MachineBlockPlacement.cpp source code [llvm_projects/llvm/lib/CodeGen/MachineBlockPlacement.cpp]

1	//===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements basic block placement transformations using the CFG
10	// structure and branch probability estimates.
11	//
12	// The pass strives to preserve the structure of the CFG (that is, retain
13	// a topological ordering of basic blocks) in the absence of a strong* signal*
14	// to the contrary from probabilities. However, within the CFG structure, it
15	// attempts to choose an ordering which favors placing more likely sequences of
16	// blocks adjacent to each other.
17	//
18	// The algorithm works from the inner-most loop within a function outward, and
19	// at each stage walks through the basic blocks, trying to coalesce them into
20	// sequential chains where allowed by the CFG (or demanded by heavy
21	// probabilities). Finally, it walks the blocks in topological order, and the
22	// first time it reaches a chain of basic blocks, it schedules them in the
23	// function in-order.
24	//
25	//===----------------------------------------------------------------------===//
26
27	#include "llvm/CodeGen/MachineBlockPlacement.h"
28	#include "BranchFolding.h"
29	#include "llvm/ADT/ArrayRef.h"
30	#include "llvm/ADT/DenseMap.h"
31	#include "llvm/ADT/STLExtras.h"
32	#include "llvm/ADT/SetVector.h"
33	#include "llvm/ADT/SmallPtrSet.h"
34	#include "llvm/ADT/SmallVector.h"
35	#include "llvm/ADT/Statistic.h"
36	#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
37	#include "llvm/Analysis/ProfileSummaryInfo.h"
38	#include "llvm/CodeGen/MBFIWrapper.h"
39	#include "llvm/CodeGen/MachineBasicBlock.h"
40	#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
41	#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
42	#include "llvm/CodeGen/MachineFunction.h"
43	#include "llvm/CodeGen/MachineFunctionPass.h"
44	#include "llvm/CodeGen/MachineLoopInfo.h"
45	#include "llvm/CodeGen/MachinePostDominators.h"
46	#include "llvm/CodeGen/MachineSizeOpts.h"
47	#include "llvm/CodeGen/TailDuplicator.h"
48	#include "llvm/CodeGen/TargetInstrInfo.h"
49	#include "llvm/CodeGen/TargetLowering.h"
50	#include "llvm/CodeGen/TargetPassConfig.h"
51	#include "llvm/CodeGen/TargetSubtargetInfo.h"
52	#include "llvm/IR/DebugLoc.h"
53	#include "llvm/IR/Function.h"
54	#include "llvm/IR/PrintPasses.h"
55	#include "llvm/InitializePasses.h"
56	#include "llvm/Pass.h"
57	#include "llvm/Support/Allocator.h"
58	#include "llvm/Support/BlockFrequency.h"
59	#include "llvm/Support/BranchProbability.h"
60	#include "llvm/Support/CodeGen.h"
61	#include "llvm/Support/CommandLine.h"
62	#include "llvm/Support/Compiler.h"
63	#include "llvm/Support/Debug.h"
64	#include "llvm/Support/raw_ostream.h"
65	#include "llvm/Target/TargetMachine.h"
66	#include "llvm/Transforms/Utils/CodeLayout.h"
67	#include <algorithm>
68	#include <cassert>
69	#include <cstdint>
70	#include <iterator>
71	#include <memory>
72	#include <string>
73	#include <tuple>
74	#include <utility>
75	#include <vector>
76
77	using namespace llvm;
78
79	#define DEBUG_TYPE "block-placement"
80
81	STATISTIC(NumCondBranches, "Number of conditional branches");
82	STATISTIC(NumUncondBranches, "Number of unconditional branches");
83	STATISTIC(CondBranchTakenFreq,
84	"Potential frequency of taking conditional branches");
85	STATISTIC(UncondBranchTakenFreq,
86	"Potential frequency of taking unconditional branches");
87
88	static cl::opt<unsigned> AlignAllBlock(
89	"align-all-blocks",
90	cl::desc ("Force the alignment of all blocks in the function in log2 format "
91	"(e.g 4 means align on 16B boundaries)."),
92	cl::init(Val: `0`), cl::Hidden);
93
94	static cl::opt<unsigned> AlignAllNonFallThruBlocks(
95	"align-all-nofallthru-blocks",
96	cl::desc ("Force the alignment of all blocks that have no fall-through "
97	"predecessors (i.e. don't add nops that are executed). In log2 "
98	"format (e.g 4 means align on 16B boundaries)."),
99	cl::init(Val: `0`), cl::Hidden);
100
101	static cl::opt<unsigned> MaxBytesForAlignmentOverride(
102	"max-bytes-for-alignment",
103	cl::desc ("Forces the maximum bytes allowed to be emitted when padding for "
104	"alignment"),
105	cl::init(Val: `0`), cl::Hidden);
106
107	static cl::opt<unsigned> PredecessorLimit(
108	"block-placement-predecessor-limit",
109	cl::desc ("For blocks with more predecessors, certain layout optimizations"
110	"will be disabled to prevent quadratic compile time."),
111	cl::init(Val: `1000`), cl::Hidden);
112
113	// FIXME: Find a good default for this flag and remove the flag.
114	static cl::opt<unsigned> ExitBlockBias(
115	"block-placement-exit-block-bias",
116	cl::desc ("Block frequency percentage a loop exit block needs "
117	"over the original exit to be considered the new exit."),
118	cl::init(Val: `0`), cl::Hidden);
119
120	// Definition:
121	// - Outlining: placement of a basic block outside the chain or hot path.
122
123	static cl::opt<unsigned> LoopToColdBlockRatio(
124	"loop-to-cold-block-ratio",
125	cl::desc ("Outline loop blocks from loop chain if (frequency of loop) / "
126	"(frequency of block) is greater than this ratio"),
127	cl::init(Val: `5`), cl::Hidden);
128
129	static cl::opt<bool>
130	ForceLoopColdBlock("force-loop-cold-block",
131	cl::desc ("Force outlining cold blocks from loops."),
132	cl::init(Val: false), cl::Hidden);
133
134	static cl::opt<bool>
135	PreciseRotationCost("precise-rotation-cost",
136	cl::desc ("Model the cost of loop rotation more "
137	"precisely by using profile data."),
138	cl::init(Val: false), cl::Hidden);
139
140	static cl::opt<bool>
141	ForcePreciseRotationCost("force-precise-rotation-cost",
142	cl::desc ("Force the use of precise cost "
143	"loop rotation strategy."),
144	cl::init(Val: false), cl::Hidden);
145
146	static cl::opt<unsigned> MisfetchCost(
147	"misfetch-cost",
148	cl::desc ("Cost that models the probabilistic risk of an instruction "
149	"misfetch due to a jump comparing to falling through, whose cost "
150	"is zero."),
151	cl::init(Val: `1`), cl::Hidden);
152
153	static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
154	cl::desc ("Cost of jump instructions."),
155	cl::init(Val: `1`), cl::Hidden);
156	static cl::opt<bool>
157	TailDupPlacement("tail-dup-placement",
158	cl::desc ("Perform tail duplication during placement. "
159	"Creates more fallthrough opportunities in "
160	"outline branches."),
161	cl::init(Val: true), cl::Hidden);
162
163	static cl::opt<bool>
164	BranchFoldPlacement("branch-fold-placement",
165	cl::desc ("Perform branch folding during placement. "
166	"Reduces code size."),
167	cl::init(Val: true), cl::Hidden);
168
169	// Heuristic for tail duplication.
170	static cl::opt<unsigned> TailDupPlacementThreshold(
171	"tail-dup-placement-threshold",
172	cl::desc ("Instruction cutoff for tail duplication during layout. "
173	"Tail merging during layout is forced to have a threshold "
174	"that won't conflict."),
175	cl::init(Val: `2`), cl::Hidden);
176
177	// Heuristic for aggressive tail duplication.
178	static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(
179	"tail-dup-placement-aggressive-threshold",
180	cl::desc ("Instruction cutoff for aggressive tail duplication during "
181	"layout. Used at -O3. Tail merging during layout is forced to "
182	"have a threshold that won't conflict."),
183	cl::init(Val: `4`), cl::Hidden);
184
185	// Heuristic for tail duplication.
186	static cl::opt<unsigned> TailDupPlacementPenalty(
187	"tail-dup-placement-penalty",
188	cl::desc (
189	"Cost penalty for blocks that can avoid breaking CFG by copying. "
190	"Copying can increase fallthrough, but it also increases icache "
191	"pressure. This parameter controls the penalty to account for that. "
192	"Percent as integer."),
193	cl::init(Val: `2`), cl::Hidden);
194
195	// Heuristic for tail duplication if profile count is used in cost model.
196	static cl::opt<unsigned> TailDupProfilePercentThreshold(
197	"tail-dup-profile-percent-threshold",
198	cl::desc ("If profile count information is used in tail duplication cost "
199	"model, the gained fall through number from tail duplication "
200	"should be at least this percent of hot count."),
201	cl::init(Val: `50`), cl::Hidden);
202
203	// Heuristic for triangle chains.
204	static cl::opt<unsigned> TriangleChainCount(
205	"triangle-chain-count",
206	cl::desc ("Number of triangle-shaped-CFG's that need to be in a row for the "
207	"triangle tail duplication heuristic to kick in. 0 to disable."),
208	cl::init(Val: `2`), cl::Hidden);
209
210	// Use case: When block layout is visualized after MBP pass, the basic blocks
211	// are labeled in layout order; meanwhile blocks could be numbered in a
212	// different order. It's hard to map between the graph and pass output.
213	// With this option on, the basic blocks are renumbered in function layout
214	// order. For debugging only.
215	static cl::opt<bool> RenumberBlocksBeforeView(
216	"renumber-blocks-before-view",
217	cl::desc (
218	"If true, basic blocks are re-numbered before MBP layout is printed "
219	"into a dot graph. Only used when a function is being printed."),
220	cl::init(Val: false), cl::Hidden);
221
222	static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
223	"ext-tsp-block-placement-max-blocks",
224	cl::desc ("Maximum number of basic blocks in a function to run ext-TSP "
225	"block placement."),
226	cl::init(UINT_MAX), cl::Hidden);
227
228	// Apply the ext-tsp algorithm minimizing the size of a binary.
229	static cl::opt<bool>
230	ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(Val: false), cl::Hidden,
231	cl::desc ("Use ext-tsp for size-aware block placement."));
232
233	namespace llvm {
234	extern cl::opt<bool> EnableExtTspBlockPlacement;
235	extern cl::opt<bool> ApplyExtTspWithoutProfile;
236	extern cl::opt<unsigned> StaticLikelyProb;
237	extern cl::opt<unsigned> ProfileLikelyProb;
238
239	// Internal option used to control BFI display only after MBP pass.
240	// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
241	// -view-block-layout-with-bfi=
242	extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI;
243
244	// Command line option to specify the name of the function for CFG dump
245	// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
246	extern cl::opt<std::string> ViewBlockFreqFuncName;
247	} // namespace llvm
248
249	namespace {
250
251	class BlockChain;
252
253	/// Type for our function-wide basic block -> block chain mapping.
254	using BlockToChainMapType = DenseMap<const MachineBasicBlock , BlockChain >;
255
256	/// A chain of blocks which will be laid out contiguously.
257	///
258	/// This is the datastructure representing a chain of consecutive blocks that
259	/// are profitable to layout together in order to maximize fallthrough
260	/// probabilities and code locality. We also can use a block chain to represent
261	/// a sequence of basic blocks which have some external (correctness)
262	/// requirement for sequential layout.
263	///
264	/// Chains can be built around a single basic block and can be merged to grow
265	/// them. They participate in a block-to-chain mapping, which is updated
266	/// automatically as chains are merged together.
267	class BlockChain {
268	/// The sequence of blocks belonging to this chain.
269	///
270	/// This is the sequence of blocks for a particular chain. These will be laid
271	/// out in-order within the function.
272	SmallVector<MachineBasicBlock *, `4`> Blocks;
273
274	/// A handle to the function-wide basic block to block chain mapping.
275	///
276	/// This is retained in each block chain to simplify the computation of child
277	/// block chains for SCC-formation and iteration. We store the edges to child
278	/// basic blocks, and map them back to their associated chains using this
279	/// structure.
280	BlockToChainMapType &BlockToChain;
281
282	public:
283	/// Construct a new BlockChain.
284	///
285	/// This builds a new block chain representing a single basic block in the
286	/// function. It also registers itself as the chain that block participates
287	/// in with the BlockToChain mapping.
288	BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
289	: Blocks (`1`, BB), BlockToChain(BlockToChain) {
290	assert(BB && "Cannot create a chain with a null basic block");
291	BlockToChain [BB] = this;
292	}
293
294	/// Iterator over blocks within the chain.
295	using iterator = SmallVectorImpl<MachineBasicBlock *>::iterator;
296	using const_iterator = SmallVectorImpl<MachineBasicBlock *>::const_iterator;
297
298	/// Beginning of blocks within the chain.
299	iterator begin() { return Blocks.begin(); }
300	const_iterator begin() const { return Blocks.begin(); }
301
302	/// End of blocks within the chain.
303	iterator end() { return Blocks.end(); }
304	const_iterator end() const { return Blocks.end(); }
305
306	bool remove(MachineBasicBlock *BB) {
307	for (iterator i = begin(); i != end(); ++i) {
308	if (*i == BB) {
309	Blocks.erase(CI: i);
310	return true;
311	}
312	}
313	return false;
314	}
315
316	/// Merge a block chain into this one.
317	///
318	/// This routine merges a block chain into this one. It takes care of forming
319	/// a contiguous sequence of basic blocks, updating the edge list, and
320	/// updating the block -> chain mapping. It does not free or tear down the
321	/// old chain, but the old chain's block list is no longer valid.
322	void merge(MachineBasicBlock BB, BlockChain Chain) {
323	assert(BB && "Can't merge a null block.");
324	assert(!Blocks.empty() && "Can't merge into an empty chain.");
325
326	// Fast path in case we don't have a chain already.
327	if (!Chain) {
328	assert(!BlockToChain[BB] &&
329	"Passed chain is null, but BB has entry in BlockToChain.");
330	Blocks.push_back(Elt: BB);
331	BlockToChain [BB] = this;
332	return;
333	}
334
335	assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
336	assert(Chain->begin() != Chain->end());
337
338	// Update the incoming blocks to point to this chain, and add them to the
339	// chain structure.
340	for (MachineBasicBlock ChainBB : Chain) {
341	Blocks.push_back(Elt: ChainBB);
342	assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
343	BlockToChain [ChainBB] = this;
344	}
345	}
346
347	#ifndef NDEBUG
348	/// Dump the blocks in this chain.
349	LLVM_DUMP_METHOD void dump() {
350	for (MachineBasicBlock MBB : this)
351	MBB->dump();
352	}
353	#endif // NDEBUG
354
355	/// Count of predecessors of any block within the chain which have not
356	/// yet been scheduled. In general, we will delay scheduling this chain
357	/// until those predecessors are scheduled (or we find a sufficiently good
358	/// reason to override this heuristic.) Note that when forming loop chains,
359	/// blocks outside the loop are ignored and treated as if they were already
360	/// scheduled.
361	///
362	/// Note: This field is reinitialized multiple times - once for each loop,
363	/// and then once for the function as a whole.
364	unsigned UnscheduledPredecessors = `0`;
365	};
366
367	class MachineBlockPlacement {
368	/// A type for a block filter set.
369	using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, `16`>;
370
371	/// Pair struct containing basic block and taildup profitability
372	struct BlockAndTailDupResult {
373	MachineBasicBlock BB = nullptr*;
374	bool ShouldTailDup;
375	};
376
377	/// Triple struct containing edge weight and the edge.
378	struct WeightedEdge {
379	BlockFrequency Weight;
380	MachineBasicBlock Src = nullptr*;
381	MachineBasicBlock Dest = nullptr*;
382	};
383
384	/// work lists of blocks that are ready to be laid out
385	SmallVector<MachineBasicBlock *, `16`> BlockWorkList;
386	SmallVector<MachineBasicBlock *, `16`> EHPadWorkList;
387
388	/// Edges that have already been computed as optimal.
389	DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges;
390
391	/// Machine Function
392	MachineFunction F = nullptr*;
393
394	/// A handle to the branch probability pass.
395	const MachineBranchProbabilityInfo MBPI = nullptr*;
396
397	/// A handle to the function-wide block frequency pass.
398	std::unique_ptr<MBFIWrapper> MBFI;
399
400	/// A handle to the loop info.
401	MachineLoopInfo MLI = nullptr*;
402
403	/// Preferred loop exit.
404	/// Member variable for convenience. It may be removed by duplication deep
405	/// in the call stack.
406	MachineBasicBlock PreferredLoopExit = nullptr*;
407
408	/// A handle to the target's instruction info.
409	const TargetInstrInfo TII = nullptr*;
410
411	/// A handle to the target's lowering info.
412	const TargetLoweringBase TLI = nullptr*;
413
414	/// A handle to the post dominator tree.
415	MachinePostDominatorTree MPDT = nullptr*;
416
417	ProfileSummaryInfo PSI = nullptr*;
418
419	// Tail merging is also determined based on
420	// whether structured CFG is required.
421	bool AllowTailMerge;
422
423	CodeGenOptLevel OptLevel;
424
425	/// Duplicator used to duplicate tails during placement.
426	///
427	/// Placement decisions can open up new tail duplication opportunities, but
428	/// since tail duplication affects placement decisions of later blocks, it
429	/// must be done inline.
430	TailDuplicator TailDup;
431
432	/// Partial tail duplication threshold.
433	BlockFrequency DupThreshold;
434
435	unsigned TailDupSize;
436
437	/// True: use block profile count to compute tail duplication cost.
438	/// False: use block frequency to compute tail duplication cost.
439	bool UseProfileCount = false;
440
441	/// Allocator and owner of BlockChain structures.
442	///
443	/// We build BlockChains lazily while processing the loop structure of
444	/// a function. To reduce malloc traffic, we allocate them using this
445	/// slab-like allocator, and destroy them after the pass completes. An
446	/// important guarantee is that this allocator produces stable pointers to
447	/// the chains.
448	SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
449
450	/// Function wide BasicBlock to BlockChain mapping.
451	///
452	/// This mapping allows efficiently moving from any given basic block to the
453	/// BlockChain it participates in, if any. We use it to, among other things,
454	/// allow implicitly defining edges between chains as the existing edges
455	/// between basic blocks.
456	DenseMap<const MachineBasicBlock , BlockChain > BlockToChain;
457
458	#ifndef NDEBUG
459	/// The set of basic blocks that have terminators that cannot be fully
460	/// analyzed. These basic blocks cannot be re-ordered safely by
461	/// MachineBlockPlacement, and we must preserve physical layout of these
462	/// blocks and their successors through the pass.
463	SmallPtrSet<MachineBasicBlock *, `4`> BlocksWithUnanalyzableExits;
464	#endif
465
466	/// Get block profile count or frequency according to UseProfileCount.
467	/// The return value is used to model tail duplication cost.
468	BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
469	if (UseProfileCount) {
470	auto Count = MBFI ->getBlockProfileCount(MBB: BB);
471	if (Count)
472	return BlockFrequency (*Count);
473	else
474	return BlockFrequency (`0`);
475	} else
476	return MBFI ->getBlockFreq(MBB: BB);
477	}
478
479	/// Scale the DupThreshold according to basic block size.
480	BlockFrequency scaleThreshold(MachineBasicBlock *BB);
481	void initTailDupThreshold();
482
483	/// Decrease the UnscheduledPredecessors count for all blocks in chain, and
484	/// if the count goes to 0, add them to the appropriate work list.
485	void markChainSuccessors(const BlockChain &Chain,
486	const MachineBasicBlock *LoopHeaderBB,
487	const BlockFilterSet BlockFilter = nullptr*);
488
489	/// Decrease the UnscheduledPredecessors count for a single block, and
490	/// if the count goes to 0, add them to the appropriate work list.
491	void markBlockSuccessors(const BlockChain &Chain, const MachineBasicBlock *BB,
492	const MachineBasicBlock *LoopHeaderBB,
493	const BlockFilterSet BlockFilter = nullptr*);
494
495	BranchProbability
496	collectViableSuccessors(const MachineBasicBlock BB, const* BlockChain &Chain,
497	const BlockFilterSet *BlockFilter,
498	SmallVector<MachineBasicBlock *, `4`> &Successors);
499	bool isBestSuccessor(MachineBasicBlock BB, MachineBasicBlock Pred,
500	BlockFilterSet *BlockFilter);
501	void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
502	MachineBasicBlock *BB,
503	BlockFilterSet *BlockFilter);
504	bool repeatedlyTailDuplicateBlock(
505	MachineBasicBlock BB, MachineBasicBlock &LPred,
506	const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
507	BlockFilterSet *BlockFilter,
508	MachineFunction::iterator &PrevUnplacedBlockIt,
509	BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt);
510	bool
511	maybeTailDuplicateBlock(MachineBasicBlock BB, MachineBasicBlock LPred,
512	BlockChain &Chain, BlockFilterSet *BlockFilter,
513	MachineFunction::iterator &PrevUnplacedBlockIt,
514	BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
515	bool &DuplicatedToLPred);
516	bool hasBetterLayoutPredecessor(const MachineBasicBlock *BB,
517	const MachineBasicBlock *Succ,
518	const BlockChain &SuccChain,
519	BranchProbability SuccProb,
520	BranchProbability RealSuccProb,
521	const BlockChain &Chain,
522	const BlockFilterSet *BlockFilter);
523	BlockAndTailDupResult selectBestSuccessor(const MachineBasicBlock *BB,
524	const BlockChain &Chain,
525	const BlockFilterSet *BlockFilter);
526	MachineBasicBlock *
527	selectBestCandidateBlock(const BlockChain &Chain,
528	SmallVectorImpl<MachineBasicBlock *> &WorkList);
529	MachineBasicBlock *
530	getFirstUnplacedBlock(const BlockChain &PlacedChain,
531	MachineFunction::iterator &PrevUnplacedBlockIt);
532	MachineBasicBlock *
533	getFirstUnplacedBlock(const BlockChain &PlacedChain,
534	BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
535	const BlockFilterSet *BlockFilter);
536
537	/// Add a basic block to the work list if it is appropriate.
538	///
539	/// If the optional parameter BlockFilter is provided, only MBB
540	/// present in the set will be added to the worklist. If nullptr
541	/// is provided, no filtering occurs.
542	void fillWorkLists(const MachineBasicBlock *MBB,
543	SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
544	const BlockFilterSet *BlockFilter);
545
546	void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
547	BlockFilterSet BlockFilter = nullptr*);
548	bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
549	const MachineBasicBlock *OldTop);
550	bool hasViableTopFallthrough(const MachineBasicBlock *Top,
551	const BlockFilterSet &LoopBlockSet);
552	BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
553	const BlockFilterSet &LoopBlockSet);
554	BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
555	const MachineBasicBlock *OldTop,
556	const MachineBasicBlock *ExitBB,
557	const BlockFilterSet &LoopBlockSet);
558	MachineBasicBlock findBestLoopTopHelper(MachineBasicBlock OldTop,
559	const MachineLoop &L,
560	const BlockFilterSet &LoopBlockSet);
561	MachineBasicBlock findBestLoopTop(const* MachineLoop &L,
562	const BlockFilterSet &LoopBlockSet);
563	MachineBasicBlock findBestLoopExit(const* MachineLoop &L,
564	const BlockFilterSet &LoopBlockSet,
565	BlockFrequency &ExitFreq);
566	BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
567	void buildLoopChains(const MachineLoop &L);
568	void rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
569	BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
570	void rotateLoopWithProfile(BlockChain &LoopChain, const MachineLoop &L,
571	const BlockFilterSet &LoopBlockSet);
572	void buildCFGChains();
573	void optimizeBranches();
574	void alignBlocks();
575	/// Returns true if a block should be tail-duplicated to increase fallthrough
576	/// opportunities.
577	bool shouldTailDuplicate(MachineBasicBlock *BB);
578	/// Check the edge frequencies to see if tail duplication will increase
579	/// fallthroughs.
580	bool isProfitableToTailDup(const MachineBasicBlock *BB,
581	const MachineBasicBlock *Succ,
582	BranchProbability QProb, const BlockChain &Chain,
583	const BlockFilterSet *BlockFilter);
584
585	/// Check for a trellis layout.
586	bool isTrellis(const MachineBasicBlock *BB,
587	const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
588	const BlockChain &Chain, const BlockFilterSet *BlockFilter);
589
590	/// Get the best successor given a trellis layout.
591	BlockAndTailDupResult getBestTrellisSuccessor(
592	const MachineBasicBlock *BB,
593	const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
594	BranchProbability AdjustedSumProb, const BlockChain &Chain,
595	const BlockFilterSet *BlockFilter);
596
597	/// Get the best pair of non-conflicting edges.
598	static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
599	const MachineBasicBlock *BB,
600	MutableArrayRef<SmallVector<WeightedEdge, `8`>> Edges);
601
602	/// Returns true if a block can tail duplicate into all unplaced
603	/// predecessors. Filters based on loop.
604	bool canTailDuplicateUnplacedPreds(const MachineBasicBlock *BB,
605	MachineBasicBlock *Succ,
606	const BlockChain &Chain,
607	const BlockFilterSet *BlockFilter);
608
609	/// Find chains of triangles to tail-duplicate where a global analysis works,
610	/// but a local analysis would not find them.
611	void precomputeTriangleChains();
612
613	/// Apply a post-processing step optimizing block placement.
614	void applyExtTsp(bool OptForSize);
615
616	/// Modify the existing block placement in the function and adjust all jumps.
617	void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
618
619	/// Create a single CFG chain from the current block order.
620	void createCFGChainExtTsp();
621
622	public:
623	MachineBlockPlacement(const MachineBranchProbabilityInfo *MBPI,
624	MachineLoopInfo MLI, ProfileSummaryInfo PSI,
625	std::unique_ptr<MBFIWrapper> MBFI,
626	MachinePostDominatorTree MPDT, bool* AllowTailMerge)
627	: MBPI(MBPI), MBFI (std::move(MBFI)), MLI(MLI), MPDT(MPDT), PSI(PSI),
628	AllowTailMerge(AllowTailMerge) {};
629
630	bool run(MachineFunction &F);
631
632	static bool allowTailDupPlacement(MachineFunction &MF) {
633	return TailDupPlacement && !MF.getTarget().requiresStructuredCFG();
634	}
635	};
636
637	class MachineBlockPlacementLegacy : public MachineFunctionPass {
638	public:
639	static char ID; // Pass identification, replacement for typeid
640
641	MachineBlockPlacementLegacy() : MachineFunctionPass (ID) {
642	initializeMachineBlockPlacementLegacyPass(*PassRegistry::getPassRegistry());
643	}
644
645	bool runOnMachineFunction(MachineFunction &MF) override {
646	if (skipFunction(F: MF.getFunction()))
647	return false;
648
649	auto *MBPI =
650	&getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
651	auto MBFI = std::make_unique<MBFIWrapper>(
652	args&: getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI());
653	auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
654	auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF)
655	? &getAnalysis<MachinePostDominatorTreeWrapperPass>()
656	.getPostDomTree()
657	: nullptr;
658	auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
659	auto *PassConfig = &getAnalysis<TargetPassConfig>();
660	bool AllowTailMerge = PassConfig->getEnableTailMerge();
661	return MachineBlockPlacement (MBPI, MLI, PSI, std::move(MBFI), MPDT,
662	AllowTailMerge)
663	.run(F&: MF);
664	}
665
666	void getAnalysisUsage(AnalysisUsage &AU) const override {
667	AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
668	AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
669	if (TailDupPlacement)
670	AU.addRequired<MachinePostDominatorTreeWrapperPass>();
671	AU.addRequired<MachineLoopInfoWrapperPass>();
672	AU.addRequired<ProfileSummaryInfoWrapperPass>();
673	AU.addRequired<TargetPassConfig>();
674	MachineFunctionPass::getAnalysisUsage(AU);
675	}
676	};
677
678	} // end anonymous namespace
679
680	char MachineBlockPlacementLegacy::ID = `0`;
681
682	char &llvm::MachineBlockPlacementID = MachineBlockPlacementLegacy::ID;
683
684	INITIALIZE_PASS_BEGIN(MachineBlockPlacementLegacy, DEBUG_TYPE,
685	"Branch Probability Basic Block Placement", false, false)
686	INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
687	INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass)
688	INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
689	INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
690	INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
691	INITIALIZE_PASS_END(MachineBlockPlacementLegacy, DEBUG_TYPE,
692	"Branch Probability Basic Block Placement", false, false)
693
694	#ifndef NDEBUG
695	/// Helper to print the name of a MBB.
696	///
697	/// Only used by debug logging.
698	static std::string getBlockName(const MachineBasicBlock *BB) {
699	std::string Result;
700	raw_string_ostream OS(Result);
701	OS << printMBBReference(*BB);
702	OS << " ('" << BB->getName() << "')";
703	OS.flush();
704	return Result;
705	}
706	#endif
707
708	/// Mark a chain's successors as having one fewer preds.
709	///
710	/// When a chain is being merged into the "placed" chain, this routine will
711	/// quickly walk the successors of each block in the chain and mark them as
712	/// having one fewer active predecessor. It also adds any successors of this
713	/// chain which reach the zero-predecessor state to the appropriate worklist.
714	void MachineBlockPlacement::markChainSuccessors(
715	const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
716	const BlockFilterSet *BlockFilter) {
717	// Walk all the blocks in this chain, marking their successors as having
718	// a predecessor placed.
719	for (MachineBasicBlock *MBB : Chain) {
720	markBlockSuccessors(Chain, BB: MBB, LoopHeaderBB, BlockFilter);
721	}
722	}
723
724	/// Mark a single block's successors as having one fewer preds.
725	///
726	/// Under normal circumstances, this is only called by markChainSuccessors,
727	/// but if a block that was to be placed is completely tail-duplicated away,
728	/// and was duplicated into the chain end, we need to redo markBlockSuccessors
729	/// for just that block.
730	void MachineBlockPlacement::markBlockSuccessors(
731	const BlockChain &Chain, const MachineBasicBlock *MBB,
732	const MachineBasicBlock LoopHeaderBB, const* BlockFilterSet *BlockFilter) {
733	// Add any successors for which this is the only un-placed in-loop
734	// predecessor to the worklist as a viable candidate for CFG-neutral
735	// placement. No subsequent placement of this block will violate the CFG
736	// shape, so we get to use heuristics to choose a favorable placement.
737	for (MachineBasicBlock *Succ : MBB->successors()) {
738	if (BlockFilter && !BlockFilter->count(key: Succ))
739	continue;
740	BlockChain &SuccChain = *BlockToChain [Succ];
741	// Disregard edges within a fixed chain, or edges to the loop header.
742	if (&Chain == &SuccChain \|\| Succ == LoopHeaderBB)
743	continue;
744
745	// This is a cross-chain edge that is within the loop, so decrement the
746	// loop predecessor count of the destination chain.
747	if (SuccChain.UnscheduledPredecessors == `0` \|\|
748	--SuccChain.UnscheduledPredecessors > `0`)
749	continue;
750
751	auto NewBB = SuccChain.begin();
752	if (NewBB->isEHPad())
753	EHPadWorkList.push_back(Elt: NewBB);
754	else
755	BlockWorkList.push_back(Elt: NewBB);
756	}
757	}
758
759	/// This helper function collects the set of successors of block
760	/// \p BB that are allowed to be its layout successors, and return
761	/// the total branch probability of edges from \p BB to those
762	/// blocks.
763	BranchProbability MachineBlockPlacement::collectViableSuccessors(
764	const MachineBasicBlock BB, const* BlockChain &Chain,
765	const BlockFilterSet *BlockFilter,
766	SmallVector<MachineBasicBlock *, `4`> &Successors) {
767	// Adjust edge probabilities by excluding edges pointing to blocks that is
768	// either not in BlockFilter or is already in the current chain. Consider the
769	// following CFG:
770	//
771	// --->A
772	// \| / \
773	// \| B C
774	// \| \ / \
775	// ----D E
776	//
777	// Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
778	// A->C is chosen as a fall-through, D won't be selected as a successor of C
779	// due to CFG constraint (the probability of C->D is not greater than
780	// HotProb to break topo-order). If we exclude E that is not in BlockFilter
781	// when calculating the probability of C->D, D will be selected and we
782	// will get A C D B as the layout of this loop.
783	auto AdjustedSumProb = BranchProbability::getOne();
784	for (MachineBasicBlock *Succ : BB->successors()) {
785	bool SkipSucc = false;
786	if (Succ->isEHPad() \|\| (BlockFilter && !BlockFilter->count(key: Succ))) {
787	SkipSucc = true;
788	} else {
789	BlockChain *SuccChain = BlockToChain [Succ];
790	if (SuccChain == &Chain) {
791	SkipSucc = true;
792	} else if (Succ != *SuccChain->begin()) {
793	LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
794	<< " -> Mid chain!\n");
795	continue;
796	}
797	}
798	if (SkipSucc)
799	AdjustedSumProb -= MBPI->getEdgeProbability(Src: BB, Dst: Succ);
800	else
801	Successors.push_back(Elt: Succ);
802	}
803
804	return AdjustedSumProb;
805	}
806
807	/// The helper function returns the branch probability that is adjusted
808	/// or normalized over the new total \p AdjustedSumProb.
809	static BranchProbability
810	getAdjustedProbability(BranchProbability OrigProb,
811	BranchProbability AdjustedSumProb) {
812	BranchProbability SuccProb;
813	uint32_t SuccProbN = OrigProb.getNumerator();
814	uint32_t SuccProbD = AdjustedSumProb.getNumerator();
815	if (SuccProbN >= SuccProbD)
816	SuccProb = BranchProbability::getOne();
817	else
818	SuccProb = BranchProbability (SuccProbN, SuccProbD);
819
820	return SuccProb;
821	}
822
823	/// Check if \p BB has exactly the successors in \p Successors.
824	static bool
825	hasSameSuccessors(MachineBasicBlock &BB,
826	SmallPtrSetImpl<const MachineBasicBlock *> &Successors) {
827	if (BB.succ_size() != Successors.size())
828	return false;
829	// We don't want to count self-loops
830	if (Successors.count(Ptr: &BB))
831	return false;
832	for (MachineBasicBlock *Succ : BB.successors())
833	if (!Successors.count(Ptr: Succ))
834	return false;
835	return true;
836	}
837
838	/// Check if a block should be tail duplicated to increase fallthrough
839	/// opportunities.
840	/// \p BB Block to check.
841	bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
842	// Blocks with single successors don't create additional fallthrough
843	// opportunities. Don't duplicate them. TODO: When conditional exits are
844	// analyzable, allow them to be duplicated.
845	bool IsSimple = TailDup.isSimpleBB(TailBB: BB);
846
847	if (BB->succ_size() == `1`)
848	return false;
849	return TailDup.shouldTailDuplicate(IsSimple, TailBB&: *BB);
850	}
851
852	/// Compare 2 BlockFrequency's with a small penalty for \p A.
853	/// In order to be conservative, we apply a X% penalty to account for
854	/// increased icache pressure and static heuristics. For small frequencies
855	/// we use only the numerators to improve accuracy. For simplicity, we assume
856	/// the penalty is less than 100%
857	/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
858	static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
859	BlockFrequency EntryFreq) {
860	BranchProbability ThresholdProb(TailDupPlacementPenalty, `100`);
861	BlockFrequency Gain = A - B;
862	return (Gain / ThresholdProb) >= EntryFreq;
863	}
864
865	/// Check the edge frequencies to see if tail duplication will increase
866	/// fallthroughs. It only makes sense to call this function when
867	/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
868	/// always locally profitable if we would have picked \p Succ without
869	/// considering duplication.
870	bool MachineBlockPlacement::isProfitableToTailDup(
871	const MachineBasicBlock BB, const* MachineBasicBlock *Succ,
872	BranchProbability QProb, const BlockChain &Chain,
873	const BlockFilterSet *BlockFilter) {
874	// We need to do a probability calculation to make sure this is profitable.
875	// First: does succ have a successor that post-dominates? This affects the
876	// calculation. The 2 relevant cases are:
877	// BB BB
878	// \| \Qout \| \Qout
879	// P\| C \|P C
880	// = C' = C'
881	// \| /Qin \| /Qin
882	// \| / \| /
883	// Succ Succ
884	// / \ \| \ V
885	// U/ =V \|U \
886	// / \ = D
887	// D E \| /
888	// \| /
889	// \|/
890	// PDom
891	// '=' : Branch taken for that CFG edge
892	// In the second case, Placing Succ while duplicating it into C prevents the
893	// fallthrough of Succ into either D or PDom, because they now have C as an
894	// unplaced predecessor
895
896	// Start by figuring out which case we fall into
897	MachineBasicBlock PDom = nullptr*;
898	SmallVector<MachineBasicBlock *, `4`> SuccSuccs;
899	// Only scan the relevant successors
900	auto AdjustedSuccSumProb =
901	collectViableSuccessors(BB: Succ, Chain, BlockFilter, Successors&: SuccSuccs);
902	BranchProbability PProb = MBPI->getEdgeProbability(Src: BB, Dst: Succ);
903	auto BBFreq = MBFI ->getBlockFreq(MBB: BB);
904	auto SuccFreq = MBFI ->getBlockFreq(MBB: Succ);
905	BlockFrequency P = BBFreq * PProb;
906	BlockFrequency Qout = BBFreq * QProb;
907	BlockFrequency EntryFreq = MBFI ->getEntryFreq();
908	// If there are no more successors, it is profitable to copy, as it strictly
909	// increases fallthrough.
910	if (SuccSuccs.size() == `0`)
911	return greaterWithBias(A: P, B: Qout, EntryFreq);
912
913	auto BestSuccSucc = BranchProbability::getZero();
914	// Find the PDom or the best Succ if no PDom exists.
915	for (MachineBasicBlock *SuccSucc : SuccSuccs) {
916	auto Prob = MBPI->getEdgeProbability(Src: Succ, Dst: SuccSucc);
917	if (Prob > BestSuccSucc)
918	BestSuccSucc = Prob;
919	if (PDom == nullptr)
920	if (MPDT->dominates(A: SuccSucc, B: Succ)) {
921	PDom = SuccSucc;
922	break;
923	}
924	}
925	// For the comparisons, we need to know Succ's best incoming edge that isn't
926	// from BB.
927	auto SuccBestPred = BlockFrequency (`0`);
928	for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
929	if (SuccPred == Succ \|\| SuccPred == BB \|\|
930	BlockToChain [SuccPred] == &Chain \|\|
931	(BlockFilter && !BlockFilter->count(key: SuccPred)))
932	continue;
933	auto Freq =
934	MBFI ->getBlockFreq(MBB: SuccPred) * MBPI->getEdgeProbability(Src: SuccPred, Dst: Succ);
935	if (Freq > SuccBestPred)
936	SuccBestPred = Freq;
937	}
938	// Qin is Succ's best unplaced incoming edge that isn't BB
939	BlockFrequency Qin = SuccBestPred;
940	// If it doesn't have a post-dominating successor, here is the calculation:
941	// BB BB
942	// \| \Qout \| \
943	// P\| C \| =
944	// = C' \| C
945	// \| /Qin \| \|
946	// \| / \| C' (+Succ)
947	// Succ Succ /\|
948	// / \ \| \/ \|
949	// U/ =V \| == \|
950	// / \ \| / \\|
951	// D E D E
952	// '=' : Branch taken for that CFG edge
953	// Cost in the first case is: P + V
954	// For this calculation, we always assume P > Qout. If Qout > P
955	// The result of this function will be ignored at the caller.
956	// Let F = SuccFreq - Qin
957	// Cost in the second case is: Qout + min(Qin, F) U + max(Qin, F) * V*
958
959	if (PDom == nullptr \|\| !Succ->isSuccessor(MBB: PDom)) {
960	BranchProbability UProb = BestSuccSucc;
961	BranchProbability VProb = AdjustedSuccSumProb - UProb;
962	BlockFrequency F = SuccFreq - Qin;
963	BlockFrequency V = SuccFreq * VProb;
964	BlockFrequency QinU = std::min(a: Qin, b: F) * UProb;
965	BlockFrequency BaseCost = P + V;
966	BlockFrequency DupCost = Qout + QinU + std::max(a: Qin, b: F) * VProb;
967	return greaterWithBias(A: BaseCost, B: DupCost, EntryFreq);
968	}
969	BranchProbability UProb = MBPI->getEdgeProbability(Src: Succ, Dst: PDom);
970	BranchProbability VProb = AdjustedSuccSumProb - UProb;
971	BlockFrequency U = SuccFreq * UProb;
972	BlockFrequency V = SuccFreq * VProb;
973	BlockFrequency F = SuccFreq - Qin;
974	// If there is a post-dominating successor, here is the calculation:
975	// BB BB BB BB
976	// \| \Qout \| \ \| \Qout \| \
977	// \|P C \| = \|P C \| =
978	// = C' \|P C = C' \|P C
979	// \| /Qin \| \| \| /Qin \| \|
980	// \| / \| C' (+Succ) \| / \| C' (+Succ)
981	// Succ Succ /\| Succ Succ /\|
982	// \| \ V \| \/ \| \| \ V \| \/ \|
983	// \|U \ \|U /\ =? \|U = \|U /\ \|
984	// = D = = =?\| \| D \| = =\|
985	// \| / \|/ D \| / \|/ D
986	// \| / \| / \| = \| /
987	// \|/ \| / \|/ \| =
988	// Dom Dom Dom Dom
989	// '=' : Branch taken for that CFG edge
990	// The cost for taken branches in the first case is P + U
991	// Let F = SuccFreq - Qin
992	// The cost in the second case (assuming independence), given the layout:
993	// BB, Succ, (C+Succ), D, Dom or the layout:
994	// BB, Succ, D, Dom, (C+Succ)
995	// is Qout + max(F, Qin) U + min(F, Qin)*
996	// compare P + U vs Qout + P U + Qin.*
997	//
998	// The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
999	//
1000	// For the 3rd case, the cost is P + 2 V*
1001	// For the 4th case, the cost is Qout + min(Qin, F) U + max(Qin, F) * V + V*
1002	// We choose 4 over 3 when (P + V) > Qout + min(Qin, F) U + max(Qin, F) * V*
1003	if (UProb > AdjustedSuccSumProb / `2` &&
1004	!hasBetterLayoutPredecessor(BB: Succ, Succ: PDom, SuccChain: *BlockToChain [PDom], SuccProb: UProb, RealSuccProb: UProb,
1005	Chain, BlockFilter))
1006	// Cases 3 & 4
1007	return greaterWithBias(
1008	A: (P + V), B: (Qout + std::max(a: Qin, b: F) * VProb + std::min(a: Qin, b: F) * UProb),
1009	EntryFreq);
1010	// Cases 1 & 2
1011	return greaterWithBias(A: (P + U),
1012	B: (Qout + std::min(a: Qin, b: F) * AdjustedSuccSumProb +
1013	std::max(a: Qin, b: F) * UProb),
1014	EntryFreq);
1015	}
1016
1017	/// Check for a trellis layout. \p BB is the upper part of a trellis if its
1018	/// successors form the lower part of a trellis. A successor set S forms the
1019	/// lower part of a trellis if all of the predecessors of S are either in S or
1020	/// have all of S as successors. We ignore trellises where BB doesn't have 2
1021	/// successors because for fewer than 2, it's trivial, and for 3 or greater they
1022	/// are very uncommon and complex to compute optimally. Allowing edges within S
1023	/// is not strictly a trellis, but the same algorithm works, so we allow it.
1024	bool MachineBlockPlacement::isTrellis(
1025	const MachineBasicBlock *BB,
1026	const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1027	const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1028	// Technically BB could form a trellis with branching factor higher than 2.
1029	// But that's extremely uncommon.
1030	if (BB->succ_size() != `2` \|\| ViableSuccs.size() != `2`)
1031	return false;
1032
1033	SmallPtrSet<const MachineBasicBlock *, `2`> Successors(llvm::from_range,
1034	BB->successors());
1035	// To avoid reviewing the same predecessors twice.
1036	SmallPtrSet<const MachineBasicBlock *, `8`> SeenPreds;
1037
1038	for (MachineBasicBlock *Succ : ViableSuccs) {
1039	// Compile-time optimization: runtime is quadratic in the number of
1040	// predecessors. For such uncommon cases, exit early.
1041	if (Succ->pred_size() > PredecessorLimit)
1042	return false;
1043
1044	int PredCount = `0`;
1045	for (auto *SuccPred : Succ->predecessors()) {
1046	// Allow triangle successors, but don't count them.
1047	if (Successors.count(Ptr: SuccPred)) {
1048	// Make sure that it is actually a triangle.
1049	for (MachineBasicBlock *CheckSucc : SuccPred->successors())
1050	if (!Successors.count(Ptr: CheckSucc))
1051	return false;
1052	continue;
1053	}
1054	const BlockChain *PredChain = BlockToChain [SuccPred];
1055	if (SuccPred == BB \|\| (BlockFilter && !BlockFilter->count(key: SuccPred)) \|\|
1056	PredChain == &Chain \|\| PredChain == BlockToChain [Succ])
1057	continue;
1058	++PredCount;
1059	// Perform the successor check only once.
1060	if (!SeenPreds.insert(Ptr: SuccPred).second)
1061	continue;
1062	if (!hasSameSuccessors(BB&: *SuccPred, Successors))
1063	return false;
1064	}
1065	// If one of the successors has only BB as a predecessor, it is not a
1066	// trellis.
1067	if (PredCount < `1`)
1068	return false;
1069	}
1070	return true;
1071	}
1072
1073	/// Pick the highest total weight pair of edges that can both be laid out.
1074	/// The edges in \p Edges[0] are assumed to have a different destination than
1075	/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
1076	/// the individual highest weight edges to the 2 different destinations, or in
1077	/// case of a conflict, one of them should be replaced with a 2nd best edge.
1078	std::pair<MachineBlockPlacement::WeightedEdge,
1079	MachineBlockPlacement::WeightedEdge>
1080	MachineBlockPlacement::getBestNonConflictingEdges(
1081	const MachineBasicBlock *BB,
1082	MutableArrayRef<SmallVector<MachineBlockPlacement::WeightedEdge, `8`>>
1083	Edges) {
1084	// Sort the edges, and then for each successor, find the best incoming
1085	// predecessor. If the best incoming predecessors aren't the same,
1086	// then that is clearly the best layout. If there is a conflict, one of the
1087	// successors will have to fallthrough from the second best predecessor. We
1088	// compare which combination is better overall.
1089
1090	// Sort for highest frequency.
1091	auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
1092
1093	llvm::stable_sort(Range&: Edges [`0`], C: Cmp);
1094	llvm::stable_sort(Range&: Edges [`1`], C: Cmp);
1095	auto BestA = Edges [`0`].begin();
1096	auto BestB = Edges [`1`].begin();
1097	// Arrange for the correct answer to be in BestA and BestB
1098	// If the 2 best edges don't conflict, the answer is already there.
1099	if (BestA->Src == BestB->Src) {
1100	// Compare the total fallthrough of (Best + Second Best) for both pairs
1101	auto SecondBestA = std::next(x: BestA);
1102	auto SecondBestB = std::next(x: BestB);
1103	BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1104	BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1105	if (BestAScore < BestBScore)
1106	BestA = SecondBestA;
1107	else
1108	BestB = SecondBestB;
1109	}
1110	// Arrange for the BB edge to be in BestA if it exists.
1111	if (BestB->Src == BB)
1112	std::swap(a&: BestA, b&: BestB);
1113	return std::make_pair(x&: BestA, y&: BestB);
1114	}
1115
1116	/// Get the best successor from \p BB based on \p BB being part of a trellis.
1117	/// We only handle trellises with 2 successors, so the algorithm is
1118	/// straightforward: Find the best pair of edges that don't conflict. We find
1119	/// the best incoming edge for each successor in the trellis. If those conflict,
1120	/// we consider which of them should be replaced with the second best.
1121	/// Upon return the two best edges will be in \p BestEdges. If one of the edges
1122	/// comes from \p BB, it will be in \p BestEdges[0]
1123	MachineBlockPlacement::BlockAndTailDupResult
1124	MachineBlockPlacement::getBestTrellisSuccessor(
1125	const MachineBasicBlock *BB,
1126	const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1127	BranchProbability AdjustedSumProb, const BlockChain &Chain,
1128	const BlockFilterSet *BlockFilter) {
1129
1130	BlockAndTailDupResult Result = {.BB: nullptr, .ShouldTailDup: false};
1131	SmallPtrSet<const MachineBasicBlock *, `4`> Successors(llvm::from_range,
1132	BB->successors());
1133
1134	// We assume size 2 because it's common. For general n, we would have to do
1135	// the Hungarian algorithm, but it's not worth the complexity because more
1136	// than 2 successors is fairly uncommon, and a trellis even more so.
1137	if (Successors.size() != `2` \|\| ViableSuccs.size() != `2`)
1138	return Result;
1139
1140	// Collect the edge frequencies of all edges that form the trellis.
1141	SmallVector<WeightedEdge, `8`> Edges[`2`];
1142	int SuccIndex = `0`;
1143	for (auto *Succ : ViableSuccs) {
1144	for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1145	// Skip any placed predecessors that are not BB
1146	if (SuccPred != BB) {
1147	if (BlockFilter && !BlockFilter->count(key: SuccPred))
1148	continue;
1149	const BlockChain *SuccPredChain = BlockToChain [SuccPred];
1150	if (SuccPredChain == &Chain \|\| SuccPredChain == BlockToChain [Succ])
1151	continue;
1152	}
1153	BlockFrequency EdgeFreq = MBFI ->getBlockFreq(MBB: SuccPred) *
1154	MBPI->getEdgeProbability(Src: SuccPred, Dst: Succ);
1155	Edges[SuccIndex].push_back(Elt: {.Weight: EdgeFreq, .Src: SuccPred, .Dest: Succ});
1156	}
1157	++SuccIndex;
1158	}
1159
1160	// Pick the best combination of 2 edges from all the edges in the trellis.
1161	WeightedEdge BestA, BestB;
1162	std::tie(args&: BestA, args&: BestB) = getBestNonConflictingEdges(BB, Edges);
1163
1164	if (BestA.Src != BB) {
1165	// If we have a trellis, and BB doesn't have the best fallthrough edges,
1166	// we shouldn't choose any successor. We've already looked and there's a
1167	// better fallthrough edge for all the successors.
1168	LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1169	return Result;
1170	}
1171
1172	// Did we pick the triangle edge? If tail-duplication is profitable, do
1173	// that instead. Otherwise merge the triangle edge now while we know it is
1174	// optimal.
1175	if (BestA.Dest == BestB.Src) {
1176	// The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1177	// would be better.
1178	MachineBasicBlock *Succ1 = BestA.Dest;
1179	MachineBasicBlock *Succ2 = BestB.Dest;
1180	// Check to see if tail-duplication would be profitable.
1181	if (allowTailDupPlacement(MF&: *F) && shouldTailDuplicate(BB: Succ2) &&
1182	canTailDuplicateUnplacedPreds(BB, Succ: Succ2, Chain, BlockFilter) &&
1183	isProfitableToTailDup(BB, Succ: Succ2, QProb: MBPI->getEdgeProbability(Src: BB, Dst: Succ1),
1184	Chain, BlockFilter)) {
1185	LLVM_DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
1186	MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1187	dbgs() << " Selected: " << getBlockName(Succ2)
1188	<< ", probability: " << Succ2Prob
1189	<< " (Tail Duplicate)\n");
1190	Result.BB = Succ2;
1191	Result.ShouldTailDup = true;
1192	return Result;
1193	}
1194	}
1195	// We have already computed the optimal edge for the other side of the
1196	// trellis.
1197	ComputedEdges [BestB.Src] = {.BB: BestB.Dest, .ShouldTailDup: false};
1198
1199	auto TrellisSucc = BestA.Dest;
1200	LLVM_DEBUG(BranchProbability SuccProb = getAdjustedProbability(
1201	MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1202	dbgs() << " Selected: " << getBlockName(TrellisSucc)
1203	<< ", probability: " << SuccProb << " (Trellis)\n");
1204	Result.BB = TrellisSucc;
1205	return Result;
1206	}
1207
1208	/// When the option allowTailDupPlacement() is on, this method checks if the
1209	/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1210	/// into all of its unplaced, unfiltered predecessors, that are not BB.
1211	bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1212	const MachineBasicBlock BB, MachineBasicBlock Succ,
1213	const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1214	if (!shouldTailDuplicate(BB: Succ))
1215	return false;
1216
1217	// The result of canTailDuplicate.
1218	bool Duplicate = true;
1219	// Number of possible duplication.
1220	unsigned int NumDup = `0`;
1221
1222	// For CFG checking.
1223	SmallPtrSet<const MachineBasicBlock *, `4`> Successors(llvm::from_range,
1224	BB->successors());
1225	for (MachineBasicBlock *Pred : Succ->predecessors()) {
1226	// Make sure all unplaced and unfiltered predecessors can be
1227	// tail-duplicated into.
1228	// Skip any blocks that are already placed or not in this loop.
1229	if (Pred == BB \|\| (BlockFilter && !BlockFilter->count(key: Pred)) \|\|
1230	(BlockToChain [Pred] == &Chain && !Succ->succ_empty()))
1231	continue;
1232	if (!TailDup.canTailDuplicate(TailBB: Succ, PredBB: Pred)) {
1233	if (Successors.size() > `1` && hasSameSuccessors(BB&: *Pred, Successors))
1234	// This will result in a trellis after tail duplication, so we don't
1235	// need to copy Succ into this predecessor. In the presence
1236	// of a trellis tail duplication can continue to be profitable.
1237	// For example:
1238	// A A
1239	// \|\ \|\
1240	// \| \ \| \
1241	// \| C \| C+BB
1242	// \| / \| \|
1243	// \|/ \| \|
1244	// BB => BB \|
1245	// \|\ \|\/\|
1246	// \| \ \|/\\|
1247	// \| D \| D
1248	// \| / \| /
1249	// \|/ \|/
1250	// Succ Succ
1251	//
1252	// After BB was duplicated into C, the layout looks like the one on the
1253	// right. BB and C now have the same successors. When considering
1254	// whether Succ can be duplicated into all its unplaced predecessors, we
1255	// ignore C.
1256	// We can do this because C already has a profitable fallthrough, namely
1257	// D. TODO(iteratee): ignore sufficiently cold predecessors for
1258	// duplication and for this test.
1259	//
1260	// This allows trellises to be laid out in 2 separate chains
1261	// (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1262	// because it allows the creation of 2 fallthrough paths with links
1263	// between them, and we correctly identify the best layout for these
1264	// CFGs. We want to extend trellises that the user created in addition
1265	// to trellises created by tail-duplication, so we just look for the
1266	// CFG.
1267	continue;
1268	Duplicate = false;
1269	continue;
1270	}
1271	NumDup++;
1272	}
1273
1274	// No possible duplication in current filter set.
1275	if (NumDup == `0`)
1276	return false;
1277
1278	// If profile information is available, findDuplicateCandidates can do more
1279	// precise benefit analysis.
1280	if (F->getFunction().hasProfileData())
1281	return true;
1282
1283	// This is mainly for function exit BB.
1284	// The integrated tail duplication is really designed for increasing
1285	// fallthrough from predecessors from Succ to its successors. We may need
1286	// other machanism to handle different cases.
1287	if (Succ->succ_empty())
1288	return true;
1289
1290	// Plus the already placed predecessor.
1291	NumDup++;
1292
1293	// If the duplication candidate has more unplaced predecessors than
1294	// successors, the extra duplication can't bring more fallthrough.
1295	//
1296	// Pred1 Pred2 Pred3
1297	// \ \| /
1298	// \ \| /
1299	// \ \| /
1300	// Dup
1301	// / \
1302	// / \
1303	// Succ1 Succ2
1304	//
1305	// In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1306	// can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1307	// but the duplication into Pred3 can't increase fallthrough.
1308	//
1309	// A small number of extra duplication may not hurt too much. We need a better
1310	// heuristic to handle it.
1311	if ((NumDup > Succ->succ_size()) \|\| !Duplicate)
1312	return false;
1313
1314	return true;
1315	}
1316
1317	/// Find chains of triangles where we believe it would be profitable to
1318	/// tail-duplicate them all, but a local analysis would not find them.
1319	/// There are 3 ways this can be profitable:
1320	/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1321	/// longer chains)
1322	/// 2) The chains are statically correlated. Branch probabilities have a very
1323	/// U-shaped distribution.
1324	/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1325	/// If the branches in a chain are likely to be from the same side of the
1326	/// distribution as their predecessor, but are independent at runtime, this
1327	/// transformation is profitable. (Because the cost of being wrong is a small
1328	/// fixed cost, unlike the standard triangle layout where the cost of being
1329	/// wrong scales with the # of triangles.)
1330	/// 3) The chains are dynamically correlated. If the probability that a previous
1331	/// branch was taken positively influences whether the next branch will be
1332	/// taken
1333	/// We believe that 2 and 3 are common enough to justify the small margin in 1.
1334	void MachineBlockPlacement::precomputeTriangleChains() {
1335	struct TriangleChain {
1336	std::vector<MachineBasicBlock *> Edges;
1337
1338	TriangleChain(MachineBasicBlock src, MachineBasicBlock dst)
1339	: Edges ({src, dst}) {}
1340
1341	void append(MachineBasicBlock *dst) {
1342	assert(getKey()->isSuccessor(dst) &&
1343	"Attempting to append a block that is not a successor.");
1344	Edges.push_back(x: dst);
1345	}
1346
1347	unsigned count() const { return Edges.size() - `1`; }
1348
1349	MachineBasicBlock getKey() const* { return Edges.back(); }
1350	};
1351
1352	if (TriangleChainCount == `0`)
1353	return;
1354
1355	LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1356	// Map from last block to the chain that contains it. This allows us to extend
1357	// chains as we find new triangles.
1358	DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap;
1359	for (MachineBasicBlock &BB : *F) {
1360	// If BB doesn't have 2 successors, it doesn't start a triangle.
1361	if (BB.succ_size() != `2`)
1362	continue;
1363	MachineBasicBlock PDom = nullptr*;
1364	for (MachineBasicBlock *Succ : BB.successors()) {
1365	if (!MPDT->dominates(A: Succ, B: &BB))
1366	continue;
1367	PDom = Succ;
1368	break;
1369	}
1370	// If BB doesn't have a post-dominating successor, it doesn't form a
1371	// triangle.
1372	if (PDom == nullptr)
1373	continue;
1374	// If PDom has a hint that it is low probability, skip this triangle.
1375	if (MBPI->getEdgeProbability(Src: &BB, Dst: PDom) < BranchProbability (`50`, `100`))
1376	continue;
1377	// If PDom isn't eligible for duplication, this isn't the kind of triangle
1378	// we're looking for.
1379	if (!shouldTailDuplicate(BB: PDom))
1380	continue;
1381	bool CanTailDuplicate = true;
1382	// If PDom can't tail-duplicate into it's non-BB predecessors, then this
1383	// isn't the kind of triangle we're looking for.
1384	for (MachineBasicBlock *Pred : PDom->predecessors()) {
1385	if (Pred == &BB)
1386	continue;
1387	if (!TailDup.canTailDuplicate(TailBB: PDom, PredBB: Pred)) {
1388	CanTailDuplicate = false;
1389	break;
1390	}
1391	}
1392	// If we can't tail-duplicate PDom to its predecessors, then skip this
1393	// triangle.
1394	if (!CanTailDuplicate)
1395	continue;
1396
1397	// Now we have an interesting triangle. Insert it if it's not part of an
1398	// existing chain.
1399	// Note: This cannot be replaced with a call insert() or emplace() because
1400	// the find key is BB, but the insert/emplace key is PDom.
1401	auto Found = TriangleChainMap.find(Val: &BB);
1402	// If it is, remove the chain from the map, grow it, and put it back in the
1403	// map with the end as the new key.
1404	if (Found != TriangleChainMap.end()) {
1405	TriangleChain Chain = std::move(Found ->second);
1406	TriangleChainMap.erase(I: Found);
1407	Chain.append(dst: PDom);
1408	TriangleChainMap.insert(KV: std::make_pair(x: Chain.getKey(), y: std::move(Chain)));
1409	} else {
1410	auto InsertResult = TriangleChainMap.try_emplace(Key: PDom, Args: &BB, Args&: PDom);
1411	assert(InsertResult.second && "Block seen twice.");
1412	(void)InsertResult;
1413	}
1414	}
1415
1416	// Iterating over a DenseMap is safe here, because the only thing in the body
1417	// of the loop is inserting into another DenseMap (ComputedEdges).
1418	// ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1419	for (auto &ChainPair : TriangleChainMap) {
1420	TriangleChain &Chain = ChainPair.second;
1421	// Benchmarking has shown that due to branch correlation duplicating 2 or
1422	// more triangles is profitable, despite the calculations assuming
1423	// independence.
1424	if (Chain.count() < TriangleChainCount)
1425	continue;
1426	MachineBasicBlock *dst = Chain.Edges.back();
1427	Chain.Edges.pop_back();
1428	for (MachineBasicBlock *src : reverse(C&: Chain.Edges)) {
1429	LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1430	<< getBlockName(dst)
1431	<< " as pre-computed based on triangles.\n");
1432
1433	auto InsertResult = ComputedEdges.insert(KV: {src, {.BB: dst, .ShouldTailDup: true}});
1434	assert(InsertResult.second && "Block seen twice.");
1435	(void)InsertResult;
1436
1437	dst = src;
1438	}
1439	}
1440	}
1441
1442	// When profile is not present, return the StaticLikelyProb.
1443	// When profile is available, we need to handle the triangle-shape CFG.
1444	static BranchProbability
1445	getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB) {
1446	if (!BB->getParent()->getFunction().hasProfileData())
1447	return BranchProbability (StaticLikelyProb, `100`);
1448	if (BB->succ_size() == `2`) {
1449	const MachineBasicBlock Succ1 = BB->succ_begin();
1450	const MachineBasicBlock Succ2 = (BB->succ_begin() + `1`);
1451	if (Succ1->isSuccessor(MBB: Succ2) \|\| Succ2->isSuccessor(MBB: Succ1)) {
1452	/ See case 1 below for the cost analysis. For BB->Succ to*
1453	* be taken with smaller cost, the following needs to hold:
1454	* Prob(BB->Succ) > 2 * Prob(BB->Pred)
1455	* So the threshold T in the calculation below
1456	* (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1457	* So T / (1 - T) = 2, Yielding T = 2/3
1458	* Also adding user specified branch bias, we have
1459	* T = (2/3)*(ProfileLikelyProb/50)
1460	* = (2*ProfileLikelyProb)/150)
1461	*/
1462	return BranchProbability (`2` * ProfileLikelyProb, `150`);
1463	}
1464	}
1465	return BranchProbability (ProfileLikelyProb, `100`);
1466	}
1467
1468	/// Checks to see if the layout candidate block \p Succ has a better layout
1469	/// predecessor than \c BB. If yes, returns true.
1470	/// \p SuccProb: The probability adjusted for only remaining blocks.
1471	/// Only used for logging
1472	/// \p RealSuccProb: The un-adjusted probability.
1473	/// \p Chain: The chain that BB belongs to and Succ is being considered for.
1474	/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1475	/// considered
1476	bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1477	const MachineBasicBlock BB, const* MachineBasicBlock *Succ,
1478	const BlockChain &SuccChain, BranchProbability SuccProb,
1479	BranchProbability RealSuccProb, const BlockChain &Chain,
1480	const BlockFilterSet *BlockFilter) {
1481
1482	// There isn't a better layout when there are no unscheduled predecessors.
1483	if (SuccChain.UnscheduledPredecessors == `0`)
1484	return false;
1485
1486	// Compile-time optimization: runtime is quadratic in the number of
1487	// predecessors. For such uncommon cases, exit early.
1488	if (Succ->pred_size() > PredecessorLimit)
1489	return false;
1490
1491	// There are two basic scenarios here:
1492	// -------------------------------------
1493	// Case 1: triangular shape CFG (if-then):
1494	// BB
1495	// \| \
1496	// \| \
1497	// \| Pred
1498	// \| /
1499	// Succ
1500	// In this case, we are evaluating whether to select edge -> Succ, e.g.
1501	// set Succ as the layout successor of BB. Picking Succ as BB's
1502	// successor breaks the CFG constraints (FIXME: define these constraints).
1503	// With this layout, Pred BB
1504	// is forced to be outlined, so the overall cost will be cost of the
1505	// branch taken from BB to Pred, plus the cost of back taken branch
1506	// from Pred to Succ, as well as the additional cost associated
1507	// with the needed unconditional jump instruction from Pred To Succ.
1508
1509	// The cost of the topological order layout is the taken branch cost
1510	// from BB to Succ, so to make BB->Succ a viable candidate, the following
1511	// must hold:
1512	// 2 freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost*
1513	// < freq(BB->Succ) taken_branch_cost.*
1514	// Ignoring unconditional jump cost, we get
1515	// freq(BB->Succ) > 2 freq(BB->Pred), i.e.,*
1516	// prob(BB->Succ) > 2 prob(BB->Pred)*
1517	//
1518	// When real profile data is available, we can precisely compute the
1519	// probability threshold that is needed for edge BB->Succ to be considered.
1520	// Without profile data, the heuristic requires the branch bias to be
1521	// a lot larger to make sure the signal is very strong (e.g. 80% default).
1522	// -----------------------------------------------------------------
1523	// Case 2: diamond like CFG (if-then-else):
1524	// S
1525	// / \
1526	// \| \
1527	// BB Pred
1528	// \ /
1529	// Succ
1530	// ..
1531	//
1532	// The current block is BB and edge BB->Succ is now being evaluated.
1533	// Note that edge S->BB was previously already selected because
1534	// prob(S->BB) > prob(S->Pred).
1535	// At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1536	// choose Pred, we will have a topological ordering as shown on the left
1537	// in the picture below. If we choose Succ, we have the solution as shown
1538	// on the right:
1539	//
1540	// topo-order:
1541	//
1542	// S----- ---S
1543	// \| \| \| \|
1544	// ---BB \| \| BB
1545	// \| \| \| \|
1546	// \| Pred-- \| Succ--
1547	// \| \| \| \|
1548	// ---Succ ---Pred--
1549	//
1550	// cost = freq(S->Pred) + freq(BB->Succ) cost = 2 freq (S->Pred)*
1551	// = freq(S->Pred) + freq(S->BB)
1552	//
1553	// If we have profile data (i.e, branch probabilities can be trusted), the
1554	// cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1555	// freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1556	// We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1557	// means the cost of topological order is greater.
1558	// When profile data is not available, however, we need to be more
1559	// conservative. If the branch prediction is wrong, breaking the topo-order
1560	// will actually yield a layout with large cost. For this reason, we need
1561	// strong biased branch at block S with Prob(S->BB) in order to select
1562	// BB->Succ. This is equivalent to looking the CFG backward with backward
1563	// edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1564	// profile data).
1565	// --------------------------------------------------------------------------
1566	// Case 3: forked diamond
1567	// S
1568	// / \
1569	// / \
1570	// BB Pred
1571	// \| \ / \|
1572	// \| \ / \|
1573	// \| X \|
1574	// \| / \ \|
1575	// \| / \ \|
1576	// S1 S2
1577	//
1578	// The current block is BB and edge BB->S1 is now being evaluated.
1579	// As above S->BB was already selected because
1580	// prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1581	//
1582	// topo-order:
1583	//
1584	// S-------\| ---S
1585	// \| \| \| \|
1586	// ---BB \| \| BB
1587	// \| \| \| \|
1588	// \| Pred----\| \| S1----
1589	// \| \| \| \|
1590	// --(S1 or S2) ---Pred--
1591	// \|
1592	// S2
1593	//
1594	// topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1595	// + min(freq(Pred->S1), freq(Pred->S2))
1596	// Non-topo-order cost:
1597	// non-topo-cost = 2 freq(S->Pred) + freq(BB->S2).*
1598	// To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1599	// is 0. Then the non topo layout is better when
1600	// freq(S->Pred) < freq(BB->S1).
1601	// This is exactly what is checked below.
1602	// Note there are other shapes that apply (Pred may not be a single block,
1603	// but they all fit this general pattern.)
1604	BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
1605
1606	// Make sure that a hot successor doesn't have a globally more
1607	// important predecessor.
1608	BlockFrequency CandidateEdgeFreq = MBFI ->getBlockFreq(MBB: BB) * RealSuccProb;
1609	bool BadCFGConflict = false;
1610
1611	for (MachineBasicBlock *Pred : Succ->predecessors()) {
1612	BlockChain *PredChain = BlockToChain [Pred];
1613	if (Pred == Succ \|\| PredChain == &SuccChain \|\|
1614	(BlockFilter && !BlockFilter->count(key: Pred)) \|\| PredChain == &Chain \|\|
1615	Pred != *std::prev(x: PredChain->end()) \|\|
1616	// This check is redundant except for look ahead. This function is
1617	// called for lookahead by isProfitableToTailDup when BB hasn't been
1618	// placed yet.
1619	(Pred == BB))
1620	continue;
1621	// Do backward checking.
1622	// For all cases above, we need a backward checking to filter out edges that
1623	// are not 'strongly' biased.
1624	// BB Pred
1625	// \ /
1626	// Succ
1627	// We select edge BB->Succ if
1628	// freq(BB->Succ) > freq(Succ) HotProb*
1629	// i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1630	// HotProb
1631	// i.e. freq((BB->Succ) (1 - HotProb) > freq(Pred->Succ) * HotProb*
1632	// Case 1 is covered too, because the first equation reduces to:
1633	// prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1634	BlockFrequency PredEdgeFreq =
1635	MBFI ->getBlockFreq(MBB: Pred) * MBPI->getEdgeProbability(Src: Pred, Dst: Succ);
1636	if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1637	BadCFGConflict = true;
1638	break;
1639	}
1640	}
1641
1642	if (BadCFGConflict) {
1643	LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1644	<< SuccProb << " (prob) (non-cold CFG conflict)\n");
1645	return true;
1646	}
1647
1648	return false;
1649	}
1650
1651	/// Select the best successor for a block.
1652	///
1653	/// This looks across all successors of a particular block and attempts to
1654	/// select the "best" one to be the layout successor. It only considers direct
1655	/// successors which also pass the block filter. It will attempt to avoid
1656	/// breaking CFG structure, but cave and break such structures in the case of
1657	/// very hot successor edges.
1658	///
1659	/// \returns The best successor block found, or null if none are viable, along
1660	/// with a boolean indicating if tail duplication is necessary.
1661	MachineBlockPlacement::BlockAndTailDupResult
1662	MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB,
1663	const BlockChain &Chain,
1664	const BlockFilterSet *BlockFilter) {
1665	const BranchProbability HotProb(StaticLikelyProb, `100`);
1666
1667	BlockAndTailDupResult BestSucc = {.BB: nullptr, .ShouldTailDup: false};
1668	auto BestProb = BranchProbability::getZero();
1669
1670	SmallVector<MachineBasicBlock *, `4`> Successors;
1671	auto AdjustedSumProb =
1672	collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1673
1674	LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1675	<< "\n");
1676
1677	// if we already precomputed the best successor for BB, return that if still
1678	// applicable.
1679	auto FoundEdge = ComputedEdges.find(Val: BB);
1680	if (FoundEdge != ComputedEdges.end()) {
1681	MachineBasicBlock *Succ = FoundEdge ->second.BB;
1682	ComputedEdges.erase(I: FoundEdge);
1683	BlockChain *SuccChain = BlockToChain [Succ];
1684	if (BB->isSuccessor(MBB: Succ) && (!BlockFilter \|\| BlockFilter->count(key: Succ)) &&
1685	SuccChain != &Chain && Succ == *SuccChain->begin())
1686	return FoundEdge ->second;
1687	}
1688
1689	// if BB is part of a trellis, Use the trellis to determine the optimal
1690	// fallthrough edges
1691	if (isTrellis(BB, ViableSuccs: Successors, Chain, BlockFilter))
1692	return getBestTrellisSuccessor(BB, ViableSuccs: Successors, AdjustedSumProb, Chain,
1693	BlockFilter);
1694
1695	// For blocks with CFG violations, we may be able to lay them out anyway with
1696	// tail-duplication. We keep this vector so we can perform the probability
1697	// calculations the minimum number of times.
1698	SmallVector<std::pair<BranchProbability, MachineBasicBlock *>, `4`>
1699	DupCandidates;
1700	for (MachineBasicBlock *Succ : Successors) {
1701	auto RealSuccProb = MBPI->getEdgeProbability(Src: BB, Dst: Succ);
1702	BranchProbability SuccProb =
1703	getAdjustedProbability(OrigProb: RealSuccProb, AdjustedSumProb);
1704
1705	BlockChain &SuccChain = *BlockToChain [Succ];
1706	// Skip the edge \c BB->Succ if block \c Succ has a better layout
1707	// predecessor that yields lower global cost.
1708	if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1709	Chain, BlockFilter)) {
1710	// If tail duplication would make Succ profitable, place it.
1711	if (allowTailDupPlacement(MF&: *F) && shouldTailDuplicate(BB: Succ))
1712	DupCandidates.emplace_back(Args&: SuccProb, Args&: Succ);
1713	continue;
1714	}
1715
1716	LLVM_DEBUG(
1717	dbgs() << " Candidate: " << getBlockName(Succ)
1718	<< ", probability: " << SuccProb
1719	<< (SuccChain.UnscheduledPredecessors != `0` ? " (CFG break)" : "")
1720	<< "\n");
1721
1722	if (BestSucc.BB && BestProb >= SuccProb) {
1723	LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1724	continue;
1725	}
1726
1727	LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1728	BestSucc.BB = Succ;
1729	BestProb = SuccProb;
1730	}
1731	// Handle the tail duplication candidates in order of decreasing probability.
1732	// Stop at the first one that is profitable. Also stop if they are less
1733	// profitable than BestSucc. Position is important because we preserve it and
1734	// prefer first best match. Here we aren't comparing in order, so we capture
1735	// the position instead.
1736	llvm::stable_sort(Range&: DupCandidates,
1737	C: [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1738	std::tuple<BranchProbability, MachineBasicBlock *> R) {
1739	return std::get<`0`>(t&: L) > std::get<`0`>(t&: R);
1740	});
1741	for (auto &Tup : DupCandidates) {
1742	BranchProbability DupProb;
1743	MachineBasicBlock *Succ;
1744	std::tie(args&: DupProb, args&: Succ) = Tup;
1745	if (DupProb < BestProb)
1746	break;
1747	if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) &&
1748	(isProfitableToTailDup(BB, Succ, QProb: BestProb, Chain, BlockFilter))) {
1749	LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1750	<< ", probability: " << DupProb
1751	<< " (Tail Duplicate)\n");
1752	BestSucc.BB = Succ;
1753	BestSucc.ShouldTailDup = true;
1754	break;
1755	}
1756	}
1757
1758	if (BestSucc.BB)
1759	LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1760
1761	return BestSucc;
1762	}
1763
1764	/// Select the best block from a worklist.
1765	///
1766	/// This looks through the provided worklist as a list of candidate basic
1767	/// blocks and select the most profitable one to place. The definition of
1768	/// profitable only really makes sense in the context of a loop. This returns
1769	/// the most frequently visited block in the worklist, which in the case of
1770	/// a loop, is the one most desirable to be physically close to the rest of the
1771	/// loop body in order to improve i-cache behavior.
1772	///
1773	/// \returns The best block found, or null if none are viable.
1774	MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1775	const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1776	// Once we need to walk the worklist looking for a candidate, cleanup the
1777	// worklist of already placed entries.
1778	// FIXME: If this shows up on profiles, it could be folded (at the cost of
1779	// some code complexity) into the loop below.
1780	llvm::erase_if(C&: WorkList, P: [&](MachineBasicBlock *BB) {
1781	return BlockToChain.lookup(Val: BB) == &Chain;
1782	});
1783
1784	if (WorkList.empty())
1785	return nullptr;
1786
1787	bool IsEHPad = WorkList [`0`]->isEHPad();
1788
1789	MachineBasicBlock BestBlock = nullptr*;
1790	BlockFrequency BestFreq;
1791	for (MachineBasicBlock *MBB : WorkList) {
1792	assert(MBB->isEHPad() == IsEHPad &&
1793	"EHPad mismatch between block and work list.");
1794
1795	BlockChain &SuccChain = *BlockToChain [MBB];
1796	if (&SuccChain == &Chain)
1797	continue;
1798
1799	assert(SuccChain.UnscheduledPredecessors == `0` &&
1800	"Found CFG-violating block");
1801
1802	BlockFrequency CandidateFreq = MBFI ->getBlockFreq(MBB);
1803	LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "
1804	<< printBlockFreq(MBFI->getMBFI(), CandidateFreq)
1805	<< " (freq)\n");
1806
1807	// For ehpad, we layout the least probable first as to avoid jumping back
1808	// from least probable landingpads to more probable ones.
1809	//
1810	// FIXME: Using probability is probably (!) not the best way to achieve
1811	// this. We should probably have a more principled approach to layout
1812	// cleanup code.
1813	//
1814	// The goal is to get:
1815	//
1816	// +--------------------------+
1817	// \| V
1818	// InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1819	//
1820	// Rather than:
1821	//
1822	// +-------------------------------------+
1823	// V \|
1824	// OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1825	if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1826	continue;
1827
1828	BestBlock = MBB;
1829	BestFreq = CandidateFreq;
1830	}
1831
1832	return BestBlock;
1833	}
1834
1835	/// Retrieve the first unplaced basic block in the entire function.
1836	///
1837	/// This routine is called when we are unable to use the CFG to walk through
1838	/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1839	/// We walk through the function's blocks in order, starting from the
1840	/// LastUnplacedBlockIt. We update this iterator on each call to avoid
1841	/// re-scanning the entire sequence on repeated calls to this routine.
1842	MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1843	const BlockChain &PlacedChain,
1844	MachineFunction::iterator &PrevUnplacedBlockIt) {
1845
1846	for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1847	++I) {
1848	if (BlockChain Chain = BlockToChain [&I]; Chain != &PlacedChain) {
1849	PrevUnplacedBlockIt = I;
1850	// Now select the head of the chain to which the unplaced block belongs
1851	// as the block to place. This will force the entire chain to be placed,
1852	// and satisfies the requirements of merging chains.
1853	return *Chain->begin();
1854	}
1855	}
1856	return nullptr;
1857	}
1858
1859	/// Retrieve the first unplaced basic block among the blocks in BlockFilter.
1860	///
1861	/// This is similar to getFirstUnplacedBlock for the entire function, but since
1862	/// the size of BlockFilter is typically far less than the number of blocks in
1863	/// the entire function, iterating through the BlockFilter is more efficient.
1864	/// When processing the entire funciton, using the version without BlockFilter
1865	/// has a complexity of #(loops in function) #(blocks in function), while this*
1866	/// version has a complexity of sum(#(loops in block) foreach block in function)
1867	/// which is always smaller. For long function mostly sequential in structure,
1868	/// the complexity is amortized to 1 #(blocks in function).*
1869	MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1870	const BlockChain &PlacedChain,
1871	BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
1872	const BlockFilterSet *BlockFilter) {
1873	assert(BlockFilter);
1874	for (; PrevUnplacedBlockInFilterIt != BlockFilter->end();
1875	++PrevUnplacedBlockInFilterIt) {
1876	BlockChain C = BlockToChain [PrevUnplacedBlockInFilterIt];
1877	if (C != &PlacedChain) {
1878	return *C->begin();
1879	}
1880	}
1881	return nullptr;
1882	}
1883
1884	void MachineBlockPlacement::fillWorkLists(
1885	const MachineBasicBlock MBB, SmallPtrSetImpl<BlockChain > &UpdatedPreds,
1886	const BlockFilterSet BlockFilter = nullptr*) {
1887	BlockChain &Chain = *BlockToChain [MBB];
1888	if (!UpdatedPreds.insert(Ptr: &Chain).second)
1889	return;
1890
1891	assert(
1892	Chain.UnscheduledPredecessors == `0` &&
1893	"Attempting to place block with unscheduled predecessors in worklist.");
1894	for (MachineBasicBlock *ChainBB : Chain) {
1895	assert(BlockToChain[ChainBB] == &Chain &&
1896	"Block in chain doesn't match BlockToChain map.");
1897	for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1898	if (BlockFilter && !BlockFilter->count(key: Pred))
1899	continue;
1900	if (BlockToChain [Pred] == &Chain)
1901	continue;
1902	++Chain.UnscheduledPredecessors;
1903	}
1904	}
1905
1906	if (Chain.UnscheduledPredecessors != `0`)
1907	return;
1908
1909	MachineBasicBlock BB = Chain.begin();
1910	if (BB->isEHPad())
1911	EHPadWorkList.push_back(Elt: BB);
1912	else
1913	BlockWorkList.push_back(Elt: BB);
1914	}
1915
1916	void MachineBlockPlacement::buildChain(const MachineBasicBlock *HeadBB,
1917	BlockChain &Chain,
1918	BlockFilterSet *BlockFilter) {
1919	assert(HeadBB && "BB must not be null.\n");
1920	assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1921	MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1922	BlockFilterSet::iterator PrevUnplacedBlockInFilterIt;
1923	if (BlockFilter)
1924	PrevUnplacedBlockInFilterIt = BlockFilter->begin();
1925
1926	const MachineBasicBlock *LoopHeaderBB = HeadBB;
1927	markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1928	MachineBasicBlock BB = std::prev(x: Chain.end());
1929	while (true) {
1930	assert(BB && "null block found at end of chain in loop.");
1931	assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1932	assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1933
1934	// Look for the best viable successor if there is one to place immediately
1935	// after this block.
1936	auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1937	MachineBasicBlock *BestSucc = Result.BB;
1938	bool ShouldTailDup = Result.ShouldTailDup;
1939	if (allowTailDupPlacement(MF&: *F))
1940	ShouldTailDup \|= (BestSucc && canTailDuplicateUnplacedPreds(
1941	BB, Succ: BestSucc, Chain, BlockFilter));
1942
1943	// If an immediate successor isn't available, look for the best viable
1944	// block among those we've identified as not violating the loop's CFG at
1945	// this point. This won't be a fallthrough, but it will increase locality.
1946	if (!BestSucc)
1947	BestSucc = selectBestCandidateBlock(Chain, WorkList&: BlockWorkList);
1948	if (!BestSucc)
1949	BestSucc = selectBestCandidateBlock(Chain, WorkList&: EHPadWorkList);
1950
1951	if (!BestSucc) {
1952	if (BlockFilter)
1953	BestSucc = getFirstUnplacedBlock(PlacedChain: Chain, PrevUnplacedBlockInFilterIt,
1954	BlockFilter);
1955	else
1956	BestSucc = getFirstUnplacedBlock(PlacedChain: Chain, PrevUnplacedBlockIt);
1957	if (!BestSucc)
1958	break;
1959
1960	LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1961	"layout successor until the CFG reduces\n");
1962	}
1963
1964	// Placement may have changed tail duplication opportunities.
1965	// Check for that now.
1966	if (allowTailDupPlacement(MF&: *F) && BestSucc && ShouldTailDup) {
1967	repeatedlyTailDuplicateBlock(BB: BestSucc, LPred&: BB, LoopHeaderBB, Chain,
1968	BlockFilter, PrevUnplacedBlockIt,
1969	PrevUnplacedBlockInFilterIt);
1970	// If the chosen successor was duplicated into BB, don't bother laying
1971	// it out, just go round the loop again with BB as the chain end.
1972	if (!BB->isSuccessor(MBB: BestSucc))
1973	continue;
1974	}
1975
1976	// Place this block, updating the datastructures to reflect its placement.
1977	BlockChain &SuccChain = *BlockToChain [BestSucc];
1978	// Zero out UnscheduledPredecessors for the successor we're about to merge
1979	// in case we selected a successor that didn't fit naturally into the CFG.
1980	SuccChain.UnscheduledPredecessors = `0`;
1981	LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1982	<< getBlockName(BestSucc) << "\n");
1983	markChainSuccessors(Chain: SuccChain, LoopHeaderBB, BlockFilter);
1984	Chain.merge(BB: BestSucc, Chain: &SuccChain);
1985	BB = *std::prev(x: Chain.end());
1986	}
1987
1988	LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1989	<< getBlockName(*Chain.begin()) << "\n");
1990	}
1991
1992	// If bottom of block BB has only one successor OldTop, in most cases it is
1993	// profitable to move it before OldTop, except the following case:
1994	//
1995	// -->OldTop<-
1996	// \| . \|
1997	// \| . \|
1998	// \| . \|
1999	// ---Pred \|
2000	// \| \|
2001	// BB-----
2002	//
2003	// If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
2004	// layout the other successor below it, so it can't reduce taken branch.
2005	// In this case we keep its original layout.
2006	bool MachineBlockPlacement::canMoveBottomBlockToTop(
2007	const MachineBasicBlock BottomBlock, const* MachineBasicBlock *OldTop) {
2008	if (BottomBlock->pred_size() != `1`)
2009	return true;
2010	MachineBasicBlock Pred = BottomBlock->pred_begin();
2011	if (Pred->succ_size() != `2`)
2012	return true;
2013
2014	MachineBasicBlock OtherBB = Pred->succ_begin();
2015	if (OtherBB == BottomBlock)
2016	OtherBB = *Pred->succ_rbegin();
2017	if (OtherBB == OldTop)
2018	return false;
2019
2020	return true;
2021	}
2022
2023	// Find out the possible fall through frequence to the top of a loop.
2024	BlockFrequency
2025	MachineBlockPlacement::TopFallThroughFreq(const MachineBasicBlock *Top,
2026	const BlockFilterSet &LoopBlockSet) {
2027	BlockFrequency MaxFreq = BlockFrequency (`0`);
2028	for (MachineBasicBlock *Pred : Top->predecessors()) {
2029	BlockChain *PredChain = BlockToChain [Pred];
2030	if (!LoopBlockSet.count(key: Pred) &&
2031	(!PredChain \|\| Pred == *std::prev(x: PredChain->end()))) {
2032	// Found a Pred block can be placed before Top.
2033	// Check if Top is the best successor of Pred.
2034	auto TopProb = MBPI->getEdgeProbability(Src: Pred, Dst: Top);
2035	bool TopOK = true;
2036	for (MachineBasicBlock *Succ : Pred->successors()) {
2037	auto SuccProb = MBPI->getEdgeProbability(Src: Pred, Dst: Succ);
2038	BlockChain *SuccChain = BlockToChain [Succ];
2039	// Check if Succ can be placed after Pred.
2040	// Succ should not be in any chain, or it is the head of some chain.
2041	if (!LoopBlockSet.count(key: Succ) && (SuccProb > TopProb) &&
2042	(!SuccChain \|\| Succ == *SuccChain->begin())) {
2043	TopOK = false;
2044	break;
2045	}
2046	}
2047	if (TopOK) {
2048	BlockFrequency EdgeFreq =
2049	MBFI ->getBlockFreq(MBB: Pred) * MBPI->getEdgeProbability(Src: Pred, Dst: Top);
2050	if (EdgeFreq > MaxFreq)
2051	MaxFreq = EdgeFreq;
2052	}
2053	}
2054	}
2055	return MaxFreq;
2056	}
2057
2058	// Compute the fall through gains when move NewTop before OldTop.
2059	//
2060	// In following diagram, edges marked as "-" are reduced fallthrough, edges
2061	// marked as "+" are increased fallthrough, this function computes
2062	//
2063	// SUM(increased fallthrough) - SUM(decreased fallthrough)
2064	//
2065	// \|
2066	// \| -
2067	// V
2068	// --->OldTop
2069	// \| .
2070	// \| .
2071	// +\| . +
2072	// \| Pred --->
2073	// \| \|-
2074	// \| V
2075	// --- NewTop <---
2076	// \|-
2077	// V
2078	//
2079	BlockFrequency MachineBlockPlacement::FallThroughGains(
2080	const MachineBasicBlock NewTop, const* MachineBasicBlock *OldTop,
2081	const MachineBasicBlock ExitBB, const* BlockFilterSet &LoopBlockSet) {
2082	BlockFrequency FallThrough2Top = TopFallThroughFreq(Top: OldTop, LoopBlockSet);
2083	BlockFrequency FallThrough2Exit = BlockFrequency (`0`);
2084	if (ExitBB)
2085	FallThrough2Exit =
2086	MBFI ->getBlockFreq(MBB: NewTop) * MBPI->getEdgeProbability(Src: NewTop, Dst: ExitBB);
2087	BlockFrequency BackEdgeFreq =
2088	MBFI ->getBlockFreq(MBB: NewTop) * MBPI->getEdgeProbability(Src: NewTop, Dst: OldTop);
2089
2090	// Find the best Pred of NewTop.
2091	MachineBasicBlock BestPred = nullptr*;
2092	BlockFrequency FallThroughFromPred = BlockFrequency (`0`);
2093	for (MachineBasicBlock *Pred : NewTop->predecessors()) {
2094	if (!LoopBlockSet.count(key: Pred))
2095	continue;
2096	BlockChain *PredChain = BlockToChain [Pred];
2097	if (!PredChain \|\| Pred == *std::prev(x: PredChain->end())) {
2098	BlockFrequency EdgeFreq =
2099	MBFI ->getBlockFreq(MBB: Pred) * MBPI->getEdgeProbability(Src: Pred, Dst: NewTop);
2100	if (EdgeFreq > FallThroughFromPred) {
2101	FallThroughFromPred = EdgeFreq;
2102	BestPred = Pred;
2103	}
2104	}
2105	}
2106
2107	// If NewTop is not placed after Pred, another successor can be placed
2108	// after Pred.
2109	BlockFrequency NewFreq = BlockFrequency (`0`);
2110	if (BestPred) {
2111	for (MachineBasicBlock *Succ : BestPred->successors()) {
2112	if ((Succ == NewTop) \|\| (Succ == BestPred) \|\| !LoopBlockSet.count(key: Succ))
2113	continue;
2114	if (ComputedEdges.contains(Val: Succ))
2115	continue;
2116	BlockChain *SuccChain = BlockToChain [Succ];
2117	if ((SuccChain && (Succ != *SuccChain->begin())) \|\|
2118	(SuccChain == BlockToChain [BestPred]))
2119	continue;
2120	BlockFrequency EdgeFreq = MBFI ->getBlockFreq(MBB: BestPred) *
2121	MBPI->getEdgeProbability(Src: BestPred, Dst: Succ);
2122	if (EdgeFreq > NewFreq)
2123	NewFreq = EdgeFreq;
2124	}
2125	BlockFrequency OrigEdgeFreq = MBFI ->getBlockFreq(MBB: BestPred) *
2126	MBPI->getEdgeProbability(Src: BestPred, Dst: NewTop);
2127	if (NewFreq > OrigEdgeFreq) {
2128	// If NewTop is not the best successor of Pred, then Pred doesn't
2129	// fallthrough to NewTop. So there is no FallThroughFromPred and
2130	// NewFreq.
2131	NewFreq = BlockFrequency (`0`);
2132	FallThroughFromPred = BlockFrequency (`0`);
2133	}
2134	}
2135
2136	BlockFrequency Result = BlockFrequency (`0`);
2137	BlockFrequency Gains = BackEdgeFreq + NewFreq;
2138	BlockFrequency Lost =
2139	FallThrough2Top + FallThrough2Exit + FallThroughFromPred;
2140	if (Gains > Lost)
2141	Result = Gains - Lost;
2142	return Result;
2143	}
2144
2145	/// Helper function of findBestLoopTop. Find the best loop top block
2146	/// from predecessors of old top.
2147	///
2148	/// Look for a block which is strictly better than the old top for laying
2149	/// out before the old top of the loop. This looks for only two patterns:
2150	///
2151	/// 1. a block has only one successor, the old loop top
2152	///
2153	/// Because such a block will always result in an unconditional jump,
2154	/// rotating it in front of the old top is always profitable.
2155	///
2156	/// 2. a block has two successors, one is old top, another is exit
2157	/// and it has more than one predecessors
2158	///
2159	/// If it is below one of its predecessors P, only P can fall through to
2160	/// it, all other predecessors need a jump to it, and another conditional
2161	/// jump to loop header. If it is moved before loop header, all its
2162	/// predecessors jump to it, then fall through to loop header. So all its
2163	/// predecessors except P can reduce one taken branch.
2164	/// At the same time, move it before old top increases the taken branch
2165	/// to loop exit block, so the reduced taken branch will be compared with
2166	/// the increased taken branch to the loop exit block.
2167	MachineBasicBlock *MachineBlockPlacement::findBestLoopTopHelper(
2168	MachineBasicBlock OldTop, const* MachineLoop &L,
2169	const BlockFilterSet &LoopBlockSet) {
2170	// Check that the header hasn't been fused with a preheader block due to
2171	// crazy branches. If it has, we need to start with the header at the top to
2172	// prevent pulling the preheader into the loop body.
2173	BlockChain &HeaderChain = *BlockToChain [OldTop];
2174	if (!LoopBlockSet.count(key: *HeaderChain.begin()))
2175	return OldTop;
2176	if (OldTop != *HeaderChain.begin())
2177	return OldTop;
2178
2179	LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2180	<< "\n");
2181
2182	BlockFrequency BestGains = BlockFrequency (`0`);
2183	MachineBasicBlock BestPred = nullptr*;
2184	for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2185	if (!LoopBlockSet.count(key: Pred))
2186	continue;
2187	if (Pred == L.getHeader())
2188	continue;
2189	LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2190	<< Pred->succ_size() << " successors, "
2191	<< printBlockFreq(MBFI->getMBFI(), *Pred) << " freq\n");
2192	if (Pred->succ_size() > `2`)
2193	continue;
2194
2195	MachineBasicBlock OtherBB = nullptr*;
2196	if (Pred->succ_size() == `2`) {
2197	OtherBB = *Pred->succ_begin();
2198	if (OtherBB == OldTop)
2199	OtherBB = *Pred->succ_rbegin();
2200	}
2201
2202	if (!canMoveBottomBlockToTop(BottomBlock: Pred, OldTop))
2203	continue;
2204
2205	BlockFrequency Gains =
2206	FallThroughGains(NewTop: Pred, OldTop, ExitBB: OtherBB, LoopBlockSet);
2207	if ((Gains > BlockFrequency (`0`)) &&
2208	(Gains > BestGains \|\|
2209	((Gains == BestGains) && Pred->isLayoutSuccessor(MBB: OldTop)))) {
2210	BestPred = Pred;
2211	BestGains = Gains;
2212	}
2213	}
2214
2215	// If no direct predecessor is fine, just use the loop header.
2216	if (!BestPred) {
2217	LLVM_DEBUG(dbgs() << " final top unchanged\n");
2218	return OldTop;
2219	}
2220
2221	// Walk backwards through any straight line of predecessors.
2222	while (BestPred->pred_size() == `1` &&
2223	(*BestPred->pred_begin())->succ_size() == `1` &&
2224	*BestPred->pred_begin() != L.getHeader())
2225	BestPred = *BestPred->pred_begin();
2226
2227	LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2228	return BestPred;
2229	}
2230
2231	/// Find the best loop top block for layout.
2232	///
2233	/// This function iteratively calls findBestLoopTopHelper, until no new better
2234	/// BB can be found.
2235	MachineBasicBlock *
2236	MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2237	const BlockFilterSet &LoopBlockSet) {
2238	// Placing the latch block before the header may introduce an extra branch
2239	// that skips this block the first time the loop is executed, which we want
2240	// to avoid when optimising for size.
2241	// FIXME: in theory there is a case that does not introduce a new branch,
2242	// i.e. when the layout predecessor does not fallthrough to the loop header.
2243	// In practice this never happens though: there always seems to be a preheader
2244	// that can fallthrough and that is also placed before the header.
2245	if (llvm::shouldOptimizeForSize(MBB: L.getHeader(), PSI, MBFIWrapper: MBFI.get()))
2246	return L.getHeader();
2247
2248	MachineBasicBlock OldTop = nullptr*;
2249	MachineBasicBlock *NewTop = L.getHeader();
2250	while (NewTop != OldTop) {
2251	OldTop = NewTop;
2252	NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2253	if (NewTop != OldTop)
2254	ComputedEdges [NewTop] = {.BB: OldTop, .ShouldTailDup: false};
2255	}
2256	return NewTop;
2257	}
2258
2259	/// Find the best loop exiting block for layout.
2260	///
2261	/// This routine implements the logic to analyze the loop looking for the best
2262	/// block to layout at the top of the loop. Typically this is done to maximize
2263	/// fallthrough opportunities.
2264	MachineBasicBlock *
2265	MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2266	const BlockFilterSet &LoopBlockSet,
2267	BlockFrequency &ExitFreq) {
2268	// We don't want to layout the loop linearly in all cases. If the loop header
2269	// is just a normal basic block in the loop, we want to look for what block
2270	// within the loop is the best one to layout at the top. However, if the loop
2271	// header has be pre-merged into a chain due to predecessors not having
2272	// analyzable branches, and* the predecessor it is merged with is not part*
2273	// of the loop, rotating the header into the middle of the loop will create
2274	// a non-contiguous range of blocks which is Very Bad. So start with the
2275	// header and only rotate if safe.
2276	BlockChain &HeaderChain = *BlockToChain [L.getHeader()];
2277	if (!LoopBlockSet.count(key: *HeaderChain.begin()))
2278	return nullptr;
2279
2280	BlockFrequency BestExitEdgeFreq;
2281	unsigned BestExitLoopDepth = `0`;
2282	MachineBasicBlock ExitingBB = nullptr*;
2283	// If there are exits to outer loops, loop rotation can severely limit
2284	// fallthrough opportunities unless it selects such an exit. Keep a set of
2285	// blocks where rotating to exit with that block will reach an outer loop.
2286	SmallPtrSet<MachineBasicBlock *, `4`> BlocksExitingToOuterLoop;
2287
2288	LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2289	<< getBlockName(L.getHeader()) << "\n");
2290	for (MachineBasicBlock *MBB : L.getBlocks()) {
2291	BlockChain &Chain = *BlockToChain [MBB];
2292	// Ensure that this block is at the end of a chain; otherwise it could be
2293	// mid-way through an inner loop or a successor of an unanalyzable branch.
2294	if (MBB != *std::prev(x: Chain.end()))
2295	continue;
2296
2297	// Now walk the successors. We need to establish whether this has a viable
2298	// exiting successor and whether it has a viable non-exiting successor.
2299	// We store the old exiting state and restore it if a viable looping
2300	// successor isn't found.
2301	MachineBasicBlock *OldExitingBB = ExitingBB;
2302	BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2303	bool HasLoopingSucc = false;
2304	for (MachineBasicBlock *Succ : MBB->successors()) {
2305	if (Succ->isEHPad())
2306	continue;
2307	if (Succ == MBB)
2308	continue;
2309	BlockChain &SuccChain = *BlockToChain [Succ];
2310	// Don't split chains, either this chain or the successor's chain.
2311	if (&Chain == &SuccChain) {
2312	LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2313	<< getBlockName(Succ) << " (chain conflict)\n");
2314	continue;
2315	}
2316
2317	auto SuccProb = MBPI->getEdgeProbability(Src: MBB, Dst: Succ);
2318	if (LoopBlockSet.count(key: Succ)) {
2319	LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2320	<< getBlockName(Succ) << " (" << SuccProb << ")\n");
2321	HasLoopingSucc = true;
2322	continue;
2323	}
2324
2325	unsigned SuccLoopDepth = `0`;
2326	if (MachineLoop *ExitLoop = MLI->getLoopFor(BB: Succ)) {
2327	SuccLoopDepth = ExitLoop->getLoopDepth();
2328	if (ExitLoop->contains(L: &L))
2329	BlocksExitingToOuterLoop.insert(Ptr: MBB);
2330	}
2331
2332	BlockFrequency ExitEdgeFreq = MBFI ->getBlockFreq(MBB) * SuccProb;
2333	LLVM_DEBUG(
2334	dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2335	<< getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("
2336	<< printBlockFreq(MBFI->getMBFI(), ExitEdgeFreq) << ")\n");
2337	// Note that we bias this toward an existing layout successor to retain
2338	// incoming order in the absence of better information. The exit must have
2339	// a frequency higher than the current exit before we consider breaking
2340	// the layout.
2341	BranchProbability Bias(`100` - ExitBlockBias, `100`);
2342	if (!ExitingBB \|\| SuccLoopDepth > BestExitLoopDepth \|\|
2343	ExitEdgeFreq > BestExitEdgeFreq \|\|
2344	(MBB->isLayoutSuccessor(MBB: Succ) &&
2345	!(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2346	BestExitEdgeFreq = ExitEdgeFreq;
2347	ExitingBB = MBB;
2348	}
2349	}
2350
2351	if (!HasLoopingSucc) {
2352	// Restore the old exiting state, no viable looping successor was found.
2353	ExitingBB = OldExitingBB;
2354	BestExitEdgeFreq = OldBestExitEdgeFreq;
2355	}
2356	}
2357	// Without a candidate exiting block or with only a single block in the
2358	// loop, just use the loop header to layout the loop.
2359	if (!ExitingBB) {
2360	LLVM_DEBUG(
2361	dbgs() << " No other candidate exit blocks, using loop header\n");
2362	return nullptr;
2363	}
2364	if (L.getNumBlocks() == `1`) {
2365	LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2366	return nullptr;
2367	}
2368
2369	// Also, if we have exit blocks which lead to outer loops but didn't select
2370	// one of them as the exiting block we are rotating toward, disable loop
2371	// rotation altogether.
2372	if (!BlocksExitingToOuterLoop.empty() &&
2373	!BlocksExitingToOuterLoop.count(Ptr: ExitingBB))
2374	return nullptr;
2375
2376	LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2377	<< "\n");
2378	ExitFreq = BestExitEdgeFreq;
2379	return ExitingBB;
2380	}
2381
2382	/// Check if there is a fallthrough to loop header Top.
2383	///
2384	/// 1. Look for a Pred that can be layout before Top.
2385	/// 2. Check if Top is the most possible successor of Pred.
2386	bool MachineBlockPlacement::hasViableTopFallthrough(
2387	const MachineBasicBlock Top, const* BlockFilterSet &LoopBlockSet) {
2388	for (MachineBasicBlock *Pred : Top->predecessors()) {
2389	BlockChain *PredChain = BlockToChain [Pred];
2390	if (!LoopBlockSet.count(key: Pred) &&
2391	(!PredChain \|\| Pred == *std::prev(x: PredChain->end()))) {
2392	// Found a Pred block can be placed before Top.
2393	// Check if Top is the best successor of Pred.
2394	auto TopProb = MBPI->getEdgeProbability(Src: Pred, Dst: Top);
2395	bool TopOK = true;
2396	for (MachineBasicBlock *Succ : Pred->successors()) {
2397	auto SuccProb = MBPI->getEdgeProbability(Src: Pred, Dst: Succ);
2398	BlockChain *SuccChain = BlockToChain [Succ];
2399	// Check if Succ can be placed after Pred.
2400	// Succ should not be in any chain, or it is the head of some chain.
2401	if ((!SuccChain \|\| Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2402	TopOK = false;
2403	break;
2404	}
2405	}
2406	if (TopOK)
2407	return true;
2408	}
2409	}
2410	return false;
2411	}
2412
2413	/// Attempt to rotate an exiting block to the bottom of the loop.
2414	///
2415	/// Once we have built a chain, try to rotate it to line up the hot exit block
2416	/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2417	/// branches. For example, if the loop has fallthrough into its header and out
2418	/// of its bottom already, don't rotate it.
2419	void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2420	const MachineBasicBlock *ExitingBB,
2421	BlockFrequency ExitFreq,
2422	const BlockFilterSet &LoopBlockSet) {
2423	if (!ExitingBB)
2424	return;
2425
2426	MachineBasicBlock Top = LoopChain.begin();
2427	MachineBasicBlock Bottom = std::prev(x: LoopChain.end());
2428
2429	// If ExitingBB is already the last one in a chain then nothing to do.
2430	if (Bottom == ExitingBB)
2431	return;
2432
2433	// The entry block should always be the first BB in a function.
2434	if (Top->isEntryBlock())
2435	return;
2436
2437	bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2438
2439	// If the header has viable fallthrough, check whether the current loop
2440	// bottom is a viable exiting block. If so, bail out as rotating will
2441	// introduce an unnecessary branch.
2442	if (ViableTopFallthrough) {
2443	for (MachineBasicBlock *Succ : Bottom->successors()) {
2444	BlockChain *SuccChain = BlockToChain [Succ];
2445	if (!LoopBlockSet.count(key: Succ) &&
2446	(!SuccChain \|\| Succ == *SuccChain->begin()))
2447	return;
2448	}
2449
2450	// Rotate will destroy the top fallthrough, we need to ensure the new exit
2451	// frequency is larger than top fallthrough.
2452	BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2453	if (FallThrough2Top >= ExitFreq)
2454	return;
2455	}
2456
2457	BlockChain::iterator ExitIt = llvm::find(Range&: LoopChain, Val: ExitingBB);
2458	if (ExitIt == LoopChain.end())
2459	return;
2460
2461	// Rotating a loop exit to the bottom when there is a fallthrough to top
2462	// trades the entry fallthrough for an exit fallthrough.
2463	// If there is no bottom->top edge, but the chosen exit block does have
2464	// a fallthrough, we break that fallthrough for nothing in return.
2465
2466	// Let's consider an example. We have a built chain of basic blocks
2467	// B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2468	// By doing a rotation we get
2469	// Bk+1, ..., Bn, B1, ..., Bk
2470	// Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2471	// If we had a fallthrough Bk -> Bk+1 it is broken now.
2472	// It might be compensated by fallthrough Bn -> B1.
2473	// So we have a condition to avoid creation of extra branch by loop rotation.
2474	// All below must be true to avoid loop rotation:
2475	// If there is a fallthrough to top (B1)
2476	// There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2477	// There is no fallthrough from bottom (Bn) to top (B1).
2478	// Please note that there is no exit fallthrough from Bn because we checked it
2479	// above.
2480	if (ViableTopFallthrough) {
2481	assert(std::next(ExitIt) != LoopChain.end() &&
2482	"Exit should not be last BB");
2483	MachineBasicBlock NextBlockInChain = std::next(x: ExitIt);
2484	if (ExitingBB->isSuccessor(MBB: NextBlockInChain))
2485	if (!Bottom->isSuccessor(MBB: Top))
2486	return;
2487	}
2488
2489	LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2490	<< " at bottom\n");
2491	std::rotate(first: LoopChain.begin(), middle: std::next(x: ExitIt), last: LoopChain.end());
2492	}
2493
2494	/// Attempt to rotate a loop based on profile data to reduce branch cost.
2495	///
2496	/// With profile data, we can determine the cost in terms of missed fall through
2497	/// opportunities when rotating a loop chain and select the best rotation.
2498	/// Basically, there are three kinds of cost to consider for each rotation:
2499	/// 1. The possibly missed fall through edge (if it exists) from BB out of
2500	/// the loop to the loop header.
2501	/// 2. The possibly missed fall through edges (if they exist) from the loop
2502	/// exits to BB out of the loop.
2503	/// 3. The missed fall through edge (if it exists) from the last BB to the
2504	/// first BB in the loop chain.
2505	/// Therefore, the cost for a given rotation is the sum of costs listed above.
2506	/// We select the best rotation with the smallest cost.
2507	void MachineBlockPlacement::rotateLoopWithProfile(
2508	BlockChain &LoopChain, const MachineLoop &L,
2509	const BlockFilterSet &LoopBlockSet) {
2510	auto RotationPos = LoopChain.end();
2511	MachineBasicBlock ChainHeaderBB = LoopChain.begin();
2512
2513	// The entry block should always be the first BB in a function.
2514	if (ChainHeaderBB->isEntryBlock())
2515	return;
2516
2517	BlockFrequency SmallestRotationCost = BlockFrequency::max();
2518
2519	// A utility lambda that scales up a block frequency by dividing it by a
2520	// branch probability which is the reciprocal of the scale.
2521	auto ScaleBlockFrequency = [](BlockFrequency Freq,
2522	unsigned Scale) -> BlockFrequency {
2523	if (Scale == `0`)
2524	return BlockFrequency (`0`);
2525	// Use operator / between BlockFrequency and BranchProbability to implement
2526	// saturating multiplication.
2527	return Freq / BranchProbability (`1`, Scale);
2528	};
2529
2530	// Compute the cost of the missed fall-through edge to the loop header if the
2531	// chain head is not the loop header. As we only consider natural loops with
2532	// single header, this computation can be done only once.
2533	BlockFrequency HeaderFallThroughCost(`0`);
2534	for (auto *Pred : ChainHeaderBB->predecessors()) {
2535	BlockChain *PredChain = BlockToChain [Pred];
2536	if (!LoopBlockSet.count(key: Pred) &&
2537	(!PredChain \|\| Pred == *std::prev(x: PredChain->end()))) {
2538	auto EdgeFreq = MBFI ->getBlockFreq(MBB: Pred) *
2539	MBPI->getEdgeProbability(Src: Pred, Dst: ChainHeaderBB);
2540	auto FallThruCost = ScaleBlockFrequency (EdgeFreq, MisfetchCost);
2541	// If the predecessor has only an unconditional jump to the header, we
2542	// need to consider the cost of this jump.
2543	if (Pred->succ_size() == `1`)
2544	FallThruCost += ScaleBlockFrequency (EdgeFreq, JumpInstCost);
2545	HeaderFallThroughCost = std::max(a: HeaderFallThroughCost, b: FallThruCost);
2546	}
2547	}
2548
2549	// Here we collect all exit blocks in the loop, and for each exit we find out
2550	// its hottest exit edge. For each loop rotation, we define the loop exit cost
2551	// as the sum of frequencies of exit edges we collect here, excluding the exit
2552	// edge from the tail of the loop chain.
2553	SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, `4`> ExitsWithFreq;
2554	for (auto *BB : LoopChain) {
2555	auto LargestExitEdgeProb = BranchProbability::getZero();
2556	for (auto *Succ : BB->successors()) {
2557	BlockChain *SuccChain = BlockToChain [Succ];
2558	if (!LoopBlockSet.count(key: Succ) &&
2559	(!SuccChain \|\| Succ == *SuccChain->begin())) {
2560	auto SuccProb = MBPI->getEdgeProbability(Src: BB, Dst: Succ);
2561	LargestExitEdgeProb = std::max(a: LargestExitEdgeProb, b: SuccProb);
2562	}
2563	}
2564	if (LargestExitEdgeProb > BranchProbability::getZero()) {
2565	auto ExitFreq = MBFI ->getBlockFreq(MBB: BB) * LargestExitEdgeProb;
2566	ExitsWithFreq.emplace_back(Args&: BB, Args&: ExitFreq);
2567	}
2568	}
2569
2570	// In this loop we iterate every block in the loop chain and calculate the
2571	// cost assuming the block is the head of the loop chain. When the loop ends,
2572	// we should have found the best candidate as the loop chain's head.
2573	for (auto Iter = LoopChain.begin(), TailIter = std::prev(x: LoopChain.end()),
2574	EndIter = LoopChain.end();
2575	Iter != EndIter; Iter++, TailIter++) {
2576	// TailIter is used to track the tail of the loop chain if the block we are
2577	// checking (pointed by Iter) is the head of the chain.
2578	if (TailIter == LoopChain.end())
2579	TailIter = LoopChain.begin();
2580
2581	auto TailBB = *TailIter;
2582
2583	// Calculate the cost by putting this BB to the top.
2584	BlockFrequency Cost = BlockFrequency (`0`);
2585
2586	// If the current BB is the loop header, we need to take into account the
2587	// cost of the missed fall through edge from outside of the loop to the
2588	// header.
2589	if (Iter != LoopChain.begin())
2590	Cost += HeaderFallThroughCost;
2591
2592	// Collect the loop exit cost by summing up frequencies of all exit edges
2593	// except the one from the chain tail.
2594	for (auto &ExitWithFreq : ExitsWithFreq)
2595	if (TailBB != ExitWithFreq.first)
2596	Cost += ExitWithFreq.second;
2597
2598	// The cost of breaking the once fall-through edge from the tail to the top
2599	// of the loop chain. Here we need to consider three cases:
2600	// 1. If the tail node has only one successor, then we will get an
2601	// additional jmp instruction. So the cost here is (MisfetchCost +
2602	// JumpInstCost) tail node frequency.*
2603	// 2. If the tail node has two successors, then we may still get an
2604	// additional jmp instruction if the layout successor after the loop
2605	// chain is not its CFG successor. Note that the more frequently executed
2606	// jmp instruction will be put ahead of the other one. Assume the
2607	// frequency of those two branches are x and y, where x is the frequency
2608	// of the edge to the chain head, then the cost will be
2609	// (x MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.*
2610	// 3. If the tail node has more than two successors (this rarely happens),
2611	// we won't consider any additional cost.
2612	if (TailBB->isSuccessor(MBB: *Iter)) {
2613	auto TailBBFreq = MBFI ->getBlockFreq(MBB: TailBB);
2614	if (TailBB->succ_size() == `1`)
2615	Cost += ScaleBlockFrequency (TailBBFreq, MisfetchCost + JumpInstCost);
2616	else if (TailBB->succ_size() == `2`) {
2617	auto TailToHeadProb = MBPI->getEdgeProbability(Src: TailBB, Dst: *Iter);
2618	auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2619	auto ColderEdgeFreq = TailToHeadProb > BranchProbability (`1`, `2`)
2620	? TailBBFreq * TailToHeadProb.getCompl()
2621	: TailToHeadFreq;
2622	Cost += ScaleBlockFrequency (TailToHeadFreq, MisfetchCost) +
2623	ScaleBlockFrequency (ColderEdgeFreq, JumpInstCost);
2624	}
2625	}
2626
2627	LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2628	<< getBlockName(*Iter) << " to the top: "
2629	<< printBlockFreq(MBFI->getMBFI(), Cost) << "\n");
2630
2631	if (Cost < SmallestRotationCost) {
2632	SmallestRotationCost = Cost;
2633	RotationPos = Iter;
2634	}
2635	}
2636
2637	if (RotationPos != LoopChain.end()) {
2638	LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2639	<< " to the top\n");
2640	std::rotate(first: LoopChain.begin(), middle: RotationPos, last: LoopChain.end());
2641	}
2642	}
2643
2644	/// Collect blocks in the given loop that are to be placed.
2645	///
2646	/// When profile data is available, exclude cold blocks from the returned set;
2647	/// otherwise, collect all blocks in the loop.
2648	MachineBlockPlacement::BlockFilterSet
2649	MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2650	// Collect the blocks in a set ordered by block number, as this gives the same
2651	// order as they appear in the function.
2652	struct MBBCompare {
2653	bool operator()(const MachineBasicBlock *X,
2654	const MachineBasicBlock Y) const* {
2655	return X->getNumber() < Y->getNumber();
2656	}
2657	};
2658	std::set<const MachineBasicBlock *, MBBCompare> LoopBlockSet;
2659
2660	// Filter cold blocks off from LoopBlockSet when profile data is available.
2661	// Collect the sum of frequencies of incoming edges to the loop header from
2662	// outside. If we treat the loop as a super block, this is the frequency of
2663	// the loop. Then for each block in the loop, we calculate the ratio between
2664	// its frequency and the frequency of the loop block. When it is too small,
2665	// don't add it to the loop chain. If there are outer loops, then this block
2666	// will be merged into the first outer loop chain for which this block is not
2667	// cold anymore. This needs precise profile data and we only do this when
2668	// profile data is available.
2669	if (F->getFunction().hasProfileData() \|\| ForceLoopColdBlock) {
2670	BlockFrequency LoopFreq(`0`);
2671	for (auto *LoopPred : L.getHeader()->predecessors())
2672	if (!L.contains(BB: LoopPred))
2673	LoopFreq += MBFI ->getBlockFreq(MBB: LoopPred) *
2674	MBPI->getEdgeProbability(Src: LoopPred, Dst: L.getHeader());
2675
2676	for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2677	if (LoopBlockSet.count(x: LoopBB))
2678	continue;
2679	auto Freq = MBFI ->getBlockFreq(MBB: LoopBB).getFrequency();
2680	if (Freq == `0` \|\| LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2681	continue;
2682	BlockChain *Chain = BlockToChain [LoopBB];
2683	for (MachineBasicBlock ChainBB : Chain)
2684	LoopBlockSet.insert(x: ChainBB);
2685	}
2686	} else
2687	LoopBlockSet.insert(first: L.block_begin(), last: L.block_end());
2688
2689	// Copy the blocks into a BlockFilterSet, as iterating it is faster than
2690	// std::set. We will only remove blocks and never insert them, which will
2691	// preserve the ordering.
2692	BlockFilterSet Ret(LoopBlockSet.begin(), LoopBlockSet.end());
2693	return Ret;
2694	}
2695
2696	/// Forms basic block chains from the natural loop structures.
2697	///
2698	/// These chains are designed to preserve the existing structure* of the code*
2699	/// as much as possible. We can then stitch the chains together in a way which
2700	/// both preserves the topological structure and minimizes taken conditional
2701	/// branches.
2702	void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2703	// First recurse through any nested loops, building chains for those inner
2704	// loops.
2705	for (const MachineLoop *InnerLoop : L)
2706	buildLoopChains(L: *InnerLoop);
2707
2708	assert(BlockWorkList.empty() &&
2709	"BlockWorkList not empty when starting to build loop chains.");
2710	assert(EHPadWorkList.empty() &&
2711	"EHPadWorkList not empty when starting to build loop chains.");
2712	BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2713
2714	// Check if we have profile data for this function. If yes, we will rotate
2715	// this loop by modeling costs more precisely which requires the profile data
2716	// for better layout.
2717	bool RotateLoopWithProfile =
2718	ForcePreciseRotationCost \|\|
2719	(PreciseRotationCost && F->getFunction().hasProfileData());
2720
2721	// First check to see if there is an obviously preferable top block for the
2722	// loop. This will default to the header, but may end up as one of the
2723	// predecessors to the header if there is one which will result in strictly
2724	// fewer branches in the loop body.
2725	MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2726
2727	// If we selected just the header for the loop top, look for a potentially
2728	// profitable exit block in the event that rotating the loop can eliminate
2729	// branches by placing an exit edge at the bottom.
2730	//
2731	// Loops are processed innermost to uttermost, make sure we clear
2732	// PreferredLoopExit before processing a new loop.
2733	PreferredLoopExit = nullptr;
2734	BlockFrequency ExitFreq;
2735	if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2736	PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2737
2738	BlockChain &LoopChain = *BlockToChain [LoopTop];
2739
2740	// FIXME: This is a really lame way of walking the chains in the loop: we
2741	// walk the blocks, and use a set to prevent visiting a particular chain
2742	// twice.
2743	SmallPtrSet<BlockChain *, `4`> UpdatedPreds;
2744	assert(LoopChain.UnscheduledPredecessors == `0` &&
2745	"LoopChain should not have unscheduled predecessors.");
2746	UpdatedPreds.insert(Ptr: &LoopChain);
2747
2748	for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2749	fillWorkLists(MBB: LoopBB, UpdatedPreds, BlockFilter: &LoopBlockSet);
2750
2751	buildChain(HeadBB: LoopTop, Chain&: LoopChain, BlockFilter: &LoopBlockSet);
2752
2753	if (RotateLoopWithProfile)
2754	rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2755	else
2756	rotateLoop(LoopChain, ExitingBB: PreferredLoopExit, ExitFreq, LoopBlockSet);
2757
2758	LLVM_DEBUG({
2759	// Crash at the end so we get all of the debugging output first.
2760	bool BadLoop = false;
2761	if (LoopChain.UnscheduledPredecessors) {
2762	BadLoop = true;
2763	dbgs() << "Loop chain contains a block without its preds placed!\n"
2764	<< " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2765	<< " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2766	}
2767	for (MachineBasicBlock *ChainBB : LoopChain) {
2768	dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2769	if (!LoopBlockSet.remove(ChainBB)) {
2770	// We don't mark the loop as bad here because there are real situations
2771	// where this can occur. For example, with an unanalyzable fallthrough
2772	// from a loop block to a non-loop block or vice versa.
2773	dbgs() << "Loop chain contains a block not contained by the loop!\n"
2774	<< " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2775	<< " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2776	<< " Bad block: " << getBlockName(ChainBB) << "\n";
2777	}
2778	}
2779
2780	if (!LoopBlockSet.empty()) {
2781	BadLoop = true;
2782	for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2783	dbgs() << "Loop contains blocks never placed into a chain!\n"
2784	<< " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2785	<< " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2786	<< " Bad block: " << getBlockName(LoopBB) << "\n";
2787	}
2788	assert(!BadLoop && "Detected problems with the placement of this loop.");
2789	});
2790
2791	BlockWorkList.clear();
2792	EHPadWorkList.clear();
2793	}
2794
2795	void MachineBlockPlacement::buildCFGChains() {
2796	// Ensure that every BB in the function has an associated chain to simplify
2797	// the assumptions of the remaining algorithm.
2798	SmallVector<MachineOperand, `4`> Cond; // For analyzeBranch.
2799	for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2800	++FI) {
2801	MachineBasicBlock BB = &FI;
2802	BlockChain *Chain =
2803	new (ChainAllocator.Allocate()) BlockChain (BlockToChain, BB);
2804	// Also, merge any blocks which we cannot reason about and must preserve
2805	// the exact fallthrough behavior for.
2806	while (true) {
2807	Cond.clear();
2808	MachineBasicBlock TBB = nullptr, FBB = nullptr; // For analyzeBranch.
2809	if (!TII->analyzeBranch(MBB&: *BB, TBB, FBB, Cond) \|\| !FI ->canFallThrough())
2810	break;
2811
2812	MachineFunction::iterator NextFI = std::next(x: FI);
2813	MachineBasicBlock NextBB = &NextFI;
2814	// Ensure that the layout successor is a viable block, as we know that
2815	// fallthrough is a possibility.
2816	assert(NextFI != FE && "Can't fallthrough past the last block.");
2817	LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2818	<< getBlockName(BB) << " -> " << getBlockName(NextBB)
2819	<< "\n");
2820	Chain->merge(BB: NextBB, Chain: nullptr);
2821	#ifndef NDEBUG
2822	BlocksWithUnanalyzableExits.insert(&*BB);
2823	#endif
2824	FI = NextFI;
2825	BB = NextBB;
2826	}
2827	}
2828
2829	// Build any loop-based chains.
2830	PreferredLoopExit = nullptr;
2831	for (MachineLoop L : MLI)
2832	buildLoopChains(L: *L);
2833
2834	assert(BlockWorkList.empty() &&
2835	"BlockWorkList should be empty before building final chain.");
2836	assert(EHPadWorkList.empty() &&
2837	"EHPadWorkList should be empty before building final chain.");
2838
2839	SmallPtrSet<BlockChain *, `4`> UpdatedPreds;
2840	for (MachineBasicBlock &MBB : *F)
2841	fillWorkLists(MBB: &MBB, UpdatedPreds);
2842
2843	BlockChain &FunctionChain = *BlockToChain [&F->front()];
2844	buildChain(HeadBB: &F->front(), Chain&: FunctionChain);
2845
2846	#ifndef NDEBUG
2847	using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, `16`>;
2848	#endif
2849	LLVM_DEBUG({
2850	// Crash at the end so we get all of the debugging output first.
2851	bool BadFunc = false;
2852	FunctionBlockSetType FunctionBlockSet;
2853	for (MachineBasicBlock &MBB : *F)
2854	FunctionBlockSet.insert(&MBB);
2855
2856	for (MachineBasicBlock *ChainBB : FunctionChain)
2857	if (!FunctionBlockSet.erase(ChainBB)) {
2858	BadFunc = true;
2859	dbgs() << "Function chain contains a block not in the function!\n"
2860	<< " Bad block: " << getBlockName(ChainBB) << "\n";
2861	}
2862
2863	if (!FunctionBlockSet.empty()) {
2864	BadFunc = true;
2865	for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2866	dbgs() << "Function contains blocks never placed into a chain!\n"
2867	<< " Bad block: " << getBlockName(RemainingBB) << "\n";
2868	}
2869	assert(!BadFunc && "Detected problems with the block placement.");
2870	});
2871
2872	// Remember original layout ordering, so we can update terminators after
2873	// reordering to point to the original layout successor.
2874	SmallVector<MachineBasicBlock *, `4`> OriginalLayoutSuccessors(
2875	F->getNumBlockIDs());
2876	{
2877	MachineBasicBlock LastMBB = nullptr*;
2878	for (auto &MBB : *F) {
2879	if (LastMBB != nullptr)
2880	OriginalLayoutSuccessors [LastMBB->getNumber()] = &MBB;
2881	LastMBB = &MBB;
2882	}
2883	OriginalLayoutSuccessors [F->back().getNumber()] = nullptr;
2884	}
2885
2886	// Splice the blocks into place.
2887	MachineFunction::iterator InsertPos = F->begin();
2888	LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2889	for (MachineBasicBlock *ChainBB : FunctionChain) {
2890	LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2891	: " ... ")
2892	<< getBlockName(ChainBB) << "\n");
2893	if (InsertPos != MachineFunction::iterator (ChainBB))
2894	F->splice(InsertPt: InsertPos, MBB: ChainBB);
2895	else
2896	++InsertPos;
2897
2898	// Update the terminator of the previous block.
2899	if (ChainBB == *FunctionChain.begin())
2900	continue;
2901	MachineBasicBlock PrevBB = &std::prev(x: MachineFunction::iterator (ChainBB));
2902
2903	// FIXME: It would be awesome of updateTerminator would just return rather
2904	// than assert when the branch cannot be analyzed in order to remove this
2905	// boiler plate.
2906	Cond.clear();
2907	MachineBasicBlock TBB = nullptr, FBB = nullptr; // For analyzeBranch.
2908
2909	#ifndef NDEBUG
2910	if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2911	// Given the exact block placement we chose, we may actually not _need_ to
2912	// be able to edit PrevBB's terminator sequence, but not being _able_ to
2913	// do that at this point is a bug.
2914	assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) \|\|
2915	!PrevBB->canFallThrough()) &&
2916	"Unexpected block with un-analyzable fallthrough!");
2917	Cond.clear();
2918	TBB = FBB = nullptr;
2919	}
2920	#endif
2921
2922	// The "PrevBB" is not yet updated to reflect current code layout, so,
2923	// o. it may fall-through to a block without explicit "goto" instruction
2924	// before layout, and no longer fall-through it after layout; or
2925	// o. just opposite.
2926	//
2927	// analyzeBranch() may return erroneous value for FBB when these two
2928	// situations take place. For the first scenario FBB is mistakenly set NULL;
2929	// for the 2nd scenario, the FBB, which is expected to be NULL, is
2930	// mistakenly pointing to "BI".*
2931	// Thus, if the future change needs to use FBB before the layout is set, it
2932	// has to correct FBB first by using the code similar to the following:
2933	//
2934	// if (!Cond.empty() && (!FBB \|\| FBB == ChainBB)) {
2935	// PrevBB->updateTerminator();
2936	// Cond.clear();
2937	// TBB = FBB = nullptr;
2938	// if (TII->analyzeBranch(PrevBB, TBB, FBB, Cond)) {*
2939	// // FIXME: This should never take place.
2940	// TBB = FBB = nullptr;
2941	// }
2942	// }
2943	if (!TII->analyzeBranch(MBB&: *PrevBB, TBB, FBB, Cond)) {
2944	PrevBB->updateTerminator(PreviousLayoutSuccessor: OriginalLayoutSuccessors [PrevBB->getNumber()]);
2945	}
2946	}
2947
2948	// Fixup the last block.
2949	Cond.clear();
2950	MachineBasicBlock TBB = nullptr, FBB = nullptr; // For analyzeBranch.
2951	if (!TII->analyzeBranch(MBB&: F->back(), TBB, FBB, Cond)) {
2952	MachineBasicBlock *PrevBB = &F->back();
2953	PrevBB->updateTerminator(PreviousLayoutSuccessor: OriginalLayoutSuccessors [PrevBB->getNumber()]);
2954	}
2955
2956	BlockWorkList.clear();
2957	EHPadWorkList.clear();
2958	}
2959
2960	void MachineBlockPlacement::optimizeBranches() {
2961	BlockChain &FunctionChain = *BlockToChain [&F->front()];
2962	SmallVector<MachineOperand, `4`> Cond;
2963
2964	// Now that all the basic blocks in the chain have the proper layout,
2965	// make a final call to analyzeBranch with AllowModify set.
2966	// Indeed, the target may be able to optimize the branches in a way we
2967	// cannot because all branches may not be analyzable.
2968	// E.g., the target may be able to remove an unconditional branch to
2969	// a fallthrough when it occurs after predicated terminators.
2970	for (MachineBasicBlock *ChainBB : FunctionChain) {
2971	Cond.clear();
2972	MachineBasicBlock TBB = nullptr, FBB = nullptr;
2973	if (TII->analyzeBranch(MBB&: ChainBB, TBB, FBB, Cond, /AllowModify/* true))
2974	continue;
2975	if (!TBB \|\| !FBB \|\| Cond.empty())
2976	continue;
2977	// If we are optimizing for size we do not consider the runtime performance.
2978	// Instead, we retain the original branch condition so we have more uniform
2979	// instructions which will benefit ICF.
2980	if (llvm::shouldOptimizeForSize(MBB: ChainBB, PSI, MBFIWrapper: MBFI.get()))
2981	continue;
2982	// If ChainBB has a two-way branch, try to re-order the branches
2983	// such that we branch to the successor with higher probability first.
2984	if (MBPI->getEdgeProbability(Src: ChainBB, Dst: TBB) >=
2985	MBPI->getEdgeProbability(Src: ChainBB, Dst: FBB))
2986	continue;
2987	if (TII->reverseBranchCondition(Cond))
2988	continue;
2989	LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2990	<< getBlockName(ChainBB) << "\n");
2991	LLVM_DEBUG(dbgs() << " " << getBlockName(TBB) << " < " << getBlockName(FBB)
2992	<< "\n");
2993	auto Dl = ChainBB->findBranchDebugLoc();
2994	TII->removeBranch(MBB&: *ChainBB);
2995	TII->insertBranch(MBB&: *ChainBB, TBB: FBB, FBB: TBB, Cond, DL: Dl);
2996	}
2997	}
2998
2999	void MachineBlockPlacement::alignBlocks() {
3000	// Walk through the backedges of the function now that we have fully laid out
3001	// the basic blocks and align the destination of each backedge. We don't rely
3002	// exclusively on the loop info here so that we can align backedges in
3003	// unnatural CFGs and backedges that were introduced purely because of the
3004	// loop rotations done during this layout pass.
3005	if (!AlignAllBlock && !AlignAllNonFallThruBlocks) {
3006	if (F->getFunction().hasMinSize() \|\|
3007	(F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
3008	return;
3009	}
3010
3011	BlockChain &FunctionChain = *BlockToChain [&F->front()];
3012	// Empty chain.
3013	if (FunctionChain.begin() == FunctionChain.end())
3014	return;
3015
3016	const BranchProbability ColdProb(`1`, `5`); // 20%
3017	BlockFrequency EntryFreq = MBFI ->getBlockFreq(MBB: &F->front());
3018	BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
3019	for (MachineBasicBlock *ChainBB : FunctionChain) {
3020	if (ChainBB == *FunctionChain.begin())
3021	continue;
3022
3023	// Don't align non-looping basic blocks. These are unlikely to execute
3024	// enough times to matter in practice. Note that we'll still handle
3025	// unnatural CFGs inside of a natural outer loop (the common case) and
3026	// rotated loops.
3027	MachineLoop *L = MLI->getLoopFor(BB: ChainBB);
3028	if (!L)
3029	continue;
3030
3031	const Align TLIAlign = TLI->getPrefLoopAlignment(ML: L);
3032	unsigned MDAlign = `1`;
3033	MDNode *LoopID = L->getLoopID();
3034	if (LoopID) {
3035	for (const MDOperand &MDO : llvm::drop_begin(RangeOrContainer: LoopID->operands())) {
3036	MDNode *MD = dyn_cast<MDNode>(Val: MDO);
3037	if (MD == nullptr)
3038	continue;
3039	MDString *S = dyn_cast<MDString>(Val: MD->getOperand(I: `0`));
3040	if (S == nullptr)
3041	continue;
3042	if (S->getString() == "llvm.loop.align") {
3043	assert(MD->getNumOperands() == `2` &&
3044	"per-loop align metadata should have two operands.");
3045	MDAlign =
3046	mdconst::extract<ConstantInt>(MD: MD->getOperand(I: `1`))->getZExtValue();
3047	assert(MDAlign >= `1` && "per-loop align value must be positive.");
3048	}
3049	}
3050	}
3051
3052	// Use max of the TLIAlign and MDAlign
3053	const Align LoopAlign = std::max(a: TLIAlign, b: Align (MDAlign));
3054	if (LoopAlign == `1`)
3055	continue; // Don't care about loop alignment.
3056
3057	// If the block is cold relative to the function entry don't waste space
3058	// aligning it.
3059	BlockFrequency Freq = MBFI ->getBlockFreq(MBB: ChainBB);
3060	if (Freq < WeightedEntryFreq)
3061	continue;
3062
3063	// If the block is cold relative to its loop header, don't align it
3064	// regardless of what edges into the block exist.
3065	MachineBasicBlock *LoopHeader = L->getHeader();
3066	BlockFrequency LoopHeaderFreq = MBFI ->getBlockFreq(MBB: LoopHeader);
3067	if (Freq < (LoopHeaderFreq * ColdProb))
3068	continue;
3069
3070	// If the global profiles indicates so, don't align it.
3071	if (llvm::shouldOptimizeForSize(MBB: ChainBB, PSI, MBFIWrapper: MBFI.get()) &&
3072	!TLI->alignLoopsWithOptSize())
3073	continue;
3074
3075	// Check for the existence of a non-layout predecessor which would benefit
3076	// from aligning this block.
3077	MachineBasicBlock *LayoutPred =
3078	&*std::prev(x: MachineFunction::iterator (ChainBB));
3079
3080	auto DetermineMaxAlignmentPadding = [&]() {
3081	// Set the maximum bytes allowed to be emitted for alignment.
3082	unsigned MaxBytes;
3083	if (MaxBytesForAlignmentOverride.getNumOccurrences() > `0`)
3084	MaxBytes = MaxBytesForAlignmentOverride;
3085	else
3086	MaxBytes = TLI->getMaxPermittedBytesForAlignment(MBB: ChainBB);
3087	ChainBB->setMaxBytesForAlignment(MaxBytes);
3088	};
3089
3090	// Force alignment if all the predecessors are jumps. We already checked
3091	// that the block isn't cold above.
3092	if (!LayoutPred->isSuccessor(MBB: ChainBB)) {
3093	ChainBB->setAlignment(LoopAlign);
3094	DetermineMaxAlignmentPadding ();
3095	continue;
3096	}
3097
3098	// Align this block if the layout predecessor's edge into this block is
3099	// cold relative to the block. When this is true, other predecessors make up
3100	// all of the hot entries into the block and thus alignment is likely to be
3101	// important.
3102	BranchProbability LayoutProb =
3103	MBPI->getEdgeProbability(Src: LayoutPred, Dst: ChainBB);
3104	BlockFrequency LayoutEdgeFreq = MBFI ->getBlockFreq(MBB: LayoutPred) * LayoutProb;
3105	if (LayoutEdgeFreq <= (Freq * ColdProb)) {
3106	ChainBB->setAlignment(LoopAlign);
3107	DetermineMaxAlignmentPadding ();
3108	}
3109	}
3110
3111	const bool HasMaxBytesOverride =
3112	MaxBytesForAlignmentOverride.getNumOccurrences() > `0`;
3113
3114	if (AlignAllBlock)
3115	// Align all of the blocks in the function to a specific alignment.
3116	for (MachineBasicBlock &MBB : *F) {
3117	if (HasMaxBytesOverride)
3118	MBB.setAlignment(A: Align (`1ULL` << AlignAllBlock),
3119	MaxBytes: MaxBytesForAlignmentOverride);
3120	else
3121	MBB.setAlignment(Align (`1ULL` << AlignAllBlock));
3122	}
3123	else if (AlignAllNonFallThruBlocks) {
3124	// Align all of the blocks that have no fall-through predecessors to a
3125	// specific alignment.
3126	for (auto MBI = std::next(x: F->begin()), MBE = F->end(); MBI != MBE; ++MBI) {
3127	auto LayoutPred = std::prev(x: MBI);
3128	if (!LayoutPred ->isSuccessor(MBB: &*MBI)) {
3129	if (HasMaxBytesOverride)
3130	MBI ->setAlignment(A: Align (`1ULL` << AlignAllNonFallThruBlocks),
3131	MaxBytes: MaxBytesForAlignmentOverride);
3132	else
3133	MBI ->setAlignment(Align (`1ULL` << AlignAllNonFallThruBlocks));
3134	}
3135	}
3136	}
3137	}
3138
3139	/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
3140	/// it was duplicated into its chain predecessor and removed.
3141	/// \p BB - Basic block that may be duplicated.
3142	///
3143	/// \p LPred - Chosen layout predecessor of \p BB.
3144	/// Updated to be the chain end if LPred is removed.
3145	/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3146	/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3147	/// Used to identify which blocks to update predecessor
3148	/// counts.
3149	/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3150	/// chosen in the given order due to unnatural CFG
3151	/// only needed if \p BB is removed and
3152	/// \p PrevUnplacedBlockIt pointed to \p BB.
3153	/// @return true if \p BB was removed.
3154	bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
3155	MachineBasicBlock BB, MachineBasicBlock &LPred,
3156	const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
3157	BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3158	BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt) {
3159	bool Removed, DuplicatedToLPred;
3160	bool DuplicatedToOriginalLPred;
3161	Removed = maybeTailDuplicateBlock(
3162	BB, LPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3163	PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3164	if (!Removed)
3165	return false;
3166	DuplicatedToOriginalLPred = DuplicatedToLPred;
3167	// Iteratively try to duplicate again. It can happen that a block that is
3168	// duplicated into is still small enough to be duplicated again.
3169	// No need to call markBlockSuccessors in this case, as the blocks being
3170	// duplicated from here on are already scheduled.
3171	while (DuplicatedToLPred && Removed) {
3172	MachineBasicBlock DupBB, DupPred;
3173	// The removal callback causes Chain.end() to be updated when a block is
3174	// removed. On the first pass through the loop, the chain end should be the
3175	// same as it was on function entry. On subsequent passes, because we are
3176	// duplicating the block at the end of the chain, if it is removed the
3177	// chain will have shrunk by one block.
3178	BlockChain::iterator ChainEnd = Chain.end();
3179	DupBB = *(--ChainEnd);
3180	// Now try to duplicate again.
3181	if (ChainEnd == Chain.begin())
3182	break;
3183	DupPred = *std::prev(x: ChainEnd);
3184	Removed = maybeTailDuplicateBlock(
3185	BB: DupBB, LPred: DupPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3186	PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3187	}
3188	// If BB was duplicated into LPred, it is now scheduled. But because it was
3189	// removed, markChainSuccessors won't be called for its chain. Instead we
3190	// call markBlockSuccessors for LPred to achieve the same effect. This must go
3191	// at the end because repeating the tail duplication can increase the number
3192	// of unscheduled predecessors.
3193	LPred = *std::prev(x: Chain.end());
3194	if (DuplicatedToOriginalLPred)
3195	markBlockSuccessors(Chain, MBB: LPred, LoopHeaderBB, BlockFilter);
3196	return true;
3197	}
3198
3199	/// Tail duplicate \p BB into (some) predecessors if profitable.
3200	/// \p BB - Basic block that may be duplicated
3201	/// \p LPred - Chosen layout predecessor of \p BB
3202	/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3203	/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3204	/// Used to identify which blocks to update predecessor
3205	/// counts.
3206	/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3207	/// chosen in the given order due to unnatural CFG
3208	/// only needed if \p BB is removed and
3209	/// \p PrevUnplacedBlockIt pointed to \p BB.
3210	/// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3211	/// \return - True if the block was duplicated into all preds and removed.
3212	bool MachineBlockPlacement::maybeTailDuplicateBlock(
3213	MachineBasicBlock BB, MachineBasicBlock LPred, BlockChain &Chain,
3214	BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3215	BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
3216	bool &DuplicatedToLPred) {
3217	DuplicatedToLPred = false;
3218	if (!shouldTailDuplicate(BB))
3219	return false;
3220
3221	LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3222	<< "\n");
3223
3224	// This has to be a callback because none of it can be done after
3225	// BB is deleted.
3226	bool Removed = false;
3227	auto RemovalCallback = [&](MachineBasicBlock *RemBB) {
3228	// Signal to outer function
3229	Removed = true;
3230
3231	// Remove from the Chain and Chain Map
3232	if (auto It = BlockToChain.find(Val: RemBB); It != BlockToChain.end()) {
3233	It ->second->remove(BB: RemBB);
3234	BlockToChain.erase(I: It);
3235	}
3236
3237	// Handle the unplaced block iterator
3238	if (&(*PrevUnplacedBlockIt) == RemBB) {
3239	PrevUnplacedBlockIt ++;
3240	}
3241
3242	// Handle the Work Lists
3243	if (RemBB->isEHPad()) {
3244	llvm::erase(C&: EHPadWorkList, V: RemBB);
3245	} else {
3246	llvm::erase(C&: BlockWorkList, V: RemBB);
3247	}
3248
3249	// Handle the filter set
3250	if (BlockFilter) {
3251	auto It = llvm::find(Range&: *BlockFilter, Val: RemBB);
3252	// Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt
3253	// pointing to the same element as before.
3254	if (It != BlockFilter->end()) {
3255	if (It < PrevUnplacedBlockInFilterIt) {
3256	const MachineBasicBlock PrevBB = PrevUnplacedBlockInFilterIt;
3257	// BlockFilter is a SmallVector so all elements after RemBB are
3258	// shifted to the front by 1 after its deletion.
3259	auto Distance = PrevUnplacedBlockInFilterIt - It - `1`;
3260	PrevUnplacedBlockInFilterIt = BlockFilter->erase(I: It) + Distance;
3261	assert(*PrevUnplacedBlockInFilterIt == PrevBB);
3262	(void)PrevBB;
3263	} else if (It == PrevUnplacedBlockInFilterIt)
3264	// The block pointed by PrevUnplacedBlockInFilterIt is erased, we
3265	// have to set it to the next element.
3266	PrevUnplacedBlockInFilterIt = BlockFilter->erase(I: It);
3267	else
3268	BlockFilter->erase(I: It);
3269	}
3270	}
3271
3272	// Remove the block from loop info.
3273	MLI->removeBlock(BB: RemBB);
3274	if (RemBB == PreferredLoopExit)
3275	PreferredLoopExit = nullptr;
3276
3277	LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: " << getBlockName(RemBB)
3278	<< "\n");
3279	};
3280	auto RemovalCallbackRef =
3281	function_ref<void(MachineBasicBlock *)>(RemovalCallback);
3282
3283	SmallVector<MachineBasicBlock *, `8`> DuplicatedPreds;
3284	bool IsSimple = TailDup.isSimpleBB(TailBB: BB);
3285	SmallVector<MachineBasicBlock *, `8`> CandidatePreds;
3286	SmallVectorImpl<MachineBasicBlock > CandidatePtr = nullptr;
3287	if (F->getFunction().hasProfileData()) {
3288	// We can do partial duplication with precise profile information.
3289	findDuplicateCandidates(Candidates&: CandidatePreds, BB, BlockFilter);
3290	if (CandidatePreds.size() == `0`)
3291	return false;
3292	if (CandidatePreds.size() < BB->pred_size())
3293	CandidatePtr = &CandidatePreds;
3294	}
3295	TailDup.tailDuplicateAndUpdate(IsSimple, MBB: BB, ForcedLayoutPred: LPred, DuplicatedPreds: &DuplicatedPreds,
3296	RemovalCallback: &RemovalCallbackRef, CandidatePtr);
3297
3298	// Update UnscheduledPredecessors to reflect tail-duplication.
3299	DuplicatedToLPred = false;
3300	for (MachineBasicBlock *Pred : DuplicatedPreds) {
3301	// We're only looking for unscheduled predecessors that match the filter.
3302	BlockChain *PredChain = BlockToChain [Pred];
3303	if (Pred == LPred)
3304	DuplicatedToLPred = true;
3305	if (Pred == LPred \|\| (BlockFilter && !BlockFilter->count(key: Pred)) \|\|
3306	PredChain == &Chain)
3307	continue;
3308	for (MachineBasicBlock *NewSucc : Pred->successors()) {
3309	if (BlockFilter && !BlockFilter->count(key: NewSucc))
3310	continue;
3311	BlockChain *NewChain = BlockToChain [NewSucc];
3312	if (NewChain != &Chain && NewChain != PredChain)
3313	NewChain->UnscheduledPredecessors++;
3314	}
3315	}
3316	return Removed;
3317	}
3318
3319	// Count the number of actual machine instructions.
3320	static uint64_t countMBBInstruction(MachineBasicBlock *MBB) {
3321	uint64_t InstrCount = `0`;
3322	for (MachineInstr &MI : *MBB) {
3323	if (!MI.isPHI() && !MI.isMetaInstruction())
3324	InstrCount += `1`;
3325	}
3326	return InstrCount;
3327	}
3328
3329	// The size cost of duplication is the instruction size of the duplicated block.
3330	// So we should scale the threshold accordingly. But the instruction size is not
3331	// available on all targets, so we use the number of instructions instead.
3332	BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3333	return BlockFrequency (DupThreshold.getFrequency() * countMBBInstruction(MBB: BB));
3334	}
3335
3336	// Returns true if BB is Pred's best successor.
3337	bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3338	MachineBasicBlock *Pred,
3339	BlockFilterSet *BlockFilter) {
3340	if (BB == Pred)
3341	return false;
3342	if (BlockFilter && !BlockFilter->count(key: Pred))
3343	return false;
3344	BlockChain *PredChain = BlockToChain [Pred];
3345	if (PredChain && (Pred != *std::prev(x: PredChain->end())))
3346	return false;
3347
3348	// Find the successor with largest probability excluding BB.
3349	BranchProbability BestProb = BranchProbability::getZero();
3350	for (MachineBasicBlock *Succ : Pred->successors())
3351	if (Succ != BB) {
3352	if (BlockFilter && !BlockFilter->count(key: Succ))
3353	continue;
3354	BlockChain *SuccChain = BlockToChain [Succ];
3355	if (SuccChain && (Succ != *SuccChain->begin()))
3356	continue;
3357	BranchProbability SuccProb = MBPI->getEdgeProbability(Src: Pred, Dst: Succ);
3358	if (SuccProb > BestProb)
3359	BestProb = SuccProb;
3360	}
3361
3362	BranchProbability BBProb = MBPI->getEdgeProbability(Src: Pred, Dst: BB);
3363	if (BBProb <= BestProb)
3364	return false;
3365
3366	// Compute the number of reduced taken branches if Pred falls through to BB
3367	// instead of another successor. Then compare it with threshold.
3368	BlockFrequency PredFreq = getBlockCountOrFrequency(BB: Pred);
3369	BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3370	return Gain > scaleThreshold(BB);
3371	}
3372
3373	// Find out the predecessors of BB and BB can be beneficially duplicated into
3374	// them.
3375	void MachineBlockPlacement::findDuplicateCandidates(
3376	SmallVectorImpl<MachineBasicBlock > &Candidates, MachineBasicBlock BB,
3377	BlockFilterSet *BlockFilter) {
3378	MachineBasicBlock Fallthrough = nullptr*;
3379	BranchProbability DefaultBranchProb = BranchProbability::getZero();
3380	BlockFrequency BBDupThreshold(scaleThreshold(BB));
3381	SmallVector<MachineBasicBlock *, `8`> Preds(BB->predecessors());
3382	SmallVector<MachineBasicBlock *, `8`> Succs(BB->successors());
3383
3384	// Sort for highest frequency.
3385	auto CmpSucc = [&](MachineBasicBlock A, MachineBasicBlock B) {
3386	return MBPI->getEdgeProbability(Src: BB, Dst: A) > MBPI->getEdgeProbability(Src: BB, Dst: B);
3387	};
3388	auto CmpPred = [&](MachineBasicBlock A, MachineBasicBlock B) {
3389	return MBFI ->getBlockFreq(MBB: A) > MBFI ->getBlockFreq(MBB: B);
3390	};
3391	llvm::stable_sort(Range&: Succs, C: CmpSucc);
3392	llvm::stable_sort(Range&: Preds, C: CmpPred);
3393
3394	auto SuccIt = Succs.begin();
3395	if (SuccIt != Succs.end()) {
3396	DefaultBranchProb = MBPI->getEdgeProbability(Src: BB, Dst: *SuccIt).getCompl();
3397	}
3398
3399	// For each predecessors of BB, compute the benefit of duplicating BB,
3400	// if it is larger than the threshold, add it into Candidates.
3401	//
3402	// If we have following control flow.
3403	//
3404	// PB1 PB2 PB3 PB4
3405	// \ \| / /\
3406	// \ \| / / \
3407	// \ \|/ / \
3408	// BB----/ OB
3409	// /\
3410	// / \
3411	// SB1 SB2
3412	//
3413	// And it can be partially duplicated as
3414	//
3415	// PB2+BB
3416	// \| PB1 PB3 PB4
3417	// \| \| / /\
3418	// \| \| / / \
3419	// \| \|/ / \
3420	// \| BB----/ OB
3421	// \|\ /\|
3422	// \| X \|
3423	// \|/ \\|
3424	// SB2 SB1
3425	//
3426	// The benefit of duplicating into a predecessor is defined as
3427	// Orig_taken_branch - Duplicated_taken_branch
3428	//
3429	// The Orig_taken_branch is computed with the assumption that predecessor
3430	// jumps to BB and the most possible successor is laid out after BB.
3431	//
3432	// The Duplicated_taken_branch is computed with the assumption that BB is
3433	// duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3434	// SB2 for PB2 in our case). If there is no available successor, the combined
3435	// block jumps to all BB's successor, like PB3 in this example.
3436	//
3437	// If a predecessor has multiple successors, so BB can't be duplicated into
3438	// it. But it can beneficially fall through to BB, and duplicate BB into other
3439	// predecessors.
3440	for (MachineBasicBlock *Pred : Preds) {
3441	BlockFrequency PredFreq = getBlockCountOrFrequency(BB: Pred);
3442
3443	if (!TailDup.canTailDuplicate(TailBB: BB, PredBB: Pred)) {
3444	// BB can't be duplicated into Pred, but it is possible to be layout
3445	// below Pred.
3446	if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3447	Fallthrough = Pred;
3448	if (SuccIt != Succs.end())
3449	SuccIt++;
3450	}
3451	continue;
3452	}
3453
3454	BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3455	BlockFrequency DupCost;
3456	if (SuccIt == Succs.end()) {
3457	// Jump to all successors;
3458	if (Succs.size() > `0`)
3459	DupCost += PredFreq;
3460	} else {
3461	// Fallthrough to SuccIt, jump to all other successors;*
3462	DupCost += PredFreq;
3463	DupCost -= PredFreq * MBPI->getEdgeProbability(Src: BB, Dst: *SuccIt);
3464	}
3465
3466	assert(OrigCost >= DupCost);
3467	OrigCost -= DupCost;
3468	if (OrigCost > BBDupThreshold) {
3469	Candidates.push_back(Elt: Pred);
3470	if (SuccIt != Succs.end())
3471	SuccIt++;
3472	}
3473	}
3474
3475	// No predecessors can optimally fallthrough to BB.
3476	// So we can change one duplication into fallthrough.
3477	if (!Fallthrough) {
3478	if ((Candidates.size() < Preds.size()) && (Candidates.size() > `0`)) {
3479	Candidates [`0`] = Candidates.back();
3480	Candidates.pop_back();
3481	}
3482	}
3483	}
3484
3485	void MachineBlockPlacement::initTailDupThreshold() {
3486	DupThreshold = BlockFrequency (`0`);
3487	if (F->getFunction().hasProfileData()) {
3488	// We prefer to use prifile count.
3489	uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3490	if (HotThreshold != UINT64_MAX) {
3491	UseProfileCount = true;
3492	DupThreshold =
3493	BlockFrequency (HotThreshold * TailDupProfilePercentThreshold / `100`);
3494	} else {
3495	// Profile count is not available, we can use block frequency instead.
3496	BlockFrequency MaxFreq = BlockFrequency (`0`);
3497	for (MachineBasicBlock &MBB : *F) {
3498	BlockFrequency Freq = MBFI ->getBlockFreq(MBB: &MBB);
3499	if (Freq > MaxFreq)
3500	MaxFreq = Freq;
3501	}
3502
3503	BranchProbability ThresholdProb(TailDupPlacementPenalty, `100`);
3504	DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
3505	UseProfileCount = false;
3506	}
3507	}
3508
3509	TailDupSize = TailDupPlacementThreshold;
3510	// If only the aggressive threshold is explicitly set, use it.
3511	if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != `0` &&
3512	TailDupPlacementThreshold.getNumOccurrences() == `0`)
3513	TailDupSize = TailDupPlacementAggressiveThreshold;
3514
3515	// For aggressive optimization, we can adjust some thresholds to be less
3516	// conservative.
3517	if (OptLevel >= CodeGenOptLevel::Aggressive) {
3518	// At O3 we should be more willing to copy blocks for tail duplication. This
3519	// increases size pressure, so we only do it at O3
3520	// Do this unless only the regular threshold is explicitly set.
3521	if (TailDupPlacementThreshold.getNumOccurrences() == `0` \|\|
3522	TailDupPlacementAggressiveThreshold.getNumOccurrences() != `0`)
3523	TailDupSize = TailDupPlacementAggressiveThreshold;
3524	}
3525
3526	// If there's no threshold provided through options, query the target
3527	// information for a threshold instead.
3528	if (TailDupPlacementThreshold.getNumOccurrences() == `0` &&
3529	(OptLevel < CodeGenOptLevel::Aggressive \|\|
3530	TailDupPlacementAggressiveThreshold.getNumOccurrences() == `0`))
3531	TailDupSize = TII->getTailDuplicateSize(OptLevel);
3532	}
3533
3534	PreservedAnalyses
3535	MachineBlockPlacementPass::run(MachineFunction &MF,
3536	MachineFunctionAnalysisManager &MFAM) {
3537	auto *MBPI = &MFAM.getResult<MachineBranchProbabilityAnalysis>(IR&: MF);
3538	auto MBFI = std::make_unique<MBFIWrapper>(
3539	args&: MFAM.getResult<MachineBlockFrequencyAnalysis>(IR&: MF));
3540	auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(IR&: MF);
3541	auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF)
3542	? &MFAM.getResult<MachinePostDominatorTreeAnalysis>(IR&: MF)
3543	: nullptr;
3544	auto *PSI = MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(IR&: MF)
3545	.getCachedResult<ProfileSummaryAnalysis>(
3546	IR&: *MF.getFunction().getParent());
3547	if (!PSI)
3548	report_fatal_error(reason: "MachineBlockPlacement requires ProfileSummaryAnalysis",
3549	gen_crash_diag: false);
3550
3551	MachineBlockPlacement MBP(MBPI, MLI, PSI, std::move(MBFI), MPDT,
3552	AllowTailMerge);
3553
3554	if (!MBP.run(F&: MF))
3555	return PreservedAnalyses::all();
3556
3557	return getMachineFunctionPassPreservedAnalyses();
3558	}
3559
3560	void MachineBlockPlacementPass::printPipeline(
3561	raw_ostream &OS,
3562	function_ref<StringRef(StringRef)> MapClassName2PassName) const {
3563	OS << MapClassName2PassName (name());
3564	if (!AllowTailMerge)
3565	OS << "<no-tail-merge>";
3566	}
3567
3568	bool MachineBlockPlacement::run(MachineFunction &MF) {
3569
3570	// Check for single-block functions and skip them.
3571	if (std::next(x: MF.begin()) == MF.end())
3572	return false;
3573
3574	F = &MF;
3575	OptLevel = F->getTarget().getOptLevel();
3576
3577	TII = MF.getSubtarget().getInstrInfo();
3578	TLI = MF.getSubtarget().getTargetLowering();
3579
3580	// Initialize PreferredLoopExit to nullptr here since it may never be set if
3581	// there are no MachineLoops.
3582	PreferredLoopExit = nullptr;
3583
3584	assert(BlockToChain.empty() &&
3585	"BlockToChain map should be empty before starting placement.");
3586	assert(ComputedEdges.empty() &&
3587	"Computed Edge map should be empty before starting placement.");
3588
3589	// Initialize tail duplication thresholds.
3590	initTailDupThreshold();
3591
3592	const bool OptForSize =
3593	llvm::shouldOptimizeForSize(MF: &MF, PSI, BFI: &MBFI ->getMBFI());
3594	// Determine whether to use ext-tsp for perf/size optimization. The method
3595	// is beneficial only for instances with at least 3 basic blocks and it can be
3596	// disabled for huge functions (exceeding a certain size).
3597	bool UseExtTspForPerf = false;
3598	bool UseExtTspForSize = false;
3599	if (`3` <= MF.size() && MF.size() <= ExtTspBlockPlacementMaxBlocks) {
3600	UseExtTspForPerf =
3601	EnableExtTspBlockPlacement &&
3602	(ApplyExtTspWithoutProfile \|\| MF.getFunction().hasProfileData());
3603	UseExtTspForSize = OptForSize && ApplyExtTspForSize;
3604	}
3605
3606	// Apply tail duplication.
3607	if (allowTailDupPlacement(MF&: *F)) {
3608	if (OptForSize)
3609	TailDupSize = `1`;
3610	const bool PreRegAlloc = false;
3611	TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI: MBFI.get(), PSI,
3612	/ LayoutMode / true, TailDupSize);
3613	if (!UseExtTspForSize)
3614	precomputeTriangleChains();
3615	}
3616
3617	// Run the main block placement.
3618	if (!UseExtTspForSize)
3619	buildCFGChains();
3620
3621	// Changing the layout can create new tail merging opportunities.
3622	// TailMerge can create jump into if branches that make CFG irreducible for
3623	// HW that requires structured CFG.
3624	const bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3625	AllowTailMerge && BranchFoldPlacement &&
3626	MF.size() > `3`;
3627	// No tail merging opportunities if the block number is less than four.
3628	if (EnableTailMerge) {
3629	const unsigned TailMergeSize = TailDupSize + `1`;
3630	BranchFolder BF(/DefaultEnableTailMerge=/true, /CommonHoist=/false,
3631	MBFI, MBPI, PSI, TailMergeSize);
3632
3633	if (BF.OptimizeFunction(MF, tii: TII, tri: MF.getSubtarget().getRegisterInfo(), mli: MLI,
3634	/AfterPlacement=/true)) {
3635	// Must redo the post-dominator tree if blocks were changed.
3636	if (MPDT)
3637	MPDT->recalculate(Func&: MF);
3638	if (!UseExtTspForSize) {
3639	// Redo the layout if tail merging creates/removes/moves blocks.
3640	BlockToChain.clear();
3641	ComputedEdges.clear();
3642	ChainAllocator.DestroyAll();
3643	buildCFGChains();
3644	}
3645	}
3646	}
3647
3648	// Apply a post-processing optimizing block placement:
3649	// - find a new placement and modify the layout of the blocks in the function;
3650	// - re-create CFG chains so that we can optimizeBranches and alignBlocks.
3651	if (UseExtTspForPerf \|\| UseExtTspForSize) {
3652	assert(
3653	!(UseExtTspForPerf && UseExtTspForSize) &&
3654	"UseExtTspForPerf and UseExtTspForSize can not be set simultaneously");
3655	applyExtTsp(/OptForSize=/UseExtTspForSize);
3656	createCFGChainExtTsp();
3657	}
3658
3659	optimizeBranches();
3660	alignBlocks();
3661
3662	BlockToChain.clear();
3663	ComputedEdges.clear();
3664	ChainAllocator.DestroyAll();
3665
3666	// View the function.
3667	if (ViewBlockLayoutWithBFI != GVDT_None &&
3668	(ViewBlockFreqFuncName.empty() \|\|
3669	F->getFunction().getName() == ViewBlockFreqFuncName)) {
3670	if (RenumberBlocksBeforeView)
3671	MF.RenumberBlocks();
3672	MBFI ->view(Name: "MBP." + MF.getName(), isSimple: false);
3673	}
3674
3675	// We always return true as we have no way to track whether the final order
3676	// differs from the original order.
3677	return true;
3678	}
3679
3680	void MachineBlockPlacement::applyExtTsp(bool OptForSize) {
3681	// Prepare data; blocks are indexed by their index in the current ordering.
3682	DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
3683	BlockIndex.reserve(NumEntries: F->size());
3684	std::vector<const MachineBasicBlock *> CurrentBlockOrder;
3685	CurrentBlockOrder.reserve(n: F->size());
3686	size_t NumBlocks = `0`;
3687	for (const MachineBasicBlock &MBB : *F) {
3688	BlockIndex [&MBB] = NumBlocks++;
3689	CurrentBlockOrder.push_back(x: &MBB);
3690	}
3691
3692	SmallVector<uint64_t, `0`> BlockCounts(F->size());
3693	SmallVector<uint64_t, `0`> BlockSizes(F->size());
3694	SmallVector<codelayout::EdgeCount, `0`> JumpCounts;
3695	SmallVector<MachineOperand, `4`> Cond; // For analyzeBranch.
3696	SmallVector<const MachineBasicBlock *, `4`> Succs;
3697	for (MachineBasicBlock &MBB : *F) {
3698	// Getting the block frequency.
3699	BlockFrequency BlockFreq = MBFI ->getBlockFreq(MBB: &MBB);
3700	BlockCounts [BlockIndex [&MBB]] = OptForSize ? `1` : BlockFreq.getFrequency();
3701	// Getting the block size:
3702	// - approximate the size of an instruction by 4 bytes, and
3703	// - ignore debug instructions.
3704	// Note: getting the exact size of each block is target-dependent and can be
3705	// done by extending the interface of MCCodeEmitter. Experimentally we do
3706	// not see a perf improvement with the exact block sizes.
3707	auto NonDbgInsts =
3708	instructionsWithoutDebug(It: MBB.instr_begin(), End: MBB.instr_end());
3709	size_t NumInsts = std::distance(first: NonDbgInsts.begin(), last: NonDbgInsts.end());
3710	BlockSizes [BlockIndex [&MBB]] = `4` * NumInsts;
3711
3712	// Getting jump frequencies.
3713	if (OptForSize) {
3714	Cond.clear();
3715	MachineBasicBlock TBB = nullptr, FBB = nullptr; // For analyzeBranch.
3716	if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3717	continue;
3718
3719	const MachineBasicBlock *FTB = MBB.getFallThrough();
3720	// Succs is a collection of distinct destinations of the block reachable
3721	// from MBB via a jump instruction; initialize the list using the three
3722	// (non-necessarily distinct) blocks, FTB, TBB, and FBB.
3723	Succs.clear();
3724	if (TBB && TBB != FTB)
3725	Succs.push_back(Elt: TBB);
3726	if (FBB && FBB != FTB)
3727	Succs.push_back(Elt: FBB);
3728	if (FTB)
3729	Succs.push_back(Elt: FTB);
3730	// Absolute magnitude of non-zero counts does not matter for the
3731	// optimization; prioritize slightly jumps with a single successor, since
3732	// the corresponding jump instruction will be removed from the binary.
3733	const uint64_t Freq = Succs.size() == `1` ? `110` : `100`;
3734	for (const MachineBasicBlock *Succ : Succs)
3735	JumpCounts.push_back(Elt: {.src: BlockIndex [&MBB], .dst: BlockIndex [Succ], .count: Freq});
3736	} else {
3737	for (MachineBasicBlock *Succ : MBB.successors()) {
3738	auto EP = MBPI->getEdgeProbability(Src: &MBB, Dst: Succ);
3739	BlockFrequency JumpFreq = BlockFreq * EP;
3740	JumpCounts.push_back(
3741	Elt: {.src: BlockIndex [&MBB], .dst: BlockIndex [Succ], .count: JumpFreq.getFrequency()});
3742	}
3743	}
3744	}
3745
3746	LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for \|V\| = " << F->size()
3747	<< " with profile = " << F->getFunction().hasProfileData()
3748	<< " (" << F->getName() << ")" << "\n");
3749
3750	const double OrgScore = calcExtTspScore(NodeSizes: BlockSizes, EdgeCounts: JumpCounts);
3751	LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore));
3752
3753	// Run the layout algorithm.
3754	auto NewOrder = computeExtTspLayout(NodeSizes: BlockSizes, NodeCounts: BlockCounts, EdgeCounts: JumpCounts);
3755	std::vector<const MachineBasicBlock *> NewBlockOrder;
3756	NewBlockOrder.reserve(n: F->size());
3757	for (uint64_t Node : NewOrder) {
3758	NewBlockOrder.push_back(x: CurrentBlockOrder [Node]);
3759	}
3760	const double OptScore = calcExtTspScore(Order: NewOrder, NodeSizes: BlockSizes, EdgeCounts: JumpCounts);
3761	LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore));
3762
3763	// If the optimization is unsuccessful, fall back to the original block order.
3764	if (OptForSize && OrgScore > OptScore)
3765	assignBlockOrder(NewOrder: CurrentBlockOrder);
3766	else
3767	assignBlockOrder(NewOrder: NewBlockOrder);
3768	}
3769
3770	void MachineBlockPlacement::assignBlockOrder(
3771	const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
3772	assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
3773	F->RenumberBlocks();
3774	// At this point, we possibly removed blocks from the function, so we can't
3775	// renumber the domtree. At this point, we don't need it anymore, though.
3776	// TODO: move this to the point where the dominator tree is actually
3777	// invalidated (i.e., where blocks are removed without updating the domtree).
3778	MPDT = nullptr;
3779
3780	bool HasChanges = false;
3781	for (size_t I = `0`; I < NewBlockOrder.size(); I++) {
3782	if (NewBlockOrder [I] != F->getBlockNumbered(N: I)) {
3783	HasChanges = true;
3784	break;
3785	}
3786	}
3787	// Stop early if the new block order is identical to the existing one.
3788	if (!HasChanges)
3789	return;
3790
3791	SmallVector<MachineBasicBlock *, `4`> PrevFallThroughs(F->getNumBlockIDs());
3792	for (auto &MBB : *F) {
3793	PrevFallThroughs [MBB.getNumber()] = MBB.getFallThrough();
3794	}
3795
3796	// Sort basic blocks in the function according to the computed order.
3797	DenseMap<const MachineBasicBlock *, size_t> NewIndex;
3798	for (const MachineBasicBlock *MBB : NewBlockOrder) {
3799	NewIndex [MBB] = NewIndex.size();
3800	}
3801	F->sort(comp: [&](MachineBasicBlock &L, MachineBasicBlock &R) {
3802	return NewIndex [&L] < NewIndex [&R];
3803	});
3804
3805	// Update basic block branches by inserting explicit fallthrough branches
3806	// when required and re-optimize branches when possible.
3807	const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
3808	SmallVector<MachineOperand, `4`> Cond;
3809	for (auto &MBB : *F) {
3810	MachineFunction::iterator NextMBB = std::next(x: MBB.getIterator());
3811	MachineFunction::iterator EndIt = MBB.getParent()->end();
3812	auto *FTMBB = PrevFallThroughs [MBB.getNumber()];
3813	// If this block had a fallthrough before we need an explicit unconditional
3814	// branch to that block if the fallthrough block is not adjacent to the
3815	// block in the new order.
3816	if (FTMBB && (NextMBB == EndIt \|\| &*NextMBB != FTMBB)) {
3817	TII->insertUnconditionalBranch(MBB, DestBB: FTMBB, DL: MBB.findBranchDebugLoc());
3818	}
3819
3820	// It might be possible to optimize branches by flipping the condition.
3821	Cond.clear();
3822	MachineBasicBlock TBB = nullptr, FBB = nullptr;
3823	if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3824	continue;
3825	MBB.updateTerminator(PreviousLayoutSuccessor: FTMBB);
3826	}
3827	}
3828
3829	void MachineBlockPlacement::createCFGChainExtTsp() {
3830	BlockToChain.clear();
3831	ComputedEdges.clear();
3832	ChainAllocator.DestroyAll();
3833
3834	MachineBasicBlock *HeadBB = &F->front();
3835	BlockChain *FunctionChain =
3836	new (ChainAllocator.Allocate()) BlockChain (BlockToChain, HeadBB);
3837
3838	for (MachineBasicBlock &MBB : *F) {
3839	if (HeadBB == &MBB)
3840	continue; // Ignore head of the chain
3841	FunctionChain->merge(BB: &MBB, Chain: nullptr);
3842	}
3843	}
3844
3845	namespace {
3846
3847	/// A pass to compute block placement statistics.
3848	///
3849	/// A separate pass to compute interesting statistics for evaluating block
3850	/// placement. This is separate from the actual placement pass so that they can
3851	/// be computed in the absence of any placement transformations or when using
3852	/// alternative placement strategies.
3853	class MachineBlockPlacementStats {
3854	/// A handle to the branch probability pass.
3855	const MachineBranchProbabilityInfo *MBPI;
3856
3857	/// A handle to the function-wide block frequency pass.
3858	const MachineBlockFrequencyInfo *MBFI;
3859
3860	public:
3861	MachineBlockPlacementStats(const MachineBranchProbabilityInfo *MBPI,
3862	const MachineBlockFrequencyInfo *MBFI)
3863	: MBPI(MBPI), MBFI(MBFI) {}
3864	bool run(MachineFunction &MF);
3865	};
3866
3867	class MachineBlockPlacementStatsLegacy : public MachineFunctionPass {
3868	public:
3869	static char ID; // Pass identification, replacement for typeid
3870
3871	MachineBlockPlacementStatsLegacy() : MachineFunctionPass (ID) {
3872	initializeMachineBlockPlacementStatsLegacyPass(
3873	*PassRegistry::getPassRegistry());
3874	}
3875
3876	bool runOnMachineFunction(MachineFunction &F) override {
3877	auto *MBPI =
3878	&getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
3879	auto *MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
3880	return MachineBlockPlacementStats (MBPI, MBFI).run(MF&: F);
3881	}
3882
3883	void getAnalysisUsage(AnalysisUsage &AU) const override {
3884	AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
3885	AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
3886	AU.setPreservesAll();
3887	MachineFunctionPass::getAnalysisUsage(AU);
3888	}
3889	};
3890
3891	} // end anonymous namespace
3892
3893	char MachineBlockPlacementStatsLegacy::ID = `0`;
3894
3895	char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStatsLegacy::ID;
3896
3897	INITIALIZE_PASS_BEGIN(MachineBlockPlacementStatsLegacy, "block-placement-stats",
3898	"Basic Block Placement Stats", false, false)
3899	INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
3900	INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass)
3901	INITIALIZE_PASS_END(MachineBlockPlacementStatsLegacy, "block-placement-stats",
3902	"Basic Block Placement Stats", false, false)
3903
3904	PreservedAnalyses
3905	MachineBlockPlacementStatsPass::run(MachineFunction &MF,
3906	MachineFunctionAnalysisManager &MFAM) {
3907	auto &MBPI = MFAM.getResult<MachineBranchProbabilityAnalysis>(IR&: MF);
3908	auto &MBFI = MFAM.getResult<MachineBlockFrequencyAnalysis>(IR&: MF);
3909
3910	MachineBlockPlacementStats (&MBPI, &MBFI).run(MF);
3911	return PreservedAnalyses::all();
3912	}
3913
3914	bool MachineBlockPlacementStats::run(MachineFunction &F) {
3915	// Check for single-block functions and skip them.
3916	if (std::next(x: F.begin()) == F.end())
3917	return false;
3918
3919	if (!isFunctionInPrintList(FunctionName: F.getName()))
3920	return false;
3921
3922	for (MachineBasicBlock &MBB : F) {
3923	BlockFrequency BlockFreq = MBFI->getBlockFreq(MBB: &MBB);
3924	Statistic &NumBranches =
3925	(MBB.succ_size() > `1`) ? NumCondBranches : NumUncondBranches;
3926	Statistic &BranchTakenFreq =
3927	(MBB.succ_size() > `1`) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3928	for (MachineBasicBlock *Succ : MBB.successors()) {
3929	// Skip if this successor is a fallthrough.
3930	if (MBB.isLayoutSuccessor(MBB: Succ))
3931	continue;
3932
3933	BlockFrequency EdgeFreq =
3934	BlockFreq * MBPI->getEdgeProbability(Src: &MBB, Dst: Succ);
3935	++NumBranches;
3936	BranchTakenFreq += EdgeFreq.getFrequency();
3937	}
3938	}
3939
3940	return false;
3941	}
3942

Browse the source code of llvm_projects/llvm/lib/CodeGen/MachineBlockPlacement.cpp