MachineBlockPlacement.cpp source code [llvm_projects/llvm/lib/CodeGen/MachineBlockPlacement.cpp]

1	//===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements basic block placement transformations using the CFG
10	// structure and branch probability estimates.
11	//
12	// The pass strives to preserve the structure of the CFG (that is, retain
13	// a topological ordering of basic blocks) in the absence of a strong* signal*
14	// to the contrary from probabilities. However, within the CFG structure, it
15	// attempts to choose an ordering which favors placing more likely sequences of
16	// blocks adjacent to each other.
17	//
18	// The algorithm works from the inner-most loop within a function outward, and
19	// at each stage walks through the basic blocks, trying to coalesce them into
20	// sequential chains where allowed by the CFG (or demanded by heavy
21	// probabilities). Finally, it walks the blocks in topological order, and the
22	// first time it reaches a chain of basic blocks, it schedules them in the
23	// function in-order.
24	//
25	//===----------------------------------------------------------------------===//
26
27	#include "llvm/CodeGen/MachineBlockPlacement.h"
28	#include "BranchFolding.h"
29	#include "llvm/ADT/ArrayRef.h"
30	#include "llvm/ADT/DenseMap.h"
31	#include "llvm/ADT/STLExtras.h"
32	#include "llvm/ADT/SetVector.h"
33	#include "llvm/ADT/SmallPtrSet.h"
34	#include "llvm/ADT/SmallVector.h"
35	#include "llvm/ADT/Statistic.h"
36	#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
37	#include "llvm/Analysis/ProfileSummaryInfo.h"
38	#include "llvm/CodeGen/MBFIWrapper.h"
39	#include "llvm/CodeGen/MachineBasicBlock.h"
40	#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
41	#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
42	#include "llvm/CodeGen/MachineFunction.h"
43	#include "llvm/CodeGen/MachineFunctionPass.h"
44	#include "llvm/CodeGen/MachineLoopInfo.h"
45	#include "llvm/CodeGen/MachinePostDominators.h"
46	#include "llvm/CodeGen/MachineSizeOpts.h"
47	#include "llvm/CodeGen/TailDuplicator.h"
48	#include "llvm/CodeGen/TargetInstrInfo.h"
49	#include "llvm/CodeGen/TargetLowering.h"
50	#include "llvm/CodeGen/TargetPassConfig.h"
51	#include "llvm/CodeGen/TargetSubtargetInfo.h"
52	#include "llvm/IR/DebugLoc.h"
53	#include "llvm/IR/Function.h"
54	#include "llvm/IR/PrintPasses.h"
55	#include "llvm/InitializePasses.h"
56	#include "llvm/Pass.h"
57	#include "llvm/Support/Allocator.h"
58	#include "llvm/Support/BlockFrequency.h"
59	#include "llvm/Support/BranchProbability.h"
60	#include "llvm/Support/CodeGen.h"
61	#include "llvm/Support/CommandLine.h"
62	#include "llvm/Support/Compiler.h"
63	#include "llvm/Support/Debug.h"
64	#include "llvm/Support/raw_ostream.h"
65	#include "llvm/Target/TargetMachine.h"
66	#include "llvm/Transforms/Utils/CodeLayout.h"
67	#include <algorithm>
68	#include <cassert>
69	#include <cstdint>
70	#include <iterator>
71	#include <memory>
72	#include <string>
73	#include <tuple>
74	#include <utility>
75	#include <vector>
76
77	using namespace llvm;
78
79	#define DEBUG_TYPE "block-placement"
80
81	STATISTIC(NumCondBranches, "Number of conditional branches");
82	STATISTIC(NumUncondBranches, "Number of unconditional branches");
83	STATISTIC(CondBranchTakenFreq,
84	"Potential frequency of taking conditional branches");
85	STATISTIC(UncondBranchTakenFreq,
86	"Potential frequency of taking unconditional branches");
87
88	static cl::opt<unsigned> AlignAllBlock(
89	"align-all-blocks",
90	cl::desc ("Force the alignment of all blocks in the function in log2 format "
91	"(e.g 4 means align on 16B boundaries)."),
92	cl::init(Val: `0`), cl::Hidden);
93
94	static cl::opt<unsigned> AlignAllNonFallThruBlocks(
95	"align-all-nofallthru-blocks",
96	cl::desc ("Force the alignment of all blocks that have no fall-through "
97	"predecessors (i.e. don't add nops that are executed). In log2 "
98	"format (e.g 4 means align on 16B boundaries)."),
99	cl::init(Val: `0`), cl::Hidden);
100
101	static cl::opt<unsigned> MaxBytesForAlignmentOverride(
102	"max-bytes-for-alignment",
103	cl::desc ("Forces the maximum bytes allowed to be emitted when padding for "
104	"alignment"),
105	cl::init(Val: `0`), cl::Hidden);
106
107	static cl::opt<unsigned> PredecessorLimit(
108	"block-placement-predecessor-limit",
109	cl::desc ("For blocks with more predecessors, certain layout optimizations"
110	"will be disabled to prevent quadratic compile time."),
111	cl::init(Val: `1000`), cl::Hidden);
112
113	// FIXME: Find a good default for this flag and remove the flag.
114	static cl::opt<unsigned> ExitBlockBias(
115	"block-placement-exit-block-bias",
116	cl::desc ("Block frequency percentage a loop exit block needs "
117	"over the original exit to be considered the new exit."),
118	cl::init(Val: `0`), cl::Hidden);
119
120	// Definition:
121	// - Outlining: placement of a basic block outside the chain or hot path.
122
123	static cl::opt<unsigned> LoopToColdBlockRatio(
124	"loop-to-cold-block-ratio",
125	cl::desc ("Outline loop blocks from loop chain if (frequency of loop) / "
126	"(frequency of block) is greater than this ratio"),
127	cl::init(Val: `5`), cl::Hidden);
128
129	static cl::opt<bool>
130	ForceLoopColdBlock("force-loop-cold-block",
131	cl::desc ("Force outlining cold blocks from loops."),
132	cl::init(Val: false), cl::Hidden);
133
134	static cl::opt<bool>
135	PreciseRotationCost("precise-rotation-cost",
136	cl::desc ("Model the cost of loop rotation more "
137	"precisely by using profile data."),
138	cl::init(Val: false), cl::Hidden);
139
140	static cl::opt<bool>
141	ForcePreciseRotationCost("force-precise-rotation-cost",
142	cl::desc ("Force the use of precise cost "
143	"loop rotation strategy."),
144	cl::init(Val: false), cl::Hidden);
145
146	static cl::opt<unsigned> MisfetchCost(
147	"misfetch-cost",
148	cl::desc ("Cost that models the probabilistic risk of an instruction "
149	"misfetch due to a jump comparing to falling through, whose cost "
150	"is zero."),
151	cl::init(Val: `1`), cl::Hidden);
152
153	static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
154	cl::desc ("Cost of jump instructions."),
155	cl::init(Val: `1`), cl::Hidden);
156	static cl::opt<bool>
157	TailDupPlacement("tail-dup-placement",
158	cl::desc ("Perform tail duplication during placement. "
159	"Creates more fallthrough opportunities in "
160	"outline branches."),
161	cl::init(Val: true), cl::Hidden);
162
163	static cl::opt<bool>
164	BranchFoldPlacement("branch-fold-placement",
165	cl::desc ("Perform branch folding during placement. "
166	"Reduces code size."),
167	cl::init(Val: true), cl::Hidden);
168
169	// Heuristic for tail duplication.
170	static cl::opt<unsigned> TailDupPlacementThreshold(
171	"tail-dup-placement-threshold",
172	cl::desc ("Instruction cutoff for tail duplication during layout. "
173	"Tail merging during layout is forced to have a threshold "
174	"that won't conflict."),
175	cl::init(Val: `2`), cl::Hidden);
176
177	// Heuristic for aggressive tail duplication.
178	static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(
179	"tail-dup-placement-aggressive-threshold",
180	cl::desc ("Instruction cutoff for aggressive tail duplication during "
181	"layout. Used at -O3. Tail merging during layout is forced to "
182	"have a threshold that won't conflict."),
183	cl::init(Val: `4`), cl::Hidden);
184
185	// Heuristic for tail duplication.
186	static cl::opt<unsigned> TailDupPlacementPenalty(
187	"tail-dup-placement-penalty",
188	cl::desc (
189	"Cost penalty for blocks that can avoid breaking CFG by copying. "
190	"Copying can increase fallthrough, but it also increases icache "
191	"pressure. This parameter controls the penalty to account for that. "
192	"Percent as integer."),
193	cl::init(Val: `2`), cl::Hidden);
194
195	// Heuristic for tail duplication if profile count is used in cost model.
196	static cl::opt<unsigned> TailDupProfilePercentThreshold(
197	"tail-dup-profile-percent-threshold",
198	cl::desc ("If profile count information is used in tail duplication cost "
199	"model, the gained fall through number from tail duplication "
200	"should be at least this percent of hot count."),
201	cl::init(Val: `50`), cl::Hidden);
202
203	// Heuristic for triangle chains.
204	static cl::opt<unsigned> TriangleChainCount(
205	"triangle-chain-count",
206	cl::desc ("Number of triangle-shaped-CFG's that need to be in a row for the "
207	"triangle tail duplication heuristic to kick in. 0 to disable."),
208	cl::init(Val: `2`), cl::Hidden);
209
210	// Use case: When block layout is visualized after MBP pass, the basic blocks
211	// are labeled in layout order; meanwhile blocks could be numbered in a
212	// different order. It's hard to map between the graph and pass output.
213	// With this option on, the basic blocks are renumbered in function layout
214	// order. For debugging only.
215	static cl::opt<bool> RenumberBlocksBeforeView(
216	"renumber-blocks-before-view",
217	cl::desc (
218	"If true, basic blocks are re-numbered before MBP layout is printed "
219	"into a dot graph. Only used when a function is being printed."),
220	cl::init(Val: false), cl::Hidden);
221
222	static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
223	"ext-tsp-block-placement-max-blocks",
224	cl::desc ("Maximum number of basic blocks in a function to run ext-TSP "
225	"block placement."),
226	cl::init(UINT_MAX), cl::Hidden);
227
228	// Apply the ext-tsp algorithm minimizing the size of a binary.
229	static cl::opt<bool>
230	ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(Val: false), cl::Hidden,
231	cl::desc ("Use ext-tsp for size-aware block placement."));
232
233	namespace llvm {
234	extern cl::opt<bool> EnableExtTspBlockPlacement;
235	extern cl::opt<bool> ApplyExtTspWithoutProfile;
236	extern cl::opt<unsigned> StaticLikelyProb;
237	extern cl::opt<unsigned> ProfileLikelyProb;
238
239	// Internal option used to control BFI display only after MBP pass.
240	// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
241	// -view-block-layout-with-bfi=
242	extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI;
243
244	// Command line option to specify the name of the function for CFG dump
245	// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
246	extern cl::opt<std::string> ViewBlockFreqFuncName;
247	} // namespace llvm
248
249	namespace {
250
251	class BlockChain;
252
253	/// Type for our function-wide basic block -> block chain mapping.
254	using BlockToChainMapType = DenseMap<const MachineBasicBlock , BlockChain >;
255
256	/// A chain of blocks which will be laid out contiguously.
257	///
258	/// This is the datastructure representing a chain of consecutive blocks that
259	/// are profitable to layout together in order to maximize fallthrough
260	/// probabilities and code locality. We also can use a block chain to represent
261	/// a sequence of basic blocks which have some external (correctness)
262	/// requirement for sequential layout.
263	///
264	/// Chains can be built around a single basic block and can be merged to grow
265	/// them. They participate in a block-to-chain mapping, which is updated
266	/// automatically as chains are merged together.
267	class BlockChain {
268	/// The sequence of blocks belonging to this chain.
269	///
270	/// This is the sequence of blocks for a particular chain. These will be laid
271	/// out in-order within the function.
272	SmallVector<MachineBasicBlock *, `4`> Blocks;
273
274	/// A handle to the function-wide basic block to block chain mapping.
275	///
276	/// This is retained in each block chain to simplify the computation of child
277	/// block chains for SCC-formation and iteration. We store the edges to child
278	/// basic blocks, and map them back to their associated chains using this
279	/// structure.
280	BlockToChainMapType &BlockToChain;
281
282	public:
283	/// Construct a new BlockChain.
284	///
285	/// This builds a new block chain representing a single basic block in the
286	/// function. It also registers itself as the chain that block participates
287	/// in with the BlockToChain mapping.
288	BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
289	: Blocks (`1`, BB), BlockToChain(BlockToChain) {
290	assert(BB && "Cannot create a chain with a null basic block");
291	BlockToChain [BB] = this;
292	}
293
294	/// Iterator over blocks within the chain.
295	using iterator = SmallVectorImpl<MachineBasicBlock *>::iterator;
296	using const_iterator = SmallVectorImpl<MachineBasicBlock *>::const_iterator;
297
298	/// Beginning of blocks within the chain.
299	iterator begin() { return Blocks.begin(); }
300	const_iterator begin() const { return Blocks.begin(); }
301
302	/// End of blocks within the chain.
303	iterator end() { return Blocks.end(); }
304	const_iterator end() const { return Blocks.end(); }
305
306	bool remove(MachineBasicBlock *BB) {
307	for (iterator i = begin(); i != end(); ++i) {
308	if (*i == BB) {
309	Blocks.erase(CI: i);
310	return true;
311	}
312	}
313	return false;
314	}
315
316	/// Merge a block chain into this one.
317	///
318	/// This routine merges a block chain into this one. It takes care of forming
319	/// a contiguous sequence of basic blocks, updating the edge list, and
320	/// updating the block -> chain mapping. It does not free or tear down the
321	/// old chain, but the old chain's block list is no longer valid.
322	void merge(MachineBasicBlock BB, BlockChain Chain) {
323	assert(BB && "Can't merge a null block.");
324	assert(!Blocks.empty() && "Can't merge into an empty chain.");
325
326	// Fast path in case we don't have a chain already.
327	if (!Chain) {
328	assert(!BlockToChain[BB] &&
329	"Passed chain is null, but BB has entry in BlockToChain.");
330	Blocks.push_back(Elt: BB);
331	BlockToChain [BB] = this;
332	return;
333	}
334
335	assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
336	assert(Chain->begin() != Chain->end());
337
338	// Update the incoming blocks to point to this chain, and add them to the
339	// chain structure.
340	for (MachineBasicBlock ChainBB : Chain) {
341	Blocks.push_back(Elt: ChainBB);
342	assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
343	BlockToChain [ChainBB] = this;
344	}
345	}
346
347	#ifndef NDEBUG
348	/// Dump the blocks in this chain.
349	LLVM_DUMP_METHOD void dump() {
350	for (MachineBasicBlock MBB : this)
351	MBB->dump();
352	}
353	#endif // NDEBUG
354
355	/// Count of predecessors of any block within the chain which have not
356	/// yet been scheduled. In general, we will delay scheduling this chain
357	/// until those predecessors are scheduled (or we find a sufficiently good
358	/// reason to override this heuristic.) Note that when forming loop chains,
359	/// blocks outside the loop are ignored and treated as if they were already
360	/// scheduled.
361	///
362	/// Note: This field is reinitialized multiple times - once for each loop,
363	/// and then once for the function as a whole.
364	unsigned UnscheduledPredecessors = `0`;
365	};
366
367	class MachineBlockPlacement {
368	/// A type for a block filter set.
369	using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, `16`>;
370
371	/// Pair struct containing basic block and taildup profitability
372	struct BlockAndTailDupResult {
373	MachineBasicBlock BB = nullptr*;
374	bool ShouldTailDup;
375	};
376
377	/// Triple struct containing edge weight and the edge.
378	struct WeightedEdge {
379	BlockFrequency Weight;
380	MachineBasicBlock Src = nullptr*;
381	MachineBasicBlock Dest = nullptr*;
382	};
383
384	/// work lists of blocks that are ready to be laid out
385	SmallVector<MachineBasicBlock *, `16`> BlockWorkList;
386	SmallVector<MachineBasicBlock *, `16`> EHPadWorkList;
387
388	/// Edges that have already been computed as optimal.
389	DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges;
390
391	/// Machine Function
392	MachineFunction F = nullptr*;
393
394	/// A handle to the branch probability pass.
395	const MachineBranchProbabilityInfo MBPI = nullptr*;
396
397	/// A handle to the function-wide block frequency pass.
398	std::unique_ptr<MBFIWrapper> MBFI;
399
400	/// A handle to the loop info.
401	MachineLoopInfo MLI = nullptr*;
402
403	/// Preferred loop exit.
404	/// Member variable for convenience. It may be removed by duplication deep
405	/// in the call stack.
406	MachineBasicBlock PreferredLoopExit = nullptr*;
407
408	/// A handle to the target's instruction info.
409	const TargetInstrInfo TII = nullptr*;
410
411	/// A handle to the target's lowering info.
412	const TargetLoweringBase TLI = nullptr*;
413
414	/// A handle to the post dominator tree.
415	MachinePostDominatorTree MPDT = nullptr*;
416
417	ProfileSummaryInfo PSI = nullptr*;
418
419	// Tail merging is also determined based on
420	// whether structured CFG is required.
421	bool AllowTailMerge;
422
423	CodeGenOptLevel OptLevel;
424
425	/// Duplicator used to duplicate tails during placement.
426	///
427	/// Placement decisions can open up new tail duplication opportunities, but
428	/// since tail duplication affects placement decisions of later blocks, it
429	/// must be done inline.
430	TailDuplicator TailDup;
431
432	/// Partial tail duplication threshold.
433	BlockFrequency DupThreshold;
434
435	unsigned TailDupSize;
436
437	/// True: use block profile count to compute tail duplication cost.
438	/// False: use block frequency to compute tail duplication cost.
439	bool UseProfileCount = false;
440
441	/// Allocator and owner of BlockChain structures.
442	///
443	/// We build BlockChains lazily while processing the loop structure of
444	/// a function. To reduce malloc traffic, we allocate them using this
445	/// slab-like allocator, and destroy them after the pass completes. An
446	/// important guarantee is that this allocator produces stable pointers to
447	/// the chains.
448	SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
449
450	/// Function wide BasicBlock to BlockChain mapping.
451	///
452	/// This mapping allows efficiently moving from any given basic block to the
453	/// BlockChain it participates in, if any. We use it to, among other things,
454	/// allow implicitly defining edges between chains as the existing edges
455	/// between basic blocks.
456	DenseMap<const MachineBasicBlock , BlockChain > BlockToChain;
457
458	#ifndef NDEBUG
459	/// The set of basic blocks that have terminators that cannot be fully
460	/// analyzed. These basic blocks cannot be re-ordered safely by
461	/// MachineBlockPlacement, and we must preserve physical layout of these
462	/// blocks and their successors through the pass.
463	SmallPtrSet<MachineBasicBlock *, `4`> BlocksWithUnanalyzableExits;
464	#endif
465
466	/// Get block profile count or frequency according to UseProfileCount.
467	/// The return value is used to model tail duplication cost.
468	BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
469	if (UseProfileCount) {
470	auto Count = MBFI ->getBlockProfileCount(MBB: BB);
471	if (Count)
472	return BlockFrequency (*Count);
473	else
474	return BlockFrequency (`0`);
475	} else
476	return MBFI ->getBlockFreq(MBB: BB);
477	}
478
479	/// Scale the DupThreshold according to basic block size.
480	BlockFrequency scaleThreshold(MachineBasicBlock *BB);
481	void initTailDupThreshold();
482
483	/// Decrease the UnscheduledPredecessors count for all blocks in chain, and
484	/// if the count goes to 0, add them to the appropriate work list.
485	void markChainSuccessors(const BlockChain &Chain,
486	const MachineBasicBlock *LoopHeaderBB,
487	const BlockFilterSet BlockFilter = nullptr*);
488
489	/// Decrease the UnscheduledPredecessors count for a single block, and
490	/// if the count goes to 0, add them to the appropriate work list.
491	void markBlockSuccessors(const BlockChain &Chain, const MachineBasicBlock *BB,
492	const MachineBasicBlock *LoopHeaderBB,
493	const BlockFilterSet BlockFilter = nullptr*);
494
495	BranchProbability
496	collectViableSuccessors(const MachineBasicBlock BB, const* BlockChain &Chain,
497	const BlockFilterSet *BlockFilter,
498	SmallVector<MachineBasicBlock *, `4`> &Successors);
499	bool isBestSuccessor(MachineBasicBlock BB, MachineBasicBlock Pred,
500	BlockFilterSet *BlockFilter);
501	void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
502	MachineBasicBlock *BB,
503	BlockFilterSet *BlockFilter);
504	bool repeatedlyTailDuplicateBlock(
505	MachineBasicBlock BB, MachineBasicBlock &LPred,
506	const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
507	BlockFilterSet *BlockFilter,
508	MachineFunction::iterator &PrevUnplacedBlockIt,
509	BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt);
510	bool
511	maybeTailDuplicateBlock(MachineBasicBlock BB, MachineBasicBlock LPred,
512	BlockChain &Chain, BlockFilterSet *BlockFilter,
513	MachineFunction::iterator &PrevUnplacedBlockIt,
514	BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
515	bool &DuplicatedToLPred);
516	bool hasBetterLayoutPredecessor(const MachineBasicBlock *BB,
517	const MachineBasicBlock *Succ,
518	const BlockChain &SuccChain,
519	BranchProbability SuccProb,
520	BranchProbability RealSuccProb,
521	const BlockChain &Chain,
522	const BlockFilterSet *BlockFilter);
523	BlockAndTailDupResult selectBestSuccessor(const MachineBasicBlock *BB,
524	const BlockChain &Chain,
525	const BlockFilterSet *BlockFilter);
526	MachineBasicBlock *
527	selectBestCandidateBlock(const BlockChain &Chain,
528	SmallVectorImpl<MachineBasicBlock *> &WorkList);
529	MachineBasicBlock *
530	getFirstUnplacedBlock(const BlockChain &PlacedChain,
531	MachineFunction::iterator &PrevUnplacedBlockIt);
532	MachineBasicBlock *
533	getFirstUnplacedBlock(const BlockChain &PlacedChain,
534	BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
535	const BlockFilterSet *BlockFilter);
536
537	/// Add a basic block to the work list if it is appropriate.
538	///
539	/// If the optional parameter BlockFilter is provided, only MBB
540	/// present in the set will be added to the worklist. If nullptr
541	/// is provided, no filtering occurs.
542	void fillWorkLists(const MachineBasicBlock *MBB,
543	SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
544	const BlockFilterSet *BlockFilter);
545
546	void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
547	BlockFilterSet BlockFilter = nullptr*);
548	bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
549	const MachineBasicBlock *OldTop);
550	bool hasViableTopFallthrough(const MachineBasicBlock *Top,
551	const BlockFilterSet &LoopBlockSet);
552	BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
553	const BlockFilterSet &LoopBlockSet);
554	BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
555	const MachineBasicBlock *OldTop,
556	const MachineBasicBlock *ExitBB,
557	const BlockFilterSet &LoopBlockSet);
558	MachineBasicBlock findBestLoopTopHelper(MachineBasicBlock OldTop,
559	const MachineLoop &L,
560	const BlockFilterSet &LoopBlockSet);
561	MachineBasicBlock findBestLoopTop(const* MachineLoop &L,
562	const BlockFilterSet &LoopBlockSet);
563	MachineBasicBlock findBestLoopExit(const* MachineLoop &L,
564	const BlockFilterSet &LoopBlockSet,
565	BlockFrequency &ExitFreq);
566	BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
567	void buildLoopChains(const MachineLoop &L);
568	void rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
569	BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
570	void rotateLoopWithProfile(BlockChain &LoopChain, const MachineLoop &L,
571	const BlockFilterSet &LoopBlockSet);
572	void buildCFGChains();
573	void optimizeBranches();
574	void alignBlocks();
575	/// Returns true if a block should be tail-duplicated to increase fallthrough
576	/// opportunities.
577	bool shouldTailDuplicate(MachineBasicBlock *BB);
578	/// Check the edge frequencies to see if tail duplication will increase
579	/// fallthroughs.
580	bool isProfitableToTailDup(const MachineBasicBlock *BB,
581	const MachineBasicBlock *Succ,
582	BranchProbability QProb, const BlockChain &Chain,
583	const BlockFilterSet *BlockFilter);
584
585	/// Check for a trellis layout.
586	bool isTrellis(const MachineBasicBlock *BB,
587	const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
588	const BlockChain &Chain, const BlockFilterSet *BlockFilter);
589
590	/// Get the best successor given a trellis layout.
591	BlockAndTailDupResult getBestTrellisSuccessor(
592	const MachineBasicBlock *BB,
593	const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
594	BranchProbability AdjustedSumProb, const BlockChain &Chain,
595	const BlockFilterSet *BlockFilter);
596
597	/// Get the best pair of non-conflicting edges.
598	static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
599	const MachineBasicBlock *BB,
600	MutableArrayRef<SmallVector<WeightedEdge, `8`>> Edges);
601
602	/// Returns true if a block can tail duplicate into all unplaced
603	/// predecessors. Filters based on loop.
604	bool canTailDuplicateUnplacedPreds(const MachineBasicBlock *BB,
605	MachineBasicBlock *Succ,
606	const BlockChain &Chain,
607	const BlockFilterSet *BlockFilter);
608
609	/// Find chains of triangles to tail-duplicate where a global analysis works,
610	/// but a local analysis would not find them.
611	void precomputeTriangleChains();
612
613	/// Apply a post-processing step optimizing block placement.
614	void applyExtTsp(bool OptForSize);
615
616	/// Modify the existing block placement in the function and adjust all jumps.
617	void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
618
619	/// Create a single CFG chain from the current block order.
620	void createCFGChainExtTsp();
621
622	public:
623	MachineBlockPlacement(const MachineBranchProbabilityInfo *MBPI,
624	MachineLoopInfo MLI, ProfileSummaryInfo PSI,
625	std::unique_ptr<MBFIWrapper> MBFI,
626	MachinePostDominatorTree MPDT, bool* AllowTailMerge)
627	: MBPI(MBPI), MBFI (std::move(MBFI)), MLI(MLI), MPDT(MPDT), PSI(PSI),
628	AllowTailMerge(AllowTailMerge) {};
629
630	bool run(MachineFunction &F);
631
632	static bool allowTailDupPlacement(MachineFunction &MF) {
633	return TailDupPlacement && !MF.getTarget().requiresStructuredCFG();
634	}
635	};
636
637	class MachineBlockPlacementLegacy : public MachineFunctionPass {
638	public:
639	static char ID; // Pass identification, replacement for typeid
640
641	MachineBlockPlacementLegacy() : MachineFunctionPass (ID) {}
642
643	bool runOnMachineFunction(MachineFunction &MF) override {
644	if (skipFunction(F: MF.getFunction()))
645	return false;
646
647	auto *MBPI =
648	&getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
649	auto MBFI = std::make_unique<MBFIWrapper>(
650	args&: getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI());
651	auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
652	auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF)
653	? &getAnalysis<MachinePostDominatorTreeWrapperPass>()
654	.getPostDomTree()
655	: nullptr;
656	auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
657	auto *PassConfig = &getAnalysis<TargetPassConfig>();
658	bool AllowTailMerge = PassConfig->getEnableTailMerge();
659	return MachineBlockPlacement (MBPI, MLI, PSI, std::move(MBFI), MPDT,
660	AllowTailMerge)
661	.run(F&: MF);
662	}
663
664	void getAnalysisUsage(AnalysisUsage &AU) const override {
665	AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
666	AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
667	if (TailDupPlacement)
668	AU.addRequired<MachinePostDominatorTreeWrapperPass>();
669	AU.addRequired<MachineLoopInfoWrapperPass>();
670	AU.addRequired<ProfileSummaryInfoWrapperPass>();
671	AU.addRequired<TargetPassConfig>();
672	MachineFunctionPass::getAnalysisUsage(AU);
673	}
674	};
675
676	} // end anonymous namespace
677
678	char MachineBlockPlacementLegacy::ID = `0`;
679
680	char &llvm::MachineBlockPlacementID = MachineBlockPlacementLegacy::ID;
681
682	INITIALIZE_PASS_BEGIN(MachineBlockPlacementLegacy, DEBUG_TYPE,
683	"Branch Probability Basic Block Placement", false, false)
684	INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
685	INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass)
686	INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
687	INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
688	INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
689	INITIALIZE_PASS_END(MachineBlockPlacementLegacy, DEBUG_TYPE,
690	"Branch Probability Basic Block Placement", false, false)
691
692	#ifndef NDEBUG
693	/// Helper to print the name of a MBB.
694	///
695	/// Only used by debug logging.
696	static std::string getBlockName(const MachineBasicBlock *BB) {
697	std::string Result;
698	raw_string_ostream OS(Result);
699	OS << printMBBReference(*BB);
700	OS << " ('" << BB->getName() << "')";
701	return Result;
702	}
703	#endif
704
705	/// Mark a chain's successors as having one fewer preds.
706	///
707	/// When a chain is being merged into the "placed" chain, this routine will
708	/// quickly walk the successors of each block in the chain and mark them as
709	/// having one fewer active predecessor. It also adds any successors of this
710	/// chain which reach the zero-predecessor state to the appropriate worklist.
711	void MachineBlockPlacement::markChainSuccessors(
712	const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
713	const BlockFilterSet *BlockFilter) {
714	// Walk all the blocks in this chain, marking their successors as having
715	// a predecessor placed.
716	for (MachineBasicBlock *MBB : Chain) {
717	markBlockSuccessors(Chain, BB: MBB, LoopHeaderBB, BlockFilter);
718	}
719	}
720
721	/// Mark a single block's successors as having one fewer preds.
722	///
723	/// Under normal circumstances, this is only called by markChainSuccessors,
724	/// but if a block that was to be placed is completely tail-duplicated away,
725	/// and was duplicated into the chain end, we need to redo markBlockSuccessors
726	/// for just that block.
727	void MachineBlockPlacement::markBlockSuccessors(
728	const BlockChain &Chain, const MachineBasicBlock *MBB,
729	const MachineBasicBlock LoopHeaderBB, const* BlockFilterSet *BlockFilter) {
730	// Add any successors for which this is the only un-placed in-loop
731	// predecessor to the worklist as a viable candidate for CFG-neutral
732	// placement. No subsequent placement of this block will violate the CFG
733	// shape, so we get to use heuristics to choose a favorable placement.
734	for (MachineBasicBlock *Succ : MBB->successors()) {
735	if (BlockFilter && !BlockFilter->count(key: Succ))
736	continue;
737	BlockChain &SuccChain = *BlockToChain [Succ];
738	// Disregard edges within a fixed chain, or edges to the loop header.
739	if (&Chain == &SuccChain \|\| Succ == LoopHeaderBB)
740	continue;
741
742	// This is a cross-chain edge that is within the loop, so decrement the
743	// loop predecessor count of the destination chain.
744	if (SuccChain.UnscheduledPredecessors == `0` \|\|
745	--SuccChain.UnscheduledPredecessors > `0`)
746	continue;
747
748	auto NewBB = SuccChain.begin();
749	if (NewBB->isEHPad())
750	EHPadWorkList.push_back(Elt: NewBB);
751	else
752	BlockWorkList.push_back(Elt: NewBB);
753	}
754	}
755
756	/// This helper function collects the set of successors of block
757	/// \p BB that are allowed to be its layout successors, and return
758	/// the total branch probability of edges from \p BB to those
759	/// blocks.
760	BranchProbability MachineBlockPlacement::collectViableSuccessors(
761	const MachineBasicBlock BB, const* BlockChain &Chain,
762	const BlockFilterSet *BlockFilter,
763	SmallVector<MachineBasicBlock *, `4`> &Successors) {
764	// Adjust edge probabilities by excluding edges pointing to blocks that is
765	// either not in BlockFilter or is already in the current chain. Consider the
766	// following CFG:
767	//
768	// --->A
769	// \| / \
770	// \| B C
771	// \| \ / \
772	// ----D E
773	//
774	// Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
775	// A->C is chosen as a fall-through, D won't be selected as a successor of C
776	// due to CFG constraint (the probability of C->D is not greater than
777	// HotProb to break topo-order). If we exclude E that is not in BlockFilter
778	// when calculating the probability of C->D, D will be selected and we
779	// will get A C D B as the layout of this loop.
780	auto AdjustedSumProb = BranchProbability::getOne();
781	for (MachineBasicBlock *Succ : BB->successors()) {
782	bool SkipSucc = false;
783	if (Succ->isEHPad() \|\| (BlockFilter && !BlockFilter->count(key: Succ))) {
784	SkipSucc = true;
785	} else {
786	BlockChain *SuccChain = BlockToChain [Succ];
787	if (SuccChain == &Chain) {
788	SkipSucc = true;
789	} else if (Succ != *SuccChain->begin()) {
790	LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
791	<< " -> Mid chain!\n");
792	continue;
793	}
794	}
795	if (SkipSucc)
796	AdjustedSumProb -= MBPI->getEdgeProbability(Src: BB, Dst: Succ);
797	else
798	Successors.push_back(Elt: Succ);
799	}
800
801	return AdjustedSumProb;
802	}
803
804	/// The helper function returns the branch probability that is adjusted
805	/// or normalized over the new total \p AdjustedSumProb.
806	static BranchProbability
807	getAdjustedProbability(BranchProbability OrigProb,
808	BranchProbability AdjustedSumProb) {
809	BranchProbability SuccProb;
810	uint32_t SuccProbN = OrigProb.getNumerator();
811	uint32_t SuccProbD = AdjustedSumProb.getNumerator();
812	if (SuccProbN >= SuccProbD)
813	SuccProb = BranchProbability::getOne();
814	else
815	SuccProb = BranchProbability (SuccProbN, SuccProbD);
816
817	return SuccProb;
818	}
819
820	/// Check if \p BB has exactly the successors in \p Successors.
821	static bool
822	hasSameSuccessors(MachineBasicBlock &BB,
823	SmallPtrSetImpl<const MachineBasicBlock *> &Successors) {
824	if (BB.succ_size() != Successors.size())
825	return false;
826	// We don't want to count self-loops
827	if (Successors.count(Ptr: &BB))
828	return false;
829	for (MachineBasicBlock *Succ : BB.successors())
830	if (!Successors.count(Ptr: Succ))
831	return false;
832	return true;
833	}
834
835	/// Check if a block should be tail duplicated to increase fallthrough
836	/// opportunities.
837	/// \p BB Block to check.
838	bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
839	// Blocks with single successors don't create additional fallthrough
840	// opportunities. Don't duplicate them. TODO: When conditional exits are
841	// analyzable, allow them to be duplicated.
842	bool IsSimple = TailDup.isSimpleBB(TailBB: BB);
843
844	if (BB->succ_size() == `1`)
845	return false;
846	return TailDup.shouldTailDuplicate(IsSimple, TailBB&: *BB);
847	}
848
849	/// Compare 2 BlockFrequency's with a small penalty for \p A.
850	/// In order to be conservative, we apply a X% penalty to account for
851	/// increased icache pressure and static heuristics. For small frequencies
852	/// we use only the numerators to improve accuracy. For simplicity, we assume
853	/// the penalty is less than 100%
854	/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
855	static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
856	BlockFrequency EntryFreq) {
857	BranchProbability ThresholdProb(TailDupPlacementPenalty, `100`);
858	BlockFrequency Gain = A - B;
859	return (Gain / ThresholdProb) >= EntryFreq;
860	}
861
862	/// Check the edge frequencies to see if tail duplication will increase
863	/// fallthroughs. It only makes sense to call this function when
864	/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
865	/// always locally profitable if we would have picked \p Succ without
866	/// considering duplication.
867	bool MachineBlockPlacement::isProfitableToTailDup(
868	const MachineBasicBlock BB, const* MachineBasicBlock *Succ,
869	BranchProbability QProb, const BlockChain &Chain,
870	const BlockFilterSet *BlockFilter) {
871	// We need to do a probability calculation to make sure this is profitable.
872	// First: does succ have a successor that post-dominates? This affects the
873	// calculation. The 2 relevant cases are:
874	// BB BB
875	// \| \Qout \| \Qout
876	// P\| C \|P C
877	// = C' = C'
878	// \| /Qin \| /Qin
879	// \| / \| /
880	// Succ Succ
881	// / \ \| \ V
882	// U/ =V \|U \
883	// / \ = D
884	// D E \| /
885	// \| /
886	// \|/
887	// PDom
888	// '=' : Branch taken for that CFG edge
889	// In the second case, Placing Succ while duplicating it into C prevents the
890	// fallthrough of Succ into either D or PDom, because they now have C as an
891	// unplaced predecessor
892
893	// Start by figuring out which case we fall into
894	MachineBasicBlock PDom = nullptr*;
895	SmallVector<MachineBasicBlock *, `4`> SuccSuccs;
896	// Only scan the relevant successors
897	auto AdjustedSuccSumProb =
898	collectViableSuccessors(BB: Succ, Chain, BlockFilter, Successors&: SuccSuccs);
899	BranchProbability PProb = MBPI->getEdgeProbability(Src: BB, Dst: Succ);
900	auto BBFreq = MBFI ->getBlockFreq(MBB: BB);
901	auto SuccFreq = MBFI ->getBlockFreq(MBB: Succ);
902	BlockFrequency P = BBFreq * PProb;
903	BlockFrequency Qout = BBFreq * QProb;
904	BlockFrequency EntryFreq = MBFI ->getEntryFreq();
905	// If there are no more successors, it is profitable to copy, as it strictly
906	// increases fallthrough.
907	if (SuccSuccs.size() == `0`)
908	return greaterWithBias(A: P, B: Qout, EntryFreq);
909
910	auto BestSuccSucc = BranchProbability::getZero();
911	// Find the PDom or the best Succ if no PDom exists.
912	for (MachineBasicBlock *SuccSucc : SuccSuccs) {
913	auto Prob = MBPI->getEdgeProbability(Src: Succ, Dst: SuccSucc);
914	if (Prob > BestSuccSucc)
915	BestSuccSucc = Prob;
916	if (PDom == nullptr)
917	if (MPDT->dominates(A: SuccSucc, B: Succ)) {
918	PDom = SuccSucc;
919	break;
920	}
921	}
922	// For the comparisons, we need to know Succ's best incoming edge that isn't
923	// from BB.
924	auto SuccBestPred = BlockFrequency (`0`);
925	for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
926	if (SuccPred == Succ \|\| SuccPred == BB \|\|
927	BlockToChain [SuccPred] == &Chain \|\|
928	(BlockFilter && !BlockFilter->count(key: SuccPred)))
929	continue;
930	auto Freq =
931	MBFI ->getBlockFreq(MBB: SuccPred) * MBPI->getEdgeProbability(Src: SuccPred, Dst: Succ);
932	if (Freq > SuccBestPred)
933	SuccBestPred = Freq;
934	}
935	// Qin is Succ's best unplaced incoming edge that isn't BB
936	BlockFrequency Qin = SuccBestPred;
937	// If it doesn't have a post-dominating successor, here is the calculation:
938	// BB BB
939	// \| \Qout \| \
940	// P\| C \| =
941	// = C' \| C
942	// \| /Qin \| \|
943	// \| / \| C' (+Succ)
944	// Succ Succ /\|
945	// / \ \| \/ \|
946	// U/ =V \| == \|
947	// / \ \| / \\|
948	// D E D E
949	// '=' : Branch taken for that CFG edge
950	// Cost in the first case is: P + V
951	// For this calculation, we always assume P > Qout. If Qout > P
952	// The result of this function will be ignored at the caller.
953	// Let F = SuccFreq - Qin
954	// Cost in the second case is: Qout + min(Qin, F) U + max(Qin, F) * V*
955
956	if (PDom == nullptr \|\| !Succ->isSuccessor(MBB: PDom)) {
957	BranchProbability UProb = BestSuccSucc;
958	BranchProbability VProb = AdjustedSuccSumProb - UProb;
959	BlockFrequency F = SuccFreq - Qin;
960	BlockFrequency V = SuccFreq * VProb;
961	BlockFrequency QinU = std::min(a: Qin, b: F) * UProb;
962	BlockFrequency BaseCost = P + V;
963	BlockFrequency DupCost = Qout + QinU + std::max(a: Qin, b: F) * VProb;
964	return greaterWithBias(A: BaseCost, B: DupCost, EntryFreq);
965	}
966	BranchProbability UProb = MBPI->getEdgeProbability(Src: Succ, Dst: PDom);
967	BranchProbability VProb = AdjustedSuccSumProb - UProb;
968	BlockFrequency U = SuccFreq * UProb;
969	BlockFrequency V = SuccFreq * VProb;
970	BlockFrequency F = SuccFreq - Qin;
971	// If there is a post-dominating successor, here is the calculation:
972	// BB BB BB BB
973	// \| \Qout \| \ \| \Qout \| \
974	// \|P C \| = \|P C \| =
975	// = C' \|P C = C' \|P C
976	// \| /Qin \| \| \| /Qin \| \|
977	// \| / \| C' (+Succ) \| / \| C' (+Succ)
978	// Succ Succ /\| Succ Succ /\|
979	// \| \ V \| \/ \| \| \ V \| \/ \|
980	// \|U \ \|U /\ =? \|U = \|U /\ \|
981	// = D = = =?\| \| D \| = =\|
982	// \| / \|/ D \| / \|/ D
983	// \| / \| / \| = \| /
984	// \|/ \| / \|/ \| =
985	// Dom Dom Dom Dom
986	// '=' : Branch taken for that CFG edge
987	// The cost for taken branches in the first case is P + U
988	// Let F = SuccFreq - Qin
989	// The cost in the second case (assuming independence), given the layout:
990	// BB, Succ, (C+Succ), D, Dom or the layout:
991	// BB, Succ, D, Dom, (C+Succ)
992	// is Qout + max(F, Qin) U + min(F, Qin)*
993	// compare P + U vs Qout + P U + Qin.*
994	//
995	// The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
996	//
997	// For the 3rd case, the cost is P + 2 V*
998	// For the 4th case, the cost is Qout + min(Qin, F) U + max(Qin, F) * V + V*
999	// We choose 4 over 3 when (P + V) > Qout + min(Qin, F) U + max(Qin, F) * V*
1000	if (UProb > AdjustedSuccSumProb / `2` &&
1001	!hasBetterLayoutPredecessor(BB: Succ, Succ: PDom, SuccChain: *BlockToChain [PDom], SuccProb: UProb, RealSuccProb: UProb,
1002	Chain, BlockFilter))
1003	// Cases 3 & 4
1004	return greaterWithBias(
1005	A: (P + V), B: (Qout + std::max(a: Qin, b: F) * VProb + std::min(a: Qin, b: F) * UProb),
1006	EntryFreq);
1007	// Cases 1 & 2
1008	return greaterWithBias(A: (P + U),
1009	B: (Qout + std::min(a: Qin, b: F) * AdjustedSuccSumProb +
1010	std::max(a: Qin, b: F) * UProb),
1011	EntryFreq);
1012	}
1013
1014	/// Check for a trellis layout. \p BB is the upper part of a trellis if its
1015	/// successors form the lower part of a trellis. A successor set S forms the
1016	/// lower part of a trellis if all of the predecessors of S are either in S or
1017	/// have all of S as successors. We ignore trellises where BB doesn't have 2
1018	/// successors because for fewer than 2, it's trivial, and for 3 or greater they
1019	/// are very uncommon and complex to compute optimally. Allowing edges within S
1020	/// is not strictly a trellis, but the same algorithm works, so we allow it.
1021	bool MachineBlockPlacement::isTrellis(
1022	const MachineBasicBlock *BB,
1023	const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1024	const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1025	// Technically BB could form a trellis with branching factor higher than 2.
1026	// But that's extremely uncommon.
1027	if (BB->succ_size() != `2` \|\| ViableSuccs.size() != `2`)
1028	return false;
1029
1030	SmallPtrSet<const MachineBasicBlock *, `2`> Successors(llvm::from_range,
1031	BB->successors());
1032	// To avoid reviewing the same predecessors twice.
1033	SmallPtrSet<const MachineBasicBlock *, `8`> SeenPreds;
1034
1035	for (MachineBasicBlock *Succ : ViableSuccs) {
1036	// Compile-time optimization: runtime is quadratic in the number of
1037	// predecessors. For such uncommon cases, exit early.
1038	if (Succ->pred_size() > PredecessorLimit)
1039	return false;
1040
1041	int PredCount = `0`;
1042	for (auto *SuccPred : Succ->predecessors()) {
1043	// Allow triangle successors, but don't count them.
1044	if (Successors.count(Ptr: SuccPred)) {
1045	// Make sure that it is actually a triangle.
1046	for (MachineBasicBlock *CheckSucc : SuccPred->successors())
1047	if (!Successors.count(Ptr: CheckSucc))
1048	return false;
1049	continue;
1050	}
1051	const BlockChain *PredChain = BlockToChain [SuccPred];
1052	if (SuccPred == BB \|\| (BlockFilter && !BlockFilter->count(key: SuccPred)) \|\|
1053	PredChain == &Chain \|\| PredChain == BlockToChain [Succ])
1054	continue;
1055	++PredCount;
1056	// Perform the successor check only once.
1057	if (!SeenPreds.insert(Ptr: SuccPred).second)
1058	continue;
1059	if (!hasSameSuccessors(BB&: *SuccPred, Successors))
1060	return false;
1061	}
1062	// If one of the successors has only BB as a predecessor, it is not a
1063	// trellis.
1064	if (PredCount < `1`)
1065	return false;
1066	}
1067	return true;
1068	}
1069
1070	/// Pick the highest total weight pair of edges that can both be laid out.
1071	/// The edges in \p Edges[0] are assumed to have a different destination than
1072	/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
1073	/// the individual highest weight edges to the 2 different destinations, or in
1074	/// case of a conflict, one of them should be replaced with a 2nd best edge.
1075	std::pair<MachineBlockPlacement::WeightedEdge,
1076	MachineBlockPlacement::WeightedEdge>
1077	MachineBlockPlacement::getBestNonConflictingEdges(
1078	const MachineBasicBlock *BB,
1079	MutableArrayRef<SmallVector<MachineBlockPlacement::WeightedEdge, `8`>>
1080	Edges) {
1081	// Sort the edges, and then for each successor, find the best incoming
1082	// predecessor. If the best incoming predecessors aren't the same,
1083	// then that is clearly the best layout. If there is a conflict, one of the
1084	// successors will have to fallthrough from the second best predecessor. We
1085	// compare which combination is better overall.
1086
1087	// Sort for highest frequency.
1088	auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
1089
1090	llvm::stable_sort(Range&: Edges [`0`], C: Cmp);
1091	llvm::stable_sort(Range&: Edges [`1`], C: Cmp);
1092	auto BestA = Edges [`0`].begin();
1093	auto BestB = Edges [`1`].begin();
1094	// Arrange for the correct answer to be in BestA and BestB
1095	// If the 2 best edges don't conflict, the answer is already there.
1096	if (BestA->Src == BestB->Src) {
1097	// Compare the total fallthrough of (Best + Second Best) for both pairs
1098	auto SecondBestA = std::next(x: BestA);
1099	auto SecondBestB = std::next(x: BestB);
1100	BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1101	BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1102	if (BestAScore < BestBScore)
1103	BestA = SecondBestA;
1104	else
1105	BestB = SecondBestB;
1106	}
1107	// Arrange for the BB edge to be in BestA if it exists.
1108	if (BestB->Src == BB)
1109	std::swap(a&: BestA, b&: BestB);
1110	return std::make_pair(x&: BestA, y&: BestB);
1111	}
1112
1113	/// Get the best successor from \p BB based on \p BB being part of a trellis.
1114	/// We only handle trellises with 2 successors, so the algorithm is
1115	/// straightforward: Find the best pair of edges that don't conflict. We find
1116	/// the best incoming edge for each successor in the trellis. If those conflict,
1117	/// we consider which of them should be replaced with the second best.
1118	/// Upon return the two best edges will be in \p BestEdges. If one of the edges
1119	/// comes from \p BB, it will be in \p BestEdges[0]
1120	MachineBlockPlacement::BlockAndTailDupResult
1121	MachineBlockPlacement::getBestTrellisSuccessor(
1122	const MachineBasicBlock *BB,
1123	const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1124	BranchProbability AdjustedSumProb, const BlockChain &Chain,
1125	const BlockFilterSet *BlockFilter) {
1126
1127	BlockAndTailDupResult Result = {.BB: nullptr, .ShouldTailDup: false};
1128	SmallPtrSet<const MachineBasicBlock *, `4`> Successors(llvm::from_range,
1129	BB->successors());
1130
1131	// We assume size 2 because it's common. For general n, we would have to do
1132	// the Hungarian algorithm, but it's not worth the complexity because more
1133	// than 2 successors is fairly uncommon, and a trellis even more so.
1134	if (Successors.size() != `2` \|\| ViableSuccs.size() != `2`)
1135	return Result;
1136
1137	// Collect the edge frequencies of all edges that form the trellis.
1138	SmallVector<WeightedEdge, `8`> Edges[`2`];
1139	int SuccIndex = `0`;
1140	for (auto *Succ : ViableSuccs) {
1141	for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1142	// Skip any placed predecessors that are not BB
1143	if (SuccPred != BB) {
1144	if (BlockFilter && !BlockFilter->count(key: SuccPred))
1145	continue;
1146	const BlockChain *SuccPredChain = BlockToChain [SuccPred];
1147	if (SuccPredChain == &Chain \|\| SuccPredChain == BlockToChain [Succ])
1148	continue;
1149	}
1150	BlockFrequency EdgeFreq = MBFI ->getBlockFreq(MBB: SuccPred) *
1151	MBPI->getEdgeProbability(Src: SuccPred, Dst: Succ);
1152	Edges[SuccIndex].push_back(Elt: {.Weight: EdgeFreq, .Src: SuccPred, .Dest: Succ});
1153	}
1154	++SuccIndex;
1155	}
1156
1157	// Pick the best combination of 2 edges from all the edges in the trellis.
1158	WeightedEdge BestA, BestB;
1159	std::tie(args&: BestA, args&: BestB) = getBestNonConflictingEdges(BB, Edges);
1160
1161	if (BestA.Src != BB) {
1162	// If we have a trellis, and BB doesn't have the best fallthrough edges,
1163	// we shouldn't choose any successor. We've already looked and there's a
1164	// better fallthrough edge for all the successors.
1165	LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1166	return Result;
1167	}
1168
1169	// Did we pick the triangle edge? If tail-duplication is profitable, do
1170	// that instead. Otherwise merge the triangle edge now while we know it is
1171	// optimal.
1172	if (BestA.Dest == BestB.Src) {
1173	// The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1174	// would be better.
1175	MachineBasicBlock *Succ1 = BestA.Dest;
1176	MachineBasicBlock *Succ2 = BestB.Dest;
1177	// Check to see if tail-duplication would be profitable.
1178	if (allowTailDupPlacement(MF&: *F) && shouldTailDuplicate(BB: Succ2) &&
1179	canTailDuplicateUnplacedPreds(BB, Succ: Succ2, Chain, BlockFilter) &&
1180	isProfitableToTailDup(BB, Succ: Succ2, QProb: MBPI->getEdgeProbability(Src: BB, Dst: Succ1),
1181	Chain, BlockFilter)) {
1182	LLVM_DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
1183	MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1184	dbgs() << " Selected: " << getBlockName(Succ2)
1185	<< ", probability: " << Succ2Prob
1186	<< " (Tail Duplicate)\n");
1187	Result.BB = Succ2;
1188	Result.ShouldTailDup = true;
1189	return Result;
1190	}
1191	}
1192	// We have already computed the optimal edge for the other side of the
1193	// trellis.
1194	ComputedEdges [BestB.Src] = {.BB: BestB.Dest, .ShouldTailDup: false};
1195
1196	auto TrellisSucc = BestA.Dest;
1197	LLVM_DEBUG(BranchProbability SuccProb = getAdjustedProbability(
1198	MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1199	dbgs() << " Selected: " << getBlockName(TrellisSucc)
1200	<< ", probability: " << SuccProb << " (Trellis)\n");
1201	Result.BB = TrellisSucc;
1202	return Result;
1203	}
1204
1205	/// When the option allowTailDupPlacement() is on, this method checks if the
1206	/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1207	/// into all of its unplaced, unfiltered predecessors, that are not BB.
1208	bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1209	const MachineBasicBlock BB, MachineBasicBlock Succ,
1210	const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1211	if (!shouldTailDuplicate(BB: Succ))
1212	return false;
1213
1214	// The result of canTailDuplicate.
1215	bool Duplicate = true;
1216	// Number of possible duplication.
1217	unsigned int NumDup = `0`;
1218
1219	// For CFG checking.
1220	SmallPtrSet<const MachineBasicBlock *, `4`> Successors(llvm::from_range,
1221	BB->successors());
1222	for (MachineBasicBlock *Pred : Succ->predecessors()) {
1223	// Make sure all unplaced and unfiltered predecessors can be
1224	// tail-duplicated into.
1225	// Skip any blocks that are already placed or not in this loop.
1226	if (Pred == BB \|\| (BlockFilter && !BlockFilter->count(key: Pred)) \|\|
1227	(BlockToChain [Pred] == &Chain && !Succ->succ_empty()))
1228	continue;
1229	if (!TailDup.canTailDuplicate(TailBB: Succ, PredBB: Pred)) {
1230	if (Successors.size() > `1` && hasSameSuccessors(BB&: *Pred, Successors))
1231	// This will result in a trellis after tail duplication, so we don't
1232	// need to copy Succ into this predecessor. In the presence
1233	// of a trellis tail duplication can continue to be profitable.
1234	// For example:
1235	// A A
1236	// \|\ \|\
1237	// \| \ \| \
1238	// \| C \| C+BB
1239	// \| / \| \|
1240	// \|/ \| \|
1241	// BB => BB \|
1242	// \|\ \|\/\|
1243	// \| \ \|/\\|
1244	// \| D \| D
1245	// \| / \| /
1246	// \|/ \|/
1247	// Succ Succ
1248	//
1249	// After BB was duplicated into C, the layout looks like the one on the
1250	// right. BB and C now have the same successors. When considering
1251	// whether Succ can be duplicated into all its unplaced predecessors, we
1252	// ignore C.
1253	// We can do this because C already has a profitable fallthrough, namely
1254	// D. TODO(iteratee): ignore sufficiently cold predecessors for
1255	// duplication and for this test.
1256	//
1257	// This allows trellises to be laid out in 2 separate chains
1258	// (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1259	// because it allows the creation of 2 fallthrough paths with links
1260	// between them, and we correctly identify the best layout for these
1261	// CFGs. We want to extend trellises that the user created in addition
1262	// to trellises created by tail-duplication, so we just look for the
1263	// CFG.
1264	continue;
1265	Duplicate = false;
1266	continue;
1267	}
1268	NumDup++;
1269	}
1270
1271	// No possible duplication in current filter set.
1272	if (NumDup == `0`)
1273	return false;
1274
1275	// If profile information is available, findDuplicateCandidates can do more
1276	// precise benefit analysis.
1277	if (F->getFunction().hasProfileData())
1278	return true;
1279
1280	// This is mainly for function exit BB.
1281	// The integrated tail duplication is really designed for increasing
1282	// fallthrough from predecessors from Succ to its successors. We may need
1283	// other machanism to handle different cases.
1284	if (Succ->succ_empty())
1285	return true;
1286
1287	// Plus the already placed predecessor.
1288	NumDup++;
1289
1290	// If the duplication candidate has more unplaced predecessors than
1291	// successors, the extra duplication can't bring more fallthrough.
1292	//
1293	// Pred1 Pred2 Pred3
1294	// \ \| /
1295	// \ \| /
1296	// \ \| /
1297	// Dup
1298	// / \
1299	// / \
1300	// Succ1 Succ2
1301	//
1302	// In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1303	// can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1304	// but the duplication into Pred3 can't increase fallthrough.
1305	//
1306	// A small number of extra duplication may not hurt too much. We need a better
1307	// heuristic to handle it.
1308	if ((NumDup > Succ->succ_size()) \|\| !Duplicate)
1309	return false;
1310
1311	return true;
1312	}
1313
1314	/// Find chains of triangles where we believe it would be profitable to
1315	/// tail-duplicate them all, but a local analysis would not find them.
1316	/// There are 3 ways this can be profitable:
1317	/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1318	/// longer chains)
1319	/// 2) The chains are statically correlated. Branch probabilities have a very
1320	/// U-shaped distribution.
1321	/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1322	/// If the branches in a chain are likely to be from the same side of the
1323	/// distribution as their predecessor, but are independent at runtime, this
1324	/// transformation is profitable. (Because the cost of being wrong is a small
1325	/// fixed cost, unlike the standard triangle layout where the cost of being
1326	/// wrong scales with the # of triangles.)
1327	/// 3) The chains are dynamically correlated. If the probability that a previous
1328	/// branch was taken positively influences whether the next branch will be
1329	/// taken
1330	/// We believe that 2 and 3 are common enough to justify the small margin in 1.
1331	void MachineBlockPlacement::precomputeTriangleChains() {
1332	struct TriangleChain {
1333	std::vector<MachineBasicBlock *> Edges;
1334
1335	TriangleChain(MachineBasicBlock src, MachineBasicBlock dst)
1336	: Edges ({src, dst}) {}
1337
1338	void append(MachineBasicBlock *dst) {
1339	assert(getKey()->isSuccessor(dst) &&
1340	"Attempting to append a block that is not a successor.");
1341	Edges.push_back(x: dst);
1342	}
1343
1344	unsigned count() const { return Edges.size() - `1`; }
1345
1346	MachineBasicBlock getKey() const* { return Edges.back(); }
1347	};
1348
1349	if (TriangleChainCount == `0`)
1350	return;
1351
1352	LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1353	// Map from last block to the chain that contains it. This allows us to extend
1354	// chains as we find new triangles.
1355	DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap;
1356	for (MachineBasicBlock &BB : *F) {
1357	// If BB doesn't have 2 successors, it doesn't start a triangle.
1358	if (BB.succ_size() != `2`)
1359	continue;
1360	MachineBasicBlock PDom = nullptr*;
1361	for (MachineBasicBlock *Succ : BB.successors()) {
1362	if (!MPDT->dominates(A: Succ, B: &BB))
1363	continue;
1364	PDom = Succ;
1365	break;
1366	}
1367	// If BB doesn't have a post-dominating successor, it doesn't form a
1368	// triangle.
1369	if (PDom == nullptr)
1370	continue;
1371	// If PDom has a hint that it is low probability, skip this triangle.
1372	if (MBPI->getEdgeProbability(Src: &BB, Dst: PDom) < BranchProbability (`50`, `100`))
1373	continue;
1374	// If PDom isn't eligible for duplication, this isn't the kind of triangle
1375	// we're looking for.
1376	if (!shouldTailDuplicate(BB: PDom))
1377	continue;
1378	bool CanTailDuplicate = true;
1379	// If PDom can't tail-duplicate into it's non-BB predecessors, then this
1380	// isn't the kind of triangle we're looking for.
1381	for (MachineBasicBlock *Pred : PDom->predecessors()) {
1382	if (Pred == &BB)
1383	continue;
1384	if (!TailDup.canTailDuplicate(TailBB: PDom, PredBB: Pred)) {
1385	CanTailDuplicate = false;
1386	break;
1387	}
1388	}
1389	// If we can't tail-duplicate PDom to its predecessors, then skip this
1390	// triangle.
1391	if (!CanTailDuplicate)
1392	continue;
1393
1394	// Now we have an interesting triangle. Insert it if it's not part of an
1395	// existing chain.
1396	// Note: This cannot be replaced with a call insert() or emplace() because
1397	// the find key is BB, but the insert/emplace key is PDom.
1398	auto Found = TriangleChainMap.find(Val: &BB);
1399	// If it is, remove the chain from the map, grow it, and put it back in the
1400	// map with the end as the new key.
1401	if (Found != TriangleChainMap.end()) {
1402	TriangleChain Chain = std::move(Found ->second);
1403	TriangleChainMap.erase(I: Found);
1404	Chain.append(dst: PDom);
1405	TriangleChainMap.insert(KV: std::make_pair(x: Chain.getKey(), y: std::move(Chain)));
1406	} else {
1407	auto InsertResult = TriangleChainMap.try_emplace(Key: PDom, Args: &BB, Args&: PDom);
1408	assert(InsertResult.second && "Block seen twice.");
1409	(void)InsertResult;
1410	}
1411	}
1412
1413	// Iterating over a DenseMap is safe here, because the only thing in the body
1414	// of the loop is inserting into another DenseMap (ComputedEdges).
1415	// ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1416	for (auto &ChainPair : TriangleChainMap) {
1417	TriangleChain &Chain = ChainPair.second;
1418	// Benchmarking has shown that due to branch correlation duplicating 2 or
1419	// more triangles is profitable, despite the calculations assuming
1420	// independence.
1421	if (Chain.count() < TriangleChainCount)
1422	continue;
1423	MachineBasicBlock *dst = Chain.Edges.back();
1424	Chain.Edges.pop_back();
1425	for (MachineBasicBlock *src : reverse(C&: Chain.Edges)) {
1426	LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1427	<< getBlockName(dst)
1428	<< " as pre-computed based on triangles.\n");
1429
1430	auto InsertResult = ComputedEdges.insert(KV: {src, {.BB: dst, .ShouldTailDup: true}});
1431	assert(InsertResult.second && "Block seen twice.");
1432	(void)InsertResult;
1433
1434	dst = src;
1435	}
1436	}
1437	}
1438
1439	// When profile is not present, return the StaticLikelyProb.
1440	// When profile is available, we need to handle the triangle-shape CFG.
1441	static BranchProbability
1442	getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB) {
1443	if (!BB->getParent()->getFunction().hasProfileData())
1444	return BranchProbability (StaticLikelyProb, `100`);
1445	if (BB->succ_size() == `2`) {
1446	const MachineBasicBlock Succ1 = BB->succ_begin();
1447	const MachineBasicBlock Succ2 = (BB->succ_begin() + `1`);
1448	if (Succ1->isSuccessor(MBB: Succ2) \|\| Succ2->isSuccessor(MBB: Succ1)) {
1449	/ See case 1 below for the cost analysis. For BB->Succ to*
1450	* be taken with smaller cost, the following needs to hold:
1451	* Prob(BB->Succ) > 2 * Prob(BB->Pred)
1452	* So the threshold T in the calculation below
1453	* (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1454	* So T / (1 - T) = 2, Yielding T = 2/3
1455	* Also adding user specified branch bias, we have
1456	* T = (2/3)*(ProfileLikelyProb/50)
1457	* = (2*ProfileLikelyProb)/150)
1458	*/
1459	return BranchProbability (`2` * ProfileLikelyProb, `150`);
1460	}
1461	}
1462	return BranchProbability (ProfileLikelyProb, `100`);
1463	}
1464
1465	/// Checks to see if the layout candidate block \p Succ has a better layout
1466	/// predecessor than \c BB. If yes, returns true.
1467	/// \p SuccProb: The probability adjusted for only remaining blocks.
1468	/// Only used for logging
1469	/// \p RealSuccProb: The un-adjusted probability.
1470	/// \p Chain: The chain that BB belongs to and Succ is being considered for.
1471	/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1472	/// considered
1473	bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1474	const MachineBasicBlock BB, const* MachineBasicBlock *Succ,
1475	const BlockChain &SuccChain, BranchProbability SuccProb,
1476	BranchProbability RealSuccProb, const BlockChain &Chain,
1477	const BlockFilterSet *BlockFilter) {
1478
1479	// There isn't a better layout when there are no unscheduled predecessors.
1480	if (SuccChain.UnscheduledPredecessors == `0`)
1481	return false;
1482
1483	// Compile-time optimization: runtime is quadratic in the number of
1484	// predecessors. For such uncommon cases, exit early.
1485	if (Succ->pred_size() > PredecessorLimit)
1486	return false;
1487
1488	// There are two basic scenarios here:
1489	// -------------------------------------
1490	// Case 1: triangular shape CFG (if-then):
1491	// BB
1492	// \| \
1493	// \| \
1494	// \| Pred
1495	// \| /
1496	// Succ
1497	// In this case, we are evaluating whether to select edge -> Succ, e.g.
1498	// set Succ as the layout successor of BB. Picking Succ as BB's
1499	// successor breaks the CFG constraints (FIXME: define these constraints).
1500	// With this layout, Pred BB
1501	// is forced to be outlined, so the overall cost will be cost of the
1502	// branch taken from BB to Pred, plus the cost of back taken branch
1503	// from Pred to Succ, as well as the additional cost associated
1504	// with the needed unconditional jump instruction from Pred To Succ.
1505
1506	// The cost of the topological order layout is the taken branch cost
1507	// from BB to Succ, so to make BB->Succ a viable candidate, the following
1508	// must hold:
1509	// 2 freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost*
1510	// < freq(BB->Succ) taken_branch_cost.*
1511	// Ignoring unconditional jump cost, we get
1512	// freq(BB->Succ) > 2 freq(BB->Pred), i.e.,*
1513	// prob(BB->Succ) > 2 prob(BB->Pred)*
1514	//
1515	// When real profile data is available, we can precisely compute the
1516	// probability threshold that is needed for edge BB->Succ to be considered.
1517	// Without profile data, the heuristic requires the branch bias to be
1518	// a lot larger to make sure the signal is very strong (e.g. 80% default).
1519	// -----------------------------------------------------------------
1520	// Case 2: diamond like CFG (if-then-else):
1521	// S
1522	// / \
1523	// \| \
1524	// BB Pred
1525	// \ /
1526	// Succ
1527	// ..
1528	//
1529	// The current block is BB and edge BB->Succ is now being evaluated.
1530	// Note that edge S->BB was previously already selected because
1531	// prob(S->BB) > prob(S->Pred).
1532	// At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1533	// choose Pred, we will have a topological ordering as shown on the left
1534	// in the picture below. If we choose Succ, we have the solution as shown
1535	// on the right:
1536	//
1537	// topo-order:
1538	//
1539	// S----- ---S
1540	// \| \| \| \|
1541	// ---BB \| \| BB
1542	// \| \| \| \|
1543	// \| Pred-- \| Succ--
1544	// \| \| \| \|
1545	// ---Succ ---Pred--
1546	//
1547	// cost = freq(S->Pred) + freq(BB->Succ) cost = 2 freq (S->Pred)*
1548	// = freq(S->Pred) + freq(S->BB)
1549	//
1550	// If we have profile data (i.e, branch probabilities can be trusted), the
1551	// cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1552	// freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1553	// We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1554	// means the cost of topological order is greater.
1555	// When profile data is not available, however, we need to be more
1556	// conservative. If the branch prediction is wrong, breaking the topo-order
1557	// will actually yield a layout with large cost. For this reason, we need
1558	// strong biased branch at block S with Prob(S->BB) in order to select
1559	// BB->Succ. This is equivalent to looking the CFG backward with backward
1560	// edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1561	// profile data).
1562	// --------------------------------------------------------------------------
1563	// Case 3: forked diamond
1564	// S
1565	// / \
1566	// / \
1567	// BB Pred
1568	// \| \ / \|
1569	// \| \ / \|
1570	// \| X \|
1571	// \| / \ \|
1572	// \| / \ \|
1573	// S1 S2
1574	//
1575	// The current block is BB and edge BB->S1 is now being evaluated.
1576	// As above S->BB was already selected because
1577	// prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1578	//
1579	// topo-order:
1580	//
1581	// S-------\| ---S
1582	// \| \| \| \|
1583	// ---BB \| \| BB
1584	// \| \| \| \|
1585	// \| Pred----\| \| S1----
1586	// \| \| \| \|
1587	// --(S1 or S2) ---Pred--
1588	// \|
1589	// S2
1590	//
1591	// topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1592	// + min(freq(Pred->S1), freq(Pred->S2))
1593	// Non-topo-order cost:
1594	// non-topo-cost = 2 freq(S->Pred) + freq(BB->S2).*
1595	// To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1596	// is 0. Then the non topo layout is better when
1597	// freq(S->Pred) < freq(BB->S1).
1598	// This is exactly what is checked below.
1599	// Note there are other shapes that apply (Pred may not be a single block,
1600	// but they all fit this general pattern.)
1601	BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
1602
1603	// Make sure that a hot successor doesn't have a globally more
1604	// important predecessor.
1605	BlockFrequency CandidateEdgeFreq = MBFI ->getBlockFreq(MBB: BB) * RealSuccProb;
1606	bool BadCFGConflict = false;
1607
1608	for (MachineBasicBlock *Pred : Succ->predecessors()) {
1609	BlockChain *PredChain = BlockToChain [Pred];
1610	if (Pred == Succ \|\| PredChain == &SuccChain \|\|
1611	(BlockFilter && !BlockFilter->count(key: Pred)) \|\| PredChain == &Chain \|\|
1612	Pred != *std::prev(x: PredChain->end()) \|\|
1613	// This check is redundant except for look ahead. This function is
1614	// called for lookahead by isProfitableToTailDup when BB hasn't been
1615	// placed yet.
1616	(Pred == BB))
1617	continue;
1618	// Do backward checking.
1619	// For all cases above, we need a backward checking to filter out edges that
1620	// are not 'strongly' biased.
1621	// BB Pred
1622	// \ /
1623	// Succ
1624	// We select edge BB->Succ if
1625	// freq(BB->Succ) > freq(Succ) HotProb*
1626	// i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1627	// HotProb
1628	// i.e. freq((BB->Succ) (1 - HotProb) > freq(Pred->Succ) * HotProb*
1629	// Case 1 is covered too, because the first equation reduces to:
1630	// prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1631	BlockFrequency PredEdgeFreq =
1632	MBFI ->getBlockFreq(MBB: Pred) * MBPI->getEdgeProbability(Src: Pred, Dst: Succ);
1633	if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1634	BadCFGConflict = true;
1635	break;
1636	}
1637	}
1638
1639	if (BadCFGConflict) {
1640	LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1641	<< SuccProb << " (prob) (non-cold CFG conflict)\n");
1642	return true;
1643	}
1644
1645	return false;
1646	}
1647
1648	/// Select the best successor for a block.
1649	///
1650	/// This looks across all successors of a particular block and attempts to
1651	/// select the "best" one to be the layout successor. It only considers direct
1652	/// successors which also pass the block filter. It will attempt to avoid
1653	/// breaking CFG structure, but cave and break such structures in the case of
1654	/// very hot successor edges.
1655	///
1656	/// \returns The best successor block found, or null if none are viable, along
1657	/// with a boolean indicating if tail duplication is necessary.
1658	MachineBlockPlacement::BlockAndTailDupResult
1659	MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB,
1660	const BlockChain &Chain,
1661	const BlockFilterSet *BlockFilter) {
1662	const BranchProbability HotProb(StaticLikelyProb, `100`);
1663
1664	BlockAndTailDupResult BestSucc = {.BB: nullptr, .ShouldTailDup: false};
1665	auto BestProb = BranchProbability::getZero();
1666
1667	SmallVector<MachineBasicBlock *, `4`> Successors;
1668	auto AdjustedSumProb =
1669	collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1670
1671	LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1672	<< "\n");
1673
1674	// if we already precomputed the best successor for BB, return that if still
1675	// applicable.
1676	auto FoundEdge = ComputedEdges.find(Val: BB);
1677	if (FoundEdge != ComputedEdges.end()) {
1678	MachineBasicBlock *Succ = FoundEdge ->second.BB;
1679	ComputedEdges.erase(I: FoundEdge);
1680	BlockChain *SuccChain = BlockToChain [Succ];
1681	if (BB->isSuccessor(MBB: Succ) && (!BlockFilter \|\| BlockFilter->count(key: Succ)) &&
1682	SuccChain != &Chain && Succ == *SuccChain->begin())
1683	return FoundEdge ->second;
1684	}
1685
1686	// if BB is part of a trellis, Use the trellis to determine the optimal
1687	// fallthrough edges
1688	if (isTrellis(BB, ViableSuccs: Successors, Chain, BlockFilter))
1689	return getBestTrellisSuccessor(BB, ViableSuccs: Successors, AdjustedSumProb, Chain,
1690	BlockFilter);
1691
1692	// For blocks with CFG violations, we may be able to lay them out anyway with
1693	// tail-duplication. We keep this vector so we can perform the probability
1694	// calculations the minimum number of times.
1695	SmallVector<std::pair<BranchProbability, MachineBasicBlock *>, `4`>
1696	DupCandidates;
1697	for (MachineBasicBlock *Succ : Successors) {
1698	auto RealSuccProb = MBPI->getEdgeProbability(Src: BB, Dst: Succ);
1699	BranchProbability SuccProb =
1700	getAdjustedProbability(OrigProb: RealSuccProb, AdjustedSumProb);
1701
1702	BlockChain &SuccChain = *BlockToChain [Succ];
1703	// Skip the edge \c BB->Succ if block \c Succ has a better layout
1704	// predecessor that yields lower global cost.
1705	if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1706	Chain, BlockFilter)) {
1707	// If tail duplication would make Succ profitable, place it.
1708	if (allowTailDupPlacement(MF&: *F) && shouldTailDuplicate(BB: Succ))
1709	DupCandidates.emplace_back(Args&: SuccProb, Args&: Succ);
1710	continue;
1711	}
1712
1713	LLVM_DEBUG(
1714	dbgs() << " Candidate: " << getBlockName(Succ)
1715	<< ", probability: " << SuccProb
1716	<< (SuccChain.UnscheduledPredecessors != `0` ? " (CFG break)" : "")
1717	<< "\n");
1718
1719	if (BestSucc.BB && BestProb >= SuccProb) {
1720	LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1721	continue;
1722	}
1723
1724	LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1725	BestSucc.BB = Succ;
1726	BestProb = SuccProb;
1727	}
1728	// Handle the tail duplication candidates in order of decreasing probability.
1729	// Stop at the first one that is profitable. Also stop if they are less
1730	// profitable than BestSucc. Position is important because we preserve it and
1731	// prefer first best match. Here we aren't comparing in order, so we capture
1732	// the position instead.
1733	llvm::stable_sort(Range&: DupCandidates,
1734	C: [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1735	std::tuple<BranchProbability, MachineBasicBlock *> R) {
1736	return std::get<`0`>(t&: L) > std::get<`0`>(t&: R);
1737	});
1738	for (auto &Tup : DupCandidates) {
1739	BranchProbability DupProb;
1740	MachineBasicBlock *Succ;
1741	std::tie(args&: DupProb, args&: Succ) = Tup;
1742	if (DupProb < BestProb)
1743	break;
1744	if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) &&
1745	(isProfitableToTailDup(BB, Succ, QProb: BestProb, Chain, BlockFilter))) {
1746	LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1747	<< ", probability: " << DupProb
1748	<< " (Tail Duplicate)\n");
1749	BestSucc.BB = Succ;
1750	BestSucc.ShouldTailDup = true;
1751	break;
1752	}
1753	}
1754
1755	if (BestSucc.BB)
1756	LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1757
1758	return BestSucc;
1759	}
1760
1761	/// Select the best block from a worklist.
1762	///
1763	/// This looks through the provided worklist as a list of candidate basic
1764	/// blocks and select the most profitable one to place. The definition of
1765	/// profitable only really makes sense in the context of a loop. This returns
1766	/// the most frequently visited block in the worklist, which in the case of
1767	/// a loop, is the one most desirable to be physically close to the rest of the
1768	/// loop body in order to improve i-cache behavior.
1769	///
1770	/// \returns The best block found, or null if none are viable.
1771	MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1772	const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1773	// Once we need to walk the worklist looking for a candidate, cleanup the
1774	// worklist of already placed entries.
1775	// FIXME: If this shows up on profiles, it could be folded (at the cost of
1776	// some code complexity) into the loop below.
1777	llvm::erase_if(C&: WorkList, P: [&](MachineBasicBlock *BB) {
1778	return BlockToChain.lookup(Val: BB) == &Chain;
1779	});
1780
1781	if (WorkList.empty())
1782	return nullptr;
1783
1784	bool IsEHPad = WorkList [`0`]->isEHPad();
1785
1786	MachineBasicBlock BestBlock = nullptr*;
1787	BlockFrequency BestFreq;
1788	for (MachineBasicBlock *MBB : WorkList) {
1789	assert(MBB->isEHPad() == IsEHPad &&
1790	"EHPad mismatch between block and work list.");
1791
1792	BlockChain &SuccChain = *BlockToChain [MBB];
1793	if (&SuccChain == &Chain)
1794	continue;
1795
1796	assert(SuccChain.UnscheduledPredecessors == `0` &&
1797	"Found CFG-violating block");
1798
1799	BlockFrequency CandidateFreq = MBFI ->getBlockFreq(MBB);
1800	LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "
1801	<< printBlockFreq(MBFI->getMBFI(), CandidateFreq)
1802	<< " (freq)\n");
1803
1804	// For ehpad, we layout the least probable first as to avoid jumping back
1805	// from least probable landingpads to more probable ones.
1806	//
1807	// FIXME: Using probability is probably (!) not the best way to achieve
1808	// this. We should probably have a more principled approach to layout
1809	// cleanup code.
1810	//
1811	// The goal is to get:
1812	//
1813	// +--------------------------+
1814	// \| V
1815	// InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1816	//
1817	// Rather than:
1818	//
1819	// +-------------------------------------+
1820	// V \|
1821	// OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1822	if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1823	continue;
1824
1825	BestBlock = MBB;
1826	BestFreq = CandidateFreq;
1827	}
1828
1829	return BestBlock;
1830	}
1831
1832	/// Retrieve the first unplaced basic block in the entire function.
1833	///
1834	/// This routine is called when we are unable to use the CFG to walk through
1835	/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1836	/// We walk through the function's blocks in order, starting from the
1837	/// LastUnplacedBlockIt. We update this iterator on each call to avoid
1838	/// re-scanning the entire sequence on repeated calls to this routine.
1839	MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1840	const BlockChain &PlacedChain,
1841	MachineFunction::iterator &PrevUnplacedBlockIt) {
1842
1843	for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1844	++I) {
1845	if (BlockChain Chain = BlockToChain [&I]; Chain != &PlacedChain) {
1846	PrevUnplacedBlockIt = I;
1847	// Now select the head of the chain to which the unplaced block belongs
1848	// as the block to place. This will force the entire chain to be placed,
1849	// and satisfies the requirements of merging chains.
1850	return *Chain->begin();
1851	}
1852	}
1853	return nullptr;
1854	}
1855
1856	/// Retrieve the first unplaced basic block among the blocks in BlockFilter.
1857	///
1858	/// This is similar to getFirstUnplacedBlock for the entire function, but since
1859	/// the size of BlockFilter is typically far less than the number of blocks in
1860	/// the entire function, iterating through the BlockFilter is more efficient.
1861	/// When processing the entire funciton, using the version without BlockFilter
1862	/// has a complexity of #(loops in function) #(blocks in function), while this*
1863	/// version has a complexity of sum(#(loops in block) foreach block in function)
1864	/// which is always smaller. For long function mostly sequential in structure,
1865	/// the complexity is amortized to 1 #(blocks in function).*
1866	MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1867	const BlockChain &PlacedChain,
1868	BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
1869	const BlockFilterSet *BlockFilter) {
1870	assert(BlockFilter);
1871	for (; PrevUnplacedBlockInFilterIt != BlockFilter->end();
1872	++PrevUnplacedBlockInFilterIt) {
1873	BlockChain C = BlockToChain [PrevUnplacedBlockInFilterIt];
1874	if (C != &PlacedChain) {
1875	return *C->begin();
1876	}
1877	}
1878	return nullptr;
1879	}
1880
1881	void MachineBlockPlacement::fillWorkLists(
1882	const MachineBasicBlock MBB, SmallPtrSetImpl<BlockChain > &UpdatedPreds,
1883	const BlockFilterSet BlockFilter = nullptr*) {
1884	BlockChain &Chain = *BlockToChain [MBB];
1885	if (!UpdatedPreds.insert(Ptr: &Chain).second)
1886	return;
1887
1888	assert(
1889	Chain.UnscheduledPredecessors == `0` &&
1890	"Attempting to place block with unscheduled predecessors in worklist.");
1891	for (MachineBasicBlock *ChainBB : Chain) {
1892	assert(BlockToChain[ChainBB] == &Chain &&
1893	"Block in chain doesn't match BlockToChain map.");
1894	for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1895	if (BlockFilter && !BlockFilter->count(key: Pred))
1896	continue;
1897	if (BlockToChain [Pred] == &Chain)
1898	continue;
1899	++Chain.UnscheduledPredecessors;
1900	}
1901	}
1902
1903	if (Chain.UnscheduledPredecessors != `0`)
1904	return;
1905
1906	MachineBasicBlock BB = Chain.begin();
1907	if (BB->isEHPad())
1908	EHPadWorkList.push_back(Elt: BB);
1909	else
1910	BlockWorkList.push_back(Elt: BB);
1911	}
1912
1913	void MachineBlockPlacement::buildChain(const MachineBasicBlock *HeadBB,
1914	BlockChain &Chain,
1915	BlockFilterSet *BlockFilter) {
1916	assert(HeadBB && "BB must not be null.\n");
1917	assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1918	MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1919	BlockFilterSet::iterator PrevUnplacedBlockInFilterIt;
1920	if (BlockFilter)
1921	PrevUnplacedBlockInFilterIt = BlockFilter->begin();
1922
1923	const MachineBasicBlock *LoopHeaderBB = HeadBB;
1924	markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1925	MachineBasicBlock BB = std::prev(x: Chain.end());
1926	while (true) {
1927	assert(BB && "null block found at end of chain in loop.");
1928	assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1929	assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1930
1931	// Look for the best viable successor if there is one to place immediately
1932	// after this block.
1933	auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1934	MachineBasicBlock *BestSucc = Result.BB;
1935	bool ShouldTailDup = Result.ShouldTailDup;
1936	if (allowTailDupPlacement(MF&: *F))
1937	ShouldTailDup \|= (BestSucc && canTailDuplicateUnplacedPreds(
1938	BB, Succ: BestSucc, Chain, BlockFilter));
1939
1940	// If an immediate successor isn't available, look for the best viable
1941	// block among those we've identified as not violating the loop's CFG at
1942	// this point. This won't be a fallthrough, but it will increase locality.
1943	if (!BestSucc)
1944	BestSucc = selectBestCandidateBlock(Chain, WorkList&: BlockWorkList);
1945	if (!BestSucc)
1946	BestSucc = selectBestCandidateBlock(Chain, WorkList&: EHPadWorkList);
1947
1948	if (!BestSucc) {
1949	if (BlockFilter)
1950	BestSucc = getFirstUnplacedBlock(PlacedChain: Chain, PrevUnplacedBlockInFilterIt,
1951	BlockFilter);
1952	else
1953	BestSucc = getFirstUnplacedBlock(PlacedChain: Chain, PrevUnplacedBlockIt);
1954	if (!BestSucc)
1955	break;
1956
1957	LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1958	"layout successor until the CFG reduces\n");
1959	}
1960
1961	// Placement may have changed tail duplication opportunities.
1962	// Check for that now.
1963	if (allowTailDupPlacement(MF&: *F) && BestSucc && ShouldTailDup) {
1964	repeatedlyTailDuplicateBlock(BB: BestSucc, LPred&: BB, LoopHeaderBB, Chain,
1965	BlockFilter, PrevUnplacedBlockIt,
1966	PrevUnplacedBlockInFilterIt);
1967	// If the chosen successor was duplicated into BB, don't bother laying
1968	// it out, just go round the loop again with BB as the chain end.
1969	if (!BB->isSuccessor(MBB: BestSucc))
1970	continue;
1971	}
1972
1973	// Place this block, updating the datastructures to reflect its placement.
1974	BlockChain &SuccChain = *BlockToChain [BestSucc];
1975	// Zero out UnscheduledPredecessors for the successor we're about to merge
1976	// in case we selected a successor that didn't fit naturally into the CFG.
1977	SuccChain.UnscheduledPredecessors = `0`;
1978	LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1979	<< getBlockName(BestSucc) << "\n");
1980	markChainSuccessors(Chain: SuccChain, LoopHeaderBB, BlockFilter);
1981	Chain.merge(BB: BestSucc, Chain: &SuccChain);
1982	BB = *std::prev(x: Chain.end());
1983	}
1984
1985	LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1986	<< getBlockName(*Chain.begin()) << "\n");
1987	}
1988
1989	// If bottom of block BB has only one successor OldTop, in most cases it is
1990	// profitable to move it before OldTop, except the following case:
1991	//
1992	// -->OldTop<-
1993	// \| . \|
1994	// \| . \|
1995	// \| . \|
1996	// ---Pred \|
1997	// \| \|
1998	// BB-----
1999	//
2000	// If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
2001	// layout the other successor below it, so it can't reduce taken branch.
2002	// In this case we keep its original layout.
2003	bool MachineBlockPlacement::canMoveBottomBlockToTop(
2004	const MachineBasicBlock BottomBlock, const* MachineBasicBlock *OldTop) {
2005	if (BottomBlock->pred_size() != `1`)
2006	return true;
2007	MachineBasicBlock Pred = BottomBlock->pred_begin();
2008	if (Pred->succ_size() != `2`)
2009	return true;
2010
2011	MachineBasicBlock OtherBB = Pred->succ_begin();
2012	if (OtherBB == BottomBlock)
2013	OtherBB = *Pred->succ_rbegin();
2014	if (OtherBB == OldTop)
2015	return false;
2016
2017	return true;
2018	}
2019
2020	// Find out the possible fall through frequence to the top of a loop.
2021	BlockFrequency
2022	MachineBlockPlacement::TopFallThroughFreq(const MachineBasicBlock *Top,
2023	const BlockFilterSet &LoopBlockSet) {
2024	BlockFrequency MaxFreq = BlockFrequency (`0`);
2025	for (MachineBasicBlock *Pred : Top->predecessors()) {
2026	BlockChain *PredChain = BlockToChain [Pred];
2027	if (!LoopBlockSet.count(key: Pred) &&
2028	(!PredChain \|\| Pred == *std::prev(x: PredChain->end()))) {
2029	// Found a Pred block can be placed before Top.
2030	// Check if Top is the best successor of Pred.
2031	auto TopProb = MBPI->getEdgeProbability(Src: Pred, Dst: Top);
2032	bool TopOK = true;
2033	for (MachineBasicBlock *Succ : Pred->successors()) {
2034	auto SuccProb = MBPI->getEdgeProbability(Src: Pred, Dst: Succ);
2035	BlockChain *SuccChain = BlockToChain [Succ];
2036	// Check if Succ can be placed after Pred.
2037	// Succ should not be in any chain, or it is the head of some chain.
2038	if (!LoopBlockSet.count(key: Succ) && (SuccProb > TopProb) &&
2039	(!SuccChain \|\| Succ == *SuccChain->begin())) {
2040	TopOK = false;
2041	break;
2042	}
2043	}
2044	if (TopOK) {
2045	BlockFrequency EdgeFreq =
2046	MBFI ->getBlockFreq(MBB: Pred) * MBPI->getEdgeProbability(Src: Pred, Dst: Top);
2047	if (EdgeFreq > MaxFreq)
2048	MaxFreq = EdgeFreq;
2049	}
2050	}
2051	}
2052	return MaxFreq;
2053	}
2054
2055	// Compute the fall through gains when move NewTop before OldTop.
2056	//
2057	// In following diagram, edges marked as "-" are reduced fallthrough, edges
2058	// marked as "+" are increased fallthrough, this function computes
2059	//
2060	// SUM(increased fallthrough) - SUM(decreased fallthrough)
2061	//
2062	// \|
2063	// \| -
2064	// V
2065	// --->OldTop
2066	// \| .
2067	// \| .
2068	// +\| . +
2069	// \| Pred --->
2070	// \| \|-
2071	// \| V
2072	// --- NewTop <---
2073	// \|-
2074	// V
2075	//
2076	BlockFrequency MachineBlockPlacement::FallThroughGains(
2077	const MachineBasicBlock NewTop, const* MachineBasicBlock *OldTop,
2078	const MachineBasicBlock ExitBB, const* BlockFilterSet &LoopBlockSet) {
2079	BlockFrequency FallThrough2Top = TopFallThroughFreq(Top: OldTop, LoopBlockSet);
2080	BlockFrequency FallThrough2Exit = BlockFrequency (`0`);
2081	if (ExitBB)
2082	FallThrough2Exit =
2083	MBFI ->getBlockFreq(MBB: NewTop) * MBPI->getEdgeProbability(Src: NewTop, Dst: ExitBB);
2084	BlockFrequency BackEdgeFreq =
2085	MBFI ->getBlockFreq(MBB: NewTop) * MBPI->getEdgeProbability(Src: NewTop, Dst: OldTop);
2086
2087	// Find the best Pred of NewTop.
2088	MachineBasicBlock BestPred = nullptr*;
2089	BlockFrequency FallThroughFromPred = BlockFrequency (`0`);
2090	for (MachineBasicBlock *Pred : NewTop->predecessors()) {
2091	if (!LoopBlockSet.count(key: Pred))
2092	continue;
2093	BlockChain *PredChain = BlockToChain [Pred];
2094	if (!PredChain \|\| Pred == *std::prev(x: PredChain->end())) {
2095	BlockFrequency EdgeFreq =
2096	MBFI ->getBlockFreq(MBB: Pred) * MBPI->getEdgeProbability(Src: Pred, Dst: NewTop);
2097	if (EdgeFreq > FallThroughFromPred) {
2098	FallThroughFromPred = EdgeFreq;
2099	BestPred = Pred;
2100	}
2101	}
2102	}
2103
2104	// If NewTop is not placed after Pred, another successor can be placed
2105	// after Pred.
2106	BlockFrequency NewFreq = BlockFrequency (`0`);
2107	if (BestPred) {
2108	for (MachineBasicBlock *Succ : BestPred->successors()) {
2109	if ((Succ == NewTop) \|\| (Succ == BestPred) \|\| !LoopBlockSet.count(key: Succ))
2110	continue;
2111	if (ComputedEdges.contains(Val: Succ))
2112	continue;
2113	BlockChain *SuccChain = BlockToChain [Succ];
2114	if ((SuccChain && (Succ != *SuccChain->begin())) \|\|
2115	(SuccChain == BlockToChain [BestPred]))
2116	continue;
2117	BlockFrequency EdgeFreq = MBFI ->getBlockFreq(MBB: BestPred) *
2118	MBPI->getEdgeProbability(Src: BestPred, Dst: Succ);
2119	if (EdgeFreq > NewFreq)
2120	NewFreq = EdgeFreq;
2121	}
2122	BlockFrequency OrigEdgeFreq = MBFI ->getBlockFreq(MBB: BestPred) *
2123	MBPI->getEdgeProbability(Src: BestPred, Dst: NewTop);
2124	if (NewFreq > OrigEdgeFreq) {
2125	// If NewTop is not the best successor of Pred, then Pred doesn't
2126	// fallthrough to NewTop. So there is no FallThroughFromPred and
2127	// NewFreq.
2128	NewFreq = BlockFrequency (`0`);
2129	FallThroughFromPred = BlockFrequency (`0`);
2130	}
2131	}
2132
2133	BlockFrequency Result = BlockFrequency (`0`);
2134	BlockFrequency Gains = BackEdgeFreq + NewFreq;
2135	BlockFrequency Lost =
2136	FallThrough2Top + FallThrough2Exit + FallThroughFromPred;
2137	if (Gains > Lost)
2138	Result = Gains - Lost;
2139	return Result;
2140	}
2141
2142	/// Helper function of findBestLoopTop. Find the best loop top block
2143	/// from predecessors of old top.
2144	///
2145	/// Look for a block which is strictly better than the old top for laying
2146	/// out before the old top of the loop. This looks for only two patterns:
2147	///
2148	/// 1. a block has only one successor, the old loop top
2149	///
2150	/// Because such a block will always result in an unconditional jump,
2151	/// rotating it in front of the old top is always profitable.
2152	///
2153	/// 2. a block has two successors, one is old top, another is exit
2154	/// and it has more than one predecessors
2155	///
2156	/// If it is below one of its predecessors P, only P can fall through to
2157	/// it, all other predecessors need a jump to it, and another conditional
2158	/// jump to loop header. If it is moved before loop header, all its
2159	/// predecessors jump to it, then fall through to loop header. So all its
2160	/// predecessors except P can reduce one taken branch.
2161	/// At the same time, move it before old top increases the taken branch
2162	/// to loop exit block, so the reduced taken branch will be compared with
2163	/// the increased taken branch to the loop exit block.
2164	MachineBasicBlock *MachineBlockPlacement::findBestLoopTopHelper(
2165	MachineBasicBlock OldTop, const* MachineLoop &L,
2166	const BlockFilterSet &LoopBlockSet) {
2167	// Check that the header hasn't been fused with a preheader block due to
2168	// crazy branches. If it has, we need to start with the header at the top to
2169	// prevent pulling the preheader into the loop body.
2170	BlockChain &HeaderChain = *BlockToChain [OldTop];
2171	if (!LoopBlockSet.count(key: *HeaderChain.begin()))
2172	return OldTop;
2173	if (OldTop != *HeaderChain.begin())
2174	return OldTop;
2175
2176	LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2177	<< "\n");
2178
2179	BlockFrequency BestGains = BlockFrequency (`0`);
2180	MachineBasicBlock BestPred = nullptr*;
2181	for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2182	if (!LoopBlockSet.count(key: Pred))
2183	continue;
2184	if (Pred == L.getHeader())
2185	continue;
2186	LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2187	<< Pred->succ_size() << " successors, "
2188	<< printBlockFreq(MBFI->getMBFI(), *Pred) << " freq\n");
2189	if (Pred->succ_size() > `2`)
2190	continue;
2191
2192	MachineBasicBlock OtherBB = nullptr*;
2193	if (Pred->succ_size() == `2`) {
2194	OtherBB = *Pred->succ_begin();
2195	if (OtherBB == OldTop)
2196	OtherBB = *Pred->succ_rbegin();
2197	}
2198
2199	if (!canMoveBottomBlockToTop(BottomBlock: Pred, OldTop))
2200	continue;
2201
2202	BlockFrequency Gains =
2203	FallThroughGains(NewTop: Pred, OldTop, ExitBB: OtherBB, LoopBlockSet);
2204	if ((Gains > BlockFrequency (`0`)) &&
2205	(Gains > BestGains \|\|
2206	((Gains == BestGains) && Pred->isLayoutSuccessor(MBB: OldTop)))) {
2207	BestPred = Pred;
2208	BestGains = Gains;
2209	}
2210	}
2211
2212	// If no direct predecessor is fine, just use the loop header.
2213	if (!BestPred) {
2214	LLVM_DEBUG(dbgs() << " final top unchanged\n");
2215	return OldTop;
2216	}
2217
2218	// Walk backwards through any straight line of predecessors.
2219	while (BestPred->pred_size() == `1` &&
2220	(*BestPred->pred_begin())->succ_size() == `1` &&
2221	*BestPred->pred_begin() != L.getHeader())
2222	BestPred = *BestPred->pred_begin();
2223
2224	LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2225	return BestPred;
2226	}
2227
2228	/// Find the best loop top block for layout.
2229	///
2230	/// This function iteratively calls findBestLoopTopHelper, until no new better
2231	/// BB can be found.
2232	MachineBasicBlock *
2233	MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2234	const BlockFilterSet &LoopBlockSet) {
2235	// Placing the latch block before the header may introduce an extra branch
2236	// that skips this block the first time the loop is executed, which we want
2237	// to avoid when optimising for size.
2238	// FIXME: in theory there is a case that does not introduce a new branch,
2239	// i.e. when the layout predecessor does not fallthrough to the loop header.
2240	// In practice this never happens though: there always seems to be a preheader
2241	// that can fallthrough and that is also placed before the header.
2242	if (llvm::shouldOptimizeForSize(MBB: L.getHeader(), PSI, MBFIWrapper: MBFI.get()))
2243	return L.getHeader();
2244
2245	MachineBasicBlock OldTop = nullptr*;
2246	MachineBasicBlock *NewTop = L.getHeader();
2247	while (NewTop != OldTop) {
2248	OldTop = NewTop;
2249	NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2250	if (NewTop != OldTop)
2251	ComputedEdges [NewTop] = {.BB: OldTop, .ShouldTailDup: false};
2252	}
2253	return NewTop;
2254	}
2255
2256	/// Find the best loop exiting block for layout.
2257	///
2258	/// This routine implements the logic to analyze the loop looking for the best
2259	/// block to layout at the top of the loop. Typically this is done to maximize
2260	/// fallthrough opportunities.
2261	MachineBasicBlock *
2262	MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2263	const BlockFilterSet &LoopBlockSet,
2264	BlockFrequency &ExitFreq) {
2265	// We don't want to layout the loop linearly in all cases. If the loop header
2266	// is just a normal basic block in the loop, we want to look for what block
2267	// within the loop is the best one to layout at the top. However, if the loop
2268	// header has be pre-merged into a chain due to predecessors not having
2269	// analyzable branches, and* the predecessor it is merged with is not part*
2270	// of the loop, rotating the header into the middle of the loop will create
2271	// a non-contiguous range of blocks which is Very Bad. So start with the
2272	// header and only rotate if safe.
2273	BlockChain &HeaderChain = *BlockToChain [L.getHeader()];
2274	if (!LoopBlockSet.count(key: *HeaderChain.begin()))
2275	return nullptr;
2276
2277	BlockFrequency BestExitEdgeFreq;
2278	unsigned BestExitLoopDepth = `0`;
2279	MachineBasicBlock ExitingBB = nullptr*;
2280	// If there are exits to outer loops, loop rotation can severely limit
2281	// fallthrough opportunities unless it selects such an exit. Keep a set of
2282	// blocks where rotating to exit with that block will reach an outer loop.
2283	SmallPtrSet<MachineBasicBlock *, `4`> BlocksExitingToOuterLoop;
2284
2285	LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2286	<< getBlockName(L.getHeader()) << "\n");
2287	for (MachineBasicBlock *MBB : L.getBlocks()) {
2288	BlockChain &Chain = *BlockToChain [MBB];
2289	// Ensure that this block is at the end of a chain; otherwise it could be
2290	// mid-way through an inner loop or a successor of an unanalyzable branch.
2291	if (MBB != *std::prev(x: Chain.end()))
2292	continue;
2293
2294	// Now walk the successors. We need to establish whether this has a viable
2295	// exiting successor and whether it has a viable non-exiting successor.
2296	// We store the old exiting state and restore it if a viable looping
2297	// successor isn't found.
2298	MachineBasicBlock *OldExitingBB = ExitingBB;
2299	BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2300	bool HasLoopingSucc = false;
2301	for (MachineBasicBlock *Succ : MBB->successors()) {
2302	if (Succ->isEHPad())
2303	continue;
2304	if (Succ == MBB)
2305	continue;
2306	BlockChain &SuccChain = *BlockToChain [Succ];
2307	// Don't split chains, either this chain or the successor's chain.
2308	if (&Chain == &SuccChain) {
2309	LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2310	<< getBlockName(Succ) << " (chain conflict)\n");
2311	continue;
2312	}
2313
2314	auto SuccProb = MBPI->getEdgeProbability(Src: MBB, Dst: Succ);
2315	if (LoopBlockSet.count(key: Succ)) {
2316	LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2317	<< getBlockName(Succ) << " (" << SuccProb << ")\n");
2318	HasLoopingSucc = true;
2319	continue;
2320	}
2321
2322	unsigned SuccLoopDepth = `0`;
2323	if (MachineLoop *ExitLoop = MLI->getLoopFor(BB: Succ)) {
2324	SuccLoopDepth = ExitLoop->getLoopDepth();
2325	if (ExitLoop->contains(L: &L))
2326	BlocksExitingToOuterLoop.insert(Ptr: MBB);
2327	}
2328
2329	BlockFrequency ExitEdgeFreq = MBFI ->getBlockFreq(MBB) * SuccProb;
2330	LLVM_DEBUG(
2331	dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2332	<< getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("
2333	<< printBlockFreq(MBFI->getMBFI(), ExitEdgeFreq) << ")\n");
2334	// Note that we bias this toward an existing layout successor to retain
2335	// incoming order in the absence of better information. The exit must have
2336	// a frequency higher than the current exit before we consider breaking
2337	// the layout.
2338	BranchProbability Bias(`100` - ExitBlockBias, `100`);
2339	if (!ExitingBB \|\| SuccLoopDepth > BestExitLoopDepth \|\|
2340	ExitEdgeFreq > BestExitEdgeFreq \|\|
2341	(MBB->isLayoutSuccessor(MBB: Succ) &&
2342	!(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2343	BestExitEdgeFreq = ExitEdgeFreq;
2344	ExitingBB = MBB;
2345	}
2346	}
2347
2348	if (!HasLoopingSucc) {
2349	// Restore the old exiting state, no viable looping successor was found.
2350	ExitingBB = OldExitingBB;
2351	BestExitEdgeFreq = OldBestExitEdgeFreq;
2352	}
2353	}
2354	// Without a candidate exiting block or with only a single block in the
2355	// loop, just use the loop header to layout the loop.
2356	if (!ExitingBB) {
2357	LLVM_DEBUG(
2358	dbgs() << " No other candidate exit blocks, using loop header\n");
2359	return nullptr;
2360	}
2361	if (L.getNumBlocks() == `1`) {
2362	LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2363	return nullptr;
2364	}
2365
2366	// Also, if we have exit blocks which lead to outer loops but didn't select
2367	// one of them as the exiting block we are rotating toward, disable loop
2368	// rotation altogether.
2369	if (!BlocksExitingToOuterLoop.empty() &&
2370	!BlocksExitingToOuterLoop.count(Ptr: ExitingBB))
2371	return nullptr;
2372
2373	LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2374	<< "\n");
2375	ExitFreq = BestExitEdgeFreq;
2376	return ExitingBB;
2377	}
2378
2379	/// Check if there is a fallthrough to loop header Top.
2380	///
2381	/// 1. Look for a Pred that can be layout before Top.
2382	/// 2. Check if Top is the most possible successor of Pred.
2383	bool MachineBlockPlacement::hasViableTopFallthrough(
2384	const MachineBasicBlock Top, const* BlockFilterSet &LoopBlockSet) {
2385	for (MachineBasicBlock *Pred : Top->predecessors()) {
2386	BlockChain *PredChain = BlockToChain [Pred];
2387	if (!LoopBlockSet.count(key: Pred) &&
2388	(!PredChain \|\| Pred == *std::prev(x: PredChain->end()))) {
2389	// Found a Pred block can be placed before Top.
2390	// Check if Top is the best successor of Pred.
2391	auto TopProb = MBPI->getEdgeProbability(Src: Pred, Dst: Top);
2392	bool TopOK = true;
2393	for (MachineBasicBlock *Succ : Pred->successors()) {
2394	auto SuccProb = MBPI->getEdgeProbability(Src: Pred, Dst: Succ);
2395	BlockChain *SuccChain = BlockToChain [Succ];
2396	// Check if Succ can be placed after Pred.
2397	// Succ should not be in any chain, or it is the head of some chain.
2398	if ((!SuccChain \|\| Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2399	TopOK = false;
2400	break;
2401	}
2402	}
2403	if (TopOK)
2404	return true;
2405	}
2406	}
2407	return false;
2408	}
2409
2410	/// Attempt to rotate an exiting block to the bottom of the loop.
2411	///
2412	/// Once we have built a chain, try to rotate it to line up the hot exit block
2413	/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2414	/// branches. For example, if the loop has fallthrough into its header and out
2415	/// of its bottom already, don't rotate it.
2416	void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2417	const MachineBasicBlock *ExitingBB,
2418	BlockFrequency ExitFreq,
2419	const BlockFilterSet &LoopBlockSet) {
2420	if (!ExitingBB)
2421	return;
2422
2423	MachineBasicBlock Top = LoopChain.begin();
2424	MachineBasicBlock Bottom = std::prev(x: LoopChain.end());
2425
2426	// If ExitingBB is already the last one in a chain then nothing to do.
2427	if (Bottom == ExitingBB)
2428	return;
2429
2430	// The entry block should always be the first BB in a function.
2431	if (Top->isEntryBlock())
2432	return;
2433
2434	bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2435
2436	// If the header has viable fallthrough, check whether the current loop
2437	// bottom is a viable exiting block. If so, bail out as rotating will
2438	// introduce an unnecessary branch.
2439	if (ViableTopFallthrough) {
2440	for (MachineBasicBlock *Succ : Bottom->successors()) {
2441	BlockChain *SuccChain = BlockToChain [Succ];
2442	if (!LoopBlockSet.count(key: Succ) &&
2443	(!SuccChain \|\| Succ == *SuccChain->begin()))
2444	return;
2445	}
2446
2447	// Rotate will destroy the top fallthrough, we need to ensure the new exit
2448	// frequency is larger than top fallthrough.
2449	BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2450	if (FallThrough2Top >= ExitFreq)
2451	return;
2452	}
2453
2454	BlockChain::iterator ExitIt = llvm::find(Range&: LoopChain, Val: ExitingBB);
2455	if (ExitIt == LoopChain.end())
2456	return;
2457
2458	// Rotating a loop exit to the bottom when there is a fallthrough to top
2459	// trades the entry fallthrough for an exit fallthrough.
2460	// If there is no bottom->top edge, but the chosen exit block does have
2461	// a fallthrough, we break that fallthrough for nothing in return.
2462
2463	// Let's consider an example. We have a built chain of basic blocks
2464	// B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2465	// By doing a rotation we get
2466	// Bk+1, ..., Bn, B1, ..., Bk
2467	// Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2468	// If we had a fallthrough Bk -> Bk+1 it is broken now.
2469	// It might be compensated by fallthrough Bn -> B1.
2470	// So we have a condition to avoid creation of extra branch by loop rotation.
2471	// All below must be true to avoid loop rotation:
2472	// If there is a fallthrough to top (B1)
2473	// There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2474	// There is no fallthrough from bottom (Bn) to top (B1).
2475	// Please note that there is no exit fallthrough from Bn because we checked it
2476	// above.
2477	if (ViableTopFallthrough) {
2478	assert(std::next(ExitIt) != LoopChain.end() &&
2479	"Exit should not be last BB");
2480	MachineBasicBlock NextBlockInChain = std::next(x: ExitIt);
2481	if (ExitingBB->isSuccessor(MBB: NextBlockInChain))
2482	if (!Bottom->isSuccessor(MBB: Top))
2483	return;
2484	}
2485
2486	LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2487	<< " at bottom\n");
2488	std::rotate(first: LoopChain.begin(), middle: std::next(x: ExitIt), last: LoopChain.end());
2489	}
2490
2491	/// Attempt to rotate a loop based on profile data to reduce branch cost.
2492	///
2493	/// With profile data, we can determine the cost in terms of missed fall through
2494	/// opportunities when rotating a loop chain and select the best rotation.
2495	/// Basically, there are three kinds of cost to consider for each rotation:
2496	/// 1. The possibly missed fall through edge (if it exists) from BB out of
2497	/// the loop to the loop header.
2498	/// 2. The possibly missed fall through edges (if they exist) from the loop
2499	/// exits to BB out of the loop.
2500	/// 3. The missed fall through edge (if it exists) from the last BB to the
2501	/// first BB in the loop chain.
2502	/// Therefore, the cost for a given rotation is the sum of costs listed above.
2503	/// We select the best rotation with the smallest cost.
2504	void MachineBlockPlacement::rotateLoopWithProfile(
2505	BlockChain &LoopChain, const MachineLoop &L,
2506	const BlockFilterSet &LoopBlockSet) {
2507	auto RotationPos = LoopChain.end();
2508	MachineBasicBlock ChainHeaderBB = LoopChain.begin();
2509
2510	// The entry block should always be the first BB in a function.
2511	if (ChainHeaderBB->isEntryBlock())
2512	return;
2513
2514	BlockFrequency SmallestRotationCost = BlockFrequency::max();
2515
2516	// A utility lambda that scales up a block frequency by dividing it by a
2517	// branch probability which is the reciprocal of the scale.
2518	auto ScaleBlockFrequency = [](BlockFrequency Freq,
2519	unsigned Scale) -> BlockFrequency {
2520	if (Scale == `0`)
2521	return BlockFrequency (`0`);
2522	// Use operator / between BlockFrequency and BranchProbability to implement
2523	// saturating multiplication.
2524	return Freq / BranchProbability (`1`, Scale);
2525	};
2526
2527	// Compute the cost of the missed fall-through edge to the loop header if the
2528	// chain head is not the loop header. As we only consider natural loops with
2529	// single header, this computation can be done only once.
2530	BlockFrequency HeaderFallThroughCost(`0`);
2531	for (auto *Pred : ChainHeaderBB->predecessors()) {
2532	BlockChain *PredChain = BlockToChain [Pred];
2533	if (!LoopBlockSet.count(key: Pred) &&
2534	(!PredChain \|\| Pred == *std::prev(x: PredChain->end()))) {
2535	auto EdgeFreq = MBFI ->getBlockFreq(MBB: Pred) *
2536	MBPI->getEdgeProbability(Src: Pred, Dst: ChainHeaderBB);
2537	auto FallThruCost = ScaleBlockFrequency (EdgeFreq, MisfetchCost);
2538	// If the predecessor has only an unconditional jump to the header, we
2539	// need to consider the cost of this jump.
2540	if (Pred->succ_size() == `1`)
2541	FallThruCost += ScaleBlockFrequency (EdgeFreq, JumpInstCost);
2542	HeaderFallThroughCost = std::max(a: HeaderFallThroughCost, b: FallThruCost);
2543	}
2544	}
2545
2546	// Here we collect all exit blocks in the loop, and for each exit we find out
2547	// its hottest exit edge. For each loop rotation, we define the loop exit cost
2548	// as the sum of frequencies of exit edges we collect here, excluding the exit
2549	// edge from the tail of the loop chain.
2550	SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, `4`> ExitsWithFreq;
2551	for (auto *BB : LoopChain) {
2552	auto LargestExitEdgeProb = BranchProbability::getZero();
2553	for (auto *Succ : BB->successors()) {
2554	BlockChain *SuccChain = BlockToChain [Succ];
2555	if (!LoopBlockSet.count(key: Succ) &&
2556	(!SuccChain \|\| Succ == *SuccChain->begin())) {
2557	auto SuccProb = MBPI->getEdgeProbability(Src: BB, Dst: Succ);
2558	LargestExitEdgeProb = std::max(a: LargestExitEdgeProb, b: SuccProb);
2559	}
2560	}
2561	if (LargestExitEdgeProb > BranchProbability::getZero()) {
2562	auto ExitFreq = MBFI ->getBlockFreq(MBB: BB) * LargestExitEdgeProb;
2563	ExitsWithFreq.emplace_back(Args&: BB, Args&: ExitFreq);
2564	}
2565	}
2566
2567	// In this loop we iterate every block in the loop chain and calculate the
2568	// cost assuming the block is the head of the loop chain. When the loop ends,
2569	// we should have found the best candidate as the loop chain's head.
2570	for (auto Iter = LoopChain.begin(), TailIter = std::prev(x: LoopChain.end()),
2571	EndIter = LoopChain.end();
2572	Iter != EndIter; Iter++, TailIter++) {
2573	// TailIter is used to track the tail of the loop chain if the block we are
2574	// checking (pointed by Iter) is the head of the chain.
2575	if (TailIter == LoopChain.end())
2576	TailIter = LoopChain.begin();
2577
2578	auto TailBB = *TailIter;
2579
2580	// Calculate the cost by putting this BB to the top.
2581	BlockFrequency Cost = BlockFrequency (`0`);
2582
2583	// If the current BB is the loop header, we need to take into account the
2584	// cost of the missed fall through edge from outside of the loop to the
2585	// header.
2586	if (Iter != LoopChain.begin())
2587	Cost += HeaderFallThroughCost;
2588
2589	// Collect the loop exit cost by summing up frequencies of all exit edges
2590	// except the one from the chain tail.
2591	for (auto &ExitWithFreq : ExitsWithFreq)
2592	if (TailBB != ExitWithFreq.first)
2593	Cost += ExitWithFreq.second;
2594
2595	// The cost of breaking the once fall-through edge from the tail to the top
2596	// of the loop chain. Here we need to consider three cases:
2597	// 1. If the tail node has only one successor, then we will get an
2598	// additional jmp instruction. So the cost here is (MisfetchCost +
2599	// JumpInstCost) tail node frequency.*
2600	// 2. If the tail node has two successors, then we may still get an
2601	// additional jmp instruction if the layout successor after the loop
2602	// chain is not its CFG successor. Note that the more frequently executed
2603	// jmp instruction will be put ahead of the other one. Assume the
2604	// frequency of those two branches are x and y, where x is the frequency
2605	// of the edge to the chain head, then the cost will be
2606	// (x MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.*
2607	// 3. If the tail node has more than two successors (this rarely happens),
2608	// we won't consider any additional cost.
2609	if (TailBB->isSuccessor(MBB: *Iter)) {
2610	auto TailBBFreq = MBFI ->getBlockFreq(MBB: TailBB);
2611	if (TailBB->succ_size() == `1`)
2612	Cost += ScaleBlockFrequency (TailBBFreq, MisfetchCost + JumpInstCost);
2613	else if (TailBB->succ_size() == `2`) {
2614	auto TailToHeadProb = MBPI->getEdgeProbability(Src: TailBB, Dst: *Iter);
2615	auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2616	auto ColderEdgeFreq = TailToHeadProb > BranchProbability (`1`, `2`)
2617	? TailBBFreq * TailToHeadProb.getCompl()
2618	: TailToHeadFreq;
2619	Cost += ScaleBlockFrequency (TailToHeadFreq, MisfetchCost) +
2620	ScaleBlockFrequency (ColderEdgeFreq, JumpInstCost);
2621	}
2622	}
2623
2624	LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2625	<< getBlockName(*Iter) << " to the top: "
2626	<< printBlockFreq(MBFI->getMBFI(), Cost) << "\n");
2627
2628	if (Cost < SmallestRotationCost) {
2629	SmallestRotationCost = Cost;
2630	RotationPos = Iter;
2631	}
2632	}
2633
2634	if (RotationPos != LoopChain.end()) {
2635	LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2636	<< " to the top\n");
2637	std::rotate(first: LoopChain.begin(), middle: RotationPos, last: LoopChain.end());
2638	}
2639	}
2640
2641	/// Collect blocks in the given loop that are to be placed.
2642	///
2643	/// When profile data is available, exclude cold blocks from the returned set;
2644	/// otherwise, collect all blocks in the loop.
2645	MachineBlockPlacement::BlockFilterSet
2646	MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2647	// Collect the blocks in a set ordered by block number, as this gives the same
2648	// order as they appear in the function.
2649	struct MBBCompare {
2650	bool operator()(const MachineBasicBlock *X,
2651	const MachineBasicBlock Y) const* {
2652	return X->getNumber() < Y->getNumber();
2653	}
2654	};
2655	std::set<const MachineBasicBlock *, MBBCompare> LoopBlockSet;
2656
2657	// Filter cold blocks off from LoopBlockSet when profile data is available.
2658	// Collect the sum of frequencies of incoming edges to the loop header from
2659	// outside. If we treat the loop as a super block, this is the frequency of
2660	// the loop. Then for each block in the loop, we calculate the ratio between
2661	// its frequency and the frequency of the loop block. When it is too small,
2662	// don't add it to the loop chain. If there are outer loops, then this block
2663	// will be merged into the first outer loop chain for which this block is not
2664	// cold anymore. This needs precise profile data and we only do this when
2665	// profile data is available.
2666	if (F->getFunction().hasProfileData() \|\| ForceLoopColdBlock) {
2667	BlockFrequency LoopFreq(`0`);
2668	for (auto *LoopPred : L.getHeader()->predecessors())
2669	if (!L.contains(BB: LoopPred))
2670	LoopFreq += MBFI ->getBlockFreq(MBB: LoopPred) *
2671	MBPI->getEdgeProbability(Src: LoopPred, Dst: L.getHeader());
2672
2673	for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2674	if (LoopBlockSet.count(x: LoopBB))
2675	continue;
2676	auto Freq = MBFI ->getBlockFreq(MBB: LoopBB).getFrequency();
2677	if (Freq == `0` \|\| LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2678	continue;
2679	BlockChain *Chain = BlockToChain [LoopBB];
2680	for (MachineBasicBlock ChainBB : Chain)
2681	LoopBlockSet.insert(x: ChainBB);
2682	}
2683	} else
2684	LoopBlockSet.insert(first: L.block_begin(), last: L.block_end());
2685
2686	// Copy the blocks into a BlockFilterSet, as iterating it is faster than
2687	// std::set. We will only remove blocks and never insert them, which will
2688	// preserve the ordering.
2689	BlockFilterSet Ret(LoopBlockSet.begin(), LoopBlockSet.end());
2690	return Ret;
2691	}
2692
2693	/// Forms basic block chains from the natural loop structures.
2694	///
2695	/// These chains are designed to preserve the existing structure* of the code*
2696	/// as much as possible. We can then stitch the chains together in a way which
2697	/// both preserves the topological structure and minimizes taken conditional
2698	/// branches.
2699	void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2700	// First recurse through any nested loops, building chains for those inner
2701	// loops.
2702	for (const MachineLoop *InnerLoop : L)
2703	buildLoopChains(L: *InnerLoop);
2704
2705	assert(BlockWorkList.empty() &&
2706	"BlockWorkList not empty when starting to build loop chains.");
2707	assert(EHPadWorkList.empty() &&
2708	"EHPadWorkList not empty when starting to build loop chains.");
2709	BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2710
2711	// Check if we have profile data for this function. If yes, we will rotate
2712	// this loop by modeling costs more precisely which requires the profile data
2713	// for better layout.
2714	bool RotateLoopWithProfile =
2715	ForcePreciseRotationCost \|\|
2716	(PreciseRotationCost && F->getFunction().hasProfileData());
2717
2718	// First check to see if there is an obviously preferable top block for the
2719	// loop. This will default to the header, but may end up as one of the
2720	// predecessors to the header if there is one which will result in strictly
2721	// fewer branches in the loop body.
2722	MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2723
2724	// If we selected just the header for the loop top, look for a potentially
2725	// profitable exit block in the event that rotating the loop can eliminate
2726	// branches by placing an exit edge at the bottom.
2727	//
2728	// Loops are processed innermost to uttermost, make sure we clear
2729	// PreferredLoopExit before processing a new loop.
2730	PreferredLoopExit = nullptr;
2731	BlockFrequency ExitFreq;
2732	if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2733	PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2734
2735	BlockChain &LoopChain = *BlockToChain [LoopTop];
2736
2737	// FIXME: This is a really lame way of walking the chains in the loop: we
2738	// walk the blocks, and use a set to prevent visiting a particular chain
2739	// twice.
2740	SmallPtrSet<BlockChain *, `4`> UpdatedPreds;
2741	assert(LoopChain.UnscheduledPredecessors == `0` &&
2742	"LoopChain should not have unscheduled predecessors.");
2743	UpdatedPreds.insert(Ptr: &LoopChain);
2744
2745	for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2746	fillWorkLists(MBB: LoopBB, UpdatedPreds, BlockFilter: &LoopBlockSet);
2747
2748	buildChain(HeadBB: LoopTop, Chain&: LoopChain, BlockFilter: &LoopBlockSet);
2749
2750	if (RotateLoopWithProfile)
2751	rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2752	else
2753	rotateLoop(LoopChain, ExitingBB: PreferredLoopExit, ExitFreq, LoopBlockSet);
2754
2755	LLVM_DEBUG({
2756	// Crash at the end so we get all of the debugging output first.
2757	bool BadLoop = false;
2758	if (LoopChain.UnscheduledPredecessors) {
2759	BadLoop = true;
2760	dbgs() << "Loop chain contains a block without its preds placed!\n"
2761	<< " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2762	<< " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2763	}
2764	for (MachineBasicBlock *ChainBB : LoopChain) {
2765	dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2766	if (!LoopBlockSet.remove(ChainBB)) {
2767	// We don't mark the loop as bad here because there are real situations
2768	// where this can occur. For example, with an unanalyzable fallthrough
2769	// from a loop block to a non-loop block or vice versa.
2770	dbgs() << "Loop chain contains a block not contained by the loop!\n"
2771	<< " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2772	<< " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2773	<< " Bad block: " << getBlockName(ChainBB) << "\n";
2774	}
2775	}
2776
2777	if (!LoopBlockSet.empty()) {
2778	BadLoop = true;
2779	for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2780	dbgs() << "Loop contains blocks never placed into a chain!\n"
2781	<< " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2782	<< " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2783	<< " Bad block: " << getBlockName(LoopBB) << "\n";
2784	}
2785	assert(!BadLoop && "Detected problems with the placement of this loop.");
2786	});
2787
2788	BlockWorkList.clear();
2789	EHPadWorkList.clear();
2790	}
2791
2792	void MachineBlockPlacement::buildCFGChains() {
2793	// Ensure that every BB in the function has an associated chain to simplify
2794	// the assumptions of the remaining algorithm.
2795	SmallVector<MachineOperand, `4`> Cond; // For analyzeBranch.
2796	for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2797	++FI) {
2798	MachineBasicBlock BB = &FI;
2799	BlockChain *Chain =
2800	new (ChainAllocator.Allocate()) BlockChain (BlockToChain, BB);
2801	// Also, merge any blocks which we cannot reason about and must preserve
2802	// the exact fallthrough behavior for.
2803	while (true) {
2804	Cond.clear();
2805	MachineBasicBlock TBB = nullptr, FBB = nullptr; // For analyzeBranch.
2806	if (!TII->analyzeBranch(MBB&: *BB, TBB, FBB, Cond) \|\| !FI ->canFallThrough())
2807	break;
2808
2809	MachineFunction::iterator NextFI = std::next(x: FI);
2810	MachineBasicBlock NextBB = &NextFI;
2811	// Ensure that the layout successor is a viable block, as we know that
2812	// fallthrough is a possibility.
2813	assert(NextFI != FE && "Can't fallthrough past the last block.");
2814	LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2815	<< getBlockName(BB) << " -> " << getBlockName(NextBB)
2816	<< "\n");
2817	Chain->merge(BB: NextBB, Chain: nullptr);
2818	#ifndef NDEBUG
2819	BlocksWithUnanalyzableExits.insert(&*BB);
2820	#endif
2821	FI = NextFI;
2822	BB = NextBB;
2823	}
2824	}
2825
2826	// Build any loop-based chains.
2827	PreferredLoopExit = nullptr;
2828	for (MachineLoop L : MLI)
2829	buildLoopChains(L: *L);
2830
2831	assert(BlockWorkList.empty() &&
2832	"BlockWorkList should be empty before building final chain.");
2833	assert(EHPadWorkList.empty() &&
2834	"EHPadWorkList should be empty before building final chain.");
2835
2836	SmallPtrSet<BlockChain *, `4`> UpdatedPreds;
2837	for (MachineBasicBlock &MBB : *F)
2838	fillWorkLists(MBB: &MBB, UpdatedPreds);
2839
2840	BlockChain &FunctionChain = *BlockToChain [&F->front()];
2841	buildChain(HeadBB: &F->front(), Chain&: FunctionChain);
2842
2843	#ifndef NDEBUG
2844	using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, `16`>;
2845	#endif
2846	LLVM_DEBUG({
2847	// Crash at the end so we get all of the debugging output first.
2848	bool BadFunc = false;
2849	FunctionBlockSetType FunctionBlockSet;
2850	for (MachineBasicBlock &MBB : *F)
2851	FunctionBlockSet.insert(&MBB);
2852
2853	for (MachineBasicBlock *ChainBB : FunctionChain)
2854	if (!FunctionBlockSet.erase(ChainBB)) {
2855	BadFunc = true;
2856	dbgs() << "Function chain contains a block not in the function!\n"
2857	<< " Bad block: " << getBlockName(ChainBB) << "\n";
2858	}
2859
2860	if (!FunctionBlockSet.empty()) {
2861	BadFunc = true;
2862	for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2863	dbgs() << "Function contains blocks never placed into a chain!\n"
2864	<< " Bad block: " << getBlockName(RemainingBB) << "\n";
2865	}
2866	assert(!BadFunc && "Detected problems with the block placement.");
2867	});
2868
2869	// Remember original layout ordering, so we can update terminators after
2870	// reordering to point to the original layout successor.
2871	SmallVector<MachineBasicBlock *, `4`> OriginalLayoutSuccessors(
2872	F->getNumBlockIDs());
2873	{
2874	MachineBasicBlock LastMBB = nullptr*;
2875	for (auto &MBB : *F) {
2876	if (LastMBB != nullptr)
2877	OriginalLayoutSuccessors [LastMBB->getNumber()] = &MBB;
2878	LastMBB = &MBB;
2879	}
2880	OriginalLayoutSuccessors [F->back().getNumber()] = nullptr;
2881	}
2882
2883	// Splice the blocks into place.
2884	MachineFunction::iterator InsertPos = F->begin();
2885	LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2886	for (MachineBasicBlock *ChainBB : FunctionChain) {
2887	LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2888	: " ... ")
2889	<< getBlockName(ChainBB) << "\n");
2890	if (InsertPos != MachineFunction::iterator (ChainBB))
2891	F->splice(InsertPt: InsertPos, MBB: ChainBB);
2892	else
2893	++InsertPos;
2894
2895	// Update the terminator of the previous block.
2896	if (ChainBB == *FunctionChain.begin())
2897	continue;
2898	MachineBasicBlock PrevBB = &std::prev(x: MachineFunction::iterator (ChainBB));
2899
2900	// FIXME: It would be awesome of updateTerminator would just return rather
2901	// than assert when the branch cannot be analyzed in order to remove this
2902	// boiler plate.
2903	Cond.clear();
2904	MachineBasicBlock TBB = nullptr, FBB = nullptr; // For analyzeBranch.
2905
2906	#ifndef NDEBUG
2907	if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2908	// Given the exact block placement we chose, we may actually not _need_ to
2909	// be able to edit PrevBB's terminator sequence, but not being _able_ to
2910	// do that at this point is a bug.
2911	assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) \|\|
2912	!PrevBB->canFallThrough()) &&
2913	"Unexpected block with un-analyzable fallthrough!");
2914	Cond.clear();
2915	TBB = FBB = nullptr;
2916	}
2917	#endif
2918
2919	// The "PrevBB" is not yet updated to reflect current code layout, so,
2920	// o. it may fall-through to a block without explicit "goto" instruction
2921	// before layout, and no longer fall-through it after layout; or
2922	// o. just opposite.
2923	//
2924	// analyzeBranch() may return erroneous value for FBB when these two
2925	// situations take place. For the first scenario FBB is mistakenly set NULL;
2926	// for the 2nd scenario, the FBB, which is expected to be NULL, is
2927	// mistakenly pointing to "BI".*
2928	// Thus, if the future change needs to use FBB before the layout is set, it
2929	// has to correct FBB first by using the code similar to the following:
2930	//
2931	// if (!Cond.empty() && (!FBB \|\| FBB == ChainBB)) {
2932	// PrevBB->updateTerminator();
2933	// Cond.clear();
2934	// TBB = FBB = nullptr;
2935	// if (TII->analyzeBranch(PrevBB, TBB, FBB, Cond)) {*
2936	// // FIXME: This should never take place.
2937	// TBB = FBB = nullptr;
2938	// }
2939	// }
2940	if (!TII->analyzeBranch(MBB&: *PrevBB, TBB, FBB, Cond)) {
2941	PrevBB->updateTerminator(PreviousLayoutSuccessor: OriginalLayoutSuccessors [PrevBB->getNumber()]);
2942	}
2943	}
2944
2945	// Fixup the last block.
2946	Cond.clear();
2947	MachineBasicBlock TBB = nullptr, FBB = nullptr; // For analyzeBranch.
2948	if (!TII->analyzeBranch(MBB&: F->back(), TBB, FBB, Cond)) {
2949	MachineBasicBlock *PrevBB = &F->back();
2950	PrevBB->updateTerminator(PreviousLayoutSuccessor: OriginalLayoutSuccessors [PrevBB->getNumber()]);
2951	}
2952
2953	BlockWorkList.clear();
2954	EHPadWorkList.clear();
2955	}
2956
2957	void MachineBlockPlacement::optimizeBranches() {
2958	BlockChain &FunctionChain = *BlockToChain [&F->front()];
2959	SmallVector<MachineOperand, `4`> Cond;
2960
2961	// Now that all the basic blocks in the chain have the proper layout,
2962	// make a final call to analyzeBranch with AllowModify set.
2963	// Indeed, the target may be able to optimize the branches in a way we
2964	// cannot because all branches may not be analyzable.
2965	// E.g., the target may be able to remove an unconditional branch to
2966	// a fallthrough when it occurs after predicated terminators.
2967	for (MachineBasicBlock *ChainBB : FunctionChain) {
2968	Cond.clear();
2969	MachineBasicBlock TBB = nullptr, FBB = nullptr;
2970	if (TII->analyzeBranch(MBB&: ChainBB, TBB, FBB, Cond, /AllowModify/* true))
2971	continue;
2972	if (!TBB \|\| !FBB \|\| Cond.empty())
2973	continue;
2974	// If we are optimizing for size we do not consider the runtime performance.
2975	// Instead, we retain the original branch condition so we have more uniform
2976	// instructions which will benefit ICF.
2977	if (llvm::shouldOptimizeForSize(MBB: ChainBB, PSI, MBFIWrapper: MBFI.get()))
2978	continue;
2979	// If ChainBB has a two-way branch, try to re-order the branches
2980	// such that we branch to the successor with higher probability first.
2981	if (MBPI->getEdgeProbability(Src: ChainBB, Dst: TBB) >=
2982	MBPI->getEdgeProbability(Src: ChainBB, Dst: FBB))
2983	continue;
2984	if (TII->reverseBranchCondition(Cond))
2985	continue;
2986	LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2987	<< getBlockName(ChainBB) << "\n");
2988	LLVM_DEBUG(dbgs() << " " << getBlockName(TBB) << " < " << getBlockName(FBB)
2989	<< "\n");
2990	auto Dl = ChainBB->findBranchDebugLoc();
2991	TII->removeBranch(MBB&: *ChainBB);
2992	TII->insertBranch(MBB&: *ChainBB, TBB: FBB, FBB: TBB, Cond, DL: Dl);
2993	}
2994	}
2995
2996	void MachineBlockPlacement::alignBlocks() {
2997	// Walk through the backedges of the function now that we have fully laid out
2998	// the basic blocks and align the destination of each backedge. We don't rely
2999	// exclusively on the loop info here so that we can align backedges in
3000	// unnatural CFGs and backedges that were introduced purely because of the
3001	// loop rotations done during this layout pass.
3002	if (!AlignAllBlock && !AlignAllNonFallThruBlocks) {
3003	if (F->getFunction().hasMinSize() \|\|
3004	(F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
3005	return;
3006	}
3007
3008	BlockChain &FunctionChain = *BlockToChain [&F->front()];
3009	// Empty chain.
3010	if (FunctionChain.begin() == FunctionChain.end())
3011	return;
3012
3013	const BranchProbability ColdProb(`1`, `5`); // 20%
3014	BlockFrequency EntryFreq = MBFI ->getBlockFreq(MBB: &F->front());
3015	BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
3016	for (MachineBasicBlock *ChainBB : FunctionChain) {
3017	if (ChainBB == *FunctionChain.begin())
3018	continue;
3019
3020	// Don't align non-looping basic blocks. These are unlikely to execute
3021	// enough times to matter in practice. Note that we'll still handle
3022	// unnatural CFGs inside of a natural outer loop (the common case) and
3023	// rotated loops.
3024	MachineLoop *L = MLI->getLoopFor(BB: ChainBB);
3025	if (!L)
3026	continue;
3027
3028	const Align TLIAlign = TLI->getPrefLoopAlignment(ML: L);
3029	unsigned MDAlign = `1`;
3030	MDNode *LoopID = L->getLoopID();
3031	if (LoopID) {
3032	for (const MDOperand &MDO : llvm::drop_begin(RangeOrContainer: LoopID->operands())) {
3033	MDNode *MD = dyn_cast<MDNode>(Val: MDO);
3034	if (MD == nullptr)
3035	continue;
3036	MDString *S = dyn_cast<MDString>(Val: MD->getOperand(I: `0`));
3037	if (S == nullptr)
3038	continue;
3039	if (S->getString() == "llvm.loop.align") {
3040	assert(MD->getNumOperands() == `2` &&
3041	"per-loop align metadata should have two operands.");
3042	MDAlign =
3043	mdconst::extract<ConstantInt>(MD: MD->getOperand(I: `1`))->getZExtValue();
3044	assert(MDAlign >= `1` && "per-loop align value must be positive.");
3045	}
3046	}
3047	}
3048
3049	// Use max of the TLIAlign and MDAlign
3050	const Align LoopAlign = std::max(a: TLIAlign, b: Align (MDAlign));
3051	if (LoopAlign == `1`)
3052	continue; // Don't care about loop alignment.
3053
3054	// If the block is cold relative to the function entry don't waste space
3055	// aligning it.
3056	BlockFrequency Freq = MBFI ->getBlockFreq(MBB: ChainBB);
3057	if (Freq < WeightedEntryFreq)
3058	continue;
3059
3060	// If the block is cold relative to its loop header, don't align it
3061	// regardless of what edges into the block exist.
3062	MachineBasicBlock *LoopHeader = L->getHeader();
3063	BlockFrequency LoopHeaderFreq = MBFI ->getBlockFreq(MBB: LoopHeader);
3064	if (Freq < (LoopHeaderFreq * ColdProb))
3065	continue;
3066
3067	// If the global profiles indicates so, don't align it.
3068	if (llvm::shouldOptimizeForSize(MBB: ChainBB, PSI, MBFIWrapper: MBFI.get()) &&
3069	!TLI->alignLoopsWithOptSize())
3070	continue;
3071
3072	// Check for the existence of a non-layout predecessor which would benefit
3073	// from aligning this block.
3074	MachineBasicBlock *LayoutPred =
3075	&*std::prev(x: MachineFunction::iterator (ChainBB));
3076
3077	auto DetermineMaxAlignmentPadding = [&]() {
3078	// Set the maximum bytes allowed to be emitted for alignment.
3079	unsigned MaxBytes;
3080	if (MaxBytesForAlignmentOverride.getNumOccurrences() > `0`)
3081	MaxBytes = MaxBytesForAlignmentOverride;
3082	else
3083	MaxBytes = TLI->getMaxPermittedBytesForAlignment(MBB: ChainBB);
3084	ChainBB->setMaxBytesForAlignment(MaxBytes);
3085	};
3086
3087	// Force alignment if all the predecessors are jumps. We already checked
3088	// that the block isn't cold above.
3089	if (!LayoutPred->isSuccessor(MBB: ChainBB)) {
3090	ChainBB->setAlignment(LoopAlign);
3091	DetermineMaxAlignmentPadding ();
3092	continue;
3093	}
3094
3095	// Align this block if the layout predecessor's edge into this block is
3096	// cold relative to the block. When this is true, other predecessors make up
3097	// all of the hot entries into the block and thus alignment is likely to be
3098	// important.
3099	BranchProbability LayoutProb =
3100	MBPI->getEdgeProbability(Src: LayoutPred, Dst: ChainBB);
3101	BlockFrequency LayoutEdgeFreq = MBFI ->getBlockFreq(MBB: LayoutPred) * LayoutProb;
3102	if (LayoutEdgeFreq <= (Freq * ColdProb)) {
3103	ChainBB->setAlignment(LoopAlign);
3104	DetermineMaxAlignmentPadding ();
3105	}
3106	}
3107
3108	const bool HasMaxBytesOverride =
3109	MaxBytesForAlignmentOverride.getNumOccurrences() > `0`;
3110
3111	if (AlignAllBlock)
3112	// Align all of the blocks in the function to a specific alignment.
3113	for (MachineBasicBlock &MBB : *F) {
3114	if (HasMaxBytesOverride)
3115	MBB.setAlignment(A: Align (`1ULL` << AlignAllBlock),
3116	MaxBytes: MaxBytesForAlignmentOverride);
3117	else
3118	MBB.setAlignment(Align (`1ULL` << AlignAllBlock));
3119	}
3120	else if (AlignAllNonFallThruBlocks) {
3121	// Align all of the blocks that have no fall-through predecessors to a
3122	// specific alignment.
3123	for (auto MBI = std::next(x: F->begin()), MBE = F->end(); MBI != MBE; ++MBI) {
3124	auto LayoutPred = std::prev(x: MBI);
3125	if (!LayoutPred ->isSuccessor(MBB: &*MBI)) {
3126	if (HasMaxBytesOverride)
3127	MBI ->setAlignment(A: Align (`1ULL` << AlignAllNonFallThruBlocks),
3128	MaxBytes: MaxBytesForAlignmentOverride);
3129	else
3130	MBI ->setAlignment(Align (`1ULL` << AlignAllNonFallThruBlocks));
3131	}
3132	}
3133	}
3134	}
3135
3136	/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
3137	/// it was duplicated into its chain predecessor and removed.
3138	/// \p BB - Basic block that may be duplicated.
3139	///
3140	/// \p LPred - Chosen layout predecessor of \p BB.
3141	/// Updated to be the chain end if LPred is removed.
3142	/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3143	/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3144	/// Used to identify which blocks to update predecessor
3145	/// counts.
3146	/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3147	/// chosen in the given order due to unnatural CFG
3148	/// only needed if \p BB is removed and
3149	/// \p PrevUnplacedBlockIt pointed to \p BB.
3150	/// @return true if \p BB was removed.
3151	bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
3152	MachineBasicBlock BB, MachineBasicBlock &LPred,
3153	const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
3154	BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3155	BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt) {
3156	bool Removed, DuplicatedToLPred;
3157	bool DuplicatedToOriginalLPred;
3158	Removed = maybeTailDuplicateBlock(
3159	BB, LPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3160	PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3161	if (!Removed)
3162	return false;
3163	DuplicatedToOriginalLPred = DuplicatedToLPred;
3164	// Iteratively try to duplicate again. It can happen that a block that is
3165	// duplicated into is still small enough to be duplicated again.
3166	// No need to call markBlockSuccessors in this case, as the blocks being
3167	// duplicated from here on are already scheduled.
3168	while (DuplicatedToLPred && Removed) {
3169	MachineBasicBlock DupBB, DupPred;
3170	// The removal callback causes Chain.end() to be updated when a block is
3171	// removed. On the first pass through the loop, the chain end should be the
3172	// same as it was on function entry. On subsequent passes, because we are
3173	// duplicating the block at the end of the chain, if it is removed the
3174	// chain will have shrunk by one block.
3175	BlockChain::iterator ChainEnd = Chain.end();
3176	DupBB = *(--ChainEnd);
3177	// Now try to duplicate again.
3178	if (ChainEnd == Chain.begin())
3179	break;
3180	DupPred = *std::prev(x: ChainEnd);
3181	Removed = maybeTailDuplicateBlock(
3182	BB: DupBB, LPred: DupPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3183	PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3184	}
3185	// If BB was duplicated into LPred, it is now scheduled. But because it was
3186	// removed, markChainSuccessors won't be called for its chain. Instead we
3187	// call markBlockSuccessors for LPred to achieve the same effect. This must go
3188	// at the end because repeating the tail duplication can increase the number
3189	// of unscheduled predecessors.
3190	LPred = *std::prev(x: Chain.end());
3191	if (DuplicatedToOriginalLPred)
3192	markBlockSuccessors(Chain, MBB: LPred, LoopHeaderBB, BlockFilter);
3193	return true;
3194	}
3195
3196	/// Tail duplicate \p BB into (some) predecessors if profitable.
3197	/// \p BB - Basic block that may be duplicated
3198	/// \p LPred - Chosen layout predecessor of \p BB
3199	/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3200	/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3201	/// Used to identify which blocks to update predecessor
3202	/// counts.
3203	/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3204	/// chosen in the given order due to unnatural CFG
3205	/// only needed if \p BB is removed and
3206	/// \p PrevUnplacedBlockIt pointed to \p BB.
3207	/// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3208	/// \return - True if the block was duplicated into all preds and removed.
3209	bool MachineBlockPlacement::maybeTailDuplicateBlock(
3210	MachineBasicBlock BB, MachineBasicBlock LPred, BlockChain &Chain,
3211	BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3212	BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
3213	bool &DuplicatedToLPred) {
3214	DuplicatedToLPred = false;
3215	if (!shouldTailDuplicate(BB))
3216	return false;
3217
3218	LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3219	<< "\n");
3220
3221	// This has to be a callback because none of it can be done after
3222	// BB is deleted.
3223	bool Removed = false;
3224	auto RemovalCallback = [&](MachineBasicBlock *RemBB) {
3225	// Signal to outer function
3226	Removed = true;
3227
3228	// Remove from the Chain and Chain Map
3229	if (auto It = BlockToChain.find(Val: RemBB); It != BlockToChain.end()) {
3230	It ->second->remove(BB: RemBB);
3231	BlockToChain.erase(I: It);
3232	}
3233
3234	// Handle the unplaced block iterator
3235	if (&(*PrevUnplacedBlockIt) == RemBB) {
3236	PrevUnplacedBlockIt ++;
3237	}
3238
3239	// Handle the Work Lists
3240	if (RemBB->isEHPad()) {
3241	llvm::erase(C&: EHPadWorkList, V: RemBB);
3242	} else {
3243	llvm::erase(C&: BlockWorkList, V: RemBB);
3244	}
3245
3246	// Handle the filter set
3247	if (BlockFilter) {
3248	auto It = llvm::find(Range&: *BlockFilter, Val: RemBB);
3249	// Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt
3250	// pointing to the same element as before.
3251	if (It != BlockFilter->end()) {
3252	if (It < PrevUnplacedBlockInFilterIt) {
3253	const MachineBasicBlock PrevBB = PrevUnplacedBlockInFilterIt;
3254	// BlockFilter is a SmallVector so all elements after RemBB are
3255	// shifted to the front by 1 after its deletion.
3256	auto Distance = PrevUnplacedBlockInFilterIt - It - `1`;
3257	PrevUnplacedBlockInFilterIt = BlockFilter->erase(I: It) + Distance;
3258	assert(*PrevUnplacedBlockInFilterIt == PrevBB);
3259	(void)PrevBB;
3260	} else if (It == PrevUnplacedBlockInFilterIt)
3261	// The block pointed by PrevUnplacedBlockInFilterIt is erased, we
3262	// have to set it to the next element.
3263	PrevUnplacedBlockInFilterIt = BlockFilter->erase(I: It);
3264	else
3265	BlockFilter->erase(I: It);
3266	}
3267	}
3268
3269	// Remove the block from loop info.
3270	MLI->removeBlock(BB: RemBB);
3271	if (RemBB == PreferredLoopExit)
3272	PreferredLoopExit = nullptr;
3273
3274	LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: " << getBlockName(RemBB)
3275	<< "\n");
3276	};
3277	auto RemovalCallbackRef =
3278	function_ref<void(MachineBasicBlock *)>(RemovalCallback);
3279
3280	SmallVector<MachineBasicBlock *, `8`> DuplicatedPreds;
3281	bool IsSimple = TailDup.isSimpleBB(TailBB: BB);
3282	SmallVector<MachineBasicBlock *, `8`> CandidatePreds;
3283	SmallVectorImpl<MachineBasicBlock > CandidatePtr = nullptr;
3284	if (F->getFunction().hasProfileData()) {
3285	// We can do partial duplication with precise profile information.
3286	findDuplicateCandidates(Candidates&: CandidatePreds, BB, BlockFilter);
3287	if (CandidatePreds.size() == `0`)
3288	return false;
3289	if (CandidatePreds.size() < BB->pred_size())
3290	CandidatePtr = &CandidatePreds;
3291	}
3292	TailDup.tailDuplicateAndUpdate(IsSimple, MBB: BB, ForcedLayoutPred: LPred, DuplicatedPreds: &DuplicatedPreds,
3293	RemovalCallback: &RemovalCallbackRef, CandidatePtr);
3294
3295	// Update UnscheduledPredecessors to reflect tail-duplication.
3296	DuplicatedToLPred = false;
3297	for (MachineBasicBlock *Pred : DuplicatedPreds) {
3298	// We're only looking for unscheduled predecessors that match the filter.
3299	BlockChain *PredChain = BlockToChain [Pred];
3300	if (Pred == LPred)
3301	DuplicatedToLPred = true;
3302	if (Pred == LPred \|\| (BlockFilter && !BlockFilter->count(key: Pred)) \|\|
3303	PredChain == &Chain)
3304	continue;
3305	for (MachineBasicBlock *NewSucc : Pred->successors()) {
3306	if (BlockFilter && !BlockFilter->count(key: NewSucc))
3307	continue;
3308	BlockChain *NewChain = BlockToChain [NewSucc];
3309	if (NewChain != &Chain && NewChain != PredChain)
3310	NewChain->UnscheduledPredecessors++;
3311	}
3312	}
3313	return Removed;
3314	}
3315
3316	// Count the number of actual machine instructions.
3317	static uint64_t countMBBInstruction(MachineBasicBlock *MBB) {
3318	uint64_t InstrCount = `0`;
3319	for (MachineInstr &MI : *MBB) {
3320	if (!MI.isPHI() && !MI.isMetaInstruction())
3321	InstrCount += `1`;
3322	}
3323	return InstrCount;
3324	}
3325
3326	// The size cost of duplication is the instruction size of the duplicated block.
3327	// So we should scale the threshold accordingly. But the instruction size is not
3328	// available on all targets, so we use the number of instructions instead.
3329	BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3330	return BlockFrequency (DupThreshold.getFrequency() * countMBBInstruction(MBB: BB));
3331	}
3332
3333	// Returns true if BB is Pred's best successor.
3334	bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3335	MachineBasicBlock *Pred,
3336	BlockFilterSet *BlockFilter) {
3337	if (BB == Pred)
3338	return false;
3339	if (BlockFilter && !BlockFilter->count(key: Pred))
3340	return false;
3341	BlockChain *PredChain = BlockToChain [Pred];
3342	if (PredChain && (Pred != *std::prev(x: PredChain->end())))
3343	return false;
3344
3345	// Find the successor with largest probability excluding BB.
3346	BranchProbability BestProb = BranchProbability::getZero();
3347	for (MachineBasicBlock *Succ : Pred->successors())
3348	if (Succ != BB) {
3349	if (BlockFilter && !BlockFilter->count(key: Succ))
3350	continue;
3351	BlockChain *SuccChain = BlockToChain [Succ];
3352	if (SuccChain && (Succ != *SuccChain->begin()))
3353	continue;
3354	BranchProbability SuccProb = MBPI->getEdgeProbability(Src: Pred, Dst: Succ);
3355	if (SuccProb > BestProb)
3356	BestProb = SuccProb;
3357	}
3358
3359	BranchProbability BBProb = MBPI->getEdgeProbability(Src: Pred, Dst: BB);
3360	if (BBProb <= BestProb)
3361	return false;
3362
3363	// Compute the number of reduced taken branches if Pred falls through to BB
3364	// instead of another successor. Then compare it with threshold.
3365	BlockFrequency PredFreq = getBlockCountOrFrequency(BB: Pred);
3366	BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3367	return Gain > scaleThreshold(BB);
3368	}
3369
3370	// Find out the predecessors of BB and BB can be beneficially duplicated into
3371	// them.
3372	void MachineBlockPlacement::findDuplicateCandidates(
3373	SmallVectorImpl<MachineBasicBlock > &Candidates, MachineBasicBlock BB,
3374	BlockFilterSet *BlockFilter) {
3375	MachineBasicBlock Fallthrough = nullptr*;
3376	BranchProbability DefaultBranchProb = BranchProbability::getZero();
3377	BlockFrequency BBDupThreshold(scaleThreshold(BB));
3378	SmallVector<MachineBasicBlock *, `8`> Preds(BB->predecessors());
3379	SmallVector<MachineBasicBlock *, `8`> Succs(BB->successors());
3380
3381	// Sort for highest frequency.
3382	auto CmpSucc = [&](MachineBasicBlock A, MachineBasicBlock B) {
3383	return MBPI->getEdgeProbability(Src: BB, Dst: A) > MBPI->getEdgeProbability(Src: BB, Dst: B);
3384	};
3385	auto CmpPred = [&](MachineBasicBlock A, MachineBasicBlock B) {
3386	return MBFI ->getBlockFreq(MBB: A) > MBFI ->getBlockFreq(MBB: B);
3387	};
3388	llvm::stable_sort(Range&: Succs, C: CmpSucc);
3389	llvm::stable_sort(Range&: Preds, C: CmpPred);
3390
3391	auto SuccIt = Succs.begin();
3392	if (SuccIt != Succs.end()) {
3393	DefaultBranchProb = MBPI->getEdgeProbability(Src: BB, Dst: *SuccIt).getCompl();
3394	}
3395
3396	// For each predecessors of BB, compute the benefit of duplicating BB,
3397	// if it is larger than the threshold, add it into Candidates.
3398	//
3399	// If we have following control flow.
3400	//
3401	// PB1 PB2 PB3 PB4
3402	// \ \| / /\
3403	// \ \| / / \
3404	// \ \|/ / \
3405	// BB----/ OB
3406	// /\
3407	// / \
3408	// SB1 SB2
3409	//
3410	// And it can be partially duplicated as
3411	//
3412	// PB2+BB
3413	// \| PB1 PB3 PB4
3414	// \| \| / /\
3415	// \| \| / / \
3416	// \| \|/ / \
3417	// \| BB----/ OB
3418	// \|\ /\|
3419	// \| X \|
3420	// \|/ \\|
3421	// SB2 SB1
3422	//
3423	// The benefit of duplicating into a predecessor is defined as
3424	// Orig_taken_branch - Duplicated_taken_branch
3425	//
3426	// The Orig_taken_branch is computed with the assumption that predecessor
3427	// jumps to BB and the most possible successor is laid out after BB.
3428	//
3429	// The Duplicated_taken_branch is computed with the assumption that BB is
3430	// duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3431	// SB2 for PB2 in our case). If there is no available successor, the combined
3432	// block jumps to all BB's successor, like PB3 in this example.
3433	//
3434	// If a predecessor has multiple successors, so BB can't be duplicated into
3435	// it. But it can beneficially fall through to BB, and duplicate BB into other
3436	// predecessors.
3437	for (MachineBasicBlock *Pred : Preds) {
3438	BlockFrequency PredFreq = getBlockCountOrFrequency(BB: Pred);
3439
3440	if (!TailDup.canTailDuplicate(TailBB: BB, PredBB: Pred)) {
3441	// BB can't be duplicated into Pred, but it is possible to be layout
3442	// below Pred.
3443	if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3444	Fallthrough = Pred;
3445	if (SuccIt != Succs.end())
3446	SuccIt++;
3447	}
3448	continue;
3449	}
3450
3451	BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3452	BlockFrequency DupCost;
3453	if (SuccIt == Succs.end()) {
3454	// Jump to all successors;
3455	if (Succs.size() > `0`)
3456	DupCost += PredFreq;
3457	} else {
3458	// Fallthrough to SuccIt, jump to all other successors;*
3459	DupCost += PredFreq;
3460	DupCost -= PredFreq * MBPI->getEdgeProbability(Src: BB, Dst: *SuccIt);
3461	}
3462
3463	assert(OrigCost >= DupCost);
3464	OrigCost -= DupCost;
3465	if (OrigCost > BBDupThreshold) {
3466	Candidates.push_back(Elt: Pred);
3467	if (SuccIt != Succs.end())
3468	SuccIt++;
3469	}
3470	}
3471
3472	// No predecessors can optimally fallthrough to BB.
3473	// So we can change one duplication into fallthrough.
3474	if (!Fallthrough) {
3475	if ((Candidates.size() < Preds.size()) && (Candidates.size() > `0`)) {
3476	Candidates [`0`] = Candidates.back();
3477	Candidates.pop_back();
3478	}
3479	}
3480	}
3481
3482	void MachineBlockPlacement::initTailDupThreshold() {
3483	DupThreshold = BlockFrequency (`0`);
3484	if (F->getFunction().hasProfileData()) {
3485	// We prefer to use prifile count.
3486	uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3487	if (HotThreshold != UINT64_MAX) {
3488	UseProfileCount = true;
3489	DupThreshold =
3490	BlockFrequency (HotThreshold * TailDupProfilePercentThreshold / `100`);
3491	} else {
3492	// Profile count is not available, we can use block frequency instead.
3493	BlockFrequency MaxFreq = BlockFrequency (`0`);
3494	for (MachineBasicBlock &MBB : *F) {
3495	BlockFrequency Freq = MBFI ->getBlockFreq(MBB: &MBB);
3496	if (Freq > MaxFreq)
3497	MaxFreq = Freq;
3498	}
3499
3500	BranchProbability ThresholdProb(TailDupPlacementPenalty, `100`);
3501	DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
3502	UseProfileCount = false;
3503	}
3504	}
3505
3506	TailDupSize = TailDupPlacementThreshold;
3507	// If only the aggressive threshold is explicitly set, use it.
3508	if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != `0` &&
3509	TailDupPlacementThreshold.getNumOccurrences() == `0`)
3510	TailDupSize = TailDupPlacementAggressiveThreshold;
3511
3512	// For aggressive optimization, we can adjust some thresholds to be less
3513	// conservative.
3514	if (OptLevel >= CodeGenOptLevel::Aggressive) {
3515	// At O3 we should be more willing to copy blocks for tail duplication. This
3516	// increases size pressure, so we only do it at O3
3517	// Do this unless only the regular threshold is explicitly set.
3518	if (TailDupPlacementThreshold.getNumOccurrences() == `0` \|\|
3519	TailDupPlacementAggressiveThreshold.getNumOccurrences() != `0`)
3520	TailDupSize = TailDupPlacementAggressiveThreshold;
3521	}
3522
3523	// If there's no threshold provided through options, query the target
3524	// information for a threshold instead.
3525	if (TailDupPlacementThreshold.getNumOccurrences() == `0` &&
3526	(OptLevel < CodeGenOptLevel::Aggressive \|\|
3527	TailDupPlacementAggressiveThreshold.getNumOccurrences() == `0`))
3528	TailDupSize = TII->getTailDuplicateSize(OptLevel);
3529	}
3530
3531	PreservedAnalyses
3532	MachineBlockPlacementPass::run(MachineFunction &MF,
3533	MachineFunctionAnalysisManager &MFAM) {
3534	auto *MBPI = &MFAM.getResult<MachineBranchProbabilityAnalysis>(IR&: MF);
3535	auto MBFI = std::make_unique<MBFIWrapper>(
3536	args&: MFAM.getResult<MachineBlockFrequencyAnalysis>(IR&: MF));
3537	auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(IR&: MF);
3538	auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF)
3539	? &MFAM.getResult<MachinePostDominatorTreeAnalysis>(IR&: MF)
3540	: nullptr;
3541	auto *MDT = MFAM.getCachedResult<MachineDominatorTreeAnalysis>(IR&: MF);
3542	auto *PSI = MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(IR&: MF)
3543	.getCachedResult<ProfileSummaryAnalysis>(
3544	IR&: *MF.getFunction().getParent());
3545	if (!PSI)
3546	report_fatal_error(reason: "MachineBlockPlacement requires ProfileSummaryAnalysis",
3547	gen_crash_diag: false);
3548	MachineBlockPlacement MBP(MBPI, MLI, PSI, std::move(MBFI), MPDT,
3549	AllowTailMerge);
3550
3551	if (MBP.run(F&: MF))
3552	return getMachineFunctionPassPreservedAnalyses();
3553
3554	if (MDT)
3555	MDT->updateBlockNumbers();
3556	if (MPDT)
3557	MPDT->updateBlockNumbers();
3558	return PreservedAnalyses::all();
3559	}
3560
3561	void MachineBlockPlacementPass::printPipeline(
3562	raw_ostream &OS,
3563	function_ref<StringRef(StringRef)> MapClassName2PassName) const {
3564	OS << MapClassName2PassName (name());
3565	if (!AllowTailMerge)
3566	OS << "<no-tail-merge>";
3567	}
3568
3569	bool MachineBlockPlacement::run(MachineFunction &MF) {
3570
3571	// Check for single-block functions and skip them.
3572	if (std::next(x: MF.begin()) == MF.end())
3573	return false;
3574
3575	F = &MF;
3576	OptLevel = F->getTarget().getOptLevel();
3577
3578	TII = MF.getSubtarget().getInstrInfo();
3579	TLI = MF.getSubtarget().getTargetLowering();
3580
3581	// Initialize PreferredLoopExit to nullptr here since it may never be set if
3582	// there are no MachineLoops.
3583	PreferredLoopExit = nullptr;
3584
3585	assert(BlockToChain.empty() &&
3586	"BlockToChain map should be empty before starting placement.");
3587	assert(ComputedEdges.empty() &&
3588	"Computed Edge map should be empty before starting placement.");
3589
3590	// Initialize tail duplication thresholds.
3591	initTailDupThreshold();
3592
3593	const bool OptForSize =
3594	llvm::shouldOptimizeForSize(MF: &MF, PSI, BFI: &MBFI ->getMBFI());
3595	// Determine whether to use ext-tsp for perf/size optimization. The method
3596	// is beneficial only for instances with at least 3 basic blocks and it can be
3597	// disabled for huge functions (exceeding a certain size).
3598	bool UseExtTspForPerf = false;
3599	bool UseExtTspForSize = false;
3600	if (`3` <= MF.size() && MF.size() <= ExtTspBlockPlacementMaxBlocks) {
3601	UseExtTspForPerf =
3602	EnableExtTspBlockPlacement &&
3603	(ApplyExtTspWithoutProfile \|\| MF.getFunction().hasProfileData());
3604	UseExtTspForSize = OptForSize && ApplyExtTspForSize;
3605	}
3606
3607	// Apply tail duplication.
3608	if (allowTailDupPlacement(MF&: *F)) {
3609	if (OptForSize)
3610	TailDupSize = `1`;
3611	const bool PreRegAlloc = false;
3612	TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI: MBFI.get(), PSI,
3613	/ LayoutMode / true, TailDupSize);
3614	if (!UseExtTspForSize)
3615	precomputeTriangleChains();
3616	}
3617
3618	// Run the main block placement.
3619	if (!UseExtTspForSize)
3620	buildCFGChains();
3621
3622	// Changing the layout can create new tail merging opportunities.
3623	// TailMerge can create jump into if branches that make CFG irreducible for
3624	// HW that requires structured CFG.
3625	const bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3626	AllowTailMerge && BranchFoldPlacement &&
3627	MF.size() > `3`;
3628	// No tail merging opportunities if the block number is less than four.
3629	if (EnableTailMerge) {
3630	const unsigned TailMergeSize = TailDupSize + `1`;
3631	BranchFolder BF(/DefaultEnableTailMerge=/true, /CommonHoist=/false,
3632	MBFI, MBPI, PSI, TailMergeSize);
3633
3634	if (BF.OptimizeFunction(MF, tii: TII, tri: MF.getSubtarget().getRegisterInfo(), mli: MLI,
3635	/AfterPlacement=/true)) {
3636	// Must redo the post-dominator tree if blocks were changed.
3637	if (MPDT)
3638	MPDT->recalculate(Func&: MF);
3639	if (!UseExtTspForSize) {
3640	// Redo the layout if tail merging creates/removes/moves blocks.
3641	BlockToChain.clear();
3642	ComputedEdges.clear();
3643	ChainAllocator.DestroyAll();
3644	buildCFGChains();
3645	}
3646	}
3647	}
3648
3649	// Apply a post-processing optimizing block placement:
3650	// - find a new placement and modify the layout of the blocks in the function;
3651	// - re-create CFG chains so that we can optimizeBranches and alignBlocks.
3652	if (UseExtTspForPerf \|\| UseExtTspForSize) {
3653	assert(
3654	!(UseExtTspForPerf && UseExtTspForSize) &&
3655	"UseExtTspForPerf and UseExtTspForSize can not be set simultaneously");
3656	applyExtTsp(/OptForSize=/UseExtTspForSize);
3657	createCFGChainExtTsp();
3658	}
3659
3660	optimizeBranches();
3661	alignBlocks();
3662
3663	BlockToChain.clear();
3664	ComputedEdges.clear();
3665	ChainAllocator.DestroyAll();
3666
3667	// View the function.
3668	if (ViewBlockLayoutWithBFI != GVDT_None &&
3669	(ViewBlockFreqFuncName.empty() \|\|
3670	F->getFunction().getName() == ViewBlockFreqFuncName)) {
3671	if (RenumberBlocksBeforeView)
3672	MF.RenumberBlocks();
3673	MBFI ->view(Name: "MBP." + MF.getName(), isSimple: false);
3674	}
3675
3676	// We always return true as we have no way to track whether the final order
3677	// differs from the original order.
3678	return true;
3679	}
3680
3681	void MachineBlockPlacement::applyExtTsp(bool OptForSize) {
3682	// Prepare data; blocks are indexed by their index in the current ordering.
3683	DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
3684	BlockIndex.reserve(NumEntries: F->size());
3685	std::vector<const MachineBasicBlock *> CurrentBlockOrder;
3686	CurrentBlockOrder.reserve(n: F->size());
3687	size_t NumBlocks = `0`;
3688	for (const MachineBasicBlock &MBB : *F) {
3689	BlockIndex [&MBB] = NumBlocks++;
3690	CurrentBlockOrder.push_back(x: &MBB);
3691	}
3692
3693	SmallVector<uint64_t, `0`> BlockCounts(F->size());
3694	SmallVector<uint64_t, `0`> BlockSizes(F->size());
3695	SmallVector<codelayout::EdgeCount, `0`> JumpCounts;
3696	SmallVector<MachineOperand, `4`> Cond; // For analyzeBranch.
3697	SmallVector<const MachineBasicBlock *, `4`> Succs;
3698	for (MachineBasicBlock &MBB : *F) {
3699	// Getting the block frequency.
3700	BlockFrequency BlockFreq = MBFI ->getBlockFreq(MBB: &MBB);
3701	BlockCounts [BlockIndex [&MBB]] = OptForSize ? `1` : BlockFreq.getFrequency();
3702	// Getting the block size:
3703	// - approximate the size of an instruction by 4 bytes, and
3704	// - ignore debug instructions.
3705	// Note: getting the exact size of each block is target-dependent and can be
3706	// done by extending the interface of MCCodeEmitter. Experimentally we do
3707	// not see a perf improvement with the exact block sizes.
3708	auto NonDbgInsts =
3709	instructionsWithoutDebug(It: MBB.instr_begin(), End: MBB.instr_end());
3710	size_t NumInsts = std::distance(first: NonDbgInsts.begin(), last: NonDbgInsts.end());
3711	BlockSizes [BlockIndex [&MBB]] = `4` * NumInsts;
3712
3713	// Getting jump frequencies.
3714	if (OptForSize) {
3715	Cond.clear();
3716	MachineBasicBlock TBB = nullptr, FBB = nullptr; // For analyzeBranch.
3717	if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3718	continue;
3719
3720	const MachineBasicBlock *FTB = MBB.getFallThrough();
3721	// Succs is a collection of distinct destinations of the block reachable
3722	// from MBB via a jump instruction; initialize the list using the three
3723	// (non-necessarily distinct) blocks, FTB, TBB, and FBB.
3724	Succs.clear();
3725	if (TBB && TBB != FTB)
3726	Succs.push_back(Elt: TBB);
3727	if (FBB && FBB != FTB)
3728	Succs.push_back(Elt: FBB);
3729	if (FTB)
3730	Succs.push_back(Elt: FTB);
3731	// Absolute magnitude of non-zero counts does not matter for the
3732	// optimization; prioritize slightly jumps with a single successor, since
3733	// the corresponding jump instruction will be removed from the binary.
3734	const uint64_t Freq = Succs.size() == `1` ? `110` : `100`;
3735	for (const MachineBasicBlock *Succ : Succs)
3736	JumpCounts.push_back(Elt: {.src: BlockIndex [&MBB], .dst: BlockIndex [Succ], .count: Freq});
3737	} else {
3738	for (MachineBasicBlock *Succ : MBB.successors()) {
3739	auto EP = MBPI->getEdgeProbability(Src: &MBB, Dst: Succ);
3740	BlockFrequency JumpFreq = BlockFreq * EP;
3741	JumpCounts.push_back(
3742	Elt: {.src: BlockIndex [&MBB], .dst: BlockIndex [Succ], .count: JumpFreq.getFrequency()});
3743	}
3744	}
3745	}
3746
3747	LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for \|V\| = " << F->size()
3748	<< " with profile = " << F->getFunction().hasProfileData()
3749	<< " (" << F->getName() << ")" << "\n");
3750
3751	const double OrgScore = calcExtTspScore(NodeSizes: BlockSizes, EdgeCounts: JumpCounts);
3752	LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore));
3753
3754	// Run the layout algorithm.
3755	auto NewOrder = computeExtTspLayout(NodeSizes: BlockSizes, NodeCounts: BlockCounts, EdgeCounts: JumpCounts);
3756	std::vector<const MachineBasicBlock *> NewBlockOrder;
3757	NewBlockOrder.reserve(n: F->size());
3758	for (uint64_t Node : NewOrder) {
3759	NewBlockOrder.push_back(x: CurrentBlockOrder [Node]);
3760	}
3761	const double OptScore = calcExtTspScore(Order: NewOrder, NodeSizes: BlockSizes, EdgeCounts: JumpCounts);
3762	LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore));
3763
3764	// If the optimization is unsuccessful, fall back to the original block order.
3765	if (OptForSize && OrgScore > OptScore)
3766	assignBlockOrder(NewOrder: CurrentBlockOrder);
3767	else
3768	assignBlockOrder(NewOrder: NewBlockOrder);
3769	}
3770
3771	void MachineBlockPlacement::assignBlockOrder(
3772	const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
3773	assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
3774	F->RenumberBlocks();
3775	// At this point, we possibly removed blocks from the function, so we can't
3776	// renumber the domtree. At this point, we don't need it anymore, though.
3777	// TODO: move this to the point where the dominator tree is actually
3778	// invalidated (i.e., where blocks are removed without updating the domtree).
3779	MPDT = nullptr;
3780
3781	bool HasChanges = false;
3782	for (size_t I = `0`; I < NewBlockOrder.size(); I++) {
3783	if (NewBlockOrder [I] != F->getBlockNumbered(N: I)) {
3784	HasChanges = true;
3785	break;
3786	}
3787	}
3788	// Stop early if the new block order is identical to the existing one.
3789	if (!HasChanges)
3790	return;
3791
3792	SmallVector<MachineBasicBlock *, `4`> PrevFallThroughs(F->getNumBlockIDs());
3793	for (auto &MBB : *F) {
3794	PrevFallThroughs [MBB.getNumber()] = MBB.getFallThrough();
3795	}
3796
3797	// Sort basic blocks in the function according to the computed order.
3798	DenseMap<const MachineBasicBlock *, size_t> NewIndex;
3799	for (const MachineBasicBlock *MBB : NewBlockOrder) {
3800	NewIndex [MBB] = NewIndex.size();
3801	}
3802	F->sort(comp: [&](MachineBasicBlock &L, MachineBasicBlock &R) {
3803	return NewIndex [&L] < NewIndex [&R];
3804	});
3805
3806	// Update basic block branches by inserting explicit fallthrough branches
3807	// when required and re-optimize branches when possible.
3808	const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
3809	SmallVector<MachineOperand, `4`> Cond;
3810	for (auto &MBB : *F) {
3811	MachineFunction::iterator NextMBB = std::next(x: MBB.getIterator());
3812	MachineFunction::iterator EndIt = MBB.getParent()->end();
3813	auto *FTMBB = PrevFallThroughs [MBB.getNumber()];
3814	// If this block had a fallthrough before we need an explicit unconditional
3815	// branch to that block if the fallthrough block is not adjacent to the
3816	// block in the new order.
3817	if (FTMBB && (NextMBB == EndIt \|\| &*NextMBB != FTMBB)) {
3818	TII->insertUnconditionalBranch(MBB, DestBB: FTMBB, DL: MBB.findBranchDebugLoc());
3819	}
3820
3821	// It might be possible to optimize branches by flipping the condition.
3822	Cond.clear();
3823	MachineBasicBlock TBB = nullptr, FBB = nullptr;
3824	if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3825	continue;
3826	MBB.updateTerminator(PreviousLayoutSuccessor: FTMBB);
3827	}
3828	}
3829
3830	void MachineBlockPlacement::createCFGChainExtTsp() {
3831	BlockToChain.clear();
3832	ComputedEdges.clear();
3833	ChainAllocator.DestroyAll();
3834
3835	MachineBasicBlock *HeadBB = &F->front();
3836	BlockChain *FunctionChain =
3837	new (ChainAllocator.Allocate()) BlockChain (BlockToChain, HeadBB);
3838
3839	for (MachineBasicBlock &MBB : *F) {
3840	if (HeadBB == &MBB)
3841	continue; // Ignore head of the chain
3842	FunctionChain->merge(BB: &MBB, Chain: nullptr);
3843	}
3844	}
3845
3846	namespace {
3847
3848	/// A pass to compute block placement statistics.
3849	///
3850	/// A separate pass to compute interesting statistics for evaluating block
3851	/// placement. This is separate from the actual placement pass so that they can
3852	/// be computed in the absence of any placement transformations or when using
3853	/// alternative placement strategies.
3854	class MachineBlockPlacementStats {
3855	/// A handle to the branch probability pass.
3856	const MachineBranchProbabilityInfo *MBPI;
3857
3858	/// A handle to the function-wide block frequency pass.
3859	const MachineBlockFrequencyInfo *MBFI;
3860
3861	public:
3862	MachineBlockPlacementStats(const MachineBranchProbabilityInfo *MBPI,
3863	const MachineBlockFrequencyInfo *MBFI)
3864	: MBPI(MBPI), MBFI(MBFI) {}
3865	bool run(MachineFunction &MF);
3866	};
3867
3868	class MachineBlockPlacementStatsLegacy : public MachineFunctionPass {
3869	public:
3870	static char ID; // Pass identification, replacement for typeid
3871
3872	MachineBlockPlacementStatsLegacy() : MachineFunctionPass (ID) {}
3873
3874	bool runOnMachineFunction(MachineFunction &F) override {
3875	auto *MBPI =
3876	&getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
3877	auto *MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
3878	return MachineBlockPlacementStats (MBPI, MBFI).run(MF&: F);
3879	}
3880
3881	void getAnalysisUsage(AnalysisUsage &AU) const override {
3882	AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
3883	AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
3884	AU.setPreservesAll();
3885	MachineFunctionPass::getAnalysisUsage(AU);
3886	}
3887	};
3888
3889	} // end anonymous namespace
3890
3891	char MachineBlockPlacementStatsLegacy::ID = `0`;
3892
3893	char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStatsLegacy::ID;
3894
3895	INITIALIZE_PASS_BEGIN(MachineBlockPlacementStatsLegacy, "block-placement-stats",
3896	"Basic Block Placement Stats", false, false)
3897	INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
3898	INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass)
3899	INITIALIZE_PASS_END(MachineBlockPlacementStatsLegacy, "block-placement-stats",
3900	"Basic Block Placement Stats", false, false)
3901
3902	PreservedAnalyses
3903	MachineBlockPlacementStatsPass::run(MachineFunction &MF,
3904	MachineFunctionAnalysisManager &MFAM) {
3905	auto &MBPI = MFAM.getResult<MachineBranchProbabilityAnalysis>(IR&: MF);
3906	auto &MBFI = MFAM.getResult<MachineBlockFrequencyAnalysis>(IR&: MF);
3907
3908	MachineBlockPlacementStats (&MBPI, &MBFI).run(MF);
3909	return PreservedAnalyses::all();
3910	}
3911
3912	bool MachineBlockPlacementStats::run(MachineFunction &F) {
3913	// Check for single-block functions and skip them.
3914	if (std::next(x: F.begin()) == F.end())
3915	return false;
3916
3917	if (!isFunctionInPrintList(FunctionName: F.getName()))
3918	return false;
3919
3920	for (MachineBasicBlock &MBB : F) {
3921	BlockFrequency BlockFreq = MBFI->getBlockFreq(MBB: &MBB);
3922	Statistic &NumBranches =
3923	(MBB.succ_size() > `1`) ? NumCondBranches : NumUncondBranches;
3924	Statistic &BranchTakenFreq =
3925	(MBB.succ_size() > `1`) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3926	for (MachineBasicBlock *Succ : MBB.successors()) {
3927	// Skip if this successor is a fallthrough.
3928	if (MBB.isLayoutSuccessor(MBB: Succ))
3929	continue;
3930
3931	BlockFrequency EdgeFreq =
3932	BlockFreq * MBPI->getEdgeProbability(Src: &MBB, Dst: Succ);
3933	++NumBranches;
3934	BranchTakenFreq += EdgeFreq.getFrequency();
3935	}
3936	}
3937
3938	return false;
3939	}
3940

Browse the source code of llvm_projects/llvm/lib/CodeGen/MachineBlockPlacement.cpp