LoopDistribute.cpp source code [llvm_projects/llvm/lib/Transforms/Scalar/LoopDistribute.cpp]

1	//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the Loop Distribution Pass. Its main focus is to
10	// distribute loops that cannot be vectorized due to dependence cycles. It
11	// tries to isolate the offending dependences into a new loop allowing
12	// vectorization of the remaining parts.
13	//
14	// For dependence analysis, the pass uses the LoopVectorizer's
15	// LoopAccessAnalysis. Because this analysis presumes no change in the order of
16	// memory operations, special care is taken to preserve the lexical order of
17	// these operations.
18	//
19	// Similarly to the Vectorizer, the pass also supports loop versioning to
20	// run-time disambiguate potentially overlapping arrays.
21	//
22	//===----------------------------------------------------------------------===//
23
24	#include "llvm/Transforms/Scalar/LoopDistribute.h"
25	#include "llvm/ADT/DenseMap.h"
26	#include "llvm/ADT/DepthFirstIterator.h"
27	#include "llvm/ADT/EquivalenceClasses.h"
28	#include "llvm/ADT/STLExtras.h"
29	#include "llvm/ADT/SetVector.h"
30	#include "llvm/ADT/SmallVector.h"
31	#include "llvm/ADT/Statistic.h"
32	#include "llvm/ADT/StringRef.h"
33	#include "llvm/ADT/Twine.h"
34	#include "llvm/ADT/iterator_range.h"
35	#include "llvm/Analysis/AssumptionCache.h"
36	#include "llvm/Analysis/GlobalsModRef.h"
37	#include "llvm/Analysis/LoopAccessAnalysis.h"
38	#include "llvm/Analysis/LoopAnalysisManager.h"
39	#include "llvm/Analysis/LoopInfo.h"
40	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
41	#include "llvm/Analysis/ScalarEvolution.h"
42	#include "llvm/Analysis/TargetLibraryInfo.h"
43	#include "llvm/Analysis/TargetTransformInfo.h"
44	#include "llvm/IR/BasicBlock.h"
45	#include "llvm/IR/Constants.h"
46	#include "llvm/IR/DiagnosticInfo.h"
47	#include "llvm/IR/Dominators.h"
48	#include "llvm/IR/Function.h"
49	#include "llvm/IR/Instruction.h"
50	#include "llvm/IR/Instructions.h"
51	#include "llvm/IR/LLVMContext.h"
52	#include "llvm/IR/Metadata.h"
53	#include "llvm/IR/PassManager.h"
54	#include "llvm/IR/Value.h"
55	#include "llvm/Support/Casting.h"
56	#include "llvm/Support/CommandLine.h"
57	#include "llvm/Support/Debug.h"
58	#include "llvm/Support/raw_ostream.h"
59	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
60	#include "llvm/Transforms/Utils/Cloning.h"
61	#include "llvm/Transforms/Utils/LoopUtils.h"
62	#include "llvm/Transforms/Utils/LoopVersioning.h"
63	#include "llvm/Transforms/Utils/ValueMapper.h"
64	#include <cassert>
65	#include <functional>
66	#include <list>
67	#include <tuple>
68	#include <utility>
69
70	using namespace llvm;
71
72	#define LDIST_NAME "loop-distribute"
73	#define DEBUG_TYPE LDIST_NAME
74
75	/// @{
76	/// Metadata attribute names
77	static const char *const LLVMLoopDistributeFollowupAll =
78	"llvm.loop.distribute.followup_all";
79	static const char *const LLVMLoopDistributeFollowupCoincident =
80	"llvm.loop.distribute.followup_coincident";
81	static const char *const LLVMLoopDistributeFollowupSequential =
82	"llvm.loop.distribute.followup_sequential";
83	static const char *const LLVMLoopDistributeFollowupFallback =
84	"llvm.loop.distribute.followup_fallback";
85	/// @}
86
87	static cl::opt<bool>
88	LDistVerify("loop-distribute-verify", cl::Hidden,
89	cl::desc ("Turn on DominatorTree and LoopInfo verification "
90	"after Loop Distribution"),
91	cl::init(Val: false));
92
93	static cl::opt<bool> DistributeNonIfConvertible(
94	"loop-distribute-non-if-convertible", cl::Hidden,
95	cl::desc ("Whether to distribute into a loop that may not be "
96	"if-convertible by the loop vectorizer"),
97	cl::init(Val: false));
98
99	static cl::opt<unsigned> DistributeSCEVCheckThreshold(
100	"loop-distribute-scev-check-threshold", cl::init(Val: `8`), cl::Hidden,
101	cl::desc ("The maximum number of SCEV checks allowed for Loop "
102	"Distribution"));
103
104	static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
105	"loop-distribute-scev-check-threshold-with-pragma", cl::init(Val: `128`),
106	cl::Hidden,
107	cl::desc ("The maximum number of SCEV checks allowed for Loop "
108	"Distribution for loop marked with #pragma clang loop "
109	"distribute(enable)"));
110
111	static cl::opt<bool> EnableLoopDistribute(
112	"enable-loop-distribute", cl::Hidden,
113	cl::desc ("Enable the new, experimental LoopDistribution Pass"),
114	cl::init(Val: false));
115
116	STATISTIC(NumLoopsDistributed, "Number of loops distributed");
117
118	namespace {
119
120	/// Maintains the set of instructions of the loop for a partition before
121	/// cloning. After cloning, it hosts the new loop.
122	class InstPartition {
123	using InstructionSet = SmallSetVector<Instruction *, `8`>;
124
125	public:
126	InstPartition(Instruction I, Loop L, bool DepCycle = false)
127	: DepCycle(DepCycle), OrigLoop(L) {
128	Set.insert(X: I);
129	}
130
131	/// Returns whether this partition contains a dependence cycle.
132	bool hasDepCycle() const { return DepCycle; }
133
134	/// Adds an instruction to this partition.
135	void add(Instruction *I) { Set.insert(X: I); }
136
137	/// Collection accessors.
138	InstructionSet::iterator begin() { return Set.begin(); }
139	InstructionSet::iterator end() { return Set.end(); }
140	InstructionSet::const_iterator begin() const { return Set.begin(); }
141	InstructionSet::const_iterator end() const { return Set.end(); }
142	bool empty() const { return Set.empty(); }
143
144	/// Moves this partition into \p Other. This partition becomes empty
145	/// after this.
146	void moveTo(InstPartition &Other) {
147	Other.Set.insert(Start: Set.begin(), End: Set.end());
148	Set.clear();
149	Other.DepCycle \|= DepCycle;
150	}
151
152	/// Populates the partition with a transitive closure of all the
153	/// instructions that the seeded instructions dependent on.
154	void populateUsedSet() {
155	// FIXME: We currently don't use control-dependence but simply include all
156	// blocks (possibly empty at the end) and let simplifycfg mostly clean this
157	// up.
158	for (auto *B : OrigLoop->getBlocks())
159	Set.insert(X: B->getTerminator());
160
161	// Follow the use-def chains to form a transitive closure of all the
162	// instructions that the originally seeded instructions depend on.
163	SmallVector<Instruction *, `8`> Worklist(Set.begin(), Set.end());
164	while (!Worklist.empty()) {
165	Instruction *I = Worklist.pop_back_val();
166	// Insert instructions from the loop that we depend on.
167	for (Value *V : I->operand_values()) {
168	auto *I = dyn_cast<Instruction>(Val: V);
169	if (I && OrigLoop->contains(BB: I->getParent()) && Set.insert(X: I))
170	Worklist.push_back(Elt: I);
171	}
172	}
173	}
174
175	/// Clones the original loop.
176	///
177	/// Updates LoopInfo and DominatorTree using the information that block \p
178	/// LoopDomBB dominates the loop.
179	Loop cloneLoopWithPreheader(BasicBlock InsertBefore, BasicBlock *LoopDomBB,
180	unsigned Index, LoopInfo *LI,
181	DominatorTree *DT) {
182	ClonedLoop = ::cloneLoopWithPreheader(Before: InsertBefore, LoopDomBB, OrigLoop,
183	VMap, NameSuffix: Twine (".ldist") + Twine (Index),
184	LI, DT, Blocks&: ClonedLoopBlocks);
185	return ClonedLoop;
186	}
187
188	/// The cloned loop. If this partition is mapped to the original loop,
189	/// this is null.
190	const Loop getClonedLoop() const* { return ClonedLoop; }
191
192	/// Returns the loop where this partition ends up after distribution.
193	/// If this partition is mapped to the original loop then use the block from
194	/// the loop.
195	Loop getDistributedLoop() const* {
196	return ClonedLoop ? ClonedLoop : OrigLoop;
197	}
198
199	/// The VMap that is populated by cloning and then used in
200	/// remapinstruction to remap the cloned instructions.
201	ValueToValueMapTy &getVMap() { return VMap; }
202
203	/// Remaps the cloned instructions using VMap.
204	void remapInstructions() {
205	remapInstructionsInBlocks(Blocks: ClonedLoopBlocks, VMap);
206	}
207
208	/// Based on the set of instructions selected for this partition,
209	/// removes the unnecessary ones.
210	void removeUnusedInsts() {
211	SmallVector<Instruction *, `8`> Unused;
212
213	for (auto *Block : OrigLoop->getBlocks())
214	for (auto &Inst : *Block)
215	if (!Set.count(key: &Inst)) {
216	Instruction *NewInst = &Inst;
217	if (!VMap.empty())
218	NewInst = cast<Instruction>(Val&: VMap [NewInst]);
219
220	assert(!isa<BranchInst>(NewInst) &&
221	"Branches are marked used early on");
222	Unused.push_back(Elt: NewInst);
223	}
224
225	// Delete the instructions backwards, as it has a reduced likelihood of
226	// having to update as many def-use and use-def chains.
227	for (auto *Inst : reverse(C&: Unused)) {
228	if (!Inst->use_empty())
229	Inst->replaceAllUsesWith(V: PoisonValue::get(T: Inst->getType()));
230	Inst->eraseFromParent();
231	}
232	}
233
234	void print(raw_ostream &OS) const {
235	OS << (DepCycle ? " (cycle)\n" : "\n");
236	for (auto *I : Set)
237	// Prefix with the block name.
238	OS << " " << I->getParent()->getName() << ":" << *I << "\n";
239	}
240
241	void printBlocks(raw_ostream &OS) const {
242	for (auto *BB : getDistributedLoop()->getBlocks())
243	OS << *BB;
244	}
245
246	private:
247	/// Instructions from OrigLoop selected for this partition.
248	InstructionSet Set;
249
250	/// Whether this partition contains a dependence cycle.
251	bool DepCycle;
252
253	/// The original loop.
254	Loop *OrigLoop;
255
256	/// The cloned loop. If this partition is mapped to the original loop,
257	/// this is null.
258	Loop ClonedLoop = nullptr*;
259
260	/// The blocks of ClonedLoop including the preheader. If this
261	/// partition is mapped to the original loop, this is empty.
262	SmallVector<BasicBlock *, `8`> ClonedLoopBlocks;
263
264	/// These gets populated once the set of instructions have been
265	/// finalized. If this partition is mapped to the original loop, these are not
266	/// set.
267	ValueToValueMapTy VMap;
268	};
269
270	/// Holds the set of Partitions. It populates them, merges them and then
271	/// clones the loops.
272	class InstPartitionContainer {
273	using InstToPartitionIdT = DenseMap<Instruction , int*>;
274
275	public:
276	InstPartitionContainer(Loop L, LoopInfo LI, DominatorTree *DT)
277	: L(L), LI(LI), DT(DT) {}
278
279	/// Returns the number of partitions.
280	unsigned getSize() const { return PartitionContainer.size(); }
281
282	/// Adds \p Inst into the current partition if that is marked to
283	/// contain cycles. Otherwise start a new partition for it.
284	void addToCyclicPartition(Instruction *Inst) {
285	// If the current partition is non-cyclic. Start a new one.
286	if (PartitionContainer.empty() \|\| !PartitionContainer.back().hasDepCycle())
287	PartitionContainer.emplace_back(args&: Inst, args&: L, /DepCycle=/args: true);
288	else
289	PartitionContainer.back().add(I: Inst);
290	}
291
292	/// Adds \p Inst into a partition that is not marked to contain
293	/// dependence cycles.
294	///
295	// Initially we isolate memory instructions into as many partitions as
296	// possible, then later we may merge them back together.
297	void addToNewNonCyclicPartition(Instruction *Inst) {
298	PartitionContainer.emplace_back(args&: Inst, args&: L);
299	}
300
301	/// Merges adjacent non-cyclic partitions.
302	///
303	/// The idea is that we currently only want to isolate the non-vectorizable
304	/// partition. We could later allow more distribution among these partition
305	/// too.
306	void mergeAdjacentNonCyclic() {
307	mergeAdjacentPartitionsIf(
308	Predicate: [](const InstPartition P) { return* !P->hasDepCycle(); });
309	}
310
311	/// If a partition contains only conditional stores, we won't vectorize
312	/// it. Try to merge it with a previous cyclic partition.
313	void mergeNonIfConvertible() {
314	mergeAdjacentPartitionsIf(Predicate: [&](const InstPartition *Partition) {
315	if (Partition->hasDepCycle())
316	return true;
317
318	// Now, check if all stores are conditional in this partition.
319	bool seenStore = false;
320
321	for (auto Inst : Partition)
322	if (isa<StoreInst>(Val: Inst)) {
323	seenStore = true;
324	if (!LoopAccessInfo::blockNeedsPredication(BB: Inst->getParent(), TheLoop: L, DT))
325	return false;
326	}
327	return seenStore;
328	});
329	}
330
331	/// Merges the partitions according to various heuristics.
332	void mergeBeforePopulating() {
333	mergeAdjacentNonCyclic();
334	if (!DistributeNonIfConvertible)
335	mergeNonIfConvertible();
336	}
337
338	/// Merges partitions in order to ensure that no loads are duplicated.
339	///
340	/// We can't duplicate loads because that could potentially reorder them.
341	/// LoopAccessAnalysis provides dependency information with the context that
342	/// the order of memory operation is preserved.
343	///
344	/// Return if any partitions were merged.
345	bool mergeToAvoidDuplicatedLoads() {
346	using LoadToPartitionT = DenseMap<Instruction , InstPartition >;
347	using ToBeMergedT = EquivalenceClasses<InstPartition *>;
348
349	LoadToPartitionT LoadToPartition;
350	ToBeMergedT ToBeMerged;
351
352	// Step through the partitions and create equivalence between partitions
353	// that contain the same load. Also put partitions in between them in the
354	// same equivalence class to avoid reordering of memory operations.
355	for (PartitionContainerT::iterator I = PartitionContainer.begin(),
356	E = PartitionContainer.end();
357	I != E; ++I) {
358	auto PartI = &I;
359
360	// If a load occurs in two partitions PartI and PartJ, merge all
361	// partitions (PartI, PartJ] into PartI.
362	for (Instruction Inst : PartI)
363	if (isa<LoadInst>(Val: Inst)) {
364	bool NewElt;
365	LoadToPartitionT::iterator LoadToPart;
366
367	std::tie(args&: LoadToPart, args&: NewElt) =
368	LoadToPartition.insert(KV: std::make_pair(x&: Inst, y&: PartI));
369	if (!NewElt) {
370	LLVM_DEBUG(
371	dbgs()
372	<< "LDist: Merging partitions due to this load in multiple "
373	<< "partitions: " << PartI << ", " << LoadToPart->second << "\n"
374	<< *Inst << "\n");
375
376	auto PartJ = I;
377	do {
378	--PartJ;
379	ToBeMerged.unionSets(V1: PartI, V2: &*PartJ);
380	} while (&*PartJ != LoadToPart ->second);
381	}
382	}
383	}
384	if (ToBeMerged.empty())
385	return false;
386
387	// Merge the member of an equivalence class into its class leader. This
388	// makes the members empty.
389	for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
390	I != E; ++I) {
391	if (!I ->isLeader())
392	continue;
393
394	auto PartI = I ->getData();
395	for (auto *PartJ : make_range(x: std::next(x: ToBeMerged.member_begin(I)),
396	y: ToBeMerged.member_end())) {
397	PartJ->moveTo(Other&: *PartI);
398	}
399	}
400
401	// Remove the empty partitions.
402	PartitionContainer.remove_if(
403	[](const InstPartition &P) { return P.empty(); });
404
405	return true;
406	}
407
408	/// Sets up the mapping between instructions to partitions. If the
409	/// instruction is duplicated across multiple partitions, set the entry to -1.
410	void setupPartitionIdOnInstructions() {
411	int PartitionID = `0`;
412	for (const auto &Partition : PartitionContainer) {
413	for (Instruction *Inst : Partition) {
414	bool NewElt;
415	InstToPartitionIdT::iterator Iter;
416
417	std::tie(args&: Iter, args&: NewElt) =
418	InstToPartitionId.insert(KV: std::make_pair(x&: Inst, y&: PartitionID));
419	if (!NewElt)
420	Iter ->second = -`1`;
421	}
422	++PartitionID;
423	}
424	}
425
426	/// Populates the partition with everything that the seeding
427	/// instructions require.
428	void populateUsedSet() {
429	for (auto &P : PartitionContainer)
430	P.populateUsedSet();
431	}
432
433	/// This performs the main chunk of the work of cloning the loops for
434	/// the partitions.
435	void cloneLoops() {
436	BasicBlock *OrigPH = L->getLoopPreheader();
437	// At this point the predecessor of the preheader is either the memcheck
438	// block or the top part of the original preheader.
439	BasicBlock *Pred = OrigPH->getSinglePredecessor();
440	assert(Pred && "Preheader does not have a single predecessor");
441	BasicBlock *ExitBlock = L->getExitBlock();
442	assert(ExitBlock && "No single exit block");
443	Loop *NewLoop;
444
445	assert(!PartitionContainer.empty() && "at least two partitions expected");
446	// We're cloning the preheader along with the loop so we already made sure
447	// it was empty.
448	assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
449	"preheader not empty");
450
451	// Preserve the original loop ID for use after the transformation.
452	MDNode *OrigLoopID = L->getLoopID();
453
454	// Create a loop for each partition except the last. Clone the original
455	// loop before PH along with adding a preheader for the cloned loop. Then
456	// update PH to point to the newly added preheader.
457	BasicBlock *TopPH = OrigPH;
458	unsigned Index = getSize() - `1`;
459	for (auto &Part : llvm::drop_begin(RangeOrContainer: llvm::reverse(C&: PartitionContainer))) {
460	NewLoop = Part.cloneLoopWithPreheader(InsertBefore: TopPH, LoopDomBB: Pred, Index, LI, DT);
461
462	Part.getVMap()[ExitBlock] = TopPH;
463	Part.remapInstructions();
464	setNewLoopID(OrigLoopID, Part: &Part);
465	--Index;
466	TopPH = NewLoop->getLoopPreheader();
467	}
468	Pred->getTerminator()->replaceUsesOfWith(From: OrigPH, To: TopPH);
469
470	// Also set a new loop ID for the last loop.
471	setNewLoopID(OrigLoopID, Part: &PartitionContainer.back());
472
473	// Now go in forward order and update the immediate dominator for the
474	// preheaders with the exiting block of the previous loop. Dominance
475	// within the loop is updated in cloneLoopWithPreheader.
476	for (auto Curr = PartitionContainer.cbegin(),
477	Next = std::next(x: PartitionContainer.cbegin()),
478	E = PartitionContainer.cend();
479	Next != E; ++Curr, ++Next)
480	DT->changeImmediateDominator(
481	BB: Next ->getDistributedLoop()->getLoopPreheader(),
482	NewBB: Curr ->getDistributedLoop()->getExitingBlock());
483	}
484
485	/// Removes the dead instructions from the cloned loops.
486	void removeUnusedInsts() {
487	for (auto &Partition : PartitionContainer)
488	Partition.removeUnusedInsts();
489	}
490
491	/// For each memory pointer, it computes the partitionId the pointer is
492	/// used in.
493	///
494	/// This returns an array of int where the I-th entry corresponds to I-th
495	/// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple
496	/// partitions its entry is set to -1.
497	SmallVector<int, `8`>
498	computePartitionSetForPointers(const LoopAccessInfo &LAI) {
499	const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
500
501	unsigned N = RtPtrCheck->Pointers.size();
502	SmallVector<int, `8`> PtrToPartitions(N);
503	for (unsigned I = `0`; I < N; ++I) {
504	Value *Ptr = RtPtrCheck->Pointers [I].PointerValue;
505	auto Instructions =
506	LAI.getInstructionsForAccess(Ptr, isWrite: RtPtrCheck->Pointers [I].IsWritePtr);
507
508	int &Partition = PtrToPartitions [I];
509	// First set it to uninitialized.
510	Partition = -`2`;
511	for (Instruction *Inst : Instructions) {
512	// Note that this could be -1 if Inst is duplicated across multiple
513	// partitions.
514	int ThisPartition = this->InstToPartitionId [Inst];
515	if (Partition == -`2`)
516	Partition = ThisPartition;
517	// -1 means belonging to multiple partitions.
518	else if (Partition == -`1`)
519	break;
520	else if (Partition != (int)ThisPartition)
521	Partition = -`1`;
522	}
523	assert(Partition != -`2` && "Pointer not belonging to any partition");
524	}
525
526	return PtrToPartitions;
527	}
528
529	void print(raw_ostream &OS) const {
530	unsigned Index = `0`;
531	for (const auto &P : PartitionContainer) {
532	OS << "LDist: Partition " << Index++ << ":";
533	P.print(OS);
534	}
535	}
536
537	void dump() const { print(OS&: dbgs()); }
538
539	#ifndef NDEBUG
540	friend raw_ostream &operator<<(raw_ostream &OS,
541	const InstPartitionContainer &Partitions) {
542	Partitions.print(OS);
543	return OS;
544	}
545	#endif
546
547	void printBlocks(raw_ostream &OS) const {
548	unsigned Index = `0`;
549	for (const auto &P : PartitionContainer) {
550	OS << "LDist: Partition " << Index++ << ":";
551	P.printBlocks(OS);
552	}
553	}
554
555	private:
556	using PartitionContainerT = std::list<InstPartition>;
557
558	/// List of partitions.
559	PartitionContainerT PartitionContainer;
560
561	/// Mapping from Instruction to partition Id. If the instruction
562	/// belongs to multiple partitions the entry contains -1.
563	InstToPartitionIdT InstToPartitionId;
564
565	Loop *L;
566	LoopInfo *LI;
567	DominatorTree *DT;
568
569	/// The control structure to merge adjacent partitions if both satisfy
570	/// the \p Predicate.
571	template <class UnaryPredicate>
572	void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
573	InstPartition PrevMatch = nullptr*;
574	for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
575	auto DoesMatch = Predicate(&*I);
576	if (PrevMatch == nullptr && DoesMatch) {
577	PrevMatch = &*I;
578	++I;
579	} else if (PrevMatch != nullptr && DoesMatch) {
580	I ->moveTo(Other&: *PrevMatch);
581	I = PartitionContainer.erase(position: I);
582	} else {
583	PrevMatch = nullptr;
584	++I;
585	}
586	}
587	}
588
589	/// Assign new LoopIDs for the partition's cloned loop.
590	void setNewLoopID(MDNode OrigLoopID, InstPartition Part) {
591	std::optional<MDNode *> PartitionID = makeFollowupLoopID(
592	OrigLoopID,
593	FollowupAttrs: {LLVMLoopDistributeFollowupAll,
594	Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential
595	: LLVMLoopDistributeFollowupCoincident});
596	if (PartitionID) {
597	Loop *NewLoop = Part->getDistributedLoop();
598	NewLoop->setLoopID(*PartitionID);
599	}
600	}
601	};
602
603	/// For each memory instruction, this class maintains difference of the
604	/// number of unsafe dependences that start out from this instruction minus
605	/// those that end here.
606	///
607	/// By traversing the memory instructions in program order and accumulating this
608	/// number, we know whether any unsafe dependence crosses over a program point.
609	class MemoryInstructionDependences {
610	using Dependence = MemoryDepChecker::Dependence;
611
612	public:
613	struct Entry {
614	Instruction *Inst;
615	unsigned NumUnsafeDependencesStartOrEnd = `0`;
616
617	Entry(Instruction *Inst) : Inst(Inst) {}
618	};
619
620	using AccessesType = SmallVector<Entry, `8`>;
621
622	AccessesType::const_iterator begin() const { return Accesses.begin(); }
623	AccessesType::const_iterator end() const { return Accesses.end(); }
624
625	MemoryInstructionDependences(
626	const SmallVectorImpl<Instruction *> &Instructions,
627	const SmallVectorImpl<Dependence> &Dependences) {
628	Accesses.append(in_start: Instructions.begin(), in_end: Instructions.end());
629
630	LLVM_DEBUG(dbgs() << "LDist: Backward dependences:\n");
631	for (const auto &Dep : Dependences)
632	if (Dep.isPossiblyBackward()) {
633	// Note that the designations source and destination follow the program
634	// order, i.e. source is always first. (The direction is given by the
635	// DepType.)
636	++Accesses [Dep.Source].NumUnsafeDependencesStartOrEnd;
637	--Accesses [Dep.Destination].NumUnsafeDependencesStartOrEnd;
638
639	LLVM_DEBUG(Dep.print(dbgs(), `2`, Instructions));
640	}
641	}
642
643	private:
644	AccessesType Accesses;
645	};
646
647	/// The actual class performing the per-loop work.
648	class LoopDistributeForLoop {
649	public:
650	LoopDistributeForLoop(Loop L, Function F, LoopInfo LI, DominatorTree DT,
651	ScalarEvolution *SE, LoopAccessInfoManager &LAIs,
652	OptimizationRemarkEmitter *ORE)
653	: L(L), F(F), LI(LI), DT(DT), SE(SE), LAIs(LAIs), ORE(ORE) {
654	setForced();
655	}
656
657	/// Try to distribute an inner-most loop.
658	bool processLoop() {
659	assert(L->isInnermost() && "Only process inner loops.");
660
661	LLVM_DEBUG(dbgs() << "\nLDist: Checking a loop in '"
662	<< L->getHeader()->getParent()->getName() << "' from "
663	<< L->getLocStr() << "\n");
664
665	// Having a single exit block implies there's also one exiting block.
666	if (!L->getExitBlock())
667	return fail(RemarkName: "MultipleExitBlocks", Message: "multiple exit blocks");
668	if (!L->isLoopSimplifyForm())
669	return fail(RemarkName: "NotLoopSimplifyForm",
670	Message: "loop is not in loop-simplify form");
671	if (!L->isRotatedForm())
672	return fail(RemarkName: "NotBottomTested", Message: "loop is not bottom tested");
673
674	BasicBlock *PH = L->getLoopPreheader();
675
676	LAI = &LAIs.getInfo(L&: *L);
677
678	// Currently, we only distribute to isolate the part of the loop with
679	// dependence cycles to enable partial vectorization.
680	if (LAI->canVectorizeMemory())
681	return fail(RemarkName: "MemOpsCanBeVectorized",
682	Message: "memory operations are safe for vectorization");
683
684	auto *Dependences = LAI->getDepChecker().getDependences();
685	if (!Dependences \|\| Dependences->empty())
686	return fail(RemarkName: "NoUnsafeDeps", Message: "no unsafe dependences to isolate");
687
688	LLVM_DEBUG(dbgs() << "LDist: Found a candidate loop: "
689	<< L->getHeader()->getName() << "\n");
690
691	InstPartitionContainer Partitions(L, LI, DT);
692
693	// First, go through each memory operation and assign them to consecutive
694	// partitions (the order of partitions follows program order). Put those
695	// with unsafe dependences into "cyclic" partition otherwise put each store
696	// in its own "non-cyclic" partition (we'll merge these later).
697	//
698	// Note that a memory operation (e.g. Load2 below) at a program point that
699	// has an unsafe dependence (Store3->Load1) spanning over it must be
700	// included in the same cyclic partition as the dependent operations. This
701	// is to preserve the original program order after distribution. E.g.:
702	//
703	// NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive
704	// Load1 -. 1 0->1
705	// Load2 \| /Unsafe/ 0 1
706	// Store3 -' -1 1->0
707	// Load4 0 0
708	//
709	// NumUnsafeDependencesActive > 0 indicates this situation and in this case
710	// we just keep assigning to the same cyclic partition until
711	// NumUnsafeDependencesActive reaches 0.
712	const MemoryDepChecker &DepChecker = LAI->getDepChecker();
713	MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
714	*Dependences);
715
716	int NumUnsafeDependencesActive = `0`;
717	for (const auto &InstDep : MID) {
718	Instruction *I = InstDep.Inst;
719	// We update NumUnsafeDependencesActive post-instruction, catch the
720	// start of a dependence directly via NumUnsafeDependencesStartOrEnd.
721	if (NumUnsafeDependencesActive \|\|
722	InstDep.NumUnsafeDependencesStartOrEnd > `0`)
723	Partitions.addToCyclicPartition(Inst: I);
724	else
725	Partitions.addToNewNonCyclicPartition(Inst: I);
726	NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
727	assert(NumUnsafeDependencesActive >= `0` &&
728	"Negative number of dependences active");
729	}
730
731	// Add partitions for values used outside. These partitions can be out of
732	// order from the original program order. This is OK because if the
733	// partition uses a load we will merge this partition with the original
734	// partition of the load that we set up in the previous loop (see
735	// mergeToAvoidDuplicatedLoads).
736	auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
737	for (auto *Inst : DefsUsedOutside)
738	Partitions.addToNewNonCyclicPartition(Inst);
739
740	LLVM_DEBUG(dbgs() << "LDist: Seeded partitions:\n" << Partitions);
741	if (Partitions.getSize() < `2`)
742	return fail(RemarkName: "CantIsolateUnsafeDeps",
743	Message: "cannot isolate unsafe dependencies");
744
745	// Run the merge heuristics: Merge non-cyclic adjacent partitions since we
746	// should be able to vectorize these together.
747	Partitions.mergeBeforePopulating();
748	LLVM_DEBUG(dbgs() << "LDist: Merged partitions:\n" << Partitions);
749	if (Partitions.getSize() < `2`)
750	return fail(RemarkName: "CantIsolateUnsafeDeps",
751	Message: "cannot isolate unsafe dependencies");
752
753	// Now, populate the partitions with non-memory operations.
754	Partitions.populateUsedSet();
755	LLVM_DEBUG(dbgs() << "LDist: Populated partitions:\n" << Partitions);
756
757	// In order to preserve original lexical order for loads, keep them in the
758	// partition that we set up in the MemoryInstructionDependences loop.
759	if (Partitions.mergeToAvoidDuplicatedLoads()) {
760	LLVM_DEBUG(dbgs() << "LDist: Partitions merged to ensure unique loads:\n"
761	<< Partitions);
762	if (Partitions.getSize() < `2`)
763	return fail(RemarkName: "CantIsolateUnsafeDeps",
764	Message: "cannot isolate unsafe dependencies");
765	}
766
767	// Don't distribute the loop if we need too many SCEV run-time checks, or
768	// any if it's illegal.
769	const SCEVPredicate &Pred = LAI->getPSE().getPredicate();
770	if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) {
771	return fail(RemarkName: "RuntimeCheckWithConvergent",
772	Message: "may not insert runtime check with convergent operation");
773	}
774
775	if (Pred.getComplexity() > (IsForced.value_or(u: false)
776	? PragmaDistributeSCEVCheckThreshold
777	: DistributeSCEVCheckThreshold))
778	return fail(RemarkName: "TooManySCEVRuntimeChecks",
779	Message: "too many SCEV run-time checks needed.\n");
780
781	if (!IsForced.value_or(u: false) && hasDisableAllTransformsHint(L))
782	return fail(RemarkName: "HeuristicDisabled", Message: "distribution heuristic disabled");
783
784	LLVM_DEBUG(dbgs() << "LDist: Distributing loop: "
785	<< L->getHeader()->getName() << "\n");
786	// We're done forming the partitions set up the reverse mapping from
787	// instructions to partitions.
788	Partitions.setupPartitionIdOnInstructions();
789
790	// If we need run-time checks, version the loop now.
791	auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI: *LAI);
792	const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
793	const auto &AllChecks = RtPtrChecking->getChecks();
794	auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
795	RtPtrChecking);
796
797	if (LAI->hasConvergentOp() && !Checks.empty()) {
798	return fail(RemarkName: "RuntimeCheckWithConvergent",
799	Message: "may not insert runtime check with convergent operation");
800	}
801
802	// To keep things simple have an empty preheader before we version or clone
803	// the loop. (Also split if this has no predecessor, i.e. entry, because we
804	// rely on PH having a predecessor.)
805	if (!PH->getSinglePredecessor() \|\| &*PH->begin() != PH->getTerminator())
806	SplitBlock(Old: PH, SplitPt: PH->getTerminator(), DT, LI);
807
808	if (!Pred.isAlwaysTrue() \|\| !Checks.empty()) {
809	assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning");
810
811	MDNode *OrigLoopID = L->getLoopID();
812
813	LLVM_DEBUG(dbgs() << "LDist: Pointers:\n");
814	LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
815	LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE);
816	LVer.versionLoop(DefsUsedOutside);
817	LVer.annotateLoopWithNoAlias();
818
819	// The unversioned loop will not be changed, so we inherit all attributes
820	// from the original loop, but remove the loop distribution metadata to
821	// avoid to distribute it again.
822	MDNode UnversionedLoopID = makeFollowupLoopID(
823	OrigLoopID,
824	FollowupAttrs: {LLVMLoopDistributeFollowupAll, LLVMLoopDistributeFollowupFallback},
825	InheritOptionsAttrsPrefix: "llvm.loop.distribute.", AlwaysNew: true);
826	LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID);
827	}
828
829	// Create identical copies of the original loop for each partition and hook
830	// them up sequentially.
831	Partitions.cloneLoops();
832
833	// Now, we remove the instruction from each loop that don't belong to that
834	// partition.
835	Partitions.removeUnusedInsts();
836	LLVM_DEBUG(dbgs() << "LDist: After removing unused Instrs:\n");
837	LLVM_DEBUG(Partitions.printBlocks(dbgs()));
838
839	if (LDistVerify) {
840	LI->verify(DomTree: *DT);
841	assert(DT->verify(DominatorTree::VerificationLevel::Fast));
842	}
843
844	++NumLoopsDistributed;
845	// Report the success.
846	ORE->emit(RemarkBuilder: [&]() {
847	return OptimizationRemark (LDIST_NAME, "Distribute", L->getStartLoc(),
848	L->getHeader())
849	<< "distributed loop";
850	});
851	return true;
852	}
853
854	/// Provide diagnostics then \return with false.
855	bool fail(StringRef RemarkName, StringRef Message) {
856	LLVMContext &Ctx = F->getContext();
857	bool Forced = isForced().value_or(u: false);
858
859	LLVM_DEBUG(dbgs() << "LDist: Skipping; " << Message << "\n");
860
861	// With Rpass-missed report that distribution failed.
862	ORE->emit(RemarkBuilder: [&]() {
863	return OptimizationRemarkMissed (LDIST_NAME, "NotDistributed",
864	L->getStartLoc(), L->getHeader())
865	<< "loop not distributed: use -Rpass-analysis=loop-distribute for "
866	"more "
867	"info";
868	});
869
870	// With Rpass-analysis report why. This is on by default if distribution
871	// was requested explicitly.
872	ORE->emit(OptDiag&: OptimizationRemarkAnalysis (
873	Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME,
874	RemarkName, L->getStartLoc(), L->getHeader())
875	<< "loop not distributed: " << Message);
876
877	// Also issue a warning if distribution was requested explicitly but it
878	// failed.
879	if (Forced)
880	Ctx.diagnose(DI: DiagnosticInfoOptimizationFailure (
881	*F, L->getStartLoc(), "loop not distributed: failed "
882	"explicitly specified loop distribution"));
883
884	return false;
885	}
886
887	/// Return if distribution forced to be enabled/disabled for the loop.
888	///
889	/// If the optional has a value, it indicates whether distribution was forced
890	/// to be enabled (true) or disabled (false). If the optional has no value
891	/// distribution was not forced either way.
892	const std::optional<bool> &isForced() const { return IsForced; }
893
894	private:
895	/// Filter out checks between pointers from the same partition.
896	///
897	/// \p PtrToPartition contains the partition number for pointers. Partition
898	/// number -1 means that the pointer is used in multiple partitions. In this
899	/// case we can't safely omit the check.
900	SmallVector<RuntimePointerCheck, `4`> includeOnlyCrossPartitionChecks(
901	const SmallVectorImpl<RuntimePointerCheck> &AllChecks,
902	const SmallVectorImpl<int> &PtrToPartition,
903	const RuntimePointerChecking *RtPtrChecking) {
904	SmallVector<RuntimePointerCheck, `4`> Checks;
905
906	copy_if(Range: AllChecks, Out: std::back_inserter(x&: Checks),
907	P: [&](const RuntimePointerCheck &Check) {
908	for (unsigned PtrIdx1 : Check.first->Members)
909	for (unsigned PtrIdx2 : Check.second->Members)
910	// Only include this check if there is a pair of pointers
911	// that require checking and the pointers fall into
912	// separate partitions.
913	//
914	// (Note that we already know at this point that the two
915	// pointer groups need checking but it doesn't follow
916	// that each pair of pointers within the two groups need
917	// checking as well.
918	//
919	// In other words we don't want to include a check just
920	// because there is a pair of pointers between the two
921	// pointer groups that require checks and a different
922	// pair whose pointers fall into different partitions.)
923	if (RtPtrChecking->needsChecking(I: PtrIdx1, J: PtrIdx2) &&
924	!RuntimePointerChecking::arePointersInSamePartition(
925	PtrToPartition, PtrIdx1, PtrIdx2))
926	return true;
927	return false;
928	});
929
930	return Checks;
931	}
932
933	/// Check whether the loop metadata is forcing distribution to be
934	/// enabled/disabled.
935	void setForced() {
936	std::optional<const MDOperand *> Value =
937	findStringMetadataForLoop(TheLoop: L, Name: "llvm.loop.distribute.enable");
938	if (!Value)
939	return;
940
941	const MDOperand Op = Value;
942	assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
943	IsForced = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue();
944	}
945
946	Loop *L;
947	Function *F;
948
949	// Analyses used.
950	LoopInfo *LI;
951	const LoopAccessInfo LAI = nullptr*;
952	DominatorTree *DT;
953	ScalarEvolution *SE;
954	LoopAccessInfoManager &LAIs;
955	OptimizationRemarkEmitter *ORE;
956
957	/// Indicates whether distribution is forced to be enabled/disabled for
958	/// the loop.
959	///
960	/// If the optional has a value, it indicates whether distribution was forced
961	/// to be enabled (true) or disabled (false). If the optional has no value
962	/// distribution was not forced either way.
963	std::optional<bool> IsForced;
964	};
965
966	} // end anonymous namespace
967
968	static bool runImpl(Function &F, LoopInfo LI, DominatorTree DT,
969	ScalarEvolution SE, OptimizationRemarkEmitter ORE,
970	LoopAccessInfoManager &LAIs) {
971	// Build up a worklist of inner-loops to distribute. This is necessary as the
972	// act of distributing a loop creates new loops and can invalidate iterators
973	// across the loops.
974	SmallVector<Loop *, `8`> Worklist;
975
976	for (Loop TopLevelLoop : LI)
977	for (Loop *L : depth_first(G: TopLevelLoop))
978	// We only handle inner-most loops.
979	if (L->isInnermost())
980	Worklist.push_back(Elt: L);
981
982	// Now walk the identified inner loops.
983	bool Changed = false;
984	for (Loop *L : Worklist) {
985	LoopDistributeForLoop LDL(L, &F, LI, DT, SE, LAIs, ORE);
986
987	// If distribution was forced for the specific loop to be
988	// enabled/disabled, follow that. Otherwise use the global flag.
989	if (LDL.isForced().value_or(u&: EnableLoopDistribute))
990	Changed \|= LDL.processLoop();
991	}
992
993	// Process each loop nest in the function.
994	return Changed;
995	}
996
997	PreservedAnalyses LoopDistributePass::run(Function &F,
998	FunctionAnalysisManager &AM) {
999	auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
1000	auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
1001	auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
1002	auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
1003
1004	LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(IR&: F);
1005	bool Changed = runImpl(F, LI: &LI, DT: &DT, SE: &SE, ORE: &ORE, LAIs);
1006	if (!Changed)
1007	return PreservedAnalyses::all();
1008	PreservedAnalyses PA;
1009	PA.preserve<LoopAnalysis>();
1010	PA.preserve<DominatorTreeAnalysis>();
1011	return PA;
1012	}
1013

Browse the source code of llvm_projects/llvm/lib/Transforms/Scalar/LoopDistribute.cpp