1 | //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the Loop Distribution Pass. Its main focus is to |
10 | // distribute loops that cannot be vectorized due to dependence cycles. It |
11 | // tries to isolate the offending dependences into a new loop allowing |
12 | // vectorization of the remaining parts. |
13 | // |
14 | // For dependence analysis, the pass uses the LoopVectorizer's |
15 | // LoopAccessAnalysis. Because this analysis presumes no change in the order of |
16 | // memory operations, special care is taken to preserve the lexical order of |
17 | // these operations. |
18 | // |
19 | // Similarly to the Vectorizer, the pass also supports loop versioning to |
20 | // run-time disambiguate potentially overlapping arrays. |
21 | // |
22 | //===----------------------------------------------------------------------===// |
23 | |
24 | #include "llvm/Transforms/Scalar/LoopDistribute.h" |
25 | #include "llvm/ADT/DenseMap.h" |
26 | #include "llvm/ADT/DepthFirstIterator.h" |
27 | #include "llvm/ADT/EquivalenceClasses.h" |
28 | #include "llvm/ADT/STLExtras.h" |
29 | #include "llvm/ADT/SetVector.h" |
30 | #include "llvm/ADT/SmallVector.h" |
31 | #include "llvm/ADT/Statistic.h" |
32 | #include "llvm/ADT/StringRef.h" |
33 | #include "llvm/ADT/Twine.h" |
34 | #include "llvm/ADT/iterator_range.h" |
35 | #include "llvm/Analysis/AssumptionCache.h" |
36 | #include "llvm/Analysis/GlobalsModRef.h" |
37 | #include "llvm/Analysis/LoopAccessAnalysis.h" |
38 | #include "llvm/Analysis/LoopAnalysisManager.h" |
39 | #include "llvm/Analysis/LoopInfo.h" |
40 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
41 | #include "llvm/Analysis/ScalarEvolution.h" |
42 | #include "llvm/Analysis/TargetLibraryInfo.h" |
43 | #include "llvm/Analysis/TargetTransformInfo.h" |
44 | #include "llvm/IR/BasicBlock.h" |
45 | #include "llvm/IR/Constants.h" |
46 | #include "llvm/IR/DiagnosticInfo.h" |
47 | #include "llvm/IR/Dominators.h" |
48 | #include "llvm/IR/Function.h" |
49 | #include "llvm/IR/Instruction.h" |
50 | #include "llvm/IR/Instructions.h" |
51 | #include "llvm/IR/LLVMContext.h" |
52 | #include "llvm/IR/Metadata.h" |
53 | #include "llvm/IR/PassManager.h" |
54 | #include "llvm/IR/Value.h" |
55 | #include "llvm/Support/Casting.h" |
56 | #include "llvm/Support/CommandLine.h" |
57 | #include "llvm/Support/Debug.h" |
58 | #include "llvm/Support/raw_ostream.h" |
59 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
60 | #include "llvm/Transforms/Utils/Cloning.h" |
61 | #include "llvm/Transforms/Utils/LoopUtils.h" |
62 | #include "llvm/Transforms/Utils/LoopVersioning.h" |
63 | #include "llvm/Transforms/Utils/ValueMapper.h" |
64 | #include <cassert> |
65 | #include <functional> |
66 | #include <list> |
67 | #include <tuple> |
68 | #include <utility> |
69 | |
70 | using namespace llvm; |
71 | |
72 | #define LDIST_NAME "loop-distribute" |
73 | #define DEBUG_TYPE LDIST_NAME |
74 | |
75 | /// @{ |
76 | /// Metadata attribute names |
77 | static const char *const LLVMLoopDistributeFollowupAll = |
78 | "llvm.loop.distribute.followup_all" ; |
79 | static const char *const LLVMLoopDistributeFollowupCoincident = |
80 | "llvm.loop.distribute.followup_coincident" ; |
81 | static const char *const LLVMLoopDistributeFollowupSequential = |
82 | "llvm.loop.distribute.followup_sequential" ; |
83 | static const char *const LLVMLoopDistributeFollowupFallback = |
84 | "llvm.loop.distribute.followup_fallback" ; |
85 | /// @} |
86 | |
87 | static cl::opt<bool> |
88 | LDistVerify("loop-distribute-verify" , cl::Hidden, |
89 | cl::desc("Turn on DominatorTree and LoopInfo verification " |
90 | "after Loop Distribution" ), |
91 | cl::init(Val: false)); |
92 | |
93 | static cl::opt<bool> DistributeNonIfConvertible( |
94 | "loop-distribute-non-if-convertible" , cl::Hidden, |
95 | cl::desc("Whether to distribute into a loop that may not be " |
96 | "if-convertible by the loop vectorizer" ), |
97 | cl::init(Val: false)); |
98 | |
99 | static cl::opt<unsigned> DistributeSCEVCheckThreshold( |
100 | "loop-distribute-scev-check-threshold" , cl::init(Val: 8), cl::Hidden, |
101 | cl::desc("The maximum number of SCEV checks allowed for Loop " |
102 | "Distribution" )); |
103 | |
104 | static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( |
105 | "loop-distribute-scev-check-threshold-with-pragma" , cl::init(Val: 128), |
106 | cl::Hidden, |
107 | cl::desc("The maximum number of SCEV checks allowed for Loop " |
108 | "Distribution for loop marked with #pragma clang loop " |
109 | "distribute(enable)" )); |
110 | |
111 | static cl::opt<bool> EnableLoopDistribute( |
112 | "enable-loop-distribute" , cl::Hidden, |
113 | cl::desc("Enable the new, experimental LoopDistribution Pass" ), |
114 | cl::init(Val: false)); |
115 | |
116 | STATISTIC(NumLoopsDistributed, "Number of loops distributed" ); |
117 | |
118 | namespace { |
119 | |
120 | /// Maintains the set of instructions of the loop for a partition before |
121 | /// cloning. After cloning, it hosts the new loop. |
122 | class InstPartition { |
123 | using InstructionSet = SmallSetVector<Instruction *, 8>; |
124 | |
125 | public: |
126 | InstPartition(Instruction *I, Loop *L, bool DepCycle = false) |
127 | : DepCycle(DepCycle), OrigLoop(L) { |
128 | Set.insert(X: I); |
129 | } |
130 | |
131 | /// Returns whether this partition contains a dependence cycle. |
132 | bool hasDepCycle() const { return DepCycle; } |
133 | |
134 | /// Adds an instruction to this partition. |
135 | void add(Instruction *I) { Set.insert(X: I); } |
136 | |
137 | /// Collection accessors. |
138 | InstructionSet::iterator begin() { return Set.begin(); } |
139 | InstructionSet::iterator end() { return Set.end(); } |
140 | InstructionSet::const_iterator begin() const { return Set.begin(); } |
141 | InstructionSet::const_iterator end() const { return Set.end(); } |
142 | bool empty() const { return Set.empty(); } |
143 | |
144 | /// Moves this partition into \p Other. This partition becomes empty |
145 | /// after this. |
146 | void moveTo(InstPartition &Other) { |
147 | Other.Set.insert(Start: Set.begin(), End: Set.end()); |
148 | Set.clear(); |
149 | Other.DepCycle |= DepCycle; |
150 | } |
151 | |
152 | /// Populates the partition with a transitive closure of all the |
153 | /// instructions that the seeded instructions dependent on. |
154 | void populateUsedSet() { |
155 | // FIXME: We currently don't use control-dependence but simply include all |
156 | // blocks (possibly empty at the end) and let simplifycfg mostly clean this |
157 | // up. |
158 | for (auto *B : OrigLoop->getBlocks()) |
159 | Set.insert(X: B->getTerminator()); |
160 | |
161 | // Follow the use-def chains to form a transitive closure of all the |
162 | // instructions that the originally seeded instructions depend on. |
163 | SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); |
164 | while (!Worklist.empty()) { |
165 | Instruction *I = Worklist.pop_back_val(); |
166 | // Insert instructions from the loop that we depend on. |
167 | for (Value *V : I->operand_values()) { |
168 | auto *I = dyn_cast<Instruction>(Val: V); |
169 | if (I && OrigLoop->contains(BB: I->getParent()) && Set.insert(X: I)) |
170 | Worklist.push_back(Elt: I); |
171 | } |
172 | } |
173 | } |
174 | |
175 | /// Clones the original loop. |
176 | /// |
177 | /// Updates LoopInfo and DominatorTree using the information that block \p |
178 | /// LoopDomBB dominates the loop. |
179 | Loop *(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, |
180 | unsigned Index, LoopInfo *LI, |
181 | DominatorTree *DT) { |
182 | ClonedLoop = ::cloneLoopWithPreheader(Before: InsertBefore, LoopDomBB, OrigLoop, |
183 | VMap, NameSuffix: Twine(".ldist" ) + Twine(Index), |
184 | LI, DT, Blocks&: ClonedLoopBlocks); |
185 | return ClonedLoop; |
186 | } |
187 | |
188 | /// The cloned loop. If this partition is mapped to the original loop, |
189 | /// this is null. |
190 | const Loop *getClonedLoop() const { return ClonedLoop; } |
191 | |
192 | /// Returns the loop where this partition ends up after distribution. |
193 | /// If this partition is mapped to the original loop then use the block from |
194 | /// the loop. |
195 | Loop *getDistributedLoop() const { |
196 | return ClonedLoop ? ClonedLoop : OrigLoop; |
197 | } |
198 | |
199 | /// The VMap that is populated by cloning and then used in |
200 | /// remapinstruction to remap the cloned instructions. |
201 | ValueToValueMapTy &getVMap() { return VMap; } |
202 | |
203 | /// Remaps the cloned instructions using VMap. |
204 | void remapInstructions() { |
205 | remapInstructionsInBlocks(Blocks: ClonedLoopBlocks, VMap); |
206 | } |
207 | |
208 | /// Based on the set of instructions selected for this partition, |
209 | /// removes the unnecessary ones. |
210 | void removeUnusedInsts() { |
211 | SmallVector<Instruction *, 8> Unused; |
212 | |
213 | for (auto *Block : OrigLoop->getBlocks()) |
214 | for (auto &Inst : *Block) |
215 | if (!Set.count(key: &Inst)) { |
216 | Instruction *NewInst = &Inst; |
217 | if (!VMap.empty()) |
218 | NewInst = cast<Instruction>(Val&: VMap[NewInst]); |
219 | |
220 | assert(!isa<BranchInst>(NewInst) && |
221 | "Branches are marked used early on" ); |
222 | Unused.push_back(Elt: NewInst); |
223 | } |
224 | |
225 | // Delete the instructions backwards, as it has a reduced likelihood of |
226 | // having to update as many def-use and use-def chains. |
227 | for (auto *Inst : reverse(C&: Unused)) { |
228 | if (!Inst->use_empty()) |
229 | Inst->replaceAllUsesWith(V: PoisonValue::get(T: Inst->getType())); |
230 | Inst->eraseFromParent(); |
231 | } |
232 | } |
233 | |
234 | void print(raw_ostream &OS) const { |
235 | OS << (DepCycle ? " (cycle)\n" : "\n" ); |
236 | for (auto *I : Set) |
237 | // Prefix with the block name. |
238 | OS << " " << I->getParent()->getName() << ":" << *I << "\n" ; |
239 | } |
240 | |
241 | void printBlocks(raw_ostream &OS) const { |
242 | for (auto *BB : getDistributedLoop()->getBlocks()) |
243 | OS << *BB; |
244 | } |
245 | |
246 | private: |
247 | /// Instructions from OrigLoop selected for this partition. |
248 | InstructionSet Set; |
249 | |
250 | /// Whether this partition contains a dependence cycle. |
251 | bool DepCycle; |
252 | |
253 | /// The original loop. |
254 | Loop *OrigLoop; |
255 | |
256 | /// The cloned loop. If this partition is mapped to the original loop, |
257 | /// this is null. |
258 | Loop *ClonedLoop = nullptr; |
259 | |
260 | /// The blocks of ClonedLoop including the preheader. If this |
261 | /// partition is mapped to the original loop, this is empty. |
262 | SmallVector<BasicBlock *, 8> ClonedLoopBlocks; |
263 | |
264 | /// These gets populated once the set of instructions have been |
265 | /// finalized. If this partition is mapped to the original loop, these are not |
266 | /// set. |
267 | ValueToValueMapTy VMap; |
268 | }; |
269 | |
270 | /// Holds the set of Partitions. It populates them, merges them and then |
271 | /// clones the loops. |
272 | class InstPartitionContainer { |
273 | using InstToPartitionIdT = DenseMap<Instruction *, int>; |
274 | |
275 | public: |
276 | InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) |
277 | : L(L), LI(LI), DT(DT) {} |
278 | |
279 | /// Returns the number of partitions. |
280 | unsigned getSize() const { return PartitionContainer.size(); } |
281 | |
282 | /// Adds \p Inst into the current partition if that is marked to |
283 | /// contain cycles. Otherwise start a new partition for it. |
284 | void addToCyclicPartition(Instruction *Inst) { |
285 | // If the current partition is non-cyclic. Start a new one. |
286 | if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) |
287 | PartitionContainer.emplace_back(args&: Inst, args&: L, /*DepCycle=*/args: true); |
288 | else |
289 | PartitionContainer.back().add(I: Inst); |
290 | } |
291 | |
292 | /// Adds \p Inst into a partition that is not marked to contain |
293 | /// dependence cycles. |
294 | /// |
295 | // Initially we isolate memory instructions into as many partitions as |
296 | // possible, then later we may merge them back together. |
297 | void addToNewNonCyclicPartition(Instruction *Inst) { |
298 | PartitionContainer.emplace_back(args&: Inst, args&: L); |
299 | } |
300 | |
301 | /// Merges adjacent non-cyclic partitions. |
302 | /// |
303 | /// The idea is that we currently only want to isolate the non-vectorizable |
304 | /// partition. We could later allow more distribution among these partition |
305 | /// too. |
306 | void mergeAdjacentNonCyclic() { |
307 | mergeAdjacentPartitionsIf( |
308 | Predicate: [](const InstPartition *P) { return !P->hasDepCycle(); }); |
309 | } |
310 | |
311 | /// If a partition contains only conditional stores, we won't vectorize |
312 | /// it. Try to merge it with a previous cyclic partition. |
313 | void mergeNonIfConvertible() { |
314 | mergeAdjacentPartitionsIf(Predicate: [&](const InstPartition *Partition) { |
315 | if (Partition->hasDepCycle()) |
316 | return true; |
317 | |
318 | // Now, check if all stores are conditional in this partition. |
319 | bool seenStore = false; |
320 | |
321 | for (auto *Inst : *Partition) |
322 | if (isa<StoreInst>(Val: Inst)) { |
323 | seenStore = true; |
324 | if (!LoopAccessInfo::blockNeedsPredication(BB: Inst->getParent(), TheLoop: L, DT)) |
325 | return false; |
326 | } |
327 | return seenStore; |
328 | }); |
329 | } |
330 | |
331 | /// Merges the partitions according to various heuristics. |
332 | void mergeBeforePopulating() { |
333 | mergeAdjacentNonCyclic(); |
334 | if (!DistributeNonIfConvertible) |
335 | mergeNonIfConvertible(); |
336 | } |
337 | |
338 | /// Merges partitions in order to ensure that no loads are duplicated. |
339 | /// |
340 | /// We can't duplicate loads because that could potentially reorder them. |
341 | /// LoopAccessAnalysis provides dependency information with the context that |
342 | /// the order of memory operation is preserved. |
343 | /// |
344 | /// Return if any partitions were merged. |
345 | bool mergeToAvoidDuplicatedLoads() { |
346 | using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>; |
347 | using ToBeMergedT = EquivalenceClasses<InstPartition *>; |
348 | |
349 | LoadToPartitionT LoadToPartition; |
350 | ToBeMergedT ToBeMerged; |
351 | |
352 | // Step through the partitions and create equivalence between partitions |
353 | // that contain the same load. Also put partitions in between them in the |
354 | // same equivalence class to avoid reordering of memory operations. |
355 | for (PartitionContainerT::iterator I = PartitionContainer.begin(), |
356 | E = PartitionContainer.end(); |
357 | I != E; ++I) { |
358 | auto *PartI = &*I; |
359 | |
360 | // If a load occurs in two partitions PartI and PartJ, merge all |
361 | // partitions (PartI, PartJ] into PartI. |
362 | for (Instruction *Inst : *PartI) |
363 | if (isa<LoadInst>(Val: Inst)) { |
364 | bool NewElt; |
365 | LoadToPartitionT::iterator LoadToPart; |
366 | |
367 | std::tie(args&: LoadToPart, args&: NewElt) = |
368 | LoadToPartition.insert(KV: std::make_pair(x&: Inst, y&: PartI)); |
369 | if (!NewElt) { |
370 | LLVM_DEBUG( |
371 | dbgs() |
372 | << "LDist: Merging partitions due to this load in multiple " |
373 | << "partitions: " << PartI << ", " << LoadToPart->second << "\n" |
374 | << *Inst << "\n" ); |
375 | |
376 | auto PartJ = I; |
377 | do { |
378 | --PartJ; |
379 | ToBeMerged.unionSets(V1: PartI, V2: &*PartJ); |
380 | } while (&*PartJ != LoadToPart->second); |
381 | } |
382 | } |
383 | } |
384 | if (ToBeMerged.empty()) |
385 | return false; |
386 | |
387 | // Merge the member of an equivalence class into its class leader. This |
388 | // makes the members empty. |
389 | for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); |
390 | I != E; ++I) { |
391 | if (!I->isLeader()) |
392 | continue; |
393 | |
394 | auto PartI = I->getData(); |
395 | for (auto *PartJ : make_range(x: std::next(x: ToBeMerged.member_begin(I)), |
396 | y: ToBeMerged.member_end())) { |
397 | PartJ->moveTo(Other&: *PartI); |
398 | } |
399 | } |
400 | |
401 | // Remove the empty partitions. |
402 | PartitionContainer.remove_if( |
403 | pred: [](const InstPartition &P) { return P.empty(); }); |
404 | |
405 | return true; |
406 | } |
407 | |
408 | /// Sets up the mapping between instructions to partitions. If the |
409 | /// instruction is duplicated across multiple partitions, set the entry to -1. |
410 | void setupPartitionIdOnInstructions() { |
411 | int PartitionID = 0; |
412 | for (const auto &Partition : PartitionContainer) { |
413 | for (Instruction *Inst : Partition) { |
414 | bool NewElt; |
415 | InstToPartitionIdT::iterator Iter; |
416 | |
417 | std::tie(args&: Iter, args&: NewElt) = |
418 | InstToPartitionId.insert(KV: std::make_pair(x&: Inst, y&: PartitionID)); |
419 | if (!NewElt) |
420 | Iter->second = -1; |
421 | } |
422 | ++PartitionID; |
423 | } |
424 | } |
425 | |
426 | /// Populates the partition with everything that the seeding |
427 | /// instructions require. |
428 | void populateUsedSet() { |
429 | for (auto &P : PartitionContainer) |
430 | P.populateUsedSet(); |
431 | } |
432 | |
433 | /// This performs the main chunk of the work of cloning the loops for |
434 | /// the partitions. |
435 | void cloneLoops() { |
436 | BasicBlock *OrigPH = L->getLoopPreheader(); |
437 | // At this point the predecessor of the preheader is either the memcheck |
438 | // block or the top part of the original preheader. |
439 | BasicBlock *Pred = OrigPH->getSinglePredecessor(); |
440 | assert(Pred && "Preheader does not have a single predecessor" ); |
441 | BasicBlock *ExitBlock = L->getExitBlock(); |
442 | assert(ExitBlock && "No single exit block" ); |
443 | Loop *NewLoop; |
444 | |
445 | assert(!PartitionContainer.empty() && "at least two partitions expected" ); |
446 | // We're cloning the preheader along with the loop so we already made sure |
447 | // it was empty. |
448 | assert(&*OrigPH->begin() == OrigPH->getTerminator() && |
449 | "preheader not empty" ); |
450 | |
451 | // Preserve the original loop ID for use after the transformation. |
452 | MDNode *OrigLoopID = L->getLoopID(); |
453 | |
454 | // Create a loop for each partition except the last. Clone the original |
455 | // loop before PH along with adding a preheader for the cloned loop. Then |
456 | // update PH to point to the newly added preheader. |
457 | BasicBlock *TopPH = OrigPH; |
458 | unsigned Index = getSize() - 1; |
459 | for (auto &Part : llvm::drop_begin(RangeOrContainer: llvm::reverse(C&: PartitionContainer))) { |
460 | NewLoop = Part.cloneLoopWithPreheader(InsertBefore: TopPH, LoopDomBB: Pred, Index, LI, DT); |
461 | |
462 | Part.getVMap()[ExitBlock] = TopPH; |
463 | Part.remapInstructions(); |
464 | setNewLoopID(OrigLoopID, Part: &Part); |
465 | --Index; |
466 | TopPH = NewLoop->getLoopPreheader(); |
467 | } |
468 | Pred->getTerminator()->replaceUsesOfWith(From: OrigPH, To: TopPH); |
469 | |
470 | // Also set a new loop ID for the last loop. |
471 | setNewLoopID(OrigLoopID, Part: &PartitionContainer.back()); |
472 | |
473 | // Now go in forward order and update the immediate dominator for the |
474 | // preheaders with the exiting block of the previous loop. Dominance |
475 | // within the loop is updated in cloneLoopWithPreheader. |
476 | for (auto Curr = PartitionContainer.cbegin(), |
477 | Next = std::next(x: PartitionContainer.cbegin()), |
478 | E = PartitionContainer.cend(); |
479 | Next != E; ++Curr, ++Next) |
480 | DT->changeImmediateDominator( |
481 | BB: Next->getDistributedLoop()->getLoopPreheader(), |
482 | NewBB: Curr->getDistributedLoop()->getExitingBlock()); |
483 | } |
484 | |
485 | /// Removes the dead instructions from the cloned loops. |
486 | void removeUnusedInsts() { |
487 | for (auto &Partition : PartitionContainer) |
488 | Partition.removeUnusedInsts(); |
489 | } |
490 | |
491 | /// For each memory pointer, it computes the partitionId the pointer is |
492 | /// used in. |
493 | /// |
494 | /// This returns an array of int where the I-th entry corresponds to I-th |
495 | /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple |
496 | /// partitions its entry is set to -1. |
497 | SmallVector<int, 8> |
498 | computePartitionSetForPointers(const LoopAccessInfo &LAI) { |
499 | const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); |
500 | |
501 | unsigned N = RtPtrCheck->Pointers.size(); |
502 | SmallVector<int, 8> PtrToPartitions(N); |
503 | for (unsigned I = 0; I < N; ++I) { |
504 | Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; |
505 | auto Instructions = |
506 | LAI.getInstructionsForAccess(Ptr, isWrite: RtPtrCheck->Pointers[I].IsWritePtr); |
507 | |
508 | int &Partition = PtrToPartitions[I]; |
509 | // First set it to uninitialized. |
510 | Partition = -2; |
511 | for (Instruction *Inst : Instructions) { |
512 | // Note that this could be -1 if Inst is duplicated across multiple |
513 | // partitions. |
514 | int ThisPartition = this->InstToPartitionId[Inst]; |
515 | if (Partition == -2) |
516 | Partition = ThisPartition; |
517 | // -1 means belonging to multiple partitions. |
518 | else if (Partition == -1) |
519 | break; |
520 | else if (Partition != (int)ThisPartition) |
521 | Partition = -1; |
522 | } |
523 | assert(Partition != -2 && "Pointer not belonging to any partition" ); |
524 | } |
525 | |
526 | return PtrToPartitions; |
527 | } |
528 | |
529 | void print(raw_ostream &OS) const { |
530 | unsigned Index = 0; |
531 | for (const auto &P : PartitionContainer) { |
532 | OS << "LDist: Partition " << Index++ << ":" ; |
533 | P.print(OS); |
534 | } |
535 | } |
536 | |
537 | void dump() const { print(OS&: dbgs()); } |
538 | |
539 | #ifndef NDEBUG |
540 | friend raw_ostream &operator<<(raw_ostream &OS, |
541 | const InstPartitionContainer &Partitions) { |
542 | Partitions.print(OS); |
543 | return OS; |
544 | } |
545 | #endif |
546 | |
547 | void printBlocks(raw_ostream &OS) const { |
548 | unsigned Index = 0; |
549 | for (const auto &P : PartitionContainer) { |
550 | OS << "LDist: Partition " << Index++ << ":" ; |
551 | P.printBlocks(OS); |
552 | } |
553 | } |
554 | |
555 | private: |
556 | using PartitionContainerT = std::list<InstPartition>; |
557 | |
558 | /// List of partitions. |
559 | PartitionContainerT PartitionContainer; |
560 | |
561 | /// Mapping from Instruction to partition Id. If the instruction |
562 | /// belongs to multiple partitions the entry contains -1. |
563 | InstToPartitionIdT InstToPartitionId; |
564 | |
565 | Loop *L; |
566 | LoopInfo *LI; |
567 | DominatorTree *DT; |
568 | |
569 | /// The control structure to merge adjacent partitions if both satisfy |
570 | /// the \p Predicate. |
571 | template <class UnaryPredicate> |
572 | void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { |
573 | InstPartition *PrevMatch = nullptr; |
574 | for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { |
575 | auto DoesMatch = Predicate(&*I); |
576 | if (PrevMatch == nullptr && DoesMatch) { |
577 | PrevMatch = &*I; |
578 | ++I; |
579 | } else if (PrevMatch != nullptr && DoesMatch) { |
580 | I->moveTo(Other&: *PrevMatch); |
581 | I = PartitionContainer.erase(position: I); |
582 | } else { |
583 | PrevMatch = nullptr; |
584 | ++I; |
585 | } |
586 | } |
587 | } |
588 | |
589 | /// Assign new LoopIDs for the partition's cloned loop. |
590 | void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) { |
591 | std::optional<MDNode *> PartitionID = makeFollowupLoopID( |
592 | OrigLoopID, |
593 | FollowupAttrs: {LLVMLoopDistributeFollowupAll, |
594 | Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential |
595 | : LLVMLoopDistributeFollowupCoincident}); |
596 | if (PartitionID) { |
597 | Loop *NewLoop = Part->getDistributedLoop(); |
598 | NewLoop->setLoopID(*PartitionID); |
599 | } |
600 | } |
601 | }; |
602 | |
603 | /// For each memory instruction, this class maintains difference of the |
604 | /// number of unsafe dependences that start out from this instruction minus |
605 | /// those that end here. |
606 | /// |
607 | /// By traversing the memory instructions in program order and accumulating this |
608 | /// number, we know whether any unsafe dependence crosses over a program point. |
609 | class MemoryInstructionDependences { |
610 | using Dependence = MemoryDepChecker::Dependence; |
611 | |
612 | public: |
613 | struct Entry { |
614 | Instruction *Inst; |
615 | unsigned NumUnsafeDependencesStartOrEnd = 0; |
616 | |
617 | Entry(Instruction *Inst) : Inst(Inst) {} |
618 | }; |
619 | |
620 | using AccessesType = SmallVector<Entry, 8>; |
621 | |
622 | AccessesType::const_iterator begin() const { return Accesses.begin(); } |
623 | AccessesType::const_iterator end() const { return Accesses.end(); } |
624 | |
625 | MemoryInstructionDependences( |
626 | const SmallVectorImpl<Instruction *> &Instructions, |
627 | const SmallVectorImpl<Dependence> &Dependences) { |
628 | Accesses.append(in_start: Instructions.begin(), in_end: Instructions.end()); |
629 | |
630 | LLVM_DEBUG(dbgs() << "LDist: Backward dependences:\n" ); |
631 | for (const auto &Dep : Dependences) |
632 | if (Dep.isPossiblyBackward()) { |
633 | // Note that the designations source and destination follow the program |
634 | // order, i.e. source is always first. (The direction is given by the |
635 | // DepType.) |
636 | ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; |
637 | --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; |
638 | |
639 | LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions)); |
640 | } |
641 | } |
642 | |
643 | private: |
644 | AccessesType Accesses; |
645 | }; |
646 | |
647 | /// The actual class performing the per-loop work. |
648 | class LoopDistributeForLoop { |
649 | public: |
650 | LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, |
651 | ScalarEvolution *SE, LoopAccessInfoManager &LAIs, |
652 | OptimizationRemarkEmitter *ORE) |
653 | : L(L), F(F), LI(LI), DT(DT), SE(SE), LAIs(LAIs), ORE(ORE) { |
654 | setForced(); |
655 | } |
656 | |
657 | /// Try to distribute an inner-most loop. |
658 | bool processLoop() { |
659 | assert(L->isInnermost() && "Only process inner loops." ); |
660 | |
661 | LLVM_DEBUG(dbgs() << "\nLDist: Checking a loop in '" |
662 | << L->getHeader()->getParent()->getName() << "' from " |
663 | << L->getLocStr() << "\n" ); |
664 | |
665 | // Having a single exit block implies there's also one exiting block. |
666 | if (!L->getExitBlock()) |
667 | return fail(RemarkName: "MultipleExitBlocks" , Message: "multiple exit blocks" ); |
668 | if (!L->isLoopSimplifyForm()) |
669 | return fail(RemarkName: "NotLoopSimplifyForm" , |
670 | Message: "loop is not in loop-simplify form" ); |
671 | if (!L->isRotatedForm()) |
672 | return fail(RemarkName: "NotBottomTested" , Message: "loop is not bottom tested" ); |
673 | |
674 | BasicBlock *PH = L->getLoopPreheader(); |
675 | |
676 | LAI = &LAIs.getInfo(L&: *L); |
677 | |
678 | // Currently, we only distribute to isolate the part of the loop with |
679 | // dependence cycles to enable partial vectorization. |
680 | if (LAI->canVectorizeMemory()) |
681 | return fail(RemarkName: "MemOpsCanBeVectorized" , |
682 | Message: "memory operations are safe for vectorization" ); |
683 | |
684 | auto *Dependences = LAI->getDepChecker().getDependences(); |
685 | if (!Dependences || Dependences->empty()) |
686 | return fail(RemarkName: "NoUnsafeDeps" , Message: "no unsafe dependences to isolate" ); |
687 | |
688 | LLVM_DEBUG(dbgs() << "LDist: Found a candidate loop: " |
689 | << L->getHeader()->getName() << "\n" ); |
690 | |
691 | InstPartitionContainer Partitions(L, LI, DT); |
692 | |
693 | // First, go through each memory operation and assign them to consecutive |
694 | // partitions (the order of partitions follows program order). Put those |
695 | // with unsafe dependences into "cyclic" partition otherwise put each store |
696 | // in its own "non-cyclic" partition (we'll merge these later). |
697 | // |
698 | // Note that a memory operation (e.g. Load2 below) at a program point that |
699 | // has an unsafe dependence (Store3->Load1) spanning over it must be |
700 | // included in the same cyclic partition as the dependent operations. This |
701 | // is to preserve the original program order after distribution. E.g.: |
702 | // |
703 | // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive |
704 | // Load1 -. 1 0->1 |
705 | // Load2 | /Unsafe/ 0 1 |
706 | // Store3 -' -1 1->0 |
707 | // Load4 0 0 |
708 | // |
709 | // NumUnsafeDependencesActive > 0 indicates this situation and in this case |
710 | // we just keep assigning to the same cyclic partition until |
711 | // NumUnsafeDependencesActive reaches 0. |
712 | const MemoryDepChecker &DepChecker = LAI->getDepChecker(); |
713 | MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), |
714 | *Dependences); |
715 | |
716 | int NumUnsafeDependencesActive = 0; |
717 | for (const auto &InstDep : MID) { |
718 | Instruction *I = InstDep.Inst; |
719 | // We update NumUnsafeDependencesActive post-instruction, catch the |
720 | // start of a dependence directly via NumUnsafeDependencesStartOrEnd. |
721 | if (NumUnsafeDependencesActive || |
722 | InstDep.NumUnsafeDependencesStartOrEnd > 0) |
723 | Partitions.addToCyclicPartition(Inst: I); |
724 | else |
725 | Partitions.addToNewNonCyclicPartition(Inst: I); |
726 | NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; |
727 | assert(NumUnsafeDependencesActive >= 0 && |
728 | "Negative number of dependences active" ); |
729 | } |
730 | |
731 | // Add partitions for values used outside. These partitions can be out of |
732 | // order from the original program order. This is OK because if the |
733 | // partition uses a load we will merge this partition with the original |
734 | // partition of the load that we set up in the previous loop (see |
735 | // mergeToAvoidDuplicatedLoads). |
736 | auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); |
737 | for (auto *Inst : DefsUsedOutside) |
738 | Partitions.addToNewNonCyclicPartition(Inst); |
739 | |
740 | LLVM_DEBUG(dbgs() << "LDist: Seeded partitions:\n" << Partitions); |
741 | if (Partitions.getSize() < 2) |
742 | return fail(RemarkName: "CantIsolateUnsafeDeps" , |
743 | Message: "cannot isolate unsafe dependencies" ); |
744 | |
745 | // Run the merge heuristics: Merge non-cyclic adjacent partitions since we |
746 | // should be able to vectorize these together. |
747 | Partitions.mergeBeforePopulating(); |
748 | LLVM_DEBUG(dbgs() << "LDist: Merged partitions:\n" << Partitions); |
749 | if (Partitions.getSize() < 2) |
750 | return fail(RemarkName: "CantIsolateUnsafeDeps" , |
751 | Message: "cannot isolate unsafe dependencies" ); |
752 | |
753 | // Now, populate the partitions with non-memory operations. |
754 | Partitions.populateUsedSet(); |
755 | LLVM_DEBUG(dbgs() << "LDist: Populated partitions:\n" << Partitions); |
756 | |
757 | // In order to preserve original lexical order for loads, keep them in the |
758 | // partition that we set up in the MemoryInstructionDependences loop. |
759 | if (Partitions.mergeToAvoidDuplicatedLoads()) { |
760 | LLVM_DEBUG(dbgs() << "LDist: Partitions merged to ensure unique loads:\n" |
761 | << Partitions); |
762 | if (Partitions.getSize() < 2) |
763 | return fail(RemarkName: "CantIsolateUnsafeDeps" , |
764 | Message: "cannot isolate unsafe dependencies" ); |
765 | } |
766 | |
767 | // Don't distribute the loop if we need too many SCEV run-time checks, or |
768 | // any if it's illegal. |
769 | const SCEVPredicate &Pred = LAI->getPSE().getPredicate(); |
770 | if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) { |
771 | return fail(RemarkName: "RuntimeCheckWithConvergent" , |
772 | Message: "may not insert runtime check with convergent operation" ); |
773 | } |
774 | |
775 | if (Pred.getComplexity() > (IsForced.value_or(u: false) |
776 | ? PragmaDistributeSCEVCheckThreshold |
777 | : DistributeSCEVCheckThreshold)) |
778 | return fail(RemarkName: "TooManySCEVRuntimeChecks" , |
779 | Message: "too many SCEV run-time checks needed.\n" ); |
780 | |
781 | if (!IsForced.value_or(u: false) && hasDisableAllTransformsHint(L)) |
782 | return fail(RemarkName: "HeuristicDisabled" , Message: "distribution heuristic disabled" ); |
783 | |
784 | LLVM_DEBUG(dbgs() << "LDist: Distributing loop: " |
785 | << L->getHeader()->getName() << "\n" ); |
786 | // We're done forming the partitions set up the reverse mapping from |
787 | // instructions to partitions. |
788 | Partitions.setupPartitionIdOnInstructions(); |
789 | |
790 | // If we need run-time checks, version the loop now. |
791 | auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI: *LAI); |
792 | const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); |
793 | const auto &AllChecks = RtPtrChecking->getChecks(); |
794 | auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, |
795 | RtPtrChecking); |
796 | |
797 | if (LAI->hasConvergentOp() && !Checks.empty()) { |
798 | return fail(RemarkName: "RuntimeCheckWithConvergent" , |
799 | Message: "may not insert runtime check with convergent operation" ); |
800 | } |
801 | |
802 | // To keep things simple have an empty preheader before we version or clone |
803 | // the loop. (Also split if this has no predecessor, i.e. entry, because we |
804 | // rely on PH having a predecessor.) |
805 | if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) |
806 | SplitBlock(Old: PH, SplitPt: PH->getTerminator(), DT, LI); |
807 | |
808 | if (!Pred.isAlwaysTrue() || !Checks.empty()) { |
809 | assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning" ); |
810 | |
811 | MDNode *OrigLoopID = L->getLoopID(); |
812 | |
813 | LLVM_DEBUG(dbgs() << "LDist: Pointers:\n" ); |
814 | LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); |
815 | LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE); |
816 | LVer.versionLoop(DefsUsedOutside); |
817 | LVer.annotateLoopWithNoAlias(); |
818 | |
819 | // The unversioned loop will not be changed, so we inherit all attributes |
820 | // from the original loop, but remove the loop distribution metadata to |
821 | // avoid to distribute it again. |
822 | MDNode *UnversionedLoopID = *makeFollowupLoopID( |
823 | OrigLoopID, |
824 | FollowupAttrs: {LLVMLoopDistributeFollowupAll, LLVMLoopDistributeFollowupFallback}, |
825 | InheritOptionsAttrsPrefix: "llvm.loop.distribute." , AlwaysNew: true); |
826 | LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID); |
827 | } |
828 | |
829 | // Create identical copies of the original loop for each partition and hook |
830 | // them up sequentially. |
831 | Partitions.cloneLoops(); |
832 | |
833 | // Now, we remove the instruction from each loop that don't belong to that |
834 | // partition. |
835 | Partitions.removeUnusedInsts(); |
836 | LLVM_DEBUG(dbgs() << "LDist: After removing unused Instrs:\n" ); |
837 | LLVM_DEBUG(Partitions.printBlocks(dbgs())); |
838 | |
839 | if (LDistVerify) { |
840 | LI->verify(DomTree: *DT); |
841 | assert(DT->verify(DominatorTree::VerificationLevel::Fast)); |
842 | } |
843 | |
844 | ++NumLoopsDistributed; |
845 | // Report the success. |
846 | ORE->emit(RemarkBuilder: [&]() { |
847 | return OptimizationRemark(LDIST_NAME, "Distribute" , L->getStartLoc(), |
848 | L->getHeader()) |
849 | << "distributed loop" ; |
850 | }); |
851 | return true; |
852 | } |
853 | |
854 | /// Provide diagnostics then \return with false. |
855 | bool fail(StringRef , StringRef Message) { |
856 | LLVMContext &Ctx = F->getContext(); |
857 | bool Forced = isForced().value_or(u: false); |
858 | |
859 | LLVM_DEBUG(dbgs() << "LDist: Skipping; " << Message << "\n" ); |
860 | |
861 | // With Rpass-missed report that distribution failed. |
862 | ORE->emit(RemarkBuilder: [&]() { |
863 | return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed" , |
864 | L->getStartLoc(), L->getHeader()) |
865 | << "loop not distributed: use -Rpass-analysis=loop-distribute for " |
866 | "more " |
867 | "info" ; |
868 | }); |
869 | |
870 | // With Rpass-analysis report why. This is on by default if distribution |
871 | // was requested explicitly. |
872 | ORE->emit(OptDiag&: OptimizationRemarkAnalysis( |
873 | Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME, |
874 | RemarkName, L->getStartLoc(), L->getHeader()) |
875 | << "loop not distributed: " << Message); |
876 | |
877 | // Also issue a warning if distribution was requested explicitly but it |
878 | // failed. |
879 | if (Forced) |
880 | Ctx.diagnose(DI: DiagnosticInfoOptimizationFailure( |
881 | *F, L->getStartLoc(), "loop not distributed: failed " |
882 | "explicitly specified loop distribution" )); |
883 | |
884 | return false; |
885 | } |
886 | |
887 | /// Return if distribution forced to be enabled/disabled for the loop. |
888 | /// |
889 | /// If the optional has a value, it indicates whether distribution was forced |
890 | /// to be enabled (true) or disabled (false). If the optional has no value |
891 | /// distribution was not forced either way. |
892 | const std::optional<bool> &isForced() const { return IsForced; } |
893 | |
894 | private: |
895 | /// Filter out checks between pointers from the same partition. |
896 | /// |
897 | /// \p PtrToPartition contains the partition number for pointers. Partition |
898 | /// number -1 means that the pointer is used in multiple partitions. In this |
899 | /// case we can't safely omit the check. |
900 | SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks( |
901 | const SmallVectorImpl<RuntimePointerCheck> &AllChecks, |
902 | const SmallVectorImpl<int> &PtrToPartition, |
903 | const RuntimePointerChecking *RtPtrChecking) { |
904 | SmallVector<RuntimePointerCheck, 4> Checks; |
905 | |
906 | copy_if(Range: AllChecks, Out: std::back_inserter(x&: Checks), |
907 | P: [&](const RuntimePointerCheck &Check) { |
908 | for (unsigned PtrIdx1 : Check.first->Members) |
909 | for (unsigned PtrIdx2 : Check.second->Members) |
910 | // Only include this check if there is a pair of pointers |
911 | // that require checking and the pointers fall into |
912 | // separate partitions. |
913 | // |
914 | // (Note that we already know at this point that the two |
915 | // pointer groups need checking but it doesn't follow |
916 | // that each pair of pointers within the two groups need |
917 | // checking as well. |
918 | // |
919 | // In other words we don't want to include a check just |
920 | // because there is a pair of pointers between the two |
921 | // pointer groups that require checks and a different |
922 | // pair whose pointers fall into different partitions.) |
923 | if (RtPtrChecking->needsChecking(I: PtrIdx1, J: PtrIdx2) && |
924 | !RuntimePointerChecking::arePointersInSamePartition( |
925 | PtrToPartition, PtrIdx1, PtrIdx2)) |
926 | return true; |
927 | return false; |
928 | }); |
929 | |
930 | return Checks; |
931 | } |
932 | |
933 | /// Check whether the loop metadata is forcing distribution to be |
934 | /// enabled/disabled. |
935 | void setForced() { |
936 | std::optional<const MDOperand *> Value = |
937 | findStringMetadataForLoop(TheLoop: L, Name: "llvm.loop.distribute.enable" ); |
938 | if (!Value) |
939 | return; |
940 | |
941 | const MDOperand *Op = *Value; |
942 | assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata" ); |
943 | IsForced = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue(); |
944 | } |
945 | |
946 | Loop *L; |
947 | Function *F; |
948 | |
949 | // Analyses used. |
950 | LoopInfo *LI; |
951 | const LoopAccessInfo *LAI = nullptr; |
952 | DominatorTree *DT; |
953 | ScalarEvolution *SE; |
954 | LoopAccessInfoManager &LAIs; |
955 | OptimizationRemarkEmitter *ORE; |
956 | |
957 | /// Indicates whether distribution is forced to be enabled/disabled for |
958 | /// the loop. |
959 | /// |
960 | /// If the optional has a value, it indicates whether distribution was forced |
961 | /// to be enabled (true) or disabled (false). If the optional has no value |
962 | /// distribution was not forced either way. |
963 | std::optional<bool> IsForced; |
964 | }; |
965 | |
966 | } // end anonymous namespace |
967 | |
968 | static bool (Function &F, LoopInfo *LI, DominatorTree *DT, |
969 | ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, |
970 | LoopAccessInfoManager &LAIs) { |
971 | // Build up a worklist of inner-loops to distribute. This is necessary as the |
972 | // act of distributing a loop creates new loops and can invalidate iterators |
973 | // across the loops. |
974 | SmallVector<Loop *, 8> Worklist; |
975 | |
976 | for (Loop *TopLevelLoop : *LI) |
977 | for (Loop *L : depth_first(G: TopLevelLoop)) |
978 | // We only handle inner-most loops. |
979 | if (L->isInnermost()) |
980 | Worklist.push_back(Elt: L); |
981 | |
982 | // Now walk the identified inner loops. |
983 | bool Changed = false; |
984 | for (Loop *L : Worklist) { |
985 | LoopDistributeForLoop LDL(L, &F, LI, DT, SE, LAIs, ORE); |
986 | |
987 | // If distribution was forced for the specific loop to be |
988 | // enabled/disabled, follow that. Otherwise use the global flag. |
989 | if (LDL.isForced().value_or(u&: EnableLoopDistribute)) |
990 | Changed |= LDL.processLoop(); |
991 | } |
992 | |
993 | // Process each loop nest in the function. |
994 | return Changed; |
995 | } |
996 | |
997 | PreservedAnalyses LoopDistributePass::run(Function &F, |
998 | FunctionAnalysisManager &AM) { |
999 | auto &LI = AM.getResult<LoopAnalysis>(IR&: F); |
1000 | auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
1001 | auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
1002 | auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
1003 | |
1004 | LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(IR&: F); |
1005 | bool Changed = runImpl(F, LI: &LI, DT: &DT, SE: &SE, ORE: &ORE, LAIs); |
1006 | if (!Changed) |
1007 | return PreservedAnalyses::all(); |
1008 | PreservedAnalyses PA; |
1009 | PA.preserve<LoopAnalysis>(); |
1010 | PA.preserve<DominatorTreeAnalysis>(); |
1011 | return PA; |
1012 | } |
1013 | |