1 | //===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file implements a set of utility VPlan to VPlan transformations. |
11 | /// |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "VPlanTransforms.h" |
15 | #include "VPRecipeBuilder.h" |
16 | #include "VPlanAnalysis.h" |
17 | #include "VPlanCFG.h" |
18 | #include "VPlanDominatorTree.h" |
19 | #include "VPlanPatternMatch.h" |
20 | #include "llvm/ADT/PostOrderIterator.h" |
21 | #include "llvm/ADT/STLExtras.h" |
22 | #include "llvm/ADT/SetVector.h" |
23 | #include "llvm/Analysis/IVDescriptors.h" |
24 | #include "llvm/Analysis/VectorUtils.h" |
25 | #include "llvm/IR/Intrinsics.h" |
26 | #include "llvm/IR/PatternMatch.h" |
27 | |
28 | using namespace llvm; |
29 | |
30 | void VPlanTransforms::VPInstructionsToVPRecipes( |
31 | VPlanPtr &Plan, |
32 | function_ref<const InductionDescriptor *(PHINode *)> |
33 | GetIntOrFpInductionDescriptor, |
34 | ScalarEvolution &SE, const TargetLibraryInfo &TLI) { |
35 | |
36 | ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( |
37 | Plan->getVectorLoopRegion()); |
38 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) { |
39 | // Skip blocks outside region |
40 | if (!VPBB->getParent()) |
41 | break; |
42 | VPRecipeBase *Term = VPBB->getTerminator(); |
43 | auto EndIter = Term ? Term->getIterator() : VPBB->end(); |
44 | // Introduce each ingredient into VPlan. |
45 | for (VPRecipeBase &Ingredient : |
46 | make_early_inc_range(Range: make_range(x: VPBB->begin(), y: EndIter))) { |
47 | |
48 | VPValue *VPV = Ingredient.getVPSingleValue(); |
49 | Instruction *Inst = cast<Instruction>(Val: VPV->getUnderlyingValue()); |
50 | |
51 | VPRecipeBase *NewRecipe = nullptr; |
52 | if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &Ingredient)) { |
53 | auto *Phi = cast<PHINode>(Val: VPPhi->getUnderlyingValue()); |
54 | const auto *II = GetIntOrFpInductionDescriptor(Phi); |
55 | if (!II) |
56 | continue; |
57 | |
58 | VPValue *Start = Plan->getOrAddLiveIn(V: II->getStartValue()); |
59 | VPValue *Step = |
60 | vputils::getOrCreateVPValueForSCEVExpr(Plan&: *Plan, Expr: II->getStep(), SE); |
61 | NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II); |
62 | } else { |
63 | assert(isa<VPInstruction>(&Ingredient) && |
64 | "only VPInstructions expected here" ); |
65 | assert(!isa<PHINode>(Inst) && "phis should be handled above" ); |
66 | // Create VPWidenMemoryRecipe for loads and stores. |
67 | if (LoadInst *Load = dyn_cast<LoadInst>(Val: Inst)) { |
68 | NewRecipe = new VPWidenLoadRecipe( |
69 | *Load, Ingredient.getOperand(N: 0), nullptr /*Mask*/, |
70 | false /*Consecutive*/, false /*Reverse*/, |
71 | Ingredient.getDebugLoc()); |
72 | } else if (StoreInst *Store = dyn_cast<StoreInst>(Val: Inst)) { |
73 | NewRecipe = new VPWidenStoreRecipe( |
74 | *Store, Ingredient.getOperand(N: 1), Ingredient.getOperand(N: 0), |
75 | nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, |
76 | Ingredient.getDebugLoc()); |
77 | } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) { |
78 | NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); |
79 | } else if (CallInst *CI = dyn_cast<CallInst>(Val: Inst)) { |
80 | NewRecipe = new VPWidenCallRecipe( |
81 | CI, Ingredient.operands(), getVectorIntrinsicIDForCall(CI, TLI: &TLI), |
82 | CI->getDebugLoc()); |
83 | } else if (SelectInst *SI = dyn_cast<SelectInst>(Val: Inst)) { |
84 | NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands()); |
85 | } else if (auto *CI = dyn_cast<CastInst>(Val: Inst)) { |
86 | NewRecipe = new VPWidenCastRecipe( |
87 | CI->getOpcode(), Ingredient.getOperand(N: 0), CI->getType(), *CI); |
88 | } else { |
89 | NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands()); |
90 | } |
91 | } |
92 | |
93 | NewRecipe->insertBefore(InsertPos: &Ingredient); |
94 | if (NewRecipe->getNumDefinedValues() == 1) |
95 | VPV->replaceAllUsesWith(New: NewRecipe->getVPSingleValue()); |
96 | else |
97 | assert(NewRecipe->getNumDefinedValues() == 0 && |
98 | "Only recpies with zero or one defined values expected" ); |
99 | Ingredient.eraseFromParent(); |
100 | } |
101 | } |
102 | } |
103 | |
104 | static bool sinkScalarOperands(VPlan &Plan) { |
105 | auto Iter = vp_depth_first_deep(G: Plan.getEntry()); |
106 | bool Changed = false; |
107 | // First, collect the operands of all recipes in replicate blocks as seeds for |
108 | // sinking. |
109 | SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; |
110 | for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Range: Iter)) { |
111 | VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); |
112 | if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) |
113 | continue; |
114 | VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Val: EntryVPBB->getSuccessors()[0]); |
115 | if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) |
116 | continue; |
117 | for (auto &Recipe : *VPBB) { |
118 | for (VPValue *Op : Recipe.operands()) |
119 | if (auto *Def = |
120 | dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe())) |
121 | WorkList.insert(X: std::make_pair(x&: VPBB, y&: Def)); |
122 | } |
123 | } |
124 | |
125 | bool ScalarVFOnly = Plan.hasScalarVFOnly(); |
126 | // Try to sink each replicate or scalar IV steps recipe in the worklist. |
127 | for (unsigned I = 0; I != WorkList.size(); ++I) { |
128 | VPBasicBlock *SinkTo; |
129 | VPSingleDefRecipe *SinkCandidate; |
130 | std::tie(args&: SinkTo, args&: SinkCandidate) = WorkList[I]; |
131 | if (SinkCandidate->getParent() == SinkTo || |
132 | SinkCandidate->mayHaveSideEffects() || |
133 | SinkCandidate->mayReadOrWriteMemory()) |
134 | continue; |
135 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) { |
136 | if (!ScalarVFOnly && RepR->isUniform()) |
137 | continue; |
138 | } else if (!isa<VPScalarIVStepsRecipe>(Val: SinkCandidate)) |
139 | continue; |
140 | |
141 | bool NeedsDuplicating = false; |
142 | // All recipe users of the sink candidate must be in the same block SinkTo |
143 | // or all users outside of SinkTo must be uniform-after-vectorization ( |
144 | // i.e., only first lane is used) . In the latter case, we need to duplicate |
145 | // SinkCandidate. |
146 | auto CanSinkWithUser = [SinkTo, &NeedsDuplicating, |
147 | SinkCandidate](VPUser *U) { |
148 | auto *UI = dyn_cast<VPRecipeBase>(Val: U); |
149 | if (!UI) |
150 | return false; |
151 | if (UI->getParent() == SinkTo) |
152 | return true; |
153 | NeedsDuplicating = UI->onlyFirstLaneUsed(Op: SinkCandidate); |
154 | // We only know how to duplicate VPRecipeRecipes for now. |
155 | return NeedsDuplicating && isa<VPReplicateRecipe>(Val: SinkCandidate); |
156 | }; |
157 | if (!all_of(Range: SinkCandidate->users(), P: CanSinkWithUser)) |
158 | continue; |
159 | |
160 | if (NeedsDuplicating) { |
161 | if (ScalarVFOnly) |
162 | continue; |
163 | Instruction *I = SinkCandidate->getUnderlyingInstr(); |
164 | auto *Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true); |
165 | // TODO: add ".cloned" suffix to name of Clone's VPValue. |
166 | |
167 | Clone->insertBefore(InsertPos: SinkCandidate); |
168 | SinkCandidate->replaceUsesWithIf(New: Clone, ShouldReplace: [SinkTo](VPUser &U, unsigned) { |
169 | return cast<VPRecipeBase>(Val: &U)->getParent() != SinkTo; |
170 | }); |
171 | } |
172 | SinkCandidate->moveBefore(BB&: *SinkTo, I: SinkTo->getFirstNonPhi()); |
173 | for (VPValue *Op : SinkCandidate->operands()) |
174 | if (auto *Def = |
175 | dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe())) |
176 | WorkList.insert(X: std::make_pair(x&: SinkTo, y&: Def)); |
177 | Changed = true; |
178 | } |
179 | return Changed; |
180 | } |
181 | |
182 | /// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return |
183 | /// the mask. |
184 | VPValue *getPredicatedMask(VPRegionBlock *R) { |
185 | auto *EntryBB = dyn_cast<VPBasicBlock>(Val: R->getEntry()); |
186 | if (!EntryBB || EntryBB->size() != 1 || |
187 | !isa<VPBranchOnMaskRecipe>(Val: EntryBB->begin())) |
188 | return nullptr; |
189 | |
190 | return cast<VPBranchOnMaskRecipe>(Val: &*EntryBB->begin())->getOperand(N: 0); |
191 | } |
192 | |
193 | /// If \p R is a triangle region, return the 'then' block of the triangle. |
194 | static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) { |
195 | auto *EntryBB = cast<VPBasicBlock>(Val: R->getEntry()); |
196 | if (EntryBB->getNumSuccessors() != 2) |
197 | return nullptr; |
198 | |
199 | auto *Succ0 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[0]); |
200 | auto *Succ1 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[1]); |
201 | if (!Succ0 || !Succ1) |
202 | return nullptr; |
203 | |
204 | if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1) |
205 | return nullptr; |
206 | if (Succ0->getSingleSuccessor() == Succ1) |
207 | return Succ0; |
208 | if (Succ1->getSingleSuccessor() == Succ0) |
209 | return Succ1; |
210 | return nullptr; |
211 | } |
212 | |
213 | // Merge replicate regions in their successor region, if a replicate region |
214 | // is connected to a successor replicate region with the same predicate by a |
215 | // single, empty VPBasicBlock. |
216 | static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { |
217 | SetVector<VPRegionBlock *> DeletedRegions; |
218 | |
219 | // Collect replicate regions followed by an empty block, followed by another |
220 | // replicate region with matching masks to process front. This is to avoid |
221 | // iterator invalidation issues while merging regions. |
222 | SmallVector<VPRegionBlock *, 8> WorkList; |
223 | for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>( |
224 | Range: vp_depth_first_deep(G: Plan.getEntry()))) { |
225 | if (!Region1->isReplicator()) |
226 | continue; |
227 | auto *MiddleBasicBlock = |
228 | dyn_cast_or_null<VPBasicBlock>(Val: Region1->getSingleSuccessor()); |
229 | if (!MiddleBasicBlock || !MiddleBasicBlock->empty()) |
230 | continue; |
231 | |
232 | auto *Region2 = |
233 | dyn_cast_or_null<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor()); |
234 | if (!Region2 || !Region2->isReplicator()) |
235 | continue; |
236 | |
237 | VPValue *Mask1 = getPredicatedMask(R: Region1); |
238 | VPValue *Mask2 = getPredicatedMask(R: Region2); |
239 | if (!Mask1 || Mask1 != Mask2) |
240 | continue; |
241 | |
242 | assert(Mask1 && Mask2 && "both region must have conditions" ); |
243 | WorkList.push_back(Elt: Region1); |
244 | } |
245 | |
246 | // Move recipes from Region1 to its successor region, if both are triangles. |
247 | for (VPRegionBlock *Region1 : WorkList) { |
248 | if (DeletedRegions.contains(key: Region1)) |
249 | continue; |
250 | auto *MiddleBasicBlock = cast<VPBasicBlock>(Val: Region1->getSingleSuccessor()); |
251 | auto *Region2 = cast<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor()); |
252 | |
253 | VPBasicBlock *Then1 = getPredicatedThenBlock(R: Region1); |
254 | VPBasicBlock *Then2 = getPredicatedThenBlock(R: Region2); |
255 | if (!Then1 || !Then2) |
256 | continue; |
257 | |
258 | // Note: No fusion-preventing memory dependencies are expected in either |
259 | // region. Such dependencies should be rejected during earlier dependence |
260 | // checks, which guarantee accesses can be re-ordered for vectorization. |
261 | // |
262 | // Move recipes to the successor region. |
263 | for (VPRecipeBase &ToMove : make_early_inc_range(Range: reverse(C&: *Then1))) |
264 | ToMove.moveBefore(BB&: *Then2, I: Then2->getFirstNonPhi()); |
265 | |
266 | auto *Merge1 = cast<VPBasicBlock>(Val: Then1->getSingleSuccessor()); |
267 | auto *Merge2 = cast<VPBasicBlock>(Val: Then2->getSingleSuccessor()); |
268 | |
269 | // Move VPPredInstPHIRecipes from the merge block to the successor region's |
270 | // merge block. Update all users inside the successor region to use the |
271 | // original values. |
272 | for (VPRecipeBase &Phi1ToMove : make_early_inc_range(Range: reverse(C&: *Merge1))) { |
273 | VPValue *PredInst1 = |
274 | cast<VPPredInstPHIRecipe>(Val: &Phi1ToMove)->getOperand(N: 0); |
275 | VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue(); |
276 | Phi1ToMoveV->replaceUsesWithIf(New: PredInst1, ShouldReplace: [Then2](VPUser &U, unsigned) { |
277 | auto *UI = dyn_cast<VPRecipeBase>(Val: &U); |
278 | return UI && UI->getParent() == Then2; |
279 | }); |
280 | |
281 | // Remove phi recipes that are unused after merging the regions. |
282 | if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) { |
283 | Phi1ToMove.eraseFromParent(); |
284 | continue; |
285 | } |
286 | Phi1ToMove.moveBefore(BB&: *Merge2, I: Merge2->begin()); |
287 | } |
288 | |
289 | // Finally, remove the first region. |
290 | for (VPBlockBase *Pred : make_early_inc_range(Range&: Region1->getPredecessors())) { |
291 | VPBlockUtils::disconnectBlocks(From: Pred, To: Region1); |
292 | VPBlockUtils::connectBlocks(From: Pred, To: MiddleBasicBlock); |
293 | } |
294 | VPBlockUtils::disconnectBlocks(From: Region1, To: MiddleBasicBlock); |
295 | DeletedRegions.insert(X: Region1); |
296 | } |
297 | |
298 | for (VPRegionBlock *ToDelete : DeletedRegions) |
299 | delete ToDelete; |
300 | return !DeletedRegions.empty(); |
301 | } |
302 | |
303 | static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, |
304 | VPlan &Plan) { |
305 | Instruction *Instr = PredRecipe->getUnderlyingInstr(); |
306 | // Build the triangular if-then region. |
307 | std::string RegionName = (Twine("pred." ) + Instr->getOpcodeName()).str(); |
308 | assert(Instr->getParent() && "Predicated instruction not in any basic block" ); |
309 | auto *BlockInMask = PredRecipe->getMask(); |
310 | auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); |
311 | auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry" , BOMRecipe); |
312 | |
313 | // Replace predicated replicate recipe with a replicate recipe without a |
314 | // mask but in the replicate region. |
315 | auto *RecipeWithoutMask = new VPReplicateRecipe( |
316 | PredRecipe->getUnderlyingInstr(), |
317 | make_range(x: PredRecipe->op_begin(), y: std::prev(x: PredRecipe->op_end())), |
318 | PredRecipe->isUniform()); |
319 | auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if" , RecipeWithoutMask); |
320 | |
321 | VPPredInstPHIRecipe *PHIRecipe = nullptr; |
322 | if (PredRecipe->getNumUsers() != 0) { |
323 | PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask); |
324 | PredRecipe->replaceAllUsesWith(New: PHIRecipe); |
325 | PHIRecipe->setOperand(I: 0, New: RecipeWithoutMask); |
326 | } |
327 | PredRecipe->eraseFromParent(); |
328 | auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue" , PHIRecipe); |
329 | VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); |
330 | |
331 | // Note: first set Entry as region entry and then connect successors starting |
332 | // from it in order, to propagate the "parent" of each VPBasicBlock. |
333 | VPBlockUtils::insertTwoBlocksAfter(IfTrue: Pred, IfFalse: Exiting, BlockPtr: Entry); |
334 | VPBlockUtils::connectBlocks(From: Pred, To: Exiting); |
335 | |
336 | return Region; |
337 | } |
338 | |
339 | static void addReplicateRegions(VPlan &Plan) { |
340 | SmallVector<VPReplicateRecipe *> WorkList; |
341 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
342 | Range: vp_depth_first_deep(G: Plan.getEntry()))) { |
343 | for (VPRecipeBase &R : *VPBB) |
344 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) { |
345 | if (RepR->isPredicated()) |
346 | WorkList.push_back(Elt: RepR); |
347 | } |
348 | } |
349 | |
350 | unsigned BBNum = 0; |
351 | for (VPReplicateRecipe *RepR : WorkList) { |
352 | VPBasicBlock *CurrentBlock = RepR->getParent(); |
353 | VPBasicBlock *SplitBlock = CurrentBlock->splitAt(SplitAt: RepR->getIterator()); |
354 | |
355 | BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent(); |
356 | SplitBlock->setName( |
357 | OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "" ); |
358 | // Record predicated instructions for above packing optimizations. |
359 | VPBlockBase *Region = createReplicateRegion(PredRecipe: RepR, Plan); |
360 | Region->setParent(CurrentBlock->getParent()); |
361 | VPBlockUtils::disconnectBlocks(From: CurrentBlock, To: SplitBlock); |
362 | VPBlockUtils::connectBlocks(From: CurrentBlock, To: Region); |
363 | VPBlockUtils::connectBlocks(From: Region, To: SplitBlock); |
364 | } |
365 | } |
366 | |
367 | /// Remove redundant VPBasicBlocks by merging them into their predecessor if |
368 | /// the predecessor has a single successor. |
369 | static bool mergeBlocksIntoPredecessors(VPlan &Plan) { |
370 | SmallVector<VPBasicBlock *> WorkList; |
371 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
372 | Range: vp_depth_first_deep(G: Plan.getEntry()))) { |
373 | // Don't fold the exit block of the Plan into its single predecessor for |
374 | // now. |
375 | // TODO: Remove restriction once more of the skeleton is modeled in VPlan. |
376 | if (VPBB->getNumSuccessors() == 0 && !VPBB->getParent()) |
377 | continue; |
378 | auto *PredVPBB = |
379 | dyn_cast_or_null<VPBasicBlock>(Val: VPBB->getSinglePredecessor()); |
380 | if (!PredVPBB || PredVPBB->getNumSuccessors() != 1) |
381 | continue; |
382 | WorkList.push_back(Elt: VPBB); |
383 | } |
384 | |
385 | for (VPBasicBlock *VPBB : WorkList) { |
386 | VPBasicBlock *PredVPBB = cast<VPBasicBlock>(Val: VPBB->getSinglePredecessor()); |
387 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) |
388 | R.moveBefore(BB&: *PredVPBB, I: PredVPBB->end()); |
389 | VPBlockUtils::disconnectBlocks(From: PredVPBB, To: VPBB); |
390 | auto *ParentRegion = cast_or_null<VPRegionBlock>(Val: VPBB->getParent()); |
391 | if (ParentRegion && ParentRegion->getExiting() == VPBB) |
392 | ParentRegion->setExiting(PredVPBB); |
393 | for (auto *Succ : to_vector(Range: VPBB->successors())) { |
394 | VPBlockUtils::disconnectBlocks(From: VPBB, To: Succ); |
395 | VPBlockUtils::connectBlocks(From: PredVPBB, To: Succ); |
396 | } |
397 | delete VPBB; |
398 | } |
399 | return !WorkList.empty(); |
400 | } |
401 | |
402 | void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) { |
403 | // Convert masked VPReplicateRecipes to if-then region blocks. |
404 | addReplicateRegions(Plan); |
405 | |
406 | bool ShouldSimplify = true; |
407 | while (ShouldSimplify) { |
408 | ShouldSimplify = sinkScalarOperands(Plan); |
409 | ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan); |
410 | ShouldSimplify |= mergeBlocksIntoPredecessors(Plan); |
411 | } |
412 | } |
413 | |
414 | /// Remove redundant casts of inductions. |
415 | /// |
416 | /// Such redundant casts are casts of induction variables that can be ignored, |
417 | /// because we already proved that the casted phi is equal to the uncasted phi |
418 | /// in the vectorized loop. There is no need to vectorize the cast - the same |
419 | /// value can be used for both the phi and casts in the vector loop. |
420 | static void removeRedundantInductionCasts(VPlan &Plan) { |
421 | for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
422 | auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi); |
423 | if (!IV || IV->getTruncInst()) |
424 | continue; |
425 | |
426 | // A sequence of IR Casts has potentially been recorded for IV, which |
427 | // *must be bypassed* when the IV is vectorized, because the vectorized IV |
428 | // will produce the desired casted value. This sequence forms a def-use |
429 | // chain and is provided in reverse order, ending with the cast that uses |
430 | // the IV phi. Search for the recipe of the last cast in the chain and |
431 | // replace it with the original IV. Note that only the final cast is |
432 | // expected to have users outside the cast-chain and the dead casts left |
433 | // over will be cleaned up later. |
434 | auto &Casts = IV->getInductionDescriptor().getCastInsts(); |
435 | VPValue *FindMyCast = IV; |
436 | for (Instruction *IRCast : reverse(C: Casts)) { |
437 | VPSingleDefRecipe *FoundUserCast = nullptr; |
438 | for (auto *U : FindMyCast->users()) { |
439 | auto *UserCast = dyn_cast<VPSingleDefRecipe>(Val: U); |
440 | if (UserCast && UserCast->getUnderlyingValue() == IRCast) { |
441 | FoundUserCast = UserCast; |
442 | break; |
443 | } |
444 | } |
445 | FindMyCast = FoundUserCast; |
446 | } |
447 | FindMyCast->replaceAllUsesWith(New: IV); |
448 | } |
449 | } |
450 | |
451 | /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV |
452 | /// recipe, if it exists. |
453 | static void removeRedundantCanonicalIVs(VPlan &Plan) { |
454 | VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); |
455 | VPWidenCanonicalIVRecipe *WidenNewIV = nullptr; |
456 | for (VPUser *U : CanonicalIV->users()) { |
457 | WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(Val: U); |
458 | if (WidenNewIV) |
459 | break; |
460 | } |
461 | |
462 | if (!WidenNewIV) |
463 | return; |
464 | |
465 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
466 | for (VPRecipeBase &Phi : HeaderVPBB->phis()) { |
467 | auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi); |
468 | |
469 | if (!WidenOriginalIV || !WidenOriginalIV->isCanonical()) |
470 | continue; |
471 | |
472 | // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides |
473 | // everything WidenNewIV's users need. That is, WidenOriginalIV will |
474 | // generate a vector phi or all users of WidenNewIV demand the first lane |
475 | // only. |
476 | if (any_of(Range: WidenOriginalIV->users(), |
477 | P: [WidenOriginalIV](VPUser *U) { |
478 | return !U->usesScalars(Op: WidenOriginalIV); |
479 | }) || |
480 | vputils::onlyFirstLaneUsed(Def: WidenNewIV)) { |
481 | WidenNewIV->replaceAllUsesWith(New: WidenOriginalIV); |
482 | WidenNewIV->eraseFromParent(); |
483 | return; |
484 | } |
485 | } |
486 | } |
487 | |
488 | /// Returns true if \p R is dead and can be removed. |
489 | static bool isDeadRecipe(VPRecipeBase &R) { |
490 | using namespace llvm::PatternMatch; |
491 | // Do remove conditional assume instructions as their conditions may be |
492 | // flattened. |
493 | auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R); |
494 | bool IsConditionalAssume = |
495 | RepR && RepR->isPredicated() && |
496 | match(V: RepR->getUnderlyingInstr(), P: m_Intrinsic<Intrinsic::assume>()); |
497 | if (IsConditionalAssume) |
498 | return true; |
499 | |
500 | if (R.mayHaveSideEffects()) |
501 | return false; |
502 | |
503 | // Recipe is dead if no user keeps the recipe alive. |
504 | return all_of(Range: R.definedValues(), |
505 | P: [](VPValue *V) { return V->getNumUsers() == 0; }); |
506 | } |
507 | |
508 | static void removeDeadRecipes(VPlan &Plan) { |
509 | ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( |
510 | Plan.getEntry()); |
511 | |
512 | for (VPBasicBlock *VPBB : reverse(C: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT))) { |
513 | // The recipes in the block are processed in reverse order, to catch chains |
514 | // of dead recipes. |
515 | for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) { |
516 | if (isDeadRecipe(R)) |
517 | R.eraseFromParent(); |
518 | } |
519 | } |
520 | } |
521 | |
522 | static VPScalarIVStepsRecipe * |
523 | createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, |
524 | Instruction::BinaryOps InductionOpcode, |
525 | FPMathOperator *FPBinOp, ScalarEvolution &SE, |
526 | Instruction *TruncI, VPValue *StartV, VPValue *Step, |
527 | VPBasicBlock::iterator IP) { |
528 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
529 | VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); |
530 | VPSingleDefRecipe *BaseIV = CanonicalIV; |
531 | if (!CanonicalIV->isCanonical(Kind, Start: StartV, Step)) { |
532 | BaseIV = new VPDerivedIVRecipe(Kind, FPBinOp, StartV, CanonicalIV, Step); |
533 | HeaderVPBB->insert(Recipe: BaseIV, InsertPt: IP); |
534 | } |
535 | |
536 | // Truncate base induction if needed. |
537 | VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), |
538 | SE.getContext()); |
539 | Type *ResultTy = TypeInfo.inferScalarType(V: BaseIV); |
540 | if (TruncI) { |
541 | Type *TruncTy = TruncI->getType(); |
542 | assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() && |
543 | "Not truncating." ); |
544 | assert(ResultTy->isIntegerTy() && "Truncation requires an integer type" ); |
545 | BaseIV = new VPScalarCastRecipe(Instruction::Trunc, BaseIV, TruncTy); |
546 | HeaderVPBB->insert(Recipe: BaseIV, InsertPt: IP); |
547 | ResultTy = TruncTy; |
548 | } |
549 | |
550 | // Truncate step if needed. |
551 | Type *StepTy = TypeInfo.inferScalarType(V: Step); |
552 | if (ResultTy != StepTy) { |
553 | assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() && |
554 | "Not truncating." ); |
555 | assert(StepTy->isIntegerTy() && "Truncation requires an integer type" ); |
556 | Step = new VPScalarCastRecipe(Instruction::Trunc, Step, ResultTy); |
557 | auto * = |
558 | cast<VPBasicBlock>(Val: HeaderVPBB->getSingleHierarchicalPredecessor()); |
559 | VecPreheader->appendRecipe(Recipe: Step->getDefiningRecipe()); |
560 | } |
561 | |
562 | VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe( |
563 | BaseIV, Step, InductionOpcode, |
564 | FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags()); |
565 | HeaderVPBB->insert(Recipe: Steps, InsertPt: IP); |
566 | return Steps; |
567 | } |
568 | |
569 | /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd |
570 | /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as |
571 | /// VPWidenPointerInductionRecipe will generate vectors only. If some users |
572 | /// require vectors while other require scalars, the scalar uses need to extract |
573 | /// the scalars from the generated vectors (Note that this is different to how |
574 | /// int/fp inductions are handled). Also optimize VPWidenIntOrFpInductionRecipe, |
575 | /// if any of its users needs scalar values, by providing them scalar steps |
576 | /// built on the canonical scalar IV and update the original IV's users. This is |
577 | /// an optional optimization to reduce the needs of vector extracts. |
578 | static void legalizeAndOptimizeInductions(VPlan &Plan, ScalarEvolution &SE) { |
579 | SmallVector<VPRecipeBase *> ToRemove; |
580 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
581 | bool HasOnlyVectorVFs = !Plan.hasVF(VF: ElementCount::getFixed(MinVal: 1)); |
582 | VPBasicBlock::iterator InsertPt = HeaderVPBB->getFirstNonPhi(); |
583 | for (VPRecipeBase &Phi : HeaderVPBB->phis()) { |
584 | // Replace wide pointer inductions which have only their scalars used by |
585 | // PtrAdd(IndStart, ScalarIVSteps (0, Step)). |
586 | if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(Val: &Phi)) { |
587 | if (!PtrIV->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF())) |
588 | continue; |
589 | |
590 | const InductionDescriptor &ID = PtrIV->getInductionDescriptor(); |
591 | VPValue *StartV = |
592 | Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: ID.getStep()->getType(), V: 0)); |
593 | VPValue *StepV = PtrIV->getOperand(N: 1); |
594 | VPScalarIVStepsRecipe *Steps = createScalarIVSteps( |
595 | Plan, Kind: InductionDescriptor::IK_IntInduction, InductionOpcode: Instruction::Add, FPBinOp: nullptr, |
596 | SE, TruncI: nullptr, StartV, Step: StepV, IP: InsertPt); |
597 | |
598 | auto *Recipe = new VPInstruction(VPInstruction::PtrAdd, |
599 | {PtrIV->getStartValue(), Steps}, |
600 | PtrIV->getDebugLoc(), "next.gep" ); |
601 | |
602 | Recipe->insertAfter(InsertPos: Steps); |
603 | PtrIV->replaceAllUsesWith(New: Recipe); |
604 | continue; |
605 | } |
606 | |
607 | // Replace widened induction with scalar steps for users that only use |
608 | // scalars. |
609 | auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi); |
610 | if (!WideIV) |
611 | continue; |
612 | if (HasOnlyVectorVFs && none_of(Range: WideIV->users(), P: [WideIV](VPUser *U) { |
613 | return U->usesScalars(Op: WideIV); |
614 | })) |
615 | continue; |
616 | |
617 | const InductionDescriptor &ID = WideIV->getInductionDescriptor(); |
618 | VPScalarIVStepsRecipe *Steps = createScalarIVSteps( |
619 | Plan, Kind: ID.getKind(), InductionOpcode: ID.getInductionOpcode(), |
620 | FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()), SE, |
621 | TruncI: WideIV->getTruncInst(), StartV: WideIV->getStartValue(), Step: WideIV->getStepValue(), |
622 | IP: InsertPt); |
623 | |
624 | // Update scalar users of IV to use Step instead. |
625 | if (!HasOnlyVectorVFs) |
626 | WideIV->replaceAllUsesWith(New: Steps); |
627 | else |
628 | WideIV->replaceUsesWithIf(New: Steps, ShouldReplace: [WideIV](VPUser &U, unsigned) { |
629 | return U.usesScalars(Op: WideIV); |
630 | }); |
631 | } |
632 | } |
633 | |
634 | /// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing |
635 | /// them with already existing recipes expanding the same SCEV expression. |
636 | static void removeRedundantExpandSCEVRecipes(VPlan &Plan) { |
637 | DenseMap<const SCEV *, VPValue *> SCEV2VPV; |
638 | |
639 | for (VPRecipeBase &R : |
640 | make_early_inc_range(Range&: *Plan.getEntry()->getEntryBasicBlock())) { |
641 | auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(Val: &R); |
642 | if (!ExpR) |
643 | continue; |
644 | |
645 | auto I = SCEV2VPV.insert(KV: {ExpR->getSCEV(), ExpR}); |
646 | if (I.second) |
647 | continue; |
648 | ExpR->replaceAllUsesWith(New: I.first->second); |
649 | ExpR->eraseFromParent(); |
650 | } |
651 | } |
652 | |
653 | static void recursivelyDeleteDeadRecipes(VPValue *V) { |
654 | SmallVector<VPValue *> WorkList; |
655 | SmallPtrSet<VPValue *, 8> Seen; |
656 | WorkList.push_back(Elt: V); |
657 | |
658 | while (!WorkList.empty()) { |
659 | VPValue *Cur = WorkList.pop_back_val(); |
660 | if (!Seen.insert(Ptr: Cur).second) |
661 | continue; |
662 | VPRecipeBase *R = Cur->getDefiningRecipe(); |
663 | if (!R) |
664 | continue; |
665 | if (!isDeadRecipe(R&: *R)) |
666 | continue; |
667 | WorkList.append(in_start: R->op_begin(), in_end: R->op_end()); |
668 | R->eraseFromParent(); |
669 | } |
670 | } |
671 | |
672 | void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, |
673 | unsigned BestUF, |
674 | PredicatedScalarEvolution &PSE) { |
675 | assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan" ); |
676 | assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan" ); |
677 | VPBasicBlock *ExitingVPBB = |
678 | Plan.getVectorLoopRegion()->getExitingBasicBlock(); |
679 | auto *Term = &ExitingVPBB->back(); |
680 | // Try to simplify the branch condition if TC <= VF * UF when preparing to |
681 | // execute the plan for the main vector loop. We only do this if the |
682 | // terminator is: |
683 | // 1. BranchOnCount, or |
684 | // 2. BranchOnCond where the input is Not(ActiveLaneMask). |
685 | using namespace llvm::VPlanPatternMatch; |
686 | if (!match(V: Term, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue())) && |
687 | !match(V: Term, |
688 | P: m_BranchOnCond(Op0: m_Not(Op0: m_ActiveLaneMask(Op0: m_VPValue(), Op1: m_VPValue()))))) |
689 | return; |
690 | |
691 | Type *IdxTy = |
692 | Plan.getCanonicalIV()->getStartValue()->getLiveInIRValue()->getType(); |
693 | const SCEV *TripCount = createTripCountSCEV(IdxTy, PSE); |
694 | ScalarEvolution &SE = *PSE.getSE(); |
695 | ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF); |
696 | const SCEV *C = SE.getElementCount(Ty: TripCount->getType(), EC: NumElements); |
697 | if (TripCount->isZero() || |
698 | !SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: TripCount, RHS: C)) |
699 | return; |
700 | |
701 | LLVMContext &Ctx = SE.getContext(); |
702 | auto *BOC = |
703 | new VPInstruction(VPInstruction::BranchOnCond, |
704 | {Plan.getOrAddLiveIn(V: ConstantInt::getTrue(Context&: Ctx))}); |
705 | |
706 | SmallVector<VPValue *> PossiblyDead(Term->operands()); |
707 | Term->eraseFromParent(); |
708 | for (VPValue *Op : PossiblyDead) |
709 | recursivelyDeleteDeadRecipes(V: Op); |
710 | ExitingVPBB->appendRecipe(Recipe: BOC); |
711 | Plan.setVF(BestVF); |
712 | Plan.setUF(BestUF); |
713 | // TODO: Further simplifications are possible |
714 | // 1. Replace inductions with constants. |
715 | // 2. Replace vector loop region with VPBasicBlock. |
716 | } |
717 | |
718 | #ifndef NDEBUG |
719 | static VPRegionBlock *GetReplicateRegion(VPRecipeBase *R) { |
720 | auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); |
721 | if (Region && Region->isReplicator()) { |
722 | assert(Region->getNumSuccessors() == 1 && |
723 | Region->getNumPredecessors() == 1 && "Expected SESE region!" ); |
724 | assert(R->getParent()->size() == 1 && |
725 | "A recipe in an original replicator region must be the only " |
726 | "recipe in its block" ); |
727 | return Region; |
728 | } |
729 | return nullptr; |
730 | } |
731 | #endif |
732 | |
733 | static bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B, |
734 | VPDominatorTree &VPDT) { |
735 | if (A == B) |
736 | return false; |
737 | |
738 | auto LocalComesBefore = [](const VPRecipeBase *A, const VPRecipeBase *B) { |
739 | for (auto &R : *A->getParent()) { |
740 | if (&R == A) |
741 | return true; |
742 | if (&R == B) |
743 | return false; |
744 | } |
745 | llvm_unreachable("recipe not found" ); |
746 | }; |
747 | const VPBlockBase *ParentA = A->getParent(); |
748 | const VPBlockBase *ParentB = B->getParent(); |
749 | if (ParentA == ParentB) |
750 | return LocalComesBefore(A, B); |
751 | |
752 | assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(A)) && |
753 | "No replicate regions expected at this point" ); |
754 | assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(B)) && |
755 | "No replicate regions expected at this point" ); |
756 | return VPDT.properlyDominates(A: ParentA, B: ParentB); |
757 | } |
758 | |
759 | /// Sink users of \p FOR after the recipe defining the previous value \p |
760 | /// Previous of the recurrence. \returns true if all users of \p FOR could be |
761 | /// re-arranged as needed or false if it is not possible. |
762 | static bool |
763 | sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, |
764 | VPRecipeBase *Previous, |
765 | VPDominatorTree &VPDT) { |
766 | // Collect recipes that need sinking. |
767 | SmallVector<VPRecipeBase *> WorkList; |
768 | SmallPtrSet<VPRecipeBase *, 8> Seen; |
769 | Seen.insert(Ptr: Previous); |
770 | auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) { |
771 | // The previous value must not depend on the users of the recurrence phi. In |
772 | // that case, FOR is not a fixed order recurrence. |
773 | if (SinkCandidate == Previous) |
774 | return false; |
775 | |
776 | if (isa<VPHeaderPHIRecipe>(Val: SinkCandidate) || |
777 | !Seen.insert(Ptr: SinkCandidate).second || |
778 | properlyDominates(A: Previous, B: SinkCandidate, VPDT)) |
779 | return true; |
780 | |
781 | if (SinkCandidate->mayHaveSideEffects()) |
782 | return false; |
783 | |
784 | WorkList.push_back(Elt: SinkCandidate); |
785 | return true; |
786 | }; |
787 | |
788 | // Recursively sink users of FOR after Previous. |
789 | WorkList.push_back(Elt: FOR); |
790 | for (unsigned I = 0; I != WorkList.size(); ++I) { |
791 | VPRecipeBase *Current = WorkList[I]; |
792 | assert(Current->getNumDefinedValues() == 1 && |
793 | "only recipes with a single defined value expected" ); |
794 | |
795 | for (VPUser *User : Current->getVPSingleValue()->users()) { |
796 | if (auto *R = dyn_cast<VPRecipeBase>(Val: User)) |
797 | if (!TryToPushSinkCandidate(R)) |
798 | return false; |
799 | } |
800 | } |
801 | |
802 | // Keep recipes to sink ordered by dominance so earlier instructions are |
803 | // processed first. |
804 | sort(C&: WorkList, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) { |
805 | return properlyDominates(A, B, VPDT); |
806 | }); |
807 | |
808 | for (VPRecipeBase *SinkCandidate : WorkList) { |
809 | if (SinkCandidate == FOR) |
810 | continue; |
811 | |
812 | SinkCandidate->moveAfter(MovePos: Previous); |
813 | Previous = SinkCandidate; |
814 | } |
815 | return true; |
816 | } |
817 | |
818 | bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, |
819 | VPBuilder &LoopBuilder) { |
820 | VPDominatorTree VPDT; |
821 | VPDT.recalculate(Func&: Plan); |
822 | |
823 | SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis; |
824 | for (VPRecipeBase &R : |
825 | Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis()) |
826 | if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R)) |
827 | RecurrencePhis.push_back(Elt: FOR); |
828 | |
829 | VPBasicBlock *MiddleVPBB = |
830 | cast<VPBasicBlock>(Val: Plan.getVectorLoopRegion()->getSingleSuccessor()); |
831 | VPBuilder MiddleBuilder; |
832 | // Set insert point so new recipes are inserted before terminator and |
833 | // condition, if there is either the former or both. |
834 | if (auto *Term = |
835 | dyn_cast_or_null<VPInstruction>(Val: MiddleVPBB->getTerminator())) { |
836 | if (auto *Cmp = dyn_cast<VPInstruction>(Val: Term->getOperand(N: 0))) |
837 | MiddleBuilder.setInsertPoint(Cmp); |
838 | else |
839 | MiddleBuilder.setInsertPoint(Term); |
840 | } else |
841 | MiddleBuilder.setInsertPoint(MiddleVPBB); |
842 | |
843 | for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) { |
844 | SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis; |
845 | VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe(); |
846 | // Fixed-order recurrences do not contain cycles, so this loop is guaranteed |
847 | // to terminate. |
848 | while (auto *PrevPhi = |
849 | dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Val: Previous)) { |
850 | assert(PrevPhi->getParent() == FOR->getParent()); |
851 | assert(SeenPhis.insert(PrevPhi).second); |
852 | Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe(); |
853 | } |
854 | |
855 | if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT)) |
856 | return false; |
857 | |
858 | // Introduce a recipe to combine the incoming and previous values of a |
859 | // fixed-order recurrence. |
860 | VPBasicBlock *InsertBlock = Previous->getParent(); |
861 | if (isa<VPHeaderPHIRecipe>(Val: Previous)) |
862 | LoopBuilder.setInsertPoint(TheBB: InsertBlock, IP: InsertBlock->getFirstNonPhi()); |
863 | else |
864 | LoopBuilder.setInsertPoint(TheBB: InsertBlock, |
865 | IP: std::next(x: Previous->getIterator())); |
866 | |
867 | auto *RecurSplice = cast<VPInstruction>( |
868 | Val: LoopBuilder.createNaryOp(Opcode: VPInstruction::FirstOrderRecurrenceSplice, |
869 | Operands: {FOR, FOR->getBackedgeValue()})); |
870 | |
871 | FOR->replaceAllUsesWith(New: RecurSplice); |
872 | // Set the first operand of RecurSplice to FOR again, after replacing |
873 | // all users. |
874 | RecurSplice->setOperand(I: 0, New: FOR); |
875 | |
876 | // This is the second phase of vectorizing first-order recurrences. An |
877 | // overview of the transformation is described below. Suppose we have the |
878 | // following loop with some use after the loop of the last a[i-1], |
879 | // |
880 | // for (int i = 0; i < n; ++i) { |
881 | // t = a[i - 1]; |
882 | // b[i] = a[i] - t; |
883 | // } |
884 | // use t; |
885 | // |
886 | // There is a first-order recurrence on "a". For this loop, the shorthand |
887 | // scalar IR looks like: |
888 | // |
889 | // scalar.ph: |
890 | // s_init = a[-1] |
891 | // br scalar.body |
892 | // |
893 | // scalar.body: |
894 | // i = phi [0, scalar.ph], [i+1, scalar.body] |
895 | // s1 = phi [s_init, scalar.ph], [s2, scalar.body] |
896 | // s2 = a[i] |
897 | // b[i] = s2 - s1 |
898 | // br cond, scalar.body, exit.block |
899 | // |
900 | // exit.block: |
901 | // use = lcssa.phi [s1, scalar.body] |
902 | // |
903 | // In this example, s1 is a recurrence because it's value depends on the |
904 | // previous iteration. In the first phase of vectorization, we created a |
905 | // vector phi v1 for s1. We now complete the vectorization and produce the |
906 | // shorthand vector IR shown below (for VF = 4, UF = 1). |
907 | // |
908 | // vector.ph: |
909 | // v_init = vector(..., ..., ..., a[-1]) |
910 | // br vector.body |
911 | // |
912 | // vector.body |
913 | // i = phi [0, vector.ph], [i+4, vector.body] |
914 | // v1 = phi [v_init, vector.ph], [v2, vector.body] |
915 | // v2 = a[i, i+1, i+2, i+3]; |
916 | // v3 = vector(v1(3), v2(0, 1, 2)) |
917 | // b[i, i+1, i+2, i+3] = v2 - v3 |
918 | // br cond, vector.body, middle.block |
919 | // |
920 | // middle.block: |
921 | // s_penultimate = v2(2) = v3(3) |
922 | // s_resume = v2(3) |
923 | // br cond, scalar.ph, exit.block |
924 | // |
925 | // scalar.ph: |
926 | // s_init' = phi [s_resume, middle.block], [s_init, otherwise] |
927 | // br scalar.body |
928 | // |
929 | // scalar.body: |
930 | // i = phi [0, scalar.ph], [i+1, scalar.body] |
931 | // s1 = phi [s_init', scalar.ph], [s2, scalar.body] |
932 | // s2 = a[i] |
933 | // b[i] = s2 - s1 |
934 | // br cond, scalar.body, exit.block |
935 | // |
936 | // exit.block: |
937 | // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block] |
938 | // |
939 | // After execution completes the vector loop, we extract the next value of |
940 | // the recurrence (x) to use as the initial value in the scalar loop. This |
941 | // is modeled by ExtractFromEnd. |
942 | Type *IntTy = Plan.getCanonicalIV()->getScalarType(); |
943 | |
944 | // Extract the penultimate value of the recurrence and update VPLiveOut |
945 | // users of the recurrence splice. Note that the extract of the final value |
946 | // used to resume in the scalar loop is created earlier during VPlan |
947 | // construction. |
948 | auto *Penultimate = cast<VPInstruction>(Val: MiddleBuilder.createNaryOp( |
949 | Opcode: VPInstruction::ExtractFromEnd, |
950 | Operands: {FOR->getBackedgeValue(), |
951 | Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: IntTy, V: 2))}, |
952 | Inst: {}, Name: "vector.recur.extract.for.phi" )); |
953 | RecurSplice->replaceUsesWithIf( |
954 | New: Penultimate, ShouldReplace: [](VPUser &U, unsigned) { return isa<VPLiveOut>(Val: &U); }); |
955 | } |
956 | return true; |
957 | } |
958 | |
959 | static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) { |
960 | SetVector<VPUser *> Users(V->user_begin(), V->user_end()); |
961 | for (unsigned I = 0; I != Users.size(); ++I) { |
962 | VPRecipeBase *Cur = dyn_cast<VPRecipeBase>(Val: Users[I]); |
963 | if (!Cur || isa<VPHeaderPHIRecipe>(Val: Cur)) |
964 | continue; |
965 | for (VPValue *V : Cur->definedValues()) |
966 | Users.insert(Start: V->user_begin(), End: V->user_end()); |
967 | } |
968 | return Users.takeVector(); |
969 | } |
970 | |
971 | void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { |
972 | for (VPRecipeBase &R : |
973 | Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
974 | auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R); |
975 | if (!PhiR) |
976 | continue; |
977 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
978 | RecurKind RK = RdxDesc.getRecurrenceKind(); |
979 | if (RK != RecurKind::Add && RK != RecurKind::Mul) |
980 | continue; |
981 | |
982 | for (VPUser *U : collectUsersRecursively(V: PhiR)) |
983 | if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: U)) { |
984 | RecWithFlags->dropPoisonGeneratingFlags(); |
985 | } |
986 | } |
987 | } |
988 | |
989 | /// Try to simplify recipe \p R. |
990 | static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { |
991 | using namespace llvm::VPlanPatternMatch; |
992 | // Try to remove redundant blend recipes. |
993 | if (auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R)) { |
994 | VPValue *Inc0 = Blend->getIncomingValue(Idx: 0); |
995 | for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) |
996 | if (Inc0 != Blend->getIncomingValue(Idx: I) && |
997 | !match(V: Blend->getMask(Idx: I), P: m_False())) |
998 | return; |
999 | Blend->replaceAllUsesWith(New: Inc0); |
1000 | Blend->eraseFromParent(); |
1001 | return; |
1002 | } |
1003 | |
1004 | VPValue *A; |
1005 | if (match(V: &R, P: m_Trunc(Op0: m_ZExtOrSExt(Op0: m_VPValue(V&: A))))) { |
1006 | VPValue *Trunc = R.getVPSingleValue(); |
1007 | Type *TruncTy = TypeInfo.inferScalarType(V: Trunc); |
1008 | Type *ATy = TypeInfo.inferScalarType(V: A); |
1009 | if (TruncTy == ATy) { |
1010 | Trunc->replaceAllUsesWith(New: A); |
1011 | } else { |
1012 | // Don't replace a scalarizing recipe with a widened cast. |
1013 | if (isa<VPReplicateRecipe>(Val: &R)) |
1014 | return; |
1015 | if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { |
1016 | |
1017 | unsigned ExtOpcode = match(V: R.getOperand(N: 0), P: m_SExt(Op0: m_VPValue())) |
1018 | ? Instruction::SExt |
1019 | : Instruction::ZExt; |
1020 | auto *VPC = |
1021 | new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); |
1022 | if (auto *UnderlyingExt = R.getOperand(N: 0)->getUnderlyingValue()) { |
1023 | // UnderlyingExt has distinct return type, used to retain legacy cost. |
1024 | VPC->setUnderlyingValue(UnderlyingExt); |
1025 | } |
1026 | VPC->insertBefore(InsertPos: &R); |
1027 | Trunc->replaceAllUsesWith(New: VPC); |
1028 | } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { |
1029 | auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); |
1030 | VPC->insertBefore(InsertPos: &R); |
1031 | Trunc->replaceAllUsesWith(New: VPC); |
1032 | } |
1033 | } |
1034 | #ifndef NDEBUG |
1035 | // Verify that the cached type info is for both A and its users is still |
1036 | // accurate by comparing it to freshly computed types. |
1037 | VPTypeAnalysis TypeInfo2( |
1038 | R.getParent()->getPlan()->getCanonicalIV()->getScalarType(), |
1039 | TypeInfo.getContext()); |
1040 | assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); |
1041 | for (VPUser *U : A->users()) { |
1042 | auto *R = dyn_cast<VPRecipeBase>(U); |
1043 | if (!R) |
1044 | continue; |
1045 | for (VPValue *VPV : R->definedValues()) |
1046 | assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); |
1047 | } |
1048 | #endif |
1049 | } |
1050 | |
1051 | // Simplify (X && Y) || (X && !Y) -> X. |
1052 | // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X |
1053 | // && (Y || Z) and (X || !X) into true. This requires queuing newly created |
1054 | // recipes to be visited during simplification. |
1055 | VPValue *X, *Y, *X1, *Y1; |
1056 | if (match(V: &R, |
1057 | P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)), |
1058 | Op1: m_LogicalAnd(Op0: m_VPValue(V&: X1), Op1: m_Not(Op0: m_VPValue(V&: Y1))))) && |
1059 | X == X1 && Y == Y1) { |
1060 | R.getVPSingleValue()->replaceAllUsesWith(New: X); |
1061 | return; |
1062 | } |
1063 | |
1064 | if (match(V: &R, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_SpecificInt(V: 1)))) |
1065 | return R.getVPSingleValue()->replaceAllUsesWith(New: A); |
1066 | } |
1067 | |
1068 | /// Try to simplify the recipes in \p Plan. |
1069 | static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) { |
1070 | ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( |
1071 | Plan.getEntry()); |
1072 | VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), Ctx); |
1073 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) { |
1074 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
1075 | simplifyRecipe(R, TypeInfo); |
1076 | } |
1077 | } |
1078 | } |
1079 | |
1080 | void VPlanTransforms::truncateToMinimalBitwidths( |
1081 | VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs, |
1082 | LLVMContext &Ctx) { |
1083 | #ifndef NDEBUG |
1084 | // Count the processed recipes and cross check the count later with MinBWs |
1085 | // size, to make sure all entries in MinBWs have been handled. |
1086 | unsigned NumProcessedRecipes = 0; |
1087 | #endif |
1088 | // Keep track of created truncates, so they can be re-used. Note that we |
1089 | // cannot use RAUW after creating a new truncate, as this would could make |
1090 | // other uses have different types for their operands, making them invalidly |
1091 | // typed. |
1092 | DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs; |
1093 | VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), Ctx); |
1094 | VPBasicBlock *PH = Plan.getEntry(); |
1095 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
1096 | Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) { |
1097 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
1098 | if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe, |
1099 | VPWidenSelectRecipe, VPWidenLoadRecipe>(Val: &R)) |
1100 | continue; |
1101 | |
1102 | VPValue *ResultVPV = R.getVPSingleValue(); |
1103 | auto *UI = cast_or_null<Instruction>(Val: ResultVPV->getUnderlyingValue()); |
1104 | unsigned NewResSizeInBits = MinBWs.lookup(Key: UI); |
1105 | if (!NewResSizeInBits) |
1106 | continue; |
1107 | |
1108 | #ifndef NDEBUG |
1109 | NumProcessedRecipes++; |
1110 | #endif |
1111 | // If the value wasn't vectorized, we must maintain the original scalar |
1112 | // type. Skip those here, after incrementing NumProcessedRecipes. Also |
1113 | // skip casts which do not need to be handled explicitly here, as |
1114 | // redundant casts will be removed during recipe simplification. |
1115 | if (isa<VPReplicateRecipe, VPWidenCastRecipe>(Val: &R)) { |
1116 | #ifndef NDEBUG |
1117 | // If any of the operands is a live-in and not used by VPWidenRecipe or |
1118 | // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as |
1119 | // processed as well. When MinBWs is currently constructed, there is no |
1120 | // information about whether recipes are widened or replicated and in |
1121 | // case they are reciplicated the operands are not truncated. Counting |
1122 | // them them here ensures we do not miss any recipes in MinBWs. |
1123 | // TODO: Remove once the analysis is done on VPlan. |
1124 | for (VPValue *Op : R.operands()) { |
1125 | if (!Op->isLiveIn()) |
1126 | continue; |
1127 | auto *UV = dyn_cast_or_null<Instruction>(Op->getUnderlyingValue()); |
1128 | if (UV && MinBWs.contains(UV) && !ProcessedTruncs.contains(Op) && |
1129 | all_of(Op->users(), [](VPUser *U) { |
1130 | return !isa<VPWidenRecipe, VPWidenSelectRecipe>(U); |
1131 | })) { |
1132 | // Add an entry to ProcessedTruncs to avoid counting the same |
1133 | // operand multiple times. |
1134 | ProcessedTruncs[Op] = nullptr; |
1135 | NumProcessedRecipes += 1; |
1136 | } |
1137 | } |
1138 | #endif |
1139 | continue; |
1140 | } |
1141 | |
1142 | Type *OldResTy = TypeInfo.inferScalarType(V: ResultVPV); |
1143 | unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits(); |
1144 | assert(OldResTy->isIntegerTy() && "only integer types supported" ); |
1145 | (void)OldResSizeInBits; |
1146 | |
1147 | auto *NewResTy = IntegerType::get(C&: Ctx, NumBits: NewResSizeInBits); |
1148 | |
1149 | // Any wrapping introduced by shrinking this operation shouldn't be |
1150 | // considered undefined behavior. So, we can't unconditionally copy |
1151 | // arithmetic wrapping flags to VPW. |
1152 | if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(Val: &R)) |
1153 | VPW->dropPoisonGeneratingFlags(); |
1154 | |
1155 | using namespace llvm::VPlanPatternMatch; |
1156 | if (OldResSizeInBits != NewResSizeInBits && |
1157 | !match(V: &R, P: m_Binary<Instruction::ICmp>(Op0: m_VPValue(), Op1: m_VPValue()))) { |
1158 | // Extend result to original width. |
1159 | auto *Ext = |
1160 | new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy); |
1161 | Ext->insertAfter(InsertPos: &R); |
1162 | ResultVPV->replaceAllUsesWith(New: Ext); |
1163 | Ext->setOperand(I: 0, New: ResultVPV); |
1164 | assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?" ); |
1165 | } else |
1166 | assert( |
1167 | match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) && |
1168 | "Only ICmps should not need extending the result." ); |
1169 | |
1170 | assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed" ); |
1171 | if (isa<VPWidenLoadRecipe>(Val: &R)) |
1172 | continue; |
1173 | |
1174 | // Shrink operands by introducing truncates as needed. |
1175 | unsigned StartIdx = isa<VPWidenSelectRecipe>(Val: &R) ? 1 : 0; |
1176 | for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) { |
1177 | auto *Op = R.getOperand(N: Idx); |
1178 | unsigned OpSizeInBits = |
1179 | TypeInfo.inferScalarType(V: Op)->getScalarSizeInBits(); |
1180 | if (OpSizeInBits == NewResSizeInBits) |
1181 | continue; |
1182 | assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate" ); |
1183 | auto [ProcessedIter, IterIsEmpty] = |
1184 | ProcessedTruncs.insert(KV: {Op, nullptr}); |
1185 | VPWidenCastRecipe *NewOp = |
1186 | IterIsEmpty |
1187 | ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy) |
1188 | : ProcessedIter->second; |
1189 | R.setOperand(I: Idx, New: NewOp); |
1190 | if (!IterIsEmpty) |
1191 | continue; |
1192 | ProcessedIter->second = NewOp; |
1193 | if (!Op->isLiveIn()) { |
1194 | NewOp->insertBefore(InsertPos: &R); |
1195 | } else { |
1196 | PH->appendRecipe(Recipe: NewOp); |
1197 | #ifndef NDEBUG |
1198 | auto *OpInst = dyn_cast<Instruction>(Op->getLiveInIRValue()); |
1199 | bool IsContained = MinBWs.contains(OpInst); |
1200 | NumProcessedRecipes += IsContained; |
1201 | #endif |
1202 | } |
1203 | } |
1204 | |
1205 | } |
1206 | } |
1207 | |
1208 | assert(MinBWs.size() == NumProcessedRecipes && |
1209 | "some entries in MinBWs haven't been processed" ); |
1210 | } |
1211 | |
1212 | void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { |
1213 | removeRedundantCanonicalIVs(Plan); |
1214 | removeRedundantInductionCasts(Plan); |
1215 | |
1216 | simplifyRecipes(Plan, Ctx&: SE.getContext()); |
1217 | legalizeAndOptimizeInductions(Plan, SE); |
1218 | removeDeadRecipes(Plan); |
1219 | |
1220 | createAndOptimizeReplicateRegions(Plan); |
1221 | |
1222 | removeRedundantExpandSCEVRecipes(Plan); |
1223 | mergeBlocksIntoPredecessors(Plan); |
1224 | } |
1225 | |
1226 | // Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace |
1227 | // the loop terminator with a branch-on-cond recipe with the negated |
1228 | // active-lane-mask as operand. Note that this turns the loop into an |
1229 | // uncountable one. Only the existing terminator is replaced, all other existing |
1230 | // recipes/users remain unchanged, except for poison-generating flags being |
1231 | // dropped from the canonical IV increment. Return the created |
1232 | // VPActiveLaneMaskPHIRecipe. |
1233 | // |
1234 | // The function uses the following definitions: |
1235 | // |
1236 | // %TripCount = DataWithControlFlowWithoutRuntimeCheck ? |
1237 | // calculate-trip-count-minus-VF (original TC) : original TC |
1238 | // %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ? |
1239 | // CanonicalIVPhi : CanonicalIVIncrement |
1240 | // %StartV is the canonical induction start value. |
1241 | // |
1242 | // The function adds the following recipes: |
1243 | // |
1244 | // vector.ph: |
1245 | // %TripCount = calculate-trip-count-minus-VF (original TC) |
1246 | // [if DataWithControlFlowWithoutRuntimeCheck] |
1247 | // %EntryInc = canonical-iv-increment-for-part %StartV |
1248 | // %EntryALM = active-lane-mask %EntryInc, %TripCount |
1249 | // |
1250 | // vector.body: |
1251 | // ... |
1252 | // %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ] |
1253 | // ... |
1254 | // %InLoopInc = canonical-iv-increment-for-part %IncrementValue |
1255 | // %ALM = active-lane-mask %InLoopInc, TripCount |
1256 | // %Negated = Not %ALM |
1257 | // branch-on-cond %Negated |
1258 | // |
1259 | static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( |
1260 | VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { |
1261 | VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); |
1262 | VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); |
1263 | auto *CanonicalIVPHI = Plan.getCanonicalIV(); |
1264 | VPValue *StartV = CanonicalIVPHI->getStartValue(); |
1265 | |
1266 | auto *CanonicalIVIncrement = |
1267 | cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue()); |
1268 | // TODO: Check if dropping the flags is needed if |
1269 | // !DataAndControlFlowWithoutRuntimeCheck. |
1270 | CanonicalIVIncrement->dropPoisonGeneratingFlags(); |
1271 | DebugLoc DL = CanonicalIVIncrement->getDebugLoc(); |
1272 | // We can't use StartV directly in the ActiveLaneMask VPInstruction, since |
1273 | // we have to take unrolling into account. Each part needs to start at |
1274 | // Part * VF |
1275 | auto * = cast<VPBasicBlock>(Val: TopRegion->getSinglePredecessor()); |
1276 | VPBuilder Builder(VecPreheader); |
1277 | |
1278 | // Create the ActiveLaneMask instruction using the correct start values. |
1279 | VPValue *TC = Plan.getTripCount(); |
1280 | |
1281 | VPValue *TripCount, *IncrementValue; |
1282 | if (!DataAndControlFlowWithoutRuntimeCheck) { |
1283 | // When the loop is guarded by a runtime overflow check for the loop |
1284 | // induction variable increment by VF, we can increment the value before |
1285 | // the get.active.lane mask and use the unmodified tripcount. |
1286 | IncrementValue = CanonicalIVIncrement; |
1287 | TripCount = TC; |
1288 | } else { |
1289 | // When avoiding a runtime check, the active.lane.mask inside the loop |
1290 | // uses a modified trip count and the induction variable increment is |
1291 | // done after the active.lane.mask intrinsic is called. |
1292 | IncrementValue = CanonicalIVPHI; |
1293 | TripCount = Builder.createNaryOp(Opcode: VPInstruction::CalculateTripCountMinusVF, |
1294 | Operands: {TC}, DL); |
1295 | } |
1296 | auto *EntryIncrement = Builder.createOverflowingOp( |
1297 | Opcode: VPInstruction::CanonicalIVIncrementForPart, Operands: {StartV}, WrapFlags: {false, false}, DL, |
1298 | Name: "index.part.next" ); |
1299 | |
1300 | // Create the active lane mask instruction in the VPlan preheader. |
1301 | auto *EntryALM = |
1302 | Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask, Operands: {EntryIncrement, TC}, |
1303 | DL, Name: "active.lane.mask.entry" ); |
1304 | |
1305 | // Now create the ActiveLaneMaskPhi recipe in the main loop using the |
1306 | // preheader ActiveLaneMask instruction. |
1307 | auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); |
1308 | LaneMaskPhi->insertAfter(InsertPos: CanonicalIVPHI); |
1309 | |
1310 | // Create the active lane mask for the next iteration of the loop before the |
1311 | // original terminator. |
1312 | VPRecipeBase *OriginalTerminator = EB->getTerminator(); |
1313 | Builder.setInsertPoint(OriginalTerminator); |
1314 | auto *InLoopIncrement = |
1315 | Builder.createOverflowingOp(Opcode: VPInstruction::CanonicalIVIncrementForPart, |
1316 | Operands: {IncrementValue}, WrapFlags: {false, false}, DL); |
1317 | auto *ALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask, |
1318 | Operands: {InLoopIncrement, TripCount}, DL, |
1319 | Name: "active.lane.mask.next" ); |
1320 | LaneMaskPhi->addOperand(Operand: ALM); |
1321 | |
1322 | // Replace the original terminator with BranchOnCond. We have to invert the |
1323 | // mask here because a true condition means jumping to the exit block. |
1324 | auto *NotMask = Builder.createNot(Operand: ALM, DL); |
1325 | Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {NotMask}, DL); |
1326 | OriginalTerminator->eraseFromParent(); |
1327 | return LaneMaskPhi; |
1328 | } |
1329 | |
1330 | /// Collect all VPValues representing a header mask through the (ICMP_ULE, |
1331 | /// WideCanonicalIV, backedge-taken-count) pattern. |
1332 | /// TODO: Introduce explicit recipe for header-mask instead of searching |
1333 | /// for the header-mask pattern manually. |
1334 | static SmallVector<VPValue *> (VPlan &Plan) { |
1335 | SmallVector<VPValue *> WideCanonicalIVs; |
1336 | auto *FoundWidenCanonicalIVUser = |
1337 | find_if(Range: Plan.getCanonicalIV()->users(), |
1338 | P: [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(Val: U); }); |
1339 | assert(count_if(Plan.getCanonicalIV()->users(), |
1340 | [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <= |
1341 | 1 && |
1342 | "Must have at most one VPWideCanonicalIVRecipe" ); |
1343 | if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) { |
1344 | auto *WideCanonicalIV = |
1345 | cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser); |
1346 | WideCanonicalIVs.push_back(Elt: WideCanonicalIV); |
1347 | } |
1348 | |
1349 | // Also include VPWidenIntOrFpInductionRecipes that represent a widened |
1350 | // version of the canonical induction. |
1351 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
1352 | for (VPRecipeBase &Phi : HeaderVPBB->phis()) { |
1353 | auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi); |
1354 | if (WidenOriginalIV && WidenOriginalIV->isCanonical()) |
1355 | WideCanonicalIVs.push_back(Elt: WidenOriginalIV); |
1356 | } |
1357 | |
1358 | // Walk users of wide canonical IVs and collect to all compares of the form |
1359 | // (ICMP_ULE, WideCanonicalIV, backedge-taken-count). |
1360 | SmallVector<VPValue *> ; |
1361 | for (auto *Wide : WideCanonicalIVs) { |
1362 | for (VPUser *U : SmallVector<VPUser *>(Wide->users())) { |
1363 | auto * = dyn_cast<VPInstruction>(Val: U); |
1364 | if (!HeaderMask || !vputils::isHeaderMask(V: HeaderMask, Plan)) |
1365 | continue; |
1366 | |
1367 | assert(HeaderMask->getOperand(0) == Wide && |
1368 | "WidenCanonicalIV must be the first operand of the compare" ); |
1369 | HeaderMasks.push_back(Elt: HeaderMask); |
1370 | } |
1371 | } |
1372 | return HeaderMasks; |
1373 | } |
1374 | |
1375 | void VPlanTransforms::addActiveLaneMask( |
1376 | VPlan &Plan, bool UseActiveLaneMaskForControlFlow, |
1377 | bool DataAndControlFlowWithoutRuntimeCheck) { |
1378 | assert((!DataAndControlFlowWithoutRuntimeCheck || |
1379 | UseActiveLaneMaskForControlFlow) && |
1380 | "DataAndControlFlowWithoutRuntimeCheck implies " |
1381 | "UseActiveLaneMaskForControlFlow" ); |
1382 | |
1383 | auto FoundWidenCanonicalIVUser = |
1384 | find_if(Range: Plan.getCanonicalIV()->users(), |
1385 | P: [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(Val: U); }); |
1386 | assert(FoundWidenCanonicalIVUser && |
1387 | "Must have widened canonical IV when tail folding!" ); |
1388 | auto *WideCanonicalIV = |
1389 | cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser); |
1390 | VPSingleDefRecipe *LaneMask; |
1391 | if (UseActiveLaneMaskForControlFlow) { |
1392 | LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( |
1393 | Plan, DataAndControlFlowWithoutRuntimeCheck); |
1394 | } else { |
1395 | VPBuilder B = VPBuilder::getToInsertAfter(R: WideCanonicalIV); |
1396 | LaneMask = B.createNaryOp(Opcode: VPInstruction::ActiveLaneMask, |
1397 | Operands: {WideCanonicalIV, Plan.getTripCount()}, Inst: nullptr, |
1398 | Name: "active.lane.mask" ); |
1399 | } |
1400 | |
1401 | // Walk users of WideCanonicalIV and replace all compares of the form |
1402 | // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an |
1403 | // active-lane-mask. |
1404 | for (VPValue * : collectAllHeaderMasks(Plan)) |
1405 | HeaderMask->replaceAllUsesWith(New: LaneMask); |
1406 | } |
1407 | |
1408 | /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and |
1409 | /// replaces all uses except the canonical IV increment of |
1410 | /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe |
1411 | /// is used only for loop iterations counting after this transformation. |
1412 | /// |
1413 | /// The function uses the following definitions: |
1414 | /// %StartV is the canonical induction start value. |
1415 | /// |
1416 | /// The function adds the following recipes: |
1417 | /// |
1418 | /// vector.ph: |
1419 | /// ... |
1420 | /// |
1421 | /// vector.body: |
1422 | /// ... |
1423 | /// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ], |
1424 | /// [ %NextEVLIV, %vector.body ] |
1425 | /// %VPEVL = EXPLICIT-VECTOR-LENGTH %EVLPhi, original TC |
1426 | /// ... |
1427 | /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi |
1428 | /// ... |
1429 | /// |
1430 | bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { |
1431 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
1432 | // The transform updates all users of inductions to work based on EVL, instead |
1433 | // of the VF directly. At the moment, widened inductions cannot be updated, so |
1434 | // bail out if the plan contains any. |
1435 | bool ContainsWidenInductions = any_of(Range: Header->phis(), P: [](VPRecipeBase &Phi) { |
1436 | return isa<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>( |
1437 | Val: &Phi); |
1438 | }); |
1439 | // FIXME: Remove this once we can transform (select header_mask, true_value, |
1440 | // false_value) into vp.merge. |
1441 | bool ContainsOutloopReductions = |
1442 | any_of(Range: Header->phis(), P: [&](VPRecipeBase &Phi) { |
1443 | auto *R = dyn_cast<VPReductionPHIRecipe>(Val: &Phi); |
1444 | return R && !R->isInLoop(); |
1445 | }); |
1446 | if (ContainsWidenInductions || ContainsOutloopReductions) |
1447 | return false; |
1448 | |
1449 | auto *CanonicalIVPHI = Plan.getCanonicalIV(); |
1450 | VPValue *StartV = CanonicalIVPHI->getStartValue(); |
1451 | |
1452 | // Create the ExplicitVectorLengthPhi recipe in the main loop. |
1453 | auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); |
1454 | EVLPhi->insertAfter(InsertPos: CanonicalIVPHI); |
1455 | auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength, |
1456 | {EVLPhi, Plan.getTripCount()}); |
1457 | VPEVL->insertBefore(BB&: *Header, IP: Header->getFirstNonPhi()); |
1458 | |
1459 | auto *CanonicalIVIncrement = |
1460 | cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue()); |
1461 | VPSingleDefRecipe *OpVPEVL = VPEVL; |
1462 | if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits(); |
1463 | IVSize != 32) { |
1464 | OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc |
1465 | : Instruction::ZExt, |
1466 | OpVPEVL, CanonicalIVPHI->getScalarType()); |
1467 | OpVPEVL->insertBefore(InsertPos: CanonicalIVIncrement); |
1468 | } |
1469 | auto *NextEVLIV = |
1470 | new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi}, |
1471 | {CanonicalIVIncrement->hasNoUnsignedWrap(), |
1472 | CanonicalIVIncrement->hasNoSignedWrap()}, |
1473 | CanonicalIVIncrement->getDebugLoc(), "index.evl.next" ); |
1474 | NextEVLIV->insertBefore(InsertPos: CanonicalIVIncrement); |
1475 | EVLPhi->addOperand(Operand: NextEVLIV); |
1476 | |
1477 | for (VPValue * : collectAllHeaderMasks(Plan)) { |
1478 | for (VPUser *U : collectUsersRecursively(V: HeaderMask)) { |
1479 | VPRecipeBase *NewRecipe = nullptr; |
1480 | auto *CurRecipe = dyn_cast<VPRecipeBase>(Val: U); |
1481 | if (!CurRecipe) |
1482 | continue; |
1483 | |
1484 | auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { |
1485 | assert(OrigMask && "Unmasked recipe when folding tail" ); |
1486 | return HeaderMask == OrigMask ? nullptr : OrigMask; |
1487 | }; |
1488 | if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(Val: CurRecipe)) { |
1489 | VPValue *NewMask = GetNewMask(MemR->getMask()); |
1490 | if (auto *L = dyn_cast<VPWidenLoadRecipe>(Val: MemR)) |
1491 | NewRecipe = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask); |
1492 | else if (auto *S = dyn_cast<VPWidenStoreRecipe>(Val: MemR)) |
1493 | NewRecipe = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask); |
1494 | else |
1495 | llvm_unreachable("unsupported recipe" ); |
1496 | } else if (auto *RedR = dyn_cast<VPReductionRecipe>(Val: CurRecipe)) { |
1497 | NewRecipe = new VPReductionEVLRecipe(RedR, VPEVL, |
1498 | GetNewMask(RedR->getCondOp())); |
1499 | } |
1500 | |
1501 | if (NewRecipe) { |
1502 | [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues(); |
1503 | assert(NumDefVal == CurRecipe->getNumDefinedValues() && |
1504 | "New recipe must define the same number of values as the " |
1505 | "original." ); |
1506 | assert( |
1507 | NumDefVal <= 1 && |
1508 | "Only supports recipes with a single definition or without users." ); |
1509 | NewRecipe->insertBefore(InsertPos: CurRecipe); |
1510 | if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(Val: NewRecipe)) { |
1511 | VPValue *CurVPV = CurRecipe->getVPSingleValue(); |
1512 | CurVPV->replaceAllUsesWith(New: NewRecipe->getVPSingleValue()); |
1513 | } |
1514 | CurRecipe->eraseFromParent(); |
1515 | } |
1516 | } |
1517 | recursivelyDeleteDeadRecipes(V: HeaderMask); |
1518 | } |
1519 | // Replace all uses of VPCanonicalIVPHIRecipe by |
1520 | // VPEVLBasedIVPHIRecipe except for the canonical IV increment. |
1521 | CanonicalIVPHI->replaceAllUsesWith(New: EVLPhi); |
1522 | CanonicalIVIncrement->setOperand(I: 0, New: CanonicalIVPHI); |
1523 | // TODO: support unroll factor > 1. |
1524 | Plan.setUF(1); |
1525 | return true; |
1526 | } |
1527 | |
1528 | void VPlanTransforms::dropPoisonGeneratingRecipes( |
1529 | VPlan &Plan, function_ref<bool(BasicBlock *)> BlockNeedsPredication) { |
1530 | // Collect recipes in the backward slice of `Root` that may generate a poison |
1531 | // value that is used after vectorization. |
1532 | SmallPtrSet<VPRecipeBase *, 16> Visited; |
1533 | auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { |
1534 | SmallVector<VPRecipeBase *, 16> Worklist; |
1535 | Worklist.push_back(Elt: Root); |
1536 | |
1537 | // Traverse the backward slice of Root through its use-def chain. |
1538 | while (!Worklist.empty()) { |
1539 | VPRecipeBase *CurRec = Worklist.back(); |
1540 | Worklist.pop_back(); |
1541 | |
1542 | if (!Visited.insert(Ptr: CurRec).second) |
1543 | continue; |
1544 | |
1545 | // Prune search if we find another recipe generating a widen memory |
1546 | // instruction. Widen memory instructions involved in address computation |
1547 | // will lead to gather/scatter instructions, which don't need to be |
1548 | // handled. |
1549 | if (isa<VPWidenMemoryRecipe>(Val: CurRec) || isa<VPInterleaveRecipe>(Val: CurRec) || |
1550 | isa<VPScalarIVStepsRecipe>(Val: CurRec) || isa<VPHeaderPHIRecipe>(Val: CurRec)) |
1551 | continue; |
1552 | |
1553 | // This recipe contributes to the address computation of a widen |
1554 | // load/store. If the underlying instruction has poison-generating flags, |
1555 | // drop them directly. |
1556 | if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: CurRec)) { |
1557 | VPValue *A, *B; |
1558 | using namespace llvm::VPlanPatternMatch; |
1559 | // Dropping disjoint from an OR may yield incorrect results, as some |
1560 | // analysis may have converted it to an Add implicitly (e.g. SCEV used |
1561 | // for dependence analysis). Instead, replace it with an equivalent Add. |
1562 | // This is possible as all users of the disjoint OR only access lanes |
1563 | // where the operands are disjoint or poison otherwise. |
1564 | if (match(V: RecWithFlags, P: m_BinaryOr(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))) && |
1565 | RecWithFlags->isDisjoint()) { |
1566 | VPBuilder Builder(RecWithFlags); |
1567 | VPInstruction *New = Builder.createOverflowingOp( |
1568 | Opcode: Instruction::Add, Operands: {A, B}, WrapFlags: {false, false}, |
1569 | DL: RecWithFlags->getDebugLoc()); |
1570 | New->setUnderlyingValue(RecWithFlags->getUnderlyingValue()); |
1571 | RecWithFlags->replaceAllUsesWith(New); |
1572 | RecWithFlags->eraseFromParent(); |
1573 | CurRec = New; |
1574 | } else |
1575 | RecWithFlags->dropPoisonGeneratingFlags(); |
1576 | } else { |
1577 | Instruction *Instr = dyn_cast_or_null<Instruction>( |
1578 | Val: CurRec->getVPSingleValue()->getUnderlyingValue()); |
1579 | (void)Instr; |
1580 | assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && |
1581 | "found instruction with poison generating flags not covered by " |
1582 | "VPRecipeWithIRFlags" ); |
1583 | } |
1584 | |
1585 | // Add new definitions to the worklist. |
1586 | for (VPValue *operand : CurRec->operands()) |
1587 | if (VPRecipeBase *OpDef = operand->getDefiningRecipe()) |
1588 | Worklist.push_back(Elt: OpDef); |
1589 | } |
1590 | }); |
1591 | |
1592 | // Traverse all the recipes in the VPlan and collect the poison-generating |
1593 | // recipes in the backward slice starting at the address of a VPWidenRecipe or |
1594 | // VPInterleaveRecipe. |
1595 | auto Iter = vp_depth_first_deep(G: Plan.getEntry()); |
1596 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) { |
1597 | for (VPRecipeBase &Recipe : *VPBB) { |
1598 | if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(Val: &Recipe)) { |
1599 | Instruction &UnderlyingInstr = WidenRec->getIngredient(); |
1600 | VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); |
1601 | if (AddrDef && WidenRec->isConsecutive() && |
1602 | BlockNeedsPredication(UnderlyingInstr.getParent())) |
1603 | collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); |
1604 | } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(Val: &Recipe)) { |
1605 | VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); |
1606 | if (AddrDef) { |
1607 | // Check if any member of the interleave group needs predication. |
1608 | const InterleaveGroup<Instruction> *InterGroup = |
1609 | InterleaveRec->getInterleaveGroup(); |
1610 | bool NeedPredication = false; |
1611 | for (int I = 0, NumMembers = InterGroup->getNumMembers(); |
1612 | I < NumMembers; ++I) { |
1613 | Instruction *Member = InterGroup->getMember(Index: I); |
1614 | if (Member) |
1615 | NeedPredication |= BlockNeedsPredication(Member->getParent()); |
1616 | } |
1617 | |
1618 | if (NeedPredication) |
1619 | collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); |
1620 | } |
1621 | } |
1622 | } |
1623 | } |
1624 | } |
1625 | |