1 | //===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file implements a set of utility VPlan to VPlan transformations. |
11 | /// |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "VPlanTransforms.h" |
15 | #include "VPRecipeBuilder.h" |
16 | #include "VPlan.h" |
17 | #include "VPlanAnalysis.h" |
18 | #include "VPlanCFG.h" |
19 | #include "VPlanDominatorTree.h" |
20 | #include "VPlanHelpers.h" |
21 | #include "VPlanPatternMatch.h" |
22 | #include "VPlanUtils.h" |
23 | #include "VPlanVerifier.h" |
24 | #include "llvm/ADT/APInt.h" |
25 | #include "llvm/ADT/PostOrderIterator.h" |
26 | #include "llvm/ADT/STLExtras.h" |
27 | #include "llvm/ADT/SetVector.h" |
28 | #include "llvm/ADT/TypeSwitch.h" |
29 | #include "llvm/Analysis/IVDescriptors.h" |
30 | #include "llvm/Analysis/InstSimplifyFolder.h" |
31 | #include "llvm/Analysis/LoopInfo.h" |
32 | #include "llvm/Analysis/VectorUtils.h" |
33 | #include "llvm/IR/Intrinsics.h" |
34 | #include "llvm/IR/MDBuilder.h" |
35 | #include "llvm/IR/PatternMatch.h" |
36 | #include "llvm/Support/Casting.h" |
37 | #include "llvm/Support/TypeSize.h" |
38 | |
39 | using namespace llvm; |
40 | |
41 | bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( |
42 | VPlanPtr &Plan, |
43 | function_ref<const InductionDescriptor *(PHINode *)> |
44 | GetIntOrFpInductionDescriptor, |
45 | ScalarEvolution &SE, const TargetLibraryInfo &TLI) { |
46 | |
47 | ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( |
48 | Plan->getVectorLoopRegion()); |
49 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) { |
50 | // Skip blocks outside region |
51 | if (!VPBB->getParent()) |
52 | break; |
53 | VPRecipeBase *Term = VPBB->getTerminator(); |
54 | auto EndIter = Term ? Term->getIterator() : VPBB->end(); |
55 | // Introduce each ingredient into VPlan. |
56 | for (VPRecipeBase &Ingredient : |
57 | make_early_inc_range(Range: make_range(x: VPBB->begin(), y: EndIter))) { |
58 | |
59 | VPValue *VPV = Ingredient.getVPSingleValue(); |
60 | if (!VPV->getUnderlyingValue()) |
61 | continue; |
62 | |
63 | Instruction *Inst = cast<Instruction>(Val: VPV->getUnderlyingValue()); |
64 | |
65 | VPRecipeBase *NewRecipe = nullptr; |
66 | if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &Ingredient)) { |
67 | auto *Phi = cast<PHINode>(Val: VPPhi->getUnderlyingValue()); |
68 | const auto *II = GetIntOrFpInductionDescriptor(Phi); |
69 | if (!II) |
70 | continue; |
71 | |
72 | VPValue *Start = Plan->getOrAddLiveIn(V: II->getStartValue()); |
73 | VPValue *Step = |
74 | vputils::getOrCreateVPValueForSCEVExpr(Plan&: *Plan, Expr: II->getStep(), SE); |
75 | NewRecipe = new VPWidenIntOrFpInductionRecipe( |
76 | Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc()); |
77 | } else { |
78 | assert(isa<VPInstruction>(&Ingredient) && |
79 | "only VPInstructions expected here" ); |
80 | assert(!isa<PHINode>(Inst) && "phis should be handled above" ); |
81 | // Create VPWidenMemoryRecipe for loads and stores. |
82 | if (LoadInst *Load = dyn_cast<LoadInst>(Val: Inst)) { |
83 | NewRecipe = new VPWidenLoadRecipe( |
84 | *Load, Ingredient.getOperand(N: 0), nullptr /*Mask*/, |
85 | false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load), |
86 | Ingredient.getDebugLoc()); |
87 | } else if (StoreInst *Store = dyn_cast<StoreInst>(Val: Inst)) { |
88 | NewRecipe = new VPWidenStoreRecipe( |
89 | *Store, Ingredient.getOperand(N: 1), Ingredient.getOperand(N: 0), |
90 | nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, |
91 | VPIRMetadata(*Store), Ingredient.getDebugLoc()); |
92 | } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) { |
93 | NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); |
94 | } else if (CallInst *CI = dyn_cast<CallInst>(Val: Inst)) { |
95 | Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, TLI: &TLI); |
96 | if (VectorID == Intrinsic::not_intrinsic) |
97 | return false; |
98 | NewRecipe = new VPWidenIntrinsicRecipe( |
99 | *CI, getVectorIntrinsicIDForCall(CI, TLI: &TLI), |
100 | {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(), |
101 | CI->getDebugLoc()); |
102 | } else if (SelectInst *SI = dyn_cast<SelectInst>(Val: Inst)) { |
103 | NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands()); |
104 | } else if (auto *CI = dyn_cast<CastInst>(Val: Inst)) { |
105 | NewRecipe = new VPWidenCastRecipe( |
106 | CI->getOpcode(), Ingredient.getOperand(N: 0), CI->getType(), *CI); |
107 | } else { |
108 | NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands()); |
109 | } |
110 | } |
111 | |
112 | NewRecipe->insertBefore(InsertPos: &Ingredient); |
113 | if (NewRecipe->getNumDefinedValues() == 1) |
114 | VPV->replaceAllUsesWith(New: NewRecipe->getVPSingleValue()); |
115 | else |
116 | assert(NewRecipe->getNumDefinedValues() == 0 && |
117 | "Only recpies with zero or one defined values expected" ); |
118 | Ingredient.eraseFromParent(); |
119 | } |
120 | } |
121 | return true; |
122 | } |
123 | |
124 | static bool sinkScalarOperands(VPlan &Plan) { |
125 | auto Iter = vp_depth_first_deep(G: Plan.getEntry()); |
126 | bool Changed = false; |
127 | // First, collect the operands of all recipes in replicate blocks as seeds for |
128 | // sinking. |
129 | SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; |
130 | for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Range: Iter)) { |
131 | VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); |
132 | if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) |
133 | continue; |
134 | VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Val: EntryVPBB->getSuccessors()[0]); |
135 | if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) |
136 | continue; |
137 | for (auto &Recipe : *VPBB) { |
138 | for (VPValue *Op : Recipe.operands()) |
139 | if (auto *Def = |
140 | dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe())) |
141 | WorkList.insert(X: std::make_pair(x&: VPBB, y&: Def)); |
142 | } |
143 | } |
144 | |
145 | bool ScalarVFOnly = Plan.hasScalarVFOnly(); |
146 | // Try to sink each replicate or scalar IV steps recipe in the worklist. |
147 | for (unsigned I = 0; I != WorkList.size(); ++I) { |
148 | VPBasicBlock *SinkTo; |
149 | VPSingleDefRecipe *SinkCandidate; |
150 | std::tie(args&: SinkTo, args&: SinkCandidate) = WorkList[I]; |
151 | if (SinkCandidate->getParent() == SinkTo || |
152 | SinkCandidate->mayHaveSideEffects() || |
153 | SinkCandidate->mayReadOrWriteMemory()) |
154 | continue; |
155 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) { |
156 | if (!ScalarVFOnly && RepR->isSingleScalar()) |
157 | continue; |
158 | } else if (!isa<VPScalarIVStepsRecipe>(Val: SinkCandidate)) |
159 | continue; |
160 | |
161 | bool NeedsDuplicating = false; |
162 | // All recipe users of the sink candidate must be in the same block SinkTo |
163 | // or all users outside of SinkTo must be uniform-after-vectorization ( |
164 | // i.e., only first lane is used) . In the latter case, we need to duplicate |
165 | // SinkCandidate. |
166 | auto CanSinkWithUser = [SinkTo, &NeedsDuplicating, |
167 | SinkCandidate](VPUser *U) { |
168 | auto *UI = cast<VPRecipeBase>(Val: U); |
169 | if (UI->getParent() == SinkTo) |
170 | return true; |
171 | NeedsDuplicating = UI->onlyFirstLaneUsed(Op: SinkCandidate); |
172 | // We only know how to duplicate VPReplicateRecipes and |
173 | // VPScalarIVStepsRecipes for now. |
174 | return NeedsDuplicating && |
175 | isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Val: SinkCandidate); |
176 | }; |
177 | if (!all_of(Range: SinkCandidate->users(), P: CanSinkWithUser)) |
178 | continue; |
179 | |
180 | if (NeedsDuplicating) { |
181 | if (ScalarVFOnly) |
182 | continue; |
183 | VPSingleDefRecipe *Clone; |
184 | if (auto *SinkCandidateRepR = |
185 | dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) { |
186 | // TODO: Handle converting to uniform recipes as separate transform, |
187 | // then cloning should be sufficient here. |
188 | Instruction *I = SinkCandidate->getUnderlyingInstr(); |
189 | Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true, |
190 | nullptr /*Mask*/, *SinkCandidateRepR); |
191 | // TODO: add ".cloned" suffix to name of Clone's VPValue. |
192 | } else { |
193 | Clone = SinkCandidate->clone(); |
194 | } |
195 | |
196 | Clone->insertBefore(InsertPos: SinkCandidate); |
197 | SinkCandidate->replaceUsesWithIf(New: Clone, ShouldReplace: [SinkTo](VPUser &U, unsigned) { |
198 | return cast<VPRecipeBase>(Val: &U)->getParent() != SinkTo; |
199 | }); |
200 | } |
201 | SinkCandidate->moveBefore(BB&: *SinkTo, I: SinkTo->getFirstNonPhi()); |
202 | for (VPValue *Op : SinkCandidate->operands()) |
203 | if (auto *Def = |
204 | dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe())) |
205 | WorkList.insert(X: std::make_pair(x&: SinkTo, y&: Def)); |
206 | Changed = true; |
207 | } |
208 | return Changed; |
209 | } |
210 | |
211 | /// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return |
212 | /// the mask. |
213 | VPValue *getPredicatedMask(VPRegionBlock *R) { |
214 | auto *EntryBB = dyn_cast<VPBasicBlock>(Val: R->getEntry()); |
215 | if (!EntryBB || EntryBB->size() != 1 || |
216 | !isa<VPBranchOnMaskRecipe>(Val: EntryBB->begin())) |
217 | return nullptr; |
218 | |
219 | return cast<VPBranchOnMaskRecipe>(Val: &*EntryBB->begin())->getOperand(N: 0); |
220 | } |
221 | |
222 | /// If \p R is a triangle region, return the 'then' block of the triangle. |
223 | static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) { |
224 | auto *EntryBB = cast<VPBasicBlock>(Val: R->getEntry()); |
225 | if (EntryBB->getNumSuccessors() != 2) |
226 | return nullptr; |
227 | |
228 | auto *Succ0 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[0]); |
229 | auto *Succ1 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[1]); |
230 | if (!Succ0 || !Succ1) |
231 | return nullptr; |
232 | |
233 | if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1) |
234 | return nullptr; |
235 | if (Succ0->getSingleSuccessor() == Succ1) |
236 | return Succ0; |
237 | if (Succ1->getSingleSuccessor() == Succ0) |
238 | return Succ1; |
239 | return nullptr; |
240 | } |
241 | |
242 | // Merge replicate regions in their successor region, if a replicate region |
243 | // is connected to a successor replicate region with the same predicate by a |
244 | // single, empty VPBasicBlock. |
245 | static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { |
246 | SmallPtrSet<VPRegionBlock *, 4> TransformedRegions; |
247 | |
248 | // Collect replicate regions followed by an empty block, followed by another |
249 | // replicate region with matching masks to process front. This is to avoid |
250 | // iterator invalidation issues while merging regions. |
251 | SmallVector<VPRegionBlock *, 8> WorkList; |
252 | for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>( |
253 | Range: vp_depth_first_deep(G: Plan.getEntry()))) { |
254 | if (!Region1->isReplicator()) |
255 | continue; |
256 | auto *MiddleBasicBlock = |
257 | dyn_cast_or_null<VPBasicBlock>(Val: Region1->getSingleSuccessor()); |
258 | if (!MiddleBasicBlock || !MiddleBasicBlock->empty()) |
259 | continue; |
260 | |
261 | auto *Region2 = |
262 | dyn_cast_or_null<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor()); |
263 | if (!Region2 || !Region2->isReplicator()) |
264 | continue; |
265 | |
266 | VPValue *Mask1 = getPredicatedMask(R: Region1); |
267 | VPValue *Mask2 = getPredicatedMask(R: Region2); |
268 | if (!Mask1 || Mask1 != Mask2) |
269 | continue; |
270 | |
271 | assert(Mask1 && Mask2 && "both region must have conditions" ); |
272 | WorkList.push_back(Elt: Region1); |
273 | } |
274 | |
275 | // Move recipes from Region1 to its successor region, if both are triangles. |
276 | for (VPRegionBlock *Region1 : WorkList) { |
277 | if (TransformedRegions.contains(Ptr: Region1)) |
278 | continue; |
279 | auto *MiddleBasicBlock = cast<VPBasicBlock>(Val: Region1->getSingleSuccessor()); |
280 | auto *Region2 = cast<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor()); |
281 | |
282 | VPBasicBlock *Then1 = getPredicatedThenBlock(R: Region1); |
283 | VPBasicBlock *Then2 = getPredicatedThenBlock(R: Region2); |
284 | if (!Then1 || !Then2) |
285 | continue; |
286 | |
287 | // Note: No fusion-preventing memory dependencies are expected in either |
288 | // region. Such dependencies should be rejected during earlier dependence |
289 | // checks, which guarantee accesses can be re-ordered for vectorization. |
290 | // |
291 | // Move recipes to the successor region. |
292 | for (VPRecipeBase &ToMove : make_early_inc_range(Range: reverse(C&: *Then1))) |
293 | ToMove.moveBefore(BB&: *Then2, I: Then2->getFirstNonPhi()); |
294 | |
295 | auto *Merge1 = cast<VPBasicBlock>(Val: Then1->getSingleSuccessor()); |
296 | auto *Merge2 = cast<VPBasicBlock>(Val: Then2->getSingleSuccessor()); |
297 | |
298 | // Move VPPredInstPHIRecipes from the merge block to the successor region's |
299 | // merge block. Update all users inside the successor region to use the |
300 | // original values. |
301 | for (VPRecipeBase &Phi1ToMove : make_early_inc_range(Range: reverse(C&: *Merge1))) { |
302 | VPValue *PredInst1 = |
303 | cast<VPPredInstPHIRecipe>(Val: &Phi1ToMove)->getOperand(N: 0); |
304 | VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue(); |
305 | Phi1ToMoveV->replaceUsesWithIf(New: PredInst1, ShouldReplace: [Then2](VPUser &U, unsigned) { |
306 | return cast<VPRecipeBase>(Val: &U)->getParent() == Then2; |
307 | }); |
308 | |
309 | // Remove phi recipes that are unused after merging the regions. |
310 | if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) { |
311 | Phi1ToMove.eraseFromParent(); |
312 | continue; |
313 | } |
314 | Phi1ToMove.moveBefore(BB&: *Merge2, I: Merge2->begin()); |
315 | } |
316 | |
317 | // Remove the dead recipes in Region1's entry block. |
318 | for (VPRecipeBase &R : |
319 | make_early_inc_range(Range: reverse(C&: *Region1->getEntryBasicBlock()))) |
320 | R.eraseFromParent(); |
321 | |
322 | // Finally, remove the first region. |
323 | for (VPBlockBase *Pred : make_early_inc_range(Range&: Region1->getPredecessors())) { |
324 | VPBlockUtils::disconnectBlocks(From: Pred, To: Region1); |
325 | VPBlockUtils::connectBlocks(From: Pred, To: MiddleBasicBlock); |
326 | } |
327 | VPBlockUtils::disconnectBlocks(From: Region1, To: MiddleBasicBlock); |
328 | TransformedRegions.insert(Ptr: Region1); |
329 | } |
330 | |
331 | return !TransformedRegions.empty(); |
332 | } |
333 | |
334 | static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, |
335 | VPlan &Plan) { |
336 | Instruction *Instr = PredRecipe->getUnderlyingInstr(); |
337 | // Build the triangular if-then region. |
338 | std::string RegionName = (Twine("pred." ) + Instr->getOpcodeName()).str(); |
339 | assert(Instr->getParent() && "Predicated instruction not in any basic block" ); |
340 | auto *BlockInMask = PredRecipe->getMask(); |
341 | auto *MaskDef = BlockInMask->getDefiningRecipe(); |
342 | auto *BOMRecipe = new VPBranchOnMaskRecipe( |
343 | BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc()); |
344 | auto *Entry = |
345 | Plan.createVPBasicBlock(Name: Twine(RegionName) + ".entry" , Recipe: BOMRecipe); |
346 | |
347 | // Replace predicated replicate recipe with a replicate recipe without a |
348 | // mask but in the replicate region. |
349 | auto *RecipeWithoutMask = new VPReplicateRecipe( |
350 | PredRecipe->getUnderlyingInstr(), |
351 | make_range(x: PredRecipe->op_begin(), y: std::prev(x: PredRecipe->op_end())), |
352 | PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe); |
353 | auto *Pred = |
354 | Plan.createVPBasicBlock(Name: Twine(RegionName) + ".if" , Recipe: RecipeWithoutMask); |
355 | |
356 | VPPredInstPHIRecipe *PHIRecipe = nullptr; |
357 | if (PredRecipe->getNumUsers() != 0) { |
358 | PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask, |
359 | RecipeWithoutMask->getDebugLoc()); |
360 | PredRecipe->replaceAllUsesWith(New: PHIRecipe); |
361 | PHIRecipe->setOperand(I: 0, New: RecipeWithoutMask); |
362 | } |
363 | PredRecipe->eraseFromParent(); |
364 | auto *Exiting = |
365 | Plan.createVPBasicBlock(Name: Twine(RegionName) + ".continue" , Recipe: PHIRecipe); |
366 | VPRegionBlock *Region = |
367 | Plan.createVPRegionBlock(Entry, Exiting, Name: RegionName, IsReplicator: true); |
368 | |
369 | // Note: first set Entry as region entry and then connect successors starting |
370 | // from it in order, to propagate the "parent" of each VPBasicBlock. |
371 | VPBlockUtils::insertTwoBlocksAfter(IfTrue: Pred, IfFalse: Exiting, BlockPtr: Entry); |
372 | VPBlockUtils::connectBlocks(From: Pred, To: Exiting); |
373 | |
374 | return Region; |
375 | } |
376 | |
377 | static void addReplicateRegions(VPlan &Plan) { |
378 | SmallVector<VPReplicateRecipe *> WorkList; |
379 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
380 | Range: vp_depth_first_deep(G: Plan.getEntry()))) { |
381 | for (VPRecipeBase &R : *VPBB) |
382 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) { |
383 | if (RepR->isPredicated()) |
384 | WorkList.push_back(Elt: RepR); |
385 | } |
386 | } |
387 | |
388 | unsigned BBNum = 0; |
389 | for (VPReplicateRecipe *RepR : WorkList) { |
390 | VPBasicBlock *CurrentBlock = RepR->getParent(); |
391 | VPBasicBlock *SplitBlock = CurrentBlock->splitAt(SplitAt: RepR->getIterator()); |
392 | |
393 | BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent(); |
394 | SplitBlock->setName( |
395 | OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "" ); |
396 | // Record predicated instructions for above packing optimizations. |
397 | VPRegionBlock *Region = createReplicateRegion(PredRecipe: RepR, Plan); |
398 | Region->setParent(CurrentBlock->getParent()); |
399 | VPBlockUtils::insertOnEdge(From: CurrentBlock, To: SplitBlock, BlockPtr: Region); |
400 | |
401 | VPRegionBlock *ParentRegion = Region->getParent(); |
402 | if (ParentRegion && ParentRegion->getExiting() == CurrentBlock) |
403 | ParentRegion->setExiting(SplitBlock); |
404 | } |
405 | } |
406 | |
407 | /// Remove redundant VPBasicBlocks by merging them into their predecessor if |
408 | /// the predecessor has a single successor. |
409 | static bool mergeBlocksIntoPredecessors(VPlan &Plan) { |
410 | SmallVector<VPBasicBlock *> WorkList; |
411 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
412 | Range: vp_depth_first_deep(G: Plan.getEntry()))) { |
413 | // Don't fold the blocks in the skeleton of the Plan into their single |
414 | // predecessors for now. |
415 | // TODO: Remove restriction once more of the skeleton is modeled in VPlan. |
416 | if (!VPBB->getParent()) |
417 | continue; |
418 | auto *PredVPBB = |
419 | dyn_cast_or_null<VPBasicBlock>(Val: VPBB->getSinglePredecessor()); |
420 | if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 || |
421 | isa<VPIRBasicBlock>(Val: PredVPBB)) |
422 | continue; |
423 | WorkList.push_back(Elt: VPBB); |
424 | } |
425 | |
426 | for (VPBasicBlock *VPBB : WorkList) { |
427 | VPBasicBlock *PredVPBB = cast<VPBasicBlock>(Val: VPBB->getSinglePredecessor()); |
428 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) |
429 | R.moveBefore(BB&: *PredVPBB, I: PredVPBB->end()); |
430 | VPBlockUtils::disconnectBlocks(From: PredVPBB, To: VPBB); |
431 | auto *ParentRegion = VPBB->getParent(); |
432 | if (ParentRegion && ParentRegion->getExiting() == VPBB) |
433 | ParentRegion->setExiting(PredVPBB); |
434 | for (auto *Succ : to_vector(Range: VPBB->successors())) { |
435 | VPBlockUtils::disconnectBlocks(From: VPBB, To: Succ); |
436 | VPBlockUtils::connectBlocks(From: PredVPBB, To: Succ); |
437 | } |
438 | // VPBB is now dead and will be cleaned up when the plan gets destroyed. |
439 | } |
440 | return !WorkList.empty(); |
441 | } |
442 | |
443 | void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) { |
444 | // Convert masked VPReplicateRecipes to if-then region blocks. |
445 | addReplicateRegions(Plan); |
446 | |
447 | bool ShouldSimplify = true; |
448 | while (ShouldSimplify) { |
449 | ShouldSimplify = sinkScalarOperands(Plan); |
450 | ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan); |
451 | ShouldSimplify |= mergeBlocksIntoPredecessors(Plan); |
452 | } |
453 | } |
454 | |
455 | /// Remove redundant casts of inductions. |
456 | /// |
457 | /// Such redundant casts are casts of induction variables that can be ignored, |
458 | /// because we already proved that the casted phi is equal to the uncasted phi |
459 | /// in the vectorized loop. There is no need to vectorize the cast - the same |
460 | /// value can be used for both the phi and casts in the vector loop. |
461 | static void removeRedundantInductionCasts(VPlan &Plan) { |
462 | for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
463 | auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi); |
464 | if (!IV || IV->getTruncInst()) |
465 | continue; |
466 | |
467 | // A sequence of IR Casts has potentially been recorded for IV, which |
468 | // *must be bypassed* when the IV is vectorized, because the vectorized IV |
469 | // will produce the desired casted value. This sequence forms a def-use |
470 | // chain and is provided in reverse order, ending with the cast that uses |
471 | // the IV phi. Search for the recipe of the last cast in the chain and |
472 | // replace it with the original IV. Note that only the final cast is |
473 | // expected to have users outside the cast-chain and the dead casts left |
474 | // over will be cleaned up later. |
475 | auto &Casts = IV->getInductionDescriptor().getCastInsts(); |
476 | VPValue *FindMyCast = IV; |
477 | for (Instruction *IRCast : reverse(C: Casts)) { |
478 | VPSingleDefRecipe *FoundUserCast = nullptr; |
479 | for (auto *U : FindMyCast->users()) { |
480 | auto *UserCast = dyn_cast<VPSingleDefRecipe>(Val: U); |
481 | if (UserCast && UserCast->getUnderlyingValue() == IRCast) { |
482 | FoundUserCast = UserCast; |
483 | break; |
484 | } |
485 | } |
486 | FindMyCast = FoundUserCast; |
487 | } |
488 | FindMyCast->replaceAllUsesWith(New: IV); |
489 | } |
490 | } |
491 | |
492 | /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV |
493 | /// recipe, if it exists. |
494 | static void removeRedundantCanonicalIVs(VPlan &Plan) { |
495 | VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); |
496 | VPWidenCanonicalIVRecipe *WidenNewIV = nullptr; |
497 | for (VPUser *U : CanonicalIV->users()) { |
498 | WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(Val: U); |
499 | if (WidenNewIV) |
500 | break; |
501 | } |
502 | |
503 | if (!WidenNewIV) |
504 | return; |
505 | |
506 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
507 | for (VPRecipeBase &Phi : HeaderVPBB->phis()) { |
508 | auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi); |
509 | |
510 | if (!WidenOriginalIV || !WidenOriginalIV->isCanonical()) |
511 | continue; |
512 | |
513 | // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides |
514 | // everything WidenNewIV's users need. That is, WidenOriginalIV will |
515 | // generate a vector phi or all users of WidenNewIV demand the first lane |
516 | // only. |
517 | if (any_of(Range: WidenOriginalIV->users(), |
518 | P: [WidenOriginalIV](VPUser *U) { |
519 | return !U->usesScalars(Op: WidenOriginalIV); |
520 | }) || |
521 | vputils::onlyFirstLaneUsed(Def: WidenNewIV)) { |
522 | WidenNewIV->replaceAllUsesWith(New: WidenOriginalIV); |
523 | WidenNewIV->eraseFromParent(); |
524 | return; |
525 | } |
526 | } |
527 | } |
528 | |
529 | /// Returns true if \p R is dead and can be removed. |
530 | static bool isDeadRecipe(VPRecipeBase &R) { |
531 | using namespace llvm::PatternMatch; |
532 | // Do remove conditional assume instructions as their conditions may be |
533 | // flattened. |
534 | auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R); |
535 | bool IsConditionalAssume = |
536 | RepR && RepR->isPredicated() && |
537 | match(V: RepR->getUnderlyingInstr(), P: m_Intrinsic<Intrinsic::assume>()); |
538 | if (IsConditionalAssume) |
539 | return true; |
540 | |
541 | if (R.mayHaveSideEffects()) |
542 | return false; |
543 | |
544 | // Recipe is dead if no user keeps the recipe alive. |
545 | return all_of(Range: R.definedValues(), |
546 | P: [](VPValue *V) { return V->getNumUsers() == 0; }); |
547 | } |
548 | |
549 | void VPlanTransforms::removeDeadRecipes(VPlan &Plan) { |
550 | ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( |
551 | Plan.getEntry()); |
552 | |
553 | for (VPBasicBlock *VPBB : reverse(C: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT))) { |
554 | // The recipes in the block are processed in reverse order, to catch chains |
555 | // of dead recipes. |
556 | for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) { |
557 | if (isDeadRecipe(R)) |
558 | R.eraseFromParent(); |
559 | } |
560 | } |
561 | } |
562 | |
563 | static VPScalarIVStepsRecipe * |
564 | createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, |
565 | Instruction::BinaryOps InductionOpcode, |
566 | FPMathOperator *FPBinOp, Instruction *TruncI, |
567 | VPValue *StartV, VPValue *Step, DebugLoc DL, |
568 | VPBuilder &Builder) { |
569 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
570 | VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); |
571 | VPSingleDefRecipe *BaseIV = Builder.createDerivedIV( |
572 | Kind, FPBinOp, Start: StartV, Current: CanonicalIV, Step, Name: "offset.idx" ); |
573 | |
574 | // Truncate base induction if needed. |
575 | Type *CanonicalIVType = CanonicalIV->getScalarType(); |
576 | VPTypeAnalysis TypeInfo(CanonicalIVType); |
577 | Type *ResultTy = TypeInfo.inferScalarType(V: BaseIV); |
578 | if (TruncI) { |
579 | Type *TruncTy = TruncI->getType(); |
580 | assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() && |
581 | "Not truncating." ); |
582 | assert(ResultTy->isIntegerTy() && "Truncation requires an integer type" ); |
583 | BaseIV = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: BaseIV, ResultTy: TruncTy, DL); |
584 | ResultTy = TruncTy; |
585 | } |
586 | |
587 | // Truncate step if needed. |
588 | Type *StepTy = TypeInfo.inferScalarType(V: Step); |
589 | if (ResultTy != StepTy) { |
590 | assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() && |
591 | "Not truncating." ); |
592 | assert(StepTy->isIntegerTy() && "Truncation requires an integer type" ); |
593 | auto * = |
594 | cast<VPBasicBlock>(Val: HeaderVPBB->getSingleHierarchicalPredecessor()); |
595 | VPBuilder::InsertPointGuard Guard(Builder); |
596 | Builder.setInsertPoint(VecPreheader); |
597 | Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy, DL); |
598 | } |
599 | return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, IV: BaseIV, Step, |
600 | VF: &Plan.getVF(), DL); |
601 | } |
602 | |
603 | static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) { |
604 | SetVector<VPUser *> Users(llvm::from_range, V->users()); |
605 | for (unsigned I = 0; I != Users.size(); ++I) { |
606 | VPRecipeBase *Cur = cast<VPRecipeBase>(Val: Users[I]); |
607 | if (isa<VPHeaderPHIRecipe>(Val: Cur)) |
608 | continue; |
609 | for (VPValue *V : Cur->definedValues()) |
610 | Users.insert_range(R: V->users()); |
611 | } |
612 | return Users.takeVector(); |
613 | } |
614 | |
615 | /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd |
616 | /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as |
617 | /// VPWidenPointerInductionRecipe will generate vectors only. If some users |
618 | /// require vectors while other require scalars, the scalar uses need to extract |
619 | /// the scalars from the generated vectors (Note that this is different to how |
620 | /// int/fp inductions are handled). Legalize extract-from-ends using uniform |
621 | /// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so |
622 | /// the correct end value is available. Also optimize |
623 | /// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by |
624 | /// providing them scalar steps built on the canonical scalar IV and update the |
625 | /// original IV's users. This is an optional optimization to reduce the needs of |
626 | /// vector extracts. |
627 | static void legalizeAndOptimizeInductions(VPlan &Plan) { |
628 | using namespace llvm::VPlanPatternMatch; |
629 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
630 | bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly(); |
631 | VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); |
632 | for (VPRecipeBase &Phi : HeaderVPBB->phis()) { |
633 | auto *PhiR = dyn_cast<VPWidenInductionRecipe>(Val: &Phi); |
634 | if (!PhiR) |
635 | continue; |
636 | |
637 | // Try to narrow wide and replicating recipes to uniform recipes, based on |
638 | // VPlan analysis. |
639 | // TODO: Apply to all recipes in the future, to replace legacy uniformity |
640 | // analysis. |
641 | auto Users = collectUsersRecursively(V: PhiR); |
642 | for (VPUser *U : reverse(C&: Users)) { |
643 | auto *Def = dyn_cast<VPSingleDefRecipe>(Val: U); |
644 | auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U); |
645 | // Skip recipes that shouldn't be narrowed. |
646 | if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Val: Def) || |
647 | Def->getNumUsers() == 0 || !Def->getUnderlyingValue() || |
648 | (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))) |
649 | continue; |
650 | |
651 | // Skip recipes that may have other lanes than their first used. |
652 | if (!vputils::isSingleScalar(VPV: Def) && !vputils::onlyFirstLaneUsed(Def)) |
653 | continue; |
654 | |
655 | auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(), |
656 | Def->operands(), /*IsUniform*/ true); |
657 | Clone->insertAfter(InsertPos: Def); |
658 | Def->replaceAllUsesWith(New: Clone); |
659 | } |
660 | |
661 | // Replace wide pointer inductions which have only their scalars used by |
662 | // PtrAdd(IndStart, ScalarIVSteps (0, Step)). |
663 | if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(Val: &Phi)) { |
664 | if (!PtrIV->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF())) |
665 | continue; |
666 | |
667 | const InductionDescriptor &ID = PtrIV->getInductionDescriptor(); |
668 | VPValue *StartV = |
669 | Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: ID.getStep()->getType(), V: 0)); |
670 | VPValue *StepV = PtrIV->getOperand(N: 1); |
671 | VPScalarIVStepsRecipe *Steps = createScalarIVSteps( |
672 | Plan, Kind: InductionDescriptor::IK_IntInduction, InductionOpcode: Instruction::Add, FPBinOp: nullptr, |
673 | TruncI: nullptr, StartV, Step: StepV, DL: PtrIV->getDebugLoc(), Builder); |
674 | |
675 | VPValue *PtrAdd = Builder.createPtrAdd(Ptr: PtrIV->getStartValue(), Offset: Steps, |
676 | DL: PtrIV->getDebugLoc(), Name: "next.gep" ); |
677 | |
678 | PtrIV->replaceAllUsesWith(New: PtrAdd); |
679 | continue; |
680 | } |
681 | |
682 | // Replace widened induction with scalar steps for users that only use |
683 | // scalars. |
684 | auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi); |
685 | if (HasOnlyVectorVFs && none_of(Range: WideIV->users(), P: [WideIV](VPUser *U) { |
686 | return U->usesScalars(Op: WideIV); |
687 | })) |
688 | continue; |
689 | |
690 | const InductionDescriptor &ID = WideIV->getInductionDescriptor(); |
691 | VPScalarIVStepsRecipe *Steps = createScalarIVSteps( |
692 | Plan, Kind: ID.getKind(), InductionOpcode: ID.getInductionOpcode(), |
693 | FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()), |
694 | TruncI: WideIV->getTruncInst(), StartV: WideIV->getStartValue(), Step: WideIV->getStepValue(), |
695 | DL: WideIV->getDebugLoc(), Builder); |
696 | |
697 | // Update scalar users of IV to use Step instead. |
698 | if (!HasOnlyVectorVFs) |
699 | WideIV->replaceAllUsesWith(New: Steps); |
700 | else |
701 | WideIV->replaceUsesWithIf(New: Steps, ShouldReplace: [WideIV](VPUser &U, unsigned) { |
702 | return U.usesScalars(Op: WideIV); |
703 | }); |
704 | } |
705 | } |
706 | |
707 | /// Check if \p VPV is an untruncated wide induction, either before or after the |
708 | /// increment. If so return the header IV (before the increment), otherwise |
709 | /// return null. |
710 | static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) { |
711 | auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: VPV); |
712 | if (WideIV) { |
713 | // VPV itself is a wide induction, separately compute the end value for exit |
714 | // users if it is not a truncated IV. |
715 | auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV); |
716 | return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV; |
717 | } |
718 | |
719 | // Check if VPV is an optimizable induction increment. |
720 | VPRecipeBase *Def = VPV->getDefiningRecipe(); |
721 | if (!Def || Def->getNumOperands() != 2) |
722 | return nullptr; |
723 | WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: 0)); |
724 | if (!WideIV) |
725 | WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: 1)); |
726 | if (!WideIV) |
727 | return nullptr; |
728 | |
729 | auto IsWideIVInc = [&]() { |
730 | using namespace VPlanPatternMatch; |
731 | auto &ID = WideIV->getInductionDescriptor(); |
732 | |
733 | // Check if VPV increments the induction by the induction step. |
734 | VPValue *IVStep = WideIV->getStepValue(); |
735 | switch (ID.getInductionOpcode()) { |
736 | case Instruction::Add: |
737 | return match(V: VPV, P: m_c_Binary<Instruction::Add>(Op0: m_Specific(VPV: WideIV), |
738 | Op1: m_Specific(VPV: IVStep))); |
739 | case Instruction::FAdd: |
740 | return match(V: VPV, P: m_c_Binary<Instruction::FAdd>(Op0: m_Specific(VPV: WideIV), |
741 | Op1: m_Specific(VPV: IVStep))); |
742 | case Instruction::FSub: |
743 | return match(V: VPV, P: m_Binary<Instruction::FSub>(Op0: m_Specific(VPV: WideIV), |
744 | Op1: m_Specific(VPV: IVStep))); |
745 | case Instruction::Sub: { |
746 | // IVStep will be the negated step of the subtraction. Check if Step == -1 |
747 | // * IVStep. |
748 | VPValue *Step; |
749 | if (!match(V: VPV, |
750 | P: m_Binary<Instruction::Sub>(Op0: m_VPValue(), Op1: m_VPValue(V&: Step))) || |
751 | !Step->isLiveIn() || !IVStep->isLiveIn()) |
752 | return false; |
753 | auto *StepCI = dyn_cast<ConstantInt>(Val: Step->getLiveInIRValue()); |
754 | auto *IVStepCI = dyn_cast<ConstantInt>(Val: IVStep->getLiveInIRValue()); |
755 | return StepCI && IVStepCI && |
756 | StepCI->getValue() == (-1 * IVStepCI->getValue()); |
757 | } |
758 | default: |
759 | return ID.getKind() == InductionDescriptor::IK_PtrInduction && |
760 | match(V: VPV, P: m_GetElementPtr(Op0: m_Specific(VPV: WideIV), |
761 | Op1: m_Specific(VPV: WideIV->getStepValue()))); |
762 | } |
763 | llvm_unreachable("should have been covered by switch above" ); |
764 | }; |
765 | return IsWideIVInc() ? WideIV : nullptr; |
766 | } |
767 | |
768 | /// Attempts to optimize the induction variable exit values for users in the |
769 | /// early exit block. |
770 | static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, |
771 | VPTypeAnalysis &TypeInfo, |
772 | VPBlockBase *PredVPBB, |
773 | VPValue *Op) { |
774 | using namespace VPlanPatternMatch; |
775 | |
776 | VPValue *Incoming, *Mask; |
777 | if (!match(V: Op, P: m_VPInstruction<Instruction::ExtractElement>( |
778 | Op0: m_VPValue(V&: Incoming), |
779 | Op1: m_VPInstruction<VPInstruction::FirstActiveLane>( |
780 | Op0: m_VPValue(V&: Mask))))) |
781 | return nullptr; |
782 | |
783 | auto *WideIV = getOptimizableIVOf(VPV: Incoming); |
784 | if (!WideIV) |
785 | return nullptr; |
786 | |
787 | auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV); |
788 | if (WideIntOrFp && WideIntOrFp->getTruncInst()) |
789 | return nullptr; |
790 | |
791 | // Calculate the final index. |
792 | VPValue *EndValue = Plan.getCanonicalIV(); |
793 | auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); |
794 | VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB)); |
795 | |
796 | DebugLoc DL = cast<VPInstruction>(Val: Op)->getDebugLoc(); |
797 | VPValue *FirstActiveLane = |
798 | B.createNaryOp(Opcode: VPInstruction::FirstActiveLane, Operands: Mask, DL); |
799 | Type *FirstActiveLaneType = TypeInfo.inferScalarType(V: FirstActiveLane); |
800 | FirstActiveLane = B.createScalarZExtOrTrunc(Op: FirstActiveLane, ResultTy: CanonicalIVType, |
801 | SrcTy: FirstActiveLaneType, DL); |
802 | EndValue = B.createNaryOp(Opcode: Instruction::Add, Operands: {EndValue, FirstActiveLane}, DL); |
803 | |
804 | // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it |
805 | // changed it means the exit is using the incremented value, so we need to |
806 | // add the step. |
807 | if (Incoming != WideIV) { |
808 | VPValue *One = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanonicalIVType, V: 1)); |
809 | EndValue = B.createNaryOp(Opcode: Instruction::Add, Operands: {EndValue, One}, DL); |
810 | } |
811 | |
812 | if (!WideIntOrFp || !WideIntOrFp->isCanonical()) { |
813 | const InductionDescriptor &ID = WideIV->getInductionDescriptor(); |
814 | VPValue *Start = WideIV->getStartValue(); |
815 | VPValue *Step = WideIV->getStepValue(); |
816 | EndValue = B.createDerivedIV( |
817 | Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()), |
818 | Start, Current: EndValue, Step); |
819 | } |
820 | |
821 | return EndValue; |
822 | } |
823 | |
824 | /// Attempts to optimize the induction variable exit values for users in the |
825 | /// exit block coming from the latch in the original scalar loop. |
826 | static VPValue * |
827 | optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, |
828 | VPBlockBase *PredVPBB, VPValue *Op, |
829 | DenseMap<VPValue *, VPValue *> &EndValues) { |
830 | using namespace VPlanPatternMatch; |
831 | |
832 | VPValue *Incoming; |
833 | if (!match(V: Op, P: m_VPInstruction<VPInstruction::ExtractLastElement>( |
834 | Op0: m_VPValue(V&: Incoming)))) |
835 | return nullptr; |
836 | |
837 | auto *WideIV = getOptimizableIVOf(VPV: Incoming); |
838 | if (!WideIV) |
839 | return nullptr; |
840 | |
841 | VPValue *EndValue = EndValues.lookup(Val: WideIV); |
842 | assert(EndValue && "end value must have been pre-computed" ); |
843 | |
844 | // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it |
845 | // changed it means the exit is using the incremented value, so we don't |
846 | // need to subtract the step. |
847 | if (Incoming != WideIV) |
848 | return EndValue; |
849 | |
850 | // Otherwise, subtract the step from the EndValue. |
851 | VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB)->getTerminator()); |
852 | VPValue *Step = WideIV->getStepValue(); |
853 | Type *ScalarTy = TypeInfo.inferScalarType(V: WideIV); |
854 | if (ScalarTy->isIntegerTy()) |
855 | return B.createNaryOp(Opcode: Instruction::Sub, Operands: {EndValue, Step}, Inst: {}, Name: "ind.escape" ); |
856 | if (ScalarTy->isPointerTy()) { |
857 | auto *Zero = Plan.getOrAddLiveIn( |
858 | V: ConstantInt::get(Ty: Step->getLiveInIRValue()->getType(), V: 0)); |
859 | return B.createPtrAdd(Ptr: EndValue, |
860 | Offset: B.createNaryOp(Opcode: Instruction::Sub, Operands: {Zero, Step}), DL: {}, |
861 | Name: "ind.escape" ); |
862 | } |
863 | if (ScalarTy->isFloatingPointTy()) { |
864 | const auto &ID = WideIV->getInductionDescriptor(); |
865 | return B.createNaryOp( |
866 | Opcode: ID.getInductionBinOp()->getOpcode() == Instruction::FAdd |
867 | ? Instruction::FSub |
868 | : Instruction::FAdd, |
869 | Operands: {EndValue, Step}, Flags: {ID.getInductionBinOp()->getFastMathFlags()}); |
870 | } |
871 | llvm_unreachable("all possible induction types must be handled" ); |
872 | return nullptr; |
873 | } |
874 | |
875 | void VPlanTransforms::optimizeInductionExitUsers( |
876 | VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues) { |
877 | VPBlockBase *MiddleVPBB = Plan.getMiddleBlock(); |
878 | VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); |
879 | for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) { |
880 | for (VPRecipeBase &R : ExitVPBB->phis()) { |
881 | auto *ExitIRI = cast<VPIRPhi>(Val: &R); |
882 | |
883 | for (auto [Idx, PredVPBB] : enumerate(First&: ExitVPBB->getPredecessors())) { |
884 | VPValue *Escape = nullptr; |
885 | if (PredVPBB == MiddleVPBB) |
886 | Escape = optimizeLatchExitInductionUser( |
887 | Plan, TypeInfo, PredVPBB, Op: ExitIRI->getOperand(N: Idx), EndValues); |
888 | else |
889 | Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB, |
890 | Op: ExitIRI->getOperand(N: Idx)); |
891 | if (Escape) |
892 | ExitIRI->setOperand(I: Idx, New: Escape); |
893 | } |
894 | } |
895 | } |
896 | } |
897 | |
898 | /// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing |
899 | /// them with already existing recipes expanding the same SCEV expression. |
900 | static void removeRedundantExpandSCEVRecipes(VPlan &Plan) { |
901 | DenseMap<const SCEV *, VPValue *> SCEV2VPV; |
902 | |
903 | for (VPRecipeBase &R : |
904 | make_early_inc_range(Range&: *Plan.getEntry()->getEntryBasicBlock())) { |
905 | auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(Val: &R); |
906 | if (!ExpR) |
907 | continue; |
908 | |
909 | auto I = SCEV2VPV.insert(KV: {ExpR->getSCEV(), ExpR}); |
910 | if (I.second) |
911 | continue; |
912 | ExpR->replaceAllUsesWith(New: I.first->second); |
913 | ExpR->eraseFromParent(); |
914 | } |
915 | } |
916 | |
917 | static void recursivelyDeleteDeadRecipes(VPValue *V) { |
918 | SmallVector<VPValue *> WorkList; |
919 | SmallPtrSet<VPValue *, 8> Seen; |
920 | WorkList.push_back(Elt: V); |
921 | |
922 | while (!WorkList.empty()) { |
923 | VPValue *Cur = WorkList.pop_back_val(); |
924 | if (!Seen.insert(Ptr: Cur).second) |
925 | continue; |
926 | VPRecipeBase *R = Cur->getDefiningRecipe(); |
927 | if (!R) |
928 | continue; |
929 | if (!isDeadRecipe(R&: *R)) |
930 | continue; |
931 | WorkList.append(in_start: R->op_begin(), in_end: R->op_end()); |
932 | R->eraseFromParent(); |
933 | } |
934 | } |
935 | |
936 | /// Try to fold \p R using InstSimplifyFolder. Will succeed and return a |
937 | /// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are |
938 | /// foldable live-ins. |
939 | static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode, |
940 | ArrayRef<VPValue *> Operands, |
941 | const DataLayout &DL, VPTypeAnalysis &TypeInfo) { |
942 | SmallVector<Value *, 4> Ops; |
943 | for (VPValue *Op : Operands) { |
944 | if (!Op->isLiveIn() || !Op->getLiveInIRValue()) |
945 | return nullptr; |
946 | Ops.push_back(Elt: Op->getLiveInIRValue()); |
947 | } |
948 | |
949 | InstSimplifyFolder Folder(DL); |
950 | if (Instruction::isBinaryOp(Opcode)) |
951 | return Folder.FoldBinOp(Opc: static_cast<Instruction::BinaryOps>(Opcode), LHS: Ops[0], |
952 | RHS: Ops[1]); |
953 | if (Instruction::isCast(Opcode)) |
954 | return Folder.FoldCast(Op: static_cast<Instruction::CastOps>(Opcode), V: Ops[0], |
955 | DestTy: TypeInfo.inferScalarType(V: R.getVPSingleValue())); |
956 | switch (Opcode) { |
957 | case VPInstruction::LogicalAnd: |
958 | return Folder.FoldSelect(C: Ops[0], True: Ops[1], |
959 | False: ConstantInt::getNullValue(Ty: Ops[1]->getType())); |
960 | case VPInstruction::Not: |
961 | return Folder.FoldBinOp(Opc: Instruction::BinaryOps::Xor, LHS: Ops[0], |
962 | RHS: Constant::getAllOnesValue(Ty: Ops[0]->getType())); |
963 | case Instruction::Select: |
964 | return Folder.FoldSelect(C: Ops[0], True: Ops[1], False: Ops[2]); |
965 | case Instruction::ICmp: |
966 | case Instruction::FCmp: |
967 | return Folder.FoldCmp(P: cast<VPRecipeWithIRFlags>(Val: R).getPredicate(), LHS: Ops[0], |
968 | RHS: Ops[1]); |
969 | case Instruction::GetElementPtr: { |
970 | auto &RFlags = cast<VPRecipeWithIRFlags>(Val: R); |
971 | auto *GEP = cast<GetElementPtrInst>(Val: RFlags.getUnderlyingInstr()); |
972 | return Folder.FoldGEP(Ty: GEP->getSourceElementType(), Ptr: Ops[0], IdxList: drop_begin(RangeOrContainer&: Ops), |
973 | NW: RFlags.getGEPNoWrapFlags()); |
974 | } |
975 | case VPInstruction::PtrAdd: |
976 | return Folder.FoldGEP(Ty: IntegerType::getInt8Ty(C&: TypeInfo.getContext()), Ptr: Ops[0], |
977 | IdxList: Ops[1], |
978 | NW: cast<VPRecipeWithIRFlags>(Val: R).getGEPNoWrapFlags()); |
979 | case Instruction::InsertElement: |
980 | return Folder.FoldInsertElement(Vec: Ops[0], NewElt: Ops[1], Idx: Ops[2]); |
981 | case Instruction::ExtractElement: |
982 | return Folder.FoldExtractElement(Vec: Ops[0], Idx: Ops[1]); |
983 | } |
984 | return nullptr; |
985 | } |
986 | |
987 | /// Try to simplify recipe \p R. |
988 | static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { |
989 | using namespace llvm::VPlanPatternMatch; |
990 | VPlan *Plan = R.getParent()->getPlan(); |
991 | |
992 | auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R); |
993 | if (!Def) |
994 | return; |
995 | |
996 | // Simplification of live-in IR values for SingleDef recipes using |
997 | // InstSimplifyFolder. |
998 | if (TypeSwitch<VPRecipeBase *, bool>(&R) |
999 | .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, |
1000 | VPReplicateRecipe>(caseFn: [&](auto *I) { |
1001 | const DataLayout &DL = |
1002 | Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout(); |
1003 | Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL, |
1004 | TypeInfo); |
1005 | if (V) |
1006 | I->replaceAllUsesWith(Plan->getOrAddLiveIn(V)); |
1007 | return V; |
1008 | }) |
1009 | .Default(defaultFn: [](auto *) { return false; })) |
1010 | return; |
1011 | |
1012 | // Fold PredPHI LiveIn -> LiveIn. |
1013 | if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Val: &R)) { |
1014 | VPValue *Op = PredPHI->getOperand(N: 0); |
1015 | if (Op->isLiveIn()) |
1016 | PredPHI->replaceAllUsesWith(New: Op); |
1017 | } |
1018 | |
1019 | VPValue *A; |
1020 | if (match(V: Def, P: m_Trunc(Op0: m_ZExtOrSExt(Op0: m_VPValue(V&: A))))) { |
1021 | Type *TruncTy = TypeInfo.inferScalarType(V: Def); |
1022 | Type *ATy = TypeInfo.inferScalarType(V: A); |
1023 | if (TruncTy == ATy) { |
1024 | Def->replaceAllUsesWith(New: A); |
1025 | } else { |
1026 | // Don't replace a scalarizing recipe with a widened cast. |
1027 | if (isa<VPReplicateRecipe>(Val: Def)) |
1028 | return; |
1029 | if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { |
1030 | |
1031 | unsigned ExtOpcode = match(V: R.getOperand(N: 0), P: m_SExt(Op0: m_VPValue())) |
1032 | ? Instruction::SExt |
1033 | : Instruction::ZExt; |
1034 | auto *VPC = |
1035 | new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); |
1036 | if (auto *UnderlyingExt = R.getOperand(N: 0)->getUnderlyingValue()) { |
1037 | // UnderlyingExt has distinct return type, used to retain legacy cost. |
1038 | VPC->setUnderlyingValue(UnderlyingExt); |
1039 | } |
1040 | VPC->insertBefore(InsertPos: &R); |
1041 | Def->replaceAllUsesWith(New: VPC); |
1042 | } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { |
1043 | auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); |
1044 | VPC->insertBefore(InsertPos: &R); |
1045 | Def->replaceAllUsesWith(New: VPC); |
1046 | } |
1047 | } |
1048 | #ifndef NDEBUG |
1049 | // Verify that the cached type info is for both A and its users is still |
1050 | // accurate by comparing it to freshly computed types. |
1051 | VPTypeAnalysis TypeInfo2(Plan->getCanonicalIV()->getScalarType()); |
1052 | assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); |
1053 | for (VPUser *U : A->users()) { |
1054 | auto *R = cast<VPRecipeBase>(U); |
1055 | for (VPValue *VPV : R->definedValues()) |
1056 | assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); |
1057 | } |
1058 | #endif |
1059 | } |
1060 | |
1061 | // Simplify (X && Y) || (X && !Y) -> X. |
1062 | // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X |
1063 | // && (Y || Z) and (X || !X) into true. This requires queuing newly created |
1064 | // recipes to be visited during simplification. |
1065 | VPValue *X, *Y; |
1066 | if (match(V: Def, |
1067 | P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)), |
1068 | Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_Not(Op0: m_Deferred(V: Y)))))) { |
1069 | Def->replaceAllUsesWith(New: X); |
1070 | Def->eraseFromParent(); |
1071 | return; |
1072 | } |
1073 | |
1074 | // OR x, 1 -> 1. |
1075 | if (match(V: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_AllOnes()))) { |
1076 | Def->replaceAllUsesWith(New: Def->getOperand(N: 0) == X ? Def->getOperand(N: 1) |
1077 | : Def->getOperand(N: 0)); |
1078 | Def->eraseFromParent(); |
1079 | return; |
1080 | } |
1081 | |
1082 | if (match(V: Def, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: X), Op2: m_Deferred(V: X)))) |
1083 | return Def->replaceAllUsesWith(New: X); |
1084 | |
1085 | if (match(V: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_SpecificInt(V: 1)))) |
1086 | return Def->replaceAllUsesWith(New: A); |
1087 | |
1088 | if (match(V: Def, P: m_Not(Op0: m_VPValue(V&: A)))) { |
1089 | if (match(V: A, P: m_Not(Op0: m_VPValue(V&: A)))) |
1090 | return Def->replaceAllUsesWith(New: A); |
1091 | |
1092 | // Try to fold Not into compares by adjusting the predicate in-place. |
1093 | if (isa<VPWidenRecipe>(Val: A) && A->getNumUsers() == 1) { |
1094 | auto *WideCmp = cast<VPWidenRecipe>(Val: A); |
1095 | if (WideCmp->getOpcode() == Instruction::ICmp || |
1096 | WideCmp->getOpcode() == Instruction::FCmp) { |
1097 | WideCmp->setPredicate( |
1098 | CmpInst::getInversePredicate(pred: WideCmp->getPredicate())); |
1099 | Def->replaceAllUsesWith(New: WideCmp); |
1100 | // If WideCmp doesn't have a debug location, use the one from the |
1101 | // negation, to preserve the location. |
1102 | if (!WideCmp->getDebugLoc() && R.getDebugLoc()) |
1103 | WideCmp->setDebugLoc(R.getDebugLoc()); |
1104 | } |
1105 | } |
1106 | } |
1107 | |
1108 | // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0. |
1109 | if ((match(V: Def, |
1110 | P: m_DerivedIV(Op0: m_SpecificInt(V: 0), Op1: m_VPValue(V&: A), Op2: m_SpecificInt(V: 1))) || |
1111 | match(V: Def, |
1112 | P: m_DerivedIV(Op0: m_SpecificInt(V: 0), Op1: m_SpecificInt(V: 0), Op2: m_VPValue()))) && |
1113 | TypeInfo.inferScalarType(V: Def->getOperand(N: 1)) == |
1114 | TypeInfo.inferScalarType(V: Def)) |
1115 | return Def->replaceAllUsesWith(New: Def->getOperand(N: 1)); |
1116 | |
1117 | if (match(V: Def, P: m_VPInstruction<VPInstruction::WideIVStep>( |
1118 | Op0: m_VPValue(V&: X), Op1: m_SpecificInt(V: 1)))) { |
1119 | Type *WideStepTy = TypeInfo.inferScalarType(V: Def); |
1120 | if (TypeInfo.inferScalarType(V: X) != WideStepTy) |
1121 | X = VPBuilder(Def).createWidenCast(Opcode: Instruction::Trunc, Op: X, ResultTy: WideStepTy); |
1122 | Def->replaceAllUsesWith(New: X); |
1123 | return; |
1124 | } |
1125 | |
1126 | // For i1 vp.merges produced by AnyOf reductions: |
1127 | // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl |
1128 | if (match(V: Def, P: m_Intrinsic<Intrinsic::vp_merge>(Op0: m_True(), Op1: m_VPValue(V&: A), |
1129 | Op2: m_VPValue(V&: X), Op3: m_VPValue())) && |
1130 | match(V: A, P: m_c_BinaryOr(Op0: m_Specific(VPV: X), Op1: m_VPValue(V&: Y))) && |
1131 | TypeInfo.inferScalarType(V: R.getVPSingleValue())->isIntegerTy(Bitwidth: 1)) { |
1132 | Def->setOperand(I: 1, New: Def->getOperand(N: 0)); |
1133 | Def->setOperand(I: 0, New: Y); |
1134 | return; |
1135 | } |
1136 | |
1137 | if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: Def)) { |
1138 | if (Phi->getOperand(N: 0) == Phi->getOperand(N: 1)) |
1139 | Def->replaceAllUsesWith(New: Phi->getOperand(N: 0)); |
1140 | return; |
1141 | } |
1142 | |
1143 | // Look through ExtractLastElement (BuildVector ....). |
1144 | if (match(V: &R, P: m_VPInstruction<VPInstruction::ExtractLastElement>( |
1145 | Op0: m_BuildVector()))) { |
1146 | auto *BuildVector = cast<VPInstruction>(Val: R.getOperand(N: 0)); |
1147 | Def->replaceAllUsesWith( |
1148 | New: BuildVector->getOperand(N: BuildVector->getNumOperands() - 1)); |
1149 | return; |
1150 | } |
1151 | |
1152 | // Look through ExtractPenultimateElement (BuildVector ....). |
1153 | if (match(V: &R, P: m_VPInstruction<VPInstruction::ExtractPenultimateElement>( |
1154 | Op0: m_BuildVector()))) { |
1155 | auto *BuildVector = cast<VPInstruction>(Val: R.getOperand(N: 0)); |
1156 | Def->replaceAllUsesWith( |
1157 | New: BuildVector->getOperand(N: BuildVector->getNumOperands() - 2)); |
1158 | return; |
1159 | } |
1160 | |
1161 | // Some simplifications can only be applied after unrolling. Perform them |
1162 | // below. |
1163 | if (!Plan->isUnrolled()) |
1164 | return; |
1165 | |
1166 | // VPScalarIVSteps for part 0 can be replaced by their start value, if only |
1167 | // the first lane is demanded. |
1168 | if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: Def)) { |
1169 | if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Def: Steps)) { |
1170 | Steps->replaceAllUsesWith(New: Steps->getOperand(N: 0)); |
1171 | return; |
1172 | } |
1173 | } |
1174 | // Simplify redundant ReductionStartVector recipes after unrolling. |
1175 | VPValue *StartV; |
1176 | if (match(V: Def, P: m_VPInstruction<VPInstruction::ReductionStartVector>( |
1177 | Op0: m_VPValue(V&: StartV), Op1: m_VPValue(), Op2: m_VPValue()))) { |
1178 | Def->replaceUsesWithIf(New: StartV, ShouldReplace: [](const VPUser &U, unsigned Idx) { |
1179 | auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &U); |
1180 | return PhiR && PhiR->isInLoop(); |
1181 | }); |
1182 | return; |
1183 | } |
1184 | |
1185 | if (match(V: Def, |
1186 | P: m_VPInstruction<VPInstruction::ExtractLastElement>( |
1187 | Op0: m_VPInstruction<VPInstruction::Broadcast>(Op0: m_VPValue(V&: A))))) { |
1188 | Def->replaceAllUsesWith(New: A); |
1189 | return; |
1190 | } |
1191 | |
1192 | VPInstruction *OpVPI; |
1193 | if (match(V: Def, P: m_VPInstruction<VPInstruction::ExtractLastElement>( |
1194 | Op0: m_VPInstruction(V&: OpVPI))) && |
1195 | OpVPI->isVectorToScalar()) { |
1196 | Def->replaceAllUsesWith(New: OpVPI); |
1197 | return; |
1198 | } |
1199 | } |
1200 | |
1201 | void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) { |
1202 | ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( |
1203 | Plan.getEntry()); |
1204 | VPTypeAnalysis TypeInfo(&CanonicalIVTy); |
1205 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) { |
1206 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
1207 | simplifyRecipe(R, TypeInfo); |
1208 | } |
1209 | } |
1210 | } |
1211 | |
1212 | static void narrowToSingleScalarRecipes(VPlan &Plan) { |
1213 | if (Plan.hasScalarVFOnly()) |
1214 | return; |
1215 | |
1216 | // Try to narrow wide and replicating recipes to single scalar recipes, |
1217 | // based on VPlan analysis. Only process blocks in the loop region for now, |
1218 | // without traversing into nested regions, as recipes in replicate regions |
1219 | // cannot be converted yet. |
1220 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
1221 | Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) { |
1222 | for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) { |
1223 | if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(Val: &R)) |
1224 | continue; |
1225 | auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R); |
1226 | if (RepR && (RepR->isSingleScalar() || RepR->isPredicated())) |
1227 | continue; |
1228 | |
1229 | auto *RepOrWidenR = cast<VPSingleDefRecipe>(Val: &R); |
1230 | // Skip recipes that aren't single scalars or don't have only their |
1231 | // scalar results used. In the latter case, we would introduce extra |
1232 | // broadcasts. |
1233 | if (!vputils::isSingleScalar(VPV: RepOrWidenR) || |
1234 | any_of(Range: RepOrWidenR->users(), P: [RepOrWidenR](VPUser *U) { |
1235 | return !U->usesScalars(Op: RepOrWidenR); |
1236 | })) |
1237 | continue; |
1238 | |
1239 | auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(), |
1240 | RepOrWidenR->operands(), |
1241 | true /*IsSingleScalar*/); |
1242 | Clone->insertBefore(InsertPos: RepOrWidenR); |
1243 | RepOrWidenR->replaceAllUsesWith(New: Clone); |
1244 | } |
1245 | } |
1246 | } |
1247 | |
1248 | /// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes |
1249 | /// to make sure the masks are simplified. |
1250 | static void simplifyBlends(VPlan &Plan) { |
1251 | using namespace llvm::VPlanPatternMatch; |
1252 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
1253 | Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) { |
1254 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
1255 | auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R); |
1256 | if (!Blend) |
1257 | continue; |
1258 | |
1259 | // Try to remove redundant blend recipes. |
1260 | SmallPtrSet<VPValue *, 4> UniqueValues; |
1261 | if (Blend->isNormalized() || !match(V: Blend->getMask(Idx: 0), P: m_False())) |
1262 | UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: 0)); |
1263 | for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) |
1264 | if (!match(V: Blend->getMask(Idx: I), P: m_False())) |
1265 | UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: I)); |
1266 | |
1267 | if (UniqueValues.size() == 1) { |
1268 | Blend->replaceAllUsesWith(New: *UniqueValues.begin()); |
1269 | Blend->eraseFromParent(); |
1270 | continue; |
1271 | } |
1272 | |
1273 | if (Blend->isNormalized()) |
1274 | continue; |
1275 | |
1276 | // Normalize the blend so its first incoming value is used as the initial |
1277 | // value with the others blended into it. |
1278 | |
1279 | unsigned StartIndex = 0; |
1280 | for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { |
1281 | // If a value's mask is used only by the blend then is can be deadcoded. |
1282 | // TODO: Find the most expensive mask that can be deadcoded, or a mask |
1283 | // that's used by multiple blends where it can be removed from them all. |
1284 | VPValue *Mask = Blend->getMask(Idx: I); |
1285 | if (Mask->getNumUsers() == 1 && !match(V: Mask, P: m_False())) { |
1286 | StartIndex = I; |
1287 | break; |
1288 | } |
1289 | } |
1290 | |
1291 | SmallVector<VPValue *, 4> OperandsWithMask; |
1292 | OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: StartIndex)); |
1293 | |
1294 | for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { |
1295 | if (I == StartIndex) |
1296 | continue; |
1297 | OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: I)); |
1298 | OperandsWithMask.push_back(Elt: Blend->getMask(Idx: I)); |
1299 | } |
1300 | |
1301 | auto *NewBlend = new VPBlendRecipe( |
1302 | cast<PHINode>(Val: Blend->getUnderlyingValue()), OperandsWithMask); |
1303 | NewBlend->insertBefore(InsertPos: &R); |
1304 | |
1305 | VPValue *DeadMask = Blend->getMask(Idx: StartIndex); |
1306 | Blend->replaceAllUsesWith(New: NewBlend); |
1307 | Blend->eraseFromParent(); |
1308 | recursivelyDeleteDeadRecipes(V: DeadMask); |
1309 | |
1310 | /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask. |
1311 | VPValue *NewMask; |
1312 | if (NewBlend->getNumOperands() == 3 && |
1313 | match(V: NewBlend->getMask(Idx: 1), P: m_Not(Op0: m_VPValue(V&: NewMask)))) { |
1314 | VPValue *Inc0 = NewBlend->getOperand(N: 0); |
1315 | VPValue *Inc1 = NewBlend->getOperand(N: 1); |
1316 | VPValue *OldMask = NewBlend->getOperand(N: 2); |
1317 | NewBlend->setOperand(I: 0, New: Inc1); |
1318 | NewBlend->setOperand(I: 1, New: Inc0); |
1319 | NewBlend->setOperand(I: 2, New: NewMask); |
1320 | if (OldMask->getNumUsers() == 0) |
1321 | cast<VPInstruction>(Val: OldMask)->eraseFromParent(); |
1322 | } |
1323 | } |
1324 | } |
1325 | } |
1326 | |
1327 | /// Optimize the width of vector induction variables in \p Plan based on a known |
1328 | /// constant Trip Count, \p BestVF and \p BestUF. |
1329 | static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, |
1330 | ElementCount BestVF, |
1331 | unsigned BestUF) { |
1332 | // Only proceed if we have not completely removed the vector region. |
1333 | if (!Plan.getVectorLoopRegion()) |
1334 | return false; |
1335 | |
1336 | if (!Plan.getTripCount()->isLiveIn()) |
1337 | return false; |
1338 | auto *TC = dyn_cast_if_present<ConstantInt>( |
1339 | Val: Plan.getTripCount()->getUnderlyingValue()); |
1340 | if (!TC || !BestVF.isFixed()) |
1341 | return false; |
1342 | |
1343 | // Calculate the minimum power-of-2 bit width that can fit the known TC, VF |
1344 | // and UF. Returns at least 8. |
1345 | auto ComputeBitWidth = [](APInt TC, uint64_t Align) { |
1346 | APInt AlignedTC = |
1347 | Align * APIntOps::RoundingUDiv(A: TC, B: APInt(TC.getBitWidth(), Align), |
1348 | RM: APInt::Rounding::UP); |
1349 | APInt MaxVal = AlignedTC - 1; |
1350 | return std::max<unsigned>(a: PowerOf2Ceil(A: MaxVal.getActiveBits()), b: 8); |
1351 | }; |
1352 | unsigned NewBitWidth = |
1353 | ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF); |
1354 | |
1355 | LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext(); |
1356 | auto *NewIVTy = IntegerType::get(C&: Ctx, NumBits: NewBitWidth); |
1357 | |
1358 | bool MadeChange = false; |
1359 | |
1360 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
1361 | for (VPRecipeBase &Phi : HeaderVPBB->phis()) { |
1362 | auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi); |
1363 | |
1364 | // Currently only handle canonical IVs as it is trivial to replace the start |
1365 | // and stop values, and we currently only perform the optimization when the |
1366 | // IV has a single use. |
1367 | if (!WideIV || !WideIV->isCanonical() || |
1368 | WideIV->hasMoreThanOneUniqueUser() || |
1369 | NewIVTy == WideIV->getScalarType()) |
1370 | continue; |
1371 | |
1372 | // Currently only handle cases where the single user is a header-mask |
1373 | // comparison with the backedge-taken-count. |
1374 | using namespace VPlanPatternMatch; |
1375 | if (!match( |
1376 | U: *WideIV->user_begin(), |
1377 | P: m_Binary<Instruction::ICmp>( |
1378 | Op0: m_Specific(VPV: WideIV), |
1379 | Op1: m_Broadcast(Op0: m_Specific(VPV: Plan.getOrCreateBackedgeTakenCount()))))) |
1380 | continue; |
1381 | |
1382 | // Update IV operands and comparison bound to use new narrower type. |
1383 | auto *NewStart = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: NewIVTy, V: 0)); |
1384 | WideIV->setStartValue(NewStart); |
1385 | auto *NewStep = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: NewIVTy, V: 1)); |
1386 | WideIV->setStepValue(NewStep); |
1387 | |
1388 | auto *NewBTC = new VPWidenCastRecipe( |
1389 | Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy); |
1390 | Plan.getVectorPreheader()->appendRecipe(Recipe: NewBTC); |
1391 | auto *Cmp = cast<VPInstruction>(Val: *WideIV->user_begin()); |
1392 | Cmp->setOperand(I: 1, New: NewBTC); |
1393 | |
1394 | MadeChange = true; |
1395 | } |
1396 | |
1397 | return MadeChange; |
1398 | } |
1399 | |
1400 | /// Return true if \p Cond is known to be true for given \p BestVF and \p |
1401 | /// BestUF. |
1402 | static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, |
1403 | ElementCount BestVF, unsigned BestUF, |
1404 | ScalarEvolution &SE) { |
1405 | using namespace llvm::VPlanPatternMatch; |
1406 | if (match(V: Cond, P: m_Binary<Instruction::Or>(Op0: m_VPValue(), Op1: m_VPValue()))) |
1407 | return any_of(Range: Cond->getDefiningRecipe()->operands(), P: [&Plan, BestVF, BestUF, |
1408 | &SE](VPValue *C) { |
1409 | return isConditionTrueViaVFAndUF(Cond: C, Plan, BestVF, BestUF, SE); |
1410 | }); |
1411 | |
1412 | auto *CanIV = Plan.getCanonicalIV(); |
1413 | if (!match(V: Cond, P: m_Binary<Instruction::ICmp>( |
1414 | Op0: m_Specific(VPV: CanIV->getBackedgeValue()), |
1415 | Op1: m_Specific(VPV: &Plan.getVectorTripCount()))) || |
1416 | cast<VPRecipeWithIRFlags>(Val: Cond->getDefiningRecipe())->getPredicate() != |
1417 | CmpInst::ICMP_EQ) |
1418 | return false; |
1419 | |
1420 | // The compare checks CanIV + VFxUF == vector trip count. The vector trip |
1421 | // count is not conveniently available as SCEV so far, so we compare directly |
1422 | // against the original trip count. This is stricter than necessary, as we |
1423 | // will only return true if the trip count == vector trip count. |
1424 | // TODO: Use SCEV for vector trip count once available, to cover cases where |
1425 | // vector trip count == UF * VF, but original trip count != UF * VF. |
1426 | const SCEV *TripCount = |
1427 | vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), SE); |
1428 | assert(!isa<SCEVCouldNotCompute>(TripCount) && |
1429 | "Trip count SCEV must be computable" ); |
1430 | ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF); |
1431 | const SCEV *C = SE.getElementCount(Ty: TripCount->getType(), EC: NumElements); |
1432 | return SE.isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: TripCount, RHS: C); |
1433 | } |
1434 | |
1435 | /// Try to simplify the branch condition of \p Plan. This may restrict the |
1436 | /// resulting plan to \p BestVF and \p BestUF. |
1437 | static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, |
1438 | unsigned BestUF, |
1439 | PredicatedScalarEvolution &PSE) { |
1440 | VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); |
1441 | VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); |
1442 | auto *Term = &ExitingVPBB->back(); |
1443 | VPValue *Cond; |
1444 | ScalarEvolution &SE = *PSE.getSE(); |
1445 | using namespace llvm::VPlanPatternMatch; |
1446 | if (match(V: Term, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue())) || |
1447 | match(V: Term, P: m_BranchOnCond( |
1448 | Op0: m_Not(Op0: m_ActiveLaneMask(Op0: m_VPValue(), Op1: m_VPValue()))))) { |
1449 | // Try to simplify the branch condition if TC <= VF * UF when the latch |
1450 | // terminator is BranchOnCount or BranchOnCond where the input is |
1451 | // Not(ActiveLaneMask). |
1452 | const SCEV *TripCount = |
1453 | vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), SE); |
1454 | assert(!isa<SCEVCouldNotCompute>(TripCount) && |
1455 | "Trip count SCEV must be computable" ); |
1456 | ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF); |
1457 | const SCEV *C = SE.getElementCount(Ty: TripCount->getType(), EC: NumElements); |
1458 | if (TripCount->isZero() || |
1459 | !SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: TripCount, RHS: C)) |
1460 | return false; |
1461 | } else if (match(V: Term, P: m_BranchOnCond(Op0: m_VPValue(V&: Cond)))) { |
1462 | // For BranchOnCond, check if we can prove the condition to be true using VF |
1463 | // and UF. |
1464 | if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE)) |
1465 | return false; |
1466 | } else { |
1467 | return false; |
1468 | } |
1469 | |
1470 | // The vector loop region only executes once. If possible, completely remove |
1471 | // the region, otherwise replace the terminator controlling the latch with |
1472 | // (BranchOnCond true). |
1473 | auto * = cast<VPBasicBlock>(Val: VectorRegion->getEntry()); |
1474 | auto *CanIVTy = Plan.getCanonicalIV()->getScalarType(); |
1475 | if (all_of( |
1476 | Range: Header->phis(), |
1477 | P: IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) { |
1478 | for (VPRecipeBase & : make_early_inc_range(Range: Header->phis())) { |
1479 | auto * = cast<VPHeaderPHIRecipe>(Val: &HeaderR); |
1480 | HeaderPhiR->replaceAllUsesWith(New: HeaderPhiR->getStartValue()); |
1481 | HeaderPhiR->eraseFromParent(); |
1482 | } |
1483 | |
1484 | VPBlockBase * = VectorRegion->getSinglePredecessor(); |
1485 | VPBlockBase *Exit = VectorRegion->getSingleSuccessor(); |
1486 | VPBlockUtils::disconnectBlocks(From: Preheader, To: VectorRegion); |
1487 | VPBlockUtils::disconnectBlocks(From: VectorRegion, To: Exit); |
1488 | |
1489 | for (VPBlockBase *B : vp_depth_first_shallow(G: VectorRegion->getEntry())) |
1490 | B->setParent(nullptr); |
1491 | |
1492 | VPBlockUtils::connectBlocks(From: Preheader, To: Header); |
1493 | VPBlockUtils::connectBlocks(From: ExitingVPBB, To: Exit); |
1494 | VPlanTransforms::simplifyRecipes(Plan, CanonicalIVTy&: *CanIVTy); |
1495 | } else { |
1496 | // The vector region contains header phis for which we cannot remove the |
1497 | // loop region yet. |
1498 | LLVMContext &Ctx = SE.getContext(); |
1499 | auto *BOC = new VPInstruction( |
1500 | VPInstruction::BranchOnCond, |
1501 | {Plan.getOrAddLiveIn(V: ConstantInt::getTrue(Context&: Ctx))}, Term->getDebugLoc()); |
1502 | ExitingVPBB->appendRecipe(Recipe: BOC); |
1503 | } |
1504 | |
1505 | Term->eraseFromParent(); |
1506 | |
1507 | return true; |
1508 | } |
1509 | |
1510 | void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, |
1511 | unsigned BestUF, |
1512 | PredicatedScalarEvolution &PSE) { |
1513 | assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan" ); |
1514 | assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan" ); |
1515 | |
1516 | bool MadeChange = |
1517 | simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); |
1518 | MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF); |
1519 | |
1520 | if (MadeChange) { |
1521 | Plan.setVF(BestVF); |
1522 | assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF" ); |
1523 | } |
1524 | // TODO: Further simplifications are possible |
1525 | // 1. Replace inductions with constants. |
1526 | // 2. Replace vector loop region with VPBasicBlock. |
1527 | } |
1528 | |
1529 | /// Sink users of \p FOR after the recipe defining the previous value \p |
1530 | /// Previous of the recurrence. \returns true if all users of \p FOR could be |
1531 | /// re-arranged as needed or false if it is not possible. |
1532 | static bool |
1533 | sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, |
1534 | VPRecipeBase *Previous, |
1535 | VPDominatorTree &VPDT) { |
1536 | // Collect recipes that need sinking. |
1537 | SmallVector<VPRecipeBase *> WorkList; |
1538 | SmallPtrSet<VPRecipeBase *, 8> Seen; |
1539 | Seen.insert(Ptr: Previous); |
1540 | auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) { |
1541 | // The previous value must not depend on the users of the recurrence phi. In |
1542 | // that case, FOR is not a fixed order recurrence. |
1543 | if (SinkCandidate == Previous) |
1544 | return false; |
1545 | |
1546 | if (isa<VPHeaderPHIRecipe>(Val: SinkCandidate) || |
1547 | !Seen.insert(Ptr: SinkCandidate).second || |
1548 | VPDT.properlyDominates(A: Previous, B: SinkCandidate)) |
1549 | return true; |
1550 | |
1551 | if (SinkCandidate->mayHaveSideEffects()) |
1552 | return false; |
1553 | |
1554 | WorkList.push_back(Elt: SinkCandidate); |
1555 | return true; |
1556 | }; |
1557 | |
1558 | // Recursively sink users of FOR after Previous. |
1559 | WorkList.push_back(Elt: FOR); |
1560 | for (unsigned I = 0; I != WorkList.size(); ++I) { |
1561 | VPRecipeBase *Current = WorkList[I]; |
1562 | assert(Current->getNumDefinedValues() == 1 && |
1563 | "only recipes with a single defined value expected" ); |
1564 | |
1565 | for (VPUser *User : Current->getVPSingleValue()->users()) { |
1566 | if (!TryToPushSinkCandidate(cast<VPRecipeBase>(Val: User))) |
1567 | return false; |
1568 | } |
1569 | } |
1570 | |
1571 | // Keep recipes to sink ordered by dominance so earlier instructions are |
1572 | // processed first. |
1573 | sort(C&: WorkList, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) { |
1574 | return VPDT.properlyDominates(A, B); |
1575 | }); |
1576 | |
1577 | for (VPRecipeBase *SinkCandidate : WorkList) { |
1578 | if (SinkCandidate == FOR) |
1579 | continue; |
1580 | |
1581 | SinkCandidate->moveAfter(MovePos: Previous); |
1582 | Previous = SinkCandidate; |
1583 | } |
1584 | return true; |
1585 | } |
1586 | |
1587 | /// Try to hoist \p Previous and its operands before all users of \p FOR. |
1588 | static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, |
1589 | VPRecipeBase *Previous, |
1590 | VPDominatorTree &VPDT) { |
1591 | if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory()) |
1592 | return false; |
1593 | |
1594 | // Collect recipes that need hoisting. |
1595 | SmallVector<VPRecipeBase *> HoistCandidates; |
1596 | SmallPtrSet<VPRecipeBase *, 8> Visited; |
1597 | VPRecipeBase *HoistPoint = nullptr; |
1598 | // Find the closest hoist point by looking at all users of FOR and selecting |
1599 | // the recipe dominating all other users. |
1600 | for (VPUser *U : FOR->users()) { |
1601 | auto *R = cast<VPRecipeBase>(Val: U); |
1602 | if (!HoistPoint || VPDT.properlyDominates(A: R, B: HoistPoint)) |
1603 | HoistPoint = R; |
1604 | } |
1605 | assert(all_of(FOR->users(), |
1606 | [&VPDT, HoistPoint](VPUser *U) { |
1607 | auto *R = cast<VPRecipeBase>(U); |
1608 | return HoistPoint == R || |
1609 | VPDT.properlyDominates(HoistPoint, R); |
1610 | }) && |
1611 | "HoistPoint must dominate all users of FOR" ); |
1612 | |
1613 | auto NeedsHoisting = [HoistPoint, &VPDT, |
1614 | &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * { |
1615 | VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe(); |
1616 | if (!HoistCandidate) |
1617 | return nullptr; |
1618 | VPRegionBlock *EnclosingLoopRegion = |
1619 | HoistCandidate->getParent()->getEnclosingLoopRegion(); |
1620 | assert((!HoistCandidate->getParent()->getParent() || |
1621 | HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) && |
1622 | "CFG in VPlan should still be flat, without replicate regions" ); |
1623 | // Hoist candidate was already visited, no need to hoist. |
1624 | if (!Visited.insert(Ptr: HoistCandidate).second) |
1625 | return nullptr; |
1626 | |
1627 | // Candidate is outside loop region or a header phi, dominates FOR users w/o |
1628 | // hoisting. |
1629 | if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(Val: HoistCandidate)) |
1630 | return nullptr; |
1631 | |
1632 | // If we reached a recipe that dominates HoistPoint, we don't need to |
1633 | // hoist the recipe. |
1634 | if (VPDT.properlyDominates(A: HoistCandidate, B: HoistPoint)) |
1635 | return nullptr; |
1636 | return HoistCandidate; |
1637 | }; |
1638 | auto CanHoist = [&](VPRecipeBase *HoistCandidate) { |
1639 | // Avoid hoisting candidates with side-effects, as we do not yet analyze |
1640 | // associated dependencies. |
1641 | return !HoistCandidate->mayHaveSideEffects(); |
1642 | }; |
1643 | |
1644 | if (!NeedsHoisting(Previous->getVPSingleValue())) |
1645 | return true; |
1646 | |
1647 | // Recursively try to hoist Previous and its operands before all users of FOR. |
1648 | HoistCandidates.push_back(Elt: Previous); |
1649 | |
1650 | for (unsigned I = 0; I != HoistCandidates.size(); ++I) { |
1651 | VPRecipeBase *Current = HoistCandidates[I]; |
1652 | assert(Current->getNumDefinedValues() == 1 && |
1653 | "only recipes with a single defined value expected" ); |
1654 | if (!CanHoist(Current)) |
1655 | return false; |
1656 | |
1657 | for (VPValue *Op : Current->operands()) { |
1658 | // If we reach FOR, it means the original Previous depends on some other |
1659 | // recurrence that in turn depends on FOR. If that is the case, we would |
1660 | // also need to hoist recipes involving the other FOR, which may break |
1661 | // dependencies. |
1662 | if (Op == FOR) |
1663 | return false; |
1664 | |
1665 | if (auto *R = NeedsHoisting(Op)) |
1666 | HoistCandidates.push_back(Elt: R); |
1667 | } |
1668 | } |
1669 | |
1670 | // Order recipes to hoist by dominance so earlier instructions are processed |
1671 | // first. |
1672 | sort(C&: HoistCandidates, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) { |
1673 | return VPDT.properlyDominates(A, B); |
1674 | }); |
1675 | |
1676 | for (VPRecipeBase *HoistCandidate : HoistCandidates) { |
1677 | HoistCandidate->moveBefore(BB&: *HoistPoint->getParent(), |
1678 | I: HoistPoint->getIterator()); |
1679 | } |
1680 | |
1681 | return true; |
1682 | } |
1683 | |
1684 | bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, |
1685 | VPBuilder &LoopBuilder) { |
1686 | VPDominatorTree VPDT; |
1687 | VPDT.recalculate(Func&: Plan); |
1688 | |
1689 | SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis; |
1690 | for (VPRecipeBase &R : |
1691 | Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis()) |
1692 | if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R)) |
1693 | RecurrencePhis.push_back(Elt: FOR); |
1694 | |
1695 | for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) { |
1696 | SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis; |
1697 | VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe(); |
1698 | // Fixed-order recurrences do not contain cycles, so this loop is guaranteed |
1699 | // to terminate. |
1700 | while (auto *PrevPhi = |
1701 | dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Val: Previous)) { |
1702 | assert(PrevPhi->getParent() == FOR->getParent()); |
1703 | assert(SeenPhis.insert(PrevPhi).second); |
1704 | Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe(); |
1705 | } |
1706 | |
1707 | if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) && |
1708 | !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT)) |
1709 | return false; |
1710 | |
1711 | // Introduce a recipe to combine the incoming and previous values of a |
1712 | // fixed-order recurrence. |
1713 | VPBasicBlock *InsertBlock = Previous->getParent(); |
1714 | if (isa<VPHeaderPHIRecipe>(Val: Previous)) |
1715 | LoopBuilder.setInsertPoint(TheBB: InsertBlock, IP: InsertBlock->getFirstNonPhi()); |
1716 | else |
1717 | LoopBuilder.setInsertPoint(TheBB: InsertBlock, |
1718 | IP: std::next(x: Previous->getIterator())); |
1719 | |
1720 | auto *RecurSplice = |
1721 | LoopBuilder.createNaryOp(Opcode: VPInstruction::FirstOrderRecurrenceSplice, |
1722 | Operands: {FOR, FOR->getBackedgeValue()}); |
1723 | |
1724 | FOR->replaceAllUsesWith(New: RecurSplice); |
1725 | // Set the first operand of RecurSplice to FOR again, after replacing |
1726 | // all users. |
1727 | RecurSplice->setOperand(I: 0, New: FOR); |
1728 | } |
1729 | return true; |
1730 | } |
1731 | |
1732 | void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { |
1733 | for (VPRecipeBase &R : |
1734 | Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
1735 | auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R); |
1736 | if (!PhiR) |
1737 | continue; |
1738 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
1739 | RecurKind RK = RdxDesc.getRecurrenceKind(); |
1740 | if (RK != RecurKind::Add && RK != RecurKind::Mul) |
1741 | continue; |
1742 | |
1743 | for (VPUser *U : collectUsersRecursively(V: PhiR)) |
1744 | if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: U)) { |
1745 | RecWithFlags->dropPoisonGeneratingFlags(); |
1746 | } |
1747 | } |
1748 | } |
1749 | |
1750 | /// Move loop-invariant recipes out of the vector loop region in \p Plan. |
1751 | static void licm(VPlan &Plan) { |
1752 | VPBasicBlock * = Plan.getVectorPreheader(); |
1753 | |
1754 | // Return true if we do not know how to (mechanically) hoist a given recipe |
1755 | // out of a loop region. Does not address legality concerns such as aliasing |
1756 | // or speculation safety. |
1757 | auto CannotHoistRecipe = [](VPRecipeBase &R) { |
1758 | // Allocas cannot be hoisted. |
1759 | auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R); |
1760 | return RepR && RepR->getOpcode() == Instruction::Alloca; |
1761 | }; |
1762 | |
1763 | // Hoist any loop invariant recipes from the vector loop region to the |
1764 | // preheader. Preform a shallow traversal of the vector loop region, to |
1765 | // exclude recipes in replicate regions. |
1766 | VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); |
1767 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
1768 | Range: vp_depth_first_shallow(G: LoopRegion->getEntry()))) { |
1769 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
1770 | if (CannotHoistRecipe(R)) |
1771 | continue; |
1772 | // TODO: Relax checks in the future, e.g. we could also hoist reads, if |
1773 | // their memory location is not modified in the vector loop. |
1774 | if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() || |
1775 | any_of(Range: R.operands(), P: [](VPValue *Op) { |
1776 | return !Op->isDefinedOutsideLoopRegions(); |
1777 | })) |
1778 | continue; |
1779 | R.moveBefore(BB&: *Preheader, I: Preheader->end()); |
1780 | } |
1781 | } |
1782 | } |
1783 | |
1784 | void VPlanTransforms::truncateToMinimalBitwidths( |
1785 | VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) { |
1786 | // Keep track of created truncates, so they can be re-used. Note that we |
1787 | // cannot use RAUW after creating a new truncate, as this would could make |
1788 | // other uses have different types for their operands, making them invalidly |
1789 | // typed. |
1790 | DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs; |
1791 | Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); |
1792 | VPTypeAnalysis TypeInfo(CanonicalIVType); |
1793 | VPBasicBlock *PH = Plan.getVectorPreheader(); |
1794 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
1795 | Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) { |
1796 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
1797 | if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe, |
1798 | VPWidenSelectRecipe, VPWidenLoadRecipe, VPWidenIntrinsicRecipe>( |
1799 | Val: &R)) |
1800 | continue; |
1801 | |
1802 | VPValue *ResultVPV = R.getVPSingleValue(); |
1803 | auto *UI = cast_or_null<Instruction>(Val: ResultVPV->getUnderlyingValue()); |
1804 | unsigned NewResSizeInBits = MinBWs.lookup(Key: UI); |
1805 | if (!NewResSizeInBits) |
1806 | continue; |
1807 | |
1808 | // If the value wasn't vectorized, we must maintain the original scalar |
1809 | // type. Skip those here, after incrementing NumProcessedRecipes. Also |
1810 | // skip casts which do not need to be handled explicitly here, as |
1811 | // redundant casts will be removed during recipe simplification. |
1812 | if (isa<VPReplicateRecipe, VPWidenCastRecipe>(Val: &R)) |
1813 | continue; |
1814 | |
1815 | Type *OldResTy = TypeInfo.inferScalarType(V: ResultVPV); |
1816 | unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits(); |
1817 | assert(OldResTy->isIntegerTy() && "only integer types supported" ); |
1818 | (void)OldResSizeInBits; |
1819 | |
1820 | LLVMContext &Ctx = CanonicalIVType->getContext(); |
1821 | auto *NewResTy = IntegerType::get(C&: Ctx, NumBits: NewResSizeInBits); |
1822 | |
1823 | // Any wrapping introduced by shrinking this operation shouldn't be |
1824 | // considered undefined behavior. So, we can't unconditionally copy |
1825 | // arithmetic wrapping flags to VPW. |
1826 | if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(Val: &R)) |
1827 | VPW->dropPoisonGeneratingFlags(); |
1828 | |
1829 | using namespace llvm::VPlanPatternMatch; |
1830 | if (OldResSizeInBits != NewResSizeInBits && |
1831 | !match(V: &R, P: m_Binary<Instruction::ICmp>(Op0: m_VPValue(), Op1: m_VPValue()))) { |
1832 | // Extend result to original width. |
1833 | auto *Ext = |
1834 | new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy); |
1835 | Ext->insertAfter(InsertPos: &R); |
1836 | ResultVPV->replaceAllUsesWith(New: Ext); |
1837 | Ext->setOperand(I: 0, New: ResultVPV); |
1838 | assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?" ); |
1839 | } else { |
1840 | assert( |
1841 | match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) && |
1842 | "Only ICmps should not need extending the result." ); |
1843 | } |
1844 | |
1845 | assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed" ); |
1846 | if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R)) |
1847 | continue; |
1848 | |
1849 | // Shrink operands by introducing truncates as needed. |
1850 | unsigned StartIdx = isa<VPWidenSelectRecipe>(Val: &R) ? 1 : 0; |
1851 | for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) { |
1852 | auto *Op = R.getOperand(N: Idx); |
1853 | unsigned OpSizeInBits = |
1854 | TypeInfo.inferScalarType(V: Op)->getScalarSizeInBits(); |
1855 | if (OpSizeInBits == NewResSizeInBits) |
1856 | continue; |
1857 | assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate" ); |
1858 | auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Key: Op); |
1859 | VPWidenCastRecipe *NewOp = |
1860 | IterIsEmpty |
1861 | ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy) |
1862 | : ProcessedIter->second; |
1863 | R.setOperand(I: Idx, New: NewOp); |
1864 | if (!IterIsEmpty) |
1865 | continue; |
1866 | ProcessedIter->second = NewOp; |
1867 | if (!Op->isLiveIn()) { |
1868 | NewOp->insertBefore(InsertPos: &R); |
1869 | } else { |
1870 | PH->appendRecipe(Recipe: NewOp); |
1871 | } |
1872 | } |
1873 | |
1874 | } |
1875 | } |
1876 | } |
1877 | |
1878 | /// Remove BranchOnCond recipes with true or false conditions together with |
1879 | /// removing dead edges to their successors. |
1880 | static void removeBranchOnConst(VPlan &Plan) { |
1881 | using namespace llvm::VPlanPatternMatch; |
1882 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
1883 | Range: vp_depth_first_shallow(G: Plan.getEntry()))) { |
1884 | VPValue *Cond; |
1885 | if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() || |
1886 | !match(V: &VPBB->back(), P: m_BranchOnCond(Op0: m_VPValue(V&: Cond)))) |
1887 | continue; |
1888 | |
1889 | unsigned RemovedIdx; |
1890 | if (match(V: Cond, P: m_True())) |
1891 | RemovedIdx = 1; |
1892 | else if (match(V: Cond, P: m_False())) |
1893 | RemovedIdx = 0; |
1894 | else |
1895 | continue; |
1896 | |
1897 | VPBasicBlock *RemovedSucc = |
1898 | cast<VPBasicBlock>(Val: VPBB->getSuccessors()[RemovedIdx]); |
1899 | assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 && |
1900 | "There must be a single edge between VPBB and its successor" ); |
1901 | // Values coming from VPBB into phi recipes of RemoveSucc are removed from |
1902 | // these recipes. |
1903 | for (VPRecipeBase &R : RemovedSucc->phis()) { |
1904 | auto *Phi = cast<VPPhiAccessors>(Val: &R); |
1905 | assert((!isa<VPIRPhi>(&R) || RemovedSucc->getNumPredecessors() == 1) && |
1906 | "VPIRPhis must have a single predecessor" ); |
1907 | Phi->removeIncomingValueFor(IncomingBlock: VPBB); |
1908 | } |
1909 | // Disconnect blocks and remove the terminator. RemovedSucc will be deleted |
1910 | // automatically on VPlan destruction if it becomes unreachable. |
1911 | VPBlockUtils::disconnectBlocks(From: VPBB, To: RemovedSucc); |
1912 | VPBB->back().eraseFromParent(); |
1913 | } |
1914 | } |
1915 | |
1916 | void VPlanTransforms::optimize(VPlan &Plan) { |
1917 | runPass(Fn: removeRedundantCanonicalIVs, Plan); |
1918 | runPass(Fn: removeRedundantInductionCasts, Plan); |
1919 | |
1920 | runPass(Fn: simplifyRecipes, Plan, Args&: *Plan.getCanonicalIV()->getScalarType()); |
1921 | runPass(Fn: simplifyBlends, Plan); |
1922 | runPass(Fn: removeDeadRecipes, Plan); |
1923 | runPass(Fn: narrowToSingleScalarRecipes, Plan); |
1924 | runPass(Fn: legalizeAndOptimizeInductions, Plan); |
1925 | runPass(Fn: removeRedundantExpandSCEVRecipes, Plan); |
1926 | runPass(Fn: simplifyRecipes, Plan, Args&: *Plan.getCanonicalIV()->getScalarType()); |
1927 | runPass(Fn: removeBranchOnConst, Plan); |
1928 | runPass(Fn: removeDeadRecipes, Plan); |
1929 | |
1930 | runPass(Fn: createAndOptimizeReplicateRegions, Plan); |
1931 | runPass(Transform: mergeBlocksIntoPredecessors, Plan); |
1932 | runPass(Fn: licm, Plan); |
1933 | } |
1934 | |
1935 | // Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace |
1936 | // the loop terminator with a branch-on-cond recipe with the negated |
1937 | // active-lane-mask as operand. Note that this turns the loop into an |
1938 | // uncountable one. Only the existing terminator is replaced, all other existing |
1939 | // recipes/users remain unchanged, except for poison-generating flags being |
1940 | // dropped from the canonical IV increment. Return the created |
1941 | // VPActiveLaneMaskPHIRecipe. |
1942 | // |
1943 | // The function uses the following definitions: |
1944 | // |
1945 | // %TripCount = DataWithControlFlowWithoutRuntimeCheck ? |
1946 | // calculate-trip-count-minus-VF (original TC) : original TC |
1947 | // %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ? |
1948 | // CanonicalIVPhi : CanonicalIVIncrement |
1949 | // %StartV is the canonical induction start value. |
1950 | // |
1951 | // The function adds the following recipes: |
1952 | // |
1953 | // vector.ph: |
1954 | // %TripCount = calculate-trip-count-minus-VF (original TC) |
1955 | // [if DataWithControlFlowWithoutRuntimeCheck] |
1956 | // %EntryInc = canonical-iv-increment-for-part %StartV |
1957 | // %EntryALM = active-lane-mask %EntryInc, %TripCount |
1958 | // |
1959 | // vector.body: |
1960 | // ... |
1961 | // %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ] |
1962 | // ... |
1963 | // %InLoopInc = canonical-iv-increment-for-part %IncrementValue |
1964 | // %ALM = active-lane-mask %InLoopInc, TripCount |
1965 | // %Negated = Not %ALM |
1966 | // branch-on-cond %Negated |
1967 | // |
1968 | static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( |
1969 | VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { |
1970 | VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); |
1971 | VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); |
1972 | auto *CanonicalIVPHI = Plan.getCanonicalIV(); |
1973 | VPValue *StartV = CanonicalIVPHI->getStartValue(); |
1974 | |
1975 | auto *CanonicalIVIncrement = |
1976 | cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue()); |
1977 | // TODO: Check if dropping the flags is needed if |
1978 | // !DataAndControlFlowWithoutRuntimeCheck. |
1979 | CanonicalIVIncrement->dropPoisonGeneratingFlags(); |
1980 | DebugLoc DL = CanonicalIVIncrement->getDebugLoc(); |
1981 | // We can't use StartV directly in the ActiveLaneMask VPInstruction, since |
1982 | // we have to take unrolling into account. Each part needs to start at |
1983 | // Part * VF |
1984 | auto * = Plan.getVectorPreheader(); |
1985 | VPBuilder Builder(VecPreheader); |
1986 | |
1987 | // Create the ActiveLaneMask instruction using the correct start values. |
1988 | VPValue *TC = Plan.getTripCount(); |
1989 | |
1990 | VPValue *TripCount, *IncrementValue; |
1991 | if (!DataAndControlFlowWithoutRuntimeCheck) { |
1992 | // When the loop is guarded by a runtime overflow check for the loop |
1993 | // induction variable increment by VF, we can increment the value before |
1994 | // the get.active.lane mask and use the unmodified tripcount. |
1995 | IncrementValue = CanonicalIVIncrement; |
1996 | TripCount = TC; |
1997 | } else { |
1998 | // When avoiding a runtime check, the active.lane.mask inside the loop |
1999 | // uses a modified trip count and the induction variable increment is |
2000 | // done after the active.lane.mask intrinsic is called. |
2001 | IncrementValue = CanonicalIVPHI; |
2002 | TripCount = Builder.createNaryOp(Opcode: VPInstruction::CalculateTripCountMinusVF, |
2003 | Operands: {TC}, DL); |
2004 | } |
2005 | auto *EntryIncrement = Builder.createOverflowingOp( |
2006 | Opcode: VPInstruction::CanonicalIVIncrementForPart, Operands: {StartV}, WrapFlags: {false, false}, DL, |
2007 | Name: "index.part.next" ); |
2008 | |
2009 | // Create the active lane mask instruction in the VPlan preheader. |
2010 | auto *EntryALM = |
2011 | Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask, Operands: {EntryIncrement, TC}, |
2012 | DL, Name: "active.lane.mask.entry" ); |
2013 | |
2014 | // Now create the ActiveLaneMaskPhi recipe in the main loop using the |
2015 | // preheader ActiveLaneMask instruction. |
2016 | auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); |
2017 | LaneMaskPhi->insertAfter(InsertPos: CanonicalIVPHI); |
2018 | |
2019 | // Create the active lane mask for the next iteration of the loop before the |
2020 | // original terminator. |
2021 | VPRecipeBase *OriginalTerminator = EB->getTerminator(); |
2022 | Builder.setInsertPoint(OriginalTerminator); |
2023 | auto *InLoopIncrement = |
2024 | Builder.createOverflowingOp(Opcode: VPInstruction::CanonicalIVIncrementForPart, |
2025 | Operands: {IncrementValue}, WrapFlags: {false, false}, DL); |
2026 | auto *ALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask, |
2027 | Operands: {InLoopIncrement, TripCount}, DL, |
2028 | Name: "active.lane.mask.next" ); |
2029 | LaneMaskPhi->addOperand(Operand: ALM); |
2030 | |
2031 | // Replace the original terminator with BranchOnCond. We have to invert the |
2032 | // mask here because a true condition means jumping to the exit block. |
2033 | auto *NotMask = Builder.createNot(Operand: ALM, DL); |
2034 | Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {NotMask}, DL); |
2035 | OriginalTerminator->eraseFromParent(); |
2036 | return LaneMaskPhi; |
2037 | } |
2038 | |
2039 | /// Collect all VPValues representing a header mask through the (ICMP_ULE, |
2040 | /// WideCanonicalIV, backedge-taken-count) pattern. |
2041 | /// TODO: Introduce explicit recipe for header-mask instead of searching |
2042 | /// for the header-mask pattern manually. |
2043 | static SmallVector<VPValue *> (VPlan &Plan) { |
2044 | SmallVector<VPValue *> WideCanonicalIVs; |
2045 | auto *FoundWidenCanonicalIVUser = |
2046 | find_if(Range: Plan.getCanonicalIV()->users(), |
2047 | P: [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(Val: U); }); |
2048 | assert(count_if(Plan.getCanonicalIV()->users(), |
2049 | [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <= |
2050 | 1 && |
2051 | "Must have at most one VPWideCanonicalIVRecipe" ); |
2052 | if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) { |
2053 | auto *WideCanonicalIV = |
2054 | cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser); |
2055 | WideCanonicalIVs.push_back(Elt: WideCanonicalIV); |
2056 | } |
2057 | |
2058 | // Also include VPWidenIntOrFpInductionRecipes that represent a widened |
2059 | // version of the canonical induction. |
2060 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
2061 | for (VPRecipeBase &Phi : HeaderVPBB->phis()) { |
2062 | auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi); |
2063 | if (WidenOriginalIV && WidenOriginalIV->isCanonical()) |
2064 | WideCanonicalIVs.push_back(Elt: WidenOriginalIV); |
2065 | } |
2066 | |
2067 | // Walk users of wide canonical IVs and collect to all compares of the form |
2068 | // (ICMP_ULE, WideCanonicalIV, backedge-taken-count). |
2069 | SmallVector<VPValue *> ; |
2070 | for (auto *Wide : WideCanonicalIVs) { |
2071 | for (VPUser *U : SmallVector<VPUser *>(Wide->users())) { |
2072 | auto * = dyn_cast<VPInstruction>(Val: U); |
2073 | if (!HeaderMask || !vputils::isHeaderMask(V: HeaderMask, Plan)) |
2074 | continue; |
2075 | |
2076 | assert(HeaderMask->getOperand(0) == Wide && |
2077 | "WidenCanonicalIV must be the first operand of the compare" ); |
2078 | HeaderMasks.push_back(Elt: HeaderMask); |
2079 | } |
2080 | } |
2081 | return HeaderMasks; |
2082 | } |
2083 | |
2084 | void VPlanTransforms::addActiveLaneMask( |
2085 | VPlan &Plan, bool UseActiveLaneMaskForControlFlow, |
2086 | bool DataAndControlFlowWithoutRuntimeCheck) { |
2087 | assert((!DataAndControlFlowWithoutRuntimeCheck || |
2088 | UseActiveLaneMaskForControlFlow) && |
2089 | "DataAndControlFlowWithoutRuntimeCheck implies " |
2090 | "UseActiveLaneMaskForControlFlow" ); |
2091 | |
2092 | auto *FoundWidenCanonicalIVUser = |
2093 | find_if(Range: Plan.getCanonicalIV()->users(), |
2094 | P: [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(Val: U); }); |
2095 | assert(FoundWidenCanonicalIVUser && |
2096 | "Must have widened canonical IV when tail folding!" ); |
2097 | auto *WideCanonicalIV = |
2098 | cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser); |
2099 | VPSingleDefRecipe *LaneMask; |
2100 | if (UseActiveLaneMaskForControlFlow) { |
2101 | LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( |
2102 | Plan, DataAndControlFlowWithoutRuntimeCheck); |
2103 | } else { |
2104 | VPBuilder B = VPBuilder::getToInsertAfter(R: WideCanonicalIV); |
2105 | LaneMask = B.createNaryOp(Opcode: VPInstruction::ActiveLaneMask, |
2106 | Operands: {WideCanonicalIV, Plan.getTripCount()}, Inst: nullptr, |
2107 | Name: "active.lane.mask" ); |
2108 | } |
2109 | |
2110 | // Walk users of WideCanonicalIV and replace all compares of the form |
2111 | // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an |
2112 | // active-lane-mask. |
2113 | for (VPValue * : collectAllHeaderMasks(Plan)) |
2114 | HeaderMask->replaceAllUsesWith(New: LaneMask); |
2115 | } |
2116 | |
2117 | /// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding |
2118 | /// EVL-based recipe without the header mask. Returns nullptr if no EVL-based |
2119 | /// recipe could be created. |
2120 | /// \p HeaderMask Header Mask. |
2121 | /// \p CurRecipe Recipe to be transform. |
2122 | /// \p TypeInfo VPlan-based type analysis. |
2123 | /// \p AllOneMask The vector mask parameter of vector-predication intrinsics. |
2124 | /// \p EVL The explicit vector length parameter of vector-predication |
2125 | /// intrinsics. |
2126 | static VPRecipeBase *optimizeMaskToEVL(VPValue *, |
2127 | VPRecipeBase &CurRecipe, |
2128 | VPTypeAnalysis &TypeInfo, |
2129 | VPValue &AllOneMask, VPValue &EVL) { |
2130 | using namespace llvm::VPlanPatternMatch; |
2131 | auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { |
2132 | assert(OrigMask && "Unmasked recipe when folding tail" ); |
2133 | // HeaderMask will be handled using EVL. |
2134 | VPValue *Mask; |
2135 | if (match(V: OrigMask, P: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: Mask)))) |
2136 | return Mask; |
2137 | return HeaderMask == OrigMask ? nullptr : OrigMask; |
2138 | }; |
2139 | |
2140 | return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe) |
2141 | .Case<VPWidenLoadRecipe>(caseFn: [&](VPWidenLoadRecipe *L) { |
2142 | VPValue *NewMask = GetNewMask(L->getMask()); |
2143 | return new VPWidenLoadEVLRecipe(*L, EVL, NewMask); |
2144 | }) |
2145 | .Case<VPWidenStoreRecipe>(caseFn: [&](VPWidenStoreRecipe *S) { |
2146 | VPValue *NewMask = GetNewMask(S->getMask()); |
2147 | return new VPWidenStoreEVLRecipe(*S, EVL, NewMask); |
2148 | }) |
2149 | .Case<VPReductionRecipe>(caseFn: [&](VPReductionRecipe *Red) { |
2150 | VPValue *NewMask = GetNewMask(Red->getCondOp()); |
2151 | return new VPReductionEVLRecipe(*Red, EVL, NewMask); |
2152 | }) |
2153 | .Case<VPInstruction>(caseFn: [&](VPInstruction *VPI) -> VPRecipeBase * { |
2154 | VPValue *LHS, *RHS; |
2155 | // Transform select with a header mask condition |
2156 | // select(header_mask, LHS, RHS) |
2157 | // into vector predication merge. |
2158 | // vp.merge(all-true, LHS, RHS, EVL) |
2159 | if (!match(V: VPI, P: m_Select(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: LHS), |
2160 | Op2: m_VPValue(V&: RHS)))) |
2161 | return nullptr; |
2162 | // Use all true as the condition because this transformation is |
2163 | // limited to selects whose condition is a header mask. |
2164 | return new VPWidenIntrinsicRecipe( |
2165 | Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL}, |
2166 | TypeInfo.inferScalarType(V: LHS), VPI->getDebugLoc()); |
2167 | }) |
2168 | .Default(defaultFn: [&](VPRecipeBase *R) { return nullptr; }); |
2169 | } |
2170 | |
2171 | /// Replace recipes with their EVL variants. |
2172 | static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { |
2173 | Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); |
2174 | VPTypeAnalysis TypeInfo(CanonicalIVType); |
2175 | LLVMContext &Ctx = CanonicalIVType->getContext(); |
2176 | VPValue *AllOneMask = Plan.getOrAddLiveIn(V: ConstantInt::getTrue(Context&: Ctx)); |
2177 | VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); |
2178 | VPBasicBlock * = LoopRegion->getEntryBasicBlock(); |
2179 | |
2180 | assert(all_of(Plan.getVF().users(), |
2181 | IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe, |
2182 | VPWidenIntOrFpInductionRecipe>) && |
2183 | "User of VF that we can't transform to EVL." ); |
2184 | Plan.getVF().replaceAllUsesWith(New: &EVL); |
2185 | |
2186 | // Defer erasing recipes till the end so that we don't invalidate the |
2187 | // VPTypeAnalysis cache. |
2188 | SmallVector<VPRecipeBase *> ToErase; |
2189 | |
2190 | // Create a scalar phi to track the previous EVL if fixed-order recurrence is |
2191 | // contained. |
2192 | bool ContainsFORs = |
2193 | any_of(Range: Header->phis(), P: IsaPred<VPFirstOrderRecurrencePHIRecipe>); |
2194 | if (ContainsFORs) { |
2195 | // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL. |
2196 | VPValue *MaxEVL = &Plan.getVF(); |
2197 | // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer. |
2198 | VPBuilder Builder(LoopRegion->getPreheaderVPBB()); |
2199 | MaxEVL = Builder.createScalarZExtOrTrunc(Op: MaxEVL, ResultTy: Type::getInt32Ty(C&: Ctx), |
2200 | SrcTy: TypeInfo.inferScalarType(V: MaxEVL), |
2201 | DL: DebugLoc()); |
2202 | |
2203 | Builder.setInsertPoint(TheBB: Header, IP: Header->getFirstNonPhi()); |
2204 | VPValue *PrevEVL = |
2205 | Builder.createScalarPhi(IncomingValues: {MaxEVL, &EVL}, DL: DebugLoc(), Name: "prev.evl" ); |
2206 | |
2207 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
2208 | Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) { |
2209 | for (VPRecipeBase &R : *VPBB) { |
2210 | using namespace VPlanPatternMatch; |
2211 | VPValue *V1, *V2; |
2212 | if (!match(V: &R, |
2213 | P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>( |
2214 | Op0: m_VPValue(V&: V1), Op1: m_VPValue(V&: V2)))) |
2215 | continue; |
2216 | VPValue *Imm = Plan.getOrAddLiveIn( |
2217 | V: ConstantInt::getSigned(Ty: Type::getInt32Ty(C&: Ctx), V: -1)); |
2218 | VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe( |
2219 | Intrinsic::experimental_vp_splice, |
2220 | {V1, V2, Imm, AllOneMask, PrevEVL, &EVL}, |
2221 | TypeInfo.inferScalarType(V: R.getVPSingleValue()), R.getDebugLoc()); |
2222 | VPSplice->insertBefore(InsertPos: &R); |
2223 | R.getVPSingleValue()->replaceAllUsesWith(New: VPSplice); |
2224 | ToErase.push_back(Elt: &R); |
2225 | } |
2226 | } |
2227 | } |
2228 | |
2229 | // Try to optimize header mask recipes away to their EVL variants. |
2230 | for (VPValue * : collectAllHeaderMasks(Plan)) { |
2231 | for (VPUser *U : collectUsersRecursively(V: HeaderMask)) { |
2232 | auto *CurRecipe = cast<VPRecipeBase>(Val: U); |
2233 | VPRecipeBase *EVLRecipe = |
2234 | optimizeMaskToEVL(HeaderMask, CurRecipe&: *CurRecipe, TypeInfo, AllOneMask&: *AllOneMask, EVL); |
2235 | if (!EVLRecipe) |
2236 | continue; |
2237 | |
2238 | [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); |
2239 | assert(NumDefVal == CurRecipe->getNumDefinedValues() && |
2240 | "New recipe must define the same number of values as the " |
2241 | "original." ); |
2242 | assert( |
2243 | NumDefVal <= 1 && |
2244 | "Only supports recipes with a single definition or without users." ); |
2245 | EVLRecipe->insertBefore(InsertPos: CurRecipe); |
2246 | if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(Val: EVLRecipe)) { |
2247 | VPValue *CurVPV = CurRecipe->getVPSingleValue(); |
2248 | CurVPV->replaceAllUsesWith(New: EVLRecipe->getVPSingleValue()); |
2249 | } |
2250 | ToErase.push_back(Elt: CurRecipe); |
2251 | } |
2252 | } |
2253 | |
2254 | for (VPRecipeBase *R : reverse(C&: ToErase)) { |
2255 | SmallVector<VPValue *> PossiblyDead(R->operands()); |
2256 | R->eraseFromParent(); |
2257 | for (VPValue *Op : PossiblyDead) |
2258 | recursivelyDeleteDeadRecipes(V: Op); |
2259 | } |
2260 | } |
2261 | |
2262 | /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and |
2263 | /// replaces all uses except the canonical IV increment of |
2264 | /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe |
2265 | /// is used only for loop iterations counting after this transformation. |
2266 | /// |
2267 | /// The function uses the following definitions: |
2268 | /// %StartV is the canonical induction start value. |
2269 | /// |
2270 | /// The function adds the following recipes: |
2271 | /// |
2272 | /// vector.ph: |
2273 | /// ... |
2274 | /// |
2275 | /// vector.body: |
2276 | /// ... |
2277 | /// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ], |
2278 | /// [ %NextEVLIV, %vector.body ] |
2279 | /// %AVL = sub original TC, %EVLPhi |
2280 | /// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL |
2281 | /// ... |
2282 | /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi |
2283 | /// ... |
2284 | /// |
2285 | /// If MaxSafeElements is provided, the function adds the following recipes: |
2286 | /// vector.ph: |
2287 | /// ... |
2288 | /// |
2289 | /// vector.body: |
2290 | /// ... |
2291 | /// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ], |
2292 | /// [ %NextEVLIV, %vector.body ] |
2293 | /// %AVL = sub original TC, %EVLPhi |
2294 | /// %cmp = cmp ult %AVL, MaxSafeElements |
2295 | /// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements |
2296 | /// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL |
2297 | /// ... |
2298 | /// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi |
2299 | /// ... |
2300 | /// |
2301 | bool VPlanTransforms::tryAddExplicitVectorLength( |
2302 | VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) { |
2303 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
2304 | // The transform updates all users of inductions to work based on EVL, instead |
2305 | // of the VF directly. At the moment, widened pointer inductions cannot be |
2306 | // updated, so bail out if the plan contains any. |
2307 | bool ContainsWidenPointerInductions = |
2308 | any_of(Range: Header->phis(), P: IsaPred<VPWidenPointerInductionRecipe>); |
2309 | if (ContainsWidenPointerInductions) |
2310 | return false; |
2311 | |
2312 | auto *CanonicalIVPHI = Plan.getCanonicalIV(); |
2313 | auto *CanIVTy = CanonicalIVPHI->getScalarType(); |
2314 | VPValue *StartV = CanonicalIVPHI->getStartValue(); |
2315 | |
2316 | // Create the ExplicitVectorLengthPhi recipe in the main loop. |
2317 | auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); |
2318 | EVLPhi->insertAfter(InsertPos: CanonicalIVPHI); |
2319 | VPBuilder Builder(Header, Header->getFirstNonPhi()); |
2320 | // Compute original TC - IV as the AVL (application vector length). |
2321 | VPValue *AVL = Builder.createNaryOp( |
2322 | Opcode: Instruction::Sub, Operands: {Plan.getTripCount(), EVLPhi}, DL: DebugLoc(), Name: "avl" ); |
2323 | if (MaxSafeElements) { |
2324 | // Support for MaxSafeDist for correct loop emission. |
2325 | VPValue *AVLSafe = |
2326 | Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanIVTy, V: *MaxSafeElements)); |
2327 | VPValue *Cmp = Builder.createICmp(Pred: ICmpInst::ICMP_ULT, A: AVL, B: AVLSafe); |
2328 | AVL = Builder.createSelect(Cond: Cmp, TrueVal: AVL, FalseVal: AVLSafe, DL: DebugLoc(), Name: "safe_avl" ); |
2329 | } |
2330 | auto *VPEVL = Builder.createNaryOp(Opcode: VPInstruction::ExplicitVectorLength, Operands: AVL, |
2331 | DL: DebugLoc()); |
2332 | |
2333 | auto *CanonicalIVIncrement = |
2334 | cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue()); |
2335 | Builder.setInsertPoint(CanonicalIVIncrement); |
2336 | VPValue *OpVPEVL = VPEVL; |
2337 | |
2338 | auto *I32Ty = Type::getInt32Ty(C&: CanIVTy->getContext()); |
2339 | OpVPEVL = Builder.createScalarZExtOrTrunc( |
2340 | Op: OpVPEVL, ResultTy: CanIVTy, SrcTy: I32Ty, DL: CanonicalIVIncrement->getDebugLoc()); |
2341 | |
2342 | auto *NextEVLIV = Builder.createOverflowingOp( |
2343 | Opcode: Instruction::Add, Operands: {OpVPEVL, EVLPhi}, |
2344 | WrapFlags: {CanonicalIVIncrement->hasNoUnsignedWrap(), |
2345 | CanonicalIVIncrement->hasNoSignedWrap()}, |
2346 | DL: CanonicalIVIncrement->getDebugLoc(), Name: "index.evl.next" ); |
2347 | EVLPhi->addOperand(Operand: NextEVLIV); |
2348 | |
2349 | transformRecipestoEVLRecipes(Plan, EVL&: *VPEVL); |
2350 | |
2351 | // Replace all uses of VPCanonicalIVPHIRecipe by |
2352 | // VPEVLBasedIVPHIRecipe except for the canonical IV increment. |
2353 | CanonicalIVPHI->replaceAllUsesWith(New: EVLPhi); |
2354 | CanonicalIVIncrement->setOperand(I: 0, New: CanonicalIVPHI); |
2355 | // TODO: support unroll factor > 1. |
2356 | Plan.setUF(1); |
2357 | return true; |
2358 | } |
2359 | |
2360 | void VPlanTransforms::dropPoisonGeneratingRecipes( |
2361 | VPlan &Plan, |
2362 | const std::function<bool(BasicBlock *)> &BlockNeedsPredication) { |
2363 | // Collect recipes in the backward slice of `Root` that may generate a poison |
2364 | // value that is used after vectorization. |
2365 | SmallPtrSet<VPRecipeBase *, 16> Visited; |
2366 | auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { |
2367 | SmallVector<VPRecipeBase *, 16> Worklist; |
2368 | Worklist.push_back(Elt: Root); |
2369 | |
2370 | // Traverse the backward slice of Root through its use-def chain. |
2371 | while (!Worklist.empty()) { |
2372 | VPRecipeBase *CurRec = Worklist.pop_back_val(); |
2373 | |
2374 | if (!Visited.insert(Ptr: CurRec).second) |
2375 | continue; |
2376 | |
2377 | // Prune search if we find another recipe generating a widen memory |
2378 | // instruction. Widen memory instructions involved in address computation |
2379 | // will lead to gather/scatter instructions, which don't need to be |
2380 | // handled. |
2381 | if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe, |
2382 | VPHeaderPHIRecipe>(Val: CurRec)) |
2383 | continue; |
2384 | |
2385 | // This recipe contributes to the address computation of a widen |
2386 | // load/store. If the underlying instruction has poison-generating flags, |
2387 | // drop them directly. |
2388 | if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: CurRec)) { |
2389 | VPValue *A, *B; |
2390 | using namespace llvm::VPlanPatternMatch; |
2391 | // Dropping disjoint from an OR may yield incorrect results, as some |
2392 | // analysis may have converted it to an Add implicitly (e.g. SCEV used |
2393 | // for dependence analysis). Instead, replace it with an equivalent Add. |
2394 | // This is possible as all users of the disjoint OR only access lanes |
2395 | // where the operands are disjoint or poison otherwise. |
2396 | if (match(V: RecWithFlags, P: m_BinaryOr(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))) && |
2397 | RecWithFlags->isDisjoint()) { |
2398 | VPBuilder Builder(RecWithFlags); |
2399 | VPInstruction *New = Builder.createOverflowingOp( |
2400 | Opcode: Instruction::Add, Operands: {A, B}, WrapFlags: {false, false}, |
2401 | DL: RecWithFlags->getDebugLoc()); |
2402 | New->setUnderlyingValue(RecWithFlags->getUnderlyingValue()); |
2403 | RecWithFlags->replaceAllUsesWith(New); |
2404 | RecWithFlags->eraseFromParent(); |
2405 | CurRec = New; |
2406 | } else |
2407 | RecWithFlags->dropPoisonGeneratingFlags(); |
2408 | } else { |
2409 | Instruction *Instr = dyn_cast_or_null<Instruction>( |
2410 | Val: CurRec->getVPSingleValue()->getUnderlyingValue()); |
2411 | (void)Instr; |
2412 | assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && |
2413 | "found instruction with poison generating flags not covered by " |
2414 | "VPRecipeWithIRFlags" ); |
2415 | } |
2416 | |
2417 | // Add new definitions to the worklist. |
2418 | for (VPValue *Operand : CurRec->operands()) |
2419 | if (VPRecipeBase *OpDef = Operand->getDefiningRecipe()) |
2420 | Worklist.push_back(Elt: OpDef); |
2421 | } |
2422 | }); |
2423 | |
2424 | // Traverse all the recipes in the VPlan and collect the poison-generating |
2425 | // recipes in the backward slice starting at the address of a VPWidenRecipe or |
2426 | // VPInterleaveRecipe. |
2427 | auto Iter = vp_depth_first_deep(G: Plan.getEntry()); |
2428 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) { |
2429 | for (VPRecipeBase &Recipe : *VPBB) { |
2430 | if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(Val: &Recipe)) { |
2431 | Instruction &UnderlyingInstr = WidenRec->getIngredient(); |
2432 | VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); |
2433 | if (AddrDef && WidenRec->isConsecutive() && |
2434 | BlockNeedsPredication(UnderlyingInstr.getParent())) |
2435 | CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef); |
2436 | } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(Val: &Recipe)) { |
2437 | VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); |
2438 | if (AddrDef) { |
2439 | // Check if any member of the interleave group needs predication. |
2440 | const InterleaveGroup<Instruction> *InterGroup = |
2441 | InterleaveRec->getInterleaveGroup(); |
2442 | bool NeedPredication = false; |
2443 | for (int I = 0, NumMembers = InterGroup->getNumMembers(); |
2444 | I < NumMembers; ++I) { |
2445 | Instruction *Member = InterGroup->getMember(Index: I); |
2446 | if (Member) |
2447 | NeedPredication |= BlockNeedsPredication(Member->getParent()); |
2448 | } |
2449 | |
2450 | if (NeedPredication) |
2451 | CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef); |
2452 | } |
2453 | } |
2454 | } |
2455 | } |
2456 | } |
2457 | |
2458 | void VPlanTransforms::createInterleaveGroups( |
2459 | VPlan &Plan, |
2460 | const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> |
2461 | &InterleaveGroups, |
2462 | VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) { |
2463 | if (InterleaveGroups.empty()) |
2464 | return; |
2465 | |
2466 | // Interleave memory: for each Interleave Group we marked earlier as relevant |
2467 | // for this VPlan, replace the Recipes widening its memory instructions with a |
2468 | // single VPInterleaveRecipe at its insertion point. |
2469 | VPDominatorTree VPDT; |
2470 | VPDT.recalculate(Func&: Plan); |
2471 | for (const auto *IG : InterleaveGroups) { |
2472 | SmallVector<VPValue *, 4> StoredValues; |
2473 | for (unsigned i = 0; i < IG->getFactor(); ++i) |
2474 | if (auto *SI = dyn_cast_or_null<StoreInst>(Val: IG->getMember(Index: i))) { |
2475 | auto *StoreR = cast<VPWidenStoreRecipe>(Val: RecipeBuilder.getRecipe(I: SI)); |
2476 | StoredValues.push_back(Elt: StoreR->getStoredValue()); |
2477 | } |
2478 | |
2479 | bool NeedsMaskForGaps = |
2480 | IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed; |
2481 | |
2482 | Instruction *IRInsertPos = IG->getInsertPos(); |
2483 | auto *InsertPos = |
2484 | cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IRInsertPos)); |
2485 | |
2486 | bool InBounds = false; |
2487 | if (auto *Gep = dyn_cast<GetElementPtrInst>( |
2488 | Val: getLoadStorePointerOperand(V: IRInsertPos)->stripPointerCasts())) |
2489 | InBounds = Gep->isInBounds(); |
2490 | |
2491 | // Get or create the start address for the interleave group. |
2492 | auto *Start = |
2493 | cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IG->getMember(Index: 0))); |
2494 | VPValue *Addr = Start->getAddr(); |
2495 | VPRecipeBase *AddrDef = Addr->getDefiningRecipe(); |
2496 | if (AddrDef && !VPDT.properlyDominates(A: AddrDef, B: InsertPos)) { |
2497 | // We cannot re-use the address of member zero because it does not |
2498 | // dominate the insert position. Instead, use the address of the insert |
2499 | // position and create a PtrAdd adjusting it to the address of member |
2500 | // zero. |
2501 | // TODO: Hoist Addr's defining recipe (and any operands as needed) to |
2502 | // InsertPos or sink loads above zero members to join it. |
2503 | assert(IG->getIndex(IRInsertPos) != 0 && |
2504 | "index of insert position shouldn't be zero" ); |
2505 | auto &DL = IRInsertPos->getDataLayout(); |
2506 | APInt Offset(32, |
2507 | DL.getTypeAllocSize(Ty: getLoadStoreType(I: IRInsertPos)) * |
2508 | IG->getIndex(Instr: IRInsertPos), |
2509 | /*IsSigned=*/true); |
2510 | VPValue *OffsetVPV = Plan.getOrAddLiveIn( |
2511 | V: ConstantInt::get(Context&: IRInsertPos->getParent()->getContext(), V: -Offset)); |
2512 | VPBuilder B(InsertPos); |
2513 | Addr = InBounds ? B.createInBoundsPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV) |
2514 | : B.createPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV); |
2515 | } |
2516 | // If the group is reverse, adjust the index to refer to the last vector |
2517 | // lane instead of the first. We adjust the index from the first vector |
2518 | // lane, rather than directly getting the pointer for lane VF - 1, because |
2519 | // the pointer operand of the interleaved access is supposed to be uniform. |
2520 | if (IG->isReverse()) { |
2521 | auto *ReversePtr = new VPVectorEndPointerRecipe( |
2522 | Addr, &Plan.getVF(), getLoadStoreType(I: IRInsertPos), |
2523 | -(int64_t)IG->getFactor(), |
2524 | InBounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none(), |
2525 | InsertPos->getDebugLoc()); |
2526 | ReversePtr->insertBefore(InsertPos); |
2527 | Addr = ReversePtr; |
2528 | } |
2529 | auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues, |
2530 | InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc()); |
2531 | VPIG->insertBefore(InsertPos); |
2532 | |
2533 | unsigned J = 0; |
2534 | for (unsigned i = 0; i < IG->getFactor(); ++i) |
2535 | if (Instruction *Member = IG->getMember(Index: i)) { |
2536 | VPRecipeBase *MemberR = RecipeBuilder.getRecipe(I: Member); |
2537 | if (!Member->getType()->isVoidTy()) { |
2538 | VPValue *OriginalV = MemberR->getVPSingleValue(); |
2539 | OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J)); |
2540 | J++; |
2541 | } |
2542 | MemberR->eraseFromParent(); |
2543 | } |
2544 | } |
2545 | } |
2546 | |
2547 | /// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial |
2548 | /// value, phi and backedge value. In the following example: |
2549 | /// |
2550 | /// vector.ph: |
2551 | /// Successor(s): vector loop |
2552 | /// |
2553 | /// <x1> vector loop: { |
2554 | /// vector.body: |
2555 | /// WIDEN-INDUCTION %i = phi %start, %step, %vf |
2556 | /// ... |
2557 | /// EMIT branch-on-count ... |
2558 | /// No successors |
2559 | /// } |
2560 | /// |
2561 | /// WIDEN-INDUCTION will get expanded to: |
2562 | /// |
2563 | /// vector.ph: |
2564 | /// ... |
2565 | /// vp<%induction.start> = ... |
2566 | /// vp<%induction.increment> = ... |
2567 | /// |
2568 | /// Successor(s): vector loop |
2569 | /// |
2570 | /// <x1> vector loop: { |
2571 | /// vector.body: |
2572 | /// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next> |
2573 | /// ... |
2574 | /// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment> |
2575 | /// EMIT branch-on-count ... |
2576 | /// No successors |
2577 | /// } |
2578 | static void |
2579 | expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, |
2580 | VPTypeAnalysis &TypeInfo) { |
2581 | VPlan *Plan = WidenIVR->getParent()->getPlan(); |
2582 | VPValue *Start = WidenIVR->getStartValue(); |
2583 | VPValue *Step = WidenIVR->getStepValue(); |
2584 | VPValue *VF = WidenIVR->getVFValue(); |
2585 | DebugLoc DL = WidenIVR->getDebugLoc(); |
2586 | |
2587 | // The value from the original loop to which we are mapping the new induction |
2588 | // variable. |
2589 | Type *Ty = TypeInfo.inferScalarType(V: WidenIVR); |
2590 | |
2591 | const InductionDescriptor &ID = WidenIVR->getInductionDescriptor(); |
2592 | Instruction::BinaryOps AddOp; |
2593 | Instruction::BinaryOps MulOp; |
2594 | // FIXME: The newly created binary instructions should contain nsw/nuw |
2595 | // flags, which can be found from the original scalar operations. |
2596 | VPIRFlags Flags; |
2597 | if (ID.getKind() == InductionDescriptor::IK_IntInduction) { |
2598 | AddOp = Instruction::Add; |
2599 | MulOp = Instruction::Mul; |
2600 | } else { |
2601 | AddOp = ID.getInductionOpcode(); |
2602 | MulOp = Instruction::FMul; |
2603 | Flags = ID.getInductionBinOp()->getFastMathFlags(); |
2604 | } |
2605 | |
2606 | // If the phi is truncated, truncate the start and step values. |
2607 | VPBuilder Builder(Plan->getVectorPreheader()); |
2608 | Type *StepTy = TypeInfo.inferScalarType(V: Step); |
2609 | if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) { |
2610 | assert(StepTy->isIntegerTy() && "Truncation requires an integer type" ); |
2611 | Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy: Ty, DL); |
2612 | Start = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Start, ResultTy: Ty, DL); |
2613 | StepTy = Ty; |
2614 | } |
2615 | |
2616 | // Construct the initial value of the vector IV in the vector loop preheader. |
2617 | Type *IVIntTy = |
2618 | IntegerType::get(C&: StepTy->getContext(), NumBits: StepTy->getScalarSizeInBits()); |
2619 | VPValue *Init = Builder.createNaryOp(Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: IVIntTy); |
2620 | if (StepTy->isFloatingPointTy()) |
2621 | Init = Builder.createWidenCast(Opcode: Instruction::UIToFP, Op: Init, ResultTy: StepTy); |
2622 | |
2623 | VPValue *SplatStart = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Start); |
2624 | VPValue *SplatStep = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Step); |
2625 | |
2626 | Init = Builder.createNaryOp(Opcode: MulOp, Operands: {Init, SplatStep}, Flags); |
2627 | Init = |
2628 | Builder.createNaryOp(Opcode: AddOp, Operands: {SplatStart, Init}, Flags, DL: {}, Name: "induction" ); |
2629 | |
2630 | // Create the widened phi of the vector IV. |
2631 | auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr, |
2632 | WidenIVR->getDebugLoc(), "vec.ind" ); |
2633 | WidePHI->addOperand(Operand: Init); |
2634 | WidePHI->insertBefore(InsertPos: WidenIVR); |
2635 | |
2636 | // Create the backedge value for the vector IV. |
2637 | VPValue *Inc; |
2638 | VPValue *Prev; |
2639 | // If unrolled, use the increment and prev value from the operands. |
2640 | if (auto *SplatVF = WidenIVR->getSplatVFValue()) { |
2641 | Inc = SplatVF; |
2642 | Prev = WidenIVR->getLastUnrolledPartOperand(); |
2643 | } else { |
2644 | if (VPRecipeBase *R = VF->getDefiningRecipe()) |
2645 | Builder.setInsertPoint(TheBB: R->getParent(), IP: std::next(x: R->getIterator())); |
2646 | // Multiply the vectorization factor by the step using integer or |
2647 | // floating-point arithmetic as appropriate. |
2648 | if (StepTy->isFloatingPointTy()) |
2649 | VF = Builder.createScalarCast(Opcode: Instruction::CastOps::UIToFP, Op: VF, ResultTy: StepTy, |
2650 | DL); |
2651 | else |
2652 | VF = Builder.createScalarZExtOrTrunc(Op: VF, ResultTy: StepTy, |
2653 | SrcTy: TypeInfo.inferScalarType(V: VF), DL); |
2654 | |
2655 | Inc = Builder.createNaryOp(Opcode: MulOp, Operands: {Step, VF}, Flags); |
2656 | Inc = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: Inc); |
2657 | Prev = WidePHI; |
2658 | } |
2659 | |
2660 | VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock(); |
2661 | Builder.setInsertPoint(TheBB: ExitingBB, IP: ExitingBB->getTerminator()->getIterator()); |
2662 | auto *Next = Builder.createNaryOp(Opcode: AddOp, Operands: {Prev, Inc}, Flags, |
2663 | DL: WidenIVR->getDebugLoc(), Name: "vec.ind.next" ); |
2664 | |
2665 | WidePHI->addOperand(Operand: Next); |
2666 | |
2667 | WidenIVR->replaceAllUsesWith(New: WidePHI); |
2668 | } |
2669 | |
2670 | void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) { |
2671 | // Replace loop regions with explicity CFG. |
2672 | SmallVector<VPRegionBlock *> LoopRegions; |
2673 | for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>( |
2674 | Range: vp_depth_first_deep(G: Plan.getEntry()))) { |
2675 | if (!R->isReplicator()) |
2676 | LoopRegions.push_back(Elt: R); |
2677 | } |
2678 | for (VPRegionBlock *R : LoopRegions) |
2679 | R->dissolveToCFGLoop(); |
2680 | } |
2681 | |
2682 | void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, |
2683 | Type &CanonicalIVTy) { |
2684 | using namespace llvm::VPlanPatternMatch; |
2685 | |
2686 | VPTypeAnalysis TypeInfo(&CanonicalIVTy); |
2687 | SmallVector<VPRecipeBase *> ToRemove; |
2688 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
2689 | Range: vp_depth_first_deep(G: Plan.getEntry()))) { |
2690 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
2691 | if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(Val: &R)) { |
2692 | auto *ScalarR = VPBuilder(PhiR).createScalarPhi( |
2693 | IncomingValues: {PhiR->getStartValue(), PhiR->getBackedgeValue()}, |
2694 | DL: PhiR->getDebugLoc(), Name: "evl.based.iv" ); |
2695 | PhiR->replaceAllUsesWith(New: ScalarR); |
2696 | ToRemove.push_back(Elt: PhiR); |
2697 | continue; |
2698 | } |
2699 | |
2700 | if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &R)) { |
2701 | expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo); |
2702 | ToRemove.push_back(Elt: WidenIVR); |
2703 | continue; |
2704 | } |
2705 | |
2706 | if (auto *Expr = dyn_cast<VPExpressionRecipe>(Val: &R)) { |
2707 | Expr->decompose(); |
2708 | ToRemove.push_back(Elt: Expr); |
2709 | } |
2710 | |
2711 | VPValue *VectorStep; |
2712 | VPValue *ScalarStep; |
2713 | if (!match(V: &R, P: m_VPInstruction<VPInstruction::WideIVStep>( |
2714 | Op0: m_VPValue(V&: VectorStep), Op1: m_VPValue(V&: ScalarStep)))) |
2715 | continue; |
2716 | |
2717 | // Expand WideIVStep. |
2718 | auto *VPI = cast<VPInstruction>(Val: &R); |
2719 | VPBuilder Builder(VPI); |
2720 | Type *IVTy = TypeInfo.inferScalarType(V: VPI); |
2721 | if (TypeInfo.inferScalarType(V: VectorStep) != IVTy) { |
2722 | Instruction::CastOps CastOp = IVTy->isFloatingPointTy() |
2723 | ? Instruction::UIToFP |
2724 | : Instruction::Trunc; |
2725 | VectorStep = Builder.createWidenCast(Opcode: CastOp, Op: VectorStep, ResultTy: IVTy); |
2726 | } |
2727 | |
2728 | [[maybe_unused]] auto *ConstStep = |
2729 | ScalarStep->isLiveIn() |
2730 | ? dyn_cast<ConstantInt>(Val: ScalarStep->getLiveInIRValue()) |
2731 | : nullptr; |
2732 | assert(!ConstStep || ConstStep->getValue() != 1); |
2733 | (void)ConstStep; |
2734 | if (TypeInfo.inferScalarType(V: ScalarStep) != IVTy) { |
2735 | ScalarStep = |
2736 | Builder.createWidenCast(Opcode: Instruction::Trunc, Op: ScalarStep, ResultTy: IVTy); |
2737 | } |
2738 | |
2739 | VPIRFlags Flags; |
2740 | if (IVTy->isFloatingPointTy()) |
2741 | Flags = {VPI->getFastMathFlags()}; |
2742 | |
2743 | unsigned MulOpc = |
2744 | IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul; |
2745 | VPInstruction *Mul = Builder.createNaryOp( |
2746 | Opcode: MulOpc, Operands: {VectorStep, ScalarStep}, Flags, DL: R.getDebugLoc()); |
2747 | VectorStep = Mul; |
2748 | VPI->replaceAllUsesWith(New: VectorStep); |
2749 | ToRemove.push_back(Elt: VPI); |
2750 | } |
2751 | } |
2752 | |
2753 | for (VPRecipeBase *R : ToRemove) |
2754 | R->eraseFromParent(); |
2755 | } |
2756 | |
2757 | void VPlanTransforms::handleUncountableEarlyExit( |
2758 | VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, |
2759 | VPBasicBlock *, VPBasicBlock *LatchVPBB, VFRange &Range) { |
2760 | using namespace llvm::VPlanPatternMatch; |
2761 | |
2762 | VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0]; |
2763 | if (!EarlyExitVPBB->getSinglePredecessor() && |
2764 | EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) { |
2765 | assert(EarlyExitVPBB->getNumPredecessors() == 2 && |
2766 | EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB && |
2767 | "unsupported early exit VPBB" ); |
2768 | // Early exit operand should always be last phi operand. If EarlyExitVPBB |
2769 | // has two predecessors and EarlyExitingVPBB is the first, swap the operands |
2770 | // of the phis. |
2771 | for (VPRecipeBase &R : EarlyExitVPBB->phis()) |
2772 | cast<VPIRPhi>(Val: &R)->swapOperands(); |
2773 | } |
2774 | |
2775 | VPBuilder Builder(LatchVPBB->getTerminator()); |
2776 | VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0]; |
2777 | assert( |
2778 | match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) && |
2779 | "Terminator must be be BranchOnCond" ); |
2780 | VPValue *CondOfEarlyExitingVPBB = |
2781 | EarlyExitingVPBB->getTerminator()->getOperand(N: 0); |
2782 | auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB |
2783 | ? CondOfEarlyExitingVPBB |
2784 | : Builder.createNot(Operand: CondOfEarlyExitingVPBB); |
2785 | |
2786 | // Split the middle block and have it conditionally branch to the early exit |
2787 | // block if CondToEarlyExit. |
2788 | VPValue *IsEarlyExitTaken = |
2789 | Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: {CondToEarlyExit}); |
2790 | VPBasicBlock *NewMiddle = Plan.createVPBasicBlock(Name: "middle.split" ); |
2791 | VPBasicBlock *VectorEarlyExitVPBB = |
2792 | Plan.createVPBasicBlock(Name: "vector.early.exit" ); |
2793 | VPBlockUtils::insertOnEdge(From: LatchVPBB, To: MiddleVPBB, BlockPtr: NewMiddle); |
2794 | VPBlockUtils::connectBlocks(From: NewMiddle, To: VectorEarlyExitVPBB); |
2795 | NewMiddle->swapSuccessors(); |
2796 | |
2797 | VPBlockUtils::connectBlocks(From: VectorEarlyExitVPBB, To: EarlyExitVPBB); |
2798 | |
2799 | // Update the exit phis in the early exit block. |
2800 | VPBuilder MiddleBuilder(NewMiddle); |
2801 | VPBuilder EarlyExitB(VectorEarlyExitVPBB); |
2802 | for (VPRecipeBase &R : EarlyExitVPBB->phis()) { |
2803 | auto *ExitIRI = cast<VPIRPhi>(Val: &R); |
2804 | // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has |
2805 | // a single predecessor and 1 if it has two. |
2806 | unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1; |
2807 | if (ExitIRI->getNumOperands() != 1) { |
2808 | // The first of two operands corresponds to the latch exit, via MiddleVPBB |
2809 | // predecessor. Extract its last lane. |
2810 | ExitIRI->extractLastLaneOfFirstOperand(Builder&: MiddleBuilder); |
2811 | } |
2812 | |
2813 | VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(N: EarlyExitIdx); |
2814 | auto IsVector = [](ElementCount VF) { return VF.isVector(); }; |
2815 | // When the VFs are vectors, need to add `extract` to get the incoming value |
2816 | // from early exit. When the range contains scalar VF, limit the range to |
2817 | // scalar VF to prevent mis-compilation for the range containing both scalar |
2818 | // and vector VFs. |
2819 | if (!IncomingFromEarlyExit->isLiveIn() && |
2820 | LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsVector, Range)) { |
2821 | // Update the incoming value from the early exit. |
2822 | VPValue *FirstActiveLane = EarlyExitB.createNaryOp( |
2823 | Opcode: VPInstruction::FirstActiveLane, Operands: {CondToEarlyExit}, Inst: nullptr, |
2824 | Name: "first.active.lane" ); |
2825 | IncomingFromEarlyExit = EarlyExitB.createNaryOp( |
2826 | Opcode: Instruction::ExtractElement, Operands: {IncomingFromEarlyExit, FirstActiveLane}, |
2827 | Inst: nullptr, Name: "early.exit.value" ); |
2828 | ExitIRI->setOperand(I: EarlyExitIdx, New: IncomingFromEarlyExit); |
2829 | } |
2830 | } |
2831 | MiddleBuilder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {IsEarlyExitTaken}); |
2832 | |
2833 | // Replace the condition controlling the non-early exit from the vector loop |
2834 | // with one exiting if either the original condition of the vector latch is |
2835 | // true or the early exit has been taken. |
2836 | auto *LatchExitingBranch = cast<VPInstruction>(Val: LatchVPBB->getTerminator()); |
2837 | assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount && |
2838 | "Unexpected terminator" ); |
2839 | auto *IsLatchExitTaken = |
2840 | Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: LatchExitingBranch->getOperand(N: 0), |
2841 | B: LatchExitingBranch->getOperand(N: 1)); |
2842 | auto *AnyExitTaken = Builder.createNaryOp( |
2843 | Opcode: Instruction::Or, Operands: {IsEarlyExitTaken, IsLatchExitTaken}); |
2844 | Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: AnyExitTaken); |
2845 | LatchExitingBranch->eraseFromParent(); |
2846 | } |
2847 | |
2848 | /// This function tries convert extended in-loop reductions to |
2849 | /// VPExpressionRecipe and clamp the \p Range if it is beneficial and |
2850 | /// valid. The created recipe must be decomposed to its constituent |
2851 | /// recipes before execution. |
2852 | static VPExpressionRecipe * |
2853 | tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, |
2854 | VFRange &Range) { |
2855 | using namespace VPlanPatternMatch; |
2856 | |
2857 | Type *RedTy = Ctx.Types.inferScalarType(V: Red); |
2858 | VPValue *VecOp = Red->getVecOp(); |
2859 | |
2860 | // Clamp the range if using extended-reduction is profitable. |
2861 | auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt, |
2862 | Type *SrcTy) -> bool { |
2863 | return LoopVectorizationPlanner::getDecisionAndClampRange( |
2864 | Predicate: [&](ElementCount VF) { |
2865 | auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF)); |
2866 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
2867 | InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost( |
2868 | Opcode, IsUnsigned: isZExt, ResTy: RedTy, Ty: SrcVecTy, FMF: Red->getFastMathFlags(), |
2869 | CostKind); |
2870 | InstructionCost ExtCost = |
2871 | cast<VPWidenCastRecipe>(Val: VecOp)->computeCost(VF, Ctx); |
2872 | InstructionCost RedCost = Red->computeCost(VF, Ctx); |
2873 | return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost; |
2874 | }, |
2875 | Range); |
2876 | }; |
2877 | |
2878 | VPValue *A; |
2879 | // Match reduce(ext)). |
2880 | if (match(V: VecOp, P: m_ZExtOrSExt(Op0: m_VPValue(V&: A))) && |
2881 | IsExtendedRedValidAndClampRange( |
2882 | RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind()), |
2883 | cast<VPWidenCastRecipe>(Val: VecOp)->getOpcode() == |
2884 | Instruction::CastOps::ZExt, |
2885 | Ctx.Types.inferScalarType(V: A))) |
2886 | return new VPExpressionRecipe(cast<VPWidenCastRecipe>(Val: VecOp), Red); |
2887 | |
2888 | return nullptr; |
2889 | } |
2890 | |
2891 | /// This function tries convert extended in-loop reductions to |
2892 | /// VPExpressionRecipe and clamp the \p Range if it is beneficial |
2893 | /// and valid. The created VPExpressionRecipe must be decomposed to its |
2894 | /// constituent recipes before execution. Patterns of the |
2895 | /// VPExpressionRecipe: |
2896 | /// reduce.add(mul(...)), |
2897 | /// reduce.add(mul(ext(A), ext(B))), |
2898 | /// reduce.add(ext(mul(ext(A), ext(B)))). |
2899 | static VPExpressionRecipe * |
2900 | tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, |
2901 | VPCostContext &Ctx, VFRange &Range) { |
2902 | using namespace VPlanPatternMatch; |
2903 | |
2904 | unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind()); |
2905 | if (Opcode != Instruction::Add) |
2906 | return nullptr; |
2907 | |
2908 | Type *RedTy = Ctx.Types.inferScalarType(V: Red); |
2909 | |
2910 | // Clamp the range if using multiply-accumulate-reduction is profitable. |
2911 | auto IsMulAccValidAndClampRange = |
2912 | [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, |
2913 | VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool { |
2914 | return LoopVectorizationPlanner::getDecisionAndClampRange( |
2915 | Predicate: [&](ElementCount VF) { |
2916 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
2917 | Type *SrcTy = |
2918 | Ext0 ? Ctx.Types.inferScalarType(V: Ext0->getOperand(N: 0)) : RedTy; |
2919 | auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF)); |
2920 | InstructionCost MulAccCost = |
2921 | Ctx.TTI.getMulAccReductionCost(IsUnsigned: isZExt, ResTy: RedTy, Ty: SrcVecTy, CostKind); |
2922 | InstructionCost MulCost = Mul->computeCost(VF, Ctx); |
2923 | InstructionCost RedCost = Red->computeCost(VF, Ctx); |
2924 | InstructionCost ExtCost = 0; |
2925 | if (Ext0) |
2926 | ExtCost += Ext0->computeCost(VF, Ctx); |
2927 | if (Ext1) |
2928 | ExtCost += Ext1->computeCost(VF, Ctx); |
2929 | if (OuterExt) |
2930 | ExtCost += OuterExt->computeCost(VF, Ctx); |
2931 | |
2932 | return MulAccCost.isValid() && |
2933 | MulAccCost < ExtCost + MulCost + RedCost; |
2934 | }, |
2935 | Range); |
2936 | }; |
2937 | |
2938 | VPValue *VecOp = Red->getVecOp(); |
2939 | VPValue *A, *B; |
2940 | // Try to match reduce.add(mul(...)). |
2941 | if (match(V: VecOp, P: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B)))) { |
2942 | auto *RecipeA = |
2943 | dyn_cast_if_present<VPWidenCastRecipe>(Val: A->getDefiningRecipe()); |
2944 | auto *RecipeB = |
2945 | dyn_cast_if_present<VPWidenCastRecipe>(Val: B->getDefiningRecipe()); |
2946 | auto *Mul = cast<VPWidenRecipe>(Val: VecOp->getDefiningRecipe()); |
2947 | |
2948 | // Match reduce.add(mul(ext, ext)). |
2949 | if (RecipeA && RecipeB && |
2950 | (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) && |
2951 | match(V: RecipeA, P: m_ZExtOrSExt(Op0: m_VPValue())) && |
2952 | match(V: RecipeB, P: m_ZExtOrSExt(Op0: m_VPValue())) && |
2953 | IsMulAccValidAndClampRange(RecipeA->getOpcode() == |
2954 | Instruction::CastOps::ZExt, |
2955 | Mul, RecipeA, RecipeB, nullptr)) { |
2956 | return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red); |
2957 | } |
2958 | // Match reduce.add(mul). |
2959 | if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) |
2960 | return new VPExpressionRecipe(Mul, Red); |
2961 | } |
2962 | // Match reduce.add(ext(mul(ext(A), ext(B)))). |
2963 | // All extend recipes must have same opcode or A == B |
2964 | // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))). |
2965 | if (match(V: VecOp, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()), |
2966 | Op1: m_ZExtOrSExt(Op0: m_VPValue()))))) { |
2967 | auto *Ext = cast<VPWidenCastRecipe>(Val: VecOp->getDefiningRecipe()); |
2968 | auto *Mul = cast<VPWidenRecipe>(Val: Ext->getOperand(N: 0)->getDefiningRecipe()); |
2969 | auto *Ext0 = |
2970 | cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 0)->getDefiningRecipe()); |
2971 | auto *Ext1 = |
2972 | cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 1)->getDefiningRecipe()); |
2973 | if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && |
2974 | Ext0->getOpcode() == Ext1->getOpcode() && |
2975 | IsMulAccValidAndClampRange(Ext0->getOpcode() == |
2976 | Instruction::CastOps::ZExt, |
2977 | Mul, Ext0, Ext1, Ext)) { |
2978 | auto *NewExt0 = new VPWidenCastRecipe( |
2979 | Ext0->getOpcode(), Ext0->getOperand(N: 0), Ext->getResultType(), *Ext0, |
2980 | Ext0->getDebugLoc()); |
2981 | NewExt0->insertBefore(InsertPos: Ext0); |
2982 | |
2983 | VPWidenCastRecipe *NewExt1 = NewExt0; |
2984 | if (Ext0 != Ext1) { |
2985 | NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(N: 0), |
2986 | Ext->getResultType(), *Ext1, |
2987 | Ext1->getDebugLoc()); |
2988 | NewExt1->insertBefore(InsertPos: Ext1); |
2989 | } |
2990 | Mul->setOperand(I: 0, New: NewExt0); |
2991 | Mul->setOperand(I: 1, New: NewExt1); |
2992 | Red->setOperand(I: 1, New: Mul); |
2993 | return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red); |
2994 | } |
2995 | } |
2996 | return nullptr; |
2997 | } |
2998 | |
2999 | /// This function tries to create abstract recipes from the reduction recipe for |
3000 | /// following optimizations and cost estimation. |
3001 | static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, |
3002 | VPCostContext &Ctx, |
3003 | VFRange &Range) { |
3004 | VPExpressionRecipe *AbstractR = nullptr; |
3005 | auto IP = std::next(x: Red->getIterator()); |
3006 | auto *VPBB = Red->getParent(); |
3007 | if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range)) |
3008 | AbstractR = MulAcc; |
3009 | else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range)) |
3010 | AbstractR = ExtRed; |
3011 | // Cannot create abstract inloop reduction recipes. |
3012 | if (!AbstractR) |
3013 | return; |
3014 | |
3015 | AbstractR->insertBefore(BB&: *VPBB, IP); |
3016 | Red->replaceAllUsesWith(New: AbstractR); |
3017 | } |
3018 | |
3019 | void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, |
3020 | VFRange &Range) { |
3021 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
3022 | Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) { |
3023 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
3024 | if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R)) |
3025 | tryToCreateAbstractReductionRecipe(Red, Ctx, Range); |
3026 | } |
3027 | } |
3028 | } |
3029 | |
3030 | void VPlanTransforms::materializeBroadcasts(VPlan &Plan) { |
3031 | if (Plan.hasScalarVFOnly()) |
3032 | return; |
3033 | |
3034 | #ifndef NDEBUG |
3035 | VPDominatorTree VPDT; |
3036 | VPDT.recalculate(Plan); |
3037 | #endif |
3038 | |
3039 | SmallVector<VPValue *> VPValues; |
3040 | if (Plan.getOrCreateBackedgeTakenCount()->getNumUsers() > 0) |
3041 | VPValues.push_back(Elt: Plan.getOrCreateBackedgeTakenCount()); |
3042 | append_range(C&: VPValues, R: Plan.getLiveIns()); |
3043 | for (VPRecipeBase &R : *Plan.getEntry()) |
3044 | append_range(C&: VPValues, R: R.definedValues()); |
3045 | |
3046 | auto * = Plan.getVectorPreheader(); |
3047 | for (VPValue *VPV : VPValues) { |
3048 | if (all_of(Range: VPV->users(), |
3049 | P: [VPV](VPUser *U) { return U->usesScalars(Op: VPV); }) || |
3050 | (VPV->isLiveIn() && VPV->getLiveInIRValue() && |
3051 | isa<Constant>(Val: VPV->getLiveInIRValue()))) |
3052 | continue; |
3053 | |
3054 | // Add explicit broadcast at the insert point that dominates all users. |
3055 | VPBasicBlock *HoistBlock = VectorPreheader; |
3056 | VPBasicBlock::iterator HoistPoint = VectorPreheader->end(); |
3057 | for (VPUser *User : VPV->users()) { |
3058 | if (User->usesScalars(Op: VPV)) |
3059 | continue; |
3060 | if (cast<VPRecipeBase>(Val: User)->getParent() == VectorPreheader) |
3061 | HoistPoint = HoistBlock->begin(); |
3062 | else |
3063 | assert(VPDT.dominates(VectorPreheader, |
3064 | cast<VPRecipeBase>(User)->getParent()) && |
3065 | "All users must be in the vector preheader or dominated by it" ); |
3066 | } |
3067 | |
3068 | VPBuilder Builder(cast<VPBasicBlock>(Val: HoistBlock), HoistPoint); |
3069 | auto *Broadcast = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: {VPV}); |
3070 | VPV->replaceUsesWithIf(New: Broadcast, |
3071 | ShouldReplace: [VPV, Broadcast](VPUser &U, unsigned Idx) { |
3072 | return Broadcast != &U && !U.usesScalars(Op: VPV); |
3073 | }); |
3074 | } |
3075 | } |
3076 | |
3077 | /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be |
3078 | /// converted to a narrower recipe. \p V is used by a wide recipe \p WideMember |
3079 | /// that feeds a store interleave group at index \p Idx, \p WideMember0 is the |
3080 | /// recipe feeding the same interleave group at index 0. A VPWidenLoadRecipe can |
3081 | /// be narrowed to an index-independent load if it feeds all wide ops at all |
3082 | /// indices (\p OpV must be the operand at index \p OpIdx for both the recipe at |
3083 | /// lane 0, \p WideMember0, and \p WideMember). A VPInterleaveRecipe can be |
3084 | /// narrowed to a wide load, if \p V is defined at \p Idx of a load interleave |
3085 | /// group. |
3086 | static bool canNarrowLoad(VPWidenRecipe *WideMember0, VPWidenRecipe *WideMember, |
3087 | unsigned OpIdx, VPValue *OpV, unsigned Idx) { |
3088 | auto *DefR = OpV->getDefiningRecipe(); |
3089 | if (!DefR) |
3090 | return WideMember0->getOperand(N: OpIdx) == OpV; |
3091 | if (auto *W = dyn_cast<VPWidenLoadRecipe>(Val: DefR)) |
3092 | return !W->getMask() && WideMember0->getOperand(N: OpIdx) == OpV; |
3093 | |
3094 | if (auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR)) |
3095 | return IR->getInterleaveGroup()->getFactor() == |
3096 | IR->getInterleaveGroup()->getNumMembers() && |
3097 | IR->getVPValue(I: Idx) == OpV; |
3098 | return false; |
3099 | } |
3100 | |
3101 | /// Returns true if \p IR is a full interleave group with factor and number of |
3102 | /// members both equal to \p VF. The interleave group must also access the full |
3103 | /// vector width \p VectorRegWidth. |
3104 | static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, |
3105 | unsigned VF, VPTypeAnalysis &TypeInfo, |
3106 | unsigned VectorRegWidth) { |
3107 | if (!InterleaveR) |
3108 | return false; |
3109 | |
3110 | Type *GroupElementTy = nullptr; |
3111 | if (InterleaveR->getStoredValues().empty()) { |
3112 | GroupElementTy = TypeInfo.inferScalarType(V: InterleaveR->getVPValue(I: 0)); |
3113 | if (!all_of(Range: InterleaveR->definedValues(), |
3114 | P: [&TypeInfo, GroupElementTy](VPValue *Op) { |
3115 | return TypeInfo.inferScalarType(V: Op) == GroupElementTy; |
3116 | })) |
3117 | return false; |
3118 | } else { |
3119 | GroupElementTy = |
3120 | TypeInfo.inferScalarType(V: InterleaveR->getStoredValues()[0]); |
3121 | if (!all_of(Range: InterleaveR->getStoredValues(), |
3122 | P: [&TypeInfo, GroupElementTy](VPValue *Op) { |
3123 | return TypeInfo.inferScalarType(V: Op) == GroupElementTy; |
3124 | })) |
3125 | return false; |
3126 | } |
3127 | |
3128 | unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF; |
3129 | auto IG = InterleaveR->getInterleaveGroup(); |
3130 | return IG->getFactor() == VF && IG->getNumMembers() == VF && |
3131 | GroupSize == VectorRegWidth; |
3132 | } |
3133 | |
3134 | /// Returns true if \p VPValue is a narrow VPValue. |
3135 | static bool isAlreadyNarrow(VPValue *VPV) { return VPV->isLiveIn(); } |
3136 | |
3137 | void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, |
3138 | unsigned VectorRegWidth) { |
3139 | using namespace llvm::VPlanPatternMatch; |
3140 | VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); |
3141 | if (VF.isScalable() || !VectorLoop) |
3142 | return; |
3143 | |
3144 | VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); |
3145 | Type *CanonicalIVType = CanonicalIV->getScalarType(); |
3146 | VPTypeAnalysis TypeInfo(CanonicalIVType); |
3147 | |
3148 | unsigned FixedVF = VF.getFixedValue(); |
3149 | SmallVector<VPInterleaveRecipe *> StoreGroups; |
3150 | for (auto &R : *VectorLoop->getEntryBasicBlock()) { |
3151 | if (isa<VPCanonicalIVPHIRecipe>(Val: &R) || |
3152 | match(V: &R, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue()))) |
3153 | continue; |
3154 | |
3155 | // Bail out on recipes not supported at the moment: |
3156 | // * phi recipes other than the canonical induction |
3157 | // * recipes writing to memory except interleave groups |
3158 | // Only support plans with a canonical induction phi. |
3159 | if (R.isPhi()) |
3160 | return; |
3161 | |
3162 | auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R); |
3163 | if (R.mayWriteToMemory() && !InterleaveR) |
3164 | return; |
3165 | |
3166 | // Do not narrow interleave groups if there are VectorPointer recipes and |
3167 | // the plan was unrolled. The recipe implicitly uses VF from |
3168 | // VPTransformState. |
3169 | // TODO: Remove restriction once the VF for the VectorPointer offset is |
3170 | // modeled explicitly as operand. |
3171 | if (isa<VPVectorPointerRecipe>(Val: &R) && Plan.getUF() > 1) |
3172 | return; |
3173 | |
3174 | // All other ops are allowed, but we reject uses that cannot be converted |
3175 | // when checking all allowed consumers (store interleave groups) below. |
3176 | if (!InterleaveR) |
3177 | continue; |
3178 | |
3179 | // Bail out on non-consecutive interleave groups. |
3180 | if (!isConsecutiveInterleaveGroup(InterleaveR, VF: FixedVF, TypeInfo, |
3181 | VectorRegWidth)) |
3182 | return; |
3183 | |
3184 | // Skip read interleave groups. |
3185 | if (InterleaveR->getStoredValues().empty()) |
3186 | continue; |
3187 | |
3188 | // Narrow interleave groups, if all operands are already matching narrow |
3189 | // ops. |
3190 | auto *Member0 = InterleaveR->getStoredValues()[0]; |
3191 | if (isAlreadyNarrow(VPV: Member0) && |
3192 | all_of(Range: InterleaveR->getStoredValues(), |
3193 | P: [Member0](VPValue *VPV) { return Member0 == VPV; })) { |
3194 | StoreGroups.push_back(Elt: InterleaveR); |
3195 | continue; |
3196 | } |
3197 | |
3198 | // For now, we only support full interleave groups storing load interleave |
3199 | // groups. |
3200 | if (all_of(Range: enumerate(First: InterleaveR->getStoredValues()), P: [](auto Op) { |
3201 | VPRecipeBase *DefR = Op.value()->getDefiningRecipe(); |
3202 | if (!DefR) |
3203 | return false; |
3204 | auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR); |
3205 | return IR && |
3206 | IR->getInterleaveGroup()->getFactor() == |
3207 | IR->getInterleaveGroup()->getNumMembers() && |
3208 | IR->getVPValue(Op.index()) == Op.value(); |
3209 | })) { |
3210 | StoreGroups.push_back(Elt: InterleaveR); |
3211 | continue; |
3212 | } |
3213 | |
3214 | // Check if all values feeding InterleaveR are matching wide recipes, which |
3215 | // operands that can be narrowed. |
3216 | auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>( |
3217 | Val: InterleaveR->getStoredValues()[0]->getDefiningRecipe()); |
3218 | if (!WideMember0) |
3219 | return; |
3220 | for (const auto &[I, V] : enumerate(First: InterleaveR->getStoredValues())) { |
3221 | auto *R = dyn_cast_or_null<VPWidenRecipe>(Val: V->getDefiningRecipe()); |
3222 | if (!R || R->getOpcode() != WideMember0->getOpcode() || |
3223 | R->getNumOperands() > 2) |
3224 | return; |
3225 | if (any_of(Range: enumerate(First: R->operands()), |
3226 | P: [WideMember0, Idx = I, R](const auto &P) { |
3227 | const auto &[OpIdx, OpV] = P; |
3228 | return !canNarrowLoad(WideMember0, R, OpIdx, OpV, Idx); |
3229 | })) |
3230 | return; |
3231 | } |
3232 | StoreGroups.push_back(Elt: InterleaveR); |
3233 | } |
3234 | |
3235 | if (StoreGroups.empty()) |
3236 | return; |
3237 | |
3238 | // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe. |
3239 | auto NarrowOp = [](VPValue *V) -> VPValue * { |
3240 | auto *R = V->getDefiningRecipe(); |
3241 | if (!R) |
3242 | return V; |
3243 | if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(Val: R)) { |
3244 | // Narrow interleave group to wide load, as transformed VPlan will only |
3245 | // process one original iteration. |
3246 | auto *L = new VPWidenLoadRecipe( |
3247 | *cast<LoadInst>(Val: LoadGroup->getInterleaveGroup()->getInsertPos()), |
3248 | LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, |
3249 | /*Reverse=*/false, {}, LoadGroup->getDebugLoc()); |
3250 | L->insertBefore(InsertPos: LoadGroup); |
3251 | return L; |
3252 | } |
3253 | |
3254 | auto *WideLoad = cast<VPWidenLoadRecipe>(Val: R); |
3255 | |
3256 | // Narrow wide load to uniform scalar load, as transformed VPlan will only |
3257 | // process one original iteration. |
3258 | auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), |
3259 | WideLoad->operands(), /*IsUniform*/ true, |
3260 | /*Mask*/ nullptr, *WideLoad); |
3261 | N->insertBefore(InsertPos: WideLoad); |
3262 | return N; |
3263 | }; |
3264 | |
3265 | // Narrow operation tree rooted at store groups. |
3266 | for (auto *StoreGroup : StoreGroups) { |
3267 | VPValue *Res = nullptr; |
3268 | VPValue *Member0 = StoreGroup->getStoredValues()[0]; |
3269 | if (isAlreadyNarrow(VPV: Member0)) { |
3270 | Res = Member0; |
3271 | } else if (auto *WideMember0 = |
3272 | dyn_cast<VPWidenRecipe>(Val: Member0->getDefiningRecipe())) { |
3273 | for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) |
3274 | WideMember0->setOperand(I: Idx, New: NarrowOp(WideMember0->getOperand(N: Idx))); |
3275 | Res = WideMember0; |
3276 | } else { |
3277 | Res = NarrowOp(Member0); |
3278 | } |
3279 | |
3280 | auto *S = new VPWidenStoreRecipe( |
3281 | *cast<StoreInst>(Val: StoreGroup->getInterleaveGroup()->getInsertPos()), |
3282 | StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true, |
3283 | /*Reverse=*/false, {}, StoreGroup->getDebugLoc()); |
3284 | S->insertBefore(InsertPos: StoreGroup); |
3285 | StoreGroup->eraseFromParent(); |
3286 | } |
3287 | |
3288 | // Adjust induction to reflect that the transformed plan only processes one |
3289 | // original iteration. |
3290 | auto *CanIV = Plan.getCanonicalIV(); |
3291 | auto *Inc = cast<VPInstruction>(Val: CanIV->getBackedgeValue()); |
3292 | Inc->setOperand(I: 1, New: Plan.getOrAddLiveIn(V: ConstantInt::get( |
3293 | Ty: CanIV->getScalarType(), V: 1 * Plan.getUF()))); |
3294 | Plan.getVF().replaceAllUsesWith( |
3295 | New: Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanIV->getScalarType(), V: 1))); |
3296 | removeDeadRecipes(Plan); |
3297 | } |
3298 | |
3299 | /// Add branch weight metadata, if the \p Plan's middle block is terminated by a |
3300 | /// BranchOnCond recipe. |
3301 | void VPlanTransforms::addBranchWeightToMiddleTerminator( |
3302 | VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) { |
3303 | VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock(); |
3304 | auto *MiddleTerm = |
3305 | dyn_cast_or_null<VPInstruction>(Val: MiddleVPBB->getTerminator()); |
3306 | // Only add branch metadata if there is a (conditional) terminator. |
3307 | if (!MiddleTerm) |
3308 | return; |
3309 | |
3310 | assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond && |
3311 | "must have a BranchOnCond" ); |
3312 | // Assume that `TripCount % VectorStep ` is equally distributed. |
3313 | unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue(); |
3314 | if (VF.isScalable() && VScaleForTuning.has_value()) |
3315 | VectorStep *= *VScaleForTuning; |
3316 | assert(VectorStep > 0 && "trip count should not be zero" ); |
3317 | MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext()); |
3318 | MDNode *BranchWeights = |
3319 | MDB.createBranchWeights(Weights: {1, VectorStep - 1}, /*IsExpected=*/false); |
3320 | MiddleTerm->addMetadata(Kind: LLVMContext::MD_prof, Node: BranchWeights); |
3321 | } |
3322 | |