1 | //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file implements explicit unrolling for VPlans. |
11 | /// |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "VPRecipeBuilder.h" |
15 | #include "VPlan.h" |
16 | #include "VPlanAnalysis.h" |
17 | #include "VPlanCFG.h" |
18 | #include "VPlanHelpers.h" |
19 | #include "VPlanPatternMatch.h" |
20 | #include "VPlanTransforms.h" |
21 | #include "VPlanUtils.h" |
22 | #include "llvm/ADT/PostOrderIterator.h" |
23 | #include "llvm/ADT/STLExtras.h" |
24 | #include "llvm/ADT/ScopeExit.h" |
25 | #include "llvm/Analysis/IVDescriptors.h" |
26 | #include "llvm/IR/Intrinsics.h" |
27 | |
28 | using namespace llvm; |
29 | using namespace llvm::VPlanPatternMatch; |
30 | |
31 | namespace { |
32 | |
33 | /// Helper to hold state needed for unrolling. It holds the Plan to unroll by |
34 | /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate |
35 | /// the unrolling transformation, where the original VPValues are retained for |
36 | /// part zero. |
37 | class UnrollState { |
38 | /// Plan to unroll. |
39 | VPlan &Plan; |
40 | /// Unroll factor to unroll by. |
41 | const unsigned UF; |
42 | /// Analysis for types. |
43 | VPTypeAnalysis TypeInfo; |
44 | |
45 | /// Unrolling may create recipes that should not be unrolled themselves. |
46 | /// Those are tracked in ToSkip. |
47 | SmallPtrSet<VPRecipeBase *, 8> ToSkip; |
48 | |
49 | // Associate with each VPValue of part 0 its unrolled instances of parts 1, |
50 | // ..., UF-1. |
51 | DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts; |
52 | |
53 | /// Unroll replicate region \p VPR by cloning the region UF - 1 times. |
54 | void unrollReplicateRegionByUF(VPRegionBlock *VPR); |
55 | |
56 | /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across |
57 | /// all parts. |
58 | void unrollRecipeByUF(VPRecipeBase &R); |
59 | |
60 | /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled |
61 | /// depends on the concrete header phi. Inserts newly created recipes at \p |
62 | /// InsertPtForPhi. |
63 | void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, |
64 | VPBasicBlock::iterator InsertPtForPhi); |
65 | |
66 | /// Unroll a widen induction recipe \p IV. This introduces recipes to compute |
67 | /// the induction steps for each part. |
68 | void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV, |
69 | VPBasicBlock::iterator InsertPtForPhi); |
70 | |
71 | VPValue *getConstantVPV(unsigned Part) { |
72 | Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType(); |
73 | return Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanIVIntTy, V: Part)); |
74 | } |
75 | |
76 | public: |
77 | UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx) |
78 | : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {} |
79 | |
80 | void unrollBlock(VPBlockBase *VPB); |
81 | |
82 | VPValue *getValueForPart(VPValue *V, unsigned Part) { |
83 | if (Part == 0 || V->isLiveIn()) |
84 | return V; |
85 | assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) && |
86 | "accessed value does not exist" ); |
87 | return VPV2Parts[V][Part - 1]; |
88 | } |
89 | |
90 | /// Given a single original recipe \p OrigR (of part zero), and its copy \p |
91 | /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its |
92 | /// corresponding VPValue defined by \p CopyR. |
93 | void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR, |
94 | unsigned Part) { |
95 | for (const auto &[Idx, VPV] : enumerate(First: OrigR->definedValues())) { |
96 | auto Ins = VPV2Parts.insert(KV: {VPV, {}}); |
97 | assert(Ins.first->second.size() == Part - 1 && "earlier parts not set" ); |
98 | Ins.first->second.push_back(Elt: CopyR->getVPValue(I: Idx)); |
99 | } |
100 | } |
101 | |
102 | /// Given a uniform recipe \p R, add it for all parts. |
103 | void addUniformForAllParts(VPSingleDefRecipe *R) { |
104 | auto Ins = VPV2Parts.insert(KV: {R, {}}); |
105 | assert(Ins.second && "uniform value already added" ); |
106 | for (unsigned Part = 0; Part != UF; ++Part) |
107 | Ins.first->second.push_back(Elt: R); |
108 | } |
109 | |
110 | bool contains(VPValue *VPV) const { return VPV2Parts.contains(Val: VPV); } |
111 | |
112 | /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part |
113 | /// \p P. |
114 | void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) { |
115 | auto *Op = R->getOperand(N: OpIdx); |
116 | R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part)); |
117 | } |
118 | |
119 | /// Update \p R's operands with their corresponding VPValues for part \p P. |
120 | void remapOperands(VPRecipeBase *R, unsigned Part) { |
121 | for (const auto &[OpIdx, Op] : enumerate(First: R->operands())) |
122 | R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part)); |
123 | } |
124 | }; |
125 | } // namespace |
126 | |
127 | void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { |
128 | VPBlockBase *InsertPt = VPR->getSingleSuccessor(); |
129 | for (unsigned Part = 1; Part != UF; ++Part) { |
130 | auto *Copy = VPR->clone(); |
131 | VPBlockUtils::insertBlockBefore(NewBlock: Copy, BlockPtr: InsertPt); |
132 | |
133 | auto PartI = vp_depth_first_shallow(G: Copy->getEntry()); |
134 | auto Part0 = vp_depth_first_shallow(G: VPR->getEntry()); |
135 | for (const auto &[PartIVPBB, Part0VPBB] : |
136 | zip(t: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: PartI), |
137 | u: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Part0))) { |
138 | for (const auto &[PartIR, Part0R] : zip(t&: *PartIVPBB, u&: *Part0VPBB)) { |
139 | remapOperands(R: &PartIR, Part); |
140 | if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Val: &PartIR)) { |
141 | ScalarIVSteps->addOperand(Operand: getConstantVPV(Part)); |
142 | } |
143 | |
144 | addRecipeForPart(OrigR: &Part0R, CopyR: &PartIR, Part); |
145 | } |
146 | } |
147 | } |
148 | } |
149 | |
150 | void UnrollState::unrollWidenInductionByUF( |
151 | VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) { |
152 | VPBasicBlock *PH = cast<VPBasicBlock>( |
153 | Val: IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor()); |
154 | Type *IVTy = TypeInfo.inferScalarType(V: IV); |
155 | auto &ID = IV->getInductionDescriptor(); |
156 | VPIRFlags Flags; |
157 | if (isa_and_present<FPMathOperator>(Val: ID.getInductionBinOp())) |
158 | Flags = ID.getInductionBinOp()->getFastMathFlags(); |
159 | |
160 | VPValue *ScalarStep = IV->getStepValue(); |
161 | VPBuilder Builder(PH); |
162 | VPInstruction *VectorStep = Builder.createNaryOp( |
163 | Opcode: VPInstruction::WideIVStep, Operands: {&Plan.getVF(), ScalarStep}, ResultTy: IVTy, Flags, |
164 | DL: IV->getDebugLoc()); |
165 | |
166 | ToSkip.insert(Ptr: VectorStep); |
167 | |
168 | // Now create recipes to compute the induction steps for part 1 .. UF. Part 0 |
169 | // remains the header phi. Parts > 0 are computed by adding Step to the |
170 | // previous part. The header phi recipe will get 2 new operands: the step |
171 | // value for a single part and the last part, used to compute the backedge |
172 | // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 = |
173 | // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3 |
174 | // %Part.1 = %Part.0 + %VectorStep |
175 | // %Part.2 = %Part.1 + %VectorStep |
176 | // %Part.3 = %Part.2 + %VectorStep |
177 | // |
178 | // The newly added recipes are added to ToSkip to avoid interleaving them |
179 | // again. |
180 | VPValue *Prev = IV; |
181 | Builder.setInsertPoint(TheBB: IV->getParent(), IP: InsertPtForPhi); |
182 | unsigned AddOpc = |
183 | IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add; |
184 | for (unsigned Part = 1; Part != UF; ++Part) { |
185 | std::string Name = |
186 | Part > 1 ? "step.add." + std::to_string(val: Part) : "step.add" ; |
187 | |
188 | VPInstruction *Add = Builder.createNaryOp(Opcode: AddOpc, |
189 | Operands: { |
190 | Prev, |
191 | VectorStep, |
192 | }, |
193 | Flags, DL: IV->getDebugLoc(), Name); |
194 | ToSkip.insert(Ptr: Add); |
195 | addRecipeForPart(OrigR: IV, CopyR: Add, Part); |
196 | Prev = Add; |
197 | } |
198 | IV->addOperand(Operand: VectorStep); |
199 | IV->addOperand(Operand: Prev); |
200 | } |
201 | |
202 | void UnrollState::(VPHeaderPHIRecipe *R, |
203 | VPBasicBlock::iterator InsertPtForPhi) { |
204 | // First-order recurrences pass a single vector or scalar through their header |
205 | // phis, irrespective of interleaving. |
206 | if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: R)) |
207 | return; |
208 | |
209 | // Generate step vectors for each unrolled part. |
210 | if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: R)) { |
211 | unrollWidenInductionByUF(IV, InsertPtForPhi); |
212 | return; |
213 | } |
214 | |
215 | auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(Val: R); |
216 | if (RdxPhi && RdxPhi->isOrdered()) |
217 | return; |
218 | |
219 | auto InsertPt = std::next(x: R->getIterator()); |
220 | for (unsigned Part = 1; Part != UF; ++Part) { |
221 | VPRecipeBase *Copy = R->clone(); |
222 | Copy->insertBefore(BB&: *R->getParent(), IP: InsertPt); |
223 | addRecipeForPart(OrigR: R, CopyR: Copy, Part); |
224 | if (isa<VPWidenPointerInductionRecipe>(Val: R)) { |
225 | Copy->addOperand(Operand: R); |
226 | Copy->addOperand(Operand: getConstantVPV(Part)); |
227 | } else if (RdxPhi) { |
228 | // If the start value is a ReductionStartVector, use the identity value |
229 | // (second operand) for unrolled parts. If the scaling factor is > 1, |
230 | // create a new ReductionStartVector with the scale factor and both |
231 | // operands set to the identity value. |
232 | if (auto *VPI = dyn_cast<VPInstruction>(Val: RdxPhi->getStartValue())) { |
233 | assert(VPI->getOpcode() == VPInstruction::ReductionStartVector && |
234 | "unexpected start VPInstruction" ); |
235 | if (Part != 1) |
236 | continue; |
237 | VPValue *StartV; |
238 | if (match(V: VPI->getOperand(N: 2), P: m_SpecificInt(V: 1))) { |
239 | StartV = VPI->getOperand(N: 1); |
240 | } else { |
241 | auto *C = VPI->clone(); |
242 | C->setOperand(I: 0, New: C->getOperand(N: 1)); |
243 | C->insertAfter(InsertPos: VPI); |
244 | StartV = C; |
245 | } |
246 | for (unsigned Part = 1; Part != UF; ++Part) |
247 | VPV2Parts[VPI][Part - 1] = StartV; |
248 | } |
249 | Copy->addOperand(Operand: getConstantVPV(Part)); |
250 | } else { |
251 | assert(isa<VPActiveLaneMaskPHIRecipe>(R) && |
252 | "unexpected header phi recipe not needing unrolled part" ); |
253 | } |
254 | } |
255 | } |
256 | |
257 | /// Handle non-header-phi recipes. |
258 | void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { |
259 | if (match(V: &R, P: m_BranchOnCond(Op0: m_VPValue())) || |
260 | match(V: &R, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue()))) |
261 | return; |
262 | |
263 | if (auto *VPI = dyn_cast<VPInstruction>(Val: &R)) { |
264 | if (vputils::onlyFirstPartUsed(Def: VPI)) { |
265 | addUniformForAllParts(R: VPI); |
266 | return; |
267 | } |
268 | } |
269 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) { |
270 | if (isa<StoreInst>(Val: RepR->getUnderlyingValue()) && |
271 | RepR->getOperand(N: 1)->isDefinedOutsideLoopRegions()) { |
272 | // Stores to an invariant address only need to store the last part. |
273 | remapOperands(R: &R, Part: UF - 1); |
274 | return; |
275 | } |
276 | if (auto *II = dyn_cast<IntrinsicInst>(Val: RepR->getUnderlyingValue())) { |
277 | if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) { |
278 | addUniformForAllParts(R: RepR); |
279 | return; |
280 | } |
281 | } |
282 | } |
283 | |
284 | // Unroll non-uniform recipes. |
285 | auto InsertPt = std::next(x: R.getIterator()); |
286 | VPBasicBlock &VPBB = *R.getParent(); |
287 | for (unsigned Part = 1; Part != UF; ++Part) { |
288 | VPRecipeBase *Copy = R.clone(); |
289 | Copy->insertBefore(BB&: VPBB, IP: InsertPt); |
290 | addRecipeForPart(OrigR: &R, CopyR: Copy, Part); |
291 | |
292 | VPValue *Op; |
293 | if (match(V: &R, P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>( |
294 | Op0: m_VPValue(), Op1: m_VPValue(V&: Op)))) { |
295 | Copy->setOperand(I: 0, New: getValueForPart(V: Op, Part: Part - 1)); |
296 | Copy->setOperand(I: 1, New: getValueForPart(V: Op, Part)); |
297 | continue; |
298 | } |
299 | if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R)) { |
300 | auto *Phi = dyn_cast<VPReductionPHIRecipe>(Val: R.getOperand(N: 0)); |
301 | if (Phi && Phi->isOrdered()) { |
302 | auto &Parts = VPV2Parts[Phi]; |
303 | if (Part == 1) { |
304 | Parts.clear(); |
305 | Parts.push_back(Elt: Red); |
306 | } |
307 | Parts.push_back(Elt: Copy->getVPSingleValue()); |
308 | Phi->setOperand(I: 1, New: Copy->getVPSingleValue()); |
309 | } |
310 | } |
311 | remapOperands(R: Copy, Part); |
312 | |
313 | // Add operand indicating the part to generate code for, to recipes still |
314 | // requiring it. |
315 | if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe, |
316 | VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Val: Copy) || |
317 | match(V: Copy, P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>( |
318 | Op0: m_VPValue()))) |
319 | Copy->addOperand(Operand: getConstantVPV(Part)); |
320 | |
321 | if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Val: R)) |
322 | Copy->setOperand(I: 0, New: R.getOperand(N: 0)); |
323 | } |
324 | } |
325 | |
326 | void UnrollState::unrollBlock(VPBlockBase *VPB) { |
327 | auto *VPR = dyn_cast<VPRegionBlock>(Val: VPB); |
328 | if (VPR) { |
329 | if (VPR->isReplicator()) |
330 | return unrollReplicateRegionByUF(VPR); |
331 | |
332 | // Traverse blocks in region in RPO to ensure defs are visited before uses |
333 | // across blocks. |
334 | ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> |
335 | RPOT(VPR->getEntry()); |
336 | for (VPBlockBase *VPB : RPOT) |
337 | unrollBlock(VPB); |
338 | return; |
339 | } |
340 | |
341 | // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes. |
342 | auto *VPBB = cast<VPBasicBlock>(Val: VPB); |
343 | auto InsertPtForPhi = VPBB->getFirstNonPhi(); |
344 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
345 | if (ToSkip.contains(Ptr: &R) || isa<VPIRInstruction>(Val: &R)) |
346 | continue; |
347 | |
348 | // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and |
349 | // Compute*Result which combine all parts to compute the final value. |
350 | VPValue *Op1; |
351 | if (match(V: &R, P: m_VPInstruction<VPInstruction::AnyOf>(Op0: m_VPValue(V&: Op1))) || |
352 | match(V: &R, P: m_VPInstruction<VPInstruction::FirstActiveLane>( |
353 | Op0: m_VPValue(V&: Op1))) || |
354 | match(V: &R, P: m_VPInstruction<VPInstruction::ComputeAnyOfResult>( |
355 | Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue(V&: Op1))) || |
356 | match(V: &R, P: m_VPInstruction<VPInstruction::ComputeReductionResult>( |
357 | Op0: m_VPValue(), Op1: m_VPValue(V&: Op1))) || |
358 | match(V: &R, P: m_VPInstruction<VPInstruction::ComputeFindIVResult>( |
359 | Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue(), Op3: m_VPValue(V&: Op1)))) { |
360 | addUniformForAllParts(R: cast<VPInstruction>(Val: &R)); |
361 | for (unsigned Part = 1; Part != UF; ++Part) |
362 | R.addOperand(Operand: getValueForPart(V: Op1, Part)); |
363 | continue; |
364 | } |
365 | VPValue *Op0; |
366 | if (match(V: &R, P: m_VPInstruction<VPInstruction::ExtractLastElement>( |
367 | Op0: m_VPValue(V&: Op0))) || |
368 | match(V: &R, P: m_VPInstruction<VPInstruction::ExtractPenultimateElement>( |
369 | Op0: m_VPValue(V&: Op0)))) { |
370 | addUniformForAllParts(R: cast<VPSingleDefRecipe>(Val: &R)); |
371 | if (Plan.hasScalarVFOnly()) { |
372 | auto *I = cast<VPInstruction>(Val: &R); |
373 | // Extracting from end with VF = 1 implies retrieving the last or |
374 | // penultimate scalar part (UF-1 or UF-2). |
375 | unsigned Offset = |
376 | I->getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2; |
377 | I->replaceAllUsesWith(New: getValueForPart(V: Op0, Part: UF - Offset)); |
378 | R.eraseFromParent(); |
379 | } else { |
380 | // Otherwise we extract from the last part. |
381 | remapOperands(R: &R, Part: UF - 1); |
382 | } |
383 | continue; |
384 | } |
385 | |
386 | auto *SingleDef = dyn_cast<VPSingleDefRecipe>(Val: &R); |
387 | if (SingleDef && vputils::isUniformAcrossVFsAndUFs(V: SingleDef)) { |
388 | addUniformForAllParts(R: SingleDef); |
389 | continue; |
390 | } |
391 | |
392 | if (auto *H = dyn_cast<VPHeaderPHIRecipe>(Val: &R)) { |
393 | unrollHeaderPHIByUF(R: H, InsertPtForPhi); |
394 | continue; |
395 | } |
396 | |
397 | unrollRecipeByUF(R); |
398 | } |
399 | } |
400 | |
401 | void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { |
402 | assert(UF > 0 && "Unroll factor must be positive" ); |
403 | Plan.setUF(UF); |
404 | auto Cleanup = make_scope_exit(F: [&Plan]() { |
405 | auto Iter = vp_depth_first_deep(G: Plan.getEntry()); |
406 | // Remove recipes that are redundant after unrolling. |
407 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) { |
408 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
409 | auto *VPI = dyn_cast<VPInstruction>(Val: &R); |
410 | if (VPI && |
411 | VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart && |
412 | VPI->getNumOperands() == 1) { |
413 | VPI->replaceAllUsesWith(New: VPI->getOperand(N: 0)); |
414 | VPI->eraseFromParent(); |
415 | } |
416 | } |
417 | } |
418 | }); |
419 | if (UF == 1) { |
420 | return; |
421 | } |
422 | |
423 | UnrollState Unroller(Plan, UF, Ctx); |
424 | |
425 | // Iterate over all blocks in the plan starting from Entry, and unroll |
426 | // recipes inside them. This includes the vector preheader and middle blocks, |
427 | // which may set up or post-process per-part values. |
428 | ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT( |
429 | Plan.getEntry()); |
430 | for (VPBlockBase *VPB : RPOT) |
431 | Unroller.unrollBlock(VPB); |
432 | |
433 | unsigned Part = 1; |
434 | // Remap operands of cloned header phis to update backedge values. The header |
435 | // phis cloned during unrolling are just after the header phi for part 0. |
436 | // Reset Part to 1 when reaching the first (part 0) recipe of a block. |
437 | for (VPRecipeBase &H : |
438 | Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
439 | // The second operand of Fixed Order Recurrence phi's, feeding the spliced |
440 | // value across the backedge, needs to remap to the last part of the spliced |
441 | // value. |
442 | if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: &H)) { |
443 | Unroller.remapOperand(R: &H, OpIdx: 1, Part: UF - 1); |
444 | continue; |
445 | } |
446 | if (Unroller.contains(VPV: H.getVPSingleValue()) || |
447 | isa<VPWidenPointerInductionRecipe>(Val: &H)) { |
448 | Part = 1; |
449 | continue; |
450 | } |
451 | Unroller.remapOperands(R: &H, Part); |
452 | Part++; |
453 | } |
454 | |
455 | VPlanTransforms::removeDeadRecipes(Plan); |
456 | } |
457 | |
458 | /// Create a single-scalar clone of \p RepR for lane \p Lane. |
459 | static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, |
460 | Type *IdxTy, VPReplicateRecipe *RepR, |
461 | VPLane Lane) { |
462 | // Collect the operands at Lane, creating extracts as needed. |
463 | SmallVector<VPValue *> NewOps; |
464 | for (VPValue *Op : RepR->operands()) { |
465 | if (vputils::isSingleScalar(VPV: Op)) { |
466 | NewOps.push_back(Elt: Op); |
467 | continue; |
468 | } |
469 | if (Lane.getKind() == VPLane::Kind::ScalableLast) { |
470 | NewOps.push_back( |
471 | Elt: Builder.createNaryOp(Opcode: VPInstruction::ExtractLastElement, Operands: {Op})); |
472 | continue; |
473 | } |
474 | // Look through buildvector to avoid unnecessary extracts. |
475 | if (match(V: Op, P: m_BuildVector())) { |
476 | NewOps.push_back( |
477 | Elt: cast<VPInstruction>(Val: Op)->getOperand(N: Lane.getKnownLane())); |
478 | continue; |
479 | } |
480 | VPValue *Idx = |
481 | Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: IdxTy, V: Lane.getKnownLane())); |
482 | VPValue *Ext = Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx}); |
483 | NewOps.push_back(Elt: Ext); |
484 | } |
485 | |
486 | auto *New = |
487 | new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps, |
488 | /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR); |
489 | New->insertBefore(InsertPos: RepR); |
490 | return New; |
491 | } |
492 | |
493 | void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { |
494 | Type *IdxTy = IntegerType::get( |
495 | C&: Plan.getScalarHeader()->getIRBasicBlock()->getContext(), NumBits: 32); |
496 | |
497 | // Visit all VPBBs outside the loop region and directly inside the top-level |
498 | // loop region. |
499 | auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>( |
500 | Range: vp_depth_first_shallow(G: Plan.getEntry())); |
501 | auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>( |
502 | Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry())); |
503 | auto VPBBsToUnroll = |
504 | concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion); |
505 | for (VPBasicBlock *VPBB : VPBBsToUnroll) { |
506 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
507 | auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R); |
508 | if (!RepR || RepR->isSingleScalar()) |
509 | continue; |
510 | |
511 | VPBuilder Builder(RepR); |
512 | if (RepR->getNumUsers() == 0) { |
513 | if (isa<StoreInst>(Val: RepR->getUnderlyingInstr()) && |
514 | vputils::isSingleScalar(VPV: RepR->getOperand(N: 1))) { |
515 | // Stores to invariant addresses need to store the last lane only. |
516 | cloneForLane(Plan, Builder, IdxTy, RepR, |
517 | Lane: VPLane::getLastLaneForVF(VF)); |
518 | } else { |
519 | // Create single-scalar version of RepR for all lanes. |
520 | for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) |
521 | cloneForLane(Plan, Builder, IdxTy, RepR, Lane: VPLane(I)); |
522 | } |
523 | RepR->eraseFromParent(); |
524 | continue; |
525 | } |
526 | /// Create single-scalar version of RepR for all lanes. |
527 | SmallVector<VPValue *> LaneDefs; |
528 | for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) |
529 | LaneDefs.push_back(Elt: cloneForLane(Plan, Builder, IdxTy, RepR, Lane: VPLane(I))); |
530 | |
531 | /// Users that only demand the first lane can use the definition for lane |
532 | /// 0. |
533 | RepR->replaceUsesWithIf(New: LaneDefs[0], ShouldReplace: [RepR](VPUser &U, unsigned) { |
534 | return U.onlyFirstLaneUsed(Op: RepR); |
535 | }); |
536 | |
537 | // If needed, create a Build(Struct)Vector recipe to insert the scalar |
538 | // lane values into a vector. |
539 | Type *ResTy = RepR->getUnderlyingInstr()->getType(); |
540 | VPValue *VecRes = Builder.createNaryOp( |
541 | Opcode: ResTy->isStructTy() ? VPInstruction::BuildStructVector |
542 | : VPInstruction::BuildVector, |
543 | Operands: LaneDefs); |
544 | RepR->replaceAllUsesWith(New: VecRes); |
545 | RepR->eraseFromParent(); |
546 | } |
547 | } |
548 | } |
549 | |