1 | //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file contains implementations for different VPlan recipes. |
11 | /// |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "VPlan.h" |
15 | #include "VPlanAnalysis.h" |
16 | #include "llvm/ADT/STLExtras.h" |
17 | #include "llvm/ADT/SmallVector.h" |
18 | #include "llvm/ADT/Twine.h" |
19 | #include "llvm/Analysis/IVDescriptors.h" |
20 | #include "llvm/IR/BasicBlock.h" |
21 | #include "llvm/IR/IRBuilder.h" |
22 | #include "llvm/IR/Instruction.h" |
23 | #include "llvm/IR/Instructions.h" |
24 | #include "llvm/IR/Type.h" |
25 | #include "llvm/IR/Value.h" |
26 | #include "llvm/Support/Casting.h" |
27 | #include "llvm/Support/CommandLine.h" |
28 | #include "llvm/Support/Debug.h" |
29 | #include "llvm/Support/raw_ostream.h" |
30 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
31 | #include "llvm/Transforms/Utils/LoopUtils.h" |
32 | #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" |
33 | #include <cassert> |
34 | |
35 | using namespace llvm; |
36 | |
37 | using VectorParts = SmallVector<Value *, 2>; |
38 | |
39 | namespace llvm { |
40 | extern cl::opt<bool> EnableVPlanNativePath; |
41 | } |
42 | extern cl::opt<unsigned> ForceTargetInstructionCost; |
43 | |
44 | #define LV_NAME "loop-vectorize" |
45 | #define DEBUG_TYPE LV_NAME |
46 | |
47 | bool VPRecipeBase::mayWriteToMemory() const { |
48 | switch (getVPDefID()) { |
49 | case VPInterleaveSC: |
50 | return cast<VPInterleaveRecipe>(Val: this)->getNumStoreOperands() > 0; |
51 | case VPWidenStoreEVLSC: |
52 | case VPWidenStoreSC: |
53 | return true; |
54 | case VPReplicateSC: |
55 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
56 | ->mayWriteToMemory(); |
57 | case VPWidenCallSC: |
58 | return !cast<VPWidenCallRecipe>(Val: this) |
59 | ->getCalledScalarFunction() |
60 | ->onlyReadsMemory(); |
61 | case VPBranchOnMaskSC: |
62 | case VPScalarIVStepsSC: |
63 | case VPPredInstPHISC: |
64 | return false; |
65 | case VPBlendSC: |
66 | case VPReductionEVLSC: |
67 | case VPReductionSC: |
68 | case VPWidenCanonicalIVSC: |
69 | case VPWidenCastSC: |
70 | case VPWidenGEPSC: |
71 | case VPWidenIntOrFpInductionSC: |
72 | case VPWidenLoadEVLSC: |
73 | case VPWidenLoadSC: |
74 | case VPWidenPHISC: |
75 | case VPWidenSC: |
76 | case VPWidenSelectSC: { |
77 | const Instruction *I = |
78 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
79 | (void)I; |
80 | assert((!I || !I->mayWriteToMemory()) && |
81 | "underlying instruction may write to memory" ); |
82 | return false; |
83 | } |
84 | default: |
85 | return true; |
86 | } |
87 | } |
88 | |
89 | bool VPRecipeBase::mayReadFromMemory() const { |
90 | switch (getVPDefID()) { |
91 | case VPWidenLoadEVLSC: |
92 | case VPWidenLoadSC: |
93 | return true; |
94 | case VPReplicateSC: |
95 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
96 | ->mayReadFromMemory(); |
97 | case VPWidenCallSC: |
98 | return !cast<VPWidenCallRecipe>(Val: this) |
99 | ->getCalledScalarFunction() |
100 | ->onlyWritesMemory(); |
101 | case VPBranchOnMaskSC: |
102 | case VPPredInstPHISC: |
103 | case VPScalarIVStepsSC: |
104 | case VPWidenStoreEVLSC: |
105 | case VPWidenStoreSC: |
106 | return false; |
107 | case VPBlendSC: |
108 | case VPReductionEVLSC: |
109 | case VPReductionSC: |
110 | case VPWidenCanonicalIVSC: |
111 | case VPWidenCastSC: |
112 | case VPWidenGEPSC: |
113 | case VPWidenIntOrFpInductionSC: |
114 | case VPWidenPHISC: |
115 | case VPWidenSC: |
116 | case VPWidenSelectSC: { |
117 | const Instruction *I = |
118 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
119 | (void)I; |
120 | assert((!I || !I->mayReadFromMemory()) && |
121 | "underlying instruction may read from memory" ); |
122 | return false; |
123 | } |
124 | default: |
125 | return true; |
126 | } |
127 | } |
128 | |
129 | bool VPRecipeBase::mayHaveSideEffects() const { |
130 | switch (getVPDefID()) { |
131 | case VPDerivedIVSC: |
132 | case VPPredInstPHISC: |
133 | case VPScalarCastSC: |
134 | return false; |
135 | case VPInstructionSC: |
136 | switch (cast<VPInstruction>(Val: this)->getOpcode()) { |
137 | case Instruction::Or: |
138 | case Instruction::ICmp: |
139 | case Instruction::Select: |
140 | case VPInstruction::Not: |
141 | case VPInstruction::CalculateTripCountMinusVF: |
142 | case VPInstruction::CanonicalIVIncrementForPart: |
143 | case VPInstruction::ExtractFromEnd: |
144 | case VPInstruction::FirstOrderRecurrenceSplice: |
145 | case VPInstruction::LogicalAnd: |
146 | case VPInstruction::PtrAdd: |
147 | return false; |
148 | default: |
149 | return true; |
150 | } |
151 | case VPWidenCallSC: { |
152 | Function *Fn = cast<VPWidenCallRecipe>(Val: this)->getCalledScalarFunction(); |
153 | return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); |
154 | } |
155 | case VPBlendSC: |
156 | case VPReductionEVLSC: |
157 | case VPReductionSC: |
158 | case VPScalarIVStepsSC: |
159 | case VPWidenCanonicalIVSC: |
160 | case VPWidenCastSC: |
161 | case VPWidenGEPSC: |
162 | case VPWidenIntOrFpInductionSC: |
163 | case VPWidenPHISC: |
164 | case VPWidenPointerInductionSC: |
165 | case VPWidenSC: |
166 | case VPWidenSelectSC: { |
167 | const Instruction *I = |
168 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
169 | (void)I; |
170 | assert((!I || !I->mayHaveSideEffects()) && |
171 | "underlying instruction has side-effects" ); |
172 | return false; |
173 | } |
174 | case VPInterleaveSC: |
175 | return mayWriteToMemory(); |
176 | case VPWidenLoadEVLSC: |
177 | case VPWidenLoadSC: |
178 | case VPWidenStoreEVLSC: |
179 | case VPWidenStoreSC: |
180 | assert( |
181 | cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == |
182 | mayWriteToMemory() && |
183 | "mayHaveSideffects result for ingredient differs from this " |
184 | "implementation" ); |
185 | return mayWriteToMemory(); |
186 | case VPReplicateSC: { |
187 | auto *R = cast<VPReplicateRecipe>(Val: this); |
188 | return R->getUnderlyingInstr()->mayHaveSideEffects(); |
189 | } |
190 | default: |
191 | return true; |
192 | } |
193 | } |
194 | |
195 | void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { |
196 | VPValue *ExitValue = getOperand(N: 0); |
197 | auto Lane = vputils::isUniformAfterVectorization(VPV: ExitValue) |
198 | ? VPLane::getFirstLane() |
199 | : VPLane::getLastLaneForVF(VF: State.VF); |
200 | VPBasicBlock *MiddleVPBB = |
201 | cast<VPBasicBlock>(Val: Plan.getVectorLoopRegion()->getSingleSuccessor()); |
202 | VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe(); |
203 | auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr; |
204 | // Values leaving the vector loop reach live out phi's in the exiting block |
205 | // via middle block. |
206 | auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion() |
207 | ? MiddleVPBB |
208 | : ExitingVPBB; |
209 | BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; |
210 | // Set insertion point in PredBB in case an extract needs to be generated. |
211 | // TODO: Model extracts explicitly. |
212 | State.Builder.SetInsertPoint(TheBB: PredBB, IP: PredBB->getFirstNonPHIIt()); |
213 | Value *V = State.get(Def: ExitValue, Instance: VPIteration(State.UF - 1, Lane)); |
214 | if (Phi->getBasicBlockIndex(BB: PredBB) != -1) |
215 | Phi->setIncomingValueForBlock(BB: PredBB, V); |
216 | else |
217 | Phi->addIncoming(V, BB: PredBB); |
218 | } |
219 | |
220 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
221 | void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { |
222 | O << "Live-out " ; |
223 | getPhi()->printAsOperand(O); |
224 | O << " = " ; |
225 | getOperand(0)->printAsOperand(O, SlotTracker); |
226 | O << "\n" ; |
227 | } |
228 | #endif |
229 | |
230 | void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { |
231 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
232 | assert(InsertPos->getParent() && |
233 | "Insertion position not in any VPBasicBlock" ); |
234 | InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator()); |
235 | } |
236 | |
237 | void VPRecipeBase::insertBefore(VPBasicBlock &BB, |
238 | iplist<VPRecipeBase>::iterator I) { |
239 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
240 | assert(I == BB.end() || I->getParent() == &BB); |
241 | BB.insert(Recipe: this, InsertPt: I); |
242 | } |
243 | |
244 | void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { |
245 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
246 | assert(InsertPos->getParent() && |
247 | "Insertion position not in any VPBasicBlock" ); |
248 | InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator())); |
249 | } |
250 | |
251 | void VPRecipeBase::removeFromParent() { |
252 | assert(getParent() && "Recipe not in any VPBasicBlock" ); |
253 | getParent()->getRecipeList().remove(IT: getIterator()); |
254 | Parent = nullptr; |
255 | } |
256 | |
257 | iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { |
258 | assert(getParent() && "Recipe not in any VPBasicBlock" ); |
259 | return getParent()->getRecipeList().erase(where: getIterator()); |
260 | } |
261 | |
262 | void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { |
263 | removeFromParent(); |
264 | insertAfter(InsertPos); |
265 | } |
266 | |
267 | void VPRecipeBase::moveBefore(VPBasicBlock &BB, |
268 | iplist<VPRecipeBase>::iterator I) { |
269 | removeFromParent(); |
270 | insertBefore(BB, I); |
271 | } |
272 | |
273 | /// Return the underlying instruction to be used for computing \p R's cost via |
274 | /// the legacy cost model. Return nullptr if there's no suitable instruction. |
275 | static Instruction *getInstructionForCost(const VPRecipeBase *R) { |
276 | if (auto *S = dyn_cast<VPSingleDefRecipe>(Val: R)) |
277 | return dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue()); |
278 | if (auto *IG = dyn_cast<VPInterleaveRecipe>(Val: R)) |
279 | return IG->getInsertPos(); |
280 | if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: R)) |
281 | return &WidenMem->getIngredient(); |
282 | return nullptr; |
283 | } |
284 | |
285 | InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { |
286 | if (auto *UI = getInstructionForCost(R: this)) |
287 | if (Ctx.skipCostComputation(UI, IsVector: VF.isVector())) |
288 | return 0; |
289 | |
290 | InstructionCost RecipeCost = computeCost(VF, Ctx); |
291 | if (ForceTargetInstructionCost.getNumOccurrences() > 0 && |
292 | RecipeCost.isValid()) |
293 | RecipeCost = InstructionCost(ForceTargetInstructionCost); |
294 | |
295 | LLVM_DEBUG({ |
296 | dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": " ; |
297 | dump(); |
298 | }); |
299 | return RecipeCost; |
300 | } |
301 | |
302 | InstructionCost VPRecipeBase::computeCost(ElementCount VF, |
303 | VPCostContext &Ctx) const { |
304 | // Compute the cost for the recipe falling back to the legacy cost model using |
305 | // the underlying instruction. If there is no underlying instruction, returns |
306 | // 0. |
307 | Instruction *UI = getInstructionForCost(R: this); |
308 | if (UI && isa<VPReplicateRecipe>(Val: this)) { |
309 | // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan |
310 | // transform, avoid computing their cost multiple times for now. |
311 | Ctx.SkipCostComputation.insert(Ptr: UI); |
312 | } |
313 | return UI ? Ctx.getLegacyCost(UI, VF) : 0; |
314 | } |
315 | |
316 | FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { |
317 | assert(OpType == OperationType::FPMathOp && |
318 | "recipe doesn't have fast math flags" ); |
319 | FastMathFlags Res; |
320 | Res.setAllowReassoc(FMFs.AllowReassoc); |
321 | Res.setNoNaNs(FMFs.NoNaNs); |
322 | Res.setNoInfs(FMFs.NoInfs); |
323 | Res.setNoSignedZeros(FMFs.NoSignedZeros); |
324 | Res.setAllowReciprocal(FMFs.AllowReciprocal); |
325 | Res.setAllowContract(FMFs.AllowContract); |
326 | Res.setApproxFunc(FMFs.ApproxFunc); |
327 | return Res; |
328 | } |
329 | |
330 | VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, |
331 | VPValue *A, VPValue *B, DebugLoc DL, |
332 | const Twine &Name) |
333 | : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), |
334 | Pred, DL), |
335 | Opcode(Opcode), Name(Name.str()) { |
336 | assert(Opcode == Instruction::ICmp && |
337 | "only ICmp predicates supported at the moment" ); |
338 | } |
339 | |
340 | VPInstruction::VPInstruction(unsigned Opcode, |
341 | std::initializer_list<VPValue *> Operands, |
342 | FastMathFlags FMFs, DebugLoc DL, const Twine &Name) |
343 | : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), |
344 | Opcode(Opcode), Name(Name.str()) { |
345 | // Make sure the VPInstruction is a floating-point operation. |
346 | assert(isFPMathOp() && "this op can't take fast-math flags" ); |
347 | } |
348 | |
349 | bool VPInstruction::doesGeneratePerAllLanes() const { |
350 | return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this); |
351 | } |
352 | |
353 | bool VPInstruction::canGenerateScalarForFirstLane() const { |
354 | if (Instruction::isBinaryOp(Opcode: getOpcode())) |
355 | return true; |
356 | if (isSingleScalar() || isVectorToScalar()) |
357 | return true; |
358 | switch (Opcode) { |
359 | case Instruction::ICmp: |
360 | case VPInstruction::BranchOnCond: |
361 | case VPInstruction::BranchOnCount: |
362 | case VPInstruction::CalculateTripCountMinusVF: |
363 | case VPInstruction::CanonicalIVIncrementForPart: |
364 | case VPInstruction::PtrAdd: |
365 | case VPInstruction::ExplicitVectorLength: |
366 | return true; |
367 | default: |
368 | return false; |
369 | } |
370 | } |
371 | |
372 | Value *VPInstruction::generatePerLane(VPTransformState &State, |
373 | const VPIteration &Lane) { |
374 | IRBuilderBase &Builder = State.Builder; |
375 | |
376 | assert(getOpcode() == VPInstruction::PtrAdd && |
377 | "only PtrAdd opcodes are supported for now" ); |
378 | return Builder.CreatePtrAdd(Ptr: State.get(Def: getOperand(N: 0), Instance: Lane), |
379 | Offset: State.get(Def: getOperand(N: 1), Instance: Lane), Name); |
380 | } |
381 | |
382 | Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { |
383 | IRBuilderBase &Builder = State.Builder; |
384 | |
385 | if (Instruction::isBinaryOp(Opcode: getOpcode())) { |
386 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
387 | Value *A = State.get(Def: getOperand(N: 0), Part, IsScalar: OnlyFirstLaneUsed); |
388 | Value *B = State.get(Def: getOperand(N: 1), Part, IsScalar: OnlyFirstLaneUsed); |
389 | auto *Res = |
390 | Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name); |
391 | if (auto *I = dyn_cast<Instruction>(Val: Res)) |
392 | setFlags(I); |
393 | return Res; |
394 | } |
395 | |
396 | switch (getOpcode()) { |
397 | case VPInstruction::Not: { |
398 | Value *A = State.get(Def: getOperand(N: 0), Part); |
399 | return Builder.CreateNot(V: A, Name); |
400 | } |
401 | case Instruction::ICmp: { |
402 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
403 | Value *A = State.get(Def: getOperand(N: 0), Part, IsScalar: OnlyFirstLaneUsed); |
404 | Value *B = State.get(Def: getOperand(N: 1), Part, IsScalar: OnlyFirstLaneUsed); |
405 | return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name); |
406 | } |
407 | case Instruction::Select: { |
408 | Value *Cond = State.get(Def: getOperand(N: 0), Part); |
409 | Value *Op1 = State.get(Def: getOperand(N: 1), Part); |
410 | Value *Op2 = State.get(Def: getOperand(N: 2), Part); |
411 | return Builder.CreateSelect(C: Cond, True: Op1, False: Op2, Name); |
412 | } |
413 | case VPInstruction::ActiveLaneMask: { |
414 | // Get first lane of vector induction variable. |
415 | Value *VIVElem0 = State.get(Def: getOperand(N: 0), Instance: VPIteration(Part, 0)); |
416 | // Get the original loop tripcount. |
417 | Value *ScalarTC = State.get(Def: getOperand(N: 1), Instance: VPIteration(Part, 0)); |
418 | |
419 | // If this part of the active lane mask is scalar, generate the CMP directly |
420 | // to avoid unnecessary extracts. |
421 | if (State.VF.isScalar()) |
422 | return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC, |
423 | Name); |
424 | |
425 | auto *Int1Ty = Type::getInt1Ty(C&: Builder.getContext()); |
426 | auto *PredTy = VectorType::get(ElementType: Int1Ty, EC: State.VF); |
427 | return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask, |
428 | Types: {PredTy, ScalarTC->getType()}, |
429 | Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name); |
430 | } |
431 | case VPInstruction::FirstOrderRecurrenceSplice: { |
432 | // Generate code to combine the previous and current values in vector v3. |
433 | // |
434 | // vector.ph: |
435 | // v_init = vector(..., ..., ..., a[-1]) |
436 | // br vector.body |
437 | // |
438 | // vector.body |
439 | // i = phi [0, vector.ph], [i+4, vector.body] |
440 | // v1 = phi [v_init, vector.ph], [v2, vector.body] |
441 | // v2 = a[i, i+1, i+2, i+3]; |
442 | // v3 = vector(v1(3), v2(0, 1, 2)) |
443 | |
444 | // For the first part, use the recurrence phi (v1), otherwise v2. |
445 | auto *V1 = State.get(Def: getOperand(N: 0), Part: 0); |
446 | Value *PartMinus1 = Part == 0 ? V1 : State.get(Def: getOperand(N: 1), Part: Part - 1); |
447 | if (!PartMinus1->getType()->isVectorTy()) |
448 | return PartMinus1; |
449 | Value *V2 = State.get(Def: getOperand(N: 1), Part); |
450 | return Builder.CreateVectorSplice(V1: PartMinus1, V2, Imm: -1, Name); |
451 | } |
452 | case VPInstruction::CalculateTripCountMinusVF: { |
453 | if (Part != 0) |
454 | return State.get(Def: this, Part: 0, /*IsScalar*/ true); |
455 | |
456 | Value *ScalarTC = State.get(Def: getOperand(N: 0), Instance: {0, 0}); |
457 | Value *Step = |
458 | createStepForVF(B&: Builder, Ty: ScalarTC->getType(), VF: State.VF, Step: State.UF); |
459 | Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: Step); |
460 | Value *Cmp = Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: Step); |
461 | Value *Zero = ConstantInt::get(Ty: ScalarTC->getType(), V: 0); |
462 | return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero); |
463 | } |
464 | case VPInstruction::ExplicitVectorLength: { |
465 | // Compute EVL |
466 | auto GetEVL = [=](VPTransformState &State, Value *AVL) { |
467 | assert(AVL->getType()->isIntegerTy() && |
468 | "Requested vector length should be an integer." ); |
469 | |
470 | // TODO: Add support for MaxSafeDist for correct loop emission. |
471 | assert(State.VF.isScalable() && "Expected scalable vector factor." ); |
472 | Value *VFArg = State.Builder.getInt32(C: State.VF.getKnownMinValue()); |
473 | |
474 | Value *EVL = State.Builder.CreateIntrinsic( |
475 | RetTy: State.Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length, |
476 | Args: {AVL, VFArg, State.Builder.getTrue()}); |
477 | return EVL; |
478 | }; |
479 | // TODO: Restructure this code with an explicit remainder loop, vsetvli can |
480 | // be outside of the main loop. |
481 | assert(Part == 0 && "No unrolling expected for predicated vectorization." ); |
482 | // Compute VTC - IV as the AVL (requested vector length). |
483 | Value *Index = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0)); |
484 | Value *TripCount = State.get(Def: getOperand(N: 1), Instance: VPIteration(0, 0)); |
485 | Value *AVL = State.Builder.CreateSub(LHS: TripCount, RHS: Index); |
486 | Value *EVL = GetEVL(State, AVL); |
487 | return EVL; |
488 | } |
489 | case VPInstruction::CanonicalIVIncrementForPart: { |
490 | auto *IV = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0)); |
491 | if (Part == 0) |
492 | return IV; |
493 | |
494 | // The canonical IV is incremented by the vectorization factor (num of SIMD |
495 | // elements) times the unroll part. |
496 | Value *Step = createStepForVF(B&: Builder, Ty: IV->getType(), VF: State.VF, Step: Part); |
497 | return Builder.CreateAdd(LHS: IV, RHS: Step, Name, HasNUW: hasNoUnsignedWrap(), |
498 | HasNSW: hasNoSignedWrap()); |
499 | } |
500 | case VPInstruction::BranchOnCond: { |
501 | if (Part != 0) |
502 | return nullptr; |
503 | |
504 | Value *Cond = State.get(Def: getOperand(N: 0), Instance: VPIteration(Part, 0)); |
505 | // Replace the temporary unreachable terminator with a new conditional |
506 | // branch, hooking it up to backward destination for exiting blocks now and |
507 | // to forward destination(s) later when they are created. |
508 | BranchInst *CondBr = |
509 | Builder.CreateCondBr(Cond, True: Builder.GetInsertBlock(), False: nullptr); |
510 | CondBr->setSuccessor(idx: 0, NewSucc: nullptr); |
511 | Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); |
512 | |
513 | if (!getParent()->isExiting()) |
514 | return CondBr; |
515 | |
516 | VPRegionBlock *ParentRegion = getParent()->getParent(); |
517 | VPBasicBlock * = ParentRegion->getEntryBasicBlock(); |
518 | CondBr->setSuccessor(idx: 1, NewSucc: State.CFG.VPBB2IRBB[Header]); |
519 | return CondBr; |
520 | } |
521 | case VPInstruction::BranchOnCount: { |
522 | if (Part != 0) |
523 | return nullptr; |
524 | // First create the compare. |
525 | Value *IV = State.get(Def: getOperand(N: 0), Part, /*IsScalar*/ true); |
526 | Value *TC = State.get(Def: getOperand(N: 1), Part, /*IsScalar*/ true); |
527 | Value *Cond = Builder.CreateICmpEQ(LHS: IV, RHS: TC); |
528 | |
529 | // Now create the branch. |
530 | auto *Plan = getParent()->getPlan(); |
531 | VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); |
532 | VPBasicBlock * = TopRegion->getEntry()->getEntryBasicBlock(); |
533 | |
534 | // Replace the temporary unreachable terminator with a new conditional |
535 | // branch, hooking it up to backward destination (the header) now and to the |
536 | // forward destination (the exit/middle block) later when it is created. |
537 | // Note that CreateCondBr expects a valid BB as first argument, so we need |
538 | // to set it to nullptr later. |
539 | BranchInst *CondBr = Builder.CreateCondBr(Cond, True: Builder.GetInsertBlock(), |
540 | False: State.CFG.VPBB2IRBB[Header]); |
541 | CondBr->setSuccessor(idx: 0, NewSucc: nullptr); |
542 | Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); |
543 | return CondBr; |
544 | } |
545 | case VPInstruction::ComputeReductionResult: { |
546 | if (Part != 0) |
547 | return State.get(Def: this, Part: 0, /*IsScalar*/ true); |
548 | |
549 | // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary |
550 | // and will be removed by breaking up the recipe further. |
551 | auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: 0)); |
552 | auto *OrigPhi = cast<PHINode>(Val: PhiR->getUnderlyingValue()); |
553 | // Get its reduction variable descriptor. |
554 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
555 | |
556 | RecurKind RK = RdxDesc.getRecurrenceKind(); |
557 | |
558 | VPValue *LoopExitingDef = getOperand(N: 1); |
559 | Type *PhiTy = OrigPhi->getType(); |
560 | VectorParts RdxParts(State.UF); |
561 | for (unsigned Part = 0; Part < State.UF; ++Part) |
562 | RdxParts[Part] = State.get(Def: LoopExitingDef, Part, IsScalar: PhiR->isInLoop()); |
563 | |
564 | // If the vector reduction can be performed in a smaller type, we truncate |
565 | // then extend the loop exit value to enable InstCombine to evaluate the |
566 | // entire expression in the smaller type. |
567 | // TODO: Handle this in truncateToMinBW. |
568 | if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { |
569 | Type *RdxVecTy = VectorType::get(ElementType: RdxDesc.getRecurrenceType(), EC: State.VF); |
570 | for (unsigned Part = 0; Part < State.UF; ++Part) |
571 | RdxParts[Part] = Builder.CreateTrunc(V: RdxParts[Part], DestTy: RdxVecTy); |
572 | } |
573 | // Reduce all of the unrolled parts into a single vector. |
574 | Value *ReducedPartRdx = RdxParts[0]; |
575 | unsigned Op = RecurrenceDescriptor::getOpcode(Kind: RK); |
576 | if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) |
577 | Op = Instruction::Or; |
578 | |
579 | if (PhiR->isOrdered()) { |
580 | ReducedPartRdx = RdxParts[State.UF - 1]; |
581 | } else { |
582 | // Floating-point operations should have some FMF to enable the reduction. |
583 | IRBuilderBase::FastMathFlagGuard FMFG(Builder); |
584 | Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); |
585 | for (unsigned Part = 1; Part < State.UF; ++Part) { |
586 | Value *RdxPart = RdxParts[Part]; |
587 | if (Op != Instruction::ICmp && Op != Instruction::FCmp) |
588 | ReducedPartRdx = Builder.CreateBinOp( |
589 | Opc: (Instruction::BinaryOps)Op, LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx" ); |
590 | else |
591 | ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart); |
592 | } |
593 | } |
594 | |
595 | // Create the reduction after the loop. Note that inloop reductions create |
596 | // the target reduction in the loop using a Reduction recipe. |
597 | if ((State.VF.isVector() || |
598 | RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) && |
599 | !PhiR->isInLoop()) { |
600 | ReducedPartRdx = |
601 | createTargetReduction(B&: Builder, Desc: RdxDesc, Src: ReducedPartRdx, OrigPhi); |
602 | // If the reduction can be performed in a smaller type, we need to extend |
603 | // the reduction to the wider type before we branch to the original loop. |
604 | if (PhiTy != RdxDesc.getRecurrenceType()) |
605 | ReducedPartRdx = RdxDesc.isSigned() |
606 | ? Builder.CreateSExt(V: ReducedPartRdx, DestTy: PhiTy) |
607 | : Builder.CreateZExt(V: ReducedPartRdx, DestTy: PhiTy); |
608 | } |
609 | |
610 | // If there were stores of the reduction value to a uniform memory address |
611 | // inside the loop, create the final store here. |
612 | if (StoreInst *SI = RdxDesc.IntermediateStore) { |
613 | auto *NewSI = Builder.CreateAlignedStore( |
614 | Val: ReducedPartRdx, Ptr: SI->getPointerOperand(), Align: SI->getAlign()); |
615 | propagateMetadata(I: NewSI, VL: SI); |
616 | } |
617 | |
618 | return ReducedPartRdx; |
619 | } |
620 | case VPInstruction::ExtractFromEnd: { |
621 | if (Part != 0) |
622 | return State.get(Def: this, Part: 0, /*IsScalar*/ true); |
623 | |
624 | auto *CI = cast<ConstantInt>(Val: getOperand(N: 1)->getLiveInIRValue()); |
625 | unsigned Offset = CI->getZExtValue(); |
626 | assert(Offset > 0 && "Offset from end must be positive" ); |
627 | Value *Res; |
628 | if (State.VF.isVector()) { |
629 | assert(Offset <= State.VF.getKnownMinValue() && |
630 | "invalid offset to extract from" ); |
631 | // Extract lane VF - Offset from the operand. |
632 | Res = State.get( |
633 | Def: getOperand(N: 0), |
634 | Instance: VPIteration(State.UF - 1, VPLane::getLaneFromEnd(VF: State.VF, Offset))); |
635 | } else { |
636 | assert(Offset <= State.UF && "invalid offset to extract from" ); |
637 | // When loop is unrolled without vectorizing, retrieve UF - Offset. |
638 | Res = State.get(Def: getOperand(N: 0), Part: State.UF - Offset); |
639 | } |
640 | if (isa<ExtractElementInst>(Val: Res)) |
641 | Res->setName(Name); |
642 | return Res; |
643 | } |
644 | case VPInstruction::LogicalAnd: { |
645 | Value *A = State.get(Def: getOperand(N: 0), Part); |
646 | Value *B = State.get(Def: getOperand(N: 1), Part); |
647 | return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name); |
648 | } |
649 | case VPInstruction::PtrAdd: { |
650 | assert(vputils::onlyFirstLaneUsed(this) && |
651 | "can only generate first lane for PtrAdd" ); |
652 | Value *Ptr = State.get(Def: getOperand(N: 0), Part, /* IsScalar */ true); |
653 | Value *Addend = State.get(Def: getOperand(N: 1), Part, /* IsScalar */ true); |
654 | return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name); |
655 | } |
656 | case VPInstruction::ResumePhi: { |
657 | if (Part != 0) |
658 | return State.get(Def: this, Part: 0, /*IsScalar*/ true); |
659 | Value *IncomingFromVPlanPred = |
660 | State.get(Def: getOperand(N: 0), Part, /* IsScalar */ true); |
661 | Value *IncomingFromOtherPreds = |
662 | State.get(Def: getOperand(N: 1), Part, /* IsScalar */ true); |
663 | auto *NewPhi = |
664 | Builder.CreatePHI(Ty: IncomingFromOtherPreds->getType(), NumReservedValues: 2, Name); |
665 | BasicBlock *VPlanPred = |
666 | State.CFG |
667 | .VPBB2IRBB[cast<VPBasicBlock>(Val: getParent()->getSinglePredecessor())]; |
668 | NewPhi->addIncoming(V: IncomingFromVPlanPred, BB: VPlanPred); |
669 | for (auto *OtherPred : predecessors(BB: Builder.GetInsertBlock())) { |
670 | assert(OtherPred != VPlanPred && |
671 | "VPlan predecessors should not be connected yet" ); |
672 | NewPhi->addIncoming(V: IncomingFromOtherPreds, BB: OtherPred); |
673 | } |
674 | return NewPhi; |
675 | } |
676 | |
677 | default: |
678 | llvm_unreachable("Unsupported opcode for instruction" ); |
679 | } |
680 | } |
681 | |
682 | bool VPInstruction::isVectorToScalar() const { |
683 | return getOpcode() == VPInstruction::ExtractFromEnd || |
684 | getOpcode() == VPInstruction::ComputeReductionResult; |
685 | } |
686 | |
687 | bool VPInstruction::isSingleScalar() const { |
688 | return getOpcode() == VPInstruction::ResumePhi; |
689 | } |
690 | |
691 | #if !defined(NDEBUG) |
692 | bool VPInstruction::isFPMathOp() const { |
693 | // Inspired by FPMathOperator::classof. Notable differences are that we don't |
694 | // support Call, PHI and Select opcodes here yet. |
695 | return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || |
696 | Opcode == Instruction::FNeg || Opcode == Instruction::FSub || |
697 | Opcode == Instruction::FDiv || Opcode == Instruction::FRem || |
698 | Opcode == Instruction::FCmp || Opcode == Instruction::Select; |
699 | } |
700 | #endif |
701 | |
702 | void VPInstruction::execute(VPTransformState &State) { |
703 | assert(!State.Instance && "VPInstruction executing an Instance" ); |
704 | IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); |
705 | assert((hasFastMathFlags() == isFPMathOp() || |
706 | getOpcode() == Instruction::Select) && |
707 | "Recipe not a FPMathOp but has fast-math flags?" ); |
708 | if (hasFastMathFlags()) |
709 | State.Builder.setFastMathFlags(getFastMathFlags()); |
710 | State.setDebugLocFrom(getDebugLoc()); |
711 | bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && |
712 | (vputils::onlyFirstLaneUsed(Def: this) || |
713 | isVectorToScalar() || isSingleScalar()); |
714 | bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); |
715 | bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(Def: this); |
716 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
717 | if (GeneratesPerAllLanes) { |
718 | for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); |
719 | Lane != NumLanes; ++Lane) { |
720 | Value *GeneratedValue = generatePerLane(State, Lane: VPIteration(Part, Lane)); |
721 | assert(GeneratedValue && "generatePerLane must produce a value" ); |
722 | State.set(Def: this, V: GeneratedValue, Instance: VPIteration(Part, Lane)); |
723 | } |
724 | continue; |
725 | } |
726 | |
727 | if (Part != 0 && OnlyFirstPartUsed && hasResult()) { |
728 | Value *Part0 = State.get(Def: this, Part: 0, /*IsScalar*/ GeneratesPerFirstLaneOnly); |
729 | State.set(Def: this, V: Part0, Part, |
730 | /*IsScalar*/ GeneratesPerFirstLaneOnly); |
731 | continue; |
732 | } |
733 | |
734 | Value *GeneratedValue = generatePerPart(State, Part); |
735 | if (!hasResult()) |
736 | continue; |
737 | assert(GeneratedValue && "generatePerPart must produce a value" ); |
738 | assert((GeneratedValue->getType()->isVectorTy() == |
739 | !GeneratesPerFirstLaneOnly || |
740 | State.VF.isScalar()) && |
741 | "scalar value but not only first lane defined" ); |
742 | State.set(Def: this, V: GeneratedValue, Part, |
743 | /*IsScalar*/ GeneratesPerFirstLaneOnly); |
744 | } |
745 | } |
746 | |
747 | bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { |
748 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
749 | if (Instruction::isBinaryOp(Opcode: getOpcode())) |
750 | return vputils::onlyFirstLaneUsed(Def: this); |
751 | |
752 | switch (getOpcode()) { |
753 | default: |
754 | return false; |
755 | case Instruction::ICmp: |
756 | case VPInstruction::PtrAdd: |
757 | // TODO: Cover additional opcodes. |
758 | return vputils::onlyFirstLaneUsed(Def: this); |
759 | case VPInstruction::ActiveLaneMask: |
760 | case VPInstruction::ExplicitVectorLength: |
761 | case VPInstruction::CalculateTripCountMinusVF: |
762 | case VPInstruction::CanonicalIVIncrementForPart: |
763 | case VPInstruction::BranchOnCount: |
764 | case VPInstruction::BranchOnCond: |
765 | case VPInstruction::ResumePhi: |
766 | return true; |
767 | }; |
768 | llvm_unreachable("switch should return" ); |
769 | } |
770 | |
771 | bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { |
772 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
773 | if (Instruction::isBinaryOp(Opcode: getOpcode())) |
774 | return vputils::onlyFirstPartUsed(Def: this); |
775 | |
776 | switch (getOpcode()) { |
777 | default: |
778 | return false; |
779 | case Instruction::ICmp: |
780 | case Instruction::Select: |
781 | return vputils::onlyFirstPartUsed(Def: this); |
782 | case VPInstruction::BranchOnCount: |
783 | case VPInstruction::BranchOnCond: |
784 | case VPInstruction::CanonicalIVIncrementForPart: |
785 | return true; |
786 | }; |
787 | llvm_unreachable("switch should return" ); |
788 | } |
789 | |
790 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
791 | void VPInstruction::dump() const { |
792 | VPSlotTracker SlotTracker(getParent()->getPlan()); |
793 | print(dbgs(), "" , SlotTracker); |
794 | } |
795 | |
796 | void VPInstruction::print(raw_ostream &O, const Twine &Indent, |
797 | VPSlotTracker &SlotTracker) const { |
798 | O << Indent << "EMIT " ; |
799 | |
800 | if (hasResult()) { |
801 | printAsOperand(O, SlotTracker); |
802 | O << " = " ; |
803 | } |
804 | |
805 | switch (getOpcode()) { |
806 | case VPInstruction::Not: |
807 | O << "not" ; |
808 | break; |
809 | case VPInstruction::SLPLoad: |
810 | O << "combined load" ; |
811 | break; |
812 | case VPInstruction::SLPStore: |
813 | O << "combined store" ; |
814 | break; |
815 | case VPInstruction::ActiveLaneMask: |
816 | O << "active lane mask" ; |
817 | break; |
818 | case VPInstruction::ResumePhi: |
819 | O << "resume-phi" ; |
820 | break; |
821 | case VPInstruction::ExplicitVectorLength: |
822 | O << "EXPLICIT-VECTOR-LENGTH" ; |
823 | break; |
824 | case VPInstruction::FirstOrderRecurrenceSplice: |
825 | O << "first-order splice" ; |
826 | break; |
827 | case VPInstruction::BranchOnCond: |
828 | O << "branch-on-cond" ; |
829 | break; |
830 | case VPInstruction::CalculateTripCountMinusVF: |
831 | O << "TC > VF ? TC - VF : 0" ; |
832 | break; |
833 | case VPInstruction::CanonicalIVIncrementForPart: |
834 | O << "VF * Part +" ; |
835 | break; |
836 | case VPInstruction::BranchOnCount: |
837 | O << "branch-on-count" ; |
838 | break; |
839 | case VPInstruction::ExtractFromEnd: |
840 | O << "extract-from-end" ; |
841 | break; |
842 | case VPInstruction::ComputeReductionResult: |
843 | O << "compute-reduction-result" ; |
844 | break; |
845 | case VPInstruction::LogicalAnd: |
846 | O << "logical-and" ; |
847 | break; |
848 | case VPInstruction::PtrAdd: |
849 | O << "ptradd" ; |
850 | break; |
851 | default: |
852 | O << Instruction::getOpcodeName(getOpcode()); |
853 | } |
854 | |
855 | printFlags(O); |
856 | printOperands(O, SlotTracker); |
857 | |
858 | if (auto DL = getDebugLoc()) { |
859 | O << ", !dbg " ; |
860 | DL.print(O); |
861 | } |
862 | } |
863 | #endif |
864 | |
865 | void VPWidenCallRecipe::execute(VPTransformState &State) { |
866 | assert(State.VF.isVector() && "not widening" ); |
867 | Function *CalledScalarFn = getCalledScalarFunction(); |
868 | assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) && |
869 | "DbgInfoIntrinsic should have been dropped during VPlan construction" ); |
870 | State.setDebugLocFrom(getDebugLoc()); |
871 | |
872 | bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; |
873 | FunctionType *VFTy = nullptr; |
874 | if (Variant) |
875 | VFTy = Variant->getFunctionType(); |
876 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
877 | SmallVector<Type *, 2> TysForDecl; |
878 | // Add return type if intrinsic is overloaded on it. |
879 | if (UseIntrinsic && |
880 | isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1)) |
881 | TysForDecl.push_back(Elt: VectorType::get( |
882 | ElementType: CalledScalarFn->getReturnType()->getScalarType(), EC: State.VF)); |
883 | SmallVector<Value *, 4> Args; |
884 | for (const auto &I : enumerate(First: arg_operands())) { |
885 | // Some intrinsics have a scalar argument - don't replace it with a |
886 | // vector. |
887 | Value *Arg; |
888 | if (UseIntrinsic && |
889 | isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index())) |
890 | Arg = State.get(Def: I.value(), Instance: VPIteration(0, 0)); |
891 | // Some vectorized function variants may also take a scalar argument, |
892 | // e.g. linear parameters for pointers. This needs to be the scalar value |
893 | // from the start of the respective part when interleaving. |
894 | else if (VFTy && !VFTy->getParamType(i: I.index())->isVectorTy()) |
895 | Arg = State.get(Def: I.value(), Instance: VPIteration(Part, 0)); |
896 | else |
897 | Arg = State.get(Def: I.value(), Part); |
898 | if (UseIntrinsic && |
899 | isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index())) |
900 | TysForDecl.push_back(Elt: Arg->getType()); |
901 | Args.push_back(Elt: Arg); |
902 | } |
903 | |
904 | Function *VectorF; |
905 | if (UseIntrinsic) { |
906 | // Use vector version of the intrinsic. |
907 | Module *M = State.Builder.GetInsertBlock()->getModule(); |
908 | VectorF = Intrinsic::getDeclaration(M, id: VectorIntrinsicID, Tys: TysForDecl); |
909 | assert(VectorF && "Can't retrieve vector intrinsic." ); |
910 | } else { |
911 | #ifndef NDEBUG |
912 | assert(Variant != nullptr && "Can't create vector function." ); |
913 | #endif |
914 | VectorF = Variant; |
915 | } |
916 | |
917 | auto *CI = cast_or_null<CallInst>(Val: getUnderlyingInstr()); |
918 | SmallVector<OperandBundleDef, 1> OpBundles; |
919 | if (CI) |
920 | CI->getOperandBundlesAsDefs(Defs&: OpBundles); |
921 | |
922 | CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles); |
923 | |
924 | if (isa<FPMathOperator>(Val: V)) |
925 | V->copyFastMathFlags(I: CI); |
926 | |
927 | if (!V->getType()->isVoidTy()) |
928 | State.set(Def: this, V, Part); |
929 | State.addMetadata(To: V, From: CI); |
930 | } |
931 | } |
932 | |
933 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
934 | void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, |
935 | VPSlotTracker &SlotTracker) const { |
936 | O << Indent << "WIDEN-CALL " ; |
937 | |
938 | Function *CalledFn = getCalledScalarFunction(); |
939 | if (CalledFn->getReturnType()->isVoidTy()) |
940 | O << "void " ; |
941 | else { |
942 | printAsOperand(O, SlotTracker); |
943 | O << " = " ; |
944 | } |
945 | |
946 | O << "call @" << CalledFn->getName() << "(" ; |
947 | interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { |
948 | Op->printAsOperand(O, SlotTracker); |
949 | }); |
950 | O << ")" ; |
951 | |
952 | if (VectorIntrinsicID) |
953 | O << " (using vector intrinsic)" ; |
954 | else { |
955 | O << " (using library function" ; |
956 | if (Variant->hasName()) |
957 | O << ": " << Variant->getName(); |
958 | O << ")" ; |
959 | } |
960 | } |
961 | |
962 | void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, |
963 | VPSlotTracker &SlotTracker) const { |
964 | O << Indent << "WIDEN-SELECT " ; |
965 | printAsOperand(O, SlotTracker); |
966 | O << " = select " ; |
967 | getOperand(0)->printAsOperand(O, SlotTracker); |
968 | O << ", " ; |
969 | getOperand(1)->printAsOperand(O, SlotTracker); |
970 | O << ", " ; |
971 | getOperand(2)->printAsOperand(O, SlotTracker); |
972 | O << (isInvariantCond() ? " (condition is loop invariant)" : "" ); |
973 | } |
974 | #endif |
975 | |
976 | void VPWidenSelectRecipe::execute(VPTransformState &State) { |
977 | State.setDebugLocFrom(getDebugLoc()); |
978 | |
979 | // The condition can be loop invariant but still defined inside the |
980 | // loop. This means that we can't just use the original 'cond' value. |
981 | // We have to take the 'vectorized' value and pick the first lane. |
982 | // Instcombine will make this a no-op. |
983 | auto *InvarCond = |
984 | isInvariantCond() ? State.get(Def: getCond(), Instance: VPIteration(0, 0)) : nullptr; |
985 | |
986 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
987 | Value *Cond = InvarCond ? InvarCond : State.get(Def: getCond(), Part); |
988 | Value *Op0 = State.get(Def: getOperand(N: 1), Part); |
989 | Value *Op1 = State.get(Def: getOperand(N: 2), Part); |
990 | Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1); |
991 | State.set(Def: this, V: Sel, Part); |
992 | State.addMetadata(To: Sel, From: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue())); |
993 | } |
994 | } |
995 | |
996 | VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( |
997 | const FastMathFlags &FMF) { |
998 | AllowReassoc = FMF.allowReassoc(); |
999 | NoNaNs = FMF.noNaNs(); |
1000 | NoInfs = FMF.noInfs(); |
1001 | NoSignedZeros = FMF.noSignedZeros(); |
1002 | AllowReciprocal = FMF.allowReciprocal(); |
1003 | AllowContract = FMF.allowContract(); |
1004 | ApproxFunc = FMF.approxFunc(); |
1005 | } |
1006 | |
1007 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1008 | void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { |
1009 | switch (OpType) { |
1010 | case OperationType::Cmp: |
1011 | O << " " << CmpInst::getPredicateName(getPredicate()); |
1012 | break; |
1013 | case OperationType::DisjointOp: |
1014 | if (DisjointFlags.IsDisjoint) |
1015 | O << " disjoint" ; |
1016 | break; |
1017 | case OperationType::PossiblyExactOp: |
1018 | if (ExactFlags.IsExact) |
1019 | O << " exact" ; |
1020 | break; |
1021 | case OperationType::OverflowingBinOp: |
1022 | if (WrapFlags.HasNUW) |
1023 | O << " nuw" ; |
1024 | if (WrapFlags.HasNSW) |
1025 | O << " nsw" ; |
1026 | break; |
1027 | case OperationType::FPMathOp: |
1028 | getFastMathFlags().print(O); |
1029 | break; |
1030 | case OperationType::GEPOp: |
1031 | if (GEPFlags.IsInBounds) |
1032 | O << " inbounds" ; |
1033 | break; |
1034 | case OperationType::NonNegOp: |
1035 | if (NonNegFlags.NonNeg) |
1036 | O << " nneg" ; |
1037 | break; |
1038 | case OperationType::Other: |
1039 | break; |
1040 | } |
1041 | if (getNumOperands() > 0) |
1042 | O << " " ; |
1043 | } |
1044 | #endif |
1045 | |
1046 | void VPWidenRecipe::execute(VPTransformState &State) { |
1047 | State.setDebugLocFrom(getDebugLoc()); |
1048 | auto &Builder = State.Builder; |
1049 | switch (Opcode) { |
1050 | case Instruction::Call: |
1051 | case Instruction::Br: |
1052 | case Instruction::PHI: |
1053 | case Instruction::GetElementPtr: |
1054 | case Instruction::Select: |
1055 | llvm_unreachable("This instruction is handled by a different recipe." ); |
1056 | case Instruction::UDiv: |
1057 | case Instruction::SDiv: |
1058 | case Instruction::SRem: |
1059 | case Instruction::URem: |
1060 | case Instruction::Add: |
1061 | case Instruction::FAdd: |
1062 | case Instruction::Sub: |
1063 | case Instruction::FSub: |
1064 | case Instruction::FNeg: |
1065 | case Instruction::Mul: |
1066 | case Instruction::FMul: |
1067 | case Instruction::FDiv: |
1068 | case Instruction::FRem: |
1069 | case Instruction::Shl: |
1070 | case Instruction::LShr: |
1071 | case Instruction::AShr: |
1072 | case Instruction::And: |
1073 | case Instruction::Or: |
1074 | case Instruction::Xor: { |
1075 | // Just widen unops and binops. |
1076 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1077 | SmallVector<Value *, 2> Ops; |
1078 | for (VPValue *VPOp : operands()) |
1079 | Ops.push_back(Elt: State.get(Def: VPOp, Part)); |
1080 | |
1081 | Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops); |
1082 | |
1083 | if (auto *VecOp = dyn_cast<Instruction>(Val: V)) |
1084 | setFlags(VecOp); |
1085 | |
1086 | // Use this vector value for all users of the original instruction. |
1087 | State.set(Def: this, V, Part); |
1088 | State.addMetadata(To: V, From: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue())); |
1089 | } |
1090 | |
1091 | break; |
1092 | } |
1093 | case Instruction::Freeze: { |
1094 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1095 | Value *Op = State.get(Def: getOperand(N: 0), Part); |
1096 | |
1097 | Value *Freeze = Builder.CreateFreeze(V: Op); |
1098 | State.set(Def: this, V: Freeze, Part); |
1099 | } |
1100 | break; |
1101 | } |
1102 | case Instruction::ICmp: |
1103 | case Instruction::FCmp: { |
1104 | // Widen compares. Generate vector compares. |
1105 | bool FCmp = Opcode == Instruction::FCmp; |
1106 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1107 | Value *A = State.get(Def: getOperand(N: 0), Part); |
1108 | Value *B = State.get(Def: getOperand(N: 1), Part); |
1109 | Value *C = nullptr; |
1110 | if (FCmp) { |
1111 | // Propagate fast math flags. |
1112 | IRBuilder<>::FastMathFlagGuard FMFG(Builder); |
1113 | if (auto *I = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue())) |
1114 | Builder.setFastMathFlags(I->getFastMathFlags()); |
1115 | C = Builder.CreateFCmp(P: getPredicate(), LHS: A, RHS: B); |
1116 | } else { |
1117 | C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B); |
1118 | } |
1119 | State.set(Def: this, V: C, Part); |
1120 | State.addMetadata(To: C, From: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue())); |
1121 | } |
1122 | |
1123 | break; |
1124 | } |
1125 | default: |
1126 | // This instruction is not vectorized by simple widening. |
1127 | LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " |
1128 | << Instruction::getOpcodeName(Opcode)); |
1129 | llvm_unreachable("Unhandled instruction!" ); |
1130 | } // end of switch. |
1131 | |
1132 | #if !defined(NDEBUG) |
1133 | // Verify that VPlan type inference results agree with the type of the |
1134 | // generated values. |
1135 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1136 | assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), |
1137 | State.VF) == State.get(this, Part)->getType() && |
1138 | "inferred type and type from generated instructions do not match" ); |
1139 | } |
1140 | #endif |
1141 | } |
1142 | |
1143 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1144 | void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, |
1145 | VPSlotTracker &SlotTracker) const { |
1146 | O << Indent << "WIDEN " ; |
1147 | printAsOperand(O, SlotTracker); |
1148 | O << " = " << Instruction::getOpcodeName(Opcode); |
1149 | printFlags(O); |
1150 | printOperands(O, SlotTracker); |
1151 | } |
1152 | #endif |
1153 | |
1154 | void VPWidenCastRecipe::execute(VPTransformState &State) { |
1155 | State.setDebugLocFrom(getDebugLoc()); |
1156 | auto &Builder = State.Builder; |
1157 | /// Vectorize casts. |
1158 | assert(State.VF.isVector() && "Not vectorizing?" ); |
1159 | Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF); |
1160 | VPValue *Op = getOperand(N: 0); |
1161 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1162 | if (Part > 0 && Op->isLiveIn()) { |
1163 | // FIXME: Remove once explicit unrolling is implemented using VPlan. |
1164 | State.set(Def: this, V: State.get(Def: this, Part: 0), Part); |
1165 | continue; |
1166 | } |
1167 | Value *A = State.get(Def: Op, Part); |
1168 | Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy); |
1169 | State.set(Def: this, V: Cast, Part); |
1170 | State.addMetadata(To: Cast, From: cast_or_null<Instruction>(Val: getUnderlyingValue())); |
1171 | } |
1172 | } |
1173 | |
1174 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1175 | void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, |
1176 | VPSlotTracker &SlotTracker) const { |
1177 | O << Indent << "WIDEN-CAST " ; |
1178 | printAsOperand(O, SlotTracker); |
1179 | O << " = " << Instruction::getOpcodeName(Opcode) << " " ; |
1180 | printFlags(O); |
1181 | printOperands(O, SlotTracker); |
1182 | O << " to " << *getResultType(); |
1183 | } |
1184 | #endif |
1185 | |
1186 | /// This function adds |
1187 | /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) |
1188 | /// to each vector element of Val. The sequence starts at StartIndex. |
1189 | /// \p Opcode is relevant for FP induction variable. |
1190 | static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, |
1191 | Instruction::BinaryOps BinOp, ElementCount VF, |
1192 | IRBuilderBase &Builder) { |
1193 | assert(VF.isVector() && "only vector VFs are supported" ); |
1194 | |
1195 | // Create and check the types. |
1196 | auto *ValVTy = cast<VectorType>(Val: Val->getType()); |
1197 | ElementCount VLen = ValVTy->getElementCount(); |
1198 | |
1199 | Type *STy = Val->getType()->getScalarType(); |
1200 | assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && |
1201 | "Induction Step must be an integer or FP" ); |
1202 | assert(Step->getType() == STy && "Step has wrong type" ); |
1203 | |
1204 | SmallVector<Constant *, 8> Indices; |
1205 | |
1206 | // Create a vector of consecutive numbers from zero to VF. |
1207 | VectorType *InitVecValVTy = ValVTy; |
1208 | if (STy->isFloatingPointTy()) { |
1209 | Type *InitVecValSTy = |
1210 | IntegerType::get(C&: STy->getContext(), NumBits: STy->getScalarSizeInBits()); |
1211 | InitVecValVTy = VectorType::get(ElementType: InitVecValSTy, EC: VLen); |
1212 | } |
1213 | Value *InitVec = Builder.CreateStepVector(DstType: InitVecValVTy); |
1214 | |
1215 | // Splat the StartIdx |
1216 | Value *StartIdxSplat = Builder.CreateVectorSplat(EC: VLen, V: StartIdx); |
1217 | |
1218 | if (STy->isIntegerTy()) { |
1219 | InitVec = Builder.CreateAdd(LHS: InitVec, RHS: StartIdxSplat); |
1220 | Step = Builder.CreateVectorSplat(EC: VLen, V: Step); |
1221 | assert(Step->getType() == Val->getType() && "Invalid step vec" ); |
1222 | // FIXME: The newly created binary instructions should contain nsw/nuw |
1223 | // flags, which can be found from the original scalar operations. |
1224 | Step = Builder.CreateMul(LHS: InitVec, RHS: Step); |
1225 | return Builder.CreateAdd(LHS: Val, RHS: Step, Name: "induction" ); |
1226 | } |
1227 | |
1228 | // Floating point induction. |
1229 | assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && |
1230 | "Binary Opcode should be specified for FP induction" ); |
1231 | InitVec = Builder.CreateUIToFP(V: InitVec, DestTy: ValVTy); |
1232 | InitVec = Builder.CreateFAdd(L: InitVec, R: StartIdxSplat); |
1233 | |
1234 | Step = Builder.CreateVectorSplat(EC: VLen, V: Step); |
1235 | Value *MulOp = Builder.CreateFMul(L: InitVec, R: Step); |
1236 | return Builder.CreateBinOp(Opc: BinOp, LHS: Val, RHS: MulOp, Name: "induction" ); |
1237 | } |
1238 | |
1239 | /// A helper function that returns an integer or floating-point constant with |
1240 | /// value C. |
1241 | static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { |
1242 | return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, V: C) |
1243 | : ConstantFP::get(Ty, V: C); |
1244 | } |
1245 | |
1246 | static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, |
1247 | ElementCount VF) { |
1248 | assert(FTy->isFloatingPointTy() && "Expected floating point type!" ); |
1249 | Type *IntTy = IntegerType::get(C&: FTy->getContext(), NumBits: FTy->getScalarSizeInBits()); |
1250 | Value *RuntimeVF = getRuntimeVF(B, Ty: IntTy, VF); |
1251 | return B.CreateUIToFP(V: RuntimeVF, DestTy: FTy); |
1252 | } |
1253 | |
1254 | void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { |
1255 | assert(!State.Instance && "Int or FP induction being replicated." ); |
1256 | |
1257 | Value *Start = getStartValue()->getLiveInIRValue(); |
1258 | const InductionDescriptor &ID = getInductionDescriptor(); |
1259 | TruncInst *Trunc = getTruncInst(); |
1260 | IRBuilderBase &Builder = State.Builder; |
1261 | assert(IV->getType() == ID.getStartValue()->getType() && "Types must match" ); |
1262 | assert(State.VF.isVector() && "must have vector VF" ); |
1263 | |
1264 | // The value from the original loop to which we are mapping the new induction |
1265 | // variable. |
1266 | Instruction *EntryVal = Trunc ? cast<Instruction>(Val: Trunc) : IV; |
1267 | |
1268 | // Fast-math-flags propagate from the original induction instruction. |
1269 | IRBuilder<>::FastMathFlagGuard FMFG(Builder); |
1270 | if (ID.getInductionBinOp() && isa<FPMathOperator>(Val: ID.getInductionBinOp())) |
1271 | Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); |
1272 | |
1273 | // Now do the actual transformations, and start with fetching the step value. |
1274 | Value *Step = State.get(Def: getStepValue(), Instance: VPIteration(0, 0)); |
1275 | |
1276 | assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && |
1277 | "Expected either an induction phi-node or a truncate of it!" ); |
1278 | |
1279 | // Construct the initial value of the vector IV in the vector loop preheader |
1280 | auto CurrIP = Builder.saveIP(); |
1281 | BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this); |
1282 | Builder.SetInsertPoint(VectorPH->getTerminator()); |
1283 | if (isa<TruncInst>(Val: EntryVal)) { |
1284 | assert(Start->getType()->isIntegerTy() && |
1285 | "Truncation requires an integer type" ); |
1286 | auto *TruncType = cast<IntegerType>(Val: EntryVal->getType()); |
1287 | Step = Builder.CreateTrunc(V: Step, DestTy: TruncType); |
1288 | Start = Builder.CreateCast(Op: Instruction::Trunc, V: Start, DestTy: TruncType); |
1289 | } |
1290 | |
1291 | Value *Zero = getSignedIntOrFpConstant(Ty: Start->getType(), C: 0); |
1292 | Value *SplatStart = Builder.CreateVectorSplat(EC: State.VF, V: Start); |
1293 | Value *SteppedStart = getStepVector( |
1294 | Val: SplatStart, StartIdx: Zero, Step, BinOp: ID.getInductionOpcode(), VF: State.VF, Builder&: State.Builder); |
1295 | |
1296 | // We create vector phi nodes for both integer and floating-point induction |
1297 | // variables. Here, we determine the kind of arithmetic we will perform. |
1298 | Instruction::BinaryOps AddOp; |
1299 | Instruction::BinaryOps MulOp; |
1300 | if (Step->getType()->isIntegerTy()) { |
1301 | AddOp = Instruction::Add; |
1302 | MulOp = Instruction::Mul; |
1303 | } else { |
1304 | AddOp = ID.getInductionOpcode(); |
1305 | MulOp = Instruction::FMul; |
1306 | } |
1307 | |
1308 | // Multiply the vectorization factor by the step using integer or |
1309 | // floating-point arithmetic as appropriate. |
1310 | Type *StepType = Step->getType(); |
1311 | Value *RuntimeVF; |
1312 | if (Step->getType()->isFloatingPointTy()) |
1313 | RuntimeVF = getRuntimeVFAsFloat(B&: Builder, FTy: StepType, VF: State.VF); |
1314 | else |
1315 | RuntimeVF = getRuntimeVF(B&: Builder, Ty: StepType, VF: State.VF); |
1316 | Value *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: Step, RHS: RuntimeVF); |
1317 | |
1318 | // Create a vector splat to use in the induction update. |
1319 | // |
1320 | // FIXME: If the step is non-constant, we create the vector splat with |
1321 | // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't |
1322 | // handle a constant vector splat. |
1323 | Value *SplatVF = isa<Constant>(Val: Mul) |
1324 | ? ConstantVector::getSplat(EC: State.VF, Elt: cast<Constant>(Val: Mul)) |
1325 | : Builder.CreateVectorSplat(EC: State.VF, V: Mul); |
1326 | Builder.restoreIP(IP: CurrIP); |
1327 | |
1328 | // We may need to add the step a number of times, depending on the unroll |
1329 | // factor. The last of those goes into the PHI. |
1330 | PHINode *VecInd = PHINode::Create(Ty: SteppedStart->getType(), NumReservedValues: 2, NameStr: "vec.ind" ); |
1331 | VecInd->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt()); |
1332 | VecInd->setDebugLoc(EntryVal->getDebugLoc()); |
1333 | Instruction *LastInduction = VecInd; |
1334 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1335 | State.set(Def: this, V: LastInduction, Part); |
1336 | |
1337 | if (isa<TruncInst>(Val: EntryVal)) |
1338 | State.addMetadata(To: LastInduction, From: EntryVal); |
1339 | |
1340 | LastInduction = cast<Instruction>( |
1341 | Val: Builder.CreateBinOp(Opc: AddOp, LHS: LastInduction, RHS: SplatVF, Name: "step.add" )); |
1342 | LastInduction->setDebugLoc(EntryVal->getDebugLoc()); |
1343 | } |
1344 | |
1345 | LastInduction->setName("vec.ind.next" ); |
1346 | VecInd->addIncoming(V: SteppedStart, BB: VectorPH); |
1347 | // Add induction update using an incorrect block temporarily. The phi node |
1348 | // will be fixed after VPlan execution. Note that at this point the latch |
1349 | // block cannot be used, as it does not exist yet. |
1350 | // TODO: Model increment value in VPlan, by turning the recipe into a |
1351 | // multi-def and a subclass of VPHeaderPHIRecipe. |
1352 | VecInd->addIncoming(V: LastInduction, BB: VectorPH); |
1353 | } |
1354 | |
1355 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1356 | void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, |
1357 | VPSlotTracker &SlotTracker) const { |
1358 | O << Indent << "WIDEN-INDUCTION" ; |
1359 | if (getTruncInst()) { |
1360 | O << "\\l\"" ; |
1361 | O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"" ; |
1362 | O << " +\n" << Indent << "\" " ; |
1363 | getVPValue(0)->printAsOperand(O, SlotTracker); |
1364 | } else |
1365 | O << " " << VPlanIngredient(IV); |
1366 | |
1367 | O << ", " ; |
1368 | getStepValue()->printAsOperand(O, SlotTracker); |
1369 | } |
1370 | #endif |
1371 | |
1372 | bool VPWidenIntOrFpInductionRecipe::isCanonical() const { |
1373 | // The step may be defined by a recipe in the preheader (e.g. if it requires |
1374 | // SCEV expansion), but for the canonical induction the step is required to be |
1375 | // 1, which is represented as live-in. |
1376 | if (getStepValue()->getDefiningRecipe()) |
1377 | return false; |
1378 | auto *StepC = dyn_cast<ConstantInt>(Val: getStepValue()->getLiveInIRValue()); |
1379 | auto *StartC = dyn_cast<ConstantInt>(Val: getStartValue()->getLiveInIRValue()); |
1380 | auto *CanIV = cast<VPCanonicalIVPHIRecipe>(Val: &*getParent()->begin()); |
1381 | return StartC && StartC->isZero() && StepC && StepC->isOne() && |
1382 | getScalarType() == CanIV->getScalarType(); |
1383 | } |
1384 | |
1385 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1386 | void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, |
1387 | VPSlotTracker &SlotTracker) const { |
1388 | O << Indent; |
1389 | printAsOperand(O, SlotTracker); |
1390 | O << Indent << "= DERIVED-IV " ; |
1391 | getStartValue()->printAsOperand(O, SlotTracker); |
1392 | O << " + " ; |
1393 | getOperand(1)->printAsOperand(O, SlotTracker); |
1394 | O << " * " ; |
1395 | getStepValue()->printAsOperand(O, SlotTracker); |
1396 | } |
1397 | #endif |
1398 | |
1399 | void VPScalarIVStepsRecipe::execute(VPTransformState &State) { |
1400 | // Fast-math-flags propagate from the original induction instruction. |
1401 | IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); |
1402 | if (hasFastMathFlags()) |
1403 | State.Builder.setFastMathFlags(getFastMathFlags()); |
1404 | |
1405 | /// Compute scalar induction steps. \p ScalarIV is the scalar induction |
1406 | /// variable on which to base the steps, \p Step is the size of the step. |
1407 | |
1408 | Value *BaseIV = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0)); |
1409 | Value *Step = State.get(Def: getStepValue(), Instance: VPIteration(0, 0)); |
1410 | IRBuilderBase &Builder = State.Builder; |
1411 | |
1412 | // Ensure step has the same type as that of scalar IV. |
1413 | Type *BaseIVTy = BaseIV->getType()->getScalarType(); |
1414 | assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!" ); |
1415 | |
1416 | // We build scalar steps for both integer and floating-point induction |
1417 | // variables. Here, we determine the kind of arithmetic we will perform. |
1418 | Instruction::BinaryOps AddOp; |
1419 | Instruction::BinaryOps MulOp; |
1420 | if (BaseIVTy->isIntegerTy()) { |
1421 | AddOp = Instruction::Add; |
1422 | MulOp = Instruction::Mul; |
1423 | } else { |
1424 | AddOp = InductionOpcode; |
1425 | MulOp = Instruction::FMul; |
1426 | } |
1427 | |
1428 | // Determine the number of scalars we need to generate for each unroll |
1429 | // iteration. |
1430 | bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this); |
1431 | // Compute the scalar steps and save the results in State. |
1432 | Type *IntStepTy = |
1433 | IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits()); |
1434 | Type *VecIVTy = nullptr; |
1435 | Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; |
1436 | if (!FirstLaneOnly && State.VF.isScalable()) { |
1437 | VecIVTy = VectorType::get(ElementType: BaseIVTy, EC: State.VF); |
1438 | UnitStepVec = |
1439 | Builder.CreateStepVector(DstType: VectorType::get(ElementType: IntStepTy, EC: State.VF)); |
1440 | SplatStep = Builder.CreateVectorSplat(EC: State.VF, V: Step); |
1441 | SplatIV = Builder.CreateVectorSplat(EC: State.VF, V: BaseIV); |
1442 | } |
1443 | |
1444 | unsigned StartPart = 0; |
1445 | unsigned EndPart = State.UF; |
1446 | unsigned StartLane = 0; |
1447 | unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); |
1448 | if (State.Instance) { |
1449 | StartPart = State.Instance->Part; |
1450 | EndPart = StartPart + 1; |
1451 | StartLane = State.Instance->Lane.getKnownLane(); |
1452 | EndLane = StartLane + 1; |
1453 | } |
1454 | for (unsigned Part = StartPart; Part < EndPart; ++Part) { |
1455 | Value *StartIdx0 = createStepForVF(B&: Builder, Ty: IntStepTy, VF: State.VF, Step: Part); |
1456 | |
1457 | if (!FirstLaneOnly && State.VF.isScalable()) { |
1458 | auto *SplatStartIdx = Builder.CreateVectorSplat(EC: State.VF, V: StartIdx0); |
1459 | auto *InitVec = Builder.CreateAdd(LHS: SplatStartIdx, RHS: UnitStepVec); |
1460 | if (BaseIVTy->isFloatingPointTy()) |
1461 | InitVec = Builder.CreateSIToFP(V: InitVec, DestTy: VecIVTy); |
1462 | auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: InitVec, RHS: SplatStep); |
1463 | auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: SplatIV, RHS: Mul); |
1464 | State.set(Def: this, V: Add, Part); |
1465 | // It's useful to record the lane values too for the known minimum number |
1466 | // of elements so we do those below. This improves the code quality when |
1467 | // trying to extract the first element, for example. |
1468 | } |
1469 | |
1470 | if (BaseIVTy->isFloatingPointTy()) |
1471 | StartIdx0 = Builder.CreateSIToFP(V: StartIdx0, DestTy: BaseIVTy); |
1472 | |
1473 | for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { |
1474 | Value *StartIdx = Builder.CreateBinOp( |
1475 | Opc: AddOp, LHS: StartIdx0, RHS: getSignedIntOrFpConstant(Ty: BaseIVTy, C: Lane)); |
1476 | // The step returned by `createStepForVF` is a runtime-evaluated value |
1477 | // when VF is scalable. Otherwise, it should be folded into a Constant. |
1478 | assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && |
1479 | "Expected StartIdx to be folded to a constant when VF is not " |
1480 | "scalable" ); |
1481 | auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step); |
1482 | auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul); |
1483 | State.set(Def: this, V: Add, Instance: VPIteration(Part, Lane)); |
1484 | } |
1485 | } |
1486 | } |
1487 | |
1488 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1489 | void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, |
1490 | VPSlotTracker &SlotTracker) const { |
1491 | O << Indent; |
1492 | printAsOperand(O, SlotTracker); |
1493 | O << " = SCALAR-STEPS " ; |
1494 | printOperands(O, SlotTracker); |
1495 | } |
1496 | #endif |
1497 | |
1498 | void VPWidenGEPRecipe::execute(VPTransformState &State) { |
1499 | assert(State.VF.isVector() && "not widening" ); |
1500 | auto *GEP = cast<GetElementPtrInst>(Val: getUnderlyingInstr()); |
1501 | // Construct a vector GEP by widening the operands of the scalar GEP as |
1502 | // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP |
1503 | // results in a vector of pointers when at least one operand of the GEP |
1504 | // is vector-typed. Thus, to keep the representation compact, we only use |
1505 | // vector-typed operands for loop-varying values. |
1506 | |
1507 | if (areAllOperandsInvariant()) { |
1508 | // If we are vectorizing, but the GEP has only loop-invariant operands, |
1509 | // the GEP we build (by only using vector-typed operands for |
1510 | // loop-varying values) would be a scalar pointer. Thus, to ensure we |
1511 | // produce a vector of pointers, we need to either arbitrarily pick an |
1512 | // operand to broadcast, or broadcast a clone of the original GEP. |
1513 | // Here, we broadcast a clone of the original. |
1514 | // |
1515 | // TODO: If at some point we decide to scalarize instructions having |
1516 | // loop-invariant operands, this special case will no longer be |
1517 | // required. We would add the scalarization decision to |
1518 | // collectLoopScalars() and teach getVectorValue() to broadcast |
1519 | // the lane-zero scalar value. |
1520 | SmallVector<Value *> Ops; |
1521 | for (unsigned I = 0, E = getNumOperands(); I != E; I++) |
1522 | Ops.push_back(Elt: State.get(Def: getOperand(N: I), Instance: VPIteration(0, 0))); |
1523 | |
1524 | auto *NewGEP = |
1525 | State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr: Ops[0], |
1526 | IdxList: ArrayRef(Ops).drop_front(), Name: "" , NW: isInBounds()); |
1527 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1528 | Value *EntryPart = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP); |
1529 | State.set(Def: this, V: EntryPart, Part); |
1530 | State.addMetadata(To: EntryPart, From: GEP); |
1531 | } |
1532 | } else { |
1533 | // If the GEP has at least one loop-varying operand, we are sure to |
1534 | // produce a vector of pointers. But if we are only unrolling, we want |
1535 | // to produce a scalar GEP for each unroll part. Thus, the GEP we |
1536 | // produce with the code below will be scalar (if VF == 1) or vector |
1537 | // (otherwise). Note that for the unroll-only case, we still maintain |
1538 | // values in the vector mapping with initVector, as we do for other |
1539 | // instructions. |
1540 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1541 | // The pointer operand of the new GEP. If it's loop-invariant, we |
1542 | // won't broadcast it. |
1543 | auto *Ptr = isPointerLoopInvariant() |
1544 | ? State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0)) |
1545 | : State.get(Def: getOperand(N: 0), Part); |
1546 | |
1547 | // Collect all the indices for the new GEP. If any index is |
1548 | // loop-invariant, we won't broadcast it. |
1549 | SmallVector<Value *, 4> Indices; |
1550 | for (unsigned I = 1, E = getNumOperands(); I < E; I++) { |
1551 | VPValue *Operand = getOperand(N: I); |
1552 | if (isIndexLoopInvariant(I: I - 1)) |
1553 | Indices.push_back(Elt: State.get(Def: Operand, Instance: VPIteration(0, 0))); |
1554 | else |
1555 | Indices.push_back(Elt: State.get(Def: Operand, Part)); |
1556 | } |
1557 | |
1558 | // Create the new GEP. Note that this GEP may be a scalar if VF == 1, |
1559 | // but it should be a vector, otherwise. |
1560 | auto *NewGEP = State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr, |
1561 | IdxList: Indices, Name: "" , NW: isInBounds()); |
1562 | assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && |
1563 | "NewGEP is not a pointer vector" ); |
1564 | State.set(Def: this, V: NewGEP, Part); |
1565 | State.addMetadata(To: NewGEP, From: GEP); |
1566 | } |
1567 | } |
1568 | } |
1569 | |
1570 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1571 | void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, |
1572 | VPSlotTracker &SlotTracker) const { |
1573 | O << Indent << "WIDEN-GEP " ; |
1574 | O << (isPointerLoopInvariant() ? "Inv" : "Var" ); |
1575 | for (size_t I = 0; I < getNumOperands() - 1; ++I) |
1576 | O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var" ) << "]" ; |
1577 | |
1578 | O << " " ; |
1579 | printAsOperand(O, SlotTracker); |
1580 | O << " = getelementptr" ; |
1581 | printFlags(O); |
1582 | printOperands(O, SlotTracker); |
1583 | } |
1584 | #endif |
1585 | |
1586 | void VPVectorPointerRecipe ::execute(VPTransformState &State) { |
1587 | auto &Builder = State.Builder; |
1588 | State.setDebugLocFrom(getDebugLoc()); |
1589 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1590 | // Calculate the pointer for the specific unroll-part. |
1591 | Value *PartPtr = nullptr; |
1592 | // Use i32 for the gep index type when the value is constant, |
1593 | // or query DataLayout for a more suitable index type otherwise. |
1594 | const DataLayout &DL = |
1595 | Builder.GetInsertBlock()->getDataLayout(); |
1596 | Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) |
1597 | ? DL.getIndexType(PtrTy: IndexedTy->getPointerTo()) |
1598 | : Builder.getInt32Ty(); |
1599 | Value *Ptr = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0)); |
1600 | bool InBounds = isInBounds(); |
1601 | if (IsReverse) { |
1602 | // If the address is consecutive but reversed, then the |
1603 | // wide store needs to start at the last vector element. |
1604 | // RunTimeVF = VScale * VF.getKnownMinValue() |
1605 | // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() |
1606 | Value *RunTimeVF = getRuntimeVF(B&: Builder, Ty: IndexTy, VF: State.VF); |
1607 | // NumElt = -Part * RunTimeVF |
1608 | Value *NumElt = Builder.CreateMul( |
1609 | LHS: ConstantInt::get(Ty: IndexTy, V: -(int64_t)Part), RHS: RunTimeVF); |
1610 | // LastLane = 1 - RunTimeVF |
1611 | Value *LastLane = |
1612 | Builder.CreateSub(LHS: ConstantInt::get(Ty: IndexTy, V: 1), RHS: RunTimeVF); |
1613 | PartPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: NumElt, Name: "" , NW: InBounds); |
1614 | PartPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr: PartPtr, IdxList: LastLane, Name: "" , NW: InBounds); |
1615 | } else { |
1616 | Value *Increment = createStepForVF(B&: Builder, Ty: IndexTy, VF: State.VF, Step: Part); |
1617 | PartPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: Increment, Name: "" , NW: InBounds); |
1618 | } |
1619 | |
1620 | State.set(Def: this, V: PartPtr, Part, /*IsScalar*/ true); |
1621 | } |
1622 | } |
1623 | |
1624 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1625 | void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, |
1626 | VPSlotTracker &SlotTracker) const { |
1627 | O << Indent; |
1628 | printAsOperand(O, SlotTracker); |
1629 | O << " = vector-pointer " ; |
1630 | if (IsReverse) |
1631 | O << "(reverse) " ; |
1632 | |
1633 | printOperands(O, SlotTracker); |
1634 | } |
1635 | #endif |
1636 | |
1637 | void VPBlendRecipe::execute(VPTransformState &State) { |
1638 | State.setDebugLocFrom(getDebugLoc()); |
1639 | // We know that all PHIs in non-header blocks are converted into |
1640 | // selects, so we don't have to worry about the insertion order and we |
1641 | // can just use the builder. |
1642 | // At this point we generate the predication tree. There may be |
1643 | // duplications since this is a simple recursive scan, but future |
1644 | // optimizations will clean it up. |
1645 | |
1646 | unsigned NumIncoming = getNumIncomingValues(); |
1647 | |
1648 | // Generate a sequence of selects of the form: |
1649 | // SELECT(Mask3, In3, |
1650 | // SELECT(Mask2, In2, |
1651 | // SELECT(Mask1, In1, |
1652 | // In0))) |
1653 | // Note that Mask0 is never used: lanes for which no path reaches this phi and |
1654 | // are essentially undef are taken from In0. |
1655 | VectorParts Entry(State.UF); |
1656 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
1657 | for (unsigned In = 0; In < NumIncoming; ++In) { |
1658 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1659 | // We might have single edge PHIs (blocks) - use an identity |
1660 | // 'select' for the first PHI operand. |
1661 | Value *In0 = State.get(Def: getIncomingValue(Idx: In), Part, IsScalar: OnlyFirstLaneUsed); |
1662 | if (In == 0) |
1663 | Entry[Part] = In0; // Initialize with the first incoming value. |
1664 | else { |
1665 | // Select between the current value and the previous incoming edge |
1666 | // based on the incoming mask. |
1667 | Value *Cond = State.get(Def: getMask(Idx: In), Part, IsScalar: OnlyFirstLaneUsed); |
1668 | Entry[Part] = |
1669 | State.Builder.CreateSelect(C: Cond, True: In0, False: Entry[Part], Name: "predphi" ); |
1670 | } |
1671 | } |
1672 | } |
1673 | for (unsigned Part = 0; Part < State.UF; ++Part) |
1674 | State.set(Def: this, V: Entry[Part], Part, IsScalar: OnlyFirstLaneUsed); |
1675 | } |
1676 | |
1677 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1678 | void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, |
1679 | VPSlotTracker &SlotTracker) const { |
1680 | O << Indent << "BLEND " ; |
1681 | printAsOperand(O, SlotTracker); |
1682 | O << " =" ; |
1683 | if (getNumIncomingValues() == 1) { |
1684 | // Not a User of any mask: not really blending, this is a |
1685 | // single-predecessor phi. |
1686 | O << " " ; |
1687 | getIncomingValue(0)->printAsOperand(O, SlotTracker); |
1688 | } else { |
1689 | for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { |
1690 | O << " " ; |
1691 | getIncomingValue(I)->printAsOperand(O, SlotTracker); |
1692 | if (I == 0) |
1693 | continue; |
1694 | O << "/" ; |
1695 | getMask(I)->printAsOperand(O, SlotTracker); |
1696 | } |
1697 | } |
1698 | } |
1699 | #endif |
1700 | |
1701 | void VPReductionRecipe::execute(VPTransformState &State) { |
1702 | assert(!State.Instance && "Reduction being replicated." ); |
1703 | Value *PrevInChain = State.get(Def: getChainOp(), Part: 0, /*IsScalar*/ true); |
1704 | RecurKind Kind = RdxDesc.getRecurrenceKind(); |
1705 | // Propagate the fast-math flags carried by the underlying instruction. |
1706 | IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); |
1707 | State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); |
1708 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1709 | Value *NewVecOp = State.get(Def: getVecOp(), Part); |
1710 | if (VPValue *Cond = getCondOp()) { |
1711 | Value *NewCond = State.get(Def: Cond, Part, IsScalar: State.VF.isScalar()); |
1712 | VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType()); |
1713 | Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); |
1714 | Value *Iden = RdxDesc.getRecurrenceIdentity(K: Kind, Tp: ElementTy, |
1715 | FMF: RdxDesc.getFastMathFlags()); |
1716 | if (State.VF.isVector()) { |
1717 | Iden = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Iden); |
1718 | } |
1719 | |
1720 | Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Iden); |
1721 | NewVecOp = Select; |
1722 | } |
1723 | Value *NewRed; |
1724 | Value *NextInChain; |
1725 | if (IsOrdered) { |
1726 | if (State.VF.isVector()) |
1727 | NewRed = createOrderedReduction(B&: State.Builder, Desc: RdxDesc, Src: NewVecOp, |
1728 | Start: PrevInChain); |
1729 | else |
1730 | NewRed = State.Builder.CreateBinOp( |
1731 | Opc: (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), LHS: PrevInChain, |
1732 | RHS: NewVecOp); |
1733 | PrevInChain = NewRed; |
1734 | } else { |
1735 | PrevInChain = State.get(Def: getChainOp(), Part, /*IsScalar*/ true); |
1736 | NewRed = createTargetReduction(B&: State.Builder, Desc: RdxDesc, Src: NewVecOp); |
1737 | } |
1738 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { |
1739 | NextInChain = createMinMaxOp(Builder&: State.Builder, RK: RdxDesc.getRecurrenceKind(), |
1740 | Left: NewRed, Right: PrevInChain); |
1741 | } else if (IsOrdered) |
1742 | NextInChain = NewRed; |
1743 | else |
1744 | NextInChain = State.Builder.CreateBinOp( |
1745 | Opc: (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), LHS: NewRed, RHS: PrevInChain); |
1746 | State.set(Def: this, V: NextInChain, Part, /*IsScalar*/ true); |
1747 | } |
1748 | } |
1749 | |
1750 | void VPReductionEVLRecipe::execute(VPTransformState &State) { |
1751 | assert(!State.Instance && "Reduction being replicated." ); |
1752 | assert(State.UF == 1 && |
1753 | "Expected only UF == 1 when vectorizing with explicit vector length." ); |
1754 | |
1755 | auto &Builder = State.Builder; |
1756 | // Propagate the fast-math flags carried by the underlying instruction. |
1757 | IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); |
1758 | const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); |
1759 | Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); |
1760 | |
1761 | RecurKind Kind = RdxDesc.getRecurrenceKind(); |
1762 | Value *Prev = State.get(Def: getChainOp(), Part: 0, /*IsScalar*/ true); |
1763 | Value *VecOp = State.get(Def: getVecOp(), Part: 0); |
1764 | Value *EVL = State.get(Def: getEVL(), Instance: VPIteration(0, 0)); |
1765 | |
1766 | VectorBuilder VBuilder(Builder); |
1767 | VBuilder.setEVL(EVL); |
1768 | Value *Mask; |
1769 | // TODO: move the all-true mask generation into VectorBuilder. |
1770 | if (VPValue *CondOp = getCondOp()) |
1771 | Mask = State.get(Def: CondOp, Part: 0); |
1772 | else |
1773 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
1774 | VBuilder.setMask(Mask); |
1775 | |
1776 | Value *NewRed; |
1777 | if (isOrdered()) { |
1778 | NewRed = createOrderedReduction(VB&: VBuilder, Desc: RdxDesc, Src: VecOp, Start: Prev); |
1779 | } else { |
1780 | NewRed = createSimpleTargetReduction(VB&: VBuilder, Src: VecOp, Desc: RdxDesc); |
1781 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) |
1782 | NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev); |
1783 | else |
1784 | NewRed = Builder.CreateBinOp( |
1785 | Opc: (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), LHS: NewRed, RHS: Prev); |
1786 | } |
1787 | State.set(Def: this, V: NewRed, Part: 0, /*IsScalar*/ true); |
1788 | } |
1789 | |
1790 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1791 | void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, |
1792 | VPSlotTracker &SlotTracker) const { |
1793 | O << Indent << "REDUCE " ; |
1794 | printAsOperand(O, SlotTracker); |
1795 | O << " = " ; |
1796 | getChainOp()->printAsOperand(O, SlotTracker); |
1797 | O << " +" ; |
1798 | if (isa<FPMathOperator>(getUnderlyingInstr())) |
1799 | O << getUnderlyingInstr()->getFastMathFlags(); |
1800 | O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (" ; |
1801 | getVecOp()->printAsOperand(O, SlotTracker); |
1802 | if (isConditional()) { |
1803 | O << ", " ; |
1804 | getCondOp()->printAsOperand(O, SlotTracker); |
1805 | } |
1806 | O << ")" ; |
1807 | if (RdxDesc.IntermediateStore) |
1808 | O << " (with final reduction value stored in invariant address sank " |
1809 | "outside of loop)" ; |
1810 | } |
1811 | |
1812 | void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, |
1813 | VPSlotTracker &SlotTracker) const { |
1814 | const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); |
1815 | O << Indent << "REDUCE " ; |
1816 | printAsOperand(O, SlotTracker); |
1817 | O << " = " ; |
1818 | getChainOp()->printAsOperand(O, SlotTracker); |
1819 | O << " +" ; |
1820 | if (isa<FPMathOperator>(getUnderlyingInstr())) |
1821 | O << getUnderlyingInstr()->getFastMathFlags(); |
1822 | O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (" ; |
1823 | getVecOp()->printAsOperand(O, SlotTracker); |
1824 | O << ", " ; |
1825 | getEVL()->printAsOperand(O, SlotTracker); |
1826 | if (isConditional()) { |
1827 | O << ", " ; |
1828 | getCondOp()->printAsOperand(O, SlotTracker); |
1829 | } |
1830 | O << ")" ; |
1831 | if (RdxDesc.IntermediateStore) |
1832 | O << " (with final reduction value stored in invariant address sank " |
1833 | "outside of loop)" ; |
1834 | } |
1835 | #endif |
1836 | |
1837 | bool VPReplicateRecipe::shouldPack() const { |
1838 | // Find if the recipe is used by a widened recipe via an intervening |
1839 | // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. |
1840 | return any_of(Range: users(), P: [](const VPUser *U) { |
1841 | if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U)) |
1842 | return any_of(Range: PredR->users(), P: [PredR](const VPUser *U) { |
1843 | return !U->usesScalars(Op: PredR); |
1844 | }); |
1845 | return false; |
1846 | }); |
1847 | } |
1848 | |
1849 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1850 | void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, |
1851 | VPSlotTracker &SlotTracker) const { |
1852 | O << Indent << (IsUniform ? "CLONE " : "REPLICATE " ); |
1853 | |
1854 | if (!getUnderlyingInstr()->getType()->isVoidTy()) { |
1855 | printAsOperand(O, SlotTracker); |
1856 | O << " = " ; |
1857 | } |
1858 | if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { |
1859 | O << "call" ; |
1860 | printFlags(O); |
1861 | O << "@" << CB->getCalledFunction()->getName() << "(" ; |
1862 | interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), |
1863 | O, [&O, &SlotTracker](VPValue *Op) { |
1864 | Op->printAsOperand(O, SlotTracker); |
1865 | }); |
1866 | O << ")" ; |
1867 | } else { |
1868 | O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); |
1869 | printFlags(O); |
1870 | printOperands(O, SlotTracker); |
1871 | } |
1872 | |
1873 | if (shouldPack()) |
1874 | O << " (S->V)" ; |
1875 | } |
1876 | #endif |
1877 | |
1878 | /// Checks if \p C is uniform across all VFs and UFs. It is considered as such |
1879 | /// if it is either defined outside the vector region or its operand is known to |
1880 | /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI). |
1881 | /// TODO: Uniformity should be associated with a VPValue and there should be a |
1882 | /// generic way to check. |
1883 | static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) { |
1884 | return C->isDefinedOutsideVectorRegions() || |
1885 | isa<VPDerivedIVRecipe>(Val: C->getOperand(N: 0)) || |
1886 | isa<VPCanonicalIVPHIRecipe>(Val: C->getOperand(N: 0)); |
1887 | } |
1888 | |
1889 | Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { |
1890 | assert(vputils::onlyFirstLaneUsed(this) && |
1891 | "Codegen only implemented for first lane." ); |
1892 | switch (Opcode) { |
1893 | case Instruction::SExt: |
1894 | case Instruction::ZExt: |
1895 | case Instruction::Trunc: { |
1896 | // Note: SExt/ZExt not used yet. |
1897 | Value *Op = State.get(Def: getOperand(N: 0), Instance: VPIteration(Part, 0)); |
1898 | return State.Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: Op, DestTy: ResultTy); |
1899 | } |
1900 | default: |
1901 | llvm_unreachable("opcode not implemented yet" ); |
1902 | } |
1903 | } |
1904 | |
1905 | void VPScalarCastRecipe ::execute(VPTransformState &State) { |
1906 | bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(C: this); |
1907 | for (unsigned Part = 0; Part != State.UF; ++Part) { |
1908 | Value *Res; |
1909 | // Only generate a single instance, if the recipe is uniform across UFs and |
1910 | // VFs. |
1911 | if (Part > 0 && IsUniformAcrossVFsAndUFs) |
1912 | Res = State.get(Def: this, Instance: VPIteration(0, 0)); |
1913 | else |
1914 | Res = generate(State, Part); |
1915 | State.set(Def: this, V: Res, Instance: VPIteration(Part, 0)); |
1916 | } |
1917 | } |
1918 | |
1919 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1920 | void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, |
1921 | VPSlotTracker &SlotTracker) const { |
1922 | O << Indent << "SCALAR-CAST " ; |
1923 | printAsOperand(O, SlotTracker); |
1924 | O << " = " << Instruction::getOpcodeName(Opcode) << " " ; |
1925 | printOperands(O, SlotTracker); |
1926 | O << " to " << *ResultTy; |
1927 | } |
1928 | #endif |
1929 | |
1930 | void VPBranchOnMaskRecipe::execute(VPTransformState &State) { |
1931 | assert(State.Instance && "Branch on Mask works only on single instance." ); |
1932 | |
1933 | unsigned Part = State.Instance->Part; |
1934 | unsigned Lane = State.Instance->Lane.getKnownLane(); |
1935 | |
1936 | Value *ConditionBit = nullptr; |
1937 | VPValue *BlockInMask = getMask(); |
1938 | if (BlockInMask) { |
1939 | ConditionBit = State.get(Def: BlockInMask, Part); |
1940 | if (ConditionBit->getType()->isVectorTy()) |
1941 | ConditionBit = State.Builder.CreateExtractElement( |
1942 | Vec: ConditionBit, Idx: State.Builder.getInt32(C: Lane)); |
1943 | } else // Block in mask is all-one. |
1944 | ConditionBit = State.Builder.getTrue(); |
1945 | |
1946 | // Replace the temporary unreachable terminator with a new conditional branch, |
1947 | // whose two destinations will be set later when they are created. |
1948 | auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); |
1949 | assert(isa<UnreachableInst>(CurrentTerminator) && |
1950 | "Expected to replace unreachable terminator with conditional branch." ); |
1951 | auto *CondBr = BranchInst::Create(IfTrue: State.CFG.PrevBB, IfFalse: nullptr, Cond: ConditionBit); |
1952 | CondBr->setSuccessor(idx: 0, NewSucc: nullptr); |
1953 | ReplaceInstWithInst(From: CurrentTerminator, To: CondBr); |
1954 | } |
1955 | |
1956 | void VPPredInstPHIRecipe::execute(VPTransformState &State) { |
1957 | assert(State.Instance && "Predicated instruction PHI works per instance." ); |
1958 | Instruction *ScalarPredInst = |
1959 | cast<Instruction>(Val: State.get(Def: getOperand(N: 0), Instance: *State.Instance)); |
1960 | BasicBlock *PredicatedBB = ScalarPredInst->getParent(); |
1961 | BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); |
1962 | assert(PredicatingBB && "Predicated block has no single predecessor." ); |
1963 | assert(isa<VPReplicateRecipe>(getOperand(0)) && |
1964 | "operand must be VPReplicateRecipe" ); |
1965 | |
1966 | // By current pack/unpack logic we need to generate only a single phi node: if |
1967 | // a vector value for the predicated instruction exists at this point it means |
1968 | // the instruction has vector users only, and a phi for the vector value is |
1969 | // needed. In this case the recipe of the predicated instruction is marked to |
1970 | // also do that packing, thereby "hoisting" the insert-element sequence. |
1971 | // Otherwise, a phi node for the scalar value is needed. |
1972 | unsigned Part = State.Instance->Part; |
1973 | if (State.hasVectorValue(Def: getOperand(N: 0), Part)) { |
1974 | Value *VectorValue = State.get(Def: getOperand(N: 0), Part); |
1975 | InsertElementInst *IEI = cast<InsertElementInst>(Val: VectorValue); |
1976 | PHINode *VPhi = State.Builder.CreatePHI(Ty: IEI->getType(), NumReservedValues: 2); |
1977 | VPhi->addIncoming(V: IEI->getOperand(i_nocapture: 0), BB: PredicatingBB); // Unmodified vector. |
1978 | VPhi->addIncoming(V: IEI, BB: PredicatedBB); // New vector with inserted element. |
1979 | if (State.hasVectorValue(Def: this, Part)) |
1980 | State.reset(Def: this, V: VPhi, Part); |
1981 | else |
1982 | State.set(Def: this, V: VPhi, Part); |
1983 | // NOTE: Currently we need to update the value of the operand, so the next |
1984 | // predicated iteration inserts its generated value in the correct vector. |
1985 | State.reset(Def: getOperand(N: 0), V: VPhi, Part); |
1986 | } else { |
1987 | Type *PredInstType = getOperand(N: 0)->getUnderlyingValue()->getType(); |
1988 | PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: 2); |
1989 | Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()), |
1990 | BB: PredicatingBB); |
1991 | Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB); |
1992 | if (State.hasScalarValue(Def: this, Instance: *State.Instance)) |
1993 | State.reset(Def: this, V: Phi, Instance: *State.Instance); |
1994 | else |
1995 | State.set(Def: this, V: Phi, Instance: *State.Instance); |
1996 | // NOTE: Currently we need to update the value of the operand, so the next |
1997 | // predicated iteration inserts its generated value in the correct vector. |
1998 | State.reset(Def: getOperand(N: 0), V: Phi, Instance: *State.Instance); |
1999 | } |
2000 | } |
2001 | |
2002 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2003 | void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
2004 | VPSlotTracker &SlotTracker) const { |
2005 | O << Indent << "PHI-PREDICATED-INSTRUCTION " ; |
2006 | printAsOperand(O, SlotTracker); |
2007 | O << " = " ; |
2008 | printOperands(O, SlotTracker); |
2009 | } |
2010 | |
2011 | void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, |
2012 | VPSlotTracker &SlotTracker) const { |
2013 | O << Indent << "WIDEN " ; |
2014 | printAsOperand(O, SlotTracker); |
2015 | O << " = load " ; |
2016 | printOperands(O, SlotTracker); |
2017 | } |
2018 | |
2019 | void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, |
2020 | VPSlotTracker &SlotTracker) const { |
2021 | O << Indent << "WIDEN " ; |
2022 | printAsOperand(O, SlotTracker); |
2023 | O << " = vp.load " ; |
2024 | printOperands(O, SlotTracker); |
2025 | } |
2026 | |
2027 | void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, |
2028 | VPSlotTracker &SlotTracker) const { |
2029 | O << Indent << "WIDEN store " ; |
2030 | printOperands(O, SlotTracker); |
2031 | } |
2032 | |
2033 | void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, |
2034 | VPSlotTracker &SlotTracker) const { |
2035 | O << Indent << "WIDEN vp.store " ; |
2036 | printOperands(O, SlotTracker); |
2037 | } |
2038 | #endif |
2039 | |
2040 | static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, |
2041 | VectorType *DstVTy, const DataLayout &DL) { |
2042 | // Verify that V is a vector type with same number of elements as DstVTy. |
2043 | auto VF = DstVTy->getElementCount(); |
2044 | auto *SrcVecTy = cast<VectorType>(Val: V->getType()); |
2045 | assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match" ); |
2046 | Type *SrcElemTy = SrcVecTy->getElementType(); |
2047 | Type *DstElemTy = DstVTy->getElementType(); |
2048 | assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && |
2049 | "Vector elements must have same size" ); |
2050 | |
2051 | // Do a direct cast if element types are castable. |
2052 | if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) { |
2053 | return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy); |
2054 | } |
2055 | // V cannot be directly casted to desired vector type. |
2056 | // May happen when V is a floating point vector but DstVTy is a vector of |
2057 | // pointers or vice-versa. Handle this using a two-step bitcast using an |
2058 | // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. |
2059 | assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && |
2060 | "Only one type should be a pointer type" ); |
2061 | assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && |
2062 | "Only one type should be a floating point type" ); |
2063 | Type *IntTy = |
2064 | IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy)); |
2065 | auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF); |
2066 | Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy); |
2067 | return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy); |
2068 | } |
2069 | |
2070 | /// Return a vector containing interleaved elements from multiple |
2071 | /// smaller input vectors. |
2072 | static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, |
2073 | const Twine &Name) { |
2074 | unsigned Factor = Vals.size(); |
2075 | assert(Factor > 1 && "Tried to interleave invalid number of vectors" ); |
2076 | |
2077 | VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType()); |
2078 | #ifndef NDEBUG |
2079 | for (Value *Val : Vals) |
2080 | assert(Val->getType() == VecTy && "Tried to interleave mismatched types" ); |
2081 | #endif |
2082 | |
2083 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), so |
2084 | // must use intrinsics to interleave. |
2085 | if (VecTy->isScalableTy()) { |
2086 | VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VTy: VecTy); |
2087 | return Builder.CreateIntrinsic(RetTy: WideVecTy, ID: Intrinsic::vector_interleave2, |
2088 | Args: Vals, |
2089 | /*FMFSource=*/nullptr, Name); |
2090 | } |
2091 | |
2092 | // Fixed length. Start by concatenating all vectors into a wide vector. |
2093 | Value *WideVec = concatenateVectors(Builder, Vecs: Vals); |
2094 | |
2095 | // Interleave the elements into the wide vector. |
2096 | const unsigned NumElts = VecTy->getElementCount().getFixedValue(); |
2097 | return Builder.CreateShuffleVector( |
2098 | V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name); |
2099 | } |
2100 | |
2101 | // Try to vectorize the interleave group that \p Instr belongs to. |
2102 | // |
2103 | // E.g. Translate following interleaved load group (factor = 3): |
2104 | // for (i = 0; i < N; i+=3) { |
2105 | // R = Pic[i]; // Member of index 0 |
2106 | // G = Pic[i+1]; // Member of index 1 |
2107 | // B = Pic[i+2]; // Member of index 2 |
2108 | // ... // do something to R, G, B |
2109 | // } |
2110 | // To: |
2111 | // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B |
2112 | // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements |
2113 | // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements |
2114 | // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements |
2115 | // |
2116 | // Or translate following interleaved store group (factor = 3): |
2117 | // for (i = 0; i < N; i+=3) { |
2118 | // ... do something to R, G, B |
2119 | // Pic[i] = R; // Member of index 0 |
2120 | // Pic[i+1] = G; // Member of index 1 |
2121 | // Pic[i+2] = B; // Member of index 2 |
2122 | // } |
2123 | // To: |
2124 | // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> |
2125 | // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> |
2126 | // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, |
2127 | // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements |
2128 | // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B |
2129 | void VPInterleaveRecipe::execute(VPTransformState &State) { |
2130 | assert(!State.Instance && "Interleave group being replicated." ); |
2131 | const InterleaveGroup<Instruction> *Group = IG; |
2132 | Instruction *Instr = Group->getInsertPos(); |
2133 | |
2134 | // Prepare for the vector type of the interleaved load/store. |
2135 | Type *ScalarTy = getLoadStoreType(I: Instr); |
2136 | unsigned InterleaveFactor = Group->getFactor(); |
2137 | auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF * InterleaveFactor); |
2138 | |
2139 | // Prepare for the new pointers. |
2140 | SmallVector<Value *, 2> AddrParts; |
2141 | unsigned Index = Group->getIndex(Instr); |
2142 | |
2143 | // TODO: extend the masked interleaved-group support to reversed access. |
2144 | VPValue *BlockInMask = getMask(); |
2145 | assert((!BlockInMask || !Group->isReverse()) && |
2146 | "Reversed masked interleave-group not supported." ); |
2147 | |
2148 | Value *Idx; |
2149 | // If the group is reverse, adjust the index to refer to the last vector lane |
2150 | // instead of the first. We adjust the index from the first vector lane, |
2151 | // rather than directly getting the pointer for lane VF - 1, because the |
2152 | // pointer operand of the interleaved access is supposed to be uniform. For |
2153 | // uniform instructions, we're only required to generate a value for the |
2154 | // first vector lane in each unroll iteration. |
2155 | if (Group->isReverse()) { |
2156 | Value *RuntimeVF = |
2157 | getRuntimeVF(B&: State.Builder, Ty: State.Builder.getInt32Ty(), VF: State.VF); |
2158 | Idx = State.Builder.CreateSub(LHS: RuntimeVF, RHS: State.Builder.getInt32(C: 1)); |
2159 | Idx = State.Builder.CreateMul(LHS: Idx, |
2160 | RHS: State.Builder.getInt32(C: Group->getFactor())); |
2161 | Idx = State.Builder.CreateAdd(LHS: Idx, RHS: State.Builder.getInt32(C: Index)); |
2162 | Idx = State.Builder.CreateNeg(V: Idx); |
2163 | } else |
2164 | Idx = State.Builder.getInt32(C: -Index); |
2165 | |
2166 | VPValue *Addr = getAddr(); |
2167 | for (unsigned Part = 0; Part < State.UF; Part++) { |
2168 | Value *AddrPart = State.get(Def: Addr, Instance: VPIteration(Part, 0)); |
2169 | if (auto *I = dyn_cast<Instruction>(Val: AddrPart)) |
2170 | State.setDebugLocFrom(I->getDebugLoc()); |
2171 | |
2172 | // Notice current instruction could be any index. Need to adjust the address |
2173 | // to the member of index 0. |
2174 | // |
2175 | // E.g. a = A[i+1]; // Member of index 1 (Current instruction) |
2176 | // b = A[i]; // Member of index 0 |
2177 | // Current pointer is pointed to A[i+1], adjust it to A[i]. |
2178 | // |
2179 | // E.g. A[i+1] = a; // Member of index 1 |
2180 | // A[i] = b; // Member of index 0 |
2181 | // A[i+2] = c; // Member of index 2 (Current instruction) |
2182 | // Current pointer is pointed to A[i+2], adjust it to A[i]. |
2183 | |
2184 | bool InBounds = false; |
2185 | if (auto *gep = dyn_cast<GetElementPtrInst>(Val: AddrPart->stripPointerCasts())) |
2186 | InBounds = gep->isInBounds(); |
2187 | AddrPart = State.Builder.CreateGEP(Ty: ScalarTy, Ptr: AddrPart, IdxList: Idx, Name: "" , NW: InBounds); |
2188 | AddrParts.push_back(Elt: AddrPart); |
2189 | } |
2190 | |
2191 | State.setDebugLocFrom(Instr->getDebugLoc()); |
2192 | Value *PoisonVec = PoisonValue::get(T: VecTy); |
2193 | |
2194 | auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor]( |
2195 | unsigned Part, Value *MaskForGaps) -> Value * { |
2196 | if (State.VF.isScalable()) { |
2197 | assert(!MaskForGaps && "Interleaved groups with gaps are not supported." ); |
2198 | assert(InterleaveFactor == 2 && |
2199 | "Unsupported deinterleave factor for scalable vectors" ); |
2200 | auto *BlockInMaskPart = State.get(Def: BlockInMask, Part); |
2201 | SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; |
2202 | auto *MaskTy = VectorType::get(ElementType: State.Builder.getInt1Ty(), |
2203 | NumElements: State.VF.getKnownMinValue() * 2, Scalable: true); |
2204 | return State.Builder.CreateIntrinsic( |
2205 | RetTy: MaskTy, ID: Intrinsic::vector_interleave2, Args: Ops, |
2206 | /*FMFSource=*/nullptr, Name: "interleaved.mask" ); |
2207 | } |
2208 | |
2209 | if (!BlockInMask) |
2210 | return MaskForGaps; |
2211 | |
2212 | Value *BlockInMaskPart = State.get(Def: BlockInMask, Part); |
2213 | Value *ShuffledMask = State.Builder.CreateShuffleVector( |
2214 | V: BlockInMaskPart, |
2215 | Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getKnownMinValue()), |
2216 | Name: "interleaved.mask" ); |
2217 | return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And, |
2218 | LHS: ShuffledMask, RHS: MaskForGaps) |
2219 | : ShuffledMask; |
2220 | }; |
2221 | |
2222 | const DataLayout &DL = Instr->getDataLayout(); |
2223 | // Vectorize the interleaved load group. |
2224 | if (isa<LoadInst>(Val: Instr)) { |
2225 | Value *MaskForGaps = nullptr; |
2226 | if (NeedsMaskForGaps) { |
2227 | MaskForGaps = createBitMaskForGaps(Builder&: State.Builder, |
2228 | VF: State.VF.getKnownMinValue(), Group: *Group); |
2229 | assert(MaskForGaps && "Mask for Gaps is required but it is null" ); |
2230 | } |
2231 | |
2232 | // For each unroll part, create a wide load for the group. |
2233 | SmallVector<Value *, 2> NewLoads; |
2234 | for (unsigned Part = 0; Part < State.UF; Part++) { |
2235 | Instruction *NewLoad; |
2236 | if (BlockInMask || MaskForGaps) { |
2237 | Value *GroupMask = CreateGroupMask(Part, MaskForGaps); |
2238 | NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: AddrParts[Part], |
2239 | Alignment: Group->getAlign(), Mask: GroupMask, |
2240 | PassThru: PoisonVec, Name: "wide.masked.vec" ); |
2241 | } else |
2242 | NewLoad = State.Builder.CreateAlignedLoad( |
2243 | Ty: VecTy, Ptr: AddrParts[Part], Align: Group->getAlign(), Name: "wide.vec" ); |
2244 | Group->addMetadata(NewInst: NewLoad); |
2245 | NewLoads.push_back(Elt: NewLoad); |
2246 | } |
2247 | |
2248 | ArrayRef<VPValue *> VPDefs = definedValues(); |
2249 | const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); |
2250 | if (VecTy->isScalableTy()) { |
2251 | assert(InterleaveFactor == 2 && |
2252 | "Unsupported deinterleave factor for scalable vectors" ); |
2253 | |
2254 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
2255 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), |
2256 | // so must use intrinsics to deinterleave. |
2257 | Value *DI = State.Builder.CreateIntrinsic( |
2258 | ID: Intrinsic::vector_deinterleave2, Types: VecTy, Args: NewLoads[Part], |
2259 | /*FMFSource=*/nullptr, Name: "strided.vec" ); |
2260 | unsigned J = 0; |
2261 | for (unsigned I = 0; I < InterleaveFactor; ++I) { |
2262 | Instruction *Member = Group->getMember(Index: I); |
2263 | |
2264 | if (!Member) |
2265 | continue; |
2266 | |
2267 | Value *StridedVec = State.Builder.CreateExtractValue(Agg: DI, Idxs: I); |
2268 | // If this member has different type, cast the result type. |
2269 | if (Member->getType() != ScalarTy) { |
2270 | VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF); |
2271 | StridedVec = |
2272 | createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL); |
2273 | } |
2274 | |
2275 | if (Group->isReverse()) |
2276 | StridedVec = |
2277 | State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse" ); |
2278 | |
2279 | State.set(Def: VPDefs[J], V: StridedVec, Part); |
2280 | ++J; |
2281 | } |
2282 | } |
2283 | |
2284 | return; |
2285 | } |
2286 | |
2287 | // For each member in the group, shuffle out the appropriate data from the |
2288 | // wide loads. |
2289 | unsigned J = 0; |
2290 | for (unsigned I = 0; I < InterleaveFactor; ++I) { |
2291 | Instruction *Member = Group->getMember(Index: I); |
2292 | |
2293 | // Skip the gaps in the group. |
2294 | if (!Member) |
2295 | continue; |
2296 | |
2297 | auto StrideMask = |
2298 | createStrideMask(Start: I, Stride: InterleaveFactor, VF: State.VF.getKnownMinValue()); |
2299 | for (unsigned Part = 0; Part < State.UF; Part++) { |
2300 | Value *StridedVec = State.Builder.CreateShuffleVector( |
2301 | V: NewLoads[Part], Mask: StrideMask, Name: "strided.vec" ); |
2302 | |
2303 | // If this member has different type, cast the result type. |
2304 | if (Member->getType() != ScalarTy) { |
2305 | assert(!State.VF.isScalable() && "VF is assumed to be non scalable." ); |
2306 | VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF); |
2307 | StridedVec = |
2308 | createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL); |
2309 | } |
2310 | |
2311 | if (Group->isReverse()) |
2312 | StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse" ); |
2313 | |
2314 | State.set(Def: VPDefs[J], V: StridedVec, Part); |
2315 | } |
2316 | ++J; |
2317 | } |
2318 | return; |
2319 | } |
2320 | |
2321 | // The sub vector type for current instruction. |
2322 | auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF); |
2323 | |
2324 | // Vectorize the interleaved store group. |
2325 | Value *MaskForGaps = |
2326 | createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group); |
2327 | assert((!MaskForGaps || !State.VF.isScalable()) && |
2328 | "masking gaps for scalable vectors is not yet supported." ); |
2329 | ArrayRef<VPValue *> StoredValues = getStoredValues(); |
2330 | for (unsigned Part = 0; Part < State.UF; Part++) { |
2331 | // Collect the stored vector from each member. |
2332 | SmallVector<Value *, 4> StoredVecs; |
2333 | unsigned StoredIdx = 0; |
2334 | for (unsigned i = 0; i < InterleaveFactor; i++) { |
2335 | assert((Group->getMember(i) || MaskForGaps) && |
2336 | "Fail to get a member from an interleaved store group" ); |
2337 | Instruction *Member = Group->getMember(Index: i); |
2338 | |
2339 | // Skip the gaps in the group. |
2340 | if (!Member) { |
2341 | Value *Undef = PoisonValue::get(T: SubVT); |
2342 | StoredVecs.push_back(Elt: Undef); |
2343 | continue; |
2344 | } |
2345 | |
2346 | Value *StoredVec = State.get(Def: StoredValues[StoredIdx], Part); |
2347 | ++StoredIdx; |
2348 | |
2349 | if (Group->isReverse()) |
2350 | StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse" ); |
2351 | |
2352 | // If this member has different type, cast it to a unified type. |
2353 | |
2354 | if (StoredVec->getType() != SubVT) |
2355 | StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL); |
2356 | |
2357 | StoredVecs.push_back(Elt: StoredVec); |
2358 | } |
2359 | |
2360 | // Interleave all the smaller vectors into one wider vector. |
2361 | Value *IVec = |
2362 | interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec" ); |
2363 | Instruction *NewStoreInstr; |
2364 | if (BlockInMask || MaskForGaps) { |
2365 | Value *GroupMask = CreateGroupMask(Part, MaskForGaps); |
2366 | NewStoreInstr = State.Builder.CreateMaskedStore( |
2367 | Val: IVec, Ptr: AddrParts[Part], Alignment: Group->getAlign(), Mask: GroupMask); |
2368 | } else |
2369 | NewStoreInstr = State.Builder.CreateAlignedStore(Val: IVec, Ptr: AddrParts[Part], |
2370 | Align: Group->getAlign()); |
2371 | |
2372 | Group->addMetadata(NewInst: NewStoreInstr); |
2373 | } |
2374 | } |
2375 | |
2376 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2377 | void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, |
2378 | VPSlotTracker &SlotTracker) const { |
2379 | O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at " ; |
2380 | IG->getInsertPos()->printAsOperand(O, false); |
2381 | O << ", " ; |
2382 | getAddr()->printAsOperand(O, SlotTracker); |
2383 | VPValue *Mask = getMask(); |
2384 | if (Mask) { |
2385 | O << ", " ; |
2386 | Mask->printAsOperand(O, SlotTracker); |
2387 | } |
2388 | |
2389 | unsigned OpIdx = 0; |
2390 | for (unsigned i = 0; i < IG->getFactor(); ++i) { |
2391 | if (!IG->getMember(i)) |
2392 | continue; |
2393 | if (getNumStoreOperands() > 0) { |
2394 | O << "\n" << Indent << " store " ; |
2395 | getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); |
2396 | O << " to index " << i; |
2397 | } else { |
2398 | O << "\n" << Indent << " " ; |
2399 | getVPValue(OpIdx)->printAsOperand(O, SlotTracker); |
2400 | O << " = load from index " << i; |
2401 | } |
2402 | ++OpIdx; |
2403 | } |
2404 | } |
2405 | #endif |
2406 | |
2407 | void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { |
2408 | Value *Start = getStartValue()->getLiveInIRValue(); |
2409 | PHINode *EntryPart = PHINode::Create(Ty: Start->getType(), NumReservedValues: 2, NameStr: "index" ); |
2410 | EntryPart->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt()); |
2411 | |
2412 | BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this); |
2413 | EntryPart->addIncoming(V: Start, BB: VectorPH); |
2414 | EntryPart->setDebugLoc(getDebugLoc()); |
2415 | for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) |
2416 | State.set(Def: this, V: EntryPart, Part, /*IsScalar*/ true); |
2417 | } |
2418 | |
2419 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2420 | void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
2421 | VPSlotTracker &SlotTracker) const { |
2422 | O << Indent << "EMIT " ; |
2423 | printAsOperand(O, SlotTracker); |
2424 | O << " = CANONICAL-INDUCTION " ; |
2425 | printOperands(O, SlotTracker); |
2426 | } |
2427 | #endif |
2428 | |
2429 | bool VPCanonicalIVPHIRecipe::isCanonical( |
2430 | InductionDescriptor::InductionKind Kind, VPValue *Start, |
2431 | VPValue *Step) const { |
2432 | // Must be an integer induction. |
2433 | if (Kind != InductionDescriptor::IK_IntInduction) |
2434 | return false; |
2435 | // Start must match the start value of this canonical induction. |
2436 | if (Start != getStartValue()) |
2437 | return false; |
2438 | |
2439 | // If the step is defined by a recipe, it is not a ConstantInt. |
2440 | if (Step->getDefiningRecipe()) |
2441 | return false; |
2442 | |
2443 | ConstantInt *StepC = dyn_cast<ConstantInt>(Val: Step->getLiveInIRValue()); |
2444 | return StepC && StepC->isOne(); |
2445 | } |
2446 | |
2447 | bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { |
2448 | return IsScalarAfterVectorization && |
2449 | (!IsScalable || vputils::onlyFirstLaneUsed(Def: this)); |
2450 | } |
2451 | |
2452 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2453 | void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, |
2454 | VPSlotTracker &SlotTracker) const { |
2455 | O << Indent << "EMIT " ; |
2456 | printAsOperand(O, SlotTracker); |
2457 | O << " = WIDEN-POINTER-INDUCTION " ; |
2458 | getStartValue()->printAsOperand(O, SlotTracker); |
2459 | O << ", " << *IndDesc.getStep(); |
2460 | } |
2461 | #endif |
2462 | |
2463 | void VPExpandSCEVRecipe::execute(VPTransformState &State) { |
2464 | assert(!State.Instance && "cannot be used in per-lane" ); |
2465 | const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); |
2466 | SCEVExpander Exp(SE, DL, "induction" ); |
2467 | |
2468 | Value *Res = Exp.expandCodeFor(SH: Expr, Ty: Expr->getType(), |
2469 | I: &*State.Builder.GetInsertPoint()); |
2470 | assert(!State.ExpandedSCEVs.contains(Expr) && |
2471 | "Same SCEV expanded multiple times" ); |
2472 | State.ExpandedSCEVs[Expr] = Res; |
2473 | for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) |
2474 | State.set(Def: this, V: Res, Instance: {Part, 0}); |
2475 | } |
2476 | |
2477 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2478 | void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, |
2479 | VPSlotTracker &SlotTracker) const { |
2480 | O << Indent << "EMIT " ; |
2481 | getVPSingleValue()->printAsOperand(O, SlotTracker); |
2482 | O << " = EXPAND SCEV " << *Expr; |
2483 | } |
2484 | #endif |
2485 | |
2486 | void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { |
2487 | Value *CanonicalIV = State.get(Def: getOperand(N: 0), Part: 0, /*IsScalar*/ true); |
2488 | Type *STy = CanonicalIV->getType(); |
2489 | IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); |
2490 | ElementCount VF = State.VF; |
2491 | Value *VStart = VF.isScalar() |
2492 | ? CanonicalIV |
2493 | : Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast" ); |
2494 | for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { |
2495 | Value *VStep = createStepForVF(B&: Builder, Ty: STy, VF, Step: Part); |
2496 | if (VF.isVector()) { |
2497 | VStep = Builder.CreateVectorSplat(EC: VF, V: VStep); |
2498 | VStep = |
2499 | Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType())); |
2500 | } |
2501 | Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv" ); |
2502 | State.set(Def: this, V: CanonicalVectorIV, Part); |
2503 | } |
2504 | } |
2505 | |
2506 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2507 | void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, |
2508 | VPSlotTracker &SlotTracker) const { |
2509 | O << Indent << "EMIT " ; |
2510 | printAsOperand(O, SlotTracker); |
2511 | O << " = WIDEN-CANONICAL-INDUCTION " ; |
2512 | printOperands(O, SlotTracker); |
2513 | } |
2514 | #endif |
2515 | |
2516 | void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { |
2517 | auto &Builder = State.Builder; |
2518 | // Create a vector from the initial value. |
2519 | auto *VectorInit = getStartValue()->getLiveInIRValue(); |
2520 | |
2521 | Type *VecTy = State.VF.isScalar() |
2522 | ? VectorInit->getType() |
2523 | : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF); |
2524 | |
2525 | BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this); |
2526 | if (State.VF.isVector()) { |
2527 | auto *IdxTy = Builder.getInt32Ty(); |
2528 | auto *One = ConstantInt::get(Ty: IdxTy, V: 1); |
2529 | IRBuilder<>::InsertPointGuard Guard(Builder); |
2530 | Builder.SetInsertPoint(VectorPH->getTerminator()); |
2531 | auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF); |
2532 | auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One); |
2533 | VectorInit = Builder.CreateInsertElement( |
2534 | Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init" ); |
2535 | } |
2536 | |
2537 | // Create a phi node for the new recurrence. |
2538 | PHINode *EntryPart = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur" ); |
2539 | EntryPart->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt()); |
2540 | EntryPart->addIncoming(V: VectorInit, BB: VectorPH); |
2541 | State.set(Def: this, V: EntryPart, Part: 0); |
2542 | } |
2543 | |
2544 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2545 | void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, |
2546 | VPSlotTracker &SlotTracker) const { |
2547 | O << Indent << "FIRST-ORDER-RECURRENCE-PHI " ; |
2548 | printAsOperand(O, SlotTracker); |
2549 | O << " = phi " ; |
2550 | printOperands(O, SlotTracker); |
2551 | } |
2552 | #endif |
2553 | |
2554 | void VPReductionPHIRecipe::execute(VPTransformState &State) { |
2555 | auto &Builder = State.Builder; |
2556 | |
2557 | // Reductions do not have to start at zero. They can start with |
2558 | // any loop invariant values. |
2559 | VPValue *StartVPV = getStartValue(); |
2560 | Value *StartV = StartVPV->getLiveInIRValue(); |
2561 | |
2562 | // In order to support recurrences we need to be able to vectorize Phi nodes. |
2563 | // Phi nodes have cycles, so we need to vectorize them in two stages. This is |
2564 | // stage #1: We create a new vector PHI node with no incoming edges. We'll use |
2565 | // this value when we vectorize all of the instructions that use the PHI. |
2566 | bool ScalarPHI = State.VF.isScalar() || IsInLoop; |
2567 | Type *VecTy = ScalarPHI ? StartV->getType() |
2568 | : VectorType::get(ElementType: StartV->getType(), EC: State.VF); |
2569 | |
2570 | BasicBlock * = State.CFG.PrevBB; |
2571 | assert(State.CurrentVectorLoop->getHeader() == HeaderBB && |
2572 | "recipe must be in the vector loop header" ); |
2573 | unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; |
2574 | for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { |
2575 | Instruction *EntryPart = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi" ); |
2576 | EntryPart->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt()); |
2577 | State.set(Def: this, V: EntryPart, Part, IsScalar: IsInLoop); |
2578 | } |
2579 | |
2580 | BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this); |
2581 | |
2582 | Value *Iden = nullptr; |
2583 | RecurKind RK = RdxDesc.getRecurrenceKind(); |
2584 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK) || |
2585 | RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) { |
2586 | // MinMax and AnyOf reductions have the start value as their identity. |
2587 | if (ScalarPHI) { |
2588 | Iden = StartV; |
2589 | } else { |
2590 | IRBuilderBase::InsertPointGuard IPBuilder(Builder); |
2591 | Builder.SetInsertPoint(VectorPH->getTerminator()); |
2592 | StartV = Iden = |
2593 | Builder.CreateVectorSplat(EC: State.VF, V: StartV, Name: "minmax.ident" ); |
2594 | } |
2595 | } else { |
2596 | Iden = RdxDesc.getRecurrenceIdentity(K: RK, Tp: VecTy->getScalarType(), |
2597 | FMF: RdxDesc.getFastMathFlags()); |
2598 | |
2599 | if (!ScalarPHI) { |
2600 | Iden = Builder.CreateVectorSplat(EC: State.VF, V: Iden); |
2601 | IRBuilderBase::InsertPointGuard IPBuilder(Builder); |
2602 | Builder.SetInsertPoint(VectorPH->getTerminator()); |
2603 | Constant *Zero = Builder.getInt32(C: 0); |
2604 | StartV = Builder.CreateInsertElement(Vec: Iden, NewElt: StartV, Idx: Zero); |
2605 | } |
2606 | } |
2607 | |
2608 | for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { |
2609 | Value *EntryPart = State.get(Def: this, Part, IsScalar: IsInLoop); |
2610 | // Make sure to add the reduction start value only to the |
2611 | // first unroll part. |
2612 | Value *StartVal = (Part == 0) ? StartV : Iden; |
2613 | cast<PHINode>(Val: EntryPart)->addIncoming(V: StartVal, BB: VectorPH); |
2614 | } |
2615 | } |
2616 | |
2617 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2618 | void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
2619 | VPSlotTracker &SlotTracker) const { |
2620 | O << Indent << "WIDEN-REDUCTION-PHI " ; |
2621 | |
2622 | printAsOperand(O, SlotTracker); |
2623 | O << " = phi " ; |
2624 | printOperands(O, SlotTracker); |
2625 | } |
2626 | #endif |
2627 | |
2628 | void VPWidenPHIRecipe::execute(VPTransformState &State) { |
2629 | assert(EnableVPlanNativePath && |
2630 | "Non-native vplans are not expected to have VPWidenPHIRecipes." ); |
2631 | |
2632 | Value *Op0 = State.get(Def: getOperand(N: 0), Part: 0); |
2633 | Type *VecTy = Op0->getType(); |
2634 | Value *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: 2, Name: "vec.phi" ); |
2635 | State.set(Def: this, V: VecPhi, Part: 0); |
2636 | } |
2637 | |
2638 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2639 | void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
2640 | VPSlotTracker &SlotTracker) const { |
2641 | O << Indent << "WIDEN-PHI " ; |
2642 | |
2643 | auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); |
2644 | // Unless all incoming values are modeled in VPlan print the original PHI |
2645 | // directly. |
2646 | // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming |
2647 | // values as VPValues. |
2648 | if (getNumOperands() != OriginalPhi->getNumOperands()) { |
2649 | O << VPlanIngredient(OriginalPhi); |
2650 | return; |
2651 | } |
2652 | |
2653 | printAsOperand(O, SlotTracker); |
2654 | O << " = phi " ; |
2655 | printOperands(O, SlotTracker); |
2656 | } |
2657 | #endif |
2658 | |
2659 | // TODO: It would be good to use the existing VPWidenPHIRecipe instead and |
2660 | // remove VPActiveLaneMaskPHIRecipe. |
2661 | void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { |
2662 | BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this); |
2663 | for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { |
2664 | Value *StartMask = State.get(Def: getOperand(N: 0), Part); |
2665 | PHINode *EntryPart = |
2666 | State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask" ); |
2667 | EntryPart->addIncoming(V: StartMask, BB: VectorPH); |
2668 | EntryPart->setDebugLoc(getDebugLoc()); |
2669 | State.set(Def: this, V: EntryPart, Part); |
2670 | } |
2671 | } |
2672 | |
2673 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2674 | void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
2675 | VPSlotTracker &SlotTracker) const { |
2676 | O << Indent << "ACTIVE-LANE-MASK-PHI " ; |
2677 | |
2678 | printAsOperand(O, SlotTracker); |
2679 | O << " = phi " ; |
2680 | printOperands(O, SlotTracker); |
2681 | } |
2682 | #endif |
2683 | |
2684 | void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { |
2685 | BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this); |
2686 | assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization." ); |
2687 | Value *Start = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0)); |
2688 | PHINode *EntryPart = |
2689 | State.Builder.CreatePHI(Ty: Start->getType(), NumReservedValues: 2, Name: "evl.based.iv" ); |
2690 | EntryPart->addIncoming(V: Start, BB: VectorPH); |
2691 | EntryPart->setDebugLoc(getDebugLoc()); |
2692 | State.set(Def: this, V: EntryPart, Part: 0, /*IsScalar=*/true); |
2693 | } |
2694 | |
2695 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2696 | void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
2697 | VPSlotTracker &SlotTracker) const { |
2698 | O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI " ; |
2699 | |
2700 | printAsOperand(O, SlotTracker); |
2701 | O << " = phi " ; |
2702 | printOperands(O, SlotTracker); |
2703 | } |
2704 | #endif |
2705 | |