1 | //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file contains implementations for different VPlan recipes. |
11 | /// |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "LoopVectorizationPlanner.h" |
15 | #include "VPlan.h" |
16 | #include "VPlanAnalysis.h" |
17 | #include "VPlanHelpers.h" |
18 | #include "VPlanPatternMatch.h" |
19 | #include "VPlanUtils.h" |
20 | #include "llvm/ADT/STLExtras.h" |
21 | #include "llvm/ADT/SmallVector.h" |
22 | #include "llvm/ADT/Twine.h" |
23 | #include "llvm/Analysis/AssumptionCache.h" |
24 | #include "llvm/Analysis/IVDescriptors.h" |
25 | #include "llvm/Analysis/LoopInfo.h" |
26 | #include "llvm/IR/BasicBlock.h" |
27 | #include "llvm/IR/IRBuilder.h" |
28 | #include "llvm/IR/Instruction.h" |
29 | #include "llvm/IR/Instructions.h" |
30 | #include "llvm/IR/Intrinsics.h" |
31 | #include "llvm/IR/Type.h" |
32 | #include "llvm/IR/Value.h" |
33 | #include "llvm/Support/Casting.h" |
34 | #include "llvm/Support/CommandLine.h" |
35 | #include "llvm/Support/Debug.h" |
36 | #include "llvm/Support/raw_ostream.h" |
37 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
38 | #include "llvm/Transforms/Utils/LoopUtils.h" |
39 | #include "llvm/Transforms/Utils/LoopVersioning.h" |
40 | #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" |
41 | #include <cassert> |
42 | |
43 | using namespace llvm; |
44 | |
45 | using VectorParts = SmallVector<Value *, 2>; |
46 | |
47 | #define LV_NAME "loop-vectorize" |
48 | #define DEBUG_TYPE LV_NAME |
49 | |
50 | bool VPRecipeBase::mayWriteToMemory() const { |
51 | switch (getVPDefID()) { |
52 | case VPExpressionSC: |
53 | return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory(); |
54 | case VPInstructionSC: |
55 | return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory(); |
56 | case VPInterleaveSC: |
57 | return cast<VPInterleaveRecipe>(Val: this)->getNumStoreOperands() > 0; |
58 | case VPWidenStoreEVLSC: |
59 | case VPWidenStoreSC: |
60 | return true; |
61 | case VPReplicateSC: |
62 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
63 | ->mayWriteToMemory(); |
64 | case VPWidenCallSC: |
65 | return !cast<VPWidenCallRecipe>(Val: this) |
66 | ->getCalledScalarFunction() |
67 | ->onlyReadsMemory(); |
68 | case VPWidenIntrinsicSC: |
69 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayWriteToMemory(); |
70 | case VPBranchOnMaskSC: |
71 | case VPFirstOrderRecurrencePHISC: |
72 | case VPScalarIVStepsSC: |
73 | case VPPredInstPHISC: |
74 | return false; |
75 | case VPBlendSC: |
76 | case VPReductionEVLSC: |
77 | case VPReductionSC: |
78 | case VPVectorPointerSC: |
79 | case VPWidenCanonicalIVSC: |
80 | case VPWidenCastSC: |
81 | case VPWidenGEPSC: |
82 | case VPWidenIntOrFpInductionSC: |
83 | case VPWidenLoadEVLSC: |
84 | case VPWidenLoadSC: |
85 | case VPWidenPHISC: |
86 | case VPWidenSC: |
87 | case VPWidenSelectSC: { |
88 | const Instruction *I = |
89 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
90 | (void)I; |
91 | assert((!I || !I->mayWriteToMemory()) && |
92 | "underlying instruction may write to memory" ); |
93 | return false; |
94 | } |
95 | default: |
96 | return true; |
97 | } |
98 | } |
99 | |
100 | bool VPRecipeBase::mayReadFromMemory() const { |
101 | switch (getVPDefID()) { |
102 | case VPExpressionSC: |
103 | return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory(); |
104 | case VPInstructionSC: |
105 | return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory(); |
106 | case VPWidenLoadEVLSC: |
107 | case VPWidenLoadSC: |
108 | return true; |
109 | case VPReplicateSC: |
110 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
111 | ->mayReadFromMemory(); |
112 | case VPWidenCallSC: |
113 | return !cast<VPWidenCallRecipe>(Val: this) |
114 | ->getCalledScalarFunction() |
115 | ->onlyWritesMemory(); |
116 | case VPWidenIntrinsicSC: |
117 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayReadFromMemory(); |
118 | case VPBranchOnMaskSC: |
119 | case VPFirstOrderRecurrencePHISC: |
120 | case VPPredInstPHISC: |
121 | case VPScalarIVStepsSC: |
122 | case VPWidenStoreEVLSC: |
123 | case VPWidenStoreSC: |
124 | return false; |
125 | case VPBlendSC: |
126 | case VPReductionEVLSC: |
127 | case VPReductionSC: |
128 | case VPVectorPointerSC: |
129 | case VPWidenCanonicalIVSC: |
130 | case VPWidenCastSC: |
131 | case VPWidenGEPSC: |
132 | case VPWidenIntOrFpInductionSC: |
133 | case VPWidenPHISC: |
134 | case VPWidenSC: |
135 | case VPWidenSelectSC: { |
136 | const Instruction *I = |
137 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
138 | (void)I; |
139 | assert((!I || !I->mayReadFromMemory()) && |
140 | "underlying instruction may read from memory" ); |
141 | return false; |
142 | } |
143 | default: |
144 | return true; |
145 | } |
146 | } |
147 | |
148 | bool VPRecipeBase::mayHaveSideEffects() const { |
149 | switch (getVPDefID()) { |
150 | case VPExpressionSC: |
151 | return cast<VPExpressionRecipe>(Val: this)->mayHaveSideEffects(); |
152 | case VPDerivedIVSC: |
153 | case VPFirstOrderRecurrencePHISC: |
154 | case VPPredInstPHISC: |
155 | case VPVectorEndPointerSC: |
156 | return false; |
157 | case VPInstructionSC: |
158 | return mayWriteToMemory(); |
159 | case VPWidenCallSC: { |
160 | Function *Fn = cast<VPWidenCallRecipe>(Val: this)->getCalledScalarFunction(); |
161 | return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); |
162 | } |
163 | case VPWidenIntrinsicSC: |
164 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayHaveSideEffects(); |
165 | case VPBlendSC: |
166 | case VPReductionEVLSC: |
167 | case VPReductionSC: |
168 | case VPScalarIVStepsSC: |
169 | case VPVectorPointerSC: |
170 | case VPWidenCanonicalIVSC: |
171 | case VPWidenCastSC: |
172 | case VPWidenGEPSC: |
173 | case VPWidenIntOrFpInductionSC: |
174 | case VPWidenPHISC: |
175 | case VPWidenPointerInductionSC: |
176 | case VPWidenSC: |
177 | case VPWidenSelectSC: { |
178 | const Instruction *I = |
179 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
180 | (void)I; |
181 | assert((!I || !I->mayHaveSideEffects()) && |
182 | "underlying instruction has side-effects" ); |
183 | return false; |
184 | } |
185 | case VPInterleaveSC: |
186 | return mayWriteToMemory(); |
187 | case VPWidenLoadEVLSC: |
188 | case VPWidenLoadSC: |
189 | case VPWidenStoreEVLSC: |
190 | case VPWidenStoreSC: |
191 | assert( |
192 | cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == |
193 | mayWriteToMemory() && |
194 | "mayHaveSideffects result for ingredient differs from this " |
195 | "implementation" ); |
196 | return mayWriteToMemory(); |
197 | case VPReplicateSC: { |
198 | auto *R = cast<VPReplicateRecipe>(Val: this); |
199 | return R->getUnderlyingInstr()->mayHaveSideEffects(); |
200 | } |
201 | default: |
202 | return true; |
203 | } |
204 | } |
205 | |
206 | void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { |
207 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
208 | assert(InsertPos->getParent() && |
209 | "Insertion position not in any VPBasicBlock" ); |
210 | InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator()); |
211 | } |
212 | |
213 | void VPRecipeBase::insertBefore(VPBasicBlock &BB, |
214 | iplist<VPRecipeBase>::iterator I) { |
215 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
216 | assert(I == BB.end() || I->getParent() == &BB); |
217 | BB.insert(Recipe: this, InsertPt: I); |
218 | } |
219 | |
220 | void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { |
221 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
222 | assert(InsertPos->getParent() && |
223 | "Insertion position not in any VPBasicBlock" ); |
224 | InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator())); |
225 | } |
226 | |
227 | void VPRecipeBase::removeFromParent() { |
228 | assert(getParent() && "Recipe not in any VPBasicBlock" ); |
229 | getParent()->getRecipeList().remove(IT: getIterator()); |
230 | Parent = nullptr; |
231 | } |
232 | |
233 | iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { |
234 | assert(getParent() && "Recipe not in any VPBasicBlock" ); |
235 | return getParent()->getRecipeList().erase(where: getIterator()); |
236 | } |
237 | |
238 | void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { |
239 | removeFromParent(); |
240 | insertAfter(InsertPos); |
241 | } |
242 | |
243 | void VPRecipeBase::moveBefore(VPBasicBlock &BB, |
244 | iplist<VPRecipeBase>::iterator I) { |
245 | removeFromParent(); |
246 | insertBefore(BB, I); |
247 | } |
248 | |
249 | InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { |
250 | // Get the underlying instruction for the recipe, if there is one. It is used |
251 | // to |
252 | // * decide if cost computation should be skipped for this recipe, |
253 | // * apply forced target instruction cost. |
254 | Instruction *UI = nullptr; |
255 | if (auto *S = dyn_cast<VPSingleDefRecipe>(Val: this)) |
256 | UI = dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue()); |
257 | else if (auto *IG = dyn_cast<VPInterleaveRecipe>(Val: this)) |
258 | UI = IG->getInsertPos(); |
259 | else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: this)) |
260 | UI = &WidenMem->getIngredient(); |
261 | |
262 | InstructionCost RecipeCost; |
263 | if (UI && Ctx.skipCostComputation(UI, IsVector: VF.isVector())) { |
264 | RecipeCost = 0; |
265 | } else { |
266 | RecipeCost = computeCost(VF, Ctx); |
267 | if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && |
268 | RecipeCost.isValid()) |
269 | RecipeCost = InstructionCost(ForceTargetInstructionCost); |
270 | } |
271 | |
272 | LLVM_DEBUG({ |
273 | dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": " ; |
274 | dump(); |
275 | }); |
276 | return RecipeCost; |
277 | } |
278 | |
279 | InstructionCost VPRecipeBase::computeCost(ElementCount VF, |
280 | VPCostContext &Ctx) const { |
281 | llvm_unreachable("subclasses should implement computeCost" ); |
282 | } |
283 | |
284 | bool VPRecipeBase::isPhi() const { |
285 | return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) || |
286 | (isa<VPInstruction>(Val: this) && |
287 | cast<VPInstruction>(Val: this)->getOpcode() == Instruction::PHI) || |
288 | isa<VPIRPhi>(Val: this); |
289 | } |
290 | |
291 | bool VPRecipeBase::isScalarCast() const { |
292 | auto *VPI = dyn_cast<VPInstruction>(Val: this); |
293 | return VPI && Instruction::isCast(Opcode: VPI->getOpcode()); |
294 | } |
295 | |
296 | InstructionCost |
297 | VPPartialReductionRecipe::computeCost(ElementCount VF, |
298 | VPCostContext &Ctx) const { |
299 | std::optional<unsigned> Opcode; |
300 | VPValue *Op = getOperand(N: 0); |
301 | VPRecipeBase *OpR = Op->getDefiningRecipe(); |
302 | |
303 | // If the partial reduction is predicated, a select will be operand 0 |
304 | using namespace llvm::VPlanPatternMatch; |
305 | if (match(V: getOperand(N: 1), P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: Op), Op2: m_VPValue()))) { |
306 | OpR = Op->getDefiningRecipe(); |
307 | } |
308 | |
309 | Type *InputTypeA = nullptr, *InputTypeB = nullptr; |
310 | TTI::PartialReductionExtendKind ExtAType = TTI::PR_None, |
311 | ExtBType = TTI::PR_None; |
312 | |
313 | auto GetExtendKind = [](VPRecipeBase *R) { |
314 | if (!R) |
315 | return TTI::PR_None; |
316 | auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(Val: R); |
317 | if (!WidenCastR) |
318 | return TTI::PR_None; |
319 | if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) |
320 | return TTI::PR_ZeroExtend; |
321 | if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) |
322 | return TTI::PR_SignExtend; |
323 | return TTI::PR_None; |
324 | }; |
325 | |
326 | // Pick out opcode, type/ext information and use sub side effects from a widen |
327 | // recipe. |
328 | auto HandleWiden = [&](VPWidenRecipe *Widen) { |
329 | if (match(V: Widen, |
330 | P: m_Binary<Instruction::Sub>(Op0: m_SpecificInt(V: 0), Op1: m_VPValue(V&: Op)))) { |
331 | Widen = dyn_cast<VPWidenRecipe>(Val: Op->getDefiningRecipe()); |
332 | } |
333 | Opcode = Widen->getOpcode(); |
334 | VPRecipeBase *ExtAR = Widen->getOperand(N: 0)->getDefiningRecipe(); |
335 | VPRecipeBase *ExtBR = Widen->getOperand(N: 1)->getDefiningRecipe(); |
336 | InputTypeA = Ctx.Types.inferScalarType(V: ExtAR ? ExtAR->getOperand(N: 0) |
337 | : Widen->getOperand(N: 0)); |
338 | InputTypeB = Ctx.Types.inferScalarType(V: ExtBR ? ExtBR->getOperand(N: 0) |
339 | : Widen->getOperand(N: 1)); |
340 | ExtAType = GetExtendKind(ExtAR); |
341 | ExtBType = GetExtendKind(ExtBR); |
342 | }; |
343 | |
344 | if (isa<VPWidenCastRecipe>(Val: OpR)) { |
345 | InputTypeA = Ctx.Types.inferScalarType(V: OpR->getOperand(N: 0)); |
346 | ExtAType = GetExtendKind(OpR); |
347 | } else if (isa<VPReductionPHIRecipe>(Val: OpR)) { |
348 | auto RedPhiOp1R = getOperand(N: 1)->getDefiningRecipe(); |
349 | if (isa<VPWidenCastRecipe>(Val: RedPhiOp1R)) { |
350 | InputTypeA = Ctx.Types.inferScalarType(V: RedPhiOp1R->getOperand(N: 0)); |
351 | ExtAType = GetExtendKind(RedPhiOp1R); |
352 | } else if (auto Widen = dyn_cast<VPWidenRecipe>(Val: RedPhiOp1R)) |
353 | HandleWiden(Widen); |
354 | } else if (auto Widen = dyn_cast<VPWidenRecipe>(Val: OpR)) { |
355 | HandleWiden(Widen); |
356 | } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(Val: OpR)) { |
357 | return Reduction->computeCost(VF, Ctx); |
358 | } |
359 | auto *PhiType = Ctx.Types.inferScalarType(V: getOperand(N: 1)); |
360 | return Ctx.TTI.getPartialReductionCost(Opcode: getOpcode(), InputTypeA, InputTypeB, |
361 | AccumType: PhiType, VF, OpAExtend: ExtAType, OpBExtend: ExtBType, |
362 | BinOp: Opcode, CostKind: Ctx.CostKind); |
363 | } |
364 | |
365 | void VPPartialReductionRecipe::execute(VPTransformState &State) { |
366 | auto &Builder = State.Builder; |
367 | |
368 | assert(getOpcode() == Instruction::Add && |
369 | "Unhandled partial reduction opcode" ); |
370 | |
371 | Value *BinOpVal = State.get(Def: getOperand(N: 1)); |
372 | Value *PhiVal = State.get(Def: getOperand(N: 0)); |
373 | assert(PhiVal && BinOpVal && "Phi and Mul must be set" ); |
374 | |
375 | Type *RetTy = PhiVal->getType(); |
376 | |
377 | CallInst *V = Builder.CreateIntrinsic( |
378 | RetTy, ID: Intrinsic::experimental_vector_partial_reduce_add, |
379 | Args: {PhiVal, BinOpVal}, FMFSource: nullptr, Name: "partial.reduce" ); |
380 | |
381 | State.set(Def: this, V); |
382 | } |
383 | |
384 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
385 | void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, |
386 | VPSlotTracker &SlotTracker) const { |
387 | O << Indent << "PARTIAL-REDUCE " ; |
388 | printAsOperand(O, SlotTracker); |
389 | O << " = " << Instruction::getOpcodeName(getOpcode()) << " " ; |
390 | printOperands(O, SlotTracker); |
391 | } |
392 | #endif |
393 | |
394 | FastMathFlags VPIRFlags::getFastMathFlags() const { |
395 | assert(OpType == OperationType::FPMathOp && |
396 | "recipe doesn't have fast math flags" ); |
397 | FastMathFlags Res; |
398 | Res.setAllowReassoc(FMFs.AllowReassoc); |
399 | Res.setNoNaNs(FMFs.NoNaNs); |
400 | Res.setNoInfs(FMFs.NoInfs); |
401 | Res.setNoSignedZeros(FMFs.NoSignedZeros); |
402 | Res.setAllowReciprocal(FMFs.AllowReciprocal); |
403 | Res.setAllowContract(FMFs.AllowContract); |
404 | Res.setApproxFunc(FMFs.ApproxFunc); |
405 | return Res; |
406 | } |
407 | |
408 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
409 | void VPSingleDefRecipe::dump() const { VPDef::dump(); } |
410 | #endif |
411 | |
412 | template <unsigned PartOpIdx> |
413 | VPValue * |
414 | VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const { |
415 | if (U.getNumOperands() == PartOpIdx + 1) |
416 | return U.getOperand(N: PartOpIdx); |
417 | return nullptr; |
418 | } |
419 | |
420 | template <unsigned PartOpIdx> |
421 | unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const { |
422 | if (auto *UnrollPartOp = getUnrollPartOperand(U)) |
423 | return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue(); |
424 | return 0; |
425 | } |
426 | |
427 | namespace llvm { |
428 | template class VPUnrollPartAccessor<2>; |
429 | template class VPUnrollPartAccessor<3>; |
430 | } |
431 | |
432 | VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, |
433 | const VPIRFlags &Flags, DebugLoc DL, |
434 | const Twine &Name) |
435 | : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL), |
436 | VPIRMetadata(), Opcode(Opcode), Name(Name.str()) { |
437 | assert(flagsValidForOpcode(getOpcode()) && |
438 | "Set flags not supported for the provided opcode" ); |
439 | assert((getNumOperandsForOpcode(Opcode) == -1u || |
440 | getNumOperandsForOpcode(Opcode) == getNumOperands()) && |
441 | "number of operands does not match opcode" ); |
442 | } |
443 | |
444 | #ifndef NDEBUG |
445 | unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { |
446 | if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode)) |
447 | return 1; |
448 | |
449 | if (Instruction::isBinaryOp(Opcode)) |
450 | return 2; |
451 | |
452 | switch (Opcode) { |
453 | case VPInstruction::StepVector: |
454 | return 0; |
455 | case Instruction::Alloca: |
456 | case Instruction::ExtractValue: |
457 | case Instruction::Freeze: |
458 | case Instruction::Load: |
459 | case VPInstruction::AnyOf: |
460 | case VPInstruction::BranchOnCond: |
461 | case VPInstruction::CalculateTripCountMinusVF: |
462 | case VPInstruction::CanonicalIVIncrementForPart: |
463 | case VPInstruction::ExplicitVectorLength: |
464 | case VPInstruction::ExtractLastElement: |
465 | case VPInstruction::ExtractPenultimateElement: |
466 | case VPInstruction::FirstActiveLane: |
467 | case VPInstruction::Not: |
468 | return 1; |
469 | case Instruction::ICmp: |
470 | case Instruction::FCmp: |
471 | case Instruction::Store: |
472 | case VPInstruction::ActiveLaneMask: |
473 | case VPInstruction::BranchOnCount: |
474 | case VPInstruction::ComputeReductionResult: |
475 | case VPInstruction::FirstOrderRecurrenceSplice: |
476 | case VPInstruction::LogicalAnd: |
477 | case VPInstruction::PtrAdd: |
478 | case VPInstruction::WideIVStep: |
479 | return 2; |
480 | case Instruction::Select: |
481 | case VPInstruction::ComputeAnyOfResult: |
482 | case VPInstruction::ReductionStartVector: |
483 | return 3; |
484 | case VPInstruction::ComputeFindIVResult: |
485 | return 4; |
486 | case Instruction::Call: |
487 | case Instruction::GetElementPtr: |
488 | case Instruction::PHI: |
489 | case Instruction::Switch: |
490 | // Cannot determine the number of operands from the opcode. |
491 | return -1u; |
492 | } |
493 | llvm_unreachable("all cases should be handled above" ); |
494 | } |
495 | #endif |
496 | |
497 | bool VPInstruction::doesGeneratePerAllLanes() const { |
498 | return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this); |
499 | } |
500 | |
501 | bool VPInstruction::canGenerateScalarForFirstLane() const { |
502 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
503 | return true; |
504 | if (isSingleScalar() || isVectorToScalar()) |
505 | return true; |
506 | switch (Opcode) { |
507 | case Instruction::Freeze: |
508 | case Instruction::ICmp: |
509 | case Instruction::PHI: |
510 | case Instruction::Select: |
511 | case VPInstruction::BranchOnCond: |
512 | case VPInstruction::BranchOnCount: |
513 | case VPInstruction::CalculateTripCountMinusVF: |
514 | case VPInstruction::CanonicalIVIncrementForPart: |
515 | case VPInstruction::PtrAdd: |
516 | case VPInstruction::ExplicitVectorLength: |
517 | case VPInstruction::AnyOf: |
518 | return true; |
519 | default: |
520 | return false; |
521 | } |
522 | } |
523 | |
524 | Value *VPInstruction::generatePerLane(VPTransformState &State, |
525 | const VPLane &Lane) { |
526 | IRBuilderBase &Builder = State.Builder; |
527 | |
528 | assert(getOpcode() == VPInstruction::PtrAdd && |
529 | "only PtrAdd opcodes are supported for now" ); |
530 | return Builder.CreatePtrAdd(Ptr: State.get(Def: getOperand(N: 0), Lane), |
531 | Offset: State.get(Def: getOperand(N: 1), Lane), Name); |
532 | } |
533 | |
534 | /// Create a conditional branch using \p Cond branching to the successors of \p |
535 | /// VPBB. Note that the first successor is always forward (i.e. not created yet) |
536 | /// while the second successor may already have been created (if it is a header |
537 | /// block and VPBB is a latch). |
538 | static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB, |
539 | VPTransformState &State) { |
540 | // Replace the temporary unreachable terminator with a new conditional |
541 | // branch, hooking it up to backward destination (header) for latch blocks |
542 | // now, and to forward destination(s) later when they are created. |
543 | // Second successor may be backwards - iff it is already in VPBB2IRBB. |
544 | VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(Val: VPBB->getSuccessors()[1]); |
545 | BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(Val: SecondVPSucc); |
546 | BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB]; |
547 | BranchInst *CondBr = State.Builder.CreateCondBr(Cond, True: IRBB, False: SecondIRSucc); |
548 | // First successor is always forward, reset it to nullptr |
549 | CondBr->setSuccessor(idx: 0, NewSucc: nullptr); |
550 | IRBB->getTerminator()->eraseFromParent(); |
551 | return CondBr; |
552 | } |
553 | |
554 | Value *VPInstruction::generate(VPTransformState &State) { |
555 | IRBuilderBase &Builder = State.Builder; |
556 | |
557 | if (Instruction::isBinaryOp(Opcode: getOpcode())) { |
558 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
559 | Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
560 | Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
561 | auto *Res = |
562 | Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name); |
563 | if (auto *I = dyn_cast<Instruction>(Val: Res)) |
564 | applyFlags(I&: *I); |
565 | return Res; |
566 | } |
567 | |
568 | switch (getOpcode()) { |
569 | case VPInstruction::Not: { |
570 | Value *A = State.get(Def: getOperand(N: 0)); |
571 | return Builder.CreateNot(V: A, Name); |
572 | } |
573 | case Instruction::ExtractElement: { |
574 | assert(State.VF.isVector() && "Only extract elements from vectors" ); |
575 | if (getOperand(N: 1)->isLiveIn()) { |
576 | unsigned = |
577 | cast<ConstantInt>(Val: getOperand(N: 1)->getLiveInIRValue())->getZExtValue(); |
578 | return State.get(Def: getOperand(N: 0), Lane: VPLane(IdxToExtract)); |
579 | } |
580 | Value *Vec = State.get(Def: getOperand(N: 0)); |
581 | Value *Idx = State.get(Def: getOperand(N: 1), /*IsScalar=*/true); |
582 | return Builder.CreateExtractElement(Vec, Idx, Name); |
583 | } |
584 | case Instruction::Freeze: { |
585 | Value *Op = State.get(Def: getOperand(N: 0), IsScalar: vputils::onlyFirstLaneUsed(Def: this)); |
586 | return Builder.CreateFreeze(V: Op, Name); |
587 | } |
588 | case Instruction::ICmp: { |
589 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
590 | Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
591 | Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
592 | return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name); |
593 | } |
594 | case Instruction::PHI: { |
595 | llvm_unreachable("should be handled by VPPhi::execute" ); |
596 | } |
597 | case Instruction::Select: { |
598 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
599 | Value *Cond = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
600 | Value *Op1 = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
601 | Value *Op2 = State.get(Def: getOperand(N: 2), IsScalar: OnlyFirstLaneUsed); |
602 | return Builder.CreateSelect(C: Cond, True: Op1, False: Op2, Name); |
603 | } |
604 | case VPInstruction::ActiveLaneMask: { |
605 | // Get first lane of vector induction variable. |
606 | Value *VIVElem0 = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
607 | // Get the original loop tripcount. |
608 | Value *ScalarTC = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
609 | |
610 | // If this part of the active lane mask is scalar, generate the CMP directly |
611 | // to avoid unnecessary extracts. |
612 | if (State.VF.isScalar()) |
613 | return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC, |
614 | Name); |
615 | |
616 | auto *Int1Ty = Type::getInt1Ty(C&: Builder.getContext()); |
617 | auto *PredTy = VectorType::get(ElementType: Int1Ty, EC: State.VF); |
618 | return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask, |
619 | Types: {PredTy, ScalarTC->getType()}, |
620 | Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name); |
621 | } |
622 | case VPInstruction::FirstOrderRecurrenceSplice: { |
623 | // Generate code to combine the previous and current values in vector v3. |
624 | // |
625 | // vector.ph: |
626 | // v_init = vector(..., ..., ..., a[-1]) |
627 | // br vector.body |
628 | // |
629 | // vector.body |
630 | // i = phi [0, vector.ph], [i+4, vector.body] |
631 | // v1 = phi [v_init, vector.ph], [v2, vector.body] |
632 | // v2 = a[i, i+1, i+2, i+3]; |
633 | // v3 = vector(v1(3), v2(0, 1, 2)) |
634 | |
635 | auto *V1 = State.get(Def: getOperand(N: 0)); |
636 | if (!V1->getType()->isVectorTy()) |
637 | return V1; |
638 | Value *V2 = State.get(Def: getOperand(N: 1)); |
639 | return Builder.CreateVectorSplice(V1, V2, Imm: -1, Name); |
640 | } |
641 | case VPInstruction::CalculateTripCountMinusVF: { |
642 | unsigned UF = getParent()->getPlan()->getUF(); |
643 | Value *ScalarTC = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
644 | Value *Step = createStepForVF(B&: Builder, Ty: ScalarTC->getType(), VF: State.VF, Step: UF); |
645 | Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: Step); |
646 | Value *Cmp = Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: Step); |
647 | Value *Zero = ConstantInt::get(Ty: ScalarTC->getType(), V: 0); |
648 | return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero); |
649 | } |
650 | case VPInstruction::ExplicitVectorLength: { |
651 | // TODO: Restructure this code with an explicit remainder loop, vsetvli can |
652 | // be outside of the main loop. |
653 | Value *AVL = State.get(Def: getOperand(N: 0), /*IsScalar*/ true); |
654 | // Compute EVL |
655 | assert(AVL->getType()->isIntegerTy() && |
656 | "Requested vector length should be an integer." ); |
657 | |
658 | assert(State.VF.isScalable() && "Expected scalable vector factor." ); |
659 | Value *VFArg = State.Builder.getInt32(C: State.VF.getKnownMinValue()); |
660 | |
661 | Value *EVL = State.Builder.CreateIntrinsic( |
662 | RetTy: State.Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length, |
663 | Args: {AVL, VFArg, State.Builder.getTrue()}); |
664 | return EVL; |
665 | } |
666 | case VPInstruction::CanonicalIVIncrementForPart: { |
667 | unsigned Part = getUnrollPart(U&: *this); |
668 | auto *IV = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
669 | assert(Part != 0 && "Must have a positive part" ); |
670 | // The canonical IV is incremented by the vectorization factor (num of |
671 | // SIMD elements) times the unroll part. |
672 | Value *Step = createStepForVF(B&: Builder, Ty: IV->getType(), VF: State.VF, Step: Part); |
673 | return Builder.CreateAdd(LHS: IV, RHS: Step, Name, HasNUW: hasNoUnsignedWrap(), |
674 | HasNSW: hasNoSignedWrap()); |
675 | } |
676 | case VPInstruction::BranchOnCond: { |
677 | Value *Cond = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
678 | auto *Br = createCondBranch(Cond, VPBB: getParent(), State); |
679 | applyMetadata(I&: *Br); |
680 | return Br; |
681 | } |
682 | case VPInstruction::BranchOnCount: { |
683 | // First create the compare. |
684 | Value *IV = State.get(Def: getOperand(N: 0), /*IsScalar*/ true); |
685 | Value *TC = State.get(Def: getOperand(N: 1), /*IsScalar*/ true); |
686 | Value *Cond = Builder.CreateICmpEQ(LHS: IV, RHS: TC); |
687 | return createCondBranch(Cond, VPBB: getParent(), State); |
688 | } |
689 | case VPInstruction::Broadcast: { |
690 | return Builder.CreateVectorSplat( |
691 | EC: State.VF, V: State.get(Def: getOperand(N: 0), /*IsScalar*/ true), Name: "broadcast" ); |
692 | } |
693 | case VPInstruction::BuildStructVector: { |
694 | // For struct types, we need to build a new 'wide' struct type, where each |
695 | // element is widened, i.e., we create a struct of vectors. |
696 | auto *StructTy = |
697 | cast<StructType>(Val: State.TypeAnalysis.inferScalarType(V: getOperand(N: 0))); |
698 | Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: StructTy, EC: State.VF)); |
699 | for (const auto &[LaneIndex, Op] : enumerate(First: operands())) { |
700 | for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements(); |
701 | FieldIndex++) { |
702 | Value *ScalarValue = |
703 | Builder.CreateExtractValue(Agg: State.get(Def: Op, IsScalar: true), Idxs: FieldIndex); |
704 | Value *VectorValue = Builder.CreateExtractValue(Agg: Res, Idxs: FieldIndex); |
705 | VectorValue = |
706 | Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneIndex); |
707 | Res = Builder.CreateInsertValue(Agg: Res, Val: VectorValue, Idxs: FieldIndex); |
708 | } |
709 | } |
710 | return Res; |
711 | } |
712 | case VPInstruction::BuildVector: { |
713 | auto *ScalarTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)); |
714 | auto NumOfElements = ElementCount::getFixed(MinVal: getNumOperands()); |
715 | Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: ScalarTy, EC: NumOfElements)); |
716 | for (const auto &[Idx, Op] : enumerate(First: operands())) |
717 | Res = State.Builder.CreateInsertElement(Vec: Res, NewElt: State.get(Def: Op, IsScalar: true), |
718 | Idx: State.Builder.getInt32(C: Idx)); |
719 | return Res; |
720 | } |
721 | case VPInstruction::ReductionStartVector: { |
722 | if (State.VF.isScalar()) |
723 | return State.get(Def: getOperand(N: 0), IsScalar: true); |
724 | IRBuilderBase::FastMathFlagGuard FMFG(Builder); |
725 | Builder.setFastMathFlags(getFastMathFlags()); |
726 | // If this start vector is scaled then it should produce a vector with fewer |
727 | // elements than the VF. |
728 | ElementCount VF = State.VF.divideCoefficientBy( |
729 | RHS: cast<ConstantInt>(Val: getOperand(N: 2)->getLiveInIRValue())->getZExtValue()); |
730 | auto *Iden = Builder.CreateVectorSplat(EC: VF, V: State.get(Def: getOperand(N: 1), IsScalar: true)); |
731 | Constant *Zero = Builder.getInt32(C: 0); |
732 | return Builder.CreateInsertElement(Vec: Iden, NewElt: State.get(Def: getOperand(N: 0), IsScalar: true), |
733 | Idx: Zero); |
734 | } |
735 | case VPInstruction::ComputeAnyOfResult: { |
736 | // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary |
737 | // and will be removed by breaking up the recipe further. |
738 | auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: 0)); |
739 | auto *OrigPhi = cast<PHINode>(Val: PhiR->getUnderlyingValue()); |
740 | Value *ReducedPartRdx = State.get(Def: getOperand(N: 2)); |
741 | for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx) |
742 | ReducedPartRdx = Builder.CreateBinOp( |
743 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode( |
744 | Kind: RecurKind::AnyOf), |
745 | LHS: State.get(Def: getOperand(N: Idx)), RHS: ReducedPartRdx, Name: "bin.rdx" ); |
746 | return createAnyOfReduction(B&: Builder, Src: ReducedPartRdx, |
747 | InitVal: State.get(Def: getOperand(N: 1), Lane: VPLane(0)), OrigPhi); |
748 | } |
749 | case VPInstruction::ComputeFindIVResult: { |
750 | // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary |
751 | // and will be removed by breaking up the recipe further. |
752 | auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: 0)); |
753 | // Get its reduction variable descriptor. |
754 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
755 | RecurKind RK = RdxDesc.getRecurrenceKind(); |
756 | assert(RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && |
757 | "Unexpected reduction kind" ); |
758 | assert(!PhiR->isInLoop() && |
759 | "In-loop FindLastIV reduction is not supported yet" ); |
760 | |
761 | // The recipe's operands are the reduction phi, the start value, the |
762 | // sentinel value, followed by one operand for each part of the reduction. |
763 | unsigned UF = getNumOperands() - 3; |
764 | Value *ReducedPartRdx = State.get(Def: getOperand(N: 3)); |
765 | RecurKind MinMaxKind; |
766 | bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(Kind: RK); |
767 | if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind: RK)) { |
768 | MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax; |
769 | } else { |
770 | assert(RecurrenceDescriptor::isFindFirstIVRecurrenceKind(RK) && |
771 | "Kind must either be FindLastIV or FindFirstIV" ); |
772 | assert(IsSigned && "Only FindFirstIV with SMax is currently supported" ); |
773 | MinMaxKind = RecurKind::SMin; |
774 | } |
775 | for (unsigned Part = 1; Part < UF; ++Part) |
776 | ReducedPartRdx = createMinMaxOp(Builder, RK: MinMaxKind, Left: ReducedPartRdx, |
777 | Right: State.get(Def: getOperand(N: 3 + Part))); |
778 | |
779 | Value *Start = State.get(Def: getOperand(N: 1), IsScalar: true); |
780 | Value *Sentinel = getOperand(N: 2)->getLiveInIRValue(); |
781 | return createFindLastIVReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK, Start, |
782 | Sentinel); |
783 | } |
784 | case VPInstruction::ComputeReductionResult: { |
785 | // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary |
786 | // and will be removed by breaking up the recipe further. |
787 | auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: 0)); |
788 | // Get its reduction variable descriptor. |
789 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
790 | |
791 | RecurKind RK = RdxDesc.getRecurrenceKind(); |
792 | assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && |
793 | "should be handled by ComputeFindIVResult" ); |
794 | |
795 | // The recipe's operands are the reduction phi, followed by one operand for |
796 | // each part of the reduction. |
797 | unsigned UF = getNumOperands() - 1; |
798 | VectorParts RdxParts(UF); |
799 | for (unsigned Part = 0; Part < UF; ++Part) |
800 | RdxParts[Part] = State.get(Def: getOperand(N: 1 + Part), IsScalar: PhiR->isInLoop()); |
801 | |
802 | IRBuilderBase::FastMathFlagGuard FMFG(Builder); |
803 | if (hasFastMathFlags()) |
804 | Builder.setFastMathFlags(getFastMathFlags()); |
805 | |
806 | // Reduce all of the unrolled parts into a single vector. |
807 | Value *ReducedPartRdx = RdxParts[0]; |
808 | if (PhiR->isOrdered()) { |
809 | ReducedPartRdx = RdxParts[UF - 1]; |
810 | } else { |
811 | // Floating-point operations should have some FMF to enable the reduction. |
812 | for (unsigned Part = 1; Part < UF; ++Part) { |
813 | Value *RdxPart = RdxParts[Part]; |
814 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) |
815 | ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart); |
816 | else |
817 | ReducedPartRdx = |
818 | Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxDesc.getOpcode(), |
819 | LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx" ); |
820 | } |
821 | } |
822 | |
823 | // Create the reduction after the loop. Note that inloop reductions create |
824 | // the target reduction in the loop using a Reduction recipe. |
825 | if (State.VF.isVector() && !PhiR->isInLoop()) { |
826 | // TODO: Support in-order reductions based on the recurrence descriptor. |
827 | // All ops in the reduction inherit fast-math-flags from the recurrence |
828 | // descriptor. |
829 | ReducedPartRdx = createSimpleReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK); |
830 | } |
831 | |
832 | return ReducedPartRdx; |
833 | } |
834 | case VPInstruction::ExtractLastElement: |
835 | case VPInstruction::ExtractPenultimateElement: { |
836 | unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2; |
837 | Value *Res; |
838 | if (State.VF.isVector()) { |
839 | assert(Offset <= State.VF.getKnownMinValue() && |
840 | "invalid offset to extract from" ); |
841 | // Extract lane VF - Offset from the operand. |
842 | Res = State.get(Def: getOperand(N: 0), Lane: VPLane::getLaneFromEnd(VF: State.VF, Offset)); |
843 | } else { |
844 | assert(Offset <= 1 && "invalid offset to extract from" ); |
845 | Res = State.get(Def: getOperand(N: 0)); |
846 | } |
847 | if (isa<ExtractElementInst>(Val: Res)) |
848 | Res->setName(Name); |
849 | return Res; |
850 | } |
851 | case VPInstruction::LogicalAnd: { |
852 | Value *A = State.get(Def: getOperand(N: 0)); |
853 | Value *B = State.get(Def: getOperand(N: 1)); |
854 | return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name); |
855 | } |
856 | case VPInstruction::PtrAdd: { |
857 | assert(vputils::onlyFirstLaneUsed(this) && |
858 | "can only generate first lane for PtrAdd" ); |
859 | Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
860 | Value *Addend = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
861 | return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags()); |
862 | } |
863 | case VPInstruction::AnyOf: { |
864 | Value *Res = State.get(Def: getOperand(N: 0)); |
865 | for (VPValue *Op : drop_begin(RangeOrContainer: operands())) |
866 | Res = Builder.CreateOr(LHS: Res, RHS: State.get(Def: Op)); |
867 | return Builder.CreateOrReduce(Src: Res); |
868 | } |
869 | case VPInstruction::FirstActiveLane: { |
870 | if (getNumOperands() == 1) { |
871 | Value *Mask = State.get(Def: getOperand(N: 0)); |
872 | return Builder.CreateCountTrailingZeroElems(ResTy: Builder.getInt64Ty(), Mask, |
873 | ZeroIsPoison: true, Name); |
874 | } |
875 | // If there are multiple operands, create a chain of selects to pick the |
876 | // first operand with an active lane and add the number of lanes of the |
877 | // preceding operands. |
878 | Value *RuntimeVF = |
879 | getRuntimeVF(B&: State.Builder, Ty: State.Builder.getInt64Ty(), VF: State.VF); |
880 | unsigned LastOpIdx = getNumOperands() - 1; |
881 | Value *Res = nullptr; |
882 | for (int Idx = LastOpIdx; Idx >= 0; --Idx) { |
883 | Value *TrailingZeros = Builder.CreateCountTrailingZeroElems( |
884 | ResTy: Builder.getInt64Ty(), Mask: State.get(Def: getOperand(N: Idx)), ZeroIsPoison: true, Name); |
885 | Value *Current = Builder.CreateAdd( |
886 | LHS: Builder.CreateMul(LHS: RuntimeVF, RHS: Builder.getInt64(C: Idx)), RHS: TrailingZeros); |
887 | if (Res) { |
888 | Value *Cmp = Builder.CreateICmpNE(LHS: TrailingZeros, RHS: RuntimeVF); |
889 | Res = Builder.CreateSelect(C: Cmp, True: Current, False: Res); |
890 | } else { |
891 | Res = Current; |
892 | } |
893 | } |
894 | |
895 | return Res; |
896 | } |
897 | default: |
898 | llvm_unreachable("Unsupported opcode for instruction" ); |
899 | } |
900 | } |
901 | |
902 | InstructionCost VPInstruction::computeCost(ElementCount VF, |
903 | VPCostContext &Ctx) const { |
904 | if (Instruction::isBinaryOp(Opcode: getOpcode())) { |
905 | Type *ResTy = Ctx.Types.inferScalarType(V: this); |
906 | if (!vputils::onlyFirstLaneUsed(Def: this)) |
907 | ResTy = toVectorTy(Scalar: ResTy, EC: VF); |
908 | |
909 | if (!getUnderlyingValue()) { |
910 | switch (getOpcode()) { |
911 | case Instruction::FMul: |
912 | return Ctx.TTI.getArithmeticInstrCost(Opcode: getOpcode(), Ty: ResTy, CostKind: Ctx.CostKind); |
913 | default: |
914 | // TODO: Compute cost for VPInstructions without underlying values once |
915 | // the legacy cost model has been retired. |
916 | return 0; |
917 | } |
918 | } |
919 | |
920 | assert(!doesGeneratePerAllLanes() && |
921 | "Should only generate a vector value or single scalar, not scalars " |
922 | "for all lanes." ); |
923 | return Ctx.TTI.getArithmeticInstrCost(Opcode: getOpcode(), Ty: ResTy, CostKind: Ctx.CostKind); |
924 | } |
925 | |
926 | switch (getOpcode()) { |
927 | case Instruction::ExtractElement: { |
928 | // Add on the cost of extracting the element. |
929 | auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF); |
930 | return Ctx.TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, |
931 | CostKind: Ctx.CostKind); |
932 | } |
933 | case VPInstruction::AnyOf: { |
934 | auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
935 | return Ctx.TTI.getArithmeticReductionCost( |
936 | Opcode: Instruction::Or, Ty: cast<VectorType>(Val: VecTy), FMF: std::nullopt, CostKind: Ctx.CostKind); |
937 | } |
938 | case VPInstruction::FirstActiveLane: { |
939 | // Calculate the cost of determining the lane index. |
940 | auto *PredTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF); |
941 | IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, |
942 | Type::getInt64Ty(C&: Ctx.LLVMCtx), |
943 | {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)}); |
944 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
945 | } |
946 | case VPInstruction::FirstOrderRecurrenceSplice: { |
947 | assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?" ); |
948 | SmallVector<int> Mask(VF.getKnownMinValue()); |
949 | std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1); |
950 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
951 | |
952 | return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice, |
953 | DstTy: cast<VectorType>(Val: VectorTy), |
954 | SrcTy: cast<VectorType>(Val: VectorTy), Mask, |
955 | CostKind: Ctx.CostKind, Index: VF.getKnownMinValue() - 1); |
956 | } |
957 | case VPInstruction::ActiveLaneMask: { |
958 | Type *ArgTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
959 | Type *RetTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF); |
960 | IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, |
961 | {ArgTy, ArgTy}); |
962 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
963 | } |
964 | case VPInstruction::ExplicitVectorLength: { |
965 | Type *Arg0Ty = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
966 | Type *I32Ty = Type::getInt32Ty(C&: Ctx.LLVMCtx); |
967 | Type *I1Ty = Type::getInt1Ty(C&: Ctx.LLVMCtx); |
968 | IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length, |
969 | I32Ty, {Arg0Ty, I32Ty, I1Ty}); |
970 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
971 | } |
972 | case VPInstruction::ExtractPenultimateElement: |
973 | if (VF == ElementCount::getScalable(MinVal: 1)) |
974 | return InstructionCost::getInvalid(); |
975 | LLVM_FALLTHROUGH; |
976 | default: |
977 | // TODO: Compute cost other VPInstructions once the legacy cost model has |
978 | // been retired. |
979 | assert(!getUnderlyingValue() && |
980 | "unexpected VPInstruction witht underlying value" ); |
981 | return 0; |
982 | } |
983 | } |
984 | |
985 | bool VPInstruction::isVectorToScalar() const { |
986 | return getOpcode() == VPInstruction::ExtractLastElement || |
987 | getOpcode() == VPInstruction::ExtractPenultimateElement || |
988 | getOpcode() == Instruction::ExtractElement || |
989 | getOpcode() == VPInstruction::FirstActiveLane || |
990 | getOpcode() == VPInstruction::ComputeAnyOfResult || |
991 | getOpcode() == VPInstruction::ComputeFindIVResult || |
992 | getOpcode() == VPInstruction::ComputeReductionResult || |
993 | getOpcode() == VPInstruction::AnyOf; |
994 | } |
995 | |
996 | bool VPInstruction::isSingleScalar() const { |
997 | return getOpcode() == Instruction::PHI || isScalarCast(); |
998 | } |
999 | |
1000 | void VPInstruction::execute(VPTransformState &State) { |
1001 | assert(!State.Lane && "VPInstruction executing an Lane" ); |
1002 | IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); |
1003 | assert(flagsValidForOpcode(getOpcode()) && |
1004 | "Set flags not supported for the provided opcode" ); |
1005 | if (hasFastMathFlags()) |
1006 | State.Builder.setFastMathFlags(getFastMathFlags()); |
1007 | bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && |
1008 | (vputils::onlyFirstLaneUsed(Def: this) || |
1009 | isVectorToScalar() || isSingleScalar()); |
1010 | bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); |
1011 | if (GeneratesPerAllLanes) { |
1012 | for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue(); |
1013 | Lane != NumLanes; ++Lane) { |
1014 | Value *GeneratedValue = generatePerLane(State, Lane: VPLane(Lane)); |
1015 | assert(GeneratedValue && "generatePerLane must produce a value" ); |
1016 | State.set(Def: this, V: GeneratedValue, Lane: VPLane(Lane)); |
1017 | } |
1018 | return; |
1019 | } |
1020 | |
1021 | Value *GeneratedValue = generate(State); |
1022 | if (!hasResult()) |
1023 | return; |
1024 | assert(GeneratedValue && "generate must produce a value" ); |
1025 | assert((((GeneratedValue->getType()->isVectorTy() || |
1026 | GeneratedValue->getType()->isStructTy()) == |
1027 | !GeneratesPerFirstLaneOnly) || |
1028 | State.VF.isScalar()) && |
1029 | "scalar value but not only first lane defined" ); |
1030 | State.set(Def: this, V: GeneratedValue, |
1031 | /*IsScalar*/ GeneratesPerFirstLaneOnly); |
1032 | } |
1033 | |
1034 | bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { |
1035 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
1036 | return false; |
1037 | switch (getOpcode()) { |
1038 | case Instruction::ExtractElement: |
1039 | case Instruction::Freeze: |
1040 | case Instruction::ICmp: |
1041 | case Instruction::Select: |
1042 | case VPInstruction::AnyOf: |
1043 | case VPInstruction::BuildStructVector: |
1044 | case VPInstruction::BuildVector: |
1045 | case VPInstruction::CalculateTripCountMinusVF: |
1046 | case VPInstruction::CanonicalIVIncrementForPart: |
1047 | case VPInstruction::ExtractLastElement: |
1048 | case VPInstruction::ExtractPenultimateElement: |
1049 | case VPInstruction::FirstActiveLane: |
1050 | case VPInstruction::FirstOrderRecurrenceSplice: |
1051 | case VPInstruction::LogicalAnd: |
1052 | case VPInstruction::Not: |
1053 | case VPInstruction::PtrAdd: |
1054 | case VPInstruction::WideIVStep: |
1055 | case VPInstruction::StepVector: |
1056 | case VPInstruction::ReductionStartVector: |
1057 | return false; |
1058 | default: |
1059 | return true; |
1060 | } |
1061 | } |
1062 | |
1063 | bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { |
1064 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
1065 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
1066 | return vputils::onlyFirstLaneUsed(Def: this); |
1067 | |
1068 | switch (getOpcode()) { |
1069 | default: |
1070 | return false; |
1071 | case Instruction::ExtractElement: |
1072 | return Op == getOperand(N: 1); |
1073 | case Instruction::PHI: |
1074 | return true; |
1075 | case Instruction::ICmp: |
1076 | case Instruction::Select: |
1077 | case Instruction::Or: |
1078 | case Instruction::Freeze: |
1079 | // TODO: Cover additional opcodes. |
1080 | return vputils::onlyFirstLaneUsed(Def: this); |
1081 | case VPInstruction::ActiveLaneMask: |
1082 | case VPInstruction::ExplicitVectorLength: |
1083 | case VPInstruction::CalculateTripCountMinusVF: |
1084 | case VPInstruction::CanonicalIVIncrementForPart: |
1085 | case VPInstruction::BranchOnCount: |
1086 | case VPInstruction::BranchOnCond: |
1087 | case VPInstruction::Broadcast: |
1088 | case VPInstruction::ReductionStartVector: |
1089 | return true; |
1090 | case VPInstruction::PtrAdd: |
1091 | return Op == getOperand(N: 0) || vputils::onlyFirstLaneUsed(Def: this); |
1092 | case VPInstruction::ComputeAnyOfResult: |
1093 | case VPInstruction::ComputeFindIVResult: |
1094 | return Op == getOperand(N: 1); |
1095 | }; |
1096 | llvm_unreachable("switch should return" ); |
1097 | } |
1098 | |
1099 | bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { |
1100 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
1101 | if (Instruction::isBinaryOp(Opcode: getOpcode())) |
1102 | return vputils::onlyFirstPartUsed(Def: this); |
1103 | |
1104 | switch (getOpcode()) { |
1105 | default: |
1106 | return false; |
1107 | case Instruction::ICmp: |
1108 | case Instruction::Select: |
1109 | return vputils::onlyFirstPartUsed(Def: this); |
1110 | case VPInstruction::BranchOnCount: |
1111 | case VPInstruction::BranchOnCond: |
1112 | case VPInstruction::CanonicalIVIncrementForPart: |
1113 | return true; |
1114 | }; |
1115 | llvm_unreachable("switch should return" ); |
1116 | } |
1117 | |
1118 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1119 | void VPInstruction::dump() const { |
1120 | VPSlotTracker SlotTracker(getParent()->getPlan()); |
1121 | print(dbgs(), "" , SlotTracker); |
1122 | } |
1123 | |
1124 | void VPInstruction::print(raw_ostream &O, const Twine &Indent, |
1125 | VPSlotTracker &SlotTracker) const { |
1126 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
1127 | |
1128 | if (hasResult()) { |
1129 | printAsOperand(O, SlotTracker); |
1130 | O << " = " ; |
1131 | } |
1132 | |
1133 | switch (getOpcode()) { |
1134 | case VPInstruction::Not: |
1135 | O << "not" ; |
1136 | break; |
1137 | case VPInstruction::SLPLoad: |
1138 | O << "combined load" ; |
1139 | break; |
1140 | case VPInstruction::SLPStore: |
1141 | O << "combined store" ; |
1142 | break; |
1143 | case VPInstruction::ActiveLaneMask: |
1144 | O << "active lane mask" ; |
1145 | break; |
1146 | case VPInstruction::ExplicitVectorLength: |
1147 | O << "EXPLICIT-VECTOR-LENGTH" ; |
1148 | break; |
1149 | case VPInstruction::FirstOrderRecurrenceSplice: |
1150 | O << "first-order splice" ; |
1151 | break; |
1152 | case VPInstruction::BranchOnCond: |
1153 | O << "branch-on-cond" ; |
1154 | break; |
1155 | case VPInstruction::CalculateTripCountMinusVF: |
1156 | O << "TC > VF ? TC - VF : 0" ; |
1157 | break; |
1158 | case VPInstruction::CanonicalIVIncrementForPart: |
1159 | O << "VF * Part +" ; |
1160 | break; |
1161 | case VPInstruction::BranchOnCount: |
1162 | O << "branch-on-count" ; |
1163 | break; |
1164 | case VPInstruction::Broadcast: |
1165 | O << "broadcast" ; |
1166 | break; |
1167 | case VPInstruction::BuildStructVector: |
1168 | O << "buildstructvector" ; |
1169 | break; |
1170 | case VPInstruction::BuildVector: |
1171 | O << "buildvector" ; |
1172 | break; |
1173 | case VPInstruction::ExtractLastElement: |
1174 | O << "extract-last-element" ; |
1175 | break; |
1176 | case VPInstruction::ExtractPenultimateElement: |
1177 | O << "extract-penultimate-element" ; |
1178 | break; |
1179 | case VPInstruction::ComputeAnyOfResult: |
1180 | O << "compute-anyof-result" ; |
1181 | break; |
1182 | case VPInstruction::ComputeFindIVResult: |
1183 | O << "compute-find-iv-result" ; |
1184 | break; |
1185 | case VPInstruction::ComputeReductionResult: |
1186 | O << "compute-reduction-result" ; |
1187 | break; |
1188 | case VPInstruction::LogicalAnd: |
1189 | O << "logical-and" ; |
1190 | break; |
1191 | case VPInstruction::PtrAdd: |
1192 | O << "ptradd" ; |
1193 | break; |
1194 | case VPInstruction::AnyOf: |
1195 | O << "any-of" ; |
1196 | break; |
1197 | case VPInstruction::FirstActiveLane: |
1198 | O << "first-active-lane" ; |
1199 | break; |
1200 | case VPInstruction::ReductionStartVector: |
1201 | O << "reduction-start-vector" ; |
1202 | break; |
1203 | default: |
1204 | O << Instruction::getOpcodeName(getOpcode()); |
1205 | } |
1206 | |
1207 | printFlags(O); |
1208 | printOperands(O, SlotTracker); |
1209 | |
1210 | if (auto DL = getDebugLoc()) { |
1211 | O << ", !dbg " ; |
1212 | DL.print(O); |
1213 | } |
1214 | } |
1215 | #endif |
1216 | |
1217 | void VPInstructionWithType::execute(VPTransformState &State) { |
1218 | State.setDebugLocFrom(getDebugLoc()); |
1219 | if (isScalarCast()) { |
1220 | Value *Op = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
1221 | Value *Cast = State.Builder.CreateCast(Op: Instruction::CastOps(getOpcode()), |
1222 | V: Op, DestTy: ResultTy); |
1223 | State.set(Def: this, V: Cast, Lane: VPLane(0)); |
1224 | return; |
1225 | } |
1226 | switch (getOpcode()) { |
1227 | case VPInstruction::StepVector: { |
1228 | Value *StepVector = |
1229 | State.Builder.CreateStepVector(DstType: VectorType::get(ElementType: ResultTy, EC: State.VF)); |
1230 | State.set(Def: this, V: StepVector); |
1231 | break; |
1232 | } |
1233 | default: |
1234 | llvm_unreachable("opcode not implemented yet" ); |
1235 | } |
1236 | } |
1237 | |
1238 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1239 | void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent, |
1240 | VPSlotTracker &SlotTracker) const { |
1241 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
1242 | printAsOperand(O, SlotTracker); |
1243 | O << " = " ; |
1244 | |
1245 | switch (getOpcode()) { |
1246 | case VPInstruction::WideIVStep: |
1247 | O << "wide-iv-step " ; |
1248 | printOperands(O, SlotTracker); |
1249 | break; |
1250 | case VPInstruction::StepVector: |
1251 | O << "step-vector " << *ResultTy; |
1252 | break; |
1253 | default: |
1254 | assert(Instruction::isCast(getOpcode()) && "unhandled opcode" ); |
1255 | O << Instruction::getOpcodeName(getOpcode()) << " " ; |
1256 | printOperands(O, SlotTracker); |
1257 | O << " to " << *ResultTy; |
1258 | } |
1259 | } |
1260 | #endif |
1261 | |
1262 | void VPPhi::execute(VPTransformState &State) { |
1263 | State.setDebugLocFrom(getDebugLoc()); |
1264 | PHINode *NewPhi = State.Builder.CreatePHI( |
1265 | Ty: State.TypeAnalysis.inferScalarType(V: this), NumReservedValues: 2, Name: getName()); |
1266 | unsigned NumIncoming = getNumIncoming(); |
1267 | if (getParent() != getParent()->getPlan()->getScalarPreheader()) { |
1268 | // TODO: Fixup all incoming values of header phis once recipes defining them |
1269 | // are introduced. |
1270 | NumIncoming = 1; |
1271 | } |
1272 | for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) { |
1273 | Value *IncV = State.get(Def: getIncomingValue(Idx), Lane: VPLane(0)); |
1274 | BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(Val: getIncomingBlock(Idx)); |
1275 | NewPhi->addIncoming(V: IncV, BB: PredBB); |
1276 | } |
1277 | State.set(Def: this, V: NewPhi, Lane: VPLane(0)); |
1278 | } |
1279 | |
1280 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1281 | void VPPhi::print(raw_ostream &O, const Twine &Indent, |
1282 | VPSlotTracker &SlotTracker) const { |
1283 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
1284 | printAsOperand(O, SlotTracker); |
1285 | O << " = phi " ; |
1286 | printPhiOperands(O, SlotTracker); |
1287 | } |
1288 | #endif |
1289 | |
1290 | VPIRInstruction *VPIRInstruction ::create(Instruction &I) { |
1291 | if (auto *Phi = dyn_cast<PHINode>(Val: &I)) |
1292 | return new VPIRPhi(*Phi); |
1293 | return new VPIRInstruction(I); |
1294 | } |
1295 | |
1296 | void VPIRInstruction::execute(VPTransformState &State) { |
1297 | assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 && |
1298 | "PHINodes must be handled by VPIRPhi" ); |
1299 | // Advance the insert point after the wrapped IR instruction. This allows |
1300 | // interleaving VPIRInstructions and other recipes. |
1301 | State.Builder.SetInsertPoint(TheBB: I.getParent(), IP: std::next(x: I.getIterator())); |
1302 | } |
1303 | |
1304 | InstructionCost VPIRInstruction::computeCost(ElementCount VF, |
1305 | VPCostContext &Ctx) const { |
1306 | // The recipe wraps an existing IR instruction on the border of VPlan's scope, |
1307 | // hence it does not contribute to the cost-modeling for the VPlan. |
1308 | return 0; |
1309 | } |
1310 | |
1311 | void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) { |
1312 | assert(isa<PHINode>(getInstruction()) && |
1313 | "can only update exiting operands to phi nodes" ); |
1314 | assert(getNumOperands() > 0 && "must have at least one operand" ); |
1315 | VPValue *Exiting = getOperand(N: 0); |
1316 | if (Exiting->isLiveIn()) |
1317 | return; |
1318 | |
1319 | Exiting = Builder.createNaryOp(Opcode: VPInstruction::ExtractLastElement, Operands: {Exiting}); |
1320 | setOperand(I: 0, New: Exiting); |
1321 | } |
1322 | |
1323 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1324 | void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, |
1325 | VPSlotTracker &SlotTracker) const { |
1326 | O << Indent << "IR " << I; |
1327 | } |
1328 | #endif |
1329 | |
1330 | void VPIRPhi::execute(VPTransformState &State) { |
1331 | PHINode *Phi = &getIRPhi(); |
1332 | for (const auto &[Idx, Op] : enumerate(First: operands())) { |
1333 | VPValue *ExitValue = Op; |
1334 | auto Lane = vputils::isSingleScalar(VPV: ExitValue) |
1335 | ? VPLane::getFirstLane() |
1336 | : VPLane::getLastLaneForVF(VF: State.VF); |
1337 | VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; |
1338 | auto *PredVPBB = Pred->getExitingBasicBlock(); |
1339 | BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; |
1340 | // Set insertion point in PredBB in case an extract needs to be generated. |
1341 | // TODO: Model extracts explicitly. |
1342 | State.Builder.SetInsertPoint(TheBB: PredBB, IP: PredBB->getFirstNonPHIIt()); |
1343 | Value *V = State.get(Def: ExitValue, Lane: VPLane(Lane)); |
1344 | // If there is no existing block for PredBB in the phi, add a new incoming |
1345 | // value. Otherwise update the existing incoming value for PredBB. |
1346 | if (Phi->getBasicBlockIndex(BB: PredBB) == -1) |
1347 | Phi->addIncoming(V, BB: PredBB); |
1348 | else |
1349 | Phi->setIncomingValueForBlock(BB: PredBB, V); |
1350 | } |
1351 | |
1352 | // Advance the insert point after the wrapped IR instruction. This allows |
1353 | // interleaving VPIRInstructions and other recipes. |
1354 | State.Builder.SetInsertPoint(TheBB: Phi->getParent(), IP: std::next(x: Phi->getIterator())); |
1355 | } |
1356 | |
1357 | void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const { |
1358 | VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe()); |
1359 | assert(R->getNumOperands() == R->getParent()->getNumPredecessors() && |
1360 | "Number of phi operands must match number of predecessors" ); |
1361 | unsigned Position = R->getParent()->getIndexForPredecessor(Pred: IncomingBlock); |
1362 | R->removeOperand(Idx: Position); |
1363 | } |
1364 | |
1365 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1366 | void VPPhiAccessors::printPhiOperands(raw_ostream &O, |
1367 | VPSlotTracker &SlotTracker) const { |
1368 | interleaveComma(enumerate(getAsRecipe()->operands()), O, |
1369 | [this, &O, &SlotTracker](auto Op) { |
1370 | O << "[ " ; |
1371 | Op.value()->printAsOperand(O, SlotTracker); |
1372 | O << ", " ; |
1373 | getIncomingBlock(Op.index())->printAsOperand(O); |
1374 | O << " ]" ; |
1375 | }); |
1376 | } |
1377 | #endif |
1378 | |
1379 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1380 | void VPIRPhi::print(raw_ostream &O, const Twine &Indent, |
1381 | VPSlotTracker &SlotTracker) const { |
1382 | VPIRInstruction::print(O, Indent, SlotTracker); |
1383 | |
1384 | if (getNumOperands() != 0) { |
1385 | O << " (extra operand" << (getNumOperands() > 1 ? "s" : "" ) << ": " ; |
1386 | interleaveComma( |
1387 | enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) { |
1388 | Op.value()->printAsOperand(O, SlotTracker); |
1389 | O << " from " ; |
1390 | getParent()->getPredecessors()[Op.index()]->printAsOperand(O); |
1391 | }); |
1392 | O << ")" ; |
1393 | } |
1394 | } |
1395 | #endif |
1396 | |
1397 | VPIRMetadata::VPIRMetadata(Instruction &I, LoopVersioning *LVer) |
1398 | : VPIRMetadata(I) { |
1399 | if (!LVer || !isa<LoadInst, StoreInst>(Val: &I)) |
1400 | return; |
1401 | const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(OrigInst: &I); |
1402 | if (AliasScopeMD) |
1403 | Metadata.emplace_back(Args: LLVMContext::MD_alias_scope, Args: AliasScopeMD); |
1404 | if (NoAliasMD) |
1405 | Metadata.emplace_back(Args: LLVMContext::MD_noalias, Args: NoAliasMD); |
1406 | } |
1407 | |
1408 | void VPIRMetadata::applyMetadata(Instruction &I) const { |
1409 | for (const auto &[Kind, Node] : Metadata) |
1410 | I.setMetadata(KindID: Kind, Node); |
1411 | } |
1412 | |
1413 | void VPWidenCallRecipe::execute(VPTransformState &State) { |
1414 | assert(State.VF.isVector() && "not widening" ); |
1415 | assert(Variant != nullptr && "Can't create vector function." ); |
1416 | |
1417 | FunctionType *VFTy = Variant->getFunctionType(); |
1418 | // Add return type if intrinsic is overloaded on it. |
1419 | SmallVector<Value *, 4> Args; |
1420 | for (const auto &I : enumerate(First: args())) { |
1421 | Value *Arg; |
1422 | // Some vectorized function variants may also take a scalar argument, |
1423 | // e.g. linear parameters for pointers. This needs to be the scalar value |
1424 | // from the start of the respective part when interleaving. |
1425 | if (!VFTy->getParamType(i: I.index())->isVectorTy()) |
1426 | Arg = State.get(Def: I.value(), Lane: VPLane(0)); |
1427 | else |
1428 | Arg = State.get(Def: I.value(), IsScalar: onlyFirstLaneUsed(Op: I.value())); |
1429 | Args.push_back(Elt: Arg); |
1430 | } |
1431 | |
1432 | auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue()); |
1433 | SmallVector<OperandBundleDef, 1> OpBundles; |
1434 | if (CI) |
1435 | CI->getOperandBundlesAsDefs(Defs&: OpBundles); |
1436 | |
1437 | CallInst *V = State.Builder.CreateCall(Callee: Variant, Args, OpBundles); |
1438 | applyFlags(I&: *V); |
1439 | applyMetadata(I&: *V); |
1440 | V->setCallingConv(Variant->getCallingConv()); |
1441 | |
1442 | if (!V->getType()->isVoidTy()) |
1443 | State.set(Def: this, V); |
1444 | } |
1445 | |
1446 | InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, |
1447 | VPCostContext &Ctx) const { |
1448 | return Ctx.TTI.getCallInstrCost(F: nullptr, RetTy: Variant->getReturnType(), |
1449 | Tys: Variant->getFunctionType()->params(), |
1450 | CostKind: Ctx.CostKind); |
1451 | } |
1452 | |
1453 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1454 | void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, |
1455 | VPSlotTracker &SlotTracker) const { |
1456 | O << Indent << "WIDEN-CALL " ; |
1457 | |
1458 | Function *CalledFn = getCalledScalarFunction(); |
1459 | if (CalledFn->getReturnType()->isVoidTy()) |
1460 | O << "void " ; |
1461 | else { |
1462 | printAsOperand(O, SlotTracker); |
1463 | O << " = " ; |
1464 | } |
1465 | |
1466 | O << "call" ; |
1467 | printFlags(O); |
1468 | O << " @" << CalledFn->getName() << "(" ; |
1469 | interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) { |
1470 | Op->printAsOperand(O, SlotTracker); |
1471 | }); |
1472 | O << ")" ; |
1473 | |
1474 | O << " (using library function" ; |
1475 | if (Variant->hasName()) |
1476 | O << ": " << Variant->getName(); |
1477 | O << ")" ; |
1478 | } |
1479 | #endif |
1480 | |
1481 | void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { |
1482 | assert(State.VF.isVector() && "not widening" ); |
1483 | |
1484 | SmallVector<Type *, 2> TysForDecl; |
1485 | // Add return type if intrinsic is overloaded on it. |
1486 | if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1, TTI: State.TTI)) |
1487 | TysForDecl.push_back(Elt: VectorType::get(ElementType: getResultType(), EC: State.VF)); |
1488 | SmallVector<Value *, 4> Args; |
1489 | for (const auto &I : enumerate(First: operands())) { |
1490 | // Some intrinsics have a scalar argument - don't replace it with a |
1491 | // vector. |
1492 | Value *Arg; |
1493 | if (isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index(), |
1494 | TTI: State.TTI)) |
1495 | Arg = State.get(Def: I.value(), Lane: VPLane(0)); |
1496 | else |
1497 | Arg = State.get(Def: I.value(), IsScalar: onlyFirstLaneUsed(Op: I.value())); |
1498 | if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index(), |
1499 | TTI: State.TTI)) |
1500 | TysForDecl.push_back(Elt: Arg->getType()); |
1501 | Args.push_back(Elt: Arg); |
1502 | } |
1503 | |
1504 | // Use vector version of the intrinsic. |
1505 | Module *M = State.Builder.GetInsertBlock()->getModule(); |
1506 | Function *VectorF = |
1507 | Intrinsic::getOrInsertDeclaration(M, id: VectorIntrinsicID, Tys: TysForDecl); |
1508 | assert(VectorF && |
1509 | "Can't retrieve vector intrinsic or vector-predication intrinsics." ); |
1510 | |
1511 | auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue()); |
1512 | SmallVector<OperandBundleDef, 1> OpBundles; |
1513 | if (CI) |
1514 | CI->getOperandBundlesAsDefs(Defs&: OpBundles); |
1515 | |
1516 | CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles); |
1517 | |
1518 | applyFlags(I&: *V); |
1519 | applyMetadata(I&: *V); |
1520 | |
1521 | if (!V->getType()->isVoidTy()) |
1522 | State.set(Def: this, V); |
1523 | } |
1524 | |
1525 | InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, |
1526 | VPCostContext &Ctx) const { |
1527 | // Some backends analyze intrinsic arguments to determine cost. Use the |
1528 | // underlying value for the operand if it has one. Otherwise try to use the |
1529 | // operand of the underlying call instruction, if there is one. Otherwise |
1530 | // clear Arguments. |
1531 | // TODO: Rework TTI interface to be independent of concrete IR values. |
1532 | SmallVector<const Value *> Arguments; |
1533 | for (const auto &[Idx, Op] : enumerate(First: operands())) { |
1534 | auto *V = Op->getUnderlyingValue(); |
1535 | if (!V) { |
1536 | if (auto *UI = dyn_cast_or_null<CallBase>(Val: getUnderlyingValue())) { |
1537 | Arguments.push_back(Elt: UI->getArgOperand(i: Idx)); |
1538 | continue; |
1539 | } |
1540 | Arguments.clear(); |
1541 | break; |
1542 | } |
1543 | Arguments.push_back(Elt: V); |
1544 | } |
1545 | |
1546 | Type *RetTy = toVectorizedTy(Ty: Ctx.Types.inferScalarType(V: this), EC: VF); |
1547 | SmallVector<Type *> ParamTys; |
1548 | for (unsigned I = 0; I != getNumOperands(); ++I) |
1549 | ParamTys.push_back( |
1550 | Elt: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: I)), EC: VF)); |
1551 | |
1552 | // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. |
1553 | FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); |
1554 | IntrinsicCostAttributes CostAttrs( |
1555 | VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, |
1556 | dyn_cast_or_null<IntrinsicInst>(Val: getUnderlyingValue()), |
1557 | InstructionCost::getInvalid(), &Ctx.TLI); |
1558 | return Ctx.TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind: Ctx.CostKind); |
1559 | } |
1560 | |
1561 | StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { |
1562 | return Intrinsic::getBaseName(id: VectorIntrinsicID); |
1563 | } |
1564 | |
1565 | bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { |
1566 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
1567 | return all_of(Range: enumerate(First: operands()), P: [this, &Op](const auto &X) { |
1568 | auto [Idx, V] = X; |
1569 | return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(), |
1570 | Idx, nullptr); |
1571 | }); |
1572 | } |
1573 | |
1574 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1575 | void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent, |
1576 | VPSlotTracker &SlotTracker) const { |
1577 | O << Indent << "WIDEN-INTRINSIC " ; |
1578 | if (ResultTy->isVoidTy()) { |
1579 | O << "void " ; |
1580 | } else { |
1581 | printAsOperand(O, SlotTracker); |
1582 | O << " = " ; |
1583 | } |
1584 | |
1585 | O << "call" ; |
1586 | printFlags(O); |
1587 | O << getIntrinsicName() << "(" ; |
1588 | |
1589 | interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { |
1590 | Op->printAsOperand(O, SlotTracker); |
1591 | }); |
1592 | O << ")" ; |
1593 | } |
1594 | #endif |
1595 | |
1596 | void VPHistogramRecipe::execute(VPTransformState &State) { |
1597 | IRBuilderBase &Builder = State.Builder; |
1598 | |
1599 | Value *Address = State.get(Def: getOperand(N: 0)); |
1600 | Value *IncAmt = State.get(Def: getOperand(N: 1), /*IsScalar=*/true); |
1601 | VectorType *VTy = cast<VectorType>(Val: Address->getType()); |
1602 | |
1603 | // The histogram intrinsic requires a mask even if the recipe doesn't; |
1604 | // if the mask operand was omitted then all lanes should be executed and |
1605 | // we just need to synthesize an all-true mask. |
1606 | Value *Mask = nullptr; |
1607 | if (VPValue *VPMask = getMask()) |
1608 | Mask = State.get(Def: VPMask); |
1609 | else |
1610 | Mask = |
1611 | Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Builder.getInt1(V: 1)); |
1612 | |
1613 | // If this is a subtract, we want to invert the increment amount. We may |
1614 | // add a separate intrinsic in future, but for now we'll try this. |
1615 | if (Opcode == Instruction::Sub) |
1616 | IncAmt = Builder.CreateNeg(V: IncAmt); |
1617 | else |
1618 | assert(Opcode == Instruction::Add && "only add or sub supported for now" ); |
1619 | |
1620 | State.Builder.CreateIntrinsic(ID: Intrinsic::experimental_vector_histogram_add, |
1621 | Types: {VTy, IncAmt->getType()}, |
1622 | Args: {Address, IncAmt, Mask}); |
1623 | } |
1624 | |
1625 | InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, |
1626 | VPCostContext &Ctx) const { |
1627 | // FIXME: Take the gather and scatter into account as well. For now we're |
1628 | // generating the same cost as the fallback path, but we'll likely |
1629 | // need to create a new TTI method for determining the cost, including |
1630 | // whether we can use base + vec-of-smaller-indices or just |
1631 | // vec-of-pointers. |
1632 | assert(VF.isVector() && "Invalid VF for histogram cost" ); |
1633 | Type *AddressTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
1634 | VPValue *IncAmt = getOperand(N: 1); |
1635 | Type *IncTy = Ctx.Types.inferScalarType(V: IncAmt); |
1636 | VectorType *VTy = VectorType::get(ElementType: IncTy, EC: VF); |
1637 | |
1638 | // Assume that a non-constant update value (or a constant != 1) requires |
1639 | // a multiply, and add that into the cost. |
1640 | InstructionCost MulCost = |
1641 | Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VTy, CostKind: Ctx.CostKind); |
1642 | if (IncAmt->isLiveIn()) { |
1643 | ConstantInt *CI = dyn_cast<ConstantInt>(Val: IncAmt->getLiveInIRValue()); |
1644 | |
1645 | if (CI && CI->getZExtValue() == 1) |
1646 | MulCost = TTI::TCC_Free; |
1647 | } |
1648 | |
1649 | // Find the cost of the histogram operation itself. |
1650 | Type *PtrTy = VectorType::get(ElementType: AddressTy, EC: VF); |
1651 | Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF); |
1652 | IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, |
1653 | Type::getVoidTy(C&: Ctx.LLVMCtx), |
1654 | {PtrTy, IncTy, MaskTy}); |
1655 | |
1656 | // Add the costs together with the add/sub operation. |
1657 | return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind) + MulCost + |
1658 | Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: VTy, CostKind: Ctx.CostKind); |
1659 | } |
1660 | |
1661 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1662 | void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent, |
1663 | VPSlotTracker &SlotTracker) const { |
1664 | O << Indent << "WIDEN-HISTOGRAM buckets: " ; |
1665 | getOperand(0)->printAsOperand(O, SlotTracker); |
1666 | |
1667 | if (Opcode == Instruction::Sub) |
1668 | O << ", dec: " ; |
1669 | else { |
1670 | assert(Opcode == Instruction::Add); |
1671 | O << ", inc: " ; |
1672 | } |
1673 | getOperand(1)->printAsOperand(O, SlotTracker); |
1674 | |
1675 | if (VPValue *Mask = getMask()) { |
1676 | O << ", mask: " ; |
1677 | Mask->printAsOperand(O, SlotTracker); |
1678 | } |
1679 | } |
1680 | |
1681 | void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, |
1682 | VPSlotTracker &SlotTracker) const { |
1683 | O << Indent << "WIDEN-SELECT " ; |
1684 | printAsOperand(O, SlotTracker); |
1685 | O << " = select " ; |
1686 | printFlags(O); |
1687 | getOperand(0)->printAsOperand(O, SlotTracker); |
1688 | O << ", " ; |
1689 | getOperand(1)->printAsOperand(O, SlotTracker); |
1690 | O << ", " ; |
1691 | getOperand(2)->printAsOperand(O, SlotTracker); |
1692 | O << (isInvariantCond() ? " (condition is loop invariant)" : "" ); |
1693 | } |
1694 | #endif |
1695 | |
1696 | void VPWidenSelectRecipe::execute(VPTransformState &State) { |
1697 | // The condition can be loop invariant but still defined inside the |
1698 | // loop. This means that we can't just use the original 'cond' value. |
1699 | // We have to take the 'vectorized' value and pick the first lane. |
1700 | // Instcombine will make this a no-op. |
1701 | auto *InvarCond = |
1702 | isInvariantCond() ? State.get(Def: getCond(), Lane: VPLane(0)) : nullptr; |
1703 | |
1704 | Value *Cond = InvarCond ? InvarCond : State.get(Def: getCond()); |
1705 | Value *Op0 = State.get(Def: getOperand(N: 1)); |
1706 | Value *Op1 = State.get(Def: getOperand(N: 2)); |
1707 | Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1); |
1708 | State.set(Def: this, V: Sel); |
1709 | if (auto *I = dyn_cast<Instruction>(Val: Sel)) { |
1710 | if (isa<FPMathOperator>(Val: I)) |
1711 | applyFlags(I&: *I); |
1712 | applyMetadata(I&: *I); |
1713 | } |
1714 | } |
1715 | |
1716 | InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, |
1717 | VPCostContext &Ctx) const { |
1718 | SelectInst *SI = cast<SelectInst>(Val: getUnderlyingValue()); |
1719 | bool ScalarCond = getOperand(N: 0)->isDefinedOutsideLoopRegions(); |
1720 | Type *ScalarTy = Ctx.Types.inferScalarType(V: this); |
1721 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
1722 | |
1723 | VPValue *Op0, *Op1; |
1724 | using namespace llvm::VPlanPatternMatch; |
1725 | if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 && |
1726 | (match(V: this, P: m_LogicalAnd(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))) || |
1727 | match(V: this, P: m_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))))) { |
1728 | // select x, y, false --> x & y |
1729 | // select x, true, y --> x | y |
1730 | const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(V: Op0); |
1731 | const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(V: Op1); |
1732 | |
1733 | SmallVector<const Value *, 2> Operands; |
1734 | if (all_of(Range: operands(), |
1735 | P: [](VPValue *Op) { return Op->getUnderlyingValue(); })) |
1736 | Operands.append(in_start: SI->op_begin(), in_end: SI->op_end()); |
1737 | bool IsLogicalOr = match(V: this, P: m_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))); |
1738 | return Ctx.TTI.getArithmeticInstrCost( |
1739 | Opcode: IsLogicalOr ? Instruction::Or : Instruction::And, Ty: VectorTy, |
1740 | CostKind: Ctx.CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: SI); |
1741 | } |
1742 | |
1743 | Type *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
1744 | if (!ScalarCond) |
1745 | CondTy = VectorType::get(ElementType: CondTy, EC: VF); |
1746 | |
1747 | CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; |
1748 | if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition())) |
1749 | Pred = Cmp->getPredicate(); |
1750 | return Ctx.TTI.getCmpSelInstrCost( |
1751 | Opcode: Instruction::Select, ValTy: VectorTy, CondTy, VecPred: Pred, CostKind: Ctx.CostKind, |
1752 | Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: SI); |
1753 | } |
1754 | |
1755 | VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) { |
1756 | AllowReassoc = FMF.allowReassoc(); |
1757 | NoNaNs = FMF.noNaNs(); |
1758 | NoInfs = FMF.noInfs(); |
1759 | NoSignedZeros = FMF.noSignedZeros(); |
1760 | AllowReciprocal = FMF.allowReciprocal(); |
1761 | AllowContract = FMF.allowContract(); |
1762 | ApproxFunc = FMF.approxFunc(); |
1763 | } |
1764 | |
1765 | #if !defined(NDEBUG) |
1766 | bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { |
1767 | switch (OpType) { |
1768 | case OperationType::OverflowingBinOp: |
1769 | return Opcode == Instruction::Add || Opcode == Instruction::Sub || |
1770 | Opcode == Instruction::Mul || |
1771 | Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart; |
1772 | case OperationType::DisjointOp: |
1773 | return Opcode == Instruction::Or; |
1774 | case OperationType::PossiblyExactOp: |
1775 | return Opcode == Instruction::AShr; |
1776 | case OperationType::GEPOp: |
1777 | return Opcode == Instruction::GetElementPtr || |
1778 | Opcode == VPInstruction::PtrAdd; |
1779 | case OperationType::FPMathOp: |
1780 | return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || |
1781 | Opcode == Instruction::FSub || Opcode == Instruction::FNeg || |
1782 | Opcode == Instruction::FDiv || Opcode == Instruction::FRem || |
1783 | Opcode == Instruction::FCmp || Opcode == Instruction::Select || |
1784 | Opcode == VPInstruction::WideIVStep || |
1785 | Opcode == VPInstruction::ReductionStartVector || |
1786 | Opcode == VPInstruction::ComputeReductionResult; |
1787 | case OperationType::NonNegOp: |
1788 | return Opcode == Instruction::ZExt; |
1789 | break; |
1790 | case OperationType::Cmp: |
1791 | return Opcode == Instruction::ICmp; |
1792 | case OperationType::Other: |
1793 | return true; |
1794 | } |
1795 | llvm_unreachable("Unknown OperationType enum" ); |
1796 | } |
1797 | #endif |
1798 | |
1799 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1800 | void VPIRFlags::printFlags(raw_ostream &O) const { |
1801 | switch (OpType) { |
1802 | case OperationType::Cmp: |
1803 | O << " " << CmpInst::getPredicateName(getPredicate()); |
1804 | break; |
1805 | case OperationType::DisjointOp: |
1806 | if (DisjointFlags.IsDisjoint) |
1807 | O << " disjoint" ; |
1808 | break; |
1809 | case OperationType::PossiblyExactOp: |
1810 | if (ExactFlags.IsExact) |
1811 | O << " exact" ; |
1812 | break; |
1813 | case OperationType::OverflowingBinOp: |
1814 | if (WrapFlags.HasNUW) |
1815 | O << " nuw" ; |
1816 | if (WrapFlags.HasNSW) |
1817 | O << " nsw" ; |
1818 | break; |
1819 | case OperationType::FPMathOp: |
1820 | getFastMathFlags().print(O); |
1821 | break; |
1822 | case OperationType::GEPOp: |
1823 | if (GEPFlags.isInBounds()) |
1824 | O << " inbounds" ; |
1825 | else if (GEPFlags.hasNoUnsignedSignedWrap()) |
1826 | O << " nusw" ; |
1827 | if (GEPFlags.hasNoUnsignedWrap()) |
1828 | O << " nuw" ; |
1829 | break; |
1830 | case OperationType::NonNegOp: |
1831 | if (NonNegFlags.NonNeg) |
1832 | O << " nneg" ; |
1833 | break; |
1834 | case OperationType::Other: |
1835 | break; |
1836 | } |
1837 | O << " " ; |
1838 | } |
1839 | #endif |
1840 | |
1841 | void VPWidenRecipe::execute(VPTransformState &State) { |
1842 | auto &Builder = State.Builder; |
1843 | switch (Opcode) { |
1844 | case Instruction::Call: |
1845 | case Instruction::Br: |
1846 | case Instruction::PHI: |
1847 | case Instruction::GetElementPtr: |
1848 | case Instruction::Select: |
1849 | llvm_unreachable("This instruction is handled by a different recipe." ); |
1850 | case Instruction::UDiv: |
1851 | case Instruction::SDiv: |
1852 | case Instruction::SRem: |
1853 | case Instruction::URem: |
1854 | case Instruction::Add: |
1855 | case Instruction::FAdd: |
1856 | case Instruction::Sub: |
1857 | case Instruction::FSub: |
1858 | case Instruction::FNeg: |
1859 | case Instruction::Mul: |
1860 | case Instruction::FMul: |
1861 | case Instruction::FDiv: |
1862 | case Instruction::FRem: |
1863 | case Instruction::Shl: |
1864 | case Instruction::LShr: |
1865 | case Instruction::AShr: |
1866 | case Instruction::And: |
1867 | case Instruction::Or: |
1868 | case Instruction::Xor: { |
1869 | // Just widen unops and binops. |
1870 | SmallVector<Value *, 2> Ops; |
1871 | for (VPValue *VPOp : operands()) |
1872 | Ops.push_back(Elt: State.get(Def: VPOp)); |
1873 | |
1874 | Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops); |
1875 | |
1876 | if (auto *VecOp = dyn_cast<Instruction>(Val: V)) { |
1877 | applyFlags(I&: *VecOp); |
1878 | applyMetadata(I&: *VecOp); |
1879 | } |
1880 | |
1881 | // Use this vector value for all users of the original instruction. |
1882 | State.set(Def: this, V); |
1883 | break; |
1884 | } |
1885 | case Instruction::ExtractValue: { |
1886 | assert(getNumOperands() == 2 && "expected single level extractvalue" ); |
1887 | Value *Op = State.get(Def: getOperand(N: 0)); |
1888 | auto *CI = cast<ConstantInt>(Val: getOperand(N: 1)->getLiveInIRValue()); |
1889 | Value * = Builder.CreateExtractValue(Agg: Op, Idxs: CI->getZExtValue()); |
1890 | State.set(Def: this, V: Extract); |
1891 | break; |
1892 | } |
1893 | case Instruction::Freeze: { |
1894 | Value *Op = State.get(Def: getOperand(N: 0)); |
1895 | Value *Freeze = Builder.CreateFreeze(V: Op); |
1896 | State.set(Def: this, V: Freeze); |
1897 | break; |
1898 | } |
1899 | case Instruction::ICmp: |
1900 | case Instruction::FCmp: { |
1901 | // Widen compares. Generate vector compares. |
1902 | bool FCmp = Opcode == Instruction::FCmp; |
1903 | Value *A = State.get(Def: getOperand(N: 0)); |
1904 | Value *B = State.get(Def: getOperand(N: 1)); |
1905 | Value *C = nullptr; |
1906 | if (FCmp) { |
1907 | // Propagate fast math flags. |
1908 | C = Builder.CreateFCmpFMF( |
1909 | P: getPredicate(), LHS: A, RHS: B, |
1910 | FMFSource: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue())); |
1911 | } else { |
1912 | C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B); |
1913 | } |
1914 | if (auto *I = dyn_cast<Instruction>(Val: C)) |
1915 | applyMetadata(I&: *I); |
1916 | State.set(Def: this, V: C); |
1917 | break; |
1918 | } |
1919 | default: |
1920 | // This instruction is not vectorized by simple widening. |
1921 | LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " |
1922 | << Instruction::getOpcodeName(Opcode)); |
1923 | llvm_unreachable("Unhandled instruction!" ); |
1924 | } // end of switch. |
1925 | |
1926 | #if !defined(NDEBUG) |
1927 | // Verify that VPlan type inference results agree with the type of the |
1928 | // generated values. |
1929 | assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == |
1930 | State.get(this)->getType() && |
1931 | "inferred type and type from generated instructions do not match" ); |
1932 | #endif |
1933 | } |
1934 | |
1935 | InstructionCost VPWidenRecipe::computeCost(ElementCount VF, |
1936 | VPCostContext &Ctx) const { |
1937 | switch (Opcode) { |
1938 | case Instruction::FNeg: { |
1939 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
1940 | return Ctx.TTI.getArithmeticInstrCost( |
1941 | Opcode, Ty: VectorTy, CostKind: Ctx.CostKind, |
1942 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
1943 | Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}); |
1944 | } |
1945 | |
1946 | case Instruction::UDiv: |
1947 | case Instruction::SDiv: |
1948 | case Instruction::SRem: |
1949 | case Instruction::URem: |
1950 | // More complex computation, let the legacy cost-model handle this for now. |
1951 | return Ctx.getLegacyCost(UI: cast<Instruction>(Val: getUnderlyingValue()), VF); |
1952 | case Instruction::Add: |
1953 | case Instruction::FAdd: |
1954 | case Instruction::Sub: |
1955 | case Instruction::FSub: |
1956 | case Instruction::Mul: |
1957 | case Instruction::FMul: |
1958 | case Instruction::FDiv: |
1959 | case Instruction::FRem: |
1960 | case Instruction::Shl: |
1961 | case Instruction::LShr: |
1962 | case Instruction::AShr: |
1963 | case Instruction::And: |
1964 | case Instruction::Or: |
1965 | case Instruction::Xor: { |
1966 | VPValue *RHS = getOperand(N: 1); |
1967 | // Certain instructions can be cheaper to vectorize if they have a constant |
1968 | // second vector operand. One example of this are shifts on x86. |
1969 | TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(V: RHS); |
1970 | |
1971 | if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && |
1972 | getOperand(N: 1)->isDefinedOutsideLoopRegions()) |
1973 | RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; |
1974 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
1975 | Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()); |
1976 | |
1977 | SmallVector<const Value *, 4> Operands; |
1978 | if (CtxI) |
1979 | Operands.append(in_start: CtxI->value_op_begin(), in_end: CtxI->value_op_end()); |
1980 | return Ctx.TTI.getArithmeticInstrCost( |
1981 | Opcode, Ty: VectorTy, CostKind: Ctx.CostKind, |
1982 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
1983 | Opd2Info: RHSInfo, Args: Operands, CxtI: CtxI, TLibInfo: &Ctx.TLI); |
1984 | } |
1985 | case Instruction::Freeze: { |
1986 | // This opcode is unknown. Assume that it is the same as 'mul'. |
1987 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
1988 | return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, |
1989 | CostKind: Ctx.CostKind); |
1990 | } |
1991 | case Instruction::ExtractValue: { |
1992 | return Ctx.TTI.getInsertExtractValueCost(Opcode: Instruction::ExtractValue, |
1993 | CostKind: Ctx.CostKind); |
1994 | } |
1995 | case Instruction::ICmp: |
1996 | case Instruction::FCmp: { |
1997 | Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()); |
1998 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF); |
1999 | return Ctx.TTI.getCmpSelInstrCost( |
2000 | Opcode, ValTy: VectorTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VectorTy), VecPred: getPredicate(), |
2001 | CostKind: Ctx.CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
2002 | Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CtxI); |
2003 | } |
2004 | default: |
2005 | llvm_unreachable("Unsupported opcode for instruction" ); |
2006 | } |
2007 | } |
2008 | |
2009 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2010 | void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, |
2011 | VPSlotTracker &SlotTracker) const { |
2012 | O << Indent << "WIDEN " ; |
2013 | printAsOperand(O, SlotTracker); |
2014 | O << " = " << Instruction::getOpcodeName(Opcode); |
2015 | printFlags(O); |
2016 | printOperands(O, SlotTracker); |
2017 | } |
2018 | #endif |
2019 | |
2020 | void VPWidenCastRecipe::execute(VPTransformState &State) { |
2021 | auto &Builder = State.Builder; |
2022 | /// Vectorize casts. |
2023 | assert(State.VF.isVector() && "Not vectorizing?" ); |
2024 | Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF); |
2025 | VPValue *Op = getOperand(N: 0); |
2026 | Value *A = State.get(Def: Op); |
2027 | Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy); |
2028 | State.set(Def: this, V: Cast); |
2029 | if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) { |
2030 | applyFlags(I&: *CastOp); |
2031 | applyMetadata(I&: *CastOp); |
2032 | } |
2033 | } |
2034 | |
2035 | InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, |
2036 | VPCostContext &Ctx) const { |
2037 | // TODO: In some cases, VPWidenCastRecipes are created but not considered in |
2038 | // the legacy cost model, including truncates/extends when evaluating a |
2039 | // reduction in a smaller type. |
2040 | if (!getUnderlyingValue()) |
2041 | return 0; |
2042 | // Computes the CastContextHint from a recipes that may access memory. |
2043 | auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { |
2044 | if (VF.isScalar()) |
2045 | return TTI::CastContextHint::Normal; |
2046 | if (isa<VPInterleaveRecipe>(Val: R)) |
2047 | return TTI::CastContextHint::Interleave; |
2048 | if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(Val: R)) |
2049 | return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked |
2050 | : TTI::CastContextHint::Normal; |
2051 | const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(Val: R); |
2052 | if (WidenMemoryRecipe == nullptr) |
2053 | return TTI::CastContextHint::None; |
2054 | if (!WidenMemoryRecipe->isConsecutive()) |
2055 | return TTI::CastContextHint::GatherScatter; |
2056 | if (WidenMemoryRecipe->isReverse()) |
2057 | return TTI::CastContextHint::Reversed; |
2058 | if (WidenMemoryRecipe->isMasked()) |
2059 | return TTI::CastContextHint::Masked; |
2060 | return TTI::CastContextHint::Normal; |
2061 | }; |
2062 | |
2063 | VPValue *Operand = getOperand(N: 0); |
2064 | TTI::CastContextHint CCH = TTI::CastContextHint::None; |
2065 | // For Trunc/FPTrunc, get the context from the only user. |
2066 | if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) && |
2067 | !hasMoreThanOneUniqueUser() && getNumUsers() > 0) { |
2068 | if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(Val: *user_begin())) |
2069 | CCH = ComputeCCH(StoreRecipe); |
2070 | } |
2071 | // For Z/Sext, get the context from the operand. |
2072 | else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || |
2073 | Opcode == Instruction::FPExt) { |
2074 | if (Operand->isLiveIn()) |
2075 | CCH = TTI::CastContextHint::Normal; |
2076 | else if (Operand->getDefiningRecipe()) |
2077 | CCH = ComputeCCH(Operand->getDefiningRecipe()); |
2078 | } |
2079 | |
2080 | auto *SrcTy = |
2081 | cast<VectorType>(Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: Operand), EC: VF)); |
2082 | auto *DestTy = cast<VectorType>(Val: toVectorTy(Scalar: getResultType(), EC: VF)); |
2083 | // Arm TTI will use the underlying instruction to determine the cost. |
2084 | return Ctx.TTI.getCastInstrCost( |
2085 | Opcode, Dst: DestTy, Src: SrcTy, CCH, CostKind: Ctx.CostKind, |
2086 | I: dyn_cast_if_present<Instruction>(Val: getUnderlyingValue())); |
2087 | } |
2088 | |
2089 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2090 | void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, |
2091 | VPSlotTracker &SlotTracker) const { |
2092 | O << Indent << "WIDEN-CAST " ; |
2093 | printAsOperand(O, SlotTracker); |
2094 | O << " = " << Instruction::getOpcodeName(Opcode); |
2095 | printFlags(O); |
2096 | printOperands(O, SlotTracker); |
2097 | O << " to " << *getResultType(); |
2098 | } |
2099 | #endif |
2100 | |
2101 | InstructionCost VPHeaderPHIRecipe::(ElementCount VF, |
2102 | VPCostContext &Ctx) const { |
2103 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
2104 | } |
2105 | |
2106 | /// A helper function that returns an integer or floating-point constant with |
2107 | /// value C. |
2108 | static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { |
2109 | return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, V: C) |
2110 | : ConstantFP::get(Ty, V: C); |
2111 | } |
2112 | |
2113 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2114 | void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, |
2115 | VPSlotTracker &SlotTracker) const { |
2116 | O << Indent; |
2117 | printAsOperand(O, SlotTracker); |
2118 | O << " = WIDEN-INDUCTION " ; |
2119 | printOperands(O, SlotTracker); |
2120 | |
2121 | if (auto *TI = getTruncInst()) |
2122 | O << " (truncated to " << *TI->getType() << ")" ; |
2123 | } |
2124 | #endif |
2125 | |
2126 | bool VPWidenIntOrFpInductionRecipe::isCanonical() const { |
2127 | // The step may be defined by a recipe in the preheader (e.g. if it requires |
2128 | // SCEV expansion), but for the canonical induction the step is required to be |
2129 | // 1, which is represented as live-in. |
2130 | if (getStepValue()->getDefiningRecipe()) |
2131 | return false; |
2132 | auto *StepC = dyn_cast<ConstantInt>(Val: getStepValue()->getLiveInIRValue()); |
2133 | auto *StartC = dyn_cast<ConstantInt>(Val: getStartValue()->getLiveInIRValue()); |
2134 | auto *CanIV = cast<VPCanonicalIVPHIRecipe>(Val: &*getParent()->begin()); |
2135 | return StartC && StartC->isZero() && StepC && StepC->isOne() && |
2136 | getScalarType() == CanIV->getScalarType(); |
2137 | } |
2138 | |
2139 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2140 | void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, |
2141 | VPSlotTracker &SlotTracker) const { |
2142 | O << Indent; |
2143 | printAsOperand(O, SlotTracker); |
2144 | O << " = DERIVED-IV " ; |
2145 | getStartValue()->printAsOperand(O, SlotTracker); |
2146 | O << " + " ; |
2147 | getOperand(1)->printAsOperand(O, SlotTracker); |
2148 | O << " * " ; |
2149 | getStepValue()->printAsOperand(O, SlotTracker); |
2150 | } |
2151 | #endif |
2152 | |
2153 | void VPScalarIVStepsRecipe::execute(VPTransformState &State) { |
2154 | // Fast-math-flags propagate from the original induction instruction. |
2155 | IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); |
2156 | if (hasFastMathFlags()) |
2157 | State.Builder.setFastMathFlags(getFastMathFlags()); |
2158 | |
2159 | /// Compute scalar induction steps. \p ScalarIV is the scalar induction |
2160 | /// variable on which to base the steps, \p Step is the size of the step. |
2161 | |
2162 | Value *BaseIV = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
2163 | Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0)); |
2164 | IRBuilderBase &Builder = State.Builder; |
2165 | |
2166 | // Ensure step has the same type as that of scalar IV. |
2167 | Type *BaseIVTy = BaseIV->getType()->getScalarType(); |
2168 | assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!" ); |
2169 | |
2170 | // We build scalar steps for both integer and floating-point induction |
2171 | // variables. Here, we determine the kind of arithmetic we will perform. |
2172 | Instruction::BinaryOps AddOp; |
2173 | Instruction::BinaryOps MulOp; |
2174 | if (BaseIVTy->isIntegerTy()) { |
2175 | AddOp = Instruction::Add; |
2176 | MulOp = Instruction::Mul; |
2177 | } else { |
2178 | AddOp = InductionOpcode; |
2179 | MulOp = Instruction::FMul; |
2180 | } |
2181 | |
2182 | // Determine the number of scalars we need to generate for each unroll |
2183 | // iteration. |
2184 | bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this); |
2185 | // Compute the scalar steps and save the results in State. |
2186 | Type *IntStepTy = |
2187 | IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits()); |
2188 | Type *VecIVTy = nullptr; |
2189 | Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; |
2190 | if (!FirstLaneOnly && State.VF.isScalable()) { |
2191 | VecIVTy = VectorType::get(ElementType: BaseIVTy, EC: State.VF); |
2192 | UnitStepVec = |
2193 | Builder.CreateStepVector(DstType: VectorType::get(ElementType: IntStepTy, EC: State.VF)); |
2194 | SplatStep = Builder.CreateVectorSplat(EC: State.VF, V: Step); |
2195 | SplatIV = Builder.CreateVectorSplat(EC: State.VF, V: BaseIV); |
2196 | } |
2197 | |
2198 | unsigned StartLane = 0; |
2199 | unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); |
2200 | if (State.Lane) { |
2201 | StartLane = State.Lane->getKnownLane(); |
2202 | EndLane = StartLane + 1; |
2203 | } |
2204 | Value *StartIdx0; |
2205 | if (getUnrollPart(U&: *this) == 0) |
2206 | StartIdx0 = ConstantInt::get(Ty: IntStepTy, V: 0); |
2207 | else { |
2208 | StartIdx0 = State.get(Def: getOperand(N: 2), IsScalar: true); |
2209 | if (getUnrollPart(U&: *this) != 1) { |
2210 | StartIdx0 = |
2211 | Builder.CreateMul(LHS: StartIdx0, RHS: ConstantInt::get(Ty: StartIdx0->getType(), |
2212 | V: getUnrollPart(U&: *this))); |
2213 | } |
2214 | StartIdx0 = Builder.CreateSExtOrTrunc(V: StartIdx0, DestTy: IntStepTy); |
2215 | } |
2216 | |
2217 | if (!FirstLaneOnly && State.VF.isScalable()) { |
2218 | auto *SplatStartIdx = Builder.CreateVectorSplat(EC: State.VF, V: StartIdx0); |
2219 | auto *InitVec = Builder.CreateAdd(LHS: SplatStartIdx, RHS: UnitStepVec); |
2220 | if (BaseIVTy->isFloatingPointTy()) |
2221 | InitVec = Builder.CreateSIToFP(V: InitVec, DestTy: VecIVTy); |
2222 | auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: InitVec, RHS: SplatStep); |
2223 | auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: SplatIV, RHS: Mul); |
2224 | State.set(Def: this, V: Add); |
2225 | // It's useful to record the lane values too for the known minimum number |
2226 | // of elements so we do those below. This improves the code quality when |
2227 | // trying to extract the first element, for example. |
2228 | } |
2229 | |
2230 | if (BaseIVTy->isFloatingPointTy()) |
2231 | StartIdx0 = Builder.CreateSIToFP(V: StartIdx0, DestTy: BaseIVTy); |
2232 | |
2233 | for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { |
2234 | Value *StartIdx = Builder.CreateBinOp( |
2235 | Opc: AddOp, LHS: StartIdx0, RHS: getSignedIntOrFpConstant(Ty: BaseIVTy, C: Lane)); |
2236 | // The step returned by `createStepForVF` is a runtime-evaluated value |
2237 | // when VF is scalable. Otherwise, it should be folded into a Constant. |
2238 | assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && |
2239 | "Expected StartIdx to be folded to a constant when VF is not " |
2240 | "scalable" ); |
2241 | auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step); |
2242 | auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul); |
2243 | State.set(Def: this, V: Add, Lane: VPLane(Lane)); |
2244 | } |
2245 | } |
2246 | |
2247 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2248 | void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, |
2249 | VPSlotTracker &SlotTracker) const { |
2250 | O << Indent; |
2251 | printAsOperand(O, SlotTracker); |
2252 | O << " = SCALAR-STEPS " ; |
2253 | printOperands(O, SlotTracker); |
2254 | } |
2255 | #endif |
2256 | |
2257 | void VPWidenGEPRecipe::execute(VPTransformState &State) { |
2258 | assert(State.VF.isVector() && "not widening" ); |
2259 | auto *GEP = cast<GetElementPtrInst>(Val: getUnderlyingInstr()); |
2260 | // Construct a vector GEP by widening the operands of the scalar GEP as |
2261 | // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP |
2262 | // results in a vector of pointers when at least one operand of the GEP |
2263 | // is vector-typed. Thus, to keep the representation compact, we only use |
2264 | // vector-typed operands for loop-varying values. |
2265 | |
2266 | if (areAllOperandsInvariant()) { |
2267 | // If we are vectorizing, but the GEP has only loop-invariant operands, |
2268 | // the GEP we build (by only using vector-typed operands for |
2269 | // loop-varying values) would be a scalar pointer. Thus, to ensure we |
2270 | // produce a vector of pointers, we need to either arbitrarily pick an |
2271 | // operand to broadcast, or broadcast a clone of the original GEP. |
2272 | // Here, we broadcast a clone of the original. |
2273 | // |
2274 | // TODO: If at some point we decide to scalarize instructions having |
2275 | // loop-invariant operands, this special case will no longer be |
2276 | // required. We would add the scalarization decision to |
2277 | // collectLoopScalars() and teach getVectorValue() to broadcast |
2278 | // the lane-zero scalar value. |
2279 | SmallVector<Value *> Ops; |
2280 | for (unsigned I = 0, E = getNumOperands(); I != E; I++) |
2281 | Ops.push_back(Elt: State.get(Def: getOperand(N: I), Lane: VPLane(0))); |
2282 | |
2283 | auto *NewGEP = State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr: Ops[0], |
2284 | IdxList: ArrayRef(Ops).drop_front(), Name: "" , |
2285 | NW: getGEPNoWrapFlags()); |
2286 | Value *Splat = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP); |
2287 | State.set(Def: this, V: Splat); |
2288 | } else { |
2289 | // If the GEP has at least one loop-varying operand, we are sure to |
2290 | // produce a vector of pointers unless VF is scalar. |
2291 | // The pointer operand of the new GEP. If it's loop-invariant, we |
2292 | // won't broadcast it. |
2293 | auto *Ptr = isPointerLoopInvariant() ? State.get(Def: getOperand(N: 0), Lane: VPLane(0)) |
2294 | : State.get(Def: getOperand(N: 0)); |
2295 | |
2296 | // Collect all the indices for the new GEP. If any index is |
2297 | // loop-invariant, we won't broadcast it. |
2298 | SmallVector<Value *, 4> Indices; |
2299 | for (unsigned I = 1, E = getNumOperands(); I < E; I++) { |
2300 | VPValue *Operand = getOperand(N: I); |
2301 | if (isIndexLoopInvariant(I: I - 1)) |
2302 | Indices.push_back(Elt: State.get(Def: Operand, Lane: VPLane(0))); |
2303 | else |
2304 | Indices.push_back(Elt: State.get(Def: Operand)); |
2305 | } |
2306 | |
2307 | // Create the new GEP. Note that this GEP may be a scalar if VF == 1, |
2308 | // but it should be a vector, otherwise. |
2309 | auto *NewGEP = State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr, |
2310 | IdxList: Indices, Name: "" , NW: getGEPNoWrapFlags()); |
2311 | assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && |
2312 | "NewGEP is not a pointer vector" ); |
2313 | State.set(Def: this, V: NewGEP); |
2314 | } |
2315 | } |
2316 | |
2317 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2318 | void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, |
2319 | VPSlotTracker &SlotTracker) const { |
2320 | O << Indent << "WIDEN-GEP " ; |
2321 | O << (isPointerLoopInvariant() ? "Inv" : "Var" ); |
2322 | for (size_t I = 0; I < getNumOperands() - 1; ++I) |
2323 | O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var" ) << "]" ; |
2324 | |
2325 | O << " " ; |
2326 | printAsOperand(O, SlotTracker); |
2327 | O << " = getelementptr" ; |
2328 | printFlags(O); |
2329 | printOperands(O, SlotTracker); |
2330 | } |
2331 | #endif |
2332 | |
2333 | static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride, |
2334 | unsigned CurrentPart, IRBuilderBase &Builder) { |
2335 | // Use i32 for the gep index type when the value is constant, |
2336 | // or query DataLayout for a more suitable index type otherwise. |
2337 | const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); |
2338 | return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0)) |
2339 | ? DL.getIndexType(PtrTy: Builder.getPtrTy(AddrSpace: 0)) |
2340 | : Builder.getInt32Ty(); |
2341 | } |
2342 | |
2343 | void VPVectorEndPointerRecipe::execute(VPTransformState &State) { |
2344 | auto &Builder = State.Builder; |
2345 | unsigned CurrentPart = getUnrollPart(U&: *this); |
2346 | bool IsUnitStride = Stride == 1 || Stride == -1; |
2347 | Type *IndexTy = getGEPIndexTy(IsScalable: State.VF.isScalable(), /*IsReverse*/ true, |
2348 | IsUnitStride, CurrentPart, Builder); |
2349 | |
2350 | // The wide store needs to start at the last vector element. |
2351 | Value *RunTimeVF = State.get(Def: getVFValue(), Lane: VPLane(0)); |
2352 | if (IndexTy != RunTimeVF->getType()) |
2353 | RunTimeVF = Builder.CreateZExtOrTrunc(V: RunTimeVF, DestTy: IndexTy); |
2354 | // NumElt = Stride * CurrentPart * RunTimeVF |
2355 | Value *NumElt = Builder.CreateMul( |
2356 | LHS: ConstantInt::get(Ty: IndexTy, V: Stride * (int64_t)CurrentPart), RHS: RunTimeVF); |
2357 | // LastLane = Stride * (RunTimeVF - 1) |
2358 | Value *LastLane = Builder.CreateSub(LHS: RunTimeVF, RHS: ConstantInt::get(Ty: IndexTy, V: 1)); |
2359 | if (Stride != 1) |
2360 | LastLane = Builder.CreateMul(LHS: ConstantInt::get(Ty: IndexTy, V: Stride), RHS: LastLane); |
2361 | Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
2362 | Value *ResultPtr = |
2363 | Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: NumElt, Name: "" , NW: getGEPNoWrapFlags()); |
2364 | ResultPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr: ResultPtr, IdxList: LastLane, Name: "" , |
2365 | NW: getGEPNoWrapFlags()); |
2366 | |
2367 | State.set(Def: this, V: ResultPtr, /*IsScalar*/ true); |
2368 | } |
2369 | |
2370 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2371 | void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent, |
2372 | VPSlotTracker &SlotTracker) const { |
2373 | O << Indent; |
2374 | printAsOperand(O, SlotTracker); |
2375 | O << " = vector-end-pointer" ; |
2376 | printFlags(O); |
2377 | printOperands(O, SlotTracker); |
2378 | } |
2379 | #endif |
2380 | |
2381 | void VPVectorPointerRecipe::execute(VPTransformState &State) { |
2382 | auto &Builder = State.Builder; |
2383 | unsigned CurrentPart = getUnrollPart(U&: *this); |
2384 | Type *IndexTy = getGEPIndexTy(IsScalable: State.VF.isScalable(), /*IsReverse*/ false, |
2385 | /*IsUnitStride*/ true, CurrentPart, Builder); |
2386 | Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
2387 | |
2388 | Value *Increment = createStepForVF(B&: Builder, Ty: IndexTy, VF: State.VF, Step: CurrentPart); |
2389 | Value *ResultPtr = |
2390 | Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: Increment, Name: "" , NW: getGEPNoWrapFlags()); |
2391 | |
2392 | State.set(Def: this, V: ResultPtr, /*IsScalar*/ true); |
2393 | } |
2394 | |
2395 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2396 | void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, |
2397 | VPSlotTracker &SlotTracker) const { |
2398 | O << Indent; |
2399 | printAsOperand(O, SlotTracker); |
2400 | O << " = vector-pointer " ; |
2401 | |
2402 | printOperands(O, SlotTracker); |
2403 | } |
2404 | #endif |
2405 | |
2406 | void VPBlendRecipe::execute(VPTransformState &State) { |
2407 | assert(isNormalized() && "Expected blend to be normalized!" ); |
2408 | // We know that all PHIs in non-header blocks are converted into |
2409 | // selects, so we don't have to worry about the insertion order and we |
2410 | // can just use the builder. |
2411 | // At this point we generate the predication tree. There may be |
2412 | // duplications since this is a simple recursive scan, but future |
2413 | // optimizations will clean it up. |
2414 | |
2415 | unsigned NumIncoming = getNumIncomingValues(); |
2416 | |
2417 | // Generate a sequence of selects of the form: |
2418 | // SELECT(Mask3, In3, |
2419 | // SELECT(Mask2, In2, |
2420 | // SELECT(Mask1, In1, |
2421 | // In0))) |
2422 | // Note that Mask0 is never used: lanes for which no path reaches this phi and |
2423 | // are essentially undef are taken from In0. |
2424 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
2425 | Value *Result = nullptr; |
2426 | for (unsigned In = 0; In < NumIncoming; ++In) { |
2427 | // We might have single edge PHIs (blocks) - use an identity |
2428 | // 'select' for the first PHI operand. |
2429 | Value *In0 = State.get(Def: getIncomingValue(Idx: In), IsScalar: OnlyFirstLaneUsed); |
2430 | if (In == 0) |
2431 | Result = In0; // Initialize with the first incoming value. |
2432 | else { |
2433 | // Select between the current value and the previous incoming edge |
2434 | // based on the incoming mask. |
2435 | Value *Cond = State.get(Def: getMask(Idx: In), IsScalar: OnlyFirstLaneUsed); |
2436 | Result = State.Builder.CreateSelect(C: Cond, True: In0, False: Result, Name: "predphi" ); |
2437 | } |
2438 | } |
2439 | State.set(Def: this, V: Result, IsScalar: OnlyFirstLaneUsed); |
2440 | } |
2441 | |
2442 | InstructionCost VPBlendRecipe::computeCost(ElementCount VF, |
2443 | VPCostContext &Ctx) const { |
2444 | // Handle cases where only the first lane is used the same way as the legacy |
2445 | // cost model. |
2446 | if (vputils::onlyFirstLaneUsed(Def: this)) |
2447 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
2448 | |
2449 | Type *ResultTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
2450 | Type *CmpTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.Types.getContext()), EC: VF); |
2451 | return (getNumIncomingValues() - 1) * |
2452 | Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: ResultTy, CondTy: CmpTy, |
2453 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind: Ctx.CostKind); |
2454 | } |
2455 | |
2456 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2457 | void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, |
2458 | VPSlotTracker &SlotTracker) const { |
2459 | O << Indent << "BLEND " ; |
2460 | printAsOperand(O, SlotTracker); |
2461 | O << " =" ; |
2462 | if (getNumIncomingValues() == 1) { |
2463 | // Not a User of any mask: not really blending, this is a |
2464 | // single-predecessor phi. |
2465 | O << " " ; |
2466 | getIncomingValue(0)->printAsOperand(O, SlotTracker); |
2467 | } else { |
2468 | for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { |
2469 | O << " " ; |
2470 | getIncomingValue(I)->printAsOperand(O, SlotTracker); |
2471 | if (I == 0) |
2472 | continue; |
2473 | O << "/" ; |
2474 | getMask(I)->printAsOperand(O, SlotTracker); |
2475 | } |
2476 | } |
2477 | } |
2478 | #endif |
2479 | |
2480 | void VPReductionRecipe::execute(VPTransformState &State) { |
2481 | assert(!State.Lane && "Reduction being replicated." ); |
2482 | Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true); |
2483 | RecurKind Kind = getRecurrenceKind(); |
2484 | assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && |
2485 | "In-loop AnyOf reductions aren't currently supported" ); |
2486 | // Propagate the fast-math flags carried by the underlying instruction. |
2487 | IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); |
2488 | State.Builder.setFastMathFlags(getFastMathFlags()); |
2489 | Value *NewVecOp = State.get(Def: getVecOp()); |
2490 | if (VPValue *Cond = getCondOp()) { |
2491 | Value *NewCond = State.get(Def: Cond, IsScalar: State.VF.isScalar()); |
2492 | VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType()); |
2493 | Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); |
2494 | |
2495 | Value *Start = getRecurrenceIdentity(K: Kind, Tp: ElementTy, FMF: getFastMathFlags()); |
2496 | if (State.VF.isVector()) |
2497 | Start = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Start); |
2498 | |
2499 | Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Start); |
2500 | NewVecOp = Select; |
2501 | } |
2502 | Value *NewRed; |
2503 | Value *NextInChain; |
2504 | if (IsOrdered) { |
2505 | if (State.VF.isVector()) |
2506 | NewRed = |
2507 | createOrderedReduction(B&: State.Builder, RdxKind: Kind, Src: NewVecOp, Start: PrevInChain); |
2508 | else |
2509 | NewRed = State.Builder.CreateBinOp( |
2510 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), |
2511 | LHS: PrevInChain, RHS: NewVecOp); |
2512 | PrevInChain = NewRed; |
2513 | NextInChain = NewRed; |
2514 | } else { |
2515 | PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true); |
2516 | NewRed = createSimpleReduction(B&: State.Builder, Src: NewVecOp, RdxKind: Kind); |
2517 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) |
2518 | NextInChain = createMinMaxOp(Builder&: State.Builder, RK: Kind, Left: NewRed, Right: PrevInChain); |
2519 | else |
2520 | NextInChain = State.Builder.CreateBinOp( |
2521 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed, |
2522 | RHS: PrevInChain); |
2523 | } |
2524 | State.set(Def: this, V: NextInChain, /*IsScalar*/ true); |
2525 | } |
2526 | |
2527 | void VPReductionEVLRecipe::execute(VPTransformState &State) { |
2528 | assert(!State.Lane && "Reduction being replicated." ); |
2529 | |
2530 | auto &Builder = State.Builder; |
2531 | // Propagate the fast-math flags carried by the underlying instruction. |
2532 | IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); |
2533 | Builder.setFastMathFlags(getFastMathFlags()); |
2534 | |
2535 | RecurKind Kind = getRecurrenceKind(); |
2536 | Value *Prev = State.get(Def: getChainOp(), /*IsScalar*/ true); |
2537 | Value *VecOp = State.get(Def: getVecOp()); |
2538 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
2539 | |
2540 | Value *Mask; |
2541 | if (VPValue *CondOp = getCondOp()) |
2542 | Mask = State.get(Def: CondOp); |
2543 | else |
2544 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
2545 | |
2546 | Value *NewRed; |
2547 | if (isOrdered()) { |
2548 | NewRed = createOrderedReduction(B&: Builder, RdxKind: Kind, Src: VecOp, Start: Prev, Mask, EVL); |
2549 | } else { |
2550 | NewRed = createSimpleReduction(B&: Builder, Src: VecOp, RdxKind: Kind, Mask, EVL); |
2551 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) |
2552 | NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev); |
2553 | else |
2554 | NewRed = Builder.CreateBinOp( |
2555 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed, |
2556 | RHS: Prev); |
2557 | } |
2558 | State.set(Def: this, V: NewRed, /*IsScalar*/ true); |
2559 | } |
2560 | |
2561 | InstructionCost VPReductionRecipe::computeCost(ElementCount VF, |
2562 | VPCostContext &Ctx) const { |
2563 | RecurKind RdxKind = getRecurrenceKind(); |
2564 | Type *ElementTy = Ctx.Types.inferScalarType(V: this); |
2565 | auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ElementTy, EC: VF)); |
2566 | unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind); |
2567 | FastMathFlags FMFs = getFastMathFlags(); |
2568 | std::optional<FastMathFlags> OptionalFMF = |
2569 | ElementTy->isFloatingPointTy() ? std::make_optional(t&: FMFs) : std::nullopt; |
2570 | |
2571 | // TODO: Support any-of reductions. |
2572 | assert( |
2573 | (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || |
2574 | ForceTargetInstructionCost.getNumOccurrences() > 0) && |
2575 | "Any-of reduction not implemented in VPlan-based cost model currently." ); |
2576 | |
2577 | // Note that TTI should model the cost of moving result to the scalar register |
2578 | // and the BinOp cost in the getMinMaxReductionCost(). |
2579 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RdxKind)) { |
2580 | Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind); |
2581 | return Ctx.TTI.getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF: FMFs, CostKind: Ctx.CostKind); |
2582 | } |
2583 | |
2584 | // Note that TTI should model the cost of moving result to the scalar register |
2585 | // and the BinOp cost in the getArithmeticReductionCost(). |
2586 | return Ctx.TTI.getArithmeticReductionCost(Opcode, Ty: VectorTy, FMF: OptionalFMF, |
2587 | CostKind: Ctx.CostKind); |
2588 | } |
2589 | |
2590 | VPExpressionRecipe::VPExpressionRecipe( |
2591 | ExpressionTypes ExpressionType, |
2592 | ArrayRef<VPSingleDefRecipe *> ExpressionRecipes) |
2593 | : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}), |
2594 | ExpressionRecipes(SetVector<VPSingleDefRecipe *>( |
2595 | ExpressionRecipes.begin(), ExpressionRecipes.end()) |
2596 | .takeVector()), |
2597 | ExpressionType(ExpressionType) { |
2598 | assert(!ExpressionRecipes.empty() && "Nothing to combine?" ); |
2599 | assert( |
2600 | none_of(ExpressionRecipes, |
2601 | [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && |
2602 | "expression cannot contain recipes with side-effects" ); |
2603 | |
2604 | // Maintain a copy of the expression recipes as a set of users. |
2605 | SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers; |
2606 | for (auto *R : ExpressionRecipes) |
2607 | ExpressionRecipesAsSetOfUsers.insert(Ptr: R); |
2608 | |
2609 | // Recipes in the expression, except the last one, must only be used by |
2610 | // (other) recipes inside the expression. If there are other users, external |
2611 | // to the expression, use a clone of the recipe for external users. |
2612 | for (VPSingleDefRecipe *R : ExpressionRecipes) { |
2613 | if (R != ExpressionRecipes.back() && |
2614 | any_of(Range: R->users(), P: [&ExpressionRecipesAsSetOfUsers](VPUser *U) { |
2615 | return !ExpressionRecipesAsSetOfUsers.contains(Ptr: U); |
2616 | })) { |
2617 | // There are users outside of the expression. Clone the recipe and use the |
2618 | // clone those external users. |
2619 | VPSingleDefRecipe *CopyForExtUsers = R->clone(); |
2620 | R->replaceUsesWithIf(New: CopyForExtUsers, ShouldReplace: [&ExpressionRecipesAsSetOfUsers]( |
2621 | VPUser &U, unsigned) { |
2622 | return !ExpressionRecipesAsSetOfUsers.contains(Ptr: &U); |
2623 | }); |
2624 | CopyForExtUsers->insertBefore(InsertPos: R); |
2625 | } |
2626 | if (R->getParent()) |
2627 | R->removeFromParent(); |
2628 | } |
2629 | |
2630 | // Internalize all external operands to the expression recipes. To do so, |
2631 | // create new temporary VPValues for all operands defined by a recipe outside |
2632 | // the expression. The original operands are added as operands of the |
2633 | // VPExpressionRecipe itself. |
2634 | for (auto *R : ExpressionRecipes) { |
2635 | for (const auto &[Idx, Op] : enumerate(First: R->operands())) { |
2636 | auto *Def = Op->getDefiningRecipe(); |
2637 | if (Def && ExpressionRecipesAsSetOfUsers.contains(Ptr: Def)) |
2638 | continue; |
2639 | addOperand(Operand: Op); |
2640 | LiveInPlaceholders.push_back(Elt: new VPValue()); |
2641 | R->setOperand(I: Idx, New: LiveInPlaceholders.back()); |
2642 | } |
2643 | } |
2644 | } |
2645 | |
2646 | void VPExpressionRecipe::decompose() { |
2647 | for (auto *R : ExpressionRecipes) |
2648 | R->insertBefore(InsertPos: this); |
2649 | |
2650 | for (const auto &[Idx, Op] : enumerate(First: operands())) |
2651 | LiveInPlaceholders[Idx]->replaceAllUsesWith(New: Op); |
2652 | |
2653 | replaceAllUsesWith(New: ExpressionRecipes.back()); |
2654 | ExpressionRecipes.clear(); |
2655 | } |
2656 | |
2657 | InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, |
2658 | VPCostContext &Ctx) const { |
2659 | Type *RedTy = Ctx.Types.inferScalarType(V: this); |
2660 | auto *SrcVecTy = cast<VectorType>( |
2661 | Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF)); |
2662 | assert(RedTy->isIntegerTy() && |
2663 | "VPExpressionRecipe only supports integer types currently." ); |
2664 | switch (ExpressionType) { |
2665 | case ExpressionTypes::ExtendedReduction: { |
2666 | unsigned Opcode = RecurrenceDescriptor::getOpcode( |
2667 | Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes[1])->getRecurrenceKind()); |
2668 | return Ctx.TTI.getExtendedReductionCost( |
2669 | Opcode, |
2670 | IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() == |
2671 | Instruction::ZExt, |
2672 | ResTy: RedTy, Ty: SrcVecTy, FMF: std::nullopt, CostKind: Ctx.CostKind); |
2673 | } |
2674 | case ExpressionTypes::MulAccReduction: |
2675 | return Ctx.TTI.getMulAccReductionCost(IsUnsigned: false, ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind); |
2676 | |
2677 | case ExpressionTypes::ExtMulAccReduction: |
2678 | return Ctx.TTI.getMulAccReductionCost( |
2679 | IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() == |
2680 | Instruction::ZExt, |
2681 | ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind); |
2682 | } |
2683 | llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum" ); |
2684 | } |
2685 | |
2686 | bool VPExpressionRecipe::mayReadOrWriteMemory() const { |
2687 | return any_of(Range: ExpressionRecipes, P: [](VPSingleDefRecipe *R) { |
2688 | return R->mayReadFromMemory() || R->mayWriteToMemory(); |
2689 | }); |
2690 | } |
2691 | |
2692 | bool VPExpressionRecipe::mayHaveSideEffects() const { |
2693 | assert( |
2694 | none_of(ExpressionRecipes, |
2695 | [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && |
2696 | "expression cannot contain recipes with side-effects" ); |
2697 | return false; |
2698 | } |
2699 | |
2700 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2701 | |
2702 | void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, |
2703 | VPSlotTracker &SlotTracker) const { |
2704 | O << Indent << "EXPRESSION " ; |
2705 | printAsOperand(O, SlotTracker); |
2706 | O << " = " ; |
2707 | auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back()); |
2708 | unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); |
2709 | |
2710 | switch (ExpressionType) { |
2711 | case ExpressionTypes::ExtendedReduction: { |
2712 | getOperand(1)->printAsOperand(O, SlotTracker); |
2713 | O << " +" ; |
2714 | O << " reduce." << Instruction::getOpcodeName(Opcode) << " (" ; |
2715 | getOperand(0)->printAsOperand(O, SlotTracker); |
2716 | Red->printFlags(O); |
2717 | |
2718 | auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); |
2719 | O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " |
2720 | << *Ext0->getResultType(); |
2721 | if (Red->isConditional()) { |
2722 | O << ", " ; |
2723 | Red->getCondOp()->printAsOperand(O, SlotTracker); |
2724 | } |
2725 | O << ")" ; |
2726 | break; |
2727 | } |
2728 | case ExpressionTypes::MulAccReduction: |
2729 | case ExpressionTypes::ExtMulAccReduction: { |
2730 | getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); |
2731 | O << " + " ; |
2732 | O << "reduce." |
2733 | << Instruction::getOpcodeName( |
2734 | RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) |
2735 | << " (" ; |
2736 | O << "mul" ; |
2737 | bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction; |
2738 | auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2] |
2739 | : ExpressionRecipes[0]); |
2740 | Mul->printFlags(O); |
2741 | if (IsExtended) |
2742 | O << "(" ; |
2743 | getOperand(0)->printAsOperand(O, SlotTracker); |
2744 | if (IsExtended) { |
2745 | auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); |
2746 | O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " |
2747 | << *Ext0->getResultType() << "), (" ; |
2748 | } else { |
2749 | O << ", " ; |
2750 | } |
2751 | getOperand(1)->printAsOperand(O, SlotTracker); |
2752 | if (IsExtended) { |
2753 | auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]); |
2754 | O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " |
2755 | << *Ext1->getResultType() << ")" ; |
2756 | } |
2757 | if (Red->isConditional()) { |
2758 | O << ", " ; |
2759 | Red->getCondOp()->printAsOperand(O, SlotTracker); |
2760 | } |
2761 | O << ")" ; |
2762 | break; |
2763 | } |
2764 | } |
2765 | } |
2766 | |
2767 | void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, |
2768 | VPSlotTracker &SlotTracker) const { |
2769 | O << Indent << "REDUCE " ; |
2770 | printAsOperand(O, SlotTracker); |
2771 | O << " = " ; |
2772 | getChainOp()->printAsOperand(O, SlotTracker); |
2773 | O << " +" ; |
2774 | printFlags(O); |
2775 | O << " reduce." |
2776 | << Instruction::getOpcodeName( |
2777 | RecurrenceDescriptor::getOpcode(getRecurrenceKind())) |
2778 | << " (" ; |
2779 | getVecOp()->printAsOperand(O, SlotTracker); |
2780 | if (isConditional()) { |
2781 | O << ", " ; |
2782 | getCondOp()->printAsOperand(O, SlotTracker); |
2783 | } |
2784 | O << ")" ; |
2785 | } |
2786 | |
2787 | void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, |
2788 | VPSlotTracker &SlotTracker) const { |
2789 | O << Indent << "REDUCE " ; |
2790 | printAsOperand(O, SlotTracker); |
2791 | O << " = " ; |
2792 | getChainOp()->printAsOperand(O, SlotTracker); |
2793 | O << " +" ; |
2794 | printFlags(O); |
2795 | O << " vp.reduce." |
2796 | << Instruction::getOpcodeName( |
2797 | RecurrenceDescriptor::getOpcode(getRecurrenceKind())) |
2798 | << " (" ; |
2799 | getVecOp()->printAsOperand(O, SlotTracker); |
2800 | O << ", " ; |
2801 | getEVL()->printAsOperand(O, SlotTracker); |
2802 | if (isConditional()) { |
2803 | O << ", " ; |
2804 | getCondOp()->printAsOperand(O, SlotTracker); |
2805 | } |
2806 | O << ")" ; |
2807 | } |
2808 | |
2809 | #endif |
2810 | |
2811 | /// A helper function to scalarize a single Instruction in the innermost loop. |
2812 | /// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue |
2813 | /// operands from \p RepRecipe instead of \p Instr's operands. |
2814 | static void scalarizeInstruction(const Instruction *Instr, |
2815 | VPReplicateRecipe *RepRecipe, |
2816 | const VPLane &Lane, VPTransformState &State) { |
2817 | assert((!Instr->getType()->isAggregateType() || |
2818 | canVectorizeTy(Instr->getType())) && |
2819 | "Expected vectorizable or non-aggregate type." ); |
2820 | |
2821 | // Does this instruction return a value ? |
2822 | bool IsVoidRetTy = Instr->getType()->isVoidTy(); |
2823 | |
2824 | Instruction *Cloned = Instr->clone(); |
2825 | if (!IsVoidRetTy) { |
2826 | Cloned->setName(Instr->getName() + ".cloned" ); |
2827 | #if !defined(NDEBUG) |
2828 | // Verify that VPlan type inference results agree with the type of the |
2829 | // generated values. |
2830 | assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && |
2831 | "inferred type and type from generated instructions do not match" ); |
2832 | #endif |
2833 | } |
2834 | |
2835 | RepRecipe->applyFlags(I&: *Cloned); |
2836 | RepRecipe->applyMetadata(I&: *Cloned); |
2837 | |
2838 | if (auto DL = RepRecipe->getDebugLoc()) |
2839 | State.setDebugLocFrom(DL); |
2840 | |
2841 | // Replace the operands of the cloned instructions with their scalar |
2842 | // equivalents in the new loop. |
2843 | for (const auto &I : enumerate(First: RepRecipe->operands())) { |
2844 | auto InputLane = Lane; |
2845 | VPValue *Operand = I.value(); |
2846 | if (vputils::isSingleScalar(VPV: Operand)) |
2847 | InputLane = VPLane::getFirstLane(); |
2848 | Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Lane: InputLane)); |
2849 | } |
2850 | |
2851 | // Place the cloned scalar in the new loop. |
2852 | State.Builder.Insert(I: Cloned); |
2853 | |
2854 | State.set(Def: RepRecipe, V: Cloned, Lane); |
2855 | |
2856 | // If we just cloned a new assumption, add it the assumption cache. |
2857 | if (auto *II = dyn_cast<AssumeInst>(Val: Cloned)) |
2858 | State.AC->registerAssumption(CI: II); |
2859 | |
2860 | assert( |
2861 | (RepRecipe->getParent()->getParent() || |
2862 | !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || |
2863 | all_of(RepRecipe->operands(), |
2864 | [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && |
2865 | "Expected a recipe is either within a region or all of its operands " |
2866 | "are defined outside the vectorized region." ); |
2867 | } |
2868 | |
2869 | void VPReplicateRecipe::execute(VPTransformState &State) { |
2870 | Instruction *UI = getUnderlyingInstr(); |
2871 | |
2872 | if (!State.Lane) { |
2873 | assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions " |
2874 | "must have already been unrolled" ); |
2875 | scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: VPLane(0), State); |
2876 | return; |
2877 | } |
2878 | |
2879 | assert((State.VF.isScalar() || !isSingleScalar()) && |
2880 | "uniform recipe shouldn't be predicated" ); |
2881 | assert(!State.VF.isScalable() && "Can't scalarize a scalable vector" ); |
2882 | scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: *State.Lane, State); |
2883 | // Insert scalar instance packing it into a vector. |
2884 | if (State.VF.isVector() && shouldPack()) { |
2885 | Value *WideValue = |
2886 | State.Lane->isFirstLane() |
2887 | ? PoisonValue::get(T: VectorType::get(ElementType: UI->getType(), EC: State.VF)) |
2888 | : State.get(Def: this); |
2889 | State.set(Def: this, V: State.packScalarIntoVectorizedValue(Def: this, WideValue, |
2890 | Lane: *State.Lane)); |
2891 | } |
2892 | } |
2893 | |
2894 | bool VPReplicateRecipe::shouldPack() const { |
2895 | // Find if the recipe is used by a widened recipe via an intervening |
2896 | // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. |
2897 | return any_of(Range: users(), P: [](const VPUser *U) { |
2898 | if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U)) |
2899 | return any_of(Range: PredR->users(), P: [PredR](const VPUser *U) { |
2900 | return !U->usesScalars(Op: PredR); |
2901 | }); |
2902 | return false; |
2903 | }); |
2904 | } |
2905 | |
2906 | InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, |
2907 | VPCostContext &Ctx) const { |
2908 | Instruction *UI = cast<Instruction>(Val: getUnderlyingValue()); |
2909 | // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan |
2910 | // transform, avoid computing their cost multiple times for now. |
2911 | Ctx.SkipCostComputation.insert(Ptr: UI); |
2912 | |
2913 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
2914 | Type *ResultTy = Ctx.Types.inferScalarType(V: this); |
2915 | switch (UI->getOpcode()) { |
2916 | case Instruction::GetElementPtr: |
2917 | // We mark this instruction as zero-cost because the cost of GEPs in |
2918 | // vectorized code depends on whether the corresponding memory instruction |
2919 | // is scalarized or not. Therefore, we handle GEPs with the memory |
2920 | // instruction cost. |
2921 | return 0; |
2922 | case Instruction::Add: |
2923 | case Instruction::Sub: |
2924 | case Instruction::FAdd: |
2925 | case Instruction::FSub: |
2926 | case Instruction::Mul: |
2927 | case Instruction::FMul: |
2928 | case Instruction::FDiv: |
2929 | case Instruction::FRem: |
2930 | case Instruction::Shl: |
2931 | case Instruction::LShr: |
2932 | case Instruction::AShr: |
2933 | case Instruction::And: |
2934 | case Instruction::Or: |
2935 | case Instruction::Xor: { |
2936 | auto Op2Info = Ctx.getOperandInfo(V: getOperand(N: 1)); |
2937 | SmallVector<const Value *, 4> Operands(UI->operand_values()); |
2938 | return Ctx.TTI.getArithmeticInstrCost( |
2939 | Opcode: UI->getOpcode(), Ty: ResultTy, CostKind, |
2940 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
2941 | Opd2Info: Op2Info, Args: Operands, CxtI: UI, TLibInfo: &Ctx.TLI) * |
2942 | (isSingleScalar() ? 1 : VF.getFixedValue()); |
2943 | } |
2944 | } |
2945 | |
2946 | return Ctx.getLegacyCost(UI, VF); |
2947 | } |
2948 | |
2949 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2950 | void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, |
2951 | VPSlotTracker &SlotTracker) const { |
2952 | O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE " ); |
2953 | |
2954 | if (!getUnderlyingInstr()->getType()->isVoidTy()) { |
2955 | printAsOperand(O, SlotTracker); |
2956 | O << " = " ; |
2957 | } |
2958 | if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { |
2959 | O << "call" ; |
2960 | printFlags(O); |
2961 | O << "@" << CB->getCalledFunction()->getName() << "(" ; |
2962 | interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), |
2963 | O, [&O, &SlotTracker](VPValue *Op) { |
2964 | Op->printAsOperand(O, SlotTracker); |
2965 | }); |
2966 | O << ")" ; |
2967 | } else { |
2968 | O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); |
2969 | printFlags(O); |
2970 | printOperands(O, SlotTracker); |
2971 | } |
2972 | |
2973 | if (shouldPack()) |
2974 | O << " (S->V)" ; |
2975 | } |
2976 | #endif |
2977 | |
2978 | void VPBranchOnMaskRecipe::execute(VPTransformState &State) { |
2979 | assert(State.Lane && "Branch on Mask works only on single instance." ); |
2980 | |
2981 | VPValue *BlockInMask = getOperand(N: 0); |
2982 | Value *ConditionBit = State.get(Def: BlockInMask, Lane: *State.Lane); |
2983 | |
2984 | // Replace the temporary unreachable terminator with a new conditional branch, |
2985 | // whose two destinations will be set later when they are created. |
2986 | auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); |
2987 | assert(isa<UnreachableInst>(CurrentTerminator) && |
2988 | "Expected to replace unreachable terminator with conditional branch." ); |
2989 | auto CondBr = |
2990 | State.Builder.CreateCondBr(Cond: ConditionBit, True: State.CFG.PrevBB, False: nullptr); |
2991 | CondBr->setSuccessor(idx: 0, NewSucc: nullptr); |
2992 | CurrentTerminator->eraseFromParent(); |
2993 | } |
2994 | |
2995 | InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, |
2996 | VPCostContext &Ctx) const { |
2997 | // The legacy cost model doesn't assign costs to branches for individual |
2998 | // replicate regions. Match the current behavior in the VPlan cost model for |
2999 | // now. |
3000 | return 0; |
3001 | } |
3002 | |
3003 | void VPPredInstPHIRecipe::execute(VPTransformState &State) { |
3004 | assert(State.Lane && "Predicated instruction PHI works per instance." ); |
3005 | Instruction *ScalarPredInst = |
3006 | cast<Instruction>(Val: State.get(Def: getOperand(N: 0), Lane: *State.Lane)); |
3007 | BasicBlock *PredicatedBB = ScalarPredInst->getParent(); |
3008 | BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); |
3009 | assert(PredicatingBB && "Predicated block has no single predecessor." ); |
3010 | assert(isa<VPReplicateRecipe>(getOperand(0)) && |
3011 | "operand must be VPReplicateRecipe" ); |
3012 | |
3013 | // By current pack/unpack logic we need to generate only a single phi node: if |
3014 | // a vector value for the predicated instruction exists at this point it means |
3015 | // the instruction has vector users only, and a phi for the vector value is |
3016 | // needed. In this case the recipe of the predicated instruction is marked to |
3017 | // also do that packing, thereby "hoisting" the insert-element sequence. |
3018 | // Otherwise, a phi node for the scalar value is needed. |
3019 | if (State.hasVectorValue(Def: getOperand(N: 0))) { |
3020 | Value *VectorValue = State.get(Def: getOperand(N: 0)); |
3021 | InsertElementInst *IEI = cast<InsertElementInst>(Val: VectorValue); |
3022 | PHINode *VPhi = State.Builder.CreatePHI(Ty: IEI->getType(), NumReservedValues: 2); |
3023 | VPhi->addIncoming(V: IEI->getOperand(i_nocapture: 0), BB: PredicatingBB); // Unmodified vector. |
3024 | VPhi->addIncoming(V: IEI, BB: PredicatedBB); // New vector with inserted element. |
3025 | if (State.hasVectorValue(Def: this)) |
3026 | State.reset(Def: this, V: VPhi); |
3027 | else |
3028 | State.set(Def: this, V: VPhi); |
3029 | // NOTE: Currently we need to update the value of the operand, so the next |
3030 | // predicated iteration inserts its generated value in the correct vector. |
3031 | State.reset(Def: getOperand(N: 0), V: VPhi); |
3032 | } else { |
3033 | if (vputils::onlyFirstLaneUsed(Def: this) && !State.Lane->isFirstLane()) |
3034 | return; |
3035 | |
3036 | Type *PredInstType = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)); |
3037 | PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: 2); |
3038 | Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()), |
3039 | BB: PredicatingBB); |
3040 | Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB); |
3041 | if (State.hasScalarValue(Def: this, Lane: *State.Lane)) |
3042 | State.reset(Def: this, V: Phi, Lane: *State.Lane); |
3043 | else |
3044 | State.set(Def: this, V: Phi, Lane: *State.Lane); |
3045 | // NOTE: Currently we need to update the value of the operand, so the next |
3046 | // predicated iteration inserts its generated value in the correct vector. |
3047 | State.reset(Def: getOperand(N: 0), V: Phi, Lane: *State.Lane); |
3048 | } |
3049 | } |
3050 | |
3051 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3052 | void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
3053 | VPSlotTracker &SlotTracker) const { |
3054 | O << Indent << "PHI-PREDICATED-INSTRUCTION " ; |
3055 | printAsOperand(O, SlotTracker); |
3056 | O << " = " ; |
3057 | printOperands(O, SlotTracker); |
3058 | } |
3059 | #endif |
3060 | |
3061 | InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, |
3062 | VPCostContext &Ctx) const { |
3063 | Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF); |
3064 | const Align Alignment = |
3065 | getLoadStoreAlignment(I: const_cast<Instruction *>(&Ingredient)); |
3066 | unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr())) |
3067 | ->getAddressSpace(); |
3068 | unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) |
3069 | ? Instruction::Load |
3070 | : Instruction::Store; |
3071 | |
3072 | if (!Consecutive) { |
3073 | // TODO: Using the original IR may not be accurate. |
3074 | // Currently, ARM will use the underlying IR to calculate gather/scatter |
3075 | // instruction cost. |
3076 | const Value *Ptr = getLoadStorePointerOperand(V: &Ingredient); |
3077 | assert(!Reverse && |
3078 | "Inconsecutive memory access should not have the order." ); |
3079 | return Ctx.TTI.getAddressComputationCost(Ty) + |
3080 | Ctx.TTI.getGatherScatterOpCost(Opcode, DataTy: Ty, Ptr, VariableMask: IsMasked, Alignment, |
3081 | CostKind: Ctx.CostKind, I: &Ingredient); |
3082 | } |
3083 | |
3084 | InstructionCost Cost = 0; |
3085 | if (IsMasked) { |
3086 | Cost += |
3087 | Ctx.TTI.getMaskedMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind); |
3088 | } else { |
3089 | TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo( |
3090 | V: isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) ? getOperand(N: 0) |
3091 | : getOperand(N: 1)); |
3092 | Cost += Ctx.TTI.getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind, |
3093 | OpdInfo: OpInfo, I: &Ingredient); |
3094 | } |
3095 | if (!Reverse) |
3096 | return Cost; |
3097 | |
3098 | return Cost += Ctx.TTI.getShuffleCost( |
3099 | Kind: TargetTransformInfo::SK_Reverse, DstTy: cast<VectorType>(Val: Ty), |
3100 | SrcTy: cast<VectorType>(Val: Ty), Mask: {}, CostKind: Ctx.CostKind, Index: 0); |
3101 | } |
3102 | |
3103 | void VPWidenLoadRecipe::execute(VPTransformState &State) { |
3104 | Type *ScalarDataTy = getLoadStoreType(I: &Ingredient); |
3105 | auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF); |
3106 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
3107 | bool CreateGather = !isConsecutive(); |
3108 | |
3109 | auto &Builder = State.Builder; |
3110 | Value *Mask = nullptr; |
3111 | if (auto *VPMask = getMask()) { |
3112 | // Mask reversal is only needed for non-all-one (null) masks, as reverse |
3113 | // of a null all-one mask is a null mask. |
3114 | Mask = State.get(Def: VPMask); |
3115 | if (isReverse()) |
3116 | Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse" ); |
3117 | } |
3118 | |
3119 | Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateGather); |
3120 | Value *NewLI; |
3121 | if (CreateGather) { |
3122 | NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr, |
3123 | Name: "wide.masked.gather" ); |
3124 | } else if (Mask) { |
3125 | NewLI = |
3126 | Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask, |
3127 | PassThru: PoisonValue::get(T: DataTy), Name: "wide.masked.load" ); |
3128 | } else { |
3129 | NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load" ); |
3130 | } |
3131 | applyMetadata(I&: *cast<Instruction>(Val: NewLI)); |
3132 | if (Reverse) |
3133 | NewLI = Builder.CreateVectorReverse(V: NewLI, Name: "reverse" ); |
3134 | State.set(Def: this, V: NewLI); |
3135 | } |
3136 | |
3137 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3138 | void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, |
3139 | VPSlotTracker &SlotTracker) const { |
3140 | O << Indent << "WIDEN " ; |
3141 | printAsOperand(O, SlotTracker); |
3142 | O << " = load " ; |
3143 | printOperands(O, SlotTracker); |
3144 | } |
3145 | #endif |
3146 | |
3147 | /// Use all-true mask for reverse rather than actual mask, as it avoids a |
3148 | /// dependence w/o affecting the result. |
3149 | static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, |
3150 | Value *EVL, const Twine &Name) { |
3151 | VectorType *ValTy = cast<VectorType>(Val: Operand->getType()); |
3152 | Value *AllTrueMask = |
3153 | Builder.CreateVectorSplat(EC: ValTy->getElementCount(), V: Builder.getTrue()); |
3154 | return Builder.CreateIntrinsic(RetTy: ValTy, ID: Intrinsic::experimental_vp_reverse, |
3155 | Args: {Operand, AllTrueMask, EVL}, FMFSource: nullptr, Name); |
3156 | } |
3157 | |
3158 | void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { |
3159 | Type *ScalarDataTy = getLoadStoreType(I: &Ingredient); |
3160 | auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF); |
3161 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
3162 | bool CreateGather = !isConsecutive(); |
3163 | |
3164 | auto &Builder = State.Builder; |
3165 | CallInst *NewLI; |
3166 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
3167 | Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateGather); |
3168 | Value *Mask = nullptr; |
3169 | if (VPValue *VPMask = getMask()) { |
3170 | Mask = State.get(Def: VPMask); |
3171 | if (isReverse()) |
3172 | Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask" ); |
3173 | } else { |
3174 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
3175 | } |
3176 | |
3177 | if (CreateGather) { |
3178 | NewLI = |
3179 | Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_gather, Args: {Addr, Mask, EVL}, |
3180 | FMFSource: nullptr, Name: "wide.masked.gather" ); |
3181 | } else { |
3182 | NewLI = Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_load, |
3183 | Args: {Addr, Mask, EVL}, FMFSource: nullptr, Name: "vp.op.load" ); |
3184 | } |
3185 | NewLI->addParamAttr( |
3186 | ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment)); |
3187 | applyMetadata(I&: *NewLI); |
3188 | Instruction *Res = NewLI; |
3189 | if (isReverse()) |
3190 | Res = createReverseEVL(Builder, Operand: Res, EVL, Name: "vp.reverse" ); |
3191 | State.set(Def: this, V: Res); |
3192 | } |
3193 | |
3194 | InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, |
3195 | VPCostContext &Ctx) const { |
3196 | if (!Consecutive || IsMasked) |
3197 | return VPWidenMemoryRecipe::computeCost(VF, Ctx); |
3198 | |
3199 | // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() |
3200 | // here because the EVL recipes using EVL to replace the tail mask. But in the |
3201 | // legacy model, it will always calculate the cost of mask. |
3202 | // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we |
3203 | // don't need to compare to the legacy cost model. |
3204 | Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF); |
3205 | const Align Alignment = |
3206 | getLoadStoreAlignment(I: const_cast<Instruction *>(&Ingredient)); |
3207 | unsigned AS = |
3208 | getLoadStoreAddressSpace(I: const_cast<Instruction *>(&Ingredient)); |
3209 | InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( |
3210 | Opcode: Instruction::Load, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind); |
3211 | if (!Reverse) |
3212 | return Cost; |
3213 | |
3214 | return Cost + Ctx.TTI.getShuffleCost( |
3215 | Kind: TargetTransformInfo::SK_Reverse, DstTy: cast<VectorType>(Val: Ty), |
3216 | SrcTy: cast<VectorType>(Val: Ty), Mask: {}, CostKind: Ctx.CostKind, Index: 0); |
3217 | } |
3218 | |
3219 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3220 | void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, |
3221 | VPSlotTracker &SlotTracker) const { |
3222 | O << Indent << "WIDEN " ; |
3223 | printAsOperand(O, SlotTracker); |
3224 | O << " = vp.load " ; |
3225 | printOperands(O, SlotTracker); |
3226 | } |
3227 | #endif |
3228 | |
3229 | void VPWidenStoreRecipe::execute(VPTransformState &State) { |
3230 | VPValue *StoredVPValue = getStoredValue(); |
3231 | bool CreateScatter = !isConsecutive(); |
3232 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
3233 | |
3234 | auto &Builder = State.Builder; |
3235 | |
3236 | Value *Mask = nullptr; |
3237 | if (auto *VPMask = getMask()) { |
3238 | // Mask reversal is only needed for non-all-one (null) masks, as reverse |
3239 | // of a null all-one mask is a null mask. |
3240 | Mask = State.get(Def: VPMask); |
3241 | if (isReverse()) |
3242 | Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse" ); |
3243 | } |
3244 | |
3245 | Value *StoredVal = State.get(Def: StoredVPValue); |
3246 | if (isReverse()) { |
3247 | // If we store to reverse consecutive memory locations, then we need |
3248 | // to reverse the order of elements in the stored value. |
3249 | StoredVal = Builder.CreateVectorReverse(V: StoredVal, Name: "reverse" ); |
3250 | // We don't want to update the value in the map as it might be used in |
3251 | // another expression. So don't call resetVectorValue(StoredVal). |
3252 | } |
3253 | Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateScatter); |
3254 | Instruction *NewSI = nullptr; |
3255 | if (CreateScatter) |
3256 | NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask); |
3257 | else if (Mask) |
3258 | NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask); |
3259 | else |
3260 | NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment); |
3261 | applyMetadata(I&: *NewSI); |
3262 | } |
3263 | |
3264 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3265 | void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, |
3266 | VPSlotTracker &SlotTracker) const { |
3267 | O << Indent << "WIDEN store " ; |
3268 | printOperands(O, SlotTracker); |
3269 | } |
3270 | #endif |
3271 | |
3272 | void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { |
3273 | VPValue *StoredValue = getStoredValue(); |
3274 | bool CreateScatter = !isConsecutive(); |
3275 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
3276 | |
3277 | auto &Builder = State.Builder; |
3278 | |
3279 | CallInst *NewSI = nullptr; |
3280 | Value *StoredVal = State.get(Def: StoredValue); |
3281 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
3282 | if (isReverse()) |
3283 | StoredVal = createReverseEVL(Builder, Operand: StoredVal, EVL, Name: "vp.reverse" ); |
3284 | Value *Mask = nullptr; |
3285 | if (VPValue *VPMask = getMask()) { |
3286 | Mask = State.get(Def: VPMask); |
3287 | if (isReverse()) |
3288 | Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask" ); |
3289 | } else { |
3290 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
3291 | } |
3292 | Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateScatter); |
3293 | if (CreateScatter) { |
3294 | NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()), |
3295 | ID: Intrinsic::vp_scatter, |
3296 | Args: {StoredVal, Addr, Mask, EVL}); |
3297 | } else { |
3298 | NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()), |
3299 | ID: Intrinsic::vp_store, |
3300 | Args: {StoredVal, Addr, Mask, EVL}); |
3301 | } |
3302 | NewSI->addParamAttr( |
3303 | ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment)); |
3304 | applyMetadata(I&: *NewSI); |
3305 | } |
3306 | |
3307 | InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, |
3308 | VPCostContext &Ctx) const { |
3309 | if (!Consecutive || IsMasked) |
3310 | return VPWidenMemoryRecipe::computeCost(VF, Ctx); |
3311 | |
3312 | // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() |
3313 | // here because the EVL recipes using EVL to replace the tail mask. But in the |
3314 | // legacy model, it will always calculate the cost of mask. |
3315 | // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we |
3316 | // don't need to compare to the legacy cost model. |
3317 | Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF); |
3318 | const Align Alignment = |
3319 | getLoadStoreAlignment(I: const_cast<Instruction *>(&Ingredient)); |
3320 | unsigned AS = |
3321 | getLoadStoreAddressSpace(I: const_cast<Instruction *>(&Ingredient)); |
3322 | InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( |
3323 | Opcode: Instruction::Store, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind); |
3324 | if (!Reverse) |
3325 | return Cost; |
3326 | |
3327 | return Cost + Ctx.TTI.getShuffleCost( |
3328 | Kind: TargetTransformInfo::SK_Reverse, DstTy: cast<VectorType>(Val: Ty), |
3329 | SrcTy: cast<VectorType>(Val: Ty), Mask: {}, CostKind: Ctx.CostKind, Index: 0); |
3330 | } |
3331 | |
3332 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3333 | void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, |
3334 | VPSlotTracker &SlotTracker) const { |
3335 | O << Indent << "WIDEN vp.store " ; |
3336 | printOperands(O, SlotTracker); |
3337 | } |
3338 | #endif |
3339 | |
3340 | static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, |
3341 | VectorType *DstVTy, const DataLayout &DL) { |
3342 | // Verify that V is a vector type with same number of elements as DstVTy. |
3343 | auto VF = DstVTy->getElementCount(); |
3344 | auto *SrcVecTy = cast<VectorType>(Val: V->getType()); |
3345 | assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match" ); |
3346 | Type *SrcElemTy = SrcVecTy->getElementType(); |
3347 | Type *DstElemTy = DstVTy->getElementType(); |
3348 | assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && |
3349 | "Vector elements must have same size" ); |
3350 | |
3351 | // Do a direct cast if element types are castable. |
3352 | if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) { |
3353 | return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy); |
3354 | } |
3355 | // V cannot be directly casted to desired vector type. |
3356 | // May happen when V is a floating point vector but DstVTy is a vector of |
3357 | // pointers or vice-versa. Handle this using a two-step bitcast using an |
3358 | // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. |
3359 | assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && |
3360 | "Only one type should be a pointer type" ); |
3361 | assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && |
3362 | "Only one type should be a floating point type" ); |
3363 | Type *IntTy = |
3364 | IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy)); |
3365 | auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF); |
3366 | Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy); |
3367 | return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy); |
3368 | } |
3369 | |
3370 | /// Return a vector containing interleaved elements from multiple |
3371 | /// smaller input vectors. |
3372 | static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, |
3373 | const Twine &Name) { |
3374 | unsigned Factor = Vals.size(); |
3375 | assert(Factor > 1 && "Tried to interleave invalid number of vectors" ); |
3376 | |
3377 | VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType()); |
3378 | #ifndef NDEBUG |
3379 | for (Value *Val : Vals) |
3380 | assert(Val->getType() == VecTy && "Tried to interleave mismatched types" ); |
3381 | #endif |
3382 | |
3383 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), so |
3384 | // must use intrinsics to interleave. |
3385 | if (VecTy->isScalableTy()) { |
3386 | assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors" ); |
3387 | VectorType *InterleaveTy = |
3388 | VectorType::get(ElementType: VecTy->getElementType(), |
3389 | EC: VecTy->getElementCount().multiplyCoefficientBy(RHS: Factor)); |
3390 | return Builder.CreateIntrinsic(RetTy: InterleaveTy, |
3391 | ID: getInterleaveIntrinsicID(Factor), Args: Vals, |
3392 | /*FMFSource=*/nullptr, Name); |
3393 | } |
3394 | |
3395 | // Fixed length. Start by concatenating all vectors into a wide vector. |
3396 | Value *WideVec = concatenateVectors(Builder, Vecs: Vals); |
3397 | |
3398 | // Interleave the elements into the wide vector. |
3399 | const unsigned NumElts = VecTy->getElementCount().getFixedValue(); |
3400 | return Builder.CreateShuffleVector( |
3401 | V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name); |
3402 | } |
3403 | |
3404 | // Try to vectorize the interleave group that \p Instr belongs to. |
3405 | // |
3406 | // E.g. Translate following interleaved load group (factor = 3): |
3407 | // for (i = 0; i < N; i+=3) { |
3408 | // R = Pic[i]; // Member of index 0 |
3409 | // G = Pic[i+1]; // Member of index 1 |
3410 | // B = Pic[i+2]; // Member of index 2 |
3411 | // ... // do something to R, G, B |
3412 | // } |
3413 | // To: |
3414 | // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B |
3415 | // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements |
3416 | // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements |
3417 | // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements |
3418 | // |
3419 | // Or translate following interleaved store group (factor = 3): |
3420 | // for (i = 0; i < N; i+=3) { |
3421 | // ... do something to R, G, B |
3422 | // Pic[i] = R; // Member of index 0 |
3423 | // Pic[i+1] = G; // Member of index 1 |
3424 | // Pic[i+2] = B; // Member of index 2 |
3425 | // } |
3426 | // To: |
3427 | // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> |
3428 | // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> |
3429 | // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, |
3430 | // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements |
3431 | // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B |
3432 | void VPInterleaveRecipe::execute(VPTransformState &State) { |
3433 | assert(!State.Lane && "Interleave group being replicated." ); |
3434 | const InterleaveGroup<Instruction> *Group = IG; |
3435 | Instruction *Instr = Group->getInsertPos(); |
3436 | |
3437 | // Prepare for the vector type of the interleaved load/store. |
3438 | Type *ScalarTy = getLoadStoreType(I: Instr); |
3439 | unsigned InterleaveFactor = Group->getFactor(); |
3440 | auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF * InterleaveFactor); |
3441 | |
3442 | VPValue *BlockInMask = getMask(); |
3443 | VPValue *Addr = getAddr(); |
3444 | Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0)); |
3445 | Value *PoisonVec = PoisonValue::get(T: VecTy); |
3446 | |
3447 | auto CreateGroupMask = [&BlockInMask, &State, |
3448 | &InterleaveFactor](Value *MaskForGaps) -> Value * { |
3449 | if (State.VF.isScalable()) { |
3450 | assert(!MaskForGaps && "Interleaved groups with gaps are not supported." ); |
3451 | assert(InterleaveFactor <= 8 && |
3452 | "Unsupported deinterleave factor for scalable vectors" ); |
3453 | auto *ResBlockInMask = State.get(Def: BlockInMask); |
3454 | SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask); |
3455 | return interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask" ); |
3456 | } |
3457 | |
3458 | if (!BlockInMask) |
3459 | return MaskForGaps; |
3460 | |
3461 | Value *ResBlockInMask = State.get(Def: BlockInMask); |
3462 | Value *ShuffledMask = State.Builder.CreateShuffleVector( |
3463 | V: ResBlockInMask, |
3464 | Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getFixedValue()), |
3465 | Name: "interleaved.mask" ); |
3466 | return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And, |
3467 | LHS: ShuffledMask, RHS: MaskForGaps) |
3468 | : ShuffledMask; |
3469 | }; |
3470 | |
3471 | const DataLayout &DL = Instr->getDataLayout(); |
3472 | // Vectorize the interleaved load group. |
3473 | if (isa<LoadInst>(Val: Instr)) { |
3474 | Value *MaskForGaps = nullptr; |
3475 | if (NeedsMaskForGaps) { |
3476 | MaskForGaps = |
3477 | createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getFixedValue(), Group: *Group); |
3478 | assert(MaskForGaps && "Mask for Gaps is required but it is null" ); |
3479 | } |
3480 | |
3481 | Instruction *NewLoad; |
3482 | if (BlockInMask || MaskForGaps) { |
3483 | Value *GroupMask = CreateGroupMask(MaskForGaps); |
3484 | NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: ResAddr, |
3485 | Alignment: Group->getAlign(), Mask: GroupMask, |
3486 | PassThru: PoisonVec, Name: "wide.masked.vec" ); |
3487 | } else |
3488 | NewLoad = State.Builder.CreateAlignedLoad(Ty: VecTy, Ptr: ResAddr, |
3489 | Align: Group->getAlign(), Name: "wide.vec" ); |
3490 | Group->addMetadata(NewInst: NewLoad); |
3491 | |
3492 | ArrayRef<VPValue *> VPDefs = definedValues(); |
3493 | const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); |
3494 | if (VecTy->isScalableTy()) { |
3495 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), |
3496 | // so must use intrinsics to deinterleave. |
3497 | assert(InterleaveFactor <= 8 && |
3498 | "Unsupported deinterleave factor for scalable vectors" ); |
3499 | Value *Deinterleave = State.Builder.CreateIntrinsic( |
3500 | ID: getDeinterleaveIntrinsicID(Factor: InterleaveFactor), Types: NewLoad->getType(), |
3501 | Args: NewLoad, |
3502 | /*FMFSource=*/nullptr, Name: "strided.vec" ); |
3503 | |
3504 | for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { |
3505 | Instruction *Member = Group->getMember(Index: I); |
3506 | Value *StridedVec = State.Builder.CreateExtractValue(Agg: Deinterleave, Idxs: I); |
3507 | if (!Member) { |
3508 | // This value is not needed as it's not used |
3509 | cast<Instruction>(Val: StridedVec)->eraseFromParent(); |
3510 | continue; |
3511 | } |
3512 | // If this member has different type, cast the result type. |
3513 | if (Member->getType() != ScalarTy) { |
3514 | VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF); |
3515 | StridedVec = |
3516 | createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL); |
3517 | } |
3518 | |
3519 | if (Group->isReverse()) |
3520 | StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse" ); |
3521 | |
3522 | State.set(Def: VPDefs[J], V: StridedVec); |
3523 | ++J; |
3524 | } |
3525 | |
3526 | return; |
3527 | } |
3528 | assert(!State.VF.isScalable() && "VF is assumed to be non scalable." ); |
3529 | |
3530 | // For each member in the group, shuffle out the appropriate data from the |
3531 | // wide loads. |
3532 | unsigned J = 0; |
3533 | for (unsigned I = 0; I < InterleaveFactor; ++I) { |
3534 | Instruction *Member = Group->getMember(Index: I); |
3535 | |
3536 | // Skip the gaps in the group. |
3537 | if (!Member) |
3538 | continue; |
3539 | |
3540 | auto StrideMask = |
3541 | createStrideMask(Start: I, Stride: InterleaveFactor, VF: State.VF.getFixedValue()); |
3542 | Value *StridedVec = |
3543 | State.Builder.CreateShuffleVector(V: NewLoad, Mask: StrideMask, Name: "strided.vec" ); |
3544 | |
3545 | // If this member has different type, cast the result type. |
3546 | if (Member->getType() != ScalarTy) { |
3547 | VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF); |
3548 | StridedVec = |
3549 | createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL); |
3550 | } |
3551 | |
3552 | if (Group->isReverse()) |
3553 | StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse" ); |
3554 | |
3555 | State.set(Def: VPDefs[J], V: StridedVec); |
3556 | ++J; |
3557 | } |
3558 | return; |
3559 | } |
3560 | |
3561 | // The sub vector type for current instruction. |
3562 | auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF); |
3563 | |
3564 | // Vectorize the interleaved store group. |
3565 | Value *MaskForGaps = |
3566 | createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group); |
3567 | assert((!MaskForGaps || !State.VF.isScalable()) && |
3568 | "masking gaps for scalable vectors is not yet supported." ); |
3569 | ArrayRef<VPValue *> StoredValues = getStoredValues(); |
3570 | // Collect the stored vector from each member. |
3571 | SmallVector<Value *, 4> StoredVecs; |
3572 | unsigned StoredIdx = 0; |
3573 | for (unsigned i = 0; i < InterleaveFactor; i++) { |
3574 | assert((Group->getMember(i) || MaskForGaps) && |
3575 | "Fail to get a member from an interleaved store group" ); |
3576 | Instruction *Member = Group->getMember(Index: i); |
3577 | |
3578 | // Skip the gaps in the group. |
3579 | if (!Member) { |
3580 | Value *Undef = PoisonValue::get(T: SubVT); |
3581 | StoredVecs.push_back(Elt: Undef); |
3582 | continue; |
3583 | } |
3584 | |
3585 | Value *StoredVec = State.get(Def: StoredValues[StoredIdx]); |
3586 | ++StoredIdx; |
3587 | |
3588 | if (Group->isReverse()) |
3589 | StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse" ); |
3590 | |
3591 | // If this member has different type, cast it to a unified type. |
3592 | |
3593 | if (StoredVec->getType() != SubVT) |
3594 | StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL); |
3595 | |
3596 | StoredVecs.push_back(Elt: StoredVec); |
3597 | } |
3598 | |
3599 | // Interleave all the smaller vectors into one wider vector. |
3600 | Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec" ); |
3601 | Instruction *NewStoreInstr; |
3602 | if (BlockInMask || MaskForGaps) { |
3603 | Value *GroupMask = CreateGroupMask(MaskForGaps); |
3604 | NewStoreInstr = State.Builder.CreateMaskedStore( |
3605 | Val: IVec, Ptr: ResAddr, Alignment: Group->getAlign(), Mask: GroupMask); |
3606 | } else |
3607 | NewStoreInstr = |
3608 | State.Builder.CreateAlignedStore(Val: IVec, Ptr: ResAddr, Align: Group->getAlign()); |
3609 | |
3610 | Group->addMetadata(NewInst: NewStoreInstr); |
3611 | } |
3612 | |
3613 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3614 | void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, |
3615 | VPSlotTracker &SlotTracker) const { |
3616 | O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at " ; |
3617 | IG->getInsertPos()->printAsOperand(O, false); |
3618 | O << ", " ; |
3619 | getAddr()->printAsOperand(O, SlotTracker); |
3620 | VPValue *Mask = getMask(); |
3621 | if (Mask) { |
3622 | O << ", " ; |
3623 | Mask->printAsOperand(O, SlotTracker); |
3624 | } |
3625 | |
3626 | unsigned OpIdx = 0; |
3627 | for (unsigned i = 0; i < IG->getFactor(); ++i) { |
3628 | if (!IG->getMember(i)) |
3629 | continue; |
3630 | if (getNumStoreOperands() > 0) { |
3631 | O << "\n" << Indent << " store " ; |
3632 | getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); |
3633 | O << " to index " << i; |
3634 | } else { |
3635 | O << "\n" << Indent << " " ; |
3636 | getVPValue(OpIdx)->printAsOperand(O, SlotTracker); |
3637 | O << " = load from index " << i; |
3638 | } |
3639 | ++OpIdx; |
3640 | } |
3641 | } |
3642 | #endif |
3643 | |
3644 | InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, |
3645 | VPCostContext &Ctx) const { |
3646 | Instruction *InsertPos = getInsertPos(); |
3647 | // Find the VPValue index of the interleave group. We need to skip gaps. |
3648 | unsigned InsertPosIdx = 0; |
3649 | for (unsigned Idx = 0; IG->getFactor(); ++Idx) |
3650 | if (auto *Member = IG->getMember(Index: Idx)) { |
3651 | if (Member == InsertPos) |
3652 | break; |
3653 | InsertPosIdx++; |
3654 | } |
3655 | Type *ValTy = Ctx.Types.inferScalarType( |
3656 | V: getNumDefinedValues() > 0 ? getVPValue(I: InsertPosIdx) |
3657 | : getStoredValues()[InsertPosIdx]); |
3658 | auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF)); |
3659 | unsigned AS = getLoadStoreAddressSpace(I: InsertPos); |
3660 | |
3661 | unsigned InterleaveFactor = IG->getFactor(); |
3662 | auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor); |
3663 | |
3664 | // Holds the indices of existing members in the interleaved group. |
3665 | SmallVector<unsigned, 4> Indices; |
3666 | for (unsigned IF = 0; IF < InterleaveFactor; IF++) |
3667 | if (IG->getMember(Index: IF)) |
3668 | Indices.push_back(Elt: IF); |
3669 | |
3670 | // Calculate the cost of the whole interleaved group. |
3671 | InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( |
3672 | Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: IG->getFactor(), Indices, |
3673 | Alignment: IG->getAlign(), AddressSpace: AS, CostKind: Ctx.CostKind, UseMaskForCond: getMask(), UseMaskForGaps: NeedsMaskForGaps); |
3674 | |
3675 | if (!IG->isReverse()) |
3676 | return Cost; |
3677 | |
3678 | return Cost + IG->getNumMembers() * |
3679 | Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, |
3680 | DstTy: VectorTy, SrcTy: VectorTy, Mask: {}, CostKind: Ctx.CostKind, |
3681 | Index: 0); |
3682 | } |
3683 | |
3684 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3685 | void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
3686 | VPSlotTracker &SlotTracker) const { |
3687 | O << Indent << "EMIT " ; |
3688 | printAsOperand(O, SlotTracker); |
3689 | O << " = CANONICAL-INDUCTION " ; |
3690 | printOperands(O, SlotTracker); |
3691 | } |
3692 | #endif |
3693 | |
3694 | bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { |
3695 | return IsScalarAfterVectorization && |
3696 | (!IsScalable || vputils::onlyFirstLaneUsed(Def: this)); |
3697 | } |
3698 | |
3699 | void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { |
3700 | assert(getInductionDescriptor().getKind() == |
3701 | InductionDescriptor::IK_PtrInduction && |
3702 | "Not a pointer induction according to InductionDescriptor!" ); |
3703 | assert(State.TypeAnalysis.inferScalarType(this)->isPointerTy() && |
3704 | "Unexpected type." ); |
3705 | assert(!onlyScalarsGenerated(State.VF.isScalable()) && |
3706 | "Recipe should have been replaced" ); |
3707 | |
3708 | unsigned CurrentPart = getUnrollPart(U&: *this); |
3709 | |
3710 | // Build a pointer phi |
3711 | Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); |
3712 | Type *ScStValueType = ScalarStartValue->getType(); |
3713 | |
3714 | BasicBlock *VectorPH = |
3715 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
3716 | PHINode *NewPointerPhi = nullptr; |
3717 | if (CurrentPart == 0) { |
3718 | IRBuilder<>::InsertPointGuard Guard(State.Builder); |
3719 | if (State.Builder.GetInsertPoint() != |
3720 | State.Builder.GetInsertBlock()->getFirstNonPHIIt()) |
3721 | State.Builder.SetInsertPoint( |
3722 | State.Builder.GetInsertBlock()->getFirstNonPHIIt()); |
3723 | NewPointerPhi = State.Builder.CreatePHI(Ty: ScStValueType, NumReservedValues: 2, Name: "pointer.phi" ); |
3724 | NewPointerPhi->addIncoming(V: ScalarStartValue, BB: VectorPH); |
3725 | NewPointerPhi->setDebugLoc(getDebugLoc()); |
3726 | } else { |
3727 | // The recipe has been unrolled. In that case, fetch the single pointer phi |
3728 | // shared among all unrolled parts of the recipe. |
3729 | auto *GEP = |
3730 | cast<GetElementPtrInst>(Val: State.get(Def: getFirstUnrolledPartOperand())); |
3731 | NewPointerPhi = cast<PHINode>(Val: GEP->getPointerOperand()); |
3732 | } |
3733 | |
3734 | // A pointer induction, performed by using a gep |
3735 | BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); |
3736 | Value *ScalarStepValue = State.get(Def: getStepValue(), Lane: VPLane(0)); |
3737 | Type *PhiType = State.TypeAnalysis.inferScalarType(V: getStepValue()); |
3738 | Value *RuntimeVF = getRuntimeVF(B&: State.Builder, Ty: PhiType, VF: State.VF); |
3739 | // Add induction update using an incorrect block temporarily. The phi node |
3740 | // will be fixed after VPlan execution. Note that at this point the latch |
3741 | // block cannot be used, as it does not exist yet. |
3742 | // TODO: Model increment value in VPlan, by turning the recipe into a |
3743 | // multi-def and a subclass of VPHeaderPHIRecipe. |
3744 | if (CurrentPart == 0) { |
3745 | // The recipe represents the first part of the pointer induction. Create the |
3746 | // GEP to increment the phi across all unrolled parts. |
3747 | Value *NumUnrolledElems = State.get(Def: getOperand(N: 2), IsScalar: true); |
3748 | |
3749 | Value *InductionGEP = GetElementPtrInst::Create( |
3750 | PointeeType: State.Builder.getInt8Ty(), Ptr: NewPointerPhi, |
3751 | IdxList: State.Builder.CreateMul( |
3752 | LHS: ScalarStepValue, |
3753 | RHS: State.Builder.CreateTrunc(V: NumUnrolledElems, DestTy: PhiType)), |
3754 | NameStr: "ptr.ind" , InsertBefore: InductionLoc); |
3755 | |
3756 | NewPointerPhi->addIncoming(V: InductionGEP, BB: VectorPH); |
3757 | } |
3758 | |
3759 | // Create actual address geps that use the pointer phi as base and a |
3760 | // vectorized version of the step value (<step*0, ..., step*N>) as offset. |
3761 | Type *VecPhiType = VectorType::get(ElementType: PhiType, EC: State.VF); |
3762 | Value *StartOffsetScalar = State.Builder.CreateMul( |
3763 | LHS: RuntimeVF, RHS: ConstantInt::get(Ty: PhiType, V: CurrentPart)); |
3764 | Value *StartOffset = |
3765 | State.Builder.CreateVectorSplat(EC: State.VF, V: StartOffsetScalar); |
3766 | // Create a vector of consecutive numbers from zero to VF. |
3767 | StartOffset = State.Builder.CreateAdd( |
3768 | LHS: StartOffset, RHS: State.Builder.CreateStepVector(DstType: VecPhiType)); |
3769 | |
3770 | assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) && |
3771 | "scalar step must be the same across all parts" ); |
3772 | Value *GEP = State.Builder.CreateGEP( |
3773 | Ty: State.Builder.getInt8Ty(), Ptr: NewPointerPhi, |
3774 | IdxList: State.Builder.CreateMul(LHS: StartOffset, RHS: State.Builder.CreateVectorSplat( |
3775 | EC: State.VF, V: ScalarStepValue)), |
3776 | Name: "vector.gep" ); |
3777 | State.set(Def: this, V: GEP); |
3778 | } |
3779 | |
3780 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3781 | void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, |
3782 | VPSlotTracker &SlotTracker) const { |
3783 | assert((getNumOperands() == 3 || getNumOperands() == 5) && |
3784 | "unexpected number of operands" ); |
3785 | O << Indent << "EMIT " ; |
3786 | printAsOperand(O, SlotTracker); |
3787 | O << " = WIDEN-POINTER-INDUCTION " ; |
3788 | getStartValue()->printAsOperand(O, SlotTracker); |
3789 | O << ", " ; |
3790 | getStepValue()->printAsOperand(O, SlotTracker); |
3791 | O << ", " ; |
3792 | getOperand(2)->printAsOperand(O, SlotTracker); |
3793 | if (getNumOperands() == 5) { |
3794 | O << ", " ; |
3795 | getOperand(3)->printAsOperand(O, SlotTracker); |
3796 | O << ", " ; |
3797 | getOperand(4)->printAsOperand(O, SlotTracker); |
3798 | } |
3799 | } |
3800 | #endif |
3801 | |
3802 | void VPExpandSCEVRecipe::execute(VPTransformState &State) { |
3803 | assert(!State.Lane && "cannot be used in per-lane" ); |
3804 | const DataLayout &DL = SE.getDataLayout(); |
3805 | SCEVExpander Exp(SE, DL, "induction" , /*PreserveLCSSA=*/true); |
3806 | Value *Res = Exp.expandCodeFor(SH: Expr, Ty: Expr->getType(), |
3807 | I: &*State.Builder.GetInsertPoint()); |
3808 | State.set(Def: this, V: Res, Lane: VPLane(0)); |
3809 | } |
3810 | |
3811 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3812 | void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, |
3813 | VPSlotTracker &SlotTracker) const { |
3814 | O << Indent << "EMIT " ; |
3815 | printAsOperand(O, SlotTracker); |
3816 | O << " = EXPAND SCEV " << *Expr; |
3817 | } |
3818 | #endif |
3819 | |
3820 | void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { |
3821 | Value *CanonicalIV = State.get(Def: getOperand(N: 0), /*IsScalar*/ true); |
3822 | Type *STy = CanonicalIV->getType(); |
3823 | IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); |
3824 | ElementCount VF = State.VF; |
3825 | Value *VStart = VF.isScalar() |
3826 | ? CanonicalIV |
3827 | : Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast" ); |
3828 | Value *VStep = createStepForVF(B&: Builder, Ty: STy, VF, Step: getUnrollPart(U&: *this)); |
3829 | if (VF.isVector()) { |
3830 | VStep = Builder.CreateVectorSplat(EC: VF, V: VStep); |
3831 | VStep = |
3832 | Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType())); |
3833 | } |
3834 | Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv" ); |
3835 | State.set(Def: this, V: CanonicalVectorIV); |
3836 | } |
3837 | |
3838 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3839 | void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, |
3840 | VPSlotTracker &SlotTracker) const { |
3841 | O << Indent << "EMIT " ; |
3842 | printAsOperand(O, SlotTracker); |
3843 | O << " = WIDEN-CANONICAL-INDUCTION " ; |
3844 | printOperands(O, SlotTracker); |
3845 | } |
3846 | #endif |
3847 | |
3848 | void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { |
3849 | auto &Builder = State.Builder; |
3850 | // Create a vector from the initial value. |
3851 | auto *VectorInit = getStartValue()->getLiveInIRValue(); |
3852 | |
3853 | Type *VecTy = State.VF.isScalar() |
3854 | ? VectorInit->getType() |
3855 | : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF); |
3856 | |
3857 | BasicBlock *VectorPH = |
3858 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
3859 | if (State.VF.isVector()) { |
3860 | auto *IdxTy = Builder.getInt32Ty(); |
3861 | auto *One = ConstantInt::get(Ty: IdxTy, V: 1); |
3862 | IRBuilder<>::InsertPointGuard Guard(Builder); |
3863 | Builder.SetInsertPoint(VectorPH->getTerminator()); |
3864 | auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF); |
3865 | auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One); |
3866 | VectorInit = Builder.CreateInsertElement( |
3867 | Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init" ); |
3868 | } |
3869 | |
3870 | // Create a phi node for the new recurrence. |
3871 | PHINode *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur" ); |
3872 | Phi->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt()); |
3873 | Phi->addIncoming(V: VectorInit, BB: VectorPH); |
3874 | State.set(Def: this, V: Phi); |
3875 | } |
3876 | |
3877 | InstructionCost |
3878 | VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, |
3879 | VPCostContext &Ctx) const { |
3880 | if (VF.isScalar()) |
3881 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
3882 | |
3883 | return 0; |
3884 | } |
3885 | |
3886 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3887 | void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, |
3888 | VPSlotTracker &SlotTracker) const { |
3889 | O << Indent << "FIRST-ORDER-RECURRENCE-PHI " ; |
3890 | printAsOperand(O, SlotTracker); |
3891 | O << " = phi " ; |
3892 | printOperands(O, SlotTracker); |
3893 | } |
3894 | #endif |
3895 | |
3896 | void VPReductionPHIRecipe::execute(VPTransformState &State) { |
3897 | // Reductions do not have to start at zero. They can start with |
3898 | // any loop invariant values. |
3899 | VPValue *StartVPV = getStartValue(); |
3900 | |
3901 | // In order to support recurrences we need to be able to vectorize Phi nodes. |
3902 | // Phi nodes have cycles, so we need to vectorize them in two stages. This is |
3903 | // stage #1: We create a new vector PHI node with no incoming edges. We'll use |
3904 | // this value when we vectorize all of the instructions that use the PHI. |
3905 | BasicBlock *VectorPH = |
3906 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
3907 | bool ScalarPHI = State.VF.isScalar() || IsInLoop; |
3908 | Value *StartV = State.get(Def: StartVPV, IsScalar: ScalarPHI); |
3909 | Type *VecTy = StartV->getType(); |
3910 | |
3911 | BasicBlock * = State.CFG.PrevBB; |
3912 | assert(State.CurrentParentLoop->getHeader() == HeaderBB && |
3913 | "recipe must be in the vector loop header" ); |
3914 | auto *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi" ); |
3915 | Phi->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt()); |
3916 | State.set(Def: this, V: Phi, IsScalar: IsInLoop); |
3917 | |
3918 | Phi->addIncoming(V: StartV, BB: VectorPH); |
3919 | } |
3920 | |
3921 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3922 | void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
3923 | VPSlotTracker &SlotTracker) const { |
3924 | O << Indent << "WIDEN-REDUCTION-PHI " ; |
3925 | |
3926 | printAsOperand(O, SlotTracker); |
3927 | O << " = phi " ; |
3928 | printOperands(O, SlotTracker); |
3929 | if (VFScaleFactor != 1) |
3930 | O << " (VF scaled by 1/" << VFScaleFactor << ")" ; |
3931 | } |
3932 | #endif |
3933 | |
3934 | void VPWidenPHIRecipe::execute(VPTransformState &State) { |
3935 | Value *Op0 = State.get(Def: getOperand(N: 0)); |
3936 | Type *VecTy = Op0->getType(); |
3937 | Instruction *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: 2, Name); |
3938 | // Manually move it with the other PHIs in case PHI recipes above this one |
3939 | // also inserted non-phi instructions. |
3940 | // TODO: Remove once VPWidenPointerInductionRecipe is also expanded in |
3941 | // convertToConcreteRecipes. |
3942 | VecPhi->moveBefore(InsertPos: State.Builder.GetInsertBlock()->getFirstNonPHIIt()); |
3943 | State.set(Def: this, V: VecPhi); |
3944 | } |
3945 | |
3946 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3947 | void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
3948 | VPSlotTracker &SlotTracker) const { |
3949 | O << Indent << "WIDEN-PHI " ; |
3950 | |
3951 | printAsOperand(O, SlotTracker); |
3952 | O << " = phi " ; |
3953 | printPhiOperands(O, SlotTracker); |
3954 | } |
3955 | #endif |
3956 | |
3957 | // TODO: It would be good to use the existing VPWidenPHIRecipe instead and |
3958 | // remove VPActiveLaneMaskPHIRecipe. |
3959 | void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { |
3960 | BasicBlock *VectorPH = |
3961 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
3962 | Value *StartMask = State.get(Def: getOperand(N: 0)); |
3963 | PHINode *Phi = |
3964 | State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask" ); |
3965 | Phi->addIncoming(V: StartMask, BB: VectorPH); |
3966 | State.set(Def: this, V: Phi); |
3967 | } |
3968 | |
3969 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3970 | void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
3971 | VPSlotTracker &SlotTracker) const { |
3972 | O << Indent << "ACTIVE-LANE-MASK-PHI " ; |
3973 | |
3974 | printAsOperand(O, SlotTracker); |
3975 | O << " = phi " ; |
3976 | printOperands(O, SlotTracker); |
3977 | } |
3978 | #endif |
3979 | |
3980 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3981 | void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
3982 | VPSlotTracker &SlotTracker) const { |
3983 | O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI " ; |
3984 | |
3985 | printAsOperand(O, SlotTracker); |
3986 | O << " = phi " ; |
3987 | printOperands(O, SlotTracker); |
3988 | } |
3989 | #endif |
3990 | |