| 1 | //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file contains implementations for different VPlan recipes. |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "LoopVectorizationPlanner.h" |
| 15 | #include "VPlan.h" |
| 16 | #include "VPlanAnalysis.h" |
| 17 | #include "VPlanHelpers.h" |
| 18 | #include "VPlanPatternMatch.h" |
| 19 | #include "VPlanUtils.h" |
| 20 | #include "llvm/ADT/STLExtras.h" |
| 21 | #include "llvm/ADT/SmallVector.h" |
| 22 | #include "llvm/ADT/Twine.h" |
| 23 | #include "llvm/Analysis/AssumptionCache.h" |
| 24 | #include "llvm/Analysis/IVDescriptors.h" |
| 25 | #include "llvm/Analysis/LoopInfo.h" |
| 26 | #include "llvm/IR/BasicBlock.h" |
| 27 | #include "llvm/IR/IRBuilder.h" |
| 28 | #include "llvm/IR/Instruction.h" |
| 29 | #include "llvm/IR/Instructions.h" |
| 30 | #include "llvm/IR/Intrinsics.h" |
| 31 | #include "llvm/IR/Type.h" |
| 32 | #include "llvm/IR/Value.h" |
| 33 | #include "llvm/Support/Casting.h" |
| 34 | #include "llvm/Support/CommandLine.h" |
| 35 | #include "llvm/Support/Debug.h" |
| 36 | #include "llvm/Support/raw_ostream.h" |
| 37 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| 38 | #include "llvm/Transforms/Utils/LoopUtils.h" |
| 39 | #include "llvm/Transforms/Utils/LoopVersioning.h" |
| 40 | #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" |
| 41 | #include <cassert> |
| 42 | |
| 43 | using namespace llvm; |
| 44 | |
| 45 | using VectorParts = SmallVector<Value *, 2>; |
| 46 | |
| 47 | #define LV_NAME "loop-vectorize" |
| 48 | #define DEBUG_TYPE LV_NAME |
| 49 | |
| 50 | bool VPRecipeBase::mayWriteToMemory() const { |
| 51 | switch (getVPDefID()) { |
| 52 | case VPExpressionSC: |
| 53 | return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory(); |
| 54 | case VPInstructionSC: |
| 55 | return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory(); |
| 56 | case VPInterleaveSC: |
| 57 | return cast<VPInterleaveRecipe>(Val: this)->getNumStoreOperands() > 0; |
| 58 | case VPWidenStoreEVLSC: |
| 59 | case VPWidenStoreSC: |
| 60 | return true; |
| 61 | case VPReplicateSC: |
| 62 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
| 63 | ->mayWriteToMemory(); |
| 64 | case VPWidenCallSC: |
| 65 | return !cast<VPWidenCallRecipe>(Val: this) |
| 66 | ->getCalledScalarFunction() |
| 67 | ->onlyReadsMemory(); |
| 68 | case VPWidenIntrinsicSC: |
| 69 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayWriteToMemory(); |
| 70 | case VPBranchOnMaskSC: |
| 71 | case VPFirstOrderRecurrencePHISC: |
| 72 | case VPScalarIVStepsSC: |
| 73 | case VPPredInstPHISC: |
| 74 | return false; |
| 75 | case VPBlendSC: |
| 76 | case VPReductionEVLSC: |
| 77 | case VPReductionSC: |
| 78 | case VPVectorPointerSC: |
| 79 | case VPWidenCanonicalIVSC: |
| 80 | case VPWidenCastSC: |
| 81 | case VPWidenGEPSC: |
| 82 | case VPWidenIntOrFpInductionSC: |
| 83 | case VPWidenLoadEVLSC: |
| 84 | case VPWidenLoadSC: |
| 85 | case VPWidenPHISC: |
| 86 | case VPWidenSC: |
| 87 | case VPWidenSelectSC: { |
| 88 | const Instruction *I = |
| 89 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
| 90 | (void)I; |
| 91 | assert((!I || !I->mayWriteToMemory()) && |
| 92 | "underlying instruction may write to memory" ); |
| 93 | return false; |
| 94 | } |
| 95 | default: |
| 96 | return true; |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | bool VPRecipeBase::mayReadFromMemory() const { |
| 101 | switch (getVPDefID()) { |
| 102 | case VPExpressionSC: |
| 103 | return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory(); |
| 104 | case VPInstructionSC: |
| 105 | return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory(); |
| 106 | case VPWidenLoadEVLSC: |
| 107 | case VPWidenLoadSC: |
| 108 | return true; |
| 109 | case VPReplicateSC: |
| 110 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
| 111 | ->mayReadFromMemory(); |
| 112 | case VPWidenCallSC: |
| 113 | return !cast<VPWidenCallRecipe>(Val: this) |
| 114 | ->getCalledScalarFunction() |
| 115 | ->onlyWritesMemory(); |
| 116 | case VPWidenIntrinsicSC: |
| 117 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayReadFromMemory(); |
| 118 | case VPBranchOnMaskSC: |
| 119 | case VPFirstOrderRecurrencePHISC: |
| 120 | case VPPredInstPHISC: |
| 121 | case VPScalarIVStepsSC: |
| 122 | case VPWidenStoreEVLSC: |
| 123 | case VPWidenStoreSC: |
| 124 | return false; |
| 125 | case VPBlendSC: |
| 126 | case VPReductionEVLSC: |
| 127 | case VPReductionSC: |
| 128 | case VPVectorPointerSC: |
| 129 | case VPWidenCanonicalIVSC: |
| 130 | case VPWidenCastSC: |
| 131 | case VPWidenGEPSC: |
| 132 | case VPWidenIntOrFpInductionSC: |
| 133 | case VPWidenPHISC: |
| 134 | case VPWidenSC: |
| 135 | case VPWidenSelectSC: { |
| 136 | const Instruction *I = |
| 137 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
| 138 | (void)I; |
| 139 | assert((!I || !I->mayReadFromMemory()) && |
| 140 | "underlying instruction may read from memory" ); |
| 141 | return false; |
| 142 | } |
| 143 | default: |
| 144 | return true; |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | bool VPRecipeBase::mayHaveSideEffects() const { |
| 149 | switch (getVPDefID()) { |
| 150 | case VPExpressionSC: |
| 151 | return cast<VPExpressionRecipe>(Val: this)->mayHaveSideEffects(); |
| 152 | case VPDerivedIVSC: |
| 153 | case VPFirstOrderRecurrencePHISC: |
| 154 | case VPPredInstPHISC: |
| 155 | case VPVectorEndPointerSC: |
| 156 | return false; |
| 157 | case VPInstructionSC: |
| 158 | return mayWriteToMemory(); |
| 159 | case VPWidenCallSC: { |
| 160 | Function *Fn = cast<VPWidenCallRecipe>(Val: this)->getCalledScalarFunction(); |
| 161 | return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); |
| 162 | } |
| 163 | case VPWidenIntrinsicSC: |
| 164 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayHaveSideEffects(); |
| 165 | case VPBlendSC: |
| 166 | case VPReductionEVLSC: |
| 167 | case VPReductionSC: |
| 168 | case VPScalarIVStepsSC: |
| 169 | case VPVectorPointerSC: |
| 170 | case VPWidenCanonicalIVSC: |
| 171 | case VPWidenCastSC: |
| 172 | case VPWidenGEPSC: |
| 173 | case VPWidenIntOrFpInductionSC: |
| 174 | case VPWidenPHISC: |
| 175 | case VPWidenPointerInductionSC: |
| 176 | case VPWidenSC: |
| 177 | case VPWidenSelectSC: { |
| 178 | const Instruction *I = |
| 179 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
| 180 | (void)I; |
| 181 | assert((!I || !I->mayHaveSideEffects()) && |
| 182 | "underlying instruction has side-effects" ); |
| 183 | return false; |
| 184 | } |
| 185 | case VPInterleaveSC: |
| 186 | return mayWriteToMemory(); |
| 187 | case VPWidenLoadEVLSC: |
| 188 | case VPWidenLoadSC: |
| 189 | case VPWidenStoreEVLSC: |
| 190 | case VPWidenStoreSC: |
| 191 | assert( |
| 192 | cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == |
| 193 | mayWriteToMemory() && |
| 194 | "mayHaveSideffects result for ingredient differs from this " |
| 195 | "implementation" ); |
| 196 | return mayWriteToMemory(); |
| 197 | case VPReplicateSC: { |
| 198 | auto *R = cast<VPReplicateRecipe>(Val: this); |
| 199 | return R->getUnderlyingInstr()->mayHaveSideEffects(); |
| 200 | } |
| 201 | default: |
| 202 | return true; |
| 203 | } |
| 204 | } |
| 205 | |
| 206 | void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { |
| 207 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
| 208 | assert(InsertPos->getParent() && |
| 209 | "Insertion position not in any VPBasicBlock" ); |
| 210 | InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator()); |
| 211 | } |
| 212 | |
| 213 | void VPRecipeBase::insertBefore(VPBasicBlock &BB, |
| 214 | iplist<VPRecipeBase>::iterator I) { |
| 215 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
| 216 | assert(I == BB.end() || I->getParent() == &BB); |
| 217 | BB.insert(Recipe: this, InsertPt: I); |
| 218 | } |
| 219 | |
| 220 | void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { |
| 221 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
| 222 | assert(InsertPos->getParent() && |
| 223 | "Insertion position not in any VPBasicBlock" ); |
| 224 | InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator())); |
| 225 | } |
| 226 | |
| 227 | void VPRecipeBase::removeFromParent() { |
| 228 | assert(getParent() && "Recipe not in any VPBasicBlock" ); |
| 229 | getParent()->getRecipeList().remove(IT: getIterator()); |
| 230 | Parent = nullptr; |
| 231 | } |
| 232 | |
| 233 | iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { |
| 234 | assert(getParent() && "Recipe not in any VPBasicBlock" ); |
| 235 | return getParent()->getRecipeList().erase(where: getIterator()); |
| 236 | } |
| 237 | |
| 238 | void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { |
| 239 | removeFromParent(); |
| 240 | insertAfter(InsertPos); |
| 241 | } |
| 242 | |
| 243 | void VPRecipeBase::moveBefore(VPBasicBlock &BB, |
| 244 | iplist<VPRecipeBase>::iterator I) { |
| 245 | removeFromParent(); |
| 246 | insertBefore(BB, I); |
| 247 | } |
| 248 | |
| 249 | InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { |
| 250 | // Get the underlying instruction for the recipe, if there is one. It is used |
| 251 | // to |
| 252 | // * decide if cost computation should be skipped for this recipe, |
| 253 | // * apply forced target instruction cost. |
| 254 | Instruction *UI = nullptr; |
| 255 | if (auto *S = dyn_cast<VPSingleDefRecipe>(Val: this)) |
| 256 | UI = dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue()); |
| 257 | else if (auto *IG = dyn_cast<VPInterleaveRecipe>(Val: this)) |
| 258 | UI = IG->getInsertPos(); |
| 259 | else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: this)) |
| 260 | UI = &WidenMem->getIngredient(); |
| 261 | |
| 262 | InstructionCost RecipeCost; |
| 263 | if (UI && Ctx.skipCostComputation(UI, IsVector: VF.isVector())) { |
| 264 | RecipeCost = 0; |
| 265 | } else { |
| 266 | RecipeCost = computeCost(VF, Ctx); |
| 267 | if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && |
| 268 | RecipeCost.isValid()) |
| 269 | RecipeCost = InstructionCost(ForceTargetInstructionCost); |
| 270 | } |
| 271 | |
| 272 | LLVM_DEBUG({ |
| 273 | dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": " ; |
| 274 | dump(); |
| 275 | }); |
| 276 | return RecipeCost; |
| 277 | } |
| 278 | |
| 279 | InstructionCost VPRecipeBase::computeCost(ElementCount VF, |
| 280 | VPCostContext &Ctx) const { |
| 281 | llvm_unreachable("subclasses should implement computeCost" ); |
| 282 | } |
| 283 | |
| 284 | bool VPRecipeBase::isPhi() const { |
| 285 | return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) || |
| 286 | (isa<VPInstruction>(Val: this) && |
| 287 | cast<VPInstruction>(Val: this)->getOpcode() == Instruction::PHI) || |
| 288 | isa<VPIRPhi>(Val: this); |
| 289 | } |
| 290 | |
| 291 | bool VPRecipeBase::isScalarCast() const { |
| 292 | auto *VPI = dyn_cast<VPInstruction>(Val: this); |
| 293 | return VPI && Instruction::isCast(Opcode: VPI->getOpcode()); |
| 294 | } |
| 295 | |
| 296 | InstructionCost |
| 297 | VPPartialReductionRecipe::computeCost(ElementCount VF, |
| 298 | VPCostContext &Ctx) const { |
| 299 | std::optional<unsigned> Opcode; |
| 300 | VPValue *Op = getOperand(N: 0); |
| 301 | VPRecipeBase *OpR = Op->getDefiningRecipe(); |
| 302 | |
| 303 | // If the partial reduction is predicated, a select will be operand 0 |
| 304 | using namespace llvm::VPlanPatternMatch; |
| 305 | if (match(V: getOperand(N: 1), P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: Op), Op2: m_VPValue()))) { |
| 306 | OpR = Op->getDefiningRecipe(); |
| 307 | } |
| 308 | |
| 309 | Type *InputTypeA = nullptr, *InputTypeB = nullptr; |
| 310 | TTI::PartialReductionExtendKind ExtAType = TTI::PR_None, |
| 311 | ExtBType = TTI::PR_None; |
| 312 | |
| 313 | auto GetExtendKind = [](VPRecipeBase *R) { |
| 314 | if (!R) |
| 315 | return TTI::PR_None; |
| 316 | auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(Val: R); |
| 317 | if (!WidenCastR) |
| 318 | return TTI::PR_None; |
| 319 | if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) |
| 320 | return TTI::PR_ZeroExtend; |
| 321 | if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) |
| 322 | return TTI::PR_SignExtend; |
| 323 | return TTI::PR_None; |
| 324 | }; |
| 325 | |
| 326 | // Pick out opcode, type/ext information and use sub side effects from a widen |
| 327 | // recipe. |
| 328 | auto HandleWiden = [&](VPWidenRecipe *Widen) { |
| 329 | if (match(V: Widen, |
| 330 | P: m_Binary<Instruction::Sub>(Op0: m_SpecificInt(V: 0), Op1: m_VPValue(V&: Op)))) { |
| 331 | Widen = dyn_cast<VPWidenRecipe>(Val: Op->getDefiningRecipe()); |
| 332 | } |
| 333 | Opcode = Widen->getOpcode(); |
| 334 | VPRecipeBase *ExtAR = Widen->getOperand(N: 0)->getDefiningRecipe(); |
| 335 | VPRecipeBase *ExtBR = Widen->getOperand(N: 1)->getDefiningRecipe(); |
| 336 | InputTypeA = Ctx.Types.inferScalarType(V: ExtAR ? ExtAR->getOperand(N: 0) |
| 337 | : Widen->getOperand(N: 0)); |
| 338 | InputTypeB = Ctx.Types.inferScalarType(V: ExtBR ? ExtBR->getOperand(N: 0) |
| 339 | : Widen->getOperand(N: 1)); |
| 340 | ExtAType = GetExtendKind(ExtAR); |
| 341 | ExtBType = GetExtendKind(ExtBR); |
| 342 | }; |
| 343 | |
| 344 | if (isa<VPWidenCastRecipe>(Val: OpR)) { |
| 345 | InputTypeA = Ctx.Types.inferScalarType(V: OpR->getOperand(N: 0)); |
| 346 | ExtAType = GetExtendKind(OpR); |
| 347 | } else if (isa<VPReductionPHIRecipe>(Val: OpR)) { |
| 348 | auto RedPhiOp1R = getOperand(N: 1)->getDefiningRecipe(); |
| 349 | if (isa<VPWidenCastRecipe>(Val: RedPhiOp1R)) { |
| 350 | InputTypeA = Ctx.Types.inferScalarType(V: RedPhiOp1R->getOperand(N: 0)); |
| 351 | ExtAType = GetExtendKind(RedPhiOp1R); |
| 352 | } else if (auto Widen = dyn_cast<VPWidenRecipe>(Val: RedPhiOp1R)) |
| 353 | HandleWiden(Widen); |
| 354 | } else if (auto Widen = dyn_cast<VPWidenRecipe>(Val: OpR)) { |
| 355 | HandleWiden(Widen); |
| 356 | } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(Val: OpR)) { |
| 357 | return Reduction->computeCost(VF, Ctx); |
| 358 | } |
| 359 | auto *PhiType = Ctx.Types.inferScalarType(V: getOperand(N: 1)); |
| 360 | return Ctx.TTI.getPartialReductionCost(Opcode: getOpcode(), InputTypeA, InputTypeB, |
| 361 | AccumType: PhiType, VF, OpAExtend: ExtAType, OpBExtend: ExtBType, |
| 362 | BinOp: Opcode, CostKind: Ctx.CostKind); |
| 363 | } |
| 364 | |
| 365 | void VPPartialReductionRecipe::execute(VPTransformState &State) { |
| 366 | auto &Builder = State.Builder; |
| 367 | |
| 368 | assert(getOpcode() == Instruction::Add && |
| 369 | "Unhandled partial reduction opcode" ); |
| 370 | |
| 371 | Value *BinOpVal = State.get(Def: getOperand(N: 1)); |
| 372 | Value *PhiVal = State.get(Def: getOperand(N: 0)); |
| 373 | assert(PhiVal && BinOpVal && "Phi and Mul must be set" ); |
| 374 | |
| 375 | Type *RetTy = PhiVal->getType(); |
| 376 | |
| 377 | CallInst *V = Builder.CreateIntrinsic( |
| 378 | RetTy, ID: Intrinsic::experimental_vector_partial_reduce_add, |
| 379 | Args: {PhiVal, BinOpVal}, FMFSource: nullptr, Name: "partial.reduce" ); |
| 380 | |
| 381 | State.set(Def: this, V); |
| 382 | } |
| 383 | |
| 384 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 385 | void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, |
| 386 | VPSlotTracker &SlotTracker) const { |
| 387 | O << Indent << "PARTIAL-REDUCE " ; |
| 388 | printAsOperand(O, SlotTracker); |
| 389 | O << " = " << Instruction::getOpcodeName(getOpcode()) << " " ; |
| 390 | printOperands(O, SlotTracker); |
| 391 | } |
| 392 | #endif |
| 393 | |
| 394 | FastMathFlags VPIRFlags::getFastMathFlags() const { |
| 395 | assert(OpType == OperationType::FPMathOp && |
| 396 | "recipe doesn't have fast math flags" ); |
| 397 | FastMathFlags Res; |
| 398 | Res.setAllowReassoc(FMFs.AllowReassoc); |
| 399 | Res.setNoNaNs(FMFs.NoNaNs); |
| 400 | Res.setNoInfs(FMFs.NoInfs); |
| 401 | Res.setNoSignedZeros(FMFs.NoSignedZeros); |
| 402 | Res.setAllowReciprocal(FMFs.AllowReciprocal); |
| 403 | Res.setAllowContract(FMFs.AllowContract); |
| 404 | Res.setApproxFunc(FMFs.ApproxFunc); |
| 405 | return Res; |
| 406 | } |
| 407 | |
| 408 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 409 | void VPSingleDefRecipe::dump() const { VPDef::dump(); } |
| 410 | #endif |
| 411 | |
| 412 | template <unsigned PartOpIdx> |
| 413 | VPValue * |
| 414 | VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const { |
| 415 | if (U.getNumOperands() == PartOpIdx + 1) |
| 416 | return U.getOperand(N: PartOpIdx); |
| 417 | return nullptr; |
| 418 | } |
| 419 | |
| 420 | template <unsigned PartOpIdx> |
| 421 | unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const { |
| 422 | if (auto *UnrollPartOp = getUnrollPartOperand(U)) |
| 423 | return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue(); |
| 424 | return 0; |
| 425 | } |
| 426 | |
| 427 | namespace llvm { |
| 428 | template class VPUnrollPartAccessor<2>; |
| 429 | template class VPUnrollPartAccessor<3>; |
| 430 | } |
| 431 | |
| 432 | VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, |
| 433 | const VPIRFlags &Flags, DebugLoc DL, |
| 434 | const Twine &Name) |
| 435 | : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL), |
| 436 | VPIRMetadata(), Opcode(Opcode), Name(Name.str()) { |
| 437 | assert(flagsValidForOpcode(getOpcode()) && |
| 438 | "Set flags not supported for the provided opcode" ); |
| 439 | assert((getNumOperandsForOpcode(Opcode) == -1u || |
| 440 | getNumOperandsForOpcode(Opcode) == getNumOperands()) && |
| 441 | "number of operands does not match opcode" ); |
| 442 | } |
| 443 | |
| 444 | #ifndef NDEBUG |
| 445 | unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { |
| 446 | if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode)) |
| 447 | return 1; |
| 448 | |
| 449 | if (Instruction::isBinaryOp(Opcode)) |
| 450 | return 2; |
| 451 | |
| 452 | switch (Opcode) { |
| 453 | case VPInstruction::StepVector: |
| 454 | return 0; |
| 455 | case Instruction::Alloca: |
| 456 | case Instruction::ExtractValue: |
| 457 | case Instruction::Freeze: |
| 458 | case Instruction::Load: |
| 459 | case VPInstruction::AnyOf: |
| 460 | case VPInstruction::BranchOnCond: |
| 461 | case VPInstruction::CalculateTripCountMinusVF: |
| 462 | case VPInstruction::CanonicalIVIncrementForPart: |
| 463 | case VPInstruction::ExplicitVectorLength: |
| 464 | case VPInstruction::ExtractLastElement: |
| 465 | case VPInstruction::ExtractPenultimateElement: |
| 466 | case VPInstruction::FirstActiveLane: |
| 467 | case VPInstruction::Not: |
| 468 | return 1; |
| 469 | case Instruction::ICmp: |
| 470 | case Instruction::FCmp: |
| 471 | case Instruction::Store: |
| 472 | case VPInstruction::ActiveLaneMask: |
| 473 | case VPInstruction::BranchOnCount: |
| 474 | case VPInstruction::ComputeReductionResult: |
| 475 | case VPInstruction::FirstOrderRecurrenceSplice: |
| 476 | case VPInstruction::LogicalAnd: |
| 477 | case VPInstruction::PtrAdd: |
| 478 | case VPInstruction::WideIVStep: |
| 479 | return 2; |
| 480 | case Instruction::Select: |
| 481 | case VPInstruction::ComputeAnyOfResult: |
| 482 | case VPInstruction::ReductionStartVector: |
| 483 | return 3; |
| 484 | case VPInstruction::ComputeFindIVResult: |
| 485 | return 4; |
| 486 | case Instruction::Call: |
| 487 | case Instruction::GetElementPtr: |
| 488 | case Instruction::PHI: |
| 489 | case Instruction::Switch: |
| 490 | // Cannot determine the number of operands from the opcode. |
| 491 | return -1u; |
| 492 | } |
| 493 | llvm_unreachable("all cases should be handled above" ); |
| 494 | } |
| 495 | #endif |
| 496 | |
| 497 | bool VPInstruction::doesGeneratePerAllLanes() const { |
| 498 | return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this); |
| 499 | } |
| 500 | |
| 501 | bool VPInstruction::canGenerateScalarForFirstLane() const { |
| 502 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
| 503 | return true; |
| 504 | if (isSingleScalar() || isVectorToScalar()) |
| 505 | return true; |
| 506 | switch (Opcode) { |
| 507 | case Instruction::Freeze: |
| 508 | case Instruction::ICmp: |
| 509 | case Instruction::PHI: |
| 510 | case Instruction::Select: |
| 511 | case VPInstruction::BranchOnCond: |
| 512 | case VPInstruction::BranchOnCount: |
| 513 | case VPInstruction::CalculateTripCountMinusVF: |
| 514 | case VPInstruction::CanonicalIVIncrementForPart: |
| 515 | case VPInstruction::PtrAdd: |
| 516 | case VPInstruction::ExplicitVectorLength: |
| 517 | case VPInstruction::AnyOf: |
| 518 | return true; |
| 519 | default: |
| 520 | return false; |
| 521 | } |
| 522 | } |
| 523 | |
| 524 | Value *VPInstruction::generatePerLane(VPTransformState &State, |
| 525 | const VPLane &Lane) { |
| 526 | IRBuilderBase &Builder = State.Builder; |
| 527 | |
| 528 | assert(getOpcode() == VPInstruction::PtrAdd && |
| 529 | "only PtrAdd opcodes are supported for now" ); |
| 530 | return Builder.CreatePtrAdd(Ptr: State.get(Def: getOperand(N: 0), Lane), |
| 531 | Offset: State.get(Def: getOperand(N: 1), Lane), Name); |
| 532 | } |
| 533 | |
| 534 | /// Create a conditional branch using \p Cond branching to the successors of \p |
| 535 | /// VPBB. Note that the first successor is always forward (i.e. not created yet) |
| 536 | /// while the second successor may already have been created (if it is a header |
| 537 | /// block and VPBB is a latch). |
| 538 | static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB, |
| 539 | VPTransformState &State) { |
| 540 | // Replace the temporary unreachable terminator with a new conditional |
| 541 | // branch, hooking it up to backward destination (header) for latch blocks |
| 542 | // now, and to forward destination(s) later when they are created. |
| 543 | // Second successor may be backwards - iff it is already in VPBB2IRBB. |
| 544 | VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(Val: VPBB->getSuccessors()[1]); |
| 545 | BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(Val: SecondVPSucc); |
| 546 | BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB]; |
| 547 | BranchInst *CondBr = State.Builder.CreateCondBr(Cond, True: IRBB, False: SecondIRSucc); |
| 548 | // First successor is always forward, reset it to nullptr |
| 549 | CondBr->setSuccessor(idx: 0, NewSucc: nullptr); |
| 550 | IRBB->getTerminator()->eraseFromParent(); |
| 551 | return CondBr; |
| 552 | } |
| 553 | |
| 554 | Value *VPInstruction::generate(VPTransformState &State) { |
| 555 | IRBuilderBase &Builder = State.Builder; |
| 556 | |
| 557 | if (Instruction::isBinaryOp(Opcode: getOpcode())) { |
| 558 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 559 | Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
| 560 | Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
| 561 | auto *Res = |
| 562 | Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name); |
| 563 | if (auto *I = dyn_cast<Instruction>(Val: Res)) |
| 564 | applyFlags(I&: *I); |
| 565 | return Res; |
| 566 | } |
| 567 | |
| 568 | switch (getOpcode()) { |
| 569 | case VPInstruction::Not: { |
| 570 | Value *A = State.get(Def: getOperand(N: 0)); |
| 571 | return Builder.CreateNot(V: A, Name); |
| 572 | } |
| 573 | case Instruction::ExtractElement: { |
| 574 | assert(State.VF.isVector() && "Only extract elements from vectors" ); |
| 575 | if (getOperand(N: 1)->isLiveIn()) { |
| 576 | unsigned = |
| 577 | cast<ConstantInt>(Val: getOperand(N: 1)->getLiveInIRValue())->getZExtValue(); |
| 578 | return State.get(Def: getOperand(N: 0), Lane: VPLane(IdxToExtract)); |
| 579 | } |
| 580 | Value *Vec = State.get(Def: getOperand(N: 0)); |
| 581 | Value *Idx = State.get(Def: getOperand(N: 1), /*IsScalar=*/true); |
| 582 | return Builder.CreateExtractElement(Vec, Idx, Name); |
| 583 | } |
| 584 | case Instruction::Freeze: { |
| 585 | Value *Op = State.get(Def: getOperand(N: 0), IsScalar: vputils::onlyFirstLaneUsed(Def: this)); |
| 586 | return Builder.CreateFreeze(V: Op, Name); |
| 587 | } |
| 588 | case Instruction::ICmp: { |
| 589 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 590 | Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
| 591 | Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
| 592 | return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name); |
| 593 | } |
| 594 | case Instruction::PHI: { |
| 595 | llvm_unreachable("should be handled by VPPhi::execute" ); |
| 596 | } |
| 597 | case Instruction::Select: { |
| 598 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 599 | Value *Cond = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
| 600 | Value *Op1 = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
| 601 | Value *Op2 = State.get(Def: getOperand(N: 2), IsScalar: OnlyFirstLaneUsed); |
| 602 | return Builder.CreateSelect(C: Cond, True: Op1, False: Op2, Name); |
| 603 | } |
| 604 | case VPInstruction::ActiveLaneMask: { |
| 605 | // Get first lane of vector induction variable. |
| 606 | Value *VIVElem0 = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 607 | // Get the original loop tripcount. |
| 608 | Value *ScalarTC = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 609 | |
| 610 | // If this part of the active lane mask is scalar, generate the CMP directly |
| 611 | // to avoid unnecessary extracts. |
| 612 | if (State.VF.isScalar()) |
| 613 | return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC, |
| 614 | Name); |
| 615 | |
| 616 | auto *Int1Ty = Type::getInt1Ty(C&: Builder.getContext()); |
| 617 | auto *PredTy = VectorType::get(ElementType: Int1Ty, EC: State.VF); |
| 618 | return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask, |
| 619 | Types: {PredTy, ScalarTC->getType()}, |
| 620 | Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name); |
| 621 | } |
| 622 | case VPInstruction::FirstOrderRecurrenceSplice: { |
| 623 | // Generate code to combine the previous and current values in vector v3. |
| 624 | // |
| 625 | // vector.ph: |
| 626 | // v_init = vector(..., ..., ..., a[-1]) |
| 627 | // br vector.body |
| 628 | // |
| 629 | // vector.body |
| 630 | // i = phi [0, vector.ph], [i+4, vector.body] |
| 631 | // v1 = phi [v_init, vector.ph], [v2, vector.body] |
| 632 | // v2 = a[i, i+1, i+2, i+3]; |
| 633 | // v3 = vector(v1(3), v2(0, 1, 2)) |
| 634 | |
| 635 | auto *V1 = State.get(Def: getOperand(N: 0)); |
| 636 | if (!V1->getType()->isVectorTy()) |
| 637 | return V1; |
| 638 | Value *V2 = State.get(Def: getOperand(N: 1)); |
| 639 | return Builder.CreateVectorSplice(V1, V2, Imm: -1, Name); |
| 640 | } |
| 641 | case VPInstruction::CalculateTripCountMinusVF: { |
| 642 | unsigned UF = getParent()->getPlan()->getUF(); |
| 643 | Value *ScalarTC = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 644 | Value *Step = createStepForVF(B&: Builder, Ty: ScalarTC->getType(), VF: State.VF, Step: UF); |
| 645 | Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: Step); |
| 646 | Value *Cmp = Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: Step); |
| 647 | Value *Zero = ConstantInt::get(Ty: ScalarTC->getType(), V: 0); |
| 648 | return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero); |
| 649 | } |
| 650 | case VPInstruction::ExplicitVectorLength: { |
| 651 | // TODO: Restructure this code with an explicit remainder loop, vsetvli can |
| 652 | // be outside of the main loop. |
| 653 | Value *AVL = State.get(Def: getOperand(N: 0), /*IsScalar*/ true); |
| 654 | // Compute EVL |
| 655 | assert(AVL->getType()->isIntegerTy() && |
| 656 | "Requested vector length should be an integer." ); |
| 657 | |
| 658 | assert(State.VF.isScalable() && "Expected scalable vector factor." ); |
| 659 | Value *VFArg = State.Builder.getInt32(C: State.VF.getKnownMinValue()); |
| 660 | |
| 661 | Value *EVL = State.Builder.CreateIntrinsic( |
| 662 | RetTy: State.Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length, |
| 663 | Args: {AVL, VFArg, State.Builder.getTrue()}); |
| 664 | return EVL; |
| 665 | } |
| 666 | case VPInstruction::CanonicalIVIncrementForPart: { |
| 667 | unsigned Part = getUnrollPart(U&: *this); |
| 668 | auto *IV = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 669 | assert(Part != 0 && "Must have a positive part" ); |
| 670 | // The canonical IV is incremented by the vectorization factor (num of |
| 671 | // SIMD elements) times the unroll part. |
| 672 | Value *Step = createStepForVF(B&: Builder, Ty: IV->getType(), VF: State.VF, Step: Part); |
| 673 | return Builder.CreateAdd(LHS: IV, RHS: Step, Name, HasNUW: hasNoUnsignedWrap(), |
| 674 | HasNSW: hasNoSignedWrap()); |
| 675 | } |
| 676 | case VPInstruction::BranchOnCond: { |
| 677 | Value *Cond = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 678 | auto *Br = createCondBranch(Cond, VPBB: getParent(), State); |
| 679 | applyMetadata(I&: *Br); |
| 680 | return Br; |
| 681 | } |
| 682 | case VPInstruction::BranchOnCount: { |
| 683 | // First create the compare. |
| 684 | Value *IV = State.get(Def: getOperand(N: 0), /*IsScalar*/ true); |
| 685 | Value *TC = State.get(Def: getOperand(N: 1), /*IsScalar*/ true); |
| 686 | Value *Cond = Builder.CreateICmpEQ(LHS: IV, RHS: TC); |
| 687 | return createCondBranch(Cond, VPBB: getParent(), State); |
| 688 | } |
| 689 | case VPInstruction::Broadcast: { |
| 690 | return Builder.CreateVectorSplat( |
| 691 | EC: State.VF, V: State.get(Def: getOperand(N: 0), /*IsScalar*/ true), Name: "broadcast" ); |
| 692 | } |
| 693 | case VPInstruction::BuildStructVector: { |
| 694 | // For struct types, we need to build a new 'wide' struct type, where each |
| 695 | // element is widened, i.e., we create a struct of vectors. |
| 696 | auto *StructTy = |
| 697 | cast<StructType>(Val: State.TypeAnalysis.inferScalarType(V: getOperand(N: 0))); |
| 698 | Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: StructTy, EC: State.VF)); |
| 699 | for (const auto &[LaneIndex, Op] : enumerate(First: operands())) { |
| 700 | for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements(); |
| 701 | FieldIndex++) { |
| 702 | Value *ScalarValue = |
| 703 | Builder.CreateExtractValue(Agg: State.get(Def: Op, IsScalar: true), Idxs: FieldIndex); |
| 704 | Value *VectorValue = Builder.CreateExtractValue(Agg: Res, Idxs: FieldIndex); |
| 705 | VectorValue = |
| 706 | Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneIndex); |
| 707 | Res = Builder.CreateInsertValue(Agg: Res, Val: VectorValue, Idxs: FieldIndex); |
| 708 | } |
| 709 | } |
| 710 | return Res; |
| 711 | } |
| 712 | case VPInstruction::BuildVector: { |
| 713 | auto *ScalarTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)); |
| 714 | auto NumOfElements = ElementCount::getFixed(MinVal: getNumOperands()); |
| 715 | Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: ScalarTy, EC: NumOfElements)); |
| 716 | for (const auto &[Idx, Op] : enumerate(First: operands())) |
| 717 | Res = State.Builder.CreateInsertElement(Vec: Res, NewElt: State.get(Def: Op, IsScalar: true), |
| 718 | Idx: State.Builder.getInt32(C: Idx)); |
| 719 | return Res; |
| 720 | } |
| 721 | case VPInstruction::ReductionStartVector: { |
| 722 | if (State.VF.isScalar()) |
| 723 | return State.get(Def: getOperand(N: 0), IsScalar: true); |
| 724 | IRBuilderBase::FastMathFlagGuard FMFG(Builder); |
| 725 | Builder.setFastMathFlags(getFastMathFlags()); |
| 726 | // If this start vector is scaled then it should produce a vector with fewer |
| 727 | // elements than the VF. |
| 728 | ElementCount VF = State.VF.divideCoefficientBy( |
| 729 | RHS: cast<ConstantInt>(Val: getOperand(N: 2)->getLiveInIRValue())->getZExtValue()); |
| 730 | auto *Iden = Builder.CreateVectorSplat(EC: VF, V: State.get(Def: getOperand(N: 1), IsScalar: true)); |
| 731 | Constant *Zero = Builder.getInt32(C: 0); |
| 732 | return Builder.CreateInsertElement(Vec: Iden, NewElt: State.get(Def: getOperand(N: 0), IsScalar: true), |
| 733 | Idx: Zero); |
| 734 | } |
| 735 | case VPInstruction::ComputeAnyOfResult: { |
| 736 | // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary |
| 737 | // and will be removed by breaking up the recipe further. |
| 738 | auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: 0)); |
| 739 | auto *OrigPhi = cast<PHINode>(Val: PhiR->getUnderlyingValue()); |
| 740 | Value *ReducedPartRdx = State.get(Def: getOperand(N: 2)); |
| 741 | for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx) |
| 742 | ReducedPartRdx = Builder.CreateBinOp( |
| 743 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode( |
| 744 | Kind: RecurKind::AnyOf), |
| 745 | LHS: State.get(Def: getOperand(N: Idx)), RHS: ReducedPartRdx, Name: "bin.rdx" ); |
| 746 | return createAnyOfReduction(B&: Builder, Src: ReducedPartRdx, |
| 747 | InitVal: State.get(Def: getOperand(N: 1), Lane: VPLane(0)), OrigPhi); |
| 748 | } |
| 749 | case VPInstruction::ComputeFindIVResult: { |
| 750 | // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary |
| 751 | // and will be removed by breaking up the recipe further. |
| 752 | auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: 0)); |
| 753 | // Get its reduction variable descriptor. |
| 754 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
| 755 | RecurKind RK = RdxDesc.getRecurrenceKind(); |
| 756 | assert(RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && |
| 757 | "Unexpected reduction kind" ); |
| 758 | assert(!PhiR->isInLoop() && |
| 759 | "In-loop FindLastIV reduction is not supported yet" ); |
| 760 | |
| 761 | // The recipe's operands are the reduction phi, the start value, the |
| 762 | // sentinel value, followed by one operand for each part of the reduction. |
| 763 | unsigned UF = getNumOperands() - 3; |
| 764 | Value *ReducedPartRdx = State.get(Def: getOperand(N: 3)); |
| 765 | RecurKind MinMaxKind; |
| 766 | bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(Kind: RK); |
| 767 | if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind: RK)) { |
| 768 | MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax; |
| 769 | } else { |
| 770 | assert(RecurrenceDescriptor::isFindFirstIVRecurrenceKind(RK) && |
| 771 | "Kind must either be FindLastIV or FindFirstIV" ); |
| 772 | assert(IsSigned && "Only FindFirstIV with SMax is currently supported" ); |
| 773 | MinMaxKind = RecurKind::SMin; |
| 774 | } |
| 775 | for (unsigned Part = 1; Part < UF; ++Part) |
| 776 | ReducedPartRdx = createMinMaxOp(Builder, RK: MinMaxKind, Left: ReducedPartRdx, |
| 777 | Right: State.get(Def: getOperand(N: 3 + Part))); |
| 778 | |
| 779 | Value *Start = State.get(Def: getOperand(N: 1), IsScalar: true); |
| 780 | Value *Sentinel = getOperand(N: 2)->getLiveInIRValue(); |
| 781 | return createFindLastIVReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK, Start, |
| 782 | Sentinel); |
| 783 | } |
| 784 | case VPInstruction::ComputeReductionResult: { |
| 785 | // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary |
| 786 | // and will be removed by breaking up the recipe further. |
| 787 | auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: 0)); |
| 788 | // Get its reduction variable descriptor. |
| 789 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
| 790 | |
| 791 | RecurKind RK = RdxDesc.getRecurrenceKind(); |
| 792 | assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && |
| 793 | "should be handled by ComputeFindIVResult" ); |
| 794 | |
| 795 | // The recipe's operands are the reduction phi, followed by one operand for |
| 796 | // each part of the reduction. |
| 797 | unsigned UF = getNumOperands() - 1; |
| 798 | VectorParts RdxParts(UF); |
| 799 | for (unsigned Part = 0; Part < UF; ++Part) |
| 800 | RdxParts[Part] = State.get(Def: getOperand(N: 1 + Part), IsScalar: PhiR->isInLoop()); |
| 801 | |
| 802 | IRBuilderBase::FastMathFlagGuard FMFG(Builder); |
| 803 | if (hasFastMathFlags()) |
| 804 | Builder.setFastMathFlags(getFastMathFlags()); |
| 805 | |
| 806 | // Reduce all of the unrolled parts into a single vector. |
| 807 | Value *ReducedPartRdx = RdxParts[0]; |
| 808 | if (PhiR->isOrdered()) { |
| 809 | ReducedPartRdx = RdxParts[UF - 1]; |
| 810 | } else { |
| 811 | // Floating-point operations should have some FMF to enable the reduction. |
| 812 | for (unsigned Part = 1; Part < UF; ++Part) { |
| 813 | Value *RdxPart = RdxParts[Part]; |
| 814 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) |
| 815 | ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart); |
| 816 | else |
| 817 | ReducedPartRdx = |
| 818 | Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxDesc.getOpcode(), |
| 819 | LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx" ); |
| 820 | } |
| 821 | } |
| 822 | |
| 823 | // Create the reduction after the loop. Note that inloop reductions create |
| 824 | // the target reduction in the loop using a Reduction recipe. |
| 825 | if (State.VF.isVector() && !PhiR->isInLoop()) { |
| 826 | // TODO: Support in-order reductions based on the recurrence descriptor. |
| 827 | // All ops in the reduction inherit fast-math-flags from the recurrence |
| 828 | // descriptor. |
| 829 | ReducedPartRdx = createSimpleReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK); |
| 830 | } |
| 831 | |
| 832 | return ReducedPartRdx; |
| 833 | } |
| 834 | case VPInstruction::ExtractLastElement: |
| 835 | case VPInstruction::ExtractPenultimateElement: { |
| 836 | unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2; |
| 837 | Value *Res; |
| 838 | if (State.VF.isVector()) { |
| 839 | assert(Offset <= State.VF.getKnownMinValue() && |
| 840 | "invalid offset to extract from" ); |
| 841 | // Extract lane VF - Offset from the operand. |
| 842 | Res = State.get(Def: getOperand(N: 0), Lane: VPLane::getLaneFromEnd(VF: State.VF, Offset)); |
| 843 | } else { |
| 844 | assert(Offset <= 1 && "invalid offset to extract from" ); |
| 845 | Res = State.get(Def: getOperand(N: 0)); |
| 846 | } |
| 847 | if (isa<ExtractElementInst>(Val: Res)) |
| 848 | Res->setName(Name); |
| 849 | return Res; |
| 850 | } |
| 851 | case VPInstruction::LogicalAnd: { |
| 852 | Value *A = State.get(Def: getOperand(N: 0)); |
| 853 | Value *B = State.get(Def: getOperand(N: 1)); |
| 854 | return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name); |
| 855 | } |
| 856 | case VPInstruction::PtrAdd: { |
| 857 | assert(vputils::onlyFirstLaneUsed(this) && |
| 858 | "can only generate first lane for PtrAdd" ); |
| 859 | Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 860 | Value *Addend = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 861 | return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags()); |
| 862 | } |
| 863 | case VPInstruction::AnyOf: { |
| 864 | Value *Res = State.get(Def: getOperand(N: 0)); |
| 865 | for (VPValue *Op : drop_begin(RangeOrContainer: operands())) |
| 866 | Res = Builder.CreateOr(LHS: Res, RHS: State.get(Def: Op)); |
| 867 | return Builder.CreateOrReduce(Src: Res); |
| 868 | } |
| 869 | case VPInstruction::FirstActiveLane: { |
| 870 | if (getNumOperands() == 1) { |
| 871 | Value *Mask = State.get(Def: getOperand(N: 0)); |
| 872 | return Builder.CreateCountTrailingZeroElems(ResTy: Builder.getInt64Ty(), Mask, |
| 873 | ZeroIsPoison: true, Name); |
| 874 | } |
| 875 | // If there are multiple operands, create a chain of selects to pick the |
| 876 | // first operand with an active lane and add the number of lanes of the |
| 877 | // preceding operands. |
| 878 | Value *RuntimeVF = |
| 879 | getRuntimeVF(B&: State.Builder, Ty: State.Builder.getInt64Ty(), VF: State.VF); |
| 880 | unsigned LastOpIdx = getNumOperands() - 1; |
| 881 | Value *Res = nullptr; |
| 882 | for (int Idx = LastOpIdx; Idx >= 0; --Idx) { |
| 883 | Value *TrailingZeros = Builder.CreateCountTrailingZeroElems( |
| 884 | ResTy: Builder.getInt64Ty(), Mask: State.get(Def: getOperand(N: Idx)), ZeroIsPoison: true, Name); |
| 885 | Value *Current = Builder.CreateAdd( |
| 886 | LHS: Builder.CreateMul(LHS: RuntimeVF, RHS: Builder.getInt64(C: Idx)), RHS: TrailingZeros); |
| 887 | if (Res) { |
| 888 | Value *Cmp = Builder.CreateICmpNE(LHS: TrailingZeros, RHS: RuntimeVF); |
| 889 | Res = Builder.CreateSelect(C: Cmp, True: Current, False: Res); |
| 890 | } else { |
| 891 | Res = Current; |
| 892 | } |
| 893 | } |
| 894 | |
| 895 | return Res; |
| 896 | } |
| 897 | default: |
| 898 | llvm_unreachable("Unsupported opcode for instruction" ); |
| 899 | } |
| 900 | } |
| 901 | |
| 902 | InstructionCost VPInstruction::computeCost(ElementCount VF, |
| 903 | VPCostContext &Ctx) const { |
| 904 | if (Instruction::isBinaryOp(Opcode: getOpcode())) { |
| 905 | Type *ResTy = Ctx.Types.inferScalarType(V: this); |
| 906 | if (!vputils::onlyFirstLaneUsed(Def: this)) |
| 907 | ResTy = toVectorTy(Scalar: ResTy, EC: VF); |
| 908 | |
| 909 | if (!getUnderlyingValue()) { |
| 910 | switch (getOpcode()) { |
| 911 | case Instruction::FMul: |
| 912 | return Ctx.TTI.getArithmeticInstrCost(Opcode: getOpcode(), Ty: ResTy, CostKind: Ctx.CostKind); |
| 913 | default: |
| 914 | // TODO: Compute cost for VPInstructions without underlying values once |
| 915 | // the legacy cost model has been retired. |
| 916 | return 0; |
| 917 | } |
| 918 | } |
| 919 | |
| 920 | assert(!doesGeneratePerAllLanes() && |
| 921 | "Should only generate a vector value or single scalar, not scalars " |
| 922 | "for all lanes." ); |
| 923 | return Ctx.TTI.getArithmeticInstrCost(Opcode: getOpcode(), Ty: ResTy, CostKind: Ctx.CostKind); |
| 924 | } |
| 925 | |
| 926 | switch (getOpcode()) { |
| 927 | case Instruction::ExtractElement: { |
| 928 | // Add on the cost of extracting the element. |
| 929 | auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF); |
| 930 | return Ctx.TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, |
| 931 | CostKind: Ctx.CostKind); |
| 932 | } |
| 933 | case VPInstruction::AnyOf: { |
| 934 | auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 935 | return Ctx.TTI.getArithmeticReductionCost( |
| 936 | Opcode: Instruction::Or, Ty: cast<VectorType>(Val: VecTy), FMF: std::nullopt, CostKind: Ctx.CostKind); |
| 937 | } |
| 938 | case VPInstruction::FirstActiveLane: { |
| 939 | // Calculate the cost of determining the lane index. |
| 940 | auto *PredTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF); |
| 941 | IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, |
| 942 | Type::getInt64Ty(C&: Ctx.LLVMCtx), |
| 943 | {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)}); |
| 944 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 945 | } |
| 946 | case VPInstruction::FirstOrderRecurrenceSplice: { |
| 947 | assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?" ); |
| 948 | SmallVector<int> Mask(VF.getKnownMinValue()); |
| 949 | std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1); |
| 950 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 951 | |
| 952 | return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice, |
| 953 | DstTy: cast<VectorType>(Val: VectorTy), |
| 954 | SrcTy: cast<VectorType>(Val: VectorTy), Mask, |
| 955 | CostKind: Ctx.CostKind, Index: VF.getKnownMinValue() - 1); |
| 956 | } |
| 957 | case VPInstruction::ActiveLaneMask: { |
| 958 | Type *ArgTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 959 | Type *RetTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF); |
| 960 | IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, |
| 961 | {ArgTy, ArgTy}); |
| 962 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 963 | } |
| 964 | case VPInstruction::ExplicitVectorLength: { |
| 965 | Type *Arg0Ty = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 966 | Type *I32Ty = Type::getInt32Ty(C&: Ctx.LLVMCtx); |
| 967 | Type *I1Ty = Type::getInt1Ty(C&: Ctx.LLVMCtx); |
| 968 | IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length, |
| 969 | I32Ty, {Arg0Ty, I32Ty, I1Ty}); |
| 970 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 971 | } |
| 972 | case VPInstruction::ExtractPenultimateElement: |
| 973 | if (VF == ElementCount::getScalable(MinVal: 1)) |
| 974 | return InstructionCost::getInvalid(); |
| 975 | LLVM_FALLTHROUGH; |
| 976 | default: |
| 977 | // TODO: Compute cost other VPInstructions once the legacy cost model has |
| 978 | // been retired. |
| 979 | assert(!getUnderlyingValue() && |
| 980 | "unexpected VPInstruction witht underlying value" ); |
| 981 | return 0; |
| 982 | } |
| 983 | } |
| 984 | |
| 985 | bool VPInstruction::isVectorToScalar() const { |
| 986 | return getOpcode() == VPInstruction::ExtractLastElement || |
| 987 | getOpcode() == VPInstruction::ExtractPenultimateElement || |
| 988 | getOpcode() == Instruction::ExtractElement || |
| 989 | getOpcode() == VPInstruction::FirstActiveLane || |
| 990 | getOpcode() == VPInstruction::ComputeAnyOfResult || |
| 991 | getOpcode() == VPInstruction::ComputeFindIVResult || |
| 992 | getOpcode() == VPInstruction::ComputeReductionResult || |
| 993 | getOpcode() == VPInstruction::AnyOf; |
| 994 | } |
| 995 | |
| 996 | bool VPInstruction::isSingleScalar() const { |
| 997 | return getOpcode() == Instruction::PHI || isScalarCast(); |
| 998 | } |
| 999 | |
| 1000 | void VPInstruction::execute(VPTransformState &State) { |
| 1001 | assert(!State.Lane && "VPInstruction executing an Lane" ); |
| 1002 | IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); |
| 1003 | assert(flagsValidForOpcode(getOpcode()) && |
| 1004 | "Set flags not supported for the provided opcode" ); |
| 1005 | if (hasFastMathFlags()) |
| 1006 | State.Builder.setFastMathFlags(getFastMathFlags()); |
| 1007 | bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && |
| 1008 | (vputils::onlyFirstLaneUsed(Def: this) || |
| 1009 | isVectorToScalar() || isSingleScalar()); |
| 1010 | bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); |
| 1011 | if (GeneratesPerAllLanes) { |
| 1012 | for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue(); |
| 1013 | Lane != NumLanes; ++Lane) { |
| 1014 | Value *GeneratedValue = generatePerLane(State, Lane: VPLane(Lane)); |
| 1015 | assert(GeneratedValue && "generatePerLane must produce a value" ); |
| 1016 | State.set(Def: this, V: GeneratedValue, Lane: VPLane(Lane)); |
| 1017 | } |
| 1018 | return; |
| 1019 | } |
| 1020 | |
| 1021 | Value *GeneratedValue = generate(State); |
| 1022 | if (!hasResult()) |
| 1023 | return; |
| 1024 | assert(GeneratedValue && "generate must produce a value" ); |
| 1025 | assert((((GeneratedValue->getType()->isVectorTy() || |
| 1026 | GeneratedValue->getType()->isStructTy()) == |
| 1027 | !GeneratesPerFirstLaneOnly) || |
| 1028 | State.VF.isScalar()) && |
| 1029 | "scalar value but not only first lane defined" ); |
| 1030 | State.set(Def: this, V: GeneratedValue, |
| 1031 | /*IsScalar*/ GeneratesPerFirstLaneOnly); |
| 1032 | } |
| 1033 | |
| 1034 | bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { |
| 1035 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
| 1036 | return false; |
| 1037 | switch (getOpcode()) { |
| 1038 | case Instruction::ExtractElement: |
| 1039 | case Instruction::Freeze: |
| 1040 | case Instruction::ICmp: |
| 1041 | case Instruction::Select: |
| 1042 | case VPInstruction::AnyOf: |
| 1043 | case VPInstruction::BuildStructVector: |
| 1044 | case VPInstruction::BuildVector: |
| 1045 | case VPInstruction::CalculateTripCountMinusVF: |
| 1046 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1047 | case VPInstruction::ExtractLastElement: |
| 1048 | case VPInstruction::ExtractPenultimateElement: |
| 1049 | case VPInstruction::FirstActiveLane: |
| 1050 | case VPInstruction::FirstOrderRecurrenceSplice: |
| 1051 | case VPInstruction::LogicalAnd: |
| 1052 | case VPInstruction::Not: |
| 1053 | case VPInstruction::PtrAdd: |
| 1054 | case VPInstruction::WideIVStep: |
| 1055 | case VPInstruction::StepVector: |
| 1056 | case VPInstruction::ReductionStartVector: |
| 1057 | return false; |
| 1058 | default: |
| 1059 | return true; |
| 1060 | } |
| 1061 | } |
| 1062 | |
| 1063 | bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { |
| 1064 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 1065 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
| 1066 | return vputils::onlyFirstLaneUsed(Def: this); |
| 1067 | |
| 1068 | switch (getOpcode()) { |
| 1069 | default: |
| 1070 | return false; |
| 1071 | case Instruction::ExtractElement: |
| 1072 | return Op == getOperand(N: 1); |
| 1073 | case Instruction::PHI: |
| 1074 | return true; |
| 1075 | case Instruction::ICmp: |
| 1076 | case Instruction::Select: |
| 1077 | case Instruction::Or: |
| 1078 | case Instruction::Freeze: |
| 1079 | // TODO: Cover additional opcodes. |
| 1080 | return vputils::onlyFirstLaneUsed(Def: this); |
| 1081 | case VPInstruction::ActiveLaneMask: |
| 1082 | case VPInstruction::ExplicitVectorLength: |
| 1083 | case VPInstruction::CalculateTripCountMinusVF: |
| 1084 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1085 | case VPInstruction::BranchOnCount: |
| 1086 | case VPInstruction::BranchOnCond: |
| 1087 | case VPInstruction::Broadcast: |
| 1088 | case VPInstruction::ReductionStartVector: |
| 1089 | return true; |
| 1090 | case VPInstruction::PtrAdd: |
| 1091 | return Op == getOperand(N: 0) || vputils::onlyFirstLaneUsed(Def: this); |
| 1092 | case VPInstruction::ComputeAnyOfResult: |
| 1093 | case VPInstruction::ComputeFindIVResult: |
| 1094 | return Op == getOperand(N: 1); |
| 1095 | }; |
| 1096 | llvm_unreachable("switch should return" ); |
| 1097 | } |
| 1098 | |
| 1099 | bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { |
| 1100 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 1101 | if (Instruction::isBinaryOp(Opcode: getOpcode())) |
| 1102 | return vputils::onlyFirstPartUsed(Def: this); |
| 1103 | |
| 1104 | switch (getOpcode()) { |
| 1105 | default: |
| 1106 | return false; |
| 1107 | case Instruction::ICmp: |
| 1108 | case Instruction::Select: |
| 1109 | return vputils::onlyFirstPartUsed(Def: this); |
| 1110 | case VPInstruction::BranchOnCount: |
| 1111 | case VPInstruction::BranchOnCond: |
| 1112 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1113 | return true; |
| 1114 | }; |
| 1115 | llvm_unreachable("switch should return" ); |
| 1116 | } |
| 1117 | |
| 1118 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1119 | void VPInstruction::dump() const { |
| 1120 | VPSlotTracker SlotTracker(getParent()->getPlan()); |
| 1121 | print(dbgs(), "" , SlotTracker); |
| 1122 | } |
| 1123 | |
| 1124 | void VPInstruction::print(raw_ostream &O, const Twine &Indent, |
| 1125 | VPSlotTracker &SlotTracker) const { |
| 1126 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
| 1127 | |
| 1128 | if (hasResult()) { |
| 1129 | printAsOperand(O, SlotTracker); |
| 1130 | O << " = " ; |
| 1131 | } |
| 1132 | |
| 1133 | switch (getOpcode()) { |
| 1134 | case VPInstruction::Not: |
| 1135 | O << "not" ; |
| 1136 | break; |
| 1137 | case VPInstruction::SLPLoad: |
| 1138 | O << "combined load" ; |
| 1139 | break; |
| 1140 | case VPInstruction::SLPStore: |
| 1141 | O << "combined store" ; |
| 1142 | break; |
| 1143 | case VPInstruction::ActiveLaneMask: |
| 1144 | O << "active lane mask" ; |
| 1145 | break; |
| 1146 | case VPInstruction::ExplicitVectorLength: |
| 1147 | O << "EXPLICIT-VECTOR-LENGTH" ; |
| 1148 | break; |
| 1149 | case VPInstruction::FirstOrderRecurrenceSplice: |
| 1150 | O << "first-order splice" ; |
| 1151 | break; |
| 1152 | case VPInstruction::BranchOnCond: |
| 1153 | O << "branch-on-cond" ; |
| 1154 | break; |
| 1155 | case VPInstruction::CalculateTripCountMinusVF: |
| 1156 | O << "TC > VF ? TC - VF : 0" ; |
| 1157 | break; |
| 1158 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1159 | O << "VF * Part +" ; |
| 1160 | break; |
| 1161 | case VPInstruction::BranchOnCount: |
| 1162 | O << "branch-on-count" ; |
| 1163 | break; |
| 1164 | case VPInstruction::Broadcast: |
| 1165 | O << "broadcast" ; |
| 1166 | break; |
| 1167 | case VPInstruction::BuildStructVector: |
| 1168 | O << "buildstructvector" ; |
| 1169 | break; |
| 1170 | case VPInstruction::BuildVector: |
| 1171 | O << "buildvector" ; |
| 1172 | break; |
| 1173 | case VPInstruction::ExtractLastElement: |
| 1174 | O << "extract-last-element" ; |
| 1175 | break; |
| 1176 | case VPInstruction::ExtractPenultimateElement: |
| 1177 | O << "extract-penultimate-element" ; |
| 1178 | break; |
| 1179 | case VPInstruction::ComputeAnyOfResult: |
| 1180 | O << "compute-anyof-result" ; |
| 1181 | break; |
| 1182 | case VPInstruction::ComputeFindIVResult: |
| 1183 | O << "compute-find-iv-result" ; |
| 1184 | break; |
| 1185 | case VPInstruction::ComputeReductionResult: |
| 1186 | O << "compute-reduction-result" ; |
| 1187 | break; |
| 1188 | case VPInstruction::LogicalAnd: |
| 1189 | O << "logical-and" ; |
| 1190 | break; |
| 1191 | case VPInstruction::PtrAdd: |
| 1192 | O << "ptradd" ; |
| 1193 | break; |
| 1194 | case VPInstruction::AnyOf: |
| 1195 | O << "any-of" ; |
| 1196 | break; |
| 1197 | case VPInstruction::FirstActiveLane: |
| 1198 | O << "first-active-lane" ; |
| 1199 | break; |
| 1200 | case VPInstruction::ReductionStartVector: |
| 1201 | O << "reduction-start-vector" ; |
| 1202 | break; |
| 1203 | default: |
| 1204 | O << Instruction::getOpcodeName(getOpcode()); |
| 1205 | } |
| 1206 | |
| 1207 | printFlags(O); |
| 1208 | printOperands(O, SlotTracker); |
| 1209 | |
| 1210 | if (auto DL = getDebugLoc()) { |
| 1211 | O << ", !dbg " ; |
| 1212 | DL.print(O); |
| 1213 | } |
| 1214 | } |
| 1215 | #endif |
| 1216 | |
| 1217 | void VPInstructionWithType::execute(VPTransformState &State) { |
| 1218 | State.setDebugLocFrom(getDebugLoc()); |
| 1219 | if (isScalarCast()) { |
| 1220 | Value *Op = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 1221 | Value *Cast = State.Builder.CreateCast(Op: Instruction::CastOps(getOpcode()), |
| 1222 | V: Op, DestTy: ResultTy); |
| 1223 | State.set(Def: this, V: Cast, Lane: VPLane(0)); |
| 1224 | return; |
| 1225 | } |
| 1226 | switch (getOpcode()) { |
| 1227 | case VPInstruction::StepVector: { |
| 1228 | Value *StepVector = |
| 1229 | State.Builder.CreateStepVector(DstType: VectorType::get(ElementType: ResultTy, EC: State.VF)); |
| 1230 | State.set(Def: this, V: StepVector); |
| 1231 | break; |
| 1232 | } |
| 1233 | default: |
| 1234 | llvm_unreachable("opcode not implemented yet" ); |
| 1235 | } |
| 1236 | } |
| 1237 | |
| 1238 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1239 | void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent, |
| 1240 | VPSlotTracker &SlotTracker) const { |
| 1241 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
| 1242 | printAsOperand(O, SlotTracker); |
| 1243 | O << " = " ; |
| 1244 | |
| 1245 | switch (getOpcode()) { |
| 1246 | case VPInstruction::WideIVStep: |
| 1247 | O << "wide-iv-step " ; |
| 1248 | printOperands(O, SlotTracker); |
| 1249 | break; |
| 1250 | case VPInstruction::StepVector: |
| 1251 | O << "step-vector " << *ResultTy; |
| 1252 | break; |
| 1253 | default: |
| 1254 | assert(Instruction::isCast(getOpcode()) && "unhandled opcode" ); |
| 1255 | O << Instruction::getOpcodeName(getOpcode()) << " " ; |
| 1256 | printOperands(O, SlotTracker); |
| 1257 | O << " to " << *ResultTy; |
| 1258 | } |
| 1259 | } |
| 1260 | #endif |
| 1261 | |
| 1262 | void VPPhi::execute(VPTransformState &State) { |
| 1263 | State.setDebugLocFrom(getDebugLoc()); |
| 1264 | PHINode *NewPhi = State.Builder.CreatePHI( |
| 1265 | Ty: State.TypeAnalysis.inferScalarType(V: this), NumReservedValues: 2, Name: getName()); |
| 1266 | unsigned NumIncoming = getNumIncoming(); |
| 1267 | if (getParent() != getParent()->getPlan()->getScalarPreheader()) { |
| 1268 | // TODO: Fixup all incoming values of header phis once recipes defining them |
| 1269 | // are introduced. |
| 1270 | NumIncoming = 1; |
| 1271 | } |
| 1272 | for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) { |
| 1273 | Value *IncV = State.get(Def: getIncomingValue(Idx), Lane: VPLane(0)); |
| 1274 | BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(Val: getIncomingBlock(Idx)); |
| 1275 | NewPhi->addIncoming(V: IncV, BB: PredBB); |
| 1276 | } |
| 1277 | State.set(Def: this, V: NewPhi, Lane: VPLane(0)); |
| 1278 | } |
| 1279 | |
| 1280 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1281 | void VPPhi::print(raw_ostream &O, const Twine &Indent, |
| 1282 | VPSlotTracker &SlotTracker) const { |
| 1283 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
| 1284 | printAsOperand(O, SlotTracker); |
| 1285 | O << " = phi " ; |
| 1286 | printPhiOperands(O, SlotTracker); |
| 1287 | } |
| 1288 | #endif |
| 1289 | |
| 1290 | VPIRInstruction *VPIRInstruction ::create(Instruction &I) { |
| 1291 | if (auto *Phi = dyn_cast<PHINode>(Val: &I)) |
| 1292 | return new VPIRPhi(*Phi); |
| 1293 | return new VPIRInstruction(I); |
| 1294 | } |
| 1295 | |
| 1296 | void VPIRInstruction::execute(VPTransformState &State) { |
| 1297 | assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 && |
| 1298 | "PHINodes must be handled by VPIRPhi" ); |
| 1299 | // Advance the insert point after the wrapped IR instruction. This allows |
| 1300 | // interleaving VPIRInstructions and other recipes. |
| 1301 | State.Builder.SetInsertPoint(TheBB: I.getParent(), IP: std::next(x: I.getIterator())); |
| 1302 | } |
| 1303 | |
| 1304 | InstructionCost VPIRInstruction::computeCost(ElementCount VF, |
| 1305 | VPCostContext &Ctx) const { |
| 1306 | // The recipe wraps an existing IR instruction on the border of VPlan's scope, |
| 1307 | // hence it does not contribute to the cost-modeling for the VPlan. |
| 1308 | return 0; |
| 1309 | } |
| 1310 | |
| 1311 | void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) { |
| 1312 | assert(isa<PHINode>(getInstruction()) && |
| 1313 | "can only update exiting operands to phi nodes" ); |
| 1314 | assert(getNumOperands() > 0 && "must have at least one operand" ); |
| 1315 | VPValue *Exiting = getOperand(N: 0); |
| 1316 | if (Exiting->isLiveIn()) |
| 1317 | return; |
| 1318 | |
| 1319 | Exiting = Builder.createNaryOp(Opcode: VPInstruction::ExtractLastElement, Operands: {Exiting}); |
| 1320 | setOperand(I: 0, New: Exiting); |
| 1321 | } |
| 1322 | |
| 1323 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1324 | void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, |
| 1325 | VPSlotTracker &SlotTracker) const { |
| 1326 | O << Indent << "IR " << I; |
| 1327 | } |
| 1328 | #endif |
| 1329 | |
| 1330 | void VPIRPhi::execute(VPTransformState &State) { |
| 1331 | PHINode *Phi = &getIRPhi(); |
| 1332 | for (const auto &[Idx, Op] : enumerate(First: operands())) { |
| 1333 | VPValue *ExitValue = Op; |
| 1334 | auto Lane = vputils::isSingleScalar(VPV: ExitValue) |
| 1335 | ? VPLane::getFirstLane() |
| 1336 | : VPLane::getLastLaneForVF(VF: State.VF); |
| 1337 | VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; |
| 1338 | auto *PredVPBB = Pred->getExitingBasicBlock(); |
| 1339 | BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; |
| 1340 | // Set insertion point in PredBB in case an extract needs to be generated. |
| 1341 | // TODO: Model extracts explicitly. |
| 1342 | State.Builder.SetInsertPoint(TheBB: PredBB, IP: PredBB->getFirstNonPHIIt()); |
| 1343 | Value *V = State.get(Def: ExitValue, Lane: VPLane(Lane)); |
| 1344 | // If there is no existing block for PredBB in the phi, add a new incoming |
| 1345 | // value. Otherwise update the existing incoming value for PredBB. |
| 1346 | if (Phi->getBasicBlockIndex(BB: PredBB) == -1) |
| 1347 | Phi->addIncoming(V, BB: PredBB); |
| 1348 | else |
| 1349 | Phi->setIncomingValueForBlock(BB: PredBB, V); |
| 1350 | } |
| 1351 | |
| 1352 | // Advance the insert point after the wrapped IR instruction. This allows |
| 1353 | // interleaving VPIRInstructions and other recipes. |
| 1354 | State.Builder.SetInsertPoint(TheBB: Phi->getParent(), IP: std::next(x: Phi->getIterator())); |
| 1355 | } |
| 1356 | |
| 1357 | void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const { |
| 1358 | VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe()); |
| 1359 | assert(R->getNumOperands() == R->getParent()->getNumPredecessors() && |
| 1360 | "Number of phi operands must match number of predecessors" ); |
| 1361 | unsigned Position = R->getParent()->getIndexForPredecessor(Pred: IncomingBlock); |
| 1362 | R->removeOperand(Idx: Position); |
| 1363 | } |
| 1364 | |
| 1365 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1366 | void VPPhiAccessors::printPhiOperands(raw_ostream &O, |
| 1367 | VPSlotTracker &SlotTracker) const { |
| 1368 | interleaveComma(enumerate(getAsRecipe()->operands()), O, |
| 1369 | [this, &O, &SlotTracker](auto Op) { |
| 1370 | O << "[ " ; |
| 1371 | Op.value()->printAsOperand(O, SlotTracker); |
| 1372 | O << ", " ; |
| 1373 | getIncomingBlock(Op.index())->printAsOperand(O); |
| 1374 | O << " ]" ; |
| 1375 | }); |
| 1376 | } |
| 1377 | #endif |
| 1378 | |
| 1379 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1380 | void VPIRPhi::print(raw_ostream &O, const Twine &Indent, |
| 1381 | VPSlotTracker &SlotTracker) const { |
| 1382 | VPIRInstruction::print(O, Indent, SlotTracker); |
| 1383 | |
| 1384 | if (getNumOperands() != 0) { |
| 1385 | O << " (extra operand" << (getNumOperands() > 1 ? "s" : "" ) << ": " ; |
| 1386 | interleaveComma( |
| 1387 | enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) { |
| 1388 | Op.value()->printAsOperand(O, SlotTracker); |
| 1389 | O << " from " ; |
| 1390 | getParent()->getPredecessors()[Op.index()]->printAsOperand(O); |
| 1391 | }); |
| 1392 | O << ")" ; |
| 1393 | } |
| 1394 | } |
| 1395 | #endif |
| 1396 | |
| 1397 | VPIRMetadata::VPIRMetadata(Instruction &I, LoopVersioning *LVer) |
| 1398 | : VPIRMetadata(I) { |
| 1399 | if (!LVer || !isa<LoadInst, StoreInst>(Val: &I)) |
| 1400 | return; |
| 1401 | const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(OrigInst: &I); |
| 1402 | if (AliasScopeMD) |
| 1403 | Metadata.emplace_back(Args: LLVMContext::MD_alias_scope, Args: AliasScopeMD); |
| 1404 | if (NoAliasMD) |
| 1405 | Metadata.emplace_back(Args: LLVMContext::MD_noalias, Args: NoAliasMD); |
| 1406 | } |
| 1407 | |
| 1408 | void VPIRMetadata::applyMetadata(Instruction &I) const { |
| 1409 | for (const auto &[Kind, Node] : Metadata) |
| 1410 | I.setMetadata(KindID: Kind, Node); |
| 1411 | } |
| 1412 | |
| 1413 | void VPWidenCallRecipe::execute(VPTransformState &State) { |
| 1414 | assert(State.VF.isVector() && "not widening" ); |
| 1415 | assert(Variant != nullptr && "Can't create vector function." ); |
| 1416 | |
| 1417 | FunctionType *VFTy = Variant->getFunctionType(); |
| 1418 | // Add return type if intrinsic is overloaded on it. |
| 1419 | SmallVector<Value *, 4> Args; |
| 1420 | for (const auto &I : enumerate(First: args())) { |
| 1421 | Value *Arg; |
| 1422 | // Some vectorized function variants may also take a scalar argument, |
| 1423 | // e.g. linear parameters for pointers. This needs to be the scalar value |
| 1424 | // from the start of the respective part when interleaving. |
| 1425 | if (!VFTy->getParamType(i: I.index())->isVectorTy()) |
| 1426 | Arg = State.get(Def: I.value(), Lane: VPLane(0)); |
| 1427 | else |
| 1428 | Arg = State.get(Def: I.value(), IsScalar: onlyFirstLaneUsed(Op: I.value())); |
| 1429 | Args.push_back(Elt: Arg); |
| 1430 | } |
| 1431 | |
| 1432 | auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue()); |
| 1433 | SmallVector<OperandBundleDef, 1> OpBundles; |
| 1434 | if (CI) |
| 1435 | CI->getOperandBundlesAsDefs(Defs&: OpBundles); |
| 1436 | |
| 1437 | CallInst *V = State.Builder.CreateCall(Callee: Variant, Args, OpBundles); |
| 1438 | applyFlags(I&: *V); |
| 1439 | applyMetadata(I&: *V); |
| 1440 | V->setCallingConv(Variant->getCallingConv()); |
| 1441 | |
| 1442 | if (!V->getType()->isVoidTy()) |
| 1443 | State.set(Def: this, V); |
| 1444 | } |
| 1445 | |
| 1446 | InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, |
| 1447 | VPCostContext &Ctx) const { |
| 1448 | return Ctx.TTI.getCallInstrCost(F: nullptr, RetTy: Variant->getReturnType(), |
| 1449 | Tys: Variant->getFunctionType()->params(), |
| 1450 | CostKind: Ctx.CostKind); |
| 1451 | } |
| 1452 | |
| 1453 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1454 | void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, |
| 1455 | VPSlotTracker &SlotTracker) const { |
| 1456 | O << Indent << "WIDEN-CALL " ; |
| 1457 | |
| 1458 | Function *CalledFn = getCalledScalarFunction(); |
| 1459 | if (CalledFn->getReturnType()->isVoidTy()) |
| 1460 | O << "void " ; |
| 1461 | else { |
| 1462 | printAsOperand(O, SlotTracker); |
| 1463 | O << " = " ; |
| 1464 | } |
| 1465 | |
| 1466 | O << "call" ; |
| 1467 | printFlags(O); |
| 1468 | O << " @" << CalledFn->getName() << "(" ; |
| 1469 | interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) { |
| 1470 | Op->printAsOperand(O, SlotTracker); |
| 1471 | }); |
| 1472 | O << ")" ; |
| 1473 | |
| 1474 | O << " (using library function" ; |
| 1475 | if (Variant->hasName()) |
| 1476 | O << ": " << Variant->getName(); |
| 1477 | O << ")" ; |
| 1478 | } |
| 1479 | #endif |
| 1480 | |
| 1481 | void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { |
| 1482 | assert(State.VF.isVector() && "not widening" ); |
| 1483 | |
| 1484 | SmallVector<Type *, 2> TysForDecl; |
| 1485 | // Add return type if intrinsic is overloaded on it. |
| 1486 | if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1, TTI: State.TTI)) |
| 1487 | TysForDecl.push_back(Elt: VectorType::get(ElementType: getResultType(), EC: State.VF)); |
| 1488 | SmallVector<Value *, 4> Args; |
| 1489 | for (const auto &I : enumerate(First: operands())) { |
| 1490 | // Some intrinsics have a scalar argument - don't replace it with a |
| 1491 | // vector. |
| 1492 | Value *Arg; |
| 1493 | if (isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index(), |
| 1494 | TTI: State.TTI)) |
| 1495 | Arg = State.get(Def: I.value(), Lane: VPLane(0)); |
| 1496 | else |
| 1497 | Arg = State.get(Def: I.value(), IsScalar: onlyFirstLaneUsed(Op: I.value())); |
| 1498 | if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index(), |
| 1499 | TTI: State.TTI)) |
| 1500 | TysForDecl.push_back(Elt: Arg->getType()); |
| 1501 | Args.push_back(Elt: Arg); |
| 1502 | } |
| 1503 | |
| 1504 | // Use vector version of the intrinsic. |
| 1505 | Module *M = State.Builder.GetInsertBlock()->getModule(); |
| 1506 | Function *VectorF = |
| 1507 | Intrinsic::getOrInsertDeclaration(M, id: VectorIntrinsicID, Tys: TysForDecl); |
| 1508 | assert(VectorF && |
| 1509 | "Can't retrieve vector intrinsic or vector-predication intrinsics." ); |
| 1510 | |
| 1511 | auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue()); |
| 1512 | SmallVector<OperandBundleDef, 1> OpBundles; |
| 1513 | if (CI) |
| 1514 | CI->getOperandBundlesAsDefs(Defs&: OpBundles); |
| 1515 | |
| 1516 | CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles); |
| 1517 | |
| 1518 | applyFlags(I&: *V); |
| 1519 | applyMetadata(I&: *V); |
| 1520 | |
| 1521 | if (!V->getType()->isVoidTy()) |
| 1522 | State.set(Def: this, V); |
| 1523 | } |
| 1524 | |
| 1525 | InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, |
| 1526 | VPCostContext &Ctx) const { |
| 1527 | // Some backends analyze intrinsic arguments to determine cost. Use the |
| 1528 | // underlying value for the operand if it has one. Otherwise try to use the |
| 1529 | // operand of the underlying call instruction, if there is one. Otherwise |
| 1530 | // clear Arguments. |
| 1531 | // TODO: Rework TTI interface to be independent of concrete IR values. |
| 1532 | SmallVector<const Value *> Arguments; |
| 1533 | for (const auto &[Idx, Op] : enumerate(First: operands())) { |
| 1534 | auto *V = Op->getUnderlyingValue(); |
| 1535 | if (!V) { |
| 1536 | if (auto *UI = dyn_cast_or_null<CallBase>(Val: getUnderlyingValue())) { |
| 1537 | Arguments.push_back(Elt: UI->getArgOperand(i: Idx)); |
| 1538 | continue; |
| 1539 | } |
| 1540 | Arguments.clear(); |
| 1541 | break; |
| 1542 | } |
| 1543 | Arguments.push_back(Elt: V); |
| 1544 | } |
| 1545 | |
| 1546 | Type *RetTy = toVectorizedTy(Ty: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 1547 | SmallVector<Type *> ParamTys; |
| 1548 | for (unsigned I = 0; I != getNumOperands(); ++I) |
| 1549 | ParamTys.push_back( |
| 1550 | Elt: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: I)), EC: VF)); |
| 1551 | |
| 1552 | // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. |
| 1553 | FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); |
| 1554 | IntrinsicCostAttributes CostAttrs( |
| 1555 | VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, |
| 1556 | dyn_cast_or_null<IntrinsicInst>(Val: getUnderlyingValue()), |
| 1557 | InstructionCost::getInvalid(), &Ctx.TLI); |
| 1558 | return Ctx.TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind: Ctx.CostKind); |
| 1559 | } |
| 1560 | |
| 1561 | StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { |
| 1562 | return Intrinsic::getBaseName(id: VectorIntrinsicID); |
| 1563 | } |
| 1564 | |
| 1565 | bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { |
| 1566 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 1567 | return all_of(Range: enumerate(First: operands()), P: [this, &Op](const auto &X) { |
| 1568 | auto [Idx, V] = X; |
| 1569 | return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(), |
| 1570 | Idx, nullptr); |
| 1571 | }); |
| 1572 | } |
| 1573 | |
| 1574 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1575 | void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent, |
| 1576 | VPSlotTracker &SlotTracker) const { |
| 1577 | O << Indent << "WIDEN-INTRINSIC " ; |
| 1578 | if (ResultTy->isVoidTy()) { |
| 1579 | O << "void " ; |
| 1580 | } else { |
| 1581 | printAsOperand(O, SlotTracker); |
| 1582 | O << " = " ; |
| 1583 | } |
| 1584 | |
| 1585 | O << "call" ; |
| 1586 | printFlags(O); |
| 1587 | O << getIntrinsicName() << "(" ; |
| 1588 | |
| 1589 | interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { |
| 1590 | Op->printAsOperand(O, SlotTracker); |
| 1591 | }); |
| 1592 | O << ")" ; |
| 1593 | } |
| 1594 | #endif |
| 1595 | |
| 1596 | void VPHistogramRecipe::execute(VPTransformState &State) { |
| 1597 | IRBuilderBase &Builder = State.Builder; |
| 1598 | |
| 1599 | Value *Address = State.get(Def: getOperand(N: 0)); |
| 1600 | Value *IncAmt = State.get(Def: getOperand(N: 1), /*IsScalar=*/true); |
| 1601 | VectorType *VTy = cast<VectorType>(Val: Address->getType()); |
| 1602 | |
| 1603 | // The histogram intrinsic requires a mask even if the recipe doesn't; |
| 1604 | // if the mask operand was omitted then all lanes should be executed and |
| 1605 | // we just need to synthesize an all-true mask. |
| 1606 | Value *Mask = nullptr; |
| 1607 | if (VPValue *VPMask = getMask()) |
| 1608 | Mask = State.get(Def: VPMask); |
| 1609 | else |
| 1610 | Mask = |
| 1611 | Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Builder.getInt1(V: 1)); |
| 1612 | |
| 1613 | // If this is a subtract, we want to invert the increment amount. We may |
| 1614 | // add a separate intrinsic in future, but for now we'll try this. |
| 1615 | if (Opcode == Instruction::Sub) |
| 1616 | IncAmt = Builder.CreateNeg(V: IncAmt); |
| 1617 | else |
| 1618 | assert(Opcode == Instruction::Add && "only add or sub supported for now" ); |
| 1619 | |
| 1620 | State.Builder.CreateIntrinsic(ID: Intrinsic::experimental_vector_histogram_add, |
| 1621 | Types: {VTy, IncAmt->getType()}, |
| 1622 | Args: {Address, IncAmt, Mask}); |
| 1623 | } |
| 1624 | |
| 1625 | InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, |
| 1626 | VPCostContext &Ctx) const { |
| 1627 | // FIXME: Take the gather and scatter into account as well. For now we're |
| 1628 | // generating the same cost as the fallback path, but we'll likely |
| 1629 | // need to create a new TTI method for determining the cost, including |
| 1630 | // whether we can use base + vec-of-smaller-indices or just |
| 1631 | // vec-of-pointers. |
| 1632 | assert(VF.isVector() && "Invalid VF for histogram cost" ); |
| 1633 | Type *AddressTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 1634 | VPValue *IncAmt = getOperand(N: 1); |
| 1635 | Type *IncTy = Ctx.Types.inferScalarType(V: IncAmt); |
| 1636 | VectorType *VTy = VectorType::get(ElementType: IncTy, EC: VF); |
| 1637 | |
| 1638 | // Assume that a non-constant update value (or a constant != 1) requires |
| 1639 | // a multiply, and add that into the cost. |
| 1640 | InstructionCost MulCost = |
| 1641 | Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VTy, CostKind: Ctx.CostKind); |
| 1642 | if (IncAmt->isLiveIn()) { |
| 1643 | ConstantInt *CI = dyn_cast<ConstantInt>(Val: IncAmt->getLiveInIRValue()); |
| 1644 | |
| 1645 | if (CI && CI->getZExtValue() == 1) |
| 1646 | MulCost = TTI::TCC_Free; |
| 1647 | } |
| 1648 | |
| 1649 | // Find the cost of the histogram operation itself. |
| 1650 | Type *PtrTy = VectorType::get(ElementType: AddressTy, EC: VF); |
| 1651 | Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF); |
| 1652 | IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, |
| 1653 | Type::getVoidTy(C&: Ctx.LLVMCtx), |
| 1654 | {PtrTy, IncTy, MaskTy}); |
| 1655 | |
| 1656 | // Add the costs together with the add/sub operation. |
| 1657 | return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind) + MulCost + |
| 1658 | Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: VTy, CostKind: Ctx.CostKind); |
| 1659 | } |
| 1660 | |
| 1661 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1662 | void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent, |
| 1663 | VPSlotTracker &SlotTracker) const { |
| 1664 | O << Indent << "WIDEN-HISTOGRAM buckets: " ; |
| 1665 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 1666 | |
| 1667 | if (Opcode == Instruction::Sub) |
| 1668 | O << ", dec: " ; |
| 1669 | else { |
| 1670 | assert(Opcode == Instruction::Add); |
| 1671 | O << ", inc: " ; |
| 1672 | } |
| 1673 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 1674 | |
| 1675 | if (VPValue *Mask = getMask()) { |
| 1676 | O << ", mask: " ; |
| 1677 | Mask->printAsOperand(O, SlotTracker); |
| 1678 | } |
| 1679 | } |
| 1680 | |
| 1681 | void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, |
| 1682 | VPSlotTracker &SlotTracker) const { |
| 1683 | O << Indent << "WIDEN-SELECT " ; |
| 1684 | printAsOperand(O, SlotTracker); |
| 1685 | O << " = select " ; |
| 1686 | printFlags(O); |
| 1687 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 1688 | O << ", " ; |
| 1689 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 1690 | O << ", " ; |
| 1691 | getOperand(2)->printAsOperand(O, SlotTracker); |
| 1692 | O << (isInvariantCond() ? " (condition is loop invariant)" : "" ); |
| 1693 | } |
| 1694 | #endif |
| 1695 | |
| 1696 | void VPWidenSelectRecipe::execute(VPTransformState &State) { |
| 1697 | // The condition can be loop invariant but still defined inside the |
| 1698 | // loop. This means that we can't just use the original 'cond' value. |
| 1699 | // We have to take the 'vectorized' value and pick the first lane. |
| 1700 | // Instcombine will make this a no-op. |
| 1701 | auto *InvarCond = |
| 1702 | isInvariantCond() ? State.get(Def: getCond(), Lane: VPLane(0)) : nullptr; |
| 1703 | |
| 1704 | Value *Cond = InvarCond ? InvarCond : State.get(Def: getCond()); |
| 1705 | Value *Op0 = State.get(Def: getOperand(N: 1)); |
| 1706 | Value *Op1 = State.get(Def: getOperand(N: 2)); |
| 1707 | Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1); |
| 1708 | State.set(Def: this, V: Sel); |
| 1709 | if (auto *I = dyn_cast<Instruction>(Val: Sel)) { |
| 1710 | if (isa<FPMathOperator>(Val: I)) |
| 1711 | applyFlags(I&: *I); |
| 1712 | applyMetadata(I&: *I); |
| 1713 | } |
| 1714 | } |
| 1715 | |
| 1716 | InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, |
| 1717 | VPCostContext &Ctx) const { |
| 1718 | SelectInst *SI = cast<SelectInst>(Val: getUnderlyingValue()); |
| 1719 | bool ScalarCond = getOperand(N: 0)->isDefinedOutsideLoopRegions(); |
| 1720 | Type *ScalarTy = Ctx.Types.inferScalarType(V: this); |
| 1721 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 1722 | |
| 1723 | VPValue *Op0, *Op1; |
| 1724 | using namespace llvm::VPlanPatternMatch; |
| 1725 | if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 && |
| 1726 | (match(V: this, P: m_LogicalAnd(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))) || |
| 1727 | match(V: this, P: m_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))))) { |
| 1728 | // select x, y, false --> x & y |
| 1729 | // select x, true, y --> x | y |
| 1730 | const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(V: Op0); |
| 1731 | const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(V: Op1); |
| 1732 | |
| 1733 | SmallVector<const Value *, 2> Operands; |
| 1734 | if (all_of(Range: operands(), |
| 1735 | P: [](VPValue *Op) { return Op->getUnderlyingValue(); })) |
| 1736 | Operands.append(in_start: SI->op_begin(), in_end: SI->op_end()); |
| 1737 | bool IsLogicalOr = match(V: this, P: m_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))); |
| 1738 | return Ctx.TTI.getArithmeticInstrCost( |
| 1739 | Opcode: IsLogicalOr ? Instruction::Or : Instruction::And, Ty: VectorTy, |
| 1740 | CostKind: Ctx.CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: SI); |
| 1741 | } |
| 1742 | |
| 1743 | Type *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 1744 | if (!ScalarCond) |
| 1745 | CondTy = VectorType::get(ElementType: CondTy, EC: VF); |
| 1746 | |
| 1747 | CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; |
| 1748 | if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition())) |
| 1749 | Pred = Cmp->getPredicate(); |
| 1750 | return Ctx.TTI.getCmpSelInstrCost( |
| 1751 | Opcode: Instruction::Select, ValTy: VectorTy, CondTy, VecPred: Pred, CostKind: Ctx.CostKind, |
| 1752 | Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: SI); |
| 1753 | } |
| 1754 | |
| 1755 | VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) { |
| 1756 | AllowReassoc = FMF.allowReassoc(); |
| 1757 | NoNaNs = FMF.noNaNs(); |
| 1758 | NoInfs = FMF.noInfs(); |
| 1759 | NoSignedZeros = FMF.noSignedZeros(); |
| 1760 | AllowReciprocal = FMF.allowReciprocal(); |
| 1761 | AllowContract = FMF.allowContract(); |
| 1762 | ApproxFunc = FMF.approxFunc(); |
| 1763 | } |
| 1764 | |
| 1765 | #if !defined(NDEBUG) |
| 1766 | bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { |
| 1767 | switch (OpType) { |
| 1768 | case OperationType::OverflowingBinOp: |
| 1769 | return Opcode == Instruction::Add || Opcode == Instruction::Sub || |
| 1770 | Opcode == Instruction::Mul || |
| 1771 | Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart; |
| 1772 | case OperationType::DisjointOp: |
| 1773 | return Opcode == Instruction::Or; |
| 1774 | case OperationType::PossiblyExactOp: |
| 1775 | return Opcode == Instruction::AShr; |
| 1776 | case OperationType::GEPOp: |
| 1777 | return Opcode == Instruction::GetElementPtr || |
| 1778 | Opcode == VPInstruction::PtrAdd; |
| 1779 | case OperationType::FPMathOp: |
| 1780 | return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || |
| 1781 | Opcode == Instruction::FSub || Opcode == Instruction::FNeg || |
| 1782 | Opcode == Instruction::FDiv || Opcode == Instruction::FRem || |
| 1783 | Opcode == Instruction::FCmp || Opcode == Instruction::Select || |
| 1784 | Opcode == VPInstruction::WideIVStep || |
| 1785 | Opcode == VPInstruction::ReductionStartVector || |
| 1786 | Opcode == VPInstruction::ComputeReductionResult; |
| 1787 | case OperationType::NonNegOp: |
| 1788 | return Opcode == Instruction::ZExt; |
| 1789 | break; |
| 1790 | case OperationType::Cmp: |
| 1791 | return Opcode == Instruction::ICmp; |
| 1792 | case OperationType::Other: |
| 1793 | return true; |
| 1794 | } |
| 1795 | llvm_unreachable("Unknown OperationType enum" ); |
| 1796 | } |
| 1797 | #endif |
| 1798 | |
| 1799 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1800 | void VPIRFlags::printFlags(raw_ostream &O) const { |
| 1801 | switch (OpType) { |
| 1802 | case OperationType::Cmp: |
| 1803 | O << " " << CmpInst::getPredicateName(getPredicate()); |
| 1804 | break; |
| 1805 | case OperationType::DisjointOp: |
| 1806 | if (DisjointFlags.IsDisjoint) |
| 1807 | O << " disjoint" ; |
| 1808 | break; |
| 1809 | case OperationType::PossiblyExactOp: |
| 1810 | if (ExactFlags.IsExact) |
| 1811 | O << " exact" ; |
| 1812 | break; |
| 1813 | case OperationType::OverflowingBinOp: |
| 1814 | if (WrapFlags.HasNUW) |
| 1815 | O << " nuw" ; |
| 1816 | if (WrapFlags.HasNSW) |
| 1817 | O << " nsw" ; |
| 1818 | break; |
| 1819 | case OperationType::FPMathOp: |
| 1820 | getFastMathFlags().print(O); |
| 1821 | break; |
| 1822 | case OperationType::GEPOp: |
| 1823 | if (GEPFlags.isInBounds()) |
| 1824 | O << " inbounds" ; |
| 1825 | else if (GEPFlags.hasNoUnsignedSignedWrap()) |
| 1826 | O << " nusw" ; |
| 1827 | if (GEPFlags.hasNoUnsignedWrap()) |
| 1828 | O << " nuw" ; |
| 1829 | break; |
| 1830 | case OperationType::NonNegOp: |
| 1831 | if (NonNegFlags.NonNeg) |
| 1832 | O << " nneg" ; |
| 1833 | break; |
| 1834 | case OperationType::Other: |
| 1835 | break; |
| 1836 | } |
| 1837 | O << " " ; |
| 1838 | } |
| 1839 | #endif |
| 1840 | |
| 1841 | void VPWidenRecipe::execute(VPTransformState &State) { |
| 1842 | auto &Builder = State.Builder; |
| 1843 | switch (Opcode) { |
| 1844 | case Instruction::Call: |
| 1845 | case Instruction::Br: |
| 1846 | case Instruction::PHI: |
| 1847 | case Instruction::GetElementPtr: |
| 1848 | case Instruction::Select: |
| 1849 | llvm_unreachable("This instruction is handled by a different recipe." ); |
| 1850 | case Instruction::UDiv: |
| 1851 | case Instruction::SDiv: |
| 1852 | case Instruction::SRem: |
| 1853 | case Instruction::URem: |
| 1854 | case Instruction::Add: |
| 1855 | case Instruction::FAdd: |
| 1856 | case Instruction::Sub: |
| 1857 | case Instruction::FSub: |
| 1858 | case Instruction::FNeg: |
| 1859 | case Instruction::Mul: |
| 1860 | case Instruction::FMul: |
| 1861 | case Instruction::FDiv: |
| 1862 | case Instruction::FRem: |
| 1863 | case Instruction::Shl: |
| 1864 | case Instruction::LShr: |
| 1865 | case Instruction::AShr: |
| 1866 | case Instruction::And: |
| 1867 | case Instruction::Or: |
| 1868 | case Instruction::Xor: { |
| 1869 | // Just widen unops and binops. |
| 1870 | SmallVector<Value *, 2> Ops; |
| 1871 | for (VPValue *VPOp : operands()) |
| 1872 | Ops.push_back(Elt: State.get(Def: VPOp)); |
| 1873 | |
| 1874 | Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops); |
| 1875 | |
| 1876 | if (auto *VecOp = dyn_cast<Instruction>(Val: V)) { |
| 1877 | applyFlags(I&: *VecOp); |
| 1878 | applyMetadata(I&: *VecOp); |
| 1879 | } |
| 1880 | |
| 1881 | // Use this vector value for all users of the original instruction. |
| 1882 | State.set(Def: this, V); |
| 1883 | break; |
| 1884 | } |
| 1885 | case Instruction::ExtractValue: { |
| 1886 | assert(getNumOperands() == 2 && "expected single level extractvalue" ); |
| 1887 | Value *Op = State.get(Def: getOperand(N: 0)); |
| 1888 | auto *CI = cast<ConstantInt>(Val: getOperand(N: 1)->getLiveInIRValue()); |
| 1889 | Value * = Builder.CreateExtractValue(Agg: Op, Idxs: CI->getZExtValue()); |
| 1890 | State.set(Def: this, V: Extract); |
| 1891 | break; |
| 1892 | } |
| 1893 | case Instruction::Freeze: { |
| 1894 | Value *Op = State.get(Def: getOperand(N: 0)); |
| 1895 | Value *Freeze = Builder.CreateFreeze(V: Op); |
| 1896 | State.set(Def: this, V: Freeze); |
| 1897 | break; |
| 1898 | } |
| 1899 | case Instruction::ICmp: |
| 1900 | case Instruction::FCmp: { |
| 1901 | // Widen compares. Generate vector compares. |
| 1902 | bool FCmp = Opcode == Instruction::FCmp; |
| 1903 | Value *A = State.get(Def: getOperand(N: 0)); |
| 1904 | Value *B = State.get(Def: getOperand(N: 1)); |
| 1905 | Value *C = nullptr; |
| 1906 | if (FCmp) { |
| 1907 | // Propagate fast math flags. |
| 1908 | C = Builder.CreateFCmpFMF( |
| 1909 | P: getPredicate(), LHS: A, RHS: B, |
| 1910 | FMFSource: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue())); |
| 1911 | } else { |
| 1912 | C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B); |
| 1913 | } |
| 1914 | if (auto *I = dyn_cast<Instruction>(Val: C)) |
| 1915 | applyMetadata(I&: *I); |
| 1916 | State.set(Def: this, V: C); |
| 1917 | break; |
| 1918 | } |
| 1919 | default: |
| 1920 | // This instruction is not vectorized by simple widening. |
| 1921 | LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " |
| 1922 | << Instruction::getOpcodeName(Opcode)); |
| 1923 | llvm_unreachable("Unhandled instruction!" ); |
| 1924 | } // end of switch. |
| 1925 | |
| 1926 | #if !defined(NDEBUG) |
| 1927 | // Verify that VPlan type inference results agree with the type of the |
| 1928 | // generated values. |
| 1929 | assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == |
| 1930 | State.get(this)->getType() && |
| 1931 | "inferred type and type from generated instructions do not match" ); |
| 1932 | #endif |
| 1933 | } |
| 1934 | |
| 1935 | InstructionCost VPWidenRecipe::computeCost(ElementCount VF, |
| 1936 | VPCostContext &Ctx) const { |
| 1937 | switch (Opcode) { |
| 1938 | case Instruction::FNeg: { |
| 1939 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 1940 | return Ctx.TTI.getArithmeticInstrCost( |
| 1941 | Opcode, Ty: VectorTy, CostKind: Ctx.CostKind, |
| 1942 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
| 1943 | Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}); |
| 1944 | } |
| 1945 | |
| 1946 | case Instruction::UDiv: |
| 1947 | case Instruction::SDiv: |
| 1948 | case Instruction::SRem: |
| 1949 | case Instruction::URem: |
| 1950 | // More complex computation, let the legacy cost-model handle this for now. |
| 1951 | return Ctx.getLegacyCost(UI: cast<Instruction>(Val: getUnderlyingValue()), VF); |
| 1952 | case Instruction::Add: |
| 1953 | case Instruction::FAdd: |
| 1954 | case Instruction::Sub: |
| 1955 | case Instruction::FSub: |
| 1956 | case Instruction::Mul: |
| 1957 | case Instruction::FMul: |
| 1958 | case Instruction::FDiv: |
| 1959 | case Instruction::FRem: |
| 1960 | case Instruction::Shl: |
| 1961 | case Instruction::LShr: |
| 1962 | case Instruction::AShr: |
| 1963 | case Instruction::And: |
| 1964 | case Instruction::Or: |
| 1965 | case Instruction::Xor: { |
| 1966 | VPValue *RHS = getOperand(N: 1); |
| 1967 | // Certain instructions can be cheaper to vectorize if they have a constant |
| 1968 | // second vector operand. One example of this are shifts on x86. |
| 1969 | TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(V: RHS); |
| 1970 | |
| 1971 | if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && |
| 1972 | getOperand(N: 1)->isDefinedOutsideLoopRegions()) |
| 1973 | RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; |
| 1974 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 1975 | Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()); |
| 1976 | |
| 1977 | SmallVector<const Value *, 4> Operands; |
| 1978 | if (CtxI) |
| 1979 | Operands.append(in_start: CtxI->value_op_begin(), in_end: CtxI->value_op_end()); |
| 1980 | return Ctx.TTI.getArithmeticInstrCost( |
| 1981 | Opcode, Ty: VectorTy, CostKind: Ctx.CostKind, |
| 1982 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
| 1983 | Opd2Info: RHSInfo, Args: Operands, CxtI: CtxI, TLibInfo: &Ctx.TLI); |
| 1984 | } |
| 1985 | case Instruction::Freeze: { |
| 1986 | // This opcode is unknown. Assume that it is the same as 'mul'. |
| 1987 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 1988 | return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, |
| 1989 | CostKind: Ctx.CostKind); |
| 1990 | } |
| 1991 | case Instruction::ExtractValue: { |
| 1992 | return Ctx.TTI.getInsertExtractValueCost(Opcode: Instruction::ExtractValue, |
| 1993 | CostKind: Ctx.CostKind); |
| 1994 | } |
| 1995 | case Instruction::ICmp: |
| 1996 | case Instruction::FCmp: { |
| 1997 | Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()); |
| 1998 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF); |
| 1999 | return Ctx.TTI.getCmpSelInstrCost( |
| 2000 | Opcode, ValTy: VectorTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VectorTy), VecPred: getPredicate(), |
| 2001 | CostKind: Ctx.CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
| 2002 | Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CtxI); |
| 2003 | } |
| 2004 | default: |
| 2005 | llvm_unreachable("Unsupported opcode for instruction" ); |
| 2006 | } |
| 2007 | } |
| 2008 | |
| 2009 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2010 | void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, |
| 2011 | VPSlotTracker &SlotTracker) const { |
| 2012 | O << Indent << "WIDEN " ; |
| 2013 | printAsOperand(O, SlotTracker); |
| 2014 | O << " = " << Instruction::getOpcodeName(Opcode); |
| 2015 | printFlags(O); |
| 2016 | printOperands(O, SlotTracker); |
| 2017 | } |
| 2018 | #endif |
| 2019 | |
| 2020 | void VPWidenCastRecipe::execute(VPTransformState &State) { |
| 2021 | auto &Builder = State.Builder; |
| 2022 | /// Vectorize casts. |
| 2023 | assert(State.VF.isVector() && "Not vectorizing?" ); |
| 2024 | Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF); |
| 2025 | VPValue *Op = getOperand(N: 0); |
| 2026 | Value *A = State.get(Def: Op); |
| 2027 | Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy); |
| 2028 | State.set(Def: this, V: Cast); |
| 2029 | if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) { |
| 2030 | applyFlags(I&: *CastOp); |
| 2031 | applyMetadata(I&: *CastOp); |
| 2032 | } |
| 2033 | } |
| 2034 | |
| 2035 | InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, |
| 2036 | VPCostContext &Ctx) const { |
| 2037 | // TODO: In some cases, VPWidenCastRecipes are created but not considered in |
| 2038 | // the legacy cost model, including truncates/extends when evaluating a |
| 2039 | // reduction in a smaller type. |
| 2040 | if (!getUnderlyingValue()) |
| 2041 | return 0; |
| 2042 | // Computes the CastContextHint from a recipes that may access memory. |
| 2043 | auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { |
| 2044 | if (VF.isScalar()) |
| 2045 | return TTI::CastContextHint::Normal; |
| 2046 | if (isa<VPInterleaveRecipe>(Val: R)) |
| 2047 | return TTI::CastContextHint::Interleave; |
| 2048 | if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(Val: R)) |
| 2049 | return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked |
| 2050 | : TTI::CastContextHint::Normal; |
| 2051 | const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(Val: R); |
| 2052 | if (WidenMemoryRecipe == nullptr) |
| 2053 | return TTI::CastContextHint::None; |
| 2054 | if (!WidenMemoryRecipe->isConsecutive()) |
| 2055 | return TTI::CastContextHint::GatherScatter; |
| 2056 | if (WidenMemoryRecipe->isReverse()) |
| 2057 | return TTI::CastContextHint::Reversed; |
| 2058 | if (WidenMemoryRecipe->isMasked()) |
| 2059 | return TTI::CastContextHint::Masked; |
| 2060 | return TTI::CastContextHint::Normal; |
| 2061 | }; |
| 2062 | |
| 2063 | VPValue *Operand = getOperand(N: 0); |
| 2064 | TTI::CastContextHint CCH = TTI::CastContextHint::None; |
| 2065 | // For Trunc/FPTrunc, get the context from the only user. |
| 2066 | if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) && |
| 2067 | !hasMoreThanOneUniqueUser() && getNumUsers() > 0) { |
| 2068 | if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(Val: *user_begin())) |
| 2069 | CCH = ComputeCCH(StoreRecipe); |
| 2070 | } |
| 2071 | // For Z/Sext, get the context from the operand. |
| 2072 | else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || |
| 2073 | Opcode == Instruction::FPExt) { |
| 2074 | if (Operand->isLiveIn()) |
| 2075 | CCH = TTI::CastContextHint::Normal; |
| 2076 | else if (Operand->getDefiningRecipe()) |
| 2077 | CCH = ComputeCCH(Operand->getDefiningRecipe()); |
| 2078 | } |
| 2079 | |
| 2080 | auto *SrcTy = |
| 2081 | cast<VectorType>(Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: Operand), EC: VF)); |
| 2082 | auto *DestTy = cast<VectorType>(Val: toVectorTy(Scalar: getResultType(), EC: VF)); |
| 2083 | // Arm TTI will use the underlying instruction to determine the cost. |
| 2084 | return Ctx.TTI.getCastInstrCost( |
| 2085 | Opcode, Dst: DestTy, Src: SrcTy, CCH, CostKind: Ctx.CostKind, |
| 2086 | I: dyn_cast_if_present<Instruction>(Val: getUnderlyingValue())); |
| 2087 | } |
| 2088 | |
| 2089 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2090 | void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, |
| 2091 | VPSlotTracker &SlotTracker) const { |
| 2092 | O << Indent << "WIDEN-CAST " ; |
| 2093 | printAsOperand(O, SlotTracker); |
| 2094 | O << " = " << Instruction::getOpcodeName(Opcode); |
| 2095 | printFlags(O); |
| 2096 | printOperands(O, SlotTracker); |
| 2097 | O << " to " << *getResultType(); |
| 2098 | } |
| 2099 | #endif |
| 2100 | |
| 2101 | InstructionCost VPHeaderPHIRecipe::(ElementCount VF, |
| 2102 | VPCostContext &Ctx) const { |
| 2103 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 2104 | } |
| 2105 | |
| 2106 | /// A helper function that returns an integer or floating-point constant with |
| 2107 | /// value C. |
| 2108 | static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { |
| 2109 | return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, V: C) |
| 2110 | : ConstantFP::get(Ty, V: C); |
| 2111 | } |
| 2112 | |
| 2113 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2114 | void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, |
| 2115 | VPSlotTracker &SlotTracker) const { |
| 2116 | O << Indent; |
| 2117 | printAsOperand(O, SlotTracker); |
| 2118 | O << " = WIDEN-INDUCTION " ; |
| 2119 | printOperands(O, SlotTracker); |
| 2120 | |
| 2121 | if (auto *TI = getTruncInst()) |
| 2122 | O << " (truncated to " << *TI->getType() << ")" ; |
| 2123 | } |
| 2124 | #endif |
| 2125 | |
| 2126 | bool VPWidenIntOrFpInductionRecipe::isCanonical() const { |
| 2127 | // The step may be defined by a recipe in the preheader (e.g. if it requires |
| 2128 | // SCEV expansion), but for the canonical induction the step is required to be |
| 2129 | // 1, which is represented as live-in. |
| 2130 | if (getStepValue()->getDefiningRecipe()) |
| 2131 | return false; |
| 2132 | auto *StepC = dyn_cast<ConstantInt>(Val: getStepValue()->getLiveInIRValue()); |
| 2133 | auto *StartC = dyn_cast<ConstantInt>(Val: getStartValue()->getLiveInIRValue()); |
| 2134 | auto *CanIV = cast<VPCanonicalIVPHIRecipe>(Val: &*getParent()->begin()); |
| 2135 | return StartC && StartC->isZero() && StepC && StepC->isOne() && |
| 2136 | getScalarType() == CanIV->getScalarType(); |
| 2137 | } |
| 2138 | |
| 2139 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2140 | void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, |
| 2141 | VPSlotTracker &SlotTracker) const { |
| 2142 | O << Indent; |
| 2143 | printAsOperand(O, SlotTracker); |
| 2144 | O << " = DERIVED-IV " ; |
| 2145 | getStartValue()->printAsOperand(O, SlotTracker); |
| 2146 | O << " + " ; |
| 2147 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 2148 | O << " * " ; |
| 2149 | getStepValue()->printAsOperand(O, SlotTracker); |
| 2150 | } |
| 2151 | #endif |
| 2152 | |
| 2153 | void VPScalarIVStepsRecipe::execute(VPTransformState &State) { |
| 2154 | // Fast-math-flags propagate from the original induction instruction. |
| 2155 | IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); |
| 2156 | if (hasFastMathFlags()) |
| 2157 | State.Builder.setFastMathFlags(getFastMathFlags()); |
| 2158 | |
| 2159 | /// Compute scalar induction steps. \p ScalarIV is the scalar induction |
| 2160 | /// variable on which to base the steps, \p Step is the size of the step. |
| 2161 | |
| 2162 | Value *BaseIV = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 2163 | Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0)); |
| 2164 | IRBuilderBase &Builder = State.Builder; |
| 2165 | |
| 2166 | // Ensure step has the same type as that of scalar IV. |
| 2167 | Type *BaseIVTy = BaseIV->getType()->getScalarType(); |
| 2168 | assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!" ); |
| 2169 | |
| 2170 | // We build scalar steps for both integer and floating-point induction |
| 2171 | // variables. Here, we determine the kind of arithmetic we will perform. |
| 2172 | Instruction::BinaryOps AddOp; |
| 2173 | Instruction::BinaryOps MulOp; |
| 2174 | if (BaseIVTy->isIntegerTy()) { |
| 2175 | AddOp = Instruction::Add; |
| 2176 | MulOp = Instruction::Mul; |
| 2177 | } else { |
| 2178 | AddOp = InductionOpcode; |
| 2179 | MulOp = Instruction::FMul; |
| 2180 | } |
| 2181 | |
| 2182 | // Determine the number of scalars we need to generate for each unroll |
| 2183 | // iteration. |
| 2184 | bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this); |
| 2185 | // Compute the scalar steps and save the results in State. |
| 2186 | Type *IntStepTy = |
| 2187 | IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits()); |
| 2188 | Type *VecIVTy = nullptr; |
| 2189 | Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; |
| 2190 | if (!FirstLaneOnly && State.VF.isScalable()) { |
| 2191 | VecIVTy = VectorType::get(ElementType: BaseIVTy, EC: State.VF); |
| 2192 | UnitStepVec = |
| 2193 | Builder.CreateStepVector(DstType: VectorType::get(ElementType: IntStepTy, EC: State.VF)); |
| 2194 | SplatStep = Builder.CreateVectorSplat(EC: State.VF, V: Step); |
| 2195 | SplatIV = Builder.CreateVectorSplat(EC: State.VF, V: BaseIV); |
| 2196 | } |
| 2197 | |
| 2198 | unsigned StartLane = 0; |
| 2199 | unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); |
| 2200 | if (State.Lane) { |
| 2201 | StartLane = State.Lane->getKnownLane(); |
| 2202 | EndLane = StartLane + 1; |
| 2203 | } |
| 2204 | Value *StartIdx0; |
| 2205 | if (getUnrollPart(U&: *this) == 0) |
| 2206 | StartIdx0 = ConstantInt::get(Ty: IntStepTy, V: 0); |
| 2207 | else { |
| 2208 | StartIdx0 = State.get(Def: getOperand(N: 2), IsScalar: true); |
| 2209 | if (getUnrollPart(U&: *this) != 1) { |
| 2210 | StartIdx0 = |
| 2211 | Builder.CreateMul(LHS: StartIdx0, RHS: ConstantInt::get(Ty: StartIdx0->getType(), |
| 2212 | V: getUnrollPart(U&: *this))); |
| 2213 | } |
| 2214 | StartIdx0 = Builder.CreateSExtOrTrunc(V: StartIdx0, DestTy: IntStepTy); |
| 2215 | } |
| 2216 | |
| 2217 | if (!FirstLaneOnly && State.VF.isScalable()) { |
| 2218 | auto *SplatStartIdx = Builder.CreateVectorSplat(EC: State.VF, V: StartIdx0); |
| 2219 | auto *InitVec = Builder.CreateAdd(LHS: SplatStartIdx, RHS: UnitStepVec); |
| 2220 | if (BaseIVTy->isFloatingPointTy()) |
| 2221 | InitVec = Builder.CreateSIToFP(V: InitVec, DestTy: VecIVTy); |
| 2222 | auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: InitVec, RHS: SplatStep); |
| 2223 | auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: SplatIV, RHS: Mul); |
| 2224 | State.set(Def: this, V: Add); |
| 2225 | // It's useful to record the lane values too for the known minimum number |
| 2226 | // of elements so we do those below. This improves the code quality when |
| 2227 | // trying to extract the first element, for example. |
| 2228 | } |
| 2229 | |
| 2230 | if (BaseIVTy->isFloatingPointTy()) |
| 2231 | StartIdx0 = Builder.CreateSIToFP(V: StartIdx0, DestTy: BaseIVTy); |
| 2232 | |
| 2233 | for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { |
| 2234 | Value *StartIdx = Builder.CreateBinOp( |
| 2235 | Opc: AddOp, LHS: StartIdx0, RHS: getSignedIntOrFpConstant(Ty: BaseIVTy, C: Lane)); |
| 2236 | // The step returned by `createStepForVF` is a runtime-evaluated value |
| 2237 | // when VF is scalable. Otherwise, it should be folded into a Constant. |
| 2238 | assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && |
| 2239 | "Expected StartIdx to be folded to a constant when VF is not " |
| 2240 | "scalable" ); |
| 2241 | auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step); |
| 2242 | auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul); |
| 2243 | State.set(Def: this, V: Add, Lane: VPLane(Lane)); |
| 2244 | } |
| 2245 | } |
| 2246 | |
| 2247 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2248 | void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, |
| 2249 | VPSlotTracker &SlotTracker) const { |
| 2250 | O << Indent; |
| 2251 | printAsOperand(O, SlotTracker); |
| 2252 | O << " = SCALAR-STEPS " ; |
| 2253 | printOperands(O, SlotTracker); |
| 2254 | } |
| 2255 | #endif |
| 2256 | |
| 2257 | void VPWidenGEPRecipe::execute(VPTransformState &State) { |
| 2258 | assert(State.VF.isVector() && "not widening" ); |
| 2259 | auto *GEP = cast<GetElementPtrInst>(Val: getUnderlyingInstr()); |
| 2260 | // Construct a vector GEP by widening the operands of the scalar GEP as |
| 2261 | // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP |
| 2262 | // results in a vector of pointers when at least one operand of the GEP |
| 2263 | // is vector-typed. Thus, to keep the representation compact, we only use |
| 2264 | // vector-typed operands for loop-varying values. |
| 2265 | |
| 2266 | if (areAllOperandsInvariant()) { |
| 2267 | // If we are vectorizing, but the GEP has only loop-invariant operands, |
| 2268 | // the GEP we build (by only using vector-typed operands for |
| 2269 | // loop-varying values) would be a scalar pointer. Thus, to ensure we |
| 2270 | // produce a vector of pointers, we need to either arbitrarily pick an |
| 2271 | // operand to broadcast, or broadcast a clone of the original GEP. |
| 2272 | // Here, we broadcast a clone of the original. |
| 2273 | // |
| 2274 | // TODO: If at some point we decide to scalarize instructions having |
| 2275 | // loop-invariant operands, this special case will no longer be |
| 2276 | // required. We would add the scalarization decision to |
| 2277 | // collectLoopScalars() and teach getVectorValue() to broadcast |
| 2278 | // the lane-zero scalar value. |
| 2279 | SmallVector<Value *> Ops; |
| 2280 | for (unsigned I = 0, E = getNumOperands(); I != E; I++) |
| 2281 | Ops.push_back(Elt: State.get(Def: getOperand(N: I), Lane: VPLane(0))); |
| 2282 | |
| 2283 | auto *NewGEP = State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr: Ops[0], |
| 2284 | IdxList: ArrayRef(Ops).drop_front(), Name: "" , |
| 2285 | NW: getGEPNoWrapFlags()); |
| 2286 | Value *Splat = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP); |
| 2287 | State.set(Def: this, V: Splat); |
| 2288 | } else { |
| 2289 | // If the GEP has at least one loop-varying operand, we are sure to |
| 2290 | // produce a vector of pointers unless VF is scalar. |
| 2291 | // The pointer operand of the new GEP. If it's loop-invariant, we |
| 2292 | // won't broadcast it. |
| 2293 | auto *Ptr = isPointerLoopInvariant() ? State.get(Def: getOperand(N: 0), Lane: VPLane(0)) |
| 2294 | : State.get(Def: getOperand(N: 0)); |
| 2295 | |
| 2296 | // Collect all the indices for the new GEP. If any index is |
| 2297 | // loop-invariant, we won't broadcast it. |
| 2298 | SmallVector<Value *, 4> Indices; |
| 2299 | for (unsigned I = 1, E = getNumOperands(); I < E; I++) { |
| 2300 | VPValue *Operand = getOperand(N: I); |
| 2301 | if (isIndexLoopInvariant(I: I - 1)) |
| 2302 | Indices.push_back(Elt: State.get(Def: Operand, Lane: VPLane(0))); |
| 2303 | else |
| 2304 | Indices.push_back(Elt: State.get(Def: Operand)); |
| 2305 | } |
| 2306 | |
| 2307 | // Create the new GEP. Note that this GEP may be a scalar if VF == 1, |
| 2308 | // but it should be a vector, otherwise. |
| 2309 | auto *NewGEP = State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr, |
| 2310 | IdxList: Indices, Name: "" , NW: getGEPNoWrapFlags()); |
| 2311 | assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && |
| 2312 | "NewGEP is not a pointer vector" ); |
| 2313 | State.set(Def: this, V: NewGEP); |
| 2314 | } |
| 2315 | } |
| 2316 | |
| 2317 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2318 | void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, |
| 2319 | VPSlotTracker &SlotTracker) const { |
| 2320 | O << Indent << "WIDEN-GEP " ; |
| 2321 | O << (isPointerLoopInvariant() ? "Inv" : "Var" ); |
| 2322 | for (size_t I = 0; I < getNumOperands() - 1; ++I) |
| 2323 | O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var" ) << "]" ; |
| 2324 | |
| 2325 | O << " " ; |
| 2326 | printAsOperand(O, SlotTracker); |
| 2327 | O << " = getelementptr" ; |
| 2328 | printFlags(O); |
| 2329 | printOperands(O, SlotTracker); |
| 2330 | } |
| 2331 | #endif |
| 2332 | |
| 2333 | static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride, |
| 2334 | unsigned CurrentPart, IRBuilderBase &Builder) { |
| 2335 | // Use i32 for the gep index type when the value is constant, |
| 2336 | // or query DataLayout for a more suitable index type otherwise. |
| 2337 | const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); |
| 2338 | return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0)) |
| 2339 | ? DL.getIndexType(PtrTy: Builder.getPtrTy(AddrSpace: 0)) |
| 2340 | : Builder.getInt32Ty(); |
| 2341 | } |
| 2342 | |
| 2343 | void VPVectorEndPointerRecipe::execute(VPTransformState &State) { |
| 2344 | auto &Builder = State.Builder; |
| 2345 | unsigned CurrentPart = getUnrollPart(U&: *this); |
| 2346 | bool IsUnitStride = Stride == 1 || Stride == -1; |
| 2347 | Type *IndexTy = getGEPIndexTy(IsScalable: State.VF.isScalable(), /*IsReverse*/ true, |
| 2348 | IsUnitStride, CurrentPart, Builder); |
| 2349 | |
| 2350 | // The wide store needs to start at the last vector element. |
| 2351 | Value *RunTimeVF = State.get(Def: getVFValue(), Lane: VPLane(0)); |
| 2352 | if (IndexTy != RunTimeVF->getType()) |
| 2353 | RunTimeVF = Builder.CreateZExtOrTrunc(V: RunTimeVF, DestTy: IndexTy); |
| 2354 | // NumElt = Stride * CurrentPart * RunTimeVF |
| 2355 | Value *NumElt = Builder.CreateMul( |
| 2356 | LHS: ConstantInt::get(Ty: IndexTy, V: Stride * (int64_t)CurrentPart), RHS: RunTimeVF); |
| 2357 | // LastLane = Stride * (RunTimeVF - 1) |
| 2358 | Value *LastLane = Builder.CreateSub(LHS: RunTimeVF, RHS: ConstantInt::get(Ty: IndexTy, V: 1)); |
| 2359 | if (Stride != 1) |
| 2360 | LastLane = Builder.CreateMul(LHS: ConstantInt::get(Ty: IndexTy, V: Stride), RHS: LastLane); |
| 2361 | Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 2362 | Value *ResultPtr = |
| 2363 | Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: NumElt, Name: "" , NW: getGEPNoWrapFlags()); |
| 2364 | ResultPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr: ResultPtr, IdxList: LastLane, Name: "" , |
| 2365 | NW: getGEPNoWrapFlags()); |
| 2366 | |
| 2367 | State.set(Def: this, V: ResultPtr, /*IsScalar*/ true); |
| 2368 | } |
| 2369 | |
| 2370 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2371 | void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent, |
| 2372 | VPSlotTracker &SlotTracker) const { |
| 2373 | O << Indent; |
| 2374 | printAsOperand(O, SlotTracker); |
| 2375 | O << " = vector-end-pointer" ; |
| 2376 | printFlags(O); |
| 2377 | printOperands(O, SlotTracker); |
| 2378 | } |
| 2379 | #endif |
| 2380 | |
| 2381 | void VPVectorPointerRecipe::execute(VPTransformState &State) { |
| 2382 | auto &Builder = State.Builder; |
| 2383 | unsigned CurrentPart = getUnrollPart(U&: *this); |
| 2384 | Type *IndexTy = getGEPIndexTy(IsScalable: State.VF.isScalable(), /*IsReverse*/ false, |
| 2385 | /*IsUnitStride*/ true, CurrentPart, Builder); |
| 2386 | Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 2387 | |
| 2388 | Value *Increment = createStepForVF(B&: Builder, Ty: IndexTy, VF: State.VF, Step: CurrentPart); |
| 2389 | Value *ResultPtr = |
| 2390 | Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: Increment, Name: "" , NW: getGEPNoWrapFlags()); |
| 2391 | |
| 2392 | State.set(Def: this, V: ResultPtr, /*IsScalar*/ true); |
| 2393 | } |
| 2394 | |
| 2395 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2396 | void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, |
| 2397 | VPSlotTracker &SlotTracker) const { |
| 2398 | O << Indent; |
| 2399 | printAsOperand(O, SlotTracker); |
| 2400 | O << " = vector-pointer " ; |
| 2401 | |
| 2402 | printOperands(O, SlotTracker); |
| 2403 | } |
| 2404 | #endif |
| 2405 | |
| 2406 | void VPBlendRecipe::execute(VPTransformState &State) { |
| 2407 | assert(isNormalized() && "Expected blend to be normalized!" ); |
| 2408 | // We know that all PHIs in non-header blocks are converted into |
| 2409 | // selects, so we don't have to worry about the insertion order and we |
| 2410 | // can just use the builder. |
| 2411 | // At this point we generate the predication tree. There may be |
| 2412 | // duplications since this is a simple recursive scan, but future |
| 2413 | // optimizations will clean it up. |
| 2414 | |
| 2415 | unsigned NumIncoming = getNumIncomingValues(); |
| 2416 | |
| 2417 | // Generate a sequence of selects of the form: |
| 2418 | // SELECT(Mask3, In3, |
| 2419 | // SELECT(Mask2, In2, |
| 2420 | // SELECT(Mask1, In1, |
| 2421 | // In0))) |
| 2422 | // Note that Mask0 is never used: lanes for which no path reaches this phi and |
| 2423 | // are essentially undef are taken from In0. |
| 2424 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 2425 | Value *Result = nullptr; |
| 2426 | for (unsigned In = 0; In < NumIncoming; ++In) { |
| 2427 | // We might have single edge PHIs (blocks) - use an identity |
| 2428 | // 'select' for the first PHI operand. |
| 2429 | Value *In0 = State.get(Def: getIncomingValue(Idx: In), IsScalar: OnlyFirstLaneUsed); |
| 2430 | if (In == 0) |
| 2431 | Result = In0; // Initialize with the first incoming value. |
| 2432 | else { |
| 2433 | // Select between the current value and the previous incoming edge |
| 2434 | // based on the incoming mask. |
| 2435 | Value *Cond = State.get(Def: getMask(Idx: In), IsScalar: OnlyFirstLaneUsed); |
| 2436 | Result = State.Builder.CreateSelect(C: Cond, True: In0, False: Result, Name: "predphi" ); |
| 2437 | } |
| 2438 | } |
| 2439 | State.set(Def: this, V: Result, IsScalar: OnlyFirstLaneUsed); |
| 2440 | } |
| 2441 | |
| 2442 | InstructionCost VPBlendRecipe::computeCost(ElementCount VF, |
| 2443 | VPCostContext &Ctx) const { |
| 2444 | // Handle cases where only the first lane is used the same way as the legacy |
| 2445 | // cost model. |
| 2446 | if (vputils::onlyFirstLaneUsed(Def: this)) |
| 2447 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 2448 | |
| 2449 | Type *ResultTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 2450 | Type *CmpTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.Types.getContext()), EC: VF); |
| 2451 | return (getNumIncomingValues() - 1) * |
| 2452 | Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: ResultTy, CondTy: CmpTy, |
| 2453 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind: Ctx.CostKind); |
| 2454 | } |
| 2455 | |
| 2456 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2457 | void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, |
| 2458 | VPSlotTracker &SlotTracker) const { |
| 2459 | O << Indent << "BLEND " ; |
| 2460 | printAsOperand(O, SlotTracker); |
| 2461 | O << " =" ; |
| 2462 | if (getNumIncomingValues() == 1) { |
| 2463 | // Not a User of any mask: not really blending, this is a |
| 2464 | // single-predecessor phi. |
| 2465 | O << " " ; |
| 2466 | getIncomingValue(0)->printAsOperand(O, SlotTracker); |
| 2467 | } else { |
| 2468 | for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { |
| 2469 | O << " " ; |
| 2470 | getIncomingValue(I)->printAsOperand(O, SlotTracker); |
| 2471 | if (I == 0) |
| 2472 | continue; |
| 2473 | O << "/" ; |
| 2474 | getMask(I)->printAsOperand(O, SlotTracker); |
| 2475 | } |
| 2476 | } |
| 2477 | } |
| 2478 | #endif |
| 2479 | |
| 2480 | void VPReductionRecipe::execute(VPTransformState &State) { |
| 2481 | assert(!State.Lane && "Reduction being replicated." ); |
| 2482 | Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true); |
| 2483 | RecurKind Kind = getRecurrenceKind(); |
| 2484 | assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && |
| 2485 | "In-loop AnyOf reductions aren't currently supported" ); |
| 2486 | // Propagate the fast-math flags carried by the underlying instruction. |
| 2487 | IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); |
| 2488 | State.Builder.setFastMathFlags(getFastMathFlags()); |
| 2489 | Value *NewVecOp = State.get(Def: getVecOp()); |
| 2490 | if (VPValue *Cond = getCondOp()) { |
| 2491 | Value *NewCond = State.get(Def: Cond, IsScalar: State.VF.isScalar()); |
| 2492 | VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType()); |
| 2493 | Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); |
| 2494 | |
| 2495 | Value *Start = getRecurrenceIdentity(K: Kind, Tp: ElementTy, FMF: getFastMathFlags()); |
| 2496 | if (State.VF.isVector()) |
| 2497 | Start = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Start); |
| 2498 | |
| 2499 | Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Start); |
| 2500 | NewVecOp = Select; |
| 2501 | } |
| 2502 | Value *NewRed; |
| 2503 | Value *NextInChain; |
| 2504 | if (IsOrdered) { |
| 2505 | if (State.VF.isVector()) |
| 2506 | NewRed = |
| 2507 | createOrderedReduction(B&: State.Builder, RdxKind: Kind, Src: NewVecOp, Start: PrevInChain); |
| 2508 | else |
| 2509 | NewRed = State.Builder.CreateBinOp( |
| 2510 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), |
| 2511 | LHS: PrevInChain, RHS: NewVecOp); |
| 2512 | PrevInChain = NewRed; |
| 2513 | NextInChain = NewRed; |
| 2514 | } else { |
| 2515 | PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true); |
| 2516 | NewRed = createSimpleReduction(B&: State.Builder, Src: NewVecOp, RdxKind: Kind); |
| 2517 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) |
| 2518 | NextInChain = createMinMaxOp(Builder&: State.Builder, RK: Kind, Left: NewRed, Right: PrevInChain); |
| 2519 | else |
| 2520 | NextInChain = State.Builder.CreateBinOp( |
| 2521 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed, |
| 2522 | RHS: PrevInChain); |
| 2523 | } |
| 2524 | State.set(Def: this, V: NextInChain, /*IsScalar*/ true); |
| 2525 | } |
| 2526 | |
| 2527 | void VPReductionEVLRecipe::execute(VPTransformState &State) { |
| 2528 | assert(!State.Lane && "Reduction being replicated." ); |
| 2529 | |
| 2530 | auto &Builder = State.Builder; |
| 2531 | // Propagate the fast-math flags carried by the underlying instruction. |
| 2532 | IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); |
| 2533 | Builder.setFastMathFlags(getFastMathFlags()); |
| 2534 | |
| 2535 | RecurKind Kind = getRecurrenceKind(); |
| 2536 | Value *Prev = State.get(Def: getChainOp(), /*IsScalar*/ true); |
| 2537 | Value *VecOp = State.get(Def: getVecOp()); |
| 2538 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 2539 | |
| 2540 | Value *Mask; |
| 2541 | if (VPValue *CondOp = getCondOp()) |
| 2542 | Mask = State.get(Def: CondOp); |
| 2543 | else |
| 2544 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
| 2545 | |
| 2546 | Value *NewRed; |
| 2547 | if (isOrdered()) { |
| 2548 | NewRed = createOrderedReduction(B&: Builder, RdxKind: Kind, Src: VecOp, Start: Prev, Mask, EVL); |
| 2549 | } else { |
| 2550 | NewRed = createSimpleReduction(B&: Builder, Src: VecOp, RdxKind: Kind, Mask, EVL); |
| 2551 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) |
| 2552 | NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev); |
| 2553 | else |
| 2554 | NewRed = Builder.CreateBinOp( |
| 2555 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed, |
| 2556 | RHS: Prev); |
| 2557 | } |
| 2558 | State.set(Def: this, V: NewRed, /*IsScalar*/ true); |
| 2559 | } |
| 2560 | |
| 2561 | InstructionCost VPReductionRecipe::computeCost(ElementCount VF, |
| 2562 | VPCostContext &Ctx) const { |
| 2563 | RecurKind RdxKind = getRecurrenceKind(); |
| 2564 | Type *ElementTy = Ctx.Types.inferScalarType(V: this); |
| 2565 | auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ElementTy, EC: VF)); |
| 2566 | unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind); |
| 2567 | FastMathFlags FMFs = getFastMathFlags(); |
| 2568 | std::optional<FastMathFlags> OptionalFMF = |
| 2569 | ElementTy->isFloatingPointTy() ? std::make_optional(t&: FMFs) : std::nullopt; |
| 2570 | |
| 2571 | // TODO: Support any-of reductions. |
| 2572 | assert( |
| 2573 | (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || |
| 2574 | ForceTargetInstructionCost.getNumOccurrences() > 0) && |
| 2575 | "Any-of reduction not implemented in VPlan-based cost model currently." ); |
| 2576 | |
| 2577 | // Note that TTI should model the cost of moving result to the scalar register |
| 2578 | // and the BinOp cost in the getMinMaxReductionCost(). |
| 2579 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RdxKind)) { |
| 2580 | Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind); |
| 2581 | return Ctx.TTI.getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF: FMFs, CostKind: Ctx.CostKind); |
| 2582 | } |
| 2583 | |
| 2584 | // Note that TTI should model the cost of moving result to the scalar register |
| 2585 | // and the BinOp cost in the getArithmeticReductionCost(). |
| 2586 | return Ctx.TTI.getArithmeticReductionCost(Opcode, Ty: VectorTy, FMF: OptionalFMF, |
| 2587 | CostKind: Ctx.CostKind); |
| 2588 | } |
| 2589 | |
| 2590 | VPExpressionRecipe::VPExpressionRecipe( |
| 2591 | ExpressionTypes ExpressionType, |
| 2592 | ArrayRef<VPSingleDefRecipe *> ExpressionRecipes) |
| 2593 | : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}), |
| 2594 | ExpressionRecipes(SetVector<VPSingleDefRecipe *>( |
| 2595 | ExpressionRecipes.begin(), ExpressionRecipes.end()) |
| 2596 | .takeVector()), |
| 2597 | ExpressionType(ExpressionType) { |
| 2598 | assert(!ExpressionRecipes.empty() && "Nothing to combine?" ); |
| 2599 | assert( |
| 2600 | none_of(ExpressionRecipes, |
| 2601 | [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && |
| 2602 | "expression cannot contain recipes with side-effects" ); |
| 2603 | |
| 2604 | // Maintain a copy of the expression recipes as a set of users. |
| 2605 | SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers; |
| 2606 | for (auto *R : ExpressionRecipes) |
| 2607 | ExpressionRecipesAsSetOfUsers.insert(Ptr: R); |
| 2608 | |
| 2609 | // Recipes in the expression, except the last one, must only be used by |
| 2610 | // (other) recipes inside the expression. If there are other users, external |
| 2611 | // to the expression, use a clone of the recipe for external users. |
| 2612 | for (VPSingleDefRecipe *R : ExpressionRecipes) { |
| 2613 | if (R != ExpressionRecipes.back() && |
| 2614 | any_of(Range: R->users(), P: [&ExpressionRecipesAsSetOfUsers](VPUser *U) { |
| 2615 | return !ExpressionRecipesAsSetOfUsers.contains(Ptr: U); |
| 2616 | })) { |
| 2617 | // There are users outside of the expression. Clone the recipe and use the |
| 2618 | // clone those external users. |
| 2619 | VPSingleDefRecipe *CopyForExtUsers = R->clone(); |
| 2620 | R->replaceUsesWithIf(New: CopyForExtUsers, ShouldReplace: [&ExpressionRecipesAsSetOfUsers]( |
| 2621 | VPUser &U, unsigned) { |
| 2622 | return !ExpressionRecipesAsSetOfUsers.contains(Ptr: &U); |
| 2623 | }); |
| 2624 | CopyForExtUsers->insertBefore(InsertPos: R); |
| 2625 | } |
| 2626 | if (R->getParent()) |
| 2627 | R->removeFromParent(); |
| 2628 | } |
| 2629 | |
| 2630 | // Internalize all external operands to the expression recipes. To do so, |
| 2631 | // create new temporary VPValues for all operands defined by a recipe outside |
| 2632 | // the expression. The original operands are added as operands of the |
| 2633 | // VPExpressionRecipe itself. |
| 2634 | for (auto *R : ExpressionRecipes) { |
| 2635 | for (const auto &[Idx, Op] : enumerate(First: R->operands())) { |
| 2636 | auto *Def = Op->getDefiningRecipe(); |
| 2637 | if (Def && ExpressionRecipesAsSetOfUsers.contains(Ptr: Def)) |
| 2638 | continue; |
| 2639 | addOperand(Operand: Op); |
| 2640 | LiveInPlaceholders.push_back(Elt: new VPValue()); |
| 2641 | R->setOperand(I: Idx, New: LiveInPlaceholders.back()); |
| 2642 | } |
| 2643 | } |
| 2644 | } |
| 2645 | |
| 2646 | void VPExpressionRecipe::decompose() { |
| 2647 | for (auto *R : ExpressionRecipes) |
| 2648 | R->insertBefore(InsertPos: this); |
| 2649 | |
| 2650 | for (const auto &[Idx, Op] : enumerate(First: operands())) |
| 2651 | LiveInPlaceholders[Idx]->replaceAllUsesWith(New: Op); |
| 2652 | |
| 2653 | replaceAllUsesWith(New: ExpressionRecipes.back()); |
| 2654 | ExpressionRecipes.clear(); |
| 2655 | } |
| 2656 | |
| 2657 | InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, |
| 2658 | VPCostContext &Ctx) const { |
| 2659 | Type *RedTy = Ctx.Types.inferScalarType(V: this); |
| 2660 | auto *SrcVecTy = cast<VectorType>( |
| 2661 | Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF)); |
| 2662 | assert(RedTy->isIntegerTy() && |
| 2663 | "VPExpressionRecipe only supports integer types currently." ); |
| 2664 | switch (ExpressionType) { |
| 2665 | case ExpressionTypes::ExtendedReduction: { |
| 2666 | unsigned Opcode = RecurrenceDescriptor::getOpcode( |
| 2667 | Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes[1])->getRecurrenceKind()); |
| 2668 | return Ctx.TTI.getExtendedReductionCost( |
| 2669 | Opcode, |
| 2670 | IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() == |
| 2671 | Instruction::ZExt, |
| 2672 | ResTy: RedTy, Ty: SrcVecTy, FMF: std::nullopt, CostKind: Ctx.CostKind); |
| 2673 | } |
| 2674 | case ExpressionTypes::MulAccReduction: |
| 2675 | return Ctx.TTI.getMulAccReductionCost(IsUnsigned: false, ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind); |
| 2676 | |
| 2677 | case ExpressionTypes::ExtMulAccReduction: |
| 2678 | return Ctx.TTI.getMulAccReductionCost( |
| 2679 | IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() == |
| 2680 | Instruction::ZExt, |
| 2681 | ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind); |
| 2682 | } |
| 2683 | llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum" ); |
| 2684 | } |
| 2685 | |
| 2686 | bool VPExpressionRecipe::mayReadOrWriteMemory() const { |
| 2687 | return any_of(Range: ExpressionRecipes, P: [](VPSingleDefRecipe *R) { |
| 2688 | return R->mayReadFromMemory() || R->mayWriteToMemory(); |
| 2689 | }); |
| 2690 | } |
| 2691 | |
| 2692 | bool VPExpressionRecipe::mayHaveSideEffects() const { |
| 2693 | assert( |
| 2694 | none_of(ExpressionRecipes, |
| 2695 | [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && |
| 2696 | "expression cannot contain recipes with side-effects" ); |
| 2697 | return false; |
| 2698 | } |
| 2699 | |
| 2700 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2701 | |
| 2702 | void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, |
| 2703 | VPSlotTracker &SlotTracker) const { |
| 2704 | O << Indent << "EXPRESSION " ; |
| 2705 | printAsOperand(O, SlotTracker); |
| 2706 | O << " = " ; |
| 2707 | auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back()); |
| 2708 | unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); |
| 2709 | |
| 2710 | switch (ExpressionType) { |
| 2711 | case ExpressionTypes::ExtendedReduction: { |
| 2712 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 2713 | O << " +" ; |
| 2714 | O << " reduce." << Instruction::getOpcodeName(Opcode) << " (" ; |
| 2715 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 2716 | Red->printFlags(O); |
| 2717 | |
| 2718 | auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); |
| 2719 | O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " |
| 2720 | << *Ext0->getResultType(); |
| 2721 | if (Red->isConditional()) { |
| 2722 | O << ", " ; |
| 2723 | Red->getCondOp()->printAsOperand(O, SlotTracker); |
| 2724 | } |
| 2725 | O << ")" ; |
| 2726 | break; |
| 2727 | } |
| 2728 | case ExpressionTypes::MulAccReduction: |
| 2729 | case ExpressionTypes::ExtMulAccReduction: { |
| 2730 | getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); |
| 2731 | O << " + " ; |
| 2732 | O << "reduce." |
| 2733 | << Instruction::getOpcodeName( |
| 2734 | RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) |
| 2735 | << " (" ; |
| 2736 | O << "mul" ; |
| 2737 | bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction; |
| 2738 | auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2] |
| 2739 | : ExpressionRecipes[0]); |
| 2740 | Mul->printFlags(O); |
| 2741 | if (IsExtended) |
| 2742 | O << "(" ; |
| 2743 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 2744 | if (IsExtended) { |
| 2745 | auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); |
| 2746 | O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " |
| 2747 | << *Ext0->getResultType() << "), (" ; |
| 2748 | } else { |
| 2749 | O << ", " ; |
| 2750 | } |
| 2751 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 2752 | if (IsExtended) { |
| 2753 | auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]); |
| 2754 | O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " |
| 2755 | << *Ext1->getResultType() << ")" ; |
| 2756 | } |
| 2757 | if (Red->isConditional()) { |
| 2758 | O << ", " ; |
| 2759 | Red->getCondOp()->printAsOperand(O, SlotTracker); |
| 2760 | } |
| 2761 | O << ")" ; |
| 2762 | break; |
| 2763 | } |
| 2764 | } |
| 2765 | } |
| 2766 | |
| 2767 | void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, |
| 2768 | VPSlotTracker &SlotTracker) const { |
| 2769 | O << Indent << "REDUCE " ; |
| 2770 | printAsOperand(O, SlotTracker); |
| 2771 | O << " = " ; |
| 2772 | getChainOp()->printAsOperand(O, SlotTracker); |
| 2773 | O << " +" ; |
| 2774 | printFlags(O); |
| 2775 | O << " reduce." |
| 2776 | << Instruction::getOpcodeName( |
| 2777 | RecurrenceDescriptor::getOpcode(getRecurrenceKind())) |
| 2778 | << " (" ; |
| 2779 | getVecOp()->printAsOperand(O, SlotTracker); |
| 2780 | if (isConditional()) { |
| 2781 | O << ", " ; |
| 2782 | getCondOp()->printAsOperand(O, SlotTracker); |
| 2783 | } |
| 2784 | O << ")" ; |
| 2785 | } |
| 2786 | |
| 2787 | void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, |
| 2788 | VPSlotTracker &SlotTracker) const { |
| 2789 | O << Indent << "REDUCE " ; |
| 2790 | printAsOperand(O, SlotTracker); |
| 2791 | O << " = " ; |
| 2792 | getChainOp()->printAsOperand(O, SlotTracker); |
| 2793 | O << " +" ; |
| 2794 | printFlags(O); |
| 2795 | O << " vp.reduce." |
| 2796 | << Instruction::getOpcodeName( |
| 2797 | RecurrenceDescriptor::getOpcode(getRecurrenceKind())) |
| 2798 | << " (" ; |
| 2799 | getVecOp()->printAsOperand(O, SlotTracker); |
| 2800 | O << ", " ; |
| 2801 | getEVL()->printAsOperand(O, SlotTracker); |
| 2802 | if (isConditional()) { |
| 2803 | O << ", " ; |
| 2804 | getCondOp()->printAsOperand(O, SlotTracker); |
| 2805 | } |
| 2806 | O << ")" ; |
| 2807 | } |
| 2808 | |
| 2809 | #endif |
| 2810 | |
| 2811 | /// A helper function to scalarize a single Instruction in the innermost loop. |
| 2812 | /// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue |
| 2813 | /// operands from \p RepRecipe instead of \p Instr's operands. |
| 2814 | static void scalarizeInstruction(const Instruction *Instr, |
| 2815 | VPReplicateRecipe *RepRecipe, |
| 2816 | const VPLane &Lane, VPTransformState &State) { |
| 2817 | assert((!Instr->getType()->isAggregateType() || |
| 2818 | canVectorizeTy(Instr->getType())) && |
| 2819 | "Expected vectorizable or non-aggregate type." ); |
| 2820 | |
| 2821 | // Does this instruction return a value ? |
| 2822 | bool IsVoidRetTy = Instr->getType()->isVoidTy(); |
| 2823 | |
| 2824 | Instruction *Cloned = Instr->clone(); |
| 2825 | if (!IsVoidRetTy) { |
| 2826 | Cloned->setName(Instr->getName() + ".cloned" ); |
| 2827 | #if !defined(NDEBUG) |
| 2828 | // Verify that VPlan type inference results agree with the type of the |
| 2829 | // generated values. |
| 2830 | assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && |
| 2831 | "inferred type and type from generated instructions do not match" ); |
| 2832 | #endif |
| 2833 | } |
| 2834 | |
| 2835 | RepRecipe->applyFlags(I&: *Cloned); |
| 2836 | RepRecipe->applyMetadata(I&: *Cloned); |
| 2837 | |
| 2838 | if (auto DL = RepRecipe->getDebugLoc()) |
| 2839 | State.setDebugLocFrom(DL); |
| 2840 | |
| 2841 | // Replace the operands of the cloned instructions with their scalar |
| 2842 | // equivalents in the new loop. |
| 2843 | for (const auto &I : enumerate(First: RepRecipe->operands())) { |
| 2844 | auto InputLane = Lane; |
| 2845 | VPValue *Operand = I.value(); |
| 2846 | if (vputils::isSingleScalar(VPV: Operand)) |
| 2847 | InputLane = VPLane::getFirstLane(); |
| 2848 | Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Lane: InputLane)); |
| 2849 | } |
| 2850 | |
| 2851 | // Place the cloned scalar in the new loop. |
| 2852 | State.Builder.Insert(I: Cloned); |
| 2853 | |
| 2854 | State.set(Def: RepRecipe, V: Cloned, Lane); |
| 2855 | |
| 2856 | // If we just cloned a new assumption, add it the assumption cache. |
| 2857 | if (auto *II = dyn_cast<AssumeInst>(Val: Cloned)) |
| 2858 | State.AC->registerAssumption(CI: II); |
| 2859 | |
| 2860 | assert( |
| 2861 | (RepRecipe->getParent()->getParent() || |
| 2862 | !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || |
| 2863 | all_of(RepRecipe->operands(), |
| 2864 | [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && |
| 2865 | "Expected a recipe is either within a region or all of its operands " |
| 2866 | "are defined outside the vectorized region." ); |
| 2867 | } |
| 2868 | |
| 2869 | void VPReplicateRecipe::execute(VPTransformState &State) { |
| 2870 | Instruction *UI = getUnderlyingInstr(); |
| 2871 | |
| 2872 | if (!State.Lane) { |
| 2873 | assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions " |
| 2874 | "must have already been unrolled" ); |
| 2875 | scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: VPLane(0), State); |
| 2876 | return; |
| 2877 | } |
| 2878 | |
| 2879 | assert((State.VF.isScalar() || !isSingleScalar()) && |
| 2880 | "uniform recipe shouldn't be predicated" ); |
| 2881 | assert(!State.VF.isScalable() && "Can't scalarize a scalable vector" ); |
| 2882 | scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: *State.Lane, State); |
| 2883 | // Insert scalar instance packing it into a vector. |
| 2884 | if (State.VF.isVector() && shouldPack()) { |
| 2885 | Value *WideValue = |
| 2886 | State.Lane->isFirstLane() |
| 2887 | ? PoisonValue::get(T: VectorType::get(ElementType: UI->getType(), EC: State.VF)) |
| 2888 | : State.get(Def: this); |
| 2889 | State.set(Def: this, V: State.packScalarIntoVectorizedValue(Def: this, WideValue, |
| 2890 | Lane: *State.Lane)); |
| 2891 | } |
| 2892 | } |
| 2893 | |
| 2894 | bool VPReplicateRecipe::shouldPack() const { |
| 2895 | // Find if the recipe is used by a widened recipe via an intervening |
| 2896 | // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. |
| 2897 | return any_of(Range: users(), P: [](const VPUser *U) { |
| 2898 | if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U)) |
| 2899 | return any_of(Range: PredR->users(), P: [PredR](const VPUser *U) { |
| 2900 | return !U->usesScalars(Op: PredR); |
| 2901 | }); |
| 2902 | return false; |
| 2903 | }); |
| 2904 | } |
| 2905 | |
| 2906 | InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, |
| 2907 | VPCostContext &Ctx) const { |
| 2908 | Instruction *UI = cast<Instruction>(Val: getUnderlyingValue()); |
| 2909 | // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan |
| 2910 | // transform, avoid computing their cost multiple times for now. |
| 2911 | Ctx.SkipCostComputation.insert(Ptr: UI); |
| 2912 | |
| 2913 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 2914 | Type *ResultTy = Ctx.Types.inferScalarType(V: this); |
| 2915 | switch (UI->getOpcode()) { |
| 2916 | case Instruction::GetElementPtr: |
| 2917 | // We mark this instruction as zero-cost because the cost of GEPs in |
| 2918 | // vectorized code depends on whether the corresponding memory instruction |
| 2919 | // is scalarized or not. Therefore, we handle GEPs with the memory |
| 2920 | // instruction cost. |
| 2921 | return 0; |
| 2922 | case Instruction::Add: |
| 2923 | case Instruction::Sub: |
| 2924 | case Instruction::FAdd: |
| 2925 | case Instruction::FSub: |
| 2926 | case Instruction::Mul: |
| 2927 | case Instruction::FMul: |
| 2928 | case Instruction::FDiv: |
| 2929 | case Instruction::FRem: |
| 2930 | case Instruction::Shl: |
| 2931 | case Instruction::LShr: |
| 2932 | case Instruction::AShr: |
| 2933 | case Instruction::And: |
| 2934 | case Instruction::Or: |
| 2935 | case Instruction::Xor: { |
| 2936 | auto Op2Info = Ctx.getOperandInfo(V: getOperand(N: 1)); |
| 2937 | SmallVector<const Value *, 4> Operands(UI->operand_values()); |
| 2938 | return Ctx.TTI.getArithmeticInstrCost( |
| 2939 | Opcode: UI->getOpcode(), Ty: ResultTy, CostKind, |
| 2940 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
| 2941 | Opd2Info: Op2Info, Args: Operands, CxtI: UI, TLibInfo: &Ctx.TLI) * |
| 2942 | (isSingleScalar() ? 1 : VF.getFixedValue()); |
| 2943 | } |
| 2944 | } |
| 2945 | |
| 2946 | return Ctx.getLegacyCost(UI, VF); |
| 2947 | } |
| 2948 | |
| 2949 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2950 | void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, |
| 2951 | VPSlotTracker &SlotTracker) const { |
| 2952 | O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE " ); |
| 2953 | |
| 2954 | if (!getUnderlyingInstr()->getType()->isVoidTy()) { |
| 2955 | printAsOperand(O, SlotTracker); |
| 2956 | O << " = " ; |
| 2957 | } |
| 2958 | if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { |
| 2959 | O << "call" ; |
| 2960 | printFlags(O); |
| 2961 | O << "@" << CB->getCalledFunction()->getName() << "(" ; |
| 2962 | interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), |
| 2963 | O, [&O, &SlotTracker](VPValue *Op) { |
| 2964 | Op->printAsOperand(O, SlotTracker); |
| 2965 | }); |
| 2966 | O << ")" ; |
| 2967 | } else { |
| 2968 | O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); |
| 2969 | printFlags(O); |
| 2970 | printOperands(O, SlotTracker); |
| 2971 | } |
| 2972 | |
| 2973 | if (shouldPack()) |
| 2974 | O << " (S->V)" ; |
| 2975 | } |
| 2976 | #endif |
| 2977 | |
| 2978 | void VPBranchOnMaskRecipe::execute(VPTransformState &State) { |
| 2979 | assert(State.Lane && "Branch on Mask works only on single instance." ); |
| 2980 | |
| 2981 | VPValue *BlockInMask = getOperand(N: 0); |
| 2982 | Value *ConditionBit = State.get(Def: BlockInMask, Lane: *State.Lane); |
| 2983 | |
| 2984 | // Replace the temporary unreachable terminator with a new conditional branch, |
| 2985 | // whose two destinations will be set later when they are created. |
| 2986 | auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); |
| 2987 | assert(isa<UnreachableInst>(CurrentTerminator) && |
| 2988 | "Expected to replace unreachable terminator with conditional branch." ); |
| 2989 | auto CondBr = |
| 2990 | State.Builder.CreateCondBr(Cond: ConditionBit, True: State.CFG.PrevBB, False: nullptr); |
| 2991 | CondBr->setSuccessor(idx: 0, NewSucc: nullptr); |
| 2992 | CurrentTerminator->eraseFromParent(); |
| 2993 | } |
| 2994 | |
| 2995 | InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, |
| 2996 | VPCostContext &Ctx) const { |
| 2997 | // The legacy cost model doesn't assign costs to branches for individual |
| 2998 | // replicate regions. Match the current behavior in the VPlan cost model for |
| 2999 | // now. |
| 3000 | return 0; |
| 3001 | } |
| 3002 | |
| 3003 | void VPPredInstPHIRecipe::execute(VPTransformState &State) { |
| 3004 | assert(State.Lane && "Predicated instruction PHI works per instance." ); |
| 3005 | Instruction *ScalarPredInst = |
| 3006 | cast<Instruction>(Val: State.get(Def: getOperand(N: 0), Lane: *State.Lane)); |
| 3007 | BasicBlock *PredicatedBB = ScalarPredInst->getParent(); |
| 3008 | BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); |
| 3009 | assert(PredicatingBB && "Predicated block has no single predecessor." ); |
| 3010 | assert(isa<VPReplicateRecipe>(getOperand(0)) && |
| 3011 | "operand must be VPReplicateRecipe" ); |
| 3012 | |
| 3013 | // By current pack/unpack logic we need to generate only a single phi node: if |
| 3014 | // a vector value for the predicated instruction exists at this point it means |
| 3015 | // the instruction has vector users only, and a phi for the vector value is |
| 3016 | // needed. In this case the recipe of the predicated instruction is marked to |
| 3017 | // also do that packing, thereby "hoisting" the insert-element sequence. |
| 3018 | // Otherwise, a phi node for the scalar value is needed. |
| 3019 | if (State.hasVectorValue(Def: getOperand(N: 0))) { |
| 3020 | Value *VectorValue = State.get(Def: getOperand(N: 0)); |
| 3021 | InsertElementInst *IEI = cast<InsertElementInst>(Val: VectorValue); |
| 3022 | PHINode *VPhi = State.Builder.CreatePHI(Ty: IEI->getType(), NumReservedValues: 2); |
| 3023 | VPhi->addIncoming(V: IEI->getOperand(i_nocapture: 0), BB: PredicatingBB); // Unmodified vector. |
| 3024 | VPhi->addIncoming(V: IEI, BB: PredicatedBB); // New vector with inserted element. |
| 3025 | if (State.hasVectorValue(Def: this)) |
| 3026 | State.reset(Def: this, V: VPhi); |
| 3027 | else |
| 3028 | State.set(Def: this, V: VPhi); |
| 3029 | // NOTE: Currently we need to update the value of the operand, so the next |
| 3030 | // predicated iteration inserts its generated value in the correct vector. |
| 3031 | State.reset(Def: getOperand(N: 0), V: VPhi); |
| 3032 | } else { |
| 3033 | if (vputils::onlyFirstLaneUsed(Def: this) && !State.Lane->isFirstLane()) |
| 3034 | return; |
| 3035 | |
| 3036 | Type *PredInstType = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)); |
| 3037 | PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: 2); |
| 3038 | Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()), |
| 3039 | BB: PredicatingBB); |
| 3040 | Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB); |
| 3041 | if (State.hasScalarValue(Def: this, Lane: *State.Lane)) |
| 3042 | State.reset(Def: this, V: Phi, Lane: *State.Lane); |
| 3043 | else |
| 3044 | State.set(Def: this, V: Phi, Lane: *State.Lane); |
| 3045 | // NOTE: Currently we need to update the value of the operand, so the next |
| 3046 | // predicated iteration inserts its generated value in the correct vector. |
| 3047 | State.reset(Def: getOperand(N: 0), V: Phi, Lane: *State.Lane); |
| 3048 | } |
| 3049 | } |
| 3050 | |
| 3051 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3052 | void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3053 | VPSlotTracker &SlotTracker) const { |
| 3054 | O << Indent << "PHI-PREDICATED-INSTRUCTION " ; |
| 3055 | printAsOperand(O, SlotTracker); |
| 3056 | O << " = " ; |
| 3057 | printOperands(O, SlotTracker); |
| 3058 | } |
| 3059 | #endif |
| 3060 | |
| 3061 | InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, |
| 3062 | VPCostContext &Ctx) const { |
| 3063 | Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF); |
| 3064 | const Align Alignment = |
| 3065 | getLoadStoreAlignment(I: const_cast<Instruction *>(&Ingredient)); |
| 3066 | unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr())) |
| 3067 | ->getAddressSpace(); |
| 3068 | unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) |
| 3069 | ? Instruction::Load |
| 3070 | : Instruction::Store; |
| 3071 | |
| 3072 | if (!Consecutive) { |
| 3073 | // TODO: Using the original IR may not be accurate. |
| 3074 | // Currently, ARM will use the underlying IR to calculate gather/scatter |
| 3075 | // instruction cost. |
| 3076 | const Value *Ptr = getLoadStorePointerOperand(V: &Ingredient); |
| 3077 | assert(!Reverse && |
| 3078 | "Inconsecutive memory access should not have the order." ); |
| 3079 | return Ctx.TTI.getAddressComputationCost(Ty) + |
| 3080 | Ctx.TTI.getGatherScatterOpCost(Opcode, DataTy: Ty, Ptr, VariableMask: IsMasked, Alignment, |
| 3081 | CostKind: Ctx.CostKind, I: &Ingredient); |
| 3082 | } |
| 3083 | |
| 3084 | InstructionCost Cost = 0; |
| 3085 | if (IsMasked) { |
| 3086 | Cost += |
| 3087 | Ctx.TTI.getMaskedMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind); |
| 3088 | } else { |
| 3089 | TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo( |
| 3090 | V: isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) ? getOperand(N: 0) |
| 3091 | : getOperand(N: 1)); |
| 3092 | Cost += Ctx.TTI.getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind, |
| 3093 | OpdInfo: OpInfo, I: &Ingredient); |
| 3094 | } |
| 3095 | if (!Reverse) |
| 3096 | return Cost; |
| 3097 | |
| 3098 | return Cost += Ctx.TTI.getShuffleCost( |
| 3099 | Kind: TargetTransformInfo::SK_Reverse, DstTy: cast<VectorType>(Val: Ty), |
| 3100 | SrcTy: cast<VectorType>(Val: Ty), Mask: {}, CostKind: Ctx.CostKind, Index: 0); |
| 3101 | } |
| 3102 | |
| 3103 | void VPWidenLoadRecipe::execute(VPTransformState &State) { |
| 3104 | Type *ScalarDataTy = getLoadStoreType(I: &Ingredient); |
| 3105 | auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF); |
| 3106 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
| 3107 | bool CreateGather = !isConsecutive(); |
| 3108 | |
| 3109 | auto &Builder = State.Builder; |
| 3110 | Value *Mask = nullptr; |
| 3111 | if (auto *VPMask = getMask()) { |
| 3112 | // Mask reversal is only needed for non-all-one (null) masks, as reverse |
| 3113 | // of a null all-one mask is a null mask. |
| 3114 | Mask = State.get(Def: VPMask); |
| 3115 | if (isReverse()) |
| 3116 | Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse" ); |
| 3117 | } |
| 3118 | |
| 3119 | Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateGather); |
| 3120 | Value *NewLI; |
| 3121 | if (CreateGather) { |
| 3122 | NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr, |
| 3123 | Name: "wide.masked.gather" ); |
| 3124 | } else if (Mask) { |
| 3125 | NewLI = |
| 3126 | Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask, |
| 3127 | PassThru: PoisonValue::get(T: DataTy), Name: "wide.masked.load" ); |
| 3128 | } else { |
| 3129 | NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load" ); |
| 3130 | } |
| 3131 | applyMetadata(I&: *cast<Instruction>(Val: NewLI)); |
| 3132 | if (Reverse) |
| 3133 | NewLI = Builder.CreateVectorReverse(V: NewLI, Name: "reverse" ); |
| 3134 | State.set(Def: this, V: NewLI); |
| 3135 | } |
| 3136 | |
| 3137 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3138 | void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3139 | VPSlotTracker &SlotTracker) const { |
| 3140 | O << Indent << "WIDEN " ; |
| 3141 | printAsOperand(O, SlotTracker); |
| 3142 | O << " = load " ; |
| 3143 | printOperands(O, SlotTracker); |
| 3144 | } |
| 3145 | #endif |
| 3146 | |
| 3147 | /// Use all-true mask for reverse rather than actual mask, as it avoids a |
| 3148 | /// dependence w/o affecting the result. |
| 3149 | static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, |
| 3150 | Value *EVL, const Twine &Name) { |
| 3151 | VectorType *ValTy = cast<VectorType>(Val: Operand->getType()); |
| 3152 | Value *AllTrueMask = |
| 3153 | Builder.CreateVectorSplat(EC: ValTy->getElementCount(), V: Builder.getTrue()); |
| 3154 | return Builder.CreateIntrinsic(RetTy: ValTy, ID: Intrinsic::experimental_vp_reverse, |
| 3155 | Args: {Operand, AllTrueMask, EVL}, FMFSource: nullptr, Name); |
| 3156 | } |
| 3157 | |
| 3158 | void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { |
| 3159 | Type *ScalarDataTy = getLoadStoreType(I: &Ingredient); |
| 3160 | auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF); |
| 3161 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
| 3162 | bool CreateGather = !isConsecutive(); |
| 3163 | |
| 3164 | auto &Builder = State.Builder; |
| 3165 | CallInst *NewLI; |
| 3166 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 3167 | Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateGather); |
| 3168 | Value *Mask = nullptr; |
| 3169 | if (VPValue *VPMask = getMask()) { |
| 3170 | Mask = State.get(Def: VPMask); |
| 3171 | if (isReverse()) |
| 3172 | Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask" ); |
| 3173 | } else { |
| 3174 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
| 3175 | } |
| 3176 | |
| 3177 | if (CreateGather) { |
| 3178 | NewLI = |
| 3179 | Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_gather, Args: {Addr, Mask, EVL}, |
| 3180 | FMFSource: nullptr, Name: "wide.masked.gather" ); |
| 3181 | } else { |
| 3182 | NewLI = Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_load, |
| 3183 | Args: {Addr, Mask, EVL}, FMFSource: nullptr, Name: "vp.op.load" ); |
| 3184 | } |
| 3185 | NewLI->addParamAttr( |
| 3186 | ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment)); |
| 3187 | applyMetadata(I&: *NewLI); |
| 3188 | Instruction *Res = NewLI; |
| 3189 | if (isReverse()) |
| 3190 | Res = createReverseEVL(Builder, Operand: Res, EVL, Name: "vp.reverse" ); |
| 3191 | State.set(Def: this, V: Res); |
| 3192 | } |
| 3193 | |
| 3194 | InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, |
| 3195 | VPCostContext &Ctx) const { |
| 3196 | if (!Consecutive || IsMasked) |
| 3197 | return VPWidenMemoryRecipe::computeCost(VF, Ctx); |
| 3198 | |
| 3199 | // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() |
| 3200 | // here because the EVL recipes using EVL to replace the tail mask. But in the |
| 3201 | // legacy model, it will always calculate the cost of mask. |
| 3202 | // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we |
| 3203 | // don't need to compare to the legacy cost model. |
| 3204 | Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF); |
| 3205 | const Align Alignment = |
| 3206 | getLoadStoreAlignment(I: const_cast<Instruction *>(&Ingredient)); |
| 3207 | unsigned AS = |
| 3208 | getLoadStoreAddressSpace(I: const_cast<Instruction *>(&Ingredient)); |
| 3209 | InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( |
| 3210 | Opcode: Instruction::Load, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind); |
| 3211 | if (!Reverse) |
| 3212 | return Cost; |
| 3213 | |
| 3214 | return Cost + Ctx.TTI.getShuffleCost( |
| 3215 | Kind: TargetTransformInfo::SK_Reverse, DstTy: cast<VectorType>(Val: Ty), |
| 3216 | SrcTy: cast<VectorType>(Val: Ty), Mask: {}, CostKind: Ctx.CostKind, Index: 0); |
| 3217 | } |
| 3218 | |
| 3219 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3220 | void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3221 | VPSlotTracker &SlotTracker) const { |
| 3222 | O << Indent << "WIDEN " ; |
| 3223 | printAsOperand(O, SlotTracker); |
| 3224 | O << " = vp.load " ; |
| 3225 | printOperands(O, SlotTracker); |
| 3226 | } |
| 3227 | #endif |
| 3228 | |
| 3229 | void VPWidenStoreRecipe::execute(VPTransformState &State) { |
| 3230 | VPValue *StoredVPValue = getStoredValue(); |
| 3231 | bool CreateScatter = !isConsecutive(); |
| 3232 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
| 3233 | |
| 3234 | auto &Builder = State.Builder; |
| 3235 | |
| 3236 | Value *Mask = nullptr; |
| 3237 | if (auto *VPMask = getMask()) { |
| 3238 | // Mask reversal is only needed for non-all-one (null) masks, as reverse |
| 3239 | // of a null all-one mask is a null mask. |
| 3240 | Mask = State.get(Def: VPMask); |
| 3241 | if (isReverse()) |
| 3242 | Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse" ); |
| 3243 | } |
| 3244 | |
| 3245 | Value *StoredVal = State.get(Def: StoredVPValue); |
| 3246 | if (isReverse()) { |
| 3247 | // If we store to reverse consecutive memory locations, then we need |
| 3248 | // to reverse the order of elements in the stored value. |
| 3249 | StoredVal = Builder.CreateVectorReverse(V: StoredVal, Name: "reverse" ); |
| 3250 | // We don't want to update the value in the map as it might be used in |
| 3251 | // another expression. So don't call resetVectorValue(StoredVal). |
| 3252 | } |
| 3253 | Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateScatter); |
| 3254 | Instruction *NewSI = nullptr; |
| 3255 | if (CreateScatter) |
| 3256 | NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask); |
| 3257 | else if (Mask) |
| 3258 | NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask); |
| 3259 | else |
| 3260 | NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment); |
| 3261 | applyMetadata(I&: *NewSI); |
| 3262 | } |
| 3263 | |
| 3264 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3265 | void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3266 | VPSlotTracker &SlotTracker) const { |
| 3267 | O << Indent << "WIDEN store " ; |
| 3268 | printOperands(O, SlotTracker); |
| 3269 | } |
| 3270 | #endif |
| 3271 | |
| 3272 | void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { |
| 3273 | VPValue *StoredValue = getStoredValue(); |
| 3274 | bool CreateScatter = !isConsecutive(); |
| 3275 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
| 3276 | |
| 3277 | auto &Builder = State.Builder; |
| 3278 | |
| 3279 | CallInst *NewSI = nullptr; |
| 3280 | Value *StoredVal = State.get(Def: StoredValue); |
| 3281 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 3282 | if (isReverse()) |
| 3283 | StoredVal = createReverseEVL(Builder, Operand: StoredVal, EVL, Name: "vp.reverse" ); |
| 3284 | Value *Mask = nullptr; |
| 3285 | if (VPValue *VPMask = getMask()) { |
| 3286 | Mask = State.get(Def: VPMask); |
| 3287 | if (isReverse()) |
| 3288 | Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask" ); |
| 3289 | } else { |
| 3290 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
| 3291 | } |
| 3292 | Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateScatter); |
| 3293 | if (CreateScatter) { |
| 3294 | NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()), |
| 3295 | ID: Intrinsic::vp_scatter, |
| 3296 | Args: {StoredVal, Addr, Mask, EVL}); |
| 3297 | } else { |
| 3298 | NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()), |
| 3299 | ID: Intrinsic::vp_store, |
| 3300 | Args: {StoredVal, Addr, Mask, EVL}); |
| 3301 | } |
| 3302 | NewSI->addParamAttr( |
| 3303 | ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment)); |
| 3304 | applyMetadata(I&: *NewSI); |
| 3305 | } |
| 3306 | |
| 3307 | InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, |
| 3308 | VPCostContext &Ctx) const { |
| 3309 | if (!Consecutive || IsMasked) |
| 3310 | return VPWidenMemoryRecipe::computeCost(VF, Ctx); |
| 3311 | |
| 3312 | // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() |
| 3313 | // here because the EVL recipes using EVL to replace the tail mask. But in the |
| 3314 | // legacy model, it will always calculate the cost of mask. |
| 3315 | // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we |
| 3316 | // don't need to compare to the legacy cost model. |
| 3317 | Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF); |
| 3318 | const Align Alignment = |
| 3319 | getLoadStoreAlignment(I: const_cast<Instruction *>(&Ingredient)); |
| 3320 | unsigned AS = |
| 3321 | getLoadStoreAddressSpace(I: const_cast<Instruction *>(&Ingredient)); |
| 3322 | InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( |
| 3323 | Opcode: Instruction::Store, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind); |
| 3324 | if (!Reverse) |
| 3325 | return Cost; |
| 3326 | |
| 3327 | return Cost + Ctx.TTI.getShuffleCost( |
| 3328 | Kind: TargetTransformInfo::SK_Reverse, DstTy: cast<VectorType>(Val: Ty), |
| 3329 | SrcTy: cast<VectorType>(Val: Ty), Mask: {}, CostKind: Ctx.CostKind, Index: 0); |
| 3330 | } |
| 3331 | |
| 3332 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3333 | void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3334 | VPSlotTracker &SlotTracker) const { |
| 3335 | O << Indent << "WIDEN vp.store " ; |
| 3336 | printOperands(O, SlotTracker); |
| 3337 | } |
| 3338 | #endif |
| 3339 | |
| 3340 | static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, |
| 3341 | VectorType *DstVTy, const DataLayout &DL) { |
| 3342 | // Verify that V is a vector type with same number of elements as DstVTy. |
| 3343 | auto VF = DstVTy->getElementCount(); |
| 3344 | auto *SrcVecTy = cast<VectorType>(Val: V->getType()); |
| 3345 | assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match" ); |
| 3346 | Type *SrcElemTy = SrcVecTy->getElementType(); |
| 3347 | Type *DstElemTy = DstVTy->getElementType(); |
| 3348 | assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && |
| 3349 | "Vector elements must have same size" ); |
| 3350 | |
| 3351 | // Do a direct cast if element types are castable. |
| 3352 | if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) { |
| 3353 | return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy); |
| 3354 | } |
| 3355 | // V cannot be directly casted to desired vector type. |
| 3356 | // May happen when V is a floating point vector but DstVTy is a vector of |
| 3357 | // pointers or vice-versa. Handle this using a two-step bitcast using an |
| 3358 | // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. |
| 3359 | assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && |
| 3360 | "Only one type should be a pointer type" ); |
| 3361 | assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && |
| 3362 | "Only one type should be a floating point type" ); |
| 3363 | Type *IntTy = |
| 3364 | IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy)); |
| 3365 | auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF); |
| 3366 | Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy); |
| 3367 | return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy); |
| 3368 | } |
| 3369 | |
| 3370 | /// Return a vector containing interleaved elements from multiple |
| 3371 | /// smaller input vectors. |
| 3372 | static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, |
| 3373 | const Twine &Name) { |
| 3374 | unsigned Factor = Vals.size(); |
| 3375 | assert(Factor > 1 && "Tried to interleave invalid number of vectors" ); |
| 3376 | |
| 3377 | VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType()); |
| 3378 | #ifndef NDEBUG |
| 3379 | for (Value *Val : Vals) |
| 3380 | assert(Val->getType() == VecTy && "Tried to interleave mismatched types" ); |
| 3381 | #endif |
| 3382 | |
| 3383 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), so |
| 3384 | // must use intrinsics to interleave. |
| 3385 | if (VecTy->isScalableTy()) { |
| 3386 | assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors" ); |
| 3387 | VectorType *InterleaveTy = |
| 3388 | VectorType::get(ElementType: VecTy->getElementType(), |
| 3389 | EC: VecTy->getElementCount().multiplyCoefficientBy(RHS: Factor)); |
| 3390 | return Builder.CreateIntrinsic(RetTy: InterleaveTy, |
| 3391 | ID: getInterleaveIntrinsicID(Factor), Args: Vals, |
| 3392 | /*FMFSource=*/nullptr, Name); |
| 3393 | } |
| 3394 | |
| 3395 | // Fixed length. Start by concatenating all vectors into a wide vector. |
| 3396 | Value *WideVec = concatenateVectors(Builder, Vecs: Vals); |
| 3397 | |
| 3398 | // Interleave the elements into the wide vector. |
| 3399 | const unsigned NumElts = VecTy->getElementCount().getFixedValue(); |
| 3400 | return Builder.CreateShuffleVector( |
| 3401 | V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name); |
| 3402 | } |
| 3403 | |
| 3404 | // Try to vectorize the interleave group that \p Instr belongs to. |
| 3405 | // |
| 3406 | // E.g. Translate following interleaved load group (factor = 3): |
| 3407 | // for (i = 0; i < N; i+=3) { |
| 3408 | // R = Pic[i]; // Member of index 0 |
| 3409 | // G = Pic[i+1]; // Member of index 1 |
| 3410 | // B = Pic[i+2]; // Member of index 2 |
| 3411 | // ... // do something to R, G, B |
| 3412 | // } |
| 3413 | // To: |
| 3414 | // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B |
| 3415 | // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements |
| 3416 | // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements |
| 3417 | // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements |
| 3418 | // |
| 3419 | // Or translate following interleaved store group (factor = 3): |
| 3420 | // for (i = 0; i < N; i+=3) { |
| 3421 | // ... do something to R, G, B |
| 3422 | // Pic[i] = R; // Member of index 0 |
| 3423 | // Pic[i+1] = G; // Member of index 1 |
| 3424 | // Pic[i+2] = B; // Member of index 2 |
| 3425 | // } |
| 3426 | // To: |
| 3427 | // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> |
| 3428 | // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> |
| 3429 | // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, |
| 3430 | // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements |
| 3431 | // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B |
| 3432 | void VPInterleaveRecipe::execute(VPTransformState &State) { |
| 3433 | assert(!State.Lane && "Interleave group being replicated." ); |
| 3434 | const InterleaveGroup<Instruction> *Group = IG; |
| 3435 | Instruction *Instr = Group->getInsertPos(); |
| 3436 | |
| 3437 | // Prepare for the vector type of the interleaved load/store. |
| 3438 | Type *ScalarTy = getLoadStoreType(I: Instr); |
| 3439 | unsigned InterleaveFactor = Group->getFactor(); |
| 3440 | auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF * InterleaveFactor); |
| 3441 | |
| 3442 | VPValue *BlockInMask = getMask(); |
| 3443 | VPValue *Addr = getAddr(); |
| 3444 | Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0)); |
| 3445 | Value *PoisonVec = PoisonValue::get(T: VecTy); |
| 3446 | |
| 3447 | auto CreateGroupMask = [&BlockInMask, &State, |
| 3448 | &InterleaveFactor](Value *MaskForGaps) -> Value * { |
| 3449 | if (State.VF.isScalable()) { |
| 3450 | assert(!MaskForGaps && "Interleaved groups with gaps are not supported." ); |
| 3451 | assert(InterleaveFactor <= 8 && |
| 3452 | "Unsupported deinterleave factor for scalable vectors" ); |
| 3453 | auto *ResBlockInMask = State.get(Def: BlockInMask); |
| 3454 | SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask); |
| 3455 | return interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask" ); |
| 3456 | } |
| 3457 | |
| 3458 | if (!BlockInMask) |
| 3459 | return MaskForGaps; |
| 3460 | |
| 3461 | Value *ResBlockInMask = State.get(Def: BlockInMask); |
| 3462 | Value *ShuffledMask = State.Builder.CreateShuffleVector( |
| 3463 | V: ResBlockInMask, |
| 3464 | Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getFixedValue()), |
| 3465 | Name: "interleaved.mask" ); |
| 3466 | return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And, |
| 3467 | LHS: ShuffledMask, RHS: MaskForGaps) |
| 3468 | : ShuffledMask; |
| 3469 | }; |
| 3470 | |
| 3471 | const DataLayout &DL = Instr->getDataLayout(); |
| 3472 | // Vectorize the interleaved load group. |
| 3473 | if (isa<LoadInst>(Val: Instr)) { |
| 3474 | Value *MaskForGaps = nullptr; |
| 3475 | if (NeedsMaskForGaps) { |
| 3476 | MaskForGaps = |
| 3477 | createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getFixedValue(), Group: *Group); |
| 3478 | assert(MaskForGaps && "Mask for Gaps is required but it is null" ); |
| 3479 | } |
| 3480 | |
| 3481 | Instruction *NewLoad; |
| 3482 | if (BlockInMask || MaskForGaps) { |
| 3483 | Value *GroupMask = CreateGroupMask(MaskForGaps); |
| 3484 | NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: ResAddr, |
| 3485 | Alignment: Group->getAlign(), Mask: GroupMask, |
| 3486 | PassThru: PoisonVec, Name: "wide.masked.vec" ); |
| 3487 | } else |
| 3488 | NewLoad = State.Builder.CreateAlignedLoad(Ty: VecTy, Ptr: ResAddr, |
| 3489 | Align: Group->getAlign(), Name: "wide.vec" ); |
| 3490 | Group->addMetadata(NewInst: NewLoad); |
| 3491 | |
| 3492 | ArrayRef<VPValue *> VPDefs = definedValues(); |
| 3493 | const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); |
| 3494 | if (VecTy->isScalableTy()) { |
| 3495 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), |
| 3496 | // so must use intrinsics to deinterleave. |
| 3497 | assert(InterleaveFactor <= 8 && |
| 3498 | "Unsupported deinterleave factor for scalable vectors" ); |
| 3499 | Value *Deinterleave = State.Builder.CreateIntrinsic( |
| 3500 | ID: getDeinterleaveIntrinsicID(Factor: InterleaveFactor), Types: NewLoad->getType(), |
| 3501 | Args: NewLoad, |
| 3502 | /*FMFSource=*/nullptr, Name: "strided.vec" ); |
| 3503 | |
| 3504 | for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { |
| 3505 | Instruction *Member = Group->getMember(Index: I); |
| 3506 | Value *StridedVec = State.Builder.CreateExtractValue(Agg: Deinterleave, Idxs: I); |
| 3507 | if (!Member) { |
| 3508 | // This value is not needed as it's not used |
| 3509 | cast<Instruction>(Val: StridedVec)->eraseFromParent(); |
| 3510 | continue; |
| 3511 | } |
| 3512 | // If this member has different type, cast the result type. |
| 3513 | if (Member->getType() != ScalarTy) { |
| 3514 | VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF); |
| 3515 | StridedVec = |
| 3516 | createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL); |
| 3517 | } |
| 3518 | |
| 3519 | if (Group->isReverse()) |
| 3520 | StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse" ); |
| 3521 | |
| 3522 | State.set(Def: VPDefs[J], V: StridedVec); |
| 3523 | ++J; |
| 3524 | } |
| 3525 | |
| 3526 | return; |
| 3527 | } |
| 3528 | assert(!State.VF.isScalable() && "VF is assumed to be non scalable." ); |
| 3529 | |
| 3530 | // For each member in the group, shuffle out the appropriate data from the |
| 3531 | // wide loads. |
| 3532 | unsigned J = 0; |
| 3533 | for (unsigned I = 0; I < InterleaveFactor; ++I) { |
| 3534 | Instruction *Member = Group->getMember(Index: I); |
| 3535 | |
| 3536 | // Skip the gaps in the group. |
| 3537 | if (!Member) |
| 3538 | continue; |
| 3539 | |
| 3540 | auto StrideMask = |
| 3541 | createStrideMask(Start: I, Stride: InterleaveFactor, VF: State.VF.getFixedValue()); |
| 3542 | Value *StridedVec = |
| 3543 | State.Builder.CreateShuffleVector(V: NewLoad, Mask: StrideMask, Name: "strided.vec" ); |
| 3544 | |
| 3545 | // If this member has different type, cast the result type. |
| 3546 | if (Member->getType() != ScalarTy) { |
| 3547 | VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF); |
| 3548 | StridedVec = |
| 3549 | createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL); |
| 3550 | } |
| 3551 | |
| 3552 | if (Group->isReverse()) |
| 3553 | StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse" ); |
| 3554 | |
| 3555 | State.set(Def: VPDefs[J], V: StridedVec); |
| 3556 | ++J; |
| 3557 | } |
| 3558 | return; |
| 3559 | } |
| 3560 | |
| 3561 | // The sub vector type for current instruction. |
| 3562 | auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF); |
| 3563 | |
| 3564 | // Vectorize the interleaved store group. |
| 3565 | Value *MaskForGaps = |
| 3566 | createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group); |
| 3567 | assert((!MaskForGaps || !State.VF.isScalable()) && |
| 3568 | "masking gaps for scalable vectors is not yet supported." ); |
| 3569 | ArrayRef<VPValue *> StoredValues = getStoredValues(); |
| 3570 | // Collect the stored vector from each member. |
| 3571 | SmallVector<Value *, 4> StoredVecs; |
| 3572 | unsigned StoredIdx = 0; |
| 3573 | for (unsigned i = 0; i < InterleaveFactor; i++) { |
| 3574 | assert((Group->getMember(i) || MaskForGaps) && |
| 3575 | "Fail to get a member from an interleaved store group" ); |
| 3576 | Instruction *Member = Group->getMember(Index: i); |
| 3577 | |
| 3578 | // Skip the gaps in the group. |
| 3579 | if (!Member) { |
| 3580 | Value *Undef = PoisonValue::get(T: SubVT); |
| 3581 | StoredVecs.push_back(Elt: Undef); |
| 3582 | continue; |
| 3583 | } |
| 3584 | |
| 3585 | Value *StoredVec = State.get(Def: StoredValues[StoredIdx]); |
| 3586 | ++StoredIdx; |
| 3587 | |
| 3588 | if (Group->isReverse()) |
| 3589 | StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse" ); |
| 3590 | |
| 3591 | // If this member has different type, cast it to a unified type. |
| 3592 | |
| 3593 | if (StoredVec->getType() != SubVT) |
| 3594 | StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL); |
| 3595 | |
| 3596 | StoredVecs.push_back(Elt: StoredVec); |
| 3597 | } |
| 3598 | |
| 3599 | // Interleave all the smaller vectors into one wider vector. |
| 3600 | Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec" ); |
| 3601 | Instruction *NewStoreInstr; |
| 3602 | if (BlockInMask || MaskForGaps) { |
| 3603 | Value *GroupMask = CreateGroupMask(MaskForGaps); |
| 3604 | NewStoreInstr = State.Builder.CreateMaskedStore( |
| 3605 | Val: IVec, Ptr: ResAddr, Alignment: Group->getAlign(), Mask: GroupMask); |
| 3606 | } else |
| 3607 | NewStoreInstr = |
| 3608 | State.Builder.CreateAlignedStore(Val: IVec, Ptr: ResAddr, Align: Group->getAlign()); |
| 3609 | |
| 3610 | Group->addMetadata(NewInst: NewStoreInstr); |
| 3611 | } |
| 3612 | |
| 3613 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3614 | void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3615 | VPSlotTracker &SlotTracker) const { |
| 3616 | O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at " ; |
| 3617 | IG->getInsertPos()->printAsOperand(O, false); |
| 3618 | O << ", " ; |
| 3619 | getAddr()->printAsOperand(O, SlotTracker); |
| 3620 | VPValue *Mask = getMask(); |
| 3621 | if (Mask) { |
| 3622 | O << ", " ; |
| 3623 | Mask->printAsOperand(O, SlotTracker); |
| 3624 | } |
| 3625 | |
| 3626 | unsigned OpIdx = 0; |
| 3627 | for (unsigned i = 0; i < IG->getFactor(); ++i) { |
| 3628 | if (!IG->getMember(i)) |
| 3629 | continue; |
| 3630 | if (getNumStoreOperands() > 0) { |
| 3631 | O << "\n" << Indent << " store " ; |
| 3632 | getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); |
| 3633 | O << " to index " << i; |
| 3634 | } else { |
| 3635 | O << "\n" << Indent << " " ; |
| 3636 | getVPValue(OpIdx)->printAsOperand(O, SlotTracker); |
| 3637 | O << " = load from index " << i; |
| 3638 | } |
| 3639 | ++OpIdx; |
| 3640 | } |
| 3641 | } |
| 3642 | #endif |
| 3643 | |
| 3644 | InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, |
| 3645 | VPCostContext &Ctx) const { |
| 3646 | Instruction *InsertPos = getInsertPos(); |
| 3647 | // Find the VPValue index of the interleave group. We need to skip gaps. |
| 3648 | unsigned InsertPosIdx = 0; |
| 3649 | for (unsigned Idx = 0; IG->getFactor(); ++Idx) |
| 3650 | if (auto *Member = IG->getMember(Index: Idx)) { |
| 3651 | if (Member == InsertPos) |
| 3652 | break; |
| 3653 | InsertPosIdx++; |
| 3654 | } |
| 3655 | Type *ValTy = Ctx.Types.inferScalarType( |
| 3656 | V: getNumDefinedValues() > 0 ? getVPValue(I: InsertPosIdx) |
| 3657 | : getStoredValues()[InsertPosIdx]); |
| 3658 | auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF)); |
| 3659 | unsigned AS = getLoadStoreAddressSpace(I: InsertPos); |
| 3660 | |
| 3661 | unsigned InterleaveFactor = IG->getFactor(); |
| 3662 | auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor); |
| 3663 | |
| 3664 | // Holds the indices of existing members in the interleaved group. |
| 3665 | SmallVector<unsigned, 4> Indices; |
| 3666 | for (unsigned IF = 0; IF < InterleaveFactor; IF++) |
| 3667 | if (IG->getMember(Index: IF)) |
| 3668 | Indices.push_back(Elt: IF); |
| 3669 | |
| 3670 | // Calculate the cost of the whole interleaved group. |
| 3671 | InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( |
| 3672 | Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: IG->getFactor(), Indices, |
| 3673 | Alignment: IG->getAlign(), AddressSpace: AS, CostKind: Ctx.CostKind, UseMaskForCond: getMask(), UseMaskForGaps: NeedsMaskForGaps); |
| 3674 | |
| 3675 | if (!IG->isReverse()) |
| 3676 | return Cost; |
| 3677 | |
| 3678 | return Cost + IG->getNumMembers() * |
| 3679 | Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, |
| 3680 | DstTy: VectorTy, SrcTy: VectorTy, Mask: {}, CostKind: Ctx.CostKind, |
| 3681 | Index: 0); |
| 3682 | } |
| 3683 | |
| 3684 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3685 | void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3686 | VPSlotTracker &SlotTracker) const { |
| 3687 | O << Indent << "EMIT " ; |
| 3688 | printAsOperand(O, SlotTracker); |
| 3689 | O << " = CANONICAL-INDUCTION " ; |
| 3690 | printOperands(O, SlotTracker); |
| 3691 | } |
| 3692 | #endif |
| 3693 | |
| 3694 | bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { |
| 3695 | return IsScalarAfterVectorization && |
| 3696 | (!IsScalable || vputils::onlyFirstLaneUsed(Def: this)); |
| 3697 | } |
| 3698 | |
| 3699 | void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { |
| 3700 | assert(getInductionDescriptor().getKind() == |
| 3701 | InductionDescriptor::IK_PtrInduction && |
| 3702 | "Not a pointer induction according to InductionDescriptor!" ); |
| 3703 | assert(State.TypeAnalysis.inferScalarType(this)->isPointerTy() && |
| 3704 | "Unexpected type." ); |
| 3705 | assert(!onlyScalarsGenerated(State.VF.isScalable()) && |
| 3706 | "Recipe should have been replaced" ); |
| 3707 | |
| 3708 | unsigned CurrentPart = getUnrollPart(U&: *this); |
| 3709 | |
| 3710 | // Build a pointer phi |
| 3711 | Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); |
| 3712 | Type *ScStValueType = ScalarStartValue->getType(); |
| 3713 | |
| 3714 | BasicBlock *VectorPH = |
| 3715 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
| 3716 | PHINode *NewPointerPhi = nullptr; |
| 3717 | if (CurrentPart == 0) { |
| 3718 | IRBuilder<>::InsertPointGuard Guard(State.Builder); |
| 3719 | if (State.Builder.GetInsertPoint() != |
| 3720 | State.Builder.GetInsertBlock()->getFirstNonPHIIt()) |
| 3721 | State.Builder.SetInsertPoint( |
| 3722 | State.Builder.GetInsertBlock()->getFirstNonPHIIt()); |
| 3723 | NewPointerPhi = State.Builder.CreatePHI(Ty: ScStValueType, NumReservedValues: 2, Name: "pointer.phi" ); |
| 3724 | NewPointerPhi->addIncoming(V: ScalarStartValue, BB: VectorPH); |
| 3725 | NewPointerPhi->setDebugLoc(getDebugLoc()); |
| 3726 | } else { |
| 3727 | // The recipe has been unrolled. In that case, fetch the single pointer phi |
| 3728 | // shared among all unrolled parts of the recipe. |
| 3729 | auto *GEP = |
| 3730 | cast<GetElementPtrInst>(Val: State.get(Def: getFirstUnrolledPartOperand())); |
| 3731 | NewPointerPhi = cast<PHINode>(Val: GEP->getPointerOperand()); |
| 3732 | } |
| 3733 | |
| 3734 | // A pointer induction, performed by using a gep |
| 3735 | BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); |
| 3736 | Value *ScalarStepValue = State.get(Def: getStepValue(), Lane: VPLane(0)); |
| 3737 | Type *PhiType = State.TypeAnalysis.inferScalarType(V: getStepValue()); |
| 3738 | Value *RuntimeVF = getRuntimeVF(B&: State.Builder, Ty: PhiType, VF: State.VF); |
| 3739 | // Add induction update using an incorrect block temporarily. The phi node |
| 3740 | // will be fixed after VPlan execution. Note that at this point the latch |
| 3741 | // block cannot be used, as it does not exist yet. |
| 3742 | // TODO: Model increment value in VPlan, by turning the recipe into a |
| 3743 | // multi-def and a subclass of VPHeaderPHIRecipe. |
| 3744 | if (CurrentPart == 0) { |
| 3745 | // The recipe represents the first part of the pointer induction. Create the |
| 3746 | // GEP to increment the phi across all unrolled parts. |
| 3747 | Value *NumUnrolledElems = State.get(Def: getOperand(N: 2), IsScalar: true); |
| 3748 | |
| 3749 | Value *InductionGEP = GetElementPtrInst::Create( |
| 3750 | PointeeType: State.Builder.getInt8Ty(), Ptr: NewPointerPhi, |
| 3751 | IdxList: State.Builder.CreateMul( |
| 3752 | LHS: ScalarStepValue, |
| 3753 | RHS: State.Builder.CreateTrunc(V: NumUnrolledElems, DestTy: PhiType)), |
| 3754 | NameStr: "ptr.ind" , InsertBefore: InductionLoc); |
| 3755 | |
| 3756 | NewPointerPhi->addIncoming(V: InductionGEP, BB: VectorPH); |
| 3757 | } |
| 3758 | |
| 3759 | // Create actual address geps that use the pointer phi as base and a |
| 3760 | // vectorized version of the step value (<step*0, ..., step*N>) as offset. |
| 3761 | Type *VecPhiType = VectorType::get(ElementType: PhiType, EC: State.VF); |
| 3762 | Value *StartOffsetScalar = State.Builder.CreateMul( |
| 3763 | LHS: RuntimeVF, RHS: ConstantInt::get(Ty: PhiType, V: CurrentPart)); |
| 3764 | Value *StartOffset = |
| 3765 | State.Builder.CreateVectorSplat(EC: State.VF, V: StartOffsetScalar); |
| 3766 | // Create a vector of consecutive numbers from zero to VF. |
| 3767 | StartOffset = State.Builder.CreateAdd( |
| 3768 | LHS: StartOffset, RHS: State.Builder.CreateStepVector(DstType: VecPhiType)); |
| 3769 | |
| 3770 | assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) && |
| 3771 | "scalar step must be the same across all parts" ); |
| 3772 | Value *GEP = State.Builder.CreateGEP( |
| 3773 | Ty: State.Builder.getInt8Ty(), Ptr: NewPointerPhi, |
| 3774 | IdxList: State.Builder.CreateMul(LHS: StartOffset, RHS: State.Builder.CreateVectorSplat( |
| 3775 | EC: State.VF, V: ScalarStepValue)), |
| 3776 | Name: "vector.gep" ); |
| 3777 | State.set(Def: this, V: GEP); |
| 3778 | } |
| 3779 | |
| 3780 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3781 | void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3782 | VPSlotTracker &SlotTracker) const { |
| 3783 | assert((getNumOperands() == 3 || getNumOperands() == 5) && |
| 3784 | "unexpected number of operands" ); |
| 3785 | O << Indent << "EMIT " ; |
| 3786 | printAsOperand(O, SlotTracker); |
| 3787 | O << " = WIDEN-POINTER-INDUCTION " ; |
| 3788 | getStartValue()->printAsOperand(O, SlotTracker); |
| 3789 | O << ", " ; |
| 3790 | getStepValue()->printAsOperand(O, SlotTracker); |
| 3791 | O << ", " ; |
| 3792 | getOperand(2)->printAsOperand(O, SlotTracker); |
| 3793 | if (getNumOperands() == 5) { |
| 3794 | O << ", " ; |
| 3795 | getOperand(3)->printAsOperand(O, SlotTracker); |
| 3796 | O << ", " ; |
| 3797 | getOperand(4)->printAsOperand(O, SlotTracker); |
| 3798 | } |
| 3799 | } |
| 3800 | #endif |
| 3801 | |
| 3802 | void VPExpandSCEVRecipe::execute(VPTransformState &State) { |
| 3803 | assert(!State.Lane && "cannot be used in per-lane" ); |
| 3804 | const DataLayout &DL = SE.getDataLayout(); |
| 3805 | SCEVExpander Exp(SE, DL, "induction" , /*PreserveLCSSA=*/true); |
| 3806 | Value *Res = Exp.expandCodeFor(SH: Expr, Ty: Expr->getType(), |
| 3807 | I: &*State.Builder.GetInsertPoint()); |
| 3808 | State.set(Def: this, V: Res, Lane: VPLane(0)); |
| 3809 | } |
| 3810 | |
| 3811 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3812 | void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3813 | VPSlotTracker &SlotTracker) const { |
| 3814 | O << Indent << "EMIT " ; |
| 3815 | printAsOperand(O, SlotTracker); |
| 3816 | O << " = EXPAND SCEV " << *Expr; |
| 3817 | } |
| 3818 | #endif |
| 3819 | |
| 3820 | void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { |
| 3821 | Value *CanonicalIV = State.get(Def: getOperand(N: 0), /*IsScalar*/ true); |
| 3822 | Type *STy = CanonicalIV->getType(); |
| 3823 | IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); |
| 3824 | ElementCount VF = State.VF; |
| 3825 | Value *VStart = VF.isScalar() |
| 3826 | ? CanonicalIV |
| 3827 | : Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast" ); |
| 3828 | Value *VStep = createStepForVF(B&: Builder, Ty: STy, VF, Step: getUnrollPart(U&: *this)); |
| 3829 | if (VF.isVector()) { |
| 3830 | VStep = Builder.CreateVectorSplat(EC: VF, V: VStep); |
| 3831 | VStep = |
| 3832 | Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType())); |
| 3833 | } |
| 3834 | Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv" ); |
| 3835 | State.set(Def: this, V: CanonicalVectorIV); |
| 3836 | } |
| 3837 | |
| 3838 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3839 | void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3840 | VPSlotTracker &SlotTracker) const { |
| 3841 | O << Indent << "EMIT " ; |
| 3842 | printAsOperand(O, SlotTracker); |
| 3843 | O << " = WIDEN-CANONICAL-INDUCTION " ; |
| 3844 | printOperands(O, SlotTracker); |
| 3845 | } |
| 3846 | #endif |
| 3847 | |
| 3848 | void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { |
| 3849 | auto &Builder = State.Builder; |
| 3850 | // Create a vector from the initial value. |
| 3851 | auto *VectorInit = getStartValue()->getLiveInIRValue(); |
| 3852 | |
| 3853 | Type *VecTy = State.VF.isScalar() |
| 3854 | ? VectorInit->getType() |
| 3855 | : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF); |
| 3856 | |
| 3857 | BasicBlock *VectorPH = |
| 3858 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
| 3859 | if (State.VF.isVector()) { |
| 3860 | auto *IdxTy = Builder.getInt32Ty(); |
| 3861 | auto *One = ConstantInt::get(Ty: IdxTy, V: 1); |
| 3862 | IRBuilder<>::InsertPointGuard Guard(Builder); |
| 3863 | Builder.SetInsertPoint(VectorPH->getTerminator()); |
| 3864 | auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF); |
| 3865 | auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One); |
| 3866 | VectorInit = Builder.CreateInsertElement( |
| 3867 | Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init" ); |
| 3868 | } |
| 3869 | |
| 3870 | // Create a phi node for the new recurrence. |
| 3871 | PHINode *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur" ); |
| 3872 | Phi->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt()); |
| 3873 | Phi->addIncoming(V: VectorInit, BB: VectorPH); |
| 3874 | State.set(Def: this, V: Phi); |
| 3875 | } |
| 3876 | |
| 3877 | InstructionCost |
| 3878 | VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, |
| 3879 | VPCostContext &Ctx) const { |
| 3880 | if (VF.isScalar()) |
| 3881 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 3882 | |
| 3883 | return 0; |
| 3884 | } |
| 3885 | |
| 3886 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3887 | void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3888 | VPSlotTracker &SlotTracker) const { |
| 3889 | O << Indent << "FIRST-ORDER-RECURRENCE-PHI " ; |
| 3890 | printAsOperand(O, SlotTracker); |
| 3891 | O << " = phi " ; |
| 3892 | printOperands(O, SlotTracker); |
| 3893 | } |
| 3894 | #endif |
| 3895 | |
| 3896 | void VPReductionPHIRecipe::execute(VPTransformState &State) { |
| 3897 | // Reductions do not have to start at zero. They can start with |
| 3898 | // any loop invariant values. |
| 3899 | VPValue *StartVPV = getStartValue(); |
| 3900 | |
| 3901 | // In order to support recurrences we need to be able to vectorize Phi nodes. |
| 3902 | // Phi nodes have cycles, so we need to vectorize them in two stages. This is |
| 3903 | // stage #1: We create a new vector PHI node with no incoming edges. We'll use |
| 3904 | // this value when we vectorize all of the instructions that use the PHI. |
| 3905 | BasicBlock *VectorPH = |
| 3906 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
| 3907 | bool ScalarPHI = State.VF.isScalar() || IsInLoop; |
| 3908 | Value *StartV = State.get(Def: StartVPV, IsScalar: ScalarPHI); |
| 3909 | Type *VecTy = StartV->getType(); |
| 3910 | |
| 3911 | BasicBlock * = State.CFG.PrevBB; |
| 3912 | assert(State.CurrentParentLoop->getHeader() == HeaderBB && |
| 3913 | "recipe must be in the vector loop header" ); |
| 3914 | auto *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi" ); |
| 3915 | Phi->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt()); |
| 3916 | State.set(Def: this, V: Phi, IsScalar: IsInLoop); |
| 3917 | |
| 3918 | Phi->addIncoming(V: StartV, BB: VectorPH); |
| 3919 | } |
| 3920 | |
| 3921 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3922 | void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3923 | VPSlotTracker &SlotTracker) const { |
| 3924 | O << Indent << "WIDEN-REDUCTION-PHI " ; |
| 3925 | |
| 3926 | printAsOperand(O, SlotTracker); |
| 3927 | O << " = phi " ; |
| 3928 | printOperands(O, SlotTracker); |
| 3929 | if (VFScaleFactor != 1) |
| 3930 | O << " (VF scaled by 1/" << VFScaleFactor << ")" ; |
| 3931 | } |
| 3932 | #endif |
| 3933 | |
| 3934 | void VPWidenPHIRecipe::execute(VPTransformState &State) { |
| 3935 | Value *Op0 = State.get(Def: getOperand(N: 0)); |
| 3936 | Type *VecTy = Op0->getType(); |
| 3937 | Instruction *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: 2, Name); |
| 3938 | // Manually move it with the other PHIs in case PHI recipes above this one |
| 3939 | // also inserted non-phi instructions. |
| 3940 | // TODO: Remove once VPWidenPointerInductionRecipe is also expanded in |
| 3941 | // convertToConcreteRecipes. |
| 3942 | VecPhi->moveBefore(InsertPos: State.Builder.GetInsertBlock()->getFirstNonPHIIt()); |
| 3943 | State.set(Def: this, V: VecPhi); |
| 3944 | } |
| 3945 | |
| 3946 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3947 | void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3948 | VPSlotTracker &SlotTracker) const { |
| 3949 | O << Indent << "WIDEN-PHI " ; |
| 3950 | |
| 3951 | printAsOperand(O, SlotTracker); |
| 3952 | O << " = phi " ; |
| 3953 | printPhiOperands(O, SlotTracker); |
| 3954 | } |
| 3955 | #endif |
| 3956 | |
| 3957 | // TODO: It would be good to use the existing VPWidenPHIRecipe instead and |
| 3958 | // remove VPActiveLaneMaskPHIRecipe. |
| 3959 | void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { |
| 3960 | BasicBlock *VectorPH = |
| 3961 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
| 3962 | Value *StartMask = State.get(Def: getOperand(N: 0)); |
| 3963 | PHINode *Phi = |
| 3964 | State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask" ); |
| 3965 | Phi->addIncoming(V: StartMask, BB: VectorPH); |
| 3966 | State.set(Def: this, V: Phi); |
| 3967 | } |
| 3968 | |
| 3969 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3970 | void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3971 | VPSlotTracker &SlotTracker) const { |
| 3972 | O << Indent << "ACTIVE-LANE-MASK-PHI " ; |
| 3973 | |
| 3974 | printAsOperand(O, SlotTracker); |
| 3975 | O << " = phi " ; |
| 3976 | printOperands(O, SlotTracker); |
| 3977 | } |
| 3978 | #endif |
| 3979 | |
| 3980 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3981 | void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
| 3982 | VPSlotTracker &SlotTracker) const { |
| 3983 | O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI " ; |
| 3984 | |
| 3985 | printAsOperand(O, SlotTracker); |
| 3986 | O << " = phi " ; |
| 3987 | printOperands(O, SlotTracker); |
| 3988 | } |
| 3989 | #endif |
| 3990 | |