| 1 | //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file contains implementations for different VPlan recipes. |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "LoopVectorizationPlanner.h" |
| 15 | #include "VPlan.h" |
| 16 | #include "VPlanAnalysis.h" |
| 17 | #include "VPlanHelpers.h" |
| 18 | #include "VPlanPatternMatch.h" |
| 19 | #include "VPlanUtils.h" |
| 20 | #include "llvm/ADT/STLExtras.h" |
| 21 | #include "llvm/ADT/SmallVector.h" |
| 22 | #include "llvm/ADT/Twine.h" |
| 23 | #include "llvm/Analysis/AssumptionCache.h" |
| 24 | #include "llvm/Analysis/IVDescriptors.h" |
| 25 | #include "llvm/Analysis/LoopInfo.h" |
| 26 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| 27 | #include "llvm/IR/BasicBlock.h" |
| 28 | #include "llvm/IR/IRBuilder.h" |
| 29 | #include "llvm/IR/Instruction.h" |
| 30 | #include "llvm/IR/Instructions.h" |
| 31 | #include "llvm/IR/Intrinsics.h" |
| 32 | #include "llvm/IR/Type.h" |
| 33 | #include "llvm/IR/Value.h" |
| 34 | #include "llvm/Support/Casting.h" |
| 35 | #include "llvm/Support/CommandLine.h" |
| 36 | #include "llvm/Support/Debug.h" |
| 37 | #include "llvm/Support/raw_ostream.h" |
| 38 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| 39 | #include "llvm/Transforms/Utils/LoopUtils.h" |
| 40 | #include <cassert> |
| 41 | |
| 42 | using namespace llvm; |
| 43 | using namespace llvm::VPlanPatternMatch; |
| 44 | |
| 45 | using VectorParts = SmallVector<Value *, 2>; |
| 46 | |
| 47 | #define LV_NAME "loop-vectorize" |
| 48 | #define DEBUG_TYPE LV_NAME |
| 49 | |
| 50 | bool VPRecipeBase::mayWriteToMemory() const { |
| 51 | switch (getVPRecipeID()) { |
| 52 | case VPExpressionSC: |
| 53 | return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory(); |
| 54 | case VPInstructionSC: { |
| 55 | auto *VPI = cast<VPInstruction>(Val: this); |
| 56 | // Loads read from memory but don't write to memory. |
| 57 | if (VPI->getOpcode() == Instruction::Load) |
| 58 | return false; |
| 59 | return VPI->opcodeMayReadOrWriteFromMemory(); |
| 60 | } |
| 61 | case VPInterleaveEVLSC: |
| 62 | case VPInterleaveSC: |
| 63 | return cast<VPInterleaveBase>(Val: this)->getNumStoreOperands() > 0; |
| 64 | case VPWidenStoreEVLSC: |
| 65 | case VPWidenStoreSC: |
| 66 | return true; |
| 67 | case VPReplicateSC: |
| 68 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
| 69 | ->mayWriteToMemory(); |
| 70 | case VPWidenCallSC: |
| 71 | return !cast<VPWidenCallRecipe>(Val: this) |
| 72 | ->getCalledScalarFunction() |
| 73 | ->onlyReadsMemory(); |
| 74 | case VPWidenIntrinsicSC: |
| 75 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayWriteToMemory(); |
| 76 | case VPActiveLaneMaskPHISC: |
| 77 | case VPCanonicalIVPHISC: |
| 78 | case VPBranchOnMaskSC: |
| 79 | case VPDerivedIVSC: |
| 80 | case VPFirstOrderRecurrencePHISC: |
| 81 | case VPReductionPHISC: |
| 82 | case VPScalarIVStepsSC: |
| 83 | case VPPredInstPHISC: |
| 84 | return false; |
| 85 | case VPBlendSC: |
| 86 | case VPReductionEVLSC: |
| 87 | case VPReductionSC: |
| 88 | case VPVectorPointerSC: |
| 89 | case VPWidenCanonicalIVSC: |
| 90 | case VPWidenCastSC: |
| 91 | case VPWidenGEPSC: |
| 92 | case VPWidenIntOrFpInductionSC: |
| 93 | case VPWidenLoadEVLSC: |
| 94 | case VPWidenLoadSC: |
| 95 | case VPWidenPHISC: |
| 96 | case VPWidenPointerInductionSC: |
| 97 | case VPWidenSC: { |
| 98 | const Instruction *I = |
| 99 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
| 100 | (void)I; |
| 101 | assert((!I || !I->mayWriteToMemory()) && |
| 102 | "underlying instruction may write to memory" ); |
| 103 | return false; |
| 104 | } |
| 105 | default: |
| 106 | return true; |
| 107 | } |
| 108 | } |
| 109 | |
| 110 | bool VPRecipeBase::mayReadFromMemory() const { |
| 111 | switch (getVPRecipeID()) { |
| 112 | case VPExpressionSC: |
| 113 | return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory(); |
| 114 | case VPInstructionSC: |
| 115 | return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory(); |
| 116 | case VPWidenLoadEVLSC: |
| 117 | case VPWidenLoadSC: |
| 118 | return true; |
| 119 | case VPReplicateSC: |
| 120 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
| 121 | ->mayReadFromMemory(); |
| 122 | case VPWidenCallSC: |
| 123 | return !cast<VPWidenCallRecipe>(Val: this) |
| 124 | ->getCalledScalarFunction() |
| 125 | ->onlyWritesMemory(); |
| 126 | case VPWidenIntrinsicSC: |
| 127 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayReadFromMemory(); |
| 128 | case VPBranchOnMaskSC: |
| 129 | case VPDerivedIVSC: |
| 130 | case VPFirstOrderRecurrencePHISC: |
| 131 | case VPReductionPHISC: |
| 132 | case VPPredInstPHISC: |
| 133 | case VPScalarIVStepsSC: |
| 134 | case VPWidenStoreEVLSC: |
| 135 | case VPWidenStoreSC: |
| 136 | return false; |
| 137 | case VPBlendSC: |
| 138 | case VPReductionEVLSC: |
| 139 | case VPReductionSC: |
| 140 | case VPVectorPointerSC: |
| 141 | case VPWidenCanonicalIVSC: |
| 142 | case VPWidenCastSC: |
| 143 | case VPWidenGEPSC: |
| 144 | case VPWidenIntOrFpInductionSC: |
| 145 | case VPWidenPHISC: |
| 146 | case VPWidenPointerInductionSC: |
| 147 | case VPWidenSC: { |
| 148 | const Instruction *I = |
| 149 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
| 150 | (void)I; |
| 151 | assert((!I || !I->mayReadFromMemory()) && |
| 152 | "underlying instruction may read from memory" ); |
| 153 | return false; |
| 154 | } |
| 155 | default: |
| 156 | // FIXME: Return false if the recipe represents an interleaved store. |
| 157 | return true; |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | bool VPRecipeBase::mayHaveSideEffects() const { |
| 162 | switch (getVPRecipeID()) { |
| 163 | case VPExpressionSC: |
| 164 | return cast<VPExpressionRecipe>(Val: this)->mayHaveSideEffects(); |
| 165 | case VPActiveLaneMaskPHISC: |
| 166 | case VPDerivedIVSC: |
| 167 | case VPFirstOrderRecurrencePHISC: |
| 168 | case VPReductionPHISC: |
| 169 | case VPPredInstPHISC: |
| 170 | case VPVectorEndPointerSC: |
| 171 | return false; |
| 172 | case VPInstructionSC: { |
| 173 | auto *VPI = cast<VPInstruction>(Val: this); |
| 174 | return mayWriteToMemory() || |
| 175 | VPI->getOpcode() == VPInstruction::BranchOnCount || |
| 176 | VPI->getOpcode() == VPInstruction::BranchOnCond || |
| 177 | VPI->getOpcode() == VPInstruction::BranchOnTwoConds; |
| 178 | } |
| 179 | case VPWidenCallSC: { |
| 180 | Function *Fn = cast<VPWidenCallRecipe>(Val: this)->getCalledScalarFunction(); |
| 181 | return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); |
| 182 | } |
| 183 | case VPWidenIntrinsicSC: |
| 184 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayHaveSideEffects(); |
| 185 | case VPBlendSC: |
| 186 | case VPReductionEVLSC: |
| 187 | case VPReductionSC: |
| 188 | case VPScalarIVStepsSC: |
| 189 | case VPVectorPointerSC: |
| 190 | case VPWidenCanonicalIVSC: |
| 191 | case VPWidenCastSC: |
| 192 | case VPWidenGEPSC: |
| 193 | case VPWidenIntOrFpInductionSC: |
| 194 | case VPWidenPHISC: |
| 195 | case VPWidenPointerInductionSC: |
| 196 | case VPWidenSC: { |
| 197 | const Instruction *I = |
| 198 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
| 199 | (void)I; |
| 200 | assert((!I || !I->mayHaveSideEffects()) && |
| 201 | "underlying instruction has side-effects" ); |
| 202 | return false; |
| 203 | } |
| 204 | case VPInterleaveEVLSC: |
| 205 | case VPInterleaveSC: |
| 206 | return mayWriteToMemory(); |
| 207 | case VPWidenLoadEVLSC: |
| 208 | case VPWidenLoadSC: |
| 209 | case VPWidenStoreEVLSC: |
| 210 | case VPWidenStoreSC: |
| 211 | assert( |
| 212 | cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == |
| 213 | mayWriteToMemory() && |
| 214 | "mayHaveSideffects result for ingredient differs from this " |
| 215 | "implementation" ); |
| 216 | return mayWriteToMemory(); |
| 217 | case VPReplicateSC: { |
| 218 | auto *R = cast<VPReplicateRecipe>(Val: this); |
| 219 | return R->getUnderlyingInstr()->mayHaveSideEffects(); |
| 220 | } |
| 221 | default: |
| 222 | return true; |
| 223 | } |
| 224 | } |
| 225 | |
| 226 | void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { |
| 227 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
| 228 | assert(InsertPos->getParent() && |
| 229 | "Insertion position not in any VPBasicBlock" ); |
| 230 | InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator()); |
| 231 | } |
| 232 | |
| 233 | void VPRecipeBase::insertBefore(VPBasicBlock &BB, |
| 234 | iplist<VPRecipeBase>::iterator I) { |
| 235 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
| 236 | assert(I == BB.end() || I->getParent() == &BB); |
| 237 | BB.insert(Recipe: this, InsertPt: I); |
| 238 | } |
| 239 | |
| 240 | void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { |
| 241 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
| 242 | assert(InsertPos->getParent() && |
| 243 | "Insertion position not in any VPBasicBlock" ); |
| 244 | InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator())); |
| 245 | } |
| 246 | |
| 247 | void VPRecipeBase::removeFromParent() { |
| 248 | assert(getParent() && "Recipe not in any VPBasicBlock" ); |
| 249 | getParent()->getRecipeList().remove(IT: getIterator()); |
| 250 | Parent = nullptr; |
| 251 | } |
| 252 | |
| 253 | iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { |
| 254 | assert(getParent() && "Recipe not in any VPBasicBlock" ); |
| 255 | return getParent()->getRecipeList().erase(where: getIterator()); |
| 256 | } |
| 257 | |
| 258 | void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { |
| 259 | removeFromParent(); |
| 260 | insertAfter(InsertPos); |
| 261 | } |
| 262 | |
| 263 | void VPRecipeBase::moveBefore(VPBasicBlock &BB, |
| 264 | iplist<VPRecipeBase>::iterator I) { |
| 265 | removeFromParent(); |
| 266 | insertBefore(BB, I); |
| 267 | } |
| 268 | |
| 269 | InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { |
| 270 | // Get the underlying instruction for the recipe, if there is one. It is used |
| 271 | // to |
| 272 | // * decide if cost computation should be skipped for this recipe, |
| 273 | // * apply forced target instruction cost. |
| 274 | Instruction *UI = nullptr; |
| 275 | if (auto *S = dyn_cast<VPSingleDefRecipe>(Val: this)) |
| 276 | UI = dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue()); |
| 277 | else if (auto *IG = dyn_cast<VPInterleaveBase>(Val: this)) |
| 278 | UI = IG->getInsertPos(); |
| 279 | else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: this)) |
| 280 | UI = &WidenMem->getIngredient(); |
| 281 | |
| 282 | InstructionCost RecipeCost; |
| 283 | if (UI && Ctx.skipCostComputation(UI, IsVector: VF.isVector())) { |
| 284 | RecipeCost = 0; |
| 285 | } else { |
| 286 | RecipeCost = computeCost(VF, Ctx); |
| 287 | if (ForceTargetInstructionCost.getNumOccurrences() > 0 && |
| 288 | RecipeCost.isValid()) { |
| 289 | if (UI) |
| 290 | RecipeCost = InstructionCost(ForceTargetInstructionCost); |
| 291 | else |
| 292 | RecipeCost = InstructionCost(0); |
| 293 | } |
| 294 | } |
| 295 | |
| 296 | LLVM_DEBUG({ |
| 297 | dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": " ; |
| 298 | dump(); |
| 299 | }); |
| 300 | return RecipeCost; |
| 301 | } |
| 302 | |
| 303 | InstructionCost VPRecipeBase::computeCost(ElementCount VF, |
| 304 | VPCostContext &Ctx) const { |
| 305 | llvm_unreachable("subclasses should implement computeCost" ); |
| 306 | } |
| 307 | |
| 308 | bool VPRecipeBase::isPhi() const { |
| 309 | return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) || |
| 310 | isa<VPPhi, VPIRPhi>(Val: this); |
| 311 | } |
| 312 | |
| 313 | bool VPRecipeBase::isScalarCast() const { |
| 314 | auto *VPI = dyn_cast<VPInstruction>(Val: this); |
| 315 | return VPI && Instruction::isCast(Opcode: VPI->getOpcode()); |
| 316 | } |
| 317 | |
| 318 | void VPIRFlags::intersectFlags(const VPIRFlags &Other) { |
| 319 | assert(OpType == Other.OpType && "OpType must match" ); |
| 320 | switch (OpType) { |
| 321 | case OperationType::OverflowingBinOp: |
| 322 | WrapFlags.HasNUW &= Other.WrapFlags.HasNUW; |
| 323 | WrapFlags.HasNSW &= Other.WrapFlags.HasNSW; |
| 324 | break; |
| 325 | case OperationType::Trunc: |
| 326 | TruncFlags.HasNUW &= Other.TruncFlags.HasNUW; |
| 327 | TruncFlags.HasNSW &= Other.TruncFlags.HasNSW; |
| 328 | break; |
| 329 | case OperationType::DisjointOp: |
| 330 | DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint; |
| 331 | break; |
| 332 | case OperationType::PossiblyExactOp: |
| 333 | ExactFlags.IsExact &= Other.ExactFlags.IsExact; |
| 334 | break; |
| 335 | case OperationType::GEPOp: |
| 336 | GEPFlags &= Other.GEPFlags; |
| 337 | break; |
| 338 | case OperationType::FPMathOp: |
| 339 | case OperationType::FCmp: |
| 340 | assert((OpType != OperationType::FCmp || |
| 341 | FCmpFlags.Pred == Other.FCmpFlags.Pred) && |
| 342 | "Cannot drop CmpPredicate" ); |
| 343 | getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs; |
| 344 | getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs; |
| 345 | break; |
| 346 | case OperationType::NonNegOp: |
| 347 | NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg; |
| 348 | break; |
| 349 | case OperationType::Cmp: |
| 350 | assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate" ); |
| 351 | break; |
| 352 | case OperationType::ReductionOp: |
| 353 | assert(ReductionFlags.Kind == Other.ReductionFlags.Kind && |
| 354 | "Cannot change RecurKind" ); |
| 355 | assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered && |
| 356 | "Cannot change IsOrdered" ); |
| 357 | assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop && |
| 358 | "Cannot change IsInLoop" ); |
| 359 | getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs; |
| 360 | getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs; |
| 361 | break; |
| 362 | case OperationType::Other: |
| 363 | assert(AllFlags == Other.AllFlags && "Cannot drop other flags" ); |
| 364 | break; |
| 365 | } |
| 366 | } |
| 367 | |
| 368 | FastMathFlags VPIRFlags::getFastMathFlags() const { |
| 369 | assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp || |
| 370 | OpType == OperationType::ReductionOp) && |
| 371 | "recipe doesn't have fast math flags" ); |
| 372 | const FastMathFlagsTy &F = getFMFsRef(); |
| 373 | FastMathFlags Res; |
| 374 | Res.setAllowReassoc(F.AllowReassoc); |
| 375 | Res.setNoNaNs(F.NoNaNs); |
| 376 | Res.setNoInfs(F.NoInfs); |
| 377 | Res.setNoSignedZeros(F.NoSignedZeros); |
| 378 | Res.setAllowReciprocal(F.AllowReciprocal); |
| 379 | Res.setAllowContract(F.AllowContract); |
| 380 | Res.setApproxFunc(F.ApproxFunc); |
| 381 | return Res; |
| 382 | } |
| 383 | |
| 384 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 385 | void VPSingleDefRecipe::dump() const { VPRecipeBase::dump(); } |
| 386 | |
| 387 | void VPRecipeBase::print(raw_ostream &O, const Twine &Indent, |
| 388 | VPSlotTracker &SlotTracker) const { |
| 389 | printRecipe(O, Indent, SlotTracker); |
| 390 | if (auto DL = getDebugLoc()) { |
| 391 | O << ", !dbg " ; |
| 392 | DL.print(O); |
| 393 | } |
| 394 | |
| 395 | if (auto *Metadata = dyn_cast<VPIRMetadata>(this)) |
| 396 | Metadata->print(O, SlotTracker); |
| 397 | } |
| 398 | #endif |
| 399 | |
| 400 | template <unsigned PartOpIdx> |
| 401 | VPValue * |
| 402 | VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const { |
| 403 | if (U.getNumOperands() == PartOpIdx + 1) |
| 404 | return U.getOperand(N: PartOpIdx); |
| 405 | return nullptr; |
| 406 | } |
| 407 | |
| 408 | template <unsigned PartOpIdx> |
| 409 | unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const { |
| 410 | if (auto *UnrollPartOp = getUnrollPartOperand(U)) |
| 411 | return cast<VPConstantInt>(UnrollPartOp)->getZExtValue(); |
| 412 | return 0; |
| 413 | } |
| 414 | |
| 415 | namespace llvm { |
| 416 | template class VPUnrollPartAccessor<1>; |
| 417 | template class VPUnrollPartAccessor<2>; |
| 418 | template class VPUnrollPartAccessor<3>; |
| 419 | } |
| 420 | |
| 421 | VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, |
| 422 | const VPIRFlags &Flags, const VPIRMetadata &MD, |
| 423 | DebugLoc DL, const Twine &Name) |
| 424 | : VPRecipeWithIRFlags(VPRecipeBase::VPInstructionSC, Operands, Flags, DL), |
| 425 | VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) { |
| 426 | assert(flagsValidForOpcode(getOpcode()) && |
| 427 | "Set flags not supported for the provided opcode" ); |
| 428 | assert(hasRequiredFlagsForOpcode(getOpcode()) && |
| 429 | "Opcode requires specific flags to be set" ); |
| 430 | assert((getNumOperandsForOpcode(Opcode) == -1u || |
| 431 | getNumOperandsForOpcode(Opcode) == getNumOperands()) && |
| 432 | "number of operands does not match opcode" ); |
| 433 | } |
| 434 | |
| 435 | #ifndef NDEBUG |
| 436 | unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { |
| 437 | if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode)) |
| 438 | return 1; |
| 439 | |
| 440 | if (Instruction::isBinaryOp(Opcode)) |
| 441 | return 2; |
| 442 | |
| 443 | switch (Opcode) { |
| 444 | case VPInstruction::StepVector: |
| 445 | case VPInstruction::VScale: |
| 446 | return 0; |
| 447 | case Instruction::Alloca: |
| 448 | case Instruction::ExtractValue: |
| 449 | case Instruction::Freeze: |
| 450 | case Instruction::Load: |
| 451 | case VPInstruction::BranchOnCond: |
| 452 | case VPInstruction::Broadcast: |
| 453 | case VPInstruction::BuildStructVector: |
| 454 | case VPInstruction::BuildVector: |
| 455 | case VPInstruction::CalculateTripCountMinusVF: |
| 456 | case VPInstruction::CanonicalIVIncrementForPart: |
| 457 | case VPInstruction::ComputeReductionResult: |
| 458 | case VPInstruction::ExplicitVectorLength: |
| 459 | case VPInstruction::ExtractLastLane: |
| 460 | case VPInstruction::ExtractLastPart: |
| 461 | case VPInstruction::ExtractPenultimateElement: |
| 462 | case VPInstruction::Not: |
| 463 | case VPInstruction::ResumeForEpilogue: |
| 464 | case VPInstruction::Reverse: |
| 465 | case VPInstruction::Unpack: |
| 466 | return 1; |
| 467 | case Instruction::ICmp: |
| 468 | case Instruction::FCmp: |
| 469 | case Instruction::ExtractElement: |
| 470 | case Instruction::Store: |
| 471 | case VPInstruction::BranchOnCount: |
| 472 | case VPInstruction::BranchOnTwoConds: |
| 473 | case VPInstruction::FirstOrderRecurrenceSplice: |
| 474 | case VPInstruction::LogicalAnd: |
| 475 | case VPInstruction::PtrAdd: |
| 476 | case VPInstruction::WidePtrAdd: |
| 477 | case VPInstruction::WideIVStep: |
| 478 | return 2; |
| 479 | case Instruction::Select: |
| 480 | case VPInstruction::ActiveLaneMask: |
| 481 | case VPInstruction::ComputeAnyOfResult: |
| 482 | case VPInstruction::ReductionStartVector: |
| 483 | case VPInstruction::ExtractLastActive: |
| 484 | return 3; |
| 485 | case Instruction::Call: |
| 486 | case Instruction::GetElementPtr: |
| 487 | case Instruction::PHI: |
| 488 | case Instruction::Switch: |
| 489 | case VPInstruction::AnyOf: |
| 490 | case VPInstruction::FirstActiveLane: |
| 491 | case VPInstruction::LastActiveLane: |
| 492 | case VPInstruction::SLPLoad: |
| 493 | case VPInstruction::SLPStore: |
| 494 | case VPInstruction::ExtractLane: |
| 495 | // Cannot determine the number of operands from the opcode. |
| 496 | return -1u; |
| 497 | } |
| 498 | llvm_unreachable("all cases should be handled above" ); |
| 499 | } |
| 500 | #endif |
| 501 | |
| 502 | bool VPInstruction::doesGeneratePerAllLanes() const { |
| 503 | return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this); |
| 504 | } |
| 505 | |
| 506 | bool VPInstruction::canGenerateScalarForFirstLane() const { |
| 507 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
| 508 | return true; |
| 509 | if (isSingleScalar() || isVectorToScalar()) |
| 510 | return true; |
| 511 | switch (Opcode) { |
| 512 | case Instruction::Freeze: |
| 513 | case Instruction::ICmp: |
| 514 | case Instruction::PHI: |
| 515 | case Instruction::Select: |
| 516 | case VPInstruction::BranchOnCond: |
| 517 | case VPInstruction::BranchOnTwoConds: |
| 518 | case VPInstruction::BranchOnCount: |
| 519 | case VPInstruction::CalculateTripCountMinusVF: |
| 520 | case VPInstruction::CanonicalIVIncrementForPart: |
| 521 | case VPInstruction::PtrAdd: |
| 522 | case VPInstruction::ExplicitVectorLength: |
| 523 | case VPInstruction::AnyOf: |
| 524 | case VPInstruction::Not: |
| 525 | return true; |
| 526 | default: |
| 527 | return false; |
| 528 | } |
| 529 | } |
| 530 | |
| 531 | Value *VPInstruction::generate(VPTransformState &State) { |
| 532 | IRBuilderBase &Builder = State.Builder; |
| 533 | |
| 534 | if (Instruction::isBinaryOp(Opcode: getOpcode())) { |
| 535 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 536 | Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
| 537 | Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
| 538 | auto *Res = |
| 539 | Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name); |
| 540 | if (auto *I = dyn_cast<Instruction>(Val: Res)) |
| 541 | applyFlags(I&: *I); |
| 542 | return Res; |
| 543 | } |
| 544 | |
| 545 | switch (getOpcode()) { |
| 546 | case VPInstruction::Not: { |
| 547 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 548 | Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
| 549 | return Builder.CreateNot(V: A, Name); |
| 550 | } |
| 551 | case Instruction::ExtractElement: { |
| 552 | assert(State.VF.isVector() && "Only extract elements from vectors" ); |
| 553 | if (auto *Idx = dyn_cast<VPConstantInt>(Val: getOperand(N: 1))) |
| 554 | return State.get(Def: getOperand(N: 0), Lane: VPLane(Idx->getZExtValue())); |
| 555 | Value *Vec = State.get(Def: getOperand(N: 0)); |
| 556 | Value *Idx = State.get(Def: getOperand(N: 1), /*IsScalar=*/true); |
| 557 | return Builder.CreateExtractElement(Vec, Idx, Name); |
| 558 | } |
| 559 | case Instruction::Freeze: { |
| 560 | Value *Op = State.get(Def: getOperand(N: 0), IsScalar: vputils::onlyFirstLaneUsed(Def: this)); |
| 561 | return Builder.CreateFreeze(V: Op, Name); |
| 562 | } |
| 563 | case Instruction::FCmp: |
| 564 | case Instruction::ICmp: { |
| 565 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 566 | Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
| 567 | Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
| 568 | return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name); |
| 569 | } |
| 570 | case Instruction::PHI: { |
| 571 | llvm_unreachable("should be handled by VPPhi::execute" ); |
| 572 | } |
| 573 | case Instruction::Select: { |
| 574 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 575 | Value *Cond = |
| 576 | State.get(Def: getOperand(N: 0), |
| 577 | IsScalar: OnlyFirstLaneUsed || vputils::isSingleScalar(VPV: getOperand(N: 0))); |
| 578 | Value *Op1 = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
| 579 | Value *Op2 = State.get(Def: getOperand(N: 2), IsScalar: OnlyFirstLaneUsed); |
| 580 | return Builder.CreateSelect(C: Cond, True: Op1, False: Op2, Name); |
| 581 | } |
| 582 | case VPInstruction::ActiveLaneMask: { |
| 583 | // Get first lane of vector induction variable. |
| 584 | Value *VIVElem0 = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 585 | // Get the original loop tripcount. |
| 586 | Value *ScalarTC = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 587 | |
| 588 | // If this part of the active lane mask is scalar, generate the CMP directly |
| 589 | // to avoid unnecessary extracts. |
| 590 | if (State.VF.isScalar()) |
| 591 | return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC, |
| 592 | Name); |
| 593 | |
| 594 | ElementCount EC = State.VF.multiplyCoefficientBy( |
| 595 | RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue()); |
| 596 | auto *PredTy = VectorType::get(ElementType: Builder.getInt1Ty(), EC); |
| 597 | return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask, |
| 598 | Types: {PredTy, ScalarTC->getType()}, |
| 599 | Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name); |
| 600 | } |
| 601 | case VPInstruction::FirstOrderRecurrenceSplice: { |
| 602 | // Generate code to combine the previous and current values in vector v3. |
| 603 | // |
| 604 | // vector.ph: |
| 605 | // v_init = vector(..., ..., ..., a[-1]) |
| 606 | // br vector.body |
| 607 | // |
| 608 | // vector.body |
| 609 | // i = phi [0, vector.ph], [i+4, vector.body] |
| 610 | // v1 = phi [v_init, vector.ph], [v2, vector.body] |
| 611 | // v2 = a[i, i+1, i+2, i+3]; |
| 612 | // v3 = vector(v1(3), v2(0, 1, 2)) |
| 613 | |
| 614 | auto *V1 = State.get(Def: getOperand(N: 0)); |
| 615 | if (!V1->getType()->isVectorTy()) |
| 616 | return V1; |
| 617 | Value *V2 = State.get(Def: getOperand(N: 1)); |
| 618 | return Builder.CreateVectorSpliceRight(V1, V2, Offset: 1, Name); |
| 619 | } |
| 620 | case VPInstruction::CalculateTripCountMinusVF: { |
| 621 | unsigned UF = getParent()->getPlan()->getUF(); |
| 622 | Value *ScalarTC = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 623 | Value *Step = createStepForVF(B&: Builder, Ty: ScalarTC->getType(), VF: State.VF, Step: UF); |
| 624 | Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: Step); |
| 625 | Value *Cmp = Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: Step); |
| 626 | Value *Zero = ConstantInt::getNullValue(Ty: ScalarTC->getType()); |
| 627 | return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero); |
| 628 | } |
| 629 | case VPInstruction::ExplicitVectorLength: { |
| 630 | // TODO: Restructure this code with an explicit remainder loop, vsetvli can |
| 631 | // be outside of the main loop. |
| 632 | Value *AVL = State.get(Def: getOperand(N: 0), /*IsScalar*/ true); |
| 633 | // Compute EVL |
| 634 | assert(AVL->getType()->isIntegerTy() && |
| 635 | "Requested vector length should be an integer." ); |
| 636 | |
| 637 | assert(State.VF.isScalable() && "Expected scalable vector factor." ); |
| 638 | Value *VFArg = Builder.getInt32(C: State.VF.getKnownMinValue()); |
| 639 | |
| 640 | Value *EVL = Builder.CreateIntrinsic( |
| 641 | RetTy: Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length, |
| 642 | Args: {AVL, VFArg, Builder.getTrue()}); |
| 643 | return EVL; |
| 644 | } |
| 645 | case VPInstruction::CanonicalIVIncrementForPart: { |
| 646 | unsigned Part = getUnrollPart(U: *this); |
| 647 | auto *IV = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 648 | assert(Part != 0 && "Must have a positive part" ); |
| 649 | // The canonical IV is incremented by the vectorization factor (num of |
| 650 | // SIMD elements) times the unroll part. |
| 651 | Value *Step = createStepForVF(B&: Builder, Ty: IV->getType(), VF: State.VF, Step: Part); |
| 652 | return Builder.CreateAdd(LHS: IV, RHS: Step, Name, HasNUW: hasNoUnsignedWrap(), |
| 653 | HasNSW: hasNoSignedWrap()); |
| 654 | } |
| 655 | case VPInstruction::BranchOnCond: { |
| 656 | Value *Cond = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 657 | // Replace the temporary unreachable terminator with a new conditional |
| 658 | // branch, hooking it up to backward destination for latch blocks now, and |
| 659 | // to forward destination(s) later when they are created. |
| 660 | // Second successor may be backwards - iff it is already in VPBB2IRBB. |
| 661 | VPBasicBlock *SecondVPSucc = |
| 662 | cast<VPBasicBlock>(Val: getParent()->getSuccessors()[1]); |
| 663 | BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(Val: SecondVPSucc); |
| 664 | BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()]; |
| 665 | auto *Br = Builder.CreateCondBr(Cond, True: IRBB, False: SecondIRSucc); |
| 666 | // First successor is always forward, reset it to nullptr. |
| 667 | Br->setSuccessor(idx: 0, NewSucc: nullptr); |
| 668 | IRBB->getTerminator()->eraseFromParent(); |
| 669 | applyMetadata(I&: *Br); |
| 670 | return Br; |
| 671 | } |
| 672 | case VPInstruction::Broadcast: { |
| 673 | return Builder.CreateVectorSplat( |
| 674 | EC: State.VF, V: State.get(Def: getOperand(N: 0), /*IsScalar*/ true), Name: "broadcast" ); |
| 675 | } |
| 676 | case VPInstruction::BuildStructVector: { |
| 677 | // For struct types, we need to build a new 'wide' struct type, where each |
| 678 | // element is widened, i.e., we create a struct of vectors. |
| 679 | auto *StructTy = |
| 680 | cast<StructType>(Val: State.TypeAnalysis.inferScalarType(V: getOperand(N: 0))); |
| 681 | Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: StructTy, EC: State.VF)); |
| 682 | for (const auto &[LaneIndex, Op] : enumerate(First: operands())) { |
| 683 | for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements(); |
| 684 | FieldIndex++) { |
| 685 | Value *ScalarValue = |
| 686 | Builder.CreateExtractValue(Agg: State.get(Def: Op, IsScalar: true), Idxs: FieldIndex); |
| 687 | Value *VectorValue = Builder.CreateExtractValue(Agg: Res, Idxs: FieldIndex); |
| 688 | VectorValue = |
| 689 | Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneIndex); |
| 690 | Res = Builder.CreateInsertValue(Agg: Res, Val: VectorValue, Idxs: FieldIndex); |
| 691 | } |
| 692 | } |
| 693 | return Res; |
| 694 | } |
| 695 | case VPInstruction::BuildVector: { |
| 696 | auto *ScalarTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)); |
| 697 | auto NumOfElements = ElementCount::getFixed(MinVal: getNumOperands()); |
| 698 | Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: ScalarTy, EC: NumOfElements)); |
| 699 | for (const auto &[Idx, Op] : enumerate(First: operands())) |
| 700 | Res = Builder.CreateInsertElement(Vec: Res, NewElt: State.get(Def: Op, IsScalar: true), |
| 701 | Idx: Builder.getInt32(C: Idx)); |
| 702 | return Res; |
| 703 | } |
| 704 | case VPInstruction::ReductionStartVector: { |
| 705 | if (State.VF.isScalar()) |
| 706 | return State.get(Def: getOperand(N: 0), IsScalar: true); |
| 707 | IRBuilderBase::FastMathFlagGuard FMFG(Builder); |
| 708 | Builder.setFastMathFlags(getFastMathFlags()); |
| 709 | // If this start vector is scaled then it should produce a vector with fewer |
| 710 | // elements than the VF. |
| 711 | ElementCount VF = State.VF.divideCoefficientBy( |
| 712 | RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue()); |
| 713 | auto *Iden = Builder.CreateVectorSplat(EC: VF, V: State.get(Def: getOperand(N: 1), IsScalar: true)); |
| 714 | return Builder.CreateInsertElement(Vec: Iden, NewElt: State.get(Def: getOperand(N: 0), IsScalar: true), |
| 715 | Idx: Builder.getInt32(C: 0)); |
| 716 | } |
| 717 | case VPInstruction::ComputeAnyOfResult: { |
| 718 | Value *Start = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 719 | Value *NewVal = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 720 | Value *ReducedResult = State.get(Def: getOperand(N: 2)); |
| 721 | for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx) |
| 722 | ReducedResult = |
| 723 | Builder.CreateBinOp(Opc: Instruction::Or, LHS: State.get(Def: getOperand(N: Idx)), |
| 724 | RHS: ReducedResult, Name: "bin.rdx" ); |
| 725 | // If any predicate is true it means that we want to select the new value. |
| 726 | if (ReducedResult->getType()->isVectorTy()) |
| 727 | ReducedResult = Builder.CreateOrReduce(Src: ReducedResult); |
| 728 | // The compares in the loop may yield poison, which propagates through the |
| 729 | // bitwise ORs. Freeze it here before the condition is used. |
| 730 | ReducedResult = Builder.CreateFreeze(V: ReducedResult); |
| 731 | return Builder.CreateSelect(C: ReducedResult, True: NewVal, False: Start, Name: "rdx.select" ); |
| 732 | } |
| 733 | case VPInstruction::ComputeReductionResult: { |
| 734 | RecurKind RK = getRecurKind(); |
| 735 | bool IsOrdered = isReductionOrdered(); |
| 736 | bool IsInLoop = isReductionInLoop(); |
| 737 | assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && |
| 738 | "FindIV should use min/max reduction kinds" ); |
| 739 | |
| 740 | // The recipe may have multiple operands to be reduced together. |
| 741 | unsigned NumOperandsToReduce = getNumOperands(); |
| 742 | VectorParts RdxParts(NumOperandsToReduce); |
| 743 | for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part) |
| 744 | RdxParts[Part] = State.get(Def: getOperand(N: Part), IsScalar: IsInLoop); |
| 745 | |
| 746 | IRBuilderBase::FastMathFlagGuard FMFG(Builder); |
| 747 | if (hasFastMathFlags()) |
| 748 | Builder.setFastMathFlags(getFastMathFlags()); |
| 749 | |
| 750 | // Reduce multiple operands into one. |
| 751 | Value *ReducedPartRdx = RdxParts[0]; |
| 752 | if (IsOrdered) { |
| 753 | ReducedPartRdx = RdxParts[NumOperandsToReduce - 1]; |
| 754 | } else { |
| 755 | // Floating-point operations should have some FMF to enable the reduction. |
| 756 | for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) { |
| 757 | Value *RdxPart = RdxParts[Part]; |
| 758 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) |
| 759 | ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart); |
| 760 | else { |
| 761 | // For sub-recurrences, each part's reduction variable is already |
| 762 | // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1) |
| 763 | Instruction::BinaryOps Opcode = |
| 764 | RK == RecurKind::Sub |
| 765 | ? Instruction::Add |
| 766 | : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind: RK); |
| 767 | ReducedPartRdx = |
| 768 | Builder.CreateBinOp(Opc: Opcode, LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx" ); |
| 769 | } |
| 770 | } |
| 771 | } |
| 772 | |
| 773 | // Create the reduction after the loop. Note that inloop reductions create |
| 774 | // the target reduction in the loop using a Reduction recipe. |
| 775 | if (State.VF.isVector() && !IsInLoop) { |
| 776 | // TODO: Support in-order reductions based on the recurrence descriptor. |
| 777 | // All ops in the reduction inherit fast-math-flags from the recurrence |
| 778 | // descriptor. |
| 779 | ReducedPartRdx = createSimpleReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK); |
| 780 | } |
| 781 | |
| 782 | return ReducedPartRdx; |
| 783 | } |
| 784 | case VPInstruction::ExtractLastLane: |
| 785 | case VPInstruction::ExtractPenultimateElement: { |
| 786 | unsigned Offset = |
| 787 | getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1; |
| 788 | Value *Res; |
| 789 | if (State.VF.isVector()) { |
| 790 | assert(Offset <= State.VF.getKnownMinValue() && |
| 791 | "invalid offset to extract from" ); |
| 792 | // Extract lane VF - Offset from the operand. |
| 793 | Res = State.get(Def: getOperand(N: 0), Lane: VPLane::getLaneFromEnd(VF: State.VF, Offset)); |
| 794 | } else { |
| 795 | // TODO: Remove ExtractLastLane for scalar VFs. |
| 796 | assert(Offset <= 1 && "invalid offset to extract from" ); |
| 797 | Res = State.get(Def: getOperand(N: 0)); |
| 798 | } |
| 799 | if (isa<ExtractElementInst>(Val: Res)) |
| 800 | Res->setName(Name); |
| 801 | return Res; |
| 802 | } |
| 803 | case VPInstruction::LogicalAnd: { |
| 804 | Value *A = State.get(Def: getOperand(N: 0)); |
| 805 | Value *B = State.get(Def: getOperand(N: 1)); |
| 806 | return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name); |
| 807 | } |
| 808 | case VPInstruction::PtrAdd: { |
| 809 | assert(vputils::onlyFirstLaneUsed(this) && |
| 810 | "can only generate first lane for PtrAdd" ); |
| 811 | Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 812 | Value *Addend = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 813 | return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags()); |
| 814 | } |
| 815 | case VPInstruction::WidePtrAdd: { |
| 816 | Value *Ptr = |
| 817 | State.get(Def: getOperand(N: 0), IsScalar: vputils::isSingleScalar(VPV: getOperand(N: 0))); |
| 818 | Value *Addend = State.get(Def: getOperand(N: 1)); |
| 819 | return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags()); |
| 820 | } |
| 821 | case VPInstruction::AnyOf: { |
| 822 | Value *Res = Builder.CreateFreeze(V: State.get(Def: getOperand(N: 0))); |
| 823 | for (VPValue *Op : drop_begin(RangeOrContainer: operands())) |
| 824 | Res = Builder.CreateOr(LHS: Res, RHS: Builder.CreateFreeze(V: State.get(Def: Op))); |
| 825 | return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Src: Res); |
| 826 | } |
| 827 | case VPInstruction::ExtractLane: { |
| 828 | assert(getNumOperands() != 2 && "ExtractLane from single source should be " |
| 829 | "simplified to ExtractElement." ); |
| 830 | Value * = State.get(Def: getOperand(N: 0), IsScalar: true); |
| 831 | Type *IdxTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)); |
| 832 | Value *Res = nullptr; |
| 833 | Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF); |
| 834 | |
| 835 | for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) { |
| 836 | Value *VectorStart = |
| 837 | Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: IdxTy, V: Idx - 1)); |
| 838 | Value *VectorIdx = Idx == 1 |
| 839 | ? LaneToExtract |
| 840 | : Builder.CreateSub(LHS: LaneToExtract, RHS: VectorStart); |
| 841 | Value *Ext = State.VF.isScalar() |
| 842 | ? State.get(Def: getOperand(N: Idx)) |
| 843 | : Builder.CreateExtractElement( |
| 844 | Vec: State.get(Def: getOperand(N: Idx)), Idx: VectorIdx); |
| 845 | if (Res) { |
| 846 | Value *Cmp = Builder.CreateICmpUGE(LHS: LaneToExtract, RHS: VectorStart); |
| 847 | Res = Builder.CreateSelect(C: Cmp, True: Ext, False: Res); |
| 848 | } else { |
| 849 | Res = Ext; |
| 850 | } |
| 851 | } |
| 852 | return Res; |
| 853 | } |
| 854 | case VPInstruction::FirstActiveLane: { |
| 855 | if (getNumOperands() == 1) { |
| 856 | Value *Mask = State.get(Def: getOperand(N: 0)); |
| 857 | return Builder.CreateCountTrailingZeroElems(ResTy: Builder.getInt64Ty(), Mask, |
| 858 | /*ZeroIsPoison=*/false, Name); |
| 859 | } |
| 860 | // If there are multiple operands, create a chain of selects to pick the |
| 861 | // first operand with an active lane and add the number of lanes of the |
| 862 | // preceding operands. |
| 863 | Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: Builder.getInt64Ty(), VF: State.VF); |
| 864 | unsigned LastOpIdx = getNumOperands() - 1; |
| 865 | Value *Res = nullptr; |
| 866 | for (int Idx = LastOpIdx; Idx >= 0; --Idx) { |
| 867 | Value *TrailingZeros = |
| 868 | State.VF.isScalar() |
| 869 | ? Builder.CreateZExt( |
| 870 | V: Builder.CreateICmpEQ(LHS: State.get(Def: getOperand(N: Idx)), |
| 871 | RHS: Builder.getFalse()), |
| 872 | DestTy: Builder.getInt64Ty()) |
| 873 | : Builder.CreateCountTrailingZeroElems( |
| 874 | ResTy: Builder.getInt64Ty(), Mask: State.get(Def: getOperand(N: Idx)), |
| 875 | /*ZeroIsPoison=*/false, Name); |
| 876 | Value *Current = Builder.CreateAdd( |
| 877 | LHS: Builder.CreateMul(LHS: RuntimeVF, RHS: Builder.getInt64(C: Idx)), RHS: TrailingZeros); |
| 878 | if (Res) { |
| 879 | Value *Cmp = Builder.CreateICmpNE(LHS: TrailingZeros, RHS: RuntimeVF); |
| 880 | Res = Builder.CreateSelect(C: Cmp, True: Current, False: Res); |
| 881 | } else { |
| 882 | Res = Current; |
| 883 | } |
| 884 | } |
| 885 | |
| 886 | return Res; |
| 887 | } |
| 888 | case VPInstruction::ResumeForEpilogue: |
| 889 | return State.get(Def: getOperand(N: 0), IsScalar: true); |
| 890 | case VPInstruction::Reverse: |
| 891 | return Builder.CreateVectorReverse(V: State.get(Def: getOperand(N: 0)), Name: "reverse" ); |
| 892 | case VPInstruction::ExtractLastActive: { |
| 893 | Value *Data = State.get(Def: getOperand(N: 0)); |
| 894 | Value *Mask = State.get(Def: getOperand(N: 1)); |
| 895 | Value *Default = State.get(Def: getOperand(N: 2), /*IsScalar=*/true); |
| 896 | Type *VTy = Data->getType(); |
| 897 | return Builder.CreateIntrinsic( |
| 898 | ID: Intrinsic::experimental_vector_extract_last_active, Types: {VTy}, |
| 899 | Args: {Data, Mask, Default}); |
| 900 | } |
| 901 | default: |
| 902 | llvm_unreachable("Unsupported opcode for instruction" ); |
| 903 | } |
| 904 | } |
| 905 | |
| 906 | InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode( |
| 907 | unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const { |
| 908 | Type *ScalarTy = Ctx.Types.inferScalarType(V: this); |
| 909 | Type *ResultTy = VF.isVector() ? toVectorTy(Scalar: ScalarTy, EC: VF) : ScalarTy; |
| 910 | switch (Opcode) { |
| 911 | case Instruction::FNeg: |
| 912 | return Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: ResultTy, CostKind: Ctx.CostKind); |
| 913 | case Instruction::UDiv: |
| 914 | case Instruction::SDiv: |
| 915 | case Instruction::SRem: |
| 916 | case Instruction::URem: |
| 917 | case Instruction::Add: |
| 918 | case Instruction::FAdd: |
| 919 | case Instruction::Sub: |
| 920 | case Instruction::FSub: |
| 921 | case Instruction::Mul: |
| 922 | case Instruction::FMul: |
| 923 | case Instruction::FDiv: |
| 924 | case Instruction::FRem: |
| 925 | case Instruction::Shl: |
| 926 | case Instruction::LShr: |
| 927 | case Instruction::AShr: |
| 928 | case Instruction::And: |
| 929 | case Instruction::Or: |
| 930 | case Instruction::Xor: { |
| 931 | TargetTransformInfo::OperandValueInfo RHSInfo = { |
| 932 | .Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}; |
| 933 | |
| 934 | if (VF.isVector()) { |
| 935 | // Certain instructions can be cheaper to vectorize if they have a |
| 936 | // constant second vector operand. One example of this are shifts on x86. |
| 937 | VPValue *RHS = getOperand(N: 1); |
| 938 | RHSInfo = Ctx.getOperandInfo(V: RHS); |
| 939 | |
| 940 | if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && |
| 941 | getOperand(N: 1)->isDefinedOutsideLoopRegions()) |
| 942 | RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; |
| 943 | } |
| 944 | |
| 945 | Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()); |
| 946 | SmallVector<const Value *, 4> Operands; |
| 947 | if (CtxI) |
| 948 | Operands.append(in_start: CtxI->value_op_begin(), in_end: CtxI->value_op_end()); |
| 949 | return Ctx.TTI.getArithmeticInstrCost( |
| 950 | Opcode, Ty: ResultTy, CostKind: Ctx.CostKind, |
| 951 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
| 952 | Opd2Info: RHSInfo, Args: Operands, CxtI: CtxI, TLibInfo: &Ctx.TLI); |
| 953 | } |
| 954 | case Instruction::Freeze: |
| 955 | // This opcode is unknown. Assume that it is the same as 'mul'. |
| 956 | return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: ResultTy, |
| 957 | CostKind: Ctx.CostKind); |
| 958 | case Instruction::ExtractValue: |
| 959 | return Ctx.TTI.getInsertExtractValueCost(Opcode: Instruction::ExtractValue, |
| 960 | CostKind: Ctx.CostKind); |
| 961 | case Instruction::ICmp: |
| 962 | case Instruction::FCmp: { |
| 963 | Type *ScalarOpTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 964 | Type *OpTy = VF.isVector() ? toVectorTy(Scalar: ScalarOpTy, EC: VF) : ScalarOpTy; |
| 965 | Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()); |
| 966 | return Ctx.TTI.getCmpSelInstrCost( |
| 967 | Opcode, ValTy: OpTy, CondTy: CmpInst::makeCmpResultType(opnd_type: OpTy), VecPred: getPredicate(), |
| 968 | CostKind: Ctx.CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
| 969 | Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CtxI); |
| 970 | } |
| 971 | case Instruction::BitCast: { |
| 972 | Type *ScalarTy = Ctx.Types.inferScalarType(V: this); |
| 973 | if (ScalarTy->isPointerTy()) |
| 974 | return 0; |
| 975 | [[fallthrough]]; |
| 976 | } |
| 977 | case Instruction::SExt: |
| 978 | case Instruction::ZExt: |
| 979 | case Instruction::FPToUI: |
| 980 | case Instruction::FPToSI: |
| 981 | case Instruction::FPExt: |
| 982 | case Instruction::PtrToInt: |
| 983 | case Instruction::PtrToAddr: |
| 984 | case Instruction::IntToPtr: |
| 985 | case Instruction::SIToFP: |
| 986 | case Instruction::UIToFP: |
| 987 | case Instruction::Trunc: |
| 988 | case Instruction::FPTrunc: |
| 989 | case Instruction::AddrSpaceCast: { |
| 990 | // Computes the CastContextHint from a recipe that may access memory. |
| 991 | auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { |
| 992 | if (isa<VPInterleaveBase>(Val: R)) |
| 993 | return TTI::CastContextHint::Interleave; |
| 994 | if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(Val: R)) { |
| 995 | // Only compute CCH for memory operations, matching the legacy model |
| 996 | // which only considers loads/stores for cast context hints. |
| 997 | auto *UI = cast<Instruction>(Val: ReplicateRecipe->getUnderlyingValue()); |
| 998 | if (!isa<LoadInst, StoreInst>(Val: UI)) |
| 999 | return TTI::CastContextHint::None; |
| 1000 | return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked |
| 1001 | : TTI::CastContextHint::Normal; |
| 1002 | } |
| 1003 | const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(Val: R); |
| 1004 | if (WidenMemoryRecipe == nullptr) |
| 1005 | return TTI::CastContextHint::None; |
| 1006 | if (VF.isScalar()) |
| 1007 | return TTI::CastContextHint::Normal; |
| 1008 | if (!WidenMemoryRecipe->isConsecutive()) |
| 1009 | return TTI::CastContextHint::GatherScatter; |
| 1010 | if (WidenMemoryRecipe->isReverse()) |
| 1011 | return TTI::CastContextHint::Reversed; |
| 1012 | if (WidenMemoryRecipe->isMasked()) |
| 1013 | return TTI::CastContextHint::Masked; |
| 1014 | return TTI::CastContextHint::Normal; |
| 1015 | }; |
| 1016 | |
| 1017 | VPValue *Operand = getOperand(N: 0); |
| 1018 | TTI::CastContextHint CCH = TTI::CastContextHint::None; |
| 1019 | // For Trunc/FPTrunc, get the context from the only user. |
| 1020 | if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { |
| 1021 | auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * { |
| 1022 | if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser()) |
| 1023 | return nullptr; |
| 1024 | return dyn_cast<VPRecipeBase>(Val: *R->user_begin()); |
| 1025 | }; |
| 1026 | if (VPRecipeBase *Recipe = GetOnlyUser(this)) { |
| 1027 | if (match(V: Recipe, P: m_Reverse(Op0: m_VPValue()))) |
| 1028 | Recipe = GetOnlyUser(cast<VPInstruction>(Val: Recipe)); |
| 1029 | if (Recipe) |
| 1030 | CCH = ComputeCCH(Recipe); |
| 1031 | } |
| 1032 | } |
| 1033 | // For Z/Sext, get the context from the operand. |
| 1034 | else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || |
| 1035 | Opcode == Instruction::FPExt) { |
| 1036 | if (auto *Recipe = Operand->getDefiningRecipe()) { |
| 1037 | VPValue *ReverseOp; |
| 1038 | if (match(V: Recipe, P: m_Reverse(Op0: m_VPValue(V&: ReverseOp)))) |
| 1039 | Recipe = ReverseOp->getDefiningRecipe(); |
| 1040 | if (Recipe) |
| 1041 | CCH = ComputeCCH(Recipe); |
| 1042 | } |
| 1043 | } |
| 1044 | |
| 1045 | auto *ScalarSrcTy = Ctx.Types.inferScalarType(V: Operand); |
| 1046 | Type *SrcTy = VF.isVector() ? toVectorTy(Scalar: ScalarSrcTy, EC: VF) : ScalarSrcTy; |
| 1047 | // Arm TTI will use the underlying instruction to determine the cost. |
| 1048 | return Ctx.TTI.getCastInstrCost( |
| 1049 | Opcode, Dst: ResultTy, Src: SrcTy, CCH, CostKind: Ctx.CostKind, |
| 1050 | I: dyn_cast_if_present<Instruction>(Val: getUnderlyingValue())); |
| 1051 | } |
| 1052 | case Instruction::Select: { |
| 1053 | SelectInst *SI = cast_or_null<SelectInst>(Val: getUnderlyingValue()); |
| 1054 | bool IsScalarCond = getOperand(N: 0)->isDefinedOutsideLoopRegions(); |
| 1055 | Type *ScalarTy = Ctx.Types.inferScalarType(V: this); |
| 1056 | |
| 1057 | VPValue *Op0, *Op1; |
| 1058 | bool IsLogicalAnd = |
| 1059 | match(V: this, P: m_LogicalAnd(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))); |
| 1060 | bool IsLogicalOr = match(V: this, P: m_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))); |
| 1061 | // Also match the inverted forms: |
| 1062 | // select x, false, y --> !x & y (still AND) |
| 1063 | // select x, y, true --> !x | y (still OR) |
| 1064 | IsLogicalAnd |= |
| 1065 | match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_False(), Op2: m_VPValue(V&: Op1))); |
| 1066 | IsLogicalOr |= |
| 1067 | match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1), Op2: m_True())); |
| 1068 | |
| 1069 | if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 && |
| 1070 | (IsLogicalAnd || IsLogicalOr)) { |
| 1071 | // select x, y, false --> x & y |
| 1072 | // select x, true, y --> x | y |
| 1073 | const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(V: Op0); |
| 1074 | const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(V: Op1); |
| 1075 | |
| 1076 | SmallVector<const Value *, 2> Operands; |
| 1077 | if (SI && all_of(Range: operands(), |
| 1078 | P: [](VPValue *Op) { return Op->getUnderlyingValue(); })) |
| 1079 | append_range(C&: Operands, R: SI->operands()); |
| 1080 | return Ctx.TTI.getArithmeticInstrCost( |
| 1081 | Opcode: IsLogicalOr ? Instruction::Or : Instruction::And, Ty: ResultTy, |
| 1082 | CostKind: Ctx.CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: SI); |
| 1083 | } |
| 1084 | |
| 1085 | Type *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 1086 | if (!IsScalarCond) |
| 1087 | CondTy = VectorType::get(ElementType: CondTy, EC: VF); |
| 1088 | |
| 1089 | llvm::CmpPredicate Pred; |
| 1090 | if (!match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue()))) |
| 1091 | if (auto *CondIRV = dyn_cast<VPIRValue>(Val: getOperand(N: 0))) |
| 1092 | if (auto *Cmp = dyn_cast<CmpInst>(Val: CondIRV->getValue())) |
| 1093 | Pred = Cmp->getPredicate(); |
| 1094 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 1095 | return Ctx.TTI.getCmpSelInstrCost( |
| 1096 | Opcode: Instruction::Select, ValTy: VectorTy, CondTy, VecPred: Pred, CostKind: Ctx.CostKind, |
| 1097 | Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: SI); |
| 1098 | } |
| 1099 | } |
| 1100 | llvm_unreachable("called for unsupported opcode" ); |
| 1101 | } |
| 1102 | |
| 1103 | InstructionCost VPInstruction::computeCost(ElementCount VF, |
| 1104 | VPCostContext &Ctx) const { |
| 1105 | if (Instruction::isBinaryOp(Opcode: getOpcode())) { |
| 1106 | if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) { |
| 1107 | // TODO: Compute cost for VPInstructions without underlying values once |
| 1108 | // the legacy cost model has been retired. |
| 1109 | return 0; |
| 1110 | } |
| 1111 | |
| 1112 | assert(!doesGeneratePerAllLanes() && |
| 1113 | "Should only generate a vector value or single scalar, not scalars " |
| 1114 | "for all lanes." ); |
| 1115 | return getCostForRecipeWithOpcode( |
| 1116 | Opcode: getOpcode(), |
| 1117 | VF: vputils::onlyFirstLaneUsed(Def: this) ? ElementCount::getFixed(MinVal: 1) : VF, Ctx); |
| 1118 | } |
| 1119 | |
| 1120 | switch (getOpcode()) { |
| 1121 | case Instruction::Select: { |
| 1122 | llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE; |
| 1123 | match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue())); |
| 1124 | auto *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 1125 | auto *VecTy = Ctx.Types.inferScalarType(V: getOperand(N: 1)); |
| 1126 | if (!vputils::onlyFirstLaneUsed(Def: this)) { |
| 1127 | CondTy = toVectorTy(Scalar: CondTy, EC: VF); |
| 1128 | VecTy = toVectorTy(Scalar: VecTy, EC: VF); |
| 1129 | } |
| 1130 | return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy, CondTy, VecPred: Pred, |
| 1131 | CostKind: Ctx.CostKind); |
| 1132 | } |
| 1133 | case Instruction::ExtractElement: |
| 1134 | case VPInstruction::ExtractLane: { |
| 1135 | if (VF.isScalar()) { |
| 1136 | // ExtractLane with VF=1 takes care of handling extracting across multiple |
| 1137 | // parts. |
| 1138 | return 0; |
| 1139 | } |
| 1140 | |
| 1141 | // Add on the cost of extracting the element. |
| 1142 | auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF); |
| 1143 | return Ctx.TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, |
| 1144 | CostKind: Ctx.CostKind); |
| 1145 | } |
| 1146 | case VPInstruction::AnyOf: { |
| 1147 | auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 1148 | return Ctx.TTI.getArithmeticReductionCost( |
| 1149 | Opcode: Instruction::Or, Ty: cast<VectorType>(Val: VecTy), FMF: std::nullopt, CostKind: Ctx.CostKind); |
| 1150 | } |
| 1151 | case VPInstruction::FirstActiveLane: { |
| 1152 | Type *ScalarTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 1153 | if (VF.isScalar()) |
| 1154 | return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy, |
| 1155 | CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy), |
| 1156 | VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind); |
| 1157 | // Calculate the cost of determining the lane index. |
| 1158 | auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF); |
| 1159 | IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, |
| 1160 | Type::getInt64Ty(C&: Ctx.LLVMCtx), |
| 1161 | {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)}); |
| 1162 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 1163 | } |
| 1164 | case VPInstruction::LastActiveLane: { |
| 1165 | Type *ScalarTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 1166 | if (VF.isScalar()) |
| 1167 | return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy, |
| 1168 | CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy), |
| 1169 | VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind); |
| 1170 | // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB. |
| 1171 | auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF); |
| 1172 | IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, |
| 1173 | Type::getInt64Ty(C&: Ctx.LLVMCtx), |
| 1174 | {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)}); |
| 1175 | InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 1176 | // Add cost of NOT operation on the predicate. |
| 1177 | Cost += Ctx.TTI.getArithmeticInstrCost( |
| 1178 | Opcode: Instruction::Xor, Ty: PredTy, CostKind: Ctx.CostKind, |
| 1179 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
| 1180 | Opd2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue, |
| 1181 | .Properties: TargetTransformInfo::OP_None}); |
| 1182 | // Add cost of SUB operation on the index. |
| 1183 | Cost += Ctx.TTI.getArithmeticInstrCost( |
| 1184 | Opcode: Instruction::Sub, Ty: Type::getInt64Ty(C&: Ctx.LLVMCtx), CostKind: Ctx.CostKind); |
| 1185 | return Cost; |
| 1186 | } |
| 1187 | case VPInstruction::ExtractLastActive: { |
| 1188 | Type *ScalarTy = Ctx.Types.inferScalarType(V: this); |
| 1189 | Type *VecTy = toVectorTy(Scalar: ScalarTy, EC: VF); |
| 1190 | Type *MaskTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF); |
| 1191 | IntrinsicCostAttributes ICA( |
| 1192 | Intrinsic::experimental_vector_extract_last_active, ScalarTy, |
| 1193 | {VecTy, MaskTy, ScalarTy}); |
| 1194 | return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind); |
| 1195 | } |
| 1196 | case VPInstruction::FirstOrderRecurrenceSplice: { |
| 1197 | assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?" ); |
| 1198 | SmallVector<int> Mask(VF.getKnownMinValue()); |
| 1199 | std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1); |
| 1200 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 1201 | |
| 1202 | return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice, |
| 1203 | DstTy: cast<VectorType>(Val: VectorTy), |
| 1204 | SrcTy: cast<VectorType>(Val: VectorTy), Mask, |
| 1205 | CostKind: Ctx.CostKind, Index: VF.getKnownMinValue() - 1); |
| 1206 | } |
| 1207 | case VPInstruction::ActiveLaneMask: { |
| 1208 | Type *ArgTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 1209 | unsigned Multiplier = cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue(); |
| 1210 | Type *RetTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF * Multiplier); |
| 1211 | IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, |
| 1212 | {ArgTy, ArgTy}); |
| 1213 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 1214 | } |
| 1215 | case VPInstruction::ExplicitVectorLength: { |
| 1216 | Type *Arg0Ty = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 1217 | Type *I32Ty = Type::getInt32Ty(C&: Ctx.LLVMCtx); |
| 1218 | Type *I1Ty = Type::getInt1Ty(C&: Ctx.LLVMCtx); |
| 1219 | IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length, |
| 1220 | I32Ty, {Arg0Ty, I32Ty, I1Ty}); |
| 1221 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 1222 | } |
| 1223 | case VPInstruction::Reverse: { |
| 1224 | assert(VF.isVector() && "Reverse operation must be vector type" ); |
| 1225 | auto *VectorTy = cast<VectorType>( |
| 1226 | Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF)); |
| 1227 | return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy, |
| 1228 | SrcTy: VectorTy, /*Mask=*/{}, CostKind: Ctx.CostKind, |
| 1229 | /*Index=*/0); |
| 1230 | } |
| 1231 | case VPInstruction::ExtractLastLane: { |
| 1232 | // Add on the cost of extracting the element. |
| 1233 | auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF); |
| 1234 | return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement, |
| 1235 | Val: VecTy, CostKind: Ctx.CostKind, Index: 0); |
| 1236 | } |
| 1237 | case VPInstruction::ExtractPenultimateElement: |
| 1238 | if (VF == ElementCount::getScalable(MinVal: 1)) |
| 1239 | return InstructionCost::getInvalid(); |
| 1240 | [[fallthrough]]; |
| 1241 | default: |
| 1242 | // TODO: Compute cost other VPInstructions once the legacy cost model has |
| 1243 | // been retired. |
| 1244 | assert(!getUnderlyingValue() && |
| 1245 | "unexpected VPInstruction witht underlying value" ); |
| 1246 | return 0; |
| 1247 | } |
| 1248 | } |
| 1249 | |
| 1250 | bool VPInstruction::isVectorToScalar() const { |
| 1251 | return getOpcode() == VPInstruction::ExtractLastLane || |
| 1252 | getOpcode() == VPInstruction::ExtractPenultimateElement || |
| 1253 | getOpcode() == Instruction::ExtractElement || |
| 1254 | getOpcode() == VPInstruction::ExtractLane || |
| 1255 | getOpcode() == VPInstruction::FirstActiveLane || |
| 1256 | getOpcode() == VPInstruction::LastActiveLane || |
| 1257 | getOpcode() == VPInstruction::ComputeAnyOfResult || |
| 1258 | getOpcode() == VPInstruction::ExtractLastActive || |
| 1259 | getOpcode() == VPInstruction::ComputeReductionResult || |
| 1260 | getOpcode() == VPInstruction::AnyOf; |
| 1261 | } |
| 1262 | |
| 1263 | bool VPInstruction::isSingleScalar() const { |
| 1264 | switch (getOpcode()) { |
| 1265 | case Instruction::PHI: |
| 1266 | case VPInstruction::ExplicitVectorLength: |
| 1267 | case VPInstruction::ResumeForEpilogue: |
| 1268 | case VPInstruction::VScale: |
| 1269 | return true; |
| 1270 | default: |
| 1271 | return isScalarCast(); |
| 1272 | } |
| 1273 | } |
| 1274 | |
| 1275 | void VPInstruction::execute(VPTransformState &State) { |
| 1276 | assert(!State.Lane && "VPInstruction executing an Lane" ); |
| 1277 | IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); |
| 1278 | assert(flagsValidForOpcode(getOpcode()) && |
| 1279 | "Set flags not supported for the provided opcode" ); |
| 1280 | assert(hasRequiredFlagsForOpcode(getOpcode()) && |
| 1281 | "Opcode requires specific flags to be set" ); |
| 1282 | if (hasFastMathFlags()) |
| 1283 | State.Builder.setFastMathFlags(getFastMathFlags()); |
| 1284 | Value *GeneratedValue = generate(State); |
| 1285 | if (!hasResult()) |
| 1286 | return; |
| 1287 | assert(GeneratedValue && "generate must produce a value" ); |
| 1288 | bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && |
| 1289 | (vputils::onlyFirstLaneUsed(Def: this) || |
| 1290 | isVectorToScalar() || isSingleScalar()); |
| 1291 | assert((((GeneratedValue->getType()->isVectorTy() || |
| 1292 | GeneratedValue->getType()->isStructTy()) == |
| 1293 | !GeneratesPerFirstLaneOnly) || |
| 1294 | State.VF.isScalar()) && |
| 1295 | "scalar value but not only first lane defined" ); |
| 1296 | State.set(Def: this, V: GeneratedValue, |
| 1297 | /*IsScalar*/ GeneratesPerFirstLaneOnly); |
| 1298 | } |
| 1299 | |
| 1300 | bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { |
| 1301 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
| 1302 | return false; |
| 1303 | switch (getOpcode()) { |
| 1304 | case Instruction::GetElementPtr: |
| 1305 | case Instruction::ExtractElement: |
| 1306 | case Instruction::Freeze: |
| 1307 | case Instruction::FCmp: |
| 1308 | case Instruction::ICmp: |
| 1309 | case Instruction::Select: |
| 1310 | case Instruction::PHI: |
| 1311 | case VPInstruction::AnyOf: |
| 1312 | case VPInstruction::BranchOnCond: |
| 1313 | case VPInstruction::BranchOnTwoConds: |
| 1314 | case VPInstruction::BranchOnCount: |
| 1315 | case VPInstruction::Broadcast: |
| 1316 | case VPInstruction::BuildStructVector: |
| 1317 | case VPInstruction::BuildVector: |
| 1318 | case VPInstruction::CalculateTripCountMinusVF: |
| 1319 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1320 | case VPInstruction::ExtractLane: |
| 1321 | case VPInstruction::ExtractLastLane: |
| 1322 | case VPInstruction::ExtractLastPart: |
| 1323 | case VPInstruction::ExtractPenultimateElement: |
| 1324 | case VPInstruction::ActiveLaneMask: |
| 1325 | case VPInstruction::ExplicitVectorLength: |
| 1326 | case VPInstruction::FirstActiveLane: |
| 1327 | case VPInstruction::LastActiveLane: |
| 1328 | case VPInstruction::ExtractLastActive: |
| 1329 | case VPInstruction::FirstOrderRecurrenceSplice: |
| 1330 | case VPInstruction::LogicalAnd: |
| 1331 | case VPInstruction::Not: |
| 1332 | case VPInstruction::PtrAdd: |
| 1333 | case VPInstruction::WideIVStep: |
| 1334 | case VPInstruction::WidePtrAdd: |
| 1335 | case VPInstruction::StepVector: |
| 1336 | case VPInstruction::ReductionStartVector: |
| 1337 | case VPInstruction::Reverse: |
| 1338 | case VPInstruction::VScale: |
| 1339 | case VPInstruction::Unpack: |
| 1340 | return false; |
| 1341 | default: |
| 1342 | return true; |
| 1343 | } |
| 1344 | } |
| 1345 | |
| 1346 | bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const { |
| 1347 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 1348 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
| 1349 | return vputils::onlyFirstLaneUsed(Def: this); |
| 1350 | |
| 1351 | switch (getOpcode()) { |
| 1352 | default: |
| 1353 | return false; |
| 1354 | case Instruction::ExtractElement: |
| 1355 | return Op == getOperand(N: 1); |
| 1356 | case Instruction::PHI: |
| 1357 | return true; |
| 1358 | case Instruction::FCmp: |
| 1359 | case Instruction::ICmp: |
| 1360 | case Instruction::Select: |
| 1361 | case Instruction::Or: |
| 1362 | case Instruction::Freeze: |
| 1363 | case VPInstruction::Not: |
| 1364 | // TODO: Cover additional opcodes. |
| 1365 | return vputils::onlyFirstLaneUsed(Def: this); |
| 1366 | case VPInstruction::ActiveLaneMask: |
| 1367 | case VPInstruction::ExplicitVectorLength: |
| 1368 | case VPInstruction::CalculateTripCountMinusVF: |
| 1369 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1370 | case VPInstruction::BranchOnCount: |
| 1371 | case VPInstruction::BranchOnCond: |
| 1372 | case VPInstruction::Broadcast: |
| 1373 | case VPInstruction::ReductionStartVector: |
| 1374 | return true; |
| 1375 | case VPInstruction::BuildStructVector: |
| 1376 | case VPInstruction::BuildVector: |
| 1377 | // Before replicating by VF, Build(Struct)Vector uses all lanes of the |
| 1378 | // operand, after replicating its operands only the first lane is used. |
| 1379 | // Before replicating, it will have only a single operand. |
| 1380 | return getNumOperands() > 1; |
| 1381 | case VPInstruction::PtrAdd: |
| 1382 | return Op == getOperand(N: 0) || vputils::onlyFirstLaneUsed(Def: this); |
| 1383 | case VPInstruction::WidePtrAdd: |
| 1384 | // WidePtrAdd supports scalar and vector base addresses. |
| 1385 | return false; |
| 1386 | case VPInstruction::ComputeAnyOfResult: |
| 1387 | return Op == getOperand(N: 0) || Op == getOperand(N: 1); |
| 1388 | case VPInstruction::ExtractLane: |
| 1389 | return Op == getOperand(N: 0); |
| 1390 | }; |
| 1391 | llvm_unreachable("switch should return" ); |
| 1392 | } |
| 1393 | |
| 1394 | bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const { |
| 1395 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 1396 | if (Instruction::isBinaryOp(Opcode: getOpcode())) |
| 1397 | return vputils::onlyFirstPartUsed(Def: this); |
| 1398 | |
| 1399 | switch (getOpcode()) { |
| 1400 | default: |
| 1401 | return false; |
| 1402 | case Instruction::FCmp: |
| 1403 | case Instruction::ICmp: |
| 1404 | case Instruction::Select: |
| 1405 | return vputils::onlyFirstPartUsed(Def: this); |
| 1406 | case VPInstruction::BranchOnCount: |
| 1407 | case VPInstruction::BranchOnCond: |
| 1408 | case VPInstruction::BranchOnTwoConds: |
| 1409 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1410 | return true; |
| 1411 | }; |
| 1412 | llvm_unreachable("switch should return" ); |
| 1413 | } |
| 1414 | |
| 1415 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1416 | void VPInstruction::dump() const { |
| 1417 | VPSlotTracker SlotTracker(getParent()->getPlan()); |
| 1418 | printRecipe(dbgs(), "" , SlotTracker); |
| 1419 | } |
| 1420 | |
| 1421 | void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1422 | VPSlotTracker &SlotTracker) const { |
| 1423 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
| 1424 | |
| 1425 | if (hasResult()) { |
| 1426 | printAsOperand(O, SlotTracker); |
| 1427 | O << " = " ; |
| 1428 | } |
| 1429 | |
| 1430 | switch (getOpcode()) { |
| 1431 | case VPInstruction::Not: |
| 1432 | O << "not" ; |
| 1433 | break; |
| 1434 | case VPInstruction::SLPLoad: |
| 1435 | O << "combined load" ; |
| 1436 | break; |
| 1437 | case VPInstruction::SLPStore: |
| 1438 | O << "combined store" ; |
| 1439 | break; |
| 1440 | case VPInstruction::ActiveLaneMask: |
| 1441 | O << "active lane mask" ; |
| 1442 | break; |
| 1443 | case VPInstruction::ExplicitVectorLength: |
| 1444 | O << "EXPLICIT-VECTOR-LENGTH" ; |
| 1445 | break; |
| 1446 | case VPInstruction::FirstOrderRecurrenceSplice: |
| 1447 | O << "first-order splice" ; |
| 1448 | break; |
| 1449 | case VPInstruction::BranchOnCond: |
| 1450 | O << "branch-on-cond" ; |
| 1451 | break; |
| 1452 | case VPInstruction::BranchOnTwoConds: |
| 1453 | O << "branch-on-two-conds" ; |
| 1454 | break; |
| 1455 | case VPInstruction::CalculateTripCountMinusVF: |
| 1456 | O << "TC > VF ? TC - VF : 0" ; |
| 1457 | break; |
| 1458 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1459 | O << "VF * Part +" ; |
| 1460 | break; |
| 1461 | case VPInstruction::BranchOnCount: |
| 1462 | O << "branch-on-count" ; |
| 1463 | break; |
| 1464 | case VPInstruction::Broadcast: |
| 1465 | O << "broadcast" ; |
| 1466 | break; |
| 1467 | case VPInstruction::BuildStructVector: |
| 1468 | O << "buildstructvector" ; |
| 1469 | break; |
| 1470 | case VPInstruction::BuildVector: |
| 1471 | O << "buildvector" ; |
| 1472 | break; |
| 1473 | case VPInstruction::ExtractLane: |
| 1474 | O << "extract-lane" ; |
| 1475 | break; |
| 1476 | case VPInstruction::ExtractLastLane: |
| 1477 | O << "extract-last-lane" ; |
| 1478 | break; |
| 1479 | case VPInstruction::ExtractLastPart: |
| 1480 | O << "extract-last-part" ; |
| 1481 | break; |
| 1482 | case VPInstruction::ExtractPenultimateElement: |
| 1483 | O << "extract-penultimate-element" ; |
| 1484 | break; |
| 1485 | case VPInstruction::ComputeAnyOfResult: |
| 1486 | O << "compute-anyof-result" ; |
| 1487 | break; |
| 1488 | case VPInstruction::ComputeReductionResult: |
| 1489 | O << "compute-reduction-result" ; |
| 1490 | break; |
| 1491 | case VPInstruction::LogicalAnd: |
| 1492 | O << "logical-and" ; |
| 1493 | break; |
| 1494 | case VPInstruction::PtrAdd: |
| 1495 | O << "ptradd" ; |
| 1496 | break; |
| 1497 | case VPInstruction::WidePtrAdd: |
| 1498 | O << "wide-ptradd" ; |
| 1499 | break; |
| 1500 | case VPInstruction::AnyOf: |
| 1501 | O << "any-of" ; |
| 1502 | break; |
| 1503 | case VPInstruction::FirstActiveLane: |
| 1504 | O << "first-active-lane" ; |
| 1505 | break; |
| 1506 | case VPInstruction::LastActiveLane: |
| 1507 | O << "last-active-lane" ; |
| 1508 | break; |
| 1509 | case VPInstruction::ReductionStartVector: |
| 1510 | O << "reduction-start-vector" ; |
| 1511 | break; |
| 1512 | case VPInstruction::ResumeForEpilogue: |
| 1513 | O << "resume-for-epilogue" ; |
| 1514 | break; |
| 1515 | case VPInstruction::Reverse: |
| 1516 | O << "reverse" ; |
| 1517 | break; |
| 1518 | case VPInstruction::Unpack: |
| 1519 | O << "unpack" ; |
| 1520 | break; |
| 1521 | case VPInstruction::ExtractLastActive: |
| 1522 | O << "extract-last-active" ; |
| 1523 | break; |
| 1524 | default: |
| 1525 | O << Instruction::getOpcodeName(getOpcode()); |
| 1526 | } |
| 1527 | |
| 1528 | printFlags(O); |
| 1529 | printOperands(O, SlotTracker); |
| 1530 | } |
| 1531 | #endif |
| 1532 | |
| 1533 | void VPInstructionWithType::execute(VPTransformState &State) { |
| 1534 | State.setDebugLocFrom(getDebugLoc()); |
| 1535 | if (isScalarCast()) { |
| 1536 | Value *Op = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 1537 | Value *Cast = State.Builder.CreateCast(Op: Instruction::CastOps(getOpcode()), |
| 1538 | V: Op, DestTy: ResultTy); |
| 1539 | State.set(Def: this, V: Cast, Lane: VPLane(0)); |
| 1540 | return; |
| 1541 | } |
| 1542 | switch (getOpcode()) { |
| 1543 | case VPInstruction::StepVector: { |
| 1544 | Value *StepVector = |
| 1545 | State.Builder.CreateStepVector(DstType: VectorType::get(ElementType: ResultTy, EC: State.VF)); |
| 1546 | State.set(Def: this, V: StepVector); |
| 1547 | break; |
| 1548 | } |
| 1549 | case VPInstruction::VScale: { |
| 1550 | Value *VScale = State.Builder.CreateVScale(Ty: ResultTy); |
| 1551 | State.set(Def: this, V: VScale, IsScalar: true); |
| 1552 | break; |
| 1553 | } |
| 1554 | |
| 1555 | default: |
| 1556 | llvm_unreachable("opcode not implemented yet" ); |
| 1557 | } |
| 1558 | } |
| 1559 | |
| 1560 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1561 | void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1562 | VPSlotTracker &SlotTracker) const { |
| 1563 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
| 1564 | printAsOperand(O, SlotTracker); |
| 1565 | O << " = " ; |
| 1566 | |
| 1567 | switch (getOpcode()) { |
| 1568 | case VPInstruction::WideIVStep: |
| 1569 | O << "wide-iv-step " ; |
| 1570 | printOperands(O, SlotTracker); |
| 1571 | break; |
| 1572 | case VPInstruction::StepVector: |
| 1573 | O << "step-vector " << *ResultTy; |
| 1574 | break; |
| 1575 | case VPInstruction::VScale: |
| 1576 | O << "vscale " << *ResultTy; |
| 1577 | break; |
| 1578 | default: |
| 1579 | assert(Instruction::isCast(getOpcode()) && "unhandled opcode" ); |
| 1580 | O << Instruction::getOpcodeName(getOpcode()) << " " ; |
| 1581 | printOperands(O, SlotTracker); |
| 1582 | O << " to " << *ResultTy; |
| 1583 | } |
| 1584 | } |
| 1585 | #endif |
| 1586 | |
| 1587 | void VPPhi::execute(VPTransformState &State) { |
| 1588 | State.setDebugLocFrom(getDebugLoc()); |
| 1589 | PHINode *NewPhi = State.Builder.CreatePHI( |
| 1590 | Ty: State.TypeAnalysis.inferScalarType(V: this), NumReservedValues: 2, Name: getName()); |
| 1591 | unsigned NumIncoming = getNumIncoming(); |
| 1592 | if (getParent() != getParent()->getPlan()->getScalarPreheader()) { |
| 1593 | // TODO: Fixup all incoming values of header phis once recipes defining them |
| 1594 | // are introduced. |
| 1595 | NumIncoming = 1; |
| 1596 | } |
| 1597 | for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) { |
| 1598 | Value *IncV = State.get(Def: getIncomingValue(Idx), Lane: VPLane(0)); |
| 1599 | BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(Val: getIncomingBlock(Idx)); |
| 1600 | NewPhi->addIncoming(V: IncV, BB: PredBB); |
| 1601 | } |
| 1602 | State.set(Def: this, V: NewPhi, Lane: VPLane(0)); |
| 1603 | } |
| 1604 | |
| 1605 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1606 | void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1607 | VPSlotTracker &SlotTracker) const { |
| 1608 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
| 1609 | printAsOperand(O, SlotTracker); |
| 1610 | O << " = phi " ; |
| 1611 | printPhiOperands(O, SlotTracker); |
| 1612 | } |
| 1613 | #endif |
| 1614 | |
| 1615 | VPIRInstruction *VPIRInstruction ::create(Instruction &I) { |
| 1616 | if (auto *Phi = dyn_cast<PHINode>(Val: &I)) |
| 1617 | return new VPIRPhi(*Phi); |
| 1618 | return new VPIRInstruction(I); |
| 1619 | } |
| 1620 | |
| 1621 | void VPIRInstruction::execute(VPTransformState &State) { |
| 1622 | assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 && |
| 1623 | "PHINodes must be handled by VPIRPhi" ); |
| 1624 | // Advance the insert point after the wrapped IR instruction. This allows |
| 1625 | // interleaving VPIRInstructions and other recipes. |
| 1626 | State.Builder.SetInsertPoint(TheBB: I.getParent(), IP: std::next(x: I.getIterator())); |
| 1627 | } |
| 1628 | |
| 1629 | InstructionCost VPIRInstruction::computeCost(ElementCount VF, |
| 1630 | VPCostContext &Ctx) const { |
| 1631 | // The recipe wraps an existing IR instruction on the border of VPlan's scope, |
| 1632 | // hence it does not contribute to the cost-modeling for the VPlan. |
| 1633 | return 0; |
| 1634 | } |
| 1635 | |
| 1636 | void VPIRInstruction::extractLastLaneOfLastPartOfFirstOperand( |
| 1637 | VPBuilder &Builder) { |
| 1638 | assert(isa<PHINode>(getInstruction()) && |
| 1639 | "can only update exiting operands to phi nodes" ); |
| 1640 | assert(getNumOperands() > 0 && "must have at least one operand" ); |
| 1641 | VPValue *Exiting = getOperand(N: 0); |
| 1642 | if (isa<VPIRValue>(Val: Exiting)) |
| 1643 | return; |
| 1644 | |
| 1645 | Exiting = Builder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: Exiting); |
| 1646 | Exiting = Builder.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: Exiting); |
| 1647 | setOperand(I: 0, New: Exiting); |
| 1648 | } |
| 1649 | |
| 1650 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1651 | void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1652 | VPSlotTracker &SlotTracker) const { |
| 1653 | O << Indent << "IR " << I; |
| 1654 | } |
| 1655 | #endif |
| 1656 | |
| 1657 | void VPIRPhi::execute(VPTransformState &State) { |
| 1658 | PHINode *Phi = &getIRPhi(); |
| 1659 | for (const auto &[Idx, Op] : enumerate(First: operands())) { |
| 1660 | VPValue *ExitValue = Op; |
| 1661 | auto Lane = vputils::isSingleScalar(VPV: ExitValue) |
| 1662 | ? VPLane::getFirstLane() |
| 1663 | : VPLane::getLastLaneForVF(VF: State.VF); |
| 1664 | VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; |
| 1665 | auto *PredVPBB = Pred->getExitingBasicBlock(); |
| 1666 | BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; |
| 1667 | // Set insertion point in PredBB in case an extract needs to be generated. |
| 1668 | // TODO: Model extracts explicitly. |
| 1669 | State.Builder.SetInsertPoint(TheBB: PredBB, IP: PredBB->getFirstNonPHIIt()); |
| 1670 | Value *V = State.get(Def: ExitValue, Lane: VPLane(Lane)); |
| 1671 | // If there is no existing block for PredBB in the phi, add a new incoming |
| 1672 | // value. Otherwise update the existing incoming value for PredBB. |
| 1673 | if (Phi->getBasicBlockIndex(BB: PredBB) == -1) |
| 1674 | Phi->addIncoming(V, BB: PredBB); |
| 1675 | else |
| 1676 | Phi->setIncomingValueForBlock(BB: PredBB, V); |
| 1677 | } |
| 1678 | |
| 1679 | // Advance the insert point after the wrapped IR instruction. This allows |
| 1680 | // interleaving VPIRInstructions and other recipes. |
| 1681 | State.Builder.SetInsertPoint(TheBB: Phi->getParent(), IP: std::next(x: Phi->getIterator())); |
| 1682 | } |
| 1683 | |
| 1684 | void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const { |
| 1685 | VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe()); |
| 1686 | assert(R->getNumOperands() == R->getParent()->getNumPredecessors() && |
| 1687 | "Number of phi operands must match number of predecessors" ); |
| 1688 | unsigned Position = R->getParent()->getIndexForPredecessor(Pred: IncomingBlock); |
| 1689 | R->removeOperand(Idx: Position); |
| 1690 | } |
| 1691 | |
| 1692 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1693 | void VPPhiAccessors::printPhiOperands(raw_ostream &O, |
| 1694 | VPSlotTracker &SlotTracker) const { |
| 1695 | interleaveComma(enumerate(getAsRecipe()->operands()), O, |
| 1696 | [this, &O, &SlotTracker](auto Op) { |
| 1697 | O << "[ " ; |
| 1698 | Op.value()->printAsOperand(O, SlotTracker); |
| 1699 | O << ", " ; |
| 1700 | getIncomingBlock(Op.index())->printAsOperand(O); |
| 1701 | O << " ]" ; |
| 1702 | }); |
| 1703 | } |
| 1704 | #endif |
| 1705 | |
| 1706 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1707 | void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1708 | VPSlotTracker &SlotTracker) const { |
| 1709 | VPIRInstruction::printRecipe(O, Indent, SlotTracker); |
| 1710 | |
| 1711 | if (getNumOperands() != 0) { |
| 1712 | O << " (extra operand" << (getNumOperands() > 1 ? "s" : "" ) << ": " ; |
| 1713 | interleaveComma(incoming_values_and_blocks(), O, |
| 1714 | [&O, &SlotTracker](auto Op) { |
| 1715 | std::get<0>(Op)->printAsOperand(O, SlotTracker); |
| 1716 | O << " from " ; |
| 1717 | std::get<1>(Op)->printAsOperand(O); |
| 1718 | }); |
| 1719 | O << ")" ; |
| 1720 | } |
| 1721 | } |
| 1722 | #endif |
| 1723 | |
| 1724 | void VPIRMetadata::applyMetadata(Instruction &I) const { |
| 1725 | for (const auto &[Kind, Node] : Metadata) |
| 1726 | I.setMetadata(KindID: Kind, Node); |
| 1727 | } |
| 1728 | |
| 1729 | void VPIRMetadata::intersect(const VPIRMetadata &Other) { |
| 1730 | SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection; |
| 1731 | for (const auto &[KindA, MDA] : Metadata) { |
| 1732 | for (const auto &[KindB, MDB] : Other.Metadata) { |
| 1733 | if (KindA == KindB && MDA == MDB) { |
| 1734 | MetadataIntersection.emplace_back(Args: KindA, Args: MDA); |
| 1735 | break; |
| 1736 | } |
| 1737 | } |
| 1738 | } |
| 1739 | Metadata = std::move(MetadataIntersection); |
| 1740 | } |
| 1741 | |
| 1742 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1743 | void VPIRMetadata::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { |
| 1744 | const Module *M = SlotTracker.getModule(); |
| 1745 | if (Metadata.empty() || !M) |
| 1746 | return; |
| 1747 | |
| 1748 | ArrayRef<StringRef> MDNames = SlotTracker.getMDNames(); |
| 1749 | O << " (" ; |
| 1750 | interleaveComma(Metadata, O, [&](const auto &KindNodePair) { |
| 1751 | auto [Kind, Node] = KindNodePair; |
| 1752 | assert(Kind < MDNames.size() && !MDNames[Kind].empty() && |
| 1753 | "Unexpected unnamed metadata kind" ); |
| 1754 | O << "!" << MDNames[Kind] << " " ; |
| 1755 | Node->printAsOperand(O, M); |
| 1756 | }); |
| 1757 | O << ")" ; |
| 1758 | } |
| 1759 | #endif |
| 1760 | |
| 1761 | void VPWidenCallRecipe::execute(VPTransformState &State) { |
| 1762 | assert(State.VF.isVector() && "not widening" ); |
| 1763 | assert(Variant != nullptr && "Can't create vector function." ); |
| 1764 | |
| 1765 | FunctionType *VFTy = Variant->getFunctionType(); |
| 1766 | // Add return type if intrinsic is overloaded on it. |
| 1767 | SmallVector<Value *, 4> Args; |
| 1768 | for (const auto &I : enumerate(First: args())) { |
| 1769 | Value *Arg; |
| 1770 | // Some vectorized function variants may also take a scalar argument, |
| 1771 | // e.g. linear parameters for pointers. This needs to be the scalar value |
| 1772 | // from the start of the respective part when interleaving. |
| 1773 | if (!VFTy->getParamType(i: I.index())->isVectorTy()) |
| 1774 | Arg = State.get(Def: I.value(), Lane: VPLane(0)); |
| 1775 | else |
| 1776 | Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value())); |
| 1777 | Args.push_back(Elt: Arg); |
| 1778 | } |
| 1779 | |
| 1780 | auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue()); |
| 1781 | SmallVector<OperandBundleDef, 1> OpBundles; |
| 1782 | if (CI) |
| 1783 | CI->getOperandBundlesAsDefs(Defs&: OpBundles); |
| 1784 | |
| 1785 | CallInst *V = State.Builder.CreateCall(Callee: Variant, Args, OpBundles); |
| 1786 | applyFlags(I&: *V); |
| 1787 | applyMetadata(I&: *V); |
| 1788 | V->setCallingConv(Variant->getCallingConv()); |
| 1789 | |
| 1790 | if (!V->getType()->isVoidTy()) |
| 1791 | State.set(Def: this, V); |
| 1792 | } |
| 1793 | |
| 1794 | InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, |
| 1795 | VPCostContext &Ctx) const { |
| 1796 | return Ctx.TTI.getCallInstrCost(F: nullptr, RetTy: Variant->getReturnType(), |
| 1797 | Tys: Variant->getFunctionType()->params(), |
| 1798 | CostKind: Ctx.CostKind); |
| 1799 | } |
| 1800 | |
| 1801 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1802 | void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1803 | VPSlotTracker &SlotTracker) const { |
| 1804 | O << Indent << "WIDEN-CALL " ; |
| 1805 | |
| 1806 | Function *CalledFn = getCalledScalarFunction(); |
| 1807 | if (CalledFn->getReturnType()->isVoidTy()) |
| 1808 | O << "void " ; |
| 1809 | else { |
| 1810 | printAsOperand(O, SlotTracker); |
| 1811 | O << " = " ; |
| 1812 | } |
| 1813 | |
| 1814 | O << "call" ; |
| 1815 | printFlags(O); |
| 1816 | O << " @" << CalledFn->getName() << "(" ; |
| 1817 | interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) { |
| 1818 | Op->printAsOperand(O, SlotTracker); |
| 1819 | }); |
| 1820 | O << ")" ; |
| 1821 | |
| 1822 | O << " (using library function" ; |
| 1823 | if (Variant->hasName()) |
| 1824 | O << ": " << Variant->getName(); |
| 1825 | O << ")" ; |
| 1826 | } |
| 1827 | #endif |
| 1828 | |
| 1829 | void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { |
| 1830 | assert(State.VF.isVector() && "not widening" ); |
| 1831 | |
| 1832 | SmallVector<Type *, 2> TysForDecl; |
| 1833 | // Add return type if intrinsic is overloaded on it. |
| 1834 | if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1, |
| 1835 | TTI: State.TTI)) { |
| 1836 | Type *RetTy = toVectorizedTy(Ty: getResultType(), EC: State.VF); |
| 1837 | ArrayRef<Type *> ContainedTys = getContainedTypes(Ty: RetTy); |
| 1838 | for (auto [Idx, Ty] : enumerate(First&: ContainedTys)) { |
| 1839 | if (isVectorIntrinsicWithStructReturnOverloadAtField(ID: VectorIntrinsicID, |
| 1840 | RetIdx: Idx, TTI: State.TTI)) |
| 1841 | TysForDecl.push_back(Elt: Ty); |
| 1842 | } |
| 1843 | } |
| 1844 | SmallVector<Value *, 4> Args; |
| 1845 | for (const auto &I : enumerate(First: operands())) { |
| 1846 | // Some intrinsics have a scalar argument - don't replace it with a |
| 1847 | // vector. |
| 1848 | Value *Arg; |
| 1849 | if (isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index(), |
| 1850 | TTI: State.TTI)) |
| 1851 | Arg = State.get(Def: I.value(), Lane: VPLane(0)); |
| 1852 | else |
| 1853 | Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value())); |
| 1854 | if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index(), |
| 1855 | TTI: State.TTI)) |
| 1856 | TysForDecl.push_back(Elt: Arg->getType()); |
| 1857 | Args.push_back(Elt: Arg); |
| 1858 | } |
| 1859 | |
| 1860 | // Use vector version of the intrinsic. |
| 1861 | Module *M = State.Builder.GetInsertBlock()->getModule(); |
| 1862 | Function *VectorF = |
| 1863 | Intrinsic::getOrInsertDeclaration(M, id: VectorIntrinsicID, Tys: TysForDecl); |
| 1864 | assert(VectorF && |
| 1865 | "Can't retrieve vector intrinsic or vector-predication intrinsics." ); |
| 1866 | |
| 1867 | auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue()); |
| 1868 | SmallVector<OperandBundleDef, 1> OpBundles; |
| 1869 | if (CI) |
| 1870 | CI->getOperandBundlesAsDefs(Defs&: OpBundles); |
| 1871 | |
| 1872 | CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles); |
| 1873 | |
| 1874 | applyFlags(I&: *V); |
| 1875 | applyMetadata(I&: *V); |
| 1876 | |
| 1877 | if (!V->getType()->isVoidTy()) |
| 1878 | State.set(Def: this, V); |
| 1879 | } |
| 1880 | |
| 1881 | /// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R. |
| 1882 | static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, |
| 1883 | ArrayRef<const VPValue *> Operands, |
| 1884 | const VPRecipeWithIRFlags &R, |
| 1885 | ElementCount VF, |
| 1886 | VPCostContext &Ctx) { |
| 1887 | // Some backends analyze intrinsic arguments to determine cost. Use the |
| 1888 | // underlying value for the operand if it has one. Otherwise try to use the |
| 1889 | // operand of the underlying call instruction, if there is one. Otherwise |
| 1890 | // clear Arguments. |
| 1891 | // TODO: Rework TTI interface to be independent of concrete IR values. |
| 1892 | SmallVector<const Value *> Arguments; |
| 1893 | for (const auto &[Idx, Op] : enumerate(First&: Operands)) { |
| 1894 | auto *V = Op->getUnderlyingValue(); |
| 1895 | if (!V) { |
| 1896 | if (auto *UI = dyn_cast_or_null<CallBase>(Val: R.getUnderlyingValue())) { |
| 1897 | Arguments.push_back(Elt: UI->getArgOperand(i: Idx)); |
| 1898 | continue; |
| 1899 | } |
| 1900 | Arguments.clear(); |
| 1901 | break; |
| 1902 | } |
| 1903 | Arguments.push_back(Elt: V); |
| 1904 | } |
| 1905 | |
| 1906 | Type *ScalarRetTy = Ctx.Types.inferScalarType(V: &R); |
| 1907 | Type *RetTy = VF.isVector() ? toVectorizedTy(Ty: ScalarRetTy, EC: VF) : ScalarRetTy; |
| 1908 | SmallVector<Type *> ParamTys; |
| 1909 | for (const VPValue *Op : Operands) { |
| 1910 | ParamTys.push_back(Elt: VF.isVector() |
| 1911 | ? toVectorTy(Scalar: Ctx.Types.inferScalarType(V: Op), EC: VF) |
| 1912 | : Ctx.Types.inferScalarType(V: Op)); |
| 1913 | } |
| 1914 | |
| 1915 | // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. |
| 1916 | FastMathFlags FMF = |
| 1917 | R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags(); |
| 1918 | IntrinsicCostAttributes CostAttrs( |
| 1919 | ID, RetTy, Arguments, ParamTys, FMF, |
| 1920 | dyn_cast_or_null<IntrinsicInst>(Val: R.getUnderlyingValue()), |
| 1921 | InstructionCost::getInvalid(), &Ctx.TLI); |
| 1922 | return Ctx.TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind: Ctx.CostKind); |
| 1923 | } |
| 1924 | |
| 1925 | InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, |
| 1926 | VPCostContext &Ctx) const { |
| 1927 | SmallVector<const VPValue *> ArgOps(operands()); |
| 1928 | return getCostForIntrinsics(ID: VectorIntrinsicID, Operands: ArgOps, R: *this, VF, Ctx); |
| 1929 | } |
| 1930 | |
| 1931 | StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { |
| 1932 | return Intrinsic::getBaseName(id: VectorIntrinsicID); |
| 1933 | } |
| 1934 | |
| 1935 | bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const { |
| 1936 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 1937 | return all_of(Range: enumerate(First: operands()), P: [this, &Op](const auto &X) { |
| 1938 | auto [Idx, V] = X; |
| 1939 | return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(), |
| 1940 | Idx, nullptr); |
| 1941 | }); |
| 1942 | } |
| 1943 | |
| 1944 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1945 | void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1946 | VPSlotTracker &SlotTracker) const { |
| 1947 | O << Indent << "WIDEN-INTRINSIC " ; |
| 1948 | if (ResultTy->isVoidTy()) { |
| 1949 | O << "void " ; |
| 1950 | } else { |
| 1951 | printAsOperand(O, SlotTracker); |
| 1952 | O << " = " ; |
| 1953 | } |
| 1954 | |
| 1955 | O << "call" ; |
| 1956 | printFlags(O); |
| 1957 | O << getIntrinsicName() << "(" ; |
| 1958 | |
| 1959 | interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { |
| 1960 | Op->printAsOperand(O, SlotTracker); |
| 1961 | }); |
| 1962 | O << ")" ; |
| 1963 | } |
| 1964 | #endif |
| 1965 | |
| 1966 | void VPHistogramRecipe::execute(VPTransformState &State) { |
| 1967 | IRBuilderBase &Builder = State.Builder; |
| 1968 | |
| 1969 | Value *Address = State.get(Def: getOperand(N: 0)); |
| 1970 | Value *IncAmt = State.get(Def: getOperand(N: 1), /*IsScalar=*/true); |
| 1971 | VectorType *VTy = cast<VectorType>(Val: Address->getType()); |
| 1972 | |
| 1973 | // The histogram intrinsic requires a mask even if the recipe doesn't; |
| 1974 | // if the mask operand was omitted then all lanes should be executed and |
| 1975 | // we just need to synthesize an all-true mask. |
| 1976 | Value *Mask = nullptr; |
| 1977 | if (VPValue *VPMask = getMask()) |
| 1978 | Mask = State.get(Def: VPMask); |
| 1979 | else |
| 1980 | Mask = |
| 1981 | Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Builder.getInt1(V: 1)); |
| 1982 | |
| 1983 | // If this is a subtract, we want to invert the increment amount. We may |
| 1984 | // add a separate intrinsic in future, but for now we'll try this. |
| 1985 | if (Opcode == Instruction::Sub) |
| 1986 | IncAmt = Builder.CreateNeg(V: IncAmt); |
| 1987 | else |
| 1988 | assert(Opcode == Instruction::Add && "only add or sub supported for now" ); |
| 1989 | |
| 1990 | State.Builder.CreateIntrinsic(ID: Intrinsic::experimental_vector_histogram_add, |
| 1991 | Types: {VTy, IncAmt->getType()}, |
| 1992 | Args: {Address, IncAmt, Mask}); |
| 1993 | } |
| 1994 | |
| 1995 | InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, |
| 1996 | VPCostContext &Ctx) const { |
| 1997 | // FIXME: Take the gather and scatter into account as well. For now we're |
| 1998 | // generating the same cost as the fallback path, but we'll likely |
| 1999 | // need to create a new TTI method for determining the cost, including |
| 2000 | // whether we can use base + vec-of-smaller-indices or just |
| 2001 | // vec-of-pointers. |
| 2002 | assert(VF.isVector() && "Invalid VF for histogram cost" ); |
| 2003 | Type *AddressTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 2004 | VPValue *IncAmt = getOperand(N: 1); |
| 2005 | Type *IncTy = Ctx.Types.inferScalarType(V: IncAmt); |
| 2006 | VectorType *VTy = VectorType::get(ElementType: IncTy, EC: VF); |
| 2007 | |
| 2008 | // Assume that a non-constant update value (or a constant != 1) requires |
| 2009 | // a multiply, and add that into the cost. |
| 2010 | InstructionCost MulCost = |
| 2011 | Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VTy, CostKind: Ctx.CostKind); |
| 2012 | if (auto *CI = dyn_cast<VPConstantInt>(Val: IncAmt)) |
| 2013 | if (CI->isOne()) |
| 2014 | MulCost = TTI::TCC_Free; |
| 2015 | |
| 2016 | // Find the cost of the histogram operation itself. |
| 2017 | Type *PtrTy = VectorType::get(ElementType: AddressTy, EC: VF); |
| 2018 | Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF); |
| 2019 | IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, |
| 2020 | Type::getVoidTy(C&: Ctx.LLVMCtx), |
| 2021 | {PtrTy, IncTy, MaskTy}); |
| 2022 | |
| 2023 | // Add the costs together with the add/sub operation. |
| 2024 | return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind) + MulCost + |
| 2025 | Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: VTy, CostKind: Ctx.CostKind); |
| 2026 | } |
| 2027 | |
| 2028 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2029 | void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2030 | VPSlotTracker &SlotTracker) const { |
| 2031 | O << Indent << "WIDEN-HISTOGRAM buckets: " ; |
| 2032 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 2033 | |
| 2034 | if (Opcode == Instruction::Sub) |
| 2035 | O << ", dec: " ; |
| 2036 | else { |
| 2037 | assert(Opcode == Instruction::Add); |
| 2038 | O << ", inc: " ; |
| 2039 | } |
| 2040 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 2041 | |
| 2042 | if (VPValue *Mask = getMask()) { |
| 2043 | O << ", mask: " ; |
| 2044 | Mask->printAsOperand(O, SlotTracker); |
| 2045 | } |
| 2046 | } |
| 2047 | #endif |
| 2048 | |
| 2049 | VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) { |
| 2050 | AllowReassoc = FMF.allowReassoc(); |
| 2051 | NoNaNs = FMF.noNaNs(); |
| 2052 | NoInfs = FMF.noInfs(); |
| 2053 | NoSignedZeros = FMF.noSignedZeros(); |
| 2054 | AllowReciprocal = FMF.allowReciprocal(); |
| 2055 | AllowContract = FMF.allowContract(); |
| 2056 | ApproxFunc = FMF.approxFunc(); |
| 2057 | } |
| 2058 | |
| 2059 | VPIRFlags VPIRFlags::getDefaultFlags(unsigned Opcode) { |
| 2060 | switch (Opcode) { |
| 2061 | case Instruction::Add: |
| 2062 | case Instruction::Sub: |
| 2063 | case Instruction::Mul: |
| 2064 | case Instruction::Shl: |
| 2065 | case VPInstruction::CanonicalIVIncrementForPart: |
| 2066 | return WrapFlagsTy(false, false); |
| 2067 | case Instruction::Trunc: |
| 2068 | return TruncFlagsTy(false, false); |
| 2069 | case Instruction::Or: |
| 2070 | return DisjointFlagsTy(false); |
| 2071 | case Instruction::AShr: |
| 2072 | case Instruction::LShr: |
| 2073 | case Instruction::UDiv: |
| 2074 | case Instruction::SDiv: |
| 2075 | return ExactFlagsTy(false); |
| 2076 | case Instruction::GetElementPtr: |
| 2077 | case VPInstruction::PtrAdd: |
| 2078 | case VPInstruction::WidePtrAdd: |
| 2079 | return GEPNoWrapFlags::none(); |
| 2080 | case Instruction::ZExt: |
| 2081 | case Instruction::UIToFP: |
| 2082 | return NonNegFlagsTy(false); |
| 2083 | case Instruction::FAdd: |
| 2084 | case Instruction::FSub: |
| 2085 | case Instruction::FMul: |
| 2086 | case Instruction::FDiv: |
| 2087 | case Instruction::FRem: |
| 2088 | case Instruction::FNeg: |
| 2089 | case Instruction::FPExt: |
| 2090 | case Instruction::FPTrunc: |
| 2091 | return FastMathFlags(); |
| 2092 | case Instruction::ICmp: |
| 2093 | case Instruction::FCmp: |
| 2094 | case VPInstruction::ComputeReductionResult: |
| 2095 | llvm_unreachable("opcode requires explicit flags" ); |
| 2096 | default: |
| 2097 | return VPIRFlags(); |
| 2098 | } |
| 2099 | } |
| 2100 | |
| 2101 | #if !defined(NDEBUG) |
| 2102 | bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { |
| 2103 | switch (OpType) { |
| 2104 | case OperationType::OverflowingBinOp: |
| 2105 | return Opcode == Instruction::Add || Opcode == Instruction::Sub || |
| 2106 | Opcode == Instruction::Mul || Opcode == Instruction::Shl || |
| 2107 | Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart; |
| 2108 | case OperationType::Trunc: |
| 2109 | return Opcode == Instruction::Trunc; |
| 2110 | case OperationType::DisjointOp: |
| 2111 | return Opcode == Instruction::Or; |
| 2112 | case OperationType::PossiblyExactOp: |
| 2113 | return Opcode == Instruction::AShr || Opcode == Instruction::LShr || |
| 2114 | Opcode == Instruction::UDiv || Opcode == Instruction::SDiv; |
| 2115 | case OperationType::GEPOp: |
| 2116 | return Opcode == Instruction::GetElementPtr || |
| 2117 | Opcode == VPInstruction::PtrAdd || |
| 2118 | Opcode == VPInstruction::WidePtrAdd; |
| 2119 | case OperationType::FPMathOp: |
| 2120 | return Opcode == Instruction::Call || Opcode == Instruction::FAdd || |
| 2121 | Opcode == Instruction::FMul || Opcode == Instruction::FSub || |
| 2122 | Opcode == Instruction::FNeg || Opcode == Instruction::FDiv || |
| 2123 | Opcode == Instruction::FRem || Opcode == Instruction::FPExt || |
| 2124 | Opcode == Instruction::FPTrunc || Opcode == Instruction::Select || |
| 2125 | Opcode == VPInstruction::WideIVStep || |
| 2126 | Opcode == VPInstruction::ReductionStartVector; |
| 2127 | case OperationType::FCmp: |
| 2128 | return Opcode == Instruction::FCmp; |
| 2129 | case OperationType::NonNegOp: |
| 2130 | return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP; |
| 2131 | case OperationType::Cmp: |
| 2132 | return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp; |
| 2133 | case OperationType::ReductionOp: |
| 2134 | return Opcode == VPInstruction::ComputeReductionResult; |
| 2135 | case OperationType::Other: |
| 2136 | return true; |
| 2137 | } |
| 2138 | llvm_unreachable("Unknown OperationType enum" ); |
| 2139 | } |
| 2140 | |
| 2141 | bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const { |
| 2142 | // Handle opcodes without default flags. |
| 2143 | if (Opcode == Instruction::ICmp) |
| 2144 | return OpType == OperationType::Cmp; |
| 2145 | if (Opcode == Instruction::FCmp) |
| 2146 | return OpType == OperationType::FCmp; |
| 2147 | if (Opcode == VPInstruction::ComputeReductionResult) |
| 2148 | return OpType == OperationType::ReductionOp; |
| 2149 | |
| 2150 | OperationType Required = getDefaultFlags(Opcode).OpType; |
| 2151 | return Required == OperationType::Other || Required == OpType; |
| 2152 | } |
| 2153 | #endif |
| 2154 | |
| 2155 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2156 | void VPIRFlags::printFlags(raw_ostream &O) const { |
| 2157 | switch (OpType) { |
| 2158 | case OperationType::Cmp: |
| 2159 | O << " " << CmpInst::getPredicateName(getPredicate()); |
| 2160 | break; |
| 2161 | case OperationType::FCmp: |
| 2162 | O << " " << CmpInst::getPredicateName(getPredicate()); |
| 2163 | getFastMathFlags().print(O); |
| 2164 | break; |
| 2165 | case OperationType::DisjointOp: |
| 2166 | if (DisjointFlags.IsDisjoint) |
| 2167 | O << " disjoint" ; |
| 2168 | break; |
| 2169 | case OperationType::PossiblyExactOp: |
| 2170 | if (ExactFlags.IsExact) |
| 2171 | O << " exact" ; |
| 2172 | break; |
| 2173 | case OperationType::OverflowingBinOp: |
| 2174 | if (WrapFlags.HasNUW) |
| 2175 | O << " nuw" ; |
| 2176 | if (WrapFlags.HasNSW) |
| 2177 | O << " nsw" ; |
| 2178 | break; |
| 2179 | case OperationType::Trunc: |
| 2180 | if (TruncFlags.HasNUW) |
| 2181 | O << " nuw" ; |
| 2182 | if (TruncFlags.HasNSW) |
| 2183 | O << " nsw" ; |
| 2184 | break; |
| 2185 | case OperationType::FPMathOp: |
| 2186 | getFastMathFlags().print(O); |
| 2187 | break; |
| 2188 | case OperationType::GEPOp: |
| 2189 | if (GEPFlags.isInBounds()) |
| 2190 | O << " inbounds" ; |
| 2191 | else if (GEPFlags.hasNoUnsignedSignedWrap()) |
| 2192 | O << " nusw" ; |
| 2193 | if (GEPFlags.hasNoUnsignedWrap()) |
| 2194 | O << " nuw" ; |
| 2195 | break; |
| 2196 | case OperationType::NonNegOp: |
| 2197 | if (NonNegFlags.NonNeg) |
| 2198 | O << " nneg" ; |
| 2199 | break; |
| 2200 | case OperationType::ReductionOp: { |
| 2201 | RecurKind RK = getRecurKind(); |
| 2202 | O << " (" ; |
| 2203 | switch (RK) { |
| 2204 | case RecurKind::AnyOf: |
| 2205 | O << "any-of" ; |
| 2206 | break; |
| 2207 | case RecurKind::SMax: |
| 2208 | O << "smax" ; |
| 2209 | break; |
| 2210 | case RecurKind::SMin: |
| 2211 | O << "smin" ; |
| 2212 | break; |
| 2213 | case RecurKind::UMax: |
| 2214 | O << "umax" ; |
| 2215 | break; |
| 2216 | case RecurKind::UMin: |
| 2217 | O << "umin" ; |
| 2218 | break; |
| 2219 | case RecurKind::FMinNum: |
| 2220 | O << "fminnum" ; |
| 2221 | break; |
| 2222 | case RecurKind::FMaxNum: |
| 2223 | O << "fmaxnum" ; |
| 2224 | break; |
| 2225 | case RecurKind::FMinimum: |
| 2226 | O << "fminimum" ; |
| 2227 | break; |
| 2228 | case RecurKind::FMaximum: |
| 2229 | O << "fmaximum" ; |
| 2230 | break; |
| 2231 | case RecurKind::FMinimumNum: |
| 2232 | O << "fminimumnum" ; |
| 2233 | break; |
| 2234 | case RecurKind::FMaximumNum: |
| 2235 | O << "fmaximumnum" ; |
| 2236 | break; |
| 2237 | default: |
| 2238 | O << Instruction::getOpcodeName(RecurrenceDescriptor::getOpcode(RK)); |
| 2239 | break; |
| 2240 | } |
| 2241 | if (isReductionInLoop()) |
| 2242 | O << ", in-loop" ; |
| 2243 | if (isReductionOrdered()) |
| 2244 | O << ", ordered" ; |
| 2245 | O << ")" ; |
| 2246 | getFastMathFlags().print(O); |
| 2247 | break; |
| 2248 | } |
| 2249 | case OperationType::Other: |
| 2250 | break; |
| 2251 | } |
| 2252 | O << " " ; |
| 2253 | } |
| 2254 | #endif |
| 2255 | |
| 2256 | void VPWidenRecipe::execute(VPTransformState &State) { |
| 2257 | auto &Builder = State.Builder; |
| 2258 | switch (Opcode) { |
| 2259 | case Instruction::Call: |
| 2260 | case Instruction::Br: |
| 2261 | case Instruction::PHI: |
| 2262 | case Instruction::GetElementPtr: |
| 2263 | llvm_unreachable("This instruction is handled by a different recipe." ); |
| 2264 | case Instruction::UDiv: |
| 2265 | case Instruction::SDiv: |
| 2266 | case Instruction::SRem: |
| 2267 | case Instruction::URem: |
| 2268 | case Instruction::Add: |
| 2269 | case Instruction::FAdd: |
| 2270 | case Instruction::Sub: |
| 2271 | case Instruction::FSub: |
| 2272 | case Instruction::FNeg: |
| 2273 | case Instruction::Mul: |
| 2274 | case Instruction::FMul: |
| 2275 | case Instruction::FDiv: |
| 2276 | case Instruction::FRem: |
| 2277 | case Instruction::Shl: |
| 2278 | case Instruction::LShr: |
| 2279 | case Instruction::AShr: |
| 2280 | case Instruction::And: |
| 2281 | case Instruction::Or: |
| 2282 | case Instruction::Xor: { |
| 2283 | // Just widen unops and binops. |
| 2284 | SmallVector<Value *, 2> Ops; |
| 2285 | for (VPValue *VPOp : operands()) |
| 2286 | Ops.push_back(Elt: State.get(Def: VPOp)); |
| 2287 | |
| 2288 | Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops); |
| 2289 | |
| 2290 | if (auto *VecOp = dyn_cast<Instruction>(Val: V)) { |
| 2291 | applyFlags(I&: *VecOp); |
| 2292 | applyMetadata(I&: *VecOp); |
| 2293 | } |
| 2294 | |
| 2295 | // Use this vector value for all users of the original instruction. |
| 2296 | State.set(Def: this, V); |
| 2297 | break; |
| 2298 | } |
| 2299 | case Instruction::ExtractValue: { |
| 2300 | assert(getNumOperands() == 2 && "expected single level extractvalue" ); |
| 2301 | Value *Op = State.get(Def: getOperand(N: 0)); |
| 2302 | Value * = Builder.CreateExtractValue( |
| 2303 | Agg: Op, Idxs: cast<VPConstantInt>(Val: getOperand(N: 1))->getZExtValue()); |
| 2304 | State.set(Def: this, V: Extract); |
| 2305 | break; |
| 2306 | } |
| 2307 | case Instruction::Freeze: { |
| 2308 | Value *Op = State.get(Def: getOperand(N: 0)); |
| 2309 | Value *Freeze = Builder.CreateFreeze(V: Op); |
| 2310 | State.set(Def: this, V: Freeze); |
| 2311 | break; |
| 2312 | } |
| 2313 | case Instruction::ICmp: |
| 2314 | case Instruction::FCmp: { |
| 2315 | // Widen compares. Generate vector compares. |
| 2316 | bool FCmp = Opcode == Instruction::FCmp; |
| 2317 | Value *A = State.get(Def: getOperand(N: 0)); |
| 2318 | Value *B = State.get(Def: getOperand(N: 1)); |
| 2319 | Value *C = nullptr; |
| 2320 | if (FCmp) { |
| 2321 | C = Builder.CreateFCmp(P: getPredicate(), LHS: A, RHS: B); |
| 2322 | } else { |
| 2323 | C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B); |
| 2324 | } |
| 2325 | if (auto *I = dyn_cast<Instruction>(Val: C)) { |
| 2326 | applyFlags(I&: *I); |
| 2327 | applyMetadata(I&: *I); |
| 2328 | } |
| 2329 | State.set(Def: this, V: C); |
| 2330 | break; |
| 2331 | } |
| 2332 | case Instruction::Select: { |
| 2333 | VPValue *CondOp = getOperand(N: 0); |
| 2334 | Value *Cond = State.get(Def: CondOp, IsScalar: vputils::isSingleScalar(VPV: CondOp)); |
| 2335 | Value *Op0 = State.get(Def: getOperand(N: 1)); |
| 2336 | Value *Op1 = State.get(Def: getOperand(N: 2)); |
| 2337 | Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1); |
| 2338 | State.set(Def: this, V: Sel); |
| 2339 | if (auto *I = dyn_cast<Instruction>(Val: Sel)) { |
| 2340 | if (isa<FPMathOperator>(Val: I)) |
| 2341 | applyFlags(I&: *I); |
| 2342 | applyMetadata(I&: *I); |
| 2343 | } |
| 2344 | break; |
| 2345 | } |
| 2346 | default: |
| 2347 | // This instruction is not vectorized by simple widening. |
| 2348 | LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " |
| 2349 | << Instruction::getOpcodeName(Opcode)); |
| 2350 | llvm_unreachable("Unhandled instruction!" ); |
| 2351 | } // end of switch. |
| 2352 | |
| 2353 | #if !defined(NDEBUG) |
| 2354 | // Verify that VPlan type inference results agree with the type of the |
| 2355 | // generated values. |
| 2356 | assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == |
| 2357 | State.get(this)->getType() && |
| 2358 | "inferred type and type from generated instructions do not match" ); |
| 2359 | #endif |
| 2360 | } |
| 2361 | |
| 2362 | InstructionCost VPWidenRecipe::computeCost(ElementCount VF, |
| 2363 | VPCostContext &Ctx) const { |
| 2364 | switch (Opcode) { |
| 2365 | case Instruction::UDiv: |
| 2366 | case Instruction::SDiv: |
| 2367 | case Instruction::SRem: |
| 2368 | case Instruction::URem: |
| 2369 | // If the div/rem operation isn't safe to speculate and requires |
| 2370 | // predication, then the only way we can even create a vplan is to insert |
| 2371 | // a select on the second input operand to ensure we use the value of 1 |
| 2372 | // for the inactive lanes. The select will be costed separately. |
| 2373 | case Instruction::FNeg: |
| 2374 | case Instruction::Add: |
| 2375 | case Instruction::FAdd: |
| 2376 | case Instruction::Sub: |
| 2377 | case Instruction::FSub: |
| 2378 | case Instruction::Mul: |
| 2379 | case Instruction::FMul: |
| 2380 | case Instruction::FDiv: |
| 2381 | case Instruction::FRem: |
| 2382 | case Instruction::Shl: |
| 2383 | case Instruction::LShr: |
| 2384 | case Instruction::AShr: |
| 2385 | case Instruction::And: |
| 2386 | case Instruction::Or: |
| 2387 | case Instruction::Xor: |
| 2388 | case Instruction::Freeze: |
| 2389 | case Instruction::ExtractValue: |
| 2390 | case Instruction::ICmp: |
| 2391 | case Instruction::FCmp: |
| 2392 | case Instruction::Select: |
| 2393 | return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx); |
| 2394 | default: |
| 2395 | llvm_unreachable("Unsupported opcode for instruction" ); |
| 2396 | } |
| 2397 | } |
| 2398 | |
| 2399 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2400 | void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2401 | VPSlotTracker &SlotTracker) const { |
| 2402 | O << Indent << "WIDEN " ; |
| 2403 | printAsOperand(O, SlotTracker); |
| 2404 | O << " = " << Instruction::getOpcodeName(Opcode); |
| 2405 | printFlags(O); |
| 2406 | printOperands(O, SlotTracker); |
| 2407 | } |
| 2408 | #endif |
| 2409 | |
| 2410 | void VPWidenCastRecipe::execute(VPTransformState &State) { |
| 2411 | auto &Builder = State.Builder; |
| 2412 | /// Vectorize casts. |
| 2413 | assert(State.VF.isVector() && "Not vectorizing?" ); |
| 2414 | Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF); |
| 2415 | VPValue *Op = getOperand(N: 0); |
| 2416 | Value *A = State.get(Def: Op); |
| 2417 | Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy); |
| 2418 | State.set(Def: this, V: Cast); |
| 2419 | if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) { |
| 2420 | applyFlags(I&: *CastOp); |
| 2421 | applyMetadata(I&: *CastOp); |
| 2422 | } |
| 2423 | } |
| 2424 | |
| 2425 | InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, |
| 2426 | VPCostContext &Ctx) const { |
| 2427 | // TODO: In some cases, VPWidenCastRecipes are created but not considered in |
| 2428 | // the legacy cost model, including truncates/extends when evaluating a |
| 2429 | // reduction in a smaller type. |
| 2430 | if (!getUnderlyingValue()) |
| 2431 | return 0; |
| 2432 | return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx); |
| 2433 | } |
| 2434 | |
| 2435 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2436 | void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2437 | VPSlotTracker &SlotTracker) const { |
| 2438 | O << Indent << "WIDEN-CAST " ; |
| 2439 | printAsOperand(O, SlotTracker); |
| 2440 | O << " = " << Instruction::getOpcodeName(Opcode); |
| 2441 | printFlags(O); |
| 2442 | printOperands(O, SlotTracker); |
| 2443 | O << " to " << *getResultType(); |
| 2444 | } |
| 2445 | #endif |
| 2446 | |
| 2447 | InstructionCost VPHeaderPHIRecipe::(ElementCount VF, |
| 2448 | VPCostContext &Ctx) const { |
| 2449 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 2450 | } |
| 2451 | |
| 2452 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2453 | void VPWidenIntOrFpInductionRecipe::printRecipe( |
| 2454 | raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { |
| 2455 | O << Indent; |
| 2456 | printAsOperand(O, SlotTracker); |
| 2457 | O << " = WIDEN-INDUCTION" ; |
| 2458 | printFlags(O); |
| 2459 | printOperands(O, SlotTracker); |
| 2460 | |
| 2461 | if (auto *TI = getTruncInst()) |
| 2462 | O << " (truncated to " << *TI->getType() << ")" ; |
| 2463 | } |
| 2464 | #endif |
| 2465 | |
| 2466 | bool VPWidenIntOrFpInductionRecipe::isCanonical() const { |
| 2467 | // The step may be defined by a recipe in the preheader (e.g. if it requires |
| 2468 | // SCEV expansion), but for the canonical induction the step is required to be |
| 2469 | // 1, which is represented as live-in. |
| 2470 | auto *StepC = dyn_cast<VPConstantInt>(Val: getStepValue()); |
| 2471 | auto *StartC = dyn_cast<VPConstantInt>(Val: getStartValue()); |
| 2472 | return StartC && StartC->isZero() && StepC && StepC->isOne() && |
| 2473 | getScalarType() == getRegion()->getCanonicalIVType(); |
| 2474 | } |
| 2475 | |
| 2476 | void VPDerivedIVRecipe::execute(VPTransformState &State) { |
| 2477 | assert(!State.Lane && "VPDerivedIVRecipe being replicated." ); |
| 2478 | |
| 2479 | // Fast-math-flags propagate from the original induction instruction. |
| 2480 | IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); |
| 2481 | if (FPBinOp) |
| 2482 | State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); |
| 2483 | |
| 2484 | Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0)); |
| 2485 | Value *Index = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 2486 | Value *DerivedIV = emitTransformedIndex( |
| 2487 | B&: State.Builder, Index, StartValue: getStartValue()->getLiveInIRValue(), Step, InductionKind: Kind, |
| 2488 | InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp)); |
| 2489 | DerivedIV->setName(Name); |
| 2490 | State.set(Def: this, V: DerivedIV, Lane: VPLane(0)); |
| 2491 | } |
| 2492 | |
| 2493 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2494 | void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2495 | VPSlotTracker &SlotTracker) const { |
| 2496 | O << Indent; |
| 2497 | printAsOperand(O, SlotTracker); |
| 2498 | O << " = DERIVED-IV " ; |
| 2499 | getStartValue()->printAsOperand(O, SlotTracker); |
| 2500 | O << " + " ; |
| 2501 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 2502 | O << " * " ; |
| 2503 | getStepValue()->printAsOperand(O, SlotTracker); |
| 2504 | } |
| 2505 | #endif |
| 2506 | |
| 2507 | void VPScalarIVStepsRecipe::execute(VPTransformState &State) { |
| 2508 | // Fast-math-flags propagate from the original induction instruction. |
| 2509 | IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); |
| 2510 | if (hasFastMathFlags()) |
| 2511 | State.Builder.setFastMathFlags(getFastMathFlags()); |
| 2512 | |
| 2513 | /// Compute scalar induction steps. \p ScalarIV is the scalar induction |
| 2514 | /// variable on which to base the steps, \p Step is the size of the step. |
| 2515 | |
| 2516 | Value *BaseIV = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 2517 | Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0)); |
| 2518 | IRBuilderBase &Builder = State.Builder; |
| 2519 | |
| 2520 | // Ensure step has the same type as that of scalar IV. |
| 2521 | Type *BaseIVTy = BaseIV->getType()->getScalarType(); |
| 2522 | assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!" ); |
| 2523 | |
| 2524 | // We build scalar steps for both integer and floating-point induction |
| 2525 | // variables. Here, we determine the kind of arithmetic we will perform. |
| 2526 | Instruction::BinaryOps AddOp; |
| 2527 | Instruction::BinaryOps MulOp; |
| 2528 | if (BaseIVTy->isIntegerTy()) { |
| 2529 | AddOp = Instruction::Add; |
| 2530 | MulOp = Instruction::Mul; |
| 2531 | } else { |
| 2532 | AddOp = InductionOpcode; |
| 2533 | MulOp = Instruction::FMul; |
| 2534 | } |
| 2535 | |
| 2536 | // Determine the number of scalars we need to generate. |
| 2537 | bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this); |
| 2538 | // Compute the scalar steps and save the results in State. |
| 2539 | |
| 2540 | unsigned StartLane = 0; |
| 2541 | unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); |
| 2542 | if (State.Lane) { |
| 2543 | StartLane = State.Lane->getKnownLane(); |
| 2544 | EndLane = StartLane + 1; |
| 2545 | } |
| 2546 | Value *StartIdx0 = getStartIndex() ? State.get(Def: getStartIndex(), IsScalar: true) |
| 2547 | : Constant::getNullValue(Ty: BaseIVTy); |
| 2548 | |
| 2549 | for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { |
| 2550 | // It is okay if the induction variable type cannot hold the lane number, |
| 2551 | // we expect truncation in this case. |
| 2552 | Constant *LaneValue = |
| 2553 | BaseIVTy->isIntegerTy() |
| 2554 | ? ConstantInt::get(Ty: BaseIVTy, V: Lane, /*IsSigned=*/false, |
| 2555 | /*ImplicitTrunc=*/true) |
| 2556 | : ConstantFP::get(Ty: BaseIVTy, V: Lane); |
| 2557 | Value *StartIdx = Builder.CreateBinOp(Opc: AddOp, LHS: StartIdx0, RHS: LaneValue); |
| 2558 | // The step returned by `createStepForVF` is a runtime-evaluated value |
| 2559 | // when VF is scalable. Otherwise, it should be folded into a Constant. |
| 2560 | assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && |
| 2561 | "Expected StartIdx to be folded to a constant when VF is not " |
| 2562 | "scalable" ); |
| 2563 | auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step); |
| 2564 | auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul); |
| 2565 | State.set(Def: this, V: Add, Lane: VPLane(Lane)); |
| 2566 | } |
| 2567 | } |
| 2568 | |
| 2569 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2570 | void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2571 | VPSlotTracker &SlotTracker) const { |
| 2572 | O << Indent; |
| 2573 | printAsOperand(O, SlotTracker); |
| 2574 | O << " = SCALAR-STEPS " ; |
| 2575 | printOperands(O, SlotTracker); |
| 2576 | } |
| 2577 | #endif |
| 2578 | |
| 2579 | bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const { |
| 2580 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 2581 | return vputils::isSingleScalar(VPV: Op); |
| 2582 | } |
| 2583 | |
| 2584 | void VPWidenGEPRecipe::execute(VPTransformState &State) { |
| 2585 | assert(State.VF.isVector() && "not widening" ); |
| 2586 | // Construct a vector GEP by widening the operands of the scalar GEP as |
| 2587 | // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP |
| 2588 | // results in a vector of pointers when at least one operand of the GEP |
| 2589 | // is vector-typed. Thus, to keep the representation compact, we only use |
| 2590 | // vector-typed operands for loop-varying values. |
| 2591 | |
| 2592 | bool AllOperandsAreInvariant = all_of(Range: operands(), P: [](VPValue *Op) { |
| 2593 | return Op->isDefinedOutsideLoopRegions(); |
| 2594 | }); |
| 2595 | if (AllOperandsAreInvariant) { |
| 2596 | // If we are vectorizing, but the GEP has only loop-invariant operands, |
| 2597 | // the GEP we build (by only using vector-typed operands for |
| 2598 | // loop-varying values) would be a scalar pointer. Thus, to ensure we |
| 2599 | // produce a vector of pointers, we need to either arbitrarily pick an |
| 2600 | // operand to broadcast, or broadcast a clone of the original GEP. |
| 2601 | // Here, we broadcast a clone of the original. |
| 2602 | |
| 2603 | SmallVector<Value *> Ops; |
| 2604 | for (unsigned I = 0, E = getNumOperands(); I != E; I++) |
| 2605 | Ops.push_back(Elt: State.get(Def: getOperand(N: I), Lane: VPLane(0))); |
| 2606 | |
| 2607 | auto *NewGEP = |
| 2608 | State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr: Ops[0], IdxList: drop_begin(RangeOrContainer&: Ops), |
| 2609 | Name: "" , NW: getGEPNoWrapFlags()); |
| 2610 | Value *Splat = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP); |
| 2611 | State.set(Def: this, V: Splat); |
| 2612 | return; |
| 2613 | } |
| 2614 | |
| 2615 | // If the GEP has at least one loop-varying operand, we are sure to |
| 2616 | // produce a vector of pointers unless VF is scalar. |
| 2617 | // The pointer operand of the new GEP. If it's loop-invariant, we |
| 2618 | // won't broadcast it. |
| 2619 | auto *Ptr = State.get(Def: getOperand(N: 0), IsScalar: isPointerLoopInvariant()); |
| 2620 | |
| 2621 | // Collect all the indices for the new GEP. If any index is |
| 2622 | // loop-invariant, we won't broadcast it. |
| 2623 | SmallVector<Value *, 4> Indices; |
| 2624 | for (unsigned I = 1, E = getNumOperands(); I < E; I++) { |
| 2625 | VPValue *Operand = getOperand(N: I); |
| 2626 | Indices.push_back(Elt: State.get(Def: Operand, IsScalar: isIndexLoopInvariant(I: I - 1))); |
| 2627 | } |
| 2628 | |
| 2629 | // Create the new GEP. Note that this GEP may be a scalar if VF == 1, |
| 2630 | // but it should be a vector, otherwise. |
| 2631 | auto *NewGEP = State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Indices, |
| 2632 | Name: "" , NW: getGEPNoWrapFlags()); |
| 2633 | assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && |
| 2634 | "NewGEP is not a pointer vector" ); |
| 2635 | State.set(Def: this, V: NewGEP); |
| 2636 | } |
| 2637 | |
| 2638 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2639 | void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2640 | VPSlotTracker &SlotTracker) const { |
| 2641 | O << Indent << "WIDEN-GEP " ; |
| 2642 | O << (isPointerLoopInvariant() ? "Inv" : "Var" ); |
| 2643 | for (size_t I = 0; I < getNumOperands() - 1; ++I) |
| 2644 | O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var" ) << "]" ; |
| 2645 | |
| 2646 | O << " " ; |
| 2647 | printAsOperand(O, SlotTracker); |
| 2648 | O << " = getelementptr" ; |
| 2649 | printFlags(O); |
| 2650 | printOperands(O, SlotTracker); |
| 2651 | } |
| 2652 | #endif |
| 2653 | |
| 2654 | void VPVectorEndPointerRecipe::execute(VPTransformState &State) { |
| 2655 | auto &Builder = State.Builder; |
| 2656 | unsigned CurrentPart = getUnrollPart(U: *this); |
| 2657 | const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); |
| 2658 | Type *IndexTy = DL.getIndexType(PtrTy: State.TypeAnalysis.inferScalarType(V: this)); |
| 2659 | |
| 2660 | // The wide store needs to start at the last vector element. |
| 2661 | Value *RunTimeVF = State.get(Def: getVFValue(), Lane: VPLane(0)); |
| 2662 | if (IndexTy != RunTimeVF->getType()) |
| 2663 | RunTimeVF = Builder.CreateZExtOrTrunc(V: RunTimeVF, DestTy: IndexTy); |
| 2664 | // NumElt = Stride * CurrentPart * RunTimeVF |
| 2665 | Value *NumElt = Builder.CreateMul( |
| 2666 | LHS: ConstantInt::getSigned(Ty: IndexTy, V: Stride * (int64_t)CurrentPart), |
| 2667 | RHS: RunTimeVF); |
| 2668 | // LastLane = Stride * (RunTimeVF - 1) |
| 2669 | Value *LastLane = Builder.CreateSub(LHS: RunTimeVF, RHS: ConstantInt::get(Ty: IndexTy, V: 1)); |
| 2670 | if (Stride != 1) |
| 2671 | LastLane = |
| 2672 | Builder.CreateMul(LHS: ConstantInt::getSigned(Ty: IndexTy, V: Stride), RHS: LastLane); |
| 2673 | Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 2674 | Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: NumElt, Name: "" , |
| 2675 | NW: getGEPNoWrapFlags()); |
| 2676 | ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr: ResultPtr, IdxList: LastLane, Name: "" , |
| 2677 | NW: getGEPNoWrapFlags()); |
| 2678 | |
| 2679 | State.set(Def: this, V: ResultPtr, /*IsScalar*/ true); |
| 2680 | } |
| 2681 | |
| 2682 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2683 | void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2684 | VPSlotTracker &SlotTracker) const { |
| 2685 | O << Indent; |
| 2686 | printAsOperand(O, SlotTracker); |
| 2687 | O << " = vector-end-pointer" ; |
| 2688 | printFlags(O); |
| 2689 | printOperands(O, SlotTracker); |
| 2690 | } |
| 2691 | #endif |
| 2692 | |
| 2693 | void VPVectorPointerRecipe::execute(VPTransformState &State) { |
| 2694 | auto &Builder = State.Builder; |
| 2695 | assert(getOffset() && |
| 2696 | "Expected prior simplification of recipe without offset" ); |
| 2697 | Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 2698 | Value *Offset = State.get(Def: getOffset(), IsScalar: true); |
| 2699 | Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "" , |
| 2700 | NW: getGEPNoWrapFlags()); |
| 2701 | State.set(Def: this, V: ResultPtr, /*IsScalar*/ true); |
| 2702 | } |
| 2703 | |
| 2704 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2705 | void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2706 | VPSlotTracker &SlotTracker) const { |
| 2707 | O << Indent; |
| 2708 | printAsOperand(O, SlotTracker); |
| 2709 | O << " = vector-pointer" ; |
| 2710 | printFlags(O); |
| 2711 | printOperands(O, SlotTracker); |
| 2712 | } |
| 2713 | #endif |
| 2714 | |
| 2715 | InstructionCost VPBlendRecipe::computeCost(ElementCount VF, |
| 2716 | VPCostContext &Ctx) const { |
| 2717 | // A blend will be expanded to a select VPInstruction, which will generate a |
| 2718 | // scalar select if only the first lane is used. |
| 2719 | if (vputils::onlyFirstLaneUsed(Def: this)) |
| 2720 | VF = ElementCount::getFixed(MinVal: 1); |
| 2721 | |
| 2722 | Type *ResultTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 2723 | Type *CmpTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.Types.getContext()), EC: VF); |
| 2724 | return (getNumIncomingValues() - 1) * |
| 2725 | Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: ResultTy, CondTy: CmpTy, |
| 2726 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind: Ctx.CostKind); |
| 2727 | } |
| 2728 | |
| 2729 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2730 | void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2731 | VPSlotTracker &SlotTracker) const { |
| 2732 | O << Indent << "BLEND " ; |
| 2733 | printAsOperand(O, SlotTracker); |
| 2734 | O << " =" ; |
| 2735 | if (getNumIncomingValues() == 1) { |
| 2736 | // Not a User of any mask: not really blending, this is a |
| 2737 | // single-predecessor phi. |
| 2738 | O << " " ; |
| 2739 | getIncomingValue(0)->printAsOperand(O, SlotTracker); |
| 2740 | } else { |
| 2741 | for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { |
| 2742 | O << " " ; |
| 2743 | getIncomingValue(I)->printAsOperand(O, SlotTracker); |
| 2744 | if (I == 0) |
| 2745 | continue; |
| 2746 | O << "/" ; |
| 2747 | getMask(I)->printAsOperand(O, SlotTracker); |
| 2748 | } |
| 2749 | } |
| 2750 | } |
| 2751 | #endif |
| 2752 | |
| 2753 | void VPReductionRecipe::execute(VPTransformState &State) { |
| 2754 | assert(!State.Lane && "Reduction being replicated." ); |
| 2755 | RecurKind Kind = getRecurrenceKind(); |
| 2756 | assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && |
| 2757 | "In-loop AnyOf reductions aren't currently supported" ); |
| 2758 | // Propagate the fast-math flags carried by the underlying instruction. |
| 2759 | IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); |
| 2760 | State.Builder.setFastMathFlags(getFastMathFlags()); |
| 2761 | Value *NewVecOp = State.get(Def: getVecOp()); |
| 2762 | if (VPValue *Cond = getCondOp()) { |
| 2763 | Value *NewCond = State.get(Def: Cond, IsScalar: State.VF.isScalar()); |
| 2764 | VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType()); |
| 2765 | Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); |
| 2766 | |
| 2767 | Value *Start = getRecurrenceIdentity(K: Kind, Tp: ElementTy, FMF: getFastMathFlags()); |
| 2768 | if (State.VF.isVector()) |
| 2769 | Start = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Start); |
| 2770 | |
| 2771 | Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Start); |
| 2772 | NewVecOp = Select; |
| 2773 | } |
| 2774 | Value *NewRed; |
| 2775 | Value *NextInChain; |
| 2776 | if (isOrdered()) { |
| 2777 | Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true); |
| 2778 | if (State.VF.isVector()) |
| 2779 | NewRed = |
| 2780 | createOrderedReduction(B&: State.Builder, RdxKind: Kind, Src: NewVecOp, Start: PrevInChain); |
| 2781 | else |
| 2782 | NewRed = State.Builder.CreateBinOp( |
| 2783 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), |
| 2784 | LHS: PrevInChain, RHS: NewVecOp); |
| 2785 | PrevInChain = NewRed; |
| 2786 | NextInChain = NewRed; |
| 2787 | } else if (isPartialReduction()) { |
| 2788 | assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) && |
| 2789 | "Unexpected partial reduction kind" ); |
| 2790 | Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ false); |
| 2791 | NewRed = State.Builder.CreateIntrinsic( |
| 2792 | RetTy: PrevInChain->getType(), |
| 2793 | ID: Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add |
| 2794 | : Intrinsic::vector_partial_reduce_fadd, |
| 2795 | Args: {PrevInChain, NewVecOp}, FMFSource: State.Builder.getFastMathFlags(), |
| 2796 | Name: "partial.reduce" ); |
| 2797 | PrevInChain = NewRed; |
| 2798 | NextInChain = NewRed; |
| 2799 | } else { |
| 2800 | assert(isInLoop() && |
| 2801 | "The reduction must either be ordered, partial or in-loop" ); |
| 2802 | Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true); |
| 2803 | NewRed = createSimpleReduction(B&: State.Builder, Src: NewVecOp, RdxKind: Kind); |
| 2804 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) |
| 2805 | NextInChain = createMinMaxOp(Builder&: State.Builder, RK: Kind, Left: NewRed, Right: PrevInChain); |
| 2806 | else |
| 2807 | NextInChain = State.Builder.CreateBinOp( |
| 2808 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), |
| 2809 | LHS: PrevInChain, RHS: NewRed); |
| 2810 | } |
| 2811 | State.set(Def: this, V: NextInChain, /*IsScalar*/ !isPartialReduction()); |
| 2812 | } |
| 2813 | |
| 2814 | void VPReductionEVLRecipe::execute(VPTransformState &State) { |
| 2815 | assert(!State.Lane && "Reduction being replicated." ); |
| 2816 | |
| 2817 | auto &Builder = State.Builder; |
| 2818 | // Propagate the fast-math flags carried by the underlying instruction. |
| 2819 | IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); |
| 2820 | Builder.setFastMathFlags(getFastMathFlags()); |
| 2821 | |
| 2822 | RecurKind Kind = getRecurrenceKind(); |
| 2823 | Value *Prev = State.get(Def: getChainOp(), /*IsScalar*/ true); |
| 2824 | Value *VecOp = State.get(Def: getVecOp()); |
| 2825 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 2826 | |
| 2827 | Value *Mask; |
| 2828 | if (VPValue *CondOp = getCondOp()) |
| 2829 | Mask = State.get(Def: CondOp); |
| 2830 | else |
| 2831 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
| 2832 | |
| 2833 | Value *NewRed; |
| 2834 | if (isOrdered()) { |
| 2835 | NewRed = createOrderedReduction(B&: Builder, RdxKind: Kind, Src: VecOp, Start: Prev, Mask, EVL); |
| 2836 | } else { |
| 2837 | NewRed = createSimpleReduction(B&: Builder, Src: VecOp, RdxKind: Kind, Mask, EVL); |
| 2838 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) |
| 2839 | NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev); |
| 2840 | else |
| 2841 | NewRed = Builder.CreateBinOp( |
| 2842 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed, |
| 2843 | RHS: Prev); |
| 2844 | } |
| 2845 | State.set(Def: this, V: NewRed, /*IsScalar*/ true); |
| 2846 | } |
| 2847 | |
| 2848 | InstructionCost VPReductionRecipe::computeCost(ElementCount VF, |
| 2849 | VPCostContext &Ctx) const { |
| 2850 | RecurKind RdxKind = getRecurrenceKind(); |
| 2851 | Type *ElementTy = Ctx.Types.inferScalarType(V: this); |
| 2852 | auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ElementTy, EC: VF)); |
| 2853 | unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind); |
| 2854 | FastMathFlags FMFs = getFastMathFlags(); |
| 2855 | std::optional<FastMathFlags> OptionalFMF = |
| 2856 | ElementTy->isFloatingPointTy() ? std::make_optional(t&: FMFs) : std::nullopt; |
| 2857 | |
| 2858 | if (isPartialReduction()) { |
| 2859 | InstructionCost CondCost = 0; |
| 2860 | if (isConditional()) { |
| 2861 | CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; |
| 2862 | auto *CondTy = cast<VectorType>( |
| 2863 | Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getCondOp()), EC: VF)); |
| 2864 | CondCost = Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VectorTy, |
| 2865 | CondTy, VecPred: Pred, CostKind: Ctx.CostKind); |
| 2866 | } |
| 2867 | return CondCost + Ctx.TTI.getPartialReductionCost( |
| 2868 | Opcode, InputTypeA: ElementTy, InputTypeB: ElementTy, AccumType: ElementTy, VF, |
| 2869 | OpAExtend: TTI::PR_None, OpBExtend: TTI::PR_None, BinOp: {}, CostKind: Ctx.CostKind, |
| 2870 | FMF: OptionalFMF); |
| 2871 | } |
| 2872 | |
| 2873 | // TODO: Support any-of reductions. |
| 2874 | assert( |
| 2875 | (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || |
| 2876 | ForceTargetInstructionCost.getNumOccurrences() > 0) && |
| 2877 | "Any-of reduction not implemented in VPlan-based cost model currently." ); |
| 2878 | |
| 2879 | // Note that TTI should model the cost of moving result to the scalar register |
| 2880 | // and the BinOp cost in the getMinMaxReductionCost(). |
| 2881 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RdxKind)) { |
| 2882 | Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind); |
| 2883 | return Ctx.TTI.getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF: FMFs, CostKind: Ctx.CostKind); |
| 2884 | } |
| 2885 | |
| 2886 | // Note that TTI should model the cost of moving result to the scalar register |
| 2887 | // and the BinOp cost in the getArithmeticReductionCost(). |
| 2888 | return Ctx.TTI.getArithmeticReductionCost(Opcode, Ty: VectorTy, FMF: OptionalFMF, |
| 2889 | CostKind: Ctx.CostKind); |
| 2890 | } |
| 2891 | |
| 2892 | VPExpressionRecipe::VPExpressionRecipe( |
| 2893 | ExpressionTypes ExpressionType, |
| 2894 | ArrayRef<VPSingleDefRecipe *> ExpressionRecipes) |
| 2895 | : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {}, {}), |
| 2896 | ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) { |
| 2897 | assert(!ExpressionRecipes.empty() && "Nothing to combine?" ); |
| 2898 | assert( |
| 2899 | none_of(ExpressionRecipes, |
| 2900 | [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && |
| 2901 | "expression cannot contain recipes with side-effects" ); |
| 2902 | |
| 2903 | // Maintain a copy of the expression recipes as a set of users. |
| 2904 | SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers; |
| 2905 | for (auto *R : ExpressionRecipes) |
| 2906 | ExpressionRecipesAsSetOfUsers.insert(Ptr: R); |
| 2907 | |
| 2908 | // Recipes in the expression, except the last one, must only be used by |
| 2909 | // (other) recipes inside the expression. If there are other users, external |
| 2910 | // to the expression, use a clone of the recipe for external users. |
| 2911 | for (VPSingleDefRecipe *R : reverse(C&: ExpressionRecipes)) { |
| 2912 | if (R != ExpressionRecipes.back() && |
| 2913 | any_of(Range: R->users(), P: [&ExpressionRecipesAsSetOfUsers](VPUser *U) { |
| 2914 | return !ExpressionRecipesAsSetOfUsers.contains(Ptr: U); |
| 2915 | })) { |
| 2916 | // There are users outside of the expression. Clone the recipe and use the |
| 2917 | // clone those external users. |
| 2918 | VPSingleDefRecipe *CopyForExtUsers = R->clone(); |
| 2919 | R->replaceUsesWithIf(New: CopyForExtUsers, ShouldReplace: [&ExpressionRecipesAsSetOfUsers]( |
| 2920 | VPUser &U, unsigned) { |
| 2921 | return !ExpressionRecipesAsSetOfUsers.contains(Ptr: &U); |
| 2922 | }); |
| 2923 | CopyForExtUsers->insertBefore(InsertPos: R); |
| 2924 | } |
| 2925 | if (R->getParent()) |
| 2926 | R->removeFromParent(); |
| 2927 | } |
| 2928 | |
| 2929 | // Internalize all external operands to the expression recipes. To do so, |
| 2930 | // create new temporary VPValues for all operands defined by a recipe outside |
| 2931 | // the expression. The original operands are added as operands of the |
| 2932 | // VPExpressionRecipe itself. |
| 2933 | for (auto *R : ExpressionRecipes) { |
| 2934 | for (const auto &[Idx, Op] : enumerate(First: R->operands())) { |
| 2935 | auto *Def = Op->getDefiningRecipe(); |
| 2936 | if (Def && ExpressionRecipesAsSetOfUsers.contains(Ptr: Def)) |
| 2937 | continue; |
| 2938 | addOperand(Operand: Op); |
| 2939 | LiveInPlaceholders.push_back(Elt: new VPSymbolicValue()); |
| 2940 | } |
| 2941 | } |
| 2942 | |
| 2943 | // Replace each external operand with the first one created for it in |
| 2944 | // LiveInPlaceholders. |
| 2945 | for (auto *R : ExpressionRecipes) |
| 2946 | for (auto const &[LiveIn, Tmp] : zip(t: operands(), u&: LiveInPlaceholders)) |
| 2947 | R->replaceUsesOfWith(From: LiveIn, To: Tmp); |
| 2948 | } |
| 2949 | |
| 2950 | void VPExpressionRecipe::decompose() { |
| 2951 | for (auto *R : ExpressionRecipes) |
| 2952 | // Since the list could contain duplicates, make sure the recipe hasn't |
| 2953 | // already been inserted. |
| 2954 | if (!R->getParent()) |
| 2955 | R->insertBefore(InsertPos: this); |
| 2956 | |
| 2957 | for (const auto &[Idx, Op] : enumerate(First: operands())) |
| 2958 | LiveInPlaceholders[Idx]->replaceAllUsesWith(New: Op); |
| 2959 | |
| 2960 | replaceAllUsesWith(New: ExpressionRecipes.back()); |
| 2961 | ExpressionRecipes.clear(); |
| 2962 | } |
| 2963 | |
| 2964 | InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, |
| 2965 | VPCostContext &Ctx) const { |
| 2966 | Type *RedTy = Ctx.Types.inferScalarType(V: this); |
| 2967 | auto *SrcVecTy = cast<VectorType>( |
| 2968 | Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF)); |
| 2969 | unsigned Opcode = RecurrenceDescriptor::getOpcode( |
| 2970 | Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes.back())->getRecurrenceKind()); |
| 2971 | switch (ExpressionType) { |
| 2972 | case ExpressionTypes::ExtendedReduction: { |
| 2973 | unsigned Opcode = RecurrenceDescriptor::getOpcode( |
| 2974 | Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes[1])->getRecurrenceKind()); |
| 2975 | auto *ExtR = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]); |
| 2976 | auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back()); |
| 2977 | |
| 2978 | if (RedR->isPartialReduction()) |
| 2979 | return Ctx.TTI.getPartialReductionCost( |
| 2980 | Opcode, InputTypeA: Ctx.Types.inferScalarType(V: getOperand(N: 0)), InputTypeB: nullptr, AccumType: RedTy, VF, |
| 2981 | OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(CastOpc: ExtR->getOpcode()), |
| 2982 | OpBExtend: TargetTransformInfo::PR_None, BinOp: std::nullopt, CostKind: Ctx.CostKind, |
| 2983 | FMF: RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()} |
| 2984 | : std::nullopt); |
| 2985 | else if (!RedTy->isFloatingPointTy()) |
| 2986 | // TTI::getExtendedReductionCost only supports integer types. |
| 2987 | return Ctx.TTI.getExtendedReductionCost( |
| 2988 | Opcode, IsUnsigned: ExtR->getOpcode() == Instruction::ZExt, ResTy: RedTy, Ty: SrcVecTy, |
| 2989 | FMF: std::nullopt, CostKind: Ctx.CostKind); |
| 2990 | else |
| 2991 | return InstructionCost::getInvalid(); |
| 2992 | } |
| 2993 | case ExpressionTypes::MulAccReduction: |
| 2994 | return Ctx.TTI.getMulAccReductionCost(IsUnsigned: false, RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy, |
| 2995 | CostKind: Ctx.CostKind); |
| 2996 | |
| 2997 | case ExpressionTypes::ExtNegatedMulAccReduction: |
| 2998 | assert(Opcode == Instruction::Add && "Unexpected opcode" ); |
| 2999 | Opcode = Instruction::Sub; |
| 3000 | [[fallthrough]]; |
| 3001 | case ExpressionTypes::ExtMulAccReduction: { |
| 3002 | auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back()); |
| 3003 | if (RedR->isPartialReduction()) { |
| 3004 | auto *Ext0R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]); |
| 3005 | auto *Ext1R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[1]); |
| 3006 | auto *Mul = cast<VPWidenRecipe>(Val: ExpressionRecipes[2]); |
| 3007 | return Ctx.TTI.getPartialReductionCost( |
| 3008 | Opcode, InputTypeA: Ctx.Types.inferScalarType(V: getOperand(N: 0)), |
| 3009 | InputTypeB: Ctx.Types.inferScalarType(V: getOperand(N: 1)), AccumType: RedTy, VF, |
| 3010 | OpAExtend: TargetTransformInfo::getPartialReductionExtendKind( |
| 3011 | CastOpc: Ext0R->getOpcode()), |
| 3012 | OpBExtend: TargetTransformInfo::getPartialReductionExtendKind( |
| 3013 | CastOpc: Ext1R->getOpcode()), |
| 3014 | BinOp: Mul->getOpcode(), CostKind: Ctx.CostKind, |
| 3015 | FMF: RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()} |
| 3016 | : std::nullopt); |
| 3017 | } |
| 3018 | return Ctx.TTI.getMulAccReductionCost( |
| 3019 | IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() == |
| 3020 | Instruction::ZExt, |
| 3021 | RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind); |
| 3022 | } |
| 3023 | } |
| 3024 | llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum" ); |
| 3025 | } |
| 3026 | |
| 3027 | bool VPExpressionRecipe::mayReadOrWriteMemory() const { |
| 3028 | return any_of(Range: ExpressionRecipes, P: [](VPSingleDefRecipe *R) { |
| 3029 | return R->mayReadFromMemory() || R->mayWriteToMemory(); |
| 3030 | }); |
| 3031 | } |
| 3032 | |
| 3033 | bool VPExpressionRecipe::mayHaveSideEffects() const { |
| 3034 | assert( |
| 3035 | none_of(ExpressionRecipes, |
| 3036 | [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && |
| 3037 | "expression cannot contain recipes with side-effects" ); |
| 3038 | return false; |
| 3039 | } |
| 3040 | |
| 3041 | bool VPExpressionRecipe::isSingleScalar() const { |
| 3042 | // Cannot use vputils::isSingleScalar(), because all external operands |
| 3043 | // of the expression will be live-ins while bundled. |
| 3044 | auto *RR = dyn_cast<VPReductionRecipe>(Val: ExpressionRecipes.back()); |
| 3045 | return RR && !RR->isPartialReduction(); |
| 3046 | } |
| 3047 | |
| 3048 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3049 | |
| 3050 | void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3051 | VPSlotTracker &SlotTracker) const { |
| 3052 | O << Indent << "EXPRESSION " ; |
| 3053 | printAsOperand(O, SlotTracker); |
| 3054 | O << " = " ; |
| 3055 | auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back()); |
| 3056 | unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); |
| 3057 | |
| 3058 | switch (ExpressionType) { |
| 3059 | case ExpressionTypes::ExtendedReduction: { |
| 3060 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 3061 | O << " + " << (Red->isPartialReduction() ? "partial." : "" ) << "reduce." ; |
| 3062 | O << Instruction::getOpcodeName(Opcode) << " (" ; |
| 3063 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 3064 | Red->printFlags(O); |
| 3065 | |
| 3066 | auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); |
| 3067 | O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " |
| 3068 | << *Ext0->getResultType(); |
| 3069 | if (Red->isConditional()) { |
| 3070 | O << ", " ; |
| 3071 | Red->getCondOp()->printAsOperand(O, SlotTracker); |
| 3072 | } |
| 3073 | O << ")" ; |
| 3074 | break; |
| 3075 | } |
| 3076 | case ExpressionTypes::ExtNegatedMulAccReduction: { |
| 3077 | getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); |
| 3078 | O << " + " << (Red->isPartialReduction() ? "partial." : "" ) << "reduce." ; |
| 3079 | O << Instruction::getOpcodeName( |
| 3080 | RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) |
| 3081 | << " (sub (0, mul" ; |
| 3082 | auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]); |
| 3083 | Mul->printFlags(O); |
| 3084 | O << "(" ; |
| 3085 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 3086 | auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); |
| 3087 | O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " |
| 3088 | << *Ext0->getResultType() << "), (" ; |
| 3089 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 3090 | auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]); |
| 3091 | O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " |
| 3092 | << *Ext1->getResultType() << ")" ; |
| 3093 | if (Red->isConditional()) { |
| 3094 | O << ", " ; |
| 3095 | Red->getCondOp()->printAsOperand(O, SlotTracker); |
| 3096 | } |
| 3097 | O << "))" ; |
| 3098 | break; |
| 3099 | } |
| 3100 | case ExpressionTypes::MulAccReduction: |
| 3101 | case ExpressionTypes::ExtMulAccReduction: { |
| 3102 | getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); |
| 3103 | O << " + " << (Red->isPartialReduction() ? "partial." : "" ) << "reduce." ; |
| 3104 | O << Instruction::getOpcodeName( |
| 3105 | RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) |
| 3106 | << " (" ; |
| 3107 | O << "mul" ; |
| 3108 | bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction; |
| 3109 | auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2] |
| 3110 | : ExpressionRecipes[0]); |
| 3111 | Mul->printFlags(O); |
| 3112 | if (IsExtended) |
| 3113 | O << "(" ; |
| 3114 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 3115 | if (IsExtended) { |
| 3116 | auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); |
| 3117 | O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " |
| 3118 | << *Ext0->getResultType() << "), (" ; |
| 3119 | } else { |
| 3120 | O << ", " ; |
| 3121 | } |
| 3122 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 3123 | if (IsExtended) { |
| 3124 | auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]); |
| 3125 | O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " |
| 3126 | << *Ext1->getResultType() << ")" ; |
| 3127 | } |
| 3128 | if (Red->isConditional()) { |
| 3129 | O << ", " ; |
| 3130 | Red->getCondOp()->printAsOperand(O, SlotTracker); |
| 3131 | } |
| 3132 | O << ")" ; |
| 3133 | break; |
| 3134 | } |
| 3135 | } |
| 3136 | } |
| 3137 | |
| 3138 | void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3139 | VPSlotTracker &SlotTracker) const { |
| 3140 | if (isPartialReduction()) |
| 3141 | O << Indent << "PARTIAL-REDUCE " ; |
| 3142 | else |
| 3143 | O << Indent << "REDUCE " ; |
| 3144 | printAsOperand(O, SlotTracker); |
| 3145 | O << " = " ; |
| 3146 | getChainOp()->printAsOperand(O, SlotTracker); |
| 3147 | O << " +" ; |
| 3148 | printFlags(O); |
| 3149 | O << " reduce." |
| 3150 | << Instruction::getOpcodeName( |
| 3151 | RecurrenceDescriptor::getOpcode(getRecurrenceKind())) |
| 3152 | << " (" ; |
| 3153 | getVecOp()->printAsOperand(O, SlotTracker); |
| 3154 | if (isConditional()) { |
| 3155 | O << ", " ; |
| 3156 | getCondOp()->printAsOperand(O, SlotTracker); |
| 3157 | } |
| 3158 | O << ")" ; |
| 3159 | } |
| 3160 | |
| 3161 | void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3162 | VPSlotTracker &SlotTracker) const { |
| 3163 | O << Indent << "REDUCE " ; |
| 3164 | printAsOperand(O, SlotTracker); |
| 3165 | O << " = " ; |
| 3166 | getChainOp()->printAsOperand(O, SlotTracker); |
| 3167 | O << " +" ; |
| 3168 | printFlags(O); |
| 3169 | O << " vp.reduce." |
| 3170 | << Instruction::getOpcodeName( |
| 3171 | RecurrenceDescriptor::getOpcode(getRecurrenceKind())) |
| 3172 | << " (" ; |
| 3173 | getVecOp()->printAsOperand(O, SlotTracker); |
| 3174 | O << ", " ; |
| 3175 | getEVL()->printAsOperand(O, SlotTracker); |
| 3176 | if (isConditional()) { |
| 3177 | O << ", " ; |
| 3178 | getCondOp()->printAsOperand(O, SlotTracker); |
| 3179 | } |
| 3180 | O << ")" ; |
| 3181 | } |
| 3182 | |
| 3183 | #endif |
| 3184 | |
| 3185 | /// A helper function to scalarize a single Instruction in the innermost loop. |
| 3186 | /// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue |
| 3187 | /// operands from \p RepRecipe instead of \p Instr's operands. |
| 3188 | static void scalarizeInstruction(const Instruction *Instr, |
| 3189 | VPReplicateRecipe *RepRecipe, |
| 3190 | const VPLane &Lane, VPTransformState &State) { |
| 3191 | assert((!Instr->getType()->isAggregateType() || |
| 3192 | canVectorizeTy(Instr->getType())) && |
| 3193 | "Expected vectorizable or non-aggregate type." ); |
| 3194 | |
| 3195 | // Does this instruction return a value ? |
| 3196 | bool IsVoidRetTy = Instr->getType()->isVoidTy(); |
| 3197 | |
| 3198 | Instruction *Cloned = Instr->clone(); |
| 3199 | if (!IsVoidRetTy) { |
| 3200 | Cloned->setName(Instr->getName() + ".cloned" ); |
| 3201 | Type *ResultTy = State.TypeAnalysis.inferScalarType(V: RepRecipe); |
| 3202 | // The operands of the replicate recipe may have been narrowed, resulting in |
| 3203 | // a narrower result type. Update the type of the cloned instruction to the |
| 3204 | // correct type. |
| 3205 | if (ResultTy != Cloned->getType()) |
| 3206 | Cloned->mutateType(Ty: ResultTy); |
| 3207 | } |
| 3208 | |
| 3209 | RepRecipe->applyFlags(I&: *Cloned); |
| 3210 | RepRecipe->applyMetadata(I&: *Cloned); |
| 3211 | |
| 3212 | if (RepRecipe->hasPredicate()) |
| 3213 | cast<CmpInst>(Val: Cloned)->setPredicate(RepRecipe->getPredicate()); |
| 3214 | |
| 3215 | if (auto DL = RepRecipe->getDebugLoc()) |
| 3216 | State.setDebugLocFrom(DL); |
| 3217 | |
| 3218 | // Replace the operands of the cloned instructions with their scalar |
| 3219 | // equivalents in the new loop. |
| 3220 | for (const auto &I : enumerate(First: RepRecipe->operands())) { |
| 3221 | auto InputLane = Lane; |
| 3222 | VPValue *Operand = I.value(); |
| 3223 | if (vputils::isSingleScalar(VPV: Operand)) |
| 3224 | InputLane = VPLane::getFirstLane(); |
| 3225 | Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Lane: InputLane)); |
| 3226 | } |
| 3227 | |
| 3228 | // Place the cloned scalar in the new loop. |
| 3229 | State.Builder.Insert(I: Cloned); |
| 3230 | |
| 3231 | State.set(Def: RepRecipe, V: Cloned, Lane); |
| 3232 | |
| 3233 | // If we just cloned a new assumption, add it the assumption cache. |
| 3234 | if (auto *II = dyn_cast<AssumeInst>(Val: Cloned)) |
| 3235 | State.AC->registerAssumption(CI: II); |
| 3236 | |
| 3237 | assert( |
| 3238 | (RepRecipe->getRegion() || |
| 3239 | !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || |
| 3240 | all_of(RepRecipe->operands(), |
| 3241 | [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && |
| 3242 | "Expected a recipe is either within a region or all of its operands " |
| 3243 | "are defined outside the vectorized region." ); |
| 3244 | } |
| 3245 | |
| 3246 | void VPReplicateRecipe::execute(VPTransformState &State) { |
| 3247 | Instruction *UI = getUnderlyingInstr(); |
| 3248 | |
| 3249 | if (!State.Lane) { |
| 3250 | assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions " |
| 3251 | "must have already been unrolled" ); |
| 3252 | scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: VPLane(0), State); |
| 3253 | return; |
| 3254 | } |
| 3255 | |
| 3256 | assert((State.VF.isScalar() || !isSingleScalar()) && |
| 3257 | "uniform recipe shouldn't be predicated" ); |
| 3258 | assert(!State.VF.isScalable() && "Can't scalarize a scalable vector" ); |
| 3259 | scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: *State.Lane, State); |
| 3260 | // Insert scalar instance packing it into a vector. |
| 3261 | if (State.VF.isVector() && shouldPack()) { |
| 3262 | Value *WideValue = |
| 3263 | State.Lane->isFirstLane() |
| 3264 | ? PoisonValue::get(T: toVectorizedTy(Ty: UI->getType(), EC: State.VF)) |
| 3265 | : State.get(Def: this); |
| 3266 | State.set(Def: this, V: State.packScalarIntoVectorizedValue(Def: this, WideValue, |
| 3267 | Lane: *State.Lane)); |
| 3268 | } |
| 3269 | } |
| 3270 | |
| 3271 | bool VPReplicateRecipe::shouldPack() const { |
| 3272 | // Find if the recipe is used by a widened recipe via an intervening |
| 3273 | // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. |
| 3274 | return any_of(Range: users(), P: [](const VPUser *U) { |
| 3275 | if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U)) |
| 3276 | return !vputils::onlyScalarValuesUsed(Def: PredR); |
| 3277 | return false; |
| 3278 | }); |
| 3279 | } |
| 3280 | |
| 3281 | /// Returns a SCEV expression for \p Ptr if it is a pointer computation for |
| 3282 | /// which the legacy cost model computes a SCEV expression when computing the |
| 3283 | /// address cost. Computing SCEVs for VPValues is incomplete and returns |
| 3284 | /// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In |
| 3285 | /// those cases we fall back to the legacy cost model. Otherwise return nullptr. |
| 3286 | static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, |
| 3287 | PredicatedScalarEvolution &PSE, |
| 3288 | const Loop *L) { |
| 3289 | const SCEV *Addr = vputils::getSCEVExprForVPValue(V: Ptr, PSE, L); |
| 3290 | if (isa<SCEVCouldNotCompute>(Val: Addr)) |
| 3291 | return Addr; |
| 3292 | |
| 3293 | return vputils::isAddressSCEVForCost(Addr, SE&: *PSE.getSE(), L) ? Addr : nullptr; |
| 3294 | } |
| 3295 | |
| 3296 | /// Returns true if \p V is used as part of the address of another load or |
| 3297 | /// store. |
| 3298 | static bool isUsedByLoadStoreAddress(const VPUser *V) { |
| 3299 | SmallPtrSet<const VPUser *, 4> Seen; |
| 3300 | SmallVector<const VPUser *> WorkList = {V}; |
| 3301 | |
| 3302 | while (!WorkList.empty()) { |
| 3303 | auto *Cur = dyn_cast<VPSingleDefRecipe>(Val: WorkList.pop_back_val()); |
| 3304 | if (!Cur || !Seen.insert(Ptr: Cur).second) |
| 3305 | continue; |
| 3306 | |
| 3307 | auto *Blend = dyn_cast<VPBlendRecipe>(Val: Cur); |
| 3308 | // Skip blends that use V only through a compare by checking if any incoming |
| 3309 | // value was already visited. |
| 3310 | if (Blend && none_of(Range: seq<unsigned>(Begin: 0, End: Blend->getNumIncomingValues()), |
| 3311 | P: [&](unsigned I) { |
| 3312 | return Seen.contains( |
| 3313 | Ptr: Blend->getIncomingValue(Idx: I)->getDefiningRecipe()); |
| 3314 | })) |
| 3315 | continue; |
| 3316 | |
| 3317 | for (VPUser *U : Cur->users()) { |
| 3318 | if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(Val: U)) |
| 3319 | if (InterleaveR->getAddr() == Cur) |
| 3320 | return true; |
| 3321 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U)) { |
| 3322 | if (RepR->getOpcode() == Instruction::Load && |
| 3323 | RepR->getOperand(N: 0) == Cur) |
| 3324 | return true; |
| 3325 | if (RepR->getOpcode() == Instruction::Store && |
| 3326 | RepR->getOperand(N: 1) == Cur) |
| 3327 | return true; |
| 3328 | } |
| 3329 | if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(Val: U)) { |
| 3330 | if (MemR->getAddr() == Cur && MemR->isConsecutive()) |
| 3331 | return true; |
| 3332 | } |
| 3333 | } |
| 3334 | |
| 3335 | // The legacy cost model only supports scalarization loads/stores with phi |
| 3336 | // addresses, if the phi is directly used as load/store address. Don't |
| 3337 | // traverse further for Blends. |
| 3338 | if (Blend) |
| 3339 | continue; |
| 3340 | |
| 3341 | append_range(C&: WorkList, R: Cur->users()); |
| 3342 | } |
| 3343 | return false; |
| 3344 | } |
| 3345 | |
| 3346 | InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, |
| 3347 | VPCostContext &Ctx) const { |
| 3348 | Instruction *UI = cast<Instruction>(Val: getUnderlyingValue()); |
| 3349 | // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan |
| 3350 | // transform, avoid computing their cost multiple times for now. |
| 3351 | Ctx.SkipCostComputation.insert(Ptr: UI); |
| 3352 | |
| 3353 | if (VF.isScalable() && !isSingleScalar()) |
| 3354 | return InstructionCost::getInvalid(); |
| 3355 | |
| 3356 | switch (UI->getOpcode()) { |
| 3357 | case Instruction::Alloca: |
| 3358 | if (VF.isScalable()) |
| 3359 | return InstructionCost::getInvalid(); |
| 3360 | return Ctx.TTI.getArithmeticInstrCost( |
| 3361 | Opcode: Instruction::Mul, Ty: Ctx.Types.inferScalarType(V: this), CostKind: Ctx.CostKind); |
| 3362 | case Instruction::GetElementPtr: |
| 3363 | // We mark this instruction as zero-cost because the cost of GEPs in |
| 3364 | // vectorized code depends on whether the corresponding memory instruction |
| 3365 | // is scalarized or not. Therefore, we handle GEPs with the memory |
| 3366 | // instruction cost. |
| 3367 | return 0; |
| 3368 | case Instruction::Call: { |
| 3369 | auto *CalledFn = |
| 3370 | cast<Function>(Val: getOperand(N: getNumOperands() - 1)->getLiveInIRValue()); |
| 3371 | |
| 3372 | SmallVector<const VPValue *> ArgOps(drop_end(RangeOrContainer: operands())); |
| 3373 | SmallVector<Type *, 4> Tys; |
| 3374 | for (const VPValue *ArgOp : ArgOps) |
| 3375 | Tys.push_back(Elt: Ctx.Types.inferScalarType(V: ArgOp)); |
| 3376 | |
| 3377 | if (CalledFn->isIntrinsic()) |
| 3378 | // Various pseudo-intrinsics with costs of 0 are scalarized instead of |
| 3379 | // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early. |
| 3380 | switch (CalledFn->getIntrinsicID()) { |
| 3381 | case Intrinsic::assume: |
| 3382 | case Intrinsic::lifetime_end: |
| 3383 | case Intrinsic::lifetime_start: |
| 3384 | case Intrinsic::sideeffect: |
| 3385 | case Intrinsic::pseudoprobe: |
| 3386 | case Intrinsic::experimental_noalias_scope_decl: { |
| 3387 | assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this, |
| 3388 | ElementCount::getFixed(1), Ctx) == 0 && |
| 3389 | "scalarizing intrinsic should be free" ); |
| 3390 | return InstructionCost(0); |
| 3391 | } |
| 3392 | default: |
| 3393 | break; |
| 3394 | } |
| 3395 | |
| 3396 | Type *ResultTy = Ctx.Types.inferScalarType(V: this); |
| 3397 | InstructionCost ScalarCallCost = |
| 3398 | Ctx.TTI.getCallInstrCost(F: CalledFn, RetTy: ResultTy, Tys, CostKind: Ctx.CostKind); |
| 3399 | if (isSingleScalar()) { |
| 3400 | if (CalledFn->isIntrinsic()) |
| 3401 | ScalarCallCost = std::min( |
| 3402 | a: ScalarCallCost, |
| 3403 | b: getCostForIntrinsics(ID: CalledFn->getIntrinsicID(), Operands: ArgOps, R: *this, |
| 3404 | VF: ElementCount::getFixed(MinVal: 1), Ctx)); |
| 3405 | return ScalarCallCost; |
| 3406 | } |
| 3407 | |
| 3408 | return ScalarCallCost * VF.getFixedValue() + |
| 3409 | Ctx.getScalarizationOverhead(ResultTy, Operands: ArgOps, VF); |
| 3410 | } |
| 3411 | case Instruction::Add: |
| 3412 | case Instruction::Sub: |
| 3413 | case Instruction::FAdd: |
| 3414 | case Instruction::FSub: |
| 3415 | case Instruction::Mul: |
| 3416 | case Instruction::FMul: |
| 3417 | case Instruction::FDiv: |
| 3418 | case Instruction::FRem: |
| 3419 | case Instruction::Shl: |
| 3420 | case Instruction::LShr: |
| 3421 | case Instruction::AShr: |
| 3422 | case Instruction::And: |
| 3423 | case Instruction::Or: |
| 3424 | case Instruction::Xor: |
| 3425 | case Instruction::ICmp: |
| 3426 | case Instruction::FCmp: |
| 3427 | return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), |
| 3428 | Ctx) * |
| 3429 | (isSingleScalar() ? 1 : VF.getFixedValue()); |
| 3430 | case Instruction::SDiv: |
| 3431 | case Instruction::UDiv: |
| 3432 | case Instruction::SRem: |
| 3433 | case Instruction::URem: { |
| 3434 | InstructionCost ScalarCost = |
| 3435 | getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), Ctx); |
| 3436 | if (isSingleScalar()) |
| 3437 | return ScalarCost; |
| 3438 | |
| 3439 | // If any of the operands is from a different replicate region and has its |
| 3440 | // cost skipped, it may have been forced to scalar. Fall back to legacy cost |
| 3441 | // model to avoid cost mis-match. |
| 3442 | if (any_of(Range: operands(), P: [&Ctx, VF](VPValue *Op) { |
| 3443 | auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: Op); |
| 3444 | if (!PredR) |
| 3445 | return false; |
| 3446 | return Ctx.skipCostComputation( |
| 3447 | UI: dyn_cast_or_null<Instruction>( |
| 3448 | Val: PredR->getOperand(N: 0)->getUnderlyingValue()), |
| 3449 | IsVector: VF.isVector()); |
| 3450 | })) |
| 3451 | break; |
| 3452 | |
| 3453 | ScalarCost = ScalarCost * VF.getFixedValue() + |
| 3454 | Ctx.getScalarizationOverhead(ResultTy: Ctx.Types.inferScalarType(V: this), |
| 3455 | Operands: to_vector(Range: operands()), VF); |
| 3456 | // If the recipe is not predicated (i.e. not in a replicate region), return |
| 3457 | // the scalar cost. Otherwise handle predicated cost. |
| 3458 | if (!getRegion()->isReplicator()) |
| 3459 | return ScalarCost; |
| 3460 | |
| 3461 | // Account for the phi nodes that we will create. |
| 3462 | ScalarCost += VF.getFixedValue() * |
| 3463 | Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 3464 | // Scale the cost by the probability of executing the predicated blocks. |
| 3465 | // This assumes the predicated block for each vector lane is equally |
| 3466 | // likely. |
| 3467 | ScalarCost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent()); |
| 3468 | return ScalarCost; |
| 3469 | } |
| 3470 | case Instruction::Load: |
| 3471 | case Instruction::Store: { |
| 3472 | // TODO: See getMemInstScalarizationCost for how to handle replicating and |
| 3473 | // predicated cases. |
| 3474 | const VPRegionBlock *ParentRegion = getRegion(); |
| 3475 | if (ParentRegion && ParentRegion->isReplicator()) |
| 3476 | break; |
| 3477 | |
| 3478 | bool IsLoad = UI->getOpcode() == Instruction::Load; |
| 3479 | const VPValue *PtrOp = getOperand(N: !IsLoad); |
| 3480 | const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr: PtrOp, PSE&: Ctx.PSE, L: Ctx.L); |
| 3481 | if (isa_and_nonnull<SCEVCouldNotCompute>(Val: PtrSCEV)) |
| 3482 | break; |
| 3483 | |
| 3484 | Type *ValTy = Ctx.Types.inferScalarType(V: IsLoad ? this : getOperand(N: 0)); |
| 3485 | Type *ScalarPtrTy = Ctx.Types.inferScalarType(V: PtrOp); |
| 3486 | const Align Alignment = getLoadStoreAlignment(I: UI); |
| 3487 | unsigned AS = cast<PointerType>(Val: ScalarPtrTy)->getAddressSpace(); |
| 3488 | TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: UI->getOperand(i: 0)); |
| 3489 | InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( |
| 3490 | Opcode: UI->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind, OpdInfo: OpInfo); |
| 3491 | |
| 3492 | Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(Scalar: ScalarPtrTy, EC: VF); |
| 3493 | bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); |
| 3494 | bool UsedByLoadStoreAddress = |
| 3495 | !PreferVectorizedAddressing && isUsedByLoadStoreAddress(V: this); |
| 3496 | InstructionCost ScalarCost = |
| 3497 | ScalarMemOpCost + |
| 3498 | Ctx.TTI.getAddressComputationCost( |
| 3499 | PtrTy, SE: UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), Ptr: PtrSCEV, |
| 3500 | CostKind: Ctx.CostKind); |
| 3501 | if (isSingleScalar()) |
| 3502 | return ScalarCost; |
| 3503 | |
| 3504 | SmallVector<const VPValue *> OpsToScalarize; |
| 3505 | Type *ResultTy = Type::getVoidTy(C&: PtrTy->getContext()); |
| 3506 | // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we |
| 3507 | // don't assign scalarization overhead in general, if the target prefers |
| 3508 | // vectorized addressing or the loaded value is used as part of an address |
| 3509 | // of another load or store. |
| 3510 | if (!UsedByLoadStoreAddress) { |
| 3511 | bool EfficientVectorLoadStore = |
| 3512 | Ctx.TTI.supportsEfficientVectorElementLoadStore(); |
| 3513 | if (!(IsLoad && !PreferVectorizedAddressing) && |
| 3514 | !(!IsLoad && EfficientVectorLoadStore)) |
| 3515 | append_range(C&: OpsToScalarize, R: operands()); |
| 3516 | |
| 3517 | if (!EfficientVectorLoadStore) |
| 3518 | ResultTy = Ctx.Types.inferScalarType(V: this); |
| 3519 | } |
| 3520 | |
| 3521 | TTI::VectorInstrContext VIC = |
| 3522 | IsLoad ? TTI::VectorInstrContext::Load : TTI::VectorInstrContext::Store; |
| 3523 | return (ScalarCost * VF.getFixedValue()) + |
| 3524 | Ctx.getScalarizationOverhead(ResultTy, Operands: OpsToScalarize, VF, VIC, |
| 3525 | AlwaysIncludeReplicatingR: true); |
| 3526 | } |
| 3527 | case Instruction::SExt: |
| 3528 | case Instruction::ZExt: |
| 3529 | case Instruction::FPToUI: |
| 3530 | case Instruction::FPToSI: |
| 3531 | case Instruction::FPExt: |
| 3532 | case Instruction::PtrToInt: |
| 3533 | case Instruction::PtrToAddr: |
| 3534 | case Instruction::IntToPtr: |
| 3535 | case Instruction::SIToFP: |
| 3536 | case Instruction::UIToFP: |
| 3537 | case Instruction::Trunc: |
| 3538 | case Instruction::FPTrunc: |
| 3539 | case Instruction::AddrSpaceCast: { |
| 3540 | return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), |
| 3541 | Ctx) * |
| 3542 | (isSingleScalar() ? 1 : VF.getFixedValue()); |
| 3543 | } |
| 3544 | case Instruction::ExtractValue: |
| 3545 | case Instruction::InsertValue: |
| 3546 | return Ctx.TTI.getInsertExtractValueCost(Opcode: getOpcode(), CostKind: Ctx.CostKind); |
| 3547 | } |
| 3548 | |
| 3549 | return Ctx.getLegacyCost(UI, VF); |
| 3550 | } |
| 3551 | |
| 3552 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3553 | void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3554 | VPSlotTracker &SlotTracker) const { |
| 3555 | O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE " ); |
| 3556 | |
| 3557 | if (!getUnderlyingInstr()->getType()->isVoidTy()) { |
| 3558 | printAsOperand(O, SlotTracker); |
| 3559 | O << " = " ; |
| 3560 | } |
| 3561 | if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { |
| 3562 | O << "call" ; |
| 3563 | printFlags(O); |
| 3564 | O << "@" << CB->getCalledFunction()->getName() << "(" ; |
| 3565 | interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), |
| 3566 | O, [&O, &SlotTracker](VPValue *Op) { |
| 3567 | Op->printAsOperand(O, SlotTracker); |
| 3568 | }); |
| 3569 | O << ")" ; |
| 3570 | } else { |
| 3571 | O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); |
| 3572 | printFlags(O); |
| 3573 | printOperands(O, SlotTracker); |
| 3574 | } |
| 3575 | |
| 3576 | if (shouldPack()) |
| 3577 | O << " (S->V)" ; |
| 3578 | } |
| 3579 | #endif |
| 3580 | |
| 3581 | void VPBranchOnMaskRecipe::execute(VPTransformState &State) { |
| 3582 | assert(State.Lane && "Branch on Mask works only on single instance." ); |
| 3583 | |
| 3584 | VPValue *BlockInMask = getOperand(N: 0); |
| 3585 | Value *ConditionBit = State.get(Def: BlockInMask, Lane: *State.Lane); |
| 3586 | |
| 3587 | // Replace the temporary unreachable terminator with a new conditional branch, |
| 3588 | // whose two destinations will be set later when they are created. |
| 3589 | auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); |
| 3590 | assert(isa<UnreachableInst>(CurrentTerminator) && |
| 3591 | "Expected to replace unreachable terminator with conditional branch." ); |
| 3592 | auto CondBr = |
| 3593 | State.Builder.CreateCondBr(Cond: ConditionBit, True: State.CFG.PrevBB, False: nullptr); |
| 3594 | CondBr->setSuccessor(idx: 0, NewSucc: nullptr); |
| 3595 | CurrentTerminator->eraseFromParent(); |
| 3596 | } |
| 3597 | |
| 3598 | InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, |
| 3599 | VPCostContext &Ctx) const { |
| 3600 | // The legacy cost model doesn't assign costs to branches for individual |
| 3601 | // replicate regions. Match the current behavior in the VPlan cost model for |
| 3602 | // now. |
| 3603 | return 0; |
| 3604 | } |
| 3605 | |
| 3606 | void VPPredInstPHIRecipe::execute(VPTransformState &State) { |
| 3607 | assert(State.Lane && "Predicated instruction PHI works per instance." ); |
| 3608 | Instruction *ScalarPredInst = |
| 3609 | cast<Instruction>(Val: State.get(Def: getOperand(N: 0), Lane: *State.Lane)); |
| 3610 | BasicBlock *PredicatedBB = ScalarPredInst->getParent(); |
| 3611 | BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); |
| 3612 | assert(PredicatingBB && "Predicated block has no single predecessor." ); |
| 3613 | assert(isa<VPReplicateRecipe>(getOperand(0)) && |
| 3614 | "operand must be VPReplicateRecipe" ); |
| 3615 | |
| 3616 | // By current pack/unpack logic we need to generate only a single phi node: if |
| 3617 | // a vector value for the predicated instruction exists at this point it means |
| 3618 | // the instruction has vector users only, and a phi for the vector value is |
| 3619 | // needed. In this case the recipe of the predicated instruction is marked to |
| 3620 | // also do that packing, thereby "hoisting" the insert-element sequence. |
| 3621 | // Otherwise, a phi node for the scalar value is needed. |
| 3622 | if (State.hasVectorValue(Def: getOperand(N: 0))) { |
| 3623 | auto *VecI = cast<Instruction>(Val: State.get(Def: getOperand(N: 0))); |
| 3624 | assert((isa<InsertElementInst, InsertValueInst>(VecI)) && |
| 3625 | "Packed operands must generate an insertelement or insertvalue" ); |
| 3626 | |
| 3627 | // If VectorI is a struct, it will be a sequence like: |
| 3628 | // %1 = insertvalue %unmodified, %x, 0 |
| 3629 | // %2 = insertvalue %1, %y, 1 |
| 3630 | // %VectorI = insertvalue %2, %z, 2 |
| 3631 | // To get the unmodified vector we need to look through the chain. |
| 3632 | if (auto *StructTy = dyn_cast<StructType>(Val: VecI->getType())) |
| 3633 | for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++) |
| 3634 | VecI = cast<InsertValueInst>(Val: VecI->getOperand(i: 0)); |
| 3635 | |
| 3636 | PHINode *VPhi = State.Builder.CreatePHI(Ty: VecI->getType(), NumReservedValues: 2); |
| 3637 | VPhi->addIncoming(V: VecI->getOperand(i: 0), BB: PredicatingBB); // Unmodified vector. |
| 3638 | VPhi->addIncoming(V: VecI, BB: PredicatedBB); // New vector with inserted element. |
| 3639 | if (State.hasVectorValue(Def: this)) |
| 3640 | State.reset(Def: this, V: VPhi); |
| 3641 | else |
| 3642 | State.set(Def: this, V: VPhi); |
| 3643 | // NOTE: Currently we need to update the value of the operand, so the next |
| 3644 | // predicated iteration inserts its generated value in the correct vector. |
| 3645 | State.reset(Def: getOperand(N: 0), V: VPhi); |
| 3646 | } else { |
| 3647 | if (vputils::onlyFirstLaneUsed(Def: this) && !State.Lane->isFirstLane()) |
| 3648 | return; |
| 3649 | |
| 3650 | Type *PredInstType = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)); |
| 3651 | PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: 2); |
| 3652 | Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()), |
| 3653 | BB: PredicatingBB); |
| 3654 | Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB); |
| 3655 | if (State.hasScalarValue(Def: this, Lane: *State.Lane)) |
| 3656 | State.reset(Def: this, V: Phi, Lane: *State.Lane); |
| 3657 | else |
| 3658 | State.set(Def: this, V: Phi, Lane: *State.Lane); |
| 3659 | // NOTE: Currently we need to update the value of the operand, so the next |
| 3660 | // predicated iteration inserts its generated value in the correct vector. |
| 3661 | State.reset(Def: getOperand(N: 0), V: Phi, Lane: *State.Lane); |
| 3662 | } |
| 3663 | } |
| 3664 | |
| 3665 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3666 | void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3667 | VPSlotTracker &SlotTracker) const { |
| 3668 | O << Indent << "PHI-PREDICATED-INSTRUCTION " ; |
| 3669 | printAsOperand(O, SlotTracker); |
| 3670 | O << " = " ; |
| 3671 | printOperands(O, SlotTracker); |
| 3672 | } |
| 3673 | #endif |
| 3674 | |
| 3675 | InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, |
| 3676 | VPCostContext &Ctx) const { |
| 3677 | Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF); |
| 3678 | unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr())) |
| 3679 | ->getAddressSpace(); |
| 3680 | unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) |
| 3681 | ? Instruction::Load |
| 3682 | : Instruction::Store; |
| 3683 | |
| 3684 | if (!Consecutive) { |
| 3685 | // TODO: Using the original IR may not be accurate. |
| 3686 | // Currently, ARM will use the underlying IR to calculate gather/scatter |
| 3687 | // instruction cost. |
| 3688 | assert(!Reverse && |
| 3689 | "Inconsecutive memory access should not have the order." ); |
| 3690 | |
| 3691 | const Value *Ptr = getLoadStorePointerOperand(V: &Ingredient); |
| 3692 | Type *PtrTy = Ptr->getType(); |
| 3693 | |
| 3694 | // If the address value is uniform across all lanes, then the address can be |
| 3695 | // calculated with scalar type and broadcast. |
| 3696 | if (!vputils::isSingleScalar(VPV: getAddr())) |
| 3697 | PtrTy = toVectorTy(Scalar: PtrTy, EC: VF); |
| 3698 | |
| 3699 | unsigned IID = isa<VPWidenLoadRecipe>(Val: this) ? Intrinsic::masked_gather |
| 3700 | : isa<VPWidenStoreRecipe>(Val: this) ? Intrinsic::masked_scatter |
| 3701 | : isa<VPWidenLoadEVLRecipe>(Val: this) ? Intrinsic::vp_gather |
| 3702 | : Intrinsic::vp_scatter; |
| 3703 | return Ctx.TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, |
| 3704 | CostKind: Ctx.CostKind) + |
| 3705 | Ctx.TTI.getMemIntrinsicInstrCost( |
| 3706 | MICA: MemIntrinsicCostAttributes(IID, Ty, Ptr, IsMasked, Alignment, |
| 3707 | &Ingredient), |
| 3708 | CostKind: Ctx.CostKind); |
| 3709 | } |
| 3710 | |
| 3711 | InstructionCost Cost = 0; |
| 3712 | if (IsMasked) { |
| 3713 | unsigned IID = isa<VPWidenLoadRecipe>(Val: this) ? Intrinsic::masked_load |
| 3714 | : Intrinsic::masked_store; |
| 3715 | Cost += Ctx.TTI.getMemIntrinsicInstrCost( |
| 3716 | MICA: MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), CostKind: Ctx.CostKind); |
| 3717 | } else { |
| 3718 | TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo( |
| 3719 | V: isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) ? getOperand(N: 0) |
| 3720 | : getOperand(N: 1)); |
| 3721 | Cost += Ctx.TTI.getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind, |
| 3722 | OpdInfo: OpInfo, I: &Ingredient); |
| 3723 | } |
| 3724 | return Cost; |
| 3725 | } |
| 3726 | |
| 3727 | void VPWidenLoadRecipe::execute(VPTransformState &State) { |
| 3728 | Type *ScalarDataTy = getLoadStoreType(I: &Ingredient); |
| 3729 | auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF); |
| 3730 | bool CreateGather = !isConsecutive(); |
| 3731 | |
| 3732 | auto &Builder = State.Builder; |
| 3733 | Value *Mask = nullptr; |
| 3734 | if (auto *VPMask = getMask()) { |
| 3735 | // Mask reversal is only needed for non-all-one (null) masks, as reverse |
| 3736 | // of a null all-one mask is a null mask. |
| 3737 | Mask = State.get(Def: VPMask); |
| 3738 | if (isReverse()) |
| 3739 | Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse" ); |
| 3740 | } |
| 3741 | |
| 3742 | Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateGather); |
| 3743 | Value *NewLI; |
| 3744 | if (CreateGather) { |
| 3745 | NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr, |
| 3746 | Name: "wide.masked.gather" ); |
| 3747 | } else if (Mask) { |
| 3748 | NewLI = |
| 3749 | Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask, |
| 3750 | PassThru: PoisonValue::get(T: DataTy), Name: "wide.masked.load" ); |
| 3751 | } else { |
| 3752 | NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load" ); |
| 3753 | } |
| 3754 | applyMetadata(I&: *cast<Instruction>(Val: NewLI)); |
| 3755 | State.set(Def: this, V: NewLI); |
| 3756 | } |
| 3757 | |
| 3758 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3759 | void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3760 | VPSlotTracker &SlotTracker) const { |
| 3761 | O << Indent << "WIDEN " ; |
| 3762 | printAsOperand(O, SlotTracker); |
| 3763 | O << " = load " ; |
| 3764 | printOperands(O, SlotTracker); |
| 3765 | } |
| 3766 | #endif |
| 3767 | |
| 3768 | /// Use all-true mask for reverse rather than actual mask, as it avoids a |
| 3769 | /// dependence w/o affecting the result. |
| 3770 | static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, |
| 3771 | Value *EVL, const Twine &Name) { |
| 3772 | VectorType *ValTy = cast<VectorType>(Val: Operand->getType()); |
| 3773 | Value *AllTrueMask = |
| 3774 | Builder.CreateVectorSplat(EC: ValTy->getElementCount(), V: Builder.getTrue()); |
| 3775 | return Builder.CreateIntrinsic(RetTy: ValTy, ID: Intrinsic::experimental_vp_reverse, |
| 3776 | Args: {Operand, AllTrueMask, EVL}, FMFSource: nullptr, Name); |
| 3777 | } |
| 3778 | |
| 3779 | void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { |
| 3780 | Type *ScalarDataTy = getLoadStoreType(I: &Ingredient); |
| 3781 | auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF); |
| 3782 | bool CreateGather = !isConsecutive(); |
| 3783 | |
| 3784 | auto &Builder = State.Builder; |
| 3785 | CallInst *NewLI; |
| 3786 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 3787 | Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateGather); |
| 3788 | Value *Mask = nullptr; |
| 3789 | if (VPValue *VPMask = getMask()) { |
| 3790 | Mask = State.get(Def: VPMask); |
| 3791 | if (isReverse()) |
| 3792 | Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask" ); |
| 3793 | } else { |
| 3794 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
| 3795 | } |
| 3796 | |
| 3797 | if (CreateGather) { |
| 3798 | NewLI = |
| 3799 | Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_gather, Args: {Addr, Mask, EVL}, |
| 3800 | FMFSource: nullptr, Name: "wide.masked.gather" ); |
| 3801 | } else { |
| 3802 | NewLI = Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_load, |
| 3803 | Args: {Addr, Mask, EVL}, FMFSource: nullptr, Name: "vp.op.load" ); |
| 3804 | } |
| 3805 | NewLI->addParamAttr( |
| 3806 | ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment)); |
| 3807 | applyMetadata(I&: *NewLI); |
| 3808 | Instruction *Res = NewLI; |
| 3809 | State.set(Def: this, V: Res); |
| 3810 | } |
| 3811 | |
| 3812 | InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, |
| 3813 | VPCostContext &Ctx) const { |
| 3814 | if (!Consecutive || IsMasked) |
| 3815 | return VPWidenMemoryRecipe::computeCost(VF, Ctx); |
| 3816 | |
| 3817 | // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost() |
| 3818 | // here because the EVL recipes using EVL to replace the tail mask. But in the |
| 3819 | // legacy model, it will always calculate the cost of mask. |
| 3820 | // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we |
| 3821 | // don't need to compare to the legacy cost model. |
| 3822 | Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF); |
| 3823 | unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr())) |
| 3824 | ->getAddressSpace(); |
| 3825 | return Ctx.TTI.getMemIntrinsicInstrCost( |
| 3826 | MICA: MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS), |
| 3827 | CostKind: Ctx.CostKind); |
| 3828 | } |
| 3829 | |
| 3830 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3831 | void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3832 | VPSlotTracker &SlotTracker) const { |
| 3833 | O << Indent << "WIDEN " ; |
| 3834 | printAsOperand(O, SlotTracker); |
| 3835 | O << " = vp.load " ; |
| 3836 | printOperands(O, SlotTracker); |
| 3837 | } |
| 3838 | #endif |
| 3839 | |
| 3840 | void VPWidenStoreRecipe::execute(VPTransformState &State) { |
| 3841 | VPValue *StoredVPValue = getStoredValue(); |
| 3842 | bool CreateScatter = !isConsecutive(); |
| 3843 | |
| 3844 | auto &Builder = State.Builder; |
| 3845 | |
| 3846 | Value *Mask = nullptr; |
| 3847 | if (auto *VPMask = getMask()) { |
| 3848 | // Mask reversal is only needed for non-all-one (null) masks, as reverse |
| 3849 | // of a null all-one mask is a null mask. |
| 3850 | Mask = State.get(Def: VPMask); |
| 3851 | if (isReverse()) |
| 3852 | Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse" ); |
| 3853 | } |
| 3854 | |
| 3855 | Value *StoredVal = State.get(Def: StoredVPValue); |
| 3856 | Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateScatter); |
| 3857 | Instruction *NewSI = nullptr; |
| 3858 | if (CreateScatter) |
| 3859 | NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask); |
| 3860 | else if (Mask) |
| 3861 | NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask); |
| 3862 | else |
| 3863 | NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment); |
| 3864 | applyMetadata(I&: *NewSI); |
| 3865 | } |
| 3866 | |
| 3867 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3868 | void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3869 | VPSlotTracker &SlotTracker) const { |
| 3870 | O << Indent << "WIDEN store " ; |
| 3871 | printOperands(O, SlotTracker); |
| 3872 | } |
| 3873 | #endif |
| 3874 | |
| 3875 | void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { |
| 3876 | VPValue *StoredValue = getStoredValue(); |
| 3877 | bool CreateScatter = !isConsecutive(); |
| 3878 | |
| 3879 | auto &Builder = State.Builder; |
| 3880 | |
| 3881 | CallInst *NewSI = nullptr; |
| 3882 | Value *StoredVal = State.get(Def: StoredValue); |
| 3883 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 3884 | Value *Mask = nullptr; |
| 3885 | if (VPValue *VPMask = getMask()) { |
| 3886 | Mask = State.get(Def: VPMask); |
| 3887 | if (isReverse()) |
| 3888 | Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask" ); |
| 3889 | } else { |
| 3890 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
| 3891 | } |
| 3892 | Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateScatter); |
| 3893 | if (CreateScatter) { |
| 3894 | NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()), |
| 3895 | ID: Intrinsic::vp_scatter, |
| 3896 | Args: {StoredVal, Addr, Mask, EVL}); |
| 3897 | } else { |
| 3898 | NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()), |
| 3899 | ID: Intrinsic::vp_store, |
| 3900 | Args: {StoredVal, Addr, Mask, EVL}); |
| 3901 | } |
| 3902 | NewSI->addParamAttr( |
| 3903 | ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment)); |
| 3904 | applyMetadata(I&: *NewSI); |
| 3905 | } |
| 3906 | |
| 3907 | InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, |
| 3908 | VPCostContext &Ctx) const { |
| 3909 | if (!Consecutive || IsMasked) |
| 3910 | return VPWidenMemoryRecipe::computeCost(VF, Ctx); |
| 3911 | |
| 3912 | // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost() |
| 3913 | // here because the EVL recipes using EVL to replace the tail mask. But in the |
| 3914 | // legacy model, it will always calculate the cost of mask. |
| 3915 | // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we |
| 3916 | // don't need to compare to the legacy cost model. |
| 3917 | Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF); |
| 3918 | unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr())) |
| 3919 | ->getAddressSpace(); |
| 3920 | return Ctx.TTI.getMemIntrinsicInstrCost( |
| 3921 | MICA: MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS), |
| 3922 | CostKind: Ctx.CostKind); |
| 3923 | } |
| 3924 | |
| 3925 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3926 | void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3927 | VPSlotTracker &SlotTracker) const { |
| 3928 | O << Indent << "WIDEN vp.store " ; |
| 3929 | printOperands(O, SlotTracker); |
| 3930 | } |
| 3931 | #endif |
| 3932 | |
| 3933 | static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, |
| 3934 | VectorType *DstVTy, const DataLayout &DL) { |
| 3935 | // Verify that V is a vector type with same number of elements as DstVTy. |
| 3936 | auto VF = DstVTy->getElementCount(); |
| 3937 | auto *SrcVecTy = cast<VectorType>(Val: V->getType()); |
| 3938 | assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match" ); |
| 3939 | Type *SrcElemTy = SrcVecTy->getElementType(); |
| 3940 | Type *DstElemTy = DstVTy->getElementType(); |
| 3941 | assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && |
| 3942 | "Vector elements must have same size" ); |
| 3943 | |
| 3944 | // Do a direct cast if element types are castable. |
| 3945 | if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) { |
| 3946 | return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy); |
| 3947 | } |
| 3948 | // V cannot be directly casted to desired vector type. |
| 3949 | // May happen when V is a floating point vector but DstVTy is a vector of |
| 3950 | // pointers or vice-versa. Handle this using a two-step bitcast using an |
| 3951 | // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. |
| 3952 | assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && |
| 3953 | "Only one type should be a pointer type" ); |
| 3954 | assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && |
| 3955 | "Only one type should be a floating point type" ); |
| 3956 | Type *IntTy = |
| 3957 | IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy)); |
| 3958 | auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF); |
| 3959 | Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy); |
| 3960 | return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy); |
| 3961 | } |
| 3962 | |
| 3963 | /// Return a vector containing interleaved elements from multiple |
| 3964 | /// smaller input vectors. |
| 3965 | static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, |
| 3966 | const Twine &Name) { |
| 3967 | unsigned Factor = Vals.size(); |
| 3968 | assert(Factor > 1 && "Tried to interleave invalid number of vectors" ); |
| 3969 | |
| 3970 | VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType()); |
| 3971 | #ifndef NDEBUG |
| 3972 | for (Value *Val : Vals) |
| 3973 | assert(Val->getType() == VecTy && "Tried to interleave mismatched types" ); |
| 3974 | #endif |
| 3975 | |
| 3976 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), so |
| 3977 | // must use intrinsics to interleave. |
| 3978 | if (VecTy->isScalableTy()) { |
| 3979 | assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors" ); |
| 3980 | return Builder.CreateVectorInterleave(Ops: Vals, Name); |
| 3981 | } |
| 3982 | |
| 3983 | // Fixed length. Start by concatenating all vectors into a wide vector. |
| 3984 | Value *WideVec = concatenateVectors(Builder, Vecs: Vals); |
| 3985 | |
| 3986 | // Interleave the elements into the wide vector. |
| 3987 | const unsigned NumElts = VecTy->getElementCount().getFixedValue(); |
| 3988 | return Builder.CreateShuffleVector( |
| 3989 | V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name); |
| 3990 | } |
| 3991 | |
| 3992 | // Try to vectorize the interleave group that \p Instr belongs to. |
| 3993 | // |
| 3994 | // E.g. Translate following interleaved load group (factor = 3): |
| 3995 | // for (i = 0; i < N; i+=3) { |
| 3996 | // R = Pic[i]; // Member of index 0 |
| 3997 | // G = Pic[i+1]; // Member of index 1 |
| 3998 | // B = Pic[i+2]; // Member of index 2 |
| 3999 | // ... // do something to R, G, B |
| 4000 | // } |
| 4001 | // To: |
| 4002 | // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B |
| 4003 | // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements |
| 4004 | // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements |
| 4005 | // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements |
| 4006 | // |
| 4007 | // Or translate following interleaved store group (factor = 3): |
| 4008 | // for (i = 0; i < N; i+=3) { |
| 4009 | // ... do something to R, G, B |
| 4010 | // Pic[i] = R; // Member of index 0 |
| 4011 | // Pic[i+1] = G; // Member of index 1 |
| 4012 | // Pic[i+2] = B; // Member of index 2 |
| 4013 | // } |
| 4014 | // To: |
| 4015 | // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> |
| 4016 | // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> |
| 4017 | // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, |
| 4018 | // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements |
| 4019 | // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B |
| 4020 | void VPInterleaveRecipe::execute(VPTransformState &State) { |
| 4021 | assert(!State.Lane && "Interleave group being replicated." ); |
| 4022 | assert((!needsMaskForGaps() || !State.VF.isScalable()) && |
| 4023 | "Masking gaps for scalable vectors is not yet supported." ); |
| 4024 | const InterleaveGroup<Instruction> *Group = getInterleaveGroup(); |
| 4025 | Instruction *Instr = Group->getInsertPos(); |
| 4026 | |
| 4027 | // Prepare for the vector type of the interleaved load/store. |
| 4028 | Type *ScalarTy = getLoadStoreType(I: Instr); |
| 4029 | unsigned InterleaveFactor = Group->getFactor(); |
| 4030 | auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF * InterleaveFactor); |
| 4031 | |
| 4032 | VPValue *BlockInMask = getMask(); |
| 4033 | VPValue *Addr = getAddr(); |
| 4034 | Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0)); |
| 4035 | |
| 4036 | auto CreateGroupMask = [&BlockInMask, &State, |
| 4037 | &InterleaveFactor](Value *MaskForGaps) -> Value * { |
| 4038 | if (State.VF.isScalable()) { |
| 4039 | assert(!MaskForGaps && "Interleaved groups with gaps are not supported." ); |
| 4040 | assert(InterleaveFactor <= 8 && |
| 4041 | "Unsupported deinterleave factor for scalable vectors" ); |
| 4042 | auto *ResBlockInMask = State.get(Def: BlockInMask); |
| 4043 | SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask); |
| 4044 | return interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask" ); |
| 4045 | } |
| 4046 | |
| 4047 | if (!BlockInMask) |
| 4048 | return MaskForGaps; |
| 4049 | |
| 4050 | Value *ResBlockInMask = State.get(Def: BlockInMask); |
| 4051 | Value *ShuffledMask = State.Builder.CreateShuffleVector( |
| 4052 | V: ResBlockInMask, |
| 4053 | Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getFixedValue()), |
| 4054 | Name: "interleaved.mask" ); |
| 4055 | return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And, |
| 4056 | LHS: ShuffledMask, RHS: MaskForGaps) |
| 4057 | : ShuffledMask; |
| 4058 | }; |
| 4059 | |
| 4060 | const DataLayout &DL = Instr->getDataLayout(); |
| 4061 | // Vectorize the interleaved load group. |
| 4062 | if (isa<LoadInst>(Val: Instr)) { |
| 4063 | Value *MaskForGaps = nullptr; |
| 4064 | if (needsMaskForGaps()) { |
| 4065 | MaskForGaps = |
| 4066 | createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getFixedValue(), Group: *Group); |
| 4067 | assert(MaskForGaps && "Mask for Gaps is required but it is null" ); |
| 4068 | } |
| 4069 | |
| 4070 | Instruction *NewLoad; |
| 4071 | if (BlockInMask || MaskForGaps) { |
| 4072 | Value *GroupMask = CreateGroupMask(MaskForGaps); |
| 4073 | Value *PoisonVec = PoisonValue::get(T: VecTy); |
| 4074 | NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: ResAddr, |
| 4075 | Alignment: Group->getAlign(), Mask: GroupMask, |
| 4076 | PassThru: PoisonVec, Name: "wide.masked.vec" ); |
| 4077 | } else |
| 4078 | NewLoad = State.Builder.CreateAlignedLoad(Ty: VecTy, Ptr: ResAddr, |
| 4079 | Align: Group->getAlign(), Name: "wide.vec" ); |
| 4080 | applyMetadata(I&: *NewLoad); |
| 4081 | // TODO: Also manage existing metadata using VPIRMetadata. |
| 4082 | Group->addMetadata(NewInst: NewLoad); |
| 4083 | |
| 4084 | ArrayRef<VPRecipeValue *> VPDefs = definedValues(); |
| 4085 | if (VecTy->isScalableTy()) { |
| 4086 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), |
| 4087 | // so must use intrinsics to deinterleave. |
| 4088 | assert(InterleaveFactor <= 8 && |
| 4089 | "Unsupported deinterleave factor for scalable vectors" ); |
| 4090 | NewLoad = State.Builder.CreateIntrinsic( |
| 4091 | ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor), |
| 4092 | Types: NewLoad->getType(), Args: NewLoad, |
| 4093 | /*FMFSource=*/nullptr, Name: "strided.vec" ); |
| 4094 | } |
| 4095 | |
| 4096 | auto CreateStridedVector = [&InterleaveFactor, &State, |
| 4097 | &NewLoad](unsigned Index) -> Value * { |
| 4098 | assert(Index < InterleaveFactor && "Illegal group index" ); |
| 4099 | if (State.VF.isScalable()) |
| 4100 | return State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: Index); |
| 4101 | |
| 4102 | // For fixed length VF, use shuffle to extract the sub-vectors from the |
| 4103 | // wide load. |
| 4104 | auto StrideMask = |
| 4105 | createStrideMask(Start: Index, Stride: InterleaveFactor, VF: State.VF.getFixedValue()); |
| 4106 | return State.Builder.CreateShuffleVector(V: NewLoad, Mask: StrideMask, |
| 4107 | Name: "strided.vec" ); |
| 4108 | }; |
| 4109 | |
| 4110 | for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { |
| 4111 | Instruction *Member = Group->getMember(Index: I); |
| 4112 | |
| 4113 | // Skip the gaps in the group. |
| 4114 | if (!Member) |
| 4115 | continue; |
| 4116 | |
| 4117 | Value *StridedVec = CreateStridedVector(I); |
| 4118 | |
| 4119 | // If this member has different type, cast the result type. |
| 4120 | if (Member->getType() != ScalarTy) { |
| 4121 | VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF); |
| 4122 | StridedVec = |
| 4123 | createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL); |
| 4124 | } |
| 4125 | |
| 4126 | if (Group->isReverse()) |
| 4127 | StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse" ); |
| 4128 | |
| 4129 | State.set(Def: VPDefs[J], V: StridedVec); |
| 4130 | ++J; |
| 4131 | } |
| 4132 | return; |
| 4133 | } |
| 4134 | |
| 4135 | // The sub vector type for current instruction. |
| 4136 | auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF); |
| 4137 | |
| 4138 | // Vectorize the interleaved store group. |
| 4139 | Value *MaskForGaps = |
| 4140 | createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group); |
| 4141 | assert(((MaskForGaps != nullptr) == needsMaskForGaps()) && |
| 4142 | "Mismatch between NeedsMaskForGaps and MaskForGaps" ); |
| 4143 | ArrayRef<VPValue *> StoredValues = getStoredValues(); |
| 4144 | // Collect the stored vector from each member. |
| 4145 | SmallVector<Value *, 4> StoredVecs; |
| 4146 | unsigned StoredIdx = 0; |
| 4147 | for (unsigned i = 0; i < InterleaveFactor; i++) { |
| 4148 | assert((Group->getMember(i) || MaskForGaps) && |
| 4149 | "Fail to get a member from an interleaved store group" ); |
| 4150 | Instruction *Member = Group->getMember(Index: i); |
| 4151 | |
| 4152 | // Skip the gaps in the group. |
| 4153 | if (!Member) { |
| 4154 | Value *Undef = PoisonValue::get(T: SubVT); |
| 4155 | StoredVecs.push_back(Elt: Undef); |
| 4156 | continue; |
| 4157 | } |
| 4158 | |
| 4159 | Value *StoredVec = State.get(Def: StoredValues[StoredIdx]); |
| 4160 | ++StoredIdx; |
| 4161 | |
| 4162 | if (Group->isReverse()) |
| 4163 | StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse" ); |
| 4164 | |
| 4165 | // If this member has different type, cast it to a unified type. |
| 4166 | |
| 4167 | if (StoredVec->getType() != SubVT) |
| 4168 | StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL); |
| 4169 | |
| 4170 | StoredVecs.push_back(Elt: StoredVec); |
| 4171 | } |
| 4172 | |
| 4173 | // Interleave all the smaller vectors into one wider vector. |
| 4174 | Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec" ); |
| 4175 | Instruction *NewStoreInstr; |
| 4176 | if (BlockInMask || MaskForGaps) { |
| 4177 | Value *GroupMask = CreateGroupMask(MaskForGaps); |
| 4178 | NewStoreInstr = State.Builder.CreateMaskedStore( |
| 4179 | Val: IVec, Ptr: ResAddr, Alignment: Group->getAlign(), Mask: GroupMask); |
| 4180 | } else |
| 4181 | NewStoreInstr = |
| 4182 | State.Builder.CreateAlignedStore(Val: IVec, Ptr: ResAddr, Align: Group->getAlign()); |
| 4183 | |
| 4184 | applyMetadata(I&: *NewStoreInstr); |
| 4185 | // TODO: Also manage existing metadata using VPIRMetadata. |
| 4186 | Group->addMetadata(NewInst: NewStoreInstr); |
| 4187 | } |
| 4188 | |
| 4189 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4190 | void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4191 | VPSlotTracker &SlotTracker) const { |
| 4192 | const InterleaveGroup<Instruction> *IG = getInterleaveGroup(); |
| 4193 | O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at " ; |
| 4194 | IG->getInsertPos()->printAsOperand(O, false); |
| 4195 | O << ", " ; |
| 4196 | getAddr()->printAsOperand(O, SlotTracker); |
| 4197 | VPValue *Mask = getMask(); |
| 4198 | if (Mask) { |
| 4199 | O << ", " ; |
| 4200 | Mask->printAsOperand(O, SlotTracker); |
| 4201 | } |
| 4202 | |
| 4203 | unsigned OpIdx = 0; |
| 4204 | for (unsigned i = 0; i < IG->getFactor(); ++i) { |
| 4205 | if (!IG->getMember(i)) |
| 4206 | continue; |
| 4207 | if (getNumStoreOperands() > 0) { |
| 4208 | O << "\n" << Indent << " store " ; |
| 4209 | getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); |
| 4210 | O << " to index " << i; |
| 4211 | } else { |
| 4212 | O << "\n" << Indent << " " ; |
| 4213 | getVPValue(OpIdx)->printAsOperand(O, SlotTracker); |
| 4214 | O << " = load from index " << i; |
| 4215 | } |
| 4216 | ++OpIdx; |
| 4217 | } |
| 4218 | } |
| 4219 | #endif |
| 4220 | |
| 4221 | void VPInterleaveEVLRecipe::execute(VPTransformState &State) { |
| 4222 | assert(!State.Lane && "Interleave group being replicated." ); |
| 4223 | assert(State.VF.isScalable() && |
| 4224 | "Only support scalable VF for EVL tail-folding." ); |
| 4225 | assert(!needsMaskForGaps() && |
| 4226 | "Masking gaps for scalable vectors is not yet supported." ); |
| 4227 | const InterleaveGroup<Instruction> *Group = getInterleaveGroup(); |
| 4228 | Instruction *Instr = Group->getInsertPos(); |
| 4229 | |
| 4230 | // Prepare for the vector type of the interleaved load/store. |
| 4231 | Type *ScalarTy = getLoadStoreType(I: Instr); |
| 4232 | unsigned InterleaveFactor = Group->getFactor(); |
| 4233 | assert(InterleaveFactor <= 8 && |
| 4234 | "Unsupported deinterleave/interleave factor for scalable vectors" ); |
| 4235 | ElementCount WideVF = State.VF * InterleaveFactor; |
| 4236 | auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: WideVF); |
| 4237 | |
| 4238 | VPValue *Addr = getAddr(); |
| 4239 | Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0)); |
| 4240 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 4241 | Value *InterleaveEVL = State.Builder.CreateMul( |
| 4242 | LHS: EVL, RHS: ConstantInt::get(Ty: EVL->getType(), V: InterleaveFactor), Name: "interleave.evl" , |
| 4243 | /* NUW= */ HasNUW: true, /* NSW= */ HasNSW: true); |
| 4244 | LLVMContext &Ctx = State.Builder.getContext(); |
| 4245 | |
| 4246 | Value *GroupMask = nullptr; |
| 4247 | if (VPValue *BlockInMask = getMask()) { |
| 4248 | SmallVector<Value *> Ops(InterleaveFactor, State.get(Def: BlockInMask)); |
| 4249 | GroupMask = interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask" ); |
| 4250 | } else { |
| 4251 | GroupMask = |
| 4252 | State.Builder.CreateVectorSplat(EC: WideVF, V: State.Builder.getTrue()); |
| 4253 | } |
| 4254 | |
| 4255 | // Vectorize the interleaved load group. |
| 4256 | if (isa<LoadInst>(Val: Instr)) { |
| 4257 | CallInst *NewLoad = State.Builder.CreateIntrinsic( |
| 4258 | RetTy: VecTy, ID: Intrinsic::vp_load, Args: {ResAddr, GroupMask, InterleaveEVL}, FMFSource: nullptr, |
| 4259 | Name: "wide.vp.load" ); |
| 4260 | NewLoad->addParamAttr(ArgNo: 0, |
| 4261 | Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign())); |
| 4262 | |
| 4263 | applyMetadata(I&: *NewLoad); |
| 4264 | // TODO: Also manage existing metadata using VPIRMetadata. |
| 4265 | Group->addMetadata(NewInst: NewLoad); |
| 4266 | |
| 4267 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), |
| 4268 | // so must use intrinsics to deinterleave. |
| 4269 | NewLoad = State.Builder.CreateIntrinsic( |
| 4270 | ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor), |
| 4271 | Types: NewLoad->getType(), Args: NewLoad, |
| 4272 | /*FMFSource=*/nullptr, Name: "strided.vec" ); |
| 4273 | |
| 4274 | const DataLayout &DL = Instr->getDataLayout(); |
| 4275 | for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { |
| 4276 | Instruction *Member = Group->getMember(Index: I); |
| 4277 | // Skip the gaps in the group. |
| 4278 | if (!Member) |
| 4279 | continue; |
| 4280 | |
| 4281 | Value *StridedVec = State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: I); |
| 4282 | // If this member has different type, cast the result type. |
| 4283 | if (Member->getType() != ScalarTy) { |
| 4284 | VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF); |
| 4285 | StridedVec = |
| 4286 | createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL); |
| 4287 | } |
| 4288 | |
| 4289 | State.set(Def: getVPValue(I: J), V: StridedVec); |
| 4290 | ++J; |
| 4291 | } |
| 4292 | return; |
| 4293 | } // End for interleaved load. |
| 4294 | |
| 4295 | // The sub vector type for current instruction. |
| 4296 | auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF); |
| 4297 | // Vectorize the interleaved store group. |
| 4298 | ArrayRef<VPValue *> StoredValues = getStoredValues(); |
| 4299 | // Collect the stored vector from each member. |
| 4300 | SmallVector<Value *, 4> StoredVecs; |
| 4301 | const DataLayout &DL = Instr->getDataLayout(); |
| 4302 | for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) { |
| 4303 | Instruction *Member = Group->getMember(Index: I); |
| 4304 | // Skip the gaps in the group. |
| 4305 | if (!Member) { |
| 4306 | StoredVecs.push_back(Elt: PoisonValue::get(T: SubVT)); |
| 4307 | continue; |
| 4308 | } |
| 4309 | |
| 4310 | Value *StoredVec = State.get(Def: StoredValues[StoredIdx]); |
| 4311 | // If this member has different type, cast it to a unified type. |
| 4312 | if (StoredVec->getType() != SubVT) |
| 4313 | StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL); |
| 4314 | |
| 4315 | StoredVecs.push_back(Elt: StoredVec); |
| 4316 | ++StoredIdx; |
| 4317 | } |
| 4318 | |
| 4319 | // Interleave all the smaller vectors into one wider vector. |
| 4320 | Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec" ); |
| 4321 | CallInst *NewStore = |
| 4322 | State.Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::vp_store, |
| 4323 | Args: {IVec, ResAddr, GroupMask, InterleaveEVL}); |
| 4324 | NewStore->addParamAttr(ArgNo: 1, |
| 4325 | Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign())); |
| 4326 | |
| 4327 | applyMetadata(I&: *NewStore); |
| 4328 | // TODO: Also manage existing metadata using VPIRMetadata. |
| 4329 | Group->addMetadata(NewInst: NewStore); |
| 4330 | } |
| 4331 | |
| 4332 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4333 | void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4334 | VPSlotTracker &SlotTracker) const { |
| 4335 | const InterleaveGroup<Instruction> *IG = getInterleaveGroup(); |
| 4336 | O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at " ; |
| 4337 | IG->getInsertPos()->printAsOperand(O, false); |
| 4338 | O << ", " ; |
| 4339 | getAddr()->printAsOperand(O, SlotTracker); |
| 4340 | O << ", " ; |
| 4341 | getEVL()->printAsOperand(O, SlotTracker); |
| 4342 | if (VPValue *Mask = getMask()) { |
| 4343 | O << ", " ; |
| 4344 | Mask->printAsOperand(O, SlotTracker); |
| 4345 | } |
| 4346 | |
| 4347 | unsigned OpIdx = 0; |
| 4348 | for (unsigned i = 0; i < IG->getFactor(); ++i) { |
| 4349 | if (!IG->getMember(i)) |
| 4350 | continue; |
| 4351 | if (getNumStoreOperands() > 0) { |
| 4352 | O << "\n" << Indent << " vp.store " ; |
| 4353 | getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker); |
| 4354 | O << " to index " << i; |
| 4355 | } else { |
| 4356 | O << "\n" << Indent << " " ; |
| 4357 | getVPValue(OpIdx)->printAsOperand(O, SlotTracker); |
| 4358 | O << " = vp.load from index " << i; |
| 4359 | } |
| 4360 | ++OpIdx; |
| 4361 | } |
| 4362 | } |
| 4363 | #endif |
| 4364 | |
| 4365 | InstructionCost VPInterleaveBase::computeCost(ElementCount VF, |
| 4366 | VPCostContext &Ctx) const { |
| 4367 | Instruction *InsertPos = getInsertPos(); |
| 4368 | // Find the VPValue index of the interleave group. We need to skip gaps. |
| 4369 | unsigned InsertPosIdx = 0; |
| 4370 | for (unsigned Idx = 0; IG->getFactor(); ++Idx) |
| 4371 | if (auto *Member = IG->getMember(Index: Idx)) { |
| 4372 | if (Member == InsertPos) |
| 4373 | break; |
| 4374 | InsertPosIdx++; |
| 4375 | } |
| 4376 | Type *ValTy = Ctx.Types.inferScalarType( |
| 4377 | V: getNumDefinedValues() > 0 ? getVPValue(I: InsertPosIdx) |
| 4378 | : getStoredValues()[InsertPosIdx]); |
| 4379 | auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF)); |
| 4380 | unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr())) |
| 4381 | ->getAddressSpace(); |
| 4382 | |
| 4383 | unsigned InterleaveFactor = IG->getFactor(); |
| 4384 | auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor); |
| 4385 | |
| 4386 | // Holds the indices of existing members in the interleaved group. |
| 4387 | SmallVector<unsigned, 4> Indices; |
| 4388 | for (unsigned IF = 0; IF < InterleaveFactor; IF++) |
| 4389 | if (IG->getMember(Index: IF)) |
| 4390 | Indices.push_back(Elt: IF); |
| 4391 | |
| 4392 | // Calculate the cost of the whole interleaved group. |
| 4393 | InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( |
| 4394 | Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: IG->getFactor(), Indices, |
| 4395 | Alignment: IG->getAlign(), AddressSpace: AS, CostKind: Ctx.CostKind, UseMaskForCond: getMask(), UseMaskForGaps: NeedsMaskForGaps); |
| 4396 | |
| 4397 | if (!IG->isReverse()) |
| 4398 | return Cost; |
| 4399 | |
| 4400 | return Cost + IG->getNumMembers() * |
| 4401 | Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, |
| 4402 | DstTy: VectorTy, SrcTy: VectorTy, Mask: {}, CostKind: Ctx.CostKind, |
| 4403 | Index: 0); |
| 4404 | } |
| 4405 | |
| 4406 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4407 | void VPCanonicalIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4408 | VPSlotTracker &SlotTracker) const { |
| 4409 | O << Indent << "EMIT " ; |
| 4410 | printAsOperand(O, SlotTracker); |
| 4411 | O << " = CANONICAL-INDUCTION " ; |
| 4412 | printOperands(O, SlotTracker); |
| 4413 | } |
| 4414 | #endif |
| 4415 | |
| 4416 | bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { |
| 4417 | return vputils::onlyScalarValuesUsed(Def: this) && |
| 4418 | (!IsScalable || vputils::onlyFirstLaneUsed(Def: this)); |
| 4419 | } |
| 4420 | |
| 4421 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4422 | void VPWidenPointerInductionRecipe::printRecipe( |
| 4423 | raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { |
| 4424 | assert((getNumOperands() == 3 || getNumOperands() == 5) && |
| 4425 | "unexpected number of operands" ); |
| 4426 | O << Indent << "EMIT " ; |
| 4427 | printAsOperand(O, SlotTracker); |
| 4428 | O << " = WIDEN-POINTER-INDUCTION " ; |
| 4429 | getStartValue()->printAsOperand(O, SlotTracker); |
| 4430 | O << ", " ; |
| 4431 | getStepValue()->printAsOperand(O, SlotTracker); |
| 4432 | O << ", " ; |
| 4433 | getOperand(2)->printAsOperand(O, SlotTracker); |
| 4434 | if (getNumOperands() == 5) { |
| 4435 | O << ", " ; |
| 4436 | getOperand(3)->printAsOperand(O, SlotTracker); |
| 4437 | O << ", " ; |
| 4438 | getOperand(4)->printAsOperand(O, SlotTracker); |
| 4439 | } |
| 4440 | } |
| 4441 | |
| 4442 | void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4443 | VPSlotTracker &SlotTracker) const { |
| 4444 | O << Indent << "EMIT " ; |
| 4445 | printAsOperand(O, SlotTracker); |
| 4446 | O << " = EXPAND SCEV " << *Expr; |
| 4447 | } |
| 4448 | #endif |
| 4449 | |
| 4450 | void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { |
| 4451 | Value *CanonicalIV = State.get(Def: getOperand(N: 0), /*IsScalar*/ true); |
| 4452 | Type *STy = CanonicalIV->getType(); |
| 4453 | IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); |
| 4454 | ElementCount VF = State.VF; |
| 4455 | Value *VStart = VF.isScalar() |
| 4456 | ? CanonicalIV |
| 4457 | : Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast" ); |
| 4458 | Value *VStep = createStepForVF(B&: Builder, Ty: STy, VF, Step: getUnrollPart(U: *this)); |
| 4459 | if (VF.isVector()) { |
| 4460 | VStep = Builder.CreateVectorSplat(EC: VF, V: VStep); |
| 4461 | VStep = |
| 4462 | Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType())); |
| 4463 | } |
| 4464 | Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv" ); |
| 4465 | State.set(Def: this, V: CanonicalVectorIV); |
| 4466 | } |
| 4467 | |
| 4468 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4469 | void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4470 | VPSlotTracker &SlotTracker) const { |
| 4471 | O << Indent << "EMIT " ; |
| 4472 | printAsOperand(O, SlotTracker); |
| 4473 | O << " = WIDEN-CANONICAL-INDUCTION " ; |
| 4474 | printOperands(O, SlotTracker); |
| 4475 | } |
| 4476 | #endif |
| 4477 | |
| 4478 | void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { |
| 4479 | auto &Builder = State.Builder; |
| 4480 | // Create a vector from the initial value. |
| 4481 | auto *VectorInit = getStartValue()->getLiveInIRValue(); |
| 4482 | |
| 4483 | Type *VecTy = State.VF.isScalar() |
| 4484 | ? VectorInit->getType() |
| 4485 | : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF); |
| 4486 | |
| 4487 | BasicBlock *VectorPH = |
| 4488 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
| 4489 | if (State.VF.isVector()) { |
| 4490 | auto *IdxTy = Builder.getInt32Ty(); |
| 4491 | auto *One = ConstantInt::get(Ty: IdxTy, V: 1); |
| 4492 | IRBuilder<>::InsertPointGuard Guard(Builder); |
| 4493 | Builder.SetInsertPoint(VectorPH->getTerminator()); |
| 4494 | auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF); |
| 4495 | auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One); |
| 4496 | VectorInit = Builder.CreateInsertElement( |
| 4497 | Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init" ); |
| 4498 | } |
| 4499 | |
| 4500 | // Create a phi node for the new recurrence. |
| 4501 | PHINode *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur" ); |
| 4502 | Phi->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt()); |
| 4503 | Phi->addIncoming(V: VectorInit, BB: VectorPH); |
| 4504 | State.set(Def: this, V: Phi); |
| 4505 | } |
| 4506 | |
| 4507 | InstructionCost |
| 4508 | VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, |
| 4509 | VPCostContext &Ctx) const { |
| 4510 | if (VF.isScalar()) |
| 4511 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 4512 | |
| 4513 | return 0; |
| 4514 | } |
| 4515 | |
| 4516 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4517 | void VPFirstOrderRecurrencePHIRecipe::printRecipe( |
| 4518 | raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { |
| 4519 | O << Indent << "FIRST-ORDER-RECURRENCE-PHI " ; |
| 4520 | printAsOperand(O, SlotTracker); |
| 4521 | O << " = phi " ; |
| 4522 | printOperands(O, SlotTracker); |
| 4523 | } |
| 4524 | #endif |
| 4525 | |
| 4526 | void VPReductionPHIRecipe::execute(VPTransformState &State) { |
| 4527 | // Reductions do not have to start at zero. They can start with |
| 4528 | // any loop invariant values. |
| 4529 | VPValue *StartVPV = getStartValue(); |
| 4530 | |
| 4531 | // In order to support recurrences we need to be able to vectorize Phi nodes. |
| 4532 | // Phi nodes have cycles, so we need to vectorize them in two stages. This is |
| 4533 | // stage #1: We create a new vector PHI node with no incoming edges. We'll use |
| 4534 | // this value when we vectorize all of the instructions that use the PHI. |
| 4535 | BasicBlock *VectorPH = |
| 4536 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
| 4537 | bool ScalarPHI = State.VF.isScalar() || isInLoop(); |
| 4538 | Value *StartV = State.get(Def: StartVPV, IsScalar: ScalarPHI); |
| 4539 | Type *VecTy = StartV->getType(); |
| 4540 | |
| 4541 | BasicBlock * = State.CFG.PrevBB; |
| 4542 | assert(State.CurrentParentLoop->getHeader() == HeaderBB && |
| 4543 | "recipe must be in the vector loop header" ); |
| 4544 | auto *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi" ); |
| 4545 | Phi->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt()); |
| 4546 | State.set(Def: this, V: Phi, IsScalar: isInLoop()); |
| 4547 | |
| 4548 | Phi->addIncoming(V: StartV, BB: VectorPH); |
| 4549 | } |
| 4550 | |
| 4551 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4552 | void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4553 | VPSlotTracker &SlotTracker) const { |
| 4554 | O << Indent << "WIDEN-REDUCTION-PHI " ; |
| 4555 | |
| 4556 | printAsOperand(O, SlotTracker); |
| 4557 | O << " = phi " ; |
| 4558 | printOperands(O, SlotTracker); |
| 4559 | if (getVFScaleFactor() > 1) |
| 4560 | O << " (VF scaled by 1/" << getVFScaleFactor() << ")" ; |
| 4561 | } |
| 4562 | #endif |
| 4563 | |
| 4564 | void VPWidenPHIRecipe::execute(VPTransformState &State) { |
| 4565 | Value *Op0 = State.get(Def: getOperand(N: 0)); |
| 4566 | Type *VecTy = Op0->getType(); |
| 4567 | Instruction *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: 2, Name); |
| 4568 | State.set(Def: this, V: VecPhi); |
| 4569 | } |
| 4570 | |
| 4571 | InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF, |
| 4572 | VPCostContext &Ctx) const { |
| 4573 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 4574 | } |
| 4575 | |
| 4576 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4577 | void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4578 | VPSlotTracker &SlotTracker) const { |
| 4579 | O << Indent << "WIDEN-PHI " ; |
| 4580 | |
| 4581 | printAsOperand(O, SlotTracker); |
| 4582 | O << " = phi " ; |
| 4583 | printPhiOperands(O, SlotTracker); |
| 4584 | } |
| 4585 | #endif |
| 4586 | |
| 4587 | void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { |
| 4588 | BasicBlock *VectorPH = |
| 4589 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
| 4590 | Value *StartMask = State.get(Def: getOperand(N: 0)); |
| 4591 | PHINode *Phi = |
| 4592 | State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask" ); |
| 4593 | Phi->addIncoming(V: StartMask, BB: VectorPH); |
| 4594 | State.set(Def: this, V: Phi); |
| 4595 | } |
| 4596 | |
| 4597 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4598 | void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4599 | VPSlotTracker &SlotTracker) const { |
| 4600 | O << Indent << "ACTIVE-LANE-MASK-PHI " ; |
| 4601 | |
| 4602 | printAsOperand(O, SlotTracker); |
| 4603 | O << " = phi " ; |
| 4604 | printOperands(O, SlotTracker); |
| 4605 | } |
| 4606 | #endif |
| 4607 | |
| 4608 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4609 | void VPEVLBasedIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4610 | VPSlotTracker &SlotTracker) const { |
| 4611 | O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI " ; |
| 4612 | |
| 4613 | printAsOperand(O, SlotTracker); |
| 4614 | O << " = phi " ; |
| 4615 | printOperands(O, SlotTracker); |
| 4616 | } |
| 4617 | #endif |
| 4618 | |