| 1 | //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file contains implementations for different VPlan recipes. |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "LoopVectorizationPlanner.h" |
| 15 | #include "VPlan.h" |
| 16 | #include "VPlanAnalysis.h" |
| 17 | #include "VPlanHelpers.h" |
| 18 | #include "VPlanPatternMatch.h" |
| 19 | #include "VPlanUtils.h" |
| 20 | #include "llvm/ADT/STLExtras.h" |
| 21 | #include "llvm/ADT/SmallVector.h" |
| 22 | #include "llvm/ADT/Twine.h" |
| 23 | #include "llvm/Analysis/AssumptionCache.h" |
| 24 | #include "llvm/Analysis/IVDescriptors.h" |
| 25 | #include "llvm/Analysis/LoopInfo.h" |
| 26 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| 27 | #include "llvm/IR/BasicBlock.h" |
| 28 | #include "llvm/IR/IRBuilder.h" |
| 29 | #include "llvm/IR/Instruction.h" |
| 30 | #include "llvm/IR/Instructions.h" |
| 31 | #include "llvm/IR/Intrinsics.h" |
| 32 | #include "llvm/IR/Type.h" |
| 33 | #include "llvm/IR/Value.h" |
| 34 | #include "llvm/Support/Casting.h" |
| 35 | #include "llvm/Support/CommandLine.h" |
| 36 | #include "llvm/Support/Debug.h" |
| 37 | #include "llvm/Support/raw_ostream.h" |
| 38 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| 39 | #include "llvm/Transforms/Utils/LoopUtils.h" |
| 40 | #include <cassert> |
| 41 | |
| 42 | using namespace llvm; |
| 43 | using namespace llvm::VPlanPatternMatch; |
| 44 | |
| 45 | using VectorParts = SmallVector<Value *, 2>; |
| 46 | |
| 47 | #define LV_NAME "loop-vectorize" |
| 48 | #define DEBUG_TYPE LV_NAME |
| 49 | |
| 50 | bool VPRecipeBase::mayWriteToMemory() const { |
| 51 | switch (getVPRecipeID()) { |
| 52 | case VPExpressionSC: |
| 53 | return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory(); |
| 54 | case VPInstructionSC: { |
| 55 | auto *VPI = cast<VPInstruction>(Val: this); |
| 56 | // Loads read from memory but don't write to memory. |
| 57 | if (VPI->getOpcode() == Instruction::Load) |
| 58 | return false; |
| 59 | return VPI->opcodeMayReadOrWriteFromMemory(); |
| 60 | } |
| 61 | case VPInterleaveEVLSC: |
| 62 | case VPInterleaveSC: |
| 63 | return cast<VPInterleaveBase>(Val: this)->getNumStoreOperands() > 0; |
| 64 | case VPWidenStoreEVLSC: |
| 65 | case VPWidenStoreSC: |
| 66 | return true; |
| 67 | case VPReplicateSC: |
| 68 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
| 69 | ->mayWriteToMemory(); |
| 70 | case VPWidenCallSC: |
| 71 | return !cast<VPWidenCallRecipe>(Val: this) |
| 72 | ->getCalledScalarFunction() |
| 73 | ->onlyReadsMemory(); |
| 74 | case VPWidenIntrinsicSC: |
| 75 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayWriteToMemory(); |
| 76 | case VPActiveLaneMaskPHISC: |
| 77 | case VPCanonicalIVPHISC: |
| 78 | case VPCurrentIterationPHISC: |
| 79 | case VPBranchOnMaskSC: |
| 80 | case VPDerivedIVSC: |
| 81 | case VPFirstOrderRecurrencePHISC: |
| 82 | case VPReductionPHISC: |
| 83 | case VPScalarIVStepsSC: |
| 84 | case VPPredInstPHISC: |
| 85 | return false; |
| 86 | case VPBlendSC: |
| 87 | case VPReductionEVLSC: |
| 88 | case VPReductionSC: |
| 89 | case VPVectorPointerSC: |
| 90 | case VPWidenCanonicalIVSC: |
| 91 | case VPWidenCastSC: |
| 92 | case VPWidenGEPSC: |
| 93 | case VPWidenIntOrFpInductionSC: |
| 94 | case VPWidenLoadEVLSC: |
| 95 | case VPWidenLoadSC: |
| 96 | case VPWidenPHISC: |
| 97 | case VPWidenPointerInductionSC: |
| 98 | case VPWidenSC: { |
| 99 | const Instruction *I = |
| 100 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
| 101 | (void)I; |
| 102 | assert((!I || !I->mayWriteToMemory()) && |
| 103 | "underlying instruction may write to memory" ); |
| 104 | return false; |
| 105 | } |
| 106 | default: |
| 107 | return true; |
| 108 | } |
| 109 | } |
| 110 | |
| 111 | bool VPRecipeBase::mayReadFromMemory() const { |
| 112 | switch (getVPRecipeID()) { |
| 113 | case VPExpressionSC: |
| 114 | return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory(); |
| 115 | case VPInstructionSC: |
| 116 | return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory(); |
| 117 | case VPWidenLoadEVLSC: |
| 118 | case VPWidenLoadSC: |
| 119 | return true; |
| 120 | case VPReplicateSC: |
| 121 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
| 122 | ->mayReadFromMemory(); |
| 123 | case VPWidenCallSC: |
| 124 | return !cast<VPWidenCallRecipe>(Val: this) |
| 125 | ->getCalledScalarFunction() |
| 126 | ->onlyWritesMemory(); |
| 127 | case VPWidenIntrinsicSC: |
| 128 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayReadFromMemory(); |
| 129 | case VPBranchOnMaskSC: |
| 130 | case VPDerivedIVSC: |
| 131 | case VPFirstOrderRecurrencePHISC: |
| 132 | case VPReductionPHISC: |
| 133 | case VPPredInstPHISC: |
| 134 | case VPScalarIVStepsSC: |
| 135 | case VPWidenStoreEVLSC: |
| 136 | case VPWidenStoreSC: |
| 137 | return false; |
| 138 | case VPBlendSC: |
| 139 | case VPReductionEVLSC: |
| 140 | case VPReductionSC: |
| 141 | case VPVectorPointerSC: |
| 142 | case VPWidenCanonicalIVSC: |
| 143 | case VPWidenCastSC: |
| 144 | case VPWidenGEPSC: |
| 145 | case VPWidenIntOrFpInductionSC: |
| 146 | case VPWidenPHISC: |
| 147 | case VPWidenPointerInductionSC: |
| 148 | case VPWidenSC: { |
| 149 | const Instruction *I = |
| 150 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
| 151 | (void)I; |
| 152 | assert((!I || !I->mayReadFromMemory()) && |
| 153 | "underlying instruction may read from memory" ); |
| 154 | return false; |
| 155 | } |
| 156 | default: |
| 157 | // FIXME: Return false if the recipe represents an interleaved store. |
| 158 | return true; |
| 159 | } |
| 160 | } |
| 161 | |
| 162 | bool VPRecipeBase::mayHaveSideEffects() const { |
| 163 | switch (getVPRecipeID()) { |
| 164 | case VPExpressionSC: |
| 165 | return cast<VPExpressionRecipe>(Val: this)->mayHaveSideEffects(); |
| 166 | case VPActiveLaneMaskPHISC: |
| 167 | case VPDerivedIVSC: |
| 168 | case VPFirstOrderRecurrencePHISC: |
| 169 | case VPReductionPHISC: |
| 170 | case VPPredInstPHISC: |
| 171 | case VPVectorEndPointerSC: |
| 172 | return false; |
| 173 | case VPInstructionSC: { |
| 174 | auto *VPI = cast<VPInstruction>(Val: this); |
| 175 | return mayWriteToMemory() || |
| 176 | VPI->getOpcode() == VPInstruction::BranchOnCount || |
| 177 | VPI->getOpcode() == VPInstruction::BranchOnCond || |
| 178 | VPI->getOpcode() == VPInstruction::BranchOnTwoConds; |
| 179 | } |
| 180 | case VPWidenCallSC: { |
| 181 | Function *Fn = cast<VPWidenCallRecipe>(Val: this)->getCalledScalarFunction(); |
| 182 | return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); |
| 183 | } |
| 184 | case VPWidenIntrinsicSC: |
| 185 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayHaveSideEffects(); |
| 186 | case VPBlendSC: |
| 187 | case VPReductionEVLSC: |
| 188 | case VPReductionSC: |
| 189 | case VPScalarIVStepsSC: |
| 190 | case VPVectorPointerSC: |
| 191 | case VPWidenCanonicalIVSC: |
| 192 | case VPWidenCastSC: |
| 193 | case VPWidenGEPSC: |
| 194 | case VPWidenIntOrFpInductionSC: |
| 195 | case VPWidenPHISC: |
| 196 | case VPWidenPointerInductionSC: |
| 197 | case VPWidenSC: { |
| 198 | const Instruction *I = |
| 199 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
| 200 | (void)I; |
| 201 | assert((!I || !I->mayHaveSideEffects()) && |
| 202 | "underlying instruction has side-effects" ); |
| 203 | return false; |
| 204 | } |
| 205 | case VPInterleaveEVLSC: |
| 206 | case VPInterleaveSC: |
| 207 | return mayWriteToMemory(); |
| 208 | case VPWidenLoadEVLSC: |
| 209 | case VPWidenLoadSC: |
| 210 | case VPWidenStoreEVLSC: |
| 211 | case VPWidenStoreSC: |
| 212 | assert( |
| 213 | cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == |
| 214 | mayWriteToMemory() && |
| 215 | "mayHaveSideffects result for ingredient differs from this " |
| 216 | "implementation" ); |
| 217 | return mayWriteToMemory(); |
| 218 | case VPReplicateSC: { |
| 219 | auto *R = cast<VPReplicateRecipe>(Val: this); |
| 220 | return R->getUnderlyingInstr()->mayHaveSideEffects(); |
| 221 | } |
| 222 | default: |
| 223 | return true; |
| 224 | } |
| 225 | } |
| 226 | |
| 227 | void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { |
| 228 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
| 229 | assert(InsertPos->getParent() && |
| 230 | "Insertion position not in any VPBasicBlock" ); |
| 231 | InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator()); |
| 232 | } |
| 233 | |
| 234 | void VPRecipeBase::insertBefore(VPBasicBlock &BB, |
| 235 | iplist<VPRecipeBase>::iterator I) { |
| 236 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
| 237 | assert(I == BB.end() || I->getParent() == &BB); |
| 238 | BB.insert(Recipe: this, InsertPt: I); |
| 239 | } |
| 240 | |
| 241 | void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { |
| 242 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
| 243 | assert(InsertPos->getParent() && |
| 244 | "Insertion position not in any VPBasicBlock" ); |
| 245 | InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator())); |
| 246 | } |
| 247 | |
| 248 | void VPRecipeBase::removeFromParent() { |
| 249 | assert(getParent() && "Recipe not in any VPBasicBlock" ); |
| 250 | getParent()->getRecipeList().remove(IT: getIterator()); |
| 251 | Parent = nullptr; |
| 252 | } |
| 253 | |
| 254 | iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { |
| 255 | assert(getParent() && "Recipe not in any VPBasicBlock" ); |
| 256 | return getParent()->getRecipeList().erase(where: getIterator()); |
| 257 | } |
| 258 | |
| 259 | void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { |
| 260 | removeFromParent(); |
| 261 | insertAfter(InsertPos); |
| 262 | } |
| 263 | |
| 264 | void VPRecipeBase::moveBefore(VPBasicBlock &BB, |
| 265 | iplist<VPRecipeBase>::iterator I) { |
| 266 | removeFromParent(); |
| 267 | insertBefore(BB, I); |
| 268 | } |
| 269 | |
| 270 | InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { |
| 271 | // Get the underlying instruction for the recipe, if there is one. It is used |
| 272 | // to |
| 273 | // * decide if cost computation should be skipped for this recipe, |
| 274 | // * apply forced target instruction cost. |
| 275 | Instruction *UI = nullptr; |
| 276 | if (auto *S = dyn_cast<VPSingleDefRecipe>(Val: this)) |
| 277 | UI = dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue()); |
| 278 | else if (auto *IG = dyn_cast<VPInterleaveBase>(Val: this)) |
| 279 | UI = IG->getInsertPos(); |
| 280 | else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: this)) |
| 281 | UI = &WidenMem->getIngredient(); |
| 282 | |
| 283 | InstructionCost RecipeCost; |
| 284 | if (UI && Ctx.skipCostComputation(UI, IsVector: VF.isVector())) { |
| 285 | RecipeCost = 0; |
| 286 | } else { |
| 287 | RecipeCost = computeCost(VF, Ctx); |
| 288 | if (ForceTargetInstructionCost.getNumOccurrences() > 0 && |
| 289 | RecipeCost.isValid()) { |
| 290 | if (UI) |
| 291 | RecipeCost = InstructionCost(ForceTargetInstructionCost); |
| 292 | else |
| 293 | RecipeCost = InstructionCost(0); |
| 294 | } |
| 295 | } |
| 296 | |
| 297 | LLVM_DEBUG({ |
| 298 | dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": " ; |
| 299 | dump(); |
| 300 | }); |
| 301 | return RecipeCost; |
| 302 | } |
| 303 | |
| 304 | InstructionCost VPRecipeBase::computeCost(ElementCount VF, |
| 305 | VPCostContext &Ctx) const { |
| 306 | llvm_unreachable("subclasses should implement computeCost" ); |
| 307 | } |
| 308 | |
| 309 | bool VPRecipeBase::isPhi() const { |
| 310 | return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) || |
| 311 | isa<VPPhi, VPIRPhi>(Val: this); |
| 312 | } |
| 313 | |
| 314 | bool VPRecipeBase::isScalarCast() const { |
| 315 | auto *VPI = dyn_cast<VPInstruction>(Val: this); |
| 316 | return VPI && Instruction::isCast(Opcode: VPI->getOpcode()); |
| 317 | } |
| 318 | |
| 319 | void VPIRFlags::intersectFlags(const VPIRFlags &Other) { |
| 320 | assert(OpType == Other.OpType && "OpType must match" ); |
| 321 | switch (OpType) { |
| 322 | case OperationType::OverflowingBinOp: |
| 323 | WrapFlags.HasNUW &= Other.WrapFlags.HasNUW; |
| 324 | WrapFlags.HasNSW &= Other.WrapFlags.HasNSW; |
| 325 | break; |
| 326 | case OperationType::Trunc: |
| 327 | TruncFlags.HasNUW &= Other.TruncFlags.HasNUW; |
| 328 | TruncFlags.HasNSW &= Other.TruncFlags.HasNSW; |
| 329 | break; |
| 330 | case OperationType::DisjointOp: |
| 331 | DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint; |
| 332 | break; |
| 333 | case OperationType::PossiblyExactOp: |
| 334 | ExactFlags.IsExact &= Other.ExactFlags.IsExact; |
| 335 | break; |
| 336 | case OperationType::GEPOp: |
| 337 | GEPFlags &= Other.GEPFlags; |
| 338 | break; |
| 339 | case OperationType::FPMathOp: |
| 340 | case OperationType::FCmp: |
| 341 | assert((OpType != OperationType::FCmp || |
| 342 | FCmpFlags.Pred == Other.FCmpFlags.Pred) && |
| 343 | "Cannot drop CmpPredicate" ); |
| 344 | getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs; |
| 345 | getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs; |
| 346 | break; |
| 347 | case OperationType::NonNegOp: |
| 348 | NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg; |
| 349 | break; |
| 350 | case OperationType::Cmp: |
| 351 | assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate" ); |
| 352 | break; |
| 353 | case OperationType::ReductionOp: |
| 354 | assert(ReductionFlags.Kind == Other.ReductionFlags.Kind && |
| 355 | "Cannot change RecurKind" ); |
| 356 | assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered && |
| 357 | "Cannot change IsOrdered" ); |
| 358 | assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop && |
| 359 | "Cannot change IsInLoop" ); |
| 360 | getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs; |
| 361 | getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs; |
| 362 | break; |
| 363 | case OperationType::Other: |
| 364 | assert(AllFlags == Other.AllFlags && "Cannot drop other flags" ); |
| 365 | break; |
| 366 | } |
| 367 | } |
| 368 | |
| 369 | FastMathFlags VPIRFlags::getFastMathFlags() const { |
| 370 | assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp || |
| 371 | OpType == OperationType::ReductionOp || |
| 372 | OpType == OperationType::Other) && |
| 373 | "recipe doesn't have fast math flags" ); |
| 374 | if (OpType == OperationType::Other) |
| 375 | return FastMathFlags(); |
| 376 | const FastMathFlagsTy &F = getFMFsRef(); |
| 377 | FastMathFlags Res; |
| 378 | Res.setAllowReassoc(F.AllowReassoc); |
| 379 | Res.setNoNaNs(F.NoNaNs); |
| 380 | Res.setNoInfs(F.NoInfs); |
| 381 | Res.setNoSignedZeros(F.NoSignedZeros); |
| 382 | Res.setAllowReciprocal(F.AllowReciprocal); |
| 383 | Res.setAllowContract(F.AllowContract); |
| 384 | Res.setApproxFunc(F.ApproxFunc); |
| 385 | return Res; |
| 386 | } |
| 387 | |
| 388 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 389 | void VPSingleDefRecipe::dump() const { VPRecipeBase::dump(); } |
| 390 | |
| 391 | void VPRecipeBase::print(raw_ostream &O, const Twine &Indent, |
| 392 | VPSlotTracker &SlotTracker) const { |
| 393 | printRecipe(O, Indent, SlotTracker); |
| 394 | if (auto DL = getDebugLoc()) { |
| 395 | O << ", !dbg " ; |
| 396 | DL.print(O); |
| 397 | } |
| 398 | |
| 399 | if (auto *Metadata = dyn_cast<VPIRMetadata>(this)) |
| 400 | Metadata->print(O, SlotTracker); |
| 401 | } |
| 402 | #endif |
| 403 | |
| 404 | template <unsigned PartOpIdx> |
| 405 | VPValue * |
| 406 | VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const { |
| 407 | if (U.getNumOperands() == PartOpIdx + 1) |
| 408 | return U.getOperand(N: PartOpIdx); |
| 409 | return nullptr; |
| 410 | } |
| 411 | |
| 412 | template <unsigned PartOpIdx> |
| 413 | unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const { |
| 414 | if (auto *UnrollPartOp = getUnrollPartOperand(U)) |
| 415 | return cast<VPConstantInt>(UnrollPartOp)->getZExtValue(); |
| 416 | return 0; |
| 417 | } |
| 418 | |
| 419 | namespace llvm { |
| 420 | template class VPUnrollPartAccessor<1>; |
| 421 | template class VPUnrollPartAccessor<2>; |
| 422 | template class VPUnrollPartAccessor<3>; |
| 423 | } |
| 424 | |
| 425 | VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, |
| 426 | const VPIRFlags &Flags, const VPIRMetadata &MD, |
| 427 | DebugLoc DL, const Twine &Name) |
| 428 | : VPRecipeWithIRFlags(VPRecipeBase::VPInstructionSC, Operands, Flags, DL), |
| 429 | VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) { |
| 430 | assert(flagsValidForOpcode(getOpcode()) && |
| 431 | "Set flags not supported for the provided opcode" ); |
| 432 | assert(hasRequiredFlagsForOpcode(getOpcode()) && |
| 433 | "Opcode requires specific flags to be set" ); |
| 434 | assert((getNumOperandsForOpcode() == -1u || |
| 435 | getNumOperandsForOpcode() == getNumOperands()) && |
| 436 | "number of operands does not match opcode" ); |
| 437 | } |
| 438 | |
| 439 | unsigned VPInstruction::getNumOperandsForOpcode() const { |
| 440 | if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode)) |
| 441 | return 1; |
| 442 | |
| 443 | if (Instruction::isBinaryOp(Opcode)) |
| 444 | return 2; |
| 445 | |
| 446 | switch (Opcode) { |
| 447 | case VPInstruction::StepVector: |
| 448 | case VPInstruction::VScale: |
| 449 | return 0; |
| 450 | case Instruction::Alloca: |
| 451 | case Instruction::ExtractValue: |
| 452 | case Instruction::Freeze: |
| 453 | case Instruction::Load: |
| 454 | case VPInstruction::BranchOnCond: |
| 455 | case VPInstruction::Broadcast: |
| 456 | case VPInstruction::ExplicitVectorLength: |
| 457 | case VPInstruction::ExtractLastLane: |
| 458 | case VPInstruction::ExtractLastPart: |
| 459 | case VPInstruction::ExtractPenultimateElement: |
| 460 | case VPInstruction::Not: |
| 461 | case VPInstruction::ResumeForEpilogue: |
| 462 | case VPInstruction::Reverse: |
| 463 | case VPInstruction::Unpack: |
| 464 | return 1; |
| 465 | case Instruction::ICmp: |
| 466 | case Instruction::FCmp: |
| 467 | case Instruction::ExtractElement: |
| 468 | case Instruction::Store: |
| 469 | case VPInstruction::BranchOnCount: |
| 470 | case VPInstruction::BranchOnTwoConds: |
| 471 | case VPInstruction::ExitingIVValue: |
| 472 | case VPInstruction::FirstOrderRecurrenceSplice: |
| 473 | case VPInstruction::LogicalAnd: |
| 474 | case VPInstruction::LogicalOr: |
| 475 | case VPInstruction::PtrAdd: |
| 476 | case VPInstruction::WidePtrAdd: |
| 477 | case VPInstruction::WideIVStep: |
| 478 | case VPInstruction::CalculateTripCountMinusVF: |
| 479 | return 2; |
| 480 | case Instruction::Select: |
| 481 | case VPInstruction::ActiveLaneMask: |
| 482 | case VPInstruction::ReductionStartVector: |
| 483 | case VPInstruction::ExtractLastActive: |
| 484 | return 3; |
| 485 | case Instruction::Call: { |
| 486 | // For unmasked calls, the last argument will the called function. Use that |
| 487 | // to compute the number of operands without the mask. |
| 488 | VPValue *LastOp = getOperand(N: getNumOperands() - 1); |
| 489 | if (isa<VPIRValue>(Val: LastOp) && isa<Function>(Val: LastOp->getLiveInIRValue())) |
| 490 | return getNumOperands(); |
| 491 | return getNumOperands() - 1; |
| 492 | } |
| 493 | case Instruction::GetElementPtr: |
| 494 | case Instruction::PHI: |
| 495 | case Instruction::Switch: |
| 496 | case VPInstruction::AnyOf: |
| 497 | case VPInstruction::BuildStructVector: |
| 498 | case VPInstruction::BuildVector: |
| 499 | case VPInstruction::CanonicalIVIncrementForPart: |
| 500 | case VPInstruction::ComputeAnyOfResult: |
| 501 | case VPInstruction::ComputeReductionResult: |
| 502 | case VPInstruction::FirstActiveLane: |
| 503 | case VPInstruction::LastActiveLane: |
| 504 | case VPInstruction::SLPLoad: |
| 505 | case VPInstruction::SLPStore: |
| 506 | case VPInstruction::ExtractLane: |
| 507 | // Cannot determine the number of operands from the opcode. |
| 508 | return -1u; |
| 509 | } |
| 510 | llvm_unreachable("all cases should be handled above" ); |
| 511 | } |
| 512 | |
| 513 | bool VPInstruction::doesGeneratePerAllLanes() const { |
| 514 | return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this); |
| 515 | } |
| 516 | |
| 517 | bool VPInstruction::canGenerateScalarForFirstLane() const { |
| 518 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
| 519 | return true; |
| 520 | if (isSingleScalar() || isVectorToScalar()) |
| 521 | return true; |
| 522 | switch (Opcode) { |
| 523 | case Instruction::Freeze: |
| 524 | case Instruction::ICmp: |
| 525 | case Instruction::PHI: |
| 526 | case Instruction::Select: |
| 527 | case VPInstruction::BranchOnCond: |
| 528 | case VPInstruction::BranchOnTwoConds: |
| 529 | case VPInstruction::BranchOnCount: |
| 530 | case VPInstruction::CalculateTripCountMinusVF: |
| 531 | case VPInstruction::CanonicalIVIncrementForPart: |
| 532 | case VPInstruction::PtrAdd: |
| 533 | case VPInstruction::ExplicitVectorLength: |
| 534 | case VPInstruction::AnyOf: |
| 535 | case VPInstruction::Not: |
| 536 | return true; |
| 537 | default: |
| 538 | return false; |
| 539 | } |
| 540 | } |
| 541 | |
| 542 | Value *VPInstruction::generate(VPTransformState &State) { |
| 543 | IRBuilderBase &Builder = State.Builder; |
| 544 | |
| 545 | if (Instruction::isBinaryOp(Opcode: getOpcode())) { |
| 546 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 547 | Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
| 548 | Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
| 549 | auto *Res = |
| 550 | Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name); |
| 551 | if (auto *I = dyn_cast<Instruction>(Val: Res)) |
| 552 | applyFlags(I&: *I); |
| 553 | return Res; |
| 554 | } |
| 555 | |
| 556 | switch (getOpcode()) { |
| 557 | case VPInstruction::Not: { |
| 558 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 559 | Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
| 560 | return Builder.CreateNot(V: A, Name); |
| 561 | } |
| 562 | case Instruction::ExtractElement: { |
| 563 | assert(State.VF.isVector() && "Only extract elements from vectors" ); |
| 564 | if (auto *Idx = dyn_cast<VPConstantInt>(Val: getOperand(N: 1))) |
| 565 | return State.get(Def: getOperand(N: 0), Lane: VPLane(Idx->getZExtValue())); |
| 566 | Value *Vec = State.get(Def: getOperand(N: 0)); |
| 567 | Value *Idx = State.get(Def: getOperand(N: 1), /*IsScalar=*/true); |
| 568 | return Builder.CreateExtractElement(Vec, Idx, Name); |
| 569 | } |
| 570 | case Instruction::Freeze: { |
| 571 | Value *Op = State.get(Def: getOperand(N: 0), IsScalar: vputils::onlyFirstLaneUsed(Def: this)); |
| 572 | return Builder.CreateFreeze(V: Op, Name); |
| 573 | } |
| 574 | case Instruction::FCmp: |
| 575 | case Instruction::ICmp: { |
| 576 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 577 | Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
| 578 | Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
| 579 | return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name); |
| 580 | } |
| 581 | case Instruction::PHI: { |
| 582 | llvm_unreachable("should be handled by VPPhi::execute" ); |
| 583 | } |
| 584 | case Instruction::Select: { |
| 585 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 586 | Value *Cond = |
| 587 | State.get(Def: getOperand(N: 0), |
| 588 | IsScalar: OnlyFirstLaneUsed || vputils::isSingleScalar(VPV: getOperand(N: 0))); |
| 589 | Value *Op1 = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
| 590 | Value *Op2 = State.get(Def: getOperand(N: 2), IsScalar: OnlyFirstLaneUsed); |
| 591 | return Builder.CreateSelectFMF(C: Cond, True: Op1, False: Op2, FMFSource: getFastMathFlags(), Name); |
| 592 | } |
| 593 | case VPInstruction::ActiveLaneMask: { |
| 594 | // Get first lane of vector induction variable. |
| 595 | Value *VIVElem0 = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 596 | // Get the original loop tripcount. |
| 597 | Value *ScalarTC = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 598 | |
| 599 | // If this part of the active lane mask is scalar, generate the CMP directly |
| 600 | // to avoid unnecessary extracts. |
| 601 | if (State.VF.isScalar()) |
| 602 | return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC, |
| 603 | Name); |
| 604 | |
| 605 | ElementCount EC = State.VF.multiplyCoefficientBy( |
| 606 | RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue()); |
| 607 | auto *PredTy = VectorType::get(ElementType: Builder.getInt1Ty(), EC); |
| 608 | return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask, |
| 609 | Types: {PredTy, ScalarTC->getType()}, |
| 610 | Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name); |
| 611 | } |
| 612 | case VPInstruction::FirstOrderRecurrenceSplice: { |
| 613 | // Generate code to combine the previous and current values in vector v3. |
| 614 | // |
| 615 | // vector.ph: |
| 616 | // v_init = vector(..., ..., ..., a[-1]) |
| 617 | // br vector.body |
| 618 | // |
| 619 | // vector.body |
| 620 | // i = phi [0, vector.ph], [i+4, vector.body] |
| 621 | // v1 = phi [v_init, vector.ph], [v2, vector.body] |
| 622 | // v2 = a[i, i+1, i+2, i+3]; |
| 623 | // v3 = vector(v1(3), v2(0, 1, 2)) |
| 624 | |
| 625 | auto *V1 = State.get(Def: getOperand(N: 0)); |
| 626 | if (!V1->getType()->isVectorTy()) |
| 627 | return V1; |
| 628 | Value *V2 = State.get(Def: getOperand(N: 1)); |
| 629 | return Builder.CreateVectorSpliceRight(V1, V2, Offset: 1, Name); |
| 630 | } |
| 631 | case VPInstruction::CalculateTripCountMinusVF: { |
| 632 | Value *ScalarTC = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 633 | Value *VFxUF = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 634 | Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: VFxUF); |
| 635 | Value *Cmp = |
| 636 | Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: VFxUF); |
| 637 | Value *Zero = ConstantInt::getNullValue(Ty: ScalarTC->getType()); |
| 638 | return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero); |
| 639 | } |
| 640 | case VPInstruction::ExplicitVectorLength: { |
| 641 | // TODO: Restructure this code with an explicit remainder loop, vsetvli can |
| 642 | // be outside of the main loop. |
| 643 | Value *AVL = State.get(Def: getOperand(N: 0), /*IsScalar*/ true); |
| 644 | // Compute EVL |
| 645 | assert(AVL->getType()->isIntegerTy() && |
| 646 | "Requested vector length should be an integer." ); |
| 647 | |
| 648 | assert(State.VF.isScalable() && "Expected scalable vector factor." ); |
| 649 | Value *VFArg = Builder.getInt32(C: State.VF.getKnownMinValue()); |
| 650 | |
| 651 | Value *EVL = Builder.CreateIntrinsic( |
| 652 | RetTy: Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length, |
| 653 | Args: {AVL, VFArg, Builder.getTrue()}); |
| 654 | return EVL; |
| 655 | } |
| 656 | case VPInstruction::CanonicalIVIncrementForPart: { |
| 657 | auto *IV = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 658 | auto *VFxPart = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 659 | // The canonical IV is incremented by the vectorization factor (num of |
| 660 | // SIMD elements) times the unroll part. |
| 661 | return Builder.CreateAdd(LHS: IV, RHS: VFxPart, Name, HasNUW: hasNoUnsignedWrap(), |
| 662 | HasNSW: hasNoSignedWrap()); |
| 663 | } |
| 664 | case VPInstruction::BranchOnCond: { |
| 665 | Value *Cond = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 666 | // Replace the temporary unreachable terminator with a new conditional |
| 667 | // branch, hooking it up to backward destination for latch blocks now, and |
| 668 | // to forward destination(s) later when they are created. |
| 669 | // Second successor may be backwards - iff it is already in VPBB2IRBB. |
| 670 | VPBasicBlock *SecondVPSucc = |
| 671 | cast<VPBasicBlock>(Val: getParent()->getSuccessors()[1]); |
| 672 | BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(Val: SecondVPSucc); |
| 673 | BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()]; |
| 674 | auto *Br = Builder.CreateCondBr(Cond, True: IRBB, False: SecondIRSucc); |
| 675 | // First successor is always forward, reset it to nullptr. |
| 676 | Br->setSuccessor(idx: 0, NewSucc: nullptr); |
| 677 | IRBB->getTerminator()->eraseFromParent(); |
| 678 | applyMetadata(I&: *Br); |
| 679 | return Br; |
| 680 | } |
| 681 | case VPInstruction::Broadcast: { |
| 682 | return Builder.CreateVectorSplat( |
| 683 | EC: State.VF, V: State.get(Def: getOperand(N: 0), /*IsScalar*/ true), Name: "broadcast" ); |
| 684 | } |
| 685 | case VPInstruction::BuildStructVector: { |
| 686 | // For struct types, we need to build a new 'wide' struct type, where each |
| 687 | // element is widened, i.e., we create a struct of vectors. |
| 688 | auto *StructTy = |
| 689 | cast<StructType>(Val: State.TypeAnalysis.inferScalarType(V: getOperand(N: 0))); |
| 690 | Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: StructTy, EC: State.VF)); |
| 691 | for (const auto &[LaneIndex, Op] : enumerate(First: operands())) { |
| 692 | for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements(); |
| 693 | FieldIndex++) { |
| 694 | Value *ScalarValue = |
| 695 | Builder.CreateExtractValue(Agg: State.get(Def: Op, IsScalar: true), Idxs: FieldIndex); |
| 696 | Value *VectorValue = Builder.CreateExtractValue(Agg: Res, Idxs: FieldIndex); |
| 697 | VectorValue = |
| 698 | Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneIndex); |
| 699 | Res = Builder.CreateInsertValue(Agg: Res, Val: VectorValue, Idxs: FieldIndex); |
| 700 | } |
| 701 | } |
| 702 | return Res; |
| 703 | } |
| 704 | case VPInstruction::BuildVector: { |
| 705 | auto *ScalarTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)); |
| 706 | auto NumOfElements = ElementCount::getFixed(MinVal: getNumOperands()); |
| 707 | Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: ScalarTy, EC: NumOfElements)); |
| 708 | for (const auto &[Idx, Op] : enumerate(First: operands())) |
| 709 | Res = Builder.CreateInsertElement(Vec: Res, NewElt: State.get(Def: Op, IsScalar: true), |
| 710 | Idx: Builder.getInt32(C: Idx)); |
| 711 | return Res; |
| 712 | } |
| 713 | case VPInstruction::ReductionStartVector: { |
| 714 | if (State.VF.isScalar()) |
| 715 | return State.get(Def: getOperand(N: 0), IsScalar: true); |
| 716 | IRBuilderBase::FastMathFlagGuard FMFG(Builder); |
| 717 | Builder.setFastMathFlags(getFastMathFlags()); |
| 718 | // If this start vector is scaled then it should produce a vector with fewer |
| 719 | // elements than the VF. |
| 720 | ElementCount VF = State.VF.divideCoefficientBy( |
| 721 | RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue()); |
| 722 | auto *Iden = Builder.CreateVectorSplat(EC: VF, V: State.get(Def: getOperand(N: 1), IsScalar: true)); |
| 723 | return Builder.CreateInsertElement(Vec: Iden, NewElt: State.get(Def: getOperand(N: 0), IsScalar: true), |
| 724 | Idx: Builder.getInt32(C: 0)); |
| 725 | } |
| 726 | case VPInstruction::ComputeAnyOfResult: { |
| 727 | Value *Start = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 728 | Value *NewVal = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 729 | Value *ReducedResult = State.get(Def: getOperand(N: 2)); |
| 730 | for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx) |
| 731 | ReducedResult = |
| 732 | Builder.CreateBinOp(Opc: Instruction::Or, LHS: State.get(Def: getOperand(N: Idx)), |
| 733 | RHS: ReducedResult, Name: "bin.rdx" ); |
| 734 | // If any predicate is true it means that we want to select the new value. |
| 735 | if (ReducedResult->getType()->isVectorTy()) |
| 736 | ReducedResult = Builder.CreateOrReduce(Src: ReducedResult); |
| 737 | // The compares in the loop may yield poison, which propagates through the |
| 738 | // bitwise ORs. Freeze it here before the condition is used. |
| 739 | ReducedResult = Builder.CreateFreeze(V: ReducedResult); |
| 740 | return Builder.CreateSelect(C: ReducedResult, True: NewVal, False: Start, Name: "rdx.select" ); |
| 741 | } |
| 742 | case VPInstruction::ComputeReductionResult: { |
| 743 | RecurKind RK = getRecurKind(); |
| 744 | bool IsOrdered = isReductionOrdered(); |
| 745 | bool IsInLoop = isReductionInLoop(); |
| 746 | assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && |
| 747 | "FindIV should use min/max reduction kinds" ); |
| 748 | |
| 749 | // The recipe may have multiple operands to be reduced together. |
| 750 | unsigned NumOperandsToReduce = getNumOperands(); |
| 751 | VectorParts RdxParts(NumOperandsToReduce); |
| 752 | for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part) |
| 753 | RdxParts[Part] = State.get(Def: getOperand(N: Part), IsScalar: IsInLoop); |
| 754 | |
| 755 | IRBuilderBase::FastMathFlagGuard FMFG(Builder); |
| 756 | Builder.setFastMathFlags(getFastMathFlags()); |
| 757 | |
| 758 | // Reduce multiple operands into one. |
| 759 | Value *ReducedPartRdx = RdxParts[0]; |
| 760 | if (IsOrdered) { |
| 761 | ReducedPartRdx = RdxParts[NumOperandsToReduce - 1]; |
| 762 | } else { |
| 763 | // Floating-point operations should have some FMF to enable the reduction. |
| 764 | for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) { |
| 765 | Value *RdxPart = RdxParts[Part]; |
| 766 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) |
| 767 | ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart); |
| 768 | else { |
| 769 | // For sub-recurrences, each part's reduction variable is already |
| 770 | // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1) |
| 771 | Instruction::BinaryOps Opcode = |
| 772 | RK == RecurKind::Sub |
| 773 | ? Instruction::Add |
| 774 | : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind: RK); |
| 775 | ReducedPartRdx = |
| 776 | Builder.CreateBinOp(Opc: Opcode, LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx" ); |
| 777 | } |
| 778 | } |
| 779 | } |
| 780 | |
| 781 | // Create the reduction after the loop. Note that inloop reductions create |
| 782 | // the target reduction in the loop using a Reduction recipe. |
| 783 | if (State.VF.isVector() && !IsInLoop) { |
| 784 | // TODO: Support in-order reductions based on the recurrence descriptor. |
| 785 | // All ops in the reduction inherit fast-math-flags from the recurrence |
| 786 | // descriptor. |
| 787 | ReducedPartRdx = createSimpleReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK); |
| 788 | } |
| 789 | |
| 790 | return ReducedPartRdx; |
| 791 | } |
| 792 | case VPInstruction::ExtractLastLane: |
| 793 | case VPInstruction::ExtractPenultimateElement: { |
| 794 | unsigned Offset = |
| 795 | getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1; |
| 796 | Value *Res; |
| 797 | if (State.VF.isVector()) { |
| 798 | assert(Offset <= State.VF.getKnownMinValue() && |
| 799 | "invalid offset to extract from" ); |
| 800 | // Extract lane VF - Offset from the operand. |
| 801 | Res = State.get(Def: getOperand(N: 0), Lane: VPLane::getLaneFromEnd(VF: State.VF, Offset)); |
| 802 | } else { |
| 803 | // TODO: Remove ExtractLastLane for scalar VFs. |
| 804 | assert(Offset <= 1 && "invalid offset to extract from" ); |
| 805 | Res = State.get(Def: getOperand(N: 0)); |
| 806 | } |
| 807 | if (isa<ExtractElementInst>(Val: Res)) |
| 808 | Res->setName(Name); |
| 809 | return Res; |
| 810 | } |
| 811 | case VPInstruction::LogicalAnd: { |
| 812 | Value *A = State.get(Def: getOperand(N: 0)); |
| 813 | Value *B = State.get(Def: getOperand(N: 1)); |
| 814 | return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name); |
| 815 | } |
| 816 | case VPInstruction::LogicalOr: { |
| 817 | Value *A = State.get(Def: getOperand(N: 0)); |
| 818 | Value *B = State.get(Def: getOperand(N: 1)); |
| 819 | return Builder.CreateLogicalOr(Cond1: A, Cond2: B, Name); |
| 820 | } |
| 821 | case VPInstruction::PtrAdd: { |
| 822 | assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) && |
| 823 | "can only generate first lane for PtrAdd" ); |
| 824 | Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 825 | Value *Addend = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 826 | return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags()); |
| 827 | } |
| 828 | case VPInstruction::WidePtrAdd: { |
| 829 | Value *Ptr = |
| 830 | State.get(Def: getOperand(N: 0), IsScalar: vputils::isSingleScalar(VPV: getOperand(N: 0))); |
| 831 | Value *Addend = State.get(Def: getOperand(N: 1)); |
| 832 | return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags()); |
| 833 | } |
| 834 | case VPInstruction::AnyOf: { |
| 835 | Value *Res = Builder.CreateFreeze(V: State.get(Def: getOperand(N: 0))); |
| 836 | for (VPValue *Op : drop_begin(RangeOrContainer: operands())) |
| 837 | Res = Builder.CreateOr(LHS: Res, RHS: Builder.CreateFreeze(V: State.get(Def: Op))); |
| 838 | return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Src: Res); |
| 839 | } |
| 840 | case VPInstruction::ExtractLane: { |
| 841 | assert(getNumOperands() != 2 && "ExtractLane from single source should be " |
| 842 | "simplified to ExtractElement." ); |
| 843 | Value * = State.get(Def: getOperand(N: 0), IsScalar: true); |
| 844 | Type *IdxTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)); |
| 845 | Value *Res = nullptr; |
| 846 | Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF); |
| 847 | |
| 848 | for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) { |
| 849 | Value *VectorStart = |
| 850 | Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: IdxTy, V: Idx - 1)); |
| 851 | Value *VectorIdx = Idx == 1 |
| 852 | ? LaneToExtract |
| 853 | : Builder.CreateSub(LHS: LaneToExtract, RHS: VectorStart); |
| 854 | Value *Ext = State.VF.isScalar() |
| 855 | ? State.get(Def: getOperand(N: Idx)) |
| 856 | : Builder.CreateExtractElement( |
| 857 | Vec: State.get(Def: getOperand(N: Idx)), Idx: VectorIdx); |
| 858 | if (Res) { |
| 859 | Value *Cmp = Builder.CreateICmpUGE(LHS: LaneToExtract, RHS: VectorStart); |
| 860 | Res = Builder.CreateSelect(C: Cmp, True: Ext, False: Res); |
| 861 | } else { |
| 862 | Res = Ext; |
| 863 | } |
| 864 | } |
| 865 | return Res; |
| 866 | } |
| 867 | case VPInstruction::FirstActiveLane: { |
| 868 | if (getNumOperands() == 1) { |
| 869 | Value *Mask = State.get(Def: getOperand(N: 0)); |
| 870 | return Builder.CreateCountTrailingZeroElems(ResTy: Builder.getInt64Ty(), Mask, |
| 871 | /*ZeroIsPoison=*/false, Name); |
| 872 | } |
| 873 | // If there are multiple operands, create a chain of selects to pick the |
| 874 | // first operand with an active lane and add the number of lanes of the |
| 875 | // preceding operands. |
| 876 | Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: Builder.getInt64Ty(), VF: State.VF); |
| 877 | unsigned LastOpIdx = getNumOperands() - 1; |
| 878 | Value *Res = nullptr; |
| 879 | for (int Idx = LastOpIdx; Idx >= 0; --Idx) { |
| 880 | Value *TrailingZeros = |
| 881 | State.VF.isScalar() |
| 882 | ? Builder.CreateZExt( |
| 883 | V: Builder.CreateICmpEQ(LHS: State.get(Def: getOperand(N: Idx)), |
| 884 | RHS: Builder.getFalse()), |
| 885 | DestTy: Builder.getInt64Ty()) |
| 886 | : Builder.CreateCountTrailingZeroElems( |
| 887 | ResTy: Builder.getInt64Ty(), Mask: State.get(Def: getOperand(N: Idx)), |
| 888 | /*ZeroIsPoison=*/false, Name); |
| 889 | Value *Current = Builder.CreateAdd( |
| 890 | LHS: Builder.CreateMul(LHS: RuntimeVF, RHS: Builder.getInt64(C: Idx)), RHS: TrailingZeros); |
| 891 | if (Res) { |
| 892 | Value *Cmp = Builder.CreateICmpNE(LHS: TrailingZeros, RHS: RuntimeVF); |
| 893 | Res = Builder.CreateSelect(C: Cmp, True: Current, False: Res); |
| 894 | } else { |
| 895 | Res = Current; |
| 896 | } |
| 897 | } |
| 898 | |
| 899 | return Res; |
| 900 | } |
| 901 | case VPInstruction::ResumeForEpilogue: |
| 902 | return State.get(Def: getOperand(N: 0), IsScalar: true); |
| 903 | case VPInstruction::Reverse: |
| 904 | return Builder.CreateVectorReverse(V: State.get(Def: getOperand(N: 0)), Name: "reverse" ); |
| 905 | case VPInstruction::ExtractLastActive: { |
| 906 | Value *Data = State.get(Def: getOperand(N: 0)); |
| 907 | Value *Mask = State.get(Def: getOperand(N: 1)); |
| 908 | Value *Default = State.get(Def: getOperand(N: 2), /*IsScalar=*/true); |
| 909 | Type *VTy = Data->getType(); |
| 910 | return Builder.CreateIntrinsic( |
| 911 | ID: Intrinsic::experimental_vector_extract_last_active, Types: {VTy}, |
| 912 | Args: {Data, Mask, Default}); |
| 913 | } |
| 914 | default: |
| 915 | llvm_unreachable("Unsupported opcode for instruction" ); |
| 916 | } |
| 917 | } |
| 918 | |
| 919 | InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode( |
| 920 | unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const { |
| 921 | Type *ScalarTy = Ctx.Types.inferScalarType(V: this); |
| 922 | Type *ResultTy = VF.isVector() ? toVectorTy(Scalar: ScalarTy, EC: VF) : ScalarTy; |
| 923 | switch (Opcode) { |
| 924 | case Instruction::FNeg: |
| 925 | return Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: ResultTy, CostKind: Ctx.CostKind); |
| 926 | case Instruction::UDiv: |
| 927 | case Instruction::SDiv: |
| 928 | case Instruction::SRem: |
| 929 | case Instruction::URem: |
| 930 | case Instruction::Add: |
| 931 | case Instruction::FAdd: |
| 932 | case Instruction::Sub: |
| 933 | case Instruction::FSub: |
| 934 | case Instruction::Mul: |
| 935 | case Instruction::FMul: |
| 936 | case Instruction::FDiv: |
| 937 | case Instruction::FRem: |
| 938 | case Instruction::Shl: |
| 939 | case Instruction::LShr: |
| 940 | case Instruction::AShr: |
| 941 | case Instruction::And: |
| 942 | case Instruction::Or: |
| 943 | case Instruction::Xor: { |
| 944 | TargetTransformInfo::OperandValueInfo RHSInfo = { |
| 945 | .Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}; |
| 946 | |
| 947 | if (VF.isVector()) { |
| 948 | // Certain instructions can be cheaper to vectorize if they have a |
| 949 | // constant second vector operand. One example of this are shifts on x86. |
| 950 | VPValue *RHS = getOperand(N: 1); |
| 951 | RHSInfo = Ctx.getOperandInfo(V: RHS); |
| 952 | |
| 953 | if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && |
| 954 | getOperand(N: 1)->isDefinedOutsideLoopRegions()) |
| 955 | RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; |
| 956 | } |
| 957 | |
| 958 | Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()); |
| 959 | SmallVector<const Value *, 4> Operands; |
| 960 | if (CtxI) |
| 961 | Operands.append(in_start: CtxI->value_op_begin(), in_end: CtxI->value_op_end()); |
| 962 | return Ctx.TTI.getArithmeticInstrCost( |
| 963 | Opcode, Ty: ResultTy, CostKind: Ctx.CostKind, |
| 964 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
| 965 | Opd2Info: RHSInfo, Args: Operands, CxtI: CtxI, TLibInfo: &Ctx.TLI); |
| 966 | } |
| 967 | case Instruction::Freeze: |
| 968 | // This opcode is unknown. Assume that it is the same as 'mul'. |
| 969 | return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: ResultTy, |
| 970 | CostKind: Ctx.CostKind); |
| 971 | case Instruction::ExtractValue: |
| 972 | return Ctx.TTI.getInsertExtractValueCost(Opcode: Instruction::ExtractValue, |
| 973 | CostKind: Ctx.CostKind); |
| 974 | case Instruction::ICmp: |
| 975 | case Instruction::FCmp: { |
| 976 | Type *ScalarOpTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 977 | Type *OpTy = VF.isVector() ? toVectorTy(Scalar: ScalarOpTy, EC: VF) : ScalarOpTy; |
| 978 | Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()); |
| 979 | return Ctx.TTI.getCmpSelInstrCost( |
| 980 | Opcode, ValTy: OpTy, CondTy: CmpInst::makeCmpResultType(opnd_type: OpTy), VecPred: getPredicate(), |
| 981 | CostKind: Ctx.CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
| 982 | Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CtxI); |
| 983 | } |
| 984 | case Instruction::BitCast: { |
| 985 | Type *ScalarTy = Ctx.Types.inferScalarType(V: this); |
| 986 | if (ScalarTy->isPointerTy()) |
| 987 | return 0; |
| 988 | [[fallthrough]]; |
| 989 | } |
| 990 | case Instruction::SExt: |
| 991 | case Instruction::ZExt: |
| 992 | case Instruction::FPToUI: |
| 993 | case Instruction::FPToSI: |
| 994 | case Instruction::FPExt: |
| 995 | case Instruction::PtrToInt: |
| 996 | case Instruction::PtrToAddr: |
| 997 | case Instruction::IntToPtr: |
| 998 | case Instruction::SIToFP: |
| 999 | case Instruction::UIToFP: |
| 1000 | case Instruction::Trunc: |
| 1001 | case Instruction::FPTrunc: |
| 1002 | case Instruction::AddrSpaceCast: { |
| 1003 | // Computes the CastContextHint from a recipe that may access memory. |
| 1004 | auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { |
| 1005 | if (isa<VPInterleaveBase>(Val: R)) |
| 1006 | return TTI::CastContextHint::Interleave; |
| 1007 | if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(Val: R)) { |
| 1008 | // Only compute CCH for memory operations, matching the legacy model |
| 1009 | // which only considers loads/stores for cast context hints. |
| 1010 | auto *UI = cast<Instruction>(Val: ReplicateRecipe->getUnderlyingValue()); |
| 1011 | if (!isa<LoadInst, StoreInst>(Val: UI)) |
| 1012 | return TTI::CastContextHint::None; |
| 1013 | return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked |
| 1014 | : TTI::CastContextHint::Normal; |
| 1015 | } |
| 1016 | const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(Val: R); |
| 1017 | if (WidenMemoryRecipe == nullptr) |
| 1018 | return TTI::CastContextHint::None; |
| 1019 | if (VF.isScalar()) |
| 1020 | return TTI::CastContextHint::Normal; |
| 1021 | if (!WidenMemoryRecipe->isConsecutive()) |
| 1022 | return TTI::CastContextHint::GatherScatter; |
| 1023 | if (WidenMemoryRecipe->isReverse()) |
| 1024 | return TTI::CastContextHint::Reversed; |
| 1025 | if (WidenMemoryRecipe->isMasked()) |
| 1026 | return TTI::CastContextHint::Masked; |
| 1027 | return TTI::CastContextHint::Normal; |
| 1028 | }; |
| 1029 | |
| 1030 | VPValue *Operand = getOperand(N: 0); |
| 1031 | TTI::CastContextHint CCH = TTI::CastContextHint::None; |
| 1032 | // For Trunc/FPTrunc, get the context from the only user. |
| 1033 | if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { |
| 1034 | auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * { |
| 1035 | if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser()) |
| 1036 | return nullptr; |
| 1037 | return dyn_cast<VPRecipeBase>(Val: *R->user_begin()); |
| 1038 | }; |
| 1039 | if (VPRecipeBase *Recipe = GetOnlyUser(this)) { |
| 1040 | if (match(V: Recipe, P: m_Reverse(Op0: m_VPValue()))) |
| 1041 | Recipe = GetOnlyUser(cast<VPInstruction>(Val: Recipe)); |
| 1042 | if (Recipe) |
| 1043 | CCH = ComputeCCH(Recipe); |
| 1044 | } |
| 1045 | } |
| 1046 | // For Z/Sext, get the context from the operand. |
| 1047 | else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || |
| 1048 | Opcode == Instruction::FPExt) { |
| 1049 | if (auto *Recipe = Operand->getDefiningRecipe()) { |
| 1050 | VPValue *ReverseOp; |
| 1051 | if (match(V: Recipe, P: m_Reverse(Op0: m_VPValue(V&: ReverseOp)))) |
| 1052 | Recipe = ReverseOp->getDefiningRecipe(); |
| 1053 | if (Recipe) |
| 1054 | CCH = ComputeCCH(Recipe); |
| 1055 | } |
| 1056 | } |
| 1057 | |
| 1058 | auto *ScalarSrcTy = Ctx.Types.inferScalarType(V: Operand); |
| 1059 | Type *SrcTy = VF.isVector() ? toVectorTy(Scalar: ScalarSrcTy, EC: VF) : ScalarSrcTy; |
| 1060 | // Arm TTI will use the underlying instruction to determine the cost. |
| 1061 | return Ctx.TTI.getCastInstrCost( |
| 1062 | Opcode, Dst: ResultTy, Src: SrcTy, CCH, CostKind: Ctx.CostKind, |
| 1063 | I: dyn_cast_if_present<Instruction>(Val: getUnderlyingValue())); |
| 1064 | } |
| 1065 | case Instruction::Select: { |
| 1066 | SelectInst *SI = cast_or_null<SelectInst>(Val: getUnderlyingValue()); |
| 1067 | bool IsScalarCond = getOperand(N: 0)->isDefinedOutsideLoopRegions(); |
| 1068 | Type *ScalarTy = Ctx.Types.inferScalarType(V: this); |
| 1069 | |
| 1070 | VPValue *Op0, *Op1; |
| 1071 | bool IsLogicalAnd = |
| 1072 | match(V: this, P: m_c_LogicalAnd(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))); |
| 1073 | bool IsLogicalOr = |
| 1074 | match(V: this, P: m_c_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))); |
| 1075 | // Also match the inverted forms: |
| 1076 | // select x, false, y --> !x & y (still AND) |
| 1077 | // select x, y, true --> !x | y (still OR) |
| 1078 | IsLogicalAnd |= |
| 1079 | match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_False(), Op2: m_VPValue(V&: Op1))); |
| 1080 | IsLogicalOr |= |
| 1081 | match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1), Op2: m_True())); |
| 1082 | |
| 1083 | if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 && |
| 1084 | (IsLogicalAnd || IsLogicalOr)) { |
| 1085 | // select x, y, false --> x & y |
| 1086 | // select x, true, y --> x | y |
| 1087 | const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(V: Op0); |
| 1088 | const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(V: Op1); |
| 1089 | |
| 1090 | SmallVector<const Value *, 2> Operands; |
| 1091 | if (SI && all_of(Range: operands(), |
| 1092 | P: [](VPValue *Op) { return Op->getUnderlyingValue(); })) |
| 1093 | append_range(C&: Operands, R: SI->operands()); |
| 1094 | return Ctx.TTI.getArithmeticInstrCost( |
| 1095 | Opcode: IsLogicalOr ? Instruction::Or : Instruction::And, Ty: ResultTy, |
| 1096 | CostKind: Ctx.CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: SI); |
| 1097 | } |
| 1098 | |
| 1099 | Type *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 1100 | if (!IsScalarCond) |
| 1101 | CondTy = VectorType::get(ElementType: CondTy, EC: VF); |
| 1102 | |
| 1103 | llvm::CmpPredicate Pred; |
| 1104 | if (!match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue()))) |
| 1105 | if (auto *CondIRV = dyn_cast<VPIRValue>(Val: getOperand(N: 0))) |
| 1106 | if (auto *Cmp = dyn_cast<CmpInst>(Val: CondIRV->getValue())) |
| 1107 | Pred = Cmp->getPredicate(); |
| 1108 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 1109 | return Ctx.TTI.getCmpSelInstrCost( |
| 1110 | Opcode: Instruction::Select, ValTy: VectorTy, CondTy, VecPred: Pred, CostKind: Ctx.CostKind, |
| 1111 | Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: SI); |
| 1112 | } |
| 1113 | } |
| 1114 | llvm_unreachable("called for unsupported opcode" ); |
| 1115 | } |
| 1116 | |
| 1117 | InstructionCost VPInstruction::computeCost(ElementCount VF, |
| 1118 | VPCostContext &Ctx) const { |
| 1119 | if (Instruction::isBinaryOp(Opcode: getOpcode())) { |
| 1120 | if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) { |
| 1121 | // TODO: Compute cost for VPInstructions without underlying values once |
| 1122 | // the legacy cost model has been retired. |
| 1123 | return 0; |
| 1124 | } |
| 1125 | |
| 1126 | assert(!doesGeneratePerAllLanes() && |
| 1127 | "Should only generate a vector value or single scalar, not scalars " |
| 1128 | "for all lanes." ); |
| 1129 | return getCostForRecipeWithOpcode( |
| 1130 | Opcode: getOpcode(), |
| 1131 | VF: vputils::onlyFirstLaneUsed(Def: this) ? ElementCount::getFixed(MinVal: 1) : VF, Ctx); |
| 1132 | } |
| 1133 | |
| 1134 | switch (getOpcode()) { |
| 1135 | case Instruction::Select: { |
| 1136 | llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE; |
| 1137 | match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue())); |
| 1138 | auto *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 1139 | auto *VecTy = Ctx.Types.inferScalarType(V: getOperand(N: 1)); |
| 1140 | if (!vputils::onlyFirstLaneUsed(Def: this)) { |
| 1141 | CondTy = toVectorTy(Scalar: CondTy, EC: VF); |
| 1142 | VecTy = toVectorTy(Scalar: VecTy, EC: VF); |
| 1143 | } |
| 1144 | return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy, CondTy, VecPred: Pred, |
| 1145 | CostKind: Ctx.CostKind); |
| 1146 | } |
| 1147 | case Instruction::ExtractElement: |
| 1148 | case VPInstruction::ExtractLane: { |
| 1149 | if (VF.isScalar()) { |
| 1150 | // ExtractLane with VF=1 takes care of handling extracting across multiple |
| 1151 | // parts. |
| 1152 | return 0; |
| 1153 | } |
| 1154 | |
| 1155 | // Add on the cost of extracting the element. |
| 1156 | auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF); |
| 1157 | return Ctx.TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, |
| 1158 | CostKind: Ctx.CostKind); |
| 1159 | } |
| 1160 | case VPInstruction::AnyOf: { |
| 1161 | auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 1162 | return Ctx.TTI.getArithmeticReductionCost( |
| 1163 | Opcode: Instruction::Or, Ty: cast<VectorType>(Val: VecTy), FMF: std::nullopt, CostKind: Ctx.CostKind); |
| 1164 | } |
| 1165 | case VPInstruction::FirstActiveLane: { |
| 1166 | Type *ScalarTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 1167 | if (VF.isScalar()) |
| 1168 | return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy, |
| 1169 | CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy), |
| 1170 | VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind); |
| 1171 | // Calculate the cost of determining the lane index. |
| 1172 | auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF); |
| 1173 | IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, |
| 1174 | Type::getInt64Ty(C&: Ctx.LLVMCtx), |
| 1175 | {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)}); |
| 1176 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 1177 | } |
| 1178 | case VPInstruction::LastActiveLane: { |
| 1179 | Type *ScalarTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 1180 | if (VF.isScalar()) |
| 1181 | return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy, |
| 1182 | CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy), |
| 1183 | VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind); |
| 1184 | // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB. |
| 1185 | auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF); |
| 1186 | IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, |
| 1187 | Type::getInt64Ty(C&: Ctx.LLVMCtx), |
| 1188 | {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)}); |
| 1189 | InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 1190 | // Add cost of NOT operation on the predicate. |
| 1191 | Cost += Ctx.TTI.getArithmeticInstrCost( |
| 1192 | Opcode: Instruction::Xor, Ty: PredTy, CostKind: Ctx.CostKind, |
| 1193 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
| 1194 | Opd2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue, |
| 1195 | .Properties: TargetTransformInfo::OP_None}); |
| 1196 | // Add cost of SUB operation on the index. |
| 1197 | Cost += Ctx.TTI.getArithmeticInstrCost( |
| 1198 | Opcode: Instruction::Sub, Ty: Type::getInt64Ty(C&: Ctx.LLVMCtx), CostKind: Ctx.CostKind); |
| 1199 | return Cost; |
| 1200 | } |
| 1201 | case VPInstruction::ExtractLastActive: { |
| 1202 | Type *ScalarTy = Ctx.Types.inferScalarType(V: this); |
| 1203 | Type *VecTy = toVectorTy(Scalar: ScalarTy, EC: VF); |
| 1204 | Type *MaskTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF); |
| 1205 | IntrinsicCostAttributes ICA( |
| 1206 | Intrinsic::experimental_vector_extract_last_active, ScalarTy, |
| 1207 | {VecTy, MaskTy, ScalarTy}); |
| 1208 | return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind); |
| 1209 | } |
| 1210 | case VPInstruction::FirstOrderRecurrenceSplice: { |
| 1211 | assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?" ); |
| 1212 | SmallVector<int> Mask(VF.getKnownMinValue()); |
| 1213 | std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1); |
| 1214 | Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 1215 | |
| 1216 | return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice, |
| 1217 | DstTy: cast<VectorType>(Val: VectorTy), |
| 1218 | SrcTy: cast<VectorType>(Val: VectorTy), Mask, |
| 1219 | CostKind: Ctx.CostKind, Index: VF.getKnownMinValue() - 1); |
| 1220 | } |
| 1221 | case VPInstruction::ActiveLaneMask: { |
| 1222 | Type *ArgTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 1223 | unsigned Multiplier = cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue(); |
| 1224 | Type *RetTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF * Multiplier); |
| 1225 | IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, |
| 1226 | {ArgTy, ArgTy}); |
| 1227 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 1228 | } |
| 1229 | case VPInstruction::ExplicitVectorLength: { |
| 1230 | Type *Arg0Ty = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 1231 | Type *I32Ty = Type::getInt32Ty(C&: Ctx.LLVMCtx); |
| 1232 | Type *I1Ty = Type::getInt1Ty(C&: Ctx.LLVMCtx); |
| 1233 | IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length, |
| 1234 | I32Ty, {Arg0Ty, I32Ty, I1Ty}); |
| 1235 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 1236 | } |
| 1237 | case VPInstruction::Reverse: { |
| 1238 | assert(VF.isVector() && "Reverse operation must be vector type" ); |
| 1239 | auto *VectorTy = cast<VectorType>( |
| 1240 | Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF)); |
| 1241 | return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy, |
| 1242 | SrcTy: VectorTy, /*Mask=*/{}, CostKind: Ctx.CostKind, |
| 1243 | /*Index=*/0); |
| 1244 | } |
| 1245 | case VPInstruction::ExtractLastLane: { |
| 1246 | // Add on the cost of extracting the element. |
| 1247 | auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF); |
| 1248 | return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement, |
| 1249 | Val: VecTy, CostKind: Ctx.CostKind, Index: 0); |
| 1250 | } |
| 1251 | case VPInstruction::ExtractPenultimateElement: |
| 1252 | if (VF == ElementCount::getScalable(MinVal: 1)) |
| 1253 | return InstructionCost::getInvalid(); |
| 1254 | [[fallthrough]]; |
| 1255 | default: |
| 1256 | // TODO: Compute cost other VPInstructions once the legacy cost model has |
| 1257 | // been retired. |
| 1258 | assert(!getUnderlyingValue() && |
| 1259 | "unexpected VPInstruction witht underlying value" ); |
| 1260 | return 0; |
| 1261 | } |
| 1262 | } |
| 1263 | |
| 1264 | bool VPInstruction::isVectorToScalar() const { |
| 1265 | return getOpcode() == VPInstruction::ExtractLastLane || |
| 1266 | getOpcode() == VPInstruction::ExtractPenultimateElement || |
| 1267 | getOpcode() == Instruction::ExtractElement || |
| 1268 | getOpcode() == VPInstruction::ExtractLane || |
| 1269 | getOpcode() == VPInstruction::FirstActiveLane || |
| 1270 | getOpcode() == VPInstruction::LastActiveLane || |
| 1271 | getOpcode() == VPInstruction::ComputeAnyOfResult || |
| 1272 | getOpcode() == VPInstruction::ExtractLastActive || |
| 1273 | getOpcode() == VPInstruction::ComputeReductionResult || |
| 1274 | getOpcode() == VPInstruction::AnyOf; |
| 1275 | } |
| 1276 | |
| 1277 | bool VPInstruction::isSingleScalar() const { |
| 1278 | switch (getOpcode()) { |
| 1279 | case Instruction::PHI: |
| 1280 | case VPInstruction::ExplicitVectorLength: |
| 1281 | case VPInstruction::ResumeForEpilogue: |
| 1282 | case VPInstruction::VScale: |
| 1283 | return true; |
| 1284 | default: |
| 1285 | return isScalarCast(); |
| 1286 | } |
| 1287 | } |
| 1288 | |
| 1289 | void VPInstruction::execute(VPTransformState &State) { |
| 1290 | assert(!isMasked() && "cannot execute masked VPInstruction" ); |
| 1291 | assert(!State.Lane && "VPInstruction executing an Lane" ); |
| 1292 | IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); |
| 1293 | assert(flagsValidForOpcode(getOpcode()) && |
| 1294 | "Set flags not supported for the provided opcode" ); |
| 1295 | assert(hasRequiredFlagsForOpcode(getOpcode()) && |
| 1296 | "Opcode requires specific flags to be set" ); |
| 1297 | if (hasFastMathFlags()) |
| 1298 | State.Builder.setFastMathFlags(getFastMathFlags()); |
| 1299 | Value *GeneratedValue = generate(State); |
| 1300 | if (!hasResult()) |
| 1301 | return; |
| 1302 | assert(GeneratedValue && "generate must produce a value" ); |
| 1303 | bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && |
| 1304 | (vputils::onlyFirstLaneUsed(Def: this) || |
| 1305 | isVectorToScalar() || isSingleScalar()); |
| 1306 | assert((((GeneratedValue->getType()->isVectorTy() || |
| 1307 | GeneratedValue->getType()->isStructTy()) == |
| 1308 | !GeneratesPerFirstLaneOnly) || |
| 1309 | State.VF.isScalar()) && |
| 1310 | "scalar value but not only first lane defined" ); |
| 1311 | State.set(Def: this, V: GeneratedValue, |
| 1312 | /*IsScalar*/ GeneratesPerFirstLaneOnly); |
| 1313 | } |
| 1314 | |
| 1315 | bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { |
| 1316 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
| 1317 | return false; |
| 1318 | switch (getOpcode()) { |
| 1319 | case Instruction::GetElementPtr: |
| 1320 | case Instruction::ExtractElement: |
| 1321 | case Instruction::Freeze: |
| 1322 | case Instruction::FCmp: |
| 1323 | case Instruction::ICmp: |
| 1324 | case Instruction::Select: |
| 1325 | case Instruction::PHI: |
| 1326 | case VPInstruction::AnyOf: |
| 1327 | case VPInstruction::BranchOnCond: |
| 1328 | case VPInstruction::BranchOnTwoConds: |
| 1329 | case VPInstruction::BranchOnCount: |
| 1330 | case VPInstruction::Broadcast: |
| 1331 | case VPInstruction::BuildStructVector: |
| 1332 | case VPInstruction::BuildVector: |
| 1333 | case VPInstruction::CalculateTripCountMinusVF: |
| 1334 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1335 | case VPInstruction::ExtractLane: |
| 1336 | case VPInstruction::ExtractLastLane: |
| 1337 | case VPInstruction::ExtractLastPart: |
| 1338 | case VPInstruction::ExtractPenultimateElement: |
| 1339 | case VPInstruction::ActiveLaneMask: |
| 1340 | case VPInstruction::ExitingIVValue: |
| 1341 | case VPInstruction::ExplicitVectorLength: |
| 1342 | case VPInstruction::FirstActiveLane: |
| 1343 | case VPInstruction::LastActiveLane: |
| 1344 | case VPInstruction::ExtractLastActive: |
| 1345 | case VPInstruction::FirstOrderRecurrenceSplice: |
| 1346 | case VPInstruction::LogicalAnd: |
| 1347 | case VPInstruction::LogicalOr: |
| 1348 | case VPInstruction::Not: |
| 1349 | case VPInstruction::PtrAdd: |
| 1350 | case VPInstruction::WideIVStep: |
| 1351 | case VPInstruction::WidePtrAdd: |
| 1352 | case VPInstruction::StepVector: |
| 1353 | case VPInstruction::ReductionStartVector: |
| 1354 | case VPInstruction::Reverse: |
| 1355 | case VPInstruction::VScale: |
| 1356 | case VPInstruction::Unpack: |
| 1357 | return false; |
| 1358 | default: |
| 1359 | return true; |
| 1360 | } |
| 1361 | } |
| 1362 | |
| 1363 | bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const { |
| 1364 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 1365 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
| 1366 | return vputils::onlyFirstLaneUsed(Def: this); |
| 1367 | |
| 1368 | switch (getOpcode()) { |
| 1369 | default: |
| 1370 | return false; |
| 1371 | case Instruction::ExtractElement: |
| 1372 | return Op == getOperand(N: 1); |
| 1373 | case Instruction::PHI: |
| 1374 | return true; |
| 1375 | case Instruction::FCmp: |
| 1376 | case Instruction::ICmp: |
| 1377 | case Instruction::Select: |
| 1378 | case Instruction::Or: |
| 1379 | case Instruction::Freeze: |
| 1380 | case VPInstruction::Not: |
| 1381 | // TODO: Cover additional opcodes. |
| 1382 | return vputils::onlyFirstLaneUsed(Def: this); |
| 1383 | case VPInstruction::ActiveLaneMask: |
| 1384 | case VPInstruction::ExplicitVectorLength: |
| 1385 | case VPInstruction::CalculateTripCountMinusVF: |
| 1386 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1387 | case VPInstruction::BranchOnCount: |
| 1388 | case VPInstruction::BranchOnCond: |
| 1389 | case VPInstruction::Broadcast: |
| 1390 | case VPInstruction::ReductionStartVector: |
| 1391 | return true; |
| 1392 | case VPInstruction::BuildStructVector: |
| 1393 | case VPInstruction::BuildVector: |
| 1394 | // Before replicating by VF, Build(Struct)Vector uses all lanes of the |
| 1395 | // operand, after replicating its operands only the first lane is used. |
| 1396 | // Before replicating, it will have only a single operand. |
| 1397 | return getNumOperands() > 1; |
| 1398 | case VPInstruction::PtrAdd: |
| 1399 | return Op == getOperand(N: 0) || vputils::onlyFirstLaneUsed(Def: this); |
| 1400 | case VPInstruction::WidePtrAdd: |
| 1401 | // WidePtrAdd supports scalar and vector base addresses. |
| 1402 | return false; |
| 1403 | case VPInstruction::ComputeAnyOfResult: |
| 1404 | return Op == getOperand(N: 0) || Op == getOperand(N: 1); |
| 1405 | case VPInstruction::ExitingIVValue: |
| 1406 | case VPInstruction::ExtractLane: |
| 1407 | return Op == getOperand(N: 0); |
| 1408 | }; |
| 1409 | llvm_unreachable("switch should return" ); |
| 1410 | } |
| 1411 | |
| 1412 | bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const { |
| 1413 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 1414 | if (Instruction::isBinaryOp(Opcode: getOpcode())) |
| 1415 | return vputils::onlyFirstPartUsed(Def: this); |
| 1416 | |
| 1417 | switch (getOpcode()) { |
| 1418 | default: |
| 1419 | return false; |
| 1420 | case Instruction::FCmp: |
| 1421 | case Instruction::ICmp: |
| 1422 | case Instruction::Select: |
| 1423 | return vputils::onlyFirstPartUsed(Def: this); |
| 1424 | case VPInstruction::BranchOnCount: |
| 1425 | case VPInstruction::BranchOnCond: |
| 1426 | case VPInstruction::BranchOnTwoConds: |
| 1427 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1428 | return true; |
| 1429 | }; |
| 1430 | llvm_unreachable("switch should return" ); |
| 1431 | } |
| 1432 | |
| 1433 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1434 | void VPInstruction::dump() const { |
| 1435 | VPSlotTracker SlotTracker(getParent()->getPlan()); |
| 1436 | printRecipe(dbgs(), "" , SlotTracker); |
| 1437 | } |
| 1438 | |
| 1439 | void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1440 | VPSlotTracker &SlotTracker) const { |
| 1441 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
| 1442 | |
| 1443 | if (hasResult()) { |
| 1444 | printAsOperand(O, SlotTracker); |
| 1445 | O << " = " ; |
| 1446 | } |
| 1447 | |
| 1448 | switch (getOpcode()) { |
| 1449 | case VPInstruction::Not: |
| 1450 | O << "not" ; |
| 1451 | break; |
| 1452 | case VPInstruction::SLPLoad: |
| 1453 | O << "combined load" ; |
| 1454 | break; |
| 1455 | case VPInstruction::SLPStore: |
| 1456 | O << "combined store" ; |
| 1457 | break; |
| 1458 | case VPInstruction::ActiveLaneMask: |
| 1459 | O << "active lane mask" ; |
| 1460 | break; |
| 1461 | case VPInstruction::ExplicitVectorLength: |
| 1462 | O << "EXPLICIT-VECTOR-LENGTH" ; |
| 1463 | break; |
| 1464 | case VPInstruction::FirstOrderRecurrenceSplice: |
| 1465 | O << "first-order splice" ; |
| 1466 | break; |
| 1467 | case VPInstruction::BranchOnCond: |
| 1468 | O << "branch-on-cond" ; |
| 1469 | break; |
| 1470 | case VPInstruction::BranchOnTwoConds: |
| 1471 | O << "branch-on-two-conds" ; |
| 1472 | break; |
| 1473 | case VPInstruction::CalculateTripCountMinusVF: |
| 1474 | O << "TC > VF ? TC - VF : 0" ; |
| 1475 | break; |
| 1476 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1477 | O << "VF * Part +" ; |
| 1478 | break; |
| 1479 | case VPInstruction::BranchOnCount: |
| 1480 | O << "branch-on-count" ; |
| 1481 | break; |
| 1482 | case VPInstruction::Broadcast: |
| 1483 | O << "broadcast" ; |
| 1484 | break; |
| 1485 | case VPInstruction::BuildStructVector: |
| 1486 | O << "buildstructvector" ; |
| 1487 | break; |
| 1488 | case VPInstruction::BuildVector: |
| 1489 | O << "buildvector" ; |
| 1490 | break; |
| 1491 | case VPInstruction::ExitingIVValue: |
| 1492 | O << "exiting-iv-value" ; |
| 1493 | break; |
| 1494 | case VPInstruction::ExtractLane: |
| 1495 | O << "extract-lane" ; |
| 1496 | break; |
| 1497 | case VPInstruction::ExtractLastLane: |
| 1498 | O << "extract-last-lane" ; |
| 1499 | break; |
| 1500 | case VPInstruction::ExtractLastPart: |
| 1501 | O << "extract-last-part" ; |
| 1502 | break; |
| 1503 | case VPInstruction::ExtractPenultimateElement: |
| 1504 | O << "extract-penultimate-element" ; |
| 1505 | break; |
| 1506 | case VPInstruction::ComputeAnyOfResult: |
| 1507 | O << "compute-anyof-result" ; |
| 1508 | break; |
| 1509 | case VPInstruction::ComputeReductionResult: |
| 1510 | O << "compute-reduction-result" ; |
| 1511 | break; |
| 1512 | case VPInstruction::LogicalAnd: |
| 1513 | O << "logical-and" ; |
| 1514 | break; |
| 1515 | case VPInstruction::LogicalOr: |
| 1516 | O << "logical-or" ; |
| 1517 | break; |
| 1518 | case VPInstruction::PtrAdd: |
| 1519 | O << "ptradd" ; |
| 1520 | break; |
| 1521 | case VPInstruction::WidePtrAdd: |
| 1522 | O << "wide-ptradd" ; |
| 1523 | break; |
| 1524 | case VPInstruction::AnyOf: |
| 1525 | O << "any-of" ; |
| 1526 | break; |
| 1527 | case VPInstruction::FirstActiveLane: |
| 1528 | O << "first-active-lane" ; |
| 1529 | break; |
| 1530 | case VPInstruction::LastActiveLane: |
| 1531 | O << "last-active-lane" ; |
| 1532 | break; |
| 1533 | case VPInstruction::ReductionStartVector: |
| 1534 | O << "reduction-start-vector" ; |
| 1535 | break; |
| 1536 | case VPInstruction::ResumeForEpilogue: |
| 1537 | O << "resume-for-epilogue" ; |
| 1538 | break; |
| 1539 | case VPInstruction::Reverse: |
| 1540 | O << "reverse" ; |
| 1541 | break; |
| 1542 | case VPInstruction::Unpack: |
| 1543 | O << "unpack" ; |
| 1544 | break; |
| 1545 | case VPInstruction::ExtractLastActive: |
| 1546 | O << "extract-last-active" ; |
| 1547 | break; |
| 1548 | default: |
| 1549 | O << Instruction::getOpcodeName(getOpcode()); |
| 1550 | } |
| 1551 | |
| 1552 | printFlags(O); |
| 1553 | printOperands(O, SlotTracker); |
| 1554 | } |
| 1555 | #endif |
| 1556 | |
| 1557 | void VPInstructionWithType::execute(VPTransformState &State) { |
| 1558 | State.setDebugLocFrom(getDebugLoc()); |
| 1559 | if (isScalarCast()) { |
| 1560 | Value *Op = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 1561 | Value *Cast = State.Builder.CreateCast(Op: Instruction::CastOps(getOpcode()), |
| 1562 | V: Op, DestTy: ResultTy); |
| 1563 | State.set(Def: this, V: Cast, Lane: VPLane(0)); |
| 1564 | return; |
| 1565 | } |
| 1566 | switch (getOpcode()) { |
| 1567 | case VPInstruction::StepVector: { |
| 1568 | Value *StepVector = |
| 1569 | State.Builder.CreateStepVector(DstType: VectorType::get(ElementType: ResultTy, EC: State.VF)); |
| 1570 | State.set(Def: this, V: StepVector); |
| 1571 | break; |
| 1572 | } |
| 1573 | case VPInstruction::VScale: { |
| 1574 | Value *VScale = State.Builder.CreateVScale(Ty: ResultTy); |
| 1575 | State.set(Def: this, V: VScale, IsScalar: true); |
| 1576 | break; |
| 1577 | } |
| 1578 | |
| 1579 | default: |
| 1580 | llvm_unreachable("opcode not implemented yet" ); |
| 1581 | } |
| 1582 | } |
| 1583 | |
| 1584 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1585 | void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1586 | VPSlotTracker &SlotTracker) const { |
| 1587 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
| 1588 | printAsOperand(O, SlotTracker); |
| 1589 | O << " = " ; |
| 1590 | |
| 1591 | switch (getOpcode()) { |
| 1592 | case VPInstruction::WideIVStep: |
| 1593 | O << "wide-iv-step " ; |
| 1594 | printOperands(O, SlotTracker); |
| 1595 | break; |
| 1596 | case VPInstruction::StepVector: |
| 1597 | O << "step-vector " << *ResultTy; |
| 1598 | break; |
| 1599 | case VPInstruction::VScale: |
| 1600 | O << "vscale " << *ResultTy; |
| 1601 | break; |
| 1602 | default: |
| 1603 | assert(Instruction::isCast(getOpcode()) && "unhandled opcode" ); |
| 1604 | O << Instruction::getOpcodeName(getOpcode()) << " " ; |
| 1605 | printOperands(O, SlotTracker); |
| 1606 | O << " to " << *ResultTy; |
| 1607 | } |
| 1608 | } |
| 1609 | #endif |
| 1610 | |
| 1611 | void VPPhi::execute(VPTransformState &State) { |
| 1612 | State.setDebugLocFrom(getDebugLoc()); |
| 1613 | PHINode *NewPhi = State.Builder.CreatePHI( |
| 1614 | Ty: State.TypeAnalysis.inferScalarType(V: this), NumReservedValues: 2, Name: getName()); |
| 1615 | unsigned NumIncoming = getNumIncoming(); |
| 1616 | if (getParent() != getParent()->getPlan()->getScalarPreheader()) { |
| 1617 | // TODO: Fixup all incoming values of header phis once recipes defining them |
| 1618 | // are introduced. |
| 1619 | NumIncoming = 1; |
| 1620 | } |
| 1621 | for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) { |
| 1622 | Value *IncV = State.get(Def: getIncomingValue(Idx), Lane: VPLane(0)); |
| 1623 | BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(Val: getIncomingBlock(Idx)); |
| 1624 | NewPhi->addIncoming(V: IncV, BB: PredBB); |
| 1625 | } |
| 1626 | State.set(Def: this, V: NewPhi, Lane: VPLane(0)); |
| 1627 | } |
| 1628 | |
| 1629 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1630 | void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1631 | VPSlotTracker &SlotTracker) const { |
| 1632 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
| 1633 | printAsOperand(O, SlotTracker); |
| 1634 | O << " = phi" ; |
| 1635 | printFlags(O); |
| 1636 | printPhiOperands(O, SlotTracker); |
| 1637 | } |
| 1638 | #endif |
| 1639 | |
| 1640 | VPIRInstruction *VPIRInstruction ::create(Instruction &I) { |
| 1641 | if (auto *Phi = dyn_cast<PHINode>(Val: &I)) |
| 1642 | return new VPIRPhi(*Phi); |
| 1643 | return new VPIRInstruction(I); |
| 1644 | } |
| 1645 | |
| 1646 | void VPIRInstruction::execute(VPTransformState &State) { |
| 1647 | assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 && |
| 1648 | "PHINodes must be handled by VPIRPhi" ); |
| 1649 | // Advance the insert point after the wrapped IR instruction. This allows |
| 1650 | // interleaving VPIRInstructions and other recipes. |
| 1651 | State.Builder.SetInsertPoint(TheBB: I.getParent(), IP: std::next(x: I.getIterator())); |
| 1652 | } |
| 1653 | |
| 1654 | InstructionCost VPIRInstruction::computeCost(ElementCount VF, |
| 1655 | VPCostContext &Ctx) const { |
| 1656 | // The recipe wraps an existing IR instruction on the border of VPlan's scope, |
| 1657 | // hence it does not contribute to the cost-modeling for the VPlan. |
| 1658 | return 0; |
| 1659 | } |
| 1660 | |
| 1661 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1662 | void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1663 | VPSlotTracker &SlotTracker) const { |
| 1664 | O << Indent << "IR " << I; |
| 1665 | } |
| 1666 | #endif |
| 1667 | |
| 1668 | void VPIRPhi::execute(VPTransformState &State) { |
| 1669 | PHINode *Phi = &getIRPhi(); |
| 1670 | for (const auto &[Idx, Op] : enumerate(First: operands())) { |
| 1671 | VPValue *ExitValue = Op; |
| 1672 | auto Lane = vputils::isSingleScalar(VPV: ExitValue) |
| 1673 | ? VPLane::getFirstLane() |
| 1674 | : VPLane::getLastLaneForVF(VF: State.VF); |
| 1675 | VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; |
| 1676 | auto *PredVPBB = Pred->getExitingBasicBlock(); |
| 1677 | BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; |
| 1678 | // Set insertion point in PredBB in case an extract needs to be generated. |
| 1679 | // TODO: Model extracts explicitly. |
| 1680 | State.Builder.SetInsertPoint(PredBB->getTerminator()); |
| 1681 | Value *V = State.get(Def: ExitValue, Lane: VPLane(Lane)); |
| 1682 | // If there is no existing block for PredBB in the phi, add a new incoming |
| 1683 | // value. Otherwise update the existing incoming value for PredBB. |
| 1684 | if (Phi->getBasicBlockIndex(BB: PredBB) == -1) |
| 1685 | Phi->addIncoming(V, BB: PredBB); |
| 1686 | else |
| 1687 | Phi->setIncomingValueForBlock(BB: PredBB, V); |
| 1688 | } |
| 1689 | |
| 1690 | // Advance the insert point after the wrapped IR instruction. This allows |
| 1691 | // interleaving VPIRInstructions and other recipes. |
| 1692 | State.Builder.SetInsertPoint(TheBB: Phi->getParent(), IP: std::next(x: Phi->getIterator())); |
| 1693 | } |
| 1694 | |
| 1695 | void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const { |
| 1696 | VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe()); |
| 1697 | assert(R->getNumOperands() == R->getParent()->getNumPredecessors() && |
| 1698 | "Number of phi operands must match number of predecessors" ); |
| 1699 | unsigned Position = R->getParent()->getIndexForPredecessor(Pred: IncomingBlock); |
| 1700 | R->removeOperand(Idx: Position); |
| 1701 | } |
| 1702 | |
| 1703 | VPValue * |
| 1704 | VPPhiAccessors::getIncomingValueForBlock(const VPBasicBlock *VPBB) const { |
| 1705 | VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe()); |
| 1706 | return getIncomingValue(Idx: R->getParent()->getIndexForPredecessor(Pred: VPBB)); |
| 1707 | } |
| 1708 | |
| 1709 | void VPPhiAccessors::setIncomingValueForBlock(const VPBasicBlock *VPBB, |
| 1710 | VPValue *V) const { |
| 1711 | VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe()); |
| 1712 | R->setOperand(I: R->getParent()->getIndexForPredecessor(Pred: VPBB), New: V); |
| 1713 | } |
| 1714 | |
| 1715 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1716 | void VPPhiAccessors::printPhiOperands(raw_ostream &O, |
| 1717 | VPSlotTracker &SlotTracker) const { |
| 1718 | interleaveComma(enumerate(getAsRecipe()->operands()), O, |
| 1719 | [this, &O, &SlotTracker](auto Op) { |
| 1720 | O << "[ " ; |
| 1721 | Op.value()->printAsOperand(O, SlotTracker); |
| 1722 | O << ", " ; |
| 1723 | getIncomingBlock(Op.index())->printAsOperand(O); |
| 1724 | O << " ]" ; |
| 1725 | }); |
| 1726 | } |
| 1727 | #endif |
| 1728 | |
| 1729 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1730 | void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1731 | VPSlotTracker &SlotTracker) const { |
| 1732 | VPIRInstruction::printRecipe(O, Indent, SlotTracker); |
| 1733 | |
| 1734 | if (getNumOperands() != 0) { |
| 1735 | O << " (extra operand" << (getNumOperands() > 1 ? "s" : "" ) << ": " ; |
| 1736 | interleaveComma(incoming_values_and_blocks(), O, |
| 1737 | [&O, &SlotTracker](auto Op) { |
| 1738 | std::get<0>(Op)->printAsOperand(O, SlotTracker); |
| 1739 | O << " from " ; |
| 1740 | std::get<1>(Op)->printAsOperand(O); |
| 1741 | }); |
| 1742 | O << ")" ; |
| 1743 | } |
| 1744 | } |
| 1745 | #endif |
| 1746 | |
| 1747 | void VPIRMetadata::applyMetadata(Instruction &I) const { |
| 1748 | for (const auto &[Kind, Node] : Metadata) |
| 1749 | I.setMetadata(KindID: Kind, Node); |
| 1750 | } |
| 1751 | |
| 1752 | void VPIRMetadata::intersect(const VPIRMetadata &Other) { |
| 1753 | SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection; |
| 1754 | for (const auto &[KindA, MDA] : Metadata) { |
| 1755 | for (const auto &[KindB, MDB] : Other.Metadata) { |
| 1756 | if (KindA == KindB && MDA == MDB) { |
| 1757 | MetadataIntersection.emplace_back(Args: KindA, Args: MDA); |
| 1758 | break; |
| 1759 | } |
| 1760 | } |
| 1761 | } |
| 1762 | Metadata = std::move(MetadataIntersection); |
| 1763 | } |
| 1764 | |
| 1765 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1766 | void VPIRMetadata::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { |
| 1767 | const Module *M = SlotTracker.getModule(); |
| 1768 | if (Metadata.empty() || !M) |
| 1769 | return; |
| 1770 | |
| 1771 | ArrayRef<StringRef> MDNames = SlotTracker.getMDNames(); |
| 1772 | O << " (" ; |
| 1773 | interleaveComma(Metadata, O, [&](const auto &KindNodePair) { |
| 1774 | auto [Kind, Node] = KindNodePair; |
| 1775 | assert(Kind < MDNames.size() && !MDNames[Kind].empty() && |
| 1776 | "Unexpected unnamed metadata kind" ); |
| 1777 | O << "!" << MDNames[Kind] << " " ; |
| 1778 | Node->printAsOperand(O, M); |
| 1779 | }); |
| 1780 | O << ")" ; |
| 1781 | } |
| 1782 | #endif |
| 1783 | |
| 1784 | void VPWidenCallRecipe::execute(VPTransformState &State) { |
| 1785 | assert(State.VF.isVector() && "not widening" ); |
| 1786 | assert(Variant != nullptr && "Can't create vector function." ); |
| 1787 | |
| 1788 | FunctionType *VFTy = Variant->getFunctionType(); |
| 1789 | // Add return type if intrinsic is overloaded on it. |
| 1790 | SmallVector<Value *, 4> Args; |
| 1791 | for (const auto &I : enumerate(First: args())) { |
| 1792 | Value *Arg; |
| 1793 | // Some vectorized function variants may also take a scalar argument, |
| 1794 | // e.g. linear parameters for pointers. This needs to be the scalar value |
| 1795 | // from the start of the respective part when interleaving. |
| 1796 | if (!VFTy->getParamType(i: I.index())->isVectorTy()) |
| 1797 | Arg = State.get(Def: I.value(), Lane: VPLane(0)); |
| 1798 | else |
| 1799 | Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value())); |
| 1800 | Args.push_back(Elt: Arg); |
| 1801 | } |
| 1802 | |
| 1803 | auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue()); |
| 1804 | SmallVector<OperandBundleDef, 1> OpBundles; |
| 1805 | if (CI) |
| 1806 | CI->getOperandBundlesAsDefs(Defs&: OpBundles); |
| 1807 | |
| 1808 | CallInst *V = State.Builder.CreateCall(Callee: Variant, Args, OpBundles); |
| 1809 | applyFlags(I&: *V); |
| 1810 | applyMetadata(I&: *V); |
| 1811 | V->setCallingConv(Variant->getCallingConv()); |
| 1812 | |
| 1813 | if (!V->getType()->isVoidTy()) |
| 1814 | State.set(Def: this, V); |
| 1815 | } |
| 1816 | |
| 1817 | InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, |
| 1818 | VPCostContext &Ctx) const { |
| 1819 | return Ctx.TTI.getCallInstrCost(F: nullptr, RetTy: Variant->getReturnType(), |
| 1820 | Tys: Variant->getFunctionType()->params(), |
| 1821 | CostKind: Ctx.CostKind); |
| 1822 | } |
| 1823 | |
| 1824 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1825 | void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1826 | VPSlotTracker &SlotTracker) const { |
| 1827 | O << Indent << "WIDEN-CALL " ; |
| 1828 | |
| 1829 | Function *CalledFn = getCalledScalarFunction(); |
| 1830 | if (CalledFn->getReturnType()->isVoidTy()) |
| 1831 | O << "void " ; |
| 1832 | else { |
| 1833 | printAsOperand(O, SlotTracker); |
| 1834 | O << " = " ; |
| 1835 | } |
| 1836 | |
| 1837 | O << "call" ; |
| 1838 | printFlags(O); |
| 1839 | O << " @" << CalledFn->getName() << "(" ; |
| 1840 | interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) { |
| 1841 | Op->printAsOperand(O, SlotTracker); |
| 1842 | }); |
| 1843 | O << ")" ; |
| 1844 | |
| 1845 | O << " (using library function" ; |
| 1846 | if (Variant->hasName()) |
| 1847 | O << ": " << Variant->getName(); |
| 1848 | O << ")" ; |
| 1849 | } |
| 1850 | #endif |
| 1851 | |
| 1852 | void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { |
| 1853 | assert(State.VF.isVector() && "not widening" ); |
| 1854 | |
| 1855 | SmallVector<Type *, 2> TysForDecl; |
| 1856 | // Add return type if intrinsic is overloaded on it. |
| 1857 | if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1, |
| 1858 | TTI: State.TTI)) { |
| 1859 | Type *RetTy = toVectorizedTy(Ty: getResultType(), EC: State.VF); |
| 1860 | ArrayRef<Type *> ContainedTys = getContainedTypes(Ty: RetTy); |
| 1861 | for (auto [Idx, Ty] : enumerate(First&: ContainedTys)) { |
| 1862 | if (isVectorIntrinsicWithStructReturnOverloadAtField(ID: VectorIntrinsicID, |
| 1863 | RetIdx: Idx, TTI: State.TTI)) |
| 1864 | TysForDecl.push_back(Elt: Ty); |
| 1865 | } |
| 1866 | } |
| 1867 | SmallVector<Value *, 4> Args; |
| 1868 | for (const auto &I : enumerate(First: operands())) { |
| 1869 | // Some intrinsics have a scalar argument - don't replace it with a |
| 1870 | // vector. |
| 1871 | Value *Arg; |
| 1872 | if (isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index(), |
| 1873 | TTI: State.TTI)) |
| 1874 | Arg = State.get(Def: I.value(), Lane: VPLane(0)); |
| 1875 | else |
| 1876 | Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value())); |
| 1877 | if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index(), |
| 1878 | TTI: State.TTI)) |
| 1879 | TysForDecl.push_back(Elt: Arg->getType()); |
| 1880 | Args.push_back(Elt: Arg); |
| 1881 | } |
| 1882 | |
| 1883 | // Use vector version of the intrinsic. |
| 1884 | Module *M = State.Builder.GetInsertBlock()->getModule(); |
| 1885 | Function *VectorF = |
| 1886 | Intrinsic::getOrInsertDeclaration(M, id: VectorIntrinsicID, Tys: TysForDecl); |
| 1887 | assert(VectorF && |
| 1888 | "Can't retrieve vector intrinsic or vector-predication intrinsics." ); |
| 1889 | |
| 1890 | auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue()); |
| 1891 | SmallVector<OperandBundleDef, 1> OpBundles; |
| 1892 | if (CI) |
| 1893 | CI->getOperandBundlesAsDefs(Defs&: OpBundles); |
| 1894 | |
| 1895 | CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles); |
| 1896 | |
| 1897 | applyFlags(I&: *V); |
| 1898 | applyMetadata(I&: *V); |
| 1899 | |
| 1900 | if (!V->getType()->isVoidTy()) |
| 1901 | State.set(Def: this, V); |
| 1902 | } |
| 1903 | |
| 1904 | /// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R. |
| 1905 | static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, |
| 1906 | ArrayRef<const VPValue *> Operands, |
| 1907 | const VPRecipeWithIRFlags &R, |
| 1908 | ElementCount VF, |
| 1909 | VPCostContext &Ctx) { |
| 1910 | // Some backends analyze intrinsic arguments to determine cost. Use the |
| 1911 | // underlying value for the operand if it has one. Otherwise try to use the |
| 1912 | // operand of the underlying call instruction, if there is one. Otherwise |
| 1913 | // clear Arguments. |
| 1914 | // TODO: Rework TTI interface to be independent of concrete IR values. |
| 1915 | SmallVector<const Value *> Arguments; |
| 1916 | for (const auto &[Idx, Op] : enumerate(First&: Operands)) { |
| 1917 | auto *V = Op->getUnderlyingValue(); |
| 1918 | if (!V) { |
| 1919 | if (auto *UI = dyn_cast_or_null<CallBase>(Val: R.getUnderlyingValue())) { |
| 1920 | Arguments.push_back(Elt: UI->getArgOperand(i: Idx)); |
| 1921 | continue; |
| 1922 | } |
| 1923 | Arguments.clear(); |
| 1924 | break; |
| 1925 | } |
| 1926 | Arguments.push_back(Elt: V); |
| 1927 | } |
| 1928 | |
| 1929 | Type *ScalarRetTy = Ctx.Types.inferScalarType(V: &R); |
| 1930 | Type *RetTy = VF.isVector() ? toVectorizedTy(Ty: ScalarRetTy, EC: VF) : ScalarRetTy; |
| 1931 | SmallVector<Type *> ParamTys; |
| 1932 | for (const VPValue *Op : Operands) { |
| 1933 | ParamTys.push_back(Elt: VF.isVector() |
| 1934 | ? toVectorTy(Scalar: Ctx.Types.inferScalarType(V: Op), EC: VF) |
| 1935 | : Ctx.Types.inferScalarType(V: Op)); |
| 1936 | } |
| 1937 | |
| 1938 | // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. |
| 1939 | IntrinsicCostAttributes CostAttrs( |
| 1940 | ID, RetTy, Arguments, ParamTys, R.getFastMathFlags(), |
| 1941 | dyn_cast_or_null<IntrinsicInst>(Val: R.getUnderlyingValue()), |
| 1942 | InstructionCost::getInvalid(), &Ctx.TLI); |
| 1943 | return Ctx.TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind: Ctx.CostKind); |
| 1944 | } |
| 1945 | |
| 1946 | InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, |
| 1947 | VPCostContext &Ctx) const { |
| 1948 | SmallVector<const VPValue *> ArgOps(operands()); |
| 1949 | return getCostForIntrinsics(ID: VectorIntrinsicID, Operands: ArgOps, R: *this, VF, Ctx); |
| 1950 | } |
| 1951 | |
| 1952 | StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { |
| 1953 | return Intrinsic::getBaseName(id: VectorIntrinsicID); |
| 1954 | } |
| 1955 | |
| 1956 | bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const { |
| 1957 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 1958 | return all_of(Range: enumerate(First: operands()), P: [this, &Op](const auto &X) { |
| 1959 | auto [Idx, V] = X; |
| 1960 | return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(), |
| 1961 | Idx, nullptr); |
| 1962 | }); |
| 1963 | } |
| 1964 | |
| 1965 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1966 | void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1967 | VPSlotTracker &SlotTracker) const { |
| 1968 | O << Indent << "WIDEN-INTRINSIC " ; |
| 1969 | if (ResultTy->isVoidTy()) { |
| 1970 | O << "void " ; |
| 1971 | } else { |
| 1972 | printAsOperand(O, SlotTracker); |
| 1973 | O << " = " ; |
| 1974 | } |
| 1975 | |
| 1976 | O << "call" ; |
| 1977 | printFlags(O); |
| 1978 | O << getIntrinsicName() << "(" ; |
| 1979 | |
| 1980 | interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { |
| 1981 | Op->printAsOperand(O, SlotTracker); |
| 1982 | }); |
| 1983 | O << ")" ; |
| 1984 | } |
| 1985 | #endif |
| 1986 | |
| 1987 | void VPHistogramRecipe::execute(VPTransformState &State) { |
| 1988 | IRBuilderBase &Builder = State.Builder; |
| 1989 | |
| 1990 | Value *Address = State.get(Def: getOperand(N: 0)); |
| 1991 | Value *IncAmt = State.get(Def: getOperand(N: 1), /*IsScalar=*/true); |
| 1992 | VectorType *VTy = cast<VectorType>(Val: Address->getType()); |
| 1993 | |
| 1994 | // The histogram intrinsic requires a mask even if the recipe doesn't; |
| 1995 | // if the mask operand was omitted then all lanes should be executed and |
| 1996 | // we just need to synthesize an all-true mask. |
| 1997 | Value *Mask = nullptr; |
| 1998 | if (VPValue *VPMask = getMask()) |
| 1999 | Mask = State.get(Def: VPMask); |
| 2000 | else |
| 2001 | Mask = |
| 2002 | Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Builder.getInt1(V: 1)); |
| 2003 | |
| 2004 | // If this is a subtract, we want to invert the increment amount. We may |
| 2005 | // add a separate intrinsic in future, but for now we'll try this. |
| 2006 | if (Opcode == Instruction::Sub) |
| 2007 | IncAmt = Builder.CreateNeg(V: IncAmt); |
| 2008 | else |
| 2009 | assert(Opcode == Instruction::Add && "only add or sub supported for now" ); |
| 2010 | |
| 2011 | State.Builder.CreateIntrinsic(ID: Intrinsic::experimental_vector_histogram_add, |
| 2012 | Types: {VTy, IncAmt->getType()}, |
| 2013 | Args: {Address, IncAmt, Mask}); |
| 2014 | } |
| 2015 | |
| 2016 | InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, |
| 2017 | VPCostContext &Ctx) const { |
| 2018 | // FIXME: Take the gather and scatter into account as well. For now we're |
| 2019 | // generating the same cost as the fallback path, but we'll likely |
| 2020 | // need to create a new TTI method for determining the cost, including |
| 2021 | // whether we can use base + vec-of-smaller-indices or just |
| 2022 | // vec-of-pointers. |
| 2023 | assert(VF.isVector() && "Invalid VF for histogram cost" ); |
| 2024 | Type *AddressTy = Ctx.Types.inferScalarType(V: getOperand(N: 0)); |
| 2025 | VPValue *IncAmt = getOperand(N: 1); |
| 2026 | Type *IncTy = Ctx.Types.inferScalarType(V: IncAmt); |
| 2027 | VectorType *VTy = VectorType::get(ElementType: IncTy, EC: VF); |
| 2028 | |
| 2029 | // Assume that a non-constant update value (or a constant != 1) requires |
| 2030 | // a multiply, and add that into the cost. |
| 2031 | InstructionCost MulCost = |
| 2032 | Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VTy, CostKind: Ctx.CostKind); |
| 2033 | if (auto *CI = dyn_cast<VPConstantInt>(Val: IncAmt)) |
| 2034 | if (CI->isOne()) |
| 2035 | MulCost = TTI::TCC_Free; |
| 2036 | |
| 2037 | // Find the cost of the histogram operation itself. |
| 2038 | Type *PtrTy = VectorType::get(ElementType: AddressTy, EC: VF); |
| 2039 | Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF); |
| 2040 | IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, |
| 2041 | Type::getVoidTy(C&: Ctx.LLVMCtx), |
| 2042 | {PtrTy, IncTy, MaskTy}); |
| 2043 | |
| 2044 | // Add the costs together with the add/sub operation. |
| 2045 | return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind) + MulCost + |
| 2046 | Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: VTy, CostKind: Ctx.CostKind); |
| 2047 | } |
| 2048 | |
| 2049 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2050 | void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2051 | VPSlotTracker &SlotTracker) const { |
| 2052 | O << Indent << "WIDEN-HISTOGRAM buckets: " ; |
| 2053 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 2054 | |
| 2055 | if (Opcode == Instruction::Sub) |
| 2056 | O << ", dec: " ; |
| 2057 | else { |
| 2058 | assert(Opcode == Instruction::Add); |
| 2059 | O << ", inc: " ; |
| 2060 | } |
| 2061 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 2062 | |
| 2063 | if (VPValue *Mask = getMask()) { |
| 2064 | O << ", mask: " ; |
| 2065 | Mask->printAsOperand(O, SlotTracker); |
| 2066 | } |
| 2067 | } |
| 2068 | #endif |
| 2069 | |
| 2070 | VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) { |
| 2071 | AllowReassoc = FMF.allowReassoc(); |
| 2072 | NoNaNs = FMF.noNaNs(); |
| 2073 | NoInfs = FMF.noInfs(); |
| 2074 | NoSignedZeros = FMF.noSignedZeros(); |
| 2075 | AllowReciprocal = FMF.allowReciprocal(); |
| 2076 | AllowContract = FMF.allowContract(); |
| 2077 | ApproxFunc = FMF.approxFunc(); |
| 2078 | } |
| 2079 | |
| 2080 | VPIRFlags VPIRFlags::getDefaultFlags(unsigned Opcode) { |
| 2081 | switch (Opcode) { |
| 2082 | case Instruction::Add: |
| 2083 | case Instruction::Sub: |
| 2084 | case Instruction::Mul: |
| 2085 | case Instruction::Shl: |
| 2086 | case VPInstruction::CanonicalIVIncrementForPart: |
| 2087 | return WrapFlagsTy(false, false); |
| 2088 | case Instruction::Trunc: |
| 2089 | return TruncFlagsTy(false, false); |
| 2090 | case Instruction::Or: |
| 2091 | return DisjointFlagsTy(false); |
| 2092 | case Instruction::AShr: |
| 2093 | case Instruction::LShr: |
| 2094 | case Instruction::UDiv: |
| 2095 | case Instruction::SDiv: |
| 2096 | return ExactFlagsTy(false); |
| 2097 | case Instruction::GetElementPtr: |
| 2098 | case VPInstruction::PtrAdd: |
| 2099 | case VPInstruction::WidePtrAdd: |
| 2100 | return GEPNoWrapFlags::none(); |
| 2101 | case Instruction::ZExt: |
| 2102 | case Instruction::UIToFP: |
| 2103 | return NonNegFlagsTy(false); |
| 2104 | case Instruction::FAdd: |
| 2105 | case Instruction::FSub: |
| 2106 | case Instruction::FMul: |
| 2107 | case Instruction::FDiv: |
| 2108 | case Instruction::FRem: |
| 2109 | case Instruction::FNeg: |
| 2110 | case Instruction::FPExt: |
| 2111 | case Instruction::FPTrunc: |
| 2112 | return FastMathFlags(); |
| 2113 | case Instruction::ICmp: |
| 2114 | case Instruction::FCmp: |
| 2115 | case VPInstruction::ComputeReductionResult: |
| 2116 | llvm_unreachable("opcode requires explicit flags" ); |
| 2117 | default: |
| 2118 | return VPIRFlags(); |
| 2119 | } |
| 2120 | } |
| 2121 | |
| 2122 | #if !defined(NDEBUG) |
| 2123 | bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { |
| 2124 | switch (OpType) { |
| 2125 | case OperationType::OverflowingBinOp: |
| 2126 | return Opcode == Instruction::Add || Opcode == Instruction::Sub || |
| 2127 | Opcode == Instruction::Mul || Opcode == Instruction::Shl || |
| 2128 | Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart; |
| 2129 | case OperationType::Trunc: |
| 2130 | return Opcode == Instruction::Trunc; |
| 2131 | case OperationType::DisjointOp: |
| 2132 | return Opcode == Instruction::Or; |
| 2133 | case OperationType::PossiblyExactOp: |
| 2134 | return Opcode == Instruction::AShr || Opcode == Instruction::LShr || |
| 2135 | Opcode == Instruction::UDiv || Opcode == Instruction::SDiv; |
| 2136 | case OperationType::GEPOp: |
| 2137 | return Opcode == Instruction::GetElementPtr || |
| 2138 | Opcode == VPInstruction::PtrAdd || |
| 2139 | Opcode == VPInstruction::WidePtrAdd; |
| 2140 | case OperationType::FPMathOp: |
| 2141 | return Opcode == Instruction::Call || Opcode == Instruction::FAdd || |
| 2142 | Opcode == Instruction::FMul || Opcode == Instruction::FSub || |
| 2143 | Opcode == Instruction::FNeg || Opcode == Instruction::FDiv || |
| 2144 | Opcode == Instruction::FRem || Opcode == Instruction::FPExt || |
| 2145 | Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI || |
| 2146 | Opcode == Instruction::Select || |
| 2147 | Opcode == VPInstruction::WideIVStep || |
| 2148 | Opcode == VPInstruction::ReductionStartVector; |
| 2149 | case OperationType::FCmp: |
| 2150 | return Opcode == Instruction::FCmp; |
| 2151 | case OperationType::NonNegOp: |
| 2152 | return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP; |
| 2153 | case OperationType::Cmp: |
| 2154 | return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp; |
| 2155 | case OperationType::ReductionOp: |
| 2156 | return Opcode == VPInstruction::ComputeReductionResult; |
| 2157 | case OperationType::Other: |
| 2158 | return true; |
| 2159 | } |
| 2160 | llvm_unreachable("Unknown OperationType enum" ); |
| 2161 | } |
| 2162 | |
| 2163 | bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const { |
| 2164 | // Handle opcodes without default flags. |
| 2165 | if (Opcode == Instruction::ICmp) |
| 2166 | return OpType == OperationType::Cmp; |
| 2167 | if (Opcode == Instruction::FCmp) |
| 2168 | return OpType == OperationType::FCmp; |
| 2169 | if (Opcode == VPInstruction::ComputeReductionResult) |
| 2170 | return OpType == OperationType::ReductionOp; |
| 2171 | |
| 2172 | OperationType Required = getDefaultFlags(Opcode).OpType; |
| 2173 | return Required == OperationType::Other || Required == OpType; |
| 2174 | } |
| 2175 | #endif |
| 2176 | |
| 2177 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2178 | void VPIRFlags::printFlags(raw_ostream &O) const { |
| 2179 | switch (OpType) { |
| 2180 | case OperationType::Cmp: |
| 2181 | O << " " << CmpInst::getPredicateName(getPredicate()); |
| 2182 | break; |
| 2183 | case OperationType::FCmp: |
| 2184 | O << " " << CmpInst::getPredicateName(getPredicate()); |
| 2185 | getFastMathFlags().print(O); |
| 2186 | break; |
| 2187 | case OperationType::DisjointOp: |
| 2188 | if (DisjointFlags.IsDisjoint) |
| 2189 | O << " disjoint" ; |
| 2190 | break; |
| 2191 | case OperationType::PossiblyExactOp: |
| 2192 | if (ExactFlags.IsExact) |
| 2193 | O << " exact" ; |
| 2194 | break; |
| 2195 | case OperationType::OverflowingBinOp: |
| 2196 | if (WrapFlags.HasNUW) |
| 2197 | O << " nuw" ; |
| 2198 | if (WrapFlags.HasNSW) |
| 2199 | O << " nsw" ; |
| 2200 | break; |
| 2201 | case OperationType::Trunc: |
| 2202 | if (TruncFlags.HasNUW) |
| 2203 | O << " nuw" ; |
| 2204 | if (TruncFlags.HasNSW) |
| 2205 | O << " nsw" ; |
| 2206 | break; |
| 2207 | case OperationType::FPMathOp: |
| 2208 | getFastMathFlags().print(O); |
| 2209 | break; |
| 2210 | case OperationType::GEPOp: |
| 2211 | if (GEPFlags.isInBounds()) |
| 2212 | O << " inbounds" ; |
| 2213 | else if (GEPFlags.hasNoUnsignedSignedWrap()) |
| 2214 | O << " nusw" ; |
| 2215 | if (GEPFlags.hasNoUnsignedWrap()) |
| 2216 | O << " nuw" ; |
| 2217 | break; |
| 2218 | case OperationType::NonNegOp: |
| 2219 | if (NonNegFlags.NonNeg) |
| 2220 | O << " nneg" ; |
| 2221 | break; |
| 2222 | case OperationType::ReductionOp: { |
| 2223 | RecurKind RK = getRecurKind(); |
| 2224 | O << " (" ; |
| 2225 | switch (RK) { |
| 2226 | case RecurKind::AnyOf: |
| 2227 | O << "any-of" ; |
| 2228 | break; |
| 2229 | case RecurKind::SMax: |
| 2230 | O << "smax" ; |
| 2231 | break; |
| 2232 | case RecurKind::SMin: |
| 2233 | O << "smin" ; |
| 2234 | break; |
| 2235 | case RecurKind::UMax: |
| 2236 | O << "umax" ; |
| 2237 | break; |
| 2238 | case RecurKind::UMin: |
| 2239 | O << "umin" ; |
| 2240 | break; |
| 2241 | case RecurKind::FMinNum: |
| 2242 | O << "fminnum" ; |
| 2243 | break; |
| 2244 | case RecurKind::FMaxNum: |
| 2245 | O << "fmaxnum" ; |
| 2246 | break; |
| 2247 | case RecurKind::FMinimum: |
| 2248 | O << "fminimum" ; |
| 2249 | break; |
| 2250 | case RecurKind::FMaximum: |
| 2251 | O << "fmaximum" ; |
| 2252 | break; |
| 2253 | case RecurKind::FMinimumNum: |
| 2254 | O << "fminimumnum" ; |
| 2255 | break; |
| 2256 | case RecurKind::FMaximumNum: |
| 2257 | O << "fmaximumnum" ; |
| 2258 | break; |
| 2259 | default: |
| 2260 | O << Instruction::getOpcodeName(RecurrenceDescriptor::getOpcode(RK)); |
| 2261 | break; |
| 2262 | } |
| 2263 | if (isReductionInLoop()) |
| 2264 | O << ", in-loop" ; |
| 2265 | if (isReductionOrdered()) |
| 2266 | O << ", ordered" ; |
| 2267 | O << ")" ; |
| 2268 | getFastMathFlags().print(O); |
| 2269 | break; |
| 2270 | } |
| 2271 | case OperationType::Other: |
| 2272 | break; |
| 2273 | } |
| 2274 | O << " " ; |
| 2275 | } |
| 2276 | #endif |
| 2277 | |
| 2278 | void VPWidenRecipe::execute(VPTransformState &State) { |
| 2279 | auto &Builder = State.Builder; |
| 2280 | switch (Opcode) { |
| 2281 | case Instruction::Call: |
| 2282 | case Instruction::Br: |
| 2283 | case Instruction::PHI: |
| 2284 | case Instruction::GetElementPtr: |
| 2285 | llvm_unreachable("This instruction is handled by a different recipe." ); |
| 2286 | case Instruction::UDiv: |
| 2287 | case Instruction::SDiv: |
| 2288 | case Instruction::SRem: |
| 2289 | case Instruction::URem: |
| 2290 | case Instruction::Add: |
| 2291 | case Instruction::FAdd: |
| 2292 | case Instruction::Sub: |
| 2293 | case Instruction::FSub: |
| 2294 | case Instruction::FNeg: |
| 2295 | case Instruction::Mul: |
| 2296 | case Instruction::FMul: |
| 2297 | case Instruction::FDiv: |
| 2298 | case Instruction::FRem: |
| 2299 | case Instruction::Shl: |
| 2300 | case Instruction::LShr: |
| 2301 | case Instruction::AShr: |
| 2302 | case Instruction::And: |
| 2303 | case Instruction::Or: |
| 2304 | case Instruction::Xor: { |
| 2305 | // Just widen unops and binops. |
| 2306 | SmallVector<Value *, 2> Ops; |
| 2307 | for (VPValue *VPOp : operands()) |
| 2308 | Ops.push_back(Elt: State.get(Def: VPOp)); |
| 2309 | |
| 2310 | Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops); |
| 2311 | |
| 2312 | if (auto *VecOp = dyn_cast<Instruction>(Val: V)) { |
| 2313 | applyFlags(I&: *VecOp); |
| 2314 | applyMetadata(I&: *VecOp); |
| 2315 | } |
| 2316 | |
| 2317 | // Use this vector value for all users of the original instruction. |
| 2318 | State.set(Def: this, V); |
| 2319 | break; |
| 2320 | } |
| 2321 | case Instruction::ExtractValue: { |
| 2322 | assert(getNumOperands() == 2 && "expected single level extractvalue" ); |
| 2323 | Value *Op = State.get(Def: getOperand(N: 0)); |
| 2324 | Value * = Builder.CreateExtractValue( |
| 2325 | Agg: Op, Idxs: cast<VPConstantInt>(Val: getOperand(N: 1))->getZExtValue()); |
| 2326 | State.set(Def: this, V: Extract); |
| 2327 | break; |
| 2328 | } |
| 2329 | case Instruction::Freeze: { |
| 2330 | Value *Op = State.get(Def: getOperand(N: 0)); |
| 2331 | Value *Freeze = Builder.CreateFreeze(V: Op); |
| 2332 | State.set(Def: this, V: Freeze); |
| 2333 | break; |
| 2334 | } |
| 2335 | case Instruction::ICmp: |
| 2336 | case Instruction::FCmp: { |
| 2337 | // Widen compares. Generate vector compares. |
| 2338 | bool FCmp = Opcode == Instruction::FCmp; |
| 2339 | Value *A = State.get(Def: getOperand(N: 0)); |
| 2340 | Value *B = State.get(Def: getOperand(N: 1)); |
| 2341 | Value *C = nullptr; |
| 2342 | if (FCmp) { |
| 2343 | C = Builder.CreateFCmp(P: getPredicate(), LHS: A, RHS: B); |
| 2344 | } else { |
| 2345 | C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B); |
| 2346 | } |
| 2347 | if (auto *I = dyn_cast<Instruction>(Val: C)) { |
| 2348 | applyFlags(I&: *I); |
| 2349 | applyMetadata(I&: *I); |
| 2350 | } |
| 2351 | State.set(Def: this, V: C); |
| 2352 | break; |
| 2353 | } |
| 2354 | case Instruction::Select: { |
| 2355 | VPValue *CondOp = getOperand(N: 0); |
| 2356 | Value *Cond = State.get(Def: CondOp, IsScalar: vputils::isSingleScalar(VPV: CondOp)); |
| 2357 | Value *Op0 = State.get(Def: getOperand(N: 1)); |
| 2358 | Value *Op1 = State.get(Def: getOperand(N: 2)); |
| 2359 | Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1); |
| 2360 | State.set(Def: this, V: Sel); |
| 2361 | if (auto *I = dyn_cast<Instruction>(Val: Sel)) { |
| 2362 | if (isa<FPMathOperator>(Val: I)) |
| 2363 | applyFlags(I&: *I); |
| 2364 | applyMetadata(I&: *I); |
| 2365 | } |
| 2366 | break; |
| 2367 | } |
| 2368 | default: |
| 2369 | // This instruction is not vectorized by simple widening. |
| 2370 | LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " |
| 2371 | << Instruction::getOpcodeName(Opcode)); |
| 2372 | llvm_unreachable("Unhandled instruction!" ); |
| 2373 | } // end of switch. |
| 2374 | |
| 2375 | #if !defined(NDEBUG) |
| 2376 | // Verify that VPlan type inference results agree with the type of the |
| 2377 | // generated values. |
| 2378 | assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == |
| 2379 | State.get(this)->getType() && |
| 2380 | "inferred type and type from generated instructions do not match" ); |
| 2381 | #endif |
| 2382 | } |
| 2383 | |
| 2384 | InstructionCost VPWidenRecipe::computeCost(ElementCount VF, |
| 2385 | VPCostContext &Ctx) const { |
| 2386 | switch (Opcode) { |
| 2387 | case Instruction::UDiv: |
| 2388 | case Instruction::SDiv: |
| 2389 | case Instruction::SRem: |
| 2390 | case Instruction::URem: |
| 2391 | // If the div/rem operation isn't safe to speculate and requires |
| 2392 | // predication, then the only way we can even create a vplan is to insert |
| 2393 | // a select on the second input operand to ensure we use the value of 1 |
| 2394 | // for the inactive lanes. The select will be costed separately. |
| 2395 | case Instruction::FNeg: |
| 2396 | case Instruction::Add: |
| 2397 | case Instruction::FAdd: |
| 2398 | case Instruction::Sub: |
| 2399 | case Instruction::FSub: |
| 2400 | case Instruction::Mul: |
| 2401 | case Instruction::FMul: |
| 2402 | case Instruction::FDiv: |
| 2403 | case Instruction::FRem: |
| 2404 | case Instruction::Shl: |
| 2405 | case Instruction::LShr: |
| 2406 | case Instruction::AShr: |
| 2407 | case Instruction::And: |
| 2408 | case Instruction::Or: |
| 2409 | case Instruction::Xor: |
| 2410 | case Instruction::Freeze: |
| 2411 | case Instruction::ExtractValue: |
| 2412 | case Instruction::ICmp: |
| 2413 | case Instruction::FCmp: |
| 2414 | case Instruction::Select: |
| 2415 | return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx); |
| 2416 | default: |
| 2417 | llvm_unreachable("Unsupported opcode for instruction" ); |
| 2418 | } |
| 2419 | } |
| 2420 | |
| 2421 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2422 | void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2423 | VPSlotTracker &SlotTracker) const { |
| 2424 | O << Indent << "WIDEN " ; |
| 2425 | printAsOperand(O, SlotTracker); |
| 2426 | O << " = " << Instruction::getOpcodeName(Opcode); |
| 2427 | printFlags(O); |
| 2428 | printOperands(O, SlotTracker); |
| 2429 | } |
| 2430 | #endif |
| 2431 | |
| 2432 | void VPWidenCastRecipe::execute(VPTransformState &State) { |
| 2433 | auto &Builder = State.Builder; |
| 2434 | /// Vectorize casts. |
| 2435 | assert(State.VF.isVector() && "Not vectorizing?" ); |
| 2436 | Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF); |
| 2437 | VPValue *Op = getOperand(N: 0); |
| 2438 | Value *A = State.get(Def: Op); |
| 2439 | Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy); |
| 2440 | State.set(Def: this, V: Cast); |
| 2441 | if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) { |
| 2442 | applyFlags(I&: *CastOp); |
| 2443 | applyMetadata(I&: *CastOp); |
| 2444 | } |
| 2445 | } |
| 2446 | |
| 2447 | InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, |
| 2448 | VPCostContext &Ctx) const { |
| 2449 | // TODO: In some cases, VPWidenCastRecipes are created but not considered in |
| 2450 | // the legacy cost model, including truncates/extends when evaluating a |
| 2451 | // reduction in a smaller type. |
| 2452 | if (!getUnderlyingValue()) |
| 2453 | return 0; |
| 2454 | return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx); |
| 2455 | } |
| 2456 | |
| 2457 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2458 | void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2459 | VPSlotTracker &SlotTracker) const { |
| 2460 | O << Indent << "WIDEN-CAST " ; |
| 2461 | printAsOperand(O, SlotTracker); |
| 2462 | O << " = " << Instruction::getOpcodeName(Opcode); |
| 2463 | printFlags(O); |
| 2464 | printOperands(O, SlotTracker); |
| 2465 | O << " to " << *getResultType(); |
| 2466 | } |
| 2467 | #endif |
| 2468 | |
| 2469 | InstructionCost VPHeaderPHIRecipe::(ElementCount VF, |
| 2470 | VPCostContext &Ctx) const { |
| 2471 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 2472 | } |
| 2473 | |
| 2474 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2475 | void VPWidenIntOrFpInductionRecipe::printRecipe( |
| 2476 | raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { |
| 2477 | O << Indent; |
| 2478 | printAsOperand(O, SlotTracker); |
| 2479 | O << " = WIDEN-INDUCTION" ; |
| 2480 | printFlags(O); |
| 2481 | printOperands(O, SlotTracker); |
| 2482 | |
| 2483 | if (auto *TI = getTruncInst()) |
| 2484 | O << " (truncated to " << *TI->getType() << ")" ; |
| 2485 | } |
| 2486 | #endif |
| 2487 | |
| 2488 | bool VPWidenIntOrFpInductionRecipe::isCanonical() const { |
| 2489 | // The step may be defined by a recipe in the preheader (e.g. if it requires |
| 2490 | // SCEV expansion), but for the canonical induction the step is required to be |
| 2491 | // 1, which is represented as live-in. |
| 2492 | auto *StepC = dyn_cast<VPConstantInt>(Val: getStepValue()); |
| 2493 | auto *StartC = dyn_cast<VPConstantInt>(Val: getStartValue()); |
| 2494 | return StartC && StartC->isZero() && StepC && StepC->isOne() && |
| 2495 | getScalarType() == getRegion()->getCanonicalIVType(); |
| 2496 | } |
| 2497 | |
| 2498 | void VPDerivedIVRecipe::execute(VPTransformState &State) { |
| 2499 | assert(!State.Lane && "VPDerivedIVRecipe being replicated." ); |
| 2500 | |
| 2501 | // Fast-math-flags propagate from the original induction instruction. |
| 2502 | IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); |
| 2503 | if (FPBinOp) |
| 2504 | State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); |
| 2505 | |
| 2506 | Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0)); |
| 2507 | Value *Index = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 2508 | Value *DerivedIV = emitTransformedIndex( |
| 2509 | B&: State.Builder, Index, StartValue: getStartValue()->getLiveInIRValue(), Step, InductionKind: Kind, |
| 2510 | InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp)); |
| 2511 | DerivedIV->setName(Name); |
| 2512 | State.set(Def: this, V: DerivedIV, Lane: VPLane(0)); |
| 2513 | } |
| 2514 | |
| 2515 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2516 | void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2517 | VPSlotTracker &SlotTracker) const { |
| 2518 | O << Indent; |
| 2519 | printAsOperand(O, SlotTracker); |
| 2520 | O << " = DERIVED-IV " ; |
| 2521 | getStartValue()->printAsOperand(O, SlotTracker); |
| 2522 | O << " + " ; |
| 2523 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 2524 | O << " * " ; |
| 2525 | getStepValue()->printAsOperand(O, SlotTracker); |
| 2526 | } |
| 2527 | #endif |
| 2528 | |
| 2529 | void VPScalarIVStepsRecipe::execute(VPTransformState &State) { |
| 2530 | // Fast-math-flags propagate from the original induction instruction. |
| 2531 | IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); |
| 2532 | State.Builder.setFastMathFlags(getFastMathFlags()); |
| 2533 | |
| 2534 | /// Compute scalar induction steps. \p ScalarIV is the scalar induction |
| 2535 | /// variable on which to base the steps, \p Step is the size of the step. |
| 2536 | |
| 2537 | Value *BaseIV = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 2538 | Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0)); |
| 2539 | IRBuilderBase &Builder = State.Builder; |
| 2540 | |
| 2541 | // Ensure step has the same type as that of scalar IV. |
| 2542 | Type *BaseIVTy = BaseIV->getType()->getScalarType(); |
| 2543 | assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!" ); |
| 2544 | |
| 2545 | // We build scalar steps for both integer and floating-point induction |
| 2546 | // variables. Here, we determine the kind of arithmetic we will perform. |
| 2547 | Instruction::BinaryOps AddOp; |
| 2548 | Instruction::BinaryOps MulOp; |
| 2549 | if (BaseIVTy->isIntegerTy()) { |
| 2550 | AddOp = Instruction::Add; |
| 2551 | MulOp = Instruction::Mul; |
| 2552 | } else { |
| 2553 | AddOp = InductionOpcode; |
| 2554 | MulOp = Instruction::FMul; |
| 2555 | } |
| 2556 | |
| 2557 | // Determine the number of scalars we need to generate. |
| 2558 | bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this); |
| 2559 | // Compute the scalar steps and save the results in State. |
| 2560 | |
| 2561 | unsigned StartLane = 0; |
| 2562 | unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); |
| 2563 | if (State.Lane) { |
| 2564 | StartLane = State.Lane->getKnownLane(); |
| 2565 | EndLane = StartLane + 1; |
| 2566 | } |
| 2567 | Value *StartIdx0 = getStartIndex() ? State.get(Def: getStartIndex(), IsScalar: true) |
| 2568 | : Constant::getNullValue(Ty: BaseIVTy); |
| 2569 | |
| 2570 | for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { |
| 2571 | // It is okay if the induction variable type cannot hold the lane number, |
| 2572 | // we expect truncation in this case. |
| 2573 | Constant *LaneValue = |
| 2574 | BaseIVTy->isIntegerTy() |
| 2575 | ? ConstantInt::get(Ty: BaseIVTy, V: Lane, /*IsSigned=*/false, |
| 2576 | /*ImplicitTrunc=*/true) |
| 2577 | : ConstantFP::get(Ty: BaseIVTy, V: Lane); |
| 2578 | Value *StartIdx = Builder.CreateBinOp(Opc: AddOp, LHS: StartIdx0, RHS: LaneValue); |
| 2579 | // The step returned by `createStepForVF` is a runtime-evaluated value |
| 2580 | // when VF is scalable. Otherwise, it should be folded into a Constant. |
| 2581 | assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && |
| 2582 | "Expected StartIdx to be folded to a constant when VF is not " |
| 2583 | "scalable" ); |
| 2584 | auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step); |
| 2585 | auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul); |
| 2586 | State.set(Def: this, V: Add, Lane: VPLane(Lane)); |
| 2587 | } |
| 2588 | } |
| 2589 | |
| 2590 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2591 | void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2592 | VPSlotTracker &SlotTracker) const { |
| 2593 | O << Indent; |
| 2594 | printAsOperand(O, SlotTracker); |
| 2595 | O << " = SCALAR-STEPS " ; |
| 2596 | printOperands(O, SlotTracker); |
| 2597 | } |
| 2598 | #endif |
| 2599 | |
| 2600 | bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const { |
| 2601 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 2602 | return vputils::isSingleScalar(VPV: Op); |
| 2603 | } |
| 2604 | |
| 2605 | void VPWidenGEPRecipe::execute(VPTransformState &State) { |
| 2606 | assert(State.VF.isVector() && "not widening" ); |
| 2607 | // Construct a vector GEP by widening the operands of the scalar GEP as |
| 2608 | // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP |
| 2609 | // results in a vector of pointers when at least one operand of the GEP |
| 2610 | // is vector-typed. Thus, to keep the representation compact, we only use |
| 2611 | // vector-typed operands for loop-varying values. |
| 2612 | |
| 2613 | bool AllOperandsAreInvariant = all_of(Range: operands(), P: [](VPValue *Op) { |
| 2614 | return Op->isDefinedOutsideLoopRegions(); |
| 2615 | }); |
| 2616 | if (AllOperandsAreInvariant) { |
| 2617 | // If we are vectorizing, but the GEP has only loop-invariant operands, |
| 2618 | // the GEP we build (by only using vector-typed operands for |
| 2619 | // loop-varying values) would be a scalar pointer. Thus, to ensure we |
| 2620 | // produce a vector of pointers, we need to either arbitrarily pick an |
| 2621 | // operand to broadcast, or broadcast a clone of the original GEP. |
| 2622 | // Here, we broadcast a clone of the original. |
| 2623 | |
| 2624 | SmallVector<Value *> Ops; |
| 2625 | for (unsigned I = 0, E = getNumOperands(); I != E; I++) |
| 2626 | Ops.push_back(Elt: State.get(Def: getOperand(N: I), Lane: VPLane(0))); |
| 2627 | |
| 2628 | auto *NewGEP = |
| 2629 | State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr: Ops[0], IdxList: drop_begin(RangeOrContainer&: Ops), |
| 2630 | Name: "" , NW: getGEPNoWrapFlags()); |
| 2631 | Value *Splat = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP); |
| 2632 | State.set(Def: this, V: Splat); |
| 2633 | return; |
| 2634 | } |
| 2635 | |
| 2636 | // If the GEP has at least one loop-varying operand, we are sure to |
| 2637 | // produce a vector of pointers unless VF is scalar. |
| 2638 | // The pointer operand of the new GEP. If it's loop-invariant, we |
| 2639 | // won't broadcast it. |
| 2640 | auto *Ptr = State.get(Def: getOperand(N: 0), IsScalar: isPointerLoopInvariant()); |
| 2641 | |
| 2642 | // Collect all the indices for the new GEP. If any index is |
| 2643 | // loop-invariant, we won't broadcast it. |
| 2644 | SmallVector<Value *, 4> Indices; |
| 2645 | for (unsigned I = 1, E = getNumOperands(); I < E; I++) { |
| 2646 | VPValue *Operand = getOperand(N: I); |
| 2647 | Indices.push_back(Elt: State.get(Def: Operand, IsScalar: isIndexLoopInvariant(I: I - 1))); |
| 2648 | } |
| 2649 | |
| 2650 | // Create the new GEP. Note that this GEP may be a scalar if VF == 1, |
| 2651 | // but it should be a vector, otherwise. |
| 2652 | auto *NewGEP = State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Indices, |
| 2653 | Name: "" , NW: getGEPNoWrapFlags()); |
| 2654 | assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && |
| 2655 | "NewGEP is not a pointer vector" ); |
| 2656 | State.set(Def: this, V: NewGEP); |
| 2657 | } |
| 2658 | |
| 2659 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2660 | void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2661 | VPSlotTracker &SlotTracker) const { |
| 2662 | O << Indent << "WIDEN-GEP " ; |
| 2663 | O << (isPointerLoopInvariant() ? "Inv" : "Var" ); |
| 2664 | for (size_t I = 0; I < getNumOperands() - 1; ++I) |
| 2665 | O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var" ) << "]" ; |
| 2666 | |
| 2667 | O << " " ; |
| 2668 | printAsOperand(O, SlotTracker); |
| 2669 | O << " = getelementptr" ; |
| 2670 | printFlags(O); |
| 2671 | printOperands(O, SlotTracker); |
| 2672 | } |
| 2673 | #endif |
| 2674 | |
| 2675 | void VPVectorEndPointerRecipe::materializeOffset(unsigned Part) { |
| 2676 | assert(!getOffset() && "Unexpected offset operand" ); |
| 2677 | VPBuilder Builder(this); |
| 2678 | VPlan &Plan = *getParent()->getPlan(); |
| 2679 | VPValue *VFVal = getVFValue(); |
| 2680 | VPTypeAnalysis TypeInfo(Plan); |
| 2681 | const DataLayout &DL = |
| 2682 | Plan.getScalarHeader()->getIRBasicBlock()->getDataLayout(); |
| 2683 | Type *IndexTy = DL.getIndexType(PtrTy: TypeInfo.inferScalarType(V: getPointer())); |
| 2684 | VPValue *Stride = |
| 2685 | Plan.getConstantInt(Ty: IndexTy, Val: getStride(), /*IsSigned=*/true); |
| 2686 | Type *VFTy = TypeInfo.inferScalarType(V: VFVal); |
| 2687 | VPValue *VF = Builder.createScalarZExtOrTrunc(Op: VFVal, ResultTy: IndexTy, SrcTy: VFTy, |
| 2688 | DL: DebugLoc::getUnknown()); |
| 2689 | |
| 2690 | // Offset for Part0 = Offset0 = Stride * (VF - 1). |
| 2691 | VPInstruction *VFMinusOne = |
| 2692 | Builder.createSub(LHS: VF, RHS: Plan.getConstantInt(Ty: IndexTy, Val: 1u), |
| 2693 | DL: DebugLoc::getUnknown(), Name: "" , WrapFlags: {true, true}); |
| 2694 | VPInstruction *Offset0 = |
| 2695 | Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {VFMinusOne, Stride}); |
| 2696 | |
| 2697 | // Offset for PartN = Offset0 + Part * Stride * VF. |
| 2698 | VPValue *PartxStride = |
| 2699 | Plan.getConstantInt(Ty: IndexTy, Val: Part * getStride(), /*IsSigned=*/true); |
| 2700 | VPValue *Offset = Builder.createAdd( |
| 2701 | LHS: Offset0, |
| 2702 | RHS: Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {PartxStride, VF})); |
| 2703 | addOperand(Operand: Offset); |
| 2704 | } |
| 2705 | |
| 2706 | void VPVectorEndPointerRecipe::execute(VPTransformState &State) { |
| 2707 | auto &Builder = State.Builder; |
| 2708 | assert(getOffset() && "Expected prior materialization of offset" ); |
| 2709 | Value *Ptr = State.get(Def: getPointer(), IsScalar: true); |
| 2710 | Value *Offset = State.get(Def: getOffset(), IsScalar: true); |
| 2711 | Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "" , |
| 2712 | NW: getGEPNoWrapFlags()); |
| 2713 | State.set(Def: this, V: ResultPtr, /*IsScalar*/ true); |
| 2714 | } |
| 2715 | |
| 2716 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2717 | void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2718 | VPSlotTracker &SlotTracker) const { |
| 2719 | O << Indent; |
| 2720 | printAsOperand(O, SlotTracker); |
| 2721 | O << " = vector-end-pointer" ; |
| 2722 | printFlags(O); |
| 2723 | printOperands(O, SlotTracker); |
| 2724 | } |
| 2725 | #endif |
| 2726 | |
| 2727 | void VPVectorPointerRecipe::execute(VPTransformState &State) { |
| 2728 | auto &Builder = State.Builder; |
| 2729 | assert(getOffset() && |
| 2730 | "Expected prior simplification of recipe without offset" ); |
| 2731 | Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 2732 | Value *Offset = State.get(Def: getOffset(), IsScalar: true); |
| 2733 | Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "" , |
| 2734 | NW: getGEPNoWrapFlags()); |
| 2735 | State.set(Def: this, V: ResultPtr, /*IsScalar*/ true); |
| 2736 | } |
| 2737 | |
| 2738 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2739 | void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2740 | VPSlotTracker &SlotTracker) const { |
| 2741 | O << Indent; |
| 2742 | printAsOperand(O, SlotTracker); |
| 2743 | O << " = vector-pointer" ; |
| 2744 | printFlags(O); |
| 2745 | printOperands(O, SlotTracker); |
| 2746 | } |
| 2747 | #endif |
| 2748 | |
| 2749 | InstructionCost VPBlendRecipe::computeCost(ElementCount VF, |
| 2750 | VPCostContext &Ctx) const { |
| 2751 | // A blend will be expanded to a select VPInstruction, which will generate a |
| 2752 | // scalar select if only the first lane is used. |
| 2753 | if (vputils::onlyFirstLaneUsed(Def: this)) |
| 2754 | VF = ElementCount::getFixed(MinVal: 1); |
| 2755 | |
| 2756 | Type *ResultTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF); |
| 2757 | Type *CmpTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.Types.getContext()), EC: VF); |
| 2758 | return (getNumIncomingValues() - 1) * |
| 2759 | Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: ResultTy, CondTy: CmpTy, |
| 2760 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind: Ctx.CostKind); |
| 2761 | } |
| 2762 | |
| 2763 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2764 | void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2765 | VPSlotTracker &SlotTracker) const { |
| 2766 | O << Indent << "BLEND " ; |
| 2767 | printAsOperand(O, SlotTracker); |
| 2768 | O << " =" ; |
| 2769 | printFlags(O); |
| 2770 | if (getNumIncomingValues() == 1) { |
| 2771 | // Not a User of any mask: not really blending, this is a |
| 2772 | // single-predecessor phi. |
| 2773 | getIncomingValue(0)->printAsOperand(O, SlotTracker); |
| 2774 | } else { |
| 2775 | for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { |
| 2776 | if (I != 0) |
| 2777 | O << " " ; |
| 2778 | getIncomingValue(I)->printAsOperand(O, SlotTracker); |
| 2779 | if (I == 0 && isNormalized()) |
| 2780 | continue; |
| 2781 | O << "/" ; |
| 2782 | getMask(I)->printAsOperand(O, SlotTracker); |
| 2783 | } |
| 2784 | } |
| 2785 | } |
| 2786 | #endif |
| 2787 | |
| 2788 | void VPReductionRecipe::execute(VPTransformState &State) { |
| 2789 | assert(!State.Lane && "Reduction being replicated." ); |
| 2790 | RecurKind Kind = getRecurrenceKind(); |
| 2791 | assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && |
| 2792 | "In-loop AnyOf reductions aren't currently supported" ); |
| 2793 | // Propagate the fast-math flags carried by the underlying instruction. |
| 2794 | IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); |
| 2795 | State.Builder.setFastMathFlags(getFastMathFlags()); |
| 2796 | Value *NewVecOp = State.get(Def: getVecOp()); |
| 2797 | if (VPValue *Cond = getCondOp()) { |
| 2798 | Value *NewCond = State.get(Def: Cond, IsScalar: State.VF.isScalar()); |
| 2799 | VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType()); |
| 2800 | Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); |
| 2801 | |
| 2802 | Value *Start = getRecurrenceIdentity(K: Kind, Tp: ElementTy, FMF: getFastMathFlags()); |
| 2803 | if (State.VF.isVector()) |
| 2804 | Start = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Start); |
| 2805 | |
| 2806 | Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Start); |
| 2807 | NewVecOp = Select; |
| 2808 | } |
| 2809 | Value *NewRed; |
| 2810 | Value *NextInChain; |
| 2811 | if (isOrdered()) { |
| 2812 | Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true); |
| 2813 | if (State.VF.isVector()) |
| 2814 | NewRed = |
| 2815 | createOrderedReduction(B&: State.Builder, RdxKind: Kind, Src: NewVecOp, Start: PrevInChain); |
| 2816 | else |
| 2817 | NewRed = State.Builder.CreateBinOp( |
| 2818 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), |
| 2819 | LHS: PrevInChain, RHS: NewVecOp); |
| 2820 | PrevInChain = NewRed; |
| 2821 | NextInChain = NewRed; |
| 2822 | } else if (isPartialReduction()) { |
| 2823 | assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) && |
| 2824 | "Unexpected partial reduction kind" ); |
| 2825 | Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ false); |
| 2826 | NewRed = State.Builder.CreateIntrinsic( |
| 2827 | RetTy: PrevInChain->getType(), |
| 2828 | ID: Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add |
| 2829 | : Intrinsic::vector_partial_reduce_fadd, |
| 2830 | Args: {PrevInChain, NewVecOp}, FMFSource: State.Builder.getFastMathFlags(), |
| 2831 | Name: "partial.reduce" ); |
| 2832 | PrevInChain = NewRed; |
| 2833 | NextInChain = NewRed; |
| 2834 | } else { |
| 2835 | assert(isInLoop() && |
| 2836 | "The reduction must either be ordered, partial or in-loop" ); |
| 2837 | Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true); |
| 2838 | NewRed = createSimpleReduction(B&: State.Builder, Src: NewVecOp, RdxKind: Kind); |
| 2839 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) |
| 2840 | NextInChain = createMinMaxOp(Builder&: State.Builder, RK: Kind, Left: NewRed, Right: PrevInChain); |
| 2841 | else |
| 2842 | NextInChain = State.Builder.CreateBinOp( |
| 2843 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), |
| 2844 | LHS: PrevInChain, RHS: NewRed); |
| 2845 | } |
| 2846 | State.set(Def: this, V: NextInChain, /*IsScalar*/ !isPartialReduction()); |
| 2847 | } |
| 2848 | |
| 2849 | void VPReductionEVLRecipe::execute(VPTransformState &State) { |
| 2850 | assert(!State.Lane && "Reduction being replicated." ); |
| 2851 | |
| 2852 | auto &Builder = State.Builder; |
| 2853 | // Propagate the fast-math flags carried by the underlying instruction. |
| 2854 | IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); |
| 2855 | Builder.setFastMathFlags(getFastMathFlags()); |
| 2856 | |
| 2857 | RecurKind Kind = getRecurrenceKind(); |
| 2858 | Value *Prev = State.get(Def: getChainOp(), /*IsScalar*/ true); |
| 2859 | Value *VecOp = State.get(Def: getVecOp()); |
| 2860 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 2861 | |
| 2862 | Value *Mask; |
| 2863 | if (VPValue *CondOp = getCondOp()) |
| 2864 | Mask = State.get(Def: CondOp); |
| 2865 | else |
| 2866 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
| 2867 | |
| 2868 | Value *NewRed; |
| 2869 | if (isOrdered()) { |
| 2870 | NewRed = createOrderedReduction(B&: Builder, RdxKind: Kind, Src: VecOp, Start: Prev, Mask, EVL); |
| 2871 | } else { |
| 2872 | NewRed = createSimpleReduction(B&: Builder, Src: VecOp, RdxKind: Kind, Mask, EVL); |
| 2873 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) |
| 2874 | NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev); |
| 2875 | else |
| 2876 | NewRed = Builder.CreateBinOp( |
| 2877 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed, |
| 2878 | RHS: Prev); |
| 2879 | } |
| 2880 | State.set(Def: this, V: NewRed, /*IsScalar*/ true); |
| 2881 | } |
| 2882 | |
| 2883 | InstructionCost VPReductionRecipe::computeCost(ElementCount VF, |
| 2884 | VPCostContext &Ctx) const { |
| 2885 | RecurKind RdxKind = getRecurrenceKind(); |
| 2886 | Type *ElementTy = Ctx.Types.inferScalarType(V: this); |
| 2887 | auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ElementTy, EC: VF)); |
| 2888 | unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind); |
| 2889 | FastMathFlags FMFs = getFastMathFlags(); |
| 2890 | std::optional<FastMathFlags> OptionalFMF = |
| 2891 | ElementTy->isFloatingPointTy() ? std::make_optional(t&: FMFs) : std::nullopt; |
| 2892 | |
| 2893 | if (isPartialReduction()) { |
| 2894 | InstructionCost CondCost = 0; |
| 2895 | if (isConditional()) { |
| 2896 | CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; |
| 2897 | auto *CondTy = cast<VectorType>( |
| 2898 | Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getCondOp()), EC: VF)); |
| 2899 | CondCost = Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VectorTy, |
| 2900 | CondTy, VecPred: Pred, CostKind: Ctx.CostKind); |
| 2901 | } |
| 2902 | return CondCost + Ctx.TTI.getPartialReductionCost( |
| 2903 | Opcode, InputTypeA: ElementTy, InputTypeB: ElementTy, AccumType: ElementTy, VF, |
| 2904 | OpAExtend: TTI::PR_None, OpBExtend: TTI::PR_None, BinOp: {}, CostKind: Ctx.CostKind, |
| 2905 | FMF: OptionalFMF); |
| 2906 | } |
| 2907 | |
| 2908 | // TODO: Support any-of reductions. |
| 2909 | assert( |
| 2910 | (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || |
| 2911 | ForceTargetInstructionCost.getNumOccurrences() > 0) && |
| 2912 | "Any-of reduction not implemented in VPlan-based cost model currently." ); |
| 2913 | |
| 2914 | // Note that TTI should model the cost of moving result to the scalar register |
| 2915 | // and the BinOp cost in the getMinMaxReductionCost(). |
| 2916 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RdxKind)) { |
| 2917 | Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind); |
| 2918 | return Ctx.TTI.getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF: FMFs, CostKind: Ctx.CostKind); |
| 2919 | } |
| 2920 | |
| 2921 | // Note that TTI should model the cost of moving result to the scalar register |
| 2922 | // and the BinOp cost in the getArithmeticReductionCost(). |
| 2923 | return Ctx.TTI.getArithmeticReductionCost(Opcode, Ty: VectorTy, FMF: OptionalFMF, |
| 2924 | CostKind: Ctx.CostKind); |
| 2925 | } |
| 2926 | |
| 2927 | VPExpressionRecipe::VPExpressionRecipe( |
| 2928 | ExpressionTypes ExpressionType, |
| 2929 | ArrayRef<VPSingleDefRecipe *> ExpressionRecipes) |
| 2930 | : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {}, {}), |
| 2931 | ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) { |
| 2932 | assert(!ExpressionRecipes.empty() && "Nothing to combine?" ); |
| 2933 | assert( |
| 2934 | none_of(ExpressionRecipes, |
| 2935 | [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && |
| 2936 | "expression cannot contain recipes with side-effects" ); |
| 2937 | |
| 2938 | // Maintain a copy of the expression recipes as a set of users. |
| 2939 | SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers; |
| 2940 | for (auto *R : ExpressionRecipes) |
| 2941 | ExpressionRecipesAsSetOfUsers.insert(Ptr: R); |
| 2942 | |
| 2943 | // Recipes in the expression, except the last one, must only be used by |
| 2944 | // (other) recipes inside the expression. If there are other users, external |
| 2945 | // to the expression, use a clone of the recipe for external users. |
| 2946 | for (VPSingleDefRecipe *R : reverse(C&: ExpressionRecipes)) { |
| 2947 | if (R != ExpressionRecipes.back() && |
| 2948 | any_of(Range: R->users(), P: [&ExpressionRecipesAsSetOfUsers](VPUser *U) { |
| 2949 | return !ExpressionRecipesAsSetOfUsers.contains(Ptr: U); |
| 2950 | })) { |
| 2951 | // There are users outside of the expression. Clone the recipe and use the |
| 2952 | // clone those external users. |
| 2953 | VPSingleDefRecipe *CopyForExtUsers = R->clone(); |
| 2954 | R->replaceUsesWithIf(New: CopyForExtUsers, ShouldReplace: [&ExpressionRecipesAsSetOfUsers]( |
| 2955 | VPUser &U, unsigned) { |
| 2956 | return !ExpressionRecipesAsSetOfUsers.contains(Ptr: &U); |
| 2957 | }); |
| 2958 | CopyForExtUsers->insertBefore(InsertPos: R); |
| 2959 | } |
| 2960 | if (R->getParent()) |
| 2961 | R->removeFromParent(); |
| 2962 | } |
| 2963 | |
| 2964 | // Internalize all external operands to the expression recipes. To do so, |
| 2965 | // create new temporary VPValues for all operands defined by a recipe outside |
| 2966 | // the expression. The original operands are added as operands of the |
| 2967 | // VPExpressionRecipe itself. |
| 2968 | for (auto *R : ExpressionRecipes) { |
| 2969 | for (const auto &[Idx, Op] : enumerate(First: R->operands())) { |
| 2970 | auto *Def = Op->getDefiningRecipe(); |
| 2971 | if (Def && ExpressionRecipesAsSetOfUsers.contains(Ptr: Def)) |
| 2972 | continue; |
| 2973 | addOperand(Operand: Op); |
| 2974 | LiveInPlaceholders.push_back(Elt: new VPSymbolicValue()); |
| 2975 | } |
| 2976 | } |
| 2977 | |
| 2978 | // Replace each external operand with the first one created for it in |
| 2979 | // LiveInPlaceholders. |
| 2980 | for (auto *R : ExpressionRecipes) |
| 2981 | for (auto const &[LiveIn, Tmp] : zip(t: operands(), u&: LiveInPlaceholders)) |
| 2982 | R->replaceUsesOfWith(From: LiveIn, To: Tmp); |
| 2983 | } |
| 2984 | |
| 2985 | void VPExpressionRecipe::decompose() { |
| 2986 | for (auto *R : ExpressionRecipes) |
| 2987 | // Since the list could contain duplicates, make sure the recipe hasn't |
| 2988 | // already been inserted. |
| 2989 | if (!R->getParent()) |
| 2990 | R->insertBefore(InsertPos: this); |
| 2991 | |
| 2992 | for (const auto &[Idx, Op] : enumerate(First: operands())) |
| 2993 | LiveInPlaceholders[Idx]->replaceAllUsesWith(New: Op); |
| 2994 | |
| 2995 | replaceAllUsesWith(New: ExpressionRecipes.back()); |
| 2996 | ExpressionRecipes.clear(); |
| 2997 | } |
| 2998 | |
| 2999 | InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, |
| 3000 | VPCostContext &Ctx) const { |
| 3001 | Type *RedTy = Ctx.Types.inferScalarType(V: this); |
| 3002 | auto *SrcVecTy = cast<VectorType>( |
| 3003 | Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF)); |
| 3004 | unsigned Opcode = RecurrenceDescriptor::getOpcode( |
| 3005 | Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes.back())->getRecurrenceKind()); |
| 3006 | switch (ExpressionType) { |
| 3007 | case ExpressionTypes::ExtendedReduction: { |
| 3008 | unsigned Opcode = RecurrenceDescriptor::getOpcode( |
| 3009 | Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes[1])->getRecurrenceKind()); |
| 3010 | auto *ExtR = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]); |
| 3011 | auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back()); |
| 3012 | |
| 3013 | if (RedR->isPartialReduction()) |
| 3014 | return Ctx.TTI.getPartialReductionCost( |
| 3015 | Opcode, InputTypeA: Ctx.Types.inferScalarType(V: getOperand(N: 0)), InputTypeB: nullptr, AccumType: RedTy, VF, |
| 3016 | OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(CastOpc: ExtR->getOpcode()), |
| 3017 | OpBExtend: TargetTransformInfo::PR_None, BinOp: std::nullopt, CostKind: Ctx.CostKind, |
| 3018 | FMF: RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()} |
| 3019 | : std::nullopt); |
| 3020 | else if (!RedTy->isFloatingPointTy()) |
| 3021 | // TTI::getExtendedReductionCost only supports integer types. |
| 3022 | return Ctx.TTI.getExtendedReductionCost( |
| 3023 | Opcode, IsUnsigned: ExtR->getOpcode() == Instruction::ZExt, ResTy: RedTy, Ty: SrcVecTy, |
| 3024 | FMF: std::nullopt, CostKind: Ctx.CostKind); |
| 3025 | else |
| 3026 | return InstructionCost::getInvalid(); |
| 3027 | } |
| 3028 | case ExpressionTypes::MulAccReduction: |
| 3029 | return Ctx.TTI.getMulAccReductionCost(IsUnsigned: false, RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy, |
| 3030 | CostKind: Ctx.CostKind); |
| 3031 | |
| 3032 | case ExpressionTypes::ExtNegatedMulAccReduction: |
| 3033 | assert(Opcode == Instruction::Add && "Unexpected opcode" ); |
| 3034 | Opcode = Instruction::Sub; |
| 3035 | [[fallthrough]]; |
| 3036 | case ExpressionTypes::ExtMulAccReduction: { |
| 3037 | auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back()); |
| 3038 | if (RedR->isPartialReduction()) { |
| 3039 | auto *Ext0R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]); |
| 3040 | auto *Ext1R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[1]); |
| 3041 | auto *Mul = cast<VPWidenRecipe>(Val: ExpressionRecipes[2]); |
| 3042 | return Ctx.TTI.getPartialReductionCost( |
| 3043 | Opcode, InputTypeA: Ctx.Types.inferScalarType(V: getOperand(N: 0)), |
| 3044 | InputTypeB: Ctx.Types.inferScalarType(V: getOperand(N: 1)), AccumType: RedTy, VF, |
| 3045 | OpAExtend: TargetTransformInfo::getPartialReductionExtendKind( |
| 3046 | CastOpc: Ext0R->getOpcode()), |
| 3047 | OpBExtend: TargetTransformInfo::getPartialReductionExtendKind( |
| 3048 | CastOpc: Ext1R->getOpcode()), |
| 3049 | BinOp: Mul->getOpcode(), CostKind: Ctx.CostKind, |
| 3050 | FMF: RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()} |
| 3051 | : std::nullopt); |
| 3052 | } |
| 3053 | return Ctx.TTI.getMulAccReductionCost( |
| 3054 | IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() == |
| 3055 | Instruction::ZExt, |
| 3056 | RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind); |
| 3057 | } |
| 3058 | } |
| 3059 | llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum" ); |
| 3060 | } |
| 3061 | |
| 3062 | bool VPExpressionRecipe::mayReadOrWriteMemory() const { |
| 3063 | return any_of(Range: ExpressionRecipes, P: [](VPSingleDefRecipe *R) { |
| 3064 | return R->mayReadFromMemory() || R->mayWriteToMemory(); |
| 3065 | }); |
| 3066 | } |
| 3067 | |
| 3068 | bool VPExpressionRecipe::mayHaveSideEffects() const { |
| 3069 | assert( |
| 3070 | none_of(ExpressionRecipes, |
| 3071 | [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && |
| 3072 | "expression cannot contain recipes with side-effects" ); |
| 3073 | return false; |
| 3074 | } |
| 3075 | |
| 3076 | bool VPExpressionRecipe::isSingleScalar() const { |
| 3077 | // Cannot use vputils::isSingleScalar(), because all external operands |
| 3078 | // of the expression will be live-ins while bundled. |
| 3079 | auto *RR = dyn_cast<VPReductionRecipe>(Val: ExpressionRecipes.back()); |
| 3080 | return RR && !RR->isPartialReduction(); |
| 3081 | } |
| 3082 | |
| 3083 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3084 | |
| 3085 | void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3086 | VPSlotTracker &SlotTracker) const { |
| 3087 | O << Indent << "EXPRESSION " ; |
| 3088 | printAsOperand(O, SlotTracker); |
| 3089 | O << " = " ; |
| 3090 | auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back()); |
| 3091 | unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); |
| 3092 | |
| 3093 | switch (ExpressionType) { |
| 3094 | case ExpressionTypes::ExtendedReduction: { |
| 3095 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 3096 | O << " + " << (Red->isPartialReduction() ? "partial." : "" ) << "reduce." ; |
| 3097 | O << Instruction::getOpcodeName(Opcode) << " (" ; |
| 3098 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 3099 | Red->printFlags(O); |
| 3100 | |
| 3101 | auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); |
| 3102 | O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " |
| 3103 | << *Ext0->getResultType(); |
| 3104 | if (Red->isConditional()) { |
| 3105 | O << ", " ; |
| 3106 | Red->getCondOp()->printAsOperand(O, SlotTracker); |
| 3107 | } |
| 3108 | O << ")" ; |
| 3109 | break; |
| 3110 | } |
| 3111 | case ExpressionTypes::ExtNegatedMulAccReduction: { |
| 3112 | getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); |
| 3113 | O << " + " << (Red->isPartialReduction() ? "partial." : "" ) << "reduce." ; |
| 3114 | O << Instruction::getOpcodeName( |
| 3115 | RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) |
| 3116 | << " (sub (0, mul" ; |
| 3117 | auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]); |
| 3118 | Mul->printFlags(O); |
| 3119 | O << "(" ; |
| 3120 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 3121 | auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); |
| 3122 | O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " |
| 3123 | << *Ext0->getResultType() << "), (" ; |
| 3124 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 3125 | auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]); |
| 3126 | O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " |
| 3127 | << *Ext1->getResultType() << ")" ; |
| 3128 | if (Red->isConditional()) { |
| 3129 | O << ", " ; |
| 3130 | Red->getCondOp()->printAsOperand(O, SlotTracker); |
| 3131 | } |
| 3132 | O << "))" ; |
| 3133 | break; |
| 3134 | } |
| 3135 | case ExpressionTypes::MulAccReduction: |
| 3136 | case ExpressionTypes::ExtMulAccReduction: { |
| 3137 | getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); |
| 3138 | O << " + " << (Red->isPartialReduction() ? "partial." : "" ) << "reduce." ; |
| 3139 | O << Instruction::getOpcodeName( |
| 3140 | RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) |
| 3141 | << " (" ; |
| 3142 | O << "mul" ; |
| 3143 | bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction; |
| 3144 | auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2] |
| 3145 | : ExpressionRecipes[0]); |
| 3146 | Mul->printFlags(O); |
| 3147 | if (IsExtended) |
| 3148 | O << "(" ; |
| 3149 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 3150 | if (IsExtended) { |
| 3151 | auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); |
| 3152 | O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " |
| 3153 | << *Ext0->getResultType() << "), (" ; |
| 3154 | } else { |
| 3155 | O << ", " ; |
| 3156 | } |
| 3157 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 3158 | if (IsExtended) { |
| 3159 | auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]); |
| 3160 | O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " |
| 3161 | << *Ext1->getResultType() << ")" ; |
| 3162 | } |
| 3163 | if (Red->isConditional()) { |
| 3164 | O << ", " ; |
| 3165 | Red->getCondOp()->printAsOperand(O, SlotTracker); |
| 3166 | } |
| 3167 | O << ")" ; |
| 3168 | break; |
| 3169 | } |
| 3170 | } |
| 3171 | } |
| 3172 | |
| 3173 | void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3174 | VPSlotTracker &SlotTracker) const { |
| 3175 | if (isPartialReduction()) |
| 3176 | O << Indent << "PARTIAL-REDUCE " ; |
| 3177 | else |
| 3178 | O << Indent << "REDUCE " ; |
| 3179 | printAsOperand(O, SlotTracker); |
| 3180 | O << " = " ; |
| 3181 | getChainOp()->printAsOperand(O, SlotTracker); |
| 3182 | O << " +" ; |
| 3183 | printFlags(O); |
| 3184 | O << " reduce." |
| 3185 | << Instruction::getOpcodeName( |
| 3186 | RecurrenceDescriptor::getOpcode(getRecurrenceKind())) |
| 3187 | << " (" ; |
| 3188 | getVecOp()->printAsOperand(O, SlotTracker); |
| 3189 | if (isConditional()) { |
| 3190 | O << ", " ; |
| 3191 | getCondOp()->printAsOperand(O, SlotTracker); |
| 3192 | } |
| 3193 | O << ")" ; |
| 3194 | } |
| 3195 | |
| 3196 | void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3197 | VPSlotTracker &SlotTracker) const { |
| 3198 | O << Indent << "REDUCE " ; |
| 3199 | printAsOperand(O, SlotTracker); |
| 3200 | O << " = " ; |
| 3201 | getChainOp()->printAsOperand(O, SlotTracker); |
| 3202 | O << " +" ; |
| 3203 | printFlags(O); |
| 3204 | O << " vp.reduce." |
| 3205 | << Instruction::getOpcodeName( |
| 3206 | RecurrenceDescriptor::getOpcode(getRecurrenceKind())) |
| 3207 | << " (" ; |
| 3208 | getVecOp()->printAsOperand(O, SlotTracker); |
| 3209 | O << ", " ; |
| 3210 | getEVL()->printAsOperand(O, SlotTracker); |
| 3211 | if (isConditional()) { |
| 3212 | O << ", " ; |
| 3213 | getCondOp()->printAsOperand(O, SlotTracker); |
| 3214 | } |
| 3215 | O << ")" ; |
| 3216 | } |
| 3217 | |
| 3218 | #endif |
| 3219 | |
| 3220 | /// A helper function to scalarize a single Instruction in the innermost loop. |
| 3221 | /// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue |
| 3222 | /// operands from \p RepRecipe instead of \p Instr's operands. |
| 3223 | static void scalarizeInstruction(const Instruction *Instr, |
| 3224 | VPReplicateRecipe *RepRecipe, |
| 3225 | const VPLane &Lane, VPTransformState &State) { |
| 3226 | assert((!Instr->getType()->isAggregateType() || |
| 3227 | canVectorizeTy(Instr->getType())) && |
| 3228 | "Expected vectorizable or non-aggregate type." ); |
| 3229 | |
| 3230 | // Does this instruction return a value ? |
| 3231 | bool IsVoidRetTy = Instr->getType()->isVoidTy(); |
| 3232 | |
| 3233 | Instruction *Cloned = Instr->clone(); |
| 3234 | if (!IsVoidRetTy) { |
| 3235 | Cloned->setName(Instr->getName() + ".cloned" ); |
| 3236 | Type *ResultTy = State.TypeAnalysis.inferScalarType(V: RepRecipe); |
| 3237 | // The operands of the replicate recipe may have been narrowed, resulting in |
| 3238 | // a narrower result type. Update the type of the cloned instruction to the |
| 3239 | // correct type. |
| 3240 | if (ResultTy != Cloned->getType()) |
| 3241 | Cloned->mutateType(Ty: ResultTy); |
| 3242 | } |
| 3243 | |
| 3244 | RepRecipe->applyFlags(I&: *Cloned); |
| 3245 | RepRecipe->applyMetadata(I&: *Cloned); |
| 3246 | |
| 3247 | if (RepRecipe->hasPredicate()) |
| 3248 | cast<CmpInst>(Val: Cloned)->setPredicate(RepRecipe->getPredicate()); |
| 3249 | |
| 3250 | if (auto DL = RepRecipe->getDebugLoc()) |
| 3251 | State.setDebugLocFrom(DL); |
| 3252 | |
| 3253 | // Replace the operands of the cloned instructions with their scalar |
| 3254 | // equivalents in the new loop. |
| 3255 | for (const auto &I : enumerate(First: RepRecipe->operands())) { |
| 3256 | auto InputLane = Lane; |
| 3257 | VPValue *Operand = I.value(); |
| 3258 | if (vputils::isSingleScalar(VPV: Operand)) |
| 3259 | InputLane = VPLane::getFirstLane(); |
| 3260 | Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Lane: InputLane)); |
| 3261 | } |
| 3262 | |
| 3263 | // Place the cloned scalar in the new loop. |
| 3264 | State.Builder.Insert(I: Cloned); |
| 3265 | |
| 3266 | State.set(Def: RepRecipe, V: Cloned, Lane); |
| 3267 | |
| 3268 | // If we just cloned a new assumption, add it the assumption cache. |
| 3269 | if (auto *II = dyn_cast<AssumeInst>(Val: Cloned)) |
| 3270 | State.AC->registerAssumption(CI: II); |
| 3271 | |
| 3272 | assert( |
| 3273 | (RepRecipe->getRegion() || |
| 3274 | !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || |
| 3275 | all_of(RepRecipe->operands(), |
| 3276 | [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && |
| 3277 | "Expected a recipe is either within a region or all of its operands " |
| 3278 | "are defined outside the vectorized region." ); |
| 3279 | } |
| 3280 | |
| 3281 | void VPReplicateRecipe::execute(VPTransformState &State) { |
| 3282 | Instruction *UI = getUnderlyingInstr(); |
| 3283 | |
| 3284 | if (!State.Lane) { |
| 3285 | assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions " |
| 3286 | "must have already been unrolled" ); |
| 3287 | scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: VPLane(0), State); |
| 3288 | return; |
| 3289 | } |
| 3290 | |
| 3291 | assert((State.VF.isScalar() || !isSingleScalar()) && |
| 3292 | "uniform recipe shouldn't be predicated" ); |
| 3293 | assert(!State.VF.isScalable() && "Can't scalarize a scalable vector" ); |
| 3294 | scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: *State.Lane, State); |
| 3295 | // Insert scalar instance packing it into a vector. |
| 3296 | if (State.VF.isVector() && shouldPack()) { |
| 3297 | Value *WideValue = |
| 3298 | State.Lane->isFirstLane() |
| 3299 | ? PoisonValue::get(T: toVectorizedTy(Ty: UI->getType(), EC: State.VF)) |
| 3300 | : State.get(Def: this); |
| 3301 | State.set(Def: this, V: State.packScalarIntoVectorizedValue(Def: this, WideValue, |
| 3302 | Lane: *State.Lane)); |
| 3303 | } |
| 3304 | } |
| 3305 | |
| 3306 | bool VPReplicateRecipe::shouldPack() const { |
| 3307 | // Find if the recipe is used by a widened recipe via an intervening |
| 3308 | // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. |
| 3309 | return any_of(Range: users(), P: [](const VPUser *U) { |
| 3310 | if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U)) |
| 3311 | return !vputils::onlyScalarValuesUsed(Def: PredR); |
| 3312 | return false; |
| 3313 | }); |
| 3314 | } |
| 3315 | |
| 3316 | /// Returns a SCEV expression for \p Ptr if it is a pointer computation for |
| 3317 | /// which the legacy cost model computes a SCEV expression when computing the |
| 3318 | /// address cost. Computing SCEVs for VPValues is incomplete and returns |
| 3319 | /// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In |
| 3320 | /// those cases we fall back to the legacy cost model. Otherwise return nullptr. |
| 3321 | static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, |
| 3322 | PredicatedScalarEvolution &PSE, |
| 3323 | const Loop *L) { |
| 3324 | const SCEV *Addr = vputils::getSCEVExprForVPValue(V: Ptr, PSE, L); |
| 3325 | if (isa<SCEVCouldNotCompute>(Val: Addr)) |
| 3326 | return Addr; |
| 3327 | |
| 3328 | return vputils::isAddressSCEVForCost(Addr, SE&: *PSE.getSE(), L) ? Addr : nullptr; |
| 3329 | } |
| 3330 | |
| 3331 | /// Returns true if \p V is used as part of the address of another load or |
| 3332 | /// store. |
| 3333 | static bool isUsedByLoadStoreAddress(const VPUser *V) { |
| 3334 | SmallPtrSet<const VPUser *, 4> Seen; |
| 3335 | SmallVector<const VPUser *> WorkList = {V}; |
| 3336 | |
| 3337 | while (!WorkList.empty()) { |
| 3338 | auto *Cur = dyn_cast<VPSingleDefRecipe>(Val: WorkList.pop_back_val()); |
| 3339 | if (!Cur || !Seen.insert(Ptr: Cur).second) |
| 3340 | continue; |
| 3341 | |
| 3342 | auto *Blend = dyn_cast<VPBlendRecipe>(Val: Cur); |
| 3343 | // Skip blends that use V only through a compare by checking if any incoming |
| 3344 | // value was already visited. |
| 3345 | if (Blend && none_of(Range: seq<unsigned>(Begin: 0, End: Blend->getNumIncomingValues()), |
| 3346 | P: [&](unsigned I) { |
| 3347 | return Seen.contains( |
| 3348 | Ptr: Blend->getIncomingValue(Idx: I)->getDefiningRecipe()); |
| 3349 | })) |
| 3350 | continue; |
| 3351 | |
| 3352 | for (VPUser *U : Cur->users()) { |
| 3353 | if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(Val: U)) |
| 3354 | if (InterleaveR->getAddr() == Cur) |
| 3355 | return true; |
| 3356 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U)) { |
| 3357 | if (RepR->getOpcode() == Instruction::Load && |
| 3358 | RepR->getOperand(N: 0) == Cur) |
| 3359 | return true; |
| 3360 | if (RepR->getOpcode() == Instruction::Store && |
| 3361 | RepR->getOperand(N: 1) == Cur) |
| 3362 | return true; |
| 3363 | } |
| 3364 | if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(Val: U)) { |
| 3365 | if (MemR->getAddr() == Cur && MemR->isConsecutive()) |
| 3366 | return true; |
| 3367 | } |
| 3368 | } |
| 3369 | |
| 3370 | // The legacy cost model only supports scalarization loads/stores with phi |
| 3371 | // addresses, if the phi is directly used as load/store address. Don't |
| 3372 | // traverse further for Blends. |
| 3373 | if (Blend) |
| 3374 | continue; |
| 3375 | |
| 3376 | append_range(C&: WorkList, R: Cur->users()); |
| 3377 | } |
| 3378 | return false; |
| 3379 | } |
| 3380 | |
| 3381 | InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, |
| 3382 | VPCostContext &Ctx) const { |
| 3383 | Instruction *UI = cast<Instruction>(Val: getUnderlyingValue()); |
| 3384 | // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan |
| 3385 | // transform, avoid computing their cost multiple times for now. |
| 3386 | Ctx.SkipCostComputation.insert(Ptr: UI); |
| 3387 | |
| 3388 | if (VF.isScalable() && !isSingleScalar()) |
| 3389 | return InstructionCost::getInvalid(); |
| 3390 | |
| 3391 | switch (UI->getOpcode()) { |
| 3392 | case Instruction::Alloca: |
| 3393 | if (VF.isScalable()) |
| 3394 | return InstructionCost::getInvalid(); |
| 3395 | return Ctx.TTI.getArithmeticInstrCost( |
| 3396 | Opcode: Instruction::Mul, Ty: Ctx.Types.inferScalarType(V: this), CostKind: Ctx.CostKind); |
| 3397 | case Instruction::GetElementPtr: |
| 3398 | // We mark this instruction as zero-cost because the cost of GEPs in |
| 3399 | // vectorized code depends on whether the corresponding memory instruction |
| 3400 | // is scalarized or not. Therefore, we handle GEPs with the memory |
| 3401 | // instruction cost. |
| 3402 | return 0; |
| 3403 | case Instruction::Call: { |
| 3404 | auto *CalledFn = |
| 3405 | cast<Function>(Val: getOperand(N: getNumOperands() - 1)->getLiveInIRValue()); |
| 3406 | |
| 3407 | SmallVector<const VPValue *> ArgOps(drop_end(RangeOrContainer: operands())); |
| 3408 | SmallVector<Type *, 4> Tys; |
| 3409 | for (const VPValue *ArgOp : ArgOps) |
| 3410 | Tys.push_back(Elt: Ctx.Types.inferScalarType(V: ArgOp)); |
| 3411 | |
| 3412 | if (CalledFn->isIntrinsic()) |
| 3413 | // Various pseudo-intrinsics with costs of 0 are scalarized instead of |
| 3414 | // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early. |
| 3415 | switch (CalledFn->getIntrinsicID()) { |
| 3416 | case Intrinsic::assume: |
| 3417 | case Intrinsic::lifetime_end: |
| 3418 | case Intrinsic::lifetime_start: |
| 3419 | case Intrinsic::sideeffect: |
| 3420 | case Intrinsic::pseudoprobe: |
| 3421 | case Intrinsic::experimental_noalias_scope_decl: { |
| 3422 | assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this, |
| 3423 | ElementCount::getFixed(1), Ctx) == 0 && |
| 3424 | "scalarizing intrinsic should be free" ); |
| 3425 | return InstructionCost(0); |
| 3426 | } |
| 3427 | default: |
| 3428 | break; |
| 3429 | } |
| 3430 | |
| 3431 | Type *ResultTy = Ctx.Types.inferScalarType(V: this); |
| 3432 | InstructionCost ScalarCallCost = |
| 3433 | Ctx.TTI.getCallInstrCost(F: CalledFn, RetTy: ResultTy, Tys, CostKind: Ctx.CostKind); |
| 3434 | if (isSingleScalar()) { |
| 3435 | if (CalledFn->isIntrinsic()) |
| 3436 | ScalarCallCost = std::min( |
| 3437 | a: ScalarCallCost, |
| 3438 | b: getCostForIntrinsics(ID: CalledFn->getIntrinsicID(), Operands: ArgOps, R: *this, |
| 3439 | VF: ElementCount::getFixed(MinVal: 1), Ctx)); |
| 3440 | return ScalarCallCost; |
| 3441 | } |
| 3442 | |
| 3443 | return ScalarCallCost * VF.getFixedValue() + |
| 3444 | Ctx.getScalarizationOverhead(ResultTy, Operands: ArgOps, VF); |
| 3445 | } |
| 3446 | case Instruction::Add: |
| 3447 | case Instruction::Sub: |
| 3448 | case Instruction::FAdd: |
| 3449 | case Instruction::FSub: |
| 3450 | case Instruction::Mul: |
| 3451 | case Instruction::FMul: |
| 3452 | case Instruction::FDiv: |
| 3453 | case Instruction::FRem: |
| 3454 | case Instruction::Shl: |
| 3455 | case Instruction::LShr: |
| 3456 | case Instruction::AShr: |
| 3457 | case Instruction::And: |
| 3458 | case Instruction::Or: |
| 3459 | case Instruction::Xor: |
| 3460 | case Instruction::ICmp: |
| 3461 | case Instruction::FCmp: |
| 3462 | return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), |
| 3463 | Ctx) * |
| 3464 | (isSingleScalar() ? 1 : VF.getFixedValue()); |
| 3465 | case Instruction::SDiv: |
| 3466 | case Instruction::UDiv: |
| 3467 | case Instruction::SRem: |
| 3468 | case Instruction::URem: { |
| 3469 | InstructionCost ScalarCost = |
| 3470 | getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), Ctx); |
| 3471 | if (isSingleScalar()) |
| 3472 | return ScalarCost; |
| 3473 | |
| 3474 | // If any of the operands is from a different replicate region and has its |
| 3475 | // cost skipped, it may have been forced to scalar. Fall back to legacy cost |
| 3476 | // model to avoid cost mis-match. |
| 3477 | if (any_of(Range: operands(), P: [&Ctx, VF](VPValue *Op) { |
| 3478 | auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: Op); |
| 3479 | if (!PredR) |
| 3480 | return false; |
| 3481 | return Ctx.skipCostComputation( |
| 3482 | UI: dyn_cast_or_null<Instruction>( |
| 3483 | Val: PredR->getOperand(N: 0)->getUnderlyingValue()), |
| 3484 | IsVector: VF.isVector()); |
| 3485 | })) |
| 3486 | break; |
| 3487 | |
| 3488 | ScalarCost = ScalarCost * VF.getFixedValue() + |
| 3489 | Ctx.getScalarizationOverhead(ResultTy: Ctx.Types.inferScalarType(V: this), |
| 3490 | Operands: to_vector(Range: operands()), VF); |
| 3491 | // If the recipe is not predicated (i.e. not in a replicate region), return |
| 3492 | // the scalar cost. Otherwise handle predicated cost. |
| 3493 | if (!getRegion()->isReplicator()) |
| 3494 | return ScalarCost; |
| 3495 | |
| 3496 | // Account for the phi nodes that we will create. |
| 3497 | ScalarCost += VF.getFixedValue() * |
| 3498 | Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 3499 | // Scale the cost by the probability of executing the predicated blocks. |
| 3500 | // This assumes the predicated block for each vector lane is equally |
| 3501 | // likely. |
| 3502 | ScalarCost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent()); |
| 3503 | return ScalarCost; |
| 3504 | } |
| 3505 | case Instruction::Load: |
| 3506 | case Instruction::Store: { |
| 3507 | bool IsLoad = UI->getOpcode() == Instruction::Load; |
| 3508 | const VPValue *PtrOp = getOperand(N: !IsLoad); |
| 3509 | const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr: PtrOp, PSE&: Ctx.PSE, L: Ctx.L); |
| 3510 | if (isa_and_nonnull<SCEVCouldNotCompute>(Val: PtrSCEV)) |
| 3511 | break; |
| 3512 | |
| 3513 | Type *ValTy = Ctx.Types.inferScalarType(V: IsLoad ? this : getOperand(N: 0)); |
| 3514 | Type *ScalarPtrTy = Ctx.Types.inferScalarType(V: PtrOp); |
| 3515 | const Align Alignment = getLoadStoreAlignment(I: UI); |
| 3516 | unsigned AS = cast<PointerType>(Val: ScalarPtrTy)->getAddressSpace(); |
| 3517 | TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: UI->getOperand(i: 0)); |
| 3518 | bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); |
| 3519 | bool UsedByLoadStoreAddress = |
| 3520 | !PreferVectorizedAddressing && isUsedByLoadStoreAddress(V: this); |
| 3521 | InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( |
| 3522 | Opcode: UI->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind, OpdInfo: OpInfo, |
| 3523 | I: UsedByLoadStoreAddress ? UI : nullptr); |
| 3524 | |
| 3525 | Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(Scalar: ScalarPtrTy, EC: VF); |
| 3526 | InstructionCost ScalarCost = |
| 3527 | ScalarMemOpCost + |
| 3528 | Ctx.TTI.getAddressComputationCost( |
| 3529 | PtrTy, SE: UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), Ptr: PtrSCEV, |
| 3530 | CostKind: Ctx.CostKind); |
| 3531 | if (isSingleScalar()) |
| 3532 | return ScalarCost; |
| 3533 | |
| 3534 | SmallVector<const VPValue *> OpsToScalarize; |
| 3535 | Type *ResultTy = Type::getVoidTy(C&: PtrTy->getContext()); |
| 3536 | // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we |
| 3537 | // don't assign scalarization overhead in general, if the target prefers |
| 3538 | // vectorized addressing or the loaded value is used as part of an address |
| 3539 | // of another load or store. |
| 3540 | if (!UsedByLoadStoreAddress) { |
| 3541 | bool EfficientVectorLoadStore = |
| 3542 | Ctx.TTI.supportsEfficientVectorElementLoadStore(); |
| 3543 | if (!(IsLoad && !PreferVectorizedAddressing) && |
| 3544 | !(!IsLoad && EfficientVectorLoadStore)) |
| 3545 | append_range(C&: OpsToScalarize, R: operands()); |
| 3546 | |
| 3547 | if (!EfficientVectorLoadStore) |
| 3548 | ResultTy = Ctx.Types.inferScalarType(V: this); |
| 3549 | } |
| 3550 | |
| 3551 | TTI::VectorInstrContext VIC = |
| 3552 | IsLoad ? TTI::VectorInstrContext::Load : TTI::VectorInstrContext::Store; |
| 3553 | InstructionCost Cost = |
| 3554 | (ScalarCost * VF.getFixedValue()) + |
| 3555 | Ctx.getScalarizationOverhead(ResultTy, Operands: OpsToScalarize, VF, VIC, AlwaysIncludeReplicatingR: true); |
| 3556 | |
| 3557 | const VPRegionBlock *ParentRegion = getRegion(); |
| 3558 | if (ParentRegion && ParentRegion->isReplicator()) { |
| 3559 | // TODO: Handle loop-invariant pointers in predicated blocks. For now, |
| 3560 | // fall back to the legacy cost model. |
| 3561 | if (!PtrSCEV || Ctx.PSE.getSE()->isLoopInvariant(S: PtrSCEV, L: Ctx.L)) |
| 3562 | break; |
| 3563 | Cost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent()); |
| 3564 | Cost += Ctx.TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind: Ctx.CostKind); |
| 3565 | |
| 3566 | auto *VecI1Ty = VectorType::get( |
| 3567 | ElementType: IntegerType::getInt1Ty(C&: Ctx.L->getHeader()->getContext()), EC: VF); |
| 3568 | Cost += Ctx.TTI.getScalarizationOverhead( |
| 3569 | Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), |
| 3570 | /*Insert=*/false, /*Extract=*/true, CostKind: Ctx.CostKind); |
| 3571 | |
| 3572 | if (Ctx.useEmulatedMaskMemRefHack(R: this, VF)) { |
| 3573 | // Artificially setting to a high enough value to practically disable |
| 3574 | // vectorization with such operations. |
| 3575 | return 3000000; |
| 3576 | } |
| 3577 | } |
| 3578 | return Cost; |
| 3579 | } |
| 3580 | case Instruction::SExt: |
| 3581 | case Instruction::ZExt: |
| 3582 | case Instruction::FPToUI: |
| 3583 | case Instruction::FPToSI: |
| 3584 | case Instruction::FPExt: |
| 3585 | case Instruction::PtrToInt: |
| 3586 | case Instruction::PtrToAddr: |
| 3587 | case Instruction::IntToPtr: |
| 3588 | case Instruction::SIToFP: |
| 3589 | case Instruction::UIToFP: |
| 3590 | case Instruction::Trunc: |
| 3591 | case Instruction::FPTrunc: |
| 3592 | case Instruction::AddrSpaceCast: { |
| 3593 | return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), |
| 3594 | Ctx) * |
| 3595 | (isSingleScalar() ? 1 : VF.getFixedValue()); |
| 3596 | } |
| 3597 | case Instruction::ExtractValue: |
| 3598 | case Instruction::InsertValue: |
| 3599 | return Ctx.TTI.getInsertExtractValueCost(Opcode: getOpcode(), CostKind: Ctx.CostKind); |
| 3600 | } |
| 3601 | |
| 3602 | return Ctx.getLegacyCost(UI, VF); |
| 3603 | } |
| 3604 | |
| 3605 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3606 | void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3607 | VPSlotTracker &SlotTracker) const { |
| 3608 | O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE " ); |
| 3609 | |
| 3610 | if (!getUnderlyingInstr()->getType()->isVoidTy()) { |
| 3611 | printAsOperand(O, SlotTracker); |
| 3612 | O << " = " ; |
| 3613 | } |
| 3614 | if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { |
| 3615 | O << "call" ; |
| 3616 | printFlags(O); |
| 3617 | O << "@" << CB->getCalledFunction()->getName() << "(" ; |
| 3618 | interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), |
| 3619 | O, [&O, &SlotTracker](VPValue *Op) { |
| 3620 | Op->printAsOperand(O, SlotTracker); |
| 3621 | }); |
| 3622 | O << ")" ; |
| 3623 | } else { |
| 3624 | O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); |
| 3625 | printFlags(O); |
| 3626 | printOperands(O, SlotTracker); |
| 3627 | } |
| 3628 | |
| 3629 | if (shouldPack()) |
| 3630 | O << " (S->V)" ; |
| 3631 | } |
| 3632 | #endif |
| 3633 | |
| 3634 | void VPBranchOnMaskRecipe::execute(VPTransformState &State) { |
| 3635 | assert(State.Lane && "Branch on Mask works only on single instance." ); |
| 3636 | |
| 3637 | VPValue *BlockInMask = getOperand(N: 0); |
| 3638 | Value *ConditionBit = State.get(Def: BlockInMask, Lane: *State.Lane); |
| 3639 | |
| 3640 | // Replace the temporary unreachable terminator with a new conditional branch, |
| 3641 | // whose two destinations will be set later when they are created. |
| 3642 | auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); |
| 3643 | assert(isa<UnreachableInst>(CurrentTerminator) && |
| 3644 | "Expected to replace unreachable terminator with conditional branch." ); |
| 3645 | auto CondBr = |
| 3646 | State.Builder.CreateCondBr(Cond: ConditionBit, True: State.CFG.PrevBB, False: nullptr); |
| 3647 | CondBr->setSuccessor(idx: 0, NewSucc: nullptr); |
| 3648 | CurrentTerminator->eraseFromParent(); |
| 3649 | } |
| 3650 | |
| 3651 | InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, |
| 3652 | VPCostContext &Ctx) const { |
| 3653 | // The legacy cost model doesn't assign costs to branches for individual |
| 3654 | // replicate regions. Match the current behavior in the VPlan cost model for |
| 3655 | // now. |
| 3656 | return 0; |
| 3657 | } |
| 3658 | |
| 3659 | void VPPredInstPHIRecipe::execute(VPTransformState &State) { |
| 3660 | assert(State.Lane && "Predicated instruction PHI works per instance." ); |
| 3661 | Instruction *ScalarPredInst = |
| 3662 | cast<Instruction>(Val: State.get(Def: getOperand(N: 0), Lane: *State.Lane)); |
| 3663 | BasicBlock *PredicatedBB = ScalarPredInst->getParent(); |
| 3664 | BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); |
| 3665 | assert(PredicatingBB && "Predicated block has no single predecessor." ); |
| 3666 | assert(isa<VPReplicateRecipe>(getOperand(0)) && |
| 3667 | "operand must be VPReplicateRecipe" ); |
| 3668 | |
| 3669 | // By current pack/unpack logic we need to generate only a single phi node: if |
| 3670 | // a vector value for the predicated instruction exists at this point it means |
| 3671 | // the instruction has vector users only, and a phi for the vector value is |
| 3672 | // needed. In this case the recipe of the predicated instruction is marked to |
| 3673 | // also do that packing, thereby "hoisting" the insert-element sequence. |
| 3674 | // Otherwise, a phi node for the scalar value is needed. |
| 3675 | if (State.hasVectorValue(Def: getOperand(N: 0))) { |
| 3676 | auto *VecI = cast<Instruction>(Val: State.get(Def: getOperand(N: 0))); |
| 3677 | assert((isa<InsertElementInst, InsertValueInst>(VecI)) && |
| 3678 | "Packed operands must generate an insertelement or insertvalue" ); |
| 3679 | |
| 3680 | // If VectorI is a struct, it will be a sequence like: |
| 3681 | // %1 = insertvalue %unmodified, %x, 0 |
| 3682 | // %2 = insertvalue %1, %y, 1 |
| 3683 | // %VectorI = insertvalue %2, %z, 2 |
| 3684 | // To get the unmodified vector we need to look through the chain. |
| 3685 | if (auto *StructTy = dyn_cast<StructType>(Val: VecI->getType())) |
| 3686 | for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++) |
| 3687 | VecI = cast<InsertValueInst>(Val: VecI->getOperand(i: 0)); |
| 3688 | |
| 3689 | PHINode *VPhi = State.Builder.CreatePHI(Ty: VecI->getType(), NumReservedValues: 2); |
| 3690 | VPhi->addIncoming(V: VecI->getOperand(i: 0), BB: PredicatingBB); // Unmodified vector. |
| 3691 | VPhi->addIncoming(V: VecI, BB: PredicatedBB); // New vector with inserted element. |
| 3692 | if (State.hasVectorValue(Def: this)) |
| 3693 | State.reset(Def: this, V: VPhi); |
| 3694 | else |
| 3695 | State.set(Def: this, V: VPhi); |
| 3696 | // NOTE: Currently we need to update the value of the operand, so the next |
| 3697 | // predicated iteration inserts its generated value in the correct vector. |
| 3698 | State.reset(Def: getOperand(N: 0), V: VPhi); |
| 3699 | } else { |
| 3700 | if (vputils::onlyFirstLaneUsed(Def: this) && !State.Lane->isFirstLane()) |
| 3701 | return; |
| 3702 | |
| 3703 | Type *PredInstType = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)); |
| 3704 | PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: 2); |
| 3705 | Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()), |
| 3706 | BB: PredicatingBB); |
| 3707 | Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB); |
| 3708 | if (State.hasScalarValue(Def: this, Lane: *State.Lane)) |
| 3709 | State.reset(Def: this, V: Phi, Lane: *State.Lane); |
| 3710 | else |
| 3711 | State.set(Def: this, V: Phi, Lane: *State.Lane); |
| 3712 | // NOTE: Currently we need to update the value of the operand, so the next |
| 3713 | // predicated iteration inserts its generated value in the correct vector. |
| 3714 | State.reset(Def: getOperand(N: 0), V: Phi, Lane: *State.Lane); |
| 3715 | } |
| 3716 | } |
| 3717 | |
| 3718 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3719 | void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3720 | VPSlotTracker &SlotTracker) const { |
| 3721 | O << Indent << "PHI-PREDICATED-INSTRUCTION " ; |
| 3722 | printAsOperand(O, SlotTracker); |
| 3723 | O << " = " ; |
| 3724 | printOperands(O, SlotTracker); |
| 3725 | } |
| 3726 | #endif |
| 3727 | |
| 3728 | InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, |
| 3729 | VPCostContext &Ctx) const { |
| 3730 | Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF); |
| 3731 | unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr())) |
| 3732 | ->getAddressSpace(); |
| 3733 | unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) |
| 3734 | ? Instruction::Load |
| 3735 | : Instruction::Store; |
| 3736 | |
| 3737 | if (!Consecutive) { |
| 3738 | // TODO: Using the original IR may not be accurate. |
| 3739 | // Currently, ARM will use the underlying IR to calculate gather/scatter |
| 3740 | // instruction cost. |
| 3741 | assert(!Reverse && |
| 3742 | "Inconsecutive memory access should not have the order." ); |
| 3743 | |
| 3744 | const Value *Ptr = getLoadStorePointerOperand(V: &Ingredient); |
| 3745 | Type *PtrTy = Ptr->getType(); |
| 3746 | |
| 3747 | // If the address value is uniform across all lanes, then the address can be |
| 3748 | // calculated with scalar type and broadcast. |
| 3749 | if (!vputils::isSingleScalar(VPV: getAddr())) |
| 3750 | PtrTy = toVectorTy(Scalar: PtrTy, EC: VF); |
| 3751 | |
| 3752 | unsigned IID = isa<VPWidenLoadRecipe>(Val: this) ? Intrinsic::masked_gather |
| 3753 | : isa<VPWidenStoreRecipe>(Val: this) ? Intrinsic::masked_scatter |
| 3754 | : isa<VPWidenLoadEVLRecipe>(Val: this) ? Intrinsic::vp_gather |
| 3755 | : Intrinsic::vp_scatter; |
| 3756 | return Ctx.TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, |
| 3757 | CostKind: Ctx.CostKind) + |
| 3758 | Ctx.TTI.getMemIntrinsicInstrCost( |
| 3759 | MICA: MemIntrinsicCostAttributes(IID, Ty, Ptr, IsMasked, Alignment, |
| 3760 | &Ingredient), |
| 3761 | CostKind: Ctx.CostKind); |
| 3762 | } |
| 3763 | |
| 3764 | InstructionCost Cost = 0; |
| 3765 | if (IsMasked) { |
| 3766 | unsigned IID = isa<VPWidenLoadRecipe>(Val: this) ? Intrinsic::masked_load |
| 3767 | : Intrinsic::masked_store; |
| 3768 | Cost += Ctx.TTI.getMemIntrinsicInstrCost( |
| 3769 | MICA: MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), CostKind: Ctx.CostKind); |
| 3770 | } else { |
| 3771 | TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo( |
| 3772 | V: isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) ? getOperand(N: 0) |
| 3773 | : getOperand(N: 1)); |
| 3774 | Cost += Ctx.TTI.getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind, |
| 3775 | OpdInfo: OpInfo, I: &Ingredient); |
| 3776 | } |
| 3777 | return Cost; |
| 3778 | } |
| 3779 | |
| 3780 | void VPWidenLoadRecipe::execute(VPTransformState &State) { |
| 3781 | Type *ScalarDataTy = getLoadStoreType(I: &Ingredient); |
| 3782 | auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF); |
| 3783 | bool CreateGather = !isConsecutive(); |
| 3784 | |
| 3785 | auto &Builder = State.Builder; |
| 3786 | Value *Mask = nullptr; |
| 3787 | if (auto *VPMask = getMask()) { |
| 3788 | // Mask reversal is only needed for non-all-one (null) masks, as reverse |
| 3789 | // of a null all-one mask is a null mask. |
| 3790 | Mask = State.get(Def: VPMask); |
| 3791 | if (isReverse()) |
| 3792 | Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse" ); |
| 3793 | } |
| 3794 | |
| 3795 | Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateGather); |
| 3796 | Value *NewLI; |
| 3797 | if (CreateGather) { |
| 3798 | NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr, |
| 3799 | Name: "wide.masked.gather" ); |
| 3800 | } else if (Mask) { |
| 3801 | NewLI = |
| 3802 | Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask, |
| 3803 | PassThru: PoisonValue::get(T: DataTy), Name: "wide.masked.load" ); |
| 3804 | } else { |
| 3805 | NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load" ); |
| 3806 | } |
| 3807 | applyMetadata(I&: *cast<Instruction>(Val: NewLI)); |
| 3808 | State.set(Def: this, V: NewLI); |
| 3809 | } |
| 3810 | |
| 3811 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3812 | void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3813 | VPSlotTracker &SlotTracker) const { |
| 3814 | O << Indent << "WIDEN " ; |
| 3815 | printAsOperand(O, SlotTracker); |
| 3816 | O << " = load " ; |
| 3817 | printOperands(O, SlotTracker); |
| 3818 | } |
| 3819 | #endif |
| 3820 | |
| 3821 | /// Use all-true mask for reverse rather than actual mask, as it avoids a |
| 3822 | /// dependence w/o affecting the result. |
| 3823 | static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, |
| 3824 | Value *EVL, const Twine &Name) { |
| 3825 | VectorType *ValTy = cast<VectorType>(Val: Operand->getType()); |
| 3826 | Value *AllTrueMask = |
| 3827 | Builder.CreateVectorSplat(EC: ValTy->getElementCount(), V: Builder.getTrue()); |
| 3828 | return Builder.CreateIntrinsic(RetTy: ValTy, ID: Intrinsic::experimental_vp_reverse, |
| 3829 | Args: {Operand, AllTrueMask, EVL}, FMFSource: nullptr, Name); |
| 3830 | } |
| 3831 | |
| 3832 | void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { |
| 3833 | Type *ScalarDataTy = getLoadStoreType(I: &Ingredient); |
| 3834 | auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF); |
| 3835 | bool CreateGather = !isConsecutive(); |
| 3836 | |
| 3837 | auto &Builder = State.Builder; |
| 3838 | CallInst *NewLI; |
| 3839 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 3840 | Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateGather); |
| 3841 | Value *Mask = nullptr; |
| 3842 | if (VPValue *VPMask = getMask()) { |
| 3843 | Mask = State.get(Def: VPMask); |
| 3844 | if (isReverse()) |
| 3845 | Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask" ); |
| 3846 | } else { |
| 3847 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
| 3848 | } |
| 3849 | |
| 3850 | if (CreateGather) { |
| 3851 | NewLI = |
| 3852 | Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_gather, Args: {Addr, Mask, EVL}, |
| 3853 | FMFSource: nullptr, Name: "wide.masked.gather" ); |
| 3854 | } else { |
| 3855 | NewLI = Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_load, |
| 3856 | Args: {Addr, Mask, EVL}, FMFSource: nullptr, Name: "vp.op.load" ); |
| 3857 | } |
| 3858 | NewLI->addParamAttr( |
| 3859 | ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment)); |
| 3860 | applyMetadata(I&: *NewLI); |
| 3861 | Instruction *Res = NewLI; |
| 3862 | State.set(Def: this, V: Res); |
| 3863 | } |
| 3864 | |
| 3865 | InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, |
| 3866 | VPCostContext &Ctx) const { |
| 3867 | if (!Consecutive || IsMasked) |
| 3868 | return VPWidenMemoryRecipe::computeCost(VF, Ctx); |
| 3869 | |
| 3870 | // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost() |
| 3871 | // here because the EVL recipes using EVL to replace the tail mask. But in the |
| 3872 | // legacy model, it will always calculate the cost of mask. |
| 3873 | // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we |
| 3874 | // don't need to compare to the legacy cost model. |
| 3875 | Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF); |
| 3876 | unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr())) |
| 3877 | ->getAddressSpace(); |
| 3878 | return Ctx.TTI.getMemIntrinsicInstrCost( |
| 3879 | MICA: MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS), |
| 3880 | CostKind: Ctx.CostKind); |
| 3881 | } |
| 3882 | |
| 3883 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3884 | void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3885 | VPSlotTracker &SlotTracker) const { |
| 3886 | O << Indent << "WIDEN " ; |
| 3887 | printAsOperand(O, SlotTracker); |
| 3888 | O << " = vp.load " ; |
| 3889 | printOperands(O, SlotTracker); |
| 3890 | } |
| 3891 | #endif |
| 3892 | |
| 3893 | void VPWidenStoreRecipe::execute(VPTransformState &State) { |
| 3894 | VPValue *StoredVPValue = getStoredValue(); |
| 3895 | bool CreateScatter = !isConsecutive(); |
| 3896 | |
| 3897 | auto &Builder = State.Builder; |
| 3898 | |
| 3899 | Value *Mask = nullptr; |
| 3900 | if (auto *VPMask = getMask()) { |
| 3901 | // Mask reversal is only needed for non-all-one (null) masks, as reverse |
| 3902 | // of a null all-one mask is a null mask. |
| 3903 | Mask = State.get(Def: VPMask); |
| 3904 | if (isReverse()) |
| 3905 | Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse" ); |
| 3906 | } |
| 3907 | |
| 3908 | Value *StoredVal = State.get(Def: StoredVPValue); |
| 3909 | Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateScatter); |
| 3910 | Instruction *NewSI = nullptr; |
| 3911 | if (CreateScatter) |
| 3912 | NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask); |
| 3913 | else if (Mask) |
| 3914 | NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask); |
| 3915 | else |
| 3916 | NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment); |
| 3917 | applyMetadata(I&: *NewSI); |
| 3918 | } |
| 3919 | |
| 3920 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3921 | void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3922 | VPSlotTracker &SlotTracker) const { |
| 3923 | O << Indent << "WIDEN store " ; |
| 3924 | printOperands(O, SlotTracker); |
| 3925 | } |
| 3926 | #endif |
| 3927 | |
| 3928 | void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { |
| 3929 | VPValue *StoredValue = getStoredValue(); |
| 3930 | bool CreateScatter = !isConsecutive(); |
| 3931 | |
| 3932 | auto &Builder = State.Builder; |
| 3933 | |
| 3934 | CallInst *NewSI = nullptr; |
| 3935 | Value *StoredVal = State.get(Def: StoredValue); |
| 3936 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 3937 | Value *Mask = nullptr; |
| 3938 | if (VPValue *VPMask = getMask()) { |
| 3939 | Mask = State.get(Def: VPMask); |
| 3940 | if (isReverse()) |
| 3941 | Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask" ); |
| 3942 | } else { |
| 3943 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
| 3944 | } |
| 3945 | Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateScatter); |
| 3946 | if (CreateScatter) { |
| 3947 | NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()), |
| 3948 | ID: Intrinsic::vp_scatter, |
| 3949 | Args: {StoredVal, Addr, Mask, EVL}); |
| 3950 | } else { |
| 3951 | NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()), |
| 3952 | ID: Intrinsic::vp_store, |
| 3953 | Args: {StoredVal, Addr, Mask, EVL}); |
| 3954 | } |
| 3955 | NewSI->addParamAttr( |
| 3956 | ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment)); |
| 3957 | applyMetadata(I&: *NewSI); |
| 3958 | } |
| 3959 | |
| 3960 | InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, |
| 3961 | VPCostContext &Ctx) const { |
| 3962 | if (!Consecutive || IsMasked) |
| 3963 | return VPWidenMemoryRecipe::computeCost(VF, Ctx); |
| 3964 | |
| 3965 | // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost() |
| 3966 | // here because the EVL recipes using EVL to replace the tail mask. But in the |
| 3967 | // legacy model, it will always calculate the cost of mask. |
| 3968 | // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we |
| 3969 | // don't need to compare to the legacy cost model. |
| 3970 | Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF); |
| 3971 | unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr())) |
| 3972 | ->getAddressSpace(); |
| 3973 | return Ctx.TTI.getMemIntrinsicInstrCost( |
| 3974 | MICA: MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS), |
| 3975 | CostKind: Ctx.CostKind); |
| 3976 | } |
| 3977 | |
| 3978 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3979 | void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3980 | VPSlotTracker &SlotTracker) const { |
| 3981 | O << Indent << "WIDEN vp.store " ; |
| 3982 | printOperands(O, SlotTracker); |
| 3983 | } |
| 3984 | #endif |
| 3985 | |
| 3986 | static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, |
| 3987 | VectorType *DstVTy, const DataLayout &DL) { |
| 3988 | // Verify that V is a vector type with same number of elements as DstVTy. |
| 3989 | auto VF = DstVTy->getElementCount(); |
| 3990 | auto *SrcVecTy = cast<VectorType>(Val: V->getType()); |
| 3991 | assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match" ); |
| 3992 | Type *SrcElemTy = SrcVecTy->getElementType(); |
| 3993 | Type *DstElemTy = DstVTy->getElementType(); |
| 3994 | assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && |
| 3995 | "Vector elements must have same size" ); |
| 3996 | |
| 3997 | // Do a direct cast if element types are castable. |
| 3998 | if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) { |
| 3999 | return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy); |
| 4000 | } |
| 4001 | // V cannot be directly casted to desired vector type. |
| 4002 | // May happen when V is a floating point vector but DstVTy is a vector of |
| 4003 | // pointers or vice-versa. Handle this using a two-step bitcast using an |
| 4004 | // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. |
| 4005 | assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && |
| 4006 | "Only one type should be a pointer type" ); |
| 4007 | assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && |
| 4008 | "Only one type should be a floating point type" ); |
| 4009 | Type *IntTy = |
| 4010 | IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy)); |
| 4011 | auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF); |
| 4012 | Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy); |
| 4013 | return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy); |
| 4014 | } |
| 4015 | |
| 4016 | /// Return a vector containing interleaved elements from multiple |
| 4017 | /// smaller input vectors. |
| 4018 | static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, |
| 4019 | const Twine &Name) { |
| 4020 | unsigned Factor = Vals.size(); |
| 4021 | assert(Factor > 1 && "Tried to interleave invalid number of vectors" ); |
| 4022 | |
| 4023 | VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType()); |
| 4024 | #ifndef NDEBUG |
| 4025 | for (Value *Val : Vals) |
| 4026 | assert(Val->getType() == VecTy && "Tried to interleave mismatched types" ); |
| 4027 | #endif |
| 4028 | |
| 4029 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), so |
| 4030 | // must use intrinsics to interleave. |
| 4031 | if (VecTy->isScalableTy()) { |
| 4032 | assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors" ); |
| 4033 | return Builder.CreateVectorInterleave(Ops: Vals, Name); |
| 4034 | } |
| 4035 | |
| 4036 | // Fixed length. Start by concatenating all vectors into a wide vector. |
| 4037 | Value *WideVec = concatenateVectors(Builder, Vecs: Vals); |
| 4038 | |
| 4039 | // Interleave the elements into the wide vector. |
| 4040 | const unsigned NumElts = VecTy->getElementCount().getFixedValue(); |
| 4041 | return Builder.CreateShuffleVector( |
| 4042 | V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name); |
| 4043 | } |
| 4044 | |
| 4045 | // Try to vectorize the interleave group that \p Instr belongs to. |
| 4046 | // |
| 4047 | // E.g. Translate following interleaved load group (factor = 3): |
| 4048 | // for (i = 0; i < N; i+=3) { |
| 4049 | // R = Pic[i]; // Member of index 0 |
| 4050 | // G = Pic[i+1]; // Member of index 1 |
| 4051 | // B = Pic[i+2]; // Member of index 2 |
| 4052 | // ... // do something to R, G, B |
| 4053 | // } |
| 4054 | // To: |
| 4055 | // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B |
| 4056 | // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements |
| 4057 | // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements |
| 4058 | // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements |
| 4059 | // |
| 4060 | // Or translate following interleaved store group (factor = 3): |
| 4061 | // for (i = 0; i < N; i+=3) { |
| 4062 | // ... do something to R, G, B |
| 4063 | // Pic[i] = R; // Member of index 0 |
| 4064 | // Pic[i+1] = G; // Member of index 1 |
| 4065 | // Pic[i+2] = B; // Member of index 2 |
| 4066 | // } |
| 4067 | // To: |
| 4068 | // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> |
| 4069 | // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> |
| 4070 | // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, |
| 4071 | // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements |
| 4072 | // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B |
| 4073 | void VPInterleaveRecipe::execute(VPTransformState &State) { |
| 4074 | assert(!State.Lane && "Interleave group being replicated." ); |
| 4075 | assert((!needsMaskForGaps() || !State.VF.isScalable()) && |
| 4076 | "Masking gaps for scalable vectors is not yet supported." ); |
| 4077 | const InterleaveGroup<Instruction> *Group = getInterleaveGroup(); |
| 4078 | Instruction *Instr = Group->getInsertPos(); |
| 4079 | |
| 4080 | // Prepare for the vector type of the interleaved load/store. |
| 4081 | Type *ScalarTy = getLoadStoreType(I: Instr); |
| 4082 | unsigned InterleaveFactor = Group->getFactor(); |
| 4083 | auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF * InterleaveFactor); |
| 4084 | |
| 4085 | VPValue *BlockInMask = getMask(); |
| 4086 | VPValue *Addr = getAddr(); |
| 4087 | Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0)); |
| 4088 | |
| 4089 | auto CreateGroupMask = [&BlockInMask, &State, |
| 4090 | &InterleaveFactor](Value *MaskForGaps) -> Value * { |
| 4091 | if (State.VF.isScalable()) { |
| 4092 | assert(!MaskForGaps && "Interleaved groups with gaps are not supported." ); |
| 4093 | assert(InterleaveFactor <= 8 && |
| 4094 | "Unsupported deinterleave factor for scalable vectors" ); |
| 4095 | auto *ResBlockInMask = State.get(Def: BlockInMask); |
| 4096 | SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask); |
| 4097 | return interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask" ); |
| 4098 | } |
| 4099 | |
| 4100 | if (!BlockInMask) |
| 4101 | return MaskForGaps; |
| 4102 | |
| 4103 | Value *ResBlockInMask = State.get(Def: BlockInMask); |
| 4104 | Value *ShuffledMask = State.Builder.CreateShuffleVector( |
| 4105 | V: ResBlockInMask, |
| 4106 | Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getFixedValue()), |
| 4107 | Name: "interleaved.mask" ); |
| 4108 | return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And, |
| 4109 | LHS: ShuffledMask, RHS: MaskForGaps) |
| 4110 | : ShuffledMask; |
| 4111 | }; |
| 4112 | |
| 4113 | const DataLayout &DL = Instr->getDataLayout(); |
| 4114 | // Vectorize the interleaved load group. |
| 4115 | if (isa<LoadInst>(Val: Instr)) { |
| 4116 | Value *MaskForGaps = nullptr; |
| 4117 | if (needsMaskForGaps()) { |
| 4118 | MaskForGaps = |
| 4119 | createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getFixedValue(), Group: *Group); |
| 4120 | assert(MaskForGaps && "Mask for Gaps is required but it is null" ); |
| 4121 | } |
| 4122 | |
| 4123 | Instruction *NewLoad; |
| 4124 | if (BlockInMask || MaskForGaps) { |
| 4125 | Value *GroupMask = CreateGroupMask(MaskForGaps); |
| 4126 | Value *PoisonVec = PoisonValue::get(T: VecTy); |
| 4127 | NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: ResAddr, |
| 4128 | Alignment: Group->getAlign(), Mask: GroupMask, |
| 4129 | PassThru: PoisonVec, Name: "wide.masked.vec" ); |
| 4130 | } else |
| 4131 | NewLoad = State.Builder.CreateAlignedLoad(Ty: VecTy, Ptr: ResAddr, |
| 4132 | Align: Group->getAlign(), Name: "wide.vec" ); |
| 4133 | applyMetadata(I&: *NewLoad); |
| 4134 | // TODO: Also manage existing metadata using VPIRMetadata. |
| 4135 | Group->addMetadata(NewInst: NewLoad); |
| 4136 | |
| 4137 | ArrayRef<VPRecipeValue *> VPDefs = definedValues(); |
| 4138 | if (VecTy->isScalableTy()) { |
| 4139 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), |
| 4140 | // so must use intrinsics to deinterleave. |
| 4141 | assert(InterleaveFactor <= 8 && |
| 4142 | "Unsupported deinterleave factor for scalable vectors" ); |
| 4143 | NewLoad = State.Builder.CreateIntrinsic( |
| 4144 | ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor), |
| 4145 | Types: NewLoad->getType(), Args: NewLoad, |
| 4146 | /*FMFSource=*/nullptr, Name: "strided.vec" ); |
| 4147 | } |
| 4148 | |
| 4149 | auto CreateStridedVector = [&InterleaveFactor, &State, |
| 4150 | &NewLoad](unsigned Index) -> Value * { |
| 4151 | assert(Index < InterleaveFactor && "Illegal group index" ); |
| 4152 | if (State.VF.isScalable()) |
| 4153 | return State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: Index); |
| 4154 | |
| 4155 | // For fixed length VF, use shuffle to extract the sub-vectors from the |
| 4156 | // wide load. |
| 4157 | auto StrideMask = |
| 4158 | createStrideMask(Start: Index, Stride: InterleaveFactor, VF: State.VF.getFixedValue()); |
| 4159 | return State.Builder.CreateShuffleVector(V: NewLoad, Mask: StrideMask, |
| 4160 | Name: "strided.vec" ); |
| 4161 | }; |
| 4162 | |
| 4163 | for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { |
| 4164 | Instruction *Member = Group->getMember(Index: I); |
| 4165 | |
| 4166 | // Skip the gaps in the group. |
| 4167 | if (!Member) |
| 4168 | continue; |
| 4169 | |
| 4170 | Value *StridedVec = CreateStridedVector(I); |
| 4171 | |
| 4172 | // If this member has different type, cast the result type. |
| 4173 | if (Member->getType() != ScalarTy) { |
| 4174 | VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF); |
| 4175 | StridedVec = |
| 4176 | createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL); |
| 4177 | } |
| 4178 | |
| 4179 | if (Group->isReverse()) |
| 4180 | StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse" ); |
| 4181 | |
| 4182 | State.set(Def: VPDefs[J], V: StridedVec); |
| 4183 | ++J; |
| 4184 | } |
| 4185 | return; |
| 4186 | } |
| 4187 | |
| 4188 | // The sub vector type for current instruction. |
| 4189 | auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF); |
| 4190 | |
| 4191 | // Vectorize the interleaved store group. |
| 4192 | Value *MaskForGaps = |
| 4193 | createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group); |
| 4194 | assert(((MaskForGaps != nullptr) == needsMaskForGaps()) && |
| 4195 | "Mismatch between NeedsMaskForGaps and MaskForGaps" ); |
| 4196 | ArrayRef<VPValue *> StoredValues = getStoredValues(); |
| 4197 | // Collect the stored vector from each member. |
| 4198 | SmallVector<Value *, 4> StoredVecs; |
| 4199 | unsigned StoredIdx = 0; |
| 4200 | for (unsigned i = 0; i < InterleaveFactor; i++) { |
| 4201 | assert((Group->getMember(i) || MaskForGaps) && |
| 4202 | "Fail to get a member from an interleaved store group" ); |
| 4203 | Instruction *Member = Group->getMember(Index: i); |
| 4204 | |
| 4205 | // Skip the gaps in the group. |
| 4206 | if (!Member) { |
| 4207 | Value *Undef = PoisonValue::get(T: SubVT); |
| 4208 | StoredVecs.push_back(Elt: Undef); |
| 4209 | continue; |
| 4210 | } |
| 4211 | |
| 4212 | Value *StoredVec = State.get(Def: StoredValues[StoredIdx]); |
| 4213 | ++StoredIdx; |
| 4214 | |
| 4215 | if (Group->isReverse()) |
| 4216 | StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse" ); |
| 4217 | |
| 4218 | // If this member has different type, cast it to a unified type. |
| 4219 | |
| 4220 | if (StoredVec->getType() != SubVT) |
| 4221 | StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL); |
| 4222 | |
| 4223 | StoredVecs.push_back(Elt: StoredVec); |
| 4224 | } |
| 4225 | |
| 4226 | // Interleave all the smaller vectors into one wider vector. |
| 4227 | Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec" ); |
| 4228 | Instruction *NewStoreInstr; |
| 4229 | if (BlockInMask || MaskForGaps) { |
| 4230 | Value *GroupMask = CreateGroupMask(MaskForGaps); |
| 4231 | NewStoreInstr = State.Builder.CreateMaskedStore( |
| 4232 | Val: IVec, Ptr: ResAddr, Alignment: Group->getAlign(), Mask: GroupMask); |
| 4233 | } else |
| 4234 | NewStoreInstr = |
| 4235 | State.Builder.CreateAlignedStore(Val: IVec, Ptr: ResAddr, Align: Group->getAlign()); |
| 4236 | |
| 4237 | applyMetadata(I&: *NewStoreInstr); |
| 4238 | // TODO: Also manage existing metadata using VPIRMetadata. |
| 4239 | Group->addMetadata(NewInst: NewStoreInstr); |
| 4240 | } |
| 4241 | |
| 4242 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4243 | void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4244 | VPSlotTracker &SlotTracker) const { |
| 4245 | const InterleaveGroup<Instruction> *IG = getInterleaveGroup(); |
| 4246 | O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at " ; |
| 4247 | IG->getInsertPos()->printAsOperand(O, false); |
| 4248 | O << ", " ; |
| 4249 | getAddr()->printAsOperand(O, SlotTracker); |
| 4250 | VPValue *Mask = getMask(); |
| 4251 | if (Mask) { |
| 4252 | O << ", " ; |
| 4253 | Mask->printAsOperand(O, SlotTracker); |
| 4254 | } |
| 4255 | |
| 4256 | unsigned OpIdx = 0; |
| 4257 | for (unsigned i = 0; i < IG->getFactor(); ++i) { |
| 4258 | if (!IG->getMember(i)) |
| 4259 | continue; |
| 4260 | if (getNumStoreOperands() > 0) { |
| 4261 | O << "\n" << Indent << " store " ; |
| 4262 | getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); |
| 4263 | O << " to index " << i; |
| 4264 | } else { |
| 4265 | O << "\n" << Indent << " " ; |
| 4266 | getVPValue(OpIdx)->printAsOperand(O, SlotTracker); |
| 4267 | O << " = load from index " << i; |
| 4268 | } |
| 4269 | ++OpIdx; |
| 4270 | } |
| 4271 | } |
| 4272 | #endif |
| 4273 | |
| 4274 | void VPInterleaveEVLRecipe::execute(VPTransformState &State) { |
| 4275 | assert(!State.Lane && "Interleave group being replicated." ); |
| 4276 | assert(State.VF.isScalable() && |
| 4277 | "Only support scalable VF for EVL tail-folding." ); |
| 4278 | assert(!needsMaskForGaps() && |
| 4279 | "Masking gaps for scalable vectors is not yet supported." ); |
| 4280 | const InterleaveGroup<Instruction> *Group = getInterleaveGroup(); |
| 4281 | Instruction *Instr = Group->getInsertPos(); |
| 4282 | |
| 4283 | // Prepare for the vector type of the interleaved load/store. |
| 4284 | Type *ScalarTy = getLoadStoreType(I: Instr); |
| 4285 | unsigned InterleaveFactor = Group->getFactor(); |
| 4286 | assert(InterleaveFactor <= 8 && |
| 4287 | "Unsupported deinterleave/interleave factor for scalable vectors" ); |
| 4288 | ElementCount WideVF = State.VF * InterleaveFactor; |
| 4289 | auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: WideVF); |
| 4290 | |
| 4291 | VPValue *Addr = getAddr(); |
| 4292 | Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0)); |
| 4293 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 4294 | Value *InterleaveEVL = State.Builder.CreateMul( |
| 4295 | LHS: EVL, RHS: ConstantInt::get(Ty: EVL->getType(), V: InterleaveFactor), Name: "interleave.evl" , |
| 4296 | /* NUW= */ HasNUW: true, /* NSW= */ HasNSW: true); |
| 4297 | LLVMContext &Ctx = State.Builder.getContext(); |
| 4298 | |
| 4299 | Value *GroupMask = nullptr; |
| 4300 | if (VPValue *BlockInMask = getMask()) { |
| 4301 | SmallVector<Value *> Ops(InterleaveFactor, State.get(Def: BlockInMask)); |
| 4302 | GroupMask = interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask" ); |
| 4303 | } else { |
| 4304 | GroupMask = |
| 4305 | State.Builder.CreateVectorSplat(EC: WideVF, V: State.Builder.getTrue()); |
| 4306 | } |
| 4307 | |
| 4308 | // Vectorize the interleaved load group. |
| 4309 | if (isa<LoadInst>(Val: Instr)) { |
| 4310 | CallInst *NewLoad = State.Builder.CreateIntrinsic( |
| 4311 | RetTy: VecTy, ID: Intrinsic::vp_load, Args: {ResAddr, GroupMask, InterleaveEVL}, FMFSource: nullptr, |
| 4312 | Name: "wide.vp.load" ); |
| 4313 | NewLoad->addParamAttr(ArgNo: 0, |
| 4314 | Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign())); |
| 4315 | |
| 4316 | applyMetadata(I&: *NewLoad); |
| 4317 | // TODO: Also manage existing metadata using VPIRMetadata. |
| 4318 | Group->addMetadata(NewInst: NewLoad); |
| 4319 | |
| 4320 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), |
| 4321 | // so must use intrinsics to deinterleave. |
| 4322 | NewLoad = State.Builder.CreateIntrinsic( |
| 4323 | ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor), |
| 4324 | Types: NewLoad->getType(), Args: NewLoad, |
| 4325 | /*FMFSource=*/nullptr, Name: "strided.vec" ); |
| 4326 | |
| 4327 | const DataLayout &DL = Instr->getDataLayout(); |
| 4328 | for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { |
| 4329 | Instruction *Member = Group->getMember(Index: I); |
| 4330 | // Skip the gaps in the group. |
| 4331 | if (!Member) |
| 4332 | continue; |
| 4333 | |
| 4334 | Value *StridedVec = State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: I); |
| 4335 | // If this member has different type, cast the result type. |
| 4336 | if (Member->getType() != ScalarTy) { |
| 4337 | VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF); |
| 4338 | StridedVec = |
| 4339 | createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL); |
| 4340 | } |
| 4341 | |
| 4342 | State.set(Def: getVPValue(I: J), V: StridedVec); |
| 4343 | ++J; |
| 4344 | } |
| 4345 | return; |
| 4346 | } // End for interleaved load. |
| 4347 | |
| 4348 | // The sub vector type for current instruction. |
| 4349 | auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF); |
| 4350 | // Vectorize the interleaved store group. |
| 4351 | ArrayRef<VPValue *> StoredValues = getStoredValues(); |
| 4352 | // Collect the stored vector from each member. |
| 4353 | SmallVector<Value *, 4> StoredVecs; |
| 4354 | const DataLayout &DL = Instr->getDataLayout(); |
| 4355 | for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) { |
| 4356 | Instruction *Member = Group->getMember(Index: I); |
| 4357 | // Skip the gaps in the group. |
| 4358 | if (!Member) { |
| 4359 | StoredVecs.push_back(Elt: PoisonValue::get(T: SubVT)); |
| 4360 | continue; |
| 4361 | } |
| 4362 | |
| 4363 | Value *StoredVec = State.get(Def: StoredValues[StoredIdx]); |
| 4364 | // If this member has different type, cast it to a unified type. |
| 4365 | if (StoredVec->getType() != SubVT) |
| 4366 | StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL); |
| 4367 | |
| 4368 | StoredVecs.push_back(Elt: StoredVec); |
| 4369 | ++StoredIdx; |
| 4370 | } |
| 4371 | |
| 4372 | // Interleave all the smaller vectors into one wider vector. |
| 4373 | Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec" ); |
| 4374 | CallInst *NewStore = |
| 4375 | State.Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::vp_store, |
| 4376 | Args: {IVec, ResAddr, GroupMask, InterleaveEVL}); |
| 4377 | NewStore->addParamAttr(ArgNo: 1, |
| 4378 | Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign())); |
| 4379 | |
| 4380 | applyMetadata(I&: *NewStore); |
| 4381 | // TODO: Also manage existing metadata using VPIRMetadata. |
| 4382 | Group->addMetadata(NewInst: NewStore); |
| 4383 | } |
| 4384 | |
| 4385 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4386 | void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4387 | VPSlotTracker &SlotTracker) const { |
| 4388 | const InterleaveGroup<Instruction> *IG = getInterleaveGroup(); |
| 4389 | O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at " ; |
| 4390 | IG->getInsertPos()->printAsOperand(O, false); |
| 4391 | O << ", " ; |
| 4392 | getAddr()->printAsOperand(O, SlotTracker); |
| 4393 | O << ", " ; |
| 4394 | getEVL()->printAsOperand(O, SlotTracker); |
| 4395 | if (VPValue *Mask = getMask()) { |
| 4396 | O << ", " ; |
| 4397 | Mask->printAsOperand(O, SlotTracker); |
| 4398 | } |
| 4399 | |
| 4400 | unsigned OpIdx = 0; |
| 4401 | for (unsigned i = 0; i < IG->getFactor(); ++i) { |
| 4402 | if (!IG->getMember(i)) |
| 4403 | continue; |
| 4404 | if (getNumStoreOperands() > 0) { |
| 4405 | O << "\n" << Indent << " vp.store " ; |
| 4406 | getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker); |
| 4407 | O << " to index " << i; |
| 4408 | } else { |
| 4409 | O << "\n" << Indent << " " ; |
| 4410 | getVPValue(OpIdx)->printAsOperand(O, SlotTracker); |
| 4411 | O << " = vp.load from index " << i; |
| 4412 | } |
| 4413 | ++OpIdx; |
| 4414 | } |
| 4415 | } |
| 4416 | #endif |
| 4417 | |
| 4418 | InstructionCost VPInterleaveBase::computeCost(ElementCount VF, |
| 4419 | VPCostContext &Ctx) const { |
| 4420 | Instruction *InsertPos = getInsertPos(); |
| 4421 | // Find the VPValue index of the interleave group. We need to skip gaps. |
| 4422 | unsigned InsertPosIdx = 0; |
| 4423 | for (unsigned Idx = 0; IG->getFactor(); ++Idx) |
| 4424 | if (auto *Member = IG->getMember(Index: Idx)) { |
| 4425 | if (Member == InsertPos) |
| 4426 | break; |
| 4427 | InsertPosIdx++; |
| 4428 | } |
| 4429 | Type *ValTy = Ctx.Types.inferScalarType( |
| 4430 | V: getNumDefinedValues() > 0 ? getVPValue(I: InsertPosIdx) |
| 4431 | : getStoredValues()[InsertPosIdx]); |
| 4432 | auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF)); |
| 4433 | unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr())) |
| 4434 | ->getAddressSpace(); |
| 4435 | |
| 4436 | unsigned InterleaveFactor = IG->getFactor(); |
| 4437 | auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor); |
| 4438 | |
| 4439 | // Holds the indices of existing members in the interleaved group. |
| 4440 | SmallVector<unsigned, 4> Indices; |
| 4441 | for (unsigned IF = 0; IF < InterleaveFactor; IF++) |
| 4442 | if (IG->getMember(Index: IF)) |
| 4443 | Indices.push_back(Elt: IF); |
| 4444 | |
| 4445 | // Calculate the cost of the whole interleaved group. |
| 4446 | InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( |
| 4447 | Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: IG->getFactor(), Indices, |
| 4448 | Alignment: IG->getAlign(), AddressSpace: AS, CostKind: Ctx.CostKind, UseMaskForCond: getMask(), UseMaskForGaps: NeedsMaskForGaps); |
| 4449 | |
| 4450 | if (!IG->isReverse()) |
| 4451 | return Cost; |
| 4452 | |
| 4453 | return Cost + IG->getNumMembers() * |
| 4454 | Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, |
| 4455 | DstTy: VectorTy, SrcTy: VectorTy, Mask: {}, CostKind: Ctx.CostKind, |
| 4456 | Index: 0); |
| 4457 | } |
| 4458 | |
| 4459 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4460 | void VPCanonicalIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4461 | VPSlotTracker &SlotTracker) const { |
| 4462 | O << Indent << "EMIT " ; |
| 4463 | printAsOperand(O, SlotTracker); |
| 4464 | O << " = CANONICAL-INDUCTION " ; |
| 4465 | printOperands(O, SlotTracker); |
| 4466 | } |
| 4467 | #endif |
| 4468 | |
| 4469 | bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { |
| 4470 | return vputils::onlyScalarValuesUsed(Def: this) && |
| 4471 | (!IsScalable || vputils::onlyFirstLaneUsed(Def: this)); |
| 4472 | } |
| 4473 | |
| 4474 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4475 | void VPWidenPointerInductionRecipe::printRecipe( |
| 4476 | raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { |
| 4477 | assert((getNumOperands() == 3 || getNumOperands() == 5) && |
| 4478 | "unexpected number of operands" ); |
| 4479 | O << Indent << "EMIT " ; |
| 4480 | printAsOperand(O, SlotTracker); |
| 4481 | O << " = WIDEN-POINTER-INDUCTION " ; |
| 4482 | getStartValue()->printAsOperand(O, SlotTracker); |
| 4483 | O << ", " ; |
| 4484 | getStepValue()->printAsOperand(O, SlotTracker); |
| 4485 | O << ", " ; |
| 4486 | getOperand(2)->printAsOperand(O, SlotTracker); |
| 4487 | if (getNumOperands() == 5) { |
| 4488 | O << ", " ; |
| 4489 | getOperand(3)->printAsOperand(O, SlotTracker); |
| 4490 | O << ", " ; |
| 4491 | getOperand(4)->printAsOperand(O, SlotTracker); |
| 4492 | } |
| 4493 | } |
| 4494 | |
| 4495 | void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4496 | VPSlotTracker &SlotTracker) const { |
| 4497 | O << Indent << "EMIT " ; |
| 4498 | printAsOperand(O, SlotTracker); |
| 4499 | O << " = EXPAND SCEV " << *Expr; |
| 4500 | } |
| 4501 | #endif |
| 4502 | |
| 4503 | void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { |
| 4504 | Value *CanonicalIV = State.get(Def: getOperand(N: 0), /*IsScalar*/ true); |
| 4505 | Type *STy = CanonicalIV->getType(); |
| 4506 | IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); |
| 4507 | ElementCount VF = State.VF; |
| 4508 | Value *VStart = VF.isScalar() |
| 4509 | ? CanonicalIV |
| 4510 | : Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast" ); |
| 4511 | Value *VStep = createStepForVF(B&: Builder, Ty: STy, VF, Step: getUnrollPart(U: *this)); |
| 4512 | if (VF.isVector()) { |
| 4513 | VStep = Builder.CreateVectorSplat(EC: VF, V: VStep); |
| 4514 | VStep = |
| 4515 | Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType())); |
| 4516 | } |
| 4517 | Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv" ); |
| 4518 | State.set(Def: this, V: CanonicalVectorIV); |
| 4519 | } |
| 4520 | |
| 4521 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4522 | void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4523 | VPSlotTracker &SlotTracker) const { |
| 4524 | O << Indent << "EMIT " ; |
| 4525 | printAsOperand(O, SlotTracker); |
| 4526 | O << " = WIDEN-CANONICAL-INDUCTION " ; |
| 4527 | printOperands(O, SlotTracker); |
| 4528 | } |
| 4529 | #endif |
| 4530 | |
| 4531 | void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { |
| 4532 | auto &Builder = State.Builder; |
| 4533 | // Create a vector from the initial value. |
| 4534 | auto *VectorInit = getStartValue()->getLiveInIRValue(); |
| 4535 | |
| 4536 | Type *VecTy = State.VF.isScalar() |
| 4537 | ? VectorInit->getType() |
| 4538 | : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF); |
| 4539 | |
| 4540 | BasicBlock *VectorPH = |
| 4541 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
| 4542 | if (State.VF.isVector()) { |
| 4543 | auto *IdxTy = Builder.getInt32Ty(); |
| 4544 | auto *One = ConstantInt::get(Ty: IdxTy, V: 1); |
| 4545 | IRBuilder<>::InsertPointGuard Guard(Builder); |
| 4546 | Builder.SetInsertPoint(VectorPH->getTerminator()); |
| 4547 | auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF); |
| 4548 | auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One); |
| 4549 | VectorInit = Builder.CreateInsertElement( |
| 4550 | Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init" ); |
| 4551 | } |
| 4552 | |
| 4553 | // Create a phi node for the new recurrence. |
| 4554 | PHINode *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur" ); |
| 4555 | Phi->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt()); |
| 4556 | Phi->addIncoming(V: VectorInit, BB: VectorPH); |
| 4557 | State.set(Def: this, V: Phi); |
| 4558 | } |
| 4559 | |
| 4560 | InstructionCost |
| 4561 | VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, |
| 4562 | VPCostContext &Ctx) const { |
| 4563 | if (VF.isScalar()) |
| 4564 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 4565 | |
| 4566 | return 0; |
| 4567 | } |
| 4568 | |
| 4569 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4570 | void VPFirstOrderRecurrencePHIRecipe::printRecipe( |
| 4571 | raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { |
| 4572 | O << Indent << "FIRST-ORDER-RECURRENCE-PHI " ; |
| 4573 | printAsOperand(O, SlotTracker); |
| 4574 | O << " = phi " ; |
| 4575 | printOperands(O, SlotTracker); |
| 4576 | } |
| 4577 | #endif |
| 4578 | |
| 4579 | void VPReductionPHIRecipe::execute(VPTransformState &State) { |
| 4580 | // Reductions do not have to start at zero. They can start with |
| 4581 | // any loop invariant values. |
| 4582 | VPValue *StartVPV = getStartValue(); |
| 4583 | |
| 4584 | // In order to support recurrences we need to be able to vectorize Phi nodes. |
| 4585 | // Phi nodes have cycles, so we need to vectorize them in two stages. This is |
| 4586 | // stage #1: We create a new vector PHI node with no incoming edges. We'll use |
| 4587 | // this value when we vectorize all of the instructions that use the PHI. |
| 4588 | BasicBlock *VectorPH = |
| 4589 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
| 4590 | bool ScalarPHI = State.VF.isScalar() || isInLoop(); |
| 4591 | Value *StartV = State.get(Def: StartVPV, IsScalar: ScalarPHI); |
| 4592 | Type *VecTy = StartV->getType(); |
| 4593 | |
| 4594 | BasicBlock * = State.CFG.PrevBB; |
| 4595 | assert(State.CurrentParentLoop->getHeader() == HeaderBB && |
| 4596 | "recipe must be in the vector loop header" ); |
| 4597 | auto *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi" ); |
| 4598 | Phi->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt()); |
| 4599 | State.set(Def: this, V: Phi, IsScalar: isInLoop()); |
| 4600 | |
| 4601 | Phi->addIncoming(V: StartV, BB: VectorPH); |
| 4602 | } |
| 4603 | |
| 4604 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4605 | void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4606 | VPSlotTracker &SlotTracker) const { |
| 4607 | O << Indent << "WIDEN-REDUCTION-PHI " ; |
| 4608 | |
| 4609 | printAsOperand(O, SlotTracker); |
| 4610 | O << " = phi" ; |
| 4611 | printFlags(O); |
| 4612 | printOperands(O, SlotTracker); |
| 4613 | if (getVFScaleFactor() > 1) |
| 4614 | O << " (VF scaled by 1/" << getVFScaleFactor() << ")" ; |
| 4615 | } |
| 4616 | #endif |
| 4617 | |
| 4618 | void VPWidenPHIRecipe::execute(VPTransformState &State) { |
| 4619 | Value *Op0 = State.get(Def: getOperand(N: 0)); |
| 4620 | Type *VecTy = Op0->getType(); |
| 4621 | Instruction *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: 2, Name); |
| 4622 | State.set(Def: this, V: VecPhi); |
| 4623 | } |
| 4624 | |
| 4625 | InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF, |
| 4626 | VPCostContext &Ctx) const { |
| 4627 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 4628 | } |
| 4629 | |
| 4630 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4631 | void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4632 | VPSlotTracker &SlotTracker) const { |
| 4633 | O << Indent << "WIDEN-PHI " ; |
| 4634 | |
| 4635 | printAsOperand(O, SlotTracker); |
| 4636 | O << " = phi " ; |
| 4637 | printPhiOperands(O, SlotTracker); |
| 4638 | } |
| 4639 | #endif |
| 4640 | |
| 4641 | void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { |
| 4642 | BasicBlock *VectorPH = |
| 4643 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
| 4644 | Value *StartMask = State.get(Def: getOperand(N: 0)); |
| 4645 | PHINode *Phi = |
| 4646 | State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask" ); |
| 4647 | Phi->addIncoming(V: StartMask, BB: VectorPH); |
| 4648 | State.set(Def: this, V: Phi); |
| 4649 | } |
| 4650 | |
| 4651 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4652 | void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4653 | VPSlotTracker &SlotTracker) const { |
| 4654 | O << Indent << "ACTIVE-LANE-MASK-PHI " ; |
| 4655 | |
| 4656 | printAsOperand(O, SlotTracker); |
| 4657 | O << " = phi " ; |
| 4658 | printOperands(O, SlotTracker); |
| 4659 | } |
| 4660 | #endif |
| 4661 | |
| 4662 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4663 | void VPCurrentIterationPHIRecipe::printRecipe( |
| 4664 | raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { |
| 4665 | O << Indent << "CURRENT-ITERATION-PHI " ; |
| 4666 | |
| 4667 | printAsOperand(O, SlotTracker); |
| 4668 | O << " = phi " ; |
| 4669 | printOperands(O, SlotTracker); |
| 4670 | } |
| 4671 | #endif |
| 4672 | |