| 1 | //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file contains implementations for different VPlan recipes. |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "LoopVectorizationPlanner.h" |
| 15 | #include "VPlan.h" |
| 16 | #include "VPlanHelpers.h" |
| 17 | #include "VPlanPatternMatch.h" |
| 18 | #include "VPlanUtils.h" |
| 19 | #include "llvm/ADT/STLExtras.h" |
| 20 | #include "llvm/ADT/SmallVector.h" |
| 21 | #include "llvm/ADT/SmallVectorExtras.h" |
| 22 | #include "llvm/ADT/Twine.h" |
| 23 | #include "llvm/Analysis/AssumptionCache.h" |
| 24 | #include "llvm/Analysis/IVDescriptors.h" |
| 25 | #include "llvm/Analysis/LoopInfo.h" |
| 26 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| 27 | #include "llvm/IR/BasicBlock.h" |
| 28 | #include "llvm/IR/IRBuilder.h" |
| 29 | #include "llvm/IR/Instruction.h" |
| 30 | #include "llvm/IR/Instructions.h" |
| 31 | #include "llvm/IR/Intrinsics.h" |
| 32 | #include "llvm/IR/Type.h" |
| 33 | #include "llvm/IR/Value.h" |
| 34 | #include "llvm/Support/Casting.h" |
| 35 | #include "llvm/Support/CommandLine.h" |
| 36 | #include "llvm/Support/Debug.h" |
| 37 | #include "llvm/Support/raw_ostream.h" |
| 38 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| 39 | #include "llvm/Transforms/Utils/LoopUtils.h" |
| 40 | #include <cassert> |
| 41 | |
| 42 | using namespace llvm; |
| 43 | using namespace llvm::VPlanPatternMatch; |
| 44 | |
| 45 | using VectorParts = SmallVector<Value *, 2>; |
| 46 | |
| 47 | #define LV_NAME "loop-vectorize" |
| 48 | #define DEBUG_TYPE LV_NAME |
| 49 | |
| 50 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 51 | // It is sometimes necessary to disable printing of metadata in tests in order |
| 52 | // to avoid non-deterministic behaviour due to metadata introduced by VPlan |
| 53 | // that wasn't present in the original scalar IR. |
| 54 | static cl::opt<bool> VPlanPrintMetadata( |
| 55 | "vplan-print-metadata" , cl::init(true), cl::Hidden, |
| 56 | cl::desc("Controls the printing of recipe metadata when debugging." )); |
| 57 | #endif |
| 58 | |
| 59 | bool VPRecipeBase::mayWriteToMemory() const { |
| 60 | switch (getVPRecipeID()) { |
| 61 | case VPExpressionSC: |
| 62 | return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory(); |
| 63 | case VPInstructionSC: { |
| 64 | auto *VPI = cast<VPInstruction>(Val: this); |
| 65 | // Loads read from memory but don't write to memory. |
| 66 | if (VPI->getOpcode() == Instruction::Load) |
| 67 | return false; |
| 68 | return VPI->opcodeMayReadOrWriteFromMemory(); |
| 69 | } |
| 70 | case VPInterleaveEVLSC: |
| 71 | case VPInterleaveSC: |
| 72 | return cast<VPInterleaveBase>(Val: this)->getNumStoreOperands() > 0; |
| 73 | case VPWidenStoreEVLSC: |
| 74 | case VPWidenStoreSC: |
| 75 | return true; |
| 76 | case VPReplicateSC: |
| 77 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
| 78 | ->mayWriteToMemory(); |
| 79 | case VPWidenCallSC: |
| 80 | return !cast<VPWidenCallRecipe>(Val: this) |
| 81 | ->getCalledScalarFunction() |
| 82 | ->onlyReadsMemory(); |
| 83 | case VPWidenMemIntrinsicSC: |
| 84 | case VPWidenIntrinsicSC: |
| 85 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayWriteToMemory(); |
| 86 | case VPActiveLaneMaskPHISC: |
| 87 | case VPCurrentIterationPHISC: |
| 88 | case VPBranchOnMaskSC: |
| 89 | case VPDerivedIVSC: |
| 90 | case VPFirstOrderRecurrencePHISC: |
| 91 | case VPReductionPHISC: |
| 92 | case VPScalarIVStepsSC: |
| 93 | case VPPredInstPHISC: |
| 94 | return false; |
| 95 | case VPBlendSC: |
| 96 | case VPReductionEVLSC: |
| 97 | case VPReductionSC: |
| 98 | case VPVectorPointerSC: |
| 99 | case VPWidenCanonicalIVSC: |
| 100 | case VPWidenCastSC: |
| 101 | case VPWidenGEPSC: |
| 102 | case VPWidenIntOrFpInductionSC: |
| 103 | case VPWidenLoadEVLSC: |
| 104 | case VPWidenLoadSC: |
| 105 | case VPWidenPHISC: |
| 106 | case VPWidenPointerInductionSC: |
| 107 | case VPWidenSC: { |
| 108 | const Instruction *I = |
| 109 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
| 110 | (void)I; |
| 111 | assert((!I || !I->mayWriteToMemory()) && |
| 112 | "underlying instruction may write to memory" ); |
| 113 | return false; |
| 114 | } |
| 115 | default: |
| 116 | return true; |
| 117 | } |
| 118 | } |
| 119 | |
| 120 | bool VPRecipeBase::mayReadFromMemory() const { |
| 121 | switch (getVPRecipeID()) { |
| 122 | case VPExpressionSC: |
| 123 | return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory(); |
| 124 | case VPInstructionSC: |
| 125 | return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory(); |
| 126 | case VPWidenLoadEVLSC: |
| 127 | case VPWidenLoadSC: |
| 128 | return true; |
| 129 | case VPReplicateSC: |
| 130 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
| 131 | ->mayReadFromMemory(); |
| 132 | case VPWidenCallSC: |
| 133 | return !cast<VPWidenCallRecipe>(Val: this) |
| 134 | ->getCalledScalarFunction() |
| 135 | ->onlyWritesMemory(); |
| 136 | case VPWidenMemIntrinsicSC: |
| 137 | case VPWidenIntrinsicSC: |
| 138 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayReadFromMemory(); |
| 139 | case VPBranchOnMaskSC: |
| 140 | case VPDerivedIVSC: |
| 141 | case VPCurrentIterationPHISC: |
| 142 | case VPFirstOrderRecurrencePHISC: |
| 143 | case VPReductionPHISC: |
| 144 | case VPPredInstPHISC: |
| 145 | case VPScalarIVStepsSC: |
| 146 | case VPWidenStoreEVLSC: |
| 147 | case VPWidenStoreSC: |
| 148 | return false; |
| 149 | case VPBlendSC: |
| 150 | case VPReductionEVLSC: |
| 151 | case VPReductionSC: |
| 152 | case VPVectorPointerSC: |
| 153 | case VPWidenCanonicalIVSC: |
| 154 | case VPWidenCastSC: |
| 155 | case VPWidenGEPSC: |
| 156 | case VPWidenIntOrFpInductionSC: |
| 157 | case VPWidenPHISC: |
| 158 | case VPWidenPointerInductionSC: |
| 159 | case VPWidenSC: { |
| 160 | const Instruction *I = |
| 161 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
| 162 | (void)I; |
| 163 | assert((!I || !I->mayReadFromMemory()) && |
| 164 | "underlying instruction may read from memory" ); |
| 165 | return false; |
| 166 | } |
| 167 | default: |
| 168 | // FIXME: Return false if the recipe represents an interleaved store. |
| 169 | return true; |
| 170 | } |
| 171 | } |
| 172 | |
| 173 | bool VPRecipeBase::mayHaveSideEffects() const { |
| 174 | switch (getVPRecipeID()) { |
| 175 | case VPExpressionSC: |
| 176 | return cast<VPExpressionRecipe>(Val: this)->mayHaveSideEffects(); |
| 177 | case VPActiveLaneMaskPHISC: |
| 178 | case VPDerivedIVSC: |
| 179 | case VPCurrentIterationPHISC: |
| 180 | case VPFirstOrderRecurrencePHISC: |
| 181 | case VPReductionPHISC: |
| 182 | case VPPredInstPHISC: |
| 183 | case VPVectorEndPointerSC: |
| 184 | return false; |
| 185 | case VPInstructionSC: { |
| 186 | auto *VPI = cast<VPInstruction>(Val: this); |
| 187 | return mayWriteToMemory() || |
| 188 | VPI->getOpcode() == VPInstruction::BranchOnCount || |
| 189 | VPI->getOpcode() == VPInstruction::BranchOnCond || |
| 190 | VPI->getOpcode() == VPInstruction::BranchOnTwoConds; |
| 191 | } |
| 192 | case VPWidenCallSC: { |
| 193 | Function *Fn = cast<VPWidenCallRecipe>(Val: this)->getCalledScalarFunction(); |
| 194 | return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); |
| 195 | } |
| 196 | case VPWidenMemIntrinsicSC: |
| 197 | case VPWidenIntrinsicSC: |
| 198 | return cast<VPWidenIntrinsicRecipe>(Val: this)->mayHaveSideEffects(); |
| 199 | case VPBlendSC: |
| 200 | case VPReductionEVLSC: |
| 201 | case VPReductionSC: |
| 202 | case VPScalarIVStepsSC: |
| 203 | case VPVectorPointerSC: |
| 204 | case VPWidenCanonicalIVSC: |
| 205 | case VPWidenCastSC: |
| 206 | case VPWidenGEPSC: |
| 207 | case VPWidenIntOrFpInductionSC: |
| 208 | case VPWidenPHISC: |
| 209 | case VPWidenPointerInductionSC: |
| 210 | case VPWidenSC: { |
| 211 | const Instruction *I = |
| 212 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
| 213 | (void)I; |
| 214 | assert((!I || !I->mayHaveSideEffects()) && |
| 215 | "underlying instruction has side-effects" ); |
| 216 | return false; |
| 217 | } |
| 218 | case VPInterleaveEVLSC: |
| 219 | case VPInterleaveSC: |
| 220 | return mayWriteToMemory(); |
| 221 | case VPWidenLoadEVLSC: |
| 222 | case VPWidenLoadSC: |
| 223 | case VPWidenStoreEVLSC: |
| 224 | case VPWidenStoreSC: |
| 225 | assert( |
| 226 | cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == |
| 227 | mayWriteToMemory() && |
| 228 | "mayHaveSideffects result for ingredient differs from this " |
| 229 | "implementation" ); |
| 230 | return mayWriteToMemory(); |
| 231 | case VPReplicateSC: { |
| 232 | auto *R = cast<VPReplicateRecipe>(Val: this); |
| 233 | return R->getUnderlyingInstr()->mayHaveSideEffects(); |
| 234 | } |
| 235 | default: |
| 236 | return true; |
| 237 | } |
| 238 | } |
| 239 | |
| 240 | bool VPRecipeBase::isSafeToSpeculativelyExecute() const { |
| 241 | switch (getVPRecipeID()) { |
| 242 | default: |
| 243 | return false; |
| 244 | case VPInstructionSC: { |
| 245 | unsigned Opcode = cast<VPInstruction>(Val: this)->getOpcode(); |
| 246 | if (Instruction::isCast(Opcode)) |
| 247 | return true; |
| 248 | |
| 249 | switch (Opcode) { |
| 250 | default: |
| 251 | return false; |
| 252 | case Instruction::Add: |
| 253 | case Instruction::Sub: |
| 254 | case Instruction::Mul: |
| 255 | case Instruction::GetElementPtr: |
| 256 | return true; |
| 257 | } |
| 258 | } |
| 259 | } |
| 260 | } |
| 261 | |
| 262 | void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { |
| 263 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
| 264 | assert(InsertPos->getParent() && |
| 265 | "Insertion position not in any VPBasicBlock" ); |
| 266 | InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator()); |
| 267 | } |
| 268 | |
| 269 | void VPRecipeBase::insertBefore(VPBasicBlock &BB, |
| 270 | iplist<VPRecipeBase>::iterator I) { |
| 271 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
| 272 | assert(I == BB.end() || I->getParent() == &BB); |
| 273 | BB.insert(Recipe: this, InsertPt: I); |
| 274 | } |
| 275 | |
| 276 | void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { |
| 277 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
| 278 | assert(InsertPos->getParent() && |
| 279 | "Insertion position not in any VPBasicBlock" ); |
| 280 | InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator())); |
| 281 | } |
| 282 | |
| 283 | void VPRecipeBase::removeFromParent() { |
| 284 | assert(getParent() && "Recipe not in any VPBasicBlock" ); |
| 285 | getParent()->getRecipeList().remove(IT: getIterator()); |
| 286 | Parent = nullptr; |
| 287 | } |
| 288 | |
| 289 | iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { |
| 290 | assert(getParent() && "Recipe not in any VPBasicBlock" ); |
| 291 | return getParent()->getRecipeList().erase(where: getIterator()); |
| 292 | } |
| 293 | |
| 294 | void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { |
| 295 | removeFromParent(); |
| 296 | insertAfter(InsertPos); |
| 297 | } |
| 298 | |
| 299 | void VPRecipeBase::moveBefore(VPBasicBlock &BB, |
| 300 | iplist<VPRecipeBase>::iterator I) { |
| 301 | removeFromParent(); |
| 302 | insertBefore(BB, I); |
| 303 | } |
| 304 | |
| 305 | InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { |
| 306 | // Get the underlying instruction for the recipe, if there is one. It is used |
| 307 | // to |
| 308 | // * decide if cost computation should be skipped for this recipe, |
| 309 | // * apply forced target instruction cost. |
| 310 | Instruction *UI = nullptr; |
| 311 | if (auto *S = dyn_cast<VPSingleDefRecipe>(Val: this)) |
| 312 | UI = dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue()); |
| 313 | else if (auto *IG = dyn_cast<VPInterleaveBase>(Val: this)) |
| 314 | UI = IG->getInsertPos(); |
| 315 | else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: this)) |
| 316 | UI = &WidenMem->getIngredient(); |
| 317 | |
| 318 | InstructionCost RecipeCost; |
| 319 | if (UI && Ctx.skipCostComputation(UI, IsVector: VF.isVector())) { |
| 320 | RecipeCost = 0; |
| 321 | } else { |
| 322 | RecipeCost = computeCost(VF, Ctx); |
| 323 | if (ForceTargetInstructionCost.getNumOccurrences() > 0 && |
| 324 | RecipeCost.isValid()) { |
| 325 | if (UI) |
| 326 | RecipeCost = InstructionCost(ForceTargetInstructionCost); |
| 327 | else |
| 328 | RecipeCost = InstructionCost(0); |
| 329 | } |
| 330 | } |
| 331 | |
| 332 | LLVM_DEBUG({ |
| 333 | dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": " ; |
| 334 | dump(); |
| 335 | }); |
| 336 | return RecipeCost; |
| 337 | } |
| 338 | |
| 339 | InstructionCost VPRecipeBase::computeCost(ElementCount VF, |
| 340 | VPCostContext &Ctx) const { |
| 341 | llvm_unreachable("subclasses should implement computeCost" ); |
| 342 | } |
| 343 | |
| 344 | bool VPRecipeBase::isPhi() const { |
| 345 | return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) || |
| 346 | isa<VPPhi, VPIRPhi>(Val: this); |
| 347 | } |
| 348 | |
| 349 | void VPIRFlags::intersectFlags(const VPIRFlags &Other) { |
| 350 | assert(OpType == Other.OpType && "OpType must match" ); |
| 351 | switch (OpType) { |
| 352 | case OperationType::OverflowingBinOp: |
| 353 | WrapFlags.HasNUW &= Other.WrapFlags.HasNUW; |
| 354 | WrapFlags.HasNSW &= Other.WrapFlags.HasNSW; |
| 355 | break; |
| 356 | case OperationType::Trunc: |
| 357 | TruncFlags.HasNUW &= Other.TruncFlags.HasNUW; |
| 358 | TruncFlags.HasNSW &= Other.TruncFlags.HasNSW; |
| 359 | break; |
| 360 | case OperationType::DisjointOp: |
| 361 | DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint; |
| 362 | break; |
| 363 | case OperationType::PossiblyExactOp: |
| 364 | ExactFlags.IsExact &= Other.ExactFlags.IsExact; |
| 365 | break; |
| 366 | case OperationType::GEPOp: |
| 367 | GEPFlagsStorage &= Other.GEPFlagsStorage; |
| 368 | break; |
| 369 | case OperationType::FPMathOp: |
| 370 | case OperationType::FCmp: |
| 371 | assert((OpType != OperationType::FCmp || |
| 372 | FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) && |
| 373 | "Cannot drop CmpPredicate" ); |
| 374 | getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs; |
| 375 | getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs; |
| 376 | break; |
| 377 | case OperationType::NonNegOp: |
| 378 | NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg; |
| 379 | break; |
| 380 | case OperationType::Cmp: |
| 381 | assert(CmpPredStorage == Other.CmpPredStorage && |
| 382 | "Cannot drop CmpPredicate" ); |
| 383 | break; |
| 384 | case OperationType::ReductionOp: |
| 385 | assert(ReductionFlags.Kind == Other.ReductionFlags.Kind && |
| 386 | "Cannot change RecurKind" ); |
| 387 | assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered && |
| 388 | "Cannot change IsOrdered" ); |
| 389 | assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop && |
| 390 | "Cannot change IsInLoop" ); |
| 391 | getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs; |
| 392 | getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs; |
| 393 | break; |
| 394 | case OperationType::Other: |
| 395 | break; |
| 396 | } |
| 397 | } |
| 398 | |
| 399 | FastMathFlags VPIRFlags::getFastMathFlagsOrNone() const { |
| 400 | if (!hasFastMathFlags()) |
| 401 | return {}; |
| 402 | const FastMathFlagsTy &F = getFMFsRef(); |
| 403 | FastMathFlags Res; |
| 404 | Res.setAllowReassoc(F.AllowReassoc); |
| 405 | Res.setNoNaNs(F.NoNaNs); |
| 406 | Res.setNoInfs(F.NoInfs); |
| 407 | Res.setNoSignedZeros(F.NoSignedZeros); |
| 408 | Res.setAllowReciprocal(F.AllowReciprocal); |
| 409 | Res.setAllowContract(F.AllowContract); |
| 410 | Res.setApproxFunc(F.ApproxFunc); |
| 411 | return Res; |
| 412 | } |
| 413 | |
| 414 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 415 | void VPSingleDefRecipe::dump() const { VPRecipeBase::dump(); } |
| 416 | |
| 417 | void VPRecipeBase::print(raw_ostream &O, const Twine &Indent, |
| 418 | VPSlotTracker &SlotTracker) const { |
| 419 | printRecipe(O, Indent, SlotTracker); |
| 420 | if (auto DL = getDebugLoc()) { |
| 421 | O << ", !dbg " ; |
| 422 | DL.print(O); |
| 423 | } |
| 424 | |
| 425 | if (auto *Metadata = dyn_cast<VPIRMetadata>(this)) |
| 426 | Metadata->print(O, SlotTracker); |
| 427 | } |
| 428 | #endif |
| 429 | |
| 430 | VPExpandSCEVRecipe::VPExpandSCEVRecipe(const SCEV *Expr) |
| 431 | : VPSingleDefRecipe(VPRecipeBase::VPExpandSCEVSC, {}, Expr->getType()), |
| 432 | Expr(Expr) {} |
| 433 | |
| 434 | /// For call VPInstruction operands, return the operand index of the called |
| 435 | /// function. The function is either the last operand (for unmasked calls) or |
| 436 | /// the second-to-last operand (for masked calls). |
| 437 | static unsigned getCalledFnOperandIndex(ArrayRef<VPValue *> Operands) { |
| 438 | unsigned NumOps = Operands.size(); |
| 439 | auto *LastOp = dyn_cast<VPIRValue>(Val: Operands[NumOps - 1]); |
| 440 | if (LastOp && isa<Function>(Val: LastOp->getValue())) |
| 441 | return NumOps - 1; |
| 442 | assert(isa<Function>(cast<VPIRValue>(Operands[NumOps - 2])->getValue()) && |
| 443 | "expected function operand" ); |
| 444 | return NumOps - 2; |
| 445 | } |
| 446 | |
| 447 | /// For call VPInstruction operands, return the called function. |
| 448 | static Function *getCalledFunction(ArrayRef<VPValue *> Operands) { |
| 449 | unsigned Idx = getCalledFnOperandIndex(Operands); |
| 450 | return cast<Function>(Val: cast<VPIRValue>(Val: Operands[Idx])->getValue()); |
| 451 | } |
| 452 | |
| 453 | Type *llvm::computeScalarTypeForInstruction(unsigned Opcode, |
| 454 | ArrayRef<VPValue *> Operands) { |
| 455 | assert(!Operands.empty() && |
| 456 | "zero-operand VPInstruction opcodes must pass explicit ResultTy" ); |
| 457 | // Assert operand \p Idx (if present and typed) has type \p ExpectedTy. |
| 458 | [[maybe_unused]] auto AssertOperandType = [&Operands](unsigned Idx, |
| 459 | Type *ExpectedTy) { |
| 460 | if (!ExpectedTy || Operands.size() <= Idx) |
| 461 | return; |
| 462 | [[maybe_unused]] Type *OpTy = Operands[Idx]->getScalarType(); |
| 463 | assert((!OpTy || OpTy == ExpectedTy) && |
| 464 | "different types inferred for different operands" ); |
| 465 | }; |
| 466 | |
| 467 | Type *Op0Ty = Operands[0]->getScalarType(); |
| 468 | LLVMContext &Ctx = Op0Ty->getContext(); |
| 469 | switch (Opcode) { |
| 470 | case VPInstruction::BranchOnCond: |
| 471 | assert(Op0Ty->isIntegerTy(1) && "expected bool condition" ); |
| 472 | return Type::getVoidTy(C&: Ctx); |
| 473 | case VPInstruction::BranchOnTwoConds: |
| 474 | assert(Op0Ty->isIntegerTy(1) && "expected bool condition" ); |
| 475 | AssertOperandType(1, IntegerType::get(C&: Ctx, NumBits: 1)); |
| 476 | return Type::getVoidTy(C&: Ctx); |
| 477 | case VPInstruction::BranchOnCount: |
| 478 | assert(Op0Ty->isIntegerTy() && "expected integer operand" ); |
| 479 | AssertOperandType(1, Op0Ty); |
| 480 | return Type::getVoidTy(C&: Ctx); |
| 481 | case VPInstruction::CalculateTripCountMinusVF: |
| 482 | case VPInstruction::CanonicalIVIncrementForPart: |
| 483 | assert(Op0Ty->isIntegerTy() && "expected integer operand" ); |
| 484 | for (unsigned Idx = 1; Idx != Operands.size(); ++Idx) |
| 485 | AssertOperandType(Idx, Op0Ty); |
| 486 | return Op0Ty; |
| 487 | case Instruction::Switch: |
| 488 | for (unsigned Idx = 1; Idx != Operands.size(); ++Idx) |
| 489 | AssertOperandType(Idx, Op0Ty); |
| 490 | return Type::getVoidTy(C&: Ctx); |
| 491 | case Instruction::Store: |
| 492 | return Type::getVoidTy(C&: Ctx); |
| 493 | case Instruction::ICmp: |
| 494 | assert(Op0Ty->isIntOrPtrTy() && "expected integer or pointer operand" ); |
| 495 | AssertOperandType(1, Op0Ty); |
| 496 | return IntegerType::get(C&: Ctx, NumBits: 1); |
| 497 | case Instruction::FCmp: |
| 498 | assert(Op0Ty->isFloatingPointTy() && "expected floating-point operand" ); |
| 499 | AssertOperandType(1, Op0Ty); |
| 500 | return IntegerType::get(C&: Ctx, NumBits: 1); |
| 501 | case VPInstruction::ActiveLaneMask: |
| 502 | assert(Op0Ty->isIntegerTy() && "expected integer operand" ); |
| 503 | AssertOperandType(1, Op0Ty); |
| 504 | return IntegerType::get(C&: Ctx, NumBits: 1); |
| 505 | case VPInstruction::MaskedCond: |
| 506 | assert(Op0Ty->isIntegerTy(1) && "expected bool operand" ); |
| 507 | return IntegerType::get(C&: Ctx, NumBits: 1); |
| 508 | case VPInstruction::LogicalAnd: |
| 509 | case VPInstruction::LogicalOr: |
| 510 | assert(Op0Ty->isIntegerTy(1) && "expected bool operand" ); |
| 511 | AssertOperandType(1, Op0Ty); |
| 512 | return IntegerType::get(C&: Ctx, NumBits: 1); |
| 513 | case VPInstruction::AnyOf: |
| 514 | assert(Op0Ty->isIntegerTy(1) && "expected bool operand" ); |
| 515 | for (unsigned Idx = 1; Idx != Operands.size(); ++Idx) |
| 516 | AssertOperandType(Idx, Op0Ty); |
| 517 | return IntegerType::get(C&: Ctx, NumBits: 1); |
| 518 | case VPInstruction::ExplicitVectorLength: |
| 519 | assert(Op0Ty->isIntegerTy() && "expected integer operand" ); |
| 520 | return IntegerType::get(C&: Ctx, NumBits: 32); |
| 521 | case Instruction::Select: { |
| 522 | assert((!Op0Ty || Op0Ty->isIntegerTy(1)) && |
| 523 | "select condition must be bool" ); |
| 524 | Type *Op1Ty = Operands[1]->getScalarType(); |
| 525 | AssertOperandType(2, Op1Ty); |
| 526 | return Op1Ty; |
| 527 | } |
| 528 | case Instruction::InsertElement: |
| 529 | // The inserted scalar (operand 1) must match the vector element type; |
| 530 | // operand 2 must be an integer. |
| 531 | AssertOperandType(1, Op0Ty); |
| 532 | assert(Operands[2]->getScalarType()->isIntegerTy() && |
| 533 | "expected integer operand" ); |
| 534 | return Op0Ty; |
| 535 | case VPInstruction::ReductionStartVector: |
| 536 | // The start value and the identity value (operands 0 and 1) fill the same |
| 537 | // vector and must match in type; operand 2 is the scaling factor. |
| 538 | AssertOperandType(1, Op0Ty); |
| 539 | return Op0Ty; |
| 540 | case VPInstruction::ExtractLane: { |
| 541 | assert(Operands.size() >= 2 && "ExtractLane requires a lane operand and " |
| 542 | "at least one source vector operand" ); |
| 543 | // Operand 0 is the lane index, used for integer arithmetic. |
| 544 | assert(Op0Ty->isIntegerTy() && "expected integer operand" ); |
| 545 | Type *Op1Ty = Operands[1]->getScalarType(); |
| 546 | for (unsigned Idx = 2; Idx != Operands.size(); ++Idx) |
| 547 | AssertOperandType(Idx, Op1Ty); |
| 548 | return Op1Ty; |
| 549 | } |
| 550 | case VPInstruction::PtrAdd: |
| 551 | case VPInstruction::WidePtrAdd: |
| 552 | assert(Operands[0]->getScalarType()->isPointerTy() && |
| 553 | "expected pointer operand" ); |
| 554 | assert(Operands[1]->getScalarType()->isIntegerTy() && |
| 555 | "expected integer operand" ); |
| 556 | return Op0Ty; |
| 557 | case Instruction::ExtractValue: { |
| 558 | assert(Operands.size() == 2 && "expected single level extractvalue" ); |
| 559 | auto *StructTy = cast<StructType>(Val: Op0Ty); |
| 560 | return StructTy->getTypeAtIndex( |
| 561 | N: cast<VPConstantInt>(Val: Operands[1])->getZExtValue()); |
| 562 | } |
| 563 | case VPInstruction::FirstActiveLane: |
| 564 | case VPInstruction::LastActiveLane: |
| 565 | case VPInstruction::NumActiveLanes: |
| 566 | case VPInstruction::IncomingAliasMask: |
| 567 | case Instruction::Load: |
| 568 | case Instruction::Alloca: |
| 569 | llvm_unreachable("type must be passed explicitly" ); |
| 570 | case Instruction::Call: |
| 571 | return getCalledFunction(Operands)->getReturnType(); |
| 572 | default: |
| 573 | break; |
| 574 | } |
| 575 | |
| 576 | // Opcodes that require all operands to share the same scalar type as the |
| 577 | // result. |
| 578 | bool AllOperandsSameType = |
| 579 | Instruction::isBinaryOp(Opcode) || |
| 580 | is_contained(Set: {VPInstruction::FirstOrderRecurrenceSplice, |
| 581 | VPInstruction::BuildVector, |
| 582 | VPInstruction::BuildStructVector}, |
| 583 | Element: Opcode); |
| 584 | if (AllOperandsSameType) |
| 585 | for (unsigned Idx = 1; Idx != Operands.size(); ++Idx) |
| 586 | AssertOperandType(Idx, Op0Ty); |
| 587 | |
| 588 | return Op0Ty; |
| 589 | } |
| 590 | |
| 591 | Type *VPReplicateRecipe::computeScalarType(const Instruction *I, |
| 592 | ArrayRef<VPValue *> Operands) { |
| 593 | unsigned Opcode = I->getOpcode(); |
| 594 | if (Instruction::isCast(Opcode) || |
| 595 | is_contained(Range: ArrayRef<unsigned>({Instruction::ExtractValue, |
| 596 | Instruction::Load, Instruction::Alloca}), |
| 597 | Element: Opcode)) |
| 598 | return I->getType(); |
| 599 | return computeScalarTypeForInstruction(Opcode, Operands); |
| 600 | } |
| 601 | |
| 602 | VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, |
| 603 | const VPIRFlags &Flags, const VPIRMetadata &MD, |
| 604 | DebugLoc DL, const Twine &Name, Type *ResultTy) |
| 605 | : VPRecipeWithIRFlags( |
| 606 | VPRecipeBase::VPInstructionSC, Operands, |
| 607 | ResultTy ? ResultTy |
| 608 | : computeScalarTypeForInstruction(Opcode, Operands), |
| 609 | Flags, DL), |
| 610 | VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) { |
| 611 | assert(flagsValidForOpcode(getOpcode()) && |
| 612 | "Set flags not supported for the provided opcode" ); |
| 613 | assert(hasRequiredFlagsForOpcode(getOpcode()) && |
| 614 | "Opcode requires specific flags to be set" ); |
| 615 | assert((getNumOperandsForOpcode() == -1u || |
| 616 | getNumOperandsForOpcode() == getNumOperands() || |
| 617 | (isMasked() && getNumOperandsForOpcode() + 1 == getNumOperands())) && |
| 618 | "number of operands does not match opcode" ); |
| 619 | } |
| 620 | |
| 621 | unsigned VPInstruction::getNumOperandsForOpcode() const { |
| 622 | if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode)) |
| 623 | return 1; |
| 624 | |
| 625 | if (Instruction::isBinaryOp(Opcode)) |
| 626 | return 2; |
| 627 | |
| 628 | switch (Opcode) { |
| 629 | case VPInstruction::StepVector: |
| 630 | case VPInstruction::VScale: |
| 631 | case VPInstruction::IncomingAliasMask: |
| 632 | return 0; |
| 633 | case Instruction::Alloca: |
| 634 | case Instruction::ExtractValue: |
| 635 | case Instruction::Freeze: |
| 636 | case Instruction::Load: |
| 637 | case VPInstruction::BranchOnCond: |
| 638 | case VPInstruction::Broadcast: |
| 639 | case VPInstruction::ExitingIVValue: |
| 640 | case VPInstruction::ExplicitVectorLength: |
| 641 | case VPInstruction::ExtractLastLane: |
| 642 | case VPInstruction::ExtractLastPart: |
| 643 | case VPInstruction::ExtractPenultimateElement: |
| 644 | case VPInstruction::MaskedCond: |
| 645 | case VPInstruction::Not: |
| 646 | case VPInstruction::Reverse: |
| 647 | case VPInstruction::Unpack: |
| 648 | case VPInstruction::NumActiveLanes: |
| 649 | return 1; |
| 650 | case Instruction::ICmp: |
| 651 | case Instruction::FCmp: |
| 652 | case Instruction::ExtractElement: |
| 653 | case Instruction::Store: |
| 654 | case VPInstruction::BranchOnCount: |
| 655 | case VPInstruction::BranchOnTwoConds: |
| 656 | case VPInstruction::FirstOrderRecurrenceSplice: |
| 657 | case VPInstruction::LogicalAnd: |
| 658 | case VPInstruction::LogicalOr: |
| 659 | case VPInstruction::PtrAdd: |
| 660 | case VPInstruction::WidePtrAdd: |
| 661 | case VPInstruction::WideIVStep: |
| 662 | case VPInstruction::CalculateTripCountMinusVF: |
| 663 | case VPInstruction::ResumeForEpilogue: |
| 664 | return 2; |
| 665 | case Instruction::InsertElement: |
| 666 | case Instruction::Select: |
| 667 | case VPInstruction::ActiveLaneMask: |
| 668 | case VPInstruction::ReductionStartVector: |
| 669 | return 3; |
| 670 | case Instruction::Call: |
| 671 | return getCalledFnOperandIndex(Operands: ArrayRef<VPValue *>(op_begin(), op_end())) + |
| 672 | 1; |
| 673 | case Instruction::GetElementPtr: |
| 674 | case Instruction::PHI: |
| 675 | case Instruction::Switch: |
| 676 | case VPInstruction::AnyOf: |
| 677 | case VPInstruction::BuildStructVector: |
| 678 | case VPInstruction::BuildVector: |
| 679 | case VPInstruction::CanonicalIVIncrementForPart: |
| 680 | case VPInstruction::ComputeReductionResult: |
| 681 | case VPInstruction::FirstActiveLane: |
| 682 | case VPInstruction::LastActiveLane: |
| 683 | case VPInstruction::ExtractLane: |
| 684 | case VPInstruction::ExtractLastActive: |
| 685 | // Cannot determine the number of operands from the opcode. |
| 686 | return -1u; |
| 687 | } |
| 688 | llvm_unreachable("all cases should be handled above" ); |
| 689 | } |
| 690 | |
| 691 | bool VPInstruction::doesGeneratePerAllLanes() const { |
| 692 | return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this); |
| 693 | } |
| 694 | |
| 695 | bool VPInstruction::canGenerateScalarForFirstLane() const { |
| 696 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
| 697 | return true; |
| 698 | if (isSingleScalar() || isVectorToScalar()) |
| 699 | return true; |
| 700 | switch (Opcode) { |
| 701 | case Instruction::Freeze: |
| 702 | case Instruction::ICmp: |
| 703 | case Instruction::PHI: |
| 704 | case Instruction::Select: |
| 705 | case VPInstruction::BranchOnCond: |
| 706 | case VPInstruction::BranchOnTwoConds: |
| 707 | case VPInstruction::BranchOnCount: |
| 708 | case VPInstruction::CalculateTripCountMinusVF: |
| 709 | case VPInstruction::CanonicalIVIncrementForPart: |
| 710 | case VPInstruction::PtrAdd: |
| 711 | case VPInstruction::ExplicitVectorLength: |
| 712 | case VPInstruction::AnyOf: |
| 713 | case VPInstruction::Not: |
| 714 | return true; |
| 715 | default: |
| 716 | return false; |
| 717 | } |
| 718 | } |
| 719 | |
| 720 | static Instruction::BinaryOps getSubRecurOpcode(RecurKind Kind) { |
| 721 | if (Kind == RecurKind::Sub) |
| 722 | return Instruction::Add; |
| 723 | if (Kind == RecurKind::FSub) |
| 724 | return Instruction::FAdd; |
| 725 | llvm_unreachable("RecurKind should be Sub/FSub." ); |
| 726 | } |
| 727 | |
| 728 | Value *VPInstruction::generate(VPTransformState &State) { |
| 729 | IRBuilderBase &Builder = State.Builder; |
| 730 | |
| 731 | if (Instruction::isBinaryOp(Opcode: getOpcode())) { |
| 732 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 733 | Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
| 734 | Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
| 735 | auto *Res = |
| 736 | Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name); |
| 737 | if (auto *I = dyn_cast<Instruction>(Val: Res)) |
| 738 | applyFlags(I&: *I); |
| 739 | return Res; |
| 740 | } |
| 741 | |
| 742 | switch (getOpcode()) { |
| 743 | case VPInstruction::Not: { |
| 744 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 745 | Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
| 746 | return Builder.CreateNot(V: A, Name); |
| 747 | } |
| 748 | case Instruction::ExtractElement: { |
| 749 | assert(State.VF.isVector() && "Only extract elements from vectors" ); |
| 750 | if (auto *Idx = dyn_cast<VPConstantInt>(Val: getOperand(N: 1))) |
| 751 | return State.get(Def: getOperand(N: 0), Lane: VPLane(Idx->getZExtValue())); |
| 752 | Value *Vec = State.get(Def: getOperand(N: 0)); |
| 753 | Value *Idx = State.get(Def: getOperand(N: 1), /*IsScalar=*/true); |
| 754 | return Builder.CreateExtractElement(Vec, Idx, Name); |
| 755 | } |
| 756 | case Instruction::InsertElement: { |
| 757 | assert(State.VF.isVector() && "Can only insert elements into vectors" ); |
| 758 | Value *Vec = State.get(Def: getOperand(N: 0), /*IsScalar=*/false); |
| 759 | Value *Elt = State.get(Def: getOperand(N: 1), /*IsScalar=*/true); |
| 760 | Value *Idx = State.get(Def: getOperand(N: 2), /*IsScalar=*/true); |
| 761 | return Builder.CreateInsertElement(Vec, NewElt: Elt, Idx, Name); |
| 762 | } |
| 763 | case Instruction::Freeze: { |
| 764 | Value *Op = State.get(Def: getOperand(N: 0), IsScalar: vputils::onlyFirstLaneUsed(Def: this)); |
| 765 | return Builder.CreateFreeze(V: Op, Name); |
| 766 | } |
| 767 | case Instruction::FCmp: |
| 768 | case Instruction::ICmp: { |
| 769 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 770 | Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed); |
| 771 | Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
| 772 | return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name); |
| 773 | } |
| 774 | case Instruction::PHI: { |
| 775 | llvm_unreachable("should be handled by VPPhi::execute" ); |
| 776 | } |
| 777 | case Instruction::Select: { |
| 778 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
| 779 | Value *Cond = |
| 780 | State.get(Def: getOperand(N: 0), |
| 781 | IsScalar: OnlyFirstLaneUsed || vputils::isSingleScalar(VPV: getOperand(N: 0))); |
| 782 | Value *Op1 = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed); |
| 783 | Value *Op2 = State.get(Def: getOperand(N: 2), IsScalar: OnlyFirstLaneUsed); |
| 784 | return Builder.CreateSelectFMF(C: Cond, True: Op1, False: Op2, FMFSource: getFastMathFlagsOrNone(), |
| 785 | Name); |
| 786 | } |
| 787 | case VPInstruction::ActiveLaneMask: { |
| 788 | // Get first lane of vector induction variable. |
| 789 | Value *VIVElem0 = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 790 | // Get the original loop tripcount. |
| 791 | Value *ScalarTC = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 792 | |
| 793 | // If this part of the active lane mask is scalar, generate the CMP directly |
| 794 | // to avoid unnecessary extracts. |
| 795 | if (State.VF.isScalar()) |
| 796 | return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC, |
| 797 | Name); |
| 798 | |
| 799 | ElementCount EC = State.VF.multiplyCoefficientBy( |
| 800 | RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue()); |
| 801 | auto *PredTy = VectorType::get(ElementType: Builder.getInt1Ty(), EC); |
| 802 | return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask, |
| 803 | OverloadTypes: {PredTy, ScalarTC->getType()}, |
| 804 | Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name); |
| 805 | } |
| 806 | case VPInstruction::NumActiveLanes: { |
| 807 | Value *Op = State.get(Def: getOperand(N: 0)); |
| 808 | auto *VecTy = cast<VectorType>(Val: Op->getType()); |
| 809 | assert(VecTy->getScalarSizeInBits() == 1 && |
| 810 | "NumActiveLanes only implemented for i1 vectors" ); |
| 811 | |
| 812 | Type *Ty = getScalarType(); |
| 813 | Value *ZExt = Builder.CreateCast( |
| 814 | Op: Instruction::ZExt, V: Op, DestTy: VectorType::get(ElementType: Ty, EC: VecTy->getElementCount())); |
| 815 | Value *NumActive = |
| 816 | Builder.CreateUnaryIntrinsic(ID: Intrinsic::vector_reduce_add, Op: ZExt); |
| 817 | return NumActive; |
| 818 | } |
| 819 | case VPInstruction::FirstOrderRecurrenceSplice: { |
| 820 | // Generate code to combine the previous and current values in vector v3. |
| 821 | // |
| 822 | // vector.ph: |
| 823 | // v_init = vector(..., ..., ..., a[-1]) |
| 824 | // br vector.body |
| 825 | // |
| 826 | // vector.body |
| 827 | // i = phi [0, vector.ph], [i+4, vector.body] |
| 828 | // v1 = phi [v_init, vector.ph], [v2, vector.body] |
| 829 | // v2 = a[i, i+1, i+2, i+3]; |
| 830 | // v3 = vector(v1(3), v2(0, 1, 2)) |
| 831 | |
| 832 | auto *V1 = State.get(Def: getOperand(N: 0)); |
| 833 | if (!V1->getType()->isVectorTy()) |
| 834 | return V1; |
| 835 | Value *V2 = State.get(Def: getOperand(N: 1)); |
| 836 | return Builder.CreateVectorSpliceRight(V1, V2, Offset: 1, Name); |
| 837 | } |
| 838 | case VPInstruction::CalculateTripCountMinusVF: { |
| 839 | Value *ScalarTC = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 840 | Value *VFxUF = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 841 | Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: VFxUF); |
| 842 | Value *Cmp = |
| 843 | Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: VFxUF); |
| 844 | Value *Zero = ConstantInt::getNullValue(Ty: ScalarTC->getType()); |
| 845 | return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero); |
| 846 | } |
| 847 | case VPInstruction::ExplicitVectorLength: { |
| 848 | // TODO: Restructure this code with an explicit remainder loop, vsetvli can |
| 849 | // be outside of the main loop. |
| 850 | Value *AVL = State.get(Def: getOperand(N: 0), /*IsScalar*/ true); |
| 851 | // Compute EVL |
| 852 | assert(AVL->getType()->isIntegerTy() && |
| 853 | "Requested vector length should be an integer." ); |
| 854 | |
| 855 | assert(State.VF.isScalable() && "Expected scalable vector factor." ); |
| 856 | Value *VFArg = Builder.getInt32(C: State.VF.getKnownMinValue()); |
| 857 | |
| 858 | Value *EVL = Builder.CreateIntrinsic( |
| 859 | RetTy: Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length, |
| 860 | Args: {AVL, VFArg, Builder.getTrue()}); |
| 861 | return EVL; |
| 862 | } |
| 863 | case VPInstruction::BranchOnCond: { |
| 864 | Value *Cond = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 865 | // Replace the temporary unreachable terminator with a new conditional |
| 866 | // branch, hooking it up to backward destination for latch blocks now, and |
| 867 | // to forward destination(s) later when they are created. |
| 868 | // Second successor may be backwards - iff it is already in VPBB2IRBB. |
| 869 | VPBasicBlock *SecondVPSucc = |
| 870 | cast<VPBasicBlock>(Val: getParent()->getSuccessors()[1]); |
| 871 | BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(Val: SecondVPSucc); |
| 872 | BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()]; |
| 873 | auto *Br = Builder.CreateCondBr(Cond, True: IRBB, False: SecondIRSucc); |
| 874 | // First successor is always forward, reset it to nullptr. |
| 875 | Br->setSuccessor(idx: 0, NewSucc: nullptr); |
| 876 | IRBB->getTerminator()->eraseFromParent(); |
| 877 | applyMetadata(I&: *Br); |
| 878 | return Br; |
| 879 | } |
| 880 | case VPInstruction::Broadcast: { |
| 881 | return Builder.CreateVectorSplat( |
| 882 | EC: State.VF, V: State.get(Def: getOperand(N: 0), /*IsScalar*/ true), Name: "broadcast" ); |
| 883 | } |
| 884 | case VPInstruction::BuildStructVector: { |
| 885 | // For struct types, we need to build a new 'wide' struct type, where each |
| 886 | // element is widened, i.e., we create a struct of vectors. |
| 887 | auto *StructTy = cast<StructType>(Val: getOperand(N: 0)->getScalarType()); |
| 888 | Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: StructTy, EC: State.VF)); |
| 889 | for (const auto &[LaneIndex, Op] : enumerate(First: operands())) { |
| 890 | for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements(); |
| 891 | FieldIndex++) { |
| 892 | Value *ScalarValue = |
| 893 | Builder.CreateExtractValue(Agg: State.get(Def: Op, IsScalar: true), Idxs: FieldIndex); |
| 894 | Value *VectorValue = Builder.CreateExtractValue(Agg: Res, Idxs: FieldIndex); |
| 895 | VectorValue = |
| 896 | Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneIndex); |
| 897 | Res = Builder.CreateInsertValue(Agg: Res, Val: VectorValue, Idxs: FieldIndex); |
| 898 | } |
| 899 | } |
| 900 | return Res; |
| 901 | } |
| 902 | case VPInstruction::BuildVector: { |
| 903 | auto *ScalarTy = getOperand(N: 0)->getScalarType(); |
| 904 | auto NumOfElements = ElementCount::getFixed(MinVal: getNumOperands()); |
| 905 | Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: ScalarTy, EC: NumOfElements)); |
| 906 | for (const auto &[Idx, Op] : enumerate(First: operands())) |
| 907 | Res = Builder.CreateInsertElement(Vec: Res, NewElt: State.get(Def: Op, IsScalar: true), |
| 908 | Idx: Builder.getInt32(C: Idx)); |
| 909 | return Res; |
| 910 | } |
| 911 | case VPInstruction::ReductionStartVector: { |
| 912 | if (State.VF.isScalar()) |
| 913 | return State.get(Def: getOperand(N: 0), IsScalar: true); |
| 914 | IRBuilderBase::FastMathFlagGuard FMFG(Builder); |
| 915 | Builder.setFastMathFlags(getFastMathFlagsOrNone()); |
| 916 | // If this start vector is scaled then it should produce a vector with fewer |
| 917 | // elements than the VF. |
| 918 | ElementCount VF = State.VF.divideCoefficientBy( |
| 919 | RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue()); |
| 920 | auto *Iden = Builder.CreateVectorSplat(EC: VF, V: State.get(Def: getOperand(N: 1), IsScalar: true)); |
| 921 | return Builder.CreateInsertElement(Vec: Iden, NewElt: State.get(Def: getOperand(N: 0), IsScalar: true), |
| 922 | Idx: Builder.getInt32(C: 0)); |
| 923 | } |
| 924 | case VPInstruction::ComputeReductionResult: { |
| 925 | RecurKind RK = getRecurKind(); |
| 926 | bool IsOrdered = isReductionOrdered(); |
| 927 | bool IsInLoop = isReductionInLoop(); |
| 928 | assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && |
| 929 | "FindIV should use min/max reduction kinds" ); |
| 930 | |
| 931 | // The recipe may have multiple operands to be reduced together. |
| 932 | unsigned NumOperandsToReduce = getNumOperands(); |
| 933 | VectorParts RdxParts(NumOperandsToReduce); |
| 934 | for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part) |
| 935 | RdxParts[Part] = State.get(Def: getOperand(N: Part), IsScalar: IsInLoop); |
| 936 | |
| 937 | IRBuilderBase::FastMathFlagGuard FMFG(Builder); |
| 938 | Builder.setFastMathFlags(getFastMathFlagsOrNone()); |
| 939 | |
| 940 | // Reduce multiple operands into one. |
| 941 | Value *ReducedPartRdx = RdxParts[0]; |
| 942 | if (IsOrdered) { |
| 943 | ReducedPartRdx = RdxParts[NumOperandsToReduce - 1]; |
| 944 | } else { |
| 945 | // Floating-point operations should have some FMF to enable the reduction. |
| 946 | for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) { |
| 947 | Value *RdxPart = RdxParts[Part]; |
| 948 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) |
| 949 | ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart); |
| 950 | else { |
| 951 | // For sub-recurrences, each part's reduction variable is already |
| 952 | // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1) |
| 953 | Instruction::BinaryOps Opcode = |
| 954 | RecurrenceDescriptor::isSubRecurrenceKind(Kind: RK) |
| 955 | ? getSubRecurOpcode(Kind: RK) |
| 956 | : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind: RK); |
| 957 | ReducedPartRdx = |
| 958 | Builder.CreateBinOp(Opc: Opcode, LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx" ); |
| 959 | } |
| 960 | } |
| 961 | } |
| 962 | |
| 963 | // Create the reduction after the loop. Note that inloop reductions create |
| 964 | // the target reduction in the loop using a Reduction recipe. |
| 965 | if (State.VF.isVector() && !IsInLoop) { |
| 966 | // TODO: Support in-order reductions based on the recurrence descriptor. |
| 967 | // All ops in the reduction inherit fast-math-flags from the recurrence |
| 968 | // descriptor. |
| 969 | ReducedPartRdx = createSimpleReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK); |
| 970 | } |
| 971 | |
| 972 | return ReducedPartRdx; |
| 973 | } |
| 974 | case VPInstruction::ExtractLastLane: |
| 975 | case VPInstruction::ExtractPenultimateElement: { |
| 976 | unsigned Offset = |
| 977 | getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1; |
| 978 | Value *Res; |
| 979 | if (State.VF.isVector()) { |
| 980 | assert(Offset <= State.VF.getKnownMinValue() && |
| 981 | "invalid offset to extract from" ); |
| 982 | // Extract lane VF - Offset from the operand. |
| 983 | Res = State.get(Def: getOperand(N: 0), Lane: VPLane::getLaneFromEnd(VF: State.VF, Offset)); |
| 984 | } else { |
| 985 | // TODO: Remove ExtractLastLane for scalar VFs. |
| 986 | assert(Offset <= 1 && "invalid offset to extract from" ); |
| 987 | Res = State.get(Def: getOperand(N: 0)); |
| 988 | } |
| 989 | if (isa<ExtractElementInst>(Val: Res)) |
| 990 | Res->setName(Name); |
| 991 | return Res; |
| 992 | } |
| 993 | case VPInstruction::LogicalAnd: { |
| 994 | Value *A = State.get(Def: getOperand(N: 0)); |
| 995 | Value *B = State.get(Def: getOperand(N: 1)); |
| 996 | return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name); |
| 997 | } |
| 998 | case VPInstruction::LogicalOr: { |
| 999 | Value *A = State.get(Def: getOperand(N: 0)); |
| 1000 | Value *B = State.get(Def: getOperand(N: 1)); |
| 1001 | return Builder.CreateLogicalOr(Cond1: A, Cond2: B, Name); |
| 1002 | } |
| 1003 | case VPInstruction::PtrAdd: { |
| 1004 | assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) && |
| 1005 | "can only generate first lane for PtrAdd" ); |
| 1006 | Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 1007 | Value *Addend = State.get(Def: getOperand(N: 1), Lane: VPLane(0)); |
| 1008 | return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags()); |
| 1009 | } |
| 1010 | case VPInstruction::WidePtrAdd: { |
| 1011 | Value *Ptr = |
| 1012 | State.get(Def: getOperand(N: 0), IsScalar: vputils::isSingleScalar(VPV: getOperand(N: 0))); |
| 1013 | Value *Addend = State.get(Def: getOperand(N: 1)); |
| 1014 | return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags()); |
| 1015 | } |
| 1016 | case VPInstruction::AnyOf: { |
| 1017 | Value *Res = Builder.CreateFreeze(V: State.get(Def: getOperand(N: 0))); |
| 1018 | for (VPValue *Op : drop_begin(RangeOrContainer: operands())) |
| 1019 | Res = Builder.CreateOr(LHS: Res, RHS: Builder.CreateFreeze(V: State.get(Def: Op))); |
| 1020 | return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Src: Res); |
| 1021 | } |
| 1022 | case VPInstruction::ExtractLane: { |
| 1023 | assert(getNumOperands() != 2 && "ExtractLane from single source should be " |
| 1024 | "simplified to ExtractElement." ); |
| 1025 | Value * = State.get(Def: getOperand(N: 0), IsScalar: true); |
| 1026 | Type *IdxTy = getOperand(N: 0)->getScalarType(); |
| 1027 | Value *Res = nullptr; |
| 1028 | Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF); |
| 1029 | |
| 1030 | for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) { |
| 1031 | Value *VectorStart = |
| 1032 | Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: IdxTy, V: Idx - 1)); |
| 1033 | Value *VectorIdx = Idx == 1 |
| 1034 | ? LaneToExtract |
| 1035 | : Builder.CreateSub(LHS: LaneToExtract, RHS: VectorStart); |
| 1036 | Value *Ext = State.VF.isScalar() |
| 1037 | ? State.get(Def: getOperand(N: Idx)) |
| 1038 | : Builder.CreateExtractElement( |
| 1039 | Vec: State.get(Def: getOperand(N: Idx)), Idx: VectorIdx); |
| 1040 | if (Res) { |
| 1041 | Value *Cmp = Builder.CreateICmpUGE(LHS: LaneToExtract, RHS: VectorStart); |
| 1042 | Res = Builder.CreateSelect(C: Cmp, True: Ext, False: Res); |
| 1043 | } else { |
| 1044 | Res = Ext; |
| 1045 | } |
| 1046 | } |
| 1047 | return Res; |
| 1048 | } |
| 1049 | case VPInstruction::FirstActiveLane: { |
| 1050 | Type *Ty = this->getScalarType(); |
| 1051 | if (getNumOperands() == 1) { |
| 1052 | Value *Mask = State.get(Def: getOperand(N: 0)); |
| 1053 | return Builder.CreateCountTrailingZeroElems(ResTy: Ty, Mask, |
| 1054 | /*ZeroIsPoison=*/false, Name); |
| 1055 | } |
| 1056 | // If there are multiple operands, create a chain of selects to pick the |
| 1057 | // first operand with an active lane and add the number of lanes of the |
| 1058 | // preceding operands. |
| 1059 | Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty, VF: State.VF); |
| 1060 | unsigned LastOpIdx = getNumOperands() - 1; |
| 1061 | Value *Res = nullptr; |
| 1062 | for (int Idx = LastOpIdx; Idx >= 0; --Idx) { |
| 1063 | Value *TrailingZeros = |
| 1064 | State.VF.isScalar() |
| 1065 | ? Builder.CreateZExt( |
| 1066 | V: Builder.CreateICmpEQ(LHS: State.get(Def: getOperand(N: Idx)), |
| 1067 | RHS: Builder.getFalse()), |
| 1068 | DestTy: Ty) |
| 1069 | : Builder.CreateCountTrailingZeroElems( |
| 1070 | ResTy: Ty, Mask: State.get(Def: getOperand(N: Idx)), |
| 1071 | /*ZeroIsPoison=*/false, Name); |
| 1072 | Value *Current = Builder.CreateAdd( |
| 1073 | LHS: Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty, V: Idx)), |
| 1074 | RHS: TrailingZeros); |
| 1075 | if (Res) { |
| 1076 | Value *Cmp = Builder.CreateICmpNE(LHS: TrailingZeros, RHS: RuntimeVF); |
| 1077 | Res = Builder.CreateSelect(C: Cmp, True: Current, False: Res); |
| 1078 | } else { |
| 1079 | Res = Current; |
| 1080 | } |
| 1081 | } |
| 1082 | |
| 1083 | return Res; |
| 1084 | } |
| 1085 | case VPInstruction::ResumeForEpilogue: |
| 1086 | return State.get(Def: getOperand(N: 0), IsScalar: true); |
| 1087 | case VPInstruction::Reverse: |
| 1088 | return Builder.CreateVectorReverse(V: State.get(Def: getOperand(N: 0)), Name: "reverse" ); |
| 1089 | case VPInstruction::ExtractLastActive: { |
| 1090 | Value *Result = State.get(Def: getOperand(N: 0), /*IsScalar=*/true); |
| 1091 | for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) { |
| 1092 | Value *Data = State.get(Def: getOperand(N: Idx)); |
| 1093 | Value *Mask = State.get(Def: getOperand(N: Idx + 1)); |
| 1094 | Type *VTy = Data->getType(); |
| 1095 | |
| 1096 | if (State.VF.isScalar()) |
| 1097 | Result = Builder.CreateSelect(C: Mask, True: Data, False: Result); |
| 1098 | else |
| 1099 | Result = Builder.CreateIntrinsic( |
| 1100 | ID: Intrinsic::experimental_vector_extract_last_active, OverloadTypes: {VTy}, |
| 1101 | Args: {Data, Mask, Result}); |
| 1102 | } |
| 1103 | |
| 1104 | return Result; |
| 1105 | } |
| 1106 | default: |
| 1107 | llvm_unreachable("Unsupported opcode for instruction" ); |
| 1108 | } |
| 1109 | } |
| 1110 | |
| 1111 | InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode( |
| 1112 | unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const { |
| 1113 | Type *ScalarTy = this->getScalarType(); |
| 1114 | Type *ResultTy = VF.isVector() ? toVectorTy(Scalar: ScalarTy, EC: VF) : ScalarTy; |
| 1115 | switch (Opcode) { |
| 1116 | case Instruction::FNeg: |
| 1117 | return Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: ResultTy, CostKind: Ctx.CostKind); |
| 1118 | case Instruction::UDiv: |
| 1119 | case Instruction::SDiv: |
| 1120 | case Instruction::SRem: |
| 1121 | case Instruction::URem: |
| 1122 | case Instruction::Add: |
| 1123 | case Instruction::FAdd: |
| 1124 | case Instruction::Sub: |
| 1125 | case Instruction::FSub: |
| 1126 | case Instruction::Mul: |
| 1127 | case Instruction::FMul: |
| 1128 | case Instruction::FDiv: |
| 1129 | case Instruction::FRem: |
| 1130 | case Instruction::Shl: |
| 1131 | case Instruction::LShr: |
| 1132 | case Instruction::AShr: |
| 1133 | case Instruction::And: |
| 1134 | case Instruction::Or: |
| 1135 | case Instruction::Xor: { |
| 1136 | // Certain instructions can be cheaper if they have a constant second |
| 1137 | // operand. One example of this are shifts on x86. |
| 1138 | VPValue *RHS = getOperand(N: 1); |
| 1139 | TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(V: RHS); |
| 1140 | |
| 1141 | if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && |
| 1142 | getOperand(N: 1)->isDefinedOutsideLoopRegions()) |
| 1143 | RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; |
| 1144 | |
| 1145 | Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()); |
| 1146 | SmallVector<const Value *, 4> Operands; |
| 1147 | if (CtxI) |
| 1148 | Operands.append(in_start: CtxI->value_op_begin(), in_end: CtxI->value_op_end()); |
| 1149 | return Ctx.TTI.getArithmeticInstrCost( |
| 1150 | Opcode, Ty: ResultTy, CostKind: Ctx.CostKind, |
| 1151 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
| 1152 | Opd2Info: RHSInfo, Args: Operands, CxtI: CtxI, TLibInfo: &Ctx.TLI); |
| 1153 | } |
| 1154 | case Instruction::Freeze: |
| 1155 | // NOTE: The only way to ask for the cost is via getInstructionCost, which |
| 1156 | // requires the actual vector instruction. Instead, both here and in the |
| 1157 | // LoopVectorizationCostModel::getInstructionCost the costs mirror the |
| 1158 | // current behaviour in llvm/Analysis/TargetTransformInfoImpl.h to keep |
| 1159 | // them in sync. |
| 1160 | return TTI::TCC_Free; |
| 1161 | case Instruction::ExtractValue: |
| 1162 | return Ctx.TTI.getInsertExtractValueCost(Opcode: Instruction::ExtractValue, |
| 1163 | CostKind: Ctx.CostKind); |
| 1164 | case Instruction::ICmp: |
| 1165 | case Instruction::FCmp: { |
| 1166 | Type *ScalarOpTy = getOperand(N: 0)->getScalarType(); |
| 1167 | Type *OpTy = VF.isVector() ? toVectorTy(Scalar: ScalarOpTy, EC: VF) : ScalarOpTy; |
| 1168 | Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()); |
| 1169 | return Ctx.TTI.getCmpSelInstrCost( |
| 1170 | Opcode, ValTy: OpTy, CondTy: CmpInst::makeCmpResultType(opnd_type: OpTy), VecPred: getPredicate(), |
| 1171 | CostKind: Ctx.CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
| 1172 | Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CtxI); |
| 1173 | } |
| 1174 | case Instruction::BitCast: { |
| 1175 | Type *ScalarTy = this->getScalarType(); |
| 1176 | if (ScalarTy->isPointerTy()) |
| 1177 | return 0; |
| 1178 | [[fallthrough]]; |
| 1179 | } |
| 1180 | case Instruction::SExt: |
| 1181 | case Instruction::ZExt: |
| 1182 | case Instruction::FPToUI: |
| 1183 | case Instruction::FPToSI: |
| 1184 | case Instruction::FPExt: |
| 1185 | case Instruction::PtrToInt: |
| 1186 | case Instruction::PtrToAddr: |
| 1187 | case Instruction::IntToPtr: |
| 1188 | case Instruction::SIToFP: |
| 1189 | case Instruction::UIToFP: |
| 1190 | case Instruction::Trunc: |
| 1191 | case Instruction::FPTrunc: |
| 1192 | case Instruction::AddrSpaceCast: { |
| 1193 | // Computes the CastContextHint from a recipe that may access memory. |
| 1194 | auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { |
| 1195 | if (isa<VPInterleaveBase>(Val: R)) |
| 1196 | return TTI::CastContextHint::Interleave; |
| 1197 | if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(Val: R)) { |
| 1198 | // Only compute CCH for memory operations, matching the legacy model |
| 1199 | // which only considers loads/stores for cast context hints. |
| 1200 | auto *UI = cast<Instruction>(Val: ReplicateRecipe->getUnderlyingValue()); |
| 1201 | if (!isa<LoadInst, StoreInst>(Val: UI)) |
| 1202 | return TTI::CastContextHint::None; |
| 1203 | return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked |
| 1204 | : TTI::CastContextHint::Normal; |
| 1205 | } |
| 1206 | const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(Val: R); |
| 1207 | if (WidenMemoryRecipe == nullptr) |
| 1208 | return TTI::CastContextHint::None; |
| 1209 | if (VF.isScalar()) |
| 1210 | return TTI::CastContextHint::Normal; |
| 1211 | if (!WidenMemoryRecipe->isConsecutive()) |
| 1212 | return TTI::CastContextHint::GatherScatter; |
| 1213 | if (WidenMemoryRecipe->isMasked()) |
| 1214 | return TTI::CastContextHint::Masked; |
| 1215 | return TTI::CastContextHint::Normal; |
| 1216 | }; |
| 1217 | |
| 1218 | VPValue *Operand = getOperand(N: 0); |
| 1219 | TTI::CastContextHint CCH = TTI::CastContextHint::None; |
| 1220 | bool IsReverse = false; |
| 1221 | // For Trunc/FPTrunc, get the context from the only user. |
| 1222 | if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { |
| 1223 | if (auto *Recipe = cast_or_null<VPRecipeBase>(Val: getSingleUser())) { |
| 1224 | if (match(V: Recipe, |
| 1225 | P: m_CombineOr( |
| 1226 | Ps: m_Reverse(Op0: m_VPValue()), |
| 1227 | Ps: m_Intrinsic<Intrinsic::experimental_vp_reverse>()))) { |
| 1228 | IsReverse = true; |
| 1229 | Recipe = cast_or_null<VPRecipeBase>( |
| 1230 | Val: Recipe->getVPSingleValue()->getSingleUser()); |
| 1231 | } |
| 1232 | if (Recipe) |
| 1233 | CCH = ComputeCCH(Recipe); |
| 1234 | } |
| 1235 | } |
| 1236 | // For Z/Sext, get the context from the operand. |
| 1237 | else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || |
| 1238 | Opcode == Instruction::FPExt) { |
| 1239 | if (auto *Recipe = Operand->getDefiningRecipe()) { |
| 1240 | VPValue *ReverseOp; |
| 1241 | if (match(V: Recipe, |
| 1242 | P: m_CombineOr(Ps: m_Reverse(Op0: m_VPValue(V&: ReverseOp)), |
| 1243 | Ps: m_Intrinsic<Intrinsic::experimental_vp_reverse>( |
| 1244 | Op0: m_VPValue(V&: ReverseOp))))) { |
| 1245 | Recipe = ReverseOp->getDefiningRecipe(); |
| 1246 | IsReverse = true; |
| 1247 | } |
| 1248 | if (Recipe) |
| 1249 | CCH = ComputeCCH(Recipe); |
| 1250 | } |
| 1251 | } |
| 1252 | if (IsReverse && CCH != TTI::CastContextHint::None) |
| 1253 | CCH = TTI::CastContextHint::Reversed; |
| 1254 | |
| 1255 | auto *ScalarSrcTy = Operand->getScalarType(); |
| 1256 | Type *SrcTy = VF.isVector() ? toVectorTy(Scalar: ScalarSrcTy, EC: VF) : ScalarSrcTy; |
| 1257 | // Arm TTI will use the underlying instruction to determine the cost. |
| 1258 | return Ctx.TTI.getCastInstrCost( |
| 1259 | Opcode, Dst: ResultTy, Src: SrcTy, CCH, CostKind: Ctx.CostKind, |
| 1260 | I: dyn_cast_if_present<Instruction>(Val: getUnderlyingValue())); |
| 1261 | } |
| 1262 | case Instruction::Select: { |
| 1263 | SelectInst *SI = cast_or_null<SelectInst>(Val: getUnderlyingValue()); |
| 1264 | bool IsScalarCond = getOperand(N: 0)->isDefinedOutsideLoopRegions(); |
| 1265 | Type *ScalarTy = this->getScalarType(); |
| 1266 | |
| 1267 | VPValue *Op0, *Op1; |
| 1268 | bool IsLogicalAnd = |
| 1269 | match(V: this, P: m_c_LogicalAnd(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))); |
| 1270 | bool IsLogicalOr = |
| 1271 | match(V: this, P: m_c_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))); |
| 1272 | // Also match the inverted forms: |
| 1273 | // select x, false, y --> !x & y (still AND) |
| 1274 | // select x, y, true --> !x | y (still OR) |
| 1275 | IsLogicalAnd |= |
| 1276 | match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_False(), Op2: m_VPValue(V&: Op1))); |
| 1277 | IsLogicalOr |= |
| 1278 | match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1), Op2: m_True())); |
| 1279 | |
| 1280 | if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 && |
| 1281 | (IsLogicalAnd || IsLogicalOr)) { |
| 1282 | // select x, y, false --> x & y |
| 1283 | // select x, true, y --> x | y |
| 1284 | const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(V: Op0); |
| 1285 | const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(V: Op1); |
| 1286 | |
| 1287 | SmallVector<const Value *, 2> Operands; |
| 1288 | if (SI && all_of(Range: operands(), |
| 1289 | P: [](VPValue *Op) { return Op->getUnderlyingValue(); })) |
| 1290 | append_range(C&: Operands, R: SI->operands()); |
| 1291 | return Ctx.TTI.getArithmeticInstrCost( |
| 1292 | Opcode: IsLogicalOr ? Instruction::Or : Instruction::And, Ty: ResultTy, |
| 1293 | CostKind: Ctx.CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: SI); |
| 1294 | } |
| 1295 | |
| 1296 | Type *CondTy = getOperand(N: 0)->getScalarType(); |
| 1297 | if (!IsScalarCond && VF.isVector()) |
| 1298 | CondTy = VectorType::get(ElementType: CondTy, EC: VF); |
| 1299 | |
| 1300 | llvm::CmpPredicate Pred; |
| 1301 | if (!match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue()))) |
| 1302 | if (auto *CondIRV = dyn_cast<VPIRValue>(Val: getOperand(N: 0))) |
| 1303 | if (auto *Cmp = dyn_cast<CmpInst>(Val: CondIRV->getValue())) |
| 1304 | Pred = Cmp->getPredicate(); |
| 1305 | Type *VectorTy = toVectorTy(Scalar: this->getScalarType(), EC: VF); |
| 1306 | return Ctx.TTI.getCmpSelInstrCost( |
| 1307 | Opcode: Instruction::Select, ValTy: VectorTy, CondTy, VecPred: Pred, CostKind: Ctx.CostKind, |
| 1308 | Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: SI); |
| 1309 | } |
| 1310 | } |
| 1311 | llvm_unreachable("called for unsupported opcode" ); |
| 1312 | } |
| 1313 | |
| 1314 | InstructionCost VPInstruction::computeCost(ElementCount VF, |
| 1315 | VPCostContext &Ctx) const { |
| 1316 | if (Instruction::isBinaryOp(Opcode: getOpcode())) { |
| 1317 | if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) { |
| 1318 | // TODO: Compute cost for VPInstructions without underlying values once |
| 1319 | // the legacy cost model has been retired. |
| 1320 | return 0; |
| 1321 | } |
| 1322 | |
| 1323 | assert(!doesGeneratePerAllLanes() && |
| 1324 | "Should only generate a vector value or single scalar, not scalars " |
| 1325 | "for all lanes." ); |
| 1326 | return getCostForRecipeWithOpcode( |
| 1327 | Opcode: getOpcode(), |
| 1328 | VF: vputils::onlyFirstLaneUsed(Def: this) ? ElementCount::getFixed(MinVal: 1) : VF, Ctx); |
| 1329 | } |
| 1330 | |
| 1331 | switch (getOpcode()) { |
| 1332 | case Instruction::Select: { |
| 1333 | llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE; |
| 1334 | match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue())); |
| 1335 | auto *CondTy = getOperand(N: 0)->getScalarType(); |
| 1336 | auto *VecTy = getOperand(N: 1)->getScalarType(); |
| 1337 | if (!vputils::onlyFirstLaneUsed(Def: this)) { |
| 1338 | CondTy = toVectorTy(Scalar: CondTy, EC: VF); |
| 1339 | VecTy = toVectorTy(Scalar: VecTy, EC: VF); |
| 1340 | } |
| 1341 | return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy, CondTy, VecPred: Pred, |
| 1342 | CostKind: Ctx.CostKind); |
| 1343 | } |
| 1344 | case Instruction::ExtractElement: |
| 1345 | case VPInstruction::ExtractLane: { |
| 1346 | if (VF.isScalar()) { |
| 1347 | // ExtractLane with VF=1 takes care of handling extracting across multiple |
| 1348 | // parts. |
| 1349 | return 0; |
| 1350 | } |
| 1351 | |
| 1352 | // Add on the cost of extracting the element. |
| 1353 | auto *VecTy = toVectorTy(Scalar: getOperand(N: 0)->getScalarType(), EC: VF); |
| 1354 | return Ctx.TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, |
| 1355 | CostKind: Ctx.CostKind); |
| 1356 | } |
| 1357 | case VPInstruction::AnyOf: { |
| 1358 | auto *VecTy = toVectorTy(Scalar: this->getScalarType(), EC: VF); |
| 1359 | return Ctx.TTI.getArithmeticReductionCost( |
| 1360 | Opcode: Instruction::Or, Ty: cast<VectorType>(Val: VecTy), FMF: std::nullopt, CostKind: Ctx.CostKind); |
| 1361 | } |
| 1362 | case VPInstruction::FirstActiveLane: { |
| 1363 | Type *Ty = this->getScalarType(); |
| 1364 | Type *ScalarTy = getOperand(N: 0)->getScalarType(); |
| 1365 | if (VF.isScalar()) |
| 1366 | return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy, |
| 1367 | CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy), |
| 1368 | VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind); |
| 1369 | // Calculate the cost of determining the lane index. |
| 1370 | auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF); |
| 1371 | IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty, |
| 1372 | {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)}); |
| 1373 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 1374 | } |
| 1375 | case VPInstruction::LastActiveLane: { |
| 1376 | Type *Ty = this->getScalarType(); |
| 1377 | Type *ScalarTy = getOperand(N: 0)->getScalarType(); |
| 1378 | if (VF.isScalar()) |
| 1379 | return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy, |
| 1380 | CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy), |
| 1381 | VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind); |
| 1382 | // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB. |
| 1383 | auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF); |
| 1384 | IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty, |
| 1385 | {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)}); |
| 1386 | InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 1387 | // Add cost of NOT operation on the predicate. |
| 1388 | Cost += Ctx.TTI.getArithmeticInstrCost( |
| 1389 | Opcode: Instruction::Xor, Ty: PredTy, CostKind: Ctx.CostKind, |
| 1390 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
| 1391 | Opd2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue, |
| 1392 | .Properties: TargetTransformInfo::OP_None}); |
| 1393 | // Add cost of SUB operation on the index. |
| 1394 | Cost += Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind: Ctx.CostKind); |
| 1395 | return Cost; |
| 1396 | } |
| 1397 | case VPInstruction::ExtractLastActive: { |
| 1398 | Type *ScalarTy = this->getScalarType(); |
| 1399 | Type *VecTy = toVectorTy(Scalar: ScalarTy, EC: VF); |
| 1400 | Type *MaskTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF); |
| 1401 | IntrinsicCostAttributes ICA( |
| 1402 | Intrinsic::experimental_vector_extract_last_active, ScalarTy, |
| 1403 | {VecTy, MaskTy, ScalarTy}); |
| 1404 | return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind); |
| 1405 | } |
| 1406 | case VPInstruction::FirstOrderRecurrenceSplice: { |
| 1407 | assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?" ); |
| 1408 | Type *VectorTy = toVectorTy(Scalar: this->getScalarType(), EC: VF); |
| 1409 | return Ctx.TTI.getShuffleCost( |
| 1410 | Kind: TargetTransformInfo::SK_Splice, DstTy: cast<VectorType>(Val: VectorTy), |
| 1411 | SrcTy: cast<VectorType>(Val: VectorTy), Mask: {}, CostKind: Ctx.CostKind, Index: -1); |
| 1412 | } |
| 1413 | case VPInstruction::ActiveLaneMask: { |
| 1414 | Type *ArgTy = getOperand(N: 0)->getScalarType(); |
| 1415 | unsigned Multiplier = cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue(); |
| 1416 | Type *RetTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF * Multiplier); |
| 1417 | IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, |
| 1418 | {ArgTy, ArgTy}); |
| 1419 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 1420 | } |
| 1421 | case VPInstruction::ExplicitVectorLength: { |
| 1422 | Type *Arg0Ty = getOperand(N: 0)->getScalarType(); |
| 1423 | Type *I32Ty = Type::getInt32Ty(C&: Ctx.LLVMCtx); |
| 1424 | Type *I1Ty = Type::getInt1Ty(C&: Ctx.LLVMCtx); |
| 1425 | IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length, |
| 1426 | I32Ty, {Arg0Ty, I32Ty, I1Ty}); |
| 1427 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 1428 | } |
| 1429 | case VPInstruction::Reverse: { |
| 1430 | assert(VF.isVector() && "Reverse operation must be vector type" ); |
| 1431 | Type *EltTy = this->getScalarType(); |
| 1432 | // Skip the reverse operation cost for the mask. |
| 1433 | // FIXME: Remove this once redundant mask reverse operations can be |
| 1434 | // eliminated by VPlanTransforms::cse before cost computation. |
| 1435 | if (EltTy->isIntegerTy(BitWidth: 1)) |
| 1436 | return 0; |
| 1437 | auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: EltTy, EC: VF)); |
| 1438 | return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy, |
| 1439 | SrcTy: VectorTy, /*Mask=*/{}, CostKind: Ctx.CostKind, |
| 1440 | /*Index=*/0); |
| 1441 | } |
| 1442 | case VPInstruction::ExtractLastLane: { |
| 1443 | // Add on the cost of extracting the element. |
| 1444 | auto *VecTy = toVectorTy(Scalar: getOperand(N: 0)->getScalarType(), EC: VF); |
| 1445 | return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement, |
| 1446 | Val: VecTy, CostKind: Ctx.CostKind, Index: 0); |
| 1447 | } |
| 1448 | case VPInstruction::Not: { |
| 1449 | Type *ValTy = this->getScalarType(); |
| 1450 | // InstCombine will fold `xor` to the conditional branch. |
| 1451 | if (auto *U = const_cast<VPUser *>(getSingleUser())) |
| 1452 | if (match(U, P: m_BranchOnCond(Op0: m_VPValue()))) |
| 1453 | return 0; |
| 1454 | if (!vputils::onlyFirstLaneUsed(Def: this)) |
| 1455 | ValTy = toVectorTy(Scalar: ValTy, EC: VF); |
| 1456 | return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Xor, Ty: ValTy, |
| 1457 | CostKind: Ctx.CostKind); |
| 1458 | } |
| 1459 | case VPInstruction::BranchOnCount: { |
| 1460 | // If TC <= VF then this is just a branch. |
| 1461 | // FIXME: Removing the branch happens in simplifyBranchConditionForVFAndUF |
| 1462 | // where it checks TC <= VF * UF, but we don't know UF yet. This means in |
| 1463 | // some cases we get a cost that's too high due to counting a cmp that |
| 1464 | // later gets removed. |
| 1465 | // FIXME: The compare could also be removed if TC = M * vscale, |
| 1466 | // VF = N * vscale, and M <= N. Detecting that would require having the |
| 1467 | // trip count as a SCEV though. |
| 1468 | Value *TC = getParent()->getPlan()->getTripCount()->getUnderlyingValue(); |
| 1469 | ConstantInt *TCConst = dyn_cast_if_present<ConstantInt>(Val: TC); |
| 1470 | if (TCConst && TCConst->getValue().ule(RHS: VF.getKnownMinValue())) |
| 1471 | return 0; |
| 1472 | // Otherwise BranchOnCount generates ICmpEQ followed by a branch. |
| 1473 | Type *ValTy = getOperand(N: 0)->getScalarType(); |
| 1474 | return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy, |
| 1475 | CondTy: CmpInst::makeCmpResultType(opnd_type: ValTy), |
| 1476 | VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind); |
| 1477 | } |
| 1478 | case Instruction::FCmp: |
| 1479 | case Instruction::ICmp: |
| 1480 | return getCostForRecipeWithOpcode( |
| 1481 | Opcode: getOpcode(), |
| 1482 | VF: vputils::onlyFirstLaneUsed(Def: this) ? ElementCount::getFixed(MinVal: 1) : VF, Ctx); |
| 1483 | case VPInstruction::ExtractPenultimateElement: |
| 1484 | if (VF == ElementCount::getScalable(MinVal: 1)) |
| 1485 | return InstructionCost::getInvalid(); |
| 1486 | [[fallthrough]]; |
| 1487 | default: |
| 1488 | // TODO: Compute cost other VPInstructions once the legacy cost model has |
| 1489 | // been retired. |
| 1490 | assert(!getUnderlyingValue() && |
| 1491 | "unexpected VPInstruction witht underlying value" ); |
| 1492 | return 0; |
| 1493 | } |
| 1494 | } |
| 1495 | |
| 1496 | bool VPInstruction::isVectorToScalar() const { |
| 1497 | return getOpcode() == VPInstruction::ExtractLastLane || |
| 1498 | getOpcode() == VPInstruction::ExtractPenultimateElement || |
| 1499 | getOpcode() == Instruction::ExtractElement || |
| 1500 | getOpcode() == VPInstruction::ExtractLane || |
| 1501 | getOpcode() == VPInstruction::FirstActiveLane || |
| 1502 | getOpcode() == VPInstruction::LastActiveLane || |
| 1503 | getOpcode() == VPInstruction::ExtractLastActive || |
| 1504 | getOpcode() == VPInstruction::ComputeReductionResult || |
| 1505 | getOpcode() == VPInstruction::AnyOf || |
| 1506 | getOpcode() == VPInstruction::NumActiveLanes; |
| 1507 | } |
| 1508 | |
| 1509 | bool VPInstruction::isSingleScalar() const { |
| 1510 | switch (getOpcode()) { |
| 1511 | case Instruction::Load: |
| 1512 | case Instruction::PHI: |
| 1513 | case VPInstruction::ExplicitVectorLength: |
| 1514 | case VPInstruction::ResumeForEpilogue: |
| 1515 | case VPInstruction::VScale: |
| 1516 | return true; |
| 1517 | default: |
| 1518 | return Instruction::isCast(Opcode: getOpcode()); |
| 1519 | } |
| 1520 | } |
| 1521 | |
| 1522 | void VPInstruction::addOperand(VPValue *Op) { |
| 1523 | #ifndef NDEBUG |
| 1524 | Type *Ty = Op->getScalarType(); |
| 1525 | switch (getOpcode()) { |
| 1526 | case VPInstruction::AnyOf: |
| 1527 | case VPInstruction::FirstActiveLane: |
| 1528 | case VPInstruction::LastActiveLane: |
| 1529 | assert(Ty == getOperand(0)->getScalarType() && |
| 1530 | "types of operand 0 and new operand must match" ); |
| 1531 | break; |
| 1532 | case VPInstruction::ComputeReductionResult: |
| 1533 | case VPInstruction::BuildVector: |
| 1534 | case VPInstruction::BuildStructVector: |
| 1535 | assert(Ty == getOperand(0)->getScalarType() && |
| 1536 | "appended operand must match operand 0's scalar type" ); |
| 1537 | break; |
| 1538 | case VPInstruction::ExtractLane: |
| 1539 | assert(Ty == getOperand(1)->getScalarType() && |
| 1540 | "appended operand must match operand 1's scalar type" ); |
| 1541 | break; |
| 1542 | case VPInstruction::ExtractLastActive: { |
| 1543 | // The recipe is constructed with 3 operands (result, data, mask). Extra |
| 1544 | // operands beyond that are appended in (data, mask) pairs. |
| 1545 | constexpr unsigned NumInitialOperands = 3; |
| 1546 | assert(getNumOperands() >= NumInitialOperands && |
| 1547 | "ExtractLastActive must have at least the initial 3 operands" ); |
| 1548 | bool IsMaskSlot = ((getNumOperands() - NumInitialOperands) & 1u) == 1u; |
| 1549 | assert((IsMaskSlot ? Ty->isIntegerTy(1) |
| 1550 | : Ty == getOperand(1)->getScalarType()) && |
| 1551 | "ExtractLastActive expects alternating data/mask operands " |
| 1552 | "matching operand 1's type and i1, respectively" ); |
| 1553 | break; |
| 1554 | } |
| 1555 | default: |
| 1556 | llvm_unreachable("opcode does not support growing the operand list " |
| 1557 | "outside of construction" ); |
| 1558 | } |
| 1559 | #endif |
| 1560 | VPUser::addOperand(Operand: Op); |
| 1561 | } |
| 1562 | |
| 1563 | void VPInstruction::execute(VPTransformState &State) { |
| 1564 | assert(!isMasked() && "cannot execute masked VPInstruction" ); |
| 1565 | IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); |
| 1566 | assert(flagsValidForOpcode(getOpcode()) && |
| 1567 | "Set flags not supported for the provided opcode" ); |
| 1568 | assert(hasRequiredFlagsForOpcode(getOpcode()) && |
| 1569 | "Opcode requires specific flags to be set" ); |
| 1570 | State.Builder.setFastMathFlags(getFastMathFlagsOrNone()); |
| 1571 | Value *GeneratedValue = generate(State); |
| 1572 | if (!hasResult()) |
| 1573 | return; |
| 1574 | assert(GeneratedValue && "generate must produce a value" ); |
| 1575 | bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && |
| 1576 | (vputils::onlyFirstLaneUsed(Def: this) || |
| 1577 | isVectorToScalar() || isSingleScalar()); |
| 1578 | assert((((GeneratedValue->getType()->isVectorTy() || |
| 1579 | GeneratedValue->getType()->isStructTy()) == |
| 1580 | !GeneratesPerFirstLaneOnly) || |
| 1581 | State.VF.isScalar()) && |
| 1582 | "scalar value but not only first lane defined" ); |
| 1583 | State.set(Def: this, V: GeneratedValue, |
| 1584 | /*IsScalar*/ GeneratesPerFirstLaneOnly); |
| 1585 | if (getOpcode() == VPInstruction::ResumeForEpilogue || |
| 1586 | getOpcode() == Instruction::Freeze) { |
| 1587 | // FIXME: This is a workaround to enable reliable updates of the scalar loop |
| 1588 | // resume phis, and to let epilogue vectorization recover the frozen |
| 1589 | // reduction start from the main plan. Must be removed once epilogue |
| 1590 | // vectorization explicitly connects VPlans. |
| 1591 | setUnderlyingValue(GeneratedValue); |
| 1592 | } |
| 1593 | } |
| 1594 | |
| 1595 | bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { |
| 1596 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || |
| 1597 | Instruction::isUnaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
| 1598 | return false; |
| 1599 | switch (getOpcode()) { |
| 1600 | case Instruction::ExtractValue: |
| 1601 | case Instruction::InsertValue: |
| 1602 | case Instruction::GetElementPtr: |
| 1603 | case Instruction::ExtractElement: |
| 1604 | case Instruction::InsertElement: |
| 1605 | case Instruction::Freeze: |
| 1606 | case Instruction::FCmp: |
| 1607 | case Instruction::ICmp: |
| 1608 | case Instruction::Select: |
| 1609 | case Instruction::PHI: |
| 1610 | case VPInstruction::AnyOf: |
| 1611 | case VPInstruction::BranchOnCond: |
| 1612 | case VPInstruction::BranchOnTwoConds: |
| 1613 | case VPInstruction::BranchOnCount: |
| 1614 | case VPInstruction::Broadcast: |
| 1615 | case VPInstruction::BuildStructVector: |
| 1616 | case VPInstruction::BuildVector: |
| 1617 | case VPInstruction::CalculateTripCountMinusVF: |
| 1618 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1619 | case VPInstruction::ExtractLane: |
| 1620 | case VPInstruction::ExtractLastLane: |
| 1621 | case VPInstruction::ExtractLastPart: |
| 1622 | case VPInstruction::ExtractPenultimateElement: |
| 1623 | case VPInstruction::ActiveLaneMask: |
| 1624 | case VPInstruction::IncomingAliasMask: |
| 1625 | case VPInstruction::ExitingIVValue: |
| 1626 | case VPInstruction::ExplicitVectorLength: |
| 1627 | case VPInstruction::FirstActiveLane: |
| 1628 | case VPInstruction::LastActiveLane: |
| 1629 | case VPInstruction::ExtractLastActive: |
| 1630 | case VPInstruction::FirstOrderRecurrenceSplice: |
| 1631 | case VPInstruction::LogicalAnd: |
| 1632 | case VPInstruction::LogicalOr: |
| 1633 | case VPInstruction::MaskedCond: |
| 1634 | case VPInstruction::Not: |
| 1635 | case VPInstruction::PtrAdd: |
| 1636 | case VPInstruction::WideIVStep: |
| 1637 | case VPInstruction::WidePtrAdd: |
| 1638 | case VPInstruction::StepVector: |
| 1639 | case VPInstruction::ReductionStartVector: |
| 1640 | case VPInstruction::Reverse: |
| 1641 | case VPInstruction::VScale: |
| 1642 | case VPInstruction::Unpack: |
| 1643 | return false; |
| 1644 | case Instruction::Call: |
| 1645 | return !getCalledFunction(Operands: ArrayRef<VPValue *>(op_begin(), op_end())) |
| 1646 | ->doesNotAccessMemory(); |
| 1647 | default: |
| 1648 | return true; |
| 1649 | } |
| 1650 | } |
| 1651 | |
| 1652 | bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const { |
| 1653 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 1654 | if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode())) |
| 1655 | return vputils::onlyFirstLaneUsed(Def: this); |
| 1656 | |
| 1657 | switch (getOpcode()) { |
| 1658 | default: |
| 1659 | return false; |
| 1660 | case Instruction::ExtractElement: |
| 1661 | return Op == getOperand(N: 1); |
| 1662 | case Instruction::InsertElement: |
| 1663 | return Op == getOperand(N: 1) || Op == getOperand(N: 2); |
| 1664 | case Instruction::PHI: |
| 1665 | return true; |
| 1666 | case Instruction::FCmp: |
| 1667 | case Instruction::ICmp: |
| 1668 | case Instruction::Select: |
| 1669 | case Instruction::Or: |
| 1670 | case Instruction::Freeze: |
| 1671 | case VPInstruction::Not: |
| 1672 | // TODO: Cover additional opcodes. |
| 1673 | return vputils::onlyFirstLaneUsed(Def: this); |
| 1674 | case Instruction::Load: |
| 1675 | case VPInstruction::ActiveLaneMask: |
| 1676 | case VPInstruction::ExplicitVectorLength: |
| 1677 | case VPInstruction::CalculateTripCountMinusVF: |
| 1678 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1679 | case VPInstruction::BranchOnCount: |
| 1680 | case VPInstruction::BranchOnCond: |
| 1681 | case VPInstruction::BranchOnTwoConds: |
| 1682 | case VPInstruction::Broadcast: |
| 1683 | case VPInstruction::ReductionStartVector: |
| 1684 | case VPInstruction::ResumeForEpilogue: |
| 1685 | return true; |
| 1686 | case VPInstruction::BuildStructVector: |
| 1687 | case VPInstruction::BuildVector: |
| 1688 | // Before replicating by VF, Build(Struct)Vector uses all lanes of the |
| 1689 | // operand, after replicating its operands only the first lane is used. |
| 1690 | // Before replicating, it will have only a single operand. |
| 1691 | return getNumOperands() > 1; |
| 1692 | case VPInstruction::PtrAdd: |
| 1693 | return Op == getOperand(N: 0) || vputils::onlyFirstLaneUsed(Def: this); |
| 1694 | case VPInstruction::WidePtrAdd: |
| 1695 | // WidePtrAdd supports scalar and vector base addresses. |
| 1696 | return false; |
| 1697 | case VPInstruction::ExitingIVValue: |
| 1698 | case VPInstruction::ExtractLane: |
| 1699 | return Op == getOperand(N: 0); |
| 1700 | }; |
| 1701 | llvm_unreachable("switch should return" ); |
| 1702 | } |
| 1703 | |
| 1704 | bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const { |
| 1705 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 1706 | if (Instruction::isBinaryOp(Opcode: getOpcode())) |
| 1707 | return vputils::onlyFirstPartUsed(Def: this); |
| 1708 | |
| 1709 | switch (getOpcode()) { |
| 1710 | default: |
| 1711 | return false; |
| 1712 | case Instruction::FCmp: |
| 1713 | case Instruction::ICmp: |
| 1714 | case Instruction::Select: |
| 1715 | return vputils::onlyFirstPartUsed(Def: this); |
| 1716 | case VPInstruction::BranchOnCount: |
| 1717 | case VPInstruction::BranchOnCond: |
| 1718 | case VPInstruction::BranchOnTwoConds: |
| 1719 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1720 | return true; |
| 1721 | }; |
| 1722 | llvm_unreachable("switch should return" ); |
| 1723 | } |
| 1724 | |
| 1725 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1726 | void VPInstruction::dump() const { |
| 1727 | VPSlotTracker SlotTracker(getParent()->getPlan()); |
| 1728 | printRecipe(dbgs(), "" , SlotTracker); |
| 1729 | } |
| 1730 | |
| 1731 | void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1732 | VPSlotTracker &SlotTracker) const { |
| 1733 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
| 1734 | |
| 1735 | if (hasResult()) { |
| 1736 | printAsOperand(O, SlotTracker); |
| 1737 | O << " = " ; |
| 1738 | } |
| 1739 | |
| 1740 | switch (getOpcode()) { |
| 1741 | case VPInstruction::Not: |
| 1742 | O << "not" ; |
| 1743 | break; |
| 1744 | case VPInstruction::ActiveLaneMask: |
| 1745 | O << "active lane mask" ; |
| 1746 | break; |
| 1747 | case VPInstruction::IncomingAliasMask: |
| 1748 | O << "incoming-alias-mask" ; |
| 1749 | break; |
| 1750 | case VPInstruction::ExplicitVectorLength: |
| 1751 | O << "EXPLICIT-VECTOR-LENGTH" ; |
| 1752 | break; |
| 1753 | case VPInstruction::FirstOrderRecurrenceSplice: |
| 1754 | O << "first-order splice" ; |
| 1755 | break; |
| 1756 | case VPInstruction::BranchOnCond: |
| 1757 | O << "branch-on-cond" ; |
| 1758 | break; |
| 1759 | case VPInstruction::BranchOnTwoConds: |
| 1760 | O << "branch-on-two-conds" ; |
| 1761 | break; |
| 1762 | case VPInstruction::CalculateTripCountMinusVF: |
| 1763 | O << "TC > VF ? TC - VF : 0" ; |
| 1764 | break; |
| 1765 | case VPInstruction::CanonicalIVIncrementForPart: |
| 1766 | O << "VF * Part +" ; |
| 1767 | break; |
| 1768 | case VPInstruction::BranchOnCount: |
| 1769 | O << "branch-on-count" ; |
| 1770 | break; |
| 1771 | case VPInstruction::Broadcast: |
| 1772 | O << "broadcast" ; |
| 1773 | break; |
| 1774 | case VPInstruction::BuildStructVector: |
| 1775 | O << "buildstructvector" ; |
| 1776 | break; |
| 1777 | case VPInstruction::BuildVector: |
| 1778 | O << "buildvector" ; |
| 1779 | break; |
| 1780 | case VPInstruction::ExitingIVValue: |
| 1781 | O << "exiting-iv-value" ; |
| 1782 | break; |
| 1783 | case VPInstruction::MaskedCond: |
| 1784 | O << "masked-cond" ; |
| 1785 | break; |
| 1786 | case VPInstruction::ExtractLane: |
| 1787 | O << "extract-lane" ; |
| 1788 | break; |
| 1789 | case VPInstruction::ExtractLastLane: |
| 1790 | O << "extract-last-lane" ; |
| 1791 | break; |
| 1792 | case VPInstruction::ExtractLastPart: |
| 1793 | O << "extract-last-part" ; |
| 1794 | break; |
| 1795 | case VPInstruction::ExtractPenultimateElement: |
| 1796 | O << "extract-penultimate-element" ; |
| 1797 | break; |
| 1798 | case VPInstruction::ComputeReductionResult: |
| 1799 | O << "compute-reduction-result" ; |
| 1800 | break; |
| 1801 | case VPInstruction::LogicalAnd: |
| 1802 | O << "logical-and" ; |
| 1803 | break; |
| 1804 | case VPInstruction::LogicalOr: |
| 1805 | O << "logical-or" ; |
| 1806 | break; |
| 1807 | case VPInstruction::PtrAdd: |
| 1808 | O << "ptradd" ; |
| 1809 | break; |
| 1810 | case VPInstruction::WidePtrAdd: |
| 1811 | O << "wide-ptradd" ; |
| 1812 | break; |
| 1813 | case VPInstruction::AnyOf: |
| 1814 | O << "any-of" ; |
| 1815 | break; |
| 1816 | case VPInstruction::FirstActiveLane: |
| 1817 | O << "first-active-lane" ; |
| 1818 | break; |
| 1819 | case VPInstruction::LastActiveLane: |
| 1820 | O << "last-active-lane" ; |
| 1821 | break; |
| 1822 | case VPInstruction::ReductionStartVector: |
| 1823 | O << "reduction-start-vector" ; |
| 1824 | break; |
| 1825 | case VPInstruction::ResumeForEpilogue: |
| 1826 | O << "resume-for-epilogue" ; |
| 1827 | break; |
| 1828 | case VPInstruction::Reverse: |
| 1829 | O << "reverse" ; |
| 1830 | break; |
| 1831 | case VPInstruction::Unpack: |
| 1832 | O << "unpack" ; |
| 1833 | break; |
| 1834 | case VPInstruction::ExtractLastActive: |
| 1835 | O << "extract-last-active" ; |
| 1836 | break; |
| 1837 | case VPInstruction::NumActiveLanes: |
| 1838 | O << "num-active-lanes" ; |
| 1839 | break; |
| 1840 | default: |
| 1841 | O << Instruction::getOpcodeName(getOpcode()); |
| 1842 | } |
| 1843 | |
| 1844 | printFlags(O); |
| 1845 | printOperands(O, SlotTracker); |
| 1846 | } |
| 1847 | #endif |
| 1848 | |
| 1849 | void VPInstructionWithType::execute(VPTransformState &State) { |
| 1850 | Type *ResultTy = getResultType(); |
| 1851 | if (Instruction::isCast(Opcode: getOpcode())) { |
| 1852 | Value *Op = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 1853 | Value *Cast = State.Builder.CreateCast(Op: Instruction::CastOps(getOpcode()), |
| 1854 | V: Op, DestTy: ResultTy); |
| 1855 | if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) { |
| 1856 | applyFlags(I&: *CastOp); |
| 1857 | applyMetadata(I&: *CastOp); |
| 1858 | } |
| 1859 | State.set(Def: this, V: Cast, Lane: VPLane(0)); |
| 1860 | return; |
| 1861 | } |
| 1862 | switch (getOpcode()) { |
| 1863 | case VPInstruction::StepVector: { |
| 1864 | Value *StepVector = |
| 1865 | State.Builder.CreateStepVector(DstType: VectorType::get(ElementType: ResultTy, EC: State.VF)); |
| 1866 | State.set(Def: this, V: StepVector); |
| 1867 | break; |
| 1868 | } |
| 1869 | case VPInstruction::VScale: { |
| 1870 | Value *VScale = State.Builder.CreateVScale(Ty: ResultTy); |
| 1871 | State.set(Def: this, V: VScale, IsScalar: true); |
| 1872 | break; |
| 1873 | } |
| 1874 | |
| 1875 | default: |
| 1876 | llvm_unreachable("opcode not implemented yet" ); |
| 1877 | } |
| 1878 | } |
| 1879 | |
| 1880 | InstructionCost VPInstructionWithType::computeCost(ElementCount VF, |
| 1881 | VPCostContext &Ctx) const { |
| 1882 | // NOTE: At the moment it seems only possible to expose this path for |
| 1883 | // the trunc, zext and sext opcodes. However, isScalarCast also covers |
| 1884 | // int<>fp conversions, bitcasts, ptr<>int conversions, etc. |
| 1885 | if (Instruction::isCast(Opcode: getOpcode())) |
| 1886 | return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), |
| 1887 | Ctx); |
| 1888 | |
| 1889 | switch (getOpcode()) { |
| 1890 | case VPInstruction::VScale: { |
| 1891 | Type *Ty = this->getScalarType(); |
| 1892 | ArrayRef<Type *> Tys; |
| 1893 | IntrinsicCostAttributes Attrs(Intrinsic::vscale, Ty, Tys); |
| 1894 | return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind); |
| 1895 | } |
| 1896 | case VPInstruction::StepVector: |
| 1897 | // TODO: This isn't quite right since even if the step-vector is hoisted |
| 1898 | // out of the loop it has a non-zero cost in the middle block, etc. |
| 1899 | // Once the stepvector is correctly hoisted out of the vector loop by the |
| 1900 | // licm transform we can add the cost here so that it doesn't incorrectly |
| 1901 | // affect the choice of VF. |
| 1902 | return 0; |
| 1903 | default: |
| 1904 | // Although VPInstructionWithType is also used for |
| 1905 | // VPInstruction::WideIVStep it isn't currently possible to expose cases |
| 1906 | // where the cost is queried. |
| 1907 | llvm_unreachable("Unhandled opcode" ); |
| 1908 | } |
| 1909 | return 0; |
| 1910 | } |
| 1911 | |
| 1912 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1913 | void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1914 | VPSlotTracker &SlotTracker) const { |
| 1915 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
| 1916 | printAsOperand(O, SlotTracker); |
| 1917 | O << " = " ; |
| 1918 | |
| 1919 | Type *ResultTy = getResultType(); |
| 1920 | switch (getOpcode()) { |
| 1921 | case VPInstruction::WideIVStep: |
| 1922 | O << "wide-iv-step " ; |
| 1923 | printOperands(O, SlotTracker); |
| 1924 | break; |
| 1925 | case VPInstruction::StepVector: |
| 1926 | O << "step-vector " << *ResultTy; |
| 1927 | break; |
| 1928 | case VPInstruction::VScale: |
| 1929 | O << "vscale " << *ResultTy; |
| 1930 | break; |
| 1931 | case Instruction::Load: |
| 1932 | O << "load " ; |
| 1933 | printOperands(O, SlotTracker); |
| 1934 | break; |
| 1935 | default: |
| 1936 | assert(Instruction::isCast(getOpcode()) && "unhandled opcode" ); |
| 1937 | O << Instruction::getOpcodeName(getOpcode()); |
| 1938 | printFlags(O); |
| 1939 | printOperands(O, SlotTracker); |
| 1940 | O << " to " << *ResultTy; |
| 1941 | } |
| 1942 | } |
| 1943 | #endif |
| 1944 | |
| 1945 | /// Shared execute logic for VPPhi and VPWidenPHIRecipe. Creates a PHI node, |
| 1946 | /// adds incoming values, and stores the result in State. For header phis, only |
| 1947 | /// the preheader incoming value is added; the backedge is fixed up later by |
| 1948 | /// VPlan::execute(). |
| 1949 | static void executePhiRecipe(VPSingleDefRecipe *R, VPPhiAccessors &Phi, |
| 1950 | VPTransformState &State, bool IsScalar, |
| 1951 | const Twine &Name) { |
| 1952 | unsigned NumIncoming = VPBlockUtils::isHeader(VPB: R->getParent(), VPDT: State.VPDT) |
| 1953 | ? 1 |
| 1954 | : Phi.getNumIncoming(); |
| 1955 | Value *FirstInc = State.get(Def: Phi.getIncomingValue(Idx: 0), IsScalar); |
| 1956 | PHINode *NewPhi = State.Builder.CreatePHI(Ty: FirstInc->getType(), NumReservedValues: 2, Name); |
| 1957 | NewPhi->addIncoming(V: FirstInc, |
| 1958 | BB: State.CFG.VPBB2IRBB.at(Val: Phi.getIncomingBlock(Idx: 0))); |
| 1959 | for (unsigned Idx = 1; Idx != NumIncoming; ++Idx) |
| 1960 | NewPhi->addIncoming(V: State.get(Def: Phi.getIncomingValue(Idx), IsScalar), |
| 1961 | BB: State.CFG.VPBB2IRBB.at(Val: Phi.getIncomingBlock(Idx))); |
| 1962 | State.set(Def: R, V: NewPhi, IsScalar); |
| 1963 | } |
| 1964 | |
| 1965 | void VPPhi::execute(VPTransformState &State) { |
| 1966 | executePhiRecipe(R: this, Phi&: *this, State, /*IsScalar=*/true, Name: getName()); |
| 1967 | } |
| 1968 | |
| 1969 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1970 | void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent, |
| 1971 | VPSlotTracker &SlotTracker) const { |
| 1972 | O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "" ) << " " ; |
| 1973 | printAsOperand(O, SlotTracker); |
| 1974 | O << " = phi" ; |
| 1975 | printFlags(O); |
| 1976 | printPhiOperands(O, SlotTracker); |
| 1977 | } |
| 1978 | #endif |
| 1979 | |
| 1980 | VPIRInstruction *VPIRInstruction ::create(Instruction &I) { |
| 1981 | if (auto *Phi = dyn_cast<PHINode>(Val: &I)) |
| 1982 | return new VPIRPhi(*Phi); |
| 1983 | return new VPIRInstruction(I); |
| 1984 | } |
| 1985 | |
| 1986 | void VPIRInstruction::execute(VPTransformState &State) { |
| 1987 | assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 && |
| 1988 | "PHINodes must be handled by VPIRPhi" ); |
| 1989 | // Advance the insert point after the wrapped IR instruction. This allows |
| 1990 | // interleaving VPIRInstructions and other recipes. |
| 1991 | State.Builder.SetInsertPoint(TheBB: I.getParent(), IP: std::next(x: I.getIterator())); |
| 1992 | } |
| 1993 | |
| 1994 | InstructionCost VPIRInstruction::computeCost(ElementCount VF, |
| 1995 | VPCostContext &Ctx) const { |
| 1996 | // The recipe wraps an existing IR instruction on the border of VPlan's scope, |
| 1997 | // hence it does not contribute to the cost-modeling for the VPlan. |
| 1998 | return 0; |
| 1999 | } |
| 2000 | |
| 2001 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2002 | void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2003 | VPSlotTracker &SlotTracker) const { |
| 2004 | O << Indent << "IR " << I; |
| 2005 | } |
| 2006 | #endif |
| 2007 | |
| 2008 | void VPIRPhi::execute(VPTransformState &State) { |
| 2009 | PHINode *Phi = &getIRPhi(); |
| 2010 | for (const auto &[Idx, Op] : enumerate(First: operands())) { |
| 2011 | VPValue *ExitValue = Op; |
| 2012 | auto Lane = vputils::isSingleScalar(VPV: ExitValue) |
| 2013 | ? VPLane::getFirstLane() |
| 2014 | : VPLane::getLastLaneForVF(VF: State.VF); |
| 2015 | VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; |
| 2016 | auto *PredVPBB = Pred->getExitingBasicBlock(); |
| 2017 | BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; |
| 2018 | // Set insertion point in PredBB in case an extract needs to be generated. |
| 2019 | // TODO: Model extracts explicitly. |
| 2020 | State.Builder.SetInsertPoint(PredBB->getTerminator()); |
| 2021 | Value *V = State.get(Def: ExitValue, Lane: VPLane(Lane)); |
| 2022 | // If there is no existing block for PredBB in the phi, add a new incoming |
| 2023 | // value. Otherwise update the existing incoming value for PredBB. |
| 2024 | if (Phi->getBasicBlockIndex(BB: PredBB) == -1) |
| 2025 | Phi->addIncoming(V, BB: PredBB); |
| 2026 | else |
| 2027 | Phi->setIncomingValueForBlock(BB: PredBB, V); |
| 2028 | } |
| 2029 | |
| 2030 | // Advance the insert point after the wrapped IR instruction. This allows |
| 2031 | // interleaving VPIRInstructions and other recipes. |
| 2032 | State.Builder.SetInsertPoint(TheBB: Phi->getParent(), IP: std::next(x: Phi->getIterator())); |
| 2033 | } |
| 2034 | |
| 2035 | void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const { |
| 2036 | VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe()); |
| 2037 | assert(R->getNumOperands() == R->getParent()->getNumPredecessors() && |
| 2038 | "Number of phi operands must match number of predecessors" ); |
| 2039 | unsigned Position = R->getParent()->getIndexForPredecessor(Pred: IncomingBlock); |
| 2040 | R->removeOperand(Idx: Position); |
| 2041 | } |
| 2042 | |
| 2043 | VPValue * |
| 2044 | VPPhiAccessors::getIncomingValueForBlock(const VPBasicBlock *VPBB) const { |
| 2045 | VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe()); |
| 2046 | return getIncomingValue(Idx: R->getParent()->getIndexForPredecessor(Pred: VPBB)); |
| 2047 | } |
| 2048 | |
| 2049 | void VPPhiAccessors::setIncomingValueForBlock(const VPBasicBlock *VPBB, |
| 2050 | VPValue *V) const { |
| 2051 | VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe()); |
| 2052 | R->setOperand(I: R->getParent()->getIndexForPredecessor(Pred: VPBB), New: V); |
| 2053 | } |
| 2054 | |
| 2055 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2056 | void VPPhiAccessors::printPhiOperands(raw_ostream &O, |
| 2057 | VPSlotTracker &SlotTracker) const { |
| 2058 | interleaveComma(enumerate(getAsRecipe()->operands()), O, |
| 2059 | [this, &O, &SlotTracker](auto Op) { |
| 2060 | O << "[ " ; |
| 2061 | Op.value()->printAsOperand(O, SlotTracker); |
| 2062 | O << ", " ; |
| 2063 | getIncomingBlock(Op.index())->printAsOperand(O); |
| 2064 | O << " ]" ; |
| 2065 | }); |
| 2066 | } |
| 2067 | #endif |
| 2068 | |
| 2069 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2070 | void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2071 | VPSlotTracker &SlotTracker) const { |
| 2072 | VPIRInstruction::printRecipe(O, Indent, SlotTracker); |
| 2073 | |
| 2074 | if (getNumOperands() != 0) { |
| 2075 | O << " (extra operand" << (getNumOperands() > 1 ? "s" : "" ) << ": " ; |
| 2076 | interleaveComma(incoming_values_and_blocks(), O, |
| 2077 | [&O, &SlotTracker](auto Op) { |
| 2078 | std::get<0>(Op)->printAsOperand(O, SlotTracker); |
| 2079 | O << " from " ; |
| 2080 | std::get<1>(Op)->printAsOperand(O); |
| 2081 | }); |
| 2082 | O << ")" ; |
| 2083 | } |
| 2084 | } |
| 2085 | #endif |
| 2086 | |
| 2087 | void VPIRMetadata::applyMetadata(Instruction &I) const { |
| 2088 | for (const auto &[Kind, Node] : Metadata) |
| 2089 | I.setMetadata(KindID: Kind, Node); |
| 2090 | } |
| 2091 | |
| 2092 | void VPIRMetadata::intersect(const VPIRMetadata &Other) { |
| 2093 | SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection; |
| 2094 | for (const auto &[KindA, MDA] : Metadata) { |
| 2095 | for (const auto &[KindB, MDB] : Other.Metadata) { |
| 2096 | if (KindA == KindB && MDA == MDB) { |
| 2097 | MetadataIntersection.emplace_back(Args: KindA, Args: MDA); |
| 2098 | break; |
| 2099 | } |
| 2100 | } |
| 2101 | } |
| 2102 | Metadata = std::move(MetadataIntersection); |
| 2103 | } |
| 2104 | |
| 2105 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2106 | void VPIRMetadata::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { |
| 2107 | const Module *M = SlotTracker.getModule(); |
| 2108 | if (Metadata.empty() || !M || !VPlanPrintMetadata) |
| 2109 | return; |
| 2110 | |
| 2111 | ArrayRef<StringRef> MDNames = SlotTracker.getMDNames(); |
| 2112 | O << " (" ; |
| 2113 | interleaveComma(Metadata, O, [&](const auto &KindNodePair) { |
| 2114 | auto [Kind, Node] = KindNodePair; |
| 2115 | assert(Kind < MDNames.size() && !MDNames[Kind].empty() && |
| 2116 | "Unexpected unnamed metadata kind" ); |
| 2117 | O << "!" << MDNames[Kind] << " " ; |
| 2118 | Node->printAsOperand(O, M); |
| 2119 | }); |
| 2120 | O << ")" ; |
| 2121 | } |
| 2122 | #endif |
| 2123 | |
| 2124 | void VPWidenCallRecipe::execute(VPTransformState &State) { |
| 2125 | assert(State.VF.isVector() && "not widening" ); |
| 2126 | assert(Variant != nullptr && "Can't create vector function." ); |
| 2127 | |
| 2128 | FunctionType *VFTy = Variant->getFunctionType(); |
| 2129 | // Add return type if intrinsic is overloaded on it. |
| 2130 | SmallVector<Value *, 4> Args; |
| 2131 | for (const auto &I : enumerate(First: args())) { |
| 2132 | Value *Arg; |
| 2133 | // Some vectorized function variants may also take a scalar argument, |
| 2134 | // e.g. linear parameters for pointers. This needs to be the scalar value |
| 2135 | // from the start of the respective part when interleaving. |
| 2136 | if (!VFTy->getParamType(i: I.index())->isVectorTy()) |
| 2137 | Arg = State.get(Def: I.value(), Lane: VPLane(0)); |
| 2138 | else |
| 2139 | Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value())); |
| 2140 | Args.push_back(Elt: Arg); |
| 2141 | } |
| 2142 | |
| 2143 | auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue()); |
| 2144 | SmallVector<OperandBundleDef, 1> OpBundles; |
| 2145 | if (CI) |
| 2146 | CI->getOperandBundlesAsDefs(Defs&: OpBundles); |
| 2147 | |
| 2148 | CallInst *V = State.Builder.CreateCall(Callee: Variant, Args, OpBundles); |
| 2149 | applyFlags(I&: *V); |
| 2150 | applyMetadata(I&: *V); |
| 2151 | V->setCallingConv(Variant->getCallingConv()); |
| 2152 | |
| 2153 | if (!V->getType()->isVoidTy()) |
| 2154 | State.set(Def: this, V); |
| 2155 | } |
| 2156 | |
| 2157 | InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, |
| 2158 | VPCostContext &Ctx) const { |
| 2159 | assert(getVectorizedTypeVF(Variant->getReturnType()) == VF && |
| 2160 | "Variant return type must match VF" ); |
| 2161 | return computeCallCost(Variant, Ctx); |
| 2162 | } |
| 2163 | |
| 2164 | InstructionCost VPWidenCallRecipe::computeCallCost(Function *Variant, |
| 2165 | VPCostContext &Ctx) { |
| 2166 | return Ctx.TTI.getCallInstrCost(F: nullptr, RetTy: Variant->getReturnType(), |
| 2167 | Tys: Variant->getFunctionType()->params(), |
| 2168 | CostKind: Ctx.CostKind); |
| 2169 | } |
| 2170 | |
| 2171 | bool VPWidenCallRecipe::usesFirstLaneOnly(const VPValue *Op) const { |
| 2172 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 2173 | assert(Variant && "Variant not set" ); |
| 2174 | FunctionType *VFTy = Variant->getFunctionType(); |
| 2175 | return all_of(Range: enumerate(First: args()), P: [VFTy, &Op](const auto &Arg) { |
| 2176 | auto [Idx, V] = Arg; |
| 2177 | Type *ArgTy = VFTy->getParamType(i: Idx); |
| 2178 | return V != Op || ArgTy->isIntegerTy() || ArgTy->isFloatingPointTy() || |
| 2179 | ArgTy->isPointerTy() || ArgTy->isByteTy(); |
| 2180 | }); |
| 2181 | } |
| 2182 | |
| 2183 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2184 | void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2185 | VPSlotTracker &SlotTracker) const { |
| 2186 | O << Indent << "WIDEN-CALL " ; |
| 2187 | |
| 2188 | Function *CalledFn = getCalledScalarFunction(); |
| 2189 | if (CalledFn->getReturnType()->isVoidTy()) |
| 2190 | O << "void " ; |
| 2191 | else { |
| 2192 | printAsOperand(O, SlotTracker); |
| 2193 | O << " = " ; |
| 2194 | } |
| 2195 | |
| 2196 | O << "call" ; |
| 2197 | printFlags(O); |
| 2198 | O << " @" << CalledFn->getName() << "(" ; |
| 2199 | interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) { |
| 2200 | Op->printAsOperand(O, SlotTracker); |
| 2201 | }); |
| 2202 | O << ")" ; |
| 2203 | |
| 2204 | O << " (using library function" ; |
| 2205 | if (Variant->hasName()) |
| 2206 | O << ": " << Variant->getName(); |
| 2207 | O << ")" ; |
| 2208 | } |
| 2209 | #endif |
| 2210 | |
| 2211 | CallInst *VPWidenIntrinsicRecipe::createVectorCall(VPTransformState &State) { |
| 2212 | assert(State.VF.isVector() && "not widening" ); |
| 2213 | |
| 2214 | SmallVector<Type *, 2> TysForDecl; |
| 2215 | // Add return type if intrinsic is overloaded on it. |
| 2216 | if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1, |
| 2217 | TTI: State.TTI)) { |
| 2218 | Type *RetTy = toVectorizedTy(Ty: getScalarType(), EC: State.VF); |
| 2219 | ArrayRef<Type *> ContainedTys = getContainedTypes(Ty: RetTy); |
| 2220 | for (auto [Idx, Ty] : enumerate(First&: ContainedTys)) { |
| 2221 | if (isVectorIntrinsicWithStructReturnOverloadAtField(ID: VectorIntrinsicID, |
| 2222 | RetIdx: Idx, TTI: State.TTI)) |
| 2223 | TysForDecl.push_back(Elt: Ty); |
| 2224 | } |
| 2225 | } |
| 2226 | SmallVector<Value *, 4> Args; |
| 2227 | for (const auto &I : enumerate(First: operands())) { |
| 2228 | // Some intrinsics have a scalar argument - don't replace it with a |
| 2229 | // vector. |
| 2230 | Value *Arg; |
| 2231 | if (isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index(), |
| 2232 | TTI: State.TTI)) |
| 2233 | Arg = State.get(Def: I.value(), Lane: VPLane(0)); |
| 2234 | else |
| 2235 | Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value())); |
| 2236 | if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index(), |
| 2237 | TTI: State.TTI)) |
| 2238 | TysForDecl.push_back(Elt: Arg->getType()); |
| 2239 | Args.push_back(Elt: Arg); |
| 2240 | } |
| 2241 | |
| 2242 | // Use vector version of the intrinsic. |
| 2243 | Module *M = State.Builder.GetInsertBlock()->getModule(); |
| 2244 | Function *VectorF = |
| 2245 | Intrinsic::getOrInsertDeclaration(M, id: VectorIntrinsicID, OverloadTys: TysForDecl); |
| 2246 | assert(VectorF && |
| 2247 | "Can't retrieve vector intrinsic or vector-predication intrinsics." ); |
| 2248 | |
| 2249 | auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue()); |
| 2250 | SmallVector<OperandBundleDef, 1> OpBundles; |
| 2251 | if (CI) |
| 2252 | CI->getOperandBundlesAsDefs(Defs&: OpBundles); |
| 2253 | |
| 2254 | CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles); |
| 2255 | |
| 2256 | applyFlags(I&: *V); |
| 2257 | applyMetadata(I&: *V); |
| 2258 | |
| 2259 | return V; |
| 2260 | } |
| 2261 | |
| 2262 | void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { |
| 2263 | CallInst *V = createVectorCall(State); |
| 2264 | if (!V->getType()->isVoidTy()) |
| 2265 | State.set(Def: this, V); |
| 2266 | } |
| 2267 | |
| 2268 | InstructionCost VPWidenIntrinsicRecipe::computeCallCost( |
| 2269 | Intrinsic::ID ID, ArrayRef<const VPValue *> Operands, |
| 2270 | const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx) { |
| 2271 | Type *ScalarRetTy = R.getScalarType(); |
| 2272 | // Skip the reverse operation cost for the mask. |
| 2273 | // FIXME: Remove this once redundant mask reverse operations can be eliminated |
| 2274 | // by VPlanTransforms::cse before cost computation. |
| 2275 | if (ID == Intrinsic::experimental_vp_reverse && ScalarRetTy->isIntegerTy(BitWidth: 1)) |
| 2276 | return InstructionCost(0); |
| 2277 | |
| 2278 | // Some backends analyze intrinsic arguments to determine cost. Use the |
| 2279 | // underlying value for the operand if it has one. Otherwise try to use the |
| 2280 | // operand of the underlying call instruction, if there is one. Otherwise |
| 2281 | // clear Arguments. |
| 2282 | // TODO: Rework TTI interface to be independent of concrete IR values. |
| 2283 | SmallVector<const Value *> Arguments; |
| 2284 | for (const auto &[Idx, Op] : enumerate(First&: Operands)) { |
| 2285 | auto *V = Op->getUnderlyingValue(); |
| 2286 | if (!V) { |
| 2287 | if (auto *UI = dyn_cast_or_null<CallBase>(Val: R.getUnderlyingValue())) { |
| 2288 | Arguments.push_back(Elt: UI->getArgOperand(i: Idx)); |
| 2289 | continue; |
| 2290 | } |
| 2291 | Arguments.clear(); |
| 2292 | break; |
| 2293 | } |
| 2294 | Arguments.push_back(Elt: V); |
| 2295 | } |
| 2296 | |
| 2297 | Type *RetTy = VF.isVector() ? toVectorizedTy(Ty: ScalarRetTy, EC: VF) : ScalarRetTy; |
| 2298 | SmallVector<Type *> ParamTys = |
| 2299 | map_to_vector(C&: Operands, F: [&](const VPValue *Op) { |
| 2300 | return toVectorTy(Scalar: Op->getScalarType(), EC: VF); |
| 2301 | }); |
| 2302 | |
| 2303 | // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. |
| 2304 | IntrinsicCostAttributes CostAttrs( |
| 2305 | ID, RetTy, Arguments, ParamTys, R.getFastMathFlagsOrNone(), |
| 2306 | dyn_cast_or_null<IntrinsicInst>(Val: R.getUnderlyingValue()), |
| 2307 | InstructionCost::getInvalid()); |
| 2308 | return Ctx.TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind: Ctx.CostKind); |
| 2309 | } |
| 2310 | |
| 2311 | InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, |
| 2312 | VPCostContext &Ctx) const { |
| 2313 | SmallVector<const VPValue *> ArgOps(operands()); |
| 2314 | return computeCallCost(ID: VectorIntrinsicID, Operands: ArgOps, R: *this, VF, Ctx); |
| 2315 | } |
| 2316 | |
| 2317 | StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { |
| 2318 | return Intrinsic::getBaseName(id: VectorIntrinsicID); |
| 2319 | } |
| 2320 | |
| 2321 | bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const { |
| 2322 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 2323 | return all_of(Range: enumerate(First: operands()), P: [this, &Op](const auto &X) { |
| 2324 | auto [Idx, V] = X; |
| 2325 | return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(), |
| 2326 | Idx, nullptr); |
| 2327 | }); |
| 2328 | } |
| 2329 | |
| 2330 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2331 | void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2332 | VPSlotTracker &SlotTracker) const { |
| 2333 | O << Indent << "WIDEN-INTRINSIC " ; |
| 2334 | if (getScalarType()->isVoidTy()) { |
| 2335 | O << "void " ; |
| 2336 | } else { |
| 2337 | printAsOperand(O, SlotTracker); |
| 2338 | O << " = " ; |
| 2339 | } |
| 2340 | |
| 2341 | O << "call" ; |
| 2342 | printFlags(O); |
| 2343 | O << getIntrinsicName() << "(" ; |
| 2344 | |
| 2345 | interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { |
| 2346 | Op->printAsOperand(O, SlotTracker); |
| 2347 | }); |
| 2348 | O << ")" ; |
| 2349 | } |
| 2350 | #endif |
| 2351 | |
| 2352 | void VPWidenMemIntrinsicRecipe::execute(VPTransformState &State) { |
| 2353 | CallInst *MemI = createVectorCall(State); |
| 2354 | MemI->addParamAttr( |
| 2355 | ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: MemI->getContext(), Alignment)); |
| 2356 | State.set(Def: this, V: MemI); |
| 2357 | } |
| 2358 | |
| 2359 | InstructionCost VPWidenMemIntrinsicRecipe::computeMemIntrinsicCost( |
| 2360 | Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, |
| 2361 | VPCostContext &Ctx) { |
| 2362 | return Ctx.TTI.getMemIntrinsicInstrCost( |
| 2363 | MICA: MemIntrinsicCostAttributes(IID, Ty, /*Ptr=*/nullptr, IsMasked, Alignment), |
| 2364 | CostKind: Ctx.CostKind); |
| 2365 | } |
| 2366 | |
| 2367 | InstructionCost |
| 2368 | VPWidenMemIntrinsicRecipe::computeCost(ElementCount VF, |
| 2369 | VPCostContext &Ctx) const { |
| 2370 | Type *Ty = toVectorTy(Scalar: getScalarType(), EC: VF); |
| 2371 | return computeMemIntrinsicCost(IID: getVectorIntrinsicID(), Ty, |
| 2372 | IsMasked: !match(V: getOperand(N: 2), P: m_True()), Alignment, |
| 2373 | Ctx); |
| 2374 | } |
| 2375 | |
| 2376 | void VPHistogramRecipe::execute(VPTransformState &State) { |
| 2377 | IRBuilderBase &Builder = State.Builder; |
| 2378 | |
| 2379 | Value *Address = State.get(Def: getOperand(N: 0)); |
| 2380 | Value *IncAmt = State.get(Def: getOperand(N: 1), /*IsScalar=*/true); |
| 2381 | VectorType *VTy = cast<VectorType>(Val: Address->getType()); |
| 2382 | |
| 2383 | // The histogram intrinsic requires a mask even if the recipe doesn't; |
| 2384 | // if the mask operand was omitted then all lanes should be executed and |
| 2385 | // we just need to synthesize an all-true mask. |
| 2386 | Value *Mask = nullptr; |
| 2387 | if (VPValue *VPMask = getMask()) |
| 2388 | Mask = State.get(Def: VPMask); |
| 2389 | else |
| 2390 | Mask = |
| 2391 | Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Builder.getInt1(V: 1)); |
| 2392 | |
| 2393 | // If this is a subtract, we want to invert the increment amount. We may |
| 2394 | // add a separate intrinsic in future, but for now we'll try this. |
| 2395 | if (Opcode == Instruction::Sub) |
| 2396 | IncAmt = Builder.CreateNeg(V: IncAmt); |
| 2397 | else |
| 2398 | assert(Opcode == Instruction::Add && "only add or sub supported for now" ); |
| 2399 | |
| 2400 | Instruction *HistogramInst = State.Builder.CreateIntrinsicWithoutFolding( |
| 2401 | ID: Intrinsic::experimental_vector_histogram_add, OverloadTypes: {VTy, IncAmt->getType()}, |
| 2402 | Args: {Address, IncAmt, Mask}); |
| 2403 | applyMetadata(I&: *HistogramInst); |
| 2404 | } |
| 2405 | |
| 2406 | InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, |
| 2407 | VPCostContext &Ctx) const { |
| 2408 | // FIXME: Take the gather and scatter into account as well. For now we're |
| 2409 | // generating the same cost as the fallback path, but we'll likely |
| 2410 | // need to create a new TTI method for determining the cost, including |
| 2411 | // whether we can use base + vec-of-smaller-indices or just |
| 2412 | // vec-of-pointers. |
| 2413 | assert(VF.isVector() && "Invalid VF for histogram cost" ); |
| 2414 | Type *AddressTy = getOperand(N: 0)->getScalarType(); |
| 2415 | VPValue *IncAmt = getOperand(N: 1); |
| 2416 | Type *IncTy = IncAmt->getScalarType(); |
| 2417 | VectorType *VTy = VectorType::get(ElementType: IncTy, EC: VF); |
| 2418 | |
| 2419 | // Assume that a non-constant update value (or a constant != 1) requires |
| 2420 | // a multiply, and add that into the cost. |
| 2421 | InstructionCost MulCost = |
| 2422 | Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VTy, CostKind: Ctx.CostKind); |
| 2423 | if (match(V: IncAmt, P: m_One())) |
| 2424 | MulCost = TTI::TCC_Free; |
| 2425 | |
| 2426 | // Find the cost of the histogram operation itself. |
| 2427 | Type *PtrTy = VectorType::get(ElementType: AddressTy, EC: VF); |
| 2428 | Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF); |
| 2429 | IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, |
| 2430 | Type::getVoidTy(C&: Ctx.LLVMCtx), |
| 2431 | {PtrTy, IncTy, MaskTy}); |
| 2432 | |
| 2433 | // Add the costs together with the add/sub operation. |
| 2434 | return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind) + MulCost + |
| 2435 | Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: VTy, CostKind: Ctx.CostKind); |
| 2436 | } |
| 2437 | |
| 2438 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2439 | void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2440 | VPSlotTracker &SlotTracker) const { |
| 2441 | O << Indent << "WIDEN-HISTOGRAM buckets: " ; |
| 2442 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 2443 | |
| 2444 | if (Opcode == Instruction::Sub) |
| 2445 | O << ", dec: " ; |
| 2446 | else { |
| 2447 | assert(Opcode == Instruction::Add); |
| 2448 | O << ", inc: " ; |
| 2449 | } |
| 2450 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 2451 | |
| 2452 | if (VPValue *Mask = getMask()) { |
| 2453 | O << ", mask: " ; |
| 2454 | Mask->printAsOperand(O, SlotTracker); |
| 2455 | } |
| 2456 | } |
| 2457 | #endif |
| 2458 | |
| 2459 | VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) { |
| 2460 | AllowReassoc = FMF.allowReassoc(); |
| 2461 | NoNaNs = FMF.noNaNs(); |
| 2462 | NoInfs = FMF.noInfs(); |
| 2463 | NoSignedZeros = FMF.noSignedZeros(); |
| 2464 | AllowReciprocal = FMF.allowReciprocal(); |
| 2465 | AllowContract = FMF.allowContract(); |
| 2466 | ApproxFunc = FMF.approxFunc(); |
| 2467 | } |
| 2468 | |
| 2469 | VPIRFlags VPIRFlags::getDefaultFlags(unsigned Opcode) { |
| 2470 | switch (Opcode) { |
| 2471 | case Instruction::Add: |
| 2472 | case Instruction::Sub: |
| 2473 | case Instruction::Mul: |
| 2474 | case Instruction::Shl: |
| 2475 | case VPInstruction::CanonicalIVIncrementForPart: |
| 2476 | return WrapFlagsTy(false, false); |
| 2477 | case Instruction::Trunc: |
| 2478 | return TruncFlagsTy(false, false); |
| 2479 | case Instruction::Or: |
| 2480 | return DisjointFlagsTy(false); |
| 2481 | case Instruction::AShr: |
| 2482 | case Instruction::LShr: |
| 2483 | case Instruction::UDiv: |
| 2484 | case Instruction::SDiv: |
| 2485 | return ExactFlagsTy(false); |
| 2486 | case Instruction::GetElementPtr: |
| 2487 | case VPInstruction::PtrAdd: |
| 2488 | case VPInstruction::WidePtrAdd: |
| 2489 | return GEPNoWrapFlags::none(); |
| 2490 | case Instruction::ZExt: |
| 2491 | case Instruction::UIToFP: |
| 2492 | return NonNegFlagsTy(false); |
| 2493 | case Instruction::FAdd: |
| 2494 | case Instruction::FSub: |
| 2495 | case Instruction::FMul: |
| 2496 | case Instruction::FDiv: |
| 2497 | case Instruction::FRem: |
| 2498 | case Instruction::FNeg: |
| 2499 | case Instruction::FPExt: |
| 2500 | case Instruction::FPTrunc: |
| 2501 | return FastMathFlags(); |
| 2502 | case Instruction::ICmp: |
| 2503 | case Instruction::FCmp: |
| 2504 | case VPInstruction::ComputeReductionResult: |
| 2505 | llvm_unreachable("opcode requires explicit flags" ); |
| 2506 | default: |
| 2507 | return VPIRFlags(); |
| 2508 | } |
| 2509 | } |
| 2510 | |
| 2511 | #if !defined(NDEBUG) |
| 2512 | bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { |
| 2513 | switch (OpType) { |
| 2514 | case OperationType::OverflowingBinOp: |
| 2515 | return Opcode == Instruction::Add || Opcode == Instruction::Sub || |
| 2516 | Opcode == Instruction::Mul || Opcode == Instruction::Shl || |
| 2517 | Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart; |
| 2518 | case OperationType::Trunc: |
| 2519 | return Opcode == Instruction::Trunc; |
| 2520 | case OperationType::DisjointOp: |
| 2521 | return Opcode == Instruction::Or; |
| 2522 | case OperationType::PossiblyExactOp: |
| 2523 | return Opcode == Instruction::AShr || Opcode == Instruction::LShr || |
| 2524 | Opcode == Instruction::UDiv || Opcode == Instruction::SDiv; |
| 2525 | case OperationType::GEPOp: |
| 2526 | return Opcode == Instruction::GetElementPtr || |
| 2527 | Opcode == VPInstruction::PtrAdd || |
| 2528 | Opcode == VPInstruction::WidePtrAdd; |
| 2529 | case OperationType::FPMathOp: |
| 2530 | return Opcode == Instruction::Call || Opcode == Instruction::FAdd || |
| 2531 | Opcode == Instruction::FMul || Opcode == Instruction::FSub || |
| 2532 | Opcode == Instruction::FNeg || Opcode == Instruction::FDiv || |
| 2533 | Opcode == Instruction::FRem || Opcode == Instruction::FPExt || |
| 2534 | Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI || |
| 2535 | Opcode == Instruction::Select || Opcode == Instruction::SIToFP || |
| 2536 | Opcode == Instruction::UIToFP || |
| 2537 | Opcode == VPInstruction::WideIVStep || |
| 2538 | Opcode == VPInstruction::ReductionStartVector; |
| 2539 | case OperationType::FCmp: |
| 2540 | return Opcode == Instruction::FCmp; |
| 2541 | case OperationType::NonNegOp: |
| 2542 | return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP; |
| 2543 | case OperationType::Cmp: |
| 2544 | return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp; |
| 2545 | case OperationType::ReductionOp: |
| 2546 | return Opcode == VPInstruction::ComputeReductionResult; |
| 2547 | case OperationType::Other: |
| 2548 | return true; |
| 2549 | } |
| 2550 | llvm_unreachable("Unknown OperationType enum" ); |
| 2551 | } |
| 2552 | |
| 2553 | bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const { |
| 2554 | // Handle opcodes without default flags. |
| 2555 | if (Opcode == Instruction::ICmp) |
| 2556 | return OpType == OperationType::Cmp; |
| 2557 | if (Opcode == Instruction::FCmp) |
| 2558 | return OpType == OperationType::FCmp; |
| 2559 | if (Opcode == VPInstruction::ComputeReductionResult) |
| 2560 | return OpType == OperationType::ReductionOp; |
| 2561 | |
| 2562 | OperationType Required = getDefaultFlags(Opcode).OpType; |
| 2563 | return Required == OperationType::Other || Required == OpType; |
| 2564 | } |
| 2565 | #endif |
| 2566 | |
| 2567 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2568 | static void printRecurrenceKind(raw_ostream &OS, const RecurKind &Kind) { |
| 2569 | switch (Kind) { |
| 2570 | case RecurKind::None: |
| 2571 | OS << "none" ; |
| 2572 | break; |
| 2573 | case RecurKind::Add: |
| 2574 | OS << "add" ; |
| 2575 | break; |
| 2576 | case RecurKind::Sub: |
| 2577 | OS << "sub" ; |
| 2578 | break; |
| 2579 | case RecurKind::AddChainWithSubs: |
| 2580 | OS << "add-chain-with-subs" ; |
| 2581 | break; |
| 2582 | case RecurKind::Mul: |
| 2583 | OS << "mul" ; |
| 2584 | break; |
| 2585 | case RecurKind::Or: |
| 2586 | OS << "or" ; |
| 2587 | break; |
| 2588 | case RecurKind::And: |
| 2589 | OS << "and" ; |
| 2590 | break; |
| 2591 | case RecurKind::Xor: |
| 2592 | OS << "xor" ; |
| 2593 | break; |
| 2594 | case RecurKind::SMin: |
| 2595 | OS << "smin" ; |
| 2596 | break; |
| 2597 | case RecurKind::SMax: |
| 2598 | OS << "smax" ; |
| 2599 | break; |
| 2600 | case RecurKind::UMin: |
| 2601 | OS << "umin" ; |
| 2602 | break; |
| 2603 | case RecurKind::UMax: |
| 2604 | OS << "umax" ; |
| 2605 | break; |
| 2606 | case RecurKind::FAdd: |
| 2607 | OS << "fadd" ; |
| 2608 | break; |
| 2609 | case RecurKind::FAddChainWithSubs: |
| 2610 | OS << "fadd-chain-with-subs" ; |
| 2611 | break; |
| 2612 | case RecurKind::FSub: |
| 2613 | OS << "fsub" ; |
| 2614 | break; |
| 2615 | case RecurKind::FMul: |
| 2616 | OS << "fmul" ; |
| 2617 | break; |
| 2618 | case RecurKind::FMin: |
| 2619 | OS << "fmin" ; |
| 2620 | break; |
| 2621 | case RecurKind::FMax: |
| 2622 | OS << "fmax" ; |
| 2623 | break; |
| 2624 | case RecurKind::FMinNum: |
| 2625 | OS << "fminnum" ; |
| 2626 | break; |
| 2627 | case RecurKind::FMaxNum: |
| 2628 | OS << "fmaxnum" ; |
| 2629 | break; |
| 2630 | case RecurKind::FMinimum: |
| 2631 | OS << "fminimum" ; |
| 2632 | break; |
| 2633 | case RecurKind::FMaximum: |
| 2634 | OS << "fmaximum" ; |
| 2635 | break; |
| 2636 | case RecurKind::FMinimumNum: |
| 2637 | OS << "fminimumnum" ; |
| 2638 | break; |
| 2639 | case RecurKind::FMaximumNum: |
| 2640 | OS << "fmaximumnum" ; |
| 2641 | break; |
| 2642 | case RecurKind::FMulAdd: |
| 2643 | OS << "fmuladd" ; |
| 2644 | break; |
| 2645 | case RecurKind::AnyOf: |
| 2646 | OS << "any-of" ; |
| 2647 | break; |
| 2648 | case RecurKind::FindIV: |
| 2649 | OS << "find-iv" ; |
| 2650 | break; |
| 2651 | case RecurKind::FindLast: |
| 2652 | OS << "find-last" ; |
| 2653 | break; |
| 2654 | } |
| 2655 | } |
| 2656 | |
| 2657 | void VPIRFlags::printFlags(raw_ostream &O) const { |
| 2658 | switch (OpType) { |
| 2659 | case OperationType::Cmp: |
| 2660 | O << " " << CmpInst::getPredicateName(getPredicate()); |
| 2661 | break; |
| 2662 | case OperationType::FCmp: |
| 2663 | O << " " << CmpInst::getPredicateName(getPredicate()); |
| 2664 | getFastMathFlagsOrNone().print(O); |
| 2665 | break; |
| 2666 | case OperationType::DisjointOp: |
| 2667 | if (DisjointFlags.IsDisjoint) |
| 2668 | O << " disjoint" ; |
| 2669 | break; |
| 2670 | case OperationType::PossiblyExactOp: |
| 2671 | if (ExactFlags.IsExact) |
| 2672 | O << " exact" ; |
| 2673 | break; |
| 2674 | case OperationType::OverflowingBinOp: |
| 2675 | if (WrapFlags.HasNUW) |
| 2676 | O << " nuw" ; |
| 2677 | if (WrapFlags.HasNSW) |
| 2678 | O << " nsw" ; |
| 2679 | break; |
| 2680 | case OperationType::Trunc: |
| 2681 | if (TruncFlags.HasNUW) |
| 2682 | O << " nuw" ; |
| 2683 | if (TruncFlags.HasNSW) |
| 2684 | O << " nsw" ; |
| 2685 | break; |
| 2686 | case OperationType::FPMathOp: |
| 2687 | getFastMathFlagsOrNone().print(O); |
| 2688 | break; |
| 2689 | case OperationType::GEPOp: { |
| 2690 | GEPNoWrapFlags Flags = getGEPNoWrapFlags(); |
| 2691 | if (Flags.isInBounds()) |
| 2692 | O << " inbounds" ; |
| 2693 | else if (Flags.hasNoUnsignedSignedWrap()) |
| 2694 | O << " nusw" ; |
| 2695 | if (Flags.hasNoUnsignedWrap()) |
| 2696 | O << " nuw" ; |
| 2697 | break; |
| 2698 | } |
| 2699 | case OperationType::NonNegOp: |
| 2700 | if (NonNegFlags.NonNeg) |
| 2701 | O << " nneg" ; |
| 2702 | break; |
| 2703 | case OperationType::ReductionOp: { |
| 2704 | O << " (" ; |
| 2705 | printRecurrenceKind(O, getRecurKind()); |
| 2706 | if (isReductionInLoop()) |
| 2707 | O << ", in-loop" ; |
| 2708 | if (isReductionOrdered()) |
| 2709 | O << ", ordered" ; |
| 2710 | O << ")" ; |
| 2711 | getFastMathFlagsOrNone().print(O); |
| 2712 | break; |
| 2713 | } |
| 2714 | case OperationType::Other: |
| 2715 | break; |
| 2716 | } |
| 2717 | O << " " ; |
| 2718 | } |
| 2719 | #endif |
| 2720 | |
| 2721 | void VPWidenRecipe::execute(VPTransformState &State) { |
| 2722 | auto &Builder = State.Builder; |
| 2723 | switch (Opcode) { |
| 2724 | case Instruction::Call: |
| 2725 | case Instruction::UncondBr: |
| 2726 | case Instruction::CondBr: |
| 2727 | case Instruction::PHI: |
| 2728 | case Instruction::GetElementPtr: |
| 2729 | llvm_unreachable("This instruction is handled by a different recipe." ); |
| 2730 | case Instruction::UDiv: |
| 2731 | case Instruction::SDiv: |
| 2732 | case Instruction::SRem: |
| 2733 | case Instruction::URem: |
| 2734 | case Instruction::Add: |
| 2735 | case Instruction::FAdd: |
| 2736 | case Instruction::Sub: |
| 2737 | case Instruction::FSub: |
| 2738 | case Instruction::FNeg: |
| 2739 | case Instruction::Mul: |
| 2740 | case Instruction::FMul: |
| 2741 | case Instruction::FDiv: |
| 2742 | case Instruction::FRem: |
| 2743 | case Instruction::Shl: |
| 2744 | case Instruction::LShr: |
| 2745 | case Instruction::AShr: |
| 2746 | case Instruction::And: |
| 2747 | case Instruction::Or: |
| 2748 | case Instruction::Xor: { |
| 2749 | // Just widen unops and binops. |
| 2750 | SmallVector<Value *, 2> Ops; |
| 2751 | for (VPValue *VPOp : operands()) |
| 2752 | Ops.push_back(Elt: State.get(Def: VPOp)); |
| 2753 | |
| 2754 | Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops); |
| 2755 | |
| 2756 | if (auto *VecOp = dyn_cast<Instruction>(Val: V)) { |
| 2757 | applyFlags(I&: *VecOp); |
| 2758 | applyMetadata(I&: *VecOp); |
| 2759 | } |
| 2760 | |
| 2761 | // Use this vector value for all users of the original instruction. |
| 2762 | State.set(Def: this, V); |
| 2763 | break; |
| 2764 | } |
| 2765 | case Instruction::ExtractValue: { |
| 2766 | assert(getNumOperands() == 2 && "expected single level extractvalue" ); |
| 2767 | Value *Op = State.get(Def: getOperand(N: 0)); |
| 2768 | Value * = Builder.CreateExtractValue( |
| 2769 | Agg: Op, Idxs: cast<VPConstantInt>(Val: getOperand(N: 1))->getZExtValue()); |
| 2770 | State.set(Def: this, V: Extract); |
| 2771 | break; |
| 2772 | } |
| 2773 | case Instruction::Freeze: { |
| 2774 | Value *Op = State.get(Def: getOperand(N: 0)); |
| 2775 | Value *Freeze = Builder.CreateFreeze(V: Op); |
| 2776 | State.set(Def: this, V: Freeze); |
| 2777 | break; |
| 2778 | } |
| 2779 | case Instruction::ICmp: |
| 2780 | case Instruction::FCmp: { |
| 2781 | // Widen compares. Generate vector compares. |
| 2782 | bool FCmp = Opcode == Instruction::FCmp; |
| 2783 | Value *A = State.get(Def: getOperand(N: 0)); |
| 2784 | Value *B = State.get(Def: getOperand(N: 1)); |
| 2785 | Value *C = nullptr; |
| 2786 | if (FCmp) { |
| 2787 | C = Builder.CreateFCmp(P: getPredicate(), LHS: A, RHS: B); |
| 2788 | } else { |
| 2789 | C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B); |
| 2790 | } |
| 2791 | if (auto *I = dyn_cast<Instruction>(Val: C)) { |
| 2792 | applyFlags(I&: *I); |
| 2793 | applyMetadata(I&: *I); |
| 2794 | } |
| 2795 | State.set(Def: this, V: C); |
| 2796 | break; |
| 2797 | } |
| 2798 | case Instruction::Select: { |
| 2799 | VPValue *CondOp = getOperand(N: 0); |
| 2800 | Value *Cond = State.get(Def: CondOp, IsScalar: vputils::isSingleScalar(VPV: CondOp)); |
| 2801 | Value *Op0 = State.get(Def: getOperand(N: 1)); |
| 2802 | Value *Op1 = State.get(Def: getOperand(N: 2)); |
| 2803 | Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1); |
| 2804 | State.set(Def: this, V: Sel); |
| 2805 | if (auto *I = dyn_cast<Instruction>(Val: Sel)) { |
| 2806 | if (isa<FPMathOperator>(Val: I)) |
| 2807 | applyFlags(I&: *I); |
| 2808 | applyMetadata(I&: *I); |
| 2809 | } |
| 2810 | break; |
| 2811 | } |
| 2812 | default: |
| 2813 | // This instruction is not vectorized by simple widening. |
| 2814 | LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " |
| 2815 | << Instruction::getOpcodeName(Opcode)); |
| 2816 | llvm_unreachable("Unhandled instruction!" ); |
| 2817 | } // end of switch. |
| 2818 | |
| 2819 | #if !defined(NDEBUG) |
| 2820 | // Verify that VPlan type inference results agree with the type of the |
| 2821 | // generated values. |
| 2822 | assert(VectorType::get(this->getScalarType(), State.VF) == |
| 2823 | State.get(this)->getType() && |
| 2824 | "inferred type and type from generated instructions do not match" ); |
| 2825 | #endif |
| 2826 | } |
| 2827 | |
| 2828 | InstructionCost VPWidenRecipe::computeCost(ElementCount VF, |
| 2829 | VPCostContext &Ctx) const { |
| 2830 | switch (Opcode) { |
| 2831 | case Instruction::UDiv: |
| 2832 | case Instruction::SDiv: |
| 2833 | case Instruction::SRem: |
| 2834 | case Instruction::URem: |
| 2835 | // If the div/rem operation isn't safe to speculate and requires |
| 2836 | // predication, then the only way we can even create a vplan is to insert |
| 2837 | // a select on the second input operand to ensure we use the value of 1 |
| 2838 | // for the inactive lanes. The select will be costed separately. |
| 2839 | case Instruction::FNeg: |
| 2840 | case Instruction::Add: |
| 2841 | case Instruction::FAdd: |
| 2842 | case Instruction::Sub: |
| 2843 | case Instruction::FSub: |
| 2844 | case Instruction::Mul: |
| 2845 | case Instruction::FMul: |
| 2846 | case Instruction::FDiv: |
| 2847 | case Instruction::FRem: |
| 2848 | case Instruction::Shl: |
| 2849 | case Instruction::LShr: |
| 2850 | case Instruction::AShr: |
| 2851 | case Instruction::And: |
| 2852 | case Instruction::Or: |
| 2853 | case Instruction::Xor: |
| 2854 | case Instruction::Freeze: |
| 2855 | case Instruction::ExtractValue: |
| 2856 | case Instruction::ICmp: |
| 2857 | case Instruction::FCmp: |
| 2858 | case Instruction::Select: |
| 2859 | return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx); |
| 2860 | default: |
| 2861 | llvm_unreachable("Unsupported opcode for instruction" ); |
| 2862 | } |
| 2863 | } |
| 2864 | |
| 2865 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2866 | void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2867 | VPSlotTracker &SlotTracker) const { |
| 2868 | O << Indent << "WIDEN " ; |
| 2869 | printAsOperand(O, SlotTracker); |
| 2870 | O << " = " << Instruction::getOpcodeName(Opcode); |
| 2871 | printFlags(O); |
| 2872 | printOperands(O, SlotTracker); |
| 2873 | } |
| 2874 | #endif |
| 2875 | |
| 2876 | void VPWidenCastRecipe::execute(VPTransformState &State) { |
| 2877 | auto &Builder = State.Builder; |
| 2878 | /// Vectorize casts. |
| 2879 | assert(State.VF.isVector() && "Not vectorizing?" ); |
| 2880 | Type *DestTy = VectorType::get(ElementType: getScalarType(), EC: State.VF); |
| 2881 | VPValue *Op = getOperand(N: 0); |
| 2882 | Value *A = State.get(Def: Op); |
| 2883 | Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy); |
| 2884 | State.set(Def: this, V: Cast); |
| 2885 | if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) { |
| 2886 | applyFlags(I&: *CastOp); |
| 2887 | applyMetadata(I&: *CastOp); |
| 2888 | } |
| 2889 | } |
| 2890 | |
| 2891 | InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, |
| 2892 | VPCostContext &Ctx) const { |
| 2893 | return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx); |
| 2894 | } |
| 2895 | |
| 2896 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2897 | void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 2898 | VPSlotTracker &SlotTracker) const { |
| 2899 | O << Indent << "WIDEN-CAST " ; |
| 2900 | printAsOperand(O, SlotTracker); |
| 2901 | O << " = " << Instruction::getOpcodeName(Opcode); |
| 2902 | printFlags(O); |
| 2903 | printOperands(O, SlotTracker); |
| 2904 | O << " to " << *getScalarType(); |
| 2905 | } |
| 2906 | #endif |
| 2907 | |
| 2908 | InstructionCost VPHeaderPHIRecipe::(ElementCount VF, |
| 2909 | VPCostContext &Ctx) const { |
| 2910 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 2911 | } |
| 2912 | |
| 2913 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 2914 | void VPWidenIntOrFpInductionRecipe::printRecipe( |
| 2915 | raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { |
| 2916 | O << Indent; |
| 2917 | printAsOperand(O, SlotTracker); |
| 2918 | O << " = WIDEN-INDUCTION" ; |
| 2919 | printFlags(O); |
| 2920 | printOperands(O, SlotTracker); |
| 2921 | |
| 2922 | if (auto *TI = getTruncInst()) |
| 2923 | O << " (truncated to " << *TI->getType() << ")" ; |
| 2924 | } |
| 2925 | #endif |
| 2926 | |
| 2927 | bool VPWidenIntOrFpInductionRecipe::isCanonical() const { |
| 2928 | // The step may be defined by a recipe in the preheader (e.g. if it requires |
| 2929 | // SCEV expansion), but for the canonical induction the step is required to be |
| 2930 | // 1, which is represented as live-in. |
| 2931 | return match(V: getStartValue(), P: m_ZeroInt()) && |
| 2932 | match(V: getStepValue(), P: m_One()) && |
| 2933 | getScalarType() == getRegion()->getCanonicalIVType(); |
| 2934 | } |
| 2935 | |
| 2936 | InstructionCost VPDerivedIVRecipe::computeCost(ElementCount VF, |
| 2937 | VPCostContext &Ctx) const { |
| 2938 | // The cost model for this is modelled on expandVPDerivedIV in |
| 2939 | // VPlanTransforms.cpp. In order to avoid overly pessimistic costs that can |
| 2940 | // negatively affect vectorization it takes into account any expected |
| 2941 | // simplifications that happen in simplifyRecipe. |
| 2942 | switch (getInductionKind()) { |
| 2943 | default: |
| 2944 | // TODO: Compute cost for remaining kinds. |
| 2945 | break; |
| 2946 | case InductionDescriptor::IK_IntInduction: { |
| 2947 | // There are currently no tests that expose a path where all lanes are |
| 2948 | // used, so it's better to bail out for now. |
| 2949 | if (!vputils::onlyFirstLaneUsed(Def: this)) |
| 2950 | break; |
| 2951 | |
| 2952 | // Start off by assuming we need both mul and add, then refine this. |
| 2953 | bool NeedsMul = true, NeedsAdd = true, NeedsShl = false; |
| 2954 | |
| 2955 | // If the start value is zero the add gets folded away. |
| 2956 | if (auto *VPV = dyn_cast<VPIRValue>(Val: getStartValue())) |
| 2957 | if (auto *StartC = dyn_cast<ConstantInt>(Val: VPV->getValue())) |
| 2958 | NeedsAdd = !StartC->isZero(); |
| 2959 | |
| 2960 | // For some values of step the arithmetic changes: |
| 2961 | // 1. A step of 1 requires no operation. |
| 2962 | // 2. A step of -1 requires a negate. |
| 2963 | // 3. A power-of-2 step will use a shl, instead of a mul. |
| 2964 | Type *StepTy = getStepValue()->getScalarType(); |
| 2965 | InstructionCost Cost(0); |
| 2966 | if (auto *VPV = dyn_cast<VPIRValue>(Val: getStepValue())) { |
| 2967 | if (auto *StepC = dyn_cast<ConstantInt>(Val: VPV->getValue())) { |
| 2968 | if (StepC->isOne()) |
| 2969 | NeedsMul = false; |
| 2970 | else if (StepC->isMinusOne()) { |
| 2971 | // This will most likely end up as a negate in simplifyRecipe, and |
| 2972 | // the negate will be combined with the add to make a sub. |
| 2973 | // NOTE: This is perhaps an invalid assumption that the cost of an |
| 2974 | // 'add' is the same as a 'sub'. |
| 2975 | NeedsMul = false; |
| 2976 | NeedsAdd = true; |
| 2977 | } else if (StepC->getValue().isPowerOf2()) { |
| 2978 | // This will most likely end up as a shift-left in simplifyRecipe |
| 2979 | NeedsMul = false; |
| 2980 | NeedsShl = true; |
| 2981 | } |
| 2982 | } |
| 2983 | } |
| 2984 | |
| 2985 | // Add the cost of the conversion from index to step type if the index |
| 2986 | // will be used. |
| 2987 | Type *IndexTy = getIndex()->getScalarType(); |
| 2988 | unsigned StepTySize = StepTy->getScalarSizeInBits(); |
| 2989 | unsigned IndexTySize = IndexTy->getScalarSizeInBits(); |
| 2990 | if ((NeedsAdd || NeedsMul || NeedsShl) && StepTySize != IndexTySize) { |
| 2991 | unsigned CastOpc = |
| 2992 | StepTySize < IndexTySize ? Instruction::Trunc : Instruction::SExt; |
| 2993 | Cost += Ctx.TTI.getCastInstrCost( |
| 2994 | Opcode: CastOpc, Dst: StepTy, Src: IndexTy, CCH: TTI::CastContextHint::None, CostKind: Ctx.CostKind); |
| 2995 | } |
| 2996 | |
| 2997 | if (NeedsMul) |
| 2998 | Cost += Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: StepTy, |
| 2999 | CostKind: Ctx.CostKind); |
| 3000 | if (NeedsShl) |
| 3001 | Cost += Ctx.TTI.getArithmeticInstrCost( |
| 3002 | Opcode: Instruction::Shl, Ty: StepTy, CostKind: Ctx.CostKind, |
| 3003 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
| 3004 | Opd2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue, |
| 3005 | .Properties: TargetTransformInfo::OP_None}); |
| 3006 | if (NeedsAdd) |
| 3007 | Cost += Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Add, Ty: StepTy, |
| 3008 | CostKind: Ctx.CostKind); |
| 3009 | return Cost; |
| 3010 | } |
| 3011 | } |
| 3012 | |
| 3013 | return 0; |
| 3014 | } |
| 3015 | |
| 3016 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3017 | void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3018 | VPSlotTracker &SlotTracker) const { |
| 3019 | O << Indent; |
| 3020 | printAsOperand(O, SlotTracker); |
| 3021 | O << " = DERIVED-IV " ; |
| 3022 | getStartValue()->printAsOperand(O, SlotTracker); |
| 3023 | O << " + " ; |
| 3024 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 3025 | O << " * " ; |
| 3026 | getStepValue()->printAsOperand(O, SlotTracker); |
| 3027 | } |
| 3028 | #endif |
| 3029 | |
| 3030 | InstructionCost VPScalarIVStepsRecipe::computeCost(ElementCount VF, |
| 3031 | VPCostContext &Ctx) const { |
| 3032 | // TODO: Add costs for floating point. |
| 3033 | Type *BaseIVTy = getOperand(N: 0)->getScalarType(); |
| 3034 | if (!BaseIVTy->isIntegerTy()) |
| 3035 | return 0; |
| 3036 | |
| 3037 | // TODO: Add support for predicated regions. Requires scaling the cost by the |
| 3038 | // probability of entering the block. |
| 3039 | if (getRegion() && getRegion()->isReplicator()) |
| 3040 | return 0; |
| 3041 | |
| 3042 | // If only the first lane is used, then there won't be any code that remains |
| 3043 | // in the loop for the first unrolled part. |
| 3044 | if (vputils::onlyFirstLaneUsed(Def: this)) |
| 3045 | return 0; |
| 3046 | |
| 3047 | // Typically the operations are: |
| 3048 | // 1. Add the start index to each lane value. |
| 3049 | // 2. Multiply the start index by the step. |
| 3050 | // 3. Add the scaled start index to base IV. |
| 3051 | // Any code generated for 1 and 2 should be loop invariant and therefore |
| 3052 | // hoisted out of the loop. We only need to add on the cost of 3. |
| 3053 | |
| 3054 | // Given the users of VPScalarIVStepsRecipe tend to be scalarized GEPs, i.e. |
| 3055 | // %add1 = add i32 %iv, 0 |
| 3056 | // %add2 = add i32 %iv, 1 |
| 3057 | // %gep1 = getelementptr i8, ptr %p, i32 %add1 |
| 3058 | // %gep2 = getelementptr i8, ptr %p, i32 %add2 |
| 3059 | // it's very likely that these GEPs will all be rewritten to have a common |
| 3060 | // base such that what's left is just |
| 3061 | // %base_gep = getelementptr i8, ptr %p, i32 %iv |
| 3062 | // %gep1 = getelementptr i8, ptr %base_gep, i32 0 |
| 3063 | // %gep2 = getelementptr i8, ptr %base_gep, i32 1 |
| 3064 | // Therefore, in reality the cost is somewhere betwen 1*AddCost and |
| 3065 | // (NumLanes - 1) * AddCost. For now, assume the cost of a single add. |
| 3066 | return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Add, Ty: BaseIVTy, |
| 3067 | CostKind: Ctx.CostKind); |
| 3068 | } |
| 3069 | |
| 3070 | void VPScalarIVStepsRecipe::execute(VPTransformState &State) { |
| 3071 | // Fast-math-flags propagate from the original induction instruction. |
| 3072 | IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); |
| 3073 | State.Builder.setFastMathFlags(getFastMathFlagsOrNone()); |
| 3074 | |
| 3075 | /// Compute scalar induction steps. \p ScalarIV is the scalar induction |
| 3076 | /// variable on which to base the steps, \p Step is the size of the step. |
| 3077 | |
| 3078 | Value *BaseIV = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 3079 | Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0)); |
| 3080 | IRBuilderBase &Builder = State.Builder; |
| 3081 | |
| 3082 | // Ensure step has the same type as that of scalar IV. |
| 3083 | Type *BaseIVTy = BaseIV->getType()->getScalarType(); |
| 3084 | assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!" ); |
| 3085 | |
| 3086 | // We build scalar steps for both integer and floating-point induction |
| 3087 | // variables. Here, we determine the kind of arithmetic we will perform. |
| 3088 | Instruction::BinaryOps AddOp; |
| 3089 | Instruction::BinaryOps MulOp; |
| 3090 | if (BaseIVTy->isIntegerTy()) { |
| 3091 | AddOp = Instruction::Add; |
| 3092 | MulOp = Instruction::Mul; |
| 3093 | } else { |
| 3094 | AddOp = InductionOpcode; |
| 3095 | MulOp = Instruction::FMul; |
| 3096 | } |
| 3097 | |
| 3098 | // Determine the number of scalars we need to generate. |
| 3099 | bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this); |
| 3100 | // Compute the scalar steps and save the results in State. |
| 3101 | |
| 3102 | unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); |
| 3103 | Value *StartIdx0 = getStartIndex() ? State.get(Def: getStartIndex(), IsScalar: true) |
| 3104 | : Constant::getNullValue(Ty: BaseIVTy); |
| 3105 | |
| 3106 | for (unsigned Lane = 0; Lane < EndLane; ++Lane) { |
| 3107 | // It is okay if the induction variable type cannot hold the lane number, |
| 3108 | // we expect truncation in this case. |
| 3109 | Constant *LaneValue = |
| 3110 | BaseIVTy->isIntegerTy() |
| 3111 | ? ConstantInt::get(Ty: BaseIVTy, V: Lane, /*IsSigned=*/false, |
| 3112 | /*ImplicitTrunc=*/true) |
| 3113 | : ConstantFP::get(Ty: BaseIVTy, V: Lane); |
| 3114 | Value *StartIdx = Builder.CreateBinOp(Opc: AddOp, LHS: StartIdx0, RHS: LaneValue); |
| 3115 | assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && |
| 3116 | "Expected StartIdx to be folded to a constant when VF is not " |
| 3117 | "scalable" ); |
| 3118 | auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step); |
| 3119 | auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul); |
| 3120 | State.set(Def: this, V: Add, Lane: VPLane(Lane)); |
| 3121 | } |
| 3122 | } |
| 3123 | |
| 3124 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3125 | void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3126 | VPSlotTracker &SlotTracker) const { |
| 3127 | O << Indent; |
| 3128 | printAsOperand(O, SlotTracker); |
| 3129 | O << " = SCALAR-STEPS " ; |
| 3130 | printOperands(O, SlotTracker); |
| 3131 | } |
| 3132 | #endif |
| 3133 | |
| 3134 | bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const { |
| 3135 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 3136 | return vputils::isSingleScalar(VPV: Op); |
| 3137 | } |
| 3138 | |
| 3139 | void VPWidenGEPRecipe::execute(VPTransformState &State) { |
| 3140 | assert(State.VF.isVector() && "not widening" ); |
| 3141 | auto Ops = map_to_vector(C: operands(), F: [&](VPValue *Op) { |
| 3142 | return State.get(Def: Op, IsScalar: vputils::isSingleScalar(VPV: Op)); |
| 3143 | }); |
| 3144 | auto *GEP = |
| 3145 | State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr: Ops.front(), |
| 3146 | IdxList: drop_begin(RangeOrContainer&: Ops), Name: "wide.gep" , NW: getGEPNoWrapFlags()); |
| 3147 | State.set(Def: this, V: GEP, IsScalar: vputils::isSingleScalar(VPV: this)); |
| 3148 | } |
| 3149 | |
| 3150 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3151 | void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3152 | VPSlotTracker &SlotTracker) const { |
| 3153 | O << Indent << "WIDEN-GEP " ; |
| 3154 | printAsOperand(O, SlotTracker); |
| 3155 | O << " = getelementptr" ; |
| 3156 | printFlags(O); |
| 3157 | printOperands(O, SlotTracker); |
| 3158 | } |
| 3159 | #endif |
| 3160 | |
| 3161 | void VPVectorEndPointerRecipe::materializeOffset(unsigned Part) { |
| 3162 | assert(!getOffset() && "Unexpected offset operand" ); |
| 3163 | VPBuilder Builder(this); |
| 3164 | VPlan &Plan = *getParent()->getPlan(); |
| 3165 | VPValue *VFVal = getVFValue(); |
| 3166 | const DataLayout &DL = Plan.getDataLayout(); |
| 3167 | Type *IndexTy = DL.getIndexType(PtrTy: this->getScalarType()); |
| 3168 | VPValue *Stride = |
| 3169 | Plan.getConstantInt(Ty: IndexTy, Val: getStride(), /*IsSigned=*/true); |
| 3170 | Type *VFTy = VFVal->getScalarType(); |
| 3171 | VPValue *VF = Builder.createScalarZExtOrTrunc(Op: VFVal, ResultTy: IndexTy, SrcTy: VFTy, |
| 3172 | DL: DebugLoc::getUnknown()); |
| 3173 | |
| 3174 | // Offset for Part0 = Offset0 = Stride * (VF - 1). |
| 3175 | VPInstruction *VFMinusOne = |
| 3176 | Builder.createSub(LHS: VF, RHS: Plan.getConstantInt(Ty: IndexTy, Val: 1u), |
| 3177 | DL: DebugLoc::getUnknown(), Name: "" , WrapFlags: {true, true}); |
| 3178 | VPInstruction *Offset0 = |
| 3179 | Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {VFMinusOne, Stride}); |
| 3180 | |
| 3181 | // Offset for PartN = Offset0 + Part * Stride * VF. |
| 3182 | VPValue *PartxStride = |
| 3183 | Plan.getConstantInt(Ty: IndexTy, Val: Part * getStride(), /*IsSigned=*/true); |
| 3184 | VPValue *Offset = Builder.createAdd( |
| 3185 | LHS: Offset0, |
| 3186 | RHS: Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {PartxStride, VF})); |
| 3187 | addOffset(Offset); |
| 3188 | } |
| 3189 | |
| 3190 | void VPVectorEndPointerRecipe::execute(VPTransformState &State) { |
| 3191 | auto &Builder = State.Builder; |
| 3192 | assert(getOffset() && "Expected prior materialization of offset" ); |
| 3193 | Value *Ptr = State.get(Def: getPointer(), IsScalar: true); |
| 3194 | Value *Offset = State.get(Def: getOffset(), IsScalar: true); |
| 3195 | Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "" , |
| 3196 | NW: getGEPNoWrapFlags()); |
| 3197 | State.set(Def: this, V: ResultPtr, /*IsScalar*/ true); |
| 3198 | } |
| 3199 | |
| 3200 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3201 | void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3202 | VPSlotTracker &SlotTracker) const { |
| 3203 | O << Indent; |
| 3204 | printAsOperand(O, SlotTracker); |
| 3205 | O << " = vector-end-pointer" ; |
| 3206 | printFlags(O); |
| 3207 | printOperands(O, SlotTracker); |
| 3208 | } |
| 3209 | #endif |
| 3210 | |
| 3211 | void VPVectorPointerRecipe::execute(VPTransformState &State) { |
| 3212 | assert(getVFxPart() && |
| 3213 | "Expected prior simplification of recipe without VFxPart" ); |
| 3214 | |
| 3215 | auto &Builder = State.Builder; |
| 3216 | Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0)); |
| 3217 | Value *Offset = State.get(Def: getVFxPart(), IsScalar: true); |
| 3218 | // TODO: Expand to VPInstruction to support constant folding. |
| 3219 | if (!match(V: getStride(), P: m_One())) { |
| 3220 | Value *Stride = Builder.CreateZExtOrTrunc(V: State.get(Def: getStride(), IsScalar: true), |
| 3221 | DestTy: Offset->getType()); |
| 3222 | Offset = Builder.CreateMul(LHS: Offset, RHS: Stride); |
| 3223 | } |
| 3224 | Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "" , |
| 3225 | NW: getGEPNoWrapFlags()); |
| 3226 | State.set(Def: this, V: ResultPtr, /*IsScalar*/ true); |
| 3227 | } |
| 3228 | |
| 3229 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3230 | void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3231 | VPSlotTracker &SlotTracker) const { |
| 3232 | O << Indent; |
| 3233 | printAsOperand(O, SlotTracker); |
| 3234 | O << " = vector-pointer" ; |
| 3235 | printFlags(O); |
| 3236 | printOperands(O, SlotTracker); |
| 3237 | } |
| 3238 | #endif |
| 3239 | |
| 3240 | InstructionCost VPBlendRecipe::computeCost(ElementCount VF, |
| 3241 | VPCostContext &Ctx) const { |
| 3242 | // A blend will be expanded to a select VPInstruction, which will generate a |
| 3243 | // scalar select if only the first lane is used. |
| 3244 | if (vputils::onlyFirstLaneUsed(Def: this)) |
| 3245 | VF = ElementCount::getFixed(MinVal: 1); |
| 3246 | |
| 3247 | Type *ResultTy = toVectorTy(Scalar: this->getScalarType(), EC: VF); |
| 3248 | Type *CmpTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF); |
| 3249 | return (getNumIncomingValues() - 1) * |
| 3250 | Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: ResultTy, CondTy: CmpTy, |
| 3251 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind: Ctx.CostKind); |
| 3252 | } |
| 3253 | |
| 3254 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3255 | void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3256 | VPSlotTracker &SlotTracker) const { |
| 3257 | O << Indent << "BLEND " ; |
| 3258 | printAsOperand(O, SlotTracker); |
| 3259 | O << " =" ; |
| 3260 | printFlags(O); |
| 3261 | if (getNumIncomingValues() == 1) { |
| 3262 | // Not a User of any mask: not really blending, this is a |
| 3263 | // single-predecessor phi. |
| 3264 | getIncomingValue(0)->printAsOperand(O, SlotTracker); |
| 3265 | } else { |
| 3266 | for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { |
| 3267 | if (I != 0) |
| 3268 | O << " " ; |
| 3269 | getIncomingValue(I)->printAsOperand(O, SlotTracker); |
| 3270 | if (I == 0 && isNormalized()) |
| 3271 | continue; |
| 3272 | O << "/" ; |
| 3273 | getMask(I)->printAsOperand(O, SlotTracker); |
| 3274 | } |
| 3275 | } |
| 3276 | } |
| 3277 | #endif |
| 3278 | |
| 3279 | void VPReductionRecipe::execute(VPTransformState &State) { |
| 3280 | RecurKind Kind = getRecurrenceKind(); |
| 3281 | assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && |
| 3282 | "In-loop AnyOf reductions aren't currently supported" ); |
| 3283 | // Propagate the fast-math flags carried by the underlying instruction. |
| 3284 | IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); |
| 3285 | State.Builder.setFastMathFlags(getFastMathFlagsOrNone()); |
| 3286 | Value *NewVecOp = State.get(Def: getVecOp()); |
| 3287 | if (VPValue *Cond = getCondOp()) { |
| 3288 | Value *NewCond = State.get(Def: Cond, IsScalar: State.VF.isScalar()); |
| 3289 | VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType()); |
| 3290 | Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); |
| 3291 | |
| 3292 | Value *Start = |
| 3293 | getRecurrenceIdentity(K: Kind, Tp: ElementTy, FMF: getFastMathFlagsOrNone()); |
| 3294 | if (State.VF.isVector()) |
| 3295 | Start = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Start); |
| 3296 | |
| 3297 | Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Start); |
| 3298 | NewVecOp = Select; |
| 3299 | } |
| 3300 | Value *NewRed; |
| 3301 | Value *NextInChain; |
| 3302 | if (isOrdered()) { |
| 3303 | Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true); |
| 3304 | if (State.VF.isVector()) |
| 3305 | NewRed = |
| 3306 | createOrderedReduction(B&: State.Builder, RdxKind: Kind, Src: NewVecOp, Start: PrevInChain); |
| 3307 | else |
| 3308 | NewRed = State.Builder.CreateBinOp( |
| 3309 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), |
| 3310 | LHS: PrevInChain, RHS: NewVecOp); |
| 3311 | PrevInChain = NewRed; |
| 3312 | NextInChain = NewRed; |
| 3313 | } else if (isPartialReduction()) { |
| 3314 | assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) && |
| 3315 | "Unexpected partial reduction kind" ); |
| 3316 | Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ false); |
| 3317 | NewRed = State.Builder.CreateIntrinsic( |
| 3318 | RetTy: PrevInChain->getType(), |
| 3319 | ID: Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add |
| 3320 | : Intrinsic::vector_partial_reduce_fadd, |
| 3321 | Args: {PrevInChain, NewVecOp}, FMFSource: State.Builder.getFastMathFlags(), |
| 3322 | Name: "partial.reduce" ); |
| 3323 | PrevInChain = NewRed; |
| 3324 | NextInChain = NewRed; |
| 3325 | } else { |
| 3326 | assert(isInLoop() && |
| 3327 | "The reduction must either be ordered, partial or in-loop" ); |
| 3328 | Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true); |
| 3329 | NewRed = createSimpleReduction(B&: State.Builder, Src: NewVecOp, RdxKind: Kind); |
| 3330 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) |
| 3331 | NextInChain = createMinMaxOp(Builder&: State.Builder, RK: Kind, Left: NewRed, Right: PrevInChain); |
| 3332 | else |
| 3333 | NextInChain = State.Builder.CreateBinOp( |
| 3334 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), |
| 3335 | LHS: PrevInChain, RHS: NewRed); |
| 3336 | } |
| 3337 | State.set(Def: this, V: NextInChain, /*IsScalar*/ !isPartialReduction()); |
| 3338 | } |
| 3339 | |
| 3340 | void VPReductionEVLRecipe::execute(VPTransformState &State) { |
| 3341 | |
| 3342 | auto &Builder = State.Builder; |
| 3343 | // Propagate the fast-math flags carried by the underlying instruction. |
| 3344 | IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); |
| 3345 | Builder.setFastMathFlags(getFastMathFlagsOrNone()); |
| 3346 | |
| 3347 | RecurKind Kind = getRecurrenceKind(); |
| 3348 | Value *Prev = State.get(Def: getChainOp(), /*IsScalar*/ true); |
| 3349 | Value *VecOp = State.get(Def: getVecOp()); |
| 3350 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 3351 | |
| 3352 | Value *Mask; |
| 3353 | if (VPValue *CondOp = getCondOp()) |
| 3354 | Mask = State.get(Def: CondOp); |
| 3355 | else |
| 3356 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
| 3357 | |
| 3358 | Value *NewRed; |
| 3359 | if (isOrdered()) { |
| 3360 | NewRed = createOrderedReduction(B&: Builder, RdxKind: Kind, Src: VecOp, Start: Prev, Mask, EVL); |
| 3361 | } else { |
| 3362 | NewRed = createSimpleReduction(B&: Builder, Src: VecOp, RdxKind: Kind, Mask, EVL); |
| 3363 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) |
| 3364 | NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev); |
| 3365 | else |
| 3366 | NewRed = Builder.CreateBinOp( |
| 3367 | Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed, |
| 3368 | RHS: Prev); |
| 3369 | } |
| 3370 | State.set(Def: this, V: NewRed, /*IsScalar*/ true); |
| 3371 | } |
| 3372 | |
| 3373 | InstructionCost VPReductionRecipe::computeCost(ElementCount VF, |
| 3374 | VPCostContext &Ctx) const { |
| 3375 | RecurKind RdxKind = getRecurrenceKind(); |
| 3376 | Type *ElementTy = this->getScalarType(); |
| 3377 | auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ElementTy, EC: VF)); |
| 3378 | unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind); |
| 3379 | FastMathFlags FMFs = getFastMathFlagsOrNone(); |
| 3380 | std::optional<FastMathFlags> OptionalFMF = |
| 3381 | ElementTy->isFloatingPointTy() ? std::make_optional(t&: FMFs) : std::nullopt; |
| 3382 | |
| 3383 | if (isPartialReduction()) { |
| 3384 | InstructionCost CondCost = 0; |
| 3385 | if (isConditional()) { |
| 3386 | CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; |
| 3387 | auto *CondTy = |
| 3388 | cast<VectorType>(Val: toVectorTy(Scalar: getCondOp()->getScalarType(), EC: VF)); |
| 3389 | CondCost = Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VectorTy, |
| 3390 | CondTy, VecPred: Pred, CostKind: Ctx.CostKind); |
| 3391 | } |
| 3392 | return CondCost + Ctx.TTI.getPartialReductionCost( |
| 3393 | Opcode, InputTypeA: ElementTy, InputTypeB: ElementTy, AccumType: ElementTy, VF, |
| 3394 | OpAExtend: TTI::PR_None, OpBExtend: TTI::PR_None, BinOp: {}, CostKind: Ctx.CostKind, |
| 3395 | FMF: OptionalFMF); |
| 3396 | } |
| 3397 | |
| 3398 | // TODO: Support any-of reductions. |
| 3399 | assert( |
| 3400 | (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || |
| 3401 | ForceTargetInstructionCost.getNumOccurrences() > 0) && |
| 3402 | "Any-of reduction not implemented in VPlan-based cost model currently." ); |
| 3403 | |
| 3404 | // Note that TTI should model the cost of moving result to the scalar register |
| 3405 | // and the BinOp cost in the getMinMaxReductionCost(). |
| 3406 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RdxKind)) { |
| 3407 | Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind); |
| 3408 | return Ctx.TTI.getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF: FMFs, CostKind: Ctx.CostKind); |
| 3409 | } |
| 3410 | |
| 3411 | // Note that TTI should model the cost of moving result to the scalar register |
| 3412 | // and the BinOp cost in the getArithmeticReductionCost(). |
| 3413 | return Ctx.TTI.getArithmeticReductionCost(Opcode, Ty: VectorTy, FMF: OptionalFMF, |
| 3414 | CostKind: Ctx.CostKind); |
| 3415 | } |
| 3416 | |
| 3417 | VPExpressionRecipe::VPExpressionRecipe( |
| 3418 | ExpressionTypes ExpressionType, |
| 3419 | ArrayRef<VPSingleDefRecipe *> ExpressionRecipes) |
| 3420 | : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {}, |
| 3421 | cast<VPReductionRecipe>(Val: ExpressionRecipes.back()) |
| 3422 | ->getChainOp() |
| 3423 | ->getScalarType()), |
| 3424 | ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) { |
| 3425 | assert(!ExpressionRecipes.empty() && "Nothing to combine?" ); |
| 3426 | assert( |
| 3427 | none_of(ExpressionRecipes, |
| 3428 | [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && |
| 3429 | "expression cannot contain recipes with side-effects" ); |
| 3430 | |
| 3431 | // Maintain a copy of the expression recipes as a set of users. |
| 3432 | SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers; |
| 3433 | for (auto *R : ExpressionRecipes) |
| 3434 | ExpressionRecipesAsSetOfUsers.insert(Ptr: R); |
| 3435 | |
| 3436 | // Recipes in the expression, except the last one, must only be used by |
| 3437 | // (other) recipes inside the expression. If there are other users, external |
| 3438 | // to the expression, use a clone of the recipe for external users. |
| 3439 | for (VPSingleDefRecipe *R : reverse(C&: ExpressionRecipes)) { |
| 3440 | if (R != ExpressionRecipes.back() && |
| 3441 | any_of(Range: R->users(), P: [&ExpressionRecipesAsSetOfUsers](VPUser *U) { |
| 3442 | return !ExpressionRecipesAsSetOfUsers.contains(Ptr: U); |
| 3443 | })) { |
| 3444 | // There are users outside of the expression. Clone the recipe and use the |
| 3445 | // clone those external users. |
| 3446 | VPSingleDefRecipe *CopyForExtUsers = R->clone(); |
| 3447 | R->replaceUsesWithIf(New: CopyForExtUsers, ShouldReplace: [&ExpressionRecipesAsSetOfUsers]( |
| 3448 | VPUser &U, unsigned) { |
| 3449 | return !ExpressionRecipesAsSetOfUsers.contains(Ptr: &U); |
| 3450 | }); |
| 3451 | CopyForExtUsers->insertBefore(InsertPos: R); |
| 3452 | } |
| 3453 | if (R->getParent()) |
| 3454 | R->removeFromParent(); |
| 3455 | } |
| 3456 | |
| 3457 | // Internalize all external operands to the expression recipes. To do so, |
| 3458 | // create new temporary VPValues for all operands defined by a recipe outside |
| 3459 | // the expression. The original operands are added as operands of the |
| 3460 | // VPExpressionRecipe itself. |
| 3461 | for (auto *R : ExpressionRecipes) { |
| 3462 | for (const auto &[Idx, Op] : enumerate(First: R->operands())) { |
| 3463 | auto *Def = Op->getDefiningRecipe(); |
| 3464 | if (Def && ExpressionRecipesAsSetOfUsers.contains(Ptr: Def)) |
| 3465 | continue; |
| 3466 | addOperand(Operand: Op); |
| 3467 | LiveInPlaceholders.push_back(Elt: new VPSymbolicValue(Op->getScalarType())); |
| 3468 | } |
| 3469 | } |
| 3470 | |
| 3471 | // Replace each external operand with the first one created for it in |
| 3472 | // LiveInPlaceholders. |
| 3473 | for (auto *R : ExpressionRecipes) |
| 3474 | for (auto const &[LiveIn, Tmp] : zip(t: operands(), u&: LiveInPlaceholders)) |
| 3475 | R->replaceUsesOfWith(From: LiveIn, To: Tmp); |
| 3476 | } |
| 3477 | |
| 3478 | void VPExpressionRecipe::decompose() { |
| 3479 | for (auto *R : ExpressionRecipes) |
| 3480 | // Since the list could contain duplicates, make sure the recipe hasn't |
| 3481 | // already been inserted. |
| 3482 | if (!R->getParent()) |
| 3483 | R->insertBefore(InsertPos: this); |
| 3484 | |
| 3485 | for (const auto &[Idx, Op] : enumerate(First: operands())) |
| 3486 | LiveInPlaceholders[Idx]->replaceAllUsesWith(New: Op); |
| 3487 | |
| 3488 | replaceAllUsesWith(New: ExpressionRecipes.back()); |
| 3489 | ExpressionRecipes.clear(); |
| 3490 | } |
| 3491 | |
| 3492 | InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, |
| 3493 | VPCostContext &Ctx) const { |
| 3494 | Type *RedTy = this->getScalarType(); |
| 3495 | auto *SrcVecTy = |
| 3496 | cast<VectorType>(Val: toVectorTy(Scalar: getOperand(N: 0)->getScalarType(), EC: VF)); |
| 3497 | unsigned Opcode = RecurrenceDescriptor::getOpcode( |
| 3498 | Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes.back())->getRecurrenceKind()); |
| 3499 | switch (ExpressionType) { |
| 3500 | case ExpressionTypes::NegatedExtendedReduction: |
| 3501 | assert((Opcode == Instruction::Add || Opcode == Instruction::FAdd) && |
| 3502 | "Unexpected opcode" ); |
| 3503 | Opcode = Opcode == Instruction::Add ? Instruction::Sub : Instruction::FSub; |
| 3504 | [[fallthrough]]; |
| 3505 | case ExpressionTypes::ExtendedReduction: { |
| 3506 | auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back()); |
| 3507 | auto *ExtR = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]); |
| 3508 | |
| 3509 | if (RedR->isPartialReduction()) |
| 3510 | return Ctx.TTI.getPartialReductionCost( |
| 3511 | Opcode, InputTypeA: getOperand(N: 0)->getScalarType(), InputTypeB: nullptr, AccumType: RedTy, VF, |
| 3512 | OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(CastOpc: ExtR->getOpcode()), |
| 3513 | OpBExtend: TargetTransformInfo::PR_None, BinOp: std::nullopt, CostKind: Ctx.CostKind, |
| 3514 | FMF: RedTy->isFloatingPointTy() |
| 3515 | ? std::optional{RedR->getFastMathFlagsOrNone()} |
| 3516 | : std::nullopt); |
| 3517 | else if (!RedTy->isFloatingPointTy()) |
| 3518 | // TTI::getExtendedReductionCost only supports integer types. |
| 3519 | return Ctx.TTI.getExtendedReductionCost( |
| 3520 | Opcode, IsUnsigned: ExtR->getOpcode() == Instruction::ZExt, ResTy: RedTy, Ty: SrcVecTy, |
| 3521 | FMF: std::nullopt, CostKind: Ctx.CostKind); |
| 3522 | else |
| 3523 | return InstructionCost::getInvalid(); |
| 3524 | } |
| 3525 | case ExpressionTypes::MulAccReduction: |
| 3526 | return Ctx.TTI.getMulAccReductionCost(IsUnsigned: false, RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy, |
| 3527 | CostKind: Ctx.CostKind); |
| 3528 | |
| 3529 | case ExpressionTypes::ExtNegatedMulAccReduction: |
| 3530 | switch (Opcode) { |
| 3531 | case Instruction::Add: |
| 3532 | Opcode = Instruction::Sub; |
| 3533 | break; |
| 3534 | case Instruction::FAdd: |
| 3535 | Opcode = Instruction::FSub; |
| 3536 | break; |
| 3537 | default: |
| 3538 | llvm_unreachable("Unsupported opcode for ExtNegatedMulAccReduction" ); |
| 3539 | } |
| 3540 | [[fallthrough]]; |
| 3541 | case ExpressionTypes::ExtMulAccReduction: { |
| 3542 | auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back()); |
| 3543 | if (RedR->isPartialReduction()) { |
| 3544 | auto *Ext0R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]); |
| 3545 | auto *Ext1R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[1]); |
| 3546 | auto *Mul = cast<VPWidenRecipe>(Val: ExpressionRecipes[2]); |
| 3547 | return Ctx.TTI.getPartialReductionCost( |
| 3548 | Opcode, InputTypeA: getOperand(N: 0)->getScalarType(), |
| 3549 | InputTypeB: getOperand(N: 1)->getScalarType(), AccumType: RedTy, VF, |
| 3550 | OpAExtend: TargetTransformInfo::getPartialReductionExtendKind( |
| 3551 | CastOpc: Ext0R->getOpcode()), |
| 3552 | OpBExtend: TargetTransformInfo::getPartialReductionExtendKind( |
| 3553 | CastOpc: Ext1R->getOpcode()), |
| 3554 | BinOp: Mul->getOpcode(), CostKind: Ctx.CostKind, |
| 3555 | FMF: RedTy->isFloatingPointTy() |
| 3556 | ? std::optional{RedR->getFastMathFlagsOrNone()} |
| 3557 | : std::nullopt); |
| 3558 | } |
| 3559 | assert(Opcode != Instruction::FSub && "Only integer types are supported" ); |
| 3560 | return Ctx.TTI.getMulAccReductionCost( |
| 3561 | IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() == |
| 3562 | Instruction::ZExt, |
| 3563 | RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind); |
| 3564 | } |
| 3565 | } |
| 3566 | llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum" ); |
| 3567 | } |
| 3568 | |
| 3569 | bool VPExpressionRecipe::mayReadOrWriteMemory() const { |
| 3570 | return any_of(Range: ExpressionRecipes, P: [](VPSingleDefRecipe *R) { |
| 3571 | return R->mayReadFromMemory() || R->mayWriteToMemory(); |
| 3572 | }); |
| 3573 | } |
| 3574 | |
| 3575 | bool VPExpressionRecipe::mayHaveSideEffects() const { |
| 3576 | assert( |
| 3577 | none_of(ExpressionRecipes, |
| 3578 | [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && |
| 3579 | "expression cannot contain recipes with side-effects" ); |
| 3580 | return false; |
| 3581 | } |
| 3582 | |
| 3583 | bool VPExpressionRecipe::isVectorToScalar() const { |
| 3584 | auto *RR = dyn_cast<VPReductionRecipe>(Val: ExpressionRecipes.back()); |
| 3585 | return RR && !RR->isPartialReduction(); |
| 3586 | } |
| 3587 | |
| 3588 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3589 | |
| 3590 | void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3591 | VPSlotTracker &SlotTracker) const { |
| 3592 | O << Indent << "EXPRESSION " ; |
| 3593 | printAsOperand(O, SlotTracker); |
| 3594 | O << " = " ; |
| 3595 | auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back()); |
| 3596 | unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); |
| 3597 | VPValue *RdxStart = |
| 3598 | getOperand(getNumOperands() - (Red->isConditional() ? 2 : 1)); |
| 3599 | |
| 3600 | switch (ExpressionType) { |
| 3601 | case ExpressionTypes::NegatedExtendedReduction: |
| 3602 | case ExpressionTypes::ExtendedReduction: { |
| 3603 | bool Negated = ExpressionType == ExpressionTypes::NegatedExtendedReduction; |
| 3604 | getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); |
| 3605 | O << " + " << (Red->isPartialReduction() ? "partial." : "" ) << "reduce." ; |
| 3606 | O << Instruction::getOpcodeName(Opcode) << " (" ; |
| 3607 | if (Negated) |
| 3608 | O << (Opcode == Instruction::Add ? "sub (0, " : "fneg(" ); |
| 3609 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 3610 | if (Negated) |
| 3611 | O << ")" ; |
| 3612 | Red->printFlags(O); |
| 3613 | |
| 3614 | auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); |
| 3615 | O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " |
| 3616 | << *Ext0->getScalarType(); |
| 3617 | if (Red->isConditional()) { |
| 3618 | O << ", " ; |
| 3619 | getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); |
| 3620 | } |
| 3621 | O << ")" ; |
| 3622 | break; |
| 3623 | } |
| 3624 | case ExpressionTypes::ExtNegatedMulAccReduction: { |
| 3625 | RdxStart->printAsOperand(O, SlotTracker); |
| 3626 | O << " + " << (Red->isPartialReduction() ? "partial." : "" ) << "reduce." ; |
| 3627 | O << Instruction::getOpcodeName( |
| 3628 | RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) |
| 3629 | << " (sub (0, mul" ; |
| 3630 | auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]); |
| 3631 | Mul->printFlags(O); |
| 3632 | O << "(" ; |
| 3633 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 3634 | auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); |
| 3635 | O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " |
| 3636 | << *Ext0->getScalarType() << "), (" ; |
| 3637 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 3638 | auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]); |
| 3639 | O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " |
| 3640 | << *Ext1->getScalarType() << ")" ; |
| 3641 | if (Red->isConditional()) { |
| 3642 | O << ", " ; |
| 3643 | getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); |
| 3644 | } |
| 3645 | O << "))" ; |
| 3646 | break; |
| 3647 | } |
| 3648 | case ExpressionTypes::MulAccReduction: |
| 3649 | case ExpressionTypes::ExtMulAccReduction: { |
| 3650 | RdxStart->printAsOperand(O, SlotTracker); |
| 3651 | O << " + " << (Red->isPartialReduction() ? "partial." : "" ) << "reduce." ; |
| 3652 | O << Instruction::getOpcodeName( |
| 3653 | RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) |
| 3654 | << " (" ; |
| 3655 | O << "mul" ; |
| 3656 | bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction; |
| 3657 | auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2] |
| 3658 | : ExpressionRecipes[0]); |
| 3659 | Mul->printFlags(O); |
| 3660 | if (IsExtended) |
| 3661 | O << "(" ; |
| 3662 | getOperand(0)->printAsOperand(O, SlotTracker); |
| 3663 | if (IsExtended) { |
| 3664 | auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); |
| 3665 | O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " |
| 3666 | << *Ext0->getScalarType() << "), (" ; |
| 3667 | } else { |
| 3668 | O << ", " ; |
| 3669 | } |
| 3670 | getOperand(1)->printAsOperand(O, SlotTracker); |
| 3671 | if (IsExtended) { |
| 3672 | auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]); |
| 3673 | O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " |
| 3674 | << *Ext1->getScalarType() << ")" ; |
| 3675 | } |
| 3676 | if (Red->isConditional()) { |
| 3677 | O << ", " ; |
| 3678 | getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); |
| 3679 | } |
| 3680 | O << ")" ; |
| 3681 | break; |
| 3682 | } |
| 3683 | } |
| 3684 | } |
| 3685 | |
| 3686 | void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3687 | VPSlotTracker &SlotTracker) const { |
| 3688 | if (isPartialReduction()) |
| 3689 | O << Indent << "PARTIAL-REDUCE " ; |
| 3690 | else |
| 3691 | O << Indent << "REDUCE " ; |
| 3692 | printAsOperand(O, SlotTracker); |
| 3693 | O << " = " ; |
| 3694 | getChainOp()->printAsOperand(O, SlotTracker); |
| 3695 | O << " +" ; |
| 3696 | printFlags(O); |
| 3697 | O << " reduce." ; |
| 3698 | printRecurrenceKind(O, getRecurrenceKind()); |
| 3699 | O << " (" ; |
| 3700 | getVecOp()->printAsOperand(O, SlotTracker); |
| 3701 | if (isConditional()) { |
| 3702 | O << ", " ; |
| 3703 | getCondOp()->printAsOperand(O, SlotTracker); |
| 3704 | } |
| 3705 | O << ")" ; |
| 3706 | } |
| 3707 | |
| 3708 | void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 3709 | VPSlotTracker &SlotTracker) const { |
| 3710 | O << Indent << "REDUCE " ; |
| 3711 | printAsOperand(O, SlotTracker); |
| 3712 | O << " = " ; |
| 3713 | getChainOp()->printAsOperand(O, SlotTracker); |
| 3714 | O << " +" ; |
| 3715 | printFlags(O); |
| 3716 | O << " vp.reduce." |
| 3717 | << Instruction::getOpcodeName( |
| 3718 | RecurrenceDescriptor::getOpcode(getRecurrenceKind())) |
| 3719 | << " (" ; |
| 3720 | getVecOp()->printAsOperand(O, SlotTracker); |
| 3721 | O << ", " ; |
| 3722 | getEVL()->printAsOperand(O, SlotTracker); |
| 3723 | if (isConditional()) { |
| 3724 | O << ", " ; |
| 3725 | getCondOp()->printAsOperand(O, SlotTracker); |
| 3726 | } |
| 3727 | O << ")" ; |
| 3728 | } |
| 3729 | |
| 3730 | #endif |
| 3731 | |
| 3732 | void VPReplicateRecipe::execute(VPTransformState &State) { |
| 3733 | assert(IsSingleScalar && |
| 3734 | "VPReplicateRecipes must be unrolled before ::execute" ); |
| 3735 | auto *Instr = getUnderlyingInstr(); |
| 3736 | Instruction *Cloned = Instr->clone(); |
| 3737 | Type *ResultTy = getScalarType(); |
| 3738 | if (!ResultTy->isVoidTy()) { |
| 3739 | Cloned->setName(Instr->getName() + ".cloned" ); |
| 3740 | // The operands of the replicate recipe may have been narrowed, resulting in |
| 3741 | // a narrower result type. Update the type of the cloned instruction to the |
| 3742 | // correct type. |
| 3743 | if (ResultTy != Cloned->getType()) |
| 3744 | Cloned->mutateType(Ty: ResultTy); |
| 3745 | } |
| 3746 | |
| 3747 | applyFlags(I&: *Cloned); |
| 3748 | applyMetadata(I&: *Cloned); |
| 3749 | |
| 3750 | if (hasPredicate()) |
| 3751 | cast<CmpInst>(Val: Cloned)->setPredicate(getPredicate()); |
| 3752 | |
| 3753 | // Replace the operands of the cloned instructions with their scalar |
| 3754 | // equivalents in the new loop. |
| 3755 | for (const auto &[Idx, V] : enumerate(First: operands())) |
| 3756 | Cloned->setOperand(i: Idx, Val: State.get(Def: V, IsScalar: true)); |
| 3757 | |
| 3758 | // Place the cloned scalar in the new loop. |
| 3759 | State.Builder.Insert(I: Cloned); |
| 3760 | |
| 3761 | State.set(Def: this, V: Cloned, IsScalar: true); |
| 3762 | |
| 3763 | // If we just cloned a new assumption, add it the assumption cache. |
| 3764 | if (auto *II = dyn_cast<AssumeInst>(Val: Cloned)) |
| 3765 | State.AC->registerAssumption(CI: II); |
| 3766 | } |
| 3767 | |
| 3768 | /// Returns a SCEV expression for \p Ptr if it is a pointer computation for |
| 3769 | /// which the legacy cost model computes a SCEV expression when computing the |
| 3770 | /// address cost. Computing SCEVs for VPValues is incomplete and returns |
| 3771 | /// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In |
| 3772 | /// those cases we fall back to the legacy cost model. Otherwise return nullptr. |
| 3773 | static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, |
| 3774 | PredicatedScalarEvolution &PSE, |
| 3775 | const Loop *L) { |
| 3776 | const SCEV *Addr = vputils::getSCEVExprForVPValue(V: Ptr, PSE, L); |
| 3777 | if (isa<SCEVCouldNotCompute>(Val: Addr)) |
| 3778 | return Addr; |
| 3779 | |
| 3780 | return vputils::isAddressSCEVForCost(Addr, SE&: *PSE.getSE(), L) ? Addr : nullptr; |
| 3781 | } |
| 3782 | |
| 3783 | InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, |
| 3784 | VPCostContext &Ctx) const { |
| 3785 | Instruction *UI = cast<Instruction>(Val: getUnderlyingValue()); |
| 3786 | // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan |
| 3787 | // transform, avoid computing their cost multiple times for now. |
| 3788 | Ctx.SkipCostComputation.insert(Ptr: UI); |
| 3789 | |
| 3790 | if (VF.isScalable() && !isSingleScalar()) |
| 3791 | return InstructionCost::getInvalid(); |
| 3792 | |
| 3793 | switch (UI->getOpcode()) { |
| 3794 | case Instruction::Alloca: |
| 3795 | if (VF.isScalable()) |
| 3796 | return InstructionCost::getInvalid(); |
| 3797 | return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, |
| 3798 | Ty: this->getScalarType(), CostKind: Ctx.CostKind); |
| 3799 | case Instruction::GetElementPtr: |
| 3800 | // We mark this instruction as zero-cost because the cost of GEPs in |
| 3801 | // vectorized code depends on whether the corresponding memory instruction |
| 3802 | // is scalarized or not. Therefore, we handle GEPs with the memory |
| 3803 | // instruction cost. |
| 3804 | return 0; |
| 3805 | case Instruction::Call: { |
| 3806 | auto *CalledFn = |
| 3807 | cast<Function>(Val: getOperand(N: getNumOperands() - 1)->getLiveInIRValue()); |
| 3808 | Type *ResultTy = this->getScalarType(); |
| 3809 | SmallVector<const VPValue *> ArgOps(drop_end(RangeOrContainer: operands())); |
| 3810 | return computeCallCost(CalledFn, ResultTy, ArgOps, IsSingleScalar: isSingleScalar(), VF, |
| 3811 | Ctx); |
| 3812 | } |
| 3813 | case Instruction::Add: |
| 3814 | case Instruction::Sub: |
| 3815 | case Instruction::FAdd: |
| 3816 | case Instruction::FSub: |
| 3817 | case Instruction::Mul: |
| 3818 | case Instruction::FMul: |
| 3819 | case Instruction::FDiv: |
| 3820 | case Instruction::FRem: |
| 3821 | case Instruction::Shl: |
| 3822 | case Instruction::LShr: |
| 3823 | case Instruction::AShr: |
| 3824 | case Instruction::And: |
| 3825 | case Instruction::Or: |
| 3826 | case Instruction::Xor: |
| 3827 | case Instruction::ICmp: |
| 3828 | case Instruction::FCmp: |
| 3829 | return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), |
| 3830 | Ctx) * |
| 3831 | (isSingleScalar() ? 1 : VF.getFixedValue()); |
| 3832 | case Instruction::SDiv: |
| 3833 | case Instruction::UDiv: |
| 3834 | case Instruction::SRem: |
| 3835 | case Instruction::URem: { |
| 3836 | InstructionCost ScalarCost = |
| 3837 | getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), Ctx); |
| 3838 | if (isSingleScalar()) |
| 3839 | return ScalarCost; |
| 3840 | |
| 3841 | // If any of the operands is from a different replicate region and has its |
| 3842 | // cost skipped, it may have been forced to scalar. Fall back to legacy cost |
| 3843 | // model to avoid cost mis-match. |
| 3844 | if (any_of(Range: operands(), P: [&Ctx, VF](VPValue *Op) { |
| 3845 | auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: Op); |
| 3846 | if (!PredR) |
| 3847 | return false; |
| 3848 | return Ctx.skipCostComputation( |
| 3849 | UI: dyn_cast_or_null<Instruction>( |
| 3850 | Val: PredR->getOperand(N: 0)->getUnderlyingValue()), |
| 3851 | IsVector: VF.isVector()); |
| 3852 | })) |
| 3853 | break; |
| 3854 | |
| 3855 | ScalarCost = ScalarCost * VF.getFixedValue() + |
| 3856 | Ctx.getScalarizationOverhead(ResultTy: this->getScalarType(), |
| 3857 | Operands: to_vector(Range: operands()), VF); |
| 3858 | // If the recipe is not predicated (i.e. not in a replicate region), return |
| 3859 | // the scalar cost. Otherwise handle predicated cost. |
| 3860 | if (!getRegion()->isReplicator()) |
| 3861 | return ScalarCost; |
| 3862 | |
| 3863 | // Account for the phi nodes that we will create. |
| 3864 | ScalarCost += VF.getFixedValue() * |
| 3865 | Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 3866 | // Scale the cost by the probability of executing the predicated blocks. |
| 3867 | // This assumes the predicated block for each vector lane is equally |
| 3868 | // likely. |
| 3869 | ScalarCost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent()); |
| 3870 | return ScalarCost; |
| 3871 | } |
| 3872 | case Instruction::Load: |
| 3873 | case Instruction::Store: { |
| 3874 | bool IsLoad = UI->getOpcode() == Instruction::Load; |
| 3875 | const VPValue *PtrOp = getOperand(N: !IsLoad); |
| 3876 | const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr: PtrOp, PSE&: Ctx.PSE, L: Ctx.L); |
| 3877 | if (isa_and_nonnull<SCEVCouldNotCompute>(Val: PtrSCEV)) |
| 3878 | break; |
| 3879 | |
| 3880 | Type *ValTy = (IsLoad ? this : getOperand(N: 0))->getScalarType(); |
| 3881 | Type *ScalarPtrTy = PtrOp->getScalarType(); |
| 3882 | const Align Alignment = getLoadStoreAlignment(I: UI); |
| 3883 | unsigned AS = cast<PointerType>(Val: ScalarPtrTy)->getAddressSpace(); |
| 3884 | TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: UI->getOperand(i: 0)); |
| 3885 | bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); |
| 3886 | bool UsedByLoadStoreAddress = |
| 3887 | !PreferVectorizedAddressing && vputils::isUsedByLoadStoreAddress(V: this); |
| 3888 | InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( |
| 3889 | Opcode: UI->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind, OpdInfo: OpInfo, |
| 3890 | I: UsedByLoadStoreAddress ? UI : nullptr); |
| 3891 | |
| 3892 | Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(Scalar: ScalarPtrTy, EC: VF); |
| 3893 | InstructionCost ScalarCost = |
| 3894 | ScalarMemOpCost + |
| 3895 | Ctx.TTI.getAddressComputationCost( |
| 3896 | PtrTy, SE: UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), Ptr: PtrSCEV, |
| 3897 | CostKind: Ctx.CostKind); |
| 3898 | if (isSingleScalar()) |
| 3899 | return ScalarCost; |
| 3900 | |
| 3901 | SmallVector<const VPValue *> OpsToScalarize; |
| 3902 | Type *ResultTy = Type::getVoidTy(C&: PtrTy->getContext()); |
| 3903 | // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we |
| 3904 | // don't assign scalarization overhead in general, if the target prefers |
| 3905 | // vectorized addressing or the loaded value is used as part of an address |
| 3906 | // of another load or store. |
| 3907 | if (!UsedByLoadStoreAddress) { |
| 3908 | bool EfficientVectorLoadStore = |
| 3909 | Ctx.TTI.supportsEfficientVectorElementLoadStore(); |
| 3910 | if (!(IsLoad && !PreferVectorizedAddressing) && |
| 3911 | !(!IsLoad && EfficientVectorLoadStore)) |
| 3912 | append_range(C&: OpsToScalarize, R: operands()); |
| 3913 | |
| 3914 | if (!EfficientVectorLoadStore) |
| 3915 | ResultTy = this->getScalarType(); |
| 3916 | } |
| 3917 | |
| 3918 | TTI::VectorInstrContext VIC = |
| 3919 | IsLoad ? TTI::VectorInstrContext::Load : TTI::VectorInstrContext::Store; |
| 3920 | InstructionCost Cost = |
| 3921 | (ScalarCost * VF.getFixedValue()) + |
| 3922 | Ctx.getScalarizationOverhead(ResultTy, Operands: OpsToScalarize, VF, VIC, AlwaysIncludeReplicatingR: true); |
| 3923 | |
| 3924 | const VPRegionBlock *ParentRegion = getRegion(); |
| 3925 | if (ParentRegion && ParentRegion->isReplicator()) { |
| 3926 | if (!PtrSCEV) |
| 3927 | break; |
| 3928 | Cost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent()); |
| 3929 | Cost += Ctx.TTI.getCFInstrCost(Opcode: Instruction::CondBr, CostKind: Ctx.CostKind); |
| 3930 | |
| 3931 | auto *VecI1Ty = VectorType::get( |
| 3932 | ElementType: IntegerType::getInt1Ty(C&: Ctx.L->getHeader()->getContext()), EC: VF); |
| 3933 | Cost += Ctx.TTI.getScalarizationOverhead( |
| 3934 | Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), |
| 3935 | /*Insert=*/false, /*Extract=*/true, CostKind: Ctx.CostKind); |
| 3936 | |
| 3937 | if (Ctx.useEmulatedMaskMemRefHack(R: this, VF)) { |
| 3938 | // Artificially setting to a high enough value to practically disable |
| 3939 | // vectorization with such operations. |
| 3940 | return 3000000; |
| 3941 | } |
| 3942 | } |
| 3943 | return Cost; |
| 3944 | } |
| 3945 | case Instruction::SExt: |
| 3946 | case Instruction::ZExt: |
| 3947 | case Instruction::FPToUI: |
| 3948 | case Instruction::FPToSI: |
| 3949 | case Instruction::FPExt: |
| 3950 | case Instruction::PtrToInt: |
| 3951 | case Instruction::PtrToAddr: |
| 3952 | case Instruction::IntToPtr: |
| 3953 | case Instruction::SIToFP: |
| 3954 | case Instruction::UIToFP: |
| 3955 | case Instruction::Trunc: |
| 3956 | case Instruction::FPTrunc: |
| 3957 | case Instruction::Select: |
| 3958 | case Instruction::AddrSpaceCast: { |
| 3959 | return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), |
| 3960 | Ctx) * |
| 3961 | (isSingleScalar() ? 1 : VF.getFixedValue()); |
| 3962 | } |
| 3963 | case Instruction::ExtractValue: |
| 3964 | case Instruction::InsertValue: |
| 3965 | return Ctx.TTI.getInsertExtractValueCost(Opcode: getOpcode(), CostKind: Ctx.CostKind); |
| 3966 | } |
| 3967 | |
| 3968 | return Ctx.getLegacyCost(UI, VF); |
| 3969 | } |
| 3970 | |
| 3971 | InstructionCost VPReplicateRecipe::computeCallCost( |
| 3972 | Function *CalledFn, Type *ResultTy, ArrayRef<const VPValue *> ArgOps, |
| 3973 | bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx) { |
| 3974 | SmallVector<Type *, 4> Tys = map_to_vector<4>( |
| 3975 | C&: ArgOps, F: [&](const VPValue *Op) { return Op->getScalarType(); }); |
| 3976 | |
| 3977 | Intrinsic::ID IntrinID = CalledFn->getIntrinsicID(); |
| 3978 | auto GetIntrinsicCost = [&] { |
| 3979 | if (!IntrinID) |
| 3980 | return InstructionCost::getInvalid(); |
| 3981 | return Ctx.TTI.getIntrinsicInstrCost( |
| 3982 | ICA: IntrinsicCostAttributes(IntrinID, ResultTy, Tys), CostKind: Ctx.CostKind); |
| 3983 | }; |
| 3984 | |
| 3985 | if (IntrinID && VPCostContext::isFreeScalarIntrinsic(ID: IntrinID)) { |
| 3986 | assert(GetIntrinsicCost() == 0 && "scalarizing intrinsic should be free" ); |
| 3987 | return 0; |
| 3988 | } |
| 3989 | |
| 3990 | InstructionCost ScalarCallCost = |
| 3991 | Ctx.TTI.getCallInstrCost(F: CalledFn, RetTy: ResultTy, Tys, CostKind: Ctx.CostKind); |
| 3992 | if (IsSingleScalar) { |
| 3993 | ScalarCallCost = std::min(a: ScalarCallCost, b: GetIntrinsicCost()); |
| 3994 | return ScalarCallCost; |
| 3995 | } |
| 3996 | |
| 3997 | // Scalarization overhead is undefined for scalable VFs. |
| 3998 | if (VF.isScalable()) |
| 3999 | return InstructionCost::getInvalid(); |
| 4000 | |
| 4001 | return ScalarCallCost * VF.getFixedValue() + |
| 4002 | Ctx.getScalarizationOverhead(ResultTy, Operands: ArgOps, VF); |
| 4003 | } |
| 4004 | |
| 4005 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4006 | void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4007 | VPSlotTracker &SlotTracker) const { |
| 4008 | O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE " ); |
| 4009 | |
| 4010 | if (!getScalarType()->isVoidTy()) { |
| 4011 | printAsOperand(O, SlotTracker); |
| 4012 | O << " = " ; |
| 4013 | } |
| 4014 | if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { |
| 4015 | O << "call" ; |
| 4016 | printFlags(O); |
| 4017 | O << "@" << CB->getCalledFunction()->getName() << "(" ; |
| 4018 | interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), |
| 4019 | O, [&O, &SlotTracker](VPValue *Op) { |
| 4020 | Op->printAsOperand(O, SlotTracker); |
| 4021 | }); |
| 4022 | O << ")" ; |
| 4023 | } else { |
| 4024 | O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); |
| 4025 | printFlags(O); |
| 4026 | printOperands(O, SlotTracker); |
| 4027 | } |
| 4028 | |
| 4029 | // Find if the recipe is used by a widened recipe via an intervening |
| 4030 | // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. |
| 4031 | if (any_of(users(), [](const VPUser *U) { |
| 4032 | if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) |
| 4033 | return !vputils::onlyScalarValuesUsed(PredR); |
| 4034 | return false; |
| 4035 | })) |
| 4036 | O << " (S->V)" ; |
| 4037 | } |
| 4038 | #endif |
| 4039 | |
| 4040 | void VPBranchOnMaskRecipe::execute(VPTransformState &State) { |
| 4041 | llvm_unreachable("recipe must be removed when dissolving replicate region" ); |
| 4042 | } |
| 4043 | |
| 4044 | InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, |
| 4045 | VPCostContext &Ctx) const { |
| 4046 | // The legacy cost model doesn't assign costs to branches for individual |
| 4047 | // replicate regions. Match the current behavior in the VPlan cost model for |
| 4048 | // now. |
| 4049 | return 0; |
| 4050 | } |
| 4051 | |
| 4052 | void VPPredInstPHIRecipe::execute(VPTransformState &State) { |
| 4053 | llvm_unreachable("recipe must be removed when dissolving replicate region" ); |
| 4054 | } |
| 4055 | |
| 4056 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4057 | void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4058 | VPSlotTracker &SlotTracker) const { |
| 4059 | O << Indent << "PHI-PREDICATED-INSTRUCTION " ; |
| 4060 | printAsOperand(O, SlotTracker); |
| 4061 | O << " = " ; |
| 4062 | printOperands(O, SlotTracker); |
| 4063 | } |
| 4064 | #endif |
| 4065 | |
| 4066 | VPRecipeBase *VPWidenLoadRecipe::getAsRecipe() { return this; } |
| 4067 | const VPRecipeBase *VPWidenLoadRecipe::getAsRecipe() const { return this; } |
| 4068 | |
| 4069 | VPRecipeBase *VPWidenLoadEVLRecipe::getAsRecipe() { return this; } |
| 4070 | const VPRecipeBase *VPWidenLoadEVLRecipe::getAsRecipe() const { return this; } |
| 4071 | |
| 4072 | VPRecipeBase *VPWidenStoreRecipe::getAsRecipe() { return this; } |
| 4073 | const VPRecipeBase *VPWidenStoreRecipe::getAsRecipe() const { return this; } |
| 4074 | |
| 4075 | VPRecipeBase *VPWidenStoreEVLRecipe::getAsRecipe() { return this; } |
| 4076 | const VPRecipeBase *VPWidenStoreEVLRecipe::getAsRecipe() const { return this; } |
| 4077 | |
| 4078 | InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, |
| 4079 | VPCostContext &Ctx) const { |
| 4080 | const VPRecipeBase *R = getAsRecipe(); |
| 4081 | bool IsLoad = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: R); |
| 4082 | Type *ScalarTy = IsLoad ? cast<VPSingleDefRecipe>(Val: R)->getScalarType() |
| 4083 | : R->getOperand(N: 1)->getScalarType(); |
| 4084 | Type *Ty = toVectorTy(Scalar: ScalarTy, EC: VF); |
| 4085 | unsigned AS = |
| 4086 | cast<PointerType>(Val: getAddr()->getScalarType())->getAddressSpace(); |
| 4087 | unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store; |
| 4088 | |
| 4089 | if (!Consecutive) { |
| 4090 | // TODO: Using the original IR may not be accurate. |
| 4091 | // Currently, ARM will use the underlying IR to calculate gather/scatter |
| 4092 | // instruction cost. |
| 4093 | [[maybe_unused]] auto IsReverseMask = [this, R]() { |
| 4094 | VPValue *Mask = getMask(); |
| 4095 | if (!Mask) |
| 4096 | return false; |
| 4097 | |
| 4098 | if (isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(Val: R)) |
| 4099 | return match(V: Mask, P: m_Intrinsic<Intrinsic::experimental_vp_reverse>()); |
| 4100 | |
| 4101 | return match(V: Mask, P: m_Reverse(Op0: m_VPValue())); |
| 4102 | }; |
| 4103 | assert(!IsReverseMask() && |
| 4104 | "Inconsecutive memory access should not have reverse order" ); |
| 4105 | Type *PtrTy = getAddr()->getScalarType(); |
| 4106 | const Value *Ptr = getAddr()->getUnderlyingValue(); |
| 4107 | |
| 4108 | // If the address value is uniform across all lanes, then the address can be |
| 4109 | // calculated with scalar type and broadcast. |
| 4110 | if (!vputils::isSingleScalar(VPV: getAddr())) |
| 4111 | PtrTy = toVectorTy(Scalar: PtrTy, EC: VF); |
| 4112 | |
| 4113 | unsigned IID = isa<VPWidenLoadRecipe>(Val: R) ? Intrinsic::masked_gather |
| 4114 | : isa<VPWidenStoreRecipe>(Val: R) ? Intrinsic::masked_scatter |
| 4115 | : isa<VPWidenLoadEVLRecipe>(Val: R) ? Intrinsic::vp_gather |
| 4116 | : Intrinsic::vp_scatter; |
| 4117 | return Ctx.TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, |
| 4118 | CostKind: Ctx.CostKind) + |
| 4119 | Ctx.TTI.getMemIntrinsicInstrCost( |
| 4120 | MICA: MemIntrinsicCostAttributes(IID, Ty, Ptr, IsMasked, Alignment, |
| 4121 | &Ingredient), |
| 4122 | CostKind: Ctx.CostKind); |
| 4123 | } |
| 4124 | |
| 4125 | InstructionCost Cost = 0; |
| 4126 | if (IsMasked) { |
| 4127 | unsigned IID = isa<VPWidenLoadRecipe>(Val: R) ? Intrinsic::masked_load |
| 4128 | : Intrinsic::masked_store; |
| 4129 | Cost += Ctx.TTI.getMemIntrinsicInstrCost( |
| 4130 | MICA: MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), CostKind: Ctx.CostKind); |
| 4131 | } else { |
| 4132 | TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo( |
| 4133 | V: isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: R) ? R->getOperand(N: 0) |
| 4134 | : R->getOperand(N: 1)); |
| 4135 | Cost += Ctx.TTI.getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind, |
| 4136 | OpdInfo: OpInfo, I: &Ingredient); |
| 4137 | } |
| 4138 | return Cost; |
| 4139 | } |
| 4140 | |
| 4141 | void VPWidenLoadRecipe::execute(VPTransformState &State) { |
| 4142 | Type *ScalarDataTy = getScalarType(); |
| 4143 | auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF); |
| 4144 | bool CreateGather = !isConsecutive(); |
| 4145 | |
| 4146 | auto &Builder = State.Builder; |
| 4147 | Value *Mask = nullptr; |
| 4148 | if (auto *VPMask = getMask()) |
| 4149 | Mask = State.get(Def: VPMask); |
| 4150 | |
| 4151 | Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateGather); |
| 4152 | Value *NewLI; |
| 4153 | if (CreateGather) { |
| 4154 | NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr, |
| 4155 | Name: "wide.masked.gather" ); |
| 4156 | } else if (Mask) { |
| 4157 | NewLI = |
| 4158 | Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask, |
| 4159 | PassThru: PoisonValue::get(T: DataTy), Name: "wide.masked.load" ); |
| 4160 | } else { |
| 4161 | NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load" ); |
| 4162 | } |
| 4163 | applyMetadata(I&: *cast<Instruction>(Val: NewLI)); |
| 4164 | State.set(Def: this, V: NewLI); |
| 4165 | } |
| 4166 | |
| 4167 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4168 | void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4169 | VPSlotTracker &SlotTracker) const { |
| 4170 | O << Indent << "WIDEN " ; |
| 4171 | printAsOperand(O, SlotTracker); |
| 4172 | O << " = load " ; |
| 4173 | printOperands(O, SlotTracker); |
| 4174 | } |
| 4175 | #endif |
| 4176 | |
| 4177 | void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { |
| 4178 | Type *ScalarDataTy = getScalarType(); |
| 4179 | auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF); |
| 4180 | bool CreateGather = !isConsecutive(); |
| 4181 | |
| 4182 | auto &Builder = State.Builder; |
| 4183 | CallInst *NewLI; |
| 4184 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 4185 | Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateGather); |
| 4186 | Value *Mask = nullptr; |
| 4187 | if (VPValue *VPMask = getMask()) |
| 4188 | Mask = State.get(Def: VPMask); |
| 4189 | else |
| 4190 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
| 4191 | |
| 4192 | if (CreateGather) { |
| 4193 | NewLI = Builder.CreateIntrinsicWithoutFolding(RetTy: DataTy, ID: Intrinsic::vp_gather, |
| 4194 | Args: {Addr, Mask, EVL}, FMFSource: nullptr, |
| 4195 | Name: "wide.masked.gather" ); |
| 4196 | } else { |
| 4197 | NewLI = Builder.CreateIntrinsicWithoutFolding( |
| 4198 | RetTy: DataTy, ID: Intrinsic::vp_load, Args: {Addr, Mask, EVL}, FMFSource: nullptr, Name: "vp.op.load" ); |
| 4199 | } |
| 4200 | NewLI->addParamAttr( |
| 4201 | ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment)); |
| 4202 | applyMetadata(I&: *NewLI); |
| 4203 | State.set(Def: this, V: NewLI); |
| 4204 | } |
| 4205 | |
| 4206 | InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, |
| 4207 | VPCostContext &Ctx) const { |
| 4208 | if (!Consecutive || IsMasked) |
| 4209 | return VPWidenMemoryRecipe::computeCost(VF, Ctx); |
| 4210 | |
| 4211 | // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost() |
| 4212 | // here because the EVL recipes using EVL to replace the tail mask. But in the |
| 4213 | // legacy model, it will always calculate the cost of mask. |
| 4214 | // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we |
| 4215 | // don't need to compare to the legacy cost model. |
| 4216 | Type *Ty = toVectorTy(Scalar: getScalarType(), EC: VF); |
| 4217 | unsigned AS = |
| 4218 | cast<PointerType>(Val: getAddr()->getScalarType())->getAddressSpace(); |
| 4219 | return Ctx.TTI.getMemIntrinsicInstrCost( |
| 4220 | MICA: MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS), |
| 4221 | CostKind: Ctx.CostKind); |
| 4222 | } |
| 4223 | |
| 4224 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4225 | void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4226 | VPSlotTracker &SlotTracker) const { |
| 4227 | O << Indent << "WIDEN " ; |
| 4228 | printAsOperand(O, SlotTracker); |
| 4229 | O << " = vp.load " ; |
| 4230 | printOperands(O, SlotTracker); |
| 4231 | } |
| 4232 | #endif |
| 4233 | |
| 4234 | void VPWidenStoreRecipe::execute(VPTransformState &State) { |
| 4235 | VPValue *StoredVPValue = getStoredValue(); |
| 4236 | bool CreateScatter = !isConsecutive(); |
| 4237 | |
| 4238 | auto &Builder = State.Builder; |
| 4239 | |
| 4240 | Value *Mask = nullptr; |
| 4241 | if (auto *VPMask = getMask()) |
| 4242 | Mask = State.get(Def: VPMask); |
| 4243 | |
| 4244 | Value *StoredVal = State.get(Def: StoredVPValue); |
| 4245 | Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateScatter); |
| 4246 | Instruction *NewSI = nullptr; |
| 4247 | if (CreateScatter) |
| 4248 | NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask); |
| 4249 | else if (Mask) |
| 4250 | NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask); |
| 4251 | else |
| 4252 | NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment); |
| 4253 | applyMetadata(I&: *NewSI); |
| 4254 | } |
| 4255 | |
| 4256 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4257 | void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4258 | VPSlotTracker &SlotTracker) const { |
| 4259 | O << Indent << "WIDEN store " ; |
| 4260 | printOperands(O, SlotTracker); |
| 4261 | } |
| 4262 | #endif |
| 4263 | |
| 4264 | void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { |
| 4265 | VPValue *StoredValue = getStoredValue(); |
| 4266 | bool CreateScatter = !isConsecutive(); |
| 4267 | |
| 4268 | auto &Builder = State.Builder; |
| 4269 | |
| 4270 | CallInst *NewSI = nullptr; |
| 4271 | Value *StoredVal = State.get(Def: StoredValue); |
| 4272 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 4273 | Value *Mask = nullptr; |
| 4274 | if (VPValue *VPMask = getMask()) |
| 4275 | Mask = State.get(Def: VPMask); |
| 4276 | else |
| 4277 | Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
| 4278 | |
| 4279 | Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateScatter); |
| 4280 | if (CreateScatter) { |
| 4281 | NewSI = Builder.CreateIntrinsicWithoutFolding( |
| 4282 | RetTy: Type::getVoidTy(C&: EVL->getContext()), ID: Intrinsic::vp_scatter, |
| 4283 | Args: {StoredVal, Addr, Mask, EVL}); |
| 4284 | } else { |
| 4285 | NewSI = Builder.CreateIntrinsicWithoutFolding( |
| 4286 | RetTy: Type::getVoidTy(C&: EVL->getContext()), ID: Intrinsic::vp_store, |
| 4287 | Args: {StoredVal, Addr, Mask, EVL}); |
| 4288 | } |
| 4289 | NewSI->addParamAttr( |
| 4290 | ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment)); |
| 4291 | applyMetadata(I&: *NewSI); |
| 4292 | } |
| 4293 | |
| 4294 | InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, |
| 4295 | VPCostContext &Ctx) const { |
| 4296 | if (!Consecutive || IsMasked) |
| 4297 | return VPWidenMemoryRecipe::computeCost(VF, Ctx); |
| 4298 | |
| 4299 | // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost() |
| 4300 | // here because the EVL recipes using EVL to replace the tail mask. But in the |
| 4301 | // legacy model, it will always calculate the cost of mask. |
| 4302 | // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we |
| 4303 | // don't need to compare to the legacy cost model. |
| 4304 | Type *Ty = toVectorTy(Scalar: getStoredValue()->getScalarType(), EC: VF); |
| 4305 | unsigned AS = |
| 4306 | cast<PointerType>(Val: getAddr()->getScalarType())->getAddressSpace(); |
| 4307 | return Ctx.TTI.getMemIntrinsicInstrCost( |
| 4308 | MICA: MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS), |
| 4309 | CostKind: Ctx.CostKind); |
| 4310 | } |
| 4311 | |
| 4312 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4313 | void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4314 | VPSlotTracker &SlotTracker) const { |
| 4315 | O << Indent << "WIDEN vp.store " ; |
| 4316 | printOperands(O, SlotTracker); |
| 4317 | } |
| 4318 | #endif |
| 4319 | |
| 4320 | static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, |
| 4321 | VectorType *DstVTy, const DataLayout &DL) { |
| 4322 | // Verify that V is a vector type with same number of elements as DstVTy. |
| 4323 | auto VF = DstVTy->getElementCount(); |
| 4324 | auto *SrcVecTy = cast<VectorType>(Val: V->getType()); |
| 4325 | assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match" ); |
| 4326 | Type *SrcElemTy = SrcVecTy->getElementType(); |
| 4327 | Type *DstElemTy = DstVTy->getElementType(); |
| 4328 | assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && |
| 4329 | "Vector elements must have same size" ); |
| 4330 | |
| 4331 | // Do a direct cast if element types are castable. |
| 4332 | if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) { |
| 4333 | return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy); |
| 4334 | } |
| 4335 | // V cannot be directly casted to desired vector type. |
| 4336 | // May happen when V is a floating point vector but DstVTy is a vector of |
| 4337 | // pointers or vice-versa. Handle this using a two-step bitcast using an |
| 4338 | // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. |
| 4339 | assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && |
| 4340 | "Only one type should be a pointer type" ); |
| 4341 | assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && |
| 4342 | "Only one type should be a floating point type" ); |
| 4343 | Type *IntTy = |
| 4344 | IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy)); |
| 4345 | auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF); |
| 4346 | Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy); |
| 4347 | return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy); |
| 4348 | } |
| 4349 | |
| 4350 | /// Return a vector containing interleaved elements from multiple |
| 4351 | /// smaller input vectors. |
| 4352 | static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, |
| 4353 | const Twine &Name) { |
| 4354 | unsigned Factor = Vals.size(); |
| 4355 | assert(Factor > 1 && "Tried to interleave invalid number of vectors" ); |
| 4356 | |
| 4357 | VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType()); |
| 4358 | #ifndef NDEBUG |
| 4359 | for (Value *Val : Vals) |
| 4360 | assert(Val->getType() == VecTy && "Tried to interleave mismatched types" ); |
| 4361 | #endif |
| 4362 | |
| 4363 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), so |
| 4364 | // must use intrinsics to interleave. |
| 4365 | if (VecTy->isScalableTy()) { |
| 4366 | assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors" ); |
| 4367 | return Builder.CreateVectorInterleave(Ops: Vals, Name); |
| 4368 | } |
| 4369 | |
| 4370 | // Fixed length. Start by concatenating all vectors into a wide vector. |
| 4371 | Value *WideVec = concatenateVectors(Builder, Vecs: Vals); |
| 4372 | |
| 4373 | // Interleave the elements into the wide vector. |
| 4374 | const unsigned NumElts = VecTy->getElementCount().getFixedValue(); |
| 4375 | return Builder.CreateShuffleVector( |
| 4376 | V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name); |
| 4377 | } |
| 4378 | |
| 4379 | // Try to vectorize the interleave group that \p Instr belongs to. |
| 4380 | // |
| 4381 | // E.g. Translate following interleaved load group (factor = 3): |
| 4382 | // for (i = 0; i < N; i+=3) { |
| 4383 | // R = Pic[i]; // Member of index 0 |
| 4384 | // G = Pic[i+1]; // Member of index 1 |
| 4385 | // B = Pic[i+2]; // Member of index 2 |
| 4386 | // ... // do something to R, G, B |
| 4387 | // } |
| 4388 | // To: |
| 4389 | // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B |
| 4390 | // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements |
| 4391 | // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements |
| 4392 | // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements |
| 4393 | // |
| 4394 | // Or translate following interleaved store group (factor = 3): |
| 4395 | // for (i = 0; i < N; i+=3) { |
| 4396 | // ... do something to R, G, B |
| 4397 | // Pic[i] = R; // Member of index 0 |
| 4398 | // Pic[i+1] = G; // Member of index 1 |
| 4399 | // Pic[i+2] = B; // Member of index 2 |
| 4400 | // } |
| 4401 | // To: |
| 4402 | // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> |
| 4403 | // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> |
| 4404 | // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, |
| 4405 | // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements |
| 4406 | // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B |
| 4407 | void VPInterleaveRecipe::execute(VPTransformState &State) { |
| 4408 | assert((!needsMaskForGaps() || !State.VF.isScalable()) && |
| 4409 | "Masking gaps for scalable vectors is not yet supported." ); |
| 4410 | const InterleaveGroup<Instruction> *Group = getInterleaveGroup(); |
| 4411 | Instruction *Instr = Group->getInsertPos(); |
| 4412 | |
| 4413 | // Prepare for the vector type of the interleaved load/store. |
| 4414 | Type *ScalarTy = getLoadStoreType(I: Instr); |
| 4415 | unsigned InterleaveFactor = Group->getFactor(); |
| 4416 | auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF * InterleaveFactor); |
| 4417 | |
| 4418 | VPValue *BlockInMask = getMask(); |
| 4419 | VPValue *Addr = getAddr(); |
| 4420 | Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0)); |
| 4421 | |
| 4422 | auto CreateGroupMask = [&BlockInMask, &State, |
| 4423 | &InterleaveFactor](Value *MaskForGaps) -> Value * { |
| 4424 | if (State.VF.isScalable()) { |
| 4425 | assert(!MaskForGaps && "Interleaved groups with gaps are not supported." ); |
| 4426 | assert(InterleaveFactor <= 8 && |
| 4427 | "Unsupported deinterleave factor for scalable vectors" ); |
| 4428 | auto *ResBlockInMask = State.get(Def: BlockInMask); |
| 4429 | SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask); |
| 4430 | return interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask" ); |
| 4431 | } |
| 4432 | |
| 4433 | if (!BlockInMask) |
| 4434 | return MaskForGaps; |
| 4435 | |
| 4436 | Value *ResBlockInMask = State.get(Def: BlockInMask); |
| 4437 | Value *ShuffledMask = State.Builder.CreateShuffleVector( |
| 4438 | V: ResBlockInMask, |
| 4439 | Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getFixedValue()), |
| 4440 | Name: "interleaved.mask" ); |
| 4441 | return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And, |
| 4442 | LHS: ShuffledMask, RHS: MaskForGaps) |
| 4443 | : ShuffledMask; |
| 4444 | }; |
| 4445 | |
| 4446 | const DataLayout &DL = Instr->getDataLayout(); |
| 4447 | // Vectorize the interleaved load group. |
| 4448 | if (isa<LoadInst>(Val: Instr)) { |
| 4449 | Value *MaskForGaps = nullptr; |
| 4450 | if (needsMaskForGaps()) { |
| 4451 | MaskForGaps = |
| 4452 | createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getFixedValue(), Group: *Group); |
| 4453 | assert(MaskForGaps && "Mask for Gaps is required but it is null" ); |
| 4454 | } |
| 4455 | |
| 4456 | Instruction *NewLoad; |
| 4457 | if (BlockInMask || MaskForGaps) { |
| 4458 | Value *GroupMask = CreateGroupMask(MaskForGaps); |
| 4459 | Value *PoisonVec = PoisonValue::get(T: VecTy); |
| 4460 | NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: ResAddr, |
| 4461 | Alignment: Group->getAlign(), Mask: GroupMask, |
| 4462 | PassThru: PoisonVec, Name: "wide.masked.vec" ); |
| 4463 | } else |
| 4464 | NewLoad = State.Builder.CreateAlignedLoad(Ty: VecTy, Ptr: ResAddr, |
| 4465 | Align: Group->getAlign(), Name: "wide.vec" ); |
| 4466 | applyMetadata(I&: *NewLoad); |
| 4467 | // TODO: Also manage existing metadata using VPIRMetadata. |
| 4468 | Group->addMetadata(NewInst: NewLoad); |
| 4469 | |
| 4470 | ArrayRef<VPRecipeValue *> VPDefs = definedValues(); |
| 4471 | if (VecTy->isScalableTy()) { |
| 4472 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), |
| 4473 | // so must use intrinsics to deinterleave. |
| 4474 | assert(InterleaveFactor <= 8 && |
| 4475 | "Unsupported deinterleave factor for scalable vectors" ); |
| 4476 | NewLoad = State.Builder.CreateIntrinsicWithoutFolding( |
| 4477 | ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor), |
| 4478 | OverloadTypes: NewLoad->getType(), Args: NewLoad, |
| 4479 | /*FMFSource=*/nullptr, Name: "strided.vec" ); |
| 4480 | } |
| 4481 | |
| 4482 | auto CreateStridedVector = [&InterleaveFactor, &State, |
| 4483 | &NewLoad](unsigned Index) -> Value * { |
| 4484 | assert(Index < InterleaveFactor && "Illegal group index" ); |
| 4485 | if (State.VF.isScalable()) |
| 4486 | return State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: Index); |
| 4487 | |
| 4488 | // For fixed length VF, use shuffle to extract the sub-vectors from the |
| 4489 | // wide load. |
| 4490 | auto StrideMask = |
| 4491 | createStrideMask(Start: Index, Stride: InterleaveFactor, VF: State.VF.getFixedValue()); |
| 4492 | return State.Builder.CreateShuffleVector(V: NewLoad, Mask: StrideMask, |
| 4493 | Name: "strided.vec" ); |
| 4494 | }; |
| 4495 | |
| 4496 | for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { |
| 4497 | Instruction *Member = Group->getMember(Index: I); |
| 4498 | |
| 4499 | // Skip the gaps in the group. |
| 4500 | if (!Member) |
| 4501 | continue; |
| 4502 | |
| 4503 | Value *StridedVec = CreateStridedVector(I); |
| 4504 | |
| 4505 | // If this member has different type, cast the result type. |
| 4506 | if (Member->getType() != ScalarTy) { |
| 4507 | VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF); |
| 4508 | StridedVec = |
| 4509 | createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL); |
| 4510 | } |
| 4511 | |
| 4512 | if (Group->isReverse()) |
| 4513 | StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse" ); |
| 4514 | |
| 4515 | State.set(Def: VPDefs[J], V: StridedVec); |
| 4516 | ++J; |
| 4517 | } |
| 4518 | return; |
| 4519 | } |
| 4520 | |
| 4521 | // The sub vector type for current instruction. |
| 4522 | auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF); |
| 4523 | |
| 4524 | // Vectorize the interleaved store group. |
| 4525 | Value *MaskForGaps = |
| 4526 | createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group); |
| 4527 | assert(((MaskForGaps != nullptr) == needsMaskForGaps()) && |
| 4528 | "Mismatch between NeedsMaskForGaps and MaskForGaps" ); |
| 4529 | ArrayRef<VPValue *> StoredValues = getStoredValues(); |
| 4530 | // Collect the stored vector from each member. |
| 4531 | SmallVector<Value *, 4> StoredVecs; |
| 4532 | unsigned StoredIdx = 0; |
| 4533 | for (unsigned i = 0; i < InterleaveFactor; i++) { |
| 4534 | assert((Group->getMember(i) || MaskForGaps) && |
| 4535 | "Fail to get a member from an interleaved store group" ); |
| 4536 | Instruction *Member = Group->getMember(Index: i); |
| 4537 | |
| 4538 | // Skip the gaps in the group. |
| 4539 | if (!Member) { |
| 4540 | Value *Undef = PoisonValue::get(T: SubVT); |
| 4541 | StoredVecs.push_back(Elt: Undef); |
| 4542 | continue; |
| 4543 | } |
| 4544 | |
| 4545 | Value *StoredVec = State.get(Def: StoredValues[StoredIdx]); |
| 4546 | ++StoredIdx; |
| 4547 | |
| 4548 | if (Group->isReverse()) |
| 4549 | StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse" ); |
| 4550 | |
| 4551 | // If this member has different type, cast it to a unified type. |
| 4552 | |
| 4553 | if (StoredVec->getType() != SubVT) |
| 4554 | StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL); |
| 4555 | |
| 4556 | StoredVecs.push_back(Elt: StoredVec); |
| 4557 | } |
| 4558 | |
| 4559 | // Interleave all the smaller vectors into one wider vector. |
| 4560 | Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec" ); |
| 4561 | Instruction *NewStoreInstr; |
| 4562 | if (BlockInMask || MaskForGaps) { |
| 4563 | Value *GroupMask = CreateGroupMask(MaskForGaps); |
| 4564 | NewStoreInstr = State.Builder.CreateMaskedStore( |
| 4565 | Val: IVec, Ptr: ResAddr, Alignment: Group->getAlign(), Mask: GroupMask); |
| 4566 | } else |
| 4567 | NewStoreInstr = |
| 4568 | State.Builder.CreateAlignedStore(Val: IVec, Ptr: ResAddr, Align: Group->getAlign()); |
| 4569 | |
| 4570 | applyMetadata(I&: *NewStoreInstr); |
| 4571 | // TODO: Also manage existing metadata using VPIRMetadata. |
| 4572 | Group->addMetadata(NewInst: NewStoreInstr); |
| 4573 | } |
| 4574 | |
| 4575 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4576 | void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4577 | VPSlotTracker &SlotTracker) const { |
| 4578 | const InterleaveGroup<Instruction> *IG = getInterleaveGroup(); |
| 4579 | O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << ", " ; |
| 4580 | getAddr()->printAsOperand(O, SlotTracker); |
| 4581 | VPValue *Mask = getMask(); |
| 4582 | if (Mask) { |
| 4583 | O << ", " ; |
| 4584 | Mask->printAsOperand(O, SlotTracker); |
| 4585 | } |
| 4586 | |
| 4587 | unsigned OpIdx = 0; |
| 4588 | for (unsigned i = 0; i < IG->getFactor(); ++i) { |
| 4589 | if (!IG->getMember(i)) |
| 4590 | continue; |
| 4591 | if (getNumStoreOperands() > 0) { |
| 4592 | O << "\n" << Indent << " store " ; |
| 4593 | getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); |
| 4594 | O << " to index " << i; |
| 4595 | } else { |
| 4596 | O << "\n" << Indent << " " ; |
| 4597 | getVPValue(OpIdx)->printAsOperand(O, SlotTracker); |
| 4598 | O << " = load from index " << i; |
| 4599 | } |
| 4600 | ++OpIdx; |
| 4601 | } |
| 4602 | } |
| 4603 | #endif |
| 4604 | |
| 4605 | void VPInterleaveEVLRecipe::execute(VPTransformState &State) { |
| 4606 | assert(State.VF.isScalable() && |
| 4607 | "Only support scalable VF for EVL tail-folding." ); |
| 4608 | assert(!needsMaskForGaps() && |
| 4609 | "Masking gaps for scalable vectors is not yet supported." ); |
| 4610 | const InterleaveGroup<Instruction> *Group = getInterleaveGroup(); |
| 4611 | Instruction *Instr = Group->getInsertPos(); |
| 4612 | |
| 4613 | // Prepare for the vector type of the interleaved load/store. |
| 4614 | Type *ScalarTy = getLoadStoreType(I: Instr); |
| 4615 | unsigned InterleaveFactor = Group->getFactor(); |
| 4616 | assert(InterleaveFactor <= 8 && |
| 4617 | "Unsupported deinterleave/interleave factor for scalable vectors" ); |
| 4618 | ElementCount WideVF = State.VF * InterleaveFactor; |
| 4619 | auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: WideVF); |
| 4620 | |
| 4621 | VPValue *Addr = getAddr(); |
| 4622 | Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0)); |
| 4623 | Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0)); |
| 4624 | Value *InterleaveEVL = State.Builder.CreateMul( |
| 4625 | LHS: EVL, RHS: ConstantInt::get(Ty: EVL->getType(), V: InterleaveFactor), Name: "interleave.evl" , |
| 4626 | /* NUW= */ HasNUW: true, /* NSW= */ HasNSW: true); |
| 4627 | LLVMContext &Ctx = State.Builder.getContext(); |
| 4628 | |
| 4629 | Value *GroupMask = nullptr; |
| 4630 | if (VPValue *BlockInMask = getMask()) { |
| 4631 | SmallVector<Value *> Ops(InterleaveFactor, State.get(Def: BlockInMask)); |
| 4632 | GroupMask = interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask" ); |
| 4633 | } else { |
| 4634 | GroupMask = |
| 4635 | State.Builder.CreateVectorSplat(EC: WideVF, V: State.Builder.getTrue()); |
| 4636 | } |
| 4637 | |
| 4638 | // Vectorize the interleaved load group. |
| 4639 | if (isa<LoadInst>(Val: Instr)) { |
| 4640 | CallInst *NewLoad = State.Builder.CreateIntrinsicWithoutFolding( |
| 4641 | RetTy: VecTy, ID: Intrinsic::vp_load, Args: {ResAddr, GroupMask, InterleaveEVL}, FMFSource: nullptr, |
| 4642 | Name: "wide.vp.load" ); |
| 4643 | NewLoad->addParamAttr(ArgNo: 0, |
| 4644 | Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign())); |
| 4645 | |
| 4646 | applyMetadata(I&: *NewLoad); |
| 4647 | // TODO: Also manage existing metadata using VPIRMetadata. |
| 4648 | Group->addMetadata(NewInst: NewLoad); |
| 4649 | |
| 4650 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), |
| 4651 | // so must use intrinsics to deinterleave. |
| 4652 | NewLoad = State.Builder.CreateIntrinsicWithoutFolding( |
| 4653 | ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor), |
| 4654 | OverloadTypes: NewLoad->getType(), Args: NewLoad, |
| 4655 | /*FMFSource=*/nullptr, Name: "strided.vec" ); |
| 4656 | |
| 4657 | const DataLayout &DL = Instr->getDataLayout(); |
| 4658 | for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { |
| 4659 | Instruction *Member = Group->getMember(Index: I); |
| 4660 | // Skip the gaps in the group. |
| 4661 | if (!Member) |
| 4662 | continue; |
| 4663 | |
| 4664 | Value *StridedVec = State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: I); |
| 4665 | // If this member has different type, cast the result type. |
| 4666 | if (Member->getType() != ScalarTy) { |
| 4667 | VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF); |
| 4668 | StridedVec = |
| 4669 | createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL); |
| 4670 | } |
| 4671 | |
| 4672 | State.set(Def: getVPValue(I: J), V: StridedVec); |
| 4673 | ++J; |
| 4674 | } |
| 4675 | return; |
| 4676 | } // End for interleaved load. |
| 4677 | |
| 4678 | // The sub vector type for current instruction. |
| 4679 | auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF); |
| 4680 | // Vectorize the interleaved store group. |
| 4681 | ArrayRef<VPValue *> StoredValues = getStoredValues(); |
| 4682 | // Collect the stored vector from each member. |
| 4683 | SmallVector<Value *, 4> StoredVecs; |
| 4684 | const DataLayout &DL = Instr->getDataLayout(); |
| 4685 | for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) { |
| 4686 | Instruction *Member = Group->getMember(Index: I); |
| 4687 | // Skip the gaps in the group. |
| 4688 | if (!Member) { |
| 4689 | StoredVecs.push_back(Elt: PoisonValue::get(T: SubVT)); |
| 4690 | continue; |
| 4691 | } |
| 4692 | |
| 4693 | Value *StoredVec = State.get(Def: StoredValues[StoredIdx]); |
| 4694 | // If this member has different type, cast it to a unified type. |
| 4695 | if (StoredVec->getType() != SubVT) |
| 4696 | StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL); |
| 4697 | |
| 4698 | StoredVecs.push_back(Elt: StoredVec); |
| 4699 | ++StoredIdx; |
| 4700 | } |
| 4701 | |
| 4702 | // Interleave all the smaller vectors into one wider vector. |
| 4703 | Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec" ); |
| 4704 | CallInst *NewStore = State.Builder.CreateIntrinsicWithoutFolding( |
| 4705 | RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::vp_store, |
| 4706 | Args: {IVec, ResAddr, GroupMask, InterleaveEVL}); |
| 4707 | |
| 4708 | NewStore->addParamAttr(ArgNo: 1, |
| 4709 | Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign())); |
| 4710 | |
| 4711 | applyMetadata(I&: *NewStore); |
| 4712 | // TODO: Also manage existing metadata using VPIRMetadata. |
| 4713 | Group->addMetadata(NewInst: NewStore); |
| 4714 | } |
| 4715 | |
| 4716 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4717 | void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4718 | VPSlotTracker &SlotTracker) const { |
| 4719 | const InterleaveGroup<Instruction> *IG = getInterleaveGroup(); |
| 4720 | O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << ", " ; |
| 4721 | getAddr()->printAsOperand(O, SlotTracker); |
| 4722 | O << ", " ; |
| 4723 | getEVL()->printAsOperand(O, SlotTracker); |
| 4724 | if (VPValue *Mask = getMask()) { |
| 4725 | O << ", " ; |
| 4726 | Mask->printAsOperand(O, SlotTracker); |
| 4727 | } |
| 4728 | |
| 4729 | unsigned OpIdx = 0; |
| 4730 | for (unsigned i = 0; i < IG->getFactor(); ++i) { |
| 4731 | if (!IG->getMember(i)) |
| 4732 | continue; |
| 4733 | if (getNumStoreOperands() > 0) { |
| 4734 | O << "\n" << Indent << " vp.store " ; |
| 4735 | getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker); |
| 4736 | O << " to index " << i; |
| 4737 | } else { |
| 4738 | O << "\n" << Indent << " " ; |
| 4739 | getVPValue(OpIdx)->printAsOperand(O, SlotTracker); |
| 4740 | O << " = vp.load from index " << i; |
| 4741 | } |
| 4742 | ++OpIdx; |
| 4743 | } |
| 4744 | } |
| 4745 | #endif |
| 4746 | |
| 4747 | InstructionCost VPInterleaveBase::computeCost(ElementCount VF, |
| 4748 | VPCostContext &Ctx) const { |
| 4749 | Instruction *InsertPos = getInsertPos(); |
| 4750 | // Find the VPValue index of the interleave group. We need to skip gaps. |
| 4751 | unsigned InsertPosIdx = 0; |
| 4752 | for (unsigned Idx = 0; IG->getFactor(); ++Idx) |
| 4753 | if (auto *Member = IG->getMember(Index: Idx)) { |
| 4754 | if (Member == InsertPos) |
| 4755 | break; |
| 4756 | InsertPosIdx++; |
| 4757 | } |
| 4758 | const VPValue *ValV = getNumDefinedValues() > 0 |
| 4759 | ? getVPValue(I: InsertPosIdx) |
| 4760 | : getStoredValues()[InsertPosIdx]; |
| 4761 | Type *ValTy = ValV->getScalarType(); |
| 4762 | auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF)); |
| 4763 | unsigned AS = |
| 4764 | cast<PointerType>(Val: getAddr()->getScalarType())->getAddressSpace(); |
| 4765 | |
| 4766 | unsigned InterleaveFactor = IG->getFactor(); |
| 4767 | auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor); |
| 4768 | |
| 4769 | // Holds the indices of existing members in the interleaved group. |
| 4770 | SmallVector<unsigned, 4> Indices; |
| 4771 | for (unsigned IF = 0; IF < InterleaveFactor; IF++) |
| 4772 | if (IG->getMember(Index: IF)) |
| 4773 | Indices.push_back(Elt: IF); |
| 4774 | |
| 4775 | // Calculate the cost of the whole interleaved group. |
| 4776 | InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( |
| 4777 | Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: IG->getFactor(), Indices, |
| 4778 | Alignment: IG->getAlign(), AddressSpace: AS, CostKind: Ctx.CostKind, UseMaskForCond: getMask(), UseMaskForGaps: NeedsMaskForGaps); |
| 4779 | |
| 4780 | if (!IG->isReverse()) |
| 4781 | return Cost; |
| 4782 | |
| 4783 | return Cost + IG->getNumMembers() * |
| 4784 | Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, |
| 4785 | DstTy: VectorTy, SrcTy: VectorTy, Mask: {}, CostKind: Ctx.CostKind, |
| 4786 | Index: 0); |
| 4787 | } |
| 4788 | |
| 4789 | bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { |
| 4790 | return vputils::onlyScalarValuesUsed(Def: this) && |
| 4791 | (!IsScalable || vputils::onlyFirstLaneUsed(Def: this)); |
| 4792 | } |
| 4793 | |
| 4794 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4795 | void VPWidenPointerInductionRecipe::printRecipe( |
| 4796 | raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { |
| 4797 | assert((getNumOperands() == 3 || getNumOperands() == 5) && |
| 4798 | "unexpected number of operands" ); |
| 4799 | O << Indent << "EMIT " ; |
| 4800 | printAsOperand(O, SlotTracker); |
| 4801 | O << " = WIDEN-POINTER-INDUCTION " ; |
| 4802 | getStartValue()->printAsOperand(O, SlotTracker); |
| 4803 | O << ", " ; |
| 4804 | getStepValue()->printAsOperand(O, SlotTracker); |
| 4805 | O << ", " ; |
| 4806 | getOperand(2)->printAsOperand(O, SlotTracker); |
| 4807 | if (getNumOperands() == 5) { |
| 4808 | O << ", " ; |
| 4809 | getOperand(3)->printAsOperand(O, SlotTracker); |
| 4810 | O << ", " ; |
| 4811 | getOperand(4)->printAsOperand(O, SlotTracker); |
| 4812 | } |
| 4813 | } |
| 4814 | |
| 4815 | void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4816 | VPSlotTracker &SlotTracker) const { |
| 4817 | O << Indent << "EMIT " ; |
| 4818 | printAsOperand(O, SlotTracker); |
| 4819 | O << " = EXPAND SCEV " << *Expr; |
| 4820 | } |
| 4821 | #endif |
| 4822 | |
| 4823 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4824 | void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4825 | VPSlotTracker &SlotTracker) const { |
| 4826 | O << Indent << "EMIT " ; |
| 4827 | printAsOperand(O, SlotTracker); |
| 4828 | O << " = WIDEN-CANONICAL-INDUCTION" ; |
| 4829 | printFlags(O); |
| 4830 | printOperands(O, SlotTracker); |
| 4831 | } |
| 4832 | #endif |
| 4833 | |
| 4834 | void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { |
| 4835 | auto &Builder = State.Builder; |
| 4836 | // Create a vector from the initial value. |
| 4837 | auto *VectorInit = getStartValue()->getLiveInIRValue(); |
| 4838 | |
| 4839 | Type *VecTy = State.VF.isScalar() |
| 4840 | ? VectorInit->getType() |
| 4841 | : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF); |
| 4842 | |
| 4843 | BasicBlock *VectorPH = |
| 4844 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
| 4845 | if (State.VF.isVector()) { |
| 4846 | auto *IdxTy = Builder.getInt32Ty(); |
| 4847 | auto *One = ConstantInt::get(Ty: IdxTy, V: 1); |
| 4848 | IRBuilder<>::InsertPointGuard Guard(Builder); |
| 4849 | Builder.SetInsertPoint(VectorPH->getTerminator()); |
| 4850 | auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF); |
| 4851 | auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One); |
| 4852 | VectorInit = Builder.CreateInsertElement( |
| 4853 | Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init" ); |
| 4854 | } |
| 4855 | |
| 4856 | // Create a phi node for the new recurrence. |
| 4857 | PHINode *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur" ); |
| 4858 | Phi->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt()); |
| 4859 | Phi->addIncoming(V: VectorInit, BB: VectorPH); |
| 4860 | State.set(Def: this, V: Phi); |
| 4861 | } |
| 4862 | |
| 4863 | InstructionCost |
| 4864 | VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, |
| 4865 | VPCostContext &Ctx) const { |
| 4866 | if (VF.isScalar()) |
| 4867 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 4868 | |
| 4869 | return 0; |
| 4870 | } |
| 4871 | |
| 4872 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4873 | void VPFirstOrderRecurrencePHIRecipe::printRecipe( |
| 4874 | raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { |
| 4875 | O << Indent << "FIRST-ORDER-RECURRENCE-PHI " ; |
| 4876 | printAsOperand(O, SlotTracker); |
| 4877 | O << " = phi " ; |
| 4878 | printOperands(O, SlotTracker); |
| 4879 | } |
| 4880 | #endif |
| 4881 | |
| 4882 | void VPReductionPHIRecipe::execute(VPTransformState &State) { |
| 4883 | // Reductions do not have to start at zero. They can start with |
| 4884 | // any loop invariant values. |
| 4885 | VPValue *StartVPV = getStartValue(); |
| 4886 | |
| 4887 | // In order to support recurrences we need to be able to vectorize Phi nodes. |
| 4888 | // Phi nodes have cycles, so we need to vectorize them in two stages. This is |
| 4889 | // stage #1: We create a new vector PHI node with no incoming edges. We'll use |
| 4890 | // this value when we vectorize all of the instructions that use the PHI. |
| 4891 | BasicBlock *VectorPH = |
| 4892 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
| 4893 | bool ScalarPHI = State.VF.isScalar() || isInLoop(); |
| 4894 | Value *StartV = State.get(Def: StartVPV, IsScalar: ScalarPHI); |
| 4895 | Type *VecTy = StartV->getType(); |
| 4896 | |
| 4897 | BasicBlock * = State.CFG.PrevBB; |
| 4898 | assert(State.CurrentParentLoop->getHeader() == HeaderBB && |
| 4899 | "recipe must be in the vector loop header" ); |
| 4900 | auto *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi" ); |
| 4901 | Phi->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt()); |
| 4902 | State.set(Def: this, V: Phi, IsScalar: isInLoop()); |
| 4903 | |
| 4904 | Phi->addIncoming(V: StartV, BB: VectorPH); |
| 4905 | } |
| 4906 | |
| 4907 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4908 | void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4909 | VPSlotTracker &SlotTracker) const { |
| 4910 | O << Indent << "WIDEN-REDUCTION-PHI " ; |
| 4911 | |
| 4912 | printAsOperand(O, SlotTracker); |
| 4913 | O << " = phi (" ; |
| 4914 | printRecurrenceKind(O, Kind); |
| 4915 | O << ")" ; |
| 4916 | printFlags(O); |
| 4917 | printOperands(O, SlotTracker); |
| 4918 | if (getVFScaleFactor() > 1) |
| 4919 | O << " (VF scaled by 1/" << getVFScaleFactor() << ")" ; |
| 4920 | } |
| 4921 | #endif |
| 4922 | |
| 4923 | bool VPBlendRecipe::usesFirstLaneOnly(const VPValue *Op) const { |
| 4924 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
| 4925 | return vputils::onlyFirstLaneUsed(Def: this); |
| 4926 | } |
| 4927 | |
| 4928 | void VPWidenPHIRecipe::execute(VPTransformState &State) { |
| 4929 | executePhiRecipe(R: this, Phi&: *this, State, /*IsScalar=*/false, Name); |
| 4930 | } |
| 4931 | |
| 4932 | InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF, |
| 4933 | VPCostContext &Ctx) const { |
| 4934 | return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind); |
| 4935 | } |
| 4936 | |
| 4937 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4938 | void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4939 | VPSlotTracker &SlotTracker) const { |
| 4940 | O << Indent << "WIDEN-PHI " ; |
| 4941 | |
| 4942 | printAsOperand(O, SlotTracker); |
| 4943 | O << " = phi " ; |
| 4944 | printPhiOperands(O, SlotTracker); |
| 4945 | } |
| 4946 | #endif |
| 4947 | |
| 4948 | void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { |
| 4949 | BasicBlock *VectorPH = |
| 4950 | State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0)); |
| 4951 | Value *StartMask = State.get(Def: getOperand(N: 0)); |
| 4952 | PHINode *Phi = |
| 4953 | State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask" ); |
| 4954 | Phi->addIncoming(V: StartMask, BB: VectorPH); |
| 4955 | State.set(Def: this, V: Phi); |
| 4956 | } |
| 4957 | |
| 4958 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4959 | void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, |
| 4960 | VPSlotTracker &SlotTracker) const { |
| 4961 | O << Indent << "ACTIVE-LANE-MASK-PHI " ; |
| 4962 | |
| 4963 | printAsOperand(O, SlotTracker); |
| 4964 | O << " = phi " ; |
| 4965 | printOperands(O, SlotTracker); |
| 4966 | } |
| 4967 | #endif |
| 4968 | |
| 4969 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 4970 | void VPCurrentIterationPHIRecipe::printRecipe( |
| 4971 | raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { |
| 4972 | O << Indent << "CURRENT-ITERATION-PHI " ; |
| 4973 | |
| 4974 | printAsOperand(O, SlotTracker); |
| 4975 | O << " = phi " ; |
| 4976 | printOperands(O, SlotTracker); |
| 4977 | } |
| 4978 | #endif |
| 4979 | |