| 1 | //===----- CodeGen/ExpandVectorPredication.cpp - Expand VP intrinsics -----===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements IR expansion for vector predication intrinsics, allowing |
| 10 | // targets to enable vector predication until just before codegen. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "llvm/CodeGen/ExpandVectorPredication.h" |
| 15 | #include "llvm/ADT/Statistic.h" |
| 16 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 17 | #include "llvm/Analysis/ValueTracking.h" |
| 18 | #include "llvm/Analysis/VectorUtils.h" |
| 19 | #include "llvm/IR/Constants.h" |
| 20 | #include "llvm/IR/Function.h" |
| 21 | #include "llvm/IR/IRBuilder.h" |
| 22 | #include "llvm/IR/Instructions.h" |
| 23 | #include "llvm/IR/IntrinsicInst.h" |
| 24 | #include "llvm/IR/Intrinsics.h" |
| 25 | #include "llvm/Support/CommandLine.h" |
| 26 | #include "llvm/Support/Compiler.h" |
| 27 | #include "llvm/Support/Debug.h" |
| 28 | #include "llvm/Transforms/Utils/LoopUtils.h" |
| 29 | #include <optional> |
| 30 | |
| 31 | using namespace llvm; |
| 32 | |
| 33 | using VPLegalization = TargetTransformInfo::VPLegalization; |
| 34 | using VPTransform = TargetTransformInfo::VPLegalization::VPTransform; |
| 35 | |
| 36 | // Keep this in sync with TargetTransformInfo::VPLegalization. |
| 37 | #define VPINTERNAL_VPLEGAL_CASES \ |
| 38 | VPINTERNAL_CASE(Legal) \ |
| 39 | VPINTERNAL_CASE(Discard) \ |
| 40 | VPINTERNAL_CASE(Convert) |
| 41 | |
| 42 | #define VPINTERNAL_CASE(X) "|" #X |
| 43 | |
| 44 | // Override options. |
| 45 | static cl::opt<std::string> EVLTransformOverride( |
| 46 | "expandvp-override-evl-transform" , cl::init(Val: "" ), cl::Hidden, |
| 47 | cl::desc("Options: <empty>" VPINTERNAL_VPLEGAL_CASES |
| 48 | ". If non-empty, ignore " |
| 49 | "TargetTransformInfo and " |
| 50 | "always use this transformation for the %evl parameter (Used in " |
| 51 | "testing)." )); |
| 52 | |
| 53 | static cl::opt<std::string> MaskTransformOverride( |
| 54 | "expandvp-override-mask-transform" , cl::init(Val: "" ), cl::Hidden, |
| 55 | cl::desc("Options: <empty>" VPINTERNAL_VPLEGAL_CASES |
| 56 | ". If non-empty, Ignore " |
| 57 | "TargetTransformInfo and " |
| 58 | "always use this transformation for the %mask parameter (Used in " |
| 59 | "testing)." )); |
| 60 | |
| 61 | #undef VPINTERNAL_CASE |
| 62 | #define VPINTERNAL_CASE(X) .Case(#X, VPLegalization::X) |
| 63 | |
| 64 | static VPTransform parseOverrideOption(const std::string &TextOpt) { |
| 65 | return StringSwitch<VPTransform>(TextOpt) VPINTERNAL_VPLEGAL_CASES; |
| 66 | } |
| 67 | |
| 68 | #undef VPINTERNAL_VPLEGAL_CASES |
| 69 | |
| 70 | // Whether any override options are set. |
| 71 | static bool anyExpandVPOverridesSet() { |
| 72 | return !EVLTransformOverride.empty() || !MaskTransformOverride.empty(); |
| 73 | } |
| 74 | |
| 75 | #define DEBUG_TYPE "expandvp" |
| 76 | |
| 77 | STATISTIC(NumFoldedVL, "Number of folded vector length params" ); |
| 78 | STATISTIC(NumLoweredVPOps, "Number of folded vector predication operations" ); |
| 79 | |
| 80 | ///// Helpers { |
| 81 | |
| 82 | /// \returns Whether the vector mask \p MaskVal has all lane bits set. |
| 83 | static bool isAllTrueMask(Value *MaskVal) { |
| 84 | if (Value *SplattedVal = getSplatValue(V: MaskVal)) |
| 85 | if (auto *ConstValue = dyn_cast<Constant>(Val: SplattedVal)) |
| 86 | return ConstValue->isAllOnesValue(); |
| 87 | |
| 88 | return false; |
| 89 | } |
| 90 | |
| 91 | /// \returns A non-excepting divisor constant for this type. |
| 92 | static Constant *getSafeDivisor(Type *DivTy) { |
| 93 | assert(DivTy->isIntOrIntVectorTy() && "Unsupported divisor type" ); |
| 94 | return ConstantInt::get(Ty: DivTy, V: 1u, IsSigned: false); |
| 95 | } |
| 96 | |
| 97 | /// Transfer operation properties from \p OldVPI to \p NewVal. |
| 98 | static void transferDecorations(Value &NewVal, VPIntrinsic &VPI) { |
| 99 | auto *NewInst = dyn_cast<Instruction>(Val: &NewVal); |
| 100 | if (!NewInst || !isa<FPMathOperator>(Val: NewVal)) |
| 101 | return; |
| 102 | |
| 103 | auto *OldFMOp = dyn_cast<FPMathOperator>(Val: &VPI); |
| 104 | if (!OldFMOp) |
| 105 | return; |
| 106 | |
| 107 | NewInst->setFastMathFlags(OldFMOp->getFastMathFlags()); |
| 108 | } |
| 109 | |
| 110 | /// Transfer all properties from \p OldOp to \p NewOp and replace all uses. |
| 111 | /// OldVP gets erased. |
| 112 | static void replaceOperation(Value &NewOp, VPIntrinsic &OldOp) { |
| 113 | transferDecorations(NewVal&: NewOp, VPI&: OldOp); |
| 114 | OldOp.replaceAllUsesWith(V: &NewOp); |
| 115 | OldOp.eraseFromParent(); |
| 116 | } |
| 117 | |
| 118 | static bool maySpeculateLanes(VPIntrinsic &VPI) { |
| 119 | // The result of VP reductions depends on the mask and evl. |
| 120 | if (isa<VPReductionIntrinsic>(Val: VPI)) |
| 121 | return false; |
| 122 | // Fallback to whether the intrinsic is speculatable. |
| 123 | if (auto IntrID = VPI.getFunctionalIntrinsicID()) |
| 124 | return Intrinsic::getFnAttributes(C&: VPI.getContext(), id: *IntrID) |
| 125 | .hasAttribute(Kind: Attribute::AttrKind::Speculatable); |
| 126 | if (auto Opc = VPI.getFunctionalOpcode()) |
| 127 | return isSafeToSpeculativelyExecuteWithOpcode(Opcode: *Opc, Inst: &VPI); |
| 128 | return false; |
| 129 | } |
| 130 | |
| 131 | //// } Helpers |
| 132 | |
| 133 | namespace { |
| 134 | |
| 135 | // Expansion pass state at function scope. |
| 136 | struct CachingVPExpander { |
| 137 | const TargetTransformInfo &TTI; |
| 138 | |
| 139 | /// \returns A bitmask that is true where the lane position is less-than \p |
| 140 | /// EVLParam |
| 141 | /// |
| 142 | /// \p Builder |
| 143 | /// Used for instruction creation. |
| 144 | /// \p VLParam |
| 145 | /// The explicit vector length parameter to test against the lane |
| 146 | /// positions. |
| 147 | /// \p ElemCount |
| 148 | /// Static (potentially scalable) number of vector elements. |
| 149 | Value *convertEVLToMask(IRBuilder<> &Builder, Value *EVLParam, |
| 150 | ElementCount ElemCount); |
| 151 | |
| 152 | /// If needed, folds the EVL in the mask operand and discards the EVL |
| 153 | /// parameter. Returns a pair of the value of the intrinsic after the change |
| 154 | /// (if any) and whether the mask was actually folded. |
| 155 | std::pair<Value *, bool> foldEVLIntoMask(VPIntrinsic &VPI); |
| 156 | |
| 157 | /// "Remove" the %evl parameter of \p PI by setting it to the static vector |
| 158 | /// length of the operation. Returns true if the %evl (if any) was effectively |
| 159 | /// changed. |
| 160 | bool discardEVLParameter(VPIntrinsic &PI); |
| 161 | |
| 162 | /// Lower this VP binary operator to a unpredicated binary operator. |
| 163 | Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder, |
| 164 | VPIntrinsic &PI); |
| 165 | |
| 166 | /// Lower this VP int call to a unpredicated int call. |
| 167 | Value *expandPredicationToIntCall(IRBuilder<> &Builder, VPIntrinsic &PI); |
| 168 | |
| 169 | /// Lower this VP fp call to a unpredicated fp call. |
| 170 | Value *expandPredicationToFPCall(IRBuilder<> &Builder, VPIntrinsic &PI, |
| 171 | unsigned UnpredicatedIntrinsicID); |
| 172 | |
| 173 | /// Lower this VP reduction to a call to an unpredicated reduction intrinsic. |
| 174 | Value *expandPredicationInReduction(IRBuilder<> &Builder, |
| 175 | VPReductionIntrinsic &PI); |
| 176 | |
| 177 | /// Lower this VP cast operation to a non-VP intrinsic. |
| 178 | Value *expandPredicationToCastIntrinsic(IRBuilder<> &Builder, |
| 179 | VPIntrinsic &VPI); |
| 180 | |
| 181 | /// Lower this VP memory operation to a non-VP intrinsic. |
| 182 | Value *expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, |
| 183 | VPIntrinsic &VPI); |
| 184 | |
| 185 | /// Lower this VP comparison to a call to an unpredicated comparison. |
| 186 | Value *expandPredicationInComparison(IRBuilder<> &Builder, |
| 187 | VPCmpIntrinsic &PI); |
| 188 | |
| 189 | /// Query TTI and expand the vector predication in \p P accordingly. |
| 190 | Value *expandPredication(VPIntrinsic &PI); |
| 191 | |
| 192 | /// Determine how and whether the VPIntrinsic \p VPI shall be expanded. This |
| 193 | /// overrides TTI with the cl::opts listed at the top of this file. |
| 194 | VPLegalization getVPLegalizationStrategy(const VPIntrinsic &VPI) const; |
| 195 | bool UsingTTIOverrides; |
| 196 | |
| 197 | public: |
| 198 | CachingVPExpander(const TargetTransformInfo &TTI) |
| 199 | : TTI(TTI), UsingTTIOverrides(anyExpandVPOverridesSet()) {} |
| 200 | |
| 201 | /// Expand llvm.vp.* intrinsics as requested by \p TTI. |
| 202 | /// Returns the details of the expansion. |
| 203 | VPExpansionDetails expandVectorPredication(VPIntrinsic &VPI); |
| 204 | }; |
| 205 | |
| 206 | //// CachingVPExpander { |
| 207 | |
| 208 | Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder, |
| 209 | Value *EVLParam, |
| 210 | ElementCount ElemCount) { |
| 211 | // TODO add caching |
| 212 | // Scalable vector %evl conversion. |
| 213 | if (ElemCount.isScalable()) { |
| 214 | Type *BoolVecTy = VectorType::get(ElementType: Builder.getInt1Ty(), EC: ElemCount); |
| 215 | // `get_active_lane_mask` performs an implicit less-than comparison. |
| 216 | Value *ConstZero = Builder.getInt32(C: 0); |
| 217 | return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask, |
| 218 | Types: {BoolVecTy, EVLParam->getType()}, |
| 219 | Args: {ConstZero, EVLParam}); |
| 220 | } |
| 221 | |
| 222 | // Fixed vector %evl conversion. |
| 223 | Type *LaneTy = EVLParam->getType(); |
| 224 | unsigned NumElems = ElemCount.getFixedValue(); |
| 225 | Value *VLSplat = Builder.CreateVectorSplat(NumElts: NumElems, V: EVLParam); |
| 226 | Value *IdxVec = Builder.CreateStepVector(DstType: VectorType::get(ElementType: LaneTy, EC: ElemCount)); |
| 227 | return Builder.CreateICmp(P: CmpInst::ICMP_ULT, LHS: IdxVec, RHS: VLSplat); |
| 228 | } |
| 229 | |
| 230 | Value * |
| 231 | CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder, |
| 232 | VPIntrinsic &VPI) { |
| 233 | assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && |
| 234 | "Implicitly dropping %evl in non-speculatable operator!" ); |
| 235 | |
| 236 | auto OC = static_cast<Instruction::BinaryOps>(*VPI.getFunctionalOpcode()); |
| 237 | assert(Instruction::isBinaryOp(OC)); |
| 238 | |
| 239 | Value *Op0 = VPI.getOperand(i_nocapture: 0); |
| 240 | Value *Op1 = VPI.getOperand(i_nocapture: 1); |
| 241 | Value *Mask = VPI.getMaskParam(); |
| 242 | |
| 243 | // Blend in safe operands. |
| 244 | if (Mask && !isAllTrueMask(MaskVal: Mask)) { |
| 245 | switch (OC) { |
| 246 | default: |
| 247 | // Can safely ignore the predicate. |
| 248 | break; |
| 249 | |
| 250 | // Division operators need a safe divisor on masked-off lanes (1). |
| 251 | case Instruction::UDiv: |
| 252 | case Instruction::SDiv: |
| 253 | case Instruction::URem: |
| 254 | case Instruction::SRem: |
| 255 | // 2nd operand must not be zero. |
| 256 | Value *SafeDivisor = getSafeDivisor(DivTy: VPI.getType()); |
| 257 | Op1 = Builder.CreateSelect(C: Mask, True: Op1, False: SafeDivisor); |
| 258 | } |
| 259 | } |
| 260 | |
| 261 | Value *NewBinOp = Builder.CreateBinOp(Opc: OC, LHS: Op0, RHS: Op1, Name: VPI.getName()); |
| 262 | |
| 263 | replaceOperation(NewOp&: *NewBinOp, OldOp&: VPI); |
| 264 | return NewBinOp; |
| 265 | } |
| 266 | |
| 267 | Value *CachingVPExpander::expandPredicationToIntCall(IRBuilder<> &Builder, |
| 268 | VPIntrinsic &VPI) { |
| 269 | std::optional<unsigned> FID = VPI.getFunctionalIntrinsicID(); |
| 270 | if (!FID) |
| 271 | return nullptr; |
| 272 | SmallVector<Value *, 2> Argument; |
| 273 | for (unsigned i = 0; i < VPI.getNumOperands() - 3; i++) { |
| 274 | Argument.push_back(Elt: VPI.getOperand(i_nocapture: i)); |
| 275 | } |
| 276 | Value *NewOp = Builder.CreateIntrinsic(ID: FID.value(), Types: {VPI.getType()}, Args: Argument, |
| 277 | /*FMFSource=*/nullptr, Name: VPI.getName()); |
| 278 | replaceOperation(NewOp&: *NewOp, OldOp&: VPI); |
| 279 | return NewOp; |
| 280 | } |
| 281 | |
| 282 | Value *CachingVPExpander::expandPredicationToFPCall( |
| 283 | IRBuilder<> &Builder, VPIntrinsic &VPI, unsigned UnpredicatedIntrinsicID) { |
| 284 | assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && |
| 285 | "Implicitly dropping %evl in non-speculatable operator!" ); |
| 286 | |
| 287 | switch (UnpredicatedIntrinsicID) { |
| 288 | case Intrinsic::fabs: |
| 289 | case Intrinsic::sqrt: |
| 290 | case Intrinsic::maxnum: |
| 291 | case Intrinsic::minnum: { |
| 292 | SmallVector<Value *, 2> Argument; |
| 293 | for (unsigned i = 0; i < VPI.getNumOperands() - 3; i++) { |
| 294 | Argument.push_back(Elt: VPI.getOperand(i_nocapture: i)); |
| 295 | } |
| 296 | Value *NewOp = Builder.CreateIntrinsic( |
| 297 | ID: UnpredicatedIntrinsicID, Types: {VPI.getType()}, Args: Argument, |
| 298 | /*FMFSource=*/nullptr, Name: VPI.getName()); |
| 299 | replaceOperation(NewOp&: *NewOp, OldOp&: VPI); |
| 300 | return NewOp; |
| 301 | } |
| 302 | case Intrinsic::fma: |
| 303 | case Intrinsic::fmuladd: |
| 304 | case Intrinsic::experimental_constrained_fma: |
| 305 | case Intrinsic::experimental_constrained_fmuladd: { |
| 306 | Value *Op0 = VPI.getOperand(i_nocapture: 0); |
| 307 | Value *Op1 = VPI.getOperand(i_nocapture: 1); |
| 308 | Value *Op2 = VPI.getOperand(i_nocapture: 2); |
| 309 | Function *Fn = Intrinsic::getOrInsertDeclaration( |
| 310 | M: VPI.getModule(), id: UnpredicatedIntrinsicID, Tys: {VPI.getType()}); |
| 311 | Value *NewOp; |
| 312 | if (Intrinsic::isConstrainedFPIntrinsic(QID: UnpredicatedIntrinsicID)) |
| 313 | NewOp = |
| 314 | Builder.CreateConstrainedFPCall(Callee: Fn, Args: {Op0, Op1, Op2}, Name: VPI.getName()); |
| 315 | else |
| 316 | NewOp = Builder.CreateCall(Callee: Fn, Args: {Op0, Op1, Op2}, Name: VPI.getName()); |
| 317 | replaceOperation(NewOp&: *NewOp, OldOp&: VPI); |
| 318 | return NewOp; |
| 319 | } |
| 320 | } |
| 321 | |
| 322 | return nullptr; |
| 323 | } |
| 324 | |
| 325 | static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI, |
| 326 | Type *EltTy) { |
| 327 | Intrinsic::ID RdxID = *VPI.getFunctionalIntrinsicID(); |
| 328 | FastMathFlags FMF; |
| 329 | if (isa<FPMathOperator>(Val: VPI)) |
| 330 | FMF = VPI.getFastMathFlags(); |
| 331 | return getReductionIdentity(RdxID, Ty: EltTy, FMF); |
| 332 | } |
| 333 | |
| 334 | Value * |
| 335 | CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder, |
| 336 | VPReductionIntrinsic &VPI) { |
| 337 | assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && |
| 338 | "Implicitly dropping %evl in non-speculatable operator!" ); |
| 339 | |
| 340 | Value *Mask = VPI.getMaskParam(); |
| 341 | Value *RedOp = VPI.getOperand(i_nocapture: VPI.getVectorParamPos()); |
| 342 | |
| 343 | // Insert neutral element in masked-out positions |
| 344 | if (Mask && !isAllTrueMask(MaskVal: Mask)) { |
| 345 | auto *NeutralElt = getNeutralReductionElement(VPI, EltTy: VPI.getType()); |
| 346 | auto *NeutralVector = Builder.CreateVectorSplat( |
| 347 | EC: cast<VectorType>(Val: RedOp->getType())->getElementCount(), V: NeutralElt); |
| 348 | RedOp = Builder.CreateSelect(C: Mask, True: RedOp, False: NeutralVector); |
| 349 | } |
| 350 | |
| 351 | Value *Reduction; |
| 352 | Value *Start = VPI.getOperand(i_nocapture: VPI.getStartParamPos()); |
| 353 | |
| 354 | switch (VPI.getIntrinsicID()) { |
| 355 | default: |
| 356 | llvm_unreachable("Impossible reduction kind" ); |
| 357 | case Intrinsic::vp_reduce_add: |
| 358 | case Intrinsic::vp_reduce_mul: |
| 359 | case Intrinsic::vp_reduce_and: |
| 360 | case Intrinsic::vp_reduce_or: |
| 361 | case Intrinsic::vp_reduce_xor: { |
| 362 | Intrinsic::ID RedID = *VPI.getFunctionalIntrinsicID(); |
| 363 | unsigned Opc = getArithmeticReductionInstruction(RdxID: RedID); |
| 364 | assert(Instruction::isBinaryOp(Opc)); |
| 365 | Reduction = Builder.CreateUnaryIntrinsic(ID: RedID, V: RedOp); |
| 366 | Reduction = |
| 367 | Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opc, LHS: Reduction, RHS: Start); |
| 368 | break; |
| 369 | } |
| 370 | case Intrinsic::vp_reduce_smax: |
| 371 | case Intrinsic::vp_reduce_smin: |
| 372 | case Intrinsic::vp_reduce_umax: |
| 373 | case Intrinsic::vp_reduce_umin: |
| 374 | case Intrinsic::vp_reduce_fmax: |
| 375 | case Intrinsic::vp_reduce_fmin: |
| 376 | case Intrinsic::vp_reduce_fmaximum: |
| 377 | case Intrinsic::vp_reduce_fminimum: { |
| 378 | Intrinsic::ID RedID = *VPI.getFunctionalIntrinsicID(); |
| 379 | Intrinsic::ID ScalarID = getMinMaxReductionIntrinsicOp(RdxID: RedID); |
| 380 | Reduction = Builder.CreateUnaryIntrinsic(ID: RedID, V: RedOp); |
| 381 | transferDecorations(NewVal&: *Reduction, VPI); |
| 382 | Reduction = Builder.CreateBinaryIntrinsic(ID: ScalarID, LHS: Reduction, RHS: Start); |
| 383 | break; |
| 384 | } |
| 385 | case Intrinsic::vp_reduce_fadd: |
| 386 | Reduction = Builder.CreateFAddReduce(Acc: Start, Src: RedOp); |
| 387 | break; |
| 388 | case Intrinsic::vp_reduce_fmul: |
| 389 | Reduction = Builder.CreateFMulReduce(Acc: Start, Src: RedOp); |
| 390 | break; |
| 391 | } |
| 392 | |
| 393 | replaceOperation(NewOp&: *Reduction, OldOp&: VPI); |
| 394 | return Reduction; |
| 395 | } |
| 396 | |
| 397 | Value *CachingVPExpander::expandPredicationToCastIntrinsic(IRBuilder<> &Builder, |
| 398 | VPIntrinsic &VPI) { |
| 399 | Intrinsic::ID VPID = VPI.getIntrinsicID(); |
| 400 | unsigned CastOpcode = VPIntrinsic::getFunctionalOpcodeForVP(ID: VPID).value(); |
| 401 | assert(Instruction::isCast(CastOpcode)); |
| 402 | Value *CastOp = |
| 403 | Builder.CreateCast(Op: Instruction::CastOps(CastOpcode), V: VPI.getOperand(i_nocapture: 0), |
| 404 | DestTy: VPI.getType(), Name: VPI.getName()); |
| 405 | |
| 406 | replaceOperation(NewOp&: *CastOp, OldOp&: VPI); |
| 407 | return CastOp; |
| 408 | } |
| 409 | |
| 410 | Value * |
| 411 | CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, |
| 412 | VPIntrinsic &VPI) { |
| 413 | assert(VPI.canIgnoreVectorLengthParam()); |
| 414 | |
| 415 | const auto &DL = VPI.getDataLayout(); |
| 416 | |
| 417 | Value *MaskParam = VPI.getMaskParam(); |
| 418 | Value *PtrParam = VPI.getMemoryPointerParam(); |
| 419 | Value *DataParam = VPI.getMemoryDataParam(); |
| 420 | bool IsUnmasked = isAllTrueMask(MaskVal: MaskParam); |
| 421 | |
| 422 | MaybeAlign AlignOpt = VPI.getPointerAlignment(); |
| 423 | |
| 424 | Value *NewMemoryInst = nullptr; |
| 425 | switch (VPI.getIntrinsicID()) { |
| 426 | default: |
| 427 | llvm_unreachable("Not a VP memory intrinsic" ); |
| 428 | case Intrinsic::vp_store: |
| 429 | if (IsUnmasked) { |
| 430 | StoreInst *NewStore = |
| 431 | Builder.CreateStore(Val: DataParam, Ptr: PtrParam, /*IsVolatile*/ isVolatile: false); |
| 432 | if (AlignOpt.has_value()) |
| 433 | NewStore->setAlignment(*AlignOpt); |
| 434 | NewMemoryInst = NewStore; |
| 435 | } else |
| 436 | NewMemoryInst = Builder.CreateMaskedStore( |
| 437 | Val: DataParam, Ptr: PtrParam, Alignment: AlignOpt.valueOrOne(), Mask: MaskParam); |
| 438 | |
| 439 | break; |
| 440 | case Intrinsic::vp_load: |
| 441 | if (IsUnmasked) { |
| 442 | LoadInst *NewLoad = |
| 443 | Builder.CreateLoad(Ty: VPI.getType(), Ptr: PtrParam, /*IsVolatile*/ isVolatile: false); |
| 444 | if (AlignOpt.has_value()) |
| 445 | NewLoad->setAlignment(*AlignOpt); |
| 446 | NewMemoryInst = NewLoad; |
| 447 | } else |
| 448 | NewMemoryInst = Builder.CreateMaskedLoad( |
| 449 | Ty: VPI.getType(), Ptr: PtrParam, Alignment: AlignOpt.valueOrOne(), Mask: MaskParam); |
| 450 | |
| 451 | break; |
| 452 | case Intrinsic::vp_scatter: { |
| 453 | auto *ElementType = |
| 454 | cast<VectorType>(Val: DataParam->getType())->getElementType(); |
| 455 | NewMemoryInst = Builder.CreateMaskedScatter( |
| 456 | Val: DataParam, Ptrs: PtrParam, |
| 457 | Alignment: AlignOpt.value_or(u: DL.getPrefTypeAlign(Ty: ElementType)), Mask: MaskParam); |
| 458 | break; |
| 459 | } |
| 460 | case Intrinsic::vp_gather: { |
| 461 | auto *ElementType = cast<VectorType>(Val: VPI.getType())->getElementType(); |
| 462 | NewMemoryInst = Builder.CreateMaskedGather( |
| 463 | Ty: VPI.getType(), Ptrs: PtrParam, |
| 464 | Alignment: AlignOpt.value_or(u: DL.getPrefTypeAlign(Ty: ElementType)), Mask: MaskParam, PassThru: nullptr, |
| 465 | Name: VPI.getName()); |
| 466 | break; |
| 467 | } |
| 468 | } |
| 469 | |
| 470 | assert(NewMemoryInst); |
| 471 | replaceOperation(NewOp&: *NewMemoryInst, OldOp&: VPI); |
| 472 | return NewMemoryInst; |
| 473 | } |
| 474 | |
| 475 | Value *CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder, |
| 476 | VPCmpIntrinsic &VPI) { |
| 477 | assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && |
| 478 | "Implicitly dropping %evl in non-speculatable operator!" ); |
| 479 | |
| 480 | assert(*VPI.getFunctionalOpcode() == Instruction::ICmp || |
| 481 | *VPI.getFunctionalOpcode() == Instruction::FCmp); |
| 482 | |
| 483 | Value *Op0 = VPI.getOperand(i_nocapture: 0); |
| 484 | Value *Op1 = VPI.getOperand(i_nocapture: 1); |
| 485 | auto Pred = VPI.getPredicate(); |
| 486 | |
| 487 | auto *NewCmp = Builder.CreateCmp(Pred, LHS: Op0, RHS: Op1); |
| 488 | |
| 489 | replaceOperation(NewOp&: *NewCmp, OldOp&: VPI); |
| 490 | return NewCmp; |
| 491 | } |
| 492 | |
| 493 | bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) { |
| 494 | LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n" ); |
| 495 | |
| 496 | if (VPI.canIgnoreVectorLengthParam()) |
| 497 | return false; |
| 498 | |
| 499 | Value *EVLParam = VPI.getVectorLengthParam(); |
| 500 | if (!EVLParam) |
| 501 | return false; |
| 502 | |
| 503 | ElementCount StaticElemCount = VPI.getStaticVectorLength(); |
| 504 | Value *MaxEVL = nullptr; |
| 505 | Type *Int32Ty = Type::getInt32Ty(C&: VPI.getContext()); |
| 506 | if (StaticElemCount.isScalable()) { |
| 507 | // TODO add caching |
| 508 | IRBuilder<> Builder(VPI.getParent(), VPI.getIterator()); |
| 509 | Value *FactorConst = Builder.getInt32(C: StaticElemCount.getKnownMinValue()); |
| 510 | Value *VScale = Builder.CreateVScale(Ty: Int32Ty, Name: "vscale" ); |
| 511 | MaxEVL = Builder.CreateMul(LHS: VScale, RHS: FactorConst, Name: "scalable_size" , |
| 512 | /*NUW*/ HasNUW: true, /*NSW*/ HasNSW: false); |
| 513 | } else { |
| 514 | MaxEVL = ConstantInt::get(Ty: Int32Ty, V: StaticElemCount.getFixedValue(), IsSigned: false); |
| 515 | } |
| 516 | VPI.setVectorLengthParam(MaxEVL); |
| 517 | return true; |
| 518 | } |
| 519 | |
| 520 | std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) { |
| 521 | LLVM_DEBUG(dbgs() << "Folding vlen for " << VPI << '\n'); |
| 522 | |
| 523 | IRBuilder<> Builder(&VPI); |
| 524 | |
| 525 | // Ineffective %evl parameter and so nothing to do here. |
| 526 | if (VPI.canIgnoreVectorLengthParam()) |
| 527 | return {&VPI, false}; |
| 528 | |
| 529 | // Only VP intrinsics can have an %evl parameter. |
| 530 | Value *OldMaskParam = VPI.getMaskParam(); |
| 531 | Value *OldEVLParam = VPI.getVectorLengthParam(); |
| 532 | assert(OldMaskParam && "no mask param to fold the vl param into" ); |
| 533 | assert(OldEVLParam && "no EVL param to fold away" ); |
| 534 | |
| 535 | LLVM_DEBUG(dbgs() << "OLD evl: " << *OldEVLParam << '\n'); |
| 536 | LLVM_DEBUG(dbgs() << "OLD mask: " << *OldMaskParam << '\n'); |
| 537 | |
| 538 | // Convert the %evl predication into vector mask predication. |
| 539 | ElementCount ElemCount = VPI.getStaticVectorLength(); |
| 540 | Value *VLMask = convertEVLToMask(Builder, EVLParam: OldEVLParam, ElemCount); |
| 541 | Value *NewMaskParam = Builder.CreateAnd(LHS: VLMask, RHS: OldMaskParam); |
| 542 | VPI.setMaskParam(NewMaskParam); |
| 543 | |
| 544 | // Drop the %evl parameter. |
| 545 | discardEVLParameter(VPI); |
| 546 | assert(VPI.canIgnoreVectorLengthParam() && |
| 547 | "transformation did not render the evl param ineffective!" ); |
| 548 | |
| 549 | // Reassess the modified instruction. |
| 550 | return {&VPI, true}; |
| 551 | } |
| 552 | |
| 553 | Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) { |
| 554 | LLVM_DEBUG(dbgs() << "Lowering to unpredicated op: " << VPI << '\n'); |
| 555 | |
| 556 | IRBuilder<> Builder(&VPI); |
| 557 | |
| 558 | // Try lowering to a LLVM instruction first. |
| 559 | auto OC = VPI.getFunctionalOpcode(); |
| 560 | |
| 561 | if (OC && Instruction::isBinaryOp(Opcode: *OC)) |
| 562 | return expandPredicationInBinaryOperator(Builder, VPI); |
| 563 | |
| 564 | if (auto *VPRI = dyn_cast<VPReductionIntrinsic>(Val: &VPI)) |
| 565 | return expandPredicationInReduction(Builder, VPI&: *VPRI); |
| 566 | |
| 567 | if (auto *VPCmp = dyn_cast<VPCmpIntrinsic>(Val: &VPI)) |
| 568 | return expandPredicationInComparison(Builder, VPI&: *VPCmp); |
| 569 | |
| 570 | if (VPCastIntrinsic::isVPCast(ID: VPI.getIntrinsicID())) { |
| 571 | return expandPredicationToCastIntrinsic(Builder, VPI); |
| 572 | } |
| 573 | |
| 574 | switch (VPI.getIntrinsicID()) { |
| 575 | default: |
| 576 | break; |
| 577 | case Intrinsic::vp_fneg: { |
| 578 | Value *NewNegOp = Builder.CreateFNeg(V: VPI.getOperand(i_nocapture: 0), Name: VPI.getName()); |
| 579 | replaceOperation(NewOp&: *NewNegOp, OldOp&: VPI); |
| 580 | return NewNegOp; |
| 581 | } |
| 582 | case Intrinsic::vp_abs: |
| 583 | case Intrinsic::vp_smax: |
| 584 | case Intrinsic::vp_smin: |
| 585 | case Intrinsic::vp_umax: |
| 586 | case Intrinsic::vp_umin: |
| 587 | case Intrinsic::vp_bswap: |
| 588 | case Intrinsic::vp_bitreverse: |
| 589 | case Intrinsic::vp_ctpop: |
| 590 | case Intrinsic::vp_ctlz: |
| 591 | case Intrinsic::vp_cttz: |
| 592 | case Intrinsic::vp_sadd_sat: |
| 593 | case Intrinsic::vp_uadd_sat: |
| 594 | case Intrinsic::vp_ssub_sat: |
| 595 | case Intrinsic::vp_usub_sat: |
| 596 | case Intrinsic::vp_fshl: |
| 597 | case Intrinsic::vp_fshr: |
| 598 | return expandPredicationToIntCall(Builder, VPI); |
| 599 | case Intrinsic::vp_fabs: |
| 600 | case Intrinsic::vp_sqrt: |
| 601 | case Intrinsic::vp_maxnum: |
| 602 | case Intrinsic::vp_minnum: |
| 603 | case Intrinsic::vp_maximum: |
| 604 | case Intrinsic::vp_minimum: |
| 605 | case Intrinsic::vp_fma: |
| 606 | case Intrinsic::vp_fmuladd: |
| 607 | return expandPredicationToFPCall(Builder, VPI, |
| 608 | UnpredicatedIntrinsicID: VPI.getFunctionalIntrinsicID().value()); |
| 609 | case Intrinsic::vp_load: |
| 610 | case Intrinsic::vp_store: |
| 611 | case Intrinsic::vp_gather: |
| 612 | case Intrinsic::vp_scatter: |
| 613 | return expandPredicationInMemoryIntrinsic(Builder, VPI); |
| 614 | } |
| 615 | |
| 616 | if (auto CID = VPI.getConstrainedIntrinsicID()) |
| 617 | if (Value *Call = expandPredicationToFPCall(Builder, VPI, UnpredicatedIntrinsicID: *CID)) |
| 618 | return Call; |
| 619 | |
| 620 | return &VPI; |
| 621 | } |
| 622 | |
| 623 | //// } CachingVPExpander |
| 624 | |
| 625 | void sanitizeStrategy(VPIntrinsic &VPI, VPLegalization &LegalizeStrat) { |
| 626 | // Operations with speculatable lanes do not strictly need predication. |
| 627 | if (maySpeculateLanes(VPI)) { |
| 628 | // Converting a speculatable VP intrinsic means dropping %mask and %evl. |
| 629 | // No need to expand %evl into the %mask only to ignore that code. |
| 630 | if (LegalizeStrat.OpStrategy == VPLegalization::Convert) |
| 631 | LegalizeStrat.EVLParamStrategy = VPLegalization::Discard; |
| 632 | return; |
| 633 | } |
| 634 | |
| 635 | // We have to preserve the predicating effect of %evl for this |
| 636 | // non-speculatable VP intrinsic. |
| 637 | // 1) Never discard %evl. |
| 638 | // 2) If this VP intrinsic will be expanded to non-VP code, make sure that |
| 639 | // %evl gets folded into %mask. |
| 640 | if ((LegalizeStrat.EVLParamStrategy == VPLegalization::Discard) || |
| 641 | (LegalizeStrat.OpStrategy == VPLegalization::Convert)) { |
| 642 | LegalizeStrat.EVLParamStrategy = VPLegalization::Convert; |
| 643 | } |
| 644 | } |
| 645 | |
| 646 | VPLegalization |
| 647 | CachingVPExpander::getVPLegalizationStrategy(const VPIntrinsic &VPI) const { |
| 648 | auto VPStrat = TTI.getVPLegalizationStrategy(PI: VPI); |
| 649 | if (LLVM_LIKELY(!UsingTTIOverrides)) { |
| 650 | // No overrides - we are in production. |
| 651 | return VPStrat; |
| 652 | } |
| 653 | |
| 654 | // Overrides set - we are in testing, the following does not need to be |
| 655 | // efficient. |
| 656 | VPStrat.EVLParamStrategy = parseOverrideOption(TextOpt: EVLTransformOverride); |
| 657 | VPStrat.OpStrategy = parseOverrideOption(TextOpt: MaskTransformOverride); |
| 658 | return VPStrat; |
| 659 | } |
| 660 | |
| 661 | VPExpansionDetails |
| 662 | CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) { |
| 663 | auto Strategy = getVPLegalizationStrategy(VPI); |
| 664 | sanitizeStrategy(VPI, LegalizeStrat&: Strategy); |
| 665 | |
| 666 | VPExpansionDetails Changed = VPExpansionDetails::IntrinsicUnchanged; |
| 667 | |
| 668 | // Transform the EVL parameter. |
| 669 | switch (Strategy.EVLParamStrategy) { |
| 670 | case VPLegalization::Legal: |
| 671 | break; |
| 672 | case VPLegalization::Discard: |
| 673 | if (discardEVLParameter(VPI)) |
| 674 | Changed = VPExpansionDetails::IntrinsicUpdated; |
| 675 | break; |
| 676 | case VPLegalization::Convert: |
| 677 | if (auto [NewVPI, Folded] = foldEVLIntoMask(VPI); Folded) { |
| 678 | (void)NewVPI; |
| 679 | Changed = VPExpansionDetails::IntrinsicUpdated; |
| 680 | ++NumFoldedVL; |
| 681 | } |
| 682 | break; |
| 683 | } |
| 684 | |
| 685 | // Replace with a non-predicated operation. |
| 686 | switch (Strategy.OpStrategy) { |
| 687 | case VPLegalization::Legal: |
| 688 | break; |
| 689 | case VPLegalization::Discard: |
| 690 | llvm_unreachable("Invalid strategy for operators." ); |
| 691 | case VPLegalization::Convert: |
| 692 | if (Value *V = expandPredication(VPI); V != &VPI) { |
| 693 | ++NumLoweredVPOps; |
| 694 | Changed = VPExpansionDetails::IntrinsicReplaced; |
| 695 | } |
| 696 | break; |
| 697 | } |
| 698 | |
| 699 | return Changed; |
| 700 | } |
| 701 | } // namespace |
| 702 | |
| 703 | VPExpansionDetails |
| 704 | llvm::expandVectorPredicationIntrinsic(VPIntrinsic &VPI, |
| 705 | const TargetTransformInfo &TTI) { |
| 706 | return CachingVPExpander(TTI).expandVectorPredication(VPI); |
| 707 | } |
| 708 | |