| 1 | //=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "AMDGPUCombinerHelper.h" |
| 10 | #include "GCNSubtarget.h" |
| 11 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 12 | #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" |
| 13 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
| 14 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
| 15 | #include "llvm/Target/TargetMachine.h" |
| 16 | |
| 17 | using namespace llvm; |
| 18 | using namespace MIPatternMatch; |
| 19 | |
| 20 | AMDGPUCombinerHelper::AMDGPUCombinerHelper( |
| 21 | GISelChangeObserver &Observer, MachineIRBuilder &B, bool IsPreLegalize, |
| 22 | GISelValueTracking *VT, MachineDominatorTree *MDT, const LegalizerInfo *LI, |
| 23 | const GCNSubtarget &STI) |
| 24 | : CombinerHelper(Observer, B, IsPreLegalize, VT, MDT, LI), STI(STI), |
| 25 | TII(*STI.getInstrInfo()) {} |
| 26 | |
| 27 | LLVM_READNONE |
| 28 | static bool fnegFoldsIntoMI(const MachineInstr &MI) { |
| 29 | switch (MI.getOpcode()) { |
| 30 | case AMDGPU::G_FADD: |
| 31 | case AMDGPU::G_FSUB: |
| 32 | case AMDGPU::G_FMUL: |
| 33 | case AMDGPU::G_FMA: |
| 34 | case AMDGPU::G_FMAD: |
| 35 | case AMDGPU::G_FMINNUM: |
| 36 | case AMDGPU::G_FMAXNUM: |
| 37 | case AMDGPU::G_FMINNUM_IEEE: |
| 38 | case AMDGPU::G_FMAXNUM_IEEE: |
| 39 | case AMDGPU::G_FMINIMUM: |
| 40 | case AMDGPU::G_FMAXIMUM: |
| 41 | case AMDGPU::G_FSIN: |
| 42 | case AMDGPU::G_FPEXT: |
| 43 | case AMDGPU::G_INTRINSIC_TRUNC: |
| 44 | case AMDGPU::G_FPTRUNC: |
| 45 | case AMDGPU::G_FRINT: |
| 46 | case AMDGPU::G_FNEARBYINT: |
| 47 | case AMDGPU::G_INTRINSIC_ROUND: |
| 48 | case AMDGPU::G_INTRINSIC_ROUNDEVEN: |
| 49 | case AMDGPU::G_FCANONICALIZE: |
| 50 | case AMDGPU::G_AMDGPU_RCP_IFLAG: |
| 51 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
| 52 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: |
| 53 | return true; |
| 54 | case AMDGPU::G_INTRINSIC: { |
| 55 | Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: MI).getIntrinsicID(); |
| 56 | switch (IntrinsicID) { |
| 57 | case Intrinsic::amdgcn_rcp: |
| 58 | case Intrinsic::amdgcn_rcp_legacy: |
| 59 | case Intrinsic::amdgcn_sin: |
| 60 | case Intrinsic::amdgcn_fmul_legacy: |
| 61 | case Intrinsic::amdgcn_fmed3: |
| 62 | case Intrinsic::amdgcn_fma_legacy: |
| 63 | return true; |
| 64 | default: |
| 65 | return false; |
| 66 | } |
| 67 | } |
| 68 | default: |
| 69 | return false; |
| 70 | } |
| 71 | } |
| 72 | |
| 73 | /// \p returns true if the operation will definitely need to use a 64-bit |
| 74 | /// encoding, and thus will use a VOP3 encoding regardless of the source |
| 75 | /// modifiers. |
| 76 | LLVM_READONLY |
| 77 | static bool opMustUseVOP3Encoding(const MachineInstr &MI, |
| 78 | const MachineRegisterInfo &MRI) { |
| 79 | return MI.getNumOperands() > (isa<GIntrinsic>(Val: MI) ? 4u : 3u) || |
| 80 | MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getScalarSizeInBits() == 64; |
| 81 | } |
| 82 | |
| 83 | // Most FP instructions support source modifiers. |
| 84 | LLVM_READONLY |
| 85 | static bool hasSourceMods(const MachineInstr &MI) { |
| 86 | if (!MI.memoperands().empty()) |
| 87 | return false; |
| 88 | |
| 89 | switch (MI.getOpcode()) { |
| 90 | case AMDGPU::COPY: |
| 91 | case AMDGPU::G_SELECT: |
| 92 | case AMDGPU::G_FDIV: |
| 93 | case AMDGPU::G_FREM: |
| 94 | case TargetOpcode::INLINEASM: |
| 95 | case TargetOpcode::INLINEASM_BR: |
| 96 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: |
| 97 | case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: |
| 98 | case AMDGPU::G_BITCAST: |
| 99 | case AMDGPU::G_ANYEXT: |
| 100 | case AMDGPU::G_BUILD_VECTOR: |
| 101 | case AMDGPU::G_BUILD_VECTOR_TRUNC: |
| 102 | case AMDGPU::G_PHI: |
| 103 | return false; |
| 104 | case AMDGPU::G_INTRINSIC: |
| 105 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
| 106 | Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: MI).getIntrinsicID(); |
| 107 | switch (IntrinsicID) { |
| 108 | case Intrinsic::amdgcn_interp_p1: |
| 109 | case Intrinsic::amdgcn_interp_p2: |
| 110 | case Intrinsic::amdgcn_interp_mov: |
| 111 | case Intrinsic::amdgcn_interp_p1_f16: |
| 112 | case Intrinsic::amdgcn_interp_p2_f16: |
| 113 | case Intrinsic::amdgcn_div_scale: |
| 114 | return false; |
| 115 | default: |
| 116 | return true; |
| 117 | } |
| 118 | } |
| 119 | default: |
| 120 | return true; |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, |
| 125 | unsigned CostThreshold = 4) { |
| 126 | // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus |
| 127 | // it is truly free to use a source modifier in all cases. If there are |
| 128 | // multiple users but for each one will necessitate using VOP3, there will be |
| 129 | // a code size increase. Try to avoid increasing code size unless we know it |
| 130 | // will save on the instruction count. |
| 131 | unsigned NumMayIncreaseSize = 0; |
| 132 | Register Dst = MI.getOperand(i: 0).getReg(); |
| 133 | for (const MachineInstr &Use : MRI.use_nodbg_instructions(Reg: Dst)) { |
| 134 | if (!hasSourceMods(MI: Use)) |
| 135 | return false; |
| 136 | |
| 137 | if (!opMustUseVOP3Encoding(MI: Use, MRI)) { |
| 138 | if (++NumMayIncreaseSize > CostThreshold) |
| 139 | return false; |
| 140 | } |
| 141 | } |
| 142 | return true; |
| 143 | } |
| 144 | |
| 145 | static bool mayIgnoreSignedZero(MachineInstr &MI) { |
| 146 | const TargetOptions &Options = MI.getMF()->getTarget().Options; |
| 147 | return Options.NoSignedZerosFPMath || MI.getFlag(Flag: MachineInstr::MIFlag::FmNsz); |
| 148 | } |
| 149 | |
| 150 | static bool isInv2Pi(const APFloat &APF) { |
| 151 | static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118)); |
| 152 | static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983)); |
| 153 | static const APFloat KF64(APFloat::IEEEdouble(), |
| 154 | APInt(64, 0x3fc45f306dc9c882)); |
| 155 | |
| 156 | return APF.bitwiseIsEqual(RHS: KF16) || APF.bitwiseIsEqual(RHS: KF32) || |
| 157 | APF.bitwiseIsEqual(RHS: KF64); |
| 158 | } |
| 159 | |
| 160 | // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an |
| 161 | // additional cost to negate them. |
| 162 | static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg, |
| 163 | MachineRegisterInfo &MRI) { |
| 164 | std::optional<FPValueAndVReg> FPValReg; |
| 165 | if (mi_match(R: Reg, MRI, P: m_GFCstOrSplat(FPValReg))) { |
| 166 | if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative()) |
| 167 | return true; |
| 168 | |
| 169 | const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>(); |
| 170 | if (ST.hasInv2PiInlineImm() && isInv2Pi(APF: FPValReg->Value)) |
| 171 | return true; |
| 172 | } |
| 173 | return false; |
| 174 | } |
| 175 | |
| 176 | static unsigned inverseMinMax(unsigned Opc) { |
| 177 | switch (Opc) { |
| 178 | case AMDGPU::G_FMAXNUM: |
| 179 | return AMDGPU::G_FMINNUM; |
| 180 | case AMDGPU::G_FMINNUM: |
| 181 | return AMDGPU::G_FMAXNUM; |
| 182 | case AMDGPU::G_FMAXNUM_IEEE: |
| 183 | return AMDGPU::G_FMINNUM_IEEE; |
| 184 | case AMDGPU::G_FMINNUM_IEEE: |
| 185 | return AMDGPU::G_FMAXNUM_IEEE; |
| 186 | case AMDGPU::G_FMAXIMUM: |
| 187 | return AMDGPU::G_FMINIMUM; |
| 188 | case AMDGPU::G_FMINIMUM: |
| 189 | return AMDGPU::G_FMAXIMUM; |
| 190 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: |
| 191 | return AMDGPU::G_AMDGPU_FMIN_LEGACY; |
| 192 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
| 193 | return AMDGPU::G_AMDGPU_FMAX_LEGACY; |
| 194 | default: |
| 195 | llvm_unreachable("invalid min/max opcode" ); |
| 196 | } |
| 197 | } |
| 198 | |
| 199 | bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI, |
| 200 | MachineInstr *&MatchInfo) const { |
| 201 | Register Src = MI.getOperand(i: 1).getReg(); |
| 202 | MatchInfo = MRI.getVRegDef(Reg: Src); |
| 203 | |
| 204 | // If the input has multiple uses and we can either fold the negate down, or |
| 205 | // the other uses cannot, give up. This both prevents unprofitable |
| 206 | // transformations and infinite loops: we won't repeatedly try to fold around |
| 207 | // a negate that has no 'good' form. |
| 208 | if (MRI.hasOneNonDBGUse(RegNo: Src)) { |
| 209 | if (allUsesHaveSourceMods(MI, MRI, CostThreshold: 0)) |
| 210 | return false; |
| 211 | } else { |
| 212 | if (fnegFoldsIntoMI(MI: *MatchInfo) && |
| 213 | (allUsesHaveSourceMods(MI, MRI) || |
| 214 | !allUsesHaveSourceMods(MI&: *MatchInfo, MRI))) |
| 215 | return false; |
| 216 | } |
| 217 | |
| 218 | switch (MatchInfo->getOpcode()) { |
| 219 | case AMDGPU::G_FMINNUM: |
| 220 | case AMDGPU::G_FMAXNUM: |
| 221 | case AMDGPU::G_FMINNUM_IEEE: |
| 222 | case AMDGPU::G_FMAXNUM_IEEE: |
| 223 | case AMDGPU::G_FMINIMUM: |
| 224 | case AMDGPU::G_FMAXIMUM: |
| 225 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
| 226 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: |
| 227 | // 0 doesn't have a negated inline immediate. |
| 228 | return !isConstantCostlierToNegate(MI&: *MatchInfo, |
| 229 | Reg: MatchInfo->getOperand(i: 2).getReg(), MRI); |
| 230 | case AMDGPU::G_FADD: |
| 231 | case AMDGPU::G_FSUB: |
| 232 | case AMDGPU::G_FMA: |
| 233 | case AMDGPU::G_FMAD: |
| 234 | return mayIgnoreSignedZero(MI&: *MatchInfo); |
| 235 | case AMDGPU::G_FMUL: |
| 236 | case AMDGPU::G_FPEXT: |
| 237 | case AMDGPU::G_INTRINSIC_TRUNC: |
| 238 | case AMDGPU::G_FPTRUNC: |
| 239 | case AMDGPU::G_FRINT: |
| 240 | case AMDGPU::G_FNEARBYINT: |
| 241 | case AMDGPU::G_INTRINSIC_ROUND: |
| 242 | case AMDGPU::G_INTRINSIC_ROUNDEVEN: |
| 243 | case AMDGPU::G_FSIN: |
| 244 | case AMDGPU::G_FCANONICALIZE: |
| 245 | case AMDGPU::G_AMDGPU_RCP_IFLAG: |
| 246 | return true; |
| 247 | case AMDGPU::G_INTRINSIC: |
| 248 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
| 249 | Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: MatchInfo)->getIntrinsicID(); |
| 250 | switch (IntrinsicID) { |
| 251 | case Intrinsic::amdgcn_rcp: |
| 252 | case Intrinsic::amdgcn_rcp_legacy: |
| 253 | case Intrinsic::amdgcn_sin: |
| 254 | case Intrinsic::amdgcn_fmul_legacy: |
| 255 | case Intrinsic::amdgcn_fmed3: |
| 256 | return true; |
| 257 | case Intrinsic::amdgcn_fma_legacy: |
| 258 | return mayIgnoreSignedZero(MI&: *MatchInfo); |
| 259 | default: |
| 260 | return false; |
| 261 | } |
| 262 | } |
| 263 | default: |
| 264 | return false; |
| 265 | } |
| 266 | } |
| 267 | |
| 268 | void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI, |
| 269 | MachineInstr *&MatchInfo) const { |
| 270 | // Transform: |
| 271 | // %A = inst %Op1, ... |
| 272 | // %B = fneg %A |
| 273 | // |
| 274 | // into: |
| 275 | // |
| 276 | // (if %A has one use, specifically fneg above) |
| 277 | // %B = inst (maybe fneg %Op1), ... |
| 278 | // |
| 279 | // (if %A has multiple uses) |
| 280 | // %B = inst (maybe fneg %Op1), ... |
| 281 | // %A = fneg %B |
| 282 | |
| 283 | // Replace register in operand with a register holding negated value. |
| 284 | auto NegateOperand = [&](MachineOperand &Op) { |
| 285 | Register Reg = Op.getReg(); |
| 286 | if (!mi_match(R: Reg, MRI, P: m_GFNeg(Src: m_Reg(R&: Reg)))) |
| 287 | Reg = Builder.buildFNeg(Dst: MRI.getType(Reg), Src0: Reg).getReg(Idx: 0); |
| 288 | replaceRegOpWith(MRI, FromRegOp&: Op, ToReg: Reg); |
| 289 | }; |
| 290 | |
| 291 | // Replace either register in operands with a register holding negated value. |
| 292 | auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) { |
| 293 | Register XReg = X.getReg(); |
| 294 | Register YReg = Y.getReg(); |
| 295 | if (mi_match(R: XReg, MRI, P: m_GFNeg(Src: m_Reg(R&: XReg)))) |
| 296 | replaceRegOpWith(MRI, FromRegOp&: X, ToReg: XReg); |
| 297 | else if (mi_match(R: YReg, MRI, P: m_GFNeg(Src: m_Reg(R&: YReg)))) |
| 298 | replaceRegOpWith(MRI, FromRegOp&: Y, ToReg: YReg); |
| 299 | else { |
| 300 | YReg = Builder.buildFNeg(Dst: MRI.getType(Reg: YReg), Src0: YReg).getReg(Idx: 0); |
| 301 | replaceRegOpWith(MRI, FromRegOp&: Y, ToReg: YReg); |
| 302 | } |
| 303 | }; |
| 304 | |
| 305 | Builder.setInstrAndDebugLoc(*MatchInfo); |
| 306 | |
| 307 | // Negate appropriate operands so that resulting value of MatchInfo is |
| 308 | // negated. |
| 309 | switch (MatchInfo->getOpcode()) { |
| 310 | case AMDGPU::G_FADD: |
| 311 | case AMDGPU::G_FSUB: |
| 312 | NegateOperand(MatchInfo->getOperand(i: 1)); |
| 313 | NegateOperand(MatchInfo->getOperand(i: 2)); |
| 314 | break; |
| 315 | case AMDGPU::G_FMUL: |
| 316 | NegateEitherOperand(MatchInfo->getOperand(i: 1), MatchInfo->getOperand(i: 2)); |
| 317 | break; |
| 318 | case AMDGPU::G_FMINNUM: |
| 319 | case AMDGPU::G_FMAXNUM: |
| 320 | case AMDGPU::G_FMINNUM_IEEE: |
| 321 | case AMDGPU::G_FMAXNUM_IEEE: |
| 322 | case AMDGPU::G_FMINIMUM: |
| 323 | case AMDGPU::G_FMAXIMUM: |
| 324 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
| 325 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: { |
| 326 | NegateOperand(MatchInfo->getOperand(i: 1)); |
| 327 | NegateOperand(MatchInfo->getOperand(i: 2)); |
| 328 | unsigned Opposite = inverseMinMax(Opc: MatchInfo->getOpcode()); |
| 329 | replaceOpcodeWith(FromMI&: *MatchInfo, ToOpcode: Opposite); |
| 330 | break; |
| 331 | } |
| 332 | case AMDGPU::G_FMA: |
| 333 | case AMDGPU::G_FMAD: |
| 334 | NegateEitherOperand(MatchInfo->getOperand(i: 1), MatchInfo->getOperand(i: 2)); |
| 335 | NegateOperand(MatchInfo->getOperand(i: 3)); |
| 336 | break; |
| 337 | case AMDGPU::G_FPEXT: |
| 338 | case AMDGPU::G_INTRINSIC_TRUNC: |
| 339 | case AMDGPU::G_FRINT: |
| 340 | case AMDGPU::G_FNEARBYINT: |
| 341 | case AMDGPU::G_INTRINSIC_ROUND: |
| 342 | case AMDGPU::G_INTRINSIC_ROUNDEVEN: |
| 343 | case AMDGPU::G_FSIN: |
| 344 | case AMDGPU::G_FCANONICALIZE: |
| 345 | case AMDGPU::G_AMDGPU_RCP_IFLAG: |
| 346 | case AMDGPU::G_FPTRUNC: |
| 347 | NegateOperand(MatchInfo->getOperand(i: 1)); |
| 348 | break; |
| 349 | case AMDGPU::G_INTRINSIC: |
| 350 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
| 351 | Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: MatchInfo)->getIntrinsicID(); |
| 352 | switch (IntrinsicID) { |
| 353 | case Intrinsic::amdgcn_rcp: |
| 354 | case Intrinsic::amdgcn_rcp_legacy: |
| 355 | case Intrinsic::amdgcn_sin: |
| 356 | NegateOperand(MatchInfo->getOperand(i: 2)); |
| 357 | break; |
| 358 | case Intrinsic::amdgcn_fmul_legacy: |
| 359 | NegateEitherOperand(MatchInfo->getOperand(i: 2), MatchInfo->getOperand(i: 3)); |
| 360 | break; |
| 361 | case Intrinsic::amdgcn_fmed3: |
| 362 | NegateOperand(MatchInfo->getOperand(i: 2)); |
| 363 | NegateOperand(MatchInfo->getOperand(i: 3)); |
| 364 | NegateOperand(MatchInfo->getOperand(i: 4)); |
| 365 | break; |
| 366 | case Intrinsic::amdgcn_fma_legacy: |
| 367 | NegateEitherOperand(MatchInfo->getOperand(i: 2), MatchInfo->getOperand(i: 3)); |
| 368 | NegateOperand(MatchInfo->getOperand(i: 4)); |
| 369 | break; |
| 370 | default: |
| 371 | llvm_unreachable("folding fneg not supported for this intrinsic" ); |
| 372 | } |
| 373 | break; |
| 374 | } |
| 375 | default: |
| 376 | llvm_unreachable("folding fneg not supported for this instruction" ); |
| 377 | } |
| 378 | |
| 379 | Register Dst = MI.getOperand(i: 0).getReg(); |
| 380 | Register MatchInfoDst = MatchInfo->getOperand(i: 0).getReg(); |
| 381 | |
| 382 | if (MRI.hasOneNonDBGUse(RegNo: MatchInfoDst)) { |
| 383 | // MatchInfo now has negated value so use that instead of old Dst. |
| 384 | replaceRegWith(MRI, FromReg: Dst, ToReg: MatchInfoDst); |
| 385 | } else { |
| 386 | // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa |
| 387 | // but replaceRegWith will replace defs as well. It is easier to replace one |
| 388 | // def with a new register. |
| 389 | LLT Type = MRI.getType(Reg: Dst); |
| 390 | Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Ty: Type); |
| 391 | replaceRegOpWith(MRI, FromRegOp&: MatchInfo->getOperand(i: 0), ToReg: NegatedMatchInfo); |
| 392 | |
| 393 | // MatchInfo now has negated value so use that instead of old Dst. |
| 394 | replaceRegWith(MRI, FromReg: Dst, ToReg: NegatedMatchInfo); |
| 395 | |
| 396 | // Recreate non negated value for other uses of old MatchInfoDst |
| 397 | auto NextInst = ++MatchInfo->getIterator(); |
| 398 | Builder.setInstrAndDebugLoc(*NextInst); |
| 399 | Builder.buildFNeg(Dst: MatchInfoDst, Src0: NegatedMatchInfo, Flags: MI.getFlags()); |
| 400 | } |
| 401 | |
| 402 | MI.eraseFromParent(); |
| 403 | } |
| 404 | |
| 405 | // TODO: Should return converted value / extension source and avoid introducing |
| 406 | // intermediate fptruncs in the apply function. |
| 407 | static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI, |
| 408 | Register Reg) { |
| 409 | const MachineInstr *Def = MRI.getVRegDef(Reg); |
| 410 | if (Def->getOpcode() == TargetOpcode::G_FPEXT) { |
| 411 | Register SrcReg = Def->getOperand(i: 1).getReg(); |
| 412 | return MRI.getType(Reg: SrcReg) == LLT::scalar(SizeInBits: 16); |
| 413 | } |
| 414 | |
| 415 | if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) { |
| 416 | APFloat Val = Def->getOperand(i: 1).getFPImm()->getValueAPF(); |
| 417 | bool LosesInfo = true; |
| 418 | Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo); |
| 419 | return !LosesInfo; |
| 420 | } |
| 421 | |
| 422 | return false; |
| 423 | } |
| 424 | |
| 425 | bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI, |
| 426 | Register Src0, |
| 427 | Register Src1, |
| 428 | Register Src2) const { |
| 429 | assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC); |
| 430 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 431 | if (!MRI.hasOneNonDBGUse(RegNo: SrcReg) || MRI.getType(Reg: SrcReg) != LLT::scalar(SizeInBits: 32)) |
| 432 | return false; |
| 433 | |
| 434 | return isFPExtFromF16OrConst(MRI, Reg: Src0) && isFPExtFromF16OrConst(MRI, Reg: Src1) && |
| 435 | isFPExtFromF16OrConst(MRI, Reg: Src2); |
| 436 | } |
| 437 | |
| 438 | void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI, |
| 439 | Register Src0, |
| 440 | Register Src1, |
| 441 | Register Src2) const { |
| 442 | // We expect fptrunc (fpext x) to fold out, and to constant fold any constant |
| 443 | // sources. |
| 444 | Src0 = Builder.buildFPTrunc(Res: LLT::scalar(SizeInBits: 16), Op: Src0).getReg(Idx: 0); |
| 445 | Src1 = Builder.buildFPTrunc(Res: LLT::scalar(SizeInBits: 16), Op: Src1).getReg(Idx: 0); |
| 446 | Src2 = Builder.buildFPTrunc(Res: LLT::scalar(SizeInBits: 16), Op: Src2).getReg(Idx: 0); |
| 447 | |
| 448 | LLT Ty = MRI.getType(Reg: Src0); |
| 449 | auto A1 = Builder.buildFMinNumIEEE(Dst: Ty, Src0, Src1); |
| 450 | auto B1 = Builder.buildFMaxNumIEEE(Dst: Ty, Src0, Src1); |
| 451 | auto C1 = Builder.buildFMaxNumIEEE(Dst: Ty, Src0: A1, Src1: Src2); |
| 452 | Builder.buildFMinNumIEEE(Dst: MI.getOperand(i: 0), Src0: B1, Src1: C1); |
| 453 | MI.eraseFromParent(); |
| 454 | } |
| 455 | |
| 456 | bool AMDGPUCombinerHelper::matchCombineFmulWithSelectToFldexp( |
| 457 | MachineInstr &MI, MachineInstr &Sel, |
| 458 | std::function<void(MachineIRBuilder &)> &MatchInfo) const { |
| 459 | assert(MI.getOpcode() == TargetOpcode::G_FMUL); |
| 460 | assert(Sel.getOpcode() == TargetOpcode::G_SELECT); |
| 461 | assert(MI.getOperand(2).getReg() == Sel.getOperand(0).getReg()); |
| 462 | |
| 463 | Register Dst = MI.getOperand(i: 0).getReg(); |
| 464 | LLT DestTy = MRI.getType(Reg: Dst); |
| 465 | LLT ScalarDestTy = DestTy.getScalarType(); |
| 466 | |
| 467 | if ((ScalarDestTy != LLT::float64() && ScalarDestTy != LLT::float32() && |
| 468 | ScalarDestTy != LLT::float16()) || |
| 469 | !MRI.hasOneNonDBGUse(RegNo: Sel.getOperand(i: 0).getReg())) |
| 470 | return false; |
| 471 | |
| 472 | Register SelectCondReg = Sel.getOperand(i: 1).getReg(); |
| 473 | MachineInstr *SelectTrue = MRI.getVRegDef(Reg: Sel.getOperand(i: 2).getReg()); |
| 474 | MachineInstr *SelectFalse = MRI.getVRegDef(Reg: Sel.getOperand(i: 3).getReg()); |
| 475 | |
| 476 | const auto SelectTrueVal = |
| 477 | isConstantOrConstantSplatVectorFP(MI&: *SelectTrue, MRI); |
| 478 | if (!SelectTrueVal) |
| 479 | return false; |
| 480 | const auto SelectFalseVal = |
| 481 | isConstantOrConstantSplatVectorFP(MI&: *SelectFalse, MRI); |
| 482 | if (!SelectFalseVal) |
| 483 | return false; |
| 484 | |
| 485 | if (SelectTrueVal->isNegative() != SelectFalseVal->isNegative()) |
| 486 | return false; |
| 487 | |
| 488 | // For f32, only non-inline constants should be transformed. |
| 489 | if (ScalarDestTy == LLT::float32() && TII.isInlineConstant(Imm: *SelectTrueVal) && |
| 490 | TII.isInlineConstant(Imm: *SelectFalseVal)) |
| 491 | return false; |
| 492 | |
| 493 | int SelectTrueLog2Val = SelectTrueVal->getExactLog2Abs(); |
| 494 | if (SelectTrueLog2Val == INT_MIN) |
| 495 | return false; |
| 496 | int SelectFalseLog2Val = SelectFalseVal->getExactLog2Abs(); |
| 497 | if (SelectFalseLog2Val == INT_MIN) |
| 498 | return false; |
| 499 | |
| 500 | MatchInfo = [=, &MI](MachineIRBuilder &Builder) { |
| 501 | LLT IntDestTy = DestTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: 32)); |
| 502 | auto NewSel = Builder.buildSelect( |
| 503 | Res: IntDestTy, Tst: SelectCondReg, |
| 504 | Op0: Builder.buildConstant(Res: IntDestTy, Val: SelectTrueLog2Val), |
| 505 | Op1: Builder.buildConstant(Res: IntDestTy, Val: SelectFalseLog2Val)); |
| 506 | |
| 507 | Register XReg = MI.getOperand(i: 1).getReg(); |
| 508 | if (SelectTrueVal->isNegative()) { |
| 509 | auto NegX = |
| 510 | Builder.buildFNeg(Dst: DestTy, Src0: XReg, Flags: MRI.getVRegDef(Reg: XReg)->getFlags()); |
| 511 | Builder.buildFLdexp(Dst, Src0: NegX, Src1: NewSel, Flags: MI.getFlags()); |
| 512 | } else { |
| 513 | Builder.buildFLdexp(Dst, Src0: XReg, Src1: NewSel, Flags: MI.getFlags()); |
| 514 | } |
| 515 | }; |
| 516 | |
| 517 | return true; |
| 518 | } |
| 519 | |