1 | //=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "AMDGPUCombinerHelper.h" |
10 | #include "GCNSubtarget.h" |
11 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
12 | #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" |
13 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
14 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
15 | #include "llvm/Target/TargetMachine.h" |
16 | |
17 | using namespace llvm; |
18 | using namespace MIPatternMatch; |
19 | |
20 | AMDGPUCombinerHelper::AMDGPUCombinerHelper( |
21 | GISelChangeObserver &Observer, MachineIRBuilder &B, bool IsPreLegalize, |
22 | GISelValueTracking *VT, MachineDominatorTree *MDT, const LegalizerInfo *LI, |
23 | const GCNSubtarget &STI) |
24 | : CombinerHelper(Observer, B, IsPreLegalize, VT, MDT, LI), STI(STI), |
25 | TII(*STI.getInstrInfo()) {} |
26 | |
27 | LLVM_READNONE |
28 | static bool fnegFoldsIntoMI(const MachineInstr &MI) { |
29 | switch (MI.getOpcode()) { |
30 | case AMDGPU::G_FADD: |
31 | case AMDGPU::G_FSUB: |
32 | case AMDGPU::G_FMUL: |
33 | case AMDGPU::G_FMA: |
34 | case AMDGPU::G_FMAD: |
35 | case AMDGPU::G_FMINNUM: |
36 | case AMDGPU::G_FMAXNUM: |
37 | case AMDGPU::G_FMINNUM_IEEE: |
38 | case AMDGPU::G_FMAXNUM_IEEE: |
39 | case AMDGPU::G_FMINIMUM: |
40 | case AMDGPU::G_FMAXIMUM: |
41 | case AMDGPU::G_FSIN: |
42 | case AMDGPU::G_FPEXT: |
43 | case AMDGPU::G_INTRINSIC_TRUNC: |
44 | case AMDGPU::G_FPTRUNC: |
45 | case AMDGPU::G_FRINT: |
46 | case AMDGPU::G_FNEARBYINT: |
47 | case AMDGPU::G_INTRINSIC_ROUND: |
48 | case AMDGPU::G_INTRINSIC_ROUNDEVEN: |
49 | case AMDGPU::G_FCANONICALIZE: |
50 | case AMDGPU::G_AMDGPU_RCP_IFLAG: |
51 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
52 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: |
53 | return true; |
54 | case AMDGPU::G_INTRINSIC: { |
55 | Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: MI).getIntrinsicID(); |
56 | switch (IntrinsicID) { |
57 | case Intrinsic::amdgcn_rcp: |
58 | case Intrinsic::amdgcn_rcp_legacy: |
59 | case Intrinsic::amdgcn_sin: |
60 | case Intrinsic::amdgcn_fmul_legacy: |
61 | case Intrinsic::amdgcn_fmed3: |
62 | case Intrinsic::amdgcn_fma_legacy: |
63 | return true; |
64 | default: |
65 | return false; |
66 | } |
67 | } |
68 | default: |
69 | return false; |
70 | } |
71 | } |
72 | |
73 | /// \p returns true if the operation will definitely need to use a 64-bit |
74 | /// encoding, and thus will use a VOP3 encoding regardless of the source |
75 | /// modifiers. |
76 | LLVM_READONLY |
77 | static bool opMustUseVOP3Encoding(const MachineInstr &MI, |
78 | const MachineRegisterInfo &MRI) { |
79 | return MI.getNumOperands() > (isa<GIntrinsic>(Val: MI) ? 4u : 3u) || |
80 | MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getScalarSizeInBits() == 64; |
81 | } |
82 | |
83 | // Most FP instructions support source modifiers. |
84 | LLVM_READONLY |
85 | static bool hasSourceMods(const MachineInstr &MI) { |
86 | if (!MI.memoperands().empty()) |
87 | return false; |
88 | |
89 | switch (MI.getOpcode()) { |
90 | case AMDGPU::COPY: |
91 | case AMDGPU::G_SELECT: |
92 | case AMDGPU::G_FDIV: |
93 | case AMDGPU::G_FREM: |
94 | case TargetOpcode::INLINEASM: |
95 | case TargetOpcode::INLINEASM_BR: |
96 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: |
97 | case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: |
98 | case AMDGPU::G_BITCAST: |
99 | case AMDGPU::G_ANYEXT: |
100 | case AMDGPU::G_BUILD_VECTOR: |
101 | case AMDGPU::G_BUILD_VECTOR_TRUNC: |
102 | case AMDGPU::G_PHI: |
103 | return false; |
104 | case AMDGPU::G_INTRINSIC: |
105 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
106 | Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: MI).getIntrinsicID(); |
107 | switch (IntrinsicID) { |
108 | case Intrinsic::amdgcn_interp_p1: |
109 | case Intrinsic::amdgcn_interp_p2: |
110 | case Intrinsic::amdgcn_interp_mov: |
111 | case Intrinsic::amdgcn_interp_p1_f16: |
112 | case Intrinsic::amdgcn_interp_p2_f16: |
113 | case Intrinsic::amdgcn_div_scale: |
114 | return false; |
115 | default: |
116 | return true; |
117 | } |
118 | } |
119 | default: |
120 | return true; |
121 | } |
122 | } |
123 | |
124 | static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, |
125 | unsigned CostThreshold = 4) { |
126 | // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus |
127 | // it is truly free to use a source modifier in all cases. If there are |
128 | // multiple users but for each one will necessitate using VOP3, there will be |
129 | // a code size increase. Try to avoid increasing code size unless we know it |
130 | // will save on the instruction count. |
131 | unsigned NumMayIncreaseSize = 0; |
132 | Register Dst = MI.getOperand(i: 0).getReg(); |
133 | for (const MachineInstr &Use : MRI.use_nodbg_instructions(Reg: Dst)) { |
134 | if (!hasSourceMods(MI: Use)) |
135 | return false; |
136 | |
137 | if (!opMustUseVOP3Encoding(MI: Use, MRI)) { |
138 | if (++NumMayIncreaseSize > CostThreshold) |
139 | return false; |
140 | } |
141 | } |
142 | return true; |
143 | } |
144 | |
145 | static bool mayIgnoreSignedZero(MachineInstr &MI) { |
146 | const TargetOptions &Options = MI.getMF()->getTarget().Options; |
147 | return Options.NoSignedZerosFPMath || MI.getFlag(Flag: MachineInstr::MIFlag::FmNsz); |
148 | } |
149 | |
150 | static bool isInv2Pi(const APFloat &APF) { |
151 | static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118)); |
152 | static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983)); |
153 | static const APFloat KF64(APFloat::IEEEdouble(), |
154 | APInt(64, 0x3fc45f306dc9c882)); |
155 | |
156 | return APF.bitwiseIsEqual(RHS: KF16) || APF.bitwiseIsEqual(RHS: KF32) || |
157 | APF.bitwiseIsEqual(RHS: KF64); |
158 | } |
159 | |
160 | // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an |
161 | // additional cost to negate them. |
162 | static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg, |
163 | MachineRegisterInfo &MRI) { |
164 | std::optional<FPValueAndVReg> FPValReg; |
165 | if (mi_match(R: Reg, MRI, P: m_GFCstOrSplat(FPValReg))) { |
166 | if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative()) |
167 | return true; |
168 | |
169 | const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>(); |
170 | if (ST.hasInv2PiInlineImm() && isInv2Pi(APF: FPValReg->Value)) |
171 | return true; |
172 | } |
173 | return false; |
174 | } |
175 | |
176 | static unsigned inverseMinMax(unsigned Opc) { |
177 | switch (Opc) { |
178 | case AMDGPU::G_FMAXNUM: |
179 | return AMDGPU::G_FMINNUM; |
180 | case AMDGPU::G_FMINNUM: |
181 | return AMDGPU::G_FMAXNUM; |
182 | case AMDGPU::G_FMAXNUM_IEEE: |
183 | return AMDGPU::G_FMINNUM_IEEE; |
184 | case AMDGPU::G_FMINNUM_IEEE: |
185 | return AMDGPU::G_FMAXNUM_IEEE; |
186 | case AMDGPU::G_FMAXIMUM: |
187 | return AMDGPU::G_FMINIMUM; |
188 | case AMDGPU::G_FMINIMUM: |
189 | return AMDGPU::G_FMAXIMUM; |
190 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: |
191 | return AMDGPU::G_AMDGPU_FMIN_LEGACY; |
192 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
193 | return AMDGPU::G_AMDGPU_FMAX_LEGACY; |
194 | default: |
195 | llvm_unreachable("invalid min/max opcode" ); |
196 | } |
197 | } |
198 | |
199 | bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI, |
200 | MachineInstr *&MatchInfo) const { |
201 | Register Src = MI.getOperand(i: 1).getReg(); |
202 | MatchInfo = MRI.getVRegDef(Reg: Src); |
203 | |
204 | // If the input has multiple uses and we can either fold the negate down, or |
205 | // the other uses cannot, give up. This both prevents unprofitable |
206 | // transformations and infinite loops: we won't repeatedly try to fold around |
207 | // a negate that has no 'good' form. |
208 | if (MRI.hasOneNonDBGUse(RegNo: Src)) { |
209 | if (allUsesHaveSourceMods(MI, MRI, CostThreshold: 0)) |
210 | return false; |
211 | } else { |
212 | if (fnegFoldsIntoMI(MI: *MatchInfo) && |
213 | (allUsesHaveSourceMods(MI, MRI) || |
214 | !allUsesHaveSourceMods(MI&: *MatchInfo, MRI))) |
215 | return false; |
216 | } |
217 | |
218 | switch (MatchInfo->getOpcode()) { |
219 | case AMDGPU::G_FMINNUM: |
220 | case AMDGPU::G_FMAXNUM: |
221 | case AMDGPU::G_FMINNUM_IEEE: |
222 | case AMDGPU::G_FMAXNUM_IEEE: |
223 | case AMDGPU::G_FMINIMUM: |
224 | case AMDGPU::G_FMAXIMUM: |
225 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
226 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: |
227 | // 0 doesn't have a negated inline immediate. |
228 | return !isConstantCostlierToNegate(MI&: *MatchInfo, |
229 | Reg: MatchInfo->getOperand(i: 2).getReg(), MRI); |
230 | case AMDGPU::G_FADD: |
231 | case AMDGPU::G_FSUB: |
232 | case AMDGPU::G_FMA: |
233 | case AMDGPU::G_FMAD: |
234 | return mayIgnoreSignedZero(MI&: *MatchInfo); |
235 | case AMDGPU::G_FMUL: |
236 | case AMDGPU::G_FPEXT: |
237 | case AMDGPU::G_INTRINSIC_TRUNC: |
238 | case AMDGPU::G_FPTRUNC: |
239 | case AMDGPU::G_FRINT: |
240 | case AMDGPU::G_FNEARBYINT: |
241 | case AMDGPU::G_INTRINSIC_ROUND: |
242 | case AMDGPU::G_INTRINSIC_ROUNDEVEN: |
243 | case AMDGPU::G_FSIN: |
244 | case AMDGPU::G_FCANONICALIZE: |
245 | case AMDGPU::G_AMDGPU_RCP_IFLAG: |
246 | return true; |
247 | case AMDGPU::G_INTRINSIC: |
248 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
249 | Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: MatchInfo)->getIntrinsicID(); |
250 | switch (IntrinsicID) { |
251 | case Intrinsic::amdgcn_rcp: |
252 | case Intrinsic::amdgcn_rcp_legacy: |
253 | case Intrinsic::amdgcn_sin: |
254 | case Intrinsic::amdgcn_fmul_legacy: |
255 | case Intrinsic::amdgcn_fmed3: |
256 | return true; |
257 | case Intrinsic::amdgcn_fma_legacy: |
258 | return mayIgnoreSignedZero(MI&: *MatchInfo); |
259 | default: |
260 | return false; |
261 | } |
262 | } |
263 | default: |
264 | return false; |
265 | } |
266 | } |
267 | |
268 | void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI, |
269 | MachineInstr *&MatchInfo) const { |
270 | // Transform: |
271 | // %A = inst %Op1, ... |
272 | // %B = fneg %A |
273 | // |
274 | // into: |
275 | // |
276 | // (if %A has one use, specifically fneg above) |
277 | // %B = inst (maybe fneg %Op1), ... |
278 | // |
279 | // (if %A has multiple uses) |
280 | // %B = inst (maybe fneg %Op1), ... |
281 | // %A = fneg %B |
282 | |
283 | // Replace register in operand with a register holding negated value. |
284 | auto NegateOperand = [&](MachineOperand &Op) { |
285 | Register Reg = Op.getReg(); |
286 | if (!mi_match(R: Reg, MRI, P: m_GFNeg(Src: m_Reg(R&: Reg)))) |
287 | Reg = Builder.buildFNeg(Dst: MRI.getType(Reg), Src0: Reg).getReg(Idx: 0); |
288 | replaceRegOpWith(MRI, FromRegOp&: Op, ToReg: Reg); |
289 | }; |
290 | |
291 | // Replace either register in operands with a register holding negated value. |
292 | auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) { |
293 | Register XReg = X.getReg(); |
294 | Register YReg = Y.getReg(); |
295 | if (mi_match(R: XReg, MRI, P: m_GFNeg(Src: m_Reg(R&: XReg)))) |
296 | replaceRegOpWith(MRI, FromRegOp&: X, ToReg: XReg); |
297 | else if (mi_match(R: YReg, MRI, P: m_GFNeg(Src: m_Reg(R&: YReg)))) |
298 | replaceRegOpWith(MRI, FromRegOp&: Y, ToReg: YReg); |
299 | else { |
300 | YReg = Builder.buildFNeg(Dst: MRI.getType(Reg: YReg), Src0: YReg).getReg(Idx: 0); |
301 | replaceRegOpWith(MRI, FromRegOp&: Y, ToReg: YReg); |
302 | } |
303 | }; |
304 | |
305 | Builder.setInstrAndDebugLoc(*MatchInfo); |
306 | |
307 | // Negate appropriate operands so that resulting value of MatchInfo is |
308 | // negated. |
309 | switch (MatchInfo->getOpcode()) { |
310 | case AMDGPU::G_FADD: |
311 | case AMDGPU::G_FSUB: |
312 | NegateOperand(MatchInfo->getOperand(i: 1)); |
313 | NegateOperand(MatchInfo->getOperand(i: 2)); |
314 | break; |
315 | case AMDGPU::G_FMUL: |
316 | NegateEitherOperand(MatchInfo->getOperand(i: 1), MatchInfo->getOperand(i: 2)); |
317 | break; |
318 | case AMDGPU::G_FMINNUM: |
319 | case AMDGPU::G_FMAXNUM: |
320 | case AMDGPU::G_FMINNUM_IEEE: |
321 | case AMDGPU::G_FMAXNUM_IEEE: |
322 | case AMDGPU::G_FMINIMUM: |
323 | case AMDGPU::G_FMAXIMUM: |
324 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
325 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: { |
326 | NegateOperand(MatchInfo->getOperand(i: 1)); |
327 | NegateOperand(MatchInfo->getOperand(i: 2)); |
328 | unsigned Opposite = inverseMinMax(Opc: MatchInfo->getOpcode()); |
329 | replaceOpcodeWith(FromMI&: *MatchInfo, ToOpcode: Opposite); |
330 | break; |
331 | } |
332 | case AMDGPU::G_FMA: |
333 | case AMDGPU::G_FMAD: |
334 | NegateEitherOperand(MatchInfo->getOperand(i: 1), MatchInfo->getOperand(i: 2)); |
335 | NegateOperand(MatchInfo->getOperand(i: 3)); |
336 | break; |
337 | case AMDGPU::G_FPEXT: |
338 | case AMDGPU::G_INTRINSIC_TRUNC: |
339 | case AMDGPU::G_FRINT: |
340 | case AMDGPU::G_FNEARBYINT: |
341 | case AMDGPU::G_INTRINSIC_ROUND: |
342 | case AMDGPU::G_INTRINSIC_ROUNDEVEN: |
343 | case AMDGPU::G_FSIN: |
344 | case AMDGPU::G_FCANONICALIZE: |
345 | case AMDGPU::G_AMDGPU_RCP_IFLAG: |
346 | case AMDGPU::G_FPTRUNC: |
347 | NegateOperand(MatchInfo->getOperand(i: 1)); |
348 | break; |
349 | case AMDGPU::G_INTRINSIC: |
350 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
351 | Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: MatchInfo)->getIntrinsicID(); |
352 | switch (IntrinsicID) { |
353 | case Intrinsic::amdgcn_rcp: |
354 | case Intrinsic::amdgcn_rcp_legacy: |
355 | case Intrinsic::amdgcn_sin: |
356 | NegateOperand(MatchInfo->getOperand(i: 2)); |
357 | break; |
358 | case Intrinsic::amdgcn_fmul_legacy: |
359 | NegateEitherOperand(MatchInfo->getOperand(i: 2), MatchInfo->getOperand(i: 3)); |
360 | break; |
361 | case Intrinsic::amdgcn_fmed3: |
362 | NegateOperand(MatchInfo->getOperand(i: 2)); |
363 | NegateOperand(MatchInfo->getOperand(i: 3)); |
364 | NegateOperand(MatchInfo->getOperand(i: 4)); |
365 | break; |
366 | case Intrinsic::amdgcn_fma_legacy: |
367 | NegateEitherOperand(MatchInfo->getOperand(i: 2), MatchInfo->getOperand(i: 3)); |
368 | NegateOperand(MatchInfo->getOperand(i: 4)); |
369 | break; |
370 | default: |
371 | llvm_unreachable("folding fneg not supported for this intrinsic" ); |
372 | } |
373 | break; |
374 | } |
375 | default: |
376 | llvm_unreachable("folding fneg not supported for this instruction" ); |
377 | } |
378 | |
379 | Register Dst = MI.getOperand(i: 0).getReg(); |
380 | Register MatchInfoDst = MatchInfo->getOperand(i: 0).getReg(); |
381 | |
382 | if (MRI.hasOneNonDBGUse(RegNo: MatchInfoDst)) { |
383 | // MatchInfo now has negated value so use that instead of old Dst. |
384 | replaceRegWith(MRI, FromReg: Dst, ToReg: MatchInfoDst); |
385 | } else { |
386 | // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa |
387 | // but replaceRegWith will replace defs as well. It is easier to replace one |
388 | // def with a new register. |
389 | LLT Type = MRI.getType(Reg: Dst); |
390 | Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Ty: Type); |
391 | replaceRegOpWith(MRI, FromRegOp&: MatchInfo->getOperand(i: 0), ToReg: NegatedMatchInfo); |
392 | |
393 | // MatchInfo now has negated value so use that instead of old Dst. |
394 | replaceRegWith(MRI, FromReg: Dst, ToReg: NegatedMatchInfo); |
395 | |
396 | // Recreate non negated value for other uses of old MatchInfoDst |
397 | auto NextInst = ++MatchInfo->getIterator(); |
398 | Builder.setInstrAndDebugLoc(*NextInst); |
399 | Builder.buildFNeg(Dst: MatchInfoDst, Src0: NegatedMatchInfo, Flags: MI.getFlags()); |
400 | } |
401 | |
402 | MI.eraseFromParent(); |
403 | } |
404 | |
405 | // TODO: Should return converted value / extension source and avoid introducing |
406 | // intermediate fptruncs in the apply function. |
407 | static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI, |
408 | Register Reg) { |
409 | const MachineInstr *Def = MRI.getVRegDef(Reg); |
410 | if (Def->getOpcode() == TargetOpcode::G_FPEXT) { |
411 | Register SrcReg = Def->getOperand(i: 1).getReg(); |
412 | return MRI.getType(Reg: SrcReg) == LLT::scalar(SizeInBits: 16); |
413 | } |
414 | |
415 | if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) { |
416 | APFloat Val = Def->getOperand(i: 1).getFPImm()->getValueAPF(); |
417 | bool LosesInfo = true; |
418 | Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo); |
419 | return !LosesInfo; |
420 | } |
421 | |
422 | return false; |
423 | } |
424 | |
425 | bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI, |
426 | Register Src0, |
427 | Register Src1, |
428 | Register Src2) const { |
429 | assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC); |
430 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
431 | if (!MRI.hasOneNonDBGUse(RegNo: SrcReg) || MRI.getType(Reg: SrcReg) != LLT::scalar(SizeInBits: 32)) |
432 | return false; |
433 | |
434 | return isFPExtFromF16OrConst(MRI, Reg: Src0) && isFPExtFromF16OrConst(MRI, Reg: Src1) && |
435 | isFPExtFromF16OrConst(MRI, Reg: Src2); |
436 | } |
437 | |
438 | void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI, |
439 | Register Src0, |
440 | Register Src1, |
441 | Register Src2) const { |
442 | // We expect fptrunc (fpext x) to fold out, and to constant fold any constant |
443 | // sources. |
444 | Src0 = Builder.buildFPTrunc(Res: LLT::scalar(SizeInBits: 16), Op: Src0).getReg(Idx: 0); |
445 | Src1 = Builder.buildFPTrunc(Res: LLT::scalar(SizeInBits: 16), Op: Src1).getReg(Idx: 0); |
446 | Src2 = Builder.buildFPTrunc(Res: LLT::scalar(SizeInBits: 16), Op: Src2).getReg(Idx: 0); |
447 | |
448 | LLT Ty = MRI.getType(Reg: Src0); |
449 | auto A1 = Builder.buildFMinNumIEEE(Dst: Ty, Src0, Src1); |
450 | auto B1 = Builder.buildFMaxNumIEEE(Dst: Ty, Src0, Src1); |
451 | auto C1 = Builder.buildFMaxNumIEEE(Dst: Ty, Src0: A1, Src1: Src2); |
452 | Builder.buildFMinNumIEEE(Dst: MI.getOperand(i: 0), Src0: B1, Src1: C1); |
453 | MI.eraseFromParent(); |
454 | } |
455 | |
456 | bool AMDGPUCombinerHelper::matchCombineFmulWithSelectToFldexp( |
457 | MachineInstr &MI, MachineInstr &Sel, |
458 | std::function<void(MachineIRBuilder &)> &MatchInfo) const { |
459 | assert(MI.getOpcode() == TargetOpcode::G_FMUL); |
460 | assert(Sel.getOpcode() == TargetOpcode::G_SELECT); |
461 | assert(MI.getOperand(2).getReg() == Sel.getOperand(0).getReg()); |
462 | |
463 | Register Dst = MI.getOperand(i: 0).getReg(); |
464 | LLT DestTy = MRI.getType(Reg: Dst); |
465 | LLT ScalarDestTy = DestTy.getScalarType(); |
466 | |
467 | if ((ScalarDestTy != LLT::float64() && ScalarDestTy != LLT::float32() && |
468 | ScalarDestTy != LLT::float16()) || |
469 | !MRI.hasOneNonDBGUse(RegNo: Sel.getOperand(i: 0).getReg())) |
470 | return false; |
471 | |
472 | Register SelectCondReg = Sel.getOperand(i: 1).getReg(); |
473 | MachineInstr *SelectTrue = MRI.getVRegDef(Reg: Sel.getOperand(i: 2).getReg()); |
474 | MachineInstr *SelectFalse = MRI.getVRegDef(Reg: Sel.getOperand(i: 3).getReg()); |
475 | |
476 | const auto SelectTrueVal = |
477 | isConstantOrConstantSplatVectorFP(MI&: *SelectTrue, MRI); |
478 | if (!SelectTrueVal) |
479 | return false; |
480 | const auto SelectFalseVal = |
481 | isConstantOrConstantSplatVectorFP(MI&: *SelectFalse, MRI); |
482 | if (!SelectFalseVal) |
483 | return false; |
484 | |
485 | if (SelectTrueVal->isNegative() != SelectFalseVal->isNegative()) |
486 | return false; |
487 | |
488 | // For f32, only non-inline constants should be transformed. |
489 | if (ScalarDestTy == LLT::float32() && TII.isInlineConstant(Imm: *SelectTrueVal) && |
490 | TII.isInlineConstant(Imm: *SelectFalseVal)) |
491 | return false; |
492 | |
493 | int SelectTrueLog2Val = SelectTrueVal->getExactLog2Abs(); |
494 | if (SelectTrueLog2Val == INT_MIN) |
495 | return false; |
496 | int SelectFalseLog2Val = SelectFalseVal->getExactLog2Abs(); |
497 | if (SelectFalseLog2Val == INT_MIN) |
498 | return false; |
499 | |
500 | MatchInfo = [=, &MI](MachineIRBuilder &Builder) { |
501 | LLT IntDestTy = DestTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: 32)); |
502 | auto NewSel = Builder.buildSelect( |
503 | Res: IntDestTy, Tst: SelectCondReg, |
504 | Op0: Builder.buildConstant(Res: IntDestTy, Val: SelectTrueLog2Val), |
505 | Op1: Builder.buildConstant(Res: IntDestTy, Val: SelectFalseLog2Val)); |
506 | |
507 | Register XReg = MI.getOperand(i: 1).getReg(); |
508 | if (SelectTrueVal->isNegative()) { |
509 | auto NegX = |
510 | Builder.buildFNeg(Dst: DestTy, Src0: XReg, Flags: MRI.getVRegDef(Reg: XReg)->getFlags()); |
511 | Builder.buildFLdexp(Dst, Src0: NegX, Src1: NewSel, Flags: MI.getFlags()); |
512 | } else { |
513 | Builder.buildFLdexp(Dst, Src0: XReg, Src1: NewSel, Flags: MI.getFlags()); |
514 | } |
515 | }; |
516 | |
517 | return true; |
518 | } |
519 | |