1 | //=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "AMDGPUCombinerHelper.h" |
10 | #include "GCNSubtarget.h" |
11 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
12 | #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" |
13 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
14 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
15 | #include "llvm/Target/TargetMachine.h" |
16 | |
17 | using namespace llvm; |
18 | using namespace MIPatternMatch; |
19 | |
20 | LLVM_READNONE |
21 | static bool fnegFoldsIntoMI(const MachineInstr &MI) { |
22 | switch (MI.getOpcode()) { |
23 | case AMDGPU::G_FADD: |
24 | case AMDGPU::G_FSUB: |
25 | case AMDGPU::G_FMUL: |
26 | case AMDGPU::G_FMA: |
27 | case AMDGPU::G_FMAD: |
28 | case AMDGPU::G_FMINNUM: |
29 | case AMDGPU::G_FMAXNUM: |
30 | case AMDGPU::G_FMINNUM_IEEE: |
31 | case AMDGPU::G_FMAXNUM_IEEE: |
32 | case AMDGPU::G_FMINIMUM: |
33 | case AMDGPU::G_FMAXIMUM: |
34 | case AMDGPU::G_FSIN: |
35 | case AMDGPU::G_FPEXT: |
36 | case AMDGPU::G_INTRINSIC_TRUNC: |
37 | case AMDGPU::G_FPTRUNC: |
38 | case AMDGPU::G_FRINT: |
39 | case AMDGPU::G_FNEARBYINT: |
40 | case AMDGPU::G_INTRINSIC_ROUND: |
41 | case AMDGPU::G_INTRINSIC_ROUNDEVEN: |
42 | case AMDGPU::G_FCANONICALIZE: |
43 | case AMDGPU::G_AMDGPU_RCP_IFLAG: |
44 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
45 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: |
46 | return true; |
47 | case AMDGPU::G_INTRINSIC: { |
48 | Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: MI).getIntrinsicID(); |
49 | switch (IntrinsicID) { |
50 | case Intrinsic::amdgcn_rcp: |
51 | case Intrinsic::amdgcn_rcp_legacy: |
52 | case Intrinsic::amdgcn_sin: |
53 | case Intrinsic::amdgcn_fmul_legacy: |
54 | case Intrinsic::amdgcn_fmed3: |
55 | case Intrinsic::amdgcn_fma_legacy: |
56 | return true; |
57 | default: |
58 | return false; |
59 | } |
60 | } |
61 | default: |
62 | return false; |
63 | } |
64 | } |
65 | |
66 | /// \p returns true if the operation will definitely need to use a 64-bit |
67 | /// encoding, and thus will use a VOP3 encoding regardless of the source |
68 | /// modifiers. |
69 | LLVM_READONLY |
70 | static bool opMustUseVOP3Encoding(const MachineInstr &MI, |
71 | const MachineRegisterInfo &MRI) { |
72 | return MI.getNumOperands() > (isa<GIntrinsic>(Val: MI) ? 4u : 3u) || |
73 | MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getScalarSizeInBits() == 64; |
74 | } |
75 | |
76 | // Most FP instructions support source modifiers. |
77 | LLVM_READONLY |
78 | static bool hasSourceMods(const MachineInstr &MI) { |
79 | if (!MI.memoperands().empty()) |
80 | return false; |
81 | |
82 | switch (MI.getOpcode()) { |
83 | case AMDGPU::COPY: |
84 | case AMDGPU::G_SELECT: |
85 | case AMDGPU::G_FDIV: |
86 | case AMDGPU::G_FREM: |
87 | case TargetOpcode::INLINEASM: |
88 | case TargetOpcode::INLINEASM_BR: |
89 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: |
90 | case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: |
91 | case AMDGPU::G_BITCAST: |
92 | case AMDGPU::G_ANYEXT: |
93 | case AMDGPU::G_BUILD_VECTOR: |
94 | case AMDGPU::G_BUILD_VECTOR_TRUNC: |
95 | case AMDGPU::G_PHI: |
96 | return false; |
97 | case AMDGPU::G_INTRINSIC: |
98 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
99 | Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: MI).getIntrinsicID(); |
100 | switch (IntrinsicID) { |
101 | case Intrinsic::amdgcn_interp_p1: |
102 | case Intrinsic::amdgcn_interp_p2: |
103 | case Intrinsic::amdgcn_interp_mov: |
104 | case Intrinsic::amdgcn_interp_p1_f16: |
105 | case Intrinsic::amdgcn_interp_p2_f16: |
106 | case Intrinsic::amdgcn_div_scale: |
107 | return false; |
108 | default: |
109 | return true; |
110 | } |
111 | } |
112 | default: |
113 | return true; |
114 | } |
115 | } |
116 | |
117 | static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, |
118 | unsigned CostThreshold = 4) { |
119 | // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus |
120 | // it is truly free to use a source modifier in all cases. If there are |
121 | // multiple users but for each one will necessitate using VOP3, there will be |
122 | // a code size increase. Try to avoid increasing code size unless we know it |
123 | // will save on the instruction count. |
124 | unsigned NumMayIncreaseSize = 0; |
125 | Register Dst = MI.getOperand(i: 0).getReg(); |
126 | for (const MachineInstr &Use : MRI.use_nodbg_instructions(Reg: Dst)) { |
127 | if (!hasSourceMods(MI: Use)) |
128 | return false; |
129 | |
130 | if (!opMustUseVOP3Encoding(MI: Use, MRI)) { |
131 | if (++NumMayIncreaseSize > CostThreshold) |
132 | return false; |
133 | } |
134 | } |
135 | return true; |
136 | } |
137 | |
138 | static bool mayIgnoreSignedZero(MachineInstr &MI) { |
139 | const TargetOptions &Options = MI.getMF()->getTarget().Options; |
140 | return Options.NoSignedZerosFPMath || MI.getFlag(Flag: MachineInstr::MIFlag::FmNsz); |
141 | } |
142 | |
143 | static bool isInv2Pi(const APFloat &APF) { |
144 | static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118)); |
145 | static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983)); |
146 | static const APFloat KF64(APFloat::IEEEdouble(), |
147 | APInt(64, 0x3fc45f306dc9c882)); |
148 | |
149 | return APF.bitwiseIsEqual(RHS: KF16) || APF.bitwiseIsEqual(RHS: KF32) || |
150 | APF.bitwiseIsEqual(RHS: KF64); |
151 | } |
152 | |
153 | // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an |
154 | // additional cost to negate them. |
155 | static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg, |
156 | MachineRegisterInfo &MRI) { |
157 | std::optional<FPValueAndVReg> FPValReg; |
158 | if (mi_match(R: Reg, MRI, P: m_GFCstOrSplat(FPValReg))) { |
159 | if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative()) |
160 | return true; |
161 | |
162 | const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>(); |
163 | if (ST.hasInv2PiInlineImm() && isInv2Pi(APF: FPValReg->Value)) |
164 | return true; |
165 | } |
166 | return false; |
167 | } |
168 | |
169 | static unsigned inverseMinMax(unsigned Opc) { |
170 | switch (Opc) { |
171 | case AMDGPU::G_FMAXNUM: |
172 | return AMDGPU::G_FMINNUM; |
173 | case AMDGPU::G_FMINNUM: |
174 | return AMDGPU::G_FMAXNUM; |
175 | case AMDGPU::G_FMAXNUM_IEEE: |
176 | return AMDGPU::G_FMINNUM_IEEE; |
177 | case AMDGPU::G_FMINNUM_IEEE: |
178 | return AMDGPU::G_FMAXNUM_IEEE; |
179 | case AMDGPU::G_FMAXIMUM: |
180 | return AMDGPU::G_FMINIMUM; |
181 | case AMDGPU::G_FMINIMUM: |
182 | return AMDGPU::G_FMAXIMUM; |
183 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: |
184 | return AMDGPU::G_AMDGPU_FMIN_LEGACY; |
185 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
186 | return AMDGPU::G_AMDGPU_FMAX_LEGACY; |
187 | default: |
188 | llvm_unreachable("invalid min/max opcode" ); |
189 | } |
190 | } |
191 | |
192 | bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI, |
193 | MachineInstr *&MatchInfo) { |
194 | Register Src = MI.getOperand(i: 1).getReg(); |
195 | MatchInfo = MRI.getVRegDef(Reg: Src); |
196 | |
197 | // If the input has multiple uses and we can either fold the negate down, or |
198 | // the other uses cannot, give up. This both prevents unprofitable |
199 | // transformations and infinite loops: we won't repeatedly try to fold around |
200 | // a negate that has no 'good' form. |
201 | if (MRI.hasOneNonDBGUse(RegNo: Src)) { |
202 | if (allUsesHaveSourceMods(MI, MRI, CostThreshold: 0)) |
203 | return false; |
204 | } else { |
205 | if (fnegFoldsIntoMI(MI: *MatchInfo) && |
206 | (allUsesHaveSourceMods(MI, MRI) || |
207 | !allUsesHaveSourceMods(MI&: *MatchInfo, MRI))) |
208 | return false; |
209 | } |
210 | |
211 | switch (MatchInfo->getOpcode()) { |
212 | case AMDGPU::G_FMINNUM: |
213 | case AMDGPU::G_FMAXNUM: |
214 | case AMDGPU::G_FMINNUM_IEEE: |
215 | case AMDGPU::G_FMAXNUM_IEEE: |
216 | case AMDGPU::G_FMINIMUM: |
217 | case AMDGPU::G_FMAXIMUM: |
218 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
219 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: |
220 | // 0 doesn't have a negated inline immediate. |
221 | return !isConstantCostlierToNegate(MI&: *MatchInfo, |
222 | Reg: MatchInfo->getOperand(i: 2).getReg(), MRI); |
223 | case AMDGPU::G_FADD: |
224 | case AMDGPU::G_FSUB: |
225 | case AMDGPU::G_FMA: |
226 | case AMDGPU::G_FMAD: |
227 | return mayIgnoreSignedZero(MI&: *MatchInfo); |
228 | case AMDGPU::G_FMUL: |
229 | case AMDGPU::G_FPEXT: |
230 | case AMDGPU::G_INTRINSIC_TRUNC: |
231 | case AMDGPU::G_FPTRUNC: |
232 | case AMDGPU::G_FRINT: |
233 | case AMDGPU::G_FNEARBYINT: |
234 | case AMDGPU::G_INTRINSIC_ROUND: |
235 | case AMDGPU::G_INTRINSIC_ROUNDEVEN: |
236 | case AMDGPU::G_FSIN: |
237 | case AMDGPU::G_FCANONICALIZE: |
238 | case AMDGPU::G_AMDGPU_RCP_IFLAG: |
239 | return true; |
240 | case AMDGPU::G_INTRINSIC: |
241 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
242 | Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: MatchInfo)->getIntrinsicID(); |
243 | switch (IntrinsicID) { |
244 | case Intrinsic::amdgcn_rcp: |
245 | case Intrinsic::amdgcn_rcp_legacy: |
246 | case Intrinsic::amdgcn_sin: |
247 | case Intrinsic::amdgcn_fmul_legacy: |
248 | case Intrinsic::amdgcn_fmed3: |
249 | return true; |
250 | case Intrinsic::amdgcn_fma_legacy: |
251 | return mayIgnoreSignedZero(MI&: *MatchInfo); |
252 | default: |
253 | return false; |
254 | } |
255 | } |
256 | default: |
257 | return false; |
258 | } |
259 | } |
260 | |
261 | void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI, |
262 | MachineInstr *&MatchInfo) { |
263 | // Transform: |
264 | // %A = inst %Op1, ... |
265 | // %B = fneg %A |
266 | // |
267 | // into: |
268 | // |
269 | // (if %A has one use, specifically fneg above) |
270 | // %B = inst (maybe fneg %Op1), ... |
271 | // |
272 | // (if %A has multiple uses) |
273 | // %B = inst (maybe fneg %Op1), ... |
274 | // %A = fneg %B |
275 | |
276 | // Replace register in operand with a register holding negated value. |
277 | auto NegateOperand = [&](MachineOperand &Op) { |
278 | Register Reg = Op.getReg(); |
279 | if (!mi_match(R: Reg, MRI, P: m_GFNeg(Src: m_Reg(R&: Reg)))) |
280 | Reg = Builder.buildFNeg(Dst: MRI.getType(Reg), Src0: Reg).getReg(Idx: 0); |
281 | replaceRegOpWith(MRI, FromRegOp&: Op, ToReg: Reg); |
282 | }; |
283 | |
284 | // Replace either register in operands with a register holding negated value. |
285 | auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) { |
286 | Register XReg = X.getReg(); |
287 | Register YReg = Y.getReg(); |
288 | if (mi_match(R: XReg, MRI, P: m_GFNeg(Src: m_Reg(R&: XReg)))) |
289 | replaceRegOpWith(MRI, FromRegOp&: X, ToReg: XReg); |
290 | else if (mi_match(R: YReg, MRI, P: m_GFNeg(Src: m_Reg(R&: YReg)))) |
291 | replaceRegOpWith(MRI, FromRegOp&: Y, ToReg: YReg); |
292 | else { |
293 | YReg = Builder.buildFNeg(Dst: MRI.getType(Reg: YReg), Src0: YReg).getReg(Idx: 0); |
294 | replaceRegOpWith(MRI, FromRegOp&: Y, ToReg: YReg); |
295 | } |
296 | }; |
297 | |
298 | Builder.setInstrAndDebugLoc(*MatchInfo); |
299 | |
300 | // Negate appropriate operands so that resulting value of MatchInfo is |
301 | // negated. |
302 | switch (MatchInfo->getOpcode()) { |
303 | case AMDGPU::G_FADD: |
304 | case AMDGPU::G_FSUB: |
305 | NegateOperand(MatchInfo->getOperand(i: 1)); |
306 | NegateOperand(MatchInfo->getOperand(i: 2)); |
307 | break; |
308 | case AMDGPU::G_FMUL: |
309 | NegateEitherOperand(MatchInfo->getOperand(i: 1), MatchInfo->getOperand(i: 2)); |
310 | break; |
311 | case AMDGPU::G_FMINNUM: |
312 | case AMDGPU::G_FMAXNUM: |
313 | case AMDGPU::G_FMINNUM_IEEE: |
314 | case AMDGPU::G_FMAXNUM_IEEE: |
315 | case AMDGPU::G_FMINIMUM: |
316 | case AMDGPU::G_FMAXIMUM: |
317 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
318 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: { |
319 | NegateOperand(MatchInfo->getOperand(i: 1)); |
320 | NegateOperand(MatchInfo->getOperand(i: 2)); |
321 | unsigned Opposite = inverseMinMax(Opc: MatchInfo->getOpcode()); |
322 | replaceOpcodeWith(FromMI&: *MatchInfo, ToOpcode: Opposite); |
323 | break; |
324 | } |
325 | case AMDGPU::G_FMA: |
326 | case AMDGPU::G_FMAD: |
327 | NegateEitherOperand(MatchInfo->getOperand(i: 1), MatchInfo->getOperand(i: 2)); |
328 | NegateOperand(MatchInfo->getOperand(i: 3)); |
329 | break; |
330 | case AMDGPU::G_FPEXT: |
331 | case AMDGPU::G_INTRINSIC_TRUNC: |
332 | case AMDGPU::G_FRINT: |
333 | case AMDGPU::G_FNEARBYINT: |
334 | case AMDGPU::G_INTRINSIC_ROUND: |
335 | case AMDGPU::G_INTRINSIC_ROUNDEVEN: |
336 | case AMDGPU::G_FSIN: |
337 | case AMDGPU::G_FCANONICALIZE: |
338 | case AMDGPU::G_AMDGPU_RCP_IFLAG: |
339 | case AMDGPU::G_FPTRUNC: |
340 | NegateOperand(MatchInfo->getOperand(i: 1)); |
341 | break; |
342 | case AMDGPU::G_INTRINSIC: |
343 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
344 | Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: MatchInfo)->getIntrinsicID(); |
345 | switch (IntrinsicID) { |
346 | case Intrinsic::amdgcn_rcp: |
347 | case Intrinsic::amdgcn_rcp_legacy: |
348 | case Intrinsic::amdgcn_sin: |
349 | NegateOperand(MatchInfo->getOperand(i: 2)); |
350 | break; |
351 | case Intrinsic::amdgcn_fmul_legacy: |
352 | NegateEitherOperand(MatchInfo->getOperand(i: 2), MatchInfo->getOperand(i: 3)); |
353 | break; |
354 | case Intrinsic::amdgcn_fmed3: |
355 | NegateOperand(MatchInfo->getOperand(i: 2)); |
356 | NegateOperand(MatchInfo->getOperand(i: 3)); |
357 | NegateOperand(MatchInfo->getOperand(i: 4)); |
358 | break; |
359 | case Intrinsic::amdgcn_fma_legacy: |
360 | NegateEitherOperand(MatchInfo->getOperand(i: 2), MatchInfo->getOperand(i: 3)); |
361 | NegateOperand(MatchInfo->getOperand(i: 4)); |
362 | break; |
363 | default: |
364 | llvm_unreachable("folding fneg not supported for this intrinsic" ); |
365 | } |
366 | break; |
367 | } |
368 | default: |
369 | llvm_unreachable("folding fneg not supported for this instruction" ); |
370 | } |
371 | |
372 | Register Dst = MI.getOperand(i: 0).getReg(); |
373 | Register MatchInfoDst = MatchInfo->getOperand(i: 0).getReg(); |
374 | |
375 | if (MRI.hasOneNonDBGUse(RegNo: MatchInfoDst)) { |
376 | // MatchInfo now has negated value so use that instead of old Dst. |
377 | replaceRegWith(MRI, FromReg: Dst, ToReg: MatchInfoDst); |
378 | } else { |
379 | // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa |
380 | // but replaceRegWith will replace defs as well. It is easier to replace one |
381 | // def with a new register. |
382 | LLT Type = MRI.getType(Reg: Dst); |
383 | Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Ty: Type); |
384 | replaceRegOpWith(MRI, FromRegOp&: MatchInfo->getOperand(i: 0), ToReg: NegatedMatchInfo); |
385 | |
386 | // MatchInfo now has negated value so use that instead of old Dst. |
387 | replaceRegWith(MRI, FromReg: Dst, ToReg: NegatedMatchInfo); |
388 | |
389 | // Recreate non negated value for other uses of old MatchInfoDst |
390 | auto NextInst = ++MatchInfo->getIterator(); |
391 | Builder.setInstrAndDebugLoc(*NextInst); |
392 | Builder.buildFNeg(Dst: MatchInfoDst, Src0: NegatedMatchInfo, Flags: MI.getFlags()); |
393 | } |
394 | |
395 | MI.eraseFromParent(); |
396 | } |
397 | |
398 | // TODO: Should return converted value / extension source and avoid introducing |
399 | // intermediate fptruncs in the apply function. |
400 | static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI, |
401 | Register Reg) { |
402 | const MachineInstr *Def = MRI.getVRegDef(Reg); |
403 | if (Def->getOpcode() == TargetOpcode::G_FPEXT) { |
404 | Register SrcReg = Def->getOperand(i: 1).getReg(); |
405 | return MRI.getType(Reg: SrcReg) == LLT::scalar(SizeInBits: 16); |
406 | } |
407 | |
408 | if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) { |
409 | APFloat Val = Def->getOperand(i: 1).getFPImm()->getValueAPF(); |
410 | bool LosesInfo = true; |
411 | Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo); |
412 | return !LosesInfo; |
413 | } |
414 | |
415 | return false; |
416 | } |
417 | |
418 | bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI, |
419 | Register Src0, |
420 | Register Src1, |
421 | Register Src2) { |
422 | assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC); |
423 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
424 | if (!MRI.hasOneNonDBGUse(RegNo: SrcReg) || MRI.getType(Reg: SrcReg) != LLT::scalar(SizeInBits: 32)) |
425 | return false; |
426 | |
427 | return isFPExtFromF16OrConst(MRI, Reg: Src0) && isFPExtFromF16OrConst(MRI, Reg: Src1) && |
428 | isFPExtFromF16OrConst(MRI, Reg: Src2); |
429 | } |
430 | |
431 | void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI, |
432 | Register Src0, |
433 | Register Src1, |
434 | Register Src2) { |
435 | // We expect fptrunc (fpext x) to fold out, and to constant fold any constant |
436 | // sources. |
437 | Src0 = Builder.buildFPTrunc(Res: LLT::scalar(SizeInBits: 16), Op: Src0).getReg(Idx: 0); |
438 | Src1 = Builder.buildFPTrunc(Res: LLT::scalar(SizeInBits: 16), Op: Src1).getReg(Idx: 0); |
439 | Src2 = Builder.buildFPTrunc(Res: LLT::scalar(SizeInBits: 16), Op: Src2).getReg(Idx: 0); |
440 | |
441 | LLT Ty = MRI.getType(Reg: Src0); |
442 | auto A1 = Builder.buildFMinNumIEEE(Dst: Ty, Src0, Src1); |
443 | auto B1 = Builder.buildFMaxNumIEEE(Dst: Ty, Src0, Src1); |
444 | auto C1 = Builder.buildFMaxNumIEEE(Dst: Ty, Src0: A1, Src1: Src2); |
445 | Builder.buildFMinNumIEEE(Dst: MI.getOperand(i: 0), Src0: B1, Src1: C1); |
446 | MI.eraseFromParent(); |
447 | } |
448 | |