1 | //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp --------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass does combining of machine instructions at the generic MI level, |
10 | // after the legalizer. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPU.h" |
15 | #include "AMDGPUCombinerHelper.h" |
16 | #include "AMDGPULegalizerInfo.h" |
17 | #include "GCNSubtarget.h" |
18 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
19 | #include "llvm/CodeGen/GlobalISel/Combiner.h" |
20 | #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" |
21 | #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" |
22 | #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" |
23 | #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" |
24 | #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" |
25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
26 | #include "llvm/CodeGen/MachineDominators.h" |
27 | #include "llvm/CodeGen/TargetPassConfig.h" |
28 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
29 | #include "llvm/Target/TargetMachine.h" |
30 | |
31 | #define GET_GICOMBINER_DEPS |
32 | #include "AMDGPUGenPreLegalizeGICombiner.inc" |
33 | #undef GET_GICOMBINER_DEPS |
34 | |
35 | #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" |
36 | |
37 | using namespace llvm; |
38 | using namespace MIPatternMatch; |
39 | |
40 | namespace { |
41 | #define GET_GICOMBINER_TYPES |
42 | #include "AMDGPUGenPostLegalizeGICombiner.inc" |
43 | #undef GET_GICOMBINER_TYPES |
44 | |
45 | class AMDGPUPostLegalizerCombinerImpl : public Combiner { |
46 | protected: |
47 | const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig; |
48 | const GCNSubtarget &STI; |
49 | const SIInstrInfo &TII; |
50 | // TODO: Make CombinerHelper methods const. |
51 | mutable AMDGPUCombinerHelper Helper; |
52 | |
53 | public: |
54 | AMDGPUPostLegalizerCombinerImpl( |
55 | MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, |
56 | GISelKnownBits &KB, GISelCSEInfo *CSEInfo, |
57 | const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, |
58 | const GCNSubtarget &STI, MachineDominatorTree *MDT, |
59 | const LegalizerInfo *LI); |
60 | |
61 | static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl" ; } |
62 | |
63 | bool tryCombineAllImpl(MachineInstr &I) const; |
64 | bool tryCombineAll(MachineInstr &I) const override; |
65 | |
66 | struct FMinFMaxLegacyInfo { |
67 | Register LHS; |
68 | Register RHS; |
69 | CmpInst::Predicate Pred; |
70 | }; |
71 | |
72 | // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize |
73 | bool matchFMinFMaxLegacy(MachineInstr &MI, MachineInstr &FCmp, |
74 | FMinFMaxLegacyInfo &Info) const; |
75 | void applySelectFCmpToFMinFMaxLegacy(MachineInstr &MI, |
76 | const FMinFMaxLegacyInfo &Info) const; |
77 | |
78 | bool matchUCharToFloat(MachineInstr &MI) const; |
79 | void applyUCharToFloat(MachineInstr &MI) const; |
80 | |
81 | bool |
82 | matchRcpSqrtToRsq(MachineInstr &MI, |
83 | std::function<void(MachineIRBuilder &)> &MatchInfo) const; |
84 | |
85 | bool matchFDivSqrtToRsqF16(MachineInstr &MI) const; |
86 | void applyFDivSqrtToRsqF16(MachineInstr &MI, const Register &X) const; |
87 | |
88 | // FIXME: Should be able to have 2 separate matchdatas rather than custom |
89 | // struct boilerplate. |
90 | struct CvtF32UByteMatchInfo { |
91 | Register CvtVal; |
92 | unsigned ShiftOffset; |
93 | }; |
94 | |
95 | bool matchCvtF32UByteN(MachineInstr &MI, |
96 | CvtF32UByteMatchInfo &MatchInfo) const; |
97 | void applyCvtF32UByteN(MachineInstr &MI, |
98 | const CvtF32UByteMatchInfo &MatchInfo) const; |
99 | |
100 | bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const; |
101 | |
102 | // Combine unsigned buffer load and signed extension instructions to generate |
103 | // signed buffer load instructions. |
104 | bool matchCombineSignExtendInReg( |
105 | MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const; |
106 | void applyCombineSignExtendInReg( |
107 | MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const; |
108 | |
109 | // Find the s_mul_u64 instructions where the higher bits are either |
110 | // zero-extended or sign-extended. |
111 | // Replace the s_mul_u64 instructions with S_MUL_I64_I32_PSEUDO if the higher |
112 | // 33 bits are sign extended and with S_MUL_U64_U32_PSEUDO if the higher 32 |
113 | // bits are zero extended. |
114 | bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const; |
115 | |
116 | private: |
117 | #define GET_GICOMBINER_CLASS_MEMBERS |
118 | #define AMDGPUSubtarget GCNSubtarget |
119 | #include "AMDGPUGenPostLegalizeGICombiner.inc" |
120 | #undef GET_GICOMBINER_CLASS_MEMBERS |
121 | #undef AMDGPUSubtarget |
122 | }; |
123 | |
124 | #define GET_GICOMBINER_IMPL |
125 | #define AMDGPUSubtarget GCNSubtarget |
126 | #include "AMDGPUGenPostLegalizeGICombiner.inc" |
127 | #undef AMDGPUSubtarget |
128 | #undef GET_GICOMBINER_IMPL |
129 | |
130 | AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl( |
131 | MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, |
132 | GISelKnownBits &KB, GISelCSEInfo *CSEInfo, |
133 | const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, |
134 | const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) |
135 | : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI), |
136 | TII(*STI.getInstrInfo()), |
137 | Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI), |
138 | #define GET_GICOMBINER_CONSTRUCTOR_INITS |
139 | #include "AMDGPUGenPostLegalizeGICombiner.inc" |
140 | #undef GET_GICOMBINER_CONSTRUCTOR_INITS |
141 | { |
142 | } |
143 | |
144 | bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { |
145 | if (tryCombineAllImpl(I&: MI)) |
146 | return true; |
147 | |
148 | switch (MI.getOpcode()) { |
149 | case TargetOpcode::G_SHL: |
150 | case TargetOpcode::G_LSHR: |
151 | case TargetOpcode::G_ASHR: |
152 | // On some subtargets, 64-bit shift is a quarter rate instruction. In the |
153 | // common case, splitting this into a move and a 32-bit shift is faster and |
154 | // the same code size. |
155 | return Helper.tryCombineShiftToUnmerge(MI, TargetShiftAmount: 32); |
156 | } |
157 | |
158 | return false; |
159 | } |
160 | |
161 | bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy( |
162 | MachineInstr &MI, MachineInstr &FCmp, FMinFMaxLegacyInfo &Info) const { |
163 | if (!MRI.hasOneNonDBGUse(RegNo: FCmp.getOperand(i: 0).getReg())) |
164 | return false; |
165 | |
166 | Info.Pred = |
167 | static_cast<CmpInst::Predicate>(FCmp.getOperand(i: 1).getPredicate()); |
168 | Info.LHS = FCmp.getOperand(i: 2).getReg(); |
169 | Info.RHS = FCmp.getOperand(i: 3).getReg(); |
170 | Register True = MI.getOperand(i: 2).getReg(); |
171 | Register False = MI.getOperand(i: 3).getReg(); |
172 | |
173 | // TODO: Handle case where the the selected value is an fneg and the compared |
174 | // constant is the negation of the selected value. |
175 | if ((Info.LHS != True || Info.RHS != False) && |
176 | (Info.LHS != False || Info.RHS != True)) |
177 | return false; |
178 | |
179 | // Invert the predicate if necessary so that the apply function can assume |
180 | // that the select operands are the same as the fcmp operands. |
181 | // (select (fcmp P, L, R), R, L) -> (select (fcmp !P, L, R), L, R) |
182 | if (Info.LHS != True) |
183 | Info.Pred = CmpInst::getInversePredicate(pred: Info.Pred); |
184 | |
185 | // Only match </<=/>=/> not ==/!= etc. |
186 | return Info.Pred != CmpInst::getSwappedPredicate(pred: Info.Pred); |
187 | } |
188 | |
189 | void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinFMaxLegacy( |
190 | MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const { |
191 | unsigned Opc = (Info.Pred & CmpInst::FCMP_OGT) ? AMDGPU::G_AMDGPU_FMAX_LEGACY |
192 | : AMDGPU::G_AMDGPU_FMIN_LEGACY; |
193 | Register X = Info.LHS; |
194 | Register Y = Info.RHS; |
195 | if (Info.Pred == CmpInst::getUnorderedPredicate(Pred: Info.Pred)) { |
196 | // We need to permute the operands to get the correct NaN behavior. The |
197 | // selected operand is the second one based on the failing compare with NaN, |
198 | // so permute it based on the compare type the hardware uses. |
199 | std::swap(a&: X, b&: Y); |
200 | } |
201 | |
202 | B.buildInstr(Opc, DstOps: {MI.getOperand(i: 0)}, SrcOps: {X, Y}, Flags: MI.getFlags()); |
203 | |
204 | MI.eraseFromParent(); |
205 | } |
206 | |
207 | bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat( |
208 | MachineInstr &MI) const { |
209 | Register DstReg = MI.getOperand(i: 0).getReg(); |
210 | |
211 | // TODO: We could try to match extracting the higher bytes, which would be |
212 | // easier if i8 vectors weren't promoted to i32 vectors, particularly after |
213 | // types are legalized. v4i8 -> v4f32 is probably the only case to worry |
214 | // about in practice. |
215 | LLT Ty = MRI.getType(Reg: DstReg); |
216 | if (Ty == LLT::scalar(SizeInBits: 32) || Ty == LLT::scalar(SizeInBits: 16)) { |
217 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
218 | unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits(); |
219 | assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); |
220 | const APInt Mask = APInt::getHighBitsSet(numBits: SrcSize, hiBitsSet: SrcSize - 8); |
221 | return Helper.getKnownBits()->maskedValueIsZero(Val: SrcReg, Mask); |
222 | } |
223 | |
224 | return false; |
225 | } |
226 | |
227 | void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat( |
228 | MachineInstr &MI) const { |
229 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
230 | |
231 | Register DstReg = MI.getOperand(i: 0).getReg(); |
232 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
233 | LLT Ty = MRI.getType(Reg: DstReg); |
234 | LLT SrcTy = MRI.getType(Reg: SrcReg); |
235 | if (SrcTy != S32) |
236 | SrcReg = B.buildAnyExtOrTrunc(Res: S32, Op: SrcReg).getReg(Idx: 0); |
237 | |
238 | if (Ty == S32) { |
239 | B.buildInstr(Opc: AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, DstOps: {DstReg}, SrcOps: {SrcReg}, |
240 | Flags: MI.getFlags()); |
241 | } else { |
242 | auto Cvt0 = B.buildInstr(Opc: AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, DstOps: {S32}, SrcOps: {SrcReg}, |
243 | Flags: MI.getFlags()); |
244 | B.buildFPTrunc(Res: DstReg, Op: Cvt0, Flags: MI.getFlags()); |
245 | } |
246 | |
247 | MI.eraseFromParent(); |
248 | } |
249 | |
250 | bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( |
251 | MachineInstr &MI, |
252 | std::function<void(MachineIRBuilder &)> &MatchInfo) const { |
253 | auto getRcpSrc = [=](const MachineInstr &MI) -> MachineInstr * { |
254 | if (!MI.getFlag(Flag: MachineInstr::FmContract)) |
255 | return nullptr; |
256 | |
257 | if (auto *GI = dyn_cast<GIntrinsic>(Val: &MI)) { |
258 | if (GI->is(ID: Intrinsic::amdgcn_rcp)) |
259 | return MRI.getVRegDef(Reg: MI.getOperand(i: 2).getReg()); |
260 | } |
261 | return nullptr; |
262 | }; |
263 | |
264 | auto getSqrtSrc = [=](const MachineInstr &MI) -> MachineInstr * { |
265 | if (!MI.getFlag(Flag: MachineInstr::FmContract)) |
266 | return nullptr; |
267 | MachineInstr *SqrtSrcMI = nullptr; |
268 | auto Match = |
269 | mi_match(R: MI.getOperand(i: 0).getReg(), MRI, P: m_GFSqrt(Src: m_MInstr(MI&: SqrtSrcMI))); |
270 | (void)Match; |
271 | return SqrtSrcMI; |
272 | }; |
273 | |
274 | MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr; |
275 | // rcp(sqrt(x)) |
276 | if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) { |
277 | MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) { |
278 | B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {MI.getOperand(i: 0)}) |
279 | .addUse(RegNo: SqrtSrcMI->getOperand(i: 0).getReg()) |
280 | .setMIFlags(MI.getFlags()); |
281 | }; |
282 | return true; |
283 | } |
284 | |
285 | // sqrt(rcp(x)) |
286 | if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) { |
287 | MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) { |
288 | B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {MI.getOperand(i: 0)}) |
289 | .addUse(RegNo: RcpSrcMI->getOperand(i: 0).getReg()) |
290 | .setMIFlags(MI.getFlags()); |
291 | }; |
292 | return true; |
293 | } |
294 | return false; |
295 | } |
296 | |
297 | bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16( |
298 | MachineInstr &MI) const { |
299 | Register Sqrt = MI.getOperand(i: 2).getReg(); |
300 | return MRI.hasOneNonDBGUse(RegNo: Sqrt); |
301 | } |
302 | |
303 | void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16( |
304 | MachineInstr &MI, const Register &X) const { |
305 | Register Dst = MI.getOperand(i: 0).getReg(); |
306 | Register Y = MI.getOperand(i: 1).getReg(); |
307 | LLT DstTy = MRI.getType(Reg: Dst); |
308 | uint32_t Flags = MI.getFlags(); |
309 | Register RSQ = B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {DstTy}) |
310 | .addUse(RegNo: X) |
311 | .setMIFlags(Flags) |
312 | .getReg(Idx: 0); |
313 | B.buildFMul(Dst, Src0: RSQ, Src1: Y, Flags); |
314 | MI.eraseFromParent(); |
315 | } |
316 | |
317 | bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN( |
318 | MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const { |
319 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
320 | |
321 | // Look through G_ZEXT. |
322 | bool IsShr = mi_match(R: SrcReg, MRI, P: m_GZExt(Src: m_Reg(R&: SrcReg))); |
323 | |
324 | Register Src0; |
325 | int64_t ShiftAmt; |
326 | IsShr = mi_match(R: SrcReg, MRI, P: m_GLShr(L: m_Reg(R&: Src0), R: m_ICst(Cst&: ShiftAmt))); |
327 | if (IsShr || mi_match(R: SrcReg, MRI, P: m_GShl(L: m_Reg(R&: Src0), R: m_ICst(Cst&: ShiftAmt)))) { |
328 | const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; |
329 | |
330 | unsigned ShiftOffset = 8 * Offset; |
331 | if (IsShr) |
332 | ShiftOffset += ShiftAmt; |
333 | else |
334 | ShiftOffset -= ShiftAmt; |
335 | |
336 | MatchInfo.CvtVal = Src0; |
337 | MatchInfo.ShiftOffset = ShiftOffset; |
338 | return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; |
339 | } |
340 | |
341 | // TODO: Simplify demanded bits. |
342 | return false; |
343 | } |
344 | |
345 | void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN( |
346 | MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const { |
347 | unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; |
348 | |
349 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
350 | Register CvtSrc = MatchInfo.CvtVal; |
351 | LLT SrcTy = MRI.getType(Reg: MatchInfo.CvtVal); |
352 | if (SrcTy != S32) { |
353 | assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); |
354 | CvtSrc = B.buildAnyExt(Res: S32, Op: CvtSrc).getReg(Idx: 0); |
355 | } |
356 | |
357 | assert(MI.getOpcode() != NewOpc); |
358 | B.buildInstr(Opc: NewOpc, DstOps: {MI.getOperand(i: 0)}, SrcOps: {CvtSrc}, Flags: MI.getFlags()); |
359 | MI.eraseFromParent(); |
360 | } |
361 | |
362 | bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize( |
363 | MachineInstr &MI, Register &Reg) const { |
364 | const SITargetLowering *TLI = static_cast<const SITargetLowering *>( |
365 | MF.getSubtarget().getTargetLowering()); |
366 | Reg = MI.getOperand(i: 1).getReg(); |
367 | return TLI->isCanonicalized(Reg, MF); |
368 | } |
369 | |
370 | // The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8, |
371 | // u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined |
372 | // with sign extension instrucions in order to generate buffer_load_{i8, i16} |
373 | // instructions. |
374 | |
375 | // Identify buffer_load_{u8, u16}. |
376 | bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg( |
377 | MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const { |
378 | Register LoadReg = MI.getOperand(i: 1).getReg(); |
379 | if (!MRI.hasOneNonDBGUse(RegNo: LoadReg)) |
380 | return false; |
381 | |
382 | // Check if the first operand of the sign extension is a subword buffer load |
383 | // instruction. |
384 | MachineInstr *LoadMI = MRI.getVRegDef(Reg: LoadReg); |
385 | int64_t Width = MI.getOperand(i: 2).getImm(); |
386 | switch (LoadMI->getOpcode()) { |
387 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: |
388 | MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE}; |
389 | return Width == 8; |
390 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: |
391 | MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT}; |
392 | return Width == 16; |
393 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: |
394 | MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE}; |
395 | return Width == 8; |
396 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: |
397 | MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT}; |
398 | return Width == 16; |
399 | } |
400 | return false; |
401 | } |
402 | |
403 | // Combine buffer_load_{u8, u16} and the sign extension instruction to generate |
404 | // buffer_load_{i8, i16}. |
405 | void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg( |
406 | MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const { |
407 | auto [LoadMI, NewOpcode] = MatchData; |
408 | LoadMI->setDesc(TII.get(Opcode: NewOpcode)); |
409 | // Update the destination register of the load with the destination register |
410 | // of the sign extension. |
411 | Register SignExtendInsnDst = MI.getOperand(i: 0).getReg(); |
412 | LoadMI->getOperand(i: 0).setReg(SignExtendInsnDst); |
413 | // Remove the sign extension. |
414 | MI.eraseFromParent(); |
415 | } |
416 | |
417 | bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64( |
418 | MachineInstr &MI, unsigned &NewOpcode) const { |
419 | Register Src0 = MI.getOperand(i: 1).getReg(); |
420 | Register Src1 = MI.getOperand(i: 2).getReg(); |
421 | if (MRI.getType(Reg: Src0) != LLT::scalar(SizeInBits: 64)) |
422 | return false; |
423 | |
424 | if (KB->getKnownBits(R: Src1).countMinLeadingZeros() >= 32 && |
425 | KB->getKnownBits(R: Src0).countMinLeadingZeros() >= 32) { |
426 | NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32; |
427 | return true; |
428 | } |
429 | |
430 | if (KB->computeNumSignBits(R: Src1) >= 33 && |
431 | KB->computeNumSignBits(R: Src0) >= 33) { |
432 | NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32; |
433 | return true; |
434 | } |
435 | return false; |
436 | } |
437 | |
438 | // Pass boilerplate |
439 | // ================ |
440 | |
441 | class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { |
442 | public: |
443 | static char ID; |
444 | |
445 | AMDGPUPostLegalizerCombiner(bool IsOptNone = false); |
446 | |
447 | StringRef getPassName() const override { |
448 | return "AMDGPUPostLegalizerCombiner" ; |
449 | } |
450 | |
451 | bool runOnMachineFunction(MachineFunction &MF) override; |
452 | |
453 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
454 | |
455 | private: |
456 | bool IsOptNone; |
457 | AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig; |
458 | }; |
459 | } // end anonymous namespace |
460 | |
461 | void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { |
462 | AU.addRequired<TargetPassConfig>(); |
463 | AU.setPreservesCFG(); |
464 | getSelectionDAGFallbackAnalysisUsage(AU); |
465 | AU.addRequired<GISelKnownBitsAnalysis>(); |
466 | AU.addPreserved<GISelKnownBitsAnalysis>(); |
467 | if (!IsOptNone) { |
468 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
469 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
470 | } |
471 | MachineFunctionPass::getAnalysisUsage(AU); |
472 | } |
473 | |
474 | AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) |
475 | : MachineFunctionPass(ID), IsOptNone(IsOptNone) { |
476 | initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); |
477 | |
478 | if (!RuleConfig.parseCommandLineOption()) |
479 | report_fatal_error(reason: "Invalid rule identifier" ); |
480 | } |
481 | |
482 | bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { |
483 | if (MF.getProperties().hasProperty( |
484 | P: MachineFunctionProperties::Property::FailedISel)) |
485 | return false; |
486 | auto *TPC = &getAnalysis<TargetPassConfig>(); |
487 | const Function &F = MF.getFunction(); |
488 | bool EnableOpt = |
489 | MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); |
490 | |
491 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
492 | const AMDGPULegalizerInfo *LI = |
493 | static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); |
494 | |
495 | GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); |
496 | MachineDominatorTree *MDT = |
497 | IsOptNone ? nullptr |
498 | : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
499 | |
500 | CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, |
501 | LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); |
502 | |
503 | AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr, |
504 | RuleConfig, ST, MDT, LI); |
505 | return Impl.combineMachineInstrs(); |
506 | } |
507 | |
508 | char AMDGPUPostLegalizerCombiner::ID = 0; |
509 | INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, |
510 | "Combine AMDGPU machine instrs after legalization" , false, |
511 | false) |
512 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
513 | INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) |
514 | INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, |
515 | "Combine AMDGPU machine instrs after legalization" , false, |
516 | false) |
517 | |
518 | namespace llvm { |
519 | FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { |
520 | return new AMDGPUPostLegalizerCombiner(IsOptNone); |
521 | } |
522 | } // end namespace llvm |
523 | |