| 1 | //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp --------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This pass does combining of machine instructions at the generic MI level, |
| 10 | // after the legalizer. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "AMDGPU.h" |
| 15 | #include "AMDGPUCombinerHelper.h" |
| 16 | #include "AMDGPULegalizerInfo.h" |
| 17 | #include "GCNSubtarget.h" |
| 18 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 19 | #include "llvm/CodeGen/GlobalISel/Combiner.h" |
| 20 | #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" |
| 21 | #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" |
| 22 | #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" |
| 23 | #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h" |
| 24 | #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" |
| 25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
| 26 | #include "llvm/CodeGen/MachineDominators.h" |
| 27 | #include "llvm/CodeGen/TargetPassConfig.h" |
| 28 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
| 29 | #include "llvm/Target/TargetMachine.h" |
| 30 | |
| 31 | #define GET_GICOMBINER_DEPS |
| 32 | #include "AMDGPUGenPreLegalizeGICombiner.inc" |
| 33 | #undef GET_GICOMBINER_DEPS |
| 34 | |
| 35 | #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" |
| 36 | |
| 37 | using namespace llvm; |
| 38 | using namespace MIPatternMatch; |
| 39 | |
| 40 | namespace { |
| 41 | #define GET_GICOMBINER_TYPES |
| 42 | #include "AMDGPUGenPostLegalizeGICombiner.inc" |
| 43 | #undef GET_GICOMBINER_TYPES |
| 44 | |
| 45 | class AMDGPUPostLegalizerCombinerImpl : public Combiner { |
| 46 | protected: |
| 47 | const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig; |
| 48 | const GCNSubtarget &STI; |
| 49 | const SIInstrInfo &TII; |
| 50 | // TODO: Make CombinerHelper methods const. |
| 51 | mutable AMDGPUCombinerHelper Helper; |
| 52 | |
| 53 | public: |
| 54 | AMDGPUPostLegalizerCombinerImpl( |
| 55 | MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, |
| 56 | GISelValueTracking &VT, GISelCSEInfo *CSEInfo, |
| 57 | const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, |
| 58 | const GCNSubtarget &STI, MachineDominatorTree *MDT, |
| 59 | const LegalizerInfo *LI); |
| 60 | |
| 61 | static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl" ; } |
| 62 | |
| 63 | bool tryCombineAllImpl(MachineInstr &I) const; |
| 64 | bool tryCombineAll(MachineInstr &I) const override; |
| 65 | |
| 66 | struct FMinFMaxLegacyInfo { |
| 67 | Register LHS; |
| 68 | Register RHS; |
| 69 | CmpInst::Predicate Pred; |
| 70 | }; |
| 71 | |
| 72 | // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize |
| 73 | bool matchFMinFMaxLegacy(MachineInstr &MI, MachineInstr &FCmp, |
| 74 | FMinFMaxLegacyInfo &Info) const; |
| 75 | void applySelectFCmpToFMinFMaxLegacy(MachineInstr &MI, |
| 76 | const FMinFMaxLegacyInfo &Info) const; |
| 77 | |
| 78 | bool matchUCharToFloat(MachineInstr &MI) const; |
| 79 | void applyUCharToFloat(MachineInstr &MI) const; |
| 80 | |
| 81 | bool |
| 82 | matchRcpSqrtToRsq(MachineInstr &MI, |
| 83 | std::function<void(MachineIRBuilder &)> &MatchInfo) const; |
| 84 | |
| 85 | bool matchFDivSqrtToRsqF16(MachineInstr &MI) const; |
| 86 | void applyFDivSqrtToRsqF16(MachineInstr &MI, const Register &X) const; |
| 87 | |
| 88 | // FIXME: Should be able to have 2 separate matchdatas rather than custom |
| 89 | // struct boilerplate. |
| 90 | struct CvtF32UByteMatchInfo { |
| 91 | Register CvtVal; |
| 92 | unsigned ShiftOffset; |
| 93 | }; |
| 94 | |
| 95 | bool matchCvtF32UByteN(MachineInstr &MI, |
| 96 | CvtF32UByteMatchInfo &MatchInfo) const; |
| 97 | void applyCvtF32UByteN(MachineInstr &MI, |
| 98 | const CvtF32UByteMatchInfo &MatchInfo) const; |
| 99 | |
| 100 | bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const; |
| 101 | |
| 102 | // Combine unsigned buffer load and signed extension instructions to generate |
| 103 | // signed buffer load instructions. |
| 104 | bool matchCombineSignExtendInReg( |
| 105 | MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const; |
| 106 | void applyCombineSignExtendInReg( |
| 107 | MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const; |
| 108 | |
| 109 | // Find the s_mul_u64 instructions where the higher bits are either |
| 110 | // zero-extended or sign-extended. |
| 111 | // Replace the s_mul_u64 instructions with S_MUL_I64_I32_PSEUDO if the higher |
| 112 | // 33 bits are sign extended and with S_MUL_U64_U32_PSEUDO if the higher 32 |
| 113 | // bits are zero extended. |
| 114 | bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const; |
| 115 | |
| 116 | private: |
| 117 | #define GET_GICOMBINER_CLASS_MEMBERS |
| 118 | #define AMDGPUSubtarget GCNSubtarget |
| 119 | #include "AMDGPUGenPostLegalizeGICombiner.inc" |
| 120 | #undef GET_GICOMBINER_CLASS_MEMBERS |
| 121 | #undef AMDGPUSubtarget |
| 122 | }; |
| 123 | |
| 124 | #define GET_GICOMBINER_IMPL |
| 125 | #define AMDGPUSubtarget GCNSubtarget |
| 126 | #include "AMDGPUGenPostLegalizeGICombiner.inc" |
| 127 | #undef AMDGPUSubtarget |
| 128 | #undef GET_GICOMBINER_IMPL |
| 129 | |
| 130 | AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl( |
| 131 | MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, |
| 132 | GISelValueTracking &VT, GISelCSEInfo *CSEInfo, |
| 133 | const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, |
| 134 | const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) |
| 135 | : Combiner(MF, CInfo, TPC, &VT, CSEInfo), RuleConfig(RuleConfig), STI(STI), |
| 136 | TII(*STI.getInstrInfo()), |
| 137 | Helper(Observer, B, /*IsPreLegalize*/ false, &VT, MDT, LI, STI), |
| 138 | #define GET_GICOMBINER_CONSTRUCTOR_INITS |
| 139 | #include "AMDGPUGenPostLegalizeGICombiner.inc" |
| 140 | #undef GET_GICOMBINER_CONSTRUCTOR_INITS |
| 141 | { |
| 142 | } |
| 143 | |
| 144 | bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { |
| 145 | if (tryCombineAllImpl(I&: MI)) |
| 146 | return true; |
| 147 | |
| 148 | switch (MI.getOpcode()) { |
| 149 | case TargetOpcode::G_SHL: |
| 150 | case TargetOpcode::G_LSHR: |
| 151 | case TargetOpcode::G_ASHR: |
| 152 | // On some subtargets, 64-bit shift is a quarter rate instruction. In the |
| 153 | // common case, splitting this into a move and a 32-bit shift is faster and |
| 154 | // the same code size. |
| 155 | return Helper.tryCombineShiftToUnmerge(MI, TargetShiftAmount: 32); |
| 156 | } |
| 157 | |
| 158 | return false; |
| 159 | } |
| 160 | |
| 161 | bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy( |
| 162 | MachineInstr &MI, MachineInstr &FCmp, FMinFMaxLegacyInfo &Info) const { |
| 163 | if (!MRI.hasOneNonDBGUse(RegNo: FCmp.getOperand(i: 0).getReg())) |
| 164 | return false; |
| 165 | |
| 166 | Info.Pred = |
| 167 | static_cast<CmpInst::Predicate>(FCmp.getOperand(i: 1).getPredicate()); |
| 168 | Info.LHS = FCmp.getOperand(i: 2).getReg(); |
| 169 | Info.RHS = FCmp.getOperand(i: 3).getReg(); |
| 170 | Register True = MI.getOperand(i: 2).getReg(); |
| 171 | Register False = MI.getOperand(i: 3).getReg(); |
| 172 | |
| 173 | // TODO: Handle case where the the selected value is an fneg and the compared |
| 174 | // constant is the negation of the selected value. |
| 175 | if ((Info.LHS != True || Info.RHS != False) && |
| 176 | (Info.LHS != False || Info.RHS != True)) |
| 177 | return false; |
| 178 | |
| 179 | // Invert the predicate if necessary so that the apply function can assume |
| 180 | // that the select operands are the same as the fcmp operands. |
| 181 | // (select (fcmp P, L, R), R, L) -> (select (fcmp !P, L, R), L, R) |
| 182 | if (Info.LHS != True) |
| 183 | Info.Pred = CmpInst::getInversePredicate(pred: Info.Pred); |
| 184 | |
| 185 | // Only match </<=/>=/> not ==/!= etc. |
| 186 | return Info.Pred != CmpInst::getSwappedPredicate(pred: Info.Pred); |
| 187 | } |
| 188 | |
| 189 | void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinFMaxLegacy( |
| 190 | MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const { |
| 191 | unsigned Opc = (Info.Pred & CmpInst::FCMP_OGT) ? AMDGPU::G_AMDGPU_FMAX_LEGACY |
| 192 | : AMDGPU::G_AMDGPU_FMIN_LEGACY; |
| 193 | Register X = Info.LHS; |
| 194 | Register Y = Info.RHS; |
| 195 | if (Info.Pred == CmpInst::getUnorderedPredicate(Pred: Info.Pred)) { |
| 196 | // We need to permute the operands to get the correct NaN behavior. The |
| 197 | // selected operand is the second one based on the failing compare with NaN, |
| 198 | // so permute it based on the compare type the hardware uses. |
| 199 | std::swap(a&: X, b&: Y); |
| 200 | } |
| 201 | |
| 202 | B.buildInstr(Opc, DstOps: {MI.getOperand(i: 0)}, SrcOps: {X, Y}, Flags: MI.getFlags()); |
| 203 | |
| 204 | MI.eraseFromParent(); |
| 205 | } |
| 206 | |
| 207 | bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat( |
| 208 | MachineInstr &MI) const { |
| 209 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 210 | |
| 211 | // TODO: We could try to match extracting the higher bytes, which would be |
| 212 | // easier if i8 vectors weren't promoted to i32 vectors, particularly after |
| 213 | // types are legalized. v4i8 -> v4f32 is probably the only case to worry |
| 214 | // about in practice. |
| 215 | LLT Ty = MRI.getType(Reg: DstReg); |
| 216 | if (Ty == LLT::scalar(SizeInBits: 32) || Ty == LLT::scalar(SizeInBits: 16)) { |
| 217 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 218 | unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits(); |
| 219 | assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); |
| 220 | const APInt Mask = APInt::getHighBitsSet(numBits: SrcSize, hiBitsSet: SrcSize - 8); |
| 221 | return Helper.getValueTracking()->maskedValueIsZero(Val: SrcReg, Mask); |
| 222 | } |
| 223 | |
| 224 | return false; |
| 225 | } |
| 226 | |
| 227 | void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat( |
| 228 | MachineInstr &MI) const { |
| 229 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 230 | |
| 231 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 232 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 233 | LLT Ty = MRI.getType(Reg: DstReg); |
| 234 | LLT SrcTy = MRI.getType(Reg: SrcReg); |
| 235 | if (SrcTy != S32) |
| 236 | SrcReg = B.buildAnyExtOrTrunc(Res: S32, Op: SrcReg).getReg(Idx: 0); |
| 237 | |
| 238 | if (Ty == S32) { |
| 239 | B.buildInstr(Opc: AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, DstOps: {DstReg}, SrcOps: {SrcReg}, |
| 240 | Flags: MI.getFlags()); |
| 241 | } else { |
| 242 | auto Cvt0 = B.buildInstr(Opc: AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, DstOps: {S32}, SrcOps: {SrcReg}, |
| 243 | Flags: MI.getFlags()); |
| 244 | B.buildFPTrunc(Res: DstReg, Op: Cvt0, Flags: MI.getFlags()); |
| 245 | } |
| 246 | |
| 247 | MI.eraseFromParent(); |
| 248 | } |
| 249 | |
| 250 | bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( |
| 251 | MachineInstr &MI, |
| 252 | std::function<void(MachineIRBuilder &)> &MatchInfo) const { |
| 253 | auto getRcpSrc = [=](const MachineInstr &MI) -> MachineInstr * { |
| 254 | if (!MI.getFlag(Flag: MachineInstr::FmContract)) |
| 255 | return nullptr; |
| 256 | |
| 257 | if (auto *GI = dyn_cast<GIntrinsic>(Val: &MI)) { |
| 258 | if (GI->is(ID: Intrinsic::amdgcn_rcp)) |
| 259 | return MRI.getVRegDef(Reg: MI.getOperand(i: 2).getReg()); |
| 260 | } |
| 261 | return nullptr; |
| 262 | }; |
| 263 | |
| 264 | auto getSqrtSrc = [=](const MachineInstr &MI) -> MachineInstr * { |
| 265 | if (!MI.getFlag(Flag: MachineInstr::FmContract)) |
| 266 | return nullptr; |
| 267 | MachineInstr *SqrtSrcMI = nullptr; |
| 268 | auto Match = |
| 269 | mi_match(R: MI.getOperand(i: 0).getReg(), MRI, P: m_GFSqrt(Src: m_MInstr(MI&: SqrtSrcMI))); |
| 270 | (void)Match; |
| 271 | return SqrtSrcMI; |
| 272 | }; |
| 273 | |
| 274 | MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr; |
| 275 | // rcp(sqrt(x)) |
| 276 | if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) { |
| 277 | MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) { |
| 278 | B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {MI.getOperand(i: 0)}) |
| 279 | .addUse(RegNo: SqrtSrcMI->getOperand(i: 0).getReg()) |
| 280 | .setMIFlags(MI.getFlags()); |
| 281 | }; |
| 282 | return true; |
| 283 | } |
| 284 | |
| 285 | // sqrt(rcp(x)) |
| 286 | if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) { |
| 287 | MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) { |
| 288 | B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {MI.getOperand(i: 0)}) |
| 289 | .addUse(RegNo: RcpSrcMI->getOperand(i: 0).getReg()) |
| 290 | .setMIFlags(MI.getFlags()); |
| 291 | }; |
| 292 | return true; |
| 293 | } |
| 294 | return false; |
| 295 | } |
| 296 | |
| 297 | bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16( |
| 298 | MachineInstr &MI) const { |
| 299 | Register Sqrt = MI.getOperand(i: 2).getReg(); |
| 300 | return MRI.hasOneNonDBGUse(RegNo: Sqrt); |
| 301 | } |
| 302 | |
| 303 | void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16( |
| 304 | MachineInstr &MI, const Register &X) const { |
| 305 | Register Dst = MI.getOperand(i: 0).getReg(); |
| 306 | Register Y = MI.getOperand(i: 1).getReg(); |
| 307 | LLT DstTy = MRI.getType(Reg: Dst); |
| 308 | uint32_t Flags = MI.getFlags(); |
| 309 | Register RSQ = B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {DstTy}) |
| 310 | .addUse(RegNo: X) |
| 311 | .setMIFlags(Flags) |
| 312 | .getReg(Idx: 0); |
| 313 | B.buildFMul(Dst, Src0: RSQ, Src1: Y, Flags); |
| 314 | MI.eraseFromParent(); |
| 315 | } |
| 316 | |
| 317 | bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN( |
| 318 | MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const { |
| 319 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 320 | |
| 321 | // Look through G_ZEXT. |
| 322 | bool IsShr = mi_match(R: SrcReg, MRI, P: m_GZExt(Src: m_Reg(R&: SrcReg))); |
| 323 | |
| 324 | Register Src0; |
| 325 | int64_t ShiftAmt; |
| 326 | IsShr = mi_match(R: SrcReg, MRI, P: m_GLShr(L: m_Reg(R&: Src0), R: m_ICst(Cst&: ShiftAmt))); |
| 327 | if (IsShr || mi_match(R: SrcReg, MRI, P: m_GShl(L: m_Reg(R&: Src0), R: m_ICst(Cst&: ShiftAmt)))) { |
| 328 | const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; |
| 329 | |
| 330 | unsigned ShiftOffset = 8 * Offset; |
| 331 | if (IsShr) |
| 332 | ShiftOffset += ShiftAmt; |
| 333 | else |
| 334 | ShiftOffset -= ShiftAmt; |
| 335 | |
| 336 | MatchInfo.CvtVal = Src0; |
| 337 | MatchInfo.ShiftOffset = ShiftOffset; |
| 338 | return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; |
| 339 | } |
| 340 | |
| 341 | // TODO: Simplify demanded bits. |
| 342 | return false; |
| 343 | } |
| 344 | |
| 345 | void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN( |
| 346 | MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const { |
| 347 | unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; |
| 348 | |
| 349 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 350 | Register CvtSrc = MatchInfo.CvtVal; |
| 351 | LLT SrcTy = MRI.getType(Reg: MatchInfo.CvtVal); |
| 352 | if (SrcTy != S32) { |
| 353 | assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); |
| 354 | CvtSrc = B.buildAnyExt(Res: S32, Op: CvtSrc).getReg(Idx: 0); |
| 355 | } |
| 356 | |
| 357 | assert(MI.getOpcode() != NewOpc); |
| 358 | B.buildInstr(Opc: NewOpc, DstOps: {MI.getOperand(i: 0)}, SrcOps: {CvtSrc}, Flags: MI.getFlags()); |
| 359 | MI.eraseFromParent(); |
| 360 | } |
| 361 | |
| 362 | bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize( |
| 363 | MachineInstr &MI, Register &Reg) const { |
| 364 | const SITargetLowering *TLI = static_cast<const SITargetLowering *>( |
| 365 | MF.getSubtarget().getTargetLowering()); |
| 366 | Reg = MI.getOperand(i: 1).getReg(); |
| 367 | return TLI->isCanonicalized(Reg, MF); |
| 368 | } |
| 369 | |
| 370 | // The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8, |
| 371 | // u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined |
| 372 | // with sign extension instrucions in order to generate buffer_load_{i8, i16} |
| 373 | // instructions. |
| 374 | |
| 375 | // Identify buffer_load_{u8, u16}. |
| 376 | bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg( |
| 377 | MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const { |
| 378 | Register LoadReg = MI.getOperand(i: 1).getReg(); |
| 379 | if (!MRI.hasOneNonDBGUse(RegNo: LoadReg)) |
| 380 | return false; |
| 381 | |
| 382 | // Check if the first operand of the sign extension is a subword buffer load |
| 383 | // instruction. |
| 384 | MachineInstr *LoadMI = MRI.getVRegDef(Reg: LoadReg); |
| 385 | int64_t Width = MI.getOperand(i: 2).getImm(); |
| 386 | switch (LoadMI->getOpcode()) { |
| 387 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: |
| 388 | MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE}; |
| 389 | return Width == 8; |
| 390 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: |
| 391 | MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT}; |
| 392 | return Width == 16; |
| 393 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: |
| 394 | MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE}; |
| 395 | return Width == 8; |
| 396 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: |
| 397 | MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT}; |
| 398 | return Width == 16; |
| 399 | } |
| 400 | return false; |
| 401 | } |
| 402 | |
| 403 | // Combine buffer_load_{u8, u16} and the sign extension instruction to generate |
| 404 | // buffer_load_{i8, i16}. |
| 405 | void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg( |
| 406 | MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const { |
| 407 | auto [LoadMI, NewOpcode] = MatchData; |
| 408 | LoadMI->setDesc(TII.get(Opcode: NewOpcode)); |
| 409 | // Update the destination register of the load with the destination register |
| 410 | // of the sign extension. |
| 411 | Register SignExtendInsnDst = MI.getOperand(i: 0).getReg(); |
| 412 | LoadMI->getOperand(i: 0).setReg(SignExtendInsnDst); |
| 413 | // Remove the sign extension. |
| 414 | MI.eraseFromParent(); |
| 415 | } |
| 416 | |
| 417 | bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64( |
| 418 | MachineInstr &MI, unsigned &NewOpcode) const { |
| 419 | Register Src0 = MI.getOperand(i: 1).getReg(); |
| 420 | Register Src1 = MI.getOperand(i: 2).getReg(); |
| 421 | if (MRI.getType(Reg: Src0) != LLT::scalar(SizeInBits: 64)) |
| 422 | return false; |
| 423 | |
| 424 | if (VT->getKnownBits(R: Src1).countMinLeadingZeros() >= 32 && |
| 425 | VT->getKnownBits(R: Src0).countMinLeadingZeros() >= 32) { |
| 426 | NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32; |
| 427 | return true; |
| 428 | } |
| 429 | |
| 430 | if (VT->computeNumSignBits(R: Src1) >= 33 && |
| 431 | VT->computeNumSignBits(R: Src0) >= 33) { |
| 432 | NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32; |
| 433 | return true; |
| 434 | } |
| 435 | return false; |
| 436 | } |
| 437 | |
| 438 | // Pass boilerplate |
| 439 | // ================ |
| 440 | |
| 441 | class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { |
| 442 | public: |
| 443 | static char ID; |
| 444 | |
| 445 | AMDGPUPostLegalizerCombiner(bool IsOptNone = false); |
| 446 | |
| 447 | StringRef getPassName() const override { |
| 448 | return "AMDGPUPostLegalizerCombiner" ; |
| 449 | } |
| 450 | |
| 451 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 452 | |
| 453 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
| 454 | |
| 455 | private: |
| 456 | bool IsOptNone; |
| 457 | AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig; |
| 458 | }; |
| 459 | } // end anonymous namespace |
| 460 | |
| 461 | void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { |
| 462 | AU.addRequired<TargetPassConfig>(); |
| 463 | AU.setPreservesCFG(); |
| 464 | getSelectionDAGFallbackAnalysisUsage(AU); |
| 465 | AU.addRequired<GISelValueTrackingAnalysisLegacy>(); |
| 466 | AU.addPreserved<GISelValueTrackingAnalysisLegacy>(); |
| 467 | if (!IsOptNone) { |
| 468 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
| 469 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
| 470 | } |
| 471 | MachineFunctionPass::getAnalysisUsage(AU); |
| 472 | } |
| 473 | |
| 474 | AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) |
| 475 | : MachineFunctionPass(ID), IsOptNone(IsOptNone) { |
| 476 | if (!RuleConfig.parseCommandLineOption()) |
| 477 | report_fatal_error(reason: "Invalid rule identifier" ); |
| 478 | } |
| 479 | |
| 480 | bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { |
| 481 | if (MF.getProperties().hasFailedISel()) |
| 482 | return false; |
| 483 | auto *TPC = &getAnalysis<TargetPassConfig>(); |
| 484 | const Function &F = MF.getFunction(); |
| 485 | bool EnableOpt = |
| 486 | MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); |
| 487 | |
| 488 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| 489 | const AMDGPULegalizerInfo *LI = |
| 490 | static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); |
| 491 | |
| 492 | GISelValueTracking *VT = |
| 493 | &getAnalysis<GISelValueTrackingAnalysisLegacy>().get(MF); |
| 494 | MachineDominatorTree *MDT = |
| 495 | IsOptNone ? nullptr |
| 496 | : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
| 497 | |
| 498 | CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, |
| 499 | LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); |
| 500 | // Disable fixed-point iteration to reduce compile-time |
| 501 | CInfo.MaxIterations = 1; |
| 502 | CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass; |
| 503 | // Legalizer performs DCE, so a full DCE pass is unnecessary. |
| 504 | CInfo.EnableFullDCE = false; |
| 505 | AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *VT, /*CSEInfo*/ nullptr, |
| 506 | RuleConfig, ST, MDT, LI); |
| 507 | return Impl.combineMachineInstrs(); |
| 508 | } |
| 509 | |
| 510 | char AMDGPUPostLegalizerCombiner::ID = 0; |
| 511 | INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, |
| 512 | "Combine AMDGPU machine instrs after legalization" , false, |
| 513 | false) |
| 514 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
| 515 | INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) |
| 516 | INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, |
| 517 | "Combine AMDGPU machine instrs after legalization" , false, |
| 518 | false) |
| 519 | |
| 520 | FunctionPass *llvm::createAMDGPUPostLegalizeCombiner(bool IsOptNone) { |
| 521 | return new AMDGPUPostLegalizerCombiner(IsOptNone); |
| 522 | } |
| 523 | |