| 1 | //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This pass does combining of machine instructions at the generic MI level, |
| 10 | // before the legalizer. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "AMDGPU.h" |
| 15 | #include "AMDGPUCombinerHelper.h" |
| 16 | #include "AMDGPULegalizerInfo.h" |
| 17 | #include "GCNSubtarget.h" |
| 18 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 19 | #include "llvm/CodeGen/GlobalISel/CSEInfo.h" |
| 20 | #include "llvm/CodeGen/GlobalISel/Combiner.h" |
| 21 | #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" |
| 22 | #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" |
| 23 | #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" |
| 24 | #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h" |
| 25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
| 26 | #include "llvm/CodeGen/MachineDominators.h" |
| 27 | #include "llvm/CodeGen/TargetPassConfig.h" |
| 28 | #include "llvm/Target/TargetMachine.h" |
| 29 | |
| 30 | #define GET_GICOMBINER_DEPS |
| 31 | #include "AMDGPUGenPreLegalizeGICombiner.inc" |
| 32 | #undef GET_GICOMBINER_DEPS |
| 33 | |
| 34 | #define DEBUG_TYPE "amdgpu-prelegalizer-combiner" |
| 35 | |
| 36 | using namespace llvm; |
| 37 | using namespace MIPatternMatch; |
| 38 | namespace { |
| 39 | |
| 40 | #define GET_GICOMBINER_TYPES |
| 41 | #include "AMDGPUGenPreLegalizeGICombiner.inc" |
| 42 | #undef GET_GICOMBINER_TYPES |
| 43 | |
| 44 | class AMDGPUPreLegalizerCombinerImpl : public Combiner { |
| 45 | protected: |
| 46 | const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig; |
| 47 | const GCNSubtarget &STI; |
| 48 | const AMDGPUCombinerHelper Helper; |
| 49 | |
| 50 | public: |
| 51 | AMDGPUPreLegalizerCombinerImpl( |
| 52 | MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, |
| 53 | GISelValueTracking &VT, GISelCSEInfo *CSEInfo, |
| 54 | const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, |
| 55 | const GCNSubtarget &STI, MachineDominatorTree *MDT, |
| 56 | const LegalizerInfo *LI); |
| 57 | |
| 58 | static const char *getName() { return "AMDGPUPreLegalizerCombinerImpl" ; } |
| 59 | |
| 60 | bool tryCombineAllImpl(MachineInstr &MI) const; |
| 61 | bool tryCombineAll(MachineInstr &I) const override; |
| 62 | |
| 63 | struct ClampI64ToI16MatchInfo { |
| 64 | int64_t Cmp1 = 0; |
| 65 | int64_t Cmp2 = 0; |
| 66 | Register Origin; |
| 67 | }; |
| 68 | |
| 69 | bool matchClampI64ToI16(MachineInstr &MI, const MachineRegisterInfo &MRI, |
| 70 | const MachineFunction &MF, |
| 71 | ClampI64ToI16MatchInfo &MatchInfo) const; |
| 72 | |
| 73 | void applyClampI64ToI16(MachineInstr &MI, |
| 74 | const ClampI64ToI16MatchInfo &MatchInfo) const; |
| 75 | |
| 76 | private: |
| 77 | #define GET_GICOMBINER_CLASS_MEMBERS |
| 78 | #define AMDGPUSubtarget GCNSubtarget |
| 79 | #include "AMDGPUGenPreLegalizeGICombiner.inc" |
| 80 | #undef GET_GICOMBINER_CLASS_MEMBERS |
| 81 | #undef AMDGPUSubtarget |
| 82 | }; |
| 83 | |
| 84 | #define GET_GICOMBINER_IMPL |
| 85 | #define AMDGPUSubtarget GCNSubtarget |
| 86 | #include "AMDGPUGenPreLegalizeGICombiner.inc" |
| 87 | #undef AMDGPUSubtarget |
| 88 | #undef GET_GICOMBINER_IMPL |
| 89 | |
| 90 | AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl( |
| 91 | MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, |
| 92 | GISelValueTracking &VT, GISelCSEInfo *CSEInfo, |
| 93 | const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, |
| 94 | const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) |
| 95 | : Combiner(MF, CInfo, TPC, &VT, CSEInfo), RuleConfig(RuleConfig), STI(STI), |
| 96 | Helper(Observer, B, /*IsPreLegalize*/ true, &VT, MDT, LI, STI), |
| 97 | #define GET_GICOMBINER_CONSTRUCTOR_INITS |
| 98 | #include "AMDGPUGenPreLegalizeGICombiner.inc" |
| 99 | #undef GET_GICOMBINER_CONSTRUCTOR_INITS |
| 100 | { |
| 101 | } |
| 102 | |
| 103 | bool AMDGPUPreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { |
| 104 | if (tryCombineAllImpl(I&: MI)) |
| 105 | return true; |
| 106 | |
| 107 | switch (MI.getOpcode()) { |
| 108 | case TargetOpcode::G_SHUFFLE_VECTOR: |
| 109 | return Helper.tryCombineShuffleVector(MI); |
| 110 | } |
| 111 | |
| 112 | return false; |
| 113 | } |
| 114 | |
| 115 | bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16( |
| 116 | MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineFunction &MF, |
| 117 | ClampI64ToI16MatchInfo &MatchInfo) const { |
| 118 | assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!" ); |
| 119 | |
| 120 | // Try to find a pattern where an i64 value should get clamped to short. |
| 121 | const LLT SrcType = MRI.getType(Reg: MI.getOperand(i: 1).getReg()); |
| 122 | if (SrcType != LLT::scalar(SizeInBits: 64)) |
| 123 | return false; |
| 124 | |
| 125 | const LLT DstType = MRI.getType(Reg: MI.getOperand(i: 0).getReg()); |
| 126 | if (DstType != LLT::scalar(SizeInBits: 16)) |
| 127 | return false; |
| 128 | |
| 129 | Register Base; |
| 130 | |
| 131 | auto IsApplicableForCombine = [&MatchInfo]() -> bool { |
| 132 | const auto Cmp1 = MatchInfo.Cmp1; |
| 133 | const auto Cmp2 = MatchInfo.Cmp2; |
| 134 | const auto Diff = std::abs(i: Cmp2 - Cmp1); |
| 135 | |
| 136 | // If the difference between both comparison values is 0 or 1, there is no |
| 137 | // need to clamp. |
| 138 | if (Diff == 0 || Diff == 1) |
| 139 | return false; |
| 140 | |
| 141 | const int64_t Min = std::numeric_limits<int16_t>::min(); |
| 142 | const int64_t Max = std::numeric_limits<int16_t>::max(); |
| 143 | |
| 144 | // Check if the comparison values are between SHORT_MIN and SHORT_MAX. |
| 145 | return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || |
| 146 | (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); |
| 147 | }; |
| 148 | |
| 149 | // Try to match a combination of min / max MIR opcodes. |
| 150 | if (mi_match(R: MI.getOperand(i: 1).getReg(), MRI, |
| 151 | P: m_GSMin(L: m_Reg(R&: Base), R: m_ICst(Cst&: MatchInfo.Cmp1)))) { |
| 152 | if (mi_match(R: Base, MRI, |
| 153 | P: m_GSMax(L: m_Reg(R&: MatchInfo.Origin), R: m_ICst(Cst&: MatchInfo.Cmp2)))) { |
| 154 | return IsApplicableForCombine(); |
| 155 | } |
| 156 | } |
| 157 | |
| 158 | if (mi_match(R: MI.getOperand(i: 1).getReg(), MRI, |
| 159 | P: m_GSMax(L: m_Reg(R&: Base), R: m_ICst(Cst&: MatchInfo.Cmp1)))) { |
| 160 | if (mi_match(R: Base, MRI, |
| 161 | P: m_GSMin(L: m_Reg(R&: MatchInfo.Origin), R: m_ICst(Cst&: MatchInfo.Cmp2)))) { |
| 162 | return IsApplicableForCombine(); |
| 163 | } |
| 164 | } |
| 165 | |
| 166 | return false; |
| 167 | } |
| 168 | |
| 169 | // We want to find a combination of instructions that |
| 170 | // gets generated when an i64 gets clamped to i16. |
| 171 | // The corresponding pattern is: |
| 172 | // G_MAX / G_MAX for i16 <= G_TRUNC i64. |
| 173 | // This can be efficiently written as following: |
| 174 | // v_cvt_pk_i16_i32 v0, v0, v1 |
| 175 | // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max |
| 176 | void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16( |
| 177 | MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const { |
| 178 | |
| 179 | Register Src = MatchInfo.Origin; |
| 180 | assert(MI.getParent()->getParent()->getRegInfo().getType(Src) == |
| 181 | LLT::scalar(64)); |
| 182 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 183 | |
| 184 | auto Unmerge = B.buildUnmerge(Res: S32, Op: Src); |
| 185 | |
| 186 | assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32); |
| 187 | |
| 188 | const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16); |
| 189 | auto CvtPk = |
| 190 | B.buildInstr(Opc: AMDGPU::G_AMDGPU_CVT_PK_I16_I32, DstOps: {V2S16}, |
| 191 | SrcOps: {Unmerge.getReg(Idx: 0), Unmerge.getReg(Idx: 1)}, Flags: MI.getFlags()); |
| 192 | |
| 193 | auto MinBoundary = std::min(a: MatchInfo.Cmp1, b: MatchInfo.Cmp2); |
| 194 | auto MaxBoundary = std::max(a: MatchInfo.Cmp1, b: MatchInfo.Cmp2); |
| 195 | auto MinBoundaryDst = B.buildConstant(Res: S32, Val: MinBoundary); |
| 196 | auto MaxBoundaryDst = B.buildConstant(Res: S32, Val: MaxBoundary); |
| 197 | |
| 198 | auto Bitcast = B.buildBitcast(Dst: {S32}, Src: CvtPk); |
| 199 | |
| 200 | auto Med3 = B.buildInstr( |
| 201 | Opc: AMDGPU::G_AMDGPU_SMED3, DstOps: {S32}, |
| 202 | SrcOps: {MinBoundaryDst.getReg(Idx: 0), Bitcast.getReg(Idx: 0), MaxBoundaryDst.getReg(Idx: 0)}, |
| 203 | Flags: MI.getFlags()); |
| 204 | |
| 205 | B.buildTrunc(Res: MI.getOperand(i: 0).getReg(), Op: Med3); |
| 206 | |
| 207 | MI.eraseFromParent(); |
| 208 | } |
| 209 | |
| 210 | // Pass boilerplate |
| 211 | // ================ |
| 212 | |
| 213 | class AMDGPUPreLegalizerCombiner : public MachineFunctionPass { |
| 214 | public: |
| 215 | static char ID; |
| 216 | |
| 217 | AMDGPUPreLegalizerCombiner(bool IsOptNone = false); |
| 218 | |
| 219 | StringRef getPassName() const override { |
| 220 | return "AMDGPUPreLegalizerCombiner" ; |
| 221 | } |
| 222 | |
| 223 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 224 | |
| 225 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
| 226 | |
| 227 | private: |
| 228 | bool IsOptNone; |
| 229 | AMDGPUPreLegalizerCombinerImplRuleConfig RuleConfig; |
| 230 | }; |
| 231 | } // end anonymous namespace |
| 232 | |
| 233 | void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { |
| 234 | AU.addRequired<TargetPassConfig>(); |
| 235 | AU.setPreservesCFG(); |
| 236 | getSelectionDAGFallbackAnalysisUsage(AU); |
| 237 | AU.addRequired<GISelValueTrackingAnalysisLegacy>(); |
| 238 | AU.addPreserved<GISelValueTrackingAnalysisLegacy>(); |
| 239 | if (!IsOptNone) { |
| 240 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
| 241 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
| 242 | } |
| 243 | |
| 244 | AU.addRequired<GISelCSEAnalysisWrapperPass>(); |
| 245 | AU.addPreserved<GISelCSEAnalysisWrapperPass>(); |
| 246 | MachineFunctionPass::getAnalysisUsage(AU); |
| 247 | } |
| 248 | |
| 249 | AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone) |
| 250 | : MachineFunctionPass(ID), IsOptNone(IsOptNone) { |
| 251 | if (!RuleConfig.parseCommandLineOption()) |
| 252 | report_fatal_error(reason: "Invalid rule identifier" ); |
| 253 | } |
| 254 | |
| 255 | bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { |
| 256 | if (MF.getProperties().hasFailedISel()) |
| 257 | return false; |
| 258 | auto *TPC = &getAnalysis<TargetPassConfig>(); |
| 259 | const Function &F = MF.getFunction(); |
| 260 | bool EnableOpt = |
| 261 | MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); |
| 262 | GISelValueTracking *VT = |
| 263 | &getAnalysis<GISelValueTrackingAnalysisLegacy>().get(MF); |
| 264 | |
| 265 | // Enable CSE. |
| 266 | GISelCSEAnalysisWrapper &Wrapper = |
| 267 | getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); |
| 268 | auto *CSEInfo = &Wrapper.get(CSEOpt: TPC->getCSEConfig()); |
| 269 | |
| 270 | const GCNSubtarget &STI = MF.getSubtarget<GCNSubtarget>(); |
| 271 | MachineDominatorTree *MDT = |
| 272 | IsOptNone ? nullptr |
| 273 | : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
| 274 | CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, |
| 275 | nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); |
| 276 | // Disable fixed-point iteration to reduce compile-time |
| 277 | CInfo.MaxIterations = 1; |
| 278 | CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass; |
| 279 | // This is the first Combiner, so the input IR might contain dead |
| 280 | // instructions. |
| 281 | CInfo.EnableFullDCE = true; |
| 282 | AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *VT, CSEInfo, RuleConfig, |
| 283 | STI, MDT, STI.getLegalizerInfo()); |
| 284 | return Impl.combineMachineInstrs(); |
| 285 | } |
| 286 | |
| 287 | char AMDGPUPreLegalizerCombiner::ID = 0; |
| 288 | INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, |
| 289 | "Combine AMDGPU machine instrs before legalization" , |
| 290 | false, false) |
| 291 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
| 292 | INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) |
| 293 | INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, |
| 294 | "Combine AMDGPU machine instrs before legalization" , false, |
| 295 | false) |
| 296 | |
| 297 | FunctionPass *llvm::createAMDGPUPreLegalizeCombiner(bool IsOptNone) { |
| 298 | return new AMDGPUPreLegalizerCombiner(IsOptNone); |
| 299 | } |
| 300 | |