1 | //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass does combining of machine instructions at the generic MI level, |
10 | // before the legalizer. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPU.h" |
15 | #include "AMDGPUCombinerHelper.h" |
16 | #include "AMDGPULegalizerInfo.h" |
17 | #include "GCNSubtarget.h" |
18 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
19 | #include "llvm/CodeGen/GlobalISel/CSEInfo.h" |
20 | #include "llvm/CodeGen/GlobalISel/Combiner.h" |
21 | #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" |
22 | #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" |
23 | #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" |
24 | #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" |
25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
26 | #include "llvm/CodeGen/MachineDominators.h" |
27 | #include "llvm/CodeGen/TargetPassConfig.h" |
28 | #include "llvm/Target/TargetMachine.h" |
29 | |
30 | #define GET_GICOMBINER_DEPS |
31 | #include "AMDGPUGenPreLegalizeGICombiner.inc" |
32 | #undef GET_GICOMBINER_DEPS |
33 | |
34 | #define DEBUG_TYPE "amdgpu-prelegalizer-combiner" |
35 | |
36 | using namespace llvm; |
37 | using namespace MIPatternMatch; |
38 | namespace { |
39 | |
40 | #define GET_GICOMBINER_TYPES |
41 | #include "AMDGPUGenPreLegalizeGICombiner.inc" |
42 | #undef GET_GICOMBINER_TYPES |
43 | |
44 | class AMDGPUPreLegalizerCombinerImpl : public Combiner { |
45 | protected: |
46 | const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig; |
47 | const GCNSubtarget &STI; |
48 | // TODO: Make CombinerHelper methods const. |
49 | mutable AMDGPUCombinerHelper Helper; |
50 | |
51 | public: |
52 | AMDGPUPreLegalizerCombinerImpl( |
53 | MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, |
54 | GISelKnownBits &KB, GISelCSEInfo *CSEInfo, |
55 | const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, |
56 | const GCNSubtarget &STI, MachineDominatorTree *MDT, |
57 | const LegalizerInfo *LI); |
58 | |
59 | static const char *getName() { return "AMDGPUPreLegalizerCombinerImpl" ; } |
60 | |
61 | bool tryCombineAllImpl(MachineInstr &MI) const; |
62 | bool tryCombineAll(MachineInstr &I) const override; |
63 | |
64 | struct ClampI64ToI16MatchInfo { |
65 | int64_t Cmp1 = 0; |
66 | int64_t Cmp2 = 0; |
67 | Register Origin; |
68 | }; |
69 | |
70 | bool matchClampI64ToI16(MachineInstr &MI, const MachineRegisterInfo &MRI, |
71 | const MachineFunction &MF, |
72 | ClampI64ToI16MatchInfo &MatchInfo) const; |
73 | |
74 | void applyClampI64ToI16(MachineInstr &MI, |
75 | const ClampI64ToI16MatchInfo &MatchInfo) const; |
76 | |
77 | private: |
78 | #define GET_GICOMBINER_CLASS_MEMBERS |
79 | #define AMDGPUSubtarget GCNSubtarget |
80 | #include "AMDGPUGenPreLegalizeGICombiner.inc" |
81 | #undef GET_GICOMBINER_CLASS_MEMBERS |
82 | #undef AMDGPUSubtarget |
83 | }; |
84 | |
85 | #define GET_GICOMBINER_IMPL |
86 | #define AMDGPUSubtarget GCNSubtarget |
87 | #include "AMDGPUGenPreLegalizeGICombiner.inc" |
88 | #undef AMDGPUSubtarget |
89 | #undef GET_GICOMBINER_IMPL |
90 | |
91 | AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl( |
92 | MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, |
93 | GISelKnownBits &KB, GISelCSEInfo *CSEInfo, |
94 | const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, |
95 | const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) |
96 | : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI), |
97 | Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI), |
98 | #define GET_GICOMBINER_CONSTRUCTOR_INITS |
99 | #include "AMDGPUGenPreLegalizeGICombiner.inc" |
100 | #undef GET_GICOMBINER_CONSTRUCTOR_INITS |
101 | { |
102 | } |
103 | |
104 | bool AMDGPUPreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { |
105 | if (tryCombineAllImpl(I&: MI)) |
106 | return true; |
107 | |
108 | switch (MI.getOpcode()) { |
109 | case TargetOpcode::G_SHUFFLE_VECTOR: |
110 | return Helper.tryCombineShuffleVector(MI); |
111 | } |
112 | |
113 | return false; |
114 | } |
115 | |
116 | bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16( |
117 | MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineFunction &MF, |
118 | ClampI64ToI16MatchInfo &MatchInfo) const { |
119 | assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!" ); |
120 | |
121 | // Try to find a pattern where an i64 value should get clamped to short. |
122 | const LLT SrcType = MRI.getType(Reg: MI.getOperand(i: 1).getReg()); |
123 | if (SrcType != LLT::scalar(SizeInBits: 64)) |
124 | return false; |
125 | |
126 | const LLT DstType = MRI.getType(Reg: MI.getOperand(i: 0).getReg()); |
127 | if (DstType != LLT::scalar(SizeInBits: 16)) |
128 | return false; |
129 | |
130 | Register Base; |
131 | |
132 | auto IsApplicableForCombine = [&MatchInfo]() -> bool { |
133 | const auto Cmp1 = MatchInfo.Cmp1; |
134 | const auto Cmp2 = MatchInfo.Cmp2; |
135 | const auto Diff = std::abs(i: Cmp2 - Cmp1); |
136 | |
137 | // If the difference between both comparison values is 0 or 1, there is no |
138 | // need to clamp. |
139 | if (Diff == 0 || Diff == 1) |
140 | return false; |
141 | |
142 | const int64_t Min = std::numeric_limits<int16_t>::min(); |
143 | const int64_t Max = std::numeric_limits<int16_t>::max(); |
144 | |
145 | // Check if the comparison values are between SHORT_MIN and SHORT_MAX. |
146 | return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || |
147 | (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); |
148 | }; |
149 | |
150 | // Try to match a combination of min / max MIR opcodes. |
151 | if (mi_match(R: MI.getOperand(i: 1).getReg(), MRI, |
152 | P: m_GSMin(L: m_Reg(R&: Base), R: m_ICst(Cst&: MatchInfo.Cmp1)))) { |
153 | if (mi_match(R: Base, MRI, |
154 | P: m_GSMax(L: m_Reg(R&: MatchInfo.Origin), R: m_ICst(Cst&: MatchInfo.Cmp2)))) { |
155 | return IsApplicableForCombine(); |
156 | } |
157 | } |
158 | |
159 | if (mi_match(R: MI.getOperand(i: 1).getReg(), MRI, |
160 | P: m_GSMax(L: m_Reg(R&: Base), R: m_ICst(Cst&: MatchInfo.Cmp1)))) { |
161 | if (mi_match(R: Base, MRI, |
162 | P: m_GSMin(L: m_Reg(R&: MatchInfo.Origin), R: m_ICst(Cst&: MatchInfo.Cmp2)))) { |
163 | return IsApplicableForCombine(); |
164 | } |
165 | } |
166 | |
167 | return false; |
168 | } |
169 | |
170 | // We want to find a combination of instructions that |
171 | // gets generated when an i64 gets clamped to i16. |
172 | // The corresponding pattern is: |
173 | // G_MAX / G_MAX for i16 <= G_TRUNC i64. |
174 | // This can be efficiently written as following: |
175 | // v_cvt_pk_i16_i32 v0, v0, v1 |
176 | // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max |
177 | void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16( |
178 | MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const { |
179 | |
180 | Register Src = MatchInfo.Origin; |
181 | assert(MI.getParent()->getParent()->getRegInfo().getType(Src) == |
182 | LLT::scalar(64)); |
183 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
184 | |
185 | auto Unmerge = B.buildUnmerge(Res: S32, Op: Src); |
186 | |
187 | assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32); |
188 | |
189 | const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16); |
190 | auto CvtPk = |
191 | B.buildInstr(Opc: AMDGPU::G_AMDGPU_CVT_PK_I16_I32, DstOps: {V2S16}, |
192 | SrcOps: {Unmerge.getReg(Idx: 0), Unmerge.getReg(Idx: 1)}, Flags: MI.getFlags()); |
193 | |
194 | auto MinBoundary = std::min(a: MatchInfo.Cmp1, b: MatchInfo.Cmp2); |
195 | auto MaxBoundary = std::max(a: MatchInfo.Cmp1, b: MatchInfo.Cmp2); |
196 | auto MinBoundaryDst = B.buildConstant(Res: S32, Val: MinBoundary); |
197 | auto MaxBoundaryDst = B.buildConstant(Res: S32, Val: MaxBoundary); |
198 | |
199 | auto Bitcast = B.buildBitcast(Dst: {S32}, Src: CvtPk); |
200 | |
201 | auto Med3 = B.buildInstr( |
202 | Opc: AMDGPU::G_AMDGPU_SMED3, DstOps: {S32}, |
203 | SrcOps: {MinBoundaryDst.getReg(Idx: 0), Bitcast.getReg(Idx: 0), MaxBoundaryDst.getReg(Idx: 0)}, |
204 | Flags: MI.getFlags()); |
205 | |
206 | B.buildTrunc(Res: MI.getOperand(i: 0).getReg(), Op: Med3); |
207 | |
208 | MI.eraseFromParent(); |
209 | } |
210 | |
211 | // Pass boilerplate |
212 | // ================ |
213 | |
214 | class AMDGPUPreLegalizerCombiner : public MachineFunctionPass { |
215 | public: |
216 | static char ID; |
217 | |
218 | AMDGPUPreLegalizerCombiner(bool IsOptNone = false); |
219 | |
220 | StringRef getPassName() const override { |
221 | return "AMDGPUPreLegalizerCombiner" ; |
222 | } |
223 | |
224 | bool runOnMachineFunction(MachineFunction &MF) override; |
225 | |
226 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
227 | |
228 | private: |
229 | bool IsOptNone; |
230 | AMDGPUPreLegalizerCombinerImplRuleConfig RuleConfig; |
231 | }; |
232 | } // end anonymous namespace |
233 | |
234 | void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { |
235 | AU.addRequired<TargetPassConfig>(); |
236 | AU.setPreservesCFG(); |
237 | getSelectionDAGFallbackAnalysisUsage(AU); |
238 | AU.addRequired<GISelKnownBitsAnalysis>(); |
239 | AU.addPreserved<GISelKnownBitsAnalysis>(); |
240 | if (!IsOptNone) { |
241 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
242 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
243 | } |
244 | |
245 | AU.addRequired<GISelCSEAnalysisWrapperPass>(); |
246 | AU.addPreserved<GISelCSEAnalysisWrapperPass>(); |
247 | MachineFunctionPass::getAnalysisUsage(AU); |
248 | } |
249 | |
250 | AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone) |
251 | : MachineFunctionPass(ID), IsOptNone(IsOptNone) { |
252 | initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); |
253 | |
254 | if (!RuleConfig.parseCommandLineOption()) |
255 | report_fatal_error(reason: "Invalid rule identifier" ); |
256 | } |
257 | |
258 | bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { |
259 | if (MF.getProperties().hasProperty( |
260 | P: MachineFunctionProperties::Property::FailedISel)) |
261 | return false; |
262 | auto *TPC = &getAnalysis<TargetPassConfig>(); |
263 | const Function &F = MF.getFunction(); |
264 | bool EnableOpt = |
265 | MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); |
266 | GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); |
267 | |
268 | // Enable CSE. |
269 | GISelCSEAnalysisWrapper &Wrapper = |
270 | getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); |
271 | auto *CSEInfo = &Wrapper.get(CSEOpt: TPC->getCSEConfig()); |
272 | |
273 | const GCNSubtarget &STI = MF.getSubtarget<GCNSubtarget>(); |
274 | MachineDominatorTree *MDT = |
275 | IsOptNone ? nullptr |
276 | : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
277 | CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, |
278 | nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); |
279 | AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig, |
280 | STI, MDT, STI.getLegalizerInfo()); |
281 | return Impl.combineMachineInstrs(); |
282 | } |
283 | |
284 | char AMDGPUPreLegalizerCombiner::ID = 0; |
285 | INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, |
286 | "Combine AMDGPU machine instrs before legalization" , |
287 | false, false) |
288 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
289 | INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) |
290 | INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, |
291 | "Combine AMDGPU machine instrs before legalization" , false, |
292 | false) |
293 | |
294 | namespace llvm { |
295 | FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) { |
296 | return new AMDGPUPreLegalizerCombiner(IsOptNone); |
297 | } |
298 | } // end namespace llvm |
299 | |