1//=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass does combining of machine instructions at the generic MI level,
10// before the legalizer.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPUCombinerHelper.h"
16#include "AMDGPULegalizerInfo.h"
17#include "GCNSubtarget.h"
18#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
20#include "llvm/CodeGen/GlobalISel/Combiner.h"
21#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
22#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
23#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
24#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/MachineDominators.h"
27#include "llvm/CodeGen/TargetPassConfig.h"
28#include "llvm/Target/TargetMachine.h"
29
30#define GET_GICOMBINER_DEPS
31#include "AMDGPUGenPreLegalizeGICombiner.inc"
32#undef GET_GICOMBINER_DEPS
33
34#define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
35
36using namespace llvm;
37using namespace MIPatternMatch;
38namespace {
39
40#define GET_GICOMBINER_TYPES
41#include "AMDGPUGenPreLegalizeGICombiner.inc"
42#undef GET_GICOMBINER_TYPES
43
44class AMDGPUPreLegalizerCombinerImpl : public Combiner {
45protected:
46 const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig;
47 const GCNSubtarget &STI;
48 // TODO: Make CombinerHelper methods const.
49 mutable AMDGPUCombinerHelper Helper;
50
51public:
52 AMDGPUPreLegalizerCombinerImpl(
53 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
54 GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
55 const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig,
56 const GCNSubtarget &STI, MachineDominatorTree *MDT,
57 const LegalizerInfo *LI);
58
59 static const char *getName() { return "AMDGPUPreLegalizerCombinerImpl"; }
60
61 bool tryCombineAllImpl(MachineInstr &MI) const;
62 bool tryCombineAll(MachineInstr &I) const override;
63
64 struct ClampI64ToI16MatchInfo {
65 int64_t Cmp1 = 0;
66 int64_t Cmp2 = 0;
67 Register Origin;
68 };
69
70 bool matchClampI64ToI16(MachineInstr &MI, const MachineRegisterInfo &MRI,
71 const MachineFunction &MF,
72 ClampI64ToI16MatchInfo &MatchInfo) const;
73
74 void applyClampI64ToI16(MachineInstr &MI,
75 const ClampI64ToI16MatchInfo &MatchInfo) const;
76
77private:
78#define GET_GICOMBINER_CLASS_MEMBERS
79#define AMDGPUSubtarget GCNSubtarget
80#include "AMDGPUGenPreLegalizeGICombiner.inc"
81#undef GET_GICOMBINER_CLASS_MEMBERS
82#undef AMDGPUSubtarget
83};
84
85#define GET_GICOMBINER_IMPL
86#define AMDGPUSubtarget GCNSubtarget
87#include "AMDGPUGenPreLegalizeGICombiner.inc"
88#undef AMDGPUSubtarget
89#undef GET_GICOMBINER_IMPL
90
91AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl(
92 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
93 GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
94 const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig,
95 const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
96 : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
97 Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI),
98#define GET_GICOMBINER_CONSTRUCTOR_INITS
99#include "AMDGPUGenPreLegalizeGICombiner.inc"
100#undef GET_GICOMBINER_CONSTRUCTOR_INITS
101{
102}
103
104bool AMDGPUPreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
105 if (tryCombineAllImpl(I&: MI))
106 return true;
107
108 switch (MI.getOpcode()) {
109 case TargetOpcode::G_SHUFFLE_VECTOR:
110 return Helper.tryCombineShuffleVector(MI);
111 }
112
113 return false;
114}
115
116bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16(
117 MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineFunction &MF,
118 ClampI64ToI16MatchInfo &MatchInfo) const {
119 assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
120
121 // Try to find a pattern where an i64 value should get clamped to short.
122 const LLT SrcType = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
123 if (SrcType != LLT::scalar(SizeInBits: 64))
124 return false;
125
126 const LLT DstType = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
127 if (DstType != LLT::scalar(SizeInBits: 16))
128 return false;
129
130 Register Base;
131
132 auto IsApplicableForCombine = [&MatchInfo]() -> bool {
133 const auto Cmp1 = MatchInfo.Cmp1;
134 const auto Cmp2 = MatchInfo.Cmp2;
135 const auto Diff = std::abs(i: Cmp2 - Cmp1);
136
137 // If the difference between both comparison values is 0 or 1, there is no
138 // need to clamp.
139 if (Diff == 0 || Diff == 1)
140 return false;
141
142 const int64_t Min = std::numeric_limits<int16_t>::min();
143 const int64_t Max = std::numeric_limits<int16_t>::max();
144
145 // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
146 return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
147 (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
148 };
149
150 // Try to match a combination of min / max MIR opcodes.
151 if (mi_match(R: MI.getOperand(i: 1).getReg(), MRI,
152 P: m_GSMin(L: m_Reg(R&: Base), R: m_ICst(Cst&: MatchInfo.Cmp1)))) {
153 if (mi_match(R: Base, MRI,
154 P: m_GSMax(L: m_Reg(R&: MatchInfo.Origin), R: m_ICst(Cst&: MatchInfo.Cmp2)))) {
155 return IsApplicableForCombine();
156 }
157 }
158
159 if (mi_match(R: MI.getOperand(i: 1).getReg(), MRI,
160 P: m_GSMax(L: m_Reg(R&: Base), R: m_ICst(Cst&: MatchInfo.Cmp1)))) {
161 if (mi_match(R: Base, MRI,
162 P: m_GSMin(L: m_Reg(R&: MatchInfo.Origin), R: m_ICst(Cst&: MatchInfo.Cmp2)))) {
163 return IsApplicableForCombine();
164 }
165 }
166
167 return false;
168}
169
170// We want to find a combination of instructions that
171// gets generated when an i64 gets clamped to i16.
172// The corresponding pattern is:
173// G_MAX / G_MAX for i16 <= G_TRUNC i64.
174// This can be efficiently written as following:
175// v_cvt_pk_i16_i32 v0, v0, v1
176// v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
177void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16(
178 MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const {
179
180 Register Src = MatchInfo.Origin;
181 assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
182 LLT::scalar(64));
183 const LLT S32 = LLT::scalar(SizeInBits: 32);
184
185 auto Unmerge = B.buildUnmerge(Res: S32, Op: Src);
186
187 assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
188
189 const LLT V2S16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
190 auto CvtPk =
191 B.buildInstr(Opc: AMDGPU::G_AMDGPU_CVT_PK_I16_I32, DstOps: {V2S16},
192 SrcOps: {Unmerge.getReg(Idx: 0), Unmerge.getReg(Idx: 1)}, Flags: MI.getFlags());
193
194 auto MinBoundary = std::min(a: MatchInfo.Cmp1, b: MatchInfo.Cmp2);
195 auto MaxBoundary = std::max(a: MatchInfo.Cmp1, b: MatchInfo.Cmp2);
196 auto MinBoundaryDst = B.buildConstant(Res: S32, Val: MinBoundary);
197 auto MaxBoundaryDst = B.buildConstant(Res: S32, Val: MaxBoundary);
198
199 auto Bitcast = B.buildBitcast(Dst: {S32}, Src: CvtPk);
200
201 auto Med3 = B.buildInstr(
202 Opc: AMDGPU::G_AMDGPU_SMED3, DstOps: {S32},
203 SrcOps: {MinBoundaryDst.getReg(Idx: 0), Bitcast.getReg(Idx: 0), MaxBoundaryDst.getReg(Idx: 0)},
204 Flags: MI.getFlags());
205
206 B.buildTrunc(Res: MI.getOperand(i: 0).getReg(), Op: Med3);
207
208 MI.eraseFromParent();
209}
210
211// Pass boilerplate
212// ================
213
214class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
215public:
216 static char ID;
217
218 AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
219
220 StringRef getPassName() const override {
221 return "AMDGPUPreLegalizerCombiner";
222 }
223
224 bool runOnMachineFunction(MachineFunction &MF) override;
225
226 void getAnalysisUsage(AnalysisUsage &AU) const override;
227
228private:
229 bool IsOptNone;
230 AMDGPUPreLegalizerCombinerImplRuleConfig RuleConfig;
231};
232} // end anonymous namespace
233
234void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
235 AU.addRequired<TargetPassConfig>();
236 AU.setPreservesCFG();
237 getSelectionDAGFallbackAnalysisUsage(AU);
238 AU.addRequired<GISelKnownBitsAnalysis>();
239 AU.addPreserved<GISelKnownBitsAnalysis>();
240 if (!IsOptNone) {
241 AU.addRequired<MachineDominatorTreeWrapperPass>();
242 AU.addPreserved<MachineDominatorTreeWrapperPass>();
243 }
244
245 AU.addRequired<GISelCSEAnalysisWrapperPass>();
246 AU.addPreserved<GISelCSEAnalysisWrapperPass>();
247 MachineFunctionPass::getAnalysisUsage(AU);
248}
249
250AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
251 : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
252 initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
253
254 if (!RuleConfig.parseCommandLineOption())
255 report_fatal_error(reason: "Invalid rule identifier");
256}
257
258bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
259 if (MF.getProperties().hasProperty(
260 P: MachineFunctionProperties::Property::FailedISel))
261 return false;
262 auto *TPC = &getAnalysis<TargetPassConfig>();
263 const Function &F = MF.getFunction();
264 bool EnableOpt =
265 MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
266 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
267
268 // Enable CSE.
269 GISelCSEAnalysisWrapper &Wrapper =
270 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
271 auto *CSEInfo = &Wrapper.get(CSEOpt: TPC->getCSEConfig());
272
273 const GCNSubtarget &STI = MF.getSubtarget<GCNSubtarget>();
274 MachineDominatorTree *MDT =
275 IsOptNone ? nullptr
276 : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
277 CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
278 nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize());
279 AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig,
280 STI, MDT, STI.getLegalizerInfo());
281 return Impl.combineMachineInstrs();
282}
283
284char AMDGPUPreLegalizerCombiner::ID = 0;
285INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
286 "Combine AMDGPU machine instrs before legalization",
287 false, false)
288INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
289INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
290INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
291 "Combine AMDGPU machine instrs before legalization", false,
292 false)
293
294namespace llvm {
295FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
296 return new AMDGPUPreLegalizerCombiner(IsOptNone);
297}
298} // end namespace llvm
299