1//=== lib/CodeGen/GlobalISel/AMDGPURegBankCombiner.cpp ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass does combining of machine instructions at the generic MI level,
10// after register banks are known.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPULegalizerInfo.h"
16#include "AMDGPURegisterBankInfo.h"
17#include "GCNSubtarget.h"
18#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19#include "SIMachineFunctionInfo.h"
20#include "llvm/CodeGen/GlobalISel/Combiner.h"
21#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
22#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
23#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
24#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/MachineDominators.h"
27#include "llvm/CodeGen/TargetPassConfig.h"
28#include "llvm/Target/TargetMachine.h"
29
30#define GET_GICOMBINER_DEPS
31#include "AMDGPUGenPreLegalizeGICombiner.inc"
32#undef GET_GICOMBINER_DEPS
33
34#define DEBUG_TYPE "amdgpu-regbank-combiner"
35
36using namespace llvm;
37using namespace MIPatternMatch;
38
39namespace {
40#define GET_GICOMBINER_TYPES
41#include "AMDGPUGenRegBankGICombiner.inc"
42#undef GET_GICOMBINER_TYPES
43
44class AMDGPURegBankCombinerImpl : public Combiner {
45protected:
46 const AMDGPURegBankCombinerImplRuleConfig &RuleConfig;
47 const GCNSubtarget &STI;
48 const RegisterBankInfo &RBI;
49 const TargetRegisterInfo &TRI;
50 const SIInstrInfo &TII;
51 const CombinerHelper Helper;
52
53public:
54 AMDGPURegBankCombinerImpl(
55 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
56 GISelValueTracking &VT, GISelCSEInfo *CSEInfo,
57 const AMDGPURegBankCombinerImplRuleConfig &RuleConfig,
58 const GCNSubtarget &STI, MachineDominatorTree *MDT,
59 const LegalizerInfo *LI);
60
61 static const char *getName() { return "AMDGPURegBankCombinerImpl"; }
62
63 bool tryCombineAll(MachineInstr &I) const override;
64
65 bool isVgprRegBank(Register Reg) const;
66 Register getAsVgpr(Register Reg) const;
67
68 struct MinMaxMedOpc {
69 unsigned Min, Max, Med;
70 };
71
72 struct Med3MatchInfo {
73 unsigned Opc;
74 Register Val0, Val1, Val2;
75 };
76
77 MinMaxMedOpc getMinMaxPair(unsigned Opc) const;
78
79 template <class m_Cst, typename CstTy>
80 bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc,
81 Register &Val, CstTy &K0, CstTy &K1) const;
82
83 bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
84 bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
85 bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg) const;
86 bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg) const;
87 void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
88 void applyClamp(MachineInstr &MI, Register &Reg) const;
89
90 void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const;
91
92 bool combineD16Load(MachineInstr &MI) const;
93 bool applyD16Load(unsigned D16Opc, MachineInstr &DstMI,
94 MachineInstr *SmallLoad, Register ToOverwriteD16) const;
95
96private:
97 SIModeRegisterDefaults getMode() const;
98 bool getIEEE() const;
99 bool getDX10Clamp() const;
100 bool isFminnumIeee(const MachineInstr &MI) const;
101 bool isFCst(MachineInstr *MI) const;
102 bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1) const;
103
104#define GET_GICOMBINER_CLASS_MEMBERS
105#define AMDGPUSubtarget GCNSubtarget
106#include "AMDGPUGenRegBankGICombiner.inc"
107#undef GET_GICOMBINER_CLASS_MEMBERS
108#undef AMDGPUSubtarget
109};
110
111#define GET_GICOMBINER_IMPL
112#define AMDGPUSubtarget GCNSubtarget
113#include "AMDGPUGenRegBankGICombiner.inc"
114#undef AMDGPUSubtarget
115#undef GET_GICOMBINER_IMPL
116
117AMDGPURegBankCombinerImpl::AMDGPURegBankCombinerImpl(
118 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
119 GISelValueTracking &VT, GISelCSEInfo *CSEInfo,
120 const AMDGPURegBankCombinerImplRuleConfig &RuleConfig,
121 const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
122 : Combiner(MF, CInfo, TPC, &VT, CSEInfo), RuleConfig(RuleConfig), STI(STI),
123 RBI(*STI.getRegBankInfo()), TRI(*STI.getRegisterInfo()),
124 TII(*STI.getInstrInfo()),
125 Helper(Observer, B, /*IsPreLegalize*/ false, &VT, MDT, LI),
126#define GET_GICOMBINER_CONSTRUCTOR_INITS
127#include "AMDGPUGenRegBankGICombiner.inc"
128#undef GET_GICOMBINER_CONSTRUCTOR_INITS
129{
130}
131
132bool AMDGPURegBankCombinerImpl::isVgprRegBank(Register Reg) const {
133 return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID;
134}
135
136Register AMDGPURegBankCombinerImpl::getAsVgpr(Register Reg) const {
137 if (isVgprRegBank(Reg))
138 return Reg;
139
140 // Search for existing copy of Reg to vgpr.
141 for (MachineInstr &Use : MRI.use_instructions(Reg)) {
142 Register Def = Use.getOperand(i: 0).getReg();
143 if (Use.getOpcode() == AMDGPU::COPY && isVgprRegBank(Reg: Def))
144 return Def;
145 }
146
147 // Copy Reg to vgpr.
148 Register VgprReg = B.buildCopy(Res: MRI.getType(Reg), Op: Reg).getReg(Idx: 0);
149 MRI.setRegBank(Reg: VgprReg, RegBank: RBI.getRegBank(ID: AMDGPU::VGPRRegBankID));
150 return VgprReg;
151}
152
153AMDGPURegBankCombinerImpl::MinMaxMedOpc
154AMDGPURegBankCombinerImpl::getMinMaxPair(unsigned Opc) const {
155 switch (Opc) {
156 default:
157 llvm_unreachable("Unsupported opcode");
158 case AMDGPU::G_SMAX:
159 case AMDGPU::G_SMIN:
160 return {.Min: AMDGPU::G_SMIN, .Max: AMDGPU::G_SMAX, .Med: AMDGPU::G_AMDGPU_SMED3};
161 case AMDGPU::G_UMAX:
162 case AMDGPU::G_UMIN:
163 return {.Min: AMDGPU::G_UMIN, .Max: AMDGPU::G_UMAX, .Med: AMDGPU::G_AMDGPU_UMED3};
164 case AMDGPU::G_FMAXNUM:
165 case AMDGPU::G_FMINNUM:
166 return {.Min: AMDGPU::G_FMINNUM, .Max: AMDGPU::G_FMAXNUM, .Med: AMDGPU::G_AMDGPU_FMED3};
167 case AMDGPU::G_FMAXNUM_IEEE:
168 case AMDGPU::G_FMINNUM_IEEE:
169 return {.Min: AMDGPU::G_FMINNUM_IEEE, .Max: AMDGPU::G_FMAXNUM_IEEE,
170 .Med: AMDGPU::G_AMDGPU_FMED3};
171 }
172}
173
174template <class m_Cst, typename CstTy>
175bool AMDGPURegBankCombinerImpl::matchMed(MachineInstr &MI,
176 MachineRegisterInfo &MRI,
177 MinMaxMedOpc MMMOpc, Register &Val,
178 CstTy &K0, CstTy &K1) const {
179 // 4 operand commutes of: min(max(Val, K0), K1).
180 // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)).
181 // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0).
182 // 4 operand commutes of: max(min(Val, K1), K0).
183 // Find K0 from outer instr: max(min(...), K0) or max(K0, min(...)).
184 // Find K1 and Val from inner instr: min(K1, Val) or min(Val, K1).
185 return mi_match(
186 MI, MRI,
187 m_any_of(
188 m_CommutativeBinOp(
189 MMMOpc.Min, m_CommutativeBinOp(MMMOpc.Max, m_Reg(R&: Val), m_Cst(K0)),
190 m_Cst(K1)),
191 m_CommutativeBinOp(
192 MMMOpc.Max, m_CommutativeBinOp(MMMOpc.Min, m_Reg(R&: Val), m_Cst(K1)),
193 m_Cst(K0))));
194}
195
196bool AMDGPURegBankCombinerImpl::matchIntMinMaxToMed3(
197 MachineInstr &MI, Med3MatchInfo &MatchInfo) const {
198 Register Dst = MI.getOperand(i: 0).getReg();
199 if (!isVgprRegBank(Reg: Dst))
200 return false;
201
202 // med3 for i16 is only available on gfx9+, and not available for v2i16.
203 LLT Ty = MRI.getType(Reg: Dst);
204 if ((Ty != LLT::scalar(SizeInBits: 16) || !STI.hasMed3_16()) && Ty != LLT::scalar(SizeInBits: 32))
205 return false;
206
207 MinMaxMedOpc OpcodeTriple = getMinMaxPair(Opc: MI.getOpcode());
208 Register Val;
209 std::optional<ValueAndVReg> K0, K1;
210 // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
211 if (!matchMed<GCstAndRegMatch>(MI, MRI, MMMOpc: OpcodeTriple, Val, K0, K1))
212 return false;
213
214 if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_SMED3 && K0->Value.sgt(RHS: K1->Value))
215 return false;
216 if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_UMED3 && K0->Value.ugt(RHS: K1->Value))
217 return false;
218
219 MatchInfo = {.Opc: OpcodeTriple.Med, .Val0: Val, .Val1: K0->VReg, .Val2: K1->VReg};
220 return true;
221}
222
223// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1)
224// ieee = true : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K
225// ieee = false : min/max(NaN, K) = K
226// clamp(NaN) = dx10_clamp ? 0.0 : NaN
227// Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input.
228// Other operand commutes (see matchMed) give same result since min and max are
229// commutative.
230
231// Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1
232// with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0.
233// Val = SNaN only for ieee = true
234// fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1
235// min(max(SNaN, K0), K1) = min(QNaN, K1) = K1
236// max(min(SNaN, K1), K0) = max(K1, K0) = K1
237// Val = NaN,ieee = false or Val = QNaN,ieee = true
238// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0
239// min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true)
240// max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0
241bool AMDGPURegBankCombinerImpl::matchFPMinMaxToMed3(
242 MachineInstr &MI, Med3MatchInfo &MatchInfo) const {
243 Register Dst = MI.getOperand(i: 0).getReg();
244 LLT Ty = MRI.getType(Reg: Dst);
245
246 // med3 for f16 is only available on gfx9+, and not available for v2f16.
247 if ((Ty != LLT::scalar(SizeInBits: 16) || !STI.hasMed3_16()) && Ty != LLT::scalar(SizeInBits: 32))
248 return false;
249
250 auto OpcodeTriple = getMinMaxPair(Opc: MI.getOpcode());
251
252 Register Val;
253 std::optional<FPValueAndVReg> K0, K1;
254 // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
255 if (!matchMed<GFCstAndRegMatch>(MI, MRI, MMMOpc: OpcodeTriple, Val, K0, K1))
256 return false;
257
258 if (K0->Value > K1->Value)
259 return false;
260
261 // For IEEE=false perform combine only when it's safe to assume that there are
262 // no NaN inputs. Most often MI is marked with nnan fast math flag.
263 // For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to
264 // min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner
265 // nodes(max/min) have same behavior when one input is NaN and other isn't.
266 // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN,
267 // also post-legalizer inputs to min/max are fcanonicalized (never SNaN).
268 if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Val: Dst, MRI)) {
269 // Don't fold single use constant that can't be inlined.
270 if ((!MRI.hasOneNonDBGUse(RegNo: K0->VReg) || TII.isInlineConstant(Imm: K0->Value)) &&
271 (!MRI.hasOneNonDBGUse(RegNo: K1->VReg) || TII.isInlineConstant(Imm: K1->Value))) {
272 MatchInfo = {.Opc: OpcodeTriple.Med, .Val0: Val, .Val1: K0->VReg, .Val2: K1->VReg};
273 return true;
274 }
275 }
276
277 return false;
278}
279
280bool AMDGPURegBankCombinerImpl::matchFPMinMaxToClamp(MachineInstr &MI,
281 Register &Reg) const {
282 // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16).
283 auto OpcodeTriple = getMinMaxPair(Opc: MI.getOpcode());
284 Register Val;
285 std::optional<FPValueAndVReg> K0, K1;
286 // Match min(max(Val, K0), K1) or max(min(Val, K1), K0).
287 if (!matchMed<GFCstOrSplatGFCstMatch>(MI, MRI, MMMOpc: OpcodeTriple, Val, K0, K1))
288 return false;
289
290 if (!K0->Value.isExactlyValue(V: 0.0) || !K1->Value.isExactlyValue(V: 1.0))
291 return false;
292
293 // For IEEE=false perform combine only when it's safe to assume that there are
294 // no NaN inputs. Most often MI is marked with nnan fast math flag.
295 // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates
296 // to 0.0 requires dx10_clamp = true.
297 if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) &&
298 isKnownNeverSNaN(Val, MRI)) ||
299 isKnownNeverNaN(Val: MI.getOperand(i: 0).getReg(), MRI)) {
300 Reg = Val;
301 return true;
302 }
303
304 return false;
305}
306
307// Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true.
308// Val = SNaN only for ieee = true. It is important which operand is NaN.
309// min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0
310// min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0
311// min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN
312// Val = NaN,ieee = false or Val = QNaN,ieee = true
313// min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0
314// min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0
315// min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0
316bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
317 Register &Reg) const {
318 // In llvm-ir, clamp is often represented as an intrinsic call to
319 // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders.
320 MachineInstr *Src0 = getDefIgnoringCopies(Reg: MI.getOperand(i: 1).getReg(), MRI);
321 MachineInstr *Src1 = getDefIgnoringCopies(Reg: MI.getOperand(i: 2).getReg(), MRI);
322 MachineInstr *Src2 = getDefIgnoringCopies(Reg: MI.getOperand(i: 3).getReg(), MRI);
323
324 if (isFCst(MI: Src0) && !isFCst(MI: Src1))
325 std::swap(a&: Src0, b&: Src1);
326 if (isFCst(MI: Src1) && !isFCst(MI: Src2))
327 std::swap(a&: Src1, b&: Src2);
328 if (isFCst(MI: Src0) && !isFCst(MI: Src1))
329 std::swap(a&: Src0, b&: Src1);
330 if (!isClampZeroToOne(K0: Src1, K1: Src2))
331 return false;
332
333 Register Val = Src0->getOperand(i: 0).getReg();
334
335 auto isOp3Zero = [&]() {
336 MachineInstr *Op3 = getDefIgnoringCopies(Reg: MI.getOperand(i: 3).getReg(), MRI);
337 if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT)
338 return Op3->getOperand(i: 1).getFPImm()->isExactlyValue(V: 0.0);
339 return false;
340 };
341 // For IEEE=false perform combine only when it's safe to assume that there are
342 // no NaN inputs. Most often MI is marked with nnan fast math flag.
343 // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold
344 // when Val could be QNaN. If Val can also be SNaN third input should be 0.0.
345 if (isKnownNeverNaN(Val: MI.getOperand(i: 0).getReg(), MRI) ||
346 (getIEEE() && getDX10Clamp() &&
347 (isKnownNeverSNaN(Val, MRI) || isOp3Zero()))) {
348 Reg = Val;
349 return true;
350 }
351
352 return false;
353}
354
355void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
356 Register &Reg) const {
357 B.buildInstr(Opc: AMDGPU::G_AMDGPU_CLAMP, DstOps: {MI.getOperand(i: 0)}, SrcOps: {Reg},
358 Flags: MI.getFlags());
359 MI.eraseFromParent();
360}
361
362void AMDGPURegBankCombinerImpl::applyMed3(MachineInstr &MI,
363 Med3MatchInfo &MatchInfo) const {
364 B.buildInstr(Opc: MatchInfo.Opc, DstOps: {MI.getOperand(i: 0)},
365 SrcOps: {getAsVgpr(Reg: MatchInfo.Val0), getAsVgpr(Reg: MatchInfo.Val1),
366 getAsVgpr(Reg: MatchInfo.Val2)},
367 Flags: MI.getFlags());
368 MI.eraseFromParent();
369}
370
371void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
372 MachineInstr &MI, MachineInstr &Ext) const {
373 unsigned ShOpc = MI.getOpcode();
374 assert(ShOpc == AMDGPU::G_SHL || ShOpc == AMDGPU::G_LSHR ||
375 ShOpc == AMDGPU::G_ASHR);
376 assert(Ext.getOpcode() == AMDGPU::G_ZEXT);
377
378 Register AmtReg = Ext.getOperand(i: 1).getReg();
379 Register ShDst = MI.getOperand(i: 0).getReg();
380 Register ShSrc = MI.getOperand(i: 1).getReg();
381
382 LLT ExtAmtTy = MRI.getType(Reg: Ext.getOperand(i: 0).getReg());
383 LLT AmtTy = MRI.getType(Reg: AmtReg);
384
385 auto &RB = *MRI.getRegBank(Reg: AmtReg);
386
387 auto NewExt = B.buildAnyExt(Res: ExtAmtTy, Op: AmtReg);
388 auto Mask = B.buildConstant(
389 Res: ExtAmtTy, Val: maskTrailingOnes<uint64_t>(N: AmtTy.getScalarSizeInBits()));
390 auto And = B.buildAnd(Dst: ExtAmtTy, Src0: NewExt, Src1: Mask);
391 B.buildInstr(Opc: ShOpc, DstOps: {ShDst}, SrcOps: {ShSrc, And});
392
393 MRI.setRegBank(Reg: NewExt.getReg(Idx: 0), RegBank: RB);
394 MRI.setRegBank(Reg: Mask.getReg(Idx: 0), RegBank: RB);
395 MRI.setRegBank(Reg: And.getReg(Idx: 0), RegBank: RB);
396 MI.eraseFromParent();
397}
398
399bool AMDGPURegBankCombinerImpl::combineD16Load(MachineInstr &MI) const {
400 Register Dst;
401 MachineInstr *Load, *SextLoad;
402 const int64_t CleanLo16 = 0xFFFFFFFFFFFF0000;
403 const int64_t CleanHi16 = 0x000000000000FFFF;
404
405 // Load lo
406 if (mi_match(R: MI.getOperand(i: 1).getReg(), MRI,
407 P: m_GOr(L: m_GAnd(L: m_GBitcast(Src: m_Reg(R&: Dst)),
408 R: m_Copy(Src: m_SpecificICst(RequestedValue: CleanLo16))),
409 R: m_MInstr(MI&: Load)))) {
410
411 if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
412 const MachineMemOperand *MMO = *Load->memoperands_begin();
413 unsigned LoadSize = MMO->getSizeInBits().getValue();
414 if (LoadSize == 8)
415 return applyD16Load(D16Opc: AMDGPU::G_AMDGPU_LOAD_D16_LO_U8, DstMI&: MI, SmallLoad: Load, ToOverwriteD16: Dst);
416 if (LoadSize == 16)
417 return applyD16Load(D16Opc: AMDGPU::G_AMDGPU_LOAD_D16_LO, DstMI&: MI, SmallLoad: Load, ToOverwriteD16: Dst);
418 return false;
419 }
420
421 if (mi_match(
422 R: Load, MRI,
423 P: m_GAnd(L: m_MInstr(MI&: SextLoad), R: m_Copy(Src: m_SpecificICst(RequestedValue: CleanHi16))))) {
424 if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
425 return false;
426
427 const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
428 if (MMO->getSizeInBits().getValue() != 8)
429 return false;
430
431 return applyD16Load(D16Opc: AMDGPU::G_AMDGPU_LOAD_D16_LO_I8, DstMI&: MI, SmallLoad: SextLoad, ToOverwriteD16: Dst);
432 }
433
434 return false;
435 }
436
437 // Load hi
438 if (mi_match(R: MI.getOperand(i: 1).getReg(), MRI,
439 P: m_GOr(L: m_GAnd(L: m_GBitcast(Src: m_Reg(R&: Dst)),
440 R: m_Copy(Src: m_SpecificICst(RequestedValue: CleanHi16))),
441 R: m_GShl(L: m_MInstr(MI&: Load), R: m_Copy(Src: m_SpecificICst(RequestedValue: 16)))))) {
442
443 if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
444 const MachineMemOperand *MMO = *Load->memoperands_begin();
445 unsigned LoadSize = MMO->getSizeInBits().getValue();
446 if (LoadSize == 8)
447 return applyD16Load(D16Opc: AMDGPU::G_AMDGPU_LOAD_D16_HI_U8, DstMI&: MI, SmallLoad: Load, ToOverwriteD16: Dst);
448 if (LoadSize == 16)
449 return applyD16Load(D16Opc: AMDGPU::G_AMDGPU_LOAD_D16_HI, DstMI&: MI, SmallLoad: Load, ToOverwriteD16: Dst);
450 return false;
451 }
452
453 if (mi_match(
454 R: Load, MRI,
455 P: m_GAnd(L: m_MInstr(MI&: SextLoad), R: m_Copy(Src: m_SpecificICst(RequestedValue: CleanHi16))))) {
456 if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
457 return false;
458 const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
459 if (MMO->getSizeInBits().getValue() != 8)
460 return false;
461
462 return applyD16Load(D16Opc: AMDGPU::G_AMDGPU_LOAD_D16_HI_I8, DstMI&: MI, SmallLoad: SextLoad, ToOverwriteD16: Dst);
463 }
464
465 return false;
466 }
467
468 return false;
469}
470
471bool AMDGPURegBankCombinerImpl::applyD16Load(
472 unsigned D16Opc, MachineInstr &DstMI, MachineInstr *SmallLoad,
473 Register SrcReg32ToOverwriteD16) const {
474 B.buildInstr(Opc: D16Opc, DstOps: {DstMI.getOperand(i: 0).getReg()},
475 SrcOps: {SmallLoad->getOperand(i: 1).getReg(), SrcReg32ToOverwriteD16})
476 .setMemRefs(SmallLoad->memoperands());
477 DstMI.eraseFromParent();
478 return true;
479}
480
481SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
482 return MF.getInfo<SIMachineFunctionInfo>()->getMode();
483}
484
485bool AMDGPURegBankCombinerImpl::getIEEE() const { return getMode().IEEE; }
486
487bool AMDGPURegBankCombinerImpl::getDX10Clamp() const {
488 return getMode().DX10Clamp;
489}
490
491bool AMDGPURegBankCombinerImpl::isFminnumIeee(const MachineInstr &MI) const {
492 return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE;
493}
494
495bool AMDGPURegBankCombinerImpl::isFCst(MachineInstr *MI) const {
496 return MI->getOpcode() == AMDGPU::G_FCONSTANT;
497}
498
499bool AMDGPURegBankCombinerImpl::isClampZeroToOne(MachineInstr *K0,
500 MachineInstr *K1) const {
501 if (isFCst(MI: K0) && isFCst(MI: K1)) {
502 const ConstantFP *KO_FPImm = K0->getOperand(i: 1).getFPImm();
503 const ConstantFP *K1_FPImm = K1->getOperand(i: 1).getFPImm();
504 return (KO_FPImm->isExactlyValue(V: 0.0) && K1_FPImm->isExactlyValue(V: 1.0)) ||
505 (KO_FPImm->isExactlyValue(V: 1.0) && K1_FPImm->isExactlyValue(V: 0.0));
506 }
507 return false;
508}
509
510// Pass boilerplate
511// ================
512
513class AMDGPURegBankCombiner : public MachineFunctionPass {
514public:
515 static char ID;
516
517 AMDGPURegBankCombiner(bool IsOptNone = false);
518
519 StringRef getPassName() const override { return "AMDGPURegBankCombiner"; }
520
521 bool runOnMachineFunction(MachineFunction &MF) override;
522
523 void getAnalysisUsage(AnalysisUsage &AU) const override;
524
525private:
526 bool IsOptNone;
527 AMDGPURegBankCombinerImplRuleConfig RuleConfig;
528};
529} // end anonymous namespace
530
531void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
532 AU.addRequired<TargetPassConfig>();
533 AU.setPreservesCFG();
534 getSelectionDAGFallbackAnalysisUsage(AU);
535 AU.addRequired<GISelValueTrackingAnalysisLegacy>();
536 AU.addPreserved<GISelValueTrackingAnalysisLegacy>();
537 if (!IsOptNone) {
538 AU.addRequired<MachineDominatorTreeWrapperPass>();
539 AU.addPreserved<MachineDominatorTreeWrapperPass>();
540 }
541 MachineFunctionPass::getAnalysisUsage(AU);
542}
543
544AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone)
545 : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
546 if (!RuleConfig.parseCommandLineOption())
547 report_fatal_error(reason: "Invalid rule identifier");
548}
549
550bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) {
551 if (MF.getProperties().hasFailedISel())
552 return false;
553 auto *TPC = &getAnalysis<TargetPassConfig>();
554 const Function &F = MF.getFunction();
555 bool EnableOpt =
556 MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
557
558 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
559 GISelValueTracking *VT =
560 &getAnalysis<GISelValueTrackingAnalysisLegacy>().get(MF);
561
562 const auto *LI = ST.getLegalizerInfo();
563 MachineDominatorTree *MDT =
564 IsOptNone ? nullptr
565 : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
566
567 CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
568 LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
569 // Disable fixed-point iteration to reduce compile-time
570 CInfo.MaxIterations = 1;
571 CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass;
572 // RegBankSelect seems not to leave dead instructions, so a full DCE pass is
573 // unnecessary.
574 CInfo.EnableFullDCE = false;
575 AMDGPURegBankCombinerImpl Impl(MF, CInfo, TPC, *VT, /*CSEInfo*/ nullptr,
576 RuleConfig, ST, MDT, LI);
577 return Impl.combineMachineInstrs();
578}
579
580char AMDGPURegBankCombiner::ID = 0;
581INITIALIZE_PASS_BEGIN(AMDGPURegBankCombiner, DEBUG_TYPE,
582 "Combine AMDGPU machine instrs after regbankselect",
583 false, false)
584INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
585INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy)
586INITIALIZE_PASS_END(AMDGPURegBankCombiner, DEBUG_TYPE,
587 "Combine AMDGPU machine instrs after regbankselect", false,
588 false)
589
590FunctionPass *llvm::createAMDGPURegBankCombiner(bool IsOptNone) {
591 return new AMDGPURegBankCombiner(IsOptNone);
592}
593