1//=== lib/CodeGen/GlobalISel/AMDGPURegBankCombiner.cpp ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass does combining of machine instructions at the generic MI level,
10// after register banks are known.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPULegalizerInfo.h"
16#include "AMDGPURegisterBankInfo.h"
17#include "GCNSubtarget.h"
18#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19#include "SIMachineFunctionInfo.h"
20#include "llvm/CodeGen/GlobalISel/Combiner.h"
21#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
22#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
23#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
24#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/MachineDominators.h"
27#include "llvm/CodeGen/TargetPassConfig.h"
28#include "llvm/Target/TargetMachine.h"
29
30#define GET_GICOMBINER_DEPS
31#include "AMDGPUGenPreLegalizeGICombiner.inc"
32#undef GET_GICOMBINER_DEPS
33
34#define DEBUG_TYPE "amdgpu-regbank-combiner"
35
36using namespace llvm;
37using namespace MIPatternMatch;
38
39namespace {
40#define GET_GICOMBINER_TYPES
41#include "AMDGPUGenRegBankGICombiner.inc"
42#undef GET_GICOMBINER_TYPES
43
44class AMDGPURegBankCombinerImpl : public Combiner {
45protected:
46 const AMDGPURegBankCombinerImplRuleConfig &RuleConfig;
47 const GCNSubtarget &STI;
48 const RegisterBankInfo &RBI;
49 const TargetRegisterInfo &TRI;
50 const SIInstrInfo &TII;
51 const CombinerHelper Helper;
52
53public:
54 AMDGPURegBankCombinerImpl(
55 MachineFunction &MF, CombinerInfo &CInfo, GISelValueTracking &VT,
56 GISelCSEInfo *CSEInfo,
57 const AMDGPURegBankCombinerImplRuleConfig &RuleConfig,
58 const GCNSubtarget &STI, MachineDominatorTree *MDT,
59 const LegalizerInfo *LI);
60
61 static const char *getName() { return "AMDGPURegBankCombinerImpl"; }
62
63 bool tryCombineAll(MachineInstr &I) const override;
64
65 bool isVgprRegBank(Register Reg) const;
66 Register getAsVgpr(Register Reg) const;
67
68 struct MinMaxMedOpc {
69 unsigned Min, Max, Med;
70 };
71
72 struct Med3MatchInfo {
73 unsigned Opc;
74 Register Val0, Val1, Val2;
75 };
76
77 struct MinMaxToMinMax3MatchInfo {
78 unsigned Opc;
79 Register Val0, Val1, Val2;
80 };
81
82 MinMaxMedOpc getMinMaxPair(unsigned Opc) const;
83
84 template <class m_Cst, typename CstTy>
85 bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc,
86 Register &Val, CstTy &K0, CstTy &K1) const;
87
88 bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
89 bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
90 bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg) const;
91 bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg) const;
92 void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
93 void applyClamp(MachineInstr &MI, Register &Reg) const;
94
95 void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const;
96
97 bool combineD16Load(MachineInstr &MI) const;
98 bool applyD16Load(unsigned D16Opc, MachineInstr &DstMI,
99 MachineInstr *SmallLoad, Register ToOverwriteD16) const;
100
101 bool matchMinMaxToMinMax3(MachineInstr &MI,
102 MinMaxToMinMax3MatchInfo &MatchInfo) const;
103 void applyMinMaxToMinMax3(MachineInstr &MI,
104 MinMaxToMinMax3MatchInfo &MatchInfo) const;
105
106private:
107 SIModeRegisterDefaults getMode() const;
108 bool getIEEE() const;
109 bool getDX10Clamp() const;
110 bool isFminnumIeee(const MachineInstr &MI) const;
111 bool isFCst(MachineInstr *MI) const;
112 bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1) const;
113
114#define GET_GICOMBINER_CLASS_MEMBERS
115#define AMDGPUSubtarget GCNSubtarget
116#include "AMDGPUGenRegBankGICombiner.inc"
117#undef GET_GICOMBINER_CLASS_MEMBERS
118#undef AMDGPUSubtarget
119};
120
121#define GET_GICOMBINER_IMPL
122#define AMDGPUSubtarget GCNSubtarget
123#include "AMDGPUGenRegBankGICombiner.inc"
124#undef AMDGPUSubtarget
125#undef GET_GICOMBINER_IMPL
126
127AMDGPURegBankCombinerImpl::AMDGPURegBankCombinerImpl(
128 MachineFunction &MF, CombinerInfo &CInfo, GISelValueTracking &VT,
129 GISelCSEInfo *CSEInfo,
130 const AMDGPURegBankCombinerImplRuleConfig &RuleConfig,
131 const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
132 : Combiner(MF, CInfo, &VT, CSEInfo), RuleConfig(RuleConfig), STI(STI),
133 RBI(*STI.getRegBankInfo()), TRI(*STI.getRegisterInfo()),
134 TII(*STI.getInstrInfo()),
135 Helper(Observer, B, /*IsPreLegalize*/ false, &VT, MDT, LI),
136#define GET_GICOMBINER_CONSTRUCTOR_INITS
137#include "AMDGPUGenRegBankGICombiner.inc"
138#undef GET_GICOMBINER_CONSTRUCTOR_INITS
139{
140}
141
142bool AMDGPURegBankCombinerImpl::isVgprRegBank(Register Reg) const {
143 return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID;
144}
145
146Register AMDGPURegBankCombinerImpl::getAsVgpr(Register Reg) const {
147 if (isVgprRegBank(Reg))
148 return Reg;
149
150 // Search for existing copy of Reg to vgpr.
151 for (MachineInstr &Use : MRI.use_instructions(Reg)) {
152 Register Def = Use.getOperand(i: 0).getReg();
153 if (Use.getOpcode() == AMDGPU::COPY && isVgprRegBank(Reg: Def))
154 return Def;
155 }
156
157 // Copy Reg to vgpr.
158 Register VgprReg = B.buildCopy(Res: MRI.getType(Reg), Op: Reg).getReg(Idx: 0);
159 MRI.setRegBank(Reg: VgprReg, RegBank: RBI.getRegBank(ID: AMDGPU::VGPRRegBankID));
160 return VgprReg;
161}
162
163AMDGPURegBankCombinerImpl::MinMaxMedOpc
164AMDGPURegBankCombinerImpl::getMinMaxPair(unsigned Opc) const {
165 switch (Opc) {
166 default:
167 llvm_unreachable("Unsupported opcode");
168 case AMDGPU::G_SMAX:
169 case AMDGPU::G_SMIN:
170 return {.Min: AMDGPU::G_SMIN, .Max: AMDGPU::G_SMAX, .Med: AMDGPU::G_AMDGPU_SMED3};
171 case AMDGPU::G_UMAX:
172 case AMDGPU::G_UMIN:
173 return {.Min: AMDGPU::G_UMIN, .Max: AMDGPU::G_UMAX, .Med: AMDGPU::G_AMDGPU_UMED3};
174 case AMDGPU::G_FMAXNUM:
175 case AMDGPU::G_FMINNUM:
176 return {.Min: AMDGPU::G_FMINNUM, .Max: AMDGPU::G_FMAXNUM, .Med: AMDGPU::G_AMDGPU_FMED3};
177 case AMDGPU::G_FMAXNUM_IEEE:
178 case AMDGPU::G_FMINNUM_IEEE:
179 return {.Min: AMDGPU::G_FMINNUM_IEEE, .Max: AMDGPU::G_FMAXNUM_IEEE,
180 .Med: AMDGPU::G_AMDGPU_FMED3};
181 }
182}
183
184template <class m_Cst, typename CstTy>
185bool AMDGPURegBankCombinerImpl::matchMed(MachineInstr &MI,
186 MachineRegisterInfo &MRI,
187 MinMaxMedOpc MMMOpc, Register &Val,
188 CstTy &K0, CstTy &K1) const {
189 // 4 operand commutes of: min(max(Val, K0), K1).
190 // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)).
191 // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0).
192 // 4 operand commutes of: max(min(Val, K1), K0).
193 // Find K0 from outer instr: max(min(...), K0) or max(K0, min(...)).
194 // Find K1 and Val from inner instr: min(K1, Val) or min(Val, K1).
195 return mi_match(
196 MI, MRI,
197 m_any_of(
198 m_CommutativeBinOp(
199 MMMOpc.Min, m_CommutativeBinOp(MMMOpc.Max, m_Reg(R&: Val), m_Cst(K0)),
200 m_Cst(K1)),
201 m_CommutativeBinOp(
202 MMMOpc.Max, m_CommutativeBinOp(MMMOpc.Min, m_Reg(R&: Val), m_Cst(K1)),
203 m_Cst(K0))));
204}
205
206bool AMDGPURegBankCombinerImpl::matchIntMinMaxToMed3(
207 MachineInstr &MI, Med3MatchInfo &MatchInfo) const {
208 Register Dst = MI.getOperand(i: 0).getReg();
209 if (!isVgprRegBank(Reg: Dst))
210 return false;
211
212 // med3 for i16 is only available on gfx9+, and not available for v2i16.
213 LLT Ty = MRI.getType(Reg: Dst);
214 if ((Ty != LLT::scalar(SizeInBits: 16) || !STI.hasMed3_16()) && Ty != LLT::scalar(SizeInBits: 32))
215 return false;
216
217 MinMaxMedOpc OpcodeTriple = getMinMaxPair(Opc: MI.getOpcode());
218 Register Val;
219 std::optional<ValueAndVReg> K0, K1;
220 // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
221 if (!matchMed<GCstAndRegMatch>(MI, MRI, MMMOpc: OpcodeTriple, Val, K0, K1))
222 return false;
223
224 if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_SMED3 && K0->Value.sgt(RHS: K1->Value))
225 return false;
226 if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_UMED3 && K0->Value.ugt(RHS: K1->Value))
227 return false;
228
229 MatchInfo = {.Opc: OpcodeTriple.Med, .Val0: Val, .Val1: K0->VReg, .Val2: K1->VReg};
230 return true;
231}
232
233// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1)
234// ieee = true : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K
235// ieee = false : min/max(NaN, K) = K
236// clamp(NaN) = dx10_clamp ? 0.0 : NaN
237// Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input.
238// Other operand commutes (see matchMed) give same result since min and max are
239// commutative.
240
241// Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1
242// with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0.
243// Val = SNaN only for ieee = true
244// fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1
245// min(max(SNaN, K0), K1) = min(QNaN, K1) = K1
246// max(min(SNaN, K1), K0) = max(K1, K0) = K1
247// Val = NaN,ieee = false or Val = QNaN,ieee = true
248// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0
249// min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true)
250// max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0
251bool AMDGPURegBankCombinerImpl::matchFPMinMaxToMed3(
252 MachineInstr &MI, Med3MatchInfo &MatchInfo) const {
253 Register Dst = MI.getOperand(i: 0).getReg();
254 LLT Ty = MRI.getType(Reg: Dst);
255
256 // med3 for f16 is only available on gfx9+, and not available for v2f16.
257 if ((Ty != LLT::scalar(SizeInBits: 16) || !STI.hasMed3_16()) && Ty != LLT::scalar(SizeInBits: 32))
258 return false;
259
260 auto OpcodeTriple = getMinMaxPair(Opc: MI.getOpcode());
261
262 Register Val;
263 std::optional<FPValueAndVReg> K0, K1;
264 // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
265 if (!matchMed<GFCstAndRegMatch>(MI, MRI, MMMOpc: OpcodeTriple, Val, K0, K1))
266 return false;
267
268 if (K0->Value > K1->Value)
269 return false;
270
271 // For IEEE=false perform combine only when it's safe to assume that there are
272 // no NaN inputs. Most often MI is marked with nnan fast math flag.
273 // For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to
274 // min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner
275 // nodes(max/min) have same behavior when one input is NaN and other isn't.
276 // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN,
277 // also post-legalizer inputs to min/max are fcanonicalized (never SNaN).
278 if ((getIEEE() && isFminnumIeee(MI)) || VT->isKnownNeverNaN(Val: Dst)) {
279 // Don't fold single use constant that can't be inlined.
280 if ((!MRI.hasOneNonDBGUse(RegNo: K0->VReg) || TII.isInlineConstant(Imm: K0->Value)) &&
281 (!MRI.hasOneNonDBGUse(RegNo: K1->VReg) || TII.isInlineConstant(Imm: K1->Value))) {
282 MatchInfo = {.Opc: OpcodeTriple.Med, .Val0: Val, .Val1: K0->VReg, .Val2: K1->VReg};
283 return true;
284 }
285 }
286
287 return false;
288}
289
290bool AMDGPURegBankCombinerImpl::matchFPMinMaxToClamp(MachineInstr &MI,
291 Register &Reg) const {
292 // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16).
293 auto OpcodeTriple = getMinMaxPair(Opc: MI.getOpcode());
294 Register Val;
295 std::optional<FPValueAndVReg> K0, K1;
296 // Match min(max(Val, K0), K1) or max(min(Val, K1), K0).
297 if (!matchMed<GFCstOrSplatGFCstMatch>(MI, MRI, MMMOpc: OpcodeTriple, Val, K0, K1))
298 return false;
299
300 if (!K0->Value.isPosZero() || !K1->Value.isOne())
301 return false;
302
303 // For IEEE=false perform combine only when it's safe to assume that there are
304 // no NaN inputs. Most often MI is marked with nnan fast math flag.
305 // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates
306 // to 0.0 requires dx10_clamp = true.
307 if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) &&
308 VT->isKnownNeverSNaN(Val)) ||
309 VT->isKnownNeverNaN(Val: MI.getOperand(i: 0).getReg())) {
310 Reg = Val;
311 return true;
312 }
313
314 return false;
315}
316
317// Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true.
318// Val = SNaN only for ieee = true. It is important which operand is NaN.
319// min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0
320// min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0
321// min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN
322// Val = NaN,ieee = false or Val = QNaN,ieee = true
323// min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0
324// min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0
325// min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0
326bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
327 Register &Reg) const {
328 // In llvm-ir, clamp is often represented as an intrinsic call to
329 // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders.
330 MachineInstr *Src0 = getDefIgnoringCopies(Reg: MI.getOperand(i: 1).getReg(), MRI);
331 MachineInstr *Src1 = getDefIgnoringCopies(Reg: MI.getOperand(i: 2).getReg(), MRI);
332 MachineInstr *Src2 = getDefIgnoringCopies(Reg: MI.getOperand(i: 3).getReg(), MRI);
333
334 if (isFCst(MI: Src0) && !isFCst(MI: Src1))
335 std::swap(a&: Src0, b&: Src1);
336 if (isFCst(MI: Src1) && !isFCst(MI: Src2))
337 std::swap(a&: Src1, b&: Src2);
338 if (isFCst(MI: Src0) && !isFCst(MI: Src1))
339 std::swap(a&: Src0, b&: Src1);
340 if (!isClampZeroToOne(K0: Src1, K1: Src2))
341 return false;
342
343 Register Val = Src0->getOperand(i: 0).getReg();
344
345 auto isOp3Zero = [&]() {
346 MachineInstr *Op3 = getDefIgnoringCopies(Reg: MI.getOperand(i: 3).getReg(), MRI);
347 if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT)
348 return Op3->getOperand(i: 1).getFPImm()->isPosZero();
349 return false;
350 };
351 // For IEEE=false perform combine only when it's safe to assume that there are
352 // no NaN inputs. Most often MI is marked with nnan fast math flag.
353 // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold
354 // when Val could be QNaN. If Val can also be SNaN third input should be 0.0.
355 if (VT->isKnownNeverNaN(Val: MI.getOperand(i: 0).getReg()) ||
356 (getIEEE() && getDX10Clamp() &&
357 (VT->isKnownNeverSNaN(Val) || isOp3Zero()))) {
358 Reg = Val;
359 return true;
360 }
361
362 return false;
363}
364
365void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
366 Register &Reg) const {
367 B.buildInstr(Opc: AMDGPU::G_AMDGPU_CLAMP, DstOps: {MI.getOperand(i: 0)}, SrcOps: {Reg},
368 Flags: MI.getFlags());
369 MI.eraseFromParent();
370}
371
372void AMDGPURegBankCombinerImpl::applyMed3(MachineInstr &MI,
373 Med3MatchInfo &MatchInfo) const {
374 B.buildInstr(Opc: MatchInfo.Opc, DstOps: {MI.getOperand(i: 0)},
375 SrcOps: {getAsVgpr(Reg: MatchInfo.Val0), getAsVgpr(Reg: MatchInfo.Val1),
376 getAsVgpr(Reg: MatchInfo.Val2)},
377 Flags: MI.getFlags());
378 MI.eraseFromParent();
379}
380
381void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
382 MachineInstr &MI, MachineInstr &Ext) const {
383 unsigned ShOpc = MI.getOpcode();
384 assert(ShOpc == AMDGPU::G_SHL || ShOpc == AMDGPU::G_LSHR ||
385 ShOpc == AMDGPU::G_ASHR);
386 assert(Ext.getOpcode() == AMDGPU::G_ZEXT);
387
388 Register AmtReg = Ext.getOperand(i: 1).getReg();
389 Register ShDst = MI.getOperand(i: 0).getReg();
390 Register ShSrc = MI.getOperand(i: 1).getReg();
391
392 LLT ExtAmtTy = MRI.getType(Reg: Ext.getOperand(i: 0).getReg());
393 LLT AmtTy = MRI.getType(Reg: AmtReg);
394
395 auto &RB = *MRI.getRegBank(Reg: AmtReg);
396
397 auto NewExt = B.buildAnyExt(Res: ExtAmtTy, Op: AmtReg);
398 auto Mask = B.buildConstant(
399 Res: ExtAmtTy, Val: maskTrailingOnes<uint64_t>(N: AmtTy.getScalarSizeInBits()));
400 auto And = B.buildAnd(Dst: ExtAmtTy, Src0: NewExt, Src1: Mask);
401 B.buildInstr(Opc: ShOpc, DstOps: {ShDst}, SrcOps: {ShSrc, And});
402
403 MRI.setRegBank(Reg: NewExt.getReg(Idx: 0), RegBank: RB);
404 MRI.setRegBank(Reg: Mask.getReg(Idx: 0), RegBank: RB);
405 MRI.setRegBank(Reg: And.getReg(Idx: 0), RegBank: RB);
406 MI.eraseFromParent();
407}
408
409bool AMDGPURegBankCombinerImpl::combineD16Load(MachineInstr &MI) const {
410 Register Dst;
411 MachineInstr *Load, *SextLoad;
412 const int64_t CleanLo16 = 0xFFFFFFFFFFFF0000;
413 const int64_t CleanHi16 = 0x000000000000FFFF;
414
415 // Load lo
416 if (mi_match(R: MI.getOperand(i: 1).getReg(), MRI,
417 P: m_GOr(L: m_GAnd(L: m_GBitcast(Src: m_Reg(R&: Dst)),
418 R: m_Copy(Src: m_SpecificICst(RequestedValue: CleanLo16))),
419 R: m_MInstr(MI&: Load)))) {
420
421 if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
422 const MachineMemOperand *MMO = *Load->memoperands_begin();
423 unsigned LoadSize = MMO->getSizeInBits().getValue();
424 if (LoadSize == 8)
425 return applyD16Load(D16Opc: AMDGPU::G_AMDGPU_LOAD_D16_LO_U8, DstMI&: MI, SmallLoad: Load, ToOverwriteD16: Dst);
426 if (LoadSize == 16)
427 return applyD16Load(D16Opc: AMDGPU::G_AMDGPU_LOAD_D16_LO, DstMI&: MI, SmallLoad: Load, ToOverwriteD16: Dst);
428 return false;
429 }
430
431 // s32 Load_lo16 holds SextLoad i8, Load_hi16 is zero.
432 // fake16: and (sextload i8 -> s32), 0xFFFF
433 // true16: zext (sextload i8 -> s16) -> s32
434 if (mi_match(
435 R: Load, MRI,
436 P: m_GAnd(L: m_MInstr(MI&: SextLoad), R: m_Copy(Src: m_SpecificICst(RequestedValue: CleanHi16)))) ||
437 mi_match(R: Load, MRI,
438 P: m_GZExt(Src: m_all_of(preds: m_SpecificType(Ty: LLT::scalar(SizeInBits: 16)),
439 preds: m_MInstr(MI&: SextLoad))))) {
440 if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
441 return false;
442
443 const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
444 if (MMO->getSizeInBits().getValue() != 8)
445 return false;
446
447 return applyD16Load(D16Opc: AMDGPU::G_AMDGPU_LOAD_D16_LO_I8, DstMI&: MI, SmallLoad: SextLoad, ToOverwriteD16: Dst);
448 }
449
450 return false;
451 }
452
453 // Load hi
454 if (mi_match(R: MI.getOperand(i: 1).getReg(), MRI,
455 P: m_GOr(L: m_GAnd(L: m_GBitcast(Src: m_Reg(R&: Dst)),
456 R: m_Copy(Src: m_SpecificICst(RequestedValue: CleanHi16))),
457 R: m_GShl(L: m_MInstr(MI&: Load), R: m_Copy(Src: m_SpecificICst(RequestedValue: 16)))))) {
458
459 if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
460 const MachineMemOperand *MMO = *Load->memoperands_begin();
461 unsigned LoadSize = MMO->getSizeInBits().getValue();
462 if (LoadSize == 8)
463 return applyD16Load(D16Opc: AMDGPU::G_AMDGPU_LOAD_D16_HI_U8, DstMI&: MI, SmallLoad: Load, ToOverwriteD16: Dst);
464 if (LoadSize == 16)
465 return applyD16Load(D16Opc: AMDGPU::G_AMDGPU_LOAD_D16_HI, DstMI&: MI, SmallLoad: Load, ToOverwriteD16: Dst);
466 return false;
467 }
468
469 // s32 Load_lo16 holds SextLoad i8, Load_hi16 is zero.
470 // fake16: and (sextload i8 -> s32), 0xFFFF
471 // true16: zext (sextload i8 -> s16) -> s32
472 if (mi_match(
473 R: Load, MRI,
474 P: m_GAnd(L: m_MInstr(MI&: SextLoad), R: m_Copy(Src: m_SpecificICst(RequestedValue: CleanHi16)))) ||
475 mi_match(R: Load, MRI,
476 P: m_GZExt(Src: m_all_of(preds: m_SpecificType(Ty: LLT::scalar(SizeInBits: 16)),
477 preds: m_MInstr(MI&: SextLoad))))) {
478 if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
479 return false;
480
481 const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
482 if (MMO->getSizeInBits().getValue() != 8)
483 return false;
484
485 return applyD16Load(D16Opc: AMDGPU::G_AMDGPU_LOAD_D16_HI_I8, DstMI&: MI, SmallLoad: SextLoad, ToOverwriteD16: Dst);
486 }
487
488 return false;
489 }
490
491 return false;
492}
493
494void AMDGPURegBankCombinerImpl::applyMinMaxToMinMax3(
495 MachineInstr &MI, MinMaxToMinMax3MatchInfo &MatchInfo) const {
496 B.buildInstr(Opc: MatchInfo.Opc, DstOps: {MI.getOperand(i: 0)},
497 SrcOps: {MatchInfo.Val0, MatchInfo.Val1, MatchInfo.Val2}, Flags: MI.getFlags());
498 MI.eraseFromParent();
499 return;
500}
501
502// min(min(a, b), c) == min(a, min(b, c)) == min3(a, b, c)
503// supported scalar type: S32 S16 U32 U16 F32 F16
504bool AMDGPURegBankCombinerImpl::matchMinMaxToMinMax3(
505 MachineInstr &MI, MinMaxToMinMax3MatchInfo &MatchInfo) const {
506 Register Dst = MI.getOperand(i: 0).getReg();
507 Register Src1 = MI.getOperand(i: 1).getReg();
508 Register Src2 = MI.getOperand(i: 2).getReg();
509 // If the register is SGPR, don't optimize it.
510 if (!(isVgprRegBank(Reg: Dst) && isVgprRegBank(Reg: Src1) && isVgprRegBank(Reg: Src2))) {
511 return false;
512 }
513
514 LLT Ty = MRI.getType(Reg: Dst);
515 unsigned Opc = MI.getOpcode();
516 if (!(Ty == LLT::scalar(SizeInBits: 32) ||
517 (Ty == LLT::scalar(SizeInBits: 16) && STI.hasMin3Max3_16())))
518 return false;
519
520 Register R0, R1, R2;
521 if (!mi_match(MI, MRI,
522 P: m_CommutativeBinOp(
523 Opcode: Opc, L: m_OneNonDBGUse(SP: m_BinOp(Opcode: Opc, L: m_Reg(R&: R0), R: m_Reg(R&: R1))),
524 R: m_Reg(R&: R2)))) {
525 return false;
526 }
527
528 unsigned AMDGPUOpc = 0;
529 switch (Opc) {
530 case AMDGPU::G_SMAX:
531 AMDGPUOpc = AMDGPU::G_AMDGPU_SMAX3;
532 break;
533 case AMDGPU::G_SMIN:
534 AMDGPUOpc = AMDGPU::G_AMDGPU_SMIN3;
535 break;
536 case AMDGPU::G_UMAX:
537 AMDGPUOpc = AMDGPU::G_AMDGPU_UMAX3;
538 break;
539 case AMDGPU::G_UMIN:
540 AMDGPUOpc = AMDGPU::G_AMDGPU_UMIN3;
541 break;
542 case AMDGPU::G_FMAXNUM:
543 case AMDGPU::G_FMAXNUM_IEEE:
544 AMDGPUOpc = AMDGPU::G_AMDGPU_FMAX3;
545 break;
546 case AMDGPU::G_FMINNUM:
547 case AMDGPU::G_FMINNUM_IEEE:
548 AMDGPUOpc = AMDGPU::G_AMDGPU_FMIN3;
549 break;
550 case AMDGPU::G_FMAXIMUM:
551 case AMDGPU::G_FMAXIMUMNUM:
552 AMDGPUOpc = AMDGPU::G_AMDGPU_FMAXIMUM3;
553 break;
554 case AMDGPU::G_FMINIMUM:
555 case AMDGPU::G_FMINIMUMNUM:
556 AMDGPUOpc = AMDGPU::G_AMDGPU_FMINIMUM3;
557 break;
558 default:
559 return false;
560 }
561
562 MatchInfo = {.Opc: AMDGPUOpc, .Val0: R0, .Val1: R1, .Val2: R2};
563 return true;
564}
565
566bool AMDGPURegBankCombinerImpl::applyD16Load(
567 unsigned D16Opc, MachineInstr &DstMI, MachineInstr *SmallLoad,
568 Register SrcReg32ToOverwriteD16) const {
569 B.buildInstr(Opc: D16Opc, DstOps: {DstMI.getOperand(i: 0).getReg()},
570 SrcOps: {SmallLoad->getOperand(i: 1).getReg(), SrcReg32ToOverwriteD16})
571 .setMemRefs(SmallLoad->memoperands());
572 DstMI.eraseFromParent();
573 return true;
574}
575
576SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
577 return MF.getInfo<SIMachineFunctionInfo>()->getMode();
578}
579
580bool AMDGPURegBankCombinerImpl::getIEEE() const { return getMode().IEEE; }
581
582bool AMDGPURegBankCombinerImpl::getDX10Clamp() const {
583 return getMode().DX10Clamp;
584}
585
586bool AMDGPURegBankCombinerImpl::isFminnumIeee(const MachineInstr &MI) const {
587 return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE;
588}
589
590bool AMDGPURegBankCombinerImpl::isFCst(MachineInstr *MI) const {
591 return MI->getOpcode() == AMDGPU::G_FCONSTANT;
592}
593
594bool AMDGPURegBankCombinerImpl::isClampZeroToOne(MachineInstr *K0,
595 MachineInstr *K1) const {
596 if (isFCst(MI: K0) && isFCst(MI: K1)) {
597 const ConstantFP *KO_FPImm = K0->getOperand(i: 1).getFPImm();
598 const ConstantFP *K1_FPImm = K1->getOperand(i: 1).getFPImm();
599 return (KO_FPImm->isPosZero() && K1_FPImm->isOne()) ||
600 (KO_FPImm->isOne() && K1_FPImm->isPosZero());
601 }
602 return false;
603}
604
605// Pass boilerplate
606// ================
607
608class AMDGPURegBankCombiner : public MachineFunctionPass {
609public:
610 static char ID;
611
612 AMDGPURegBankCombiner(bool IsOptNone = false);
613
614 StringRef getPassName() const override { return "AMDGPURegBankCombiner"; }
615
616 bool runOnMachineFunction(MachineFunction &MF) override;
617
618 void getAnalysisUsage(AnalysisUsage &AU) const override;
619
620private:
621 bool IsOptNone;
622 AMDGPURegBankCombinerImplRuleConfig RuleConfig;
623};
624} // end anonymous namespace
625
626void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
627 AU.setPreservesCFG();
628 getSelectionDAGFallbackAnalysisUsage(AU);
629 AU.addRequired<GISelValueTrackingAnalysisLegacy>();
630 AU.addPreserved<GISelValueTrackingAnalysisLegacy>();
631 if (!IsOptNone) {
632 AU.addRequired<MachineDominatorTreeWrapperPass>();
633 AU.addPreserved<MachineDominatorTreeWrapperPass>();
634 }
635 MachineFunctionPass::getAnalysisUsage(AU);
636}
637
638AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone)
639 : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
640 if (!RuleConfig.parseCommandLineOption())
641 report_fatal_error(reason: "Invalid rule identifier");
642}
643
644bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) {
645 if (MF.getProperties().hasFailedISel())
646 return false;
647 const Function &F = MF.getFunction();
648 bool EnableOpt =
649 MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
650
651 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
652 GISelValueTracking *VT =
653 &getAnalysis<GISelValueTrackingAnalysisLegacy>().get(MF);
654
655 const auto *LI = ST.getLegalizerInfo();
656 MachineDominatorTree *MDT =
657 IsOptNone ? nullptr
658 : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
659
660 CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
661 LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
662 // Disable fixed-point iteration to reduce compile-time
663 CInfo.MaxIterations = 1;
664 CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass;
665 // RegBankSelect seems not to leave dead instructions, so a full DCE pass is
666 // unnecessary.
667 CInfo.EnableFullDCE = false;
668 AMDGPURegBankCombinerImpl Impl(MF, CInfo, *VT, /*CSEInfo*/ nullptr,
669 RuleConfig, ST, MDT, LI);
670 return Impl.combineMachineInstrs();
671}
672
673char AMDGPURegBankCombiner::ID = 0;
674INITIALIZE_PASS_BEGIN(AMDGPURegBankCombiner, DEBUG_TYPE,
675 "Combine AMDGPU machine instrs after regbankselect",
676 false, false)
677INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy)
678INITIALIZE_PASS_END(AMDGPURegBankCombiner, DEBUG_TYPE,
679 "Combine AMDGPU machine instrs after regbankselect", false,
680 false)
681
682FunctionPass *llvm::createAMDGPURegBankCombiner(bool IsOptNone) {
683 return new AMDGPURegBankCombiner(IsOptNone);
684}
685