1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUInstructionSelector.h"
15#include "AMDGPU.h"
16#include "AMDGPUGlobalISelUtils.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
23#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
24#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/CodeGen/MachineFrameInfo.h"
28#include "llvm/IR/DiagnosticInfo.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
43AMDGPUInstructionSelector::AMDGPUInstructionSelector(
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48#define GET_GLOBALISEL_PREDICATES_INIT
49#include "AMDGPUGenGlobalISel.inc"
50#undef GET_GLOBALISEL_PREDICATES_INIT
51#define GET_GLOBALISEL_TEMPORARIES_INIT
52#include "AMDGPUGenGlobalISel.inc"
53#undef GET_GLOBALISEL_TEMPORARIES_INIT
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
59void AMDGPUInstructionSelector::setupMF(MachineFunction &MF,
60 GISelValueTracking *VT,
61 CodeGenCoverage *CoverageInfo,
62 ProfileSummaryInfo *PSI,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
66 Subtarget->checkSubtargetFeatures(F: MF.getFunction());
67 InstructionSelector::setupMF(mf&: MF, vt: VT, covinfo: CoverageInfo, psi: PSI, bfi: BFI);
68}
69
70// Return the wave level SGPR base address if this is a wave address.
71static Register getWaveAddress(const MachineInstr *Def) {
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(i: 1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
85 dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank);
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(RC: TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(Val: RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(Opcode: NewOpc));
102 MI.removeOperand(OpNo: 1); // Remove intrinsic ID.
103 MI.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
104
105 MachineOperand &Dst = MI.getOperand(i: 0);
106 MachineOperand &Src = MI.getOperand(i: 1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Reg: Dst.getReg()) == LLT::scalar(SizeInBits: 1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 if (!RBI.constrainGenericRegister(Reg: Dst.getReg(), RC: *DstRC, MRI&: *MRI) ||
120 !RBI.constrainGenericRegister(Reg: Src.getReg(), RC: *SrcRC, MRI&: *MRI))
121 return false;
122 const MCInstrDesc &MCID = MI.getDesc();
123 if (MCID.getOperandConstraint(OpNum: 0, Constraint: MCOI::EARLY_CLOBBER) != -1) {
124 MI.getOperand(i: 0).setIsEarlyClobber(true);
125 }
126 return true;
127}
128
129bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
130 const DebugLoc &DL = I.getDebugLoc();
131 MachineBasicBlock *BB = I.getParent();
132 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
133
134 const MachineOperand &Src = I.getOperand(i: 1);
135 MachineOperand &Dst = I.getOperand(i: 0);
136 Register DstReg = Dst.getReg();
137 Register SrcReg = Src.getReg();
138
139 if (isVCC(Reg: DstReg, MRI: *MRI)) {
140 if (SrcReg == AMDGPU::SCC) {
141 const TargetRegisterClass *RC
142 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
143 if (!RC)
144 return true;
145 return RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI);
146 }
147
148 if (!isVCC(Reg: SrcReg, MRI: *MRI)) {
149 // TODO: Should probably leave the copy and let copyPhysReg expand it.
150 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *TRI.getBoolRC(), MRI&: *MRI))
151 return false;
152
153 const TargetRegisterClass *SrcRC
154 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
155
156 std::optional<ValueAndVReg> ConstVal =
157 getIConstantVRegValWithLookThrough(VReg: SrcReg, MRI: *MRI, LookThroughInstrs: true);
158 if (ConstVal) {
159 unsigned MovOpc =
160 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
161 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: MovOpc), DestReg: DstReg)
162 .addImm(Val: ConstVal->Value.getBoolValue() ? -1 : 0);
163 } else {
164 Register MaskedReg = MRI->createVirtualRegister(RegClass: SrcRC);
165
166 // We can't trust the high bits at this point, so clear them.
167
168 // TODO: Skip masking high bits if def is known boolean.
169
170 if (AMDGPU::getRegBitWidth(RCID: SrcRC->getID()) == 16) {
171 assert(Subtarget->useRealTrue16Insts());
172 const int64_t NoMods = 0;
173 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B16_t16_e64), DestReg: MaskedReg)
174 .addImm(Val: NoMods)
175 .addImm(Val: 1)
176 .addImm(Val: NoMods)
177 .addReg(RegNo: SrcReg)
178 .addImm(Val: NoMods);
179 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_NE_U16_t16_e64), DestReg: DstReg)
180 .addImm(Val: NoMods)
181 .addImm(Val: 0)
182 .addImm(Val: NoMods)
183 .addReg(RegNo: MaskedReg)
184 .addImm(Val: NoMods);
185 } else {
186 bool IsSGPR = TRI.isSGPRClass(RC: SrcRC);
187 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
188 auto And = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: MaskedReg)
189 .addImm(Val: 1)
190 .addReg(RegNo: SrcReg);
191 if (IsSGPR)
192 And.setOperandDead(3); // Dead scc
193
194 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_NE_U32_e64), DestReg: DstReg)
195 .addImm(Val: 0)
196 .addReg(RegNo: MaskedReg);
197 }
198 }
199
200 if (!MRI->getRegClassOrNull(Reg: SrcReg))
201 MRI->setRegClass(Reg: SrcReg, RC: SrcRC);
202 I.eraseFromParent();
203 return true;
204 }
205
206 const TargetRegisterClass *RC =
207 TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
208 if (RC && !RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI))
209 return false;
210
211 return true;
212 }
213
214 for (const MachineOperand &MO : I.operands()) {
215 if (MO.getReg().isPhysical())
216 continue;
217
218 const TargetRegisterClass *RC =
219 TRI.getConstrainedRegClassForOperand(MO, MRI: *MRI);
220 if (!RC)
221 continue;
222 RBI.constrainGenericRegister(Reg: MO.getReg(), RC: *RC, MRI&: *MRI);
223 }
224 return true;
225}
226
227bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
228 const DebugLoc &DL = I.getDebugLoc();
229 MachineBasicBlock *BB = I.getParent();
230 Register VCCReg = I.getOperand(i: 1).getReg();
231 MachineInstr *Cmp;
232
233 // Set SCC as a side effect with S_CMP or S_OR.
234 if (STI.hasScalarCompareEq64()) {
235 unsigned CmpOpc =
236 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
237 Cmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: CmpOpc)).addReg(RegNo: VCCReg).addImm(Val: 0);
238 } else {
239 Register DeadDst = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
240 Cmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_OR_B64), DestReg: DeadDst)
241 .addReg(RegNo: VCCReg)
242 .addReg(RegNo: VCCReg);
243 }
244
245 if (!constrainSelectedInstRegOperands(I&: *Cmp, TII, TRI, RBI))
246 return false;
247
248 Register DstReg = I.getOperand(i: 0).getReg();
249 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: AMDGPU::SCC);
250
251 I.eraseFromParent();
252 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
253}
254
255bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
256 const DebugLoc &DL = I.getDebugLoc();
257 MachineBasicBlock *BB = I.getParent();
258
259 Register DstReg = I.getOperand(i: 0).getReg();
260 Register SrcReg = I.getOperand(i: 1).getReg();
261 std::optional<ValueAndVReg> Arg =
262 getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: 1).getReg(), MRI: *MRI);
263
264 if (Arg) {
265 const int64_t Value = Arg->Value.getZExtValue();
266 if (Value == 0) {
267 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
268 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: DstReg).addImm(Val: 0);
269 } else {
270 assert(Value == 1);
271 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: TRI.getExec());
272 }
273 I.eraseFromParent();
274 return RBI.constrainGenericRegister(Reg: DstReg, RC: *TRI.getBoolRC(), MRI&: *MRI);
275 }
276
277 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
278 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC).addReg(RegNo: SrcReg);
279
280 unsigned SelectOpcode =
281 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
282 MachineInstr *Select = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: SelectOpcode), DestReg: DstReg)
283 .addReg(RegNo: TRI.getExec())
284 .addImm(Val: 0);
285
286 I.eraseFromParent();
287 return constrainSelectedInstRegOperands(I&: *Select, TII, TRI, RBI);
288}
289
290bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
291 Register DstReg = I.getOperand(i: 0).getReg();
292 Register SrcReg = I.getOperand(i: 1).getReg();
293
294 const DebugLoc &DL = I.getDebugLoc();
295 MachineBasicBlock *BB = I.getParent();
296
297 auto RFL = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
298 .addReg(RegNo: SrcReg);
299
300 I.eraseFromParent();
301 return constrainSelectedInstRegOperands(I&: *RFL, TII, TRI, RBI);
302}
303
304bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
305 const Register DefReg = I.getOperand(i: 0).getReg();
306 const LLT DefTy = MRI->getType(Reg: DefReg);
307
308 // S1 G_PHIs should not be selected in instruction-select, instead:
309 // - divergent S1 G_PHI should go through lane mask merging algorithm
310 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
311 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
312 if (DefTy == LLT::scalar(SizeInBits: 1))
313 return false;
314
315 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
316
317 const RegClassOrRegBank &RegClassOrBank =
318 MRI->getRegClassOrRegBank(Reg: DefReg);
319
320 const TargetRegisterClass *DefRC =
321 dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank);
322 if (!DefRC) {
323 if (!DefTy.isValid()) {
324 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
325 return false;
326 }
327
328 const RegisterBank &RB = *cast<const RegisterBank *>(Val: RegClassOrBank);
329 DefRC = TRI.getRegClassForTypeOnBank(Ty: DefTy, Bank: RB);
330 if (!DefRC) {
331 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
332 return false;
333 }
334 }
335
336 // If inputs have register bank, assign corresponding reg class.
337 // Note: registers don't need to have the same reg bank.
338 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
339 const Register SrcReg = I.getOperand(i).getReg();
340
341 const RegisterBank *RB = MRI->getRegBankOrNull(Reg: SrcReg);
342 if (RB) {
343 const LLT SrcTy = MRI->getType(Reg: SrcReg);
344 const TargetRegisterClass *SrcRC =
345 TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *RB);
346 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI))
347 return false;
348 }
349 }
350
351 I.setDesc(TII.get(Opcode: TargetOpcode::PHI));
352 return RBI.constrainGenericRegister(Reg: DefReg, RC: *DefRC, MRI&: *MRI);
353}
354
355MachineOperand
356AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
357 const TargetRegisterClass &SubRC,
358 unsigned SubIdx) const {
359
360 MachineInstr *MI = MO.getParent();
361 MachineBasicBlock *BB = MO.getParent()->getParent();
362 Register DstReg = MRI->createVirtualRegister(RegClass: &SubRC);
363
364 if (MO.isReg()) {
365 unsigned ComposedSubIdx = TRI.composeSubRegIndices(a: MO.getSubReg(), b: SubIdx);
366 Register Reg = MO.getReg();
367 BuildMI(BB&: *BB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
368 .addReg(RegNo: Reg, Flags: {}, SubReg: ComposedSubIdx);
369
370 return MachineOperand::CreateReg(Reg: DstReg, isDef: MO.isDef(), isImp: MO.isImplicit(),
371 isKill: MO.isKill(), isDead: MO.isDead(), isUndef: MO.isUndef(),
372 isEarlyClobber: MO.isEarlyClobber(), SubReg: 0, isDebug: MO.isDebug(),
373 isInternalRead: MO.isInternalRead());
374 }
375
376 assert(MO.isImm());
377
378 APInt Imm(64, MO.getImm());
379
380 switch (SubIdx) {
381 default:
382 llvm_unreachable("do not know to split immediate with this sub index.");
383 case AMDGPU::sub0:
384 return MachineOperand::CreateImm(Val: Imm.getLoBits(numBits: 32).getSExtValue());
385 case AMDGPU::sub1:
386 return MachineOperand::CreateImm(Val: Imm.getHiBits(numBits: 32).getSExtValue());
387 }
388}
389
390static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
391 switch (Opc) {
392 case AMDGPU::G_AND:
393 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
394 case AMDGPU::G_OR:
395 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
396 case AMDGPU::G_XOR:
397 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
398 default:
399 llvm_unreachable("not a bit op");
400 }
401}
402
403bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
404 Register DstReg = I.getOperand(i: 0).getReg();
405 unsigned Size = RBI.getSizeInBits(Reg: DstReg, MRI: *MRI, TRI);
406
407 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
408 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
409 DstRB->getID() != AMDGPU::VCCRegBankID)
410 return false;
411
412 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
413 STI.isWave64());
414 I.setDesc(TII.get(Opcode: getLogicalBitOpcode(Opc: I.getOpcode(), Is64)));
415
416 // Dead implicit-def of scc
417 I.addOperand(Op: MachineOperand::CreateReg(Reg: AMDGPU::SCC, isDef: true, // isDef
418 isImp: true, // isImp
419 isKill: false, // isKill
420 isDead: true)); // isDead
421 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
422}
423
424bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
425 MachineBasicBlock *BB = I.getParent();
426 MachineFunction *MF = BB->getParent();
427 Register DstReg = I.getOperand(i: 0).getReg();
428 const DebugLoc &DL = I.getDebugLoc();
429 LLT Ty = MRI->getType(Reg: DstReg);
430 if (Ty.isVector())
431 return false;
432
433 unsigned Size = Ty.getSizeInBits();
434 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
435 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
436 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
437
438 if (Size == 32) {
439 if (IsSALU) {
440 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
441 MachineInstr *Add =
442 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
443 .add(MO: I.getOperand(i: 1))
444 .add(MO: I.getOperand(i: 2))
445 .setOperandDead(3); // Dead scc
446 I.eraseFromParent();
447 return constrainSelectedInstRegOperands(I&: *Add, TII, TRI, RBI);
448 }
449
450 if (STI.hasAddNoCarryInsts()) {
451 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
452 I.setDesc(TII.get(Opcode: Opc));
453 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
454 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
455 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
456 }
457
458 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
459
460 Register UnusedCarry = MRI->createVirtualRegister(RegClass: TRI.getWaveMaskRegClass());
461 MachineInstr *Add
462 = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
463 .addDef(RegNo: UnusedCarry, Flags: RegState::Dead)
464 .add(MO: I.getOperand(i: 1))
465 .add(MO: I.getOperand(i: 2))
466 .addImm(Val: 0);
467 I.eraseFromParent();
468 return constrainSelectedInstRegOperands(I&: *Add, TII, TRI, RBI);
469 }
470
471 assert(!Sub && "illegal sub should not reach here");
472
473 const TargetRegisterClass &RC
474 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
475 const TargetRegisterClass &HalfRC
476 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
477
478 MachineOperand Lo1(getSubOperand64(MO&: I.getOperand(i: 1), SubRC: HalfRC, SubIdx: AMDGPU::sub0));
479 MachineOperand Lo2(getSubOperand64(MO&: I.getOperand(i: 2), SubRC: HalfRC, SubIdx: AMDGPU::sub0));
480 MachineOperand Hi1(getSubOperand64(MO&: I.getOperand(i: 1), SubRC: HalfRC, SubIdx: AMDGPU::sub1));
481 MachineOperand Hi2(getSubOperand64(MO&: I.getOperand(i: 2), SubRC: HalfRC, SubIdx: AMDGPU::sub1));
482
483 Register DstLo = MRI->createVirtualRegister(RegClass: &HalfRC);
484 Register DstHi = MRI->createVirtualRegister(RegClass: &HalfRC);
485
486 if (IsSALU) {
487 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADD_U32), DestReg: DstLo)
488 .add(MO: Lo1)
489 .add(MO: Lo2);
490 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADDC_U32), DestReg: DstHi)
491 .add(MO: Hi1)
492 .add(MO: Hi2)
493 .setOperandDead(3); // Dead scc
494 } else {
495 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
496 Register CarryReg = MRI->createVirtualRegister(RegClass: CarryRC);
497 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: DstLo)
498 .addDef(RegNo: CarryReg)
499 .add(MO: Lo1)
500 .add(MO: Lo2)
501 .addImm(Val: 0);
502 MachineInstr *Addc = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: DstHi)
503 .addDef(RegNo: MRI->createVirtualRegister(RegClass: CarryRC), Flags: RegState::Dead)
504 .add(MO: Hi1)
505 .add(MO: Hi2)
506 .addReg(RegNo: CarryReg, Flags: RegState::Kill)
507 .addImm(Val: 0);
508
509 if (!constrainSelectedInstRegOperands(I&: *Addc, TII, TRI, RBI))
510 return false;
511 }
512
513 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
514 .addReg(RegNo: DstLo)
515 .addImm(Val: AMDGPU::sub0)
516 .addReg(RegNo: DstHi)
517 .addImm(Val: AMDGPU::sub1);
518
519
520 if (!RBI.constrainGenericRegister(Reg: DstReg, RC, MRI&: *MRI))
521 return false;
522
523 I.eraseFromParent();
524 return true;
525}
526
527bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
528 MachineInstr &I) const {
529 MachineBasicBlock *BB = I.getParent();
530 MachineFunction *MF = BB->getParent();
531 const DebugLoc &DL = I.getDebugLoc();
532 Register Dst0Reg = I.getOperand(i: 0).getReg();
533 Register Dst1Reg = I.getOperand(i: 1).getReg();
534 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
535 I.getOpcode() == AMDGPU::G_UADDE;
536 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
537 I.getOpcode() == AMDGPU::G_USUBE;
538
539 if (isVCC(Reg: Dst1Reg, MRI: *MRI)) {
540 unsigned NoCarryOpc =
541 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
542 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
543 I.setDesc(TII.get(Opcode: HasCarryIn ? CarryOpc : NoCarryOpc));
544 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
545 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
546 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
547 }
548
549 Register Src0Reg = I.getOperand(i: 2).getReg();
550 Register Src1Reg = I.getOperand(i: 3).getReg();
551
552 if (HasCarryIn) {
553 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC)
554 .addReg(RegNo: I.getOperand(i: 4).getReg());
555 }
556
557 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
558 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
559
560 auto CarryInst = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: HasCarryIn ? CarryOpc : NoCarryOpc), DestReg: Dst0Reg)
561 .add(MO: I.getOperand(i: 2))
562 .add(MO: I.getOperand(i: 3));
563
564 if (MRI->use_nodbg_empty(RegNo: Dst1Reg)) {
565 CarryInst.setOperandDead(3); // Dead scc
566 } else {
567 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: Dst1Reg)
568 .addReg(RegNo: AMDGPU::SCC);
569 if (!MRI->getRegClassOrNull(Reg: Dst1Reg))
570 MRI->setRegClass(Reg: Dst1Reg, RC: &AMDGPU::SReg_32RegClass);
571 }
572
573 if (!RBI.constrainGenericRegister(Reg: Dst0Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
574 !RBI.constrainGenericRegister(Reg: Src0Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
575 !RBI.constrainGenericRegister(Reg: Src1Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
576 return false;
577
578 if (HasCarryIn &&
579 !RBI.constrainGenericRegister(Reg: I.getOperand(i: 4).getReg(),
580 RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
581 return false;
582
583 I.eraseFromParent();
584 return true;
585}
586
587bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
588 MachineInstr &I) const {
589 MachineBasicBlock *BB = I.getParent();
590 MachineFunction *MF = BB->getParent();
591 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
592 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
593 MRI->use_nodbg_empty(RegNo: I.getOperand(i: 1).getReg());
594
595 unsigned Opc;
596 if (Subtarget->hasMADIntraFwdBug())
597 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
598 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
599 else if (UseNoCarry)
600 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
601 : AMDGPU::V_MAD_NC_I64_I32_e64;
602 else
603 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
604
605 if (UseNoCarry)
606 I.removeOperand(OpNo: 1);
607
608 I.setDesc(TII.get(Opcode: Opc));
609 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
610 I.addImplicitDefUseOperands(MF&: *MF);
611 I.getOperand(i: 0).setIsEarlyClobber(true);
612 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
613}
614
615// TODO: We should probably legalize these to only using 32-bit results.
616bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
617 MachineBasicBlock *BB = I.getParent();
618 Register DstReg = I.getOperand(i: 0).getReg();
619 Register SrcReg = I.getOperand(i: 1).getReg();
620 LLT DstTy = MRI->getType(Reg: DstReg);
621 LLT SrcTy = MRI->getType(Reg: SrcReg);
622 const unsigned SrcSize = SrcTy.getSizeInBits();
623 unsigned DstSize = DstTy.getSizeInBits();
624
625 // TODO: Should handle any multiple of 32 offset.
626 unsigned Offset = I.getOperand(i: 2).getImm();
627 if (Offset % 32 != 0 || DstSize > 128)
628 return false;
629
630 // 16-bit operations really use 32-bit registers.
631 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
632 if (DstSize == 16)
633 DstSize = 32;
634
635 const TargetRegisterClass *DstRC =
636 TRI.getConstrainedRegClassForOperand(MO: I.getOperand(i: 0), MRI: *MRI);
637 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
638 return false;
639
640 const RegisterBank *SrcBank = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
641 const TargetRegisterClass *SrcRC =
642 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcBank);
643 if (!SrcRC)
644 return false;
645 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Channel: Offset / 32,
646 NumRegs: DstSize / 32);
647 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
648 if (!SrcRC)
649 return false;
650
651 SrcReg = constrainOperandRegClass(MF: *MF, TRI, MRI&: *MRI, TII, RBI, InsertPt&: I,
652 RegClass: *SrcRC, RegMO&: I.getOperand(i: 1));
653 const DebugLoc &DL = I.getDebugLoc();
654 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::COPY), DestReg: DstReg)
655 .addReg(RegNo: SrcReg, Flags: {}, SubReg);
656
657 I.eraseFromParent();
658 return true;
659}
660
661bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
662 MachineBasicBlock *BB = MI.getParent();
663 Register DstReg = MI.getOperand(i: 0).getReg();
664 LLT DstTy = MRI->getType(Reg: DstReg);
665 LLT SrcTy = MRI->getType(Reg: MI.getOperand(i: 1).getReg());
666
667 const unsigned SrcSize = SrcTy.getSizeInBits();
668 if (SrcSize < 32)
669 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
670
671 const DebugLoc &DL = MI.getDebugLoc();
672 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
673 const unsigned DstSize = DstTy.getSizeInBits();
674 const TargetRegisterClass *DstRC =
675 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
676 if (!DstRC)
677 return false;
678
679 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: DstRC, EltSize: SrcSize / 8);
680 MachineInstrBuilder MIB =
681 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg);
682 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
683 MachineOperand &Src = MI.getOperand(i: I + 1);
684 MIB.addReg(RegNo: Src.getReg(), Flags: getUndefRegState(B: Src.isUndef()));
685 MIB.addImm(Val: SubRegs[I]);
686
687 const TargetRegisterClass *SrcRC
688 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
689 if (SrcRC && !RBI.constrainGenericRegister(Reg: Src.getReg(), RC: *SrcRC, MRI&: *MRI))
690 return false;
691 }
692
693 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
694 return false;
695
696 MI.eraseFromParent();
697 return true;
698}
699
700bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
701 MachineBasicBlock *BB = MI.getParent();
702 const int NumDst = MI.getNumOperands() - 1;
703
704 MachineOperand &Src = MI.getOperand(i: NumDst);
705
706 Register SrcReg = Src.getReg();
707 Register DstReg0 = MI.getOperand(i: 0).getReg();
708 LLT DstTy = MRI->getType(Reg: DstReg0);
709 LLT SrcTy = MRI->getType(Reg: SrcReg);
710
711 const unsigned DstSize = DstTy.getSizeInBits();
712 const unsigned SrcSize = SrcTy.getSizeInBits();
713 const DebugLoc &DL = MI.getDebugLoc();
714 const RegisterBank *SrcBank = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
715
716 const TargetRegisterClass *SrcRC =
717 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcBank);
718 if (!SrcRC || !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI))
719 return false;
720
721 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
722 // source, and this relies on the fact that the same subregister indices are
723 // used for both.
724 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: SrcRC, EltSize: DstSize / 8);
725 for (int I = 0, E = NumDst; I != E; ++I) {
726 MachineOperand &Dst = MI.getOperand(i: I);
727 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::COPY), DestReg: Dst.getReg())
728 .addReg(RegNo: SrcReg, Flags: {}, SubReg: SubRegs[I]);
729
730 // Make sure the subregister index is valid for the source register.
731 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
732 if (!SrcRC || !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI))
733 return false;
734
735 const TargetRegisterClass *DstRC =
736 TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
737 if (DstRC && !RBI.constrainGenericRegister(Reg: Dst.getReg(), RC: *DstRC, MRI&: *MRI))
738 return false;
739 }
740
741 MI.eraseFromParent();
742 return true;
743}
744
745bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
746 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
747 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
748
749 Register Src0 = MI.getOperand(i: 1).getReg();
750 Register Src1 = MI.getOperand(i: 2).getReg();
751 LLT SrcTy = MRI->getType(Reg: Src0);
752 const unsigned SrcSize = SrcTy.getSizeInBits();
753
754 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
755 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
756 return selectG_MERGE_VALUES(MI);
757 }
758
759 // Selection logic below is for V2S16 only.
760 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
761 Register Dst = MI.getOperand(i: 0).getReg();
762 if (MRI->getType(Reg: Dst) != LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) ||
763 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
764 SrcTy != LLT::scalar(SizeInBits: 32)))
765 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
766
767 const RegisterBank *DstBank = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
768 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
769 return false;
770
771 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
772 DstBank->getID() == AMDGPU::VGPRRegBankID);
773 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
774
775 const DebugLoc &DL = MI.getDebugLoc();
776 MachineBasicBlock *BB = MI.getParent();
777
778 // First, before trying TableGen patterns, check if both sources are
779 // constants. In those cases, we can trivially compute the final constant
780 // and emit a simple move.
781 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(VReg: Src1, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
782 if (ConstSrc1) {
783 auto ConstSrc0 =
784 getAnyConstantVRegValWithLookThrough(VReg: Src0, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
785 if (ConstSrc0) {
786 const int64_t K0 = ConstSrc0->Value.getSExtValue();
787 const int64_t K1 = ConstSrc1->Value.getSExtValue();
788 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
789 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
790 uint32_t Imm = Lo16 | (Hi16 << 16);
791
792 // VALU
793 if (IsVector) {
794 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: Dst).addImm(Val: Imm);
795 MI.eraseFromParent();
796 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI);
797 }
798
799 // SALU
800 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: Dst).addImm(Val: Imm);
801 MI.eraseFromParent();
802 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
803 }
804 }
805
806 // Now try TableGen patterns.
807 if (selectImpl(I&: MI, CoverageInfo&: *CoverageInfo))
808 return true;
809
810 // TODO: This should probably be a combine somewhere
811 // (build_vector $src0, undef) -> copy $src0
812 MachineInstr *Src1Def = getDefIgnoringCopies(Reg: Src1, MRI: *MRI);
813 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
814 MI.setDesc(TII.get(Opcode: AMDGPU::COPY));
815 MI.removeOperand(OpNo: 2);
816 const auto &RC =
817 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
818 return RBI.constrainGenericRegister(Reg: Dst, RC, MRI&: *MRI) &&
819 RBI.constrainGenericRegister(Reg: Src0, RC, MRI&: *MRI);
820 }
821
822 // TODO: Can be improved?
823 if (IsVector) {
824 Register TmpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
825 auto MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B32_e32), DestReg: TmpReg)
826 .addImm(Val: 0xFFFF)
827 .addReg(RegNo: Src0);
828 if (!constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI))
829 return false;
830
831 MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: Dst)
832 .addReg(RegNo: Src1)
833 .addImm(Val: 16)
834 .addReg(RegNo: TmpReg);
835 if (!constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI))
836 return false;
837
838 MI.eraseFromParent();
839 return true;
840 }
841
842 Register ShiftSrc0;
843 Register ShiftSrc1;
844
845 // With multiple uses of the shift, this will duplicate the shift and
846 // increase register pressure.
847 //
848 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
849 // => (S_PACK_HH_B32_B16 $src0, $src1)
850 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
851 // => (S_PACK_HL_B32_B16 $src0, $src1)
852 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
853 // => (S_PACK_LH_B32_B16 $src0, $src1)
854 // (build_vector $src0, $src1)
855 // => (S_PACK_LL_B32_B16 $src0, $src1)
856
857 bool Shift0 = mi_match(
858 R: Src0, MRI: *MRI, P: m_OneUse(SP: m_GLShr(L: m_Reg(R&: ShiftSrc0), R: m_SpecificICst(RequestedValue: 16))));
859
860 bool Shift1 = mi_match(
861 R: Src1, MRI: *MRI, P: m_OneUse(SP: m_GLShr(L: m_Reg(R&: ShiftSrc1), R: m_SpecificICst(RequestedValue: 16))));
862
863 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
864 if (Shift0 && Shift1) {
865 Opc = AMDGPU::S_PACK_HH_B32_B16;
866 MI.getOperand(i: 1).setReg(ShiftSrc0);
867 MI.getOperand(i: 2).setReg(ShiftSrc1);
868 } else if (Shift1) {
869 Opc = AMDGPU::S_PACK_LH_B32_B16;
870 MI.getOperand(i: 2).setReg(ShiftSrc1);
871 } else if (Shift0) {
872 auto ConstSrc1 =
873 getAnyConstantVRegValWithLookThrough(VReg: Src1, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
874 if (ConstSrc1 && ConstSrc1->Value == 0) {
875 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
876 auto MIB = BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: Dst)
877 .addReg(RegNo: ShiftSrc0)
878 .addImm(Val: 16)
879 .setOperandDead(3); // Dead scc
880
881 MI.eraseFromParent();
882 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
883 }
884 if (STI.hasSPackHL()) {
885 Opc = AMDGPU::S_PACK_HL_B32_B16;
886 MI.getOperand(i: 1).setReg(ShiftSrc0);
887 }
888 }
889
890 MI.setDesc(TII.get(Opcode: Opc));
891 return constrainSelectedInstRegOperands(I&: MI, TII, TRI, RBI);
892}
893
894bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
895 const MachineOperand &MO = I.getOperand(i: 0);
896
897 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
898 // regbank check here is to know why getConstrainedRegClassForOperand failed.
899 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI: *MRI);
900 if ((!RC && !MRI->getRegBankOrNull(Reg: MO.getReg())) ||
901 (RC && RBI.constrainGenericRegister(Reg: MO.getReg(), RC: *RC, MRI&: *MRI))) {
902 I.setDesc(TII.get(Opcode: TargetOpcode::IMPLICIT_DEF));
903 return true;
904 }
905
906 return false;
907}
908
909bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
910 MachineBasicBlock *BB = I.getParent();
911
912 Register DstReg = I.getOperand(i: 0).getReg();
913 Register Src0Reg = I.getOperand(i: 1).getReg();
914 Register Src1Reg = I.getOperand(i: 2).getReg();
915 LLT Src1Ty = MRI->getType(Reg: Src1Reg);
916
917 unsigned DstSize = MRI->getType(Reg: DstReg).getSizeInBits();
918 unsigned InsSize = Src1Ty.getSizeInBits();
919
920 int64_t Offset = I.getOperand(i: 3).getImm();
921
922 // FIXME: These cases should have been illegal and unnecessary to check here.
923 if (Offset % 32 != 0 || InsSize % 32 != 0)
924 return false;
925
926 // Currently not handled by getSubRegFromChannel.
927 if (InsSize > 128)
928 return false;
929
930 unsigned SubReg = TRI.getSubRegFromChannel(Channel: Offset / 32, NumRegs: InsSize / 32);
931 if (SubReg == AMDGPU::NoSubRegister)
932 return false;
933
934 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
935 const TargetRegisterClass *DstRC =
936 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
937 if (!DstRC)
938 return false;
939
940 const RegisterBank *Src0Bank = RBI.getRegBank(Reg: Src0Reg, MRI: *MRI, TRI);
941 const RegisterBank *Src1Bank = RBI.getRegBank(Reg: Src1Reg, MRI: *MRI, TRI);
942 const TargetRegisterClass *Src0RC =
943 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *Src0Bank);
944 const TargetRegisterClass *Src1RC =
945 TRI.getRegClassForSizeOnBank(Size: InsSize, Bank: *Src1Bank);
946
947 // Deal with weird cases where the class only partially supports the subreg
948 // index.
949 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
950 if (!Src0RC || !Src1RC)
951 return false;
952
953 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
954 !RBI.constrainGenericRegister(Reg: Src0Reg, RC: *Src0RC, MRI&: *MRI) ||
955 !RBI.constrainGenericRegister(Reg: Src1Reg, RC: *Src1RC, MRI&: *MRI))
956 return false;
957
958 const DebugLoc &DL = I.getDebugLoc();
959 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: DstReg)
960 .addReg(RegNo: Src0Reg)
961 .addReg(RegNo: Src1Reg)
962 .addImm(Val: SubReg);
963
964 I.eraseFromParent();
965 return true;
966}
967
968bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
969 Register DstReg = MI.getOperand(i: 0).getReg();
970 Register SrcReg = MI.getOperand(i: 1).getReg();
971 Register OffsetReg = MI.getOperand(i: 2).getReg();
972 Register WidthReg = MI.getOperand(i: 3).getReg();
973
974 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
975 "scalar BFX instructions are expanded in regbankselect");
976 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
977 "64-bit vector BFX instructions are expanded in regbankselect");
978
979 const DebugLoc &DL = MI.getDebugLoc();
980 MachineBasicBlock *MBB = MI.getParent();
981
982 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
983 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
984 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
985 .addReg(RegNo: SrcReg)
986 .addReg(RegNo: OffsetReg)
987 .addReg(RegNo: WidthReg);
988 MI.eraseFromParent();
989 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
990}
991
992bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
993 if (STI.getLDSBankCount() != 16)
994 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
995
996 Register Dst = MI.getOperand(i: 0).getReg();
997 Register Src0 = MI.getOperand(i: 2).getReg();
998 Register M0Val = MI.getOperand(i: 6).getReg();
999 if (!RBI.constrainGenericRegister(Reg: M0Val, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
1000 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI) ||
1001 !RBI.constrainGenericRegister(Reg: Src0, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI))
1002 return false;
1003
1004 // This requires 2 instructions. It is possible to write a pattern to support
1005 // this, but the generated isel emitter doesn't correctly deal with multiple
1006 // output instructions using the same physical register input. The copy to m0
1007 // is incorrectly placed before the second instruction.
1008 //
1009 // TODO: Match source modifiers.
1010
1011 Register InterpMov = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1012 const DebugLoc &DL = MI.getDebugLoc();
1013 MachineBasicBlock *MBB = MI.getParent();
1014
1015 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1016 .addReg(RegNo: M0Val);
1017 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_INTERP_MOV_F32), DestReg: InterpMov)
1018 .addImm(Val: 2)
1019 .addImm(Val: MI.getOperand(i: 4).getImm()) // $attr
1020 .addImm(Val: MI.getOperand(i: 3).getImm()); // $attrchan
1021
1022 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_INTERP_P1LV_F16), DestReg: Dst)
1023 .addImm(Val: 0) // $src0_modifiers
1024 .addReg(RegNo: Src0) // $src0
1025 .addImm(Val: MI.getOperand(i: 4).getImm()) // $attr
1026 .addImm(Val: MI.getOperand(i: 3).getImm()) // $attrchan
1027 .addImm(Val: 0) // $src2_modifiers
1028 .addReg(RegNo: InterpMov) // $src2 - 2 f16 values selected by high
1029 .addImm(Val: MI.getOperand(i: 5).getImm()) // $high
1030 .addImm(Val: 0) // $clamp
1031 .addImm(Val: 0); // $omod
1032
1033 MI.eraseFromParent();
1034 return true;
1035}
1036
1037// Writelane is special in that it can use SGPR and M0 (which would normally
1038// count as using the constant bus twice - but in this case it is allowed since
1039// the lane selector doesn't count as a use of the constant bus). However, it is
1040// still required to abide by the 1 SGPR rule. Fix this up if we might have
1041// multiple SGPRs.
1042bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1043 // With a constant bus limit of at least 2, there's no issue.
1044 if (STI.getConstantBusLimit(Opcode: AMDGPU::V_WRITELANE_B32) > 1)
1045 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
1046
1047 MachineBasicBlock *MBB = MI.getParent();
1048 const DebugLoc &DL = MI.getDebugLoc();
1049 Register VDst = MI.getOperand(i: 0).getReg();
1050 Register Val = MI.getOperand(i: 2).getReg();
1051 Register LaneSelect = MI.getOperand(i: 3).getReg();
1052 Register VDstIn = MI.getOperand(i: 4).getReg();
1053
1054 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_WRITELANE_B32), DestReg: VDst);
1055
1056 std::optional<ValueAndVReg> ConstSelect =
1057 getIConstantVRegValWithLookThrough(VReg: LaneSelect, MRI: *MRI);
1058 if (ConstSelect) {
1059 // The selector has to be an inline immediate, so we can use whatever for
1060 // the other operands.
1061 MIB.addReg(RegNo: Val);
1062 MIB.addImm(Val: ConstSelect->Value.getSExtValue() &
1063 maskTrailingOnes<uint64_t>(N: STI.getWavefrontSizeLog2()));
1064 } else {
1065 std::optional<ValueAndVReg> ConstVal =
1066 getIConstantVRegValWithLookThrough(VReg: Val, MRI: *MRI);
1067
1068 // If the value written is an inline immediate, we can get away without a
1069 // copy to m0.
1070 if (ConstVal && AMDGPU::isInlinableLiteral32(Literal: ConstVal->Value.getSExtValue(),
1071 HasInv2Pi: STI.hasInv2PiInlineImm())) {
1072 MIB.addImm(Val: ConstVal->Value.getSExtValue());
1073 MIB.addReg(RegNo: LaneSelect);
1074 } else {
1075 MIB.addReg(RegNo: Val);
1076
1077 // If the lane selector was originally in a VGPR and copied with
1078 // readfirstlane, there's a hazard to read the same SGPR from the
1079 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1080 RBI.constrainGenericRegister(Reg: LaneSelect, RC: AMDGPU::SReg_32_XM0RegClass, MRI&: *MRI);
1081
1082 BuildMI(BB&: *MBB, I&: *MIB, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1083 .addReg(RegNo: LaneSelect);
1084 MIB.addReg(RegNo: AMDGPU::M0);
1085 }
1086 }
1087
1088 MIB.addReg(RegNo: VDstIn);
1089
1090 MI.eraseFromParent();
1091 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1092}
1093
1094// We need to handle this here because tablegen doesn't support matching
1095// instructions with multiple outputs.
1096bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1097 Register Dst0 = MI.getOperand(i: 0).getReg();
1098 Register Dst1 = MI.getOperand(i: 1).getReg();
1099
1100 LLT Ty = MRI->getType(Reg: Dst0);
1101 unsigned Opc;
1102 if (Ty == LLT::scalar(SizeInBits: 32))
1103 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1104 else if (Ty == LLT::scalar(SizeInBits: 64))
1105 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1106 else
1107 return false;
1108
1109 // TODO: Match source modifiers.
1110
1111 const DebugLoc &DL = MI.getDebugLoc();
1112 MachineBasicBlock *MBB = MI.getParent();
1113
1114 Register Numer = MI.getOperand(i: 3).getReg();
1115 Register Denom = MI.getOperand(i: 4).getReg();
1116 unsigned ChooseDenom = MI.getOperand(i: 5).getImm();
1117
1118 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1119
1120 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: Dst0)
1121 .addDef(RegNo: Dst1)
1122 .addImm(Val: 0) // $src0_modifiers
1123 .addUse(RegNo: Src0) // $src0
1124 .addImm(Val: 0) // $src1_modifiers
1125 .addUse(RegNo: Denom) // $src1
1126 .addImm(Val: 0) // $src2_modifiers
1127 .addUse(RegNo: Numer) // $src2
1128 .addImm(Val: 0) // $clamp
1129 .addImm(Val: 0); // $omod
1130
1131 MI.eraseFromParent();
1132 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1133}
1134
1135bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1136 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
1137 switch (IntrinsicID) {
1138 case Intrinsic::amdgcn_if_break: {
1139 MachineBasicBlock *BB = I.getParent();
1140
1141 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1142 // SelectionDAG uses for wave32 vs wave64.
1143 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::SI_IF_BREAK))
1144 .add(MO: I.getOperand(i: 0))
1145 .add(MO: I.getOperand(i: 2))
1146 .add(MO: I.getOperand(i: 3));
1147
1148 Register DstReg = I.getOperand(i: 0).getReg();
1149 Register Src0Reg = I.getOperand(i: 2).getReg();
1150 Register Src1Reg = I.getOperand(i: 3).getReg();
1151
1152 I.eraseFromParent();
1153
1154 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1155 MRI->setRegClass(Reg, RC: TRI.getWaveMaskRegClass());
1156
1157 return true;
1158 }
1159 case Intrinsic::amdgcn_interp_p1_f16:
1160 return selectInterpP1F16(MI&: I);
1161 case Intrinsic::amdgcn_wqm:
1162 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::WQM);
1163 case Intrinsic::amdgcn_softwqm:
1164 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::SOFT_WQM);
1165 case Intrinsic::amdgcn_strict_wwm:
1166 case Intrinsic::amdgcn_wwm:
1167 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::STRICT_WWM);
1168 case Intrinsic::amdgcn_strict_wqm:
1169 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::STRICT_WQM);
1170 case Intrinsic::amdgcn_writelane:
1171 return selectWritelane(MI&: I);
1172 case Intrinsic::amdgcn_div_scale:
1173 return selectDivScale(MI&: I);
1174 case Intrinsic::amdgcn_icmp:
1175 case Intrinsic::amdgcn_fcmp:
1176 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
1177 return true;
1178 return selectIntrinsicCmp(MI&: I);
1179 case Intrinsic::amdgcn_ballot:
1180 return selectBallot(I);
1181 case Intrinsic::amdgcn_reloc_constant:
1182 return selectRelocConstant(I);
1183 case Intrinsic::amdgcn_groupstaticsize:
1184 return selectGroupStaticSize(I);
1185 case Intrinsic::returnaddress:
1186 return selectReturnAddress(I);
1187 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1188 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1189 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1190 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1191 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1192 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1193 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1194 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1195 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1196 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1197 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1198 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1199 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1200 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1201 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1202 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1203 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1204 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1205 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1206 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1207 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1208 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1209 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1210 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1211 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1212 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1213 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1214 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1215 return selectSMFMACIntrin(I);
1216 case Intrinsic::amdgcn_permlane16_swap:
1217 case Intrinsic::amdgcn_permlane32_swap:
1218 return selectPermlaneSwapIntrin(I, IntrID: IntrinsicID);
1219 case Intrinsic::amdgcn_wave_shuffle:
1220 return selectWaveShuffleIntrin(I);
1221 default:
1222 return selectImpl(I, CoverageInfo&: *CoverageInfo);
1223 }
1224}
1225
1226static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
1227 const GCNSubtarget &ST) {
1228 if (Size != 16 && Size != 32 && Size != 64)
1229 return -1;
1230
1231 if (Size == 16 && !ST.has16BitInsts())
1232 return -1;
1233
1234 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1235 unsigned FakeS16Opc, unsigned S32Opc,
1236 unsigned S64Opc) {
1237 if (Size == 16)
1238 return ST.hasTrue16BitInsts()
1239 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1240 : S16Opc;
1241 if (Size == 32)
1242 return S32Opc;
1243 return S64Opc;
1244 };
1245
1246 switch (P) {
1247 default:
1248 llvm_unreachable("Unknown condition code!");
1249 case CmpInst::ICMP_NE:
1250 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1251 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1252 AMDGPU::V_CMP_NE_U64_e64);
1253 case CmpInst::ICMP_EQ:
1254 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1255 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1256 AMDGPU::V_CMP_EQ_U64_e64);
1257 case CmpInst::ICMP_SGT:
1258 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1259 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1260 AMDGPU::V_CMP_GT_I64_e64);
1261 case CmpInst::ICMP_SGE:
1262 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1263 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1264 AMDGPU::V_CMP_GE_I64_e64);
1265 case CmpInst::ICMP_SLT:
1266 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1267 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1268 AMDGPU::V_CMP_LT_I64_e64);
1269 case CmpInst::ICMP_SLE:
1270 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1271 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1272 AMDGPU::V_CMP_LE_I64_e64);
1273 case CmpInst::ICMP_UGT:
1274 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1275 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1276 AMDGPU::V_CMP_GT_U64_e64);
1277 case CmpInst::ICMP_UGE:
1278 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1279 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1280 AMDGPU::V_CMP_GE_U64_e64);
1281 case CmpInst::ICMP_ULT:
1282 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1283 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1284 AMDGPU::V_CMP_LT_U64_e64);
1285 case CmpInst::ICMP_ULE:
1286 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1287 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1288 AMDGPU::V_CMP_LE_U64_e64);
1289
1290 case CmpInst::FCMP_OEQ:
1291 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1292 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1293 AMDGPU::V_CMP_EQ_F64_e64);
1294 case CmpInst::FCMP_OGT:
1295 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1296 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1297 AMDGPU::V_CMP_GT_F64_e64);
1298 case CmpInst::FCMP_OGE:
1299 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1300 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1301 AMDGPU::V_CMP_GE_F64_e64);
1302 case CmpInst::FCMP_OLT:
1303 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1304 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1305 AMDGPU::V_CMP_LT_F64_e64);
1306 case CmpInst::FCMP_OLE:
1307 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1308 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1309 AMDGPU::V_CMP_LE_F64_e64);
1310 case CmpInst::FCMP_ONE:
1311 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1312 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1313 AMDGPU::V_CMP_NEQ_F64_e64);
1314 case CmpInst::FCMP_ORD:
1315 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1316 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1317 AMDGPU::V_CMP_O_F64_e64);
1318 case CmpInst::FCMP_UNO:
1319 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1320 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1321 AMDGPU::V_CMP_U_F64_e64);
1322 case CmpInst::FCMP_UEQ:
1323 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1324 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1325 AMDGPU::V_CMP_NLG_F64_e64);
1326 case CmpInst::FCMP_UGT:
1327 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1328 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1329 AMDGPU::V_CMP_NLE_F64_e64);
1330 case CmpInst::FCMP_UGE:
1331 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1332 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1333 AMDGPU::V_CMP_NLT_F64_e64);
1334 case CmpInst::FCMP_ULT:
1335 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1336 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1337 AMDGPU::V_CMP_NGE_F64_e64);
1338 case CmpInst::FCMP_ULE:
1339 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1340 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1341 AMDGPU::V_CMP_NGT_F64_e64);
1342 case CmpInst::FCMP_UNE:
1343 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1344 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1345 AMDGPU::V_CMP_NEQ_F64_e64);
1346 case CmpInst::FCMP_TRUE:
1347 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1348 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1349 AMDGPU::V_CMP_TRU_F64_e64);
1350 case CmpInst::FCMP_FALSE:
1351 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1352 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1353 AMDGPU::V_CMP_F_F64_e64);
1354 }
1355}
1356
1357int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1358 unsigned Size) const {
1359 if (Size == 64) {
1360 if (!STI.hasScalarCompareEq64())
1361 return -1;
1362
1363 switch (P) {
1364 case CmpInst::ICMP_NE:
1365 return AMDGPU::S_CMP_LG_U64;
1366 case CmpInst::ICMP_EQ:
1367 return AMDGPU::S_CMP_EQ_U64;
1368 default:
1369 return -1;
1370 }
1371 }
1372
1373 if (Size == 32) {
1374 switch (P) {
1375 case CmpInst::ICMP_NE:
1376 return AMDGPU::S_CMP_LG_U32;
1377 case CmpInst::ICMP_EQ:
1378 return AMDGPU::S_CMP_EQ_U32;
1379 case CmpInst::ICMP_SGT:
1380 return AMDGPU::S_CMP_GT_I32;
1381 case CmpInst::ICMP_SGE:
1382 return AMDGPU::S_CMP_GE_I32;
1383 case CmpInst::ICMP_SLT:
1384 return AMDGPU::S_CMP_LT_I32;
1385 case CmpInst::ICMP_SLE:
1386 return AMDGPU::S_CMP_LE_I32;
1387 case CmpInst::ICMP_UGT:
1388 return AMDGPU::S_CMP_GT_U32;
1389 case CmpInst::ICMP_UGE:
1390 return AMDGPU::S_CMP_GE_U32;
1391 case CmpInst::ICMP_ULT:
1392 return AMDGPU::S_CMP_LT_U32;
1393 case CmpInst::ICMP_ULE:
1394 return AMDGPU::S_CMP_LE_U32;
1395 case CmpInst::FCMP_OEQ:
1396 return AMDGPU::S_CMP_EQ_F32;
1397 case CmpInst::FCMP_OGT:
1398 return AMDGPU::S_CMP_GT_F32;
1399 case CmpInst::FCMP_OGE:
1400 return AMDGPU::S_CMP_GE_F32;
1401 case CmpInst::FCMP_OLT:
1402 return AMDGPU::S_CMP_LT_F32;
1403 case CmpInst::FCMP_OLE:
1404 return AMDGPU::S_CMP_LE_F32;
1405 case CmpInst::FCMP_ONE:
1406 return AMDGPU::S_CMP_LG_F32;
1407 case CmpInst::FCMP_ORD:
1408 return AMDGPU::S_CMP_O_F32;
1409 case CmpInst::FCMP_UNO:
1410 return AMDGPU::S_CMP_U_F32;
1411 case CmpInst::FCMP_UEQ:
1412 return AMDGPU::S_CMP_NLG_F32;
1413 case CmpInst::FCMP_UGT:
1414 return AMDGPU::S_CMP_NLE_F32;
1415 case CmpInst::FCMP_UGE:
1416 return AMDGPU::S_CMP_NLT_F32;
1417 case CmpInst::FCMP_ULT:
1418 return AMDGPU::S_CMP_NGE_F32;
1419 case CmpInst::FCMP_ULE:
1420 return AMDGPU::S_CMP_NGT_F32;
1421 case CmpInst::FCMP_UNE:
1422 return AMDGPU::S_CMP_NEQ_F32;
1423 default:
1424 llvm_unreachable("Unknown condition code!");
1425 }
1426 }
1427
1428 if (Size == 16) {
1429 if (!STI.hasSALUFloatInsts())
1430 return -1;
1431
1432 switch (P) {
1433 case CmpInst::FCMP_OEQ:
1434 return AMDGPU::S_CMP_EQ_F16;
1435 case CmpInst::FCMP_OGT:
1436 return AMDGPU::S_CMP_GT_F16;
1437 case CmpInst::FCMP_OGE:
1438 return AMDGPU::S_CMP_GE_F16;
1439 case CmpInst::FCMP_OLT:
1440 return AMDGPU::S_CMP_LT_F16;
1441 case CmpInst::FCMP_OLE:
1442 return AMDGPU::S_CMP_LE_F16;
1443 case CmpInst::FCMP_ONE:
1444 return AMDGPU::S_CMP_LG_F16;
1445 case CmpInst::FCMP_ORD:
1446 return AMDGPU::S_CMP_O_F16;
1447 case CmpInst::FCMP_UNO:
1448 return AMDGPU::S_CMP_U_F16;
1449 case CmpInst::FCMP_UEQ:
1450 return AMDGPU::S_CMP_NLG_F16;
1451 case CmpInst::FCMP_UGT:
1452 return AMDGPU::S_CMP_NLE_F16;
1453 case CmpInst::FCMP_UGE:
1454 return AMDGPU::S_CMP_NLT_F16;
1455 case CmpInst::FCMP_ULT:
1456 return AMDGPU::S_CMP_NGE_F16;
1457 case CmpInst::FCMP_ULE:
1458 return AMDGPU::S_CMP_NGT_F16;
1459 case CmpInst::FCMP_UNE:
1460 return AMDGPU::S_CMP_NEQ_F16;
1461 default:
1462 llvm_unreachable("Unknown condition code!");
1463 }
1464 }
1465
1466 return -1;
1467}
1468
1469bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1470
1471 MachineBasicBlock *BB = I.getParent();
1472 const DebugLoc &DL = I.getDebugLoc();
1473
1474 Register SrcReg = I.getOperand(i: 2).getReg();
1475 unsigned Size = RBI.getSizeInBits(Reg: SrcReg, MRI: *MRI, TRI);
1476
1477 auto Pred = (CmpInst::Predicate)I.getOperand(i: 1).getPredicate();
1478
1479 Register CCReg = I.getOperand(i: 0).getReg();
1480 if (!isVCC(Reg: CCReg, MRI: *MRI)) {
1481 int Opcode = getS_CMPOpcode(P: Pred, Size);
1482 if (Opcode == -1)
1483 return false;
1484 MachineInstr *ICmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode))
1485 .add(MO: I.getOperand(i: 2))
1486 .add(MO: I.getOperand(i: 3));
1487 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CCReg)
1488 .addReg(RegNo: AMDGPU::SCC);
1489 bool Ret =
1490 constrainSelectedInstRegOperands(I&: *ICmp, TII, TRI, RBI) &&
1491 RBI.constrainGenericRegister(Reg: CCReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
1492 I.eraseFromParent();
1493 return Ret;
1494 }
1495
1496 if (I.getOpcode() == AMDGPU::G_FCMP)
1497 return false;
1498
1499 int Opcode = getV_CMPOpcode(P: Pred, Size, ST: *Subtarget);
1500 if (Opcode == -1)
1501 return false;
1502
1503 MachineInstrBuilder ICmp;
1504 // t16 instructions
1505 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers)) {
1506 ICmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: I.getOperand(i: 0).getReg())
1507 .addImm(Val: 0)
1508 .add(MO: I.getOperand(i: 2))
1509 .addImm(Val: 0)
1510 .add(MO: I.getOperand(i: 3))
1511 .addImm(Val: 0); // op_sel
1512 } else {
1513 ICmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: I.getOperand(i: 0).getReg())
1514 .add(MO: I.getOperand(i: 2))
1515 .add(MO: I.getOperand(i: 3));
1516 }
1517
1518 RBI.constrainGenericRegister(Reg: ICmp->getOperand(i: 0).getReg(),
1519 RC: *TRI.getBoolRC(), MRI&: *MRI);
1520 bool Ret = constrainSelectedInstRegOperands(I&: *ICmp, TII, TRI, RBI);
1521 I.eraseFromParent();
1522 return Ret;
1523}
1524
1525bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1526 Register Dst = I.getOperand(i: 0).getReg();
1527 if (isVCC(Reg: Dst, MRI: *MRI))
1528 return false;
1529
1530 LLT DstTy = MRI->getType(Reg: Dst);
1531 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1532 return false;
1533
1534 MachineBasicBlock *BB = I.getParent();
1535 const DebugLoc &DL = I.getDebugLoc();
1536 Register SrcReg = I.getOperand(i: 2).getReg();
1537 unsigned Size = RBI.getSizeInBits(Reg: SrcReg, MRI: *MRI, TRI);
1538
1539 // i1 inputs are not supported in GlobalISel.
1540 if (Size == 1)
1541 return false;
1542
1543 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(i: 4).getImm());
1544 if (!CmpInst::isIntPredicate(P: Pred) && !CmpInst::isFPPredicate(P: Pred)) {
1545 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Dst);
1546 I.eraseFromParent();
1547 return RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI);
1548 }
1549
1550 const int Opcode = getV_CMPOpcode(P: Pred, Size, ST: *Subtarget);
1551 if (Opcode == -1)
1552 return false;
1553
1554 MachineInstrBuilder SelectedMI;
1555 MachineOperand &LHS = I.getOperand(i: 2);
1556 MachineOperand &RHS = I.getOperand(i: 3);
1557 auto [Src0, Src0Mods] = selectVOP3ModsImpl(Src: LHS.getReg());
1558 auto [Src1, Src1Mods] = selectVOP3ModsImpl(Src: RHS.getReg());
1559 Register Src0Reg =
1560 copyToVGPRIfSrcFolded(Src: Src0, Mods: Src0Mods, Root: LHS, InsertPt: &I, /*ForceVGPR*/ true);
1561 Register Src1Reg =
1562 copyToVGPRIfSrcFolded(Src: Src1, Mods: Src1Mods, Root: RHS, InsertPt: &I, /*ForceVGPR*/ true);
1563 SelectedMI = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: Dst);
1564 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers))
1565 SelectedMI.addImm(Val: Src0Mods);
1566 SelectedMI.addReg(RegNo: Src0Reg);
1567 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src1_modifiers))
1568 SelectedMI.addImm(Val: Src1Mods);
1569 SelectedMI.addReg(RegNo: Src1Reg);
1570 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::clamp))
1571 SelectedMI.addImm(Val: 0); // clamp
1572 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::op_sel))
1573 SelectedMI.addImm(Val: 0); // op_sel
1574
1575 RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI);
1576 if (!constrainSelectedInstRegOperands(I&: *SelectedMI, TII, TRI, RBI))
1577 return false;
1578
1579 I.eraseFromParent();
1580 return true;
1581}
1582
1583// Ballot has to zero bits in input lane-mask that are zero in current exec,
1584// Done as AND with exec. For inputs that are results of instruction that
1585// implicitly use same exec, for example compares in same basic block or SCC to
1586// VCC copy, use copy.
1587static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI,
1588 MachineBasicBlock *MBB) {
1589 MachineInstr *MI = MRI.getVRegDef(Reg);
1590 if (MI->getParent() != MBB)
1591 return false;
1592
1593 // Lane mask generated by SCC to VCC copy.
1594 if (MI->getOpcode() == AMDGPU::COPY) {
1595 auto DstRB = MRI.getRegBankOrNull(Reg: MI->getOperand(i: 0).getReg());
1596 auto SrcRB = MRI.getRegBankOrNull(Reg: MI->getOperand(i: 1).getReg());
1597 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1598 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1599 return true;
1600 }
1601
1602 // Lane mask generated using compare with same exec.
1603 if (isa<GAnyCmp>(Val: MI))
1604 return true;
1605
1606 Register LHS, RHS;
1607 // Look through AND.
1608 if (mi_match(R: Reg, MRI, P: m_GAnd(L: m_Reg(R&: LHS), R: m_Reg(R&: RHS))))
1609 return isLaneMaskFromSameBlock(Reg: LHS, MRI, MBB) ||
1610 isLaneMaskFromSameBlock(Reg: RHS, MRI, MBB);
1611
1612 return false;
1613}
1614
1615bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1616 MachineBasicBlock *BB = I.getParent();
1617 const DebugLoc &DL = I.getDebugLoc();
1618 Register DstReg = I.getOperand(i: 0).getReg();
1619 Register SrcReg = I.getOperand(i: 2).getReg();
1620 const unsigned BallotSize = MRI->getType(Reg: DstReg).getSizeInBits();
1621 const unsigned WaveSize = STI.getWavefrontSize();
1622
1623 // In the common case, the return type matches the wave size.
1624 // However we also support emitting i64 ballots in wave32 mode.
1625 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1626 return false;
1627
1628 std::optional<ValueAndVReg> Arg =
1629 getIConstantVRegValWithLookThrough(VReg: SrcReg, MRI: *MRI);
1630
1631 Register Dst = DstReg;
1632 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1633 if (BallotSize != WaveSize) {
1634 Dst = MRI->createVirtualRegister(RegClass: TRI.getBoolRC());
1635 }
1636
1637 if (Arg) {
1638 const int64_t Value = Arg->Value.getZExtValue();
1639 if (Value == 0) {
1640 // Dst = S_MOV 0
1641 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1642 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: Dst).addImm(Val: 0);
1643 } else {
1644 // Dst = COPY EXEC
1645 assert(Value == 1);
1646 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: Dst).addReg(RegNo: TRI.getExec());
1647 }
1648 if (!RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI))
1649 return false;
1650 } else {
1651 if (isLaneMaskFromSameBlock(Reg: SrcReg, MRI&: *MRI, MBB: BB)) {
1652 // Dst = COPY SrcReg
1653 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: Dst).addReg(RegNo: SrcReg);
1654 if (!RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI))
1655 return false;
1656 } else {
1657 // Dst = S_AND SrcReg, EXEC
1658 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1659 auto And = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: Dst)
1660 .addReg(RegNo: SrcReg)
1661 .addReg(RegNo: TRI.getExec())
1662 .setOperandDead(3); // Dead scc
1663 if (!constrainSelectedInstRegOperands(I&: *And, TII, TRI, RBI))
1664 return false;
1665 }
1666 }
1667
1668 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1669 if (BallotSize != WaveSize) {
1670 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1671 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: HiReg).addImm(Val: 0);
1672 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
1673 .addReg(RegNo: Dst)
1674 .addImm(Val: AMDGPU::sub0)
1675 .addReg(RegNo: HiReg)
1676 .addImm(Val: AMDGPU::sub1);
1677 }
1678
1679 I.eraseFromParent();
1680 return true;
1681}
1682
1683bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1684 Register DstReg = I.getOperand(i: 0).getReg();
1685 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
1686 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(Size: 32, Bank: *DstBank);
1687 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
1688 return false;
1689
1690 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1691
1692 Module *M = MF->getFunction().getParent();
1693 const MDNode *Metadata = I.getOperand(i: 2).getMetadata();
1694 auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString();
1695 auto *RelocSymbol = cast<GlobalVariable>(
1696 Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext())));
1697
1698 MachineBasicBlock *BB = I.getParent();
1699 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(),
1700 MCID: TII.get(Opcode: IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DestReg: DstReg)
1701 .addGlobalAddress(GV: RelocSymbol, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
1702
1703 I.eraseFromParent();
1704 return true;
1705}
1706
1707bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1708 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1709
1710 Register DstReg = I.getOperand(i: 0).getReg();
1711 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
1712 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1713 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1714
1715 MachineBasicBlock *MBB = I.getParent();
1716 const DebugLoc &DL = I.getDebugLoc();
1717
1718 auto MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Mov), DestReg: DstReg);
1719
1720 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1721 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1722 MIB.addImm(Val: MFI->getLDSSize());
1723 } else {
1724 Module *M = MF->getFunction().getParent();
1725 const GlobalValue *GV =
1726 Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::amdgcn_groupstaticsize);
1727 MIB.addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
1728 }
1729
1730 I.eraseFromParent();
1731 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1732}
1733
1734bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1735 MachineBasicBlock *MBB = I.getParent();
1736 MachineFunction &MF = *MBB->getParent();
1737 const DebugLoc &DL = I.getDebugLoc();
1738
1739 MachineOperand &Dst = I.getOperand(i: 0);
1740 Register DstReg = Dst.getReg();
1741 unsigned Depth = I.getOperand(i: 2).getImm();
1742
1743 const TargetRegisterClass *RC
1744 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
1745 if (!RC->hasSubClassEq(RC: &AMDGPU::SGPR_64RegClass) ||
1746 !RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI))
1747 return false;
1748
1749 // Check for kernel and shader functions
1750 if (Depth != 0 ||
1751 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1752 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B64), DestReg: DstReg)
1753 .addImm(Val: 0);
1754 I.eraseFromParent();
1755 return true;
1756 }
1757
1758 MachineFrameInfo &MFI = MF.getFrameInfo();
1759 // There is a call to @llvm.returnaddress in this function
1760 MFI.setReturnAddressIsTaken(true);
1761
1762 // Get the return address reg and mark it as an implicit live-in
1763 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1764 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, PhysReg: ReturnAddrReg,
1765 RC: AMDGPU::SReg_64RegClass, DL);
1766 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
1767 .addReg(RegNo: LiveIn);
1768 I.eraseFromParent();
1769 return true;
1770}
1771
1772bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1773 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1774 // SelectionDAG uses for wave32 vs wave64.
1775 MachineBasicBlock *BB = MI.getParent();
1776 BuildMI(BB&: *BB, I: &MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::SI_END_CF))
1777 .add(MO: MI.getOperand(i: 1));
1778
1779 Register Reg = MI.getOperand(i: 1).getReg();
1780 MI.eraseFromParent();
1781
1782 if (!MRI->getRegClassOrNull(Reg))
1783 MRI->setRegClass(Reg, RC: TRI.getWaveMaskRegClass());
1784 return true;
1785}
1786
1787bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1788 MachineInstr &MI, Intrinsic::ID IntrID) const {
1789 MachineBasicBlock *MBB = MI.getParent();
1790 MachineFunction *MF = MBB->getParent();
1791 const DebugLoc &DL = MI.getDebugLoc();
1792
1793 unsigned IndexOperand = MI.getOperand(i: 7).getImm();
1794 bool WaveRelease = MI.getOperand(i: 8).getImm() != 0;
1795 bool WaveDone = MI.getOperand(i: 9).getImm() != 0;
1796
1797 if (WaveDone && !WaveRelease) {
1798 // TODO: Move this to IR verifier
1799 const Function &Fn = MF->getFunction();
1800 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
1801 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1802 }
1803
1804 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1805 IndexOperand &= ~0x3f;
1806 unsigned CountDw = 0;
1807
1808 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1809 CountDw = (IndexOperand >> 24) & 0xf;
1810 IndexOperand &= ~(0xf << 24);
1811
1812 if (CountDw < 1 || CountDw > 4) {
1813 const Function &Fn = MF->getFunction();
1814 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
1815 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1816 CountDw = 1;
1817 }
1818 }
1819
1820 if (IndexOperand) {
1821 const Function &Fn = MF->getFunction();
1822 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
1823 Fn, "ds_ordered_count: bad index operand", DL));
1824 }
1825
1826 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1827 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(MF: *MF);
1828
1829 unsigned Offset0 = OrderedCountIndex << 2;
1830 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1831
1832 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1833 Offset1 |= (CountDw - 1) << 6;
1834
1835 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1836 Offset1 |= ShaderType << 2;
1837
1838 unsigned Offset = Offset0 | (Offset1 << 8);
1839
1840 Register M0Val = MI.getOperand(i: 2).getReg();
1841 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1842 .addReg(RegNo: M0Val);
1843
1844 Register DstReg = MI.getOperand(i: 0).getReg();
1845 Register ValReg = MI.getOperand(i: 3).getReg();
1846 MachineInstrBuilder DS =
1847 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_ORDERED_COUNT), DestReg: DstReg)
1848 .addReg(RegNo: ValReg)
1849 .addImm(Val: Offset)
1850 .cloneMemRefs(OtherMI: MI);
1851
1852 if (!RBI.constrainGenericRegister(Reg: M0Val, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
1853 return false;
1854
1855 bool Ret = constrainSelectedInstRegOperands(I&: *DS, TII, TRI, RBI);
1856 MI.eraseFromParent();
1857 return Ret;
1858}
1859
1860static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1861 switch (IntrID) {
1862 case Intrinsic::amdgcn_ds_gws_init:
1863 return AMDGPU::DS_GWS_INIT;
1864 case Intrinsic::amdgcn_ds_gws_barrier:
1865 return AMDGPU::DS_GWS_BARRIER;
1866 case Intrinsic::amdgcn_ds_gws_sema_v:
1867 return AMDGPU::DS_GWS_SEMA_V;
1868 case Intrinsic::amdgcn_ds_gws_sema_br:
1869 return AMDGPU::DS_GWS_SEMA_BR;
1870 case Intrinsic::amdgcn_ds_gws_sema_p:
1871 return AMDGPU::DS_GWS_SEMA_P;
1872 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1873 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1874 default:
1875 llvm_unreachable("not a gws intrinsic");
1876 }
1877}
1878
1879bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1880 Intrinsic::ID IID) const {
1881 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1882 !STI.hasGWSSemaReleaseAll()))
1883 return false;
1884
1885 // intrinsic ID, vsrc, offset
1886 const bool HasVSrc = MI.getNumOperands() == 3;
1887 assert(HasVSrc || MI.getNumOperands() == 2);
1888
1889 Register BaseOffset = MI.getOperand(i: HasVSrc ? 2 : 1).getReg();
1890 const RegisterBank *OffsetRB = RBI.getRegBank(Reg: BaseOffset, MRI: *MRI, TRI);
1891 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1892 return false;
1893
1894 MachineInstr *OffsetDef = getDefIgnoringCopies(Reg: BaseOffset, MRI: *MRI);
1895 unsigned ImmOffset;
1896
1897 MachineBasicBlock *MBB = MI.getParent();
1898 const DebugLoc &DL = MI.getDebugLoc();
1899
1900 MachineInstr *Readfirstlane = nullptr;
1901
1902 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1903 // incoming offset, in case there's an add of a constant. We'll have to put it
1904 // back later.
1905 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1906 Readfirstlane = OffsetDef;
1907 BaseOffset = OffsetDef->getOperand(i: 1).getReg();
1908 OffsetDef = getDefIgnoringCopies(Reg: BaseOffset, MRI: *MRI);
1909 }
1910
1911 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1912 // If we have a constant offset, try to use the 0 in m0 as the base.
1913 // TODO: Look into changing the default m0 initialization value. If the
1914 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1915 // the immediate offset.
1916
1917 ImmOffset = OffsetDef->getOperand(i: 1).getCImm()->getZExtValue();
1918 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
1919 .addImm(Val: 0);
1920 } else {
1921 std::tie(args&: BaseOffset, args&: ImmOffset) =
1922 AMDGPU::getBaseWithConstantOffset(MRI&: *MRI, Reg: BaseOffset, ValueTracking: VT);
1923
1924 if (Readfirstlane) {
1925 // We have the constant offset now, so put the readfirstlane back on the
1926 // variable component.
1927 if (!RBI.constrainGenericRegister(Reg: BaseOffset, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI))
1928 return false;
1929
1930 Readfirstlane->getOperand(i: 1).setReg(BaseOffset);
1931 BaseOffset = Readfirstlane->getOperand(i: 0).getReg();
1932 } else {
1933 if (!RBI.constrainGenericRegister(Reg: BaseOffset,
1934 RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
1935 return false;
1936 }
1937
1938 Register M0Base = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1939 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: M0Base)
1940 .addReg(RegNo: BaseOffset)
1941 .addImm(Val: 16)
1942 .setOperandDead(3); // Dead scc
1943
1944 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1945 .addReg(RegNo: M0Base);
1946 }
1947
1948 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1949 // offset field) % 64. Some versions of the programming guide omit the m0
1950 // part, or claim it's from offset 0.
1951
1952 unsigned Opc = gwsIntrinToOpcode(IntrID: IID);
1953 const MCInstrDesc &InstrDesc = TII.get(Opcode: Opc);
1954
1955 if (HasVSrc) {
1956 Register VSrc = MI.getOperand(i: 1).getReg();
1957
1958 int Data0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
1959 const TargetRegisterClass *DataRC = TII.getRegClass(MCID: InstrDesc, OpNum: Data0Idx);
1960 const TargetRegisterClass *SubRC =
1961 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
1962
1963 if (!SubRC) {
1964 // 32-bit normal case.
1965 if (!RBI.constrainGenericRegister(Reg: VSrc, RC: *DataRC, MRI&: *MRI))
1966 return false;
1967
1968 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: InstrDesc)
1969 .addReg(RegNo: VSrc)
1970 .addImm(Val: ImmOffset)
1971 .cloneMemRefs(OtherMI: MI);
1972 } else {
1973 // Requires even register alignment, so create 64-bit value and pad the
1974 // top half with undef.
1975 Register DataReg = MRI->createVirtualRegister(RegClass: DataRC);
1976 if (!RBI.constrainGenericRegister(Reg: VSrc, RC: *SubRC, MRI&: *MRI))
1977 return false;
1978
1979 Register UndefReg = MRI->createVirtualRegister(RegClass: SubRC);
1980 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefReg);
1981 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DataReg)
1982 .addReg(RegNo: VSrc)
1983 .addImm(Val: AMDGPU::sub0)
1984 .addReg(RegNo: UndefReg)
1985 .addImm(Val: AMDGPU::sub1);
1986
1987 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: InstrDesc)
1988 .addReg(RegNo: DataReg)
1989 .addImm(Val: ImmOffset)
1990 .cloneMemRefs(OtherMI: MI);
1991 }
1992 } else {
1993 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: InstrDesc)
1994 .addImm(Val: ImmOffset)
1995 .cloneMemRefs(OtherMI: MI);
1996 }
1997
1998 MI.eraseFromParent();
1999 return true;
2000}
2001
2002bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2003 bool IsAppend) const {
2004 Register PtrBase = MI.getOperand(i: 2).getReg();
2005 LLT PtrTy = MRI->getType(Reg: PtrBase);
2006 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2007
2008 unsigned Offset;
2009 std::tie(args&: PtrBase, args&: Offset) = selectDS1Addr1OffsetImpl(Root&: MI.getOperand(i: 2));
2010
2011 // TODO: Should this try to look through readfirstlane like GWS?
2012 if (!isDSOffsetLegal(Base: PtrBase, Offset)) {
2013 PtrBase = MI.getOperand(i: 2).getReg();
2014 Offset = 0;
2015 }
2016
2017 MachineBasicBlock *MBB = MI.getParent();
2018 const DebugLoc &DL = MI.getDebugLoc();
2019 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2020
2021 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
2022 .addReg(RegNo: PtrBase);
2023 if (!RBI.constrainGenericRegister(Reg: PtrBase, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
2024 return false;
2025
2026 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: MI.getOperand(i: 0).getReg())
2027 .addImm(Val: Offset)
2028 .addImm(Val: IsGDS ? -1 : 0)
2029 .cloneMemRefs(OtherMI: MI);
2030 MI.eraseFromParent();
2031 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2032}
2033
2034bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2035 MachineFunction *MF = MI.getMF();
2036 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2037
2038 MFInfo->setInitWholeWave();
2039 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
2040}
2041
2042static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2043 bool &IsTexFail) {
2044 if (TexFailCtrl)
2045 IsTexFail = true;
2046
2047 TFE = TexFailCtrl & 0x1;
2048 TexFailCtrl &= ~(uint64_t)0x1;
2049 LWE = TexFailCtrl & 0x2;
2050 TexFailCtrl &= ~(uint64_t)0x2;
2051
2052 return TexFailCtrl == 0;
2053}
2054
2055bool AMDGPUInstructionSelector::selectImageIntrinsic(
2056 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2057 MachineBasicBlock *MBB = MI.getParent();
2058 const DebugLoc &DL = MI.getDebugLoc();
2059 unsigned IntrOpcode = Intr->BaseOpcode;
2060
2061 // For image atomic: use no-return opcode if result is unused.
2062 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2063 Register ResultDef = MI.getOperand(i: 0).getReg();
2064 if (MRI->use_nodbg_empty(RegNo: ResultDef))
2065 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2066 }
2067
2068 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2069 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: IntrOpcode);
2070
2071 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim);
2072 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2073 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2074 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2075
2076 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2077
2078 Register VDataIn = AMDGPU::NoRegister;
2079 Register VDataOut = AMDGPU::NoRegister;
2080 LLT VDataTy;
2081 int NumVDataDwords = -1;
2082 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2083 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2084
2085 bool Unorm;
2086 if (!BaseOpcode->Sampler)
2087 Unorm = true;
2088 else
2089 Unorm = MI.getOperand(i: ArgOffset + Intr->UnormIndex).getImm() != 0;
2090
2091 bool TFE;
2092 bool LWE;
2093 bool IsTexFail = false;
2094 if (!parseTexFail(TexFailCtrl: MI.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2095 TFE, LWE, IsTexFail))
2096 return false;
2097
2098 const int Flags = MI.getOperand(i: ArgOffset + Intr->NumArgs).getImm();
2099 const bool IsA16 = (Flags & 1) != 0;
2100 const bool IsG16 = (Flags & 2) != 0;
2101
2102 // A16 implies 16 bit gradients if subtarget doesn't support G16
2103 if (IsA16 && !STI.hasG16() && !IsG16)
2104 return false;
2105
2106 unsigned DMask = 0;
2107 unsigned DMaskLanes = 0;
2108
2109 if (BaseOpcode->Atomic) {
2110 if (!BaseOpcode->NoReturn)
2111 VDataOut = MI.getOperand(i: 0).getReg();
2112 VDataIn = MI.getOperand(i: 2).getReg();
2113 LLT Ty = MRI->getType(Reg: VDataIn);
2114
2115 // Be careful to allow atomic swap on 16-bit element vectors.
2116 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2117 Ty.getSizeInBits() == 128 :
2118 Ty.getSizeInBits() == 64;
2119
2120 if (BaseOpcode->AtomicX2) {
2121 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2122
2123 DMask = Is64Bit ? 0xf : 0x3;
2124 NumVDataDwords = Is64Bit ? 4 : 2;
2125 } else {
2126 DMask = Is64Bit ? 0x3 : 0x1;
2127 NumVDataDwords = Is64Bit ? 2 : 1;
2128 }
2129 } else {
2130 DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
2131 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask);
2132
2133 if (BaseOpcode->Store) {
2134 VDataIn = MI.getOperand(i: 1).getReg();
2135 VDataTy = MRI->getType(Reg: VDataIn);
2136 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2137 } else if (BaseOpcode->NoReturn) {
2138 NumVDataDwords = 0;
2139 } else {
2140 VDataOut = MI.getOperand(i: 0).getReg();
2141 VDataTy = MRI->getType(Reg: VDataOut);
2142 NumVDataDwords = DMaskLanes;
2143
2144 if (IsD16 && !STI.hasUnpackedD16VMem())
2145 NumVDataDwords = (DMaskLanes + 1) / 2;
2146 }
2147 }
2148
2149 // Set G16 opcode
2150 if (Subtarget->hasG16() && IsG16) {
2151 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2152 AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode);
2153 assert(G16MappingInfo);
2154 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2155 }
2156
2157 // TODO: Check this in verifier.
2158 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2159
2160 unsigned CPol = MI.getOperand(i: ArgOffset + Intr->CachePolicyIndex).getImm();
2161 // Keep GLC only when the atomic's result is actually used.
2162 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2163 CPol |= AMDGPU::CPol::GLC;
2164 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2165 AMDGPU::CPol::VOLATILE))
2166 return false;
2167
2168 int NumVAddrRegs = 0;
2169 int NumVAddrDwords = 0;
2170 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2171 // Skip the $noregs and 0s inserted during legalization.
2172 MachineOperand &AddrOp = MI.getOperand(i: ArgOffset + I);
2173 if (!AddrOp.isReg())
2174 continue; // XXX - Break?
2175
2176 Register Addr = AddrOp.getReg();
2177 if (!Addr)
2178 break;
2179
2180 ++NumVAddrRegs;
2181 NumVAddrDwords += (MRI->getType(Reg: Addr).getSizeInBits() + 31) / 32;
2182 }
2183
2184 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2185 // NSA, these should have been packed into a single value in the first
2186 // address register
2187 const bool UseNSA =
2188 NumVAddrRegs != 1 &&
2189 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2190 : NumVAddrDwords == NumVAddrRegs);
2191 if (UseNSA && !STI.hasFeature(Feature: AMDGPU::FeatureNSAEncoding)) {
2192 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2193 return false;
2194 }
2195
2196 if (IsTexFail)
2197 ++NumVDataDwords;
2198
2199 int Opcode = -1;
2200 if (IsGFX12Plus) {
2201 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx12,
2202 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2203 } else if (IsGFX11Plus) {
2204 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
2205 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx11NSA
2206 : AMDGPU::MIMGEncGfx11Default,
2207 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2208 } else if (IsGFX10Plus) {
2209 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
2210 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx10NSA
2211 : AMDGPU::MIMGEncGfx10Default,
2212 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2213 } else {
2214 if (Subtarget->hasGFX90AInsts()) {
2215 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx90a,
2216 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2217 if (Opcode == -1) {
2218 LLVM_DEBUG(
2219 dbgs()
2220 << "requested image instruction is not supported on this GPU\n");
2221 return false;
2222 }
2223 }
2224 if (Opcode == -1 &&
2225 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2226 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx8,
2227 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2228 if (Opcode == -1)
2229 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx6,
2230 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2231 }
2232 if (Opcode == -1)
2233 return false;
2234
2235 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode))
2236 .cloneMemRefs(OtherMI: MI);
2237
2238 if (VDataOut) {
2239 if (BaseOpcode->AtomicX2) {
2240 const bool Is64 = MRI->getType(Reg: VDataOut).getSizeInBits() == 64;
2241
2242 Register TmpReg = MRI->createVirtualRegister(
2243 RegClass: Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2244 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2245
2246 MIB.addDef(RegNo: TmpReg);
2247 if (!MRI->use_empty(RegNo: VDataOut)) {
2248 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: VDataOut)
2249 .addReg(RegNo: TmpReg, Flags: RegState::Kill, SubReg);
2250 }
2251
2252 } else {
2253 MIB.addDef(RegNo: VDataOut); // vdata output
2254 }
2255 }
2256
2257 if (VDataIn)
2258 MIB.addReg(RegNo: VDataIn); // vdata input
2259
2260 for (int I = 0; I != NumVAddrRegs; ++I) {
2261 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + Intr->VAddrStart + I);
2262 if (SrcOp.isReg()) {
2263 assert(SrcOp.getReg() != 0);
2264 MIB.addReg(RegNo: SrcOp.getReg());
2265 }
2266 }
2267
2268 MIB.addReg(RegNo: MI.getOperand(i: ArgOffset + Intr->RsrcIndex).getReg());
2269 if (BaseOpcode->Sampler)
2270 MIB.addReg(RegNo: MI.getOperand(i: ArgOffset + Intr->SampIndex).getReg());
2271
2272 MIB.addImm(Val: DMask); // dmask
2273
2274 if (IsGFX10Plus)
2275 MIB.addImm(Val: DimInfo->Encoding);
2276 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::unorm))
2277 MIB.addImm(Val: Unorm);
2278
2279 MIB.addImm(Val: CPol);
2280 MIB.addImm(Val: IsA16 && // a16 or r128
2281 STI.hasFeature(Feature: AMDGPU::FeatureR128A16) ? -1 : 0);
2282 if (IsGFX10Plus)
2283 MIB.addImm(Val: IsA16 ? -1 : 0);
2284
2285 if (!Subtarget->hasGFX90AInsts()) {
2286 MIB.addImm(Val: TFE); // tfe
2287 } else if (TFE) {
2288 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2289 return false;
2290 }
2291
2292 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::lwe))
2293 MIB.addImm(Val: LWE); // lwe
2294 if (!IsGFX10Plus)
2295 MIB.addImm(Val: DimInfo->DA ? -1 : 0);
2296 if (BaseOpcode->HasD16)
2297 MIB.addImm(Val: IsD16 ? -1 : 0);
2298
2299 MI.eraseFromParent();
2300 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2301 TII.enforceOperandRCAlignment(MI&: *MIB, OpName: AMDGPU::OpName::vaddr);
2302 return true;
2303}
2304
2305// We need to handle this here because tablegen doesn't support matching
2306// instructions with multiple outputs.
2307bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2308 MachineInstr &MI) const {
2309 Register Dst0 = MI.getOperand(i: 0).getReg();
2310 Register Dst1 = MI.getOperand(i: 1).getReg();
2311
2312 const DebugLoc &DL = MI.getDebugLoc();
2313 MachineBasicBlock *MBB = MI.getParent();
2314
2315 Register Addr = MI.getOperand(i: 3).getReg();
2316 Register Data0 = MI.getOperand(i: 4).getReg();
2317 Register Data1 = MI.getOperand(i: 5).getReg();
2318 unsigned Offset = MI.getOperand(i: 6).getImm();
2319
2320 unsigned Opc;
2321 switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
2322 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2323 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2324 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2325 break;
2326 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2327 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2328 break;
2329 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2330 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2331 break;
2332 }
2333
2334 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: Dst0)
2335 .addDef(RegNo: Dst1)
2336 .addUse(RegNo: Addr)
2337 .addUse(RegNo: Data0)
2338 .addUse(RegNo: Data1)
2339 .addImm(Val: Offset)
2340 .cloneMemRefs(OtherMI: MI);
2341
2342 MI.eraseFromParent();
2343 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2344}
2345
2346bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2347 MachineInstr &I) const {
2348 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
2349 switch (IntrinsicID) {
2350 case Intrinsic::amdgcn_end_cf:
2351 return selectEndCfIntrinsic(MI&: I);
2352 case Intrinsic::amdgcn_ds_ordered_add:
2353 case Intrinsic::amdgcn_ds_ordered_swap:
2354 return selectDSOrderedIntrinsic(MI&: I, IntrID: IntrinsicID);
2355 case Intrinsic::amdgcn_ds_gws_init:
2356 case Intrinsic::amdgcn_ds_gws_barrier:
2357 case Intrinsic::amdgcn_ds_gws_sema_v:
2358 case Intrinsic::amdgcn_ds_gws_sema_br:
2359 case Intrinsic::amdgcn_ds_gws_sema_p:
2360 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2361 return selectDSGWSIntrinsic(MI&: I, IID: IntrinsicID);
2362 case Intrinsic::amdgcn_ds_append:
2363 return selectDSAppendConsume(MI&: I, IsAppend: true);
2364 case Intrinsic::amdgcn_ds_consume:
2365 return selectDSAppendConsume(MI&: I, IsAppend: false);
2366 case Intrinsic::amdgcn_init_whole_wave:
2367 return selectInitWholeWave(MI&: I);
2368 case Intrinsic::amdgcn_raw_buffer_load_lds:
2369 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2370 case Intrinsic::amdgcn_struct_buffer_load_lds:
2371 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2372 return selectBufferLoadLds(MI&: I);
2373 // Until we can store both the address space of the global and the LDS
2374 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2375 // that the argument is a global pointer (buffer pointers have been handled by
2376 // a LLVM IR-level lowering).
2377 case Intrinsic::amdgcn_load_to_lds:
2378 case Intrinsic::amdgcn_global_load_lds:
2379 return selectGlobalLoadLds(MI&: I);
2380 case Intrinsic::amdgcn_exp_compr:
2381 if (!STI.hasCompressedExport()) {
2382 Function &F = I.getMF()->getFunction();
2383 F.getContext().diagnose(
2384 DI: DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2385 I.getDebugLoc(), DS_Error));
2386 return false;
2387 }
2388 break;
2389 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2390 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2391 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2392 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2393 return selectDSBvhStackIntrinsic(MI&: I);
2394 case Intrinsic::amdgcn_s_barrier_init:
2395 case Intrinsic::amdgcn_s_barrier_signal_var:
2396 return selectNamedBarrierInit(I, IID: IntrinsicID);
2397 case Intrinsic::amdgcn_s_wakeup_barrier: {
2398 if (!STI.hasSWakeupBarrier()) {
2399 Function &F = I.getMF()->getFunction();
2400 F.getContext().diagnose(
2401 DI: DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2402 I.getDebugLoc(), DS_Error));
2403 return false;
2404 }
2405 return selectNamedBarrierInst(I, IID: IntrinsicID);
2406 }
2407 case Intrinsic::amdgcn_s_barrier_join:
2408 case Intrinsic::amdgcn_s_get_named_barrier_state:
2409 return selectNamedBarrierInst(I, IID: IntrinsicID);
2410 case Intrinsic::amdgcn_s_get_barrier_state:
2411 return selectSGetBarrierState(I, IID: IntrinsicID);
2412 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2413 return selectSBarrierSignalIsfirst(I, IID: IntrinsicID);
2414 }
2415 return selectImpl(I, CoverageInfo&: *CoverageInfo);
2416}
2417
2418bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2419 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
2420 return true;
2421
2422 MachineBasicBlock *BB = I.getParent();
2423 const DebugLoc &DL = I.getDebugLoc();
2424
2425 Register DstReg = I.getOperand(i: 0).getReg();
2426 unsigned Size = RBI.getSizeInBits(Reg: DstReg, MRI: *MRI, TRI);
2427 assert(Size <= 32 || Size == 64);
2428 const MachineOperand &CCOp = I.getOperand(i: 1);
2429 Register CCReg = CCOp.getReg();
2430 if (!isVCC(Reg: CCReg, MRI: *MRI)) {
2431 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2432 AMDGPU::S_CSELECT_B32;
2433 MachineInstr *CopySCC = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC)
2434 .addReg(RegNo: CCReg);
2435
2436 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2437 // bank, because it does not cover the register class that we used to represent
2438 // for it. So we need to manually set the register class here.
2439 if (!MRI->getRegClassOrNull(Reg: CCReg))
2440 MRI->setRegClass(Reg: CCReg, RC: TRI.getConstrainedRegClassForOperand(MO: CCOp, MRI: *MRI));
2441 MachineInstr *Select = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: SelectOpcode), DestReg: DstReg)
2442 .add(MO: I.getOperand(i: 2))
2443 .add(MO: I.getOperand(i: 3));
2444
2445 bool Ret = false;
2446 Ret |= constrainSelectedInstRegOperands(I&: *Select, TII, TRI, RBI);
2447 Ret |= constrainSelectedInstRegOperands(I&: *CopySCC, TII, TRI, RBI);
2448 I.eraseFromParent();
2449 return Ret;
2450 }
2451
2452 // Wide VGPR select should have been split in RegBankSelect.
2453 if (Size > 32)
2454 return false;
2455
2456 MachineInstr *Select =
2457 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
2458 .addImm(Val: 0)
2459 .add(MO: I.getOperand(i: 3))
2460 .addImm(Val: 0)
2461 .add(MO: I.getOperand(i: 2))
2462 .add(MO: I.getOperand(i: 1));
2463
2464 bool Ret = constrainSelectedInstRegOperands(I&: *Select, TII, TRI, RBI);
2465 I.eraseFromParent();
2466 return Ret;
2467}
2468
2469bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2470 Register DstReg = I.getOperand(i: 0).getReg();
2471 Register SrcReg = I.getOperand(i: 1).getReg();
2472 const LLT DstTy = MRI->getType(Reg: DstReg);
2473 const LLT SrcTy = MRI->getType(Reg: SrcReg);
2474 const LLT S1 = LLT::scalar(SizeInBits: 1);
2475
2476 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
2477 const RegisterBank *DstRB;
2478 if (DstTy == S1) {
2479 // This is a special case. We don't treat s1 for legalization artifacts as
2480 // vcc booleans.
2481 DstRB = SrcRB;
2482 } else {
2483 DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
2484 if (SrcRB != DstRB)
2485 return false;
2486 }
2487
2488 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2489
2490 unsigned DstSize = DstTy.getSizeInBits();
2491 unsigned SrcSize = SrcTy.getSizeInBits();
2492
2493 const TargetRegisterClass *SrcRC =
2494 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcRB);
2495 const TargetRegisterClass *DstRC =
2496 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstRB);
2497 if (!SrcRC || !DstRC)
2498 return false;
2499
2500 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
2501 !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI)) {
2502 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2503 return false;
2504 }
2505
2506 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2507 assert(STI.useRealTrue16Insts());
2508 const DebugLoc &DL = I.getDebugLoc();
2509 MachineBasicBlock *MBB = I.getParent();
2510 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
2511 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::lo16);
2512 I.eraseFromParent();
2513 return true;
2514 }
2515
2516 if (DstTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) && SrcTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
2517 MachineBasicBlock *MBB = I.getParent();
2518 const DebugLoc &DL = I.getDebugLoc();
2519
2520 Register LoReg = MRI->createVirtualRegister(RegClass: DstRC);
2521 Register HiReg = MRI->createVirtualRegister(RegClass: DstRC);
2522 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
2523 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub0);
2524 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
2525 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub1);
2526
2527 if (IsVALU && STI.hasSDWA()) {
2528 // Write the low 16-bits of the high element into the high 16-bits of the
2529 // low element.
2530 MachineInstr *MovSDWA =
2531 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_sdwa), DestReg: DstReg)
2532 .addImm(Val: 0) // $src0_modifiers
2533 .addReg(RegNo: HiReg) // $src0
2534 .addImm(Val: 0) // $clamp
2535 .addImm(Val: AMDGPU::SDWA::WORD_1) // $dst_sel
2536 .addImm(Val: AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2537 .addImm(Val: AMDGPU::SDWA::WORD_0) // $src0_sel
2538 .addReg(RegNo: LoReg, Flags: RegState::Implicit);
2539 MovSDWA->tieOperands(DefIdx: 0, UseIdx: MovSDWA->getNumOperands() - 1);
2540 } else {
2541 Register TmpReg0 = MRI->createVirtualRegister(RegClass: DstRC);
2542 Register TmpReg1 = MRI->createVirtualRegister(RegClass: DstRC);
2543 Register ImmReg = MRI->createVirtualRegister(RegClass: DstRC);
2544 if (IsVALU) {
2545 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHLREV_B32_e64), DestReg: TmpReg0)
2546 .addImm(Val: 16)
2547 .addReg(RegNo: HiReg);
2548 } else {
2549 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: TmpReg0)
2550 .addReg(RegNo: HiReg)
2551 .addImm(Val: 16)
2552 .setOperandDead(3); // Dead scc
2553 }
2554
2555 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2556 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2557 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2558
2559 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: MovOpc), DestReg: ImmReg)
2560 .addImm(Val: 0xffff);
2561 auto And = BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: TmpReg1)
2562 .addReg(RegNo: LoReg)
2563 .addReg(RegNo: ImmReg);
2564 auto Or = BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: OrOpc), DestReg: DstReg)
2565 .addReg(RegNo: TmpReg0)
2566 .addReg(RegNo: TmpReg1);
2567
2568 if (!IsVALU) {
2569 And.setOperandDead(3); // Dead scc
2570 Or.setOperandDead(3); // Dead scc
2571 }
2572 }
2573
2574 I.eraseFromParent();
2575 return true;
2576 }
2577
2578 if (!DstTy.isScalar())
2579 return false;
2580
2581 if (SrcSize > 32) {
2582 unsigned SubRegIdx = DstSize < 32
2583 ? static_cast<unsigned>(AMDGPU::sub0)
2584 : TRI.getSubRegFromChannel(Channel: 0, NumRegs: DstSize / 32);
2585 if (SubRegIdx == AMDGPU::NoSubRegister)
2586 return false;
2587
2588 // Deal with weird cases where the class only partially supports the subreg
2589 // index.
2590 const TargetRegisterClass *SrcWithSubRC
2591 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2592 if (!SrcWithSubRC)
2593 return false;
2594
2595 if (SrcWithSubRC != SrcRC) {
2596 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcWithSubRC, MRI&: *MRI))
2597 return false;
2598 }
2599
2600 I.getOperand(i: 1).setSubReg(SubRegIdx);
2601 }
2602
2603 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
2604 return true;
2605}
2606
2607/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2608static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2609 Mask = maskTrailingOnes<unsigned>(N: Size);
2610 int SignedMask = static_cast<int>(Mask);
2611 return SignedMask >= -16 && SignedMask <= 64;
2612}
2613
2614// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2615const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2616 Register Reg, const MachineRegisterInfo &MRI,
2617 const TargetRegisterInfo &TRI) const {
2618 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2619 if (auto *RB = dyn_cast<const RegisterBank *>(Val: RegClassOrBank))
2620 return RB;
2621
2622 // Ignore the type, since we don't use vcc in artifacts.
2623 if (auto *RC = dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank))
2624 return &RBI.getRegBankFromRegClass(RC: *RC, LLT());
2625 return nullptr;
2626}
2627
2628bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2629 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2630 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2631 const DebugLoc &DL = I.getDebugLoc();
2632 MachineBasicBlock &MBB = *I.getParent();
2633 const Register DstReg = I.getOperand(i: 0).getReg();
2634 const Register SrcReg = I.getOperand(i: 1).getReg();
2635
2636 const LLT DstTy = MRI->getType(Reg: DstReg);
2637 const LLT SrcTy = MRI->getType(Reg: SrcReg);
2638 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2639 I.getOperand(i: 2).getImm() : SrcTy.getSizeInBits();
2640 const unsigned DstSize = DstTy.getSizeInBits();
2641 if (!DstTy.isScalar())
2642 return false;
2643
2644 // Artifact casts should never use vcc.
2645 const RegisterBank *SrcBank = getArtifactRegBank(Reg: SrcReg, MRI: *MRI, TRI);
2646
2647 // FIXME: This should probably be illegal and split earlier.
2648 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2649 if (DstSize <= 32)
2650 return selectCOPY(I);
2651
2652 const TargetRegisterClass *SrcRC =
2653 TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *SrcBank);
2654 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
2655 const TargetRegisterClass *DstRC =
2656 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
2657
2658 Register UndefReg = MRI->createVirtualRegister(RegClass: SrcRC);
2659 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefReg);
2660 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
2661 .addReg(RegNo: SrcReg)
2662 .addImm(Val: AMDGPU::sub0)
2663 .addReg(RegNo: UndefReg)
2664 .addImm(Val: AMDGPU::sub1);
2665 I.eraseFromParent();
2666
2667 return RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) &&
2668 RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI);
2669 }
2670
2671 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2672 // 64-bit should have been split up in RegBankSelect
2673
2674 // Try to use an and with a mask if it will save code size.
2675 unsigned Mask;
2676 if (!Signed && shouldUseAndMask(Size: SrcSize, Mask)) {
2677 MachineInstr *ExtI =
2678 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B32_e32), DestReg: DstReg)
2679 .addImm(Val: Mask)
2680 .addReg(RegNo: SrcReg);
2681 I.eraseFromParent();
2682 return constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
2683 }
2684
2685 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2686 MachineInstr *ExtI =
2687 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE), DestReg: DstReg)
2688 .addReg(RegNo: SrcReg)
2689 .addImm(Val: 0) // Offset
2690 .addImm(Val: SrcSize); // Width
2691 I.eraseFromParent();
2692 return constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
2693 }
2694
2695 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2696 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2697 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2698 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: SrcRC, MRI&: *MRI))
2699 return false;
2700
2701 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2702 const unsigned SextOpc = SrcSize == 8 ?
2703 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2704 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: SextOpc), DestReg: DstReg)
2705 .addReg(RegNo: SrcReg);
2706 I.eraseFromParent();
2707 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2708 }
2709
2710 // Using a single 32-bit SALU to calculate the high half is smaller than
2711 // S_BFE with a literal constant operand.
2712 if (DstSize > 32 && SrcSize == 32) {
2713 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2714 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2715 if (Signed) {
2716 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ASHR_I32), DestReg: HiReg)
2717 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
2718 .addImm(Val: 31)
2719 .setOperandDead(3); // Dead scc
2720 } else {
2721 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: HiReg)
2722 .addImm(Val: 0);
2723 }
2724 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
2725 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
2726 .addImm(Val: AMDGPU::sub0)
2727 .addReg(RegNo: HiReg)
2728 .addImm(Val: AMDGPU::sub1);
2729 I.eraseFromParent();
2730 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_64RegClass,
2731 MRI&: *MRI);
2732 }
2733
2734 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2735 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2736
2737 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2738 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2739 // We need a 64-bit register source, but the high bits don't matter.
2740 Register ExtReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
2741 Register UndefReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2742 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2743
2744 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefReg);
2745 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: ExtReg)
2746 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
2747 .addImm(Val: AMDGPU::sub0)
2748 .addReg(RegNo: UndefReg)
2749 .addImm(Val: AMDGPU::sub1);
2750
2751 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE64), DestReg: DstReg)
2752 .addReg(RegNo: ExtReg)
2753 .addImm(Val: SrcSize << 16);
2754
2755 I.eraseFromParent();
2756 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI);
2757 }
2758
2759 unsigned Mask;
2760 if (!Signed && shouldUseAndMask(Size: SrcSize, Mask)) {
2761 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: DstReg)
2762 .addReg(RegNo: SrcReg)
2763 .addImm(Val: Mask)
2764 .setOperandDead(3); // Dead scc
2765 } else {
2766 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE32), DestReg: DstReg)
2767 .addReg(RegNo: SrcReg)
2768 .addImm(Val: SrcSize << 16);
2769 }
2770
2771 I.eraseFromParent();
2772 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2773 }
2774
2775 return false;
2776}
2777
2778static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
2779 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2780}
2781
2782static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
2783 Register BitcastSrc;
2784 if (mi_match(R: Reg, MRI, P: m_GBitcast(Src: m_Reg(R&: BitcastSrc))))
2785 Reg = BitcastSrc;
2786 return Reg;
2787}
2788
2789static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
2790 Register &Out) {
2791 Register Trunc;
2792 if (!mi_match(R: In, MRI, P: m_GTrunc(Src: m_Reg(R&: Trunc))))
2793 return false;
2794
2795 Register LShlSrc;
2796 Register Cst;
2797 if (mi_match(R: Trunc, MRI, P: m_GLShr(L: m_Reg(R&: LShlSrc), R: m_Reg(R&: Cst)))) {
2798 Cst = stripCopy(Reg: Cst, MRI);
2799 if (mi_match(R: Cst, MRI, P: m_SpecificICst(RequestedValue: 16))) {
2800 Out = stripBitCast(Reg: LShlSrc, MRI);
2801 return true;
2802 }
2803 }
2804
2805 MachineInstr *Shuffle = MRI.getVRegDef(Reg: Trunc);
2806 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2807 return false;
2808
2809 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2810 LLT::fixed_vector(2, 16));
2811
2812 ArrayRef<int> Mask = Shuffle->getOperand(i: 3).getShuffleMask();
2813 assert(Mask.size() == 2);
2814
2815 if (Mask[0] == 1 && Mask[1] <= 1) {
2816 Out = Shuffle->getOperand(i: 0).getReg();
2817 return true;
2818 }
2819
2820 return false;
2821}
2822
2823bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2824 if (!Subtarget->hasSALUFloatInsts())
2825 return false;
2826
2827 Register Dst = I.getOperand(i: 0).getReg();
2828 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
2829 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2830 return false;
2831
2832 Register Src = I.getOperand(i: 1).getReg();
2833
2834 if (MRI->getType(Reg: Dst) == LLT::scalar(SizeInBits: 32) &&
2835 MRI->getType(Reg: Src) == LLT::scalar(SizeInBits: 16)) {
2836 if (isExtractHiElt(MRI&: *MRI, In: Src, Out&: Src)) {
2837 MachineBasicBlock *BB = I.getParent();
2838 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_CVT_HI_F32_F16), DestReg: Dst)
2839 .addUse(RegNo: Src);
2840 I.eraseFromParent();
2841 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2842 }
2843 }
2844
2845 return false;
2846}
2847
2848bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2849 // Only manually handle the f64 SGPR case.
2850 //
2851 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2852 // the bit ops theoretically have a second result due to the implicit def of
2853 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2854 // that is easy by disabling the check. The result works, but uses a
2855 // nonsensical sreg32orlds_and_sreg_1 regclass.
2856 //
2857 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2858 // the variadic REG_SEQUENCE operands.
2859
2860 Register Dst = MI.getOperand(i: 0).getReg();
2861 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
2862 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2863 MRI->getType(Reg: Dst) != LLT::scalar(SizeInBits: 64))
2864 return false;
2865
2866 Register Src = MI.getOperand(i: 1).getReg();
2867 MachineInstr *Fabs = getOpcodeDef(Opcode: TargetOpcode::G_FABS, Reg: Src, MRI: *MRI);
2868 if (Fabs)
2869 Src = Fabs->getOperand(i: 1).getReg();
2870
2871 if (!RBI.constrainGenericRegister(Reg: Src, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI) ||
2872 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI))
2873 return false;
2874
2875 MachineBasicBlock *BB = MI.getParent();
2876 const DebugLoc &DL = MI.getDebugLoc();
2877 Register LoReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2878 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2879 Register ConstReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2880 Register OpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2881
2882 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
2883 .addReg(RegNo: Src, Flags: {}, SubReg: AMDGPU::sub0);
2884 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
2885 .addReg(RegNo: Src, Flags: {}, SubReg: AMDGPU::sub1);
2886 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: ConstReg)
2887 .addImm(Val: 0x80000000);
2888
2889 // Set or toggle sign bit.
2890 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2891 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: OpReg)
2892 .addReg(RegNo: HiReg)
2893 .addReg(RegNo: ConstReg)
2894 .setOperandDead(3); // Dead scc
2895 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
2896 .addReg(RegNo: LoReg)
2897 .addImm(Val: AMDGPU::sub0)
2898 .addReg(RegNo: OpReg)
2899 .addImm(Val: AMDGPU::sub1);
2900 MI.eraseFromParent();
2901 return true;
2902}
2903
2904// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2905bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2906 Register Dst = MI.getOperand(i: 0).getReg();
2907 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
2908 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2909 MRI->getType(Reg: Dst) != LLT::scalar(SizeInBits: 64))
2910 return false;
2911
2912 Register Src = MI.getOperand(i: 1).getReg();
2913 MachineBasicBlock *BB = MI.getParent();
2914 const DebugLoc &DL = MI.getDebugLoc();
2915 Register LoReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2916 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2917 Register ConstReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2918 Register OpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2919
2920 if (!RBI.constrainGenericRegister(Reg: Src, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI) ||
2921 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI))
2922 return false;
2923
2924 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
2925 .addReg(RegNo: Src, Flags: {}, SubReg: AMDGPU::sub0);
2926 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
2927 .addReg(RegNo: Src, Flags: {}, SubReg: AMDGPU::sub1);
2928 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: ConstReg)
2929 .addImm(Val: 0x7fffffff);
2930
2931 // Clear sign bit.
2932 // TODO: Should this used S_BITSET0_*?
2933 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: OpReg)
2934 .addReg(RegNo: HiReg)
2935 .addReg(RegNo: ConstReg)
2936 .setOperandDead(3); // Dead scc
2937 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
2938 .addReg(RegNo: LoReg)
2939 .addImm(Val: AMDGPU::sub0)
2940 .addReg(RegNo: OpReg)
2941 .addImm(Val: AMDGPU::sub1);
2942
2943 MI.eraseFromParent();
2944 return true;
2945}
2946
2947static bool isConstant(const MachineInstr &MI) {
2948 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2949}
2950
2951void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2952 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2953
2954 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2955 const MachineInstr *PtrMI =
2956 MRI.getUniqueVRegDef(Reg: Load.getOperand(i: OpNo).getReg());
2957
2958 assert(PtrMI);
2959
2960 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2961 return;
2962
2963 GEPInfo GEPInfo;
2964
2965 for (unsigned i = 1; i != 3; ++i) {
2966 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2967 const MachineInstr *OpDef = MRI.getUniqueVRegDef(Reg: GEPOp.getReg());
2968 assert(OpDef);
2969 if (i == 2 && isConstant(MI: *OpDef)) {
2970 // TODO: Could handle constant base + variable offset, but a combine
2971 // probably should have commuted it.
2972 assert(GEPInfo.Imm == 0);
2973 GEPInfo.Imm = OpDef->getOperand(i: 1).getCImm()->getSExtValue();
2974 continue;
2975 }
2976 const RegisterBank *OpBank = RBI.getRegBank(Reg: GEPOp.getReg(), MRI, TRI);
2977 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2978 GEPInfo.SgprParts.push_back(Elt: GEPOp.getReg());
2979 else
2980 GEPInfo.VgprParts.push_back(Elt: GEPOp.getReg());
2981 }
2982
2983 AddrInfo.push_back(Elt: GEPInfo);
2984 getAddrModeInfo(Load: *PtrMI, MRI, AddrInfo);
2985}
2986
2987bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2988 return RBI.getRegBank(Reg, MRI: *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2989}
2990
2991bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2992 if (!MI.hasOneMemOperand())
2993 return false;
2994
2995 const MachineMemOperand *MMO = *MI.memoperands_begin();
2996 const Value *Ptr = MMO->getValue();
2997
2998 // UndefValue means this is a load of a kernel input. These are uniform.
2999 // Sometimes LDS instructions have constant pointers.
3000 // If Ptr is null, then that means this mem operand contains a
3001 // PseudoSourceValue like GOT.
3002 if (!Ptr || isa<UndefValue, Argument, Constant, GlobalValue>(Val: Ptr))
3003 return true;
3004
3005 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
3006 return true;
3007
3008 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3009 return RBI.getRegBank(Reg: MI.getOperand(i: 0).getReg(), MRI: *MRI, TRI)->getID() ==
3010 AMDGPU::SGPRRegBankID;
3011
3012 const Instruction *I = dyn_cast<Instruction>(Val: Ptr);
3013 return I && I->getMetadata(Kind: "amdgpu.uniform");
3014}
3015
3016bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3017 for (const GEPInfo &GEPInfo : AddrInfo) {
3018 if (!GEPInfo.VgprParts.empty())
3019 return true;
3020 }
3021 return false;
3022}
3023
3024void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3025 const LLT PtrTy = MRI->getType(Reg: I.getOperand(i: 1).getReg());
3026 unsigned AS = PtrTy.getAddressSpace();
3027 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
3028 STI.ldsRequiresM0Init()) {
3029 MachineBasicBlock *BB = I.getParent();
3030
3031 // If DS instructions require M0 initialization, insert it before selecting.
3032 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
3033 .addImm(Val: -1);
3034 }
3035}
3036
3037bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3038 MachineInstr &I) const {
3039 initM0(I);
3040 return selectImpl(I, CoverageInfo&: *CoverageInfo);
3041}
3042
3043static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
3044 if (Reg.isPhysical())
3045 return false;
3046
3047 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
3048 const unsigned Opcode = MI.getOpcode();
3049
3050 if (Opcode == AMDGPU::COPY)
3051 return isVCmpResult(Reg: MI.getOperand(i: 1).getReg(), MRI);
3052
3053 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3054 Opcode == AMDGPU::G_XOR)
3055 return isVCmpResult(Reg: MI.getOperand(i: 1).getReg(), MRI) &&
3056 isVCmpResult(Reg: MI.getOperand(i: 2).getReg(), MRI);
3057
3058 if (auto *GI = dyn_cast<GIntrinsic>(Val: &MI))
3059 return GI->is(ID: Intrinsic::amdgcn_class);
3060
3061 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3062}
3063
3064bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3065 MachineBasicBlock *BB = I.getParent();
3066 MachineOperand &CondOp = I.getOperand(i: 0);
3067 Register CondReg = CondOp.getReg();
3068 const DebugLoc &DL = I.getDebugLoc();
3069
3070 unsigned BrOpcode;
3071 Register CondPhysReg;
3072 const TargetRegisterClass *ConstrainRC;
3073
3074 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3075 // whether the branch is uniform when selecting the instruction. In
3076 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3077 // RegBankSelect knows what it's doing if the branch condition is scc, even
3078 // though it currently does not.
3079 if (!isVCC(Reg: CondReg, MRI: *MRI)) {
3080 if (MRI->getType(Reg: CondReg) != LLT::scalar(SizeInBits: 32))
3081 return false;
3082
3083 CondPhysReg = AMDGPU::SCC;
3084 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3085 ConstrainRC = &AMDGPU::SReg_32RegClass;
3086 } else {
3087 // FIXME: Should scc->vcc copies and with exec?
3088
3089 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3090 // need to insert an and with exec.
3091 if (!isVCmpResult(Reg: CondReg, MRI&: *MRI)) {
3092 const bool Is64 = STI.isWave64();
3093 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3094 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3095
3096 Register TmpReg = MRI->createVirtualRegister(RegClass: TRI.getBoolRC());
3097 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: TmpReg)
3098 .addReg(RegNo: CondReg)
3099 .addReg(RegNo: Exec)
3100 .setOperandDead(3); // Dead scc
3101 CondReg = TmpReg;
3102 }
3103
3104 CondPhysReg = TRI.getVCC();
3105 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3106 ConstrainRC = TRI.getBoolRC();
3107 }
3108
3109 if (!MRI->getRegClassOrNull(Reg: CondReg))
3110 MRI->setRegClass(Reg: CondReg, RC: ConstrainRC);
3111
3112 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CondPhysReg)
3113 .addReg(RegNo: CondReg);
3114 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: BrOpcode))
3115 .addMBB(MBB: I.getOperand(i: 1).getMBB());
3116
3117 I.eraseFromParent();
3118 return true;
3119}
3120
3121bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3122 MachineInstr &I) const {
3123 Register DstReg = I.getOperand(i: 0).getReg();
3124 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3125 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3126 I.setDesc(TII.get(Opcode: IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3127 if (IsVGPR)
3128 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
3129
3130 return RBI.constrainGenericRegister(
3131 Reg: DstReg, RC: IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI&: *MRI);
3132}
3133
3134bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3135 Register DstReg = I.getOperand(i: 0).getReg();
3136 Register SrcReg = I.getOperand(i: 1).getReg();
3137 Register MaskReg = I.getOperand(i: 2).getReg();
3138 LLT Ty = MRI->getType(Reg: DstReg);
3139 LLT MaskTy = MRI->getType(Reg: MaskReg);
3140 MachineBasicBlock *BB = I.getParent();
3141 const DebugLoc &DL = I.getDebugLoc();
3142
3143 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3144 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
3145 const RegisterBank *MaskRB = RBI.getRegBank(Reg: MaskReg, MRI: *MRI, TRI);
3146 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3147 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3148 return false;
3149
3150 // Try to avoid emitting a bit operation when we only need to touch half of
3151 // the 64-bit pointer.
3152 APInt MaskOnes = VT->getKnownOnes(R: MaskReg).zext(width: 64);
3153 const APInt MaskHi32 = APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32);
3154 const APInt MaskLo32 = APInt::getLowBitsSet(numBits: 64, loBitsSet: 32);
3155
3156 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3157 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3158
3159 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3160 !CanCopyLow32 && !CanCopyHi32) {
3161 auto MIB = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B64), DestReg: DstReg)
3162 .addReg(RegNo: SrcReg)
3163 .addReg(RegNo: MaskReg)
3164 .setOperandDead(3); // Dead scc
3165 I.eraseFromParent();
3166 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3167 }
3168
3169 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3170 const TargetRegisterClass &RegRC
3171 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3172
3173 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, Bank: *DstRB);
3174 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, Bank: *SrcRB);
3175 const TargetRegisterClass *MaskRC =
3176 TRI.getRegClassForTypeOnBank(Ty: MaskTy, Bank: *MaskRB);
3177
3178 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
3179 !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
3180 !RBI.constrainGenericRegister(Reg: MaskReg, RC: *MaskRC, MRI&: *MRI))
3181 return false;
3182
3183 if (Ty.getSizeInBits() == 32) {
3184 assert(MaskTy.getSizeInBits() == 32 &&
3185 "ptrmask should have been narrowed during legalize");
3186
3187 auto NewOp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: DstReg)
3188 .addReg(RegNo: SrcReg)
3189 .addReg(RegNo: MaskReg);
3190
3191 if (!IsVGPR)
3192 NewOp.setOperandDead(3); // Dead scc
3193 I.eraseFromParent();
3194 return true;
3195 }
3196
3197 Register HiReg = MRI->createVirtualRegister(RegClass: &RegRC);
3198 Register LoReg = MRI->createVirtualRegister(RegClass: &RegRC);
3199
3200 // Extract the subregisters from the source pointer.
3201 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
3202 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub0);
3203 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
3204 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub1);
3205
3206 Register MaskedLo, MaskedHi;
3207
3208 if (CanCopyLow32) {
3209 // If all the bits in the low half are 1, we only need a copy for it.
3210 MaskedLo = LoReg;
3211 } else {
3212 // Extract the mask subregister and apply the and.
3213 Register MaskLo = MRI->createVirtualRegister(RegClass: &RegRC);
3214 MaskedLo = MRI->createVirtualRegister(RegClass: &RegRC);
3215
3216 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: MaskLo)
3217 .addReg(RegNo: MaskReg, Flags: {}, SubReg: AMDGPU::sub0);
3218 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: MaskedLo)
3219 .addReg(RegNo: LoReg)
3220 .addReg(RegNo: MaskLo);
3221 }
3222
3223 if (CanCopyHi32) {
3224 // If all the bits in the high half are 1, we only need a copy for it.
3225 MaskedHi = HiReg;
3226 } else {
3227 Register MaskHi = MRI->createVirtualRegister(RegClass: &RegRC);
3228 MaskedHi = MRI->createVirtualRegister(RegClass: &RegRC);
3229
3230 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: MaskHi)
3231 .addReg(RegNo: MaskReg, Flags: {}, SubReg: AMDGPU::sub1);
3232 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: MaskedHi)
3233 .addReg(RegNo: HiReg)
3234 .addReg(RegNo: MaskHi);
3235 }
3236
3237 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
3238 .addReg(RegNo: MaskedLo)
3239 .addImm(Val: AMDGPU::sub0)
3240 .addReg(RegNo: MaskedHi)
3241 .addImm(Val: AMDGPU::sub1);
3242 I.eraseFromParent();
3243 return true;
3244}
3245
3246/// Return the register to use for the index value, and the subregister to use
3247/// for the indirectly accessed register.
3248static std::pair<Register, unsigned>
3249computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
3250 const TargetRegisterClass *SuperRC, Register IdxReg,
3251 unsigned EltSize, GISelValueTracking &ValueTracking) {
3252 Register IdxBaseReg;
3253 int Offset;
3254
3255 std::tie(args&: IdxBaseReg, args&: Offset) =
3256 AMDGPU::getBaseWithConstantOffset(MRI, Reg: IdxReg, ValueTracking: &ValueTracking);
3257 if (IdxBaseReg == AMDGPU::NoRegister) {
3258 // This will happen if the index is a known constant. This should ordinarily
3259 // be legalized out, but handle it as a register just in case.
3260 assert(Offset == 0);
3261 IdxBaseReg = IdxReg;
3262 }
3263
3264 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: SuperRC, EltSize);
3265
3266 // Skip out of bounds offsets, or else we would end up using an undefined
3267 // register.
3268 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3269 return std::pair(IdxReg, SubRegs[0]);
3270 return std::pair(IdxBaseReg, SubRegs[Offset]);
3271}
3272
3273bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3274 MachineInstr &MI) const {
3275 Register DstReg = MI.getOperand(i: 0).getReg();
3276 Register SrcReg = MI.getOperand(i: 1).getReg();
3277 Register IdxReg = MI.getOperand(i: 2).getReg();
3278
3279 LLT DstTy = MRI->getType(Reg: DstReg);
3280 LLT SrcTy = MRI->getType(Reg: SrcReg);
3281
3282 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3283 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
3284 const RegisterBank *IdxRB = RBI.getRegBank(Reg: IdxReg, MRI: *MRI, TRI);
3285
3286 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3287 // into a waterfall loop.
3288 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3289 return false;
3290
3291 const TargetRegisterClass *SrcRC =
3292 TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *SrcRB);
3293 const TargetRegisterClass *DstRC =
3294 TRI.getRegClassForTypeOnBank(Ty: DstTy, Bank: *DstRB);
3295 if (!SrcRC || !DstRC)
3296 return false;
3297 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
3298 !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
3299 !RBI.constrainGenericRegister(Reg: IdxReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
3300 return false;
3301
3302 MachineBasicBlock *BB = MI.getParent();
3303 const DebugLoc &DL = MI.getDebugLoc();
3304 const bool Is64 = DstTy.getSizeInBits() == 64;
3305
3306 unsigned SubReg;
3307 std::tie(args&: IdxReg, args&: SubReg) = computeIndirectRegIndex(
3308 MRI&: *MRI, TRI, SuperRC: SrcRC, IdxReg, EltSize: DstTy.getSizeInBits() / 8, ValueTracking&: *VT);
3309
3310 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3311 if (DstTy.getSizeInBits() != 32 && !Is64)
3312 return false;
3313
3314 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3315 .addReg(RegNo: IdxReg);
3316
3317 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3318 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
3319 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
3320 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
3321 MI.eraseFromParent();
3322 return true;
3323 }
3324
3325 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3326 return false;
3327
3328 if (!STI.useVGPRIndexMode()) {
3329 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3330 .addReg(RegNo: IdxReg);
3331 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: DstReg)
3332 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
3333 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
3334 MI.eraseFromParent();
3335 return true;
3336 }
3337
3338 const MCInstrDesc &GPRIDXDesc =
3339 TII.getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *SrcRC), IsIndirectSrc: true);
3340 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: GPRIDXDesc, DestReg: DstReg)
3341 .addReg(RegNo: SrcReg)
3342 .addReg(RegNo: IdxReg)
3343 .addImm(Val: SubReg);
3344
3345 MI.eraseFromParent();
3346 return true;
3347}
3348
3349// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3350bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3351 MachineInstr &MI) const {
3352 Register DstReg = MI.getOperand(i: 0).getReg();
3353 Register VecReg = MI.getOperand(i: 1).getReg();
3354 Register ValReg = MI.getOperand(i: 2).getReg();
3355 Register IdxReg = MI.getOperand(i: 3).getReg();
3356
3357 LLT VecTy = MRI->getType(Reg: DstReg);
3358 LLT ValTy = MRI->getType(Reg: ValReg);
3359 unsigned VecSize = VecTy.getSizeInBits();
3360 unsigned ValSize = ValTy.getSizeInBits();
3361
3362 const RegisterBank *VecRB = RBI.getRegBank(Reg: VecReg, MRI: *MRI, TRI);
3363 const RegisterBank *ValRB = RBI.getRegBank(Reg: ValReg, MRI: *MRI, TRI);
3364 const RegisterBank *IdxRB = RBI.getRegBank(Reg: IdxReg, MRI: *MRI, TRI);
3365
3366 assert(VecTy.getElementType() == ValTy);
3367
3368 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3369 // into a waterfall loop.
3370 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3371 return false;
3372
3373 const TargetRegisterClass *VecRC =
3374 TRI.getRegClassForTypeOnBank(Ty: VecTy, Bank: *VecRB);
3375 const TargetRegisterClass *ValRC =
3376 TRI.getRegClassForTypeOnBank(Ty: ValTy, Bank: *ValRB);
3377
3378 if (!RBI.constrainGenericRegister(Reg: VecReg, RC: *VecRC, MRI&: *MRI) ||
3379 !RBI.constrainGenericRegister(Reg: DstReg, RC: *VecRC, MRI&: *MRI) ||
3380 !RBI.constrainGenericRegister(Reg: ValReg, RC: *ValRC, MRI&: *MRI) ||
3381 !RBI.constrainGenericRegister(Reg: IdxReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
3382 return false;
3383
3384 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3385 return false;
3386
3387 unsigned SubReg;
3388 std::tie(args&: IdxReg, args&: SubReg) =
3389 computeIndirectRegIndex(MRI&: *MRI, TRI, SuperRC: VecRC, IdxReg, EltSize: ValSize / 8, ValueTracking&: *VT);
3390
3391 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3392 STI.useVGPRIndexMode();
3393
3394 MachineBasicBlock *BB = MI.getParent();
3395 const DebugLoc &DL = MI.getDebugLoc();
3396
3397 if (!IndexMode) {
3398 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3399 .addReg(RegNo: IdxReg);
3400
3401 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3402 VecSize, EltSize: ValSize, IsSGPR: VecRB->getID() == AMDGPU::SGPRRegBankID);
3403 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: RegWriteOp, DestReg: DstReg)
3404 .addReg(RegNo: VecReg)
3405 .addReg(RegNo: ValReg)
3406 .addImm(Val: SubReg);
3407 MI.eraseFromParent();
3408 return true;
3409 }
3410
3411 const MCInstrDesc &GPRIDXDesc =
3412 TII.getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
3413 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: GPRIDXDesc, DestReg: DstReg)
3414 .addReg(RegNo: VecReg)
3415 .addReg(RegNo: ValReg)
3416 .addReg(RegNo: IdxReg)
3417 .addImm(Val: SubReg);
3418
3419 MI.eraseFromParent();
3420 return true;
3421}
3422
3423bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3424 if (!Subtarget->hasVMemToLDSLoad())
3425 return false;
3426 unsigned Opc;
3427 unsigned Size = MI.getOperand(i: 3).getImm();
3428
3429 // The struct intrinsic variants add one additional operand over raw.
3430 const bool HasVIndex = MI.getNumOperands() == 9;
3431 Register VIndex;
3432 int OpOffset = 0;
3433 if (HasVIndex) {
3434 VIndex = MI.getOperand(i: 4).getReg();
3435 OpOffset = 1;
3436 }
3437
3438 Register VOffset = MI.getOperand(i: 4 + OpOffset).getReg();
3439 std::optional<ValueAndVReg> MaybeVOffset =
3440 getIConstantVRegValWithLookThrough(VReg: VOffset, MRI: *MRI);
3441 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3442
3443 switch (Size) {
3444 default:
3445 return false;
3446 case 1:
3447 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3448 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3449 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3450 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3451 break;
3452 case 2:
3453 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3454 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3455 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3456 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3457 break;
3458 case 4:
3459 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3460 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3461 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3462 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3463 break;
3464 case 12:
3465 if (!Subtarget->hasLDSLoadB96_B128())
3466 return false;
3467
3468 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3469 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3470 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3471 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3472 break;
3473 case 16:
3474 if (!Subtarget->hasLDSLoadB96_B128())
3475 return false;
3476
3477 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3478 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3479 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3480 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3481 break;
3482 }
3483
3484 MachineBasicBlock *MBB = MI.getParent();
3485 const DebugLoc &DL = MI.getDebugLoc();
3486 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3487 .add(MO: MI.getOperand(i: 2));
3488
3489 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc));
3490
3491 if (HasVIndex && HasVOffset) {
3492 Register IdxReg = MRI->createVirtualRegister(RegClass: TRI.getVGPR64Class());
3493 BuildMI(BB&: *MBB, I: &*MIB, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: IdxReg)
3494 .addReg(RegNo: VIndex)
3495 .addImm(Val: AMDGPU::sub0)
3496 .addReg(RegNo: VOffset)
3497 .addImm(Val: AMDGPU::sub1);
3498
3499 MIB.addReg(RegNo: IdxReg);
3500 } else if (HasVIndex) {
3501 MIB.addReg(RegNo: VIndex);
3502 } else if (HasVOffset) {
3503 MIB.addReg(RegNo: VOffset);
3504 }
3505
3506 MIB.add(MO: MI.getOperand(i: 1)); // rsrc
3507 MIB.add(MO: MI.getOperand(i: 5 + OpOffset)); // soffset
3508 MIB.add(MO: MI.getOperand(i: 6 + OpOffset)); // imm offset
3509 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3510 unsigned Aux = MI.getOperand(i: 7 + OpOffset).getImm();
3511 MIB.addImm(Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3512 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3513 MIB.addImm(
3514 Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3515 ? 1
3516 : 0); // swz
3517
3518 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3519 // Don't set the offset value here because the pointer points to the base of
3520 // the buffer.
3521 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3522
3523 MachinePointerInfo StorePtrI = LoadPtrI;
3524 LoadPtrI.V = PoisonValue::get(T: PointerType::get(C&: MF->getFunction().getContext(),
3525 AddressSpace: AMDGPUAS::BUFFER_RESOURCE));
3526 LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
3527 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3528
3529 auto F = LoadMMO->getFlags() &
3530 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3531 LoadMMO = MF->getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad,
3532 Size, BaseAlignment: LoadMMO->getBaseAlign());
3533
3534 MachineMemOperand *StoreMMO =
3535 MF->getMachineMemOperand(PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore,
3536 Size: sizeof(int32_t), BaseAlignment: LoadMMO->getBaseAlign());
3537
3538 MIB.setMemRefs({LoadMMO, StoreMMO});
3539
3540 MI.eraseFromParent();
3541 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3542}
3543
3544/// Match a zero extend from a 32-bit value to 64-bits.
3545Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3546 Register ZExtSrc;
3547 if (mi_match(R: Reg, MRI: *MRI, P: m_GZExt(Src: m_Reg(R&: ZExtSrc))))
3548 return MRI->getType(Reg: ZExtSrc) == LLT::scalar(SizeInBits: 32) ? ZExtSrc : Register();
3549
3550 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3551 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI);
3552 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3553 return Register();
3554
3555 assert(Def->getNumOperands() == 3 &&
3556 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3557 if (mi_match(R: Def->getOperand(i: 2).getReg(), MRI: *MRI, P: m_ZeroInt())) {
3558 return Def->getOperand(i: 1).getReg();
3559 }
3560
3561 return Register();
3562}
3563
3564/// Match a sign extend from a 32-bit value to 64-bits.
3565Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3566 Register SExtSrc;
3567 if (mi_match(R: Reg, MRI: *MRI, P: m_GSExt(Src: m_Reg(R&: SExtSrc))))
3568 return MRI->getType(Reg: SExtSrc) == LLT::scalar(SizeInBits: 32) ? SExtSrc : Register();
3569
3570 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3571 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI);
3572 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3573 return Register();
3574
3575 assert(Def->getNumOperands() == 3 &&
3576 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3577 if (mi_match(R: Def->getOperand(i: 2).getReg(), MRI: *MRI,
3578 P: m_GAShr(L: m_SpecificReg(RequestedReg: Def->getOperand(i: 1).getReg()),
3579 R: m_SpecificICst(RequestedValue: 31))))
3580 return Def->getOperand(i: 1).getReg();
3581
3582 if (VT->signBitIsZero(Op: Reg))
3583 return matchZeroExtendFromS32(Reg);
3584
3585 return Register();
3586}
3587
3588/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3589/// is 32-bit.
3590Register
3591AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3592 return MRI->getType(Reg) == LLT::scalar(SizeInBits: 32) ? Reg
3593 : matchZeroExtendFromS32(Reg);
3594}
3595
3596/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3597/// is 32-bit.
3598Register
3599AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3600 return MRI->getType(Reg) == LLT::scalar(SizeInBits: 32) ? Reg
3601 : matchSignExtendFromS32(Reg);
3602}
3603
3604Register
3605AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3606 bool IsSigned) const {
3607 if (IsSigned)
3608 return matchSignExtendFromS32OrS32(Reg);
3609
3610 return matchZeroExtendFromS32OrS32(Reg);
3611}
3612
3613Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3614 Register AnyExtSrc;
3615 if (mi_match(R: Reg, MRI: *MRI, P: m_GAnyExt(Src: m_Reg(R&: AnyExtSrc))))
3616 return MRI->getType(Reg: AnyExtSrc) == LLT::scalar(SizeInBits: 32) ? AnyExtSrc : Register();
3617
3618 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3619 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI);
3620 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3621 return Register();
3622
3623 assert(Def->getNumOperands() == 3 &&
3624 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3625
3626 if (mi_match(R: Def->getOperand(i: 2).getReg(), MRI: *MRI, P: m_GImplicitDef()))
3627 return Def->getOperand(i: 1).getReg();
3628
3629 return Register();
3630}
3631
3632bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3633 if (!Subtarget->hasVMemToLDSLoad())
3634 return false;
3635
3636 unsigned Opc;
3637 unsigned Size = MI.getOperand(i: 3).getImm();
3638
3639 switch (Size) {
3640 default:
3641 return false;
3642 case 1:
3643 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3644 break;
3645 case 2:
3646 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3647 break;
3648 case 4:
3649 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3650 break;
3651 case 12:
3652 if (!Subtarget->hasLDSLoadB96_B128())
3653 return false;
3654 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3655 break;
3656 case 16:
3657 if (!Subtarget->hasLDSLoadB96_B128())
3658 return false;
3659 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3660 break;
3661 }
3662
3663 MachineBasicBlock *MBB = MI.getParent();
3664 const DebugLoc &DL = MI.getDebugLoc();
3665 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3666 .add(MO: MI.getOperand(i: 2));
3667
3668 Register Addr = MI.getOperand(i: 1).getReg();
3669 Register VOffset;
3670 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3671 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3672 if (!isSGPR(Reg: Addr)) {
3673 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
3674 if (isSGPR(Reg: AddrDef->Reg)) {
3675 Addr = AddrDef->Reg;
3676 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3677 Register SAddr =
3678 getSrcRegIgnoringCopies(Reg: AddrDef->MI->getOperand(i: 1).getReg(), MRI: *MRI);
3679 if (isSGPR(Reg: SAddr)) {
3680 Register PtrBaseOffset = AddrDef->MI->getOperand(i: 2).getReg();
3681 if (Register Off = matchZeroExtendFromS32(Reg: PtrBaseOffset)) {
3682 Addr = SAddr;
3683 VOffset = Off;
3684 }
3685 }
3686 }
3687 }
3688
3689 if (isSGPR(Reg: Addr)) {
3690 Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc);
3691 if (!VOffset) {
3692 VOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
3693 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VOffset)
3694 .addImm(Val: 0);
3695 }
3696 }
3697
3698 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc))
3699 .addReg(RegNo: Addr);
3700
3701 if (isSGPR(Reg: Addr))
3702 MIB.addReg(RegNo: VOffset);
3703
3704 MIB.add(MO: MI.getOperand(i: 4)); // offset
3705
3706 unsigned Aux = MI.getOperand(i: 5).getImm();
3707 MIB.addImm(Val: Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3708
3709 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3710 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3711 LoadPtrI.Offset = MI.getOperand(i: 4).getImm();
3712 MachinePointerInfo StorePtrI = LoadPtrI;
3713 LoadPtrI.V = PoisonValue::get(T: PointerType::get(C&: MF->getFunction().getContext(),
3714 AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
3715 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
3716 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3717 auto F = LoadMMO->getFlags() &
3718 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3719 LoadMMO = MF->getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad,
3720 Size, BaseAlignment: LoadMMO->getBaseAlign());
3721 MachineMemOperand *StoreMMO =
3722 MF->getMachineMemOperand(PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore,
3723 Size: sizeof(int32_t), BaseAlignment: Align(4));
3724
3725 MIB.setMemRefs({LoadMMO, StoreMMO});
3726
3727 MI.eraseFromParent();
3728 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3729}
3730
3731bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3732 MachineInstr &MI) const {
3733 unsigned OpcodeOpIdx =
3734 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3735 MI.setDesc(TII.get(Opcode: MI.getOperand(i: OpcodeOpIdx).getImm()));
3736 MI.removeOperand(OpNo: OpcodeOpIdx);
3737 MI.addImplicitDefUseOperands(MF&: *MI.getMF());
3738 return constrainSelectedInstRegOperands(I&: MI, TII, TRI, RBI);
3739}
3740
3741// FIXME: This should be removed and let the patterns select. We just need the
3742// AGPR/VGPR combination versions.
3743bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3744 unsigned Opc;
3745 switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
3746 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3747 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3748 break;
3749 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3750 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3751 break;
3752 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3753 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3754 break;
3755 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3756 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3757 break;
3758 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3759 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3760 break;
3761 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3762 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3763 break;
3764 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3765 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3766 break;
3767 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3768 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3769 break;
3770 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3771 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3772 break;
3773 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3774 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3775 break;
3776 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3777 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3778 break;
3779 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3780 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3781 break;
3782 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3783 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3784 break;
3785 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3786 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3787 break;
3788 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3789 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3790 break;
3791 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3792 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3793 break;
3794 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3795 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3796 break;
3797 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3798 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3799 break;
3800 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3801 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3802 break;
3803 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3804 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3805 break;
3806 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3807 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3808 break;
3809 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3810 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3811 break;
3812 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3813 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3814 break;
3815 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3816 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3817 break;
3818 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3819 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3820 break;
3821 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3822 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3823 break;
3824 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3825 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3826 break;
3827 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3828 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3829 break;
3830 default:
3831 llvm_unreachable("unhandled smfmac intrinsic");
3832 }
3833
3834 auto VDst_In = MI.getOperand(i: 4);
3835
3836 MI.setDesc(TII.get(Opcode: Opc));
3837 MI.removeOperand(OpNo: 4); // VDst_In
3838 MI.removeOperand(OpNo: 1); // Intrinsic ID
3839 MI.addOperand(Op: VDst_In); // Readd VDst_In to the end
3840 MI.addImplicitDefUseOperands(MF&: *MI.getMF());
3841 const MCInstrDesc &MCID = MI.getDesc();
3842 if (MCID.getOperandConstraint(OpNum: 0, Constraint: MCOI::EARLY_CLOBBER) != -1) {
3843 MI.getOperand(i: 0).setIsEarlyClobber(true);
3844 }
3845 return true;
3846}
3847
3848bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3849 MachineInstr &MI, Intrinsic::ID IntrID) const {
3850 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3851 !Subtarget->hasPermlane16Swap())
3852 return false;
3853 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3854 !Subtarget->hasPermlane32Swap())
3855 return false;
3856
3857 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3858 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3859 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3860
3861 MI.removeOperand(OpNo: 2);
3862 MI.setDesc(TII.get(Opcode));
3863 MI.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
3864
3865 MachineOperand &FI = MI.getOperand(i: 4);
3866 FI.setImm(FI.getImm() ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0);
3867
3868 return constrainSelectedInstRegOperands(I&: MI, TII, TRI, RBI);
3869}
3870
3871bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3872 Register DstReg = MI.getOperand(i: 0).getReg();
3873 Register SrcReg = MI.getOperand(i: 1).getReg();
3874 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3875 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3876 MachineBasicBlock *MBB = MI.getParent();
3877 const DebugLoc &DL = MI.getDebugLoc();
3878
3879 if (IsVALU) {
3880 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: DstReg)
3881 .addImm(Val: Subtarget->getWavefrontSizeLog2())
3882 .addReg(RegNo: SrcReg);
3883 } else {
3884 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: DstReg)
3885 .addReg(RegNo: SrcReg)
3886 .addImm(Val: Subtarget->getWavefrontSizeLog2())
3887 .setOperandDead(3); // Dead scc
3888 }
3889
3890 const TargetRegisterClass &RC =
3891 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3892 if (!RBI.constrainGenericRegister(Reg: DstReg, RC, MRI&: *MRI))
3893 return false;
3894
3895 MI.eraseFromParent();
3896 return true;
3897}
3898
3899bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
3900 MachineInstr &MI) const {
3901 assert(MI.getNumOperands() == 4);
3902 MachineBasicBlock *MBB = MI.getParent();
3903 const DebugLoc &DL = MI.getDebugLoc();
3904
3905 Register DstReg = MI.getOperand(i: 0).getReg();
3906 Register ValReg = MI.getOperand(i: 2).getReg();
3907 Register IdxReg = MI.getOperand(i: 3).getReg();
3908
3909 const LLT DstTy = MRI->getType(Reg: DstReg);
3910 unsigned DstSize = DstTy.getSizeInBits();
3911 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3912 const TargetRegisterClass *DstRC =
3913 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstRB);
3914
3915 if (DstTy != LLT::scalar(SizeInBits: 32))
3916 return false;
3917
3918 if (!Subtarget->supportsBPermute())
3919 return false;
3920
3921 // If we can bpermute across the whole wave, then just do that
3922 if (Subtarget->supportsWaveWideBPermute()) {
3923 Register ShiftIdxReg = MRI->createVirtualRegister(RegClass: DstRC);
3924 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHLREV_B32_e64), DestReg: ShiftIdxReg)
3925 .addImm(Val: 2)
3926 .addReg(RegNo: IdxReg);
3927
3928 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_BPERMUTE_B32), DestReg: DstReg)
3929 .addReg(RegNo: ShiftIdxReg)
3930 .addReg(RegNo: ValReg)
3931 .addImm(Val: 0);
3932 } else {
3933 // Otherwise, we need to make use of whole wave mode
3934 assert(Subtarget->isWave64());
3935
3936 // Set inactive lanes to poison
3937 Register UndefValReg =
3938 MRI->createVirtualRegister(RegClass: TRI.getRegClass(i: AMDGPU::SReg_32RegClassID));
3939 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefValReg);
3940
3941 Register UndefExecReg = MRI->createVirtualRegister(
3942 RegClass: TRI.getRegClass(i: AMDGPU::SReg_64_XEXECRegClassID));
3943 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefExecReg);
3944
3945 Register PoisonValReg = MRI->createVirtualRegister(RegClass: DstRC);
3946 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SET_INACTIVE_B32), DestReg: PoisonValReg)
3947 .addImm(Val: 0)
3948 .addReg(RegNo: ValReg)
3949 .addImm(Val: 0)
3950 .addReg(RegNo: UndefValReg)
3951 .addReg(RegNo: UndefExecReg);
3952
3953 // ds_bpermute requires index to be multiplied by 4
3954 Register ShiftIdxReg = MRI->createVirtualRegister(RegClass: DstRC);
3955 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHLREV_B32_e64), DestReg: ShiftIdxReg)
3956 .addImm(Val: 2)
3957 .addReg(RegNo: IdxReg);
3958
3959 Register PoisonIdxReg = MRI->createVirtualRegister(RegClass: DstRC);
3960 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SET_INACTIVE_B32), DestReg: PoisonIdxReg)
3961 .addImm(Val: 0)
3962 .addReg(RegNo: ShiftIdxReg)
3963 .addImm(Val: 0)
3964 .addReg(RegNo: UndefValReg)
3965 .addReg(RegNo: UndefExecReg);
3966
3967 // Get permutation of each half, then we'll select which one to use
3968 Register SameSidePermReg = MRI->createVirtualRegister(RegClass: DstRC);
3969 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_BPERMUTE_B32), DestReg: SameSidePermReg)
3970 .addReg(RegNo: PoisonIdxReg)
3971 .addReg(RegNo: PoisonValReg)
3972 .addImm(Val: 0);
3973
3974 Register SwappedValReg = MRI->createVirtualRegister(RegClass: DstRC);
3975 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_PERMLANE64_B32), DestReg: SwappedValReg)
3976 .addReg(RegNo: PoisonValReg);
3977
3978 Register OppSidePermReg = MRI->createVirtualRegister(RegClass: DstRC);
3979 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_BPERMUTE_B32), DestReg: OppSidePermReg)
3980 .addReg(RegNo: PoisonIdxReg)
3981 .addReg(RegNo: SwappedValReg)
3982 .addImm(Val: 0);
3983
3984 Register WWMSwapPermReg = MRI->createVirtualRegister(RegClass: DstRC);
3985 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::STRICT_WWM), DestReg: WWMSwapPermReg)
3986 .addReg(RegNo: OppSidePermReg);
3987
3988 // Select which side to take the permute from
3989 // We can get away with only using mbcnt_lo here since we're only
3990 // trying to detect which side of 32 each lane is on, and mbcnt_lo
3991 // returns 32 for lanes 32-63.
3992 Register ThreadIDReg = MRI->createVirtualRegister(RegClass: DstRC);
3993 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MBCNT_LO_U32_B32_e64), DestReg: ThreadIDReg)
3994 .addImm(Val: -1)
3995 .addImm(Val: 0);
3996
3997 Register XORReg = MRI->createVirtualRegister(RegClass: DstRC);
3998 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_XOR_B32_e64), DestReg: XORReg)
3999 .addReg(RegNo: ThreadIDReg)
4000 .addReg(RegNo: PoisonIdxReg);
4001
4002 Register ANDReg = MRI->createVirtualRegister(RegClass: DstRC);
4003 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B32_e64), DestReg: ANDReg)
4004 .addReg(RegNo: XORReg)
4005 .addImm(Val: 32);
4006
4007 Register CompareReg = MRI->createVirtualRegister(
4008 RegClass: TRI.getRegClass(i: AMDGPU::SReg_64_XEXECRegClassID));
4009 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: CompareReg)
4010 .addReg(RegNo: ANDReg)
4011 .addImm(Val: 0);
4012
4013 // Finally do the selection
4014 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
4015 .addImm(Val: 0)
4016 .addReg(RegNo: WWMSwapPermReg)
4017 .addImm(Val: 0)
4018 .addReg(RegNo: SameSidePermReg)
4019 .addReg(RegNo: CompareReg);
4020 }
4021
4022 MI.eraseFromParent();
4023 return true;
4024}
4025
4026// Match BITOP3 operation and return a number of matched instructions plus
4027// truth table.
4028static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
4029 SmallVectorImpl<Register> &Src,
4030 const MachineRegisterInfo &MRI) {
4031 unsigned NumOpcodes = 0;
4032 uint8_t LHSBits, RHSBits;
4033
4034 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
4035 // Define truth table given Src0, Src1, Src2 bits permutations:
4036 // 0 0 0
4037 // 0 0 1
4038 // 0 1 0
4039 // 0 1 1
4040 // 1 0 0
4041 // 1 0 1
4042 // 1 1 0
4043 // 1 1 1
4044 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4045
4046 if (mi_match(R: Op, MRI, P: m_AllOnesInt())) {
4047 Bits = 0xff;
4048 return true;
4049 }
4050 if (mi_match(R: Op, MRI, P: m_ZeroInt())) {
4051 Bits = 0;
4052 return true;
4053 }
4054
4055 for (unsigned I = 0; I < Src.size(); ++I) {
4056 // Try to find existing reused operand
4057 if (Src[I] == Op) {
4058 Bits = SrcBits[I];
4059 return true;
4060 }
4061 // Try to replace parent operator
4062 if (Src[I] == R) {
4063 Bits = SrcBits[I];
4064 Src[I] = Op;
4065 return true;
4066 }
4067 }
4068
4069 if (Src.size() == 3) {
4070 // No room left for operands. Try one last time, there can be a 'not' of
4071 // one of our source operands. In this case we can compute the bits
4072 // without growing Src vector.
4073 Register LHS;
4074 if (mi_match(R: Op, MRI, P: m_Not(Src: m_Reg(R&: LHS)))) {
4075 LHS = getSrcRegIgnoringCopies(Reg: LHS, MRI);
4076 for (unsigned I = 0; I < Src.size(); ++I) {
4077 if (Src[I] == LHS) {
4078 Bits = ~SrcBits[I];
4079 return true;
4080 }
4081 }
4082 }
4083
4084 return false;
4085 }
4086
4087 Bits = SrcBits[Src.size()];
4088 Src.push_back(Elt: Op);
4089 return true;
4090 };
4091
4092 MachineInstr *MI = MRI.getVRegDef(Reg: R);
4093 switch (MI->getOpcode()) {
4094 case TargetOpcode::G_AND:
4095 case TargetOpcode::G_OR:
4096 case TargetOpcode::G_XOR: {
4097 Register LHS = getSrcRegIgnoringCopies(Reg: MI->getOperand(i: 1).getReg(), MRI);
4098 Register RHS = getSrcRegIgnoringCopies(Reg: MI->getOperand(i: 2).getReg(), MRI);
4099
4100 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
4101 if (!getOperandBits(LHS, LHSBits) ||
4102 !getOperandBits(RHS, RHSBits)) {
4103 Src = std::move(Backup);
4104 return std::make_pair(x: 0, y: 0);
4105 }
4106
4107 // Recursion is naturally limited by the size of the operand vector.
4108 auto Op = BitOp3_Op(R: LHS, Src, MRI);
4109 if (Op.first) {
4110 NumOpcodes += Op.first;
4111 LHSBits = Op.second;
4112 }
4113
4114 Op = BitOp3_Op(R: RHS, Src, MRI);
4115 if (Op.first) {
4116 NumOpcodes += Op.first;
4117 RHSBits = Op.second;
4118 }
4119 break;
4120 }
4121 default:
4122 return std::make_pair(x: 0, y: 0);
4123 }
4124
4125 uint8_t TTbl;
4126 switch (MI->getOpcode()) {
4127 case TargetOpcode::G_AND:
4128 TTbl = LHSBits & RHSBits;
4129 break;
4130 case TargetOpcode::G_OR:
4131 TTbl = LHSBits | RHSBits;
4132 break;
4133 case TargetOpcode::G_XOR:
4134 TTbl = LHSBits ^ RHSBits;
4135 break;
4136 default:
4137 break;
4138 }
4139
4140 return std::make_pair(x: NumOpcodes + 1, y&: TTbl);
4141}
4142
4143bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4144 if (!Subtarget->hasBitOp3Insts())
4145 return false;
4146
4147 Register DstReg = MI.getOperand(i: 0).getReg();
4148 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
4149 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4150 if (!IsVALU)
4151 return false;
4152
4153 SmallVector<Register, 3> Src;
4154 uint8_t TTbl;
4155 unsigned NumOpcodes;
4156
4157 std::tie(args&: NumOpcodes, args&: TTbl) = BitOp3_Op(R: DstReg, Src, MRI: *MRI);
4158
4159 // Src.empty() case can happen if all operands are all zero or all ones.
4160 // Normally it shall be optimized out before reaching this.
4161 if (NumOpcodes < 2 || Src.empty())
4162 return false;
4163
4164 const bool IsB32 = MRI->getType(Reg: DstReg) == LLT::scalar(SizeInBits: 32);
4165 if (NumOpcodes == 2 && IsB32) {
4166 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4167 // asm more readable. This cannot be modeled with AddedComplexity because
4168 // selector does not know how many operations did we match.
4169 if (mi_match(MI, MRI: *MRI, P: m_GXor(L: m_GXor(L: m_Reg(), R: m_Reg()), R: m_Reg())) ||
4170 mi_match(MI, MRI: *MRI, P: m_GOr(L: m_GOr(L: m_Reg(), R: m_Reg()), R: m_Reg())) ||
4171 mi_match(MI, MRI: *MRI, P: m_GOr(L: m_GAnd(L: m_Reg(), R: m_Reg()), R: m_Reg())))
4172 return false;
4173 } else if (NumOpcodes < 4) {
4174 // For a uniform case threshold should be higher to account for moves
4175 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4176 // in SGPRs and a readtfirstlane after.
4177 return false;
4178 }
4179
4180 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4181 if (!IsB32 && STI.hasTrue16BitInsts())
4182 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4183 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4184 unsigned CBL = STI.getConstantBusLimit(Opcode: Opc);
4185 MachineBasicBlock *MBB = MI.getParent();
4186 const DebugLoc &DL = MI.getDebugLoc();
4187
4188 for (unsigned I = 0; I < Src.size(); ++I) {
4189 const RegisterBank *RB = RBI.getRegBank(Reg: Src[I], MRI: *MRI, TRI);
4190 if (RB->getID() != AMDGPU::SGPRRegBankID)
4191 continue;
4192 if (CBL > 0) {
4193 --CBL;
4194 continue;
4195 }
4196 Register NewReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
4197 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: NewReg)
4198 .addReg(RegNo: Src[I]);
4199 Src[I] = NewReg;
4200 }
4201
4202 // Last operand can be ignored, turning a ternary operation into a binary.
4203 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4204 // 'c' with 'a' here without changing the answer. In some pathological
4205 // cases it should be possible to get an operation with a single operand
4206 // too if optimizer would not catch it.
4207 while (Src.size() < 3)
4208 Src.push_back(Elt: Src[0]);
4209
4210 auto MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg);
4211 if (!IsB32)
4212 MIB.addImm(Val: 0); // src_mod0
4213 MIB.addReg(RegNo: Src[0]);
4214 if (!IsB32)
4215 MIB.addImm(Val: 0); // src_mod1
4216 MIB.addReg(RegNo: Src[1]);
4217 if (!IsB32)
4218 MIB.addImm(Val: 0); // src_mod2
4219 MIB.addReg(RegNo: Src[2])
4220 .addImm(Val: TTbl);
4221 if (!IsB32)
4222 MIB.addImm(Val: 0); // op_sel
4223
4224 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
4225 MI.eraseFromParent();
4226
4227 return true;
4228}
4229
4230bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4231 Register SrcReg = MI.getOperand(i: 0).getReg();
4232 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
4233 return false;
4234
4235 MachineInstr *DefMI = MRI->getVRegDef(Reg: SrcReg);
4236 Register SP =
4237 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4238 Register WaveAddr = getWaveAddress(Def: DefMI);
4239 MachineBasicBlock *MBB = MI.getParent();
4240 const DebugLoc &DL = MI.getDebugLoc();
4241
4242 if (!WaveAddr) {
4243 WaveAddr = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
4244 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: WaveAddr)
4245 .addReg(RegNo: SrcReg)
4246 .addImm(Val: Subtarget->getWavefrontSizeLog2())
4247 .setOperandDead(3); // Dead scc
4248 }
4249
4250 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: SP)
4251 .addReg(RegNo: WaveAddr);
4252
4253 MI.eraseFromParent();
4254 return true;
4255}
4256
4257bool AMDGPUInstructionSelector::select(MachineInstr &I) {
4258
4259 if (!I.isPreISelOpcode()) {
4260 if (I.isCopy())
4261 return selectCOPY(I);
4262 return true;
4263 }
4264
4265 switch (I.getOpcode()) {
4266 case TargetOpcode::G_AND:
4267 case TargetOpcode::G_OR:
4268 case TargetOpcode::G_XOR:
4269 if (selectBITOP3(MI&: I))
4270 return true;
4271 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4272 return true;
4273 return selectG_AND_OR_XOR(I);
4274 case TargetOpcode::G_ADD:
4275 case TargetOpcode::G_SUB:
4276 case TargetOpcode::G_PTR_ADD:
4277 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4278 return true;
4279 return selectG_ADD_SUB(I);
4280 case TargetOpcode::G_UADDO:
4281 case TargetOpcode::G_USUBO:
4282 case TargetOpcode::G_UADDE:
4283 case TargetOpcode::G_USUBE:
4284 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4285 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4286 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4287 return selectG_AMDGPU_MAD_64_32(I);
4288 case TargetOpcode::G_INTTOPTR:
4289 case TargetOpcode::G_BITCAST:
4290 case TargetOpcode::G_PTRTOINT:
4291 case TargetOpcode::G_FREEZE:
4292 return selectCOPY(I);
4293 case TargetOpcode::G_FNEG:
4294 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4295 return true;
4296 return selectG_FNEG(MI&: I);
4297 case TargetOpcode::G_FABS:
4298 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4299 return true;
4300 return selectG_FABS(MI&: I);
4301 case TargetOpcode::G_EXTRACT:
4302 return selectG_EXTRACT(I);
4303 case TargetOpcode::G_MERGE_VALUES:
4304 case TargetOpcode::G_CONCAT_VECTORS:
4305 return selectG_MERGE_VALUES(MI&: I);
4306 case TargetOpcode::G_UNMERGE_VALUES:
4307 return selectG_UNMERGE_VALUES(MI&: I);
4308 case TargetOpcode::G_BUILD_VECTOR:
4309 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4310 return selectG_BUILD_VECTOR(MI&: I);
4311 case TargetOpcode::G_IMPLICIT_DEF:
4312 return selectG_IMPLICIT_DEF(I);
4313 case TargetOpcode::G_INSERT:
4314 return selectG_INSERT(I);
4315 case TargetOpcode::G_INTRINSIC:
4316 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4317 return selectG_INTRINSIC(I);
4318 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4319 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4320 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4321 case TargetOpcode::G_ICMP:
4322 case TargetOpcode::G_FCMP:
4323 if (selectG_ICMP_or_FCMP(I))
4324 return true;
4325 return selectImpl(I, CoverageInfo&: *CoverageInfo);
4326 case TargetOpcode::G_LOAD:
4327 case TargetOpcode::G_ZEXTLOAD:
4328 case TargetOpcode::G_SEXTLOAD:
4329 case TargetOpcode::G_STORE:
4330 case TargetOpcode::G_ATOMIC_CMPXCHG:
4331 case TargetOpcode::G_ATOMICRMW_XCHG:
4332 case TargetOpcode::G_ATOMICRMW_ADD:
4333 case TargetOpcode::G_ATOMICRMW_SUB:
4334 case TargetOpcode::G_ATOMICRMW_AND:
4335 case TargetOpcode::G_ATOMICRMW_OR:
4336 case TargetOpcode::G_ATOMICRMW_XOR:
4337 case TargetOpcode::G_ATOMICRMW_MIN:
4338 case TargetOpcode::G_ATOMICRMW_MAX:
4339 case TargetOpcode::G_ATOMICRMW_UMIN:
4340 case TargetOpcode::G_ATOMICRMW_UMAX:
4341 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4342 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4343 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4344 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4345 case TargetOpcode::G_ATOMICRMW_FADD:
4346 case TargetOpcode::G_ATOMICRMW_FMIN:
4347 case TargetOpcode::G_ATOMICRMW_FMAX:
4348 return selectG_LOAD_STORE_ATOMICRMW(I);
4349 case TargetOpcode::G_SELECT:
4350 return selectG_SELECT(I);
4351 case TargetOpcode::G_TRUNC:
4352 return selectG_TRUNC(I);
4353 case TargetOpcode::G_SEXT:
4354 case TargetOpcode::G_ZEXT:
4355 case TargetOpcode::G_ANYEXT:
4356 case TargetOpcode::G_SEXT_INREG:
4357 // This is a workaround. For extension from type i1, `selectImpl()` uses
4358 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4359 // i1 can only be hold in a SGPR class.
4360 if (MRI->getType(Reg: I.getOperand(i: 1).getReg()) != LLT::scalar(SizeInBits: 1) &&
4361 selectImpl(I, CoverageInfo&: *CoverageInfo))
4362 return true;
4363 return selectG_SZA_EXT(I);
4364 case TargetOpcode::G_FPEXT:
4365 if (selectG_FPEXT(I))
4366 return true;
4367 return selectImpl(I, CoverageInfo&: *CoverageInfo);
4368 case TargetOpcode::G_BRCOND:
4369 return selectG_BRCOND(I);
4370 case TargetOpcode::G_GLOBAL_VALUE:
4371 return selectG_GLOBAL_VALUE(I);
4372 case TargetOpcode::G_PTRMASK:
4373 return selectG_PTRMASK(I);
4374 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4375 return selectG_EXTRACT_VECTOR_ELT(MI&: I);
4376 case TargetOpcode::G_INSERT_VECTOR_ELT:
4377 return selectG_INSERT_VECTOR_ELT(MI&: I);
4378 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4379 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4380 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4381 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4382 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4383 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4384 AMDGPU::getImageDimIntrinsicInfo(Intr: AMDGPU::getIntrinsicID(I));
4385 assert(Intr && "not an image intrinsic with image pseudo");
4386 return selectImageIntrinsic(MI&: I, Intr);
4387 }
4388 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4389 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4390 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4391 return selectBVHIntersectRayIntrinsic(MI&: I);
4392 case AMDGPU::G_SBFX:
4393 case AMDGPU::G_UBFX:
4394 return selectG_SBFX_UBFX(MI&: I);
4395 case AMDGPU::G_SI_CALL:
4396 I.setDesc(TII.get(Opcode: AMDGPU::SI_CALL));
4397 return true;
4398 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4399 return selectWaveAddress(MI&: I);
4400 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4401 I.setDesc(TII.get(Opcode: AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4402 return true;
4403 }
4404 case AMDGPU::G_STACKRESTORE:
4405 return selectStackRestore(MI&: I);
4406 case AMDGPU::G_PHI:
4407 return selectPHI(I);
4408 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4409 return selectCOPY_SCC_VCC(I);
4410 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4411 return selectCOPY_VCC_SCC(I);
4412 case AMDGPU::G_AMDGPU_READANYLANE:
4413 return selectReadAnyLane(I);
4414 case TargetOpcode::G_CONSTANT:
4415 case TargetOpcode::G_FCONSTANT:
4416 default:
4417 return selectImpl(I, CoverageInfo&: *CoverageInfo);
4418 }
4419 return false;
4420}
4421
4422InstructionSelector::ComplexRendererFns
4423AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4424 return {{
4425 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); }
4426 }};
4427
4428}
4429
4430std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4431 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4432 unsigned Mods = 0;
4433 MachineInstr *MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI);
4434
4435 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4436 Src = MI->getOperand(i: 1).getReg();
4437 Mods |= SISrcMods::NEG;
4438 MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI);
4439 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4440 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4441 // denormal mode, but we're implicitly canonicalizing in a source operand.
4442 const ConstantFP *LHS =
4443 getConstantFPVRegVal(VReg: MI->getOperand(i: 1).getReg(), MRI: *MRI);
4444 if (LHS && LHS->isZero()) {
4445 Mods |= SISrcMods::NEG;
4446 Src = MI->getOperand(i: 2).getReg();
4447 }
4448 }
4449
4450 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4451 Src = MI->getOperand(i: 1).getReg();
4452 Mods |= SISrcMods::ABS;
4453 }
4454
4455 if (OpSel)
4456 Mods |= SISrcMods::OP_SEL_0;
4457
4458 return std::pair(Src, Mods);
4459}
4460
4461Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4462 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4463 bool ForceVGPR) const {
4464 if ((Mods != 0 || ForceVGPR) &&
4465 RBI.getRegBank(Reg: Src, MRI: *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4466
4467 // If we looked through copies to find source modifiers on an SGPR operand,
4468 // we now have an SGPR register source. To avoid potentially violating the
4469 // constant bus restriction, we need to insert a copy to a VGPR.
4470 Register VGPRSrc = MRI->cloneVirtualRegister(VReg: Root.getReg());
4471 BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
4472 MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: VGPRSrc)
4473 .addReg(RegNo: Src);
4474 Src = VGPRSrc;
4475 }
4476
4477 return Src;
4478}
4479
4480///
4481/// This will select either an SGPR or VGPR operand and will save us from
4482/// having to write an extra tablegen pattern.
4483InstructionSelector::ComplexRendererFns
4484AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4485 return {{
4486 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); }
4487 }};
4488}
4489
4490InstructionSelector::ComplexRendererFns
4491AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4492 Register Src;
4493 unsigned Mods;
4494 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
4495
4496 return {{
4497 [=](MachineInstrBuilder &MIB) {
4498 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4499 },
4500 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
4501 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
4502 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
4503 }};
4504}
4505
4506InstructionSelector::ComplexRendererFns
4507AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4508 Register Src;
4509 unsigned Mods;
4510 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg(),
4511 /*IsCanonicalizing=*/true,
4512 /*AllowAbs=*/false);
4513
4514 return {{
4515 [=](MachineInstrBuilder &MIB) {
4516 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4517 },
4518 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
4519 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
4520 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
4521 }};
4522}
4523
4524InstructionSelector::ComplexRendererFns
4525AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4526 return {{
4527 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); },
4528 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
4529 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
4530 }};
4531}
4532
4533InstructionSelector::ComplexRendererFns
4534AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4535 Register Src;
4536 unsigned Mods;
4537 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
4538
4539 return {{
4540 [=](MachineInstrBuilder &MIB) {
4541 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4542 },
4543 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4544 }};
4545}
4546
4547InstructionSelector::ComplexRendererFns
4548AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4549 MachineOperand &Root) const {
4550 Register Src;
4551 unsigned Mods;
4552 std::tie(args&: Src, args&: Mods) =
4553 selectVOP3ModsImpl(Src: Root.getReg(), /*IsCanonicalizing=*/false);
4554
4555 return {{
4556 [=](MachineInstrBuilder &MIB) {
4557 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4558 },
4559 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4560 }};
4561}
4562
4563InstructionSelector::ComplexRendererFns
4564AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4565 Register Src;
4566 unsigned Mods;
4567 std::tie(args&: Src, args&: Mods) =
4568 selectVOP3ModsImpl(Src: Root.getReg(), /*IsCanonicalizing=*/true,
4569 /*AllowAbs=*/false);
4570
4571 return {{
4572 [=](MachineInstrBuilder &MIB) {
4573 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4574 },
4575 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4576 }};
4577}
4578
4579InstructionSelector::ComplexRendererFns
4580AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4581 Register Reg = Root.getReg();
4582 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI);
4583 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4584 return {};
4585 return {{
4586 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
4587 }};
4588}
4589
4590enum class SrcStatus {
4591 IS_SAME,
4592 IS_UPPER_HALF,
4593 IS_LOWER_HALF,
4594 IS_UPPER_HALF_NEG,
4595 // This means current op = [op_upper, op_lower] and src = -op_lower.
4596 IS_LOWER_HALF_NEG,
4597 IS_HI_NEG,
4598 // This means current op = [op_upper, op_lower] and src = [op_upper,
4599 // -op_lower].
4600 IS_LO_NEG,
4601 IS_BOTH_NEG,
4602 INVALID,
4603 NEG_START = IS_UPPER_HALF_NEG,
4604 NEG_END = IS_BOTH_NEG,
4605 HALF_START = IS_UPPER_HALF,
4606 HALF_END = IS_LOWER_HALF_NEG
4607};
4608/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4609static bool isTruncHalf(const MachineInstr *MI,
4610 const MachineRegisterInfo &MRI) {
4611 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4612 return false;
4613
4614 unsigned DstSize = MRI.getType(Reg: MI->getOperand(i: 0).getReg()).getSizeInBits();
4615 unsigned SrcSize = MRI.getType(Reg: MI->getOperand(i: 1).getReg()).getSizeInBits();
4616 return DstSize * 2 == SrcSize;
4617}
4618
4619/// Test if the MI is logic shift right with half bits,
4620/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4621static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4622 if (MI->getOpcode() != AMDGPU::G_LSHR)
4623 return false;
4624
4625 Register ShiftSrc;
4626 std::optional<ValueAndVReg> ShiftAmt;
4627 if (mi_match(R: MI->getOperand(i: 0).getReg(), MRI,
4628 P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt)))) {
4629 unsigned SrcSize = MRI.getType(Reg: MI->getOperand(i: 1).getReg()).getSizeInBits();
4630 unsigned Shift = ShiftAmt->Value.getZExtValue();
4631 return Shift * 2 == SrcSize;
4632 }
4633 return false;
4634}
4635
4636/// Test if the MI is shift left with half bits,
4637/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4638static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4639 if (MI->getOpcode() != AMDGPU::G_SHL)
4640 return false;
4641
4642 Register ShiftSrc;
4643 std::optional<ValueAndVReg> ShiftAmt;
4644 if (mi_match(R: MI->getOperand(i: 0).getReg(), MRI,
4645 P: m_GShl(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt)))) {
4646 unsigned SrcSize = MRI.getType(Reg: MI->getOperand(i: 1).getReg()).getSizeInBits();
4647 unsigned Shift = ShiftAmt->Value.getZExtValue();
4648 return Shift * 2 == SrcSize;
4649 }
4650 return false;
4651}
4652
4653/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4654static bool isUnmergeHalf(const MachineInstr *MI,
4655 const MachineRegisterInfo &MRI) {
4656 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4657 return false;
4658 return MI->getNumOperands() == 3 && MI->getOperand(i: 0).isDef() &&
4659 MI->getOperand(i: 1).isDef() && !MI->getOperand(i: 2).isDef();
4660}
4661
4662enum class TypeClass { VECTOR_OF_TWO, SCALAR, NONE_OF_LISTED };
4663
4664static TypeClass isVectorOfTwoOrScalar(Register Reg,
4665 const MachineRegisterInfo &MRI) {
4666 LLT OpTy = MRI.getType(Reg);
4667 if (OpTy.isScalar())
4668 return TypeClass::SCALAR;
4669 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4670 return TypeClass::VECTOR_OF_TWO;
4671 return TypeClass::NONE_OF_LISTED;
4672}
4673
4674static SrcStatus getNegStatus(Register Reg, SrcStatus S,
4675 const MachineRegisterInfo &MRI) {
4676 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4677 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4678 return SrcStatus::INVALID;
4679
4680 switch (S) {
4681 case SrcStatus::IS_SAME:
4682 if (NegType == TypeClass::VECTOR_OF_TWO) {
4683 // Vector of 2:
4684 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4685 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4686 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4687 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4688 return SrcStatus::IS_BOTH_NEG;
4689 }
4690 if (NegType == TypeClass::SCALAR) {
4691 // Scalar:
4692 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4693 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4694 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4695 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4696 return SrcStatus::IS_HI_NEG;
4697 }
4698 break;
4699 case SrcStatus::IS_HI_NEG:
4700 if (NegType == TypeClass::VECTOR_OF_TWO) {
4701 // Vector of 2:
4702 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4703 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4704 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4705 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4706 return SrcStatus::IS_LO_NEG;
4707 }
4708 if (NegType == TypeClass::SCALAR) {
4709 // Scalar:
4710 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4711 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4712 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4713 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4714 return SrcStatus::IS_SAME;
4715 }
4716 break;
4717 case SrcStatus::IS_LO_NEG:
4718 if (NegType == TypeClass::VECTOR_OF_TWO) {
4719 // Vector of 2:
4720 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4721 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4722 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4723 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4724 return SrcStatus::IS_HI_NEG;
4725 }
4726 if (NegType == TypeClass::SCALAR) {
4727 // Scalar:
4728 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4729 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4730 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4731 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4732 return SrcStatus::IS_BOTH_NEG;
4733 }
4734 break;
4735 case SrcStatus::IS_BOTH_NEG:
4736 if (NegType == TypeClass::VECTOR_OF_TWO) {
4737 // Vector of 2:
4738 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4739 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4740 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4741 // [SrcHi, SrcLo] = [OpHi, OpLo]
4742 return SrcStatus::IS_SAME;
4743 }
4744 if (NegType == TypeClass::SCALAR) {
4745 // Scalar:
4746 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4747 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4748 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4749 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4750 return SrcStatus::IS_LO_NEG;
4751 }
4752 break;
4753 case SrcStatus::IS_UPPER_HALF:
4754 // Vector of 2:
4755 // Src = CurrUpper
4756 // Curr = [CurrUpper, CurrLower]
4757 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4758 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4759 // Src = -OpUpper
4760 //
4761 // Scalar:
4762 // Src = CurrUpper
4763 // Curr = [CurrUpper, CurrLower]
4764 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4765 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4766 // Src = -OpUpper
4767 return SrcStatus::IS_UPPER_HALF_NEG;
4768 case SrcStatus::IS_LOWER_HALF:
4769 if (NegType == TypeClass::VECTOR_OF_TWO) {
4770 // Vector of 2:
4771 // Src = CurrLower
4772 // Curr = [CurrUpper, CurrLower]
4773 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4774 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4775 // Src = -OpLower
4776 return SrcStatus::IS_LOWER_HALF_NEG;
4777 }
4778 if (NegType == TypeClass::SCALAR) {
4779 // Scalar:
4780 // Src = CurrLower
4781 // Curr = [CurrUpper, CurrLower]
4782 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4783 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4784 // Src = OpLower
4785 return SrcStatus::IS_LOWER_HALF;
4786 }
4787 break;
4788 case SrcStatus::IS_UPPER_HALF_NEG:
4789 // Vector of 2:
4790 // Src = -CurrUpper
4791 // Curr = [CurrUpper, CurrLower]
4792 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4793 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4794 // Src = -(-OpUpper) = OpUpper
4795 //
4796 // Scalar:
4797 // Src = -CurrUpper
4798 // Curr = [CurrUpper, CurrLower]
4799 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4800 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4801 // Src = -(-OpUpper) = OpUpper
4802 return SrcStatus::IS_UPPER_HALF;
4803 case SrcStatus::IS_LOWER_HALF_NEG:
4804 if (NegType == TypeClass::VECTOR_OF_TWO) {
4805 // Vector of 2:
4806 // Src = -CurrLower
4807 // Curr = [CurrUpper, CurrLower]
4808 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4809 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4810 // Src = -(-OpLower) = OpLower
4811 return SrcStatus::IS_LOWER_HALF;
4812 }
4813 if (NegType == TypeClass::SCALAR) {
4814 // Scalar:
4815 // Src = -CurrLower
4816 // Curr = [CurrUpper, CurrLower]
4817 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4818 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4819 // Src = -OpLower
4820 return SrcStatus::IS_LOWER_HALF_NEG;
4821 }
4822 break;
4823 default:
4824 break;
4825 }
4826 llvm_unreachable("unexpected SrcStatus & NegType combination");
4827}
4828
4829static std::optional<std::pair<Register, SrcStatus>>
4830calcNextStatus(std::pair<Register, SrcStatus> Curr,
4831 const MachineRegisterInfo &MRI) {
4832 const MachineInstr *MI = MRI.getVRegDef(Reg: Curr.first);
4833
4834 unsigned Opc = MI->getOpcode();
4835
4836 // Handle general Opc cases.
4837 switch (Opc) {
4838 case AMDGPU::G_BITCAST:
4839 return std::optional<std::pair<Register, SrcStatus>>(
4840 {MI->getOperand(i: 1).getReg(), Curr.second});
4841 case AMDGPU::COPY:
4842 if (MI->getOperand(i: 1).getReg().isPhysical())
4843 return std::nullopt;
4844 return std::optional<std::pair<Register, SrcStatus>>(
4845 {MI->getOperand(i: 1).getReg(), Curr.second});
4846 case AMDGPU::G_FNEG: {
4847 SrcStatus Stat = getNegStatus(Reg: Curr.first, S: Curr.second, MRI);
4848 if (Stat == SrcStatus::INVALID)
4849 return std::nullopt;
4850 return std::optional<std::pair<Register, SrcStatus>>(
4851 {MI->getOperand(i: 1).getReg(), Stat});
4852 }
4853 default:
4854 break;
4855 }
4856
4857 // Calc next Stat from current Stat.
4858 switch (Curr.second) {
4859 case SrcStatus::IS_SAME:
4860 if (isTruncHalf(MI, MRI))
4861 return std::optional<std::pair<Register, SrcStatus>>(
4862 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_LOWER_HALF});
4863 else if (isUnmergeHalf(MI, MRI)) {
4864 if (Curr.first == MI->getOperand(i: 0).getReg())
4865 return std::optional<std::pair<Register, SrcStatus>>(
4866 {MI->getOperand(i: 2).getReg(), SrcStatus::IS_LOWER_HALF});
4867 return std::optional<std::pair<Register, SrcStatus>>(
4868 {MI->getOperand(i: 2).getReg(), SrcStatus::IS_UPPER_HALF});
4869 }
4870 break;
4871 case SrcStatus::IS_HI_NEG:
4872 if (isTruncHalf(MI, MRI)) {
4873 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4874 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4875 // = [OpLowerHi, OpLowerLo]
4876 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4877 // = [-OpLowerHi, OpLowerLo]
4878 // = -OpLower
4879 return std::optional<std::pair<Register, SrcStatus>>(
4880 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4881 }
4882 if (isUnmergeHalf(MI, MRI)) {
4883 if (Curr.first == MI->getOperand(i: 0).getReg())
4884 return std::optional<std::pair<Register, SrcStatus>>(
4885 {MI->getOperand(i: 2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4886 return std::optional<std::pair<Register, SrcStatus>>(
4887 {MI->getOperand(i: 2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4888 }
4889 break;
4890 case SrcStatus::IS_UPPER_HALF:
4891 if (isShlHalf(MI, MRI))
4892 return std::optional<std::pair<Register, SrcStatus>>(
4893 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_LOWER_HALF});
4894 break;
4895 case SrcStatus::IS_LOWER_HALF:
4896 if (isLshrHalf(MI, MRI))
4897 return std::optional<std::pair<Register, SrcStatus>>(
4898 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_UPPER_HALF});
4899 break;
4900 case SrcStatus::IS_UPPER_HALF_NEG:
4901 if (isShlHalf(MI, MRI))
4902 return std::optional<std::pair<Register, SrcStatus>>(
4903 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4904 break;
4905 case SrcStatus::IS_LOWER_HALF_NEG:
4906 if (isLshrHalf(MI, MRI))
4907 return std::optional<std::pair<Register, SrcStatus>>(
4908 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4909 break;
4910 default:
4911 break;
4912 }
4913 return std::nullopt;
4914}
4915
4916/// This is used to control valid status that current MI supports. For example,
4917/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
4918/// bit on VOP3P.
4919/// The class can be further extended to recognize support on SEL, NEG, ABS bit
4920/// for different MI on different arch
4921class SearchOptions {
4922private:
4923 bool HasNeg = false;
4924 // Assume all complex pattern of VOP3P have opsel.
4925 bool HasOpsel = true;
4926
4927public:
4928 SearchOptions(Register Reg, const MachineRegisterInfo &MRI) {
4929 const MachineInstr *MI = MRI.getVRegDef(Reg);
4930 unsigned Opc = MI->getOpcode();
4931
4932 if (Opc < TargetOpcode::GENERIC_OP_END) {
4933 // Keep same for generic op.
4934 HasNeg = true;
4935 } else if (Opc == TargetOpcode::G_INTRINSIC) {
4936 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: *MI).getIntrinsicID();
4937 // Only float point intrinsic has neg & neg_hi bits.
4938 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
4939 HasNeg = true;
4940 }
4941 }
4942 bool checkOptions(SrcStatus Stat) const {
4943 if (!HasNeg &&
4944 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
4945 return false;
4946 }
4947 if (!HasOpsel &&
4948 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
4949 return false;
4950 }
4951 return true;
4952 }
4953};
4954
4955static SmallVector<std::pair<Register, SrcStatus>>
4956getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO,
4957 int MaxDepth = 3) {
4958 int Depth = 0;
4959 auto Curr = calcNextStatus(Curr: {Reg, SrcStatus::IS_SAME}, MRI);
4960 SmallVector<std::pair<Register, SrcStatus>> Statlist;
4961
4962 while (Depth <= MaxDepth && Curr.has_value()) {
4963 Depth++;
4964 if (SO.checkOptions(Stat: Curr.value().second))
4965 Statlist.push_back(Elt: Curr.value());
4966 Curr = calcNextStatus(Curr: Curr.value(), MRI);
4967 }
4968
4969 return Statlist;
4970}
4971
4972static std::pair<Register, SrcStatus>
4973getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO,
4974 int MaxDepth = 3) {
4975 int Depth = 0;
4976 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
4977 auto Curr = calcNextStatus(Curr: LastSameOrNeg, MRI);
4978
4979 while (Depth <= MaxDepth && Curr.has_value()) {
4980 Depth++;
4981 SrcStatus Stat = Curr.value().second;
4982 if (SO.checkOptions(Stat)) {
4983 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
4984 Stat == SrcStatus::IS_LO_NEG || Stat == SrcStatus::IS_BOTH_NEG)
4985 LastSameOrNeg = Curr.value();
4986 }
4987 Curr = calcNextStatus(Curr: Curr.value(), MRI);
4988 }
4989
4990 return LastSameOrNeg;
4991}
4992
4993static bool isSameBitWidth(Register Reg1, Register Reg2,
4994 const MachineRegisterInfo &MRI) {
4995 unsigned Width1 = MRI.getType(Reg: Reg1).getSizeInBits();
4996 unsigned Width2 = MRI.getType(Reg: Reg2).getSizeInBits();
4997 return Width1 == Width2;
4998}
4999
5000static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
5001 // SrcStatus::IS_LOWER_HALF remain 0.
5002 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
5003 Mods ^= SISrcMods::NEG_HI;
5004 Mods |= SISrcMods::OP_SEL_1;
5005 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
5006 Mods |= SISrcMods::OP_SEL_1;
5007 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
5008 Mods ^= SISrcMods::NEG_HI;
5009 else if (HiStat == SrcStatus::IS_HI_NEG)
5010 Mods ^= SISrcMods::NEG_HI;
5011
5012 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
5013 Mods ^= SISrcMods::NEG;
5014 Mods |= SISrcMods::OP_SEL_0;
5015 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
5016 Mods |= SISrcMods::OP_SEL_0;
5017 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
5018 Mods |= SISrcMods::NEG;
5019 else if (LoStat == SrcStatus::IS_HI_NEG)
5020 Mods ^= SISrcMods::NEG;
5021
5022 return Mods;
5023}
5024
5025static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
5026 Register RootReg, const SIInstrInfo &TII,
5027 const MachineRegisterInfo &MRI) {
5028 auto IsHalfState = [](SrcStatus S) {
5029 return S == SrcStatus::IS_UPPER_HALF || S == SrcStatus::IS_UPPER_HALF_NEG ||
5030 S == SrcStatus::IS_LOWER_HALF || S == SrcStatus::IS_LOWER_HALF_NEG;
5031 };
5032 return isSameBitWidth(Reg1: NewReg, Reg2: RootReg, MRI) && IsHalfState(LoStat) &&
5033 IsHalfState(HiStat);
5034}
5035
5036std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5037 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
5038 unsigned Mods = 0;
5039 // No modification if Root type is not form of <2 x Type>.
5040 if (isVectorOfTwoOrScalar(Reg: RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
5041 Mods |= SISrcMods::OP_SEL_1;
5042 return {RootReg, Mods};
5043 }
5044
5045 SearchOptions SO(RootReg, MRI);
5046
5047 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(Reg: RootReg, MRI, SO);
5048
5049 if (Stat.second == SrcStatus::IS_BOTH_NEG)
5050 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
5051 else if (Stat.second == SrcStatus::IS_HI_NEG)
5052 Mods ^= SISrcMods::NEG_HI;
5053 else if (Stat.second == SrcStatus::IS_LO_NEG)
5054 Mods ^= SISrcMods::NEG;
5055
5056 MachineInstr *MI = MRI.getVRegDef(Reg: Stat.first);
5057
5058 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
5059 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5060 Mods |= SISrcMods::OP_SEL_1;
5061 return {Stat.first, Mods};
5062 }
5063
5064 SmallVector<std::pair<Register, SrcStatus>> StatlistHi =
5065 getSrcStats(Reg: MI->getOperand(i: 2).getReg(), MRI, SO);
5066
5067 if (StatlistHi.empty()) {
5068 Mods |= SISrcMods::OP_SEL_1;
5069 return {Stat.first, Mods};
5070 }
5071
5072 SmallVector<std::pair<Register, SrcStatus>> StatlistLo =
5073 getSrcStats(Reg: MI->getOperand(i: 1).getReg(), MRI, SO);
5074
5075 if (StatlistLo.empty()) {
5076 Mods |= SISrcMods::OP_SEL_1;
5077 return {Stat.first, Mods};
5078 }
5079
5080 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
5081 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
5082 if (StatlistHi[I].first == StatlistLo[J].first &&
5083 isValidToPack(HiStat: StatlistHi[I].second, LoStat: StatlistLo[J].second,
5084 NewReg: StatlistHi[I].first, RootReg, TII, MRI))
5085 return {StatlistHi[I].first,
5086 updateMods(HiStat: StatlistHi[I].second, LoStat: StatlistLo[J].second, Mods)};
5087 }
5088 }
5089 // Packed instructions do not have abs modifiers.
5090 Mods |= SISrcMods::OP_SEL_1;
5091
5092 return {Stat.first, Mods};
5093}
5094
5095// Removed unused function `getAllKindImm` to eliminate dead code.
5096
5097static bool checkRB(Register Reg, unsigned int RBNo,
5098 const AMDGPURegisterBankInfo &RBI,
5099 const MachineRegisterInfo &MRI,
5100 const TargetRegisterInfo &TRI) {
5101 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
5102 return RB->getID() == RBNo;
5103}
5104
5105// This function is used to get the correct register bank for returned reg.
5106// Assume:
5107// 1. VOP3P is always legal for VGPR.
5108// 2. RootOp's regbank is legal.
5109// Thus
5110// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
5111// 2. If RootOp is VGPR, then NewOp must be VGPR.
5112static Register getLegalRegBank(Register NewReg, Register RootReg,
5113 const AMDGPURegisterBankInfo &RBI,
5114 MachineRegisterInfo &MRI,
5115 const TargetRegisterInfo &TRI,
5116 const SIInstrInfo &TII) {
5117 // RootOp can only be VGPR or SGPR (some hand written cases such as.
5118 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
5119 if (checkRB(Reg: RootReg, RBNo: AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
5120 checkRB(Reg: NewReg, RBNo: AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
5121 return NewReg;
5122
5123 MachineInstr *MI = MRI.getVRegDef(Reg: RootReg);
5124 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(i: 1).getReg()) {
5125 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
5126 return RootReg;
5127 }
5128
5129 MachineBasicBlock *BB = MI->getParent();
5130 Register DstReg = MRI.cloneVirtualRegister(VReg: RootReg);
5131
5132 MachineInstrBuilder MIB =
5133 BuildMI(BB&: *BB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
5134 .addReg(RegNo: NewReg);
5135
5136 // Only accept VGPR.
5137 return MIB->getOperand(i: 0).getReg();
5138}
5139
5140InstructionSelector::ComplexRendererFns
5141AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5142 bool IsDOT) const {
5143 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5144 Register Reg;
5145 unsigned Mods;
5146 std::tie(args&: Reg, args&: Mods) = selectVOP3PModsImpl(RootReg: Root.getReg(), MRI, IsDOT);
5147
5148 Reg = getLegalRegBank(NewReg: Reg, RootReg: Root.getReg(), RBI, MRI, TRI, TII);
5149 return {{
5150 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
5151 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
5152 }};
5153}
5154
5155InstructionSelector::ComplexRendererFns
5156AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5157
5158 return selectVOP3PRetHelper(Root);
5159}
5160
5161InstructionSelector::ComplexRendererFns
5162AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5163
5164 return selectVOP3PRetHelper(Root, IsDOT: true);
5165}
5166
5167InstructionSelector::ComplexRendererFns
5168AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5169 MachineOperand &Root) const {
5170 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5171 "expected i1 value");
5172 unsigned Mods = SISrcMods::OP_SEL_1;
5173 if (Root.getImm() != 0)
5174 Mods |= SISrcMods::OP_SEL_0;
5175
5176 return {{
5177 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
5178 }};
5179}
5180
5181static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
5182 MachineInstr *InsertPt,
5183 MachineRegisterInfo &MRI) {
5184 const TargetRegisterClass *DstRegClass;
5185 switch (Elts.size()) {
5186 case 8:
5187 DstRegClass = &AMDGPU::VReg_256RegClass;
5188 break;
5189 case 4:
5190 DstRegClass = &AMDGPU::VReg_128RegClass;
5191 break;
5192 case 2:
5193 DstRegClass = &AMDGPU::VReg_64RegClass;
5194 break;
5195 default:
5196 llvm_unreachable("unhandled Reg sequence size");
5197 }
5198
5199 MachineIRBuilder B(*InsertPt);
5200 auto MIB = B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
5201 .addDef(RegNo: MRI.createVirtualRegister(RegClass: DstRegClass));
5202 for (unsigned i = 0; i < Elts.size(); ++i) {
5203 MIB.addReg(RegNo: Elts[i]);
5204 MIB.addImm(Val: SIRegisterInfo::getSubRegFromChannel(Channel: i));
5205 }
5206 return MIB->getOperand(i: 0).getReg();
5207}
5208
5209static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5210 SmallVectorImpl<Register> &Elts, Register &Src,
5211 MachineInstr *InsertPt,
5212 MachineRegisterInfo &MRI) {
5213 if (ModOpcode == TargetOpcode::G_FNEG) {
5214 Mods |= SISrcMods::NEG;
5215 // Check if all elements also have abs modifier
5216 SmallVector<Register, 8> NegAbsElts;
5217 for (auto El : Elts) {
5218 Register FabsSrc;
5219 if (!mi_match(R: El, MRI, P: m_GFabs(Src: m_Reg(R&: FabsSrc))))
5220 break;
5221 NegAbsElts.push_back(Elt: FabsSrc);
5222 }
5223 if (Elts.size() != NegAbsElts.size()) {
5224 // Neg
5225 Src = buildRegSequence(Elts, InsertPt, MRI);
5226 } else {
5227 // Neg and Abs
5228 Mods |= SISrcMods::NEG_HI;
5229 Src = buildRegSequence(Elts&: NegAbsElts, InsertPt, MRI);
5230 }
5231 } else {
5232 assert(ModOpcode == TargetOpcode::G_FABS);
5233 // Abs
5234 Mods |= SISrcMods::NEG_HI;
5235 Src = buildRegSequence(Elts, InsertPt, MRI);
5236 }
5237}
5238
5239InstructionSelector::ComplexRendererFns
5240AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5241 Register Src = Root.getReg();
5242 unsigned Mods = SISrcMods::OP_SEL_1;
5243 SmallVector<Register, 8> EltsF32;
5244
5245 if (GBuildVector *BV = dyn_cast<GBuildVector>(Val: MRI->getVRegDef(Reg: Src))) {
5246 assert(BV->getNumSources() > 0);
5247 // Based on first element decide which mod we match, neg or abs
5248 MachineInstr *ElF32 = MRI->getVRegDef(Reg: BV->getSourceReg(I: 0));
5249 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5250 ? AMDGPU::G_FNEG
5251 : AMDGPU::G_FABS;
5252 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5253 ElF32 = MRI->getVRegDef(Reg: BV->getSourceReg(I: i));
5254 if (ElF32->getOpcode() != ModOpcode)
5255 break;
5256 EltsF32.push_back(Elt: ElF32->getOperand(i: 1).getReg());
5257 }
5258
5259 // All elements had ModOpcode modifier
5260 if (BV->getNumSources() == EltsF32.size()) {
5261 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF32, Src, InsertPt: Root.getParent(),
5262 MRI&: *MRI);
5263 }
5264 }
5265
5266 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5267 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
5268}
5269
5270InstructionSelector::ComplexRendererFns
5271AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5272 Register Src = Root.getReg();
5273 unsigned Mods = SISrcMods::OP_SEL_1;
5274 SmallVector<Register, 8> EltsV2F16;
5275
5276 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(Val: MRI->getVRegDef(Reg: Src))) {
5277 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5278 Register FNegSrc;
5279 if (!mi_match(R: CV->getSourceReg(I: i), MRI: *MRI, P: m_GFNeg(Src: m_Reg(R&: FNegSrc))))
5280 break;
5281 EltsV2F16.push_back(Elt: FNegSrc);
5282 }
5283
5284 // All elements had ModOpcode modifier
5285 if (CV->getNumSources() == EltsV2F16.size()) {
5286 Mods |= SISrcMods::NEG;
5287 Mods |= SISrcMods::NEG_HI;
5288 Src = buildRegSequence(Elts&: EltsV2F16, InsertPt: Root.getParent(), MRI&: *MRI);
5289 }
5290 }
5291
5292 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5293 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
5294}
5295
5296InstructionSelector::ComplexRendererFns
5297AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5298 Register Src = Root.getReg();
5299 unsigned Mods = SISrcMods::OP_SEL_1;
5300 SmallVector<Register, 8> EltsV2F16;
5301
5302 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(Val: MRI->getVRegDef(Reg: Src))) {
5303 assert(CV->getNumSources() > 0);
5304 MachineInstr *ElV2F16 = MRI->getVRegDef(Reg: CV->getSourceReg(I: 0));
5305 // Based on first element decide which mod we match, neg or abs
5306 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5307 ? AMDGPU::G_FNEG
5308 : AMDGPU::G_FABS;
5309
5310 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5311 ElV2F16 = MRI->getVRegDef(Reg: CV->getSourceReg(I: i));
5312 if (ElV2F16->getOpcode() != ModOpcode)
5313 break;
5314 EltsV2F16.push_back(Elt: ElV2F16->getOperand(i: 1).getReg());
5315 }
5316
5317 // All elements had ModOpcode modifier
5318 if (CV->getNumSources() == EltsV2F16.size()) {
5319 MachineIRBuilder B(*Root.getParent());
5320 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsV2F16, Src, InsertPt: Root.getParent(),
5321 MRI&: *MRI);
5322 }
5323 }
5324
5325 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5326 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
5327}
5328
5329InstructionSelector::ComplexRendererFns
5330AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5331 std::optional<FPValueAndVReg> FPValReg;
5332 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_GFCstOrSplat(FPValReg))) {
5333 if (TII.isInlineConstant(Imm: FPValReg->Value)) {
5334 return {{[=](MachineInstrBuilder &MIB) {
5335 MIB.addImm(Val: FPValReg->Value.bitcastToAPInt().getSExtValue());
5336 }}};
5337 }
5338 // Non-inlineable splat floats should not fall-through for integer immediate
5339 // checks.
5340 return {};
5341 }
5342
5343 APInt ICst;
5344 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICstOrSplat(Cst&: ICst))) {
5345 if (TII.isInlineConstant(Imm: ICst)) {
5346 return {
5347 {[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ICst.getSExtValue()); }}};
5348 }
5349 }
5350
5351 return {};
5352}
5353
5354InstructionSelector::ComplexRendererFns
5355AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5356 Register Src =
5357 getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg();
5358 unsigned Key = 0;
5359
5360 Register ShiftSrc;
5361 std::optional<ValueAndVReg> ShiftAmt;
5362 if (mi_match(R: Src, MRI: *MRI, P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt))) &&
5363 MRI->getType(Reg: ShiftSrc).getSizeInBits() == 32 &&
5364 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5365 Key = ShiftAmt->Value.getZExtValue() / 8;
5366 Src = ShiftSrc;
5367 }
5368
5369 return {{
5370 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5371 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key
5372 }};
5373}
5374
5375InstructionSelector::ComplexRendererFns
5376AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5377
5378 Register Src =
5379 getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg();
5380 unsigned Key = 0;
5381
5382 Register ShiftSrc;
5383 std::optional<ValueAndVReg> ShiftAmt;
5384 if (mi_match(R: Src, MRI: *MRI, P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt))) &&
5385 MRI->getType(Reg: ShiftSrc).getSizeInBits() == 32 &&
5386 ShiftAmt->Value.getZExtValue() == 16) {
5387 Src = ShiftSrc;
5388 Key = 1;
5389 }
5390
5391 return {{
5392 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5393 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key
5394 }};
5395}
5396
5397InstructionSelector::ComplexRendererFns
5398AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5399 Register Src =
5400 getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg();
5401 unsigned Key = 0;
5402
5403 Register S32 = matchZeroExtendFromS32(Reg: Src);
5404 if (!S32)
5405 S32 = matchAnyExtendFromS32(Reg: Src);
5406
5407 if (S32) {
5408 const MachineInstr *Def = getDefIgnoringCopies(Reg: S32, MRI: *MRI);
5409 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5410 assert(Def->getNumOperands() == 3);
5411 Register DstReg1 = Def->getOperand(i: 1).getReg();
5412 if (mi_match(R: S32, MRI: *MRI,
5413 P: m_any_of(preds: m_SpecificReg(RequestedReg: DstReg1), preds: m_Copy(Src: m_Reg(R&: DstReg1))))) {
5414 Src = Def->getOperand(i: 2).getReg();
5415 Key = 1;
5416 }
5417 }
5418 }
5419
5420 return {{
5421 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5422 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key
5423 }};
5424}
5425
5426InstructionSelector::ComplexRendererFns
5427AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5428 Register Src;
5429 unsigned Mods;
5430 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
5431
5432 // FIXME: Handle op_sel
5433 return {{
5434 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5435 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
5436 }};
5437}
5438
5439// FIXME-TRUE16 remove when fake16 is removed
5440InstructionSelector::ComplexRendererFns
5441AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5442 Register Src;
5443 unsigned Mods;
5444 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg(),
5445 /*IsCanonicalizing=*/true,
5446 /*AllowAbs=*/false,
5447 /*OpSel=*/false);
5448
5449 return {{
5450 [=](MachineInstrBuilder &MIB) {
5451 MIB.addReg(
5452 RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB, /* ForceVGPR */ true));
5453 },
5454 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
5455 }};
5456}
5457
5458InstructionSelector::ComplexRendererFns
5459AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5460 Register Src;
5461 unsigned Mods;
5462 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg(),
5463 /*IsCanonicalizing=*/true,
5464 /*AllowAbs=*/false,
5465 /*OpSel=*/true);
5466
5467 return {{
5468 [=](MachineInstrBuilder &MIB) {
5469 MIB.addReg(
5470 RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB, /* ForceVGPR */ true));
5471 },
5472 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
5473 }};
5474}
5475
5476// Given \p Offset and load specified by the \p Root operand check if \p Offset
5477// is a multiple of the load byte size. If it is update \p Offset to a
5478// pre-scaled value and return true.
5479bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5480 Register &Offset,
5481 bool IsSigned) const {
5482 if (!Subtarget->hasScaleOffset())
5483 return false;
5484
5485 const MachineInstr &MI = *Root.getParent();
5486 MachineMemOperand *MMO = *MI.memoperands_begin();
5487
5488 if (!MMO->getSize().hasValue())
5489 return false;
5490
5491 uint64_t Size = MMO->getSize().getValue();
5492
5493 Register OffsetReg = matchExtendFromS32OrS32(Reg: Offset, IsSigned);
5494 if (!OffsetReg)
5495 OffsetReg = Offset;
5496
5497 if (auto Def = getDefSrcRegIgnoringCopies(Reg: OffsetReg, MRI: *MRI))
5498 OffsetReg = Def->Reg;
5499
5500 Register Op0;
5501 MachineInstr *Mul;
5502 bool ScaleOffset =
5503 (isPowerOf2_64(Value: Size) &&
5504 mi_match(R: OffsetReg, MRI: *MRI,
5505 P: m_GShl(L: m_Reg(R&: Op0),
5506 R: m_any_of(preds: m_SpecificICst(RequestedValue: Log2_64(Value: Size)),
5507 preds: m_Copy(Src: m_SpecificICst(RequestedValue: Log2_64(Value: Size))))))) ||
5508 mi_match(R: OffsetReg, MRI: *MRI,
5509 P: m_GMul(L: m_Reg(R&: Op0), R: m_any_of(preds: m_SpecificICst(RequestedValue: Size),
5510 preds: m_Copy(Src: m_SpecificICst(RequestedValue: Size))))) ||
5511 mi_match(
5512 R: OffsetReg, MRI: *MRI,
5513 P: m_BinOp(Opcode: IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5514 L: m_Reg(R&: Op0), R: m_SpecificICst(RequestedValue: Size))) ||
5515 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5516 (mi_match(R: OffsetReg, MRI: *MRI, P: m_MInstr(MI&: Mul)) &&
5517 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5518 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5519 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5520 VT->signBitIsZero(Op: Mul->getOperand(i: 2).getReg()))) &&
5521 mi_match(R: Mul->getOperand(i: 4).getReg(), MRI: *MRI, P: m_ZeroInt()) &&
5522 mi_match(R: Mul->getOperand(i: 3).getReg(), MRI: *MRI,
5523 P: m_GTrunc(Src: m_any_of(preds: m_SpecificICst(RequestedValue: Size),
5524 preds: m_Copy(Src: m_SpecificICst(RequestedValue: Size))))) &&
5525 mi_match(R: Mul->getOperand(i: 2).getReg(), MRI: *MRI, P: m_Reg(R&: Op0)));
5526
5527 if (ScaleOffset)
5528 Offset = Op0;
5529
5530 return ScaleOffset;
5531}
5532
5533bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5534 Register &Base,
5535 Register *SOffset,
5536 int64_t *Offset,
5537 bool *ScaleOffset) const {
5538 MachineInstr *MI = Root.getParent();
5539 MachineBasicBlock *MBB = MI->getParent();
5540
5541 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5542 // then we can select all ptr + 32-bit offsets.
5543 SmallVector<GEPInfo, 4> AddrInfo;
5544 getAddrModeInfo(Load: *MI, MRI: *MRI, AddrInfo);
5545
5546 if (AddrInfo.empty())
5547 return false;
5548
5549 const GEPInfo &GEPI = AddrInfo[0];
5550 std::optional<int64_t> EncodedImm;
5551
5552 if (ScaleOffset)
5553 *ScaleOffset = false;
5554
5555 if (SOffset && Offset) {
5556 EncodedImm = AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: GEPI.Imm, /*IsBuffer=*/false,
5557 /*HasSOffset=*/true);
5558 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5559 AddrInfo.size() > 1) {
5560 const GEPInfo &GEPI2 = AddrInfo[1];
5561 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5562 Register OffsetReg = GEPI2.SgprParts[1];
5563 if (ScaleOffset)
5564 *ScaleOffset =
5565 selectScaleOffset(Root, Offset&: OffsetReg, IsSigned: false /* IsSigned */);
5566 OffsetReg = matchZeroExtendFromS32OrS32(Reg: OffsetReg);
5567 if (OffsetReg) {
5568 Base = GEPI2.SgprParts[0];
5569 *SOffset = OffsetReg;
5570 *Offset = *EncodedImm;
5571 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(ST: STI))
5572 return true;
5573
5574 // For unbuffered smem loads, it is illegal for the Immediate Offset
5575 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5576 // is negative. Handle the case where the Immediate Offset + SOffset
5577 // is negative.
5578 auto SKnown = VT->getKnownBits(R: *SOffset);
5579 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5580 return false;
5581
5582 return true;
5583 }
5584 }
5585 }
5586 return false;
5587 }
5588
5589 EncodedImm = AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: GEPI.Imm, /*IsBuffer=*/false,
5590 /*HasSOffset=*/false);
5591 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5592 Base = GEPI.SgprParts[0];
5593 *Offset = *EncodedImm;
5594 return true;
5595 }
5596
5597 // SGPR offset is unsigned.
5598 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(x: GEPI.Imm) &&
5599 GEPI.Imm != 0) {
5600 // If we make it this far we have a load with an 32-bit immediate offset.
5601 // It is OK to select this using a sgpr offset, because we have already
5602 // failed trying to select this load into one of the _IMM variants since
5603 // the _IMM Patterns are considered before the _SGPR patterns.
5604 Base = GEPI.SgprParts[0];
5605 *SOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5606 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: *SOffset)
5607 .addImm(Val: GEPI.Imm);
5608 return true;
5609 }
5610
5611 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5612 Register OffsetReg = GEPI.SgprParts[1];
5613 if (ScaleOffset)
5614 *ScaleOffset = selectScaleOffset(Root, Offset&: OffsetReg, IsSigned: false /* IsSigned */);
5615 OffsetReg = matchZeroExtendFromS32OrS32(Reg: OffsetReg);
5616 if (OffsetReg) {
5617 Base = GEPI.SgprParts[0];
5618 *SOffset = OffsetReg;
5619 return true;
5620 }
5621 }
5622
5623 return false;
5624}
5625
5626InstructionSelector::ComplexRendererFns
5627AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5628 Register Base;
5629 int64_t Offset;
5630 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, Offset: &Offset,
5631 /* ScaleOffset */ nullptr))
5632 return std::nullopt;
5633
5634 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
5635 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }}};
5636}
5637
5638InstructionSelector::ComplexRendererFns
5639AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5640 SmallVector<GEPInfo, 4> AddrInfo;
5641 getAddrModeInfo(Load: *Root.getParent(), MRI: *MRI, AddrInfo);
5642
5643 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5644 return std::nullopt;
5645
5646 const GEPInfo &GEPInfo = AddrInfo[0];
5647 Register PtrReg = GEPInfo.SgprParts[0];
5648 std::optional<int64_t> EncodedImm =
5649 AMDGPU::getSMRDEncodedLiteralOffset32(ST: STI, ByteOffset: GEPInfo.Imm);
5650 if (!EncodedImm)
5651 return std::nullopt;
5652
5653 return {{
5654 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrReg); },
5655 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); }
5656 }};
5657}
5658
5659InstructionSelector::ComplexRendererFns
5660AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5661 Register Base, SOffset;
5662 bool ScaleOffset;
5663 if (!selectSmrdOffset(Root, Base, SOffset: &SOffset, /* Offset= */ nullptr,
5664 ScaleOffset: &ScaleOffset))
5665 return std::nullopt;
5666
5667 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5668 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
5669 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); },
5670 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPol); }}};
5671}
5672
5673InstructionSelector::ComplexRendererFns
5674AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5675 Register Base, SOffset;
5676 int64_t Offset;
5677 bool ScaleOffset;
5678 if (!selectSmrdOffset(Root, Base, SOffset: &SOffset, Offset: &Offset, ScaleOffset: &ScaleOffset))
5679 return std::nullopt;
5680
5681 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5682 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
5683 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); },
5684 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); },
5685 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPol); }}};
5686}
5687
5688std::pair<Register, int>
5689AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5690 uint64_t FlatVariant) const {
5691 MachineInstr *MI = Root.getParent();
5692
5693 auto Default = std::pair(Root.getReg(), 0);
5694
5695 if (!STI.hasFlatInstOffsets())
5696 return Default;
5697
5698 Register PtrBase;
5699 int64_t ConstOffset;
5700 bool IsInBounds;
5701 std::tie(args&: PtrBase, args&: ConstOffset, args&: IsInBounds) =
5702 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
5703
5704 // Adding the offset to the base address with an immediate in a FLAT
5705 // instruction must not change the memory aperture in which the address falls.
5706 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5707 // instructions.
5708 if (ConstOffset == 0 ||
5709 (FlatVariant == SIInstrFlags::FlatScratch &&
5710 !isFlatScratchBaseLegal(Addr: Root.getReg())) ||
5711 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5712 return Default;
5713
5714 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5715 if (!TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace, FlatVariant))
5716 return Default;
5717
5718 return std::pair(PtrBase, ConstOffset);
5719}
5720
5721InstructionSelector::ComplexRendererFns
5722AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5723 auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FLAT);
5724
5725 return {{
5726 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
5727 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
5728 }};
5729}
5730
5731InstructionSelector::ComplexRendererFns
5732AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5733 auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FlatGlobal);
5734
5735 return {{
5736 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
5737 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
5738 }};
5739}
5740
5741InstructionSelector::ComplexRendererFns
5742AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5743 auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FlatScratch);
5744
5745 return {{
5746 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
5747 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
5748 }};
5749}
5750
5751// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5752InstructionSelector::ComplexRendererFns
5753AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5754 unsigned CPolBits,
5755 bool NeedIOffset) const {
5756 Register Addr = Root.getReg();
5757 Register PtrBase;
5758 int64_t ConstOffset;
5759 int64_t ImmOffset = 0;
5760
5761 // Match the immediate offset first, which canonically is moved as low as
5762 // possible.
5763 std::tie(args&: PtrBase, args&: ConstOffset, args: std::ignore) =
5764 getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
5765
5766 if (ConstOffset != 0) {
5767 if (NeedIOffset &&
5768 TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
5769 FlatVariant: SIInstrFlags::FlatGlobal)) {
5770 Addr = PtrBase;
5771 ImmOffset = ConstOffset;
5772 } else {
5773 auto PtrBaseDef = getDefSrcRegIgnoringCopies(Reg: PtrBase, MRI: *MRI);
5774 if (isSGPR(Reg: PtrBaseDef->Reg)) {
5775 if (ConstOffset > 0) {
5776 // Offset is too large.
5777 //
5778 // saddr + large_offset -> saddr +
5779 // (voffset = large_offset & ~MaxOffset) +
5780 // (large_offset & MaxOffset);
5781 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5782 if (NeedIOffset) {
5783 std::tie(args&: SplitImmOffset, args&: RemainderOffset) =
5784 TII.splitFlatOffset(COffsetVal: ConstOffset, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
5785 FlatVariant: SIInstrFlags::FlatGlobal);
5786 }
5787
5788 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(x: RemainderOffset)
5789 : isUInt<32>(x: RemainderOffset)) {
5790 MachineInstr *MI = Root.getParent();
5791 MachineBasicBlock *MBB = MI->getParent();
5792 Register HighBits =
5793 MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5794
5795 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32),
5796 DestReg: HighBits)
5797 .addImm(Val: RemainderOffset);
5798
5799 if (NeedIOffset)
5800 return {{
5801 [=](MachineInstrBuilder &MIB) {
5802 MIB.addReg(RegNo: PtrBase);
5803 }, // saddr
5804 [=](MachineInstrBuilder &MIB) {
5805 MIB.addReg(RegNo: HighBits);
5806 }, // voffset
5807 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: SplitImmOffset); },
5808 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPolBits); },
5809 }};
5810 return {{
5811 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrBase); }, // saddr
5812 [=](MachineInstrBuilder &MIB) {
5813 MIB.addReg(RegNo: HighBits);
5814 }, // voffset
5815 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPolBits); },
5816 }};
5817 }
5818 }
5819
5820 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5821 // is 1 we would need to perform 1 or 2 extra moves for each half of
5822 // the constant and it is better to do a scalar add and then issue a
5823 // single VALU instruction to materialize zero. Otherwise it is less
5824 // instructions to perform VALU adds with immediates or inline literals.
5825 unsigned NumLiterals =
5826 !TII.isInlineConstant(Imm: APInt(32, Lo_32(Value: ConstOffset))) +
5827 !TII.isInlineConstant(Imm: APInt(32, Hi_32(Value: ConstOffset)));
5828 if (STI.getConstantBusLimit(Opcode: AMDGPU::V_ADD_U32_e64) > NumLiterals)
5829 return std::nullopt;
5830 }
5831 }
5832 }
5833
5834 // Match the variable offset.
5835 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
5836 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5837 // Look through the SGPR->VGPR copy.
5838 Register SAddr =
5839 getSrcRegIgnoringCopies(Reg: AddrDef->MI->getOperand(i: 1).getReg(), MRI: *MRI);
5840
5841 if (isSGPR(Reg: SAddr)) {
5842 Register PtrBaseOffset = AddrDef->MI->getOperand(i: 2).getReg();
5843
5844 // It's possible voffset is an SGPR here, but the copy to VGPR will be
5845 // inserted later.
5846 bool ScaleOffset = selectScaleOffset(Root, Offset&: PtrBaseOffset,
5847 IsSigned: Subtarget->hasSignedGVSOffset());
5848 if (Register VOffset = matchExtendFromS32OrS32(
5849 Reg: PtrBaseOffset, IsSigned: Subtarget->hasSignedGVSOffset())) {
5850 if (NeedIOffset)
5851 return {{[=](MachineInstrBuilder &MIB) { // saddr
5852 MIB.addReg(RegNo: SAddr);
5853 },
5854 [=](MachineInstrBuilder &MIB) { // voffset
5855 MIB.addReg(RegNo: VOffset);
5856 },
5857 [=](MachineInstrBuilder &MIB) { // offset
5858 MIB.addImm(Val: ImmOffset);
5859 },
5860 [=](MachineInstrBuilder &MIB) { // cpol
5861 MIB.addImm(Val: CPolBits |
5862 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5863 }}};
5864 return {{[=](MachineInstrBuilder &MIB) { // saddr
5865 MIB.addReg(RegNo: SAddr);
5866 },
5867 [=](MachineInstrBuilder &MIB) { // voffset
5868 MIB.addReg(RegNo: VOffset);
5869 },
5870 [=](MachineInstrBuilder &MIB) { // cpol
5871 MIB.addImm(Val: CPolBits |
5872 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5873 }}};
5874 }
5875 }
5876 }
5877
5878 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
5879 // drop this.
5880 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
5881 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(Reg: AddrDef->Reg))
5882 return std::nullopt;
5883
5884 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
5885 // moves required to copy a 64-bit SGPR to VGPR.
5886 MachineInstr *MI = Root.getParent();
5887 MachineBasicBlock *MBB = MI->getParent();
5888 Register VOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5889
5890 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VOffset)
5891 .addImm(Val: 0);
5892
5893 if (NeedIOffset)
5894 return {{
5895 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: AddrDef->Reg); }, // saddr
5896 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: VOffset); }, // voffset
5897 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); }, // offset
5898 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPolBits); } // cpol
5899 }};
5900 return {{
5901 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: AddrDef->Reg); }, // saddr
5902 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: VOffset); }, // voffset
5903 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPolBits); } // cpol
5904 }};
5905}
5906
5907InstructionSelector::ComplexRendererFns
5908AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
5909 return selectGlobalSAddr(Root, CPolBits: 0);
5910}
5911
5912InstructionSelector::ComplexRendererFns
5913AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
5914 const MachineInstr &I = *Root.getParent();
5915
5916 // We are assuming CPol is always the last operand of the intrinsic.
5917 auto PassedCPol =
5918 I.getOperand(i: I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5919 return selectGlobalSAddr(Root, CPolBits: PassedCPol);
5920}
5921
5922InstructionSelector::ComplexRendererFns
5923AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
5924 const MachineInstr &I = *Root.getParent();
5925
5926 // We are assuming CPol is second from last operand of the intrinsic.
5927 auto PassedCPol =
5928 I.getOperand(i: I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5929 return selectGlobalSAddr(Root, CPolBits: PassedCPol);
5930}
5931
5932InstructionSelector::ComplexRendererFns
5933AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
5934 return selectGlobalSAddr(Root, CPolBits: AMDGPU::CPol::GLC);
5935}
5936
5937InstructionSelector::ComplexRendererFns
5938AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
5939 MachineOperand &Root) const {
5940 const MachineInstr &I = *Root.getParent();
5941
5942 // We are assuming CPol is always the last operand of the intrinsic.
5943 auto PassedCPol =
5944 I.getOperand(i: I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5945 return selectGlobalSAddr(Root, CPolBits: PassedCPol, NeedIOffset: false);
5946}
5947
5948InstructionSelector::ComplexRendererFns
5949AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
5950 MachineOperand &Root) const {
5951 const MachineInstr &I = *Root.getParent();
5952
5953 // We are assuming CPol is second from last operand of the intrinsic.
5954 auto PassedCPol =
5955 I.getOperand(i: I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5956 return selectGlobalSAddr(Root, CPolBits: PassedCPol, NeedIOffset: false);
5957}
5958
5959InstructionSelector::ComplexRendererFns
5960AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
5961 Register Addr = Root.getReg();
5962 Register PtrBase;
5963 int64_t ConstOffset;
5964 int64_t ImmOffset = 0;
5965
5966 // Match the immediate offset first, which canonically is moved as low as
5967 // possible.
5968 std::tie(args&: PtrBase, args&: ConstOffset, args: std::ignore) =
5969 getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
5970
5971 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
5972 TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
5973 FlatVariant: SIInstrFlags::FlatScratch)) {
5974 Addr = PtrBase;
5975 ImmOffset = ConstOffset;
5976 }
5977
5978 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
5979 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5980 int FI = AddrDef->MI->getOperand(i: 1).getIndex();
5981 return {{
5982 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(Idx: FI); }, // saddr
5983 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
5984 }};
5985 }
5986
5987 Register SAddr = AddrDef->Reg;
5988
5989 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5990 Register LHS = AddrDef->MI->getOperand(i: 1).getReg();
5991 Register RHS = AddrDef->MI->getOperand(i: 2).getReg();
5992 auto LHSDef = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI);
5993 auto RHSDef = getDefSrcRegIgnoringCopies(Reg: RHS, MRI: *MRI);
5994
5995 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
5996 isSGPR(Reg: RHSDef->Reg)) {
5997 int FI = LHSDef->MI->getOperand(i: 1).getIndex();
5998 MachineInstr &I = *Root.getParent();
5999 MachineBasicBlock *BB = I.getParent();
6000 const DebugLoc &DL = I.getDebugLoc();
6001 SAddr = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6002
6003 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADD_I32), DestReg: SAddr)
6004 .addFrameIndex(Idx: FI)
6005 .addReg(RegNo: RHSDef->Reg)
6006 .setOperandDead(3); // Dead scc
6007 }
6008 }
6009
6010 if (!isSGPR(Reg: SAddr))
6011 return std::nullopt;
6012
6013 return {{
6014 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SAddr); }, // saddr
6015 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
6016 }};
6017}
6018
6019// Check whether the flat scratch SVS swizzle bug affects this access.
6020bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6021 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
6022 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6023 return false;
6024
6025 // The bug affects the swizzling of SVS accesses if there is any carry out
6026 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
6027 // voffset to (soffset + inst_offset).
6028 auto VKnown = VT->getKnownBits(R: VAddr);
6029 auto SKnown = KnownBits::add(LHS: VT->getKnownBits(R: SAddr),
6030 RHS: KnownBits::makeConstant(C: APInt(32, ImmOffset)));
6031 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6032 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
6033 return (VMax & 3) + (SMax & 3) >= 4;
6034}
6035
6036InstructionSelector::ComplexRendererFns
6037AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
6038 Register Addr = Root.getReg();
6039 Register PtrBase;
6040 int64_t ConstOffset;
6041 int64_t ImmOffset = 0;
6042
6043 // Match the immediate offset first, which canonically is moved as low as
6044 // possible.
6045 std::tie(args&: PtrBase, args&: ConstOffset, args: std::ignore) =
6046 getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
6047
6048 Register OrigAddr = Addr;
6049 if (ConstOffset != 0 &&
6050 TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
6051 FlatVariant: SIInstrFlags::FlatScratch)) {
6052 Addr = PtrBase;
6053 ImmOffset = ConstOffset;
6054 }
6055
6056 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
6057 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6058 return std::nullopt;
6059
6060 Register RHS = AddrDef->MI->getOperand(i: 2).getReg();
6061 if (RBI.getRegBank(Reg: RHS, MRI: *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6062 return std::nullopt;
6063
6064 Register LHS = AddrDef->MI->getOperand(i: 1).getReg();
6065 auto LHSDef = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI);
6066
6067 if (OrigAddr != Addr) {
6068 if (!isFlatScratchBaseLegalSVImm(Addr: OrigAddr))
6069 return std::nullopt;
6070 } else {
6071 if (!isFlatScratchBaseLegalSV(Addr: OrigAddr))
6072 return std::nullopt;
6073 }
6074
6075 if (checkFlatScratchSVSSwizzleBug(VAddr: RHS, SAddr: LHS, ImmOffset))
6076 return std::nullopt;
6077
6078 unsigned CPol = selectScaleOffset(Root, Offset&: RHS, IsSigned: true /* IsSigned */)
6079 ? AMDGPU::CPol::SCAL
6080 : 0;
6081
6082 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6083 int FI = LHSDef->MI->getOperand(i: 1).getIndex();
6084 return {{
6085 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: RHS); }, // vaddr
6086 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(Idx: FI); }, // saddr
6087 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); }, // offset
6088 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPol); } // cpol
6089 }};
6090 }
6091
6092 if (!isSGPR(Reg: LHS))
6093 if (auto Def = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI))
6094 LHS = Def->Reg;
6095
6096 if (!isSGPR(Reg: LHS))
6097 return std::nullopt;
6098
6099 return {{
6100 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: RHS); }, // vaddr
6101 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: LHS); }, // saddr
6102 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); }, // offset
6103 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPol); } // cpol
6104 }};
6105}
6106
6107InstructionSelector::ComplexRendererFns
6108AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
6109 MachineInstr *MI = Root.getParent();
6110 MachineBasicBlock *MBB = MI->getParent();
6111 MachineFunction *MF = MBB->getParent();
6112 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6113
6114 int64_t Offset = 0;
6115 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: Offset)) &&
6116 Offset != TM.getNullPointerValue(AddrSpace: AMDGPUAS::PRIVATE_ADDRESS)) {
6117 Register HighBits = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6118
6119 // TODO: Should this be inside the render function? The iterator seems to
6120 // move.
6121 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
6122 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32),
6123 DestReg: HighBits)
6124 .addImm(Val: Offset & ~MaxOffset);
6125
6126 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6127 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6128 },
6129 [=](MachineInstrBuilder &MIB) { // vaddr
6130 MIB.addReg(RegNo: HighBits);
6131 },
6132 [=](MachineInstrBuilder &MIB) { // soffset
6133 // Use constant zero for soffset and rely on eliminateFrameIndex
6134 // to choose the appropriate frame register if need be.
6135 MIB.addImm(Val: 0);
6136 },
6137 [=](MachineInstrBuilder &MIB) { // offset
6138 MIB.addImm(Val: Offset & MaxOffset);
6139 }}};
6140 }
6141
6142 assert(Offset == 0 || Offset == -1);
6143
6144 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6145 // offsets.
6146 std::optional<int> FI;
6147 Register VAddr = Root.getReg();
6148
6149 const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg());
6150 Register PtrBase;
6151 int64_t ConstOffset;
6152 std::tie(args&: PtrBase, args&: ConstOffset, args: std::ignore) =
6153 getPtrBaseWithConstantOffset(Root: VAddr, MRI: *MRI);
6154 if (ConstOffset != 0) {
6155 if (TII.isLegalMUBUFImmOffset(Imm: ConstOffset) &&
6156 (!STI.privateMemoryResourceIsRangeChecked() ||
6157 VT->signBitIsZero(Op: PtrBase))) {
6158 const MachineInstr *PtrBaseDef = MRI->getVRegDef(Reg: PtrBase);
6159 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6160 FI = PtrBaseDef->getOperand(i: 1).getIndex();
6161 else
6162 VAddr = PtrBase;
6163 Offset = ConstOffset;
6164 }
6165 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6166 FI = RootDef->getOperand(i: 1).getIndex();
6167 }
6168
6169 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6170 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6171 },
6172 [=](MachineInstrBuilder &MIB) { // vaddr
6173 if (FI)
6174 MIB.addFrameIndex(Idx: *FI);
6175 else
6176 MIB.addReg(RegNo: VAddr);
6177 },
6178 [=](MachineInstrBuilder &MIB) { // soffset
6179 // Use constant zero for soffset and rely on eliminateFrameIndex
6180 // to choose the appropriate frame register if need be.
6181 MIB.addImm(Val: 0);
6182 },
6183 [=](MachineInstrBuilder &MIB) { // offset
6184 MIB.addImm(Val: Offset);
6185 }}};
6186}
6187
6188bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6189 int64_t Offset) const {
6190 if (!isUInt<16>(x: Offset))
6191 return false;
6192
6193 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6194 return true;
6195
6196 // On Southern Islands instruction with a negative base value and an offset
6197 // don't seem to work.
6198 return VT->signBitIsZero(Op: Base);
6199}
6200
6201bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6202 int64_t Offset1,
6203 unsigned Size) const {
6204 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6205 return false;
6206 if (!isUInt<8>(x: Offset0 / Size) || !isUInt<8>(x: Offset1 / Size))
6207 return false;
6208
6209 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6210 return true;
6211
6212 // On Southern Islands instruction with a negative base value and an offset
6213 // don't seem to work.
6214 return VT->signBitIsZero(Op: Base);
6215}
6216
6217// Return whether the operation has NoUnsignedWrap property.
6218static bool isNoUnsignedWrap(MachineInstr *Addr) {
6219 return Addr->getOpcode() == TargetOpcode::G_OR ||
6220 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6221 Addr->getFlag(Flag: MachineInstr::NoUWrap));
6222}
6223
6224// Check that the base address of flat scratch load/store in the form of `base +
6225// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6226// requirement). We always treat the first operand as the base address here.
6227bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6228 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
6229
6230 if (isNoUnsignedWrap(Addr: AddrMI))
6231 return true;
6232
6233 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6234 // values.
6235 if (STI.hasSignedScratchOffsets())
6236 return true;
6237
6238 Register LHS = AddrMI->getOperand(i: 1).getReg();
6239 Register RHS = AddrMI->getOperand(i: 2).getReg();
6240
6241 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6242 std::optional<ValueAndVReg> RhsValReg =
6243 getIConstantVRegValWithLookThrough(VReg: RHS, MRI: *MRI);
6244 // If the immediate offset is negative and within certain range, the base
6245 // address cannot also be negative. If the base is also negative, the sum
6246 // would be either negative or much larger than the valid range of scratch
6247 // memory a thread can access.
6248 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6249 RhsValReg->Value.getSExtValue() > -0x40000000)
6250 return true;
6251 }
6252
6253 return VT->signBitIsZero(Op: LHS);
6254}
6255
6256// Check address value in SGPR/VGPR are legal for flat scratch in the form
6257// of: SGPR + VGPR.
6258bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6259 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
6260
6261 if (isNoUnsignedWrap(Addr: AddrMI))
6262 return true;
6263
6264 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6265 // values.
6266 if (STI.hasSignedScratchOffsets())
6267 return true;
6268
6269 Register LHS = AddrMI->getOperand(i: 1).getReg();
6270 Register RHS = AddrMI->getOperand(i: 2).getReg();
6271 return VT->signBitIsZero(Op: RHS) && VT->signBitIsZero(Op: LHS);
6272}
6273
6274// Check address value in SGPR/VGPR are legal for flat scratch in the form
6275// of: SGPR + VGPR + Imm.
6276bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6277 Register Addr) const {
6278 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6279 // values.
6280 if (STI.hasSignedScratchOffsets())
6281 return true;
6282
6283 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
6284 Register Base = AddrMI->getOperand(i: 1).getReg();
6285 std::optional<DefinitionAndSourceRegister> BaseDef =
6286 getDefSrcRegIgnoringCopies(Reg: Base, MRI: *MRI);
6287 std::optional<ValueAndVReg> RHSOffset =
6288 getIConstantVRegValWithLookThrough(VReg: AddrMI->getOperand(i: 2).getReg(), MRI: *MRI);
6289 assert(RHSOffset);
6290
6291 // If the immediate offset is negative and within certain range, the base
6292 // address cannot also be negative. If the base is also negative, the sum
6293 // would be either negative or much larger than the valid range of scratch
6294 // memory a thread can access.
6295 if (isNoUnsignedWrap(Addr: BaseDef->MI) &&
6296 (isNoUnsignedWrap(Addr: AddrMI) ||
6297 (RHSOffset->Value.getSExtValue() < 0 &&
6298 RHSOffset->Value.getSExtValue() > -0x40000000)))
6299 return true;
6300
6301 Register LHS = BaseDef->MI->getOperand(i: 1).getReg();
6302 Register RHS = BaseDef->MI->getOperand(i: 2).getReg();
6303 return VT->signBitIsZero(Op: RHS) && VT->signBitIsZero(Op: LHS);
6304}
6305
6306bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6307 unsigned ShAmtBits) const {
6308 assert(MI.getOpcode() == TargetOpcode::G_AND);
6309
6310 std::optional<APInt> RHS =
6311 getIConstantVRegVal(VReg: MI.getOperand(i: 2).getReg(), MRI: *MRI);
6312 if (!RHS)
6313 return false;
6314
6315 if (RHS->countr_one() >= ShAmtBits)
6316 return true;
6317
6318 const APInt &LHSKnownZeros = VT->getKnownZeroes(R: MI.getOperand(i: 1).getReg());
6319 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6320}
6321
6322InstructionSelector::ComplexRendererFns
6323AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6324 MachineOperand &Root) const {
6325 Register Reg = Root.getReg();
6326 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6327
6328 std::optional<DefinitionAndSourceRegister> Def =
6329 getDefSrcRegIgnoringCopies(Reg, MRI: *MRI);
6330 assert(Def && "this shouldn't be an optional result");
6331 Reg = Def->Reg;
6332
6333 if (Register WaveBase = getWaveAddress(Def: Def->MI)) {
6334 return {{
6335 [=](MachineInstrBuilder &MIB) { // rsrc
6336 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6337 },
6338 [=](MachineInstrBuilder &MIB) { // soffset
6339 MIB.addReg(RegNo: WaveBase);
6340 },
6341 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // offset
6342 }};
6343 }
6344
6345 int64_t Offset = 0;
6346
6347 // FIXME: Copy check is a hack
6348 Register BasePtr;
6349 if (mi_match(R: Reg, MRI: *MRI,
6350 P: m_GPtrAdd(L: m_Reg(R&: BasePtr),
6351 R: m_any_of(preds: m_ICst(Cst&: Offset), preds: m_Copy(Src: m_ICst(Cst&: Offset)))))) {
6352 if (!TII.isLegalMUBUFImmOffset(Imm: Offset))
6353 return {};
6354 MachineInstr *BasePtrDef = getDefIgnoringCopies(Reg: BasePtr, MRI: *MRI);
6355 Register WaveBase = getWaveAddress(Def: BasePtrDef);
6356 if (!WaveBase)
6357 return {};
6358
6359 return {{
6360 [=](MachineInstrBuilder &MIB) { // rsrc
6361 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6362 },
6363 [=](MachineInstrBuilder &MIB) { // soffset
6364 MIB.addReg(RegNo: WaveBase);
6365 },
6366 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); } // offset
6367 }};
6368 }
6369
6370 if (!mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: Offset)) ||
6371 !TII.isLegalMUBUFImmOffset(Imm: Offset))
6372 return {};
6373
6374 return {{
6375 [=](MachineInstrBuilder &MIB) { // rsrc
6376 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6377 },
6378 [=](MachineInstrBuilder &MIB) { // soffset
6379 MIB.addImm(Val: 0);
6380 },
6381 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); } // offset
6382 }};
6383}
6384
6385std::pair<Register, unsigned>
6386AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6387 const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg());
6388 int64_t ConstAddr = 0;
6389
6390 Register PtrBase;
6391 int64_t Offset;
6392 std::tie(args&: PtrBase, args&: Offset, args: std::ignore) =
6393 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
6394
6395 if (Offset) {
6396 if (isDSOffsetLegal(Base: PtrBase, Offset)) {
6397 // (add n0, c0)
6398 return std::pair(PtrBase, Offset);
6399 }
6400 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6401 // TODO
6402
6403
6404 } else if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: ConstAddr))) {
6405 // TODO
6406
6407 }
6408
6409 return std::pair(Root.getReg(), 0);
6410}
6411
6412InstructionSelector::ComplexRendererFns
6413AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6414 Register Reg;
6415 unsigned Offset;
6416 std::tie(args&: Reg, args&: Offset) = selectDS1Addr1OffsetImpl(Root);
6417 return {{
6418 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
6419 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }
6420 }};
6421}
6422
6423InstructionSelector::ComplexRendererFns
6424AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6425 return selectDSReadWrite2(Root, size: 4);
6426}
6427
6428InstructionSelector::ComplexRendererFns
6429AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6430 return selectDSReadWrite2(Root, size: 8);
6431}
6432
6433InstructionSelector::ComplexRendererFns
6434AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6435 unsigned Size) const {
6436 Register Reg;
6437 unsigned Offset;
6438 std::tie(args&: Reg, args&: Offset) = selectDSReadWrite2Impl(Root, size: Size);
6439 return {{
6440 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
6441 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); },
6442 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset+1); }
6443 }};
6444}
6445
6446std::pair<Register, unsigned>
6447AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6448 unsigned Size) const {
6449 const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg());
6450 int64_t ConstAddr = 0;
6451
6452 Register PtrBase;
6453 int64_t Offset;
6454 std::tie(args&: PtrBase, args&: Offset, args: std::ignore) =
6455 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
6456
6457 if (Offset) {
6458 int64_t OffsetValue0 = Offset;
6459 int64_t OffsetValue1 = Offset + Size;
6460 if (isDSOffset2Legal(Base: PtrBase, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
6461 // (add n0, c0)
6462 return std::pair(PtrBase, OffsetValue0 / Size);
6463 }
6464 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6465 // TODO
6466
6467 } else if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: ConstAddr))) {
6468 // TODO
6469
6470 }
6471
6472 return std::pair(Root.getReg(), 0);
6473}
6474
6475/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6476/// the base value with the constant offset, and if the offset computation is
6477/// known to be inbounds. There may be intervening copies between \p Root and
6478/// the identified constant. Returns \p Root, 0, false if this does not match
6479/// the pattern.
6480std::tuple<Register, int64_t, bool>
6481AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6482 Register Root, const MachineRegisterInfo &MRI) const {
6483 MachineInstr *RootI = getDefIgnoringCopies(Reg: Root, MRI);
6484 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6485 return {Root, 0, false};
6486
6487 MachineOperand &RHS = RootI->getOperand(i: 2);
6488 std::optional<ValueAndVReg> MaybeOffset =
6489 getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI);
6490 if (!MaybeOffset)
6491 return {Root, 0, false};
6492 bool IsInBounds = RootI->getFlag(Flag: MachineInstr::MIFlag::InBounds);
6493 return {RootI->getOperand(i: 1).getReg(), MaybeOffset->Value.getSExtValue(),
6494 IsInBounds};
6495}
6496
6497static void addZeroImm(MachineInstrBuilder &MIB) {
6498 MIB.addImm(Val: 0);
6499}
6500
6501/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6502/// BasePtr is not valid, a null base pointer will be used.
6503static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6504 uint32_t FormatLo, uint32_t FormatHi,
6505 Register BasePtr) {
6506 Register RSrc2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6507 Register RSrc3 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6508 Register RSrcHi = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
6509 Register RSrc = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_128RegClass);
6510
6511 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
6512 .addDef(RegNo: RSrc2)
6513 .addImm(Val: FormatLo);
6514 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
6515 .addDef(RegNo: RSrc3)
6516 .addImm(Val: FormatHi);
6517
6518 // Build the half of the subregister with the constants before building the
6519 // full 128-bit register. If we are building multiple resource descriptors,
6520 // this will allow CSEing of the 2-component register.
6521 B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
6522 .addDef(RegNo: RSrcHi)
6523 .addReg(RegNo: RSrc2)
6524 .addImm(Val: AMDGPU::sub0)
6525 .addReg(RegNo: RSrc3)
6526 .addImm(Val: AMDGPU::sub1);
6527
6528 Register RSrcLo = BasePtr;
6529 if (!BasePtr) {
6530 RSrcLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
6531 B.buildInstr(Opcode: AMDGPU::S_MOV_B64)
6532 .addDef(RegNo: RSrcLo)
6533 .addImm(Val: 0);
6534 }
6535
6536 B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
6537 .addDef(RegNo: RSrc)
6538 .addReg(RegNo: RSrcLo)
6539 .addImm(Val: AMDGPU::sub0_sub1)
6540 .addReg(RegNo: RSrcHi)
6541 .addImm(Val: AMDGPU::sub2_sub3);
6542
6543 return RSrc;
6544}
6545
6546static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6547 const SIInstrInfo &TII, Register BasePtr) {
6548 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6549
6550 // FIXME: Why are half the "default" bits ignored based on the addressing
6551 // mode?
6552 return buildRSRC(B, MRI, FormatLo: 0, FormatHi: Hi_32(Value: DefaultFormat), BasePtr);
6553}
6554
6555static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6556 const SIInstrInfo &TII, Register BasePtr) {
6557 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6558
6559 // FIXME: Why are half the "default" bits ignored based on the addressing
6560 // mode?
6561 return buildRSRC(B, MRI, FormatLo: -1, FormatHi: Hi_32(Value: DefaultFormat), BasePtr);
6562}
6563
6564AMDGPUInstructionSelector::MUBUFAddressData
6565AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6566 MUBUFAddressData Data;
6567 Data.N0 = Src;
6568
6569 Register PtrBase;
6570 int64_t Offset;
6571
6572 std::tie(args&: PtrBase, args&: Offset, args: std::ignore) =
6573 getPtrBaseWithConstantOffset(Root: Src, MRI: *MRI);
6574 if (isUInt<32>(x: Offset)) {
6575 Data.N0 = PtrBase;
6576 Data.Offset = Offset;
6577 }
6578
6579 if (MachineInstr *InputAdd
6580 = getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Data.N0, MRI: *MRI)) {
6581 Data.N2 = InputAdd->getOperand(i: 1).getReg();
6582 Data.N3 = InputAdd->getOperand(i: 2).getReg();
6583
6584 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6585 // FIXME: Don't know this was defined by operand 0
6586 //
6587 // TODO: Remove this when we have copy folding optimizations after
6588 // RegBankSelect.
6589 Data.N2 = getDefIgnoringCopies(Reg: Data.N2, MRI: *MRI)->getOperand(i: 0).getReg();
6590 Data.N3 = getDefIgnoringCopies(Reg: Data.N3, MRI: *MRI)->getOperand(i: 0).getReg();
6591 }
6592
6593 return Data;
6594}
6595
6596/// Return if the addr64 mubuf mode should be used for the given address.
6597bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6598 // (ptr_add N2, N3) -> addr64, or
6599 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6600 if (Addr.N2)
6601 return true;
6602
6603 const RegisterBank *N0Bank = RBI.getRegBank(Reg: Addr.N0, MRI: *MRI, TRI);
6604 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6605}
6606
6607/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6608/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6609/// component.
6610void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6611 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6612 if (TII.isLegalMUBUFImmOffset(Imm: ImmOffset))
6613 return;
6614
6615 // Illegal offset, store it in soffset.
6616 SOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6617 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
6618 .addDef(RegNo: SOffset)
6619 .addImm(Val: ImmOffset);
6620 ImmOffset = 0;
6621}
6622
6623bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6624 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6625 Register &SOffset, int64_t &Offset) const {
6626 // FIXME: Predicates should stop this from reaching here.
6627 // addr64 bit was removed for volcanic islands.
6628 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6629 return false;
6630
6631 MUBUFAddressData AddrData = parseMUBUFAddress(Src: Root.getReg());
6632 if (!shouldUseAddr64(Addr: AddrData))
6633 return false;
6634
6635 Register N0 = AddrData.N0;
6636 Register N2 = AddrData.N2;
6637 Register N3 = AddrData.N3;
6638 Offset = AddrData.Offset;
6639
6640 // Base pointer for the SRD.
6641 Register SRDPtr;
6642
6643 if (N2) {
6644 if (RBI.getRegBank(Reg: N2, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6645 assert(N3);
6646 if (RBI.getRegBank(Reg: N3, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6647 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6648 // addr64, and construct the default resource from a 0 address.
6649 VAddr = N0;
6650 } else {
6651 SRDPtr = N3;
6652 VAddr = N2;
6653 }
6654 } else {
6655 // N2 is not divergent.
6656 SRDPtr = N2;
6657 VAddr = N3;
6658 }
6659 } else if (RBI.getRegBank(Reg: N0, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6660 // Use the default null pointer in the resource
6661 VAddr = N0;
6662 } else {
6663 // N0 -> offset, or
6664 // (N0 + C1) -> offset
6665 SRDPtr = N0;
6666 }
6667
6668 MachineIRBuilder B(*Root.getParent());
6669 RSrcReg = buildAddr64RSrc(B, MRI&: *MRI, TII, BasePtr: SRDPtr);
6670 splitIllegalMUBUFOffset(B, SOffset, ImmOffset&: Offset);
6671 return true;
6672}
6673
6674bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6675 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6676 int64_t &Offset) const {
6677
6678 // FIXME: Pattern should not reach here.
6679 if (STI.useFlatForGlobal())
6680 return false;
6681
6682 MUBUFAddressData AddrData = parseMUBUFAddress(Src: Root.getReg());
6683 if (shouldUseAddr64(Addr: AddrData))
6684 return false;
6685
6686 // N0 -> offset, or
6687 // (N0 + C1) -> offset
6688 Register SRDPtr = AddrData.N0;
6689 Offset = AddrData.Offset;
6690
6691 // TODO: Look through extensions for 32-bit soffset.
6692 MachineIRBuilder B(*Root.getParent());
6693
6694 RSrcReg = buildOffsetSrc(B, MRI&: *MRI, TII, BasePtr: SRDPtr);
6695 splitIllegalMUBUFOffset(B, SOffset, ImmOffset&: Offset);
6696 return true;
6697}
6698
6699InstructionSelector::ComplexRendererFns
6700AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6701 Register VAddr;
6702 Register RSrcReg;
6703 Register SOffset;
6704 int64_t Offset = 0;
6705
6706 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6707 return {};
6708
6709 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6710 // pattern.
6711 return {{
6712 [=](MachineInstrBuilder &MIB) { // rsrc
6713 MIB.addReg(RegNo: RSrcReg);
6714 },
6715 [=](MachineInstrBuilder &MIB) { // vaddr
6716 MIB.addReg(RegNo: VAddr);
6717 },
6718 [=](MachineInstrBuilder &MIB) { // soffset
6719 if (SOffset)
6720 MIB.addReg(RegNo: SOffset);
6721 else if (STI.hasRestrictedSOffset())
6722 MIB.addReg(RegNo: AMDGPU::SGPR_NULL);
6723 else
6724 MIB.addImm(Val: 0);
6725 },
6726 [=](MachineInstrBuilder &MIB) { // offset
6727 MIB.addImm(Val: Offset);
6728 },
6729 addZeroImm, // cpol
6730 addZeroImm, // tfe
6731 addZeroImm // swz
6732 }};
6733}
6734
6735InstructionSelector::ComplexRendererFns
6736AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6737 Register RSrcReg;
6738 Register SOffset;
6739 int64_t Offset = 0;
6740
6741 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6742 return {};
6743
6744 return {{
6745 [=](MachineInstrBuilder &MIB) { // rsrc
6746 MIB.addReg(RegNo: RSrcReg);
6747 },
6748 [=](MachineInstrBuilder &MIB) { // soffset
6749 if (SOffset)
6750 MIB.addReg(RegNo: SOffset);
6751 else if (STI.hasRestrictedSOffset())
6752 MIB.addReg(RegNo: AMDGPU::SGPR_NULL);
6753 else
6754 MIB.addImm(Val: 0);
6755 },
6756 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }, // offset
6757 addZeroImm, // cpol
6758 addZeroImm, // tfe
6759 addZeroImm, // swz
6760 }};
6761}
6762
6763InstructionSelector::ComplexRendererFns
6764AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6765
6766 Register SOffset = Root.getReg();
6767
6768 if (STI.hasRestrictedSOffset() && mi_match(R: SOffset, MRI: *MRI, P: m_ZeroInt()))
6769 SOffset = AMDGPU::SGPR_NULL;
6770
6771 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); }}};
6772}
6773
6774/// Get an immediate that must be 32-bits, and treated as zero extended.
6775static std::optional<uint64_t>
6776getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
6777 // getIConstantVRegVal sexts any values, so see if that matters.
6778 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(VReg: Reg, MRI);
6779 if (!OffsetVal || !isInt<32>(x: *OffsetVal))
6780 return std::nullopt;
6781 return Lo_32(Value: *OffsetVal);
6782}
6783
6784InstructionSelector::ComplexRendererFns
6785AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6786 std::optional<uint64_t> OffsetVal =
6787 Root.isImm() ? Root.getImm() : getConstantZext32Val(Reg: Root.getReg(), MRI: *MRI);
6788 if (!OffsetVal)
6789 return {};
6790
6791 std::optional<int64_t> EncodedImm =
6792 AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: *OffsetVal, IsBuffer: true);
6793 if (!EncodedImm)
6794 return {};
6795
6796 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); } }};
6797}
6798
6799InstructionSelector::ComplexRendererFns
6800AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6801 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6802
6803 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Reg: Root.getReg(), MRI: *MRI);
6804 if (!OffsetVal)
6805 return {};
6806
6807 std::optional<int64_t> EncodedImm =
6808 AMDGPU::getSMRDEncodedLiteralOffset32(ST: STI, ByteOffset: *OffsetVal);
6809 if (!EncodedImm)
6810 return {};
6811
6812 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); } }};
6813}
6814
6815InstructionSelector::ComplexRendererFns
6816AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6817 // Match the (soffset + offset) pair as a 32-bit register base and
6818 // an immediate offset.
6819 Register SOffset;
6820 unsigned Offset;
6821 std::tie(args&: SOffset, args&: Offset) = AMDGPU::getBaseWithConstantOffset(
6822 MRI&: *MRI, Reg: Root.getReg(), ValueTracking: VT, /*CheckNUW*/ true);
6823 if (!SOffset)
6824 return std::nullopt;
6825
6826 std::optional<int64_t> EncodedOffset =
6827 AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: Offset, /* IsBuffer */ true);
6828 if (!EncodedOffset)
6829 return std::nullopt;
6830
6831 assert(MRI->getType(SOffset) == LLT::scalar(32));
6832 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); },
6833 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedOffset); }}};
6834}
6835
6836std::pair<Register, unsigned>
6837AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6838 bool &Matched) const {
6839 Matched = false;
6840
6841 Register Src;
6842 unsigned Mods;
6843 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
6844
6845 if (mi_match(R: Src, MRI: *MRI, P: m_GFPExt(Src: m_Reg(R&: Src)))) {
6846 assert(MRI->getType(Src) == LLT::scalar(16));
6847
6848 // Only change Src if src modifier could be gained. In such cases new Src
6849 // could be sgpr but this does not violate constant bus restriction for
6850 // instruction that is being selected.
6851 Src = stripBitCast(Reg: Src, MRI&: *MRI);
6852
6853 const auto CheckAbsNeg = [&]() {
6854 // Be careful about folding modifiers if we already have an abs. fneg is
6855 // applied last, so we don't want to apply an earlier fneg.
6856 if ((Mods & SISrcMods::ABS) == 0) {
6857 unsigned ModsTmp;
6858 std::tie(args&: Src, args&: ModsTmp) = selectVOP3ModsImpl(Src);
6859
6860 if ((ModsTmp & SISrcMods::NEG) != 0)
6861 Mods ^= SISrcMods::NEG;
6862
6863 if ((ModsTmp & SISrcMods::ABS) != 0)
6864 Mods |= SISrcMods::ABS;
6865 }
6866 };
6867
6868 CheckAbsNeg();
6869
6870 // op_sel/op_sel_hi decide the source type and source.
6871 // If the source's op_sel_hi is set, it indicates to do a conversion from
6872 // fp16. If the sources's op_sel is set, it picks the high half of the
6873 // source register.
6874
6875 Mods |= SISrcMods::OP_SEL_1;
6876
6877 if (isExtractHiElt(MRI&: *MRI, In: Src, Out&: Src)) {
6878 Mods |= SISrcMods::OP_SEL_0;
6879 CheckAbsNeg();
6880 }
6881
6882 Matched = true;
6883 }
6884
6885 return {Src, Mods};
6886}
6887
6888InstructionSelector::ComplexRendererFns
6889AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
6890 MachineOperand &Root) const {
6891 Register Src;
6892 unsigned Mods;
6893 bool Matched;
6894 std::tie(args&: Src, args&: Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6895 if (!Matched)
6896 return {};
6897
6898 return {{
6899 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
6900 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
6901 }};
6902}
6903
6904InstructionSelector::ComplexRendererFns
6905AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
6906 Register Src;
6907 unsigned Mods;
6908 bool Matched;
6909 std::tie(args&: Src, args&: Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6910
6911 return {{
6912 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
6913 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
6914 }};
6915}
6916
6917bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
6918 MachineInstr &I, Intrinsic::ID IntrID) const {
6919 MachineBasicBlock *MBB = I.getParent();
6920 const DebugLoc &DL = I.getDebugLoc();
6921 Register CCReg = I.getOperand(i: 0).getReg();
6922
6923 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6924 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CMP_EQ_U32)).addImm(Val: 0).addImm(Val: 0);
6925
6926 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
6927 .addImm(Val: I.getOperand(i: 2).getImm());
6928
6929 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CCReg).addReg(RegNo: AMDGPU::SCC);
6930
6931 I.eraseFromParent();
6932 return RBI.constrainGenericRegister(Reg: CCReg, RC: AMDGPU::SReg_32_XM0_XEXECRegClass,
6933 MRI&: *MRI);
6934}
6935
6936bool AMDGPUInstructionSelector::selectSGetBarrierState(
6937 MachineInstr &I, Intrinsic::ID IntrID) const {
6938 MachineBasicBlock *MBB = I.getParent();
6939 const DebugLoc &DL = I.getDebugLoc();
6940 const MachineOperand &BarOp = I.getOperand(i: 2);
6941 std::optional<int64_t> BarValImm =
6942 getIConstantVRegSExtVal(VReg: BarOp.getReg(), MRI: *MRI);
6943
6944 if (!BarValImm) {
6945 auto CopyMIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
6946 .addReg(RegNo: BarOp.getReg());
6947 constrainSelectedInstRegOperands(I&: *CopyMIB, TII, TRI, RBI);
6948 }
6949 MachineInstrBuilder MIB;
6950 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
6951 : AMDGPU::S_GET_BARRIER_STATE_M0;
6952 MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc));
6953
6954 auto DstReg = I.getOperand(i: 0).getReg();
6955 const TargetRegisterClass *DstRC =
6956 TRI.getConstrainedRegClassForOperand(MO: I.getOperand(i: 0), MRI: *MRI);
6957 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
6958 return false;
6959 MIB.addDef(RegNo: DstReg);
6960 if (BarValImm) {
6961 MIB.addImm(Val: *BarValImm);
6962 }
6963 I.eraseFromParent();
6964 return true;
6965}
6966
6967unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
6968 if (HasInlineConst) {
6969 switch (IntrID) {
6970 default:
6971 llvm_unreachable("not a named barrier op");
6972 case Intrinsic::amdgcn_s_barrier_join:
6973 return AMDGPU::S_BARRIER_JOIN_IMM;
6974 case Intrinsic::amdgcn_s_wakeup_barrier:
6975 return AMDGPU::S_WAKEUP_BARRIER_IMM;
6976 case Intrinsic::amdgcn_s_get_named_barrier_state:
6977 return AMDGPU::S_GET_BARRIER_STATE_IMM;
6978 };
6979 } else {
6980 switch (IntrID) {
6981 default:
6982 llvm_unreachable("not a named barrier op");
6983 case Intrinsic::amdgcn_s_barrier_join:
6984 return AMDGPU::S_BARRIER_JOIN_M0;
6985 case Intrinsic::amdgcn_s_wakeup_barrier:
6986 return AMDGPU::S_WAKEUP_BARRIER_M0;
6987 case Intrinsic::amdgcn_s_get_named_barrier_state:
6988 return AMDGPU::S_GET_BARRIER_STATE_M0;
6989 };
6990 }
6991}
6992
6993bool AMDGPUInstructionSelector::selectNamedBarrierInit(
6994 MachineInstr &I, Intrinsic::ID IntrID) const {
6995 MachineBasicBlock *MBB = I.getParent();
6996 const DebugLoc &DL = I.getDebugLoc();
6997 const MachineOperand &BarOp = I.getOperand(i: 1);
6998 const MachineOperand &CntOp = I.getOperand(i: 2);
6999
7000 // BarID = (BarOp >> 4) & 0x3F
7001 Register TmpReg0 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7002 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: TmpReg0)
7003 .add(MO: BarOp)
7004 .addImm(Val: 4u)
7005 .setOperandDead(3); // Dead scc
7006
7007 Register TmpReg1 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7008 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: TmpReg1)
7009 .addReg(RegNo: TmpReg0)
7010 .addImm(Val: 0x3F)
7011 .setOperandDead(3); // Dead scc
7012
7013 // MO = ((CntOp & 0x3F) << shAmt) | BarID
7014 Register TmpReg2 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7015 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: TmpReg2)
7016 .add(MO: CntOp)
7017 .addImm(Val: 0x3F)
7018 .setOperandDead(3); // Dead scc
7019
7020 Register TmpReg3 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7021 constexpr unsigned ShAmt = 16;
7022 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: TmpReg3)
7023 .addReg(RegNo: TmpReg2)
7024 .addImm(Val: ShAmt)
7025 .setOperandDead(3); // Dead scc
7026
7027 Register TmpReg4 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7028 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_OR_B32), DestReg: TmpReg4)
7029 .addReg(RegNo: TmpReg1)
7030 .addReg(RegNo: TmpReg3)
7031 .setOperandDead(3); // Dead scc;
7032
7033 auto CopyMIB =
7034 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0).addReg(RegNo: TmpReg4);
7035 constrainSelectedInstRegOperands(I&: *CopyMIB, TII, TRI, RBI);
7036
7037 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7038 ? AMDGPU::S_BARRIER_INIT_M0
7039 : AMDGPU::S_BARRIER_SIGNAL_M0;
7040 MachineInstrBuilder MIB;
7041 MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc));
7042
7043 I.eraseFromParent();
7044 return true;
7045}
7046
7047bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7048 MachineInstr &I, Intrinsic::ID IntrID) const {
7049 MachineBasicBlock *MBB = I.getParent();
7050 const DebugLoc &DL = I.getDebugLoc();
7051 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7052 ? I.getOperand(i: 2)
7053 : I.getOperand(i: 1);
7054 std::optional<int64_t> BarValImm =
7055 getIConstantVRegSExtVal(VReg: BarOp.getReg(), MRI: *MRI);
7056
7057 if (!BarValImm) {
7058 // BarID = (BarOp >> 4) & 0x3F
7059 Register TmpReg0 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7060 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: TmpReg0)
7061 .addReg(RegNo: BarOp.getReg())
7062 .addImm(Val: 4u)
7063 .setOperandDead(3); // Dead scc;
7064
7065 Register TmpReg1 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7066 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: TmpReg1)
7067 .addReg(RegNo: TmpReg0)
7068 .addImm(Val: 0x3F)
7069 .setOperandDead(3); // Dead scc;
7070
7071 auto CopyMIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
7072 .addReg(RegNo: TmpReg1);
7073 constrainSelectedInstRegOperands(I&: *CopyMIB, TII, TRI, RBI);
7074 }
7075
7076 MachineInstrBuilder MIB;
7077 unsigned Opc = getNamedBarrierOp(HasInlineConst: BarValImm.has_value(), IntrID);
7078 MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc));
7079
7080 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7081 auto DstReg = I.getOperand(i: 0).getReg();
7082 const TargetRegisterClass *DstRC =
7083 TRI.getConstrainedRegClassForOperand(MO: I.getOperand(i: 0), MRI: *MRI);
7084 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
7085 return false;
7086 MIB.addDef(RegNo: DstReg);
7087 }
7088
7089 if (BarValImm) {
7090 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7091 MIB.addImm(Val: BarId);
7092 }
7093
7094 I.eraseFromParent();
7095 return true;
7096}
7097
7098void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
7099 const MachineInstr &MI,
7100 int OpIdx) const {
7101 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7102 "Expected G_CONSTANT");
7103 MIB.addImm(Val: MI.getOperand(i: 1).getCImm()->getSExtValue());
7104}
7105
7106void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
7107 const MachineInstr &MI,
7108 int OpIdx) const {
7109 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7110 "Expected G_CONSTANT");
7111 MIB.addImm(Val: -MI.getOperand(i: 1).getCImm()->getSExtValue());
7112}
7113
7114void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
7115 const MachineInstr &MI,
7116 int OpIdx) const {
7117 const MachineOperand &Op = MI.getOperand(i: 1);
7118 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
7119 MIB.addImm(Val: Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7120}
7121
7122void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
7123 const MachineInstr &MI,
7124 int OpIdx) const {
7125 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7126 "Expected G_CONSTANT");
7127 MIB.addImm(Val: MI.getOperand(i: 1).getCImm()->getValue().popcount());
7128}
7129
7130/// This only really exists to satisfy DAG type checking machinery, so is a
7131/// no-op here.
7132void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7133 const MachineInstr &MI,
7134 int OpIdx) const {
7135 const MachineOperand &Op = MI.getOperand(i: OpIdx);
7136 int64_t Imm;
7137 if (Op.isReg() && mi_match(R: Op.getReg(), MRI: *MRI, P: m_ICst(Cst&: Imm)))
7138 MIB.addImm(Val: Imm);
7139 else
7140 MIB.addImm(Val: Op.getImm());
7141}
7142
7143void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7144 const MachineInstr &MI,
7145 int OpIdx) const {
7146 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() != 0);
7147}
7148
7149void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7150 const MachineInstr &MI,
7151 int OpIdx) const {
7152 assert(OpIdx >= 0 && "expected to match an immediate operand");
7153 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7154}
7155
7156void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7157 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7158 assert(OpIdx >= 0 && "expected to match an immediate operand");
7159 MIB.addImm(
7160 Val: (MI.getOperand(i: OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7161}
7162
7163void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7164 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7165 assert(OpIdx >= 0 && "expected to match an immediate operand");
7166 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() & 0x1)
7167 ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL)
7168 : (int64_t)SISrcMods::DST_OP_SEL);
7169}
7170
7171void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7172 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7173 assert(OpIdx >= 0 && "expected to match an immediate operand");
7174 MIB.addImm(
7175 Val: (MI.getOperand(i: OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7176}
7177
7178void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7179 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7180 assert(OpIdx >= 0 && "expected to match an immediate operand");
7181 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() & 0x2)
7182 ? (int64_t)(SISrcMods::OP_SEL_0)
7183 : 0);
7184}
7185
7186void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7187 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7188 assert(OpIdx >= 0 && "expected to match an immediate operand");
7189 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7190 : 0);
7191}
7192
7193void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7194 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7195 assert(OpIdx >= 0 && "expected to match an immediate operand");
7196 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7197 : 0);
7198}
7199
7200void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7201 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7202 assert(OpIdx >= 0 && "expected to match an immediate operand");
7203 MIB.addImm(
7204 Val: (MI.getOperand(i: OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7205}
7206
7207void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7208 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7209 assert(OpIdx >= 0 && "expected to match an immediate operand");
7210 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() & 0x2)
7211 ? (int64_t)SISrcMods::DST_OP_SEL
7212 : 0);
7213}
7214
7215void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7216 const MachineInstr &MI,
7217 int OpIdx) const {
7218 assert(OpIdx >= 0 && "expected to match an immediate operand");
7219 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() &
7220 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
7221 : AMDGPU::CPol::ALL_pregfx12));
7222}
7223
7224void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7225 const MachineInstr &MI,
7226 int OpIdx) const {
7227 assert(OpIdx >= 0 && "expected to match an immediate operand");
7228 const bool Swizzle = MI.getOperand(i: OpIdx).getImm() &
7229 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ
7230 : AMDGPU::CPol::SWZ_pregfx12);
7231 MIB.addImm(Val: Swizzle);
7232}
7233
7234void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7235 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7236 assert(OpIdx >= 0 && "expected to match an immediate operand");
7237 const uint32_t Cpol = MI.getOperand(i: OpIdx).getImm() &
7238 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
7239 : AMDGPU::CPol::ALL_pregfx12);
7240 MIB.addImm(Val: Cpol | AMDGPU::CPol::GLC);
7241}
7242
7243void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7244 const MachineInstr &MI,
7245 int OpIdx) const {
7246 MIB.addFrameIndex(Idx: MI.getOperand(i: 1).getIndex());
7247}
7248
7249void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7250 const MachineInstr &MI,
7251 int OpIdx) const {
7252 const APFloat &APF = MI.getOperand(i: 1).getFPImm()->getValueAPF();
7253 int ExpVal = APF.getExactLog2Abs();
7254 assert(ExpVal != INT_MIN);
7255 MIB.addImm(Val: ExpVal);
7256}
7257
7258void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7259 const MachineInstr &MI,
7260 int OpIdx) const {
7261 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7262 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7263 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7264 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7265 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() + 3) % 4);
7266}
7267
7268void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7269 const MachineInstr &MI,
7270 int OpIdx) const {
7271 unsigned Mods = SISrcMods::OP_SEL_1;
7272 if (MI.getOperand(i: OpIdx).getImm())
7273 Mods ^= SISrcMods::NEG;
7274 MIB.addImm(Val: (int64_t)Mods);
7275}
7276
7277void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7278 const MachineInstr &MI,
7279 int OpIdx) const {
7280 unsigned Mods = SISrcMods::OP_SEL_1;
7281 if (MI.getOperand(i: OpIdx).getImm())
7282 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
7283 MIB.addImm(Val: (int64_t)Mods);
7284}
7285
7286void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7287 const MachineInstr &MI,
7288 int OpIdx) const {
7289 unsigned Val = MI.getOperand(i: OpIdx).getImm();
7290 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7291 if (Val == 1) // neg
7292 Mods ^= SISrcMods::NEG;
7293 if (Val == 2) // abs
7294 Mods ^= SISrcMods::ABS;
7295 if (Val == 3) // neg and abs
7296 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7297 MIB.addImm(Val: (int64_t)Mods);
7298}
7299
7300void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7301 const MachineInstr &MI,
7302 int OpIdx) const {
7303 uint32_t V = MI.getOperand(i: 2).getImm();
7304 V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK))
7305 << AMDGPU::CPol::SCOPE_SHIFT;
7306 if (!Subtarget->hasSafeCUPrefetch())
7307 V = std::max(a: V, b: (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7308 MIB.addImm(Val: V);
7309}
7310
7311/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7312void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7313 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7314 unsigned Val = MI.getOperand(i: OpIdx).getImm();
7315 unsigned New = 0;
7316 if (Val & 0x1)
7317 New |= SISrcMods::OP_SEL_0;
7318 if (Val & 0x2)
7319 New |= SISrcMods::OP_SEL_1;
7320 MIB.addImm(Val: New);
7321}
7322
7323bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7324 return TII.isInlineConstant(Imm);
7325}
7326
7327bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7328 return TII.isInlineConstant(Imm);
7329}
7330