1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUInstructionSelector.h"
15#include "AMDGPU.h"
16#include "AMDGPUGlobalISelUtils.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
23#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
24#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/CodeGen/MachineFrameInfo.h"
28#include "llvm/IR/DiagnosticInfo.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
43AMDGPUInstructionSelector::AMDGPUInstructionSelector(
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48#define GET_GLOBALISEL_PREDICATES_INIT
49#include "AMDGPUGenGlobalISel.inc"
50#undef GET_GLOBALISEL_PREDICATES_INIT
51#define GET_GLOBALISEL_TEMPORARIES_INIT
52#include "AMDGPUGenGlobalISel.inc"
53#undef GET_GLOBALISEL_TEMPORARIES_INIT
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
59void AMDGPUInstructionSelector::setupMF(MachineFunction &MF,
60 GISelValueTracking *VT,
61 CodeGenCoverage *CoverageInfo,
62 ProfileSummaryInfo *PSI,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
66 Subtarget->checkSubtargetFeatures(F: MF.getFunction());
67 InstructionSelector::setupMF(mf&: MF, vt: VT, covinfo: CoverageInfo, psi: PSI, bfi: BFI);
68}
69
70// Return the wave level SGPR base address if this is a wave address.
71static Register getWaveAddress(const MachineInstr *Def) {
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(i: 1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
85 dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank);
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(RC: TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(Val: RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(Opcode: NewOpc));
102 MI.removeOperand(OpNo: 1); // Remove intrinsic ID.
103 MI.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
104
105 MachineOperand &Dst = MI.getOperand(i: 0);
106 MachineOperand &Src = MI.getOperand(i: 1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Reg: Dst.getReg()) == LLT::scalar(SizeInBits: 1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 if (!RBI.constrainGenericRegister(Reg: Dst.getReg(), RC: *DstRC, MRI&: *MRI) ||
120 !RBI.constrainGenericRegister(Reg: Src.getReg(), RC: *SrcRC, MRI&: *MRI))
121 return false;
122 const MCInstrDesc &MCID = MI.getDesc();
123 if (MCID.getOperandConstraint(OpNum: 0, Constraint: MCOI::EARLY_CLOBBER) != -1) {
124 MI.getOperand(i: 0).setIsEarlyClobber(true);
125 }
126 return true;
127}
128
129bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
130 const DebugLoc &DL = I.getDebugLoc();
131 MachineBasicBlock *BB = I.getParent();
132 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
133
134 const MachineOperand &Src = I.getOperand(i: 1);
135 MachineOperand &Dst = I.getOperand(i: 0);
136 Register DstReg = Dst.getReg();
137 Register SrcReg = Src.getReg();
138
139 if (isVCC(Reg: DstReg, MRI: *MRI)) {
140 if (SrcReg == AMDGPU::SCC) {
141 const TargetRegisterClass *RC
142 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
143 if (!RC)
144 return true;
145 return RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI);
146 }
147
148 if (!isVCC(Reg: SrcReg, MRI: *MRI)) {
149 // TODO: Should probably leave the copy and let copyPhysReg expand it.
150 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *TRI.getBoolRC(), MRI&: *MRI))
151 return false;
152
153 const TargetRegisterClass *SrcRC
154 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
155
156 std::optional<ValueAndVReg> ConstVal =
157 getIConstantVRegValWithLookThrough(VReg: SrcReg, MRI: *MRI, LookThroughInstrs: true);
158 if (ConstVal) {
159 unsigned MovOpc =
160 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
161 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: MovOpc), DestReg: DstReg)
162 .addImm(Val: ConstVal->Value.getBoolValue() ? -1 : 0);
163 } else {
164 Register MaskedReg = MRI->createVirtualRegister(RegClass: SrcRC);
165
166 // We can't trust the high bits at this point, so clear them.
167
168 // TODO: Skip masking high bits if def is known boolean.
169
170 if (AMDGPU::getRegBitWidth(RCID: SrcRC->getID()) == 16) {
171 assert(Subtarget->useRealTrue16Insts());
172 const int64_t NoMods = 0;
173 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B16_t16_e64), DestReg: MaskedReg)
174 .addImm(Val: NoMods)
175 .addImm(Val: 1)
176 .addImm(Val: NoMods)
177 .addReg(RegNo: SrcReg)
178 .addImm(Val: NoMods);
179 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_NE_U16_t16_e64), DestReg: DstReg)
180 .addImm(Val: NoMods)
181 .addImm(Val: 0)
182 .addImm(Val: NoMods)
183 .addReg(RegNo: MaskedReg)
184 .addImm(Val: NoMods);
185 } else {
186 bool IsSGPR = TRI.isSGPRClass(RC: SrcRC);
187 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
188 auto And = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: MaskedReg)
189 .addImm(Val: 1)
190 .addReg(RegNo: SrcReg);
191 if (IsSGPR)
192 And.setOperandDead(3); // Dead scc
193
194 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_NE_U32_e64), DestReg: DstReg)
195 .addImm(Val: 0)
196 .addReg(RegNo: MaskedReg);
197 }
198 }
199
200 if (!MRI->getRegClassOrNull(Reg: SrcReg))
201 MRI->setRegClass(Reg: SrcReg, RC: SrcRC);
202 I.eraseFromParent();
203 return true;
204 }
205
206 const TargetRegisterClass *RC =
207 TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
208 if (RC && !RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI))
209 return false;
210
211 return true;
212 }
213
214 for (const MachineOperand &MO : I.operands()) {
215 if (MO.getReg().isPhysical())
216 continue;
217
218 const TargetRegisterClass *RC =
219 TRI.getConstrainedRegClassForOperand(MO, MRI: *MRI);
220 if (!RC)
221 continue;
222 RBI.constrainGenericRegister(Reg: MO.getReg(), RC: *RC, MRI&: *MRI);
223 }
224 return true;
225}
226
227bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
228 const DebugLoc &DL = I.getDebugLoc();
229 MachineBasicBlock *BB = I.getParent();
230 Register VCCReg = I.getOperand(i: 1).getReg();
231 MachineInstr *Cmp;
232
233 // Set SCC as a side effect with S_CMP or S_OR.
234 if (STI.hasScalarCompareEq64()) {
235 unsigned CmpOpc =
236 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
237 Cmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: CmpOpc)).addReg(RegNo: VCCReg).addImm(Val: 0);
238 } else {
239 Register DeadDst = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
240 Cmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_OR_B64), DestReg: DeadDst)
241 .addReg(RegNo: VCCReg)
242 .addReg(RegNo: VCCReg);
243 }
244
245 constrainSelectedInstRegOperands(I&: *Cmp, TII, TRI, RBI);
246
247 Register DstReg = I.getOperand(i: 0).getReg();
248 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: AMDGPU::SCC);
249
250 I.eraseFromParent();
251 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
252}
253
254bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
255 const DebugLoc &DL = I.getDebugLoc();
256 MachineBasicBlock *BB = I.getParent();
257
258 Register DstReg = I.getOperand(i: 0).getReg();
259 Register SrcReg = I.getOperand(i: 1).getReg();
260 std::optional<ValueAndVReg> Arg =
261 getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: 1).getReg(), MRI: *MRI);
262
263 if (Arg) {
264 const int64_t Value = Arg->Value.getZExtValue();
265 if (Value == 0) {
266 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
267 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: DstReg).addImm(Val: 0);
268 } else {
269 assert(Value == 1);
270 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: TRI.getExec());
271 }
272 I.eraseFromParent();
273 return RBI.constrainGenericRegister(Reg: DstReg, RC: *TRI.getBoolRC(), MRI&: *MRI);
274 }
275
276 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
277 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC).addReg(RegNo: SrcReg);
278
279 unsigned SelectOpcode =
280 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
281 MachineInstr *Select = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: SelectOpcode), DestReg: DstReg)
282 .addReg(RegNo: TRI.getExec())
283 .addImm(Val: 0);
284
285 I.eraseFromParent();
286 constrainSelectedInstRegOperands(I&: *Select, TII, TRI, RBI);
287 return true;
288}
289
290bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
291 Register DstReg = I.getOperand(i: 0).getReg();
292 Register SrcReg = I.getOperand(i: 1).getReg();
293
294 const DebugLoc &DL = I.getDebugLoc();
295 MachineBasicBlock *BB = I.getParent();
296
297 auto RFL = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
298 .addReg(RegNo: SrcReg);
299
300 I.eraseFromParent();
301 constrainSelectedInstRegOperands(I&: *RFL, TII, TRI, RBI);
302 return true;
303}
304
305bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
306 const Register DefReg = I.getOperand(i: 0).getReg();
307 const LLT DefTy = MRI->getType(Reg: DefReg);
308
309 // S1 G_PHIs should not be selected in instruction-select, instead:
310 // - divergent S1 G_PHI should go through lane mask merging algorithm
311 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
312 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
313 if (DefTy == LLT::scalar(SizeInBits: 1))
314 return false;
315
316 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
317
318 const RegClassOrRegBank &RegClassOrBank =
319 MRI->getRegClassOrRegBank(Reg: DefReg);
320
321 const TargetRegisterClass *DefRC =
322 dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank);
323 if (!DefRC) {
324 if (!DefTy.isValid()) {
325 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
326 return false;
327 }
328
329 const RegisterBank &RB = *cast<const RegisterBank *>(Val: RegClassOrBank);
330 DefRC = TRI.getRegClassForTypeOnBank(Ty: DefTy, Bank: RB);
331 if (!DefRC) {
332 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
333 return false;
334 }
335 }
336
337 // If inputs have register bank, assign corresponding reg class.
338 // Note: registers don't need to have the same reg bank.
339 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
340 const Register SrcReg = I.getOperand(i).getReg();
341
342 const RegisterBank *RB = MRI->getRegBankOrNull(Reg: SrcReg);
343 if (RB) {
344 const LLT SrcTy = MRI->getType(Reg: SrcReg);
345 const TargetRegisterClass *SrcRC =
346 TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *RB);
347 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI))
348 return false;
349 }
350 }
351
352 I.setDesc(TII.get(Opcode: TargetOpcode::PHI));
353 return RBI.constrainGenericRegister(Reg: DefReg, RC: *DefRC, MRI&: *MRI);
354}
355
356MachineOperand
357AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
358 const TargetRegisterClass &SubRC,
359 unsigned SubIdx) const {
360
361 MachineInstr *MI = MO.getParent();
362 MachineBasicBlock *BB = MO.getParent()->getParent();
363 Register DstReg = MRI->createVirtualRegister(RegClass: &SubRC);
364
365 if (MO.isReg()) {
366 unsigned ComposedSubIdx = TRI.composeSubRegIndices(a: MO.getSubReg(), b: SubIdx);
367 Register Reg = MO.getReg();
368 BuildMI(BB&: *BB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
369 .addReg(RegNo: Reg, Flags: {}, SubReg: ComposedSubIdx);
370
371 return MachineOperand::CreateReg(Reg: DstReg, isDef: MO.isDef(), isImp: MO.isImplicit(),
372 isKill: MO.isKill(), isDead: MO.isDead(), isUndef: MO.isUndef(),
373 isEarlyClobber: MO.isEarlyClobber(), SubReg: 0, isDebug: MO.isDebug(),
374 isInternalRead: MO.isInternalRead());
375 }
376
377 assert(MO.isImm());
378
379 APInt Imm(64, MO.getImm());
380
381 switch (SubIdx) {
382 default:
383 llvm_unreachable("do not know to split immediate with this sub index.");
384 case AMDGPU::sub0:
385 return MachineOperand::CreateImm(Val: Imm.getLoBits(numBits: 32).getSExtValue());
386 case AMDGPU::sub1:
387 return MachineOperand::CreateImm(Val: Imm.getHiBits(numBits: 32).getSExtValue());
388 }
389}
390
391static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
392 switch (Opc) {
393 case AMDGPU::G_AND:
394 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
395 case AMDGPU::G_OR:
396 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
397 case AMDGPU::G_XOR:
398 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
399 default:
400 llvm_unreachable("not a bit op");
401 }
402}
403
404bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
405 Register DstReg = I.getOperand(i: 0).getReg();
406 unsigned Size = RBI.getSizeInBits(Reg: DstReg, MRI: *MRI, TRI);
407
408 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
409 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
410 DstRB->getID() != AMDGPU::VCCRegBankID)
411 return false;
412
413 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
414 STI.isWave64());
415 I.setDesc(TII.get(Opcode: getLogicalBitOpcode(Opc: I.getOpcode(), Is64)));
416
417 // Dead implicit-def of scc
418 I.addOperand(Op: MachineOperand::CreateReg(Reg: AMDGPU::SCC, isDef: true, // isDef
419 isImp: true, // isImp
420 isKill: false, // isKill
421 isDead: true)); // isDead
422 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
423 return true;
424}
425
426bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
427 MachineBasicBlock *BB = I.getParent();
428 MachineFunction *MF = BB->getParent();
429 Register DstReg = I.getOperand(i: 0).getReg();
430 const DebugLoc &DL = I.getDebugLoc();
431 LLT Ty = MRI->getType(Reg: DstReg);
432 if (Ty.isVector())
433 return false;
434
435 unsigned Size = Ty.getSizeInBits();
436 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
437 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
438 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
439
440 if (Size == 32) {
441 if (IsSALU) {
442 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
443 MachineInstr *Add =
444 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
445 .add(MO: I.getOperand(i: 1))
446 .add(MO: I.getOperand(i: 2))
447 .setOperandDead(3); // Dead scc
448 I.eraseFromParent();
449 constrainSelectedInstRegOperands(I&: *Add, TII, TRI, RBI);
450 return true;
451 }
452
453 if (STI.hasAddNoCarryInsts()) {
454 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
455 I.setDesc(TII.get(Opcode: Opc));
456 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
457 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
458 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
459 return true;
460 }
461
462 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
463
464 Register UnusedCarry = MRI->createVirtualRegister(RegClass: TRI.getWaveMaskRegClass());
465 MachineInstr *Add
466 = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
467 .addDef(RegNo: UnusedCarry, Flags: RegState::Dead)
468 .add(MO: I.getOperand(i: 1))
469 .add(MO: I.getOperand(i: 2))
470 .addImm(Val: 0);
471 I.eraseFromParent();
472 constrainSelectedInstRegOperands(I&: *Add, TII, TRI, RBI);
473 return true;
474 }
475
476 assert(!Sub && "illegal sub should not reach here");
477
478 const TargetRegisterClass &RC
479 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
480 const TargetRegisterClass &HalfRC
481 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
482
483 MachineOperand Lo1(getSubOperand64(MO&: I.getOperand(i: 1), SubRC: HalfRC, SubIdx: AMDGPU::sub0));
484 MachineOperand Lo2(getSubOperand64(MO&: I.getOperand(i: 2), SubRC: HalfRC, SubIdx: AMDGPU::sub0));
485 MachineOperand Hi1(getSubOperand64(MO&: I.getOperand(i: 1), SubRC: HalfRC, SubIdx: AMDGPU::sub1));
486 MachineOperand Hi2(getSubOperand64(MO&: I.getOperand(i: 2), SubRC: HalfRC, SubIdx: AMDGPU::sub1));
487
488 Register DstLo = MRI->createVirtualRegister(RegClass: &HalfRC);
489 Register DstHi = MRI->createVirtualRegister(RegClass: &HalfRC);
490
491 if (IsSALU) {
492 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADD_U32), DestReg: DstLo)
493 .add(MO: Lo1)
494 .add(MO: Lo2);
495 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADDC_U32), DestReg: DstHi)
496 .add(MO: Hi1)
497 .add(MO: Hi2)
498 .setOperandDead(3); // Dead scc
499 } else {
500 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
501 Register CarryReg = MRI->createVirtualRegister(RegClass: CarryRC);
502 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: DstLo)
503 .addDef(RegNo: CarryReg)
504 .add(MO: Lo1)
505 .add(MO: Lo2)
506 .addImm(Val: 0);
507 MachineInstr *Addc = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: DstHi)
508 .addDef(RegNo: MRI->createVirtualRegister(RegClass: CarryRC), Flags: RegState::Dead)
509 .add(MO: Hi1)
510 .add(MO: Hi2)
511 .addReg(RegNo: CarryReg, Flags: RegState::Kill)
512 .addImm(Val: 0);
513
514 constrainSelectedInstRegOperands(I&: *Addc, TII, TRI, RBI);
515 }
516
517 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
518 .addReg(RegNo: DstLo)
519 .addImm(Val: AMDGPU::sub0)
520 .addReg(RegNo: DstHi)
521 .addImm(Val: AMDGPU::sub1);
522
523
524 if (!RBI.constrainGenericRegister(Reg: DstReg, RC, MRI&: *MRI))
525 return false;
526
527 I.eraseFromParent();
528 return true;
529}
530
531bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
532 MachineInstr &I) const {
533 MachineBasicBlock *BB = I.getParent();
534 MachineFunction *MF = BB->getParent();
535 const DebugLoc &DL = I.getDebugLoc();
536 Register Dst0Reg = I.getOperand(i: 0).getReg();
537 Register Dst1Reg = I.getOperand(i: 1).getReg();
538 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
539 I.getOpcode() == AMDGPU::G_UADDE;
540 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
541 I.getOpcode() == AMDGPU::G_USUBE;
542
543 if (isVCC(Reg: Dst1Reg, MRI: *MRI)) {
544 unsigned NoCarryOpc =
545 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
546 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
547 I.setDesc(TII.get(Opcode: HasCarryIn ? CarryOpc : NoCarryOpc));
548 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
549 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
550 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
551 return true;
552 }
553
554 Register Src0Reg = I.getOperand(i: 2).getReg();
555 Register Src1Reg = I.getOperand(i: 3).getReg();
556
557 if (HasCarryIn) {
558 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC)
559 .addReg(RegNo: I.getOperand(i: 4).getReg());
560 }
561
562 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
563 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
564
565 auto CarryInst = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: HasCarryIn ? CarryOpc : NoCarryOpc), DestReg: Dst0Reg)
566 .add(MO: I.getOperand(i: 2))
567 .add(MO: I.getOperand(i: 3));
568
569 if (MRI->use_nodbg_empty(RegNo: Dst1Reg)) {
570 CarryInst.setOperandDead(3); // Dead scc
571 } else {
572 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: Dst1Reg)
573 .addReg(RegNo: AMDGPU::SCC);
574 if (!MRI->getRegClassOrNull(Reg: Dst1Reg))
575 MRI->setRegClass(Reg: Dst1Reg, RC: &AMDGPU::SReg_32RegClass);
576 }
577
578 if (!RBI.constrainGenericRegister(Reg: Dst0Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
579 !RBI.constrainGenericRegister(Reg: Src0Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
580 !RBI.constrainGenericRegister(Reg: Src1Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
581 return false;
582
583 if (HasCarryIn &&
584 !RBI.constrainGenericRegister(Reg: I.getOperand(i: 4).getReg(),
585 RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
586 return false;
587
588 I.eraseFromParent();
589 return true;
590}
591
592bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
593 MachineInstr &I) const {
594 MachineBasicBlock *BB = I.getParent();
595 MachineFunction *MF = BB->getParent();
596 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
597 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
598 MRI->use_nodbg_empty(RegNo: I.getOperand(i: 1).getReg());
599
600 unsigned Opc;
601 if (Subtarget->hasMADIntraFwdBug())
602 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
603 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
604 else if (UseNoCarry)
605 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
606 : AMDGPU::V_MAD_NC_I64_I32_e64;
607 else
608 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
609
610 if (UseNoCarry)
611 I.removeOperand(OpNo: 1);
612
613 I.setDesc(TII.get(Opcode: Opc));
614 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
615 I.addImplicitDefUseOperands(MF&: *MF);
616 I.getOperand(i: 0).setIsEarlyClobber(true);
617 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
618 return true;
619}
620
621// TODO: We should probably legalize these to only using 32-bit results.
622bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
623 MachineBasicBlock *BB = I.getParent();
624 Register DstReg = I.getOperand(i: 0).getReg();
625 Register SrcReg = I.getOperand(i: 1).getReg();
626 LLT DstTy = MRI->getType(Reg: DstReg);
627 LLT SrcTy = MRI->getType(Reg: SrcReg);
628 const unsigned SrcSize = SrcTy.getSizeInBits();
629 unsigned DstSize = DstTy.getSizeInBits();
630
631 // TODO: Should handle any multiple of 32 offset.
632 unsigned Offset = I.getOperand(i: 2).getImm();
633 if (Offset % 32 != 0 || DstSize > 128)
634 return false;
635
636 // 16-bit operations really use 32-bit registers.
637 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
638 if (DstSize == 16)
639 DstSize = 32;
640
641 const TargetRegisterClass *DstRC =
642 TRI.getConstrainedRegClassForOperand(MO: I.getOperand(i: 0), MRI: *MRI);
643 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
644 return false;
645
646 const RegisterBank *SrcBank = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
647 const TargetRegisterClass *SrcRC =
648 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcBank);
649 if (!SrcRC)
650 return false;
651 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Channel: Offset / 32,
652 NumRegs: DstSize / 32);
653 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
654 if (!SrcRC)
655 return false;
656
657 SrcReg = constrainOperandRegClass(MF: *MF, TRI, MRI&: *MRI, TII, RBI, InsertPt&: I,
658 RegClass: *SrcRC, RegMO&: I.getOperand(i: 1));
659 const DebugLoc &DL = I.getDebugLoc();
660 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::COPY), DestReg: DstReg)
661 .addReg(RegNo: SrcReg, Flags: {}, SubReg);
662
663 I.eraseFromParent();
664 return true;
665}
666
667bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
668 MachineBasicBlock *BB = MI.getParent();
669 Register DstReg = MI.getOperand(i: 0).getReg();
670 LLT DstTy = MRI->getType(Reg: DstReg);
671 LLT SrcTy = MRI->getType(Reg: MI.getOperand(i: 1).getReg());
672
673 const unsigned SrcSize = SrcTy.getSizeInBits();
674 if (SrcSize < 32)
675 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
676
677 const DebugLoc &DL = MI.getDebugLoc();
678 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
679 const unsigned DstSize = DstTy.getSizeInBits();
680 const TargetRegisterClass *DstRC =
681 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
682 if (!DstRC)
683 return false;
684
685 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: DstRC, EltSize: SrcSize / 8);
686 MachineInstrBuilder MIB =
687 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg);
688 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
689 MachineOperand &Src = MI.getOperand(i: I + 1);
690 MIB.addReg(RegNo: Src.getReg(), Flags: getUndefRegState(B: Src.isUndef()));
691 MIB.addImm(Val: SubRegs[I]);
692
693 const TargetRegisterClass *SrcRC
694 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
695 if (SrcRC && !RBI.constrainGenericRegister(Reg: Src.getReg(), RC: *SrcRC, MRI&: *MRI))
696 return false;
697 }
698
699 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
700 return false;
701
702 MI.eraseFromParent();
703 return true;
704}
705
706bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
707 MachineBasicBlock *BB = MI.getParent();
708 const int NumDst = MI.getNumOperands() - 1;
709
710 MachineOperand &Src = MI.getOperand(i: NumDst);
711
712 Register SrcReg = Src.getReg();
713 Register DstReg0 = MI.getOperand(i: 0).getReg();
714 LLT DstTy = MRI->getType(Reg: DstReg0);
715 LLT SrcTy = MRI->getType(Reg: SrcReg);
716
717 const unsigned DstSize = DstTy.getSizeInBits();
718 const unsigned SrcSize = SrcTy.getSizeInBits();
719 const DebugLoc &DL = MI.getDebugLoc();
720 const RegisterBank *SrcBank = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
721
722 const TargetRegisterClass *SrcRC =
723 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcBank);
724 if (!SrcRC || !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI))
725 return false;
726
727 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
728 // source, and this relies on the fact that the same subregister indices are
729 // used for both.
730 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: SrcRC, EltSize: DstSize / 8);
731 for (int I = 0, E = NumDst; I != E; ++I) {
732 MachineOperand &Dst = MI.getOperand(i: I);
733 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::COPY), DestReg: Dst.getReg())
734 .addReg(RegNo: SrcReg, Flags: {}, SubReg: SubRegs[I]);
735
736 // Make sure the subregister index is valid for the source register.
737 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
738 if (!SrcRC || !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI))
739 return false;
740
741 const TargetRegisterClass *DstRC =
742 TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
743 if (DstRC && !RBI.constrainGenericRegister(Reg: Dst.getReg(), RC: *DstRC, MRI&: *MRI))
744 return false;
745 }
746
747 MI.eraseFromParent();
748 return true;
749}
750
751bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
752 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
753 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
754
755 Register Src0 = MI.getOperand(i: 1).getReg();
756 Register Src1 = MI.getOperand(i: 2).getReg();
757 LLT SrcTy = MRI->getType(Reg: Src0);
758 const unsigned SrcSize = SrcTy.getSizeInBits();
759
760 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
761 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
762 return selectG_MERGE_VALUES(MI);
763 }
764
765 // Selection logic below is for V2S16 only.
766 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
767 Register Dst = MI.getOperand(i: 0).getReg();
768 if (MRI->getType(Reg: Dst) != LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) ||
769 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
770 SrcTy != LLT::scalar(SizeInBits: 32)))
771 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
772
773 const RegisterBank *DstBank = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
774 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
775 return false;
776
777 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
778 DstBank->getID() == AMDGPU::VGPRRegBankID);
779 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
780
781 const DebugLoc &DL = MI.getDebugLoc();
782 MachineBasicBlock *BB = MI.getParent();
783
784 // First, before trying TableGen patterns, check if both sources are
785 // constants. In those cases, we can trivially compute the final constant
786 // and emit a simple move.
787 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(VReg: Src1, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
788 if (ConstSrc1) {
789 auto ConstSrc0 =
790 getAnyConstantVRegValWithLookThrough(VReg: Src0, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
791 if (ConstSrc0) {
792 const int64_t K0 = ConstSrc0->Value.getSExtValue();
793 const int64_t K1 = ConstSrc1->Value.getSExtValue();
794 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
795 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
796 uint32_t Imm = Lo16 | (Hi16 << 16);
797
798 // VALU
799 if (IsVector) {
800 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: Dst).addImm(Val: Imm);
801 MI.eraseFromParent();
802 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI);
803 }
804
805 // SALU
806 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: Dst).addImm(Val: Imm);
807 MI.eraseFromParent();
808 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
809 }
810 }
811
812 // Now try TableGen patterns.
813 if (selectImpl(I&: MI, CoverageInfo&: *CoverageInfo))
814 return true;
815
816 // TODO: This should probably be a combine somewhere
817 // (build_vector $src0, undef) -> copy $src0
818 MachineInstr *Src1Def = getDefIgnoringCopies(Reg: Src1, MRI: *MRI);
819 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
820 MI.setDesc(TII.get(Opcode: AMDGPU::COPY));
821 MI.removeOperand(OpNo: 2);
822 const auto &RC =
823 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
824 return RBI.constrainGenericRegister(Reg: Dst, RC, MRI&: *MRI) &&
825 RBI.constrainGenericRegister(Reg: Src0, RC, MRI&: *MRI);
826 }
827
828 // TODO: Can be improved?
829 if (IsVector) {
830 Register TmpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
831 auto MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B32_e32), DestReg: TmpReg)
832 .addImm(Val: 0xFFFF)
833 .addReg(RegNo: Src0);
834 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
835
836 MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: Dst)
837 .addReg(RegNo: Src1)
838 .addImm(Val: 16)
839 .addReg(RegNo: TmpReg);
840 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
841
842 MI.eraseFromParent();
843 return true;
844 }
845
846 Register ShiftSrc0;
847 Register ShiftSrc1;
848
849 // With multiple uses of the shift, this will duplicate the shift and
850 // increase register pressure.
851 //
852 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
853 // => (S_PACK_HH_B32_B16 $src0, $src1)
854 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
855 // => (S_PACK_HL_B32_B16 $src0, $src1)
856 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
857 // => (S_PACK_LH_B32_B16 $src0, $src1)
858 // (build_vector $src0, $src1)
859 // => (S_PACK_LL_B32_B16 $src0, $src1)
860
861 bool Shift0 = mi_match(
862 R: Src0, MRI: *MRI, P: m_OneUse(SP: m_GLShr(L: m_Reg(R&: ShiftSrc0), R: m_SpecificICst(RequestedValue: 16))));
863
864 bool Shift1 = mi_match(
865 R: Src1, MRI: *MRI, P: m_OneUse(SP: m_GLShr(L: m_Reg(R&: ShiftSrc1), R: m_SpecificICst(RequestedValue: 16))));
866
867 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
868 if (Shift0 && Shift1) {
869 Opc = AMDGPU::S_PACK_HH_B32_B16;
870 MI.getOperand(i: 1).setReg(ShiftSrc0);
871 MI.getOperand(i: 2).setReg(ShiftSrc1);
872 } else if (Shift1) {
873 Opc = AMDGPU::S_PACK_LH_B32_B16;
874 MI.getOperand(i: 2).setReg(ShiftSrc1);
875 } else if (Shift0) {
876 auto ConstSrc1 =
877 getAnyConstantVRegValWithLookThrough(VReg: Src1, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
878 if (ConstSrc1 && ConstSrc1->Value == 0) {
879 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
880 auto MIB = BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: Dst)
881 .addReg(RegNo: ShiftSrc0)
882 .addImm(Val: 16)
883 .setOperandDead(3); // Dead scc
884
885 MI.eraseFromParent();
886 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
887 return true;
888 }
889 if (STI.hasSPackHL()) {
890 Opc = AMDGPU::S_PACK_HL_B32_B16;
891 MI.getOperand(i: 1).setReg(ShiftSrc0);
892 }
893 }
894
895 MI.setDesc(TII.get(Opcode: Opc));
896 constrainSelectedInstRegOperands(I&: MI, TII, TRI, RBI);
897 return true;
898}
899
900bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
901 const MachineOperand &MO = I.getOperand(i: 0);
902
903 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
904 // regbank check here is to know why getConstrainedRegClassForOperand failed.
905 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI: *MRI);
906 if ((!RC && !MRI->getRegBankOrNull(Reg: MO.getReg())) ||
907 (RC && RBI.constrainGenericRegister(Reg: MO.getReg(), RC: *RC, MRI&: *MRI))) {
908 I.setDesc(TII.get(Opcode: TargetOpcode::IMPLICIT_DEF));
909 return true;
910 }
911
912 return false;
913}
914
915bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
916 MachineBasicBlock *BB = I.getParent();
917
918 Register DstReg = I.getOperand(i: 0).getReg();
919 Register Src0Reg = I.getOperand(i: 1).getReg();
920 Register Src1Reg = I.getOperand(i: 2).getReg();
921 LLT Src1Ty = MRI->getType(Reg: Src1Reg);
922
923 unsigned DstSize = MRI->getType(Reg: DstReg).getSizeInBits();
924 unsigned InsSize = Src1Ty.getSizeInBits();
925
926 int64_t Offset = I.getOperand(i: 3).getImm();
927
928 // FIXME: These cases should have been illegal and unnecessary to check here.
929 if (Offset % 32 != 0 || InsSize % 32 != 0)
930 return false;
931
932 // Currently not handled by getSubRegFromChannel.
933 if (InsSize > 128)
934 return false;
935
936 unsigned SubReg = TRI.getSubRegFromChannel(Channel: Offset / 32, NumRegs: InsSize / 32);
937 if (SubReg == AMDGPU::NoSubRegister)
938 return false;
939
940 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
941 const TargetRegisterClass *DstRC =
942 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
943 if (!DstRC)
944 return false;
945
946 const RegisterBank *Src0Bank = RBI.getRegBank(Reg: Src0Reg, MRI: *MRI, TRI);
947 const RegisterBank *Src1Bank = RBI.getRegBank(Reg: Src1Reg, MRI: *MRI, TRI);
948 const TargetRegisterClass *Src0RC =
949 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *Src0Bank);
950 const TargetRegisterClass *Src1RC =
951 TRI.getRegClassForSizeOnBank(Size: InsSize, Bank: *Src1Bank);
952
953 // Deal with weird cases where the class only partially supports the subreg
954 // index.
955 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
956 if (!Src0RC || !Src1RC)
957 return false;
958
959 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
960 !RBI.constrainGenericRegister(Reg: Src0Reg, RC: *Src0RC, MRI&: *MRI) ||
961 !RBI.constrainGenericRegister(Reg: Src1Reg, RC: *Src1RC, MRI&: *MRI))
962 return false;
963
964 const DebugLoc &DL = I.getDebugLoc();
965 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: DstReg)
966 .addReg(RegNo: Src0Reg)
967 .addReg(RegNo: Src1Reg)
968 .addImm(Val: SubReg);
969
970 I.eraseFromParent();
971 return true;
972}
973
974bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
975 Register DstReg = MI.getOperand(i: 0).getReg();
976 Register SrcReg = MI.getOperand(i: 1).getReg();
977 Register OffsetReg = MI.getOperand(i: 2).getReg();
978 Register WidthReg = MI.getOperand(i: 3).getReg();
979
980 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
981 "scalar BFX instructions are expanded in regbankselect");
982 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
983 "64-bit vector BFX instructions are expanded in regbankselect");
984
985 const DebugLoc &DL = MI.getDebugLoc();
986 MachineBasicBlock *MBB = MI.getParent();
987
988 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
989 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
990 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
991 .addReg(RegNo: SrcReg)
992 .addReg(RegNo: OffsetReg)
993 .addReg(RegNo: WidthReg);
994 MI.eraseFromParent();
995 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
996 return true;
997}
998
999bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
1000 if (STI.getLDSBankCount() != 16)
1001 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
1002
1003 Register Dst = MI.getOperand(i: 0).getReg();
1004 Register Src0 = MI.getOperand(i: 2).getReg();
1005 Register M0Val = MI.getOperand(i: 6).getReg();
1006 if (!RBI.constrainGenericRegister(Reg: M0Val, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
1007 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI) ||
1008 !RBI.constrainGenericRegister(Reg: Src0, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI))
1009 return false;
1010
1011 // This requires 2 instructions. It is possible to write a pattern to support
1012 // this, but the generated isel emitter doesn't correctly deal with multiple
1013 // output instructions using the same physical register input. The copy to m0
1014 // is incorrectly placed before the second instruction.
1015 //
1016 // TODO: Match source modifiers.
1017
1018 Register InterpMov = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1019 const DebugLoc &DL = MI.getDebugLoc();
1020 MachineBasicBlock *MBB = MI.getParent();
1021
1022 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1023 .addReg(RegNo: M0Val);
1024 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_INTERP_MOV_F32), DestReg: InterpMov)
1025 .addImm(Val: 2)
1026 .addImm(Val: MI.getOperand(i: 4).getImm()) // $attr
1027 .addImm(Val: MI.getOperand(i: 3).getImm()); // $attrchan
1028
1029 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_INTERP_P1LV_F16), DestReg: Dst)
1030 .addImm(Val: 0) // $src0_modifiers
1031 .addReg(RegNo: Src0) // $src0
1032 .addImm(Val: MI.getOperand(i: 4).getImm()) // $attr
1033 .addImm(Val: MI.getOperand(i: 3).getImm()) // $attrchan
1034 .addImm(Val: 0) // $src2_modifiers
1035 .addReg(RegNo: InterpMov) // $src2 - 2 f16 values selected by high
1036 .addImm(Val: MI.getOperand(i: 5).getImm()) // $high
1037 .addImm(Val: 0) // $clamp
1038 .addImm(Val: 0); // $omod
1039
1040 MI.eraseFromParent();
1041 return true;
1042}
1043
1044// Writelane is special in that it can use SGPR and M0 (which would normally
1045// count as using the constant bus twice - but in this case it is allowed since
1046// the lane selector doesn't count as a use of the constant bus). However, it is
1047// still required to abide by the 1 SGPR rule. Fix this up if we might have
1048// multiple SGPRs.
1049bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1050 // With a constant bus limit of at least 2, there's no issue.
1051 if (STI.getConstantBusLimit(Opcode: AMDGPU::V_WRITELANE_B32) > 1)
1052 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
1053
1054 MachineBasicBlock *MBB = MI.getParent();
1055 const DebugLoc &DL = MI.getDebugLoc();
1056 Register VDst = MI.getOperand(i: 0).getReg();
1057 Register Val = MI.getOperand(i: 2).getReg();
1058 Register LaneSelect = MI.getOperand(i: 3).getReg();
1059 Register VDstIn = MI.getOperand(i: 4).getReg();
1060
1061 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_WRITELANE_B32), DestReg: VDst);
1062
1063 std::optional<ValueAndVReg> ConstSelect =
1064 getIConstantVRegValWithLookThrough(VReg: LaneSelect, MRI: *MRI);
1065 if (ConstSelect) {
1066 // The selector has to be an inline immediate, so we can use whatever for
1067 // the other operands.
1068 MIB.addReg(RegNo: Val);
1069 MIB.addImm(Val: ConstSelect->Value.getSExtValue() &
1070 maskTrailingOnes<uint64_t>(N: STI.getWavefrontSizeLog2()));
1071 } else {
1072 std::optional<ValueAndVReg> ConstVal =
1073 getIConstantVRegValWithLookThrough(VReg: Val, MRI: *MRI);
1074
1075 // If the value written is an inline immediate, we can get away without a
1076 // copy to m0.
1077 if (ConstVal && AMDGPU::isInlinableLiteral32(Literal: ConstVal->Value.getSExtValue(),
1078 HasInv2Pi: STI.hasInv2PiInlineImm())) {
1079 MIB.addImm(Val: ConstVal->Value.getSExtValue());
1080 MIB.addReg(RegNo: LaneSelect);
1081 } else {
1082 MIB.addReg(RegNo: Val);
1083
1084 // If the lane selector was originally in a VGPR and copied with
1085 // readfirstlane, there's a hazard to read the same SGPR from the
1086 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1087 RBI.constrainGenericRegister(Reg: LaneSelect, RC: AMDGPU::SReg_32_XM0RegClass, MRI&: *MRI);
1088
1089 BuildMI(BB&: *MBB, I&: *MIB, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1090 .addReg(RegNo: LaneSelect);
1091 MIB.addReg(RegNo: AMDGPU::M0);
1092 }
1093 }
1094
1095 MIB.addReg(RegNo: VDstIn);
1096
1097 MI.eraseFromParent();
1098 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1099 return true;
1100}
1101
1102// We need to handle this here because tablegen doesn't support matching
1103// instructions with multiple outputs.
1104bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1105 Register Dst0 = MI.getOperand(i: 0).getReg();
1106 Register Dst1 = MI.getOperand(i: 1).getReg();
1107
1108 LLT Ty = MRI->getType(Reg: Dst0);
1109 unsigned Opc;
1110 if (Ty == LLT::scalar(SizeInBits: 32))
1111 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1112 else if (Ty == LLT::scalar(SizeInBits: 64))
1113 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1114 else
1115 return false;
1116
1117 // TODO: Match source modifiers.
1118
1119 const DebugLoc &DL = MI.getDebugLoc();
1120 MachineBasicBlock *MBB = MI.getParent();
1121
1122 Register Numer = MI.getOperand(i: 3).getReg();
1123 Register Denom = MI.getOperand(i: 4).getReg();
1124 unsigned ChooseDenom = MI.getOperand(i: 5).getImm();
1125
1126 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1127
1128 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: Dst0)
1129 .addDef(RegNo: Dst1)
1130 .addImm(Val: 0) // $src0_modifiers
1131 .addUse(RegNo: Src0) // $src0
1132 .addImm(Val: 0) // $src1_modifiers
1133 .addUse(RegNo: Denom) // $src1
1134 .addImm(Val: 0) // $src2_modifiers
1135 .addUse(RegNo: Numer) // $src2
1136 .addImm(Val: 0) // $clamp
1137 .addImm(Val: 0); // $omod
1138
1139 MI.eraseFromParent();
1140 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1141 return true;
1142}
1143
1144bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1145 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
1146 switch (IntrinsicID) {
1147 case Intrinsic::amdgcn_if_break: {
1148 MachineBasicBlock *BB = I.getParent();
1149
1150 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1151 // SelectionDAG uses for wave32 vs wave64.
1152 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::SI_IF_BREAK))
1153 .add(MO: I.getOperand(i: 0))
1154 .add(MO: I.getOperand(i: 2))
1155 .add(MO: I.getOperand(i: 3));
1156
1157 Register DstReg = I.getOperand(i: 0).getReg();
1158 Register Src0Reg = I.getOperand(i: 2).getReg();
1159 Register Src1Reg = I.getOperand(i: 3).getReg();
1160
1161 I.eraseFromParent();
1162
1163 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1164 MRI->setRegClass(Reg, RC: TRI.getWaveMaskRegClass());
1165
1166 return true;
1167 }
1168 case Intrinsic::amdgcn_interp_p1_f16:
1169 return selectInterpP1F16(MI&: I);
1170 case Intrinsic::amdgcn_wqm:
1171 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::WQM);
1172 case Intrinsic::amdgcn_softwqm:
1173 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::SOFT_WQM);
1174 case Intrinsic::amdgcn_strict_wwm:
1175 case Intrinsic::amdgcn_wwm:
1176 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::STRICT_WWM);
1177 case Intrinsic::amdgcn_strict_wqm:
1178 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::STRICT_WQM);
1179 case Intrinsic::amdgcn_writelane:
1180 return selectWritelane(MI&: I);
1181 case Intrinsic::amdgcn_div_scale:
1182 return selectDivScale(MI&: I);
1183 case Intrinsic::amdgcn_icmp:
1184 case Intrinsic::amdgcn_fcmp:
1185 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
1186 return true;
1187 return selectIntrinsicCmp(MI&: I);
1188 case Intrinsic::amdgcn_ballot:
1189 return selectBallot(I);
1190 case Intrinsic::amdgcn_reloc_constant:
1191 return selectRelocConstant(I);
1192 case Intrinsic::amdgcn_groupstaticsize:
1193 return selectGroupStaticSize(I);
1194 case Intrinsic::returnaddress:
1195 return selectReturnAddress(I);
1196 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1197 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1198 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1199 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1200 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1201 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1202 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1203 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1204 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1205 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1206 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1207 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1208 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1209 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1210 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1211 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1212 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1213 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1214 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1215 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1216 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1217 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1218 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1219 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1220 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1221 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1222 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1223 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1224 return selectSMFMACIntrin(I);
1225 case Intrinsic::amdgcn_permlane16_swap:
1226 case Intrinsic::amdgcn_permlane32_swap:
1227 return selectPermlaneSwapIntrin(I, IntrID: IntrinsicID);
1228 case Intrinsic::amdgcn_wave_shuffle:
1229 return selectWaveShuffleIntrin(I);
1230 default:
1231 return selectImpl(I, CoverageInfo&: *CoverageInfo);
1232 }
1233}
1234
1235static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
1236 const GCNSubtarget &ST) {
1237 if (Size != 16 && Size != 32 && Size != 64)
1238 return -1;
1239
1240 if (Size == 16 && !ST.has16BitInsts())
1241 return -1;
1242
1243 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1244 unsigned FakeS16Opc, unsigned S32Opc,
1245 unsigned S64Opc) {
1246 if (Size == 16)
1247 return ST.hasTrue16BitInsts()
1248 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1249 : S16Opc;
1250 if (Size == 32)
1251 return S32Opc;
1252 return S64Opc;
1253 };
1254
1255 switch (P) {
1256 default:
1257 llvm_unreachable("Unknown condition code!");
1258 case CmpInst::ICMP_NE:
1259 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1260 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1261 AMDGPU::V_CMP_NE_U64_e64);
1262 case CmpInst::ICMP_EQ:
1263 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1264 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1265 AMDGPU::V_CMP_EQ_U64_e64);
1266 case CmpInst::ICMP_SGT:
1267 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1268 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1269 AMDGPU::V_CMP_GT_I64_e64);
1270 case CmpInst::ICMP_SGE:
1271 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1272 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1273 AMDGPU::V_CMP_GE_I64_e64);
1274 case CmpInst::ICMP_SLT:
1275 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1276 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1277 AMDGPU::V_CMP_LT_I64_e64);
1278 case CmpInst::ICMP_SLE:
1279 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1280 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1281 AMDGPU::V_CMP_LE_I64_e64);
1282 case CmpInst::ICMP_UGT:
1283 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1284 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1285 AMDGPU::V_CMP_GT_U64_e64);
1286 case CmpInst::ICMP_UGE:
1287 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1288 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1289 AMDGPU::V_CMP_GE_U64_e64);
1290 case CmpInst::ICMP_ULT:
1291 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1292 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1293 AMDGPU::V_CMP_LT_U64_e64);
1294 case CmpInst::ICMP_ULE:
1295 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1296 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1297 AMDGPU::V_CMP_LE_U64_e64);
1298
1299 case CmpInst::FCMP_OEQ:
1300 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1301 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1302 AMDGPU::V_CMP_EQ_F64_e64);
1303 case CmpInst::FCMP_OGT:
1304 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1305 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1306 AMDGPU::V_CMP_GT_F64_e64);
1307 case CmpInst::FCMP_OGE:
1308 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1309 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1310 AMDGPU::V_CMP_GE_F64_e64);
1311 case CmpInst::FCMP_OLT:
1312 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1313 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1314 AMDGPU::V_CMP_LT_F64_e64);
1315 case CmpInst::FCMP_OLE:
1316 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1317 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1318 AMDGPU::V_CMP_LE_F64_e64);
1319 case CmpInst::FCMP_ONE:
1320 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1321 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1322 AMDGPU::V_CMP_NEQ_F64_e64);
1323 case CmpInst::FCMP_ORD:
1324 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1325 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1326 AMDGPU::V_CMP_O_F64_e64);
1327 case CmpInst::FCMP_UNO:
1328 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1329 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1330 AMDGPU::V_CMP_U_F64_e64);
1331 case CmpInst::FCMP_UEQ:
1332 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1333 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1334 AMDGPU::V_CMP_NLG_F64_e64);
1335 case CmpInst::FCMP_UGT:
1336 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1337 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1338 AMDGPU::V_CMP_NLE_F64_e64);
1339 case CmpInst::FCMP_UGE:
1340 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1341 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1342 AMDGPU::V_CMP_NLT_F64_e64);
1343 case CmpInst::FCMP_ULT:
1344 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1345 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1346 AMDGPU::V_CMP_NGE_F64_e64);
1347 case CmpInst::FCMP_ULE:
1348 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1349 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1350 AMDGPU::V_CMP_NGT_F64_e64);
1351 case CmpInst::FCMP_UNE:
1352 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1353 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1354 AMDGPU::V_CMP_NEQ_F64_e64);
1355 case CmpInst::FCMP_TRUE:
1356 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1357 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1358 AMDGPU::V_CMP_TRU_F64_e64);
1359 case CmpInst::FCMP_FALSE:
1360 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1361 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1362 AMDGPU::V_CMP_F_F64_e64);
1363 }
1364}
1365
1366int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1367 unsigned Size) const {
1368 if (Size == 64) {
1369 if (!STI.hasScalarCompareEq64())
1370 return -1;
1371
1372 switch (P) {
1373 case CmpInst::ICMP_NE:
1374 return AMDGPU::S_CMP_LG_U64;
1375 case CmpInst::ICMP_EQ:
1376 return AMDGPU::S_CMP_EQ_U64;
1377 default:
1378 return -1;
1379 }
1380 }
1381
1382 if (Size == 32) {
1383 switch (P) {
1384 case CmpInst::ICMP_NE:
1385 return AMDGPU::S_CMP_LG_U32;
1386 case CmpInst::ICMP_EQ:
1387 return AMDGPU::S_CMP_EQ_U32;
1388 case CmpInst::ICMP_SGT:
1389 return AMDGPU::S_CMP_GT_I32;
1390 case CmpInst::ICMP_SGE:
1391 return AMDGPU::S_CMP_GE_I32;
1392 case CmpInst::ICMP_SLT:
1393 return AMDGPU::S_CMP_LT_I32;
1394 case CmpInst::ICMP_SLE:
1395 return AMDGPU::S_CMP_LE_I32;
1396 case CmpInst::ICMP_UGT:
1397 return AMDGPU::S_CMP_GT_U32;
1398 case CmpInst::ICMP_UGE:
1399 return AMDGPU::S_CMP_GE_U32;
1400 case CmpInst::ICMP_ULT:
1401 return AMDGPU::S_CMP_LT_U32;
1402 case CmpInst::ICMP_ULE:
1403 return AMDGPU::S_CMP_LE_U32;
1404 case CmpInst::FCMP_OEQ:
1405 return AMDGPU::S_CMP_EQ_F32;
1406 case CmpInst::FCMP_OGT:
1407 return AMDGPU::S_CMP_GT_F32;
1408 case CmpInst::FCMP_OGE:
1409 return AMDGPU::S_CMP_GE_F32;
1410 case CmpInst::FCMP_OLT:
1411 return AMDGPU::S_CMP_LT_F32;
1412 case CmpInst::FCMP_OLE:
1413 return AMDGPU::S_CMP_LE_F32;
1414 case CmpInst::FCMP_ONE:
1415 return AMDGPU::S_CMP_LG_F32;
1416 case CmpInst::FCMP_ORD:
1417 return AMDGPU::S_CMP_O_F32;
1418 case CmpInst::FCMP_UNO:
1419 return AMDGPU::S_CMP_U_F32;
1420 case CmpInst::FCMP_UEQ:
1421 return AMDGPU::S_CMP_NLG_F32;
1422 case CmpInst::FCMP_UGT:
1423 return AMDGPU::S_CMP_NLE_F32;
1424 case CmpInst::FCMP_UGE:
1425 return AMDGPU::S_CMP_NLT_F32;
1426 case CmpInst::FCMP_ULT:
1427 return AMDGPU::S_CMP_NGE_F32;
1428 case CmpInst::FCMP_ULE:
1429 return AMDGPU::S_CMP_NGT_F32;
1430 case CmpInst::FCMP_UNE:
1431 return AMDGPU::S_CMP_NEQ_F32;
1432 default:
1433 llvm_unreachable("Unknown condition code!");
1434 }
1435 }
1436
1437 if (Size == 16) {
1438 if (!STI.hasSALUFloatInsts())
1439 return -1;
1440
1441 switch (P) {
1442 case CmpInst::FCMP_OEQ:
1443 return AMDGPU::S_CMP_EQ_F16;
1444 case CmpInst::FCMP_OGT:
1445 return AMDGPU::S_CMP_GT_F16;
1446 case CmpInst::FCMP_OGE:
1447 return AMDGPU::S_CMP_GE_F16;
1448 case CmpInst::FCMP_OLT:
1449 return AMDGPU::S_CMP_LT_F16;
1450 case CmpInst::FCMP_OLE:
1451 return AMDGPU::S_CMP_LE_F16;
1452 case CmpInst::FCMP_ONE:
1453 return AMDGPU::S_CMP_LG_F16;
1454 case CmpInst::FCMP_ORD:
1455 return AMDGPU::S_CMP_O_F16;
1456 case CmpInst::FCMP_UNO:
1457 return AMDGPU::S_CMP_U_F16;
1458 case CmpInst::FCMP_UEQ:
1459 return AMDGPU::S_CMP_NLG_F16;
1460 case CmpInst::FCMP_UGT:
1461 return AMDGPU::S_CMP_NLE_F16;
1462 case CmpInst::FCMP_UGE:
1463 return AMDGPU::S_CMP_NLT_F16;
1464 case CmpInst::FCMP_ULT:
1465 return AMDGPU::S_CMP_NGE_F16;
1466 case CmpInst::FCMP_ULE:
1467 return AMDGPU::S_CMP_NGT_F16;
1468 case CmpInst::FCMP_UNE:
1469 return AMDGPU::S_CMP_NEQ_F16;
1470 default:
1471 llvm_unreachable("Unknown condition code!");
1472 }
1473 }
1474
1475 return -1;
1476}
1477
1478bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1479
1480 MachineBasicBlock *BB = I.getParent();
1481 const DebugLoc &DL = I.getDebugLoc();
1482
1483 Register SrcReg = I.getOperand(i: 2).getReg();
1484 unsigned Size = RBI.getSizeInBits(Reg: SrcReg, MRI: *MRI, TRI);
1485
1486 auto Pred = (CmpInst::Predicate)I.getOperand(i: 1).getPredicate();
1487
1488 Register CCReg = I.getOperand(i: 0).getReg();
1489 if (!isVCC(Reg: CCReg, MRI: *MRI)) {
1490 int Opcode = getS_CMPOpcode(P: Pred, Size);
1491 if (Opcode == -1)
1492 return false;
1493 MachineInstr *ICmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode))
1494 .add(MO: I.getOperand(i: 2))
1495 .add(MO: I.getOperand(i: 3));
1496 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CCReg)
1497 .addReg(RegNo: AMDGPU::SCC);
1498 constrainSelectedInstRegOperands(I&: *ICmp, TII, TRI, RBI);
1499 bool Ret =
1500 RBI.constrainGenericRegister(Reg: CCReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
1501 I.eraseFromParent();
1502 return Ret;
1503 }
1504
1505 if (I.getOpcode() == AMDGPU::G_FCMP)
1506 return false;
1507
1508 int Opcode = getV_CMPOpcode(P: Pred, Size, ST: *Subtarget);
1509 if (Opcode == -1)
1510 return false;
1511
1512 MachineInstrBuilder ICmp;
1513 // t16 instructions
1514 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers)) {
1515 ICmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: I.getOperand(i: 0).getReg())
1516 .addImm(Val: 0)
1517 .add(MO: I.getOperand(i: 2))
1518 .addImm(Val: 0)
1519 .add(MO: I.getOperand(i: 3))
1520 .addImm(Val: 0); // op_sel
1521 } else {
1522 ICmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: I.getOperand(i: 0).getReg())
1523 .add(MO: I.getOperand(i: 2))
1524 .add(MO: I.getOperand(i: 3));
1525 }
1526
1527 RBI.constrainGenericRegister(Reg: ICmp->getOperand(i: 0).getReg(),
1528 RC: *TRI.getBoolRC(), MRI&: *MRI);
1529 constrainSelectedInstRegOperands(I&: *ICmp, TII, TRI, RBI);
1530 I.eraseFromParent();
1531 return true;
1532}
1533
1534bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1535 Register Dst = I.getOperand(i: 0).getReg();
1536 if (isVCC(Reg: Dst, MRI: *MRI))
1537 return false;
1538
1539 LLT DstTy = MRI->getType(Reg: Dst);
1540 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1541 return false;
1542
1543 MachineBasicBlock *BB = I.getParent();
1544 const DebugLoc &DL = I.getDebugLoc();
1545 Register SrcReg = I.getOperand(i: 2).getReg();
1546 unsigned Size = RBI.getSizeInBits(Reg: SrcReg, MRI: *MRI, TRI);
1547
1548 // i1 inputs are not supported in GlobalISel.
1549 if (Size == 1)
1550 return false;
1551
1552 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(i: 4).getImm());
1553 if (!CmpInst::isIntPredicate(P: Pred) && !CmpInst::isFPPredicate(P: Pred)) {
1554 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Dst);
1555 I.eraseFromParent();
1556 return RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI);
1557 }
1558
1559 const int Opcode = getV_CMPOpcode(P: Pred, Size, ST: *Subtarget);
1560 if (Opcode == -1)
1561 return false;
1562
1563 MachineInstrBuilder SelectedMI;
1564 MachineOperand &LHS = I.getOperand(i: 2);
1565 MachineOperand &RHS = I.getOperand(i: 3);
1566 auto [Src0, Src0Mods] = selectVOP3ModsImpl(Src: LHS.getReg());
1567 auto [Src1, Src1Mods] = selectVOP3ModsImpl(Src: RHS.getReg());
1568 Register Src0Reg =
1569 copyToVGPRIfSrcFolded(Src: Src0, Mods: Src0Mods, Root: LHS, InsertPt: &I, /*ForceVGPR*/ true);
1570 Register Src1Reg =
1571 copyToVGPRIfSrcFolded(Src: Src1, Mods: Src1Mods, Root: RHS, InsertPt: &I, /*ForceVGPR*/ true);
1572 SelectedMI = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: Dst);
1573 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers))
1574 SelectedMI.addImm(Val: Src0Mods);
1575 SelectedMI.addReg(RegNo: Src0Reg);
1576 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src1_modifiers))
1577 SelectedMI.addImm(Val: Src1Mods);
1578 SelectedMI.addReg(RegNo: Src1Reg);
1579 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::clamp))
1580 SelectedMI.addImm(Val: 0); // clamp
1581 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::op_sel))
1582 SelectedMI.addImm(Val: 0); // op_sel
1583
1584 RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI);
1585 constrainSelectedInstRegOperands(I&: *SelectedMI, TII, TRI, RBI);
1586
1587 I.eraseFromParent();
1588 return true;
1589}
1590
1591// Ballot has to zero bits in input lane-mask that are zero in current exec,
1592// Done as AND with exec. For inputs that are results of instruction that
1593// implicitly use same exec, for example compares in same basic block or SCC to
1594// VCC copy, use copy.
1595static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI,
1596 MachineBasicBlock *MBB) {
1597 MachineInstr *MI = MRI.getVRegDef(Reg);
1598 if (MI->getParent() != MBB)
1599 return false;
1600
1601 // Lane mask generated by SCC to VCC copy.
1602 if (MI->getOpcode() == AMDGPU::COPY) {
1603 auto DstRB = MRI.getRegBankOrNull(Reg: MI->getOperand(i: 0).getReg());
1604 auto SrcRB = MRI.getRegBankOrNull(Reg: MI->getOperand(i: 1).getReg());
1605 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1606 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1607 return true;
1608 }
1609
1610 // Lane mask generated using compare with same exec.
1611 if (isa<GAnyCmp>(Val: MI))
1612 return true;
1613
1614 Register LHS, RHS;
1615 // Look through AND.
1616 if (mi_match(R: Reg, MRI, P: m_GAnd(L: m_Reg(R&: LHS), R: m_Reg(R&: RHS))))
1617 return isLaneMaskFromSameBlock(Reg: LHS, MRI, MBB) ||
1618 isLaneMaskFromSameBlock(Reg: RHS, MRI, MBB);
1619
1620 return false;
1621}
1622
1623bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1624 MachineBasicBlock *BB = I.getParent();
1625 const DebugLoc &DL = I.getDebugLoc();
1626 Register DstReg = I.getOperand(i: 0).getReg();
1627 Register SrcReg = I.getOperand(i: 2).getReg();
1628 const unsigned BallotSize = MRI->getType(Reg: DstReg).getSizeInBits();
1629 const unsigned WaveSize = STI.getWavefrontSize();
1630
1631 // In the common case, the return type matches the wave size.
1632 // However we also support emitting i64 ballots in wave32 mode.
1633 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1634 return false;
1635
1636 std::optional<ValueAndVReg> Arg =
1637 getIConstantVRegValWithLookThrough(VReg: SrcReg, MRI: *MRI);
1638
1639 Register Dst = DstReg;
1640 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1641 if (BallotSize != WaveSize) {
1642 Dst = MRI->createVirtualRegister(RegClass: TRI.getBoolRC());
1643 }
1644
1645 if (Arg) {
1646 const int64_t Value = Arg->Value.getZExtValue();
1647 if (Value == 0) {
1648 // Dst = S_MOV 0
1649 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1650 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: Dst).addImm(Val: 0);
1651 } else {
1652 // Dst = COPY EXEC
1653 assert(Value == 1);
1654 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: Dst).addReg(RegNo: TRI.getExec());
1655 }
1656 if (!RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI))
1657 return false;
1658 } else {
1659 if (isLaneMaskFromSameBlock(Reg: SrcReg, MRI&: *MRI, MBB: BB)) {
1660 // Dst = COPY SrcReg
1661 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: Dst).addReg(RegNo: SrcReg);
1662 if (!RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI))
1663 return false;
1664 } else {
1665 // Dst = S_AND SrcReg, EXEC
1666 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1667 auto And = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: Dst)
1668 .addReg(RegNo: SrcReg)
1669 .addReg(RegNo: TRI.getExec())
1670 .setOperandDead(3); // Dead scc
1671 constrainSelectedInstRegOperands(I&: *And, TII, TRI, RBI);
1672 }
1673 }
1674
1675 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1676 if (BallotSize != WaveSize) {
1677 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1678 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: HiReg).addImm(Val: 0);
1679 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
1680 .addReg(RegNo: Dst)
1681 .addImm(Val: AMDGPU::sub0)
1682 .addReg(RegNo: HiReg)
1683 .addImm(Val: AMDGPU::sub1);
1684 }
1685
1686 I.eraseFromParent();
1687 return true;
1688}
1689
1690bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1691 Register DstReg = I.getOperand(i: 0).getReg();
1692 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
1693 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(Size: 32, Bank: *DstBank);
1694 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
1695 return false;
1696
1697 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1698
1699 Module *M = MF->getFunction().getParent();
1700 const MDNode *Metadata = I.getOperand(i: 2).getMetadata();
1701 auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString();
1702 auto *RelocSymbol = cast<GlobalVariable>(
1703 Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext())));
1704
1705 MachineBasicBlock *BB = I.getParent();
1706 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(),
1707 MCID: TII.get(Opcode: IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DestReg: DstReg)
1708 .addGlobalAddress(GV: RelocSymbol, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
1709
1710 I.eraseFromParent();
1711 return true;
1712}
1713
1714bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1715 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1716
1717 Register DstReg = I.getOperand(i: 0).getReg();
1718 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
1719 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1720 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1721
1722 MachineBasicBlock *MBB = I.getParent();
1723 const DebugLoc &DL = I.getDebugLoc();
1724
1725 auto MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Mov), DestReg: DstReg);
1726
1727 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1728 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1729 MIB.addImm(Val: MFI->getLDSSize());
1730 } else {
1731 Module *M = MF->getFunction().getParent();
1732 const GlobalValue *GV =
1733 Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::amdgcn_groupstaticsize);
1734 MIB.addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
1735 }
1736
1737 I.eraseFromParent();
1738 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1739 return true;
1740}
1741
1742bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1743 MachineBasicBlock *MBB = I.getParent();
1744 MachineFunction &MF = *MBB->getParent();
1745 const DebugLoc &DL = I.getDebugLoc();
1746
1747 MachineOperand &Dst = I.getOperand(i: 0);
1748 Register DstReg = Dst.getReg();
1749 unsigned Depth = I.getOperand(i: 2).getImm();
1750
1751 const TargetRegisterClass *RC
1752 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
1753 if (!RC->hasSubClassEq(RC: &AMDGPU::SGPR_64RegClass) ||
1754 !RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI))
1755 return false;
1756
1757 // Check for kernel and shader functions
1758 if (Depth != 0 ||
1759 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1760 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B64), DestReg: DstReg)
1761 .addImm(Val: 0);
1762 I.eraseFromParent();
1763 return true;
1764 }
1765
1766 MachineFrameInfo &MFI = MF.getFrameInfo();
1767 // There is a call to @llvm.returnaddress in this function
1768 MFI.setReturnAddressIsTaken(true);
1769
1770 // Get the return address reg and mark it as an implicit live-in
1771 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1772 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, PhysReg: ReturnAddrReg,
1773 RC: AMDGPU::SReg_64RegClass, DL);
1774 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
1775 .addReg(RegNo: LiveIn);
1776 I.eraseFromParent();
1777 return true;
1778}
1779
1780bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1781 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1782 // SelectionDAG uses for wave32 vs wave64.
1783 MachineBasicBlock *BB = MI.getParent();
1784 BuildMI(BB&: *BB, I: &MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::SI_END_CF))
1785 .add(MO: MI.getOperand(i: 1));
1786
1787 Register Reg = MI.getOperand(i: 1).getReg();
1788 MI.eraseFromParent();
1789
1790 if (!MRI->getRegClassOrNull(Reg))
1791 MRI->setRegClass(Reg, RC: TRI.getWaveMaskRegClass());
1792 return true;
1793}
1794
1795bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1796 MachineInstr &MI, Intrinsic::ID IntrID) const {
1797 MachineBasicBlock *MBB = MI.getParent();
1798 MachineFunction *MF = MBB->getParent();
1799 const DebugLoc &DL = MI.getDebugLoc();
1800
1801 unsigned IndexOperand = MI.getOperand(i: 7).getImm();
1802 bool WaveRelease = MI.getOperand(i: 8).getImm() != 0;
1803 bool WaveDone = MI.getOperand(i: 9).getImm() != 0;
1804
1805 if (WaveDone && !WaveRelease) {
1806 // TODO: Move this to IR verifier
1807 const Function &Fn = MF->getFunction();
1808 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
1809 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1810 }
1811
1812 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1813 IndexOperand &= ~0x3f;
1814 unsigned CountDw = 0;
1815
1816 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1817 CountDw = (IndexOperand >> 24) & 0xf;
1818 IndexOperand &= ~(0xf << 24);
1819
1820 if (CountDw < 1 || CountDw > 4) {
1821 const Function &Fn = MF->getFunction();
1822 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
1823 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1824 CountDw = 1;
1825 }
1826 }
1827
1828 if (IndexOperand) {
1829 const Function &Fn = MF->getFunction();
1830 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
1831 Fn, "ds_ordered_count: bad index operand", DL));
1832 }
1833
1834 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1835 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(MF: *MF);
1836
1837 unsigned Offset0 = OrderedCountIndex << 2;
1838 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1839
1840 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1841 Offset1 |= (CountDw - 1) << 6;
1842
1843 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1844 Offset1 |= ShaderType << 2;
1845
1846 unsigned Offset = Offset0 | (Offset1 << 8);
1847
1848 Register M0Val = MI.getOperand(i: 2).getReg();
1849 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1850 .addReg(RegNo: M0Val);
1851
1852 Register DstReg = MI.getOperand(i: 0).getReg();
1853 Register ValReg = MI.getOperand(i: 3).getReg();
1854 MachineInstrBuilder DS =
1855 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_ORDERED_COUNT), DestReg: DstReg)
1856 .addReg(RegNo: ValReg)
1857 .addImm(Val: Offset)
1858 .cloneMemRefs(OtherMI: MI);
1859
1860 if (!RBI.constrainGenericRegister(Reg: M0Val, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
1861 return false;
1862
1863 constrainSelectedInstRegOperands(I&: *DS, TII, TRI, RBI);
1864 MI.eraseFromParent();
1865 return true;
1866}
1867
1868static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1869 switch (IntrID) {
1870 case Intrinsic::amdgcn_ds_gws_init:
1871 return AMDGPU::DS_GWS_INIT;
1872 case Intrinsic::amdgcn_ds_gws_barrier:
1873 return AMDGPU::DS_GWS_BARRIER;
1874 case Intrinsic::amdgcn_ds_gws_sema_v:
1875 return AMDGPU::DS_GWS_SEMA_V;
1876 case Intrinsic::amdgcn_ds_gws_sema_br:
1877 return AMDGPU::DS_GWS_SEMA_BR;
1878 case Intrinsic::amdgcn_ds_gws_sema_p:
1879 return AMDGPU::DS_GWS_SEMA_P;
1880 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1881 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1882 default:
1883 llvm_unreachable("not a gws intrinsic");
1884 }
1885}
1886
1887bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1888 Intrinsic::ID IID) const {
1889 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1890 !STI.hasGWSSemaReleaseAll()))
1891 return false;
1892
1893 // intrinsic ID, vsrc, offset
1894 const bool HasVSrc = MI.getNumOperands() == 3;
1895 assert(HasVSrc || MI.getNumOperands() == 2);
1896
1897 Register BaseOffset = MI.getOperand(i: HasVSrc ? 2 : 1).getReg();
1898 const RegisterBank *OffsetRB = RBI.getRegBank(Reg: BaseOffset, MRI: *MRI, TRI);
1899 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1900 return false;
1901
1902 MachineInstr *OffsetDef = getDefIgnoringCopies(Reg: BaseOffset, MRI: *MRI);
1903 unsigned ImmOffset;
1904
1905 MachineBasicBlock *MBB = MI.getParent();
1906 const DebugLoc &DL = MI.getDebugLoc();
1907
1908 MachineInstr *Readfirstlane = nullptr;
1909
1910 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1911 // incoming offset, in case there's an add of a constant. We'll have to put it
1912 // back later.
1913 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1914 Readfirstlane = OffsetDef;
1915 BaseOffset = OffsetDef->getOperand(i: 1).getReg();
1916 OffsetDef = getDefIgnoringCopies(Reg: BaseOffset, MRI: *MRI);
1917 }
1918
1919 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1920 // If we have a constant offset, try to use the 0 in m0 as the base.
1921 // TODO: Look into changing the default m0 initialization value. If the
1922 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1923 // the immediate offset.
1924
1925 ImmOffset = OffsetDef->getOperand(i: 1).getCImm()->getZExtValue();
1926 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
1927 .addImm(Val: 0);
1928 } else {
1929 std::tie(args&: BaseOffset, args&: ImmOffset) =
1930 AMDGPU::getBaseWithConstantOffset(MRI&: *MRI, Reg: BaseOffset, ValueTracking: VT);
1931
1932 if (Readfirstlane) {
1933 // We have the constant offset now, so put the readfirstlane back on the
1934 // variable component.
1935 if (!RBI.constrainGenericRegister(Reg: BaseOffset, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI))
1936 return false;
1937
1938 Readfirstlane->getOperand(i: 1).setReg(BaseOffset);
1939 BaseOffset = Readfirstlane->getOperand(i: 0).getReg();
1940 } else {
1941 if (!RBI.constrainGenericRegister(Reg: BaseOffset,
1942 RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
1943 return false;
1944 }
1945
1946 Register M0Base = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1947 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: M0Base)
1948 .addReg(RegNo: BaseOffset)
1949 .addImm(Val: 16)
1950 .setOperandDead(3); // Dead scc
1951
1952 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1953 .addReg(RegNo: M0Base);
1954 }
1955
1956 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1957 // offset field) % 64. Some versions of the programming guide omit the m0
1958 // part, or claim it's from offset 0.
1959
1960 unsigned Opc = gwsIntrinToOpcode(IntrID: IID);
1961 const MCInstrDesc &InstrDesc = TII.get(Opcode: Opc);
1962
1963 if (HasVSrc) {
1964 Register VSrc = MI.getOperand(i: 1).getReg();
1965
1966 int Data0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
1967 const TargetRegisterClass *DataRC = TII.getRegClass(MCID: InstrDesc, OpNum: Data0Idx);
1968 const TargetRegisterClass *SubRC =
1969 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
1970
1971 if (!SubRC) {
1972 // 32-bit normal case.
1973 if (!RBI.constrainGenericRegister(Reg: VSrc, RC: *DataRC, MRI&: *MRI))
1974 return false;
1975
1976 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: InstrDesc)
1977 .addReg(RegNo: VSrc)
1978 .addImm(Val: ImmOffset)
1979 .cloneMemRefs(OtherMI: MI);
1980 } else {
1981 // Requires even register alignment, so create 64-bit value and pad the
1982 // top half with undef.
1983 Register DataReg = MRI->createVirtualRegister(RegClass: DataRC);
1984 if (!RBI.constrainGenericRegister(Reg: VSrc, RC: *SubRC, MRI&: *MRI))
1985 return false;
1986
1987 Register UndefReg = MRI->createVirtualRegister(RegClass: SubRC);
1988 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefReg);
1989 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DataReg)
1990 .addReg(RegNo: VSrc)
1991 .addImm(Val: AMDGPU::sub0)
1992 .addReg(RegNo: UndefReg)
1993 .addImm(Val: AMDGPU::sub1);
1994
1995 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: InstrDesc)
1996 .addReg(RegNo: DataReg)
1997 .addImm(Val: ImmOffset)
1998 .cloneMemRefs(OtherMI: MI);
1999 }
2000 } else {
2001 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: InstrDesc)
2002 .addImm(Val: ImmOffset)
2003 .cloneMemRefs(OtherMI: MI);
2004 }
2005
2006 MI.eraseFromParent();
2007 return true;
2008}
2009
2010bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2011 bool IsAppend) const {
2012 Register PtrBase = MI.getOperand(i: 2).getReg();
2013 LLT PtrTy = MRI->getType(Reg: PtrBase);
2014 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2015
2016 unsigned Offset;
2017 std::tie(args&: PtrBase, args&: Offset) = selectDS1Addr1OffsetImpl(Root&: MI.getOperand(i: 2));
2018
2019 // TODO: Should this try to look through readfirstlane like GWS?
2020 if (!isDSOffsetLegal(Base: PtrBase, Offset)) {
2021 PtrBase = MI.getOperand(i: 2).getReg();
2022 Offset = 0;
2023 }
2024
2025 MachineBasicBlock *MBB = MI.getParent();
2026 const DebugLoc &DL = MI.getDebugLoc();
2027 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2028
2029 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
2030 .addReg(RegNo: PtrBase);
2031 if (!RBI.constrainGenericRegister(Reg: PtrBase, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
2032 return false;
2033
2034 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: MI.getOperand(i: 0).getReg())
2035 .addImm(Val: Offset)
2036 .addImm(Val: IsGDS ? -1 : 0)
2037 .cloneMemRefs(OtherMI: MI);
2038 MI.eraseFromParent();
2039 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2040 return true;
2041}
2042
2043bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2044 MachineFunction *MF = MI.getMF();
2045 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2046
2047 MFInfo->setInitWholeWave();
2048 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
2049}
2050
2051static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2052 bool &IsTexFail) {
2053 if (TexFailCtrl)
2054 IsTexFail = true;
2055
2056 TFE = TexFailCtrl & 0x1;
2057 TexFailCtrl &= ~(uint64_t)0x1;
2058 LWE = TexFailCtrl & 0x2;
2059 TexFailCtrl &= ~(uint64_t)0x2;
2060
2061 return TexFailCtrl == 0;
2062}
2063
2064bool AMDGPUInstructionSelector::selectImageIntrinsic(
2065 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2066 MachineBasicBlock *MBB = MI.getParent();
2067 const DebugLoc &DL = MI.getDebugLoc();
2068 unsigned IntrOpcode = Intr->BaseOpcode;
2069
2070 // For image atomic: use no-return opcode if result is unused.
2071 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2072 Register ResultDef = MI.getOperand(i: 0).getReg();
2073 if (MRI->use_nodbg_empty(RegNo: ResultDef))
2074 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2075 }
2076
2077 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2078 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: IntrOpcode);
2079
2080 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim);
2081 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2082 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2083 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2084
2085 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2086
2087 Register VDataIn = AMDGPU::NoRegister;
2088 Register VDataOut = AMDGPU::NoRegister;
2089 LLT VDataTy;
2090 int NumVDataDwords = -1;
2091 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2092 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2093
2094 bool Unorm;
2095 if (!BaseOpcode->Sampler)
2096 Unorm = true;
2097 else
2098 Unorm = MI.getOperand(i: ArgOffset + Intr->UnormIndex).getImm() != 0;
2099
2100 bool TFE;
2101 bool LWE;
2102 bool IsTexFail = false;
2103 if (!parseTexFail(TexFailCtrl: MI.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2104 TFE, LWE, IsTexFail))
2105 return false;
2106
2107 const int Flags = MI.getOperand(i: ArgOffset + Intr->NumArgs).getImm();
2108 const bool IsA16 = (Flags & 1) != 0;
2109 const bool IsG16 = (Flags & 2) != 0;
2110
2111 // A16 implies 16 bit gradients if subtarget doesn't support G16
2112 if (IsA16 && !STI.hasG16() && !IsG16)
2113 return false;
2114
2115 unsigned DMask = 0;
2116 unsigned DMaskLanes = 0;
2117
2118 if (BaseOpcode->Atomic) {
2119 if (!BaseOpcode->NoReturn)
2120 VDataOut = MI.getOperand(i: 0).getReg();
2121 VDataIn = MI.getOperand(i: 2).getReg();
2122 LLT Ty = MRI->getType(Reg: VDataIn);
2123
2124 // Be careful to allow atomic swap on 16-bit element vectors.
2125 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2126 Ty.getSizeInBits() == 128 :
2127 Ty.getSizeInBits() == 64;
2128
2129 if (BaseOpcode->AtomicX2) {
2130 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2131
2132 DMask = Is64Bit ? 0xf : 0x3;
2133 NumVDataDwords = Is64Bit ? 4 : 2;
2134 } else {
2135 DMask = Is64Bit ? 0x3 : 0x1;
2136 NumVDataDwords = Is64Bit ? 2 : 1;
2137 }
2138 } else {
2139 DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
2140 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask);
2141
2142 if (BaseOpcode->Store) {
2143 VDataIn = MI.getOperand(i: 1).getReg();
2144 VDataTy = MRI->getType(Reg: VDataIn);
2145 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2146 } else if (BaseOpcode->NoReturn) {
2147 NumVDataDwords = 0;
2148 } else {
2149 VDataOut = MI.getOperand(i: 0).getReg();
2150 VDataTy = MRI->getType(Reg: VDataOut);
2151 NumVDataDwords = DMaskLanes;
2152
2153 if (IsD16 && !STI.hasUnpackedD16VMem())
2154 NumVDataDwords = (DMaskLanes + 1) / 2;
2155 }
2156 }
2157
2158 // Set G16 opcode
2159 if (Subtarget->hasG16() && IsG16) {
2160 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2161 AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode);
2162 assert(G16MappingInfo);
2163 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2164 }
2165
2166 // TODO: Check this in verifier.
2167 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2168
2169 unsigned CPol = MI.getOperand(i: ArgOffset + Intr->CachePolicyIndex).getImm();
2170 // Keep GLC only when the atomic's result is actually used.
2171 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2172 CPol |= AMDGPU::CPol::GLC;
2173 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2174 AMDGPU::CPol::VOLATILE))
2175 return false;
2176
2177 int NumVAddrRegs = 0;
2178 int NumVAddrDwords = 0;
2179 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2180 // Skip the $noregs and 0s inserted during legalization.
2181 MachineOperand &AddrOp = MI.getOperand(i: ArgOffset + I);
2182 if (!AddrOp.isReg())
2183 continue; // XXX - Break?
2184
2185 Register Addr = AddrOp.getReg();
2186 if (!Addr)
2187 break;
2188
2189 ++NumVAddrRegs;
2190 NumVAddrDwords += (MRI->getType(Reg: Addr).getSizeInBits() + 31) / 32;
2191 }
2192
2193 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2194 // NSA, these should have been packed into a single value in the first
2195 // address register
2196 const bool UseNSA =
2197 NumVAddrRegs != 1 &&
2198 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2199 : NumVAddrDwords == NumVAddrRegs);
2200 if (UseNSA && !STI.hasFeature(Feature: AMDGPU::FeatureNSAEncoding)) {
2201 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2202 return false;
2203 }
2204
2205 if (IsTexFail)
2206 ++NumVDataDwords;
2207
2208 int Opcode = -1;
2209 if (IsGFX12Plus) {
2210 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx12,
2211 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2212 } else if (IsGFX11Plus) {
2213 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
2214 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx11NSA
2215 : AMDGPU::MIMGEncGfx11Default,
2216 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2217 } else if (IsGFX10Plus) {
2218 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
2219 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx10NSA
2220 : AMDGPU::MIMGEncGfx10Default,
2221 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2222 } else {
2223 if (Subtarget->hasGFX90AInsts()) {
2224 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx90a,
2225 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2226 if (Opcode == -1) {
2227 LLVM_DEBUG(
2228 dbgs()
2229 << "requested image instruction is not supported on this GPU\n");
2230 return false;
2231 }
2232 }
2233 if (Opcode == -1 &&
2234 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2235 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx8,
2236 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2237 if (Opcode == -1)
2238 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx6,
2239 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2240 }
2241 if (Opcode == -1)
2242 return false;
2243
2244 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode))
2245 .cloneMemRefs(OtherMI: MI);
2246
2247 if (VDataOut) {
2248 if (BaseOpcode->AtomicX2) {
2249 const bool Is64 = MRI->getType(Reg: VDataOut).getSizeInBits() == 64;
2250
2251 Register TmpReg = MRI->createVirtualRegister(
2252 RegClass: Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2253 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2254
2255 MIB.addDef(RegNo: TmpReg);
2256 if (!MRI->use_empty(RegNo: VDataOut)) {
2257 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: VDataOut)
2258 .addReg(RegNo: TmpReg, Flags: RegState::Kill, SubReg);
2259 }
2260
2261 } else {
2262 MIB.addDef(RegNo: VDataOut); // vdata output
2263 }
2264 }
2265
2266 if (VDataIn)
2267 MIB.addReg(RegNo: VDataIn); // vdata input
2268
2269 for (int I = 0; I != NumVAddrRegs; ++I) {
2270 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + Intr->VAddrStart + I);
2271 if (SrcOp.isReg()) {
2272 assert(SrcOp.getReg() != 0);
2273 MIB.addReg(RegNo: SrcOp.getReg());
2274 }
2275 }
2276
2277 MIB.addReg(RegNo: MI.getOperand(i: ArgOffset + Intr->RsrcIndex).getReg());
2278 if (BaseOpcode->Sampler)
2279 MIB.addReg(RegNo: MI.getOperand(i: ArgOffset + Intr->SampIndex).getReg());
2280
2281 MIB.addImm(Val: DMask); // dmask
2282
2283 if (IsGFX10Plus)
2284 MIB.addImm(Val: DimInfo->Encoding);
2285 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::unorm))
2286 MIB.addImm(Val: Unorm);
2287
2288 MIB.addImm(Val: CPol);
2289 MIB.addImm(Val: IsA16 && // a16 or r128
2290 STI.hasFeature(Feature: AMDGPU::FeatureR128A16) ? -1 : 0);
2291 if (IsGFX10Plus)
2292 MIB.addImm(Val: IsA16 ? -1 : 0);
2293
2294 if (!Subtarget->hasGFX90AInsts()) {
2295 MIB.addImm(Val: TFE); // tfe
2296 } else if (TFE) {
2297 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2298 return false;
2299 }
2300
2301 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::lwe))
2302 MIB.addImm(Val: LWE); // lwe
2303 if (!IsGFX10Plus)
2304 MIB.addImm(Val: DimInfo->DA ? -1 : 0);
2305 if (BaseOpcode->HasD16)
2306 MIB.addImm(Val: IsD16 ? -1 : 0);
2307
2308 MI.eraseFromParent();
2309 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2310 TII.enforceOperandRCAlignment(MI&: *MIB, OpName: AMDGPU::OpName::vaddr);
2311 return true;
2312}
2313
2314// We need to handle this here because tablegen doesn't support matching
2315// instructions with multiple outputs.
2316bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2317 MachineInstr &MI) const {
2318 Register Dst0 = MI.getOperand(i: 0).getReg();
2319 Register Dst1 = MI.getOperand(i: 1).getReg();
2320
2321 const DebugLoc &DL = MI.getDebugLoc();
2322 MachineBasicBlock *MBB = MI.getParent();
2323
2324 Register Addr = MI.getOperand(i: 3).getReg();
2325 Register Data0 = MI.getOperand(i: 4).getReg();
2326 Register Data1 = MI.getOperand(i: 5).getReg();
2327 unsigned Offset = MI.getOperand(i: 6).getImm();
2328
2329 unsigned Opc;
2330 switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
2331 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2332 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2333 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2334 break;
2335 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2336 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2337 break;
2338 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2339 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2340 break;
2341 }
2342
2343 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: Dst0)
2344 .addDef(RegNo: Dst1)
2345 .addUse(RegNo: Addr)
2346 .addUse(RegNo: Data0)
2347 .addUse(RegNo: Data1)
2348 .addImm(Val: Offset)
2349 .cloneMemRefs(OtherMI: MI);
2350
2351 MI.eraseFromParent();
2352 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2353 return true;
2354}
2355
2356bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2357 MachineInstr &I) const {
2358 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
2359 switch (IntrinsicID) {
2360 case Intrinsic::amdgcn_end_cf:
2361 return selectEndCfIntrinsic(MI&: I);
2362 case Intrinsic::amdgcn_ds_ordered_add:
2363 case Intrinsic::amdgcn_ds_ordered_swap:
2364 return selectDSOrderedIntrinsic(MI&: I, IntrID: IntrinsicID);
2365 case Intrinsic::amdgcn_ds_gws_init:
2366 case Intrinsic::amdgcn_ds_gws_barrier:
2367 case Intrinsic::amdgcn_ds_gws_sema_v:
2368 case Intrinsic::amdgcn_ds_gws_sema_br:
2369 case Intrinsic::amdgcn_ds_gws_sema_p:
2370 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2371 return selectDSGWSIntrinsic(MI&: I, IID: IntrinsicID);
2372 case Intrinsic::amdgcn_ds_append:
2373 return selectDSAppendConsume(MI&: I, IsAppend: true);
2374 case Intrinsic::amdgcn_ds_consume:
2375 return selectDSAppendConsume(MI&: I, IsAppend: false);
2376 case Intrinsic::amdgcn_init_whole_wave:
2377 return selectInitWholeWave(MI&: I);
2378 case Intrinsic::amdgcn_raw_buffer_load_lds:
2379 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
2380 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2381 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
2382 case Intrinsic::amdgcn_struct_buffer_load_lds:
2383 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
2384 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2385 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
2386 return selectBufferLoadLds(MI&: I);
2387 // Until we can store both the address space of the global and the LDS
2388 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2389 // that the argument is a global pointer (buffer pointers have been handled by
2390 // a LLVM IR-level lowering).
2391 case Intrinsic::amdgcn_load_to_lds:
2392 case Intrinsic::amdgcn_load_async_to_lds:
2393 case Intrinsic::amdgcn_global_load_lds:
2394 case Intrinsic::amdgcn_global_load_async_lds:
2395 return selectGlobalLoadLds(MI&: I);
2396 case Intrinsic::amdgcn_tensor_load_to_lds:
2397 case Intrinsic::amdgcn_tensor_store_from_lds:
2398 return selectTensorLoadStore(MI&: I, IID: IntrinsicID);
2399 case Intrinsic::amdgcn_asyncmark:
2400 case Intrinsic::amdgcn_wait_asyncmark:
2401 // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
2402 if (!Subtarget->hasVMemToLDSLoad())
2403 return false;
2404 break;
2405 case Intrinsic::amdgcn_exp_compr:
2406 if (!STI.hasCompressedExport()) {
2407 Function &F = I.getMF()->getFunction();
2408 F.getContext().diagnose(
2409 DI: DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2410 I.getDebugLoc(), DS_Error));
2411 return false;
2412 }
2413 break;
2414 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2415 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2416 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2417 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2418 return selectDSBvhStackIntrinsic(MI&: I);
2419 case Intrinsic::amdgcn_s_alloc_vgpr: {
2420 // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
2421 // SCC. We then need to COPY it into the result vreg.
2422 MachineBasicBlock *MBB = I.getParent();
2423 const DebugLoc &DL = I.getDebugLoc();
2424
2425 Register ResReg = I.getOperand(i: 0).getReg();
2426
2427 MachineInstr *AllocMI = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ALLOC_VGPR))
2428 .add(MO: I.getOperand(i: 2));
2429 (void)BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: ResReg)
2430 .addReg(RegNo: AMDGPU::SCC);
2431 I.eraseFromParent();
2432 constrainSelectedInstRegOperands(I&: *AllocMI, TII, TRI, RBI);
2433 return RBI.constrainGenericRegister(Reg: ResReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2434 }
2435 case Intrinsic::amdgcn_s_barrier_init:
2436 case Intrinsic::amdgcn_s_barrier_signal_var:
2437 return selectNamedBarrierInit(I, IID: IntrinsicID);
2438 case Intrinsic::amdgcn_s_wakeup_barrier: {
2439 if (!STI.hasSWakeupBarrier()) {
2440 Function &F = I.getMF()->getFunction();
2441 F.getContext().diagnose(
2442 DI: DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2443 I.getDebugLoc(), DS_Error));
2444 return false;
2445 }
2446 return selectNamedBarrierInst(I, IID: IntrinsicID);
2447 }
2448 case Intrinsic::amdgcn_s_barrier_join:
2449 case Intrinsic::amdgcn_s_get_named_barrier_state:
2450 return selectNamedBarrierInst(I, IID: IntrinsicID);
2451 case Intrinsic::amdgcn_s_get_barrier_state:
2452 return selectSGetBarrierState(I, IID: IntrinsicID);
2453 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2454 return selectSBarrierSignalIsfirst(I, IID: IntrinsicID);
2455 }
2456 return selectImpl(I, CoverageInfo&: *CoverageInfo);
2457}
2458
2459bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2460 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
2461 return true;
2462
2463 MachineBasicBlock *BB = I.getParent();
2464 const DebugLoc &DL = I.getDebugLoc();
2465
2466 Register DstReg = I.getOperand(i: 0).getReg();
2467 unsigned Size = RBI.getSizeInBits(Reg: DstReg, MRI: *MRI, TRI);
2468 assert(Size <= 32 || Size == 64);
2469 const MachineOperand &CCOp = I.getOperand(i: 1);
2470 Register CCReg = CCOp.getReg();
2471 if (!isVCC(Reg: CCReg, MRI: *MRI)) {
2472 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2473 AMDGPU::S_CSELECT_B32;
2474 MachineInstr *CopySCC = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC)
2475 .addReg(RegNo: CCReg);
2476
2477 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2478 // bank, because it does not cover the register class that we used to represent
2479 // for it. So we need to manually set the register class here.
2480 if (!MRI->getRegClassOrNull(Reg: CCReg))
2481 MRI->setRegClass(Reg: CCReg, RC: TRI.getConstrainedRegClassForOperand(MO: CCOp, MRI: *MRI));
2482 MachineInstr *Select = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: SelectOpcode), DestReg: DstReg)
2483 .add(MO: I.getOperand(i: 2))
2484 .add(MO: I.getOperand(i: 3));
2485
2486 constrainSelectedInstRegOperands(I&: *Select, TII, TRI, RBI);
2487 constrainSelectedInstRegOperands(I&: *CopySCC, TII, TRI, RBI);
2488 I.eraseFromParent();
2489 return true;
2490 }
2491
2492 // Wide VGPR select should have been split in RegBankSelect.
2493 if (Size > 32)
2494 return false;
2495
2496 MachineInstr *Select =
2497 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
2498 .addImm(Val: 0)
2499 .add(MO: I.getOperand(i: 3))
2500 .addImm(Val: 0)
2501 .add(MO: I.getOperand(i: 2))
2502 .add(MO: I.getOperand(i: 1));
2503
2504 constrainSelectedInstRegOperands(I&: *Select, TII, TRI, RBI);
2505 I.eraseFromParent();
2506 return true;
2507}
2508
2509bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2510 Register DstReg = I.getOperand(i: 0).getReg();
2511 Register SrcReg = I.getOperand(i: 1).getReg();
2512 const LLT DstTy = MRI->getType(Reg: DstReg);
2513 const LLT SrcTy = MRI->getType(Reg: SrcReg);
2514 const LLT S1 = LLT::scalar(SizeInBits: 1);
2515
2516 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
2517 const RegisterBank *DstRB;
2518 if (DstTy == S1) {
2519 // This is a special case. We don't treat s1 for legalization artifacts as
2520 // vcc booleans.
2521 DstRB = SrcRB;
2522 } else {
2523 DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
2524 if (SrcRB != DstRB)
2525 return false;
2526 }
2527
2528 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2529
2530 unsigned DstSize = DstTy.getSizeInBits();
2531 unsigned SrcSize = SrcTy.getSizeInBits();
2532
2533 const TargetRegisterClass *SrcRC =
2534 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcRB);
2535 const TargetRegisterClass *DstRC =
2536 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstRB);
2537 if (!SrcRC || !DstRC)
2538 return false;
2539
2540 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
2541 !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI)) {
2542 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2543 return false;
2544 }
2545
2546 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2547 assert(STI.useRealTrue16Insts());
2548 const DebugLoc &DL = I.getDebugLoc();
2549 MachineBasicBlock *MBB = I.getParent();
2550 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
2551 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::lo16);
2552 I.eraseFromParent();
2553 return true;
2554 }
2555
2556 if (DstTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) && SrcTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
2557 MachineBasicBlock *MBB = I.getParent();
2558 const DebugLoc &DL = I.getDebugLoc();
2559
2560 Register LoReg = MRI->createVirtualRegister(RegClass: DstRC);
2561 Register HiReg = MRI->createVirtualRegister(RegClass: DstRC);
2562 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
2563 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub0);
2564 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
2565 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub1);
2566
2567 if (IsVALU && STI.hasSDWA()) {
2568 // Write the low 16-bits of the high element into the high 16-bits of the
2569 // low element.
2570 MachineInstr *MovSDWA =
2571 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_sdwa), DestReg: DstReg)
2572 .addImm(Val: 0) // $src0_modifiers
2573 .addReg(RegNo: HiReg) // $src0
2574 .addImm(Val: 0) // $clamp
2575 .addImm(Val: AMDGPU::SDWA::WORD_1) // $dst_sel
2576 .addImm(Val: AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2577 .addImm(Val: AMDGPU::SDWA::WORD_0) // $src0_sel
2578 .addReg(RegNo: LoReg, Flags: RegState::Implicit);
2579 MovSDWA->tieOperands(DefIdx: 0, UseIdx: MovSDWA->getNumOperands() - 1);
2580 } else {
2581 Register TmpReg0 = MRI->createVirtualRegister(RegClass: DstRC);
2582 Register TmpReg1 = MRI->createVirtualRegister(RegClass: DstRC);
2583 Register ImmReg = MRI->createVirtualRegister(RegClass: DstRC);
2584 if (IsVALU) {
2585 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHLREV_B32_e64), DestReg: TmpReg0)
2586 .addImm(Val: 16)
2587 .addReg(RegNo: HiReg);
2588 } else {
2589 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: TmpReg0)
2590 .addReg(RegNo: HiReg)
2591 .addImm(Val: 16)
2592 .setOperandDead(3); // Dead scc
2593 }
2594
2595 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2596 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2597 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2598
2599 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: MovOpc), DestReg: ImmReg)
2600 .addImm(Val: 0xffff);
2601 auto And = BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: TmpReg1)
2602 .addReg(RegNo: LoReg)
2603 .addReg(RegNo: ImmReg);
2604 auto Or = BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: OrOpc), DestReg: DstReg)
2605 .addReg(RegNo: TmpReg0)
2606 .addReg(RegNo: TmpReg1);
2607
2608 if (!IsVALU) {
2609 And.setOperandDead(3); // Dead scc
2610 Or.setOperandDead(3); // Dead scc
2611 }
2612 }
2613
2614 I.eraseFromParent();
2615 return true;
2616 }
2617
2618 if (!DstTy.isScalar())
2619 return false;
2620
2621 if (SrcSize > 32) {
2622 unsigned SubRegIdx = DstSize < 32
2623 ? static_cast<unsigned>(AMDGPU::sub0)
2624 : TRI.getSubRegFromChannel(Channel: 0, NumRegs: DstSize / 32);
2625 if (SubRegIdx == AMDGPU::NoSubRegister)
2626 return false;
2627
2628 // Deal with weird cases where the class only partially supports the subreg
2629 // index.
2630 const TargetRegisterClass *SrcWithSubRC
2631 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2632 if (!SrcWithSubRC)
2633 return false;
2634
2635 if (SrcWithSubRC != SrcRC) {
2636 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcWithSubRC, MRI&: *MRI))
2637 return false;
2638 }
2639
2640 I.getOperand(i: 1).setSubReg(SubRegIdx);
2641 }
2642
2643 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
2644 return true;
2645}
2646
2647/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2648static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2649 Mask = maskTrailingOnes<unsigned>(N: Size);
2650 int SignedMask = static_cast<int>(Mask);
2651 return SignedMask >= -16 && SignedMask <= 64;
2652}
2653
2654// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2655const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2656 Register Reg, const MachineRegisterInfo &MRI,
2657 const TargetRegisterInfo &TRI) const {
2658 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2659 if (auto *RB = dyn_cast<const RegisterBank *>(Val: RegClassOrBank))
2660 return RB;
2661
2662 // Ignore the type, since we don't use vcc in artifacts.
2663 if (auto *RC = dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank))
2664 return &RBI.getRegBankFromRegClass(RC: *RC, LLT());
2665 return nullptr;
2666}
2667
2668bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2669 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2670 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2671 const DebugLoc &DL = I.getDebugLoc();
2672 MachineBasicBlock &MBB = *I.getParent();
2673 const Register DstReg = I.getOperand(i: 0).getReg();
2674 const Register SrcReg = I.getOperand(i: 1).getReg();
2675
2676 const LLT DstTy = MRI->getType(Reg: DstReg);
2677 const LLT SrcTy = MRI->getType(Reg: SrcReg);
2678 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2679 I.getOperand(i: 2).getImm() : SrcTy.getSizeInBits();
2680 const unsigned DstSize = DstTy.getSizeInBits();
2681 if (!DstTy.isScalar())
2682 return false;
2683
2684 // Artifact casts should never use vcc.
2685 const RegisterBank *SrcBank = getArtifactRegBank(Reg: SrcReg, MRI: *MRI, TRI);
2686
2687 // FIXME: This should probably be illegal and split earlier.
2688 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2689 if (DstSize <= 32)
2690 return selectCOPY(I);
2691
2692 const TargetRegisterClass *SrcRC =
2693 TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *SrcBank);
2694 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
2695 const TargetRegisterClass *DstRC =
2696 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
2697
2698 Register UndefReg = MRI->createVirtualRegister(RegClass: SrcRC);
2699 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefReg);
2700 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
2701 .addReg(RegNo: SrcReg)
2702 .addImm(Val: AMDGPU::sub0)
2703 .addReg(RegNo: UndefReg)
2704 .addImm(Val: AMDGPU::sub1);
2705 I.eraseFromParent();
2706
2707 return RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) &&
2708 RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI);
2709 }
2710
2711 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2712 // 64-bit should have been split up in RegBankSelect
2713
2714 // Try to use an and with a mask if it will save code size.
2715 unsigned Mask;
2716 if (!Signed && shouldUseAndMask(Size: SrcSize, Mask)) {
2717 MachineInstr *ExtI =
2718 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B32_e32), DestReg: DstReg)
2719 .addImm(Val: Mask)
2720 .addReg(RegNo: SrcReg);
2721 I.eraseFromParent();
2722 constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
2723 return true;
2724 }
2725
2726 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2727 MachineInstr *ExtI =
2728 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE), DestReg: DstReg)
2729 .addReg(RegNo: SrcReg)
2730 .addImm(Val: 0) // Offset
2731 .addImm(Val: SrcSize); // Width
2732 I.eraseFromParent();
2733 constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
2734 return true;
2735 }
2736
2737 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2738 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2739 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2740 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: SrcRC, MRI&: *MRI))
2741 return false;
2742
2743 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2744 const unsigned SextOpc = SrcSize == 8 ?
2745 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2746 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: SextOpc), DestReg: DstReg)
2747 .addReg(RegNo: SrcReg);
2748 I.eraseFromParent();
2749 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2750 }
2751
2752 // Using a single 32-bit SALU to calculate the high half is smaller than
2753 // S_BFE with a literal constant operand.
2754 if (DstSize > 32 && SrcSize == 32) {
2755 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2756 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2757 if (Signed) {
2758 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ASHR_I32), DestReg: HiReg)
2759 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
2760 .addImm(Val: 31)
2761 .setOperandDead(3); // Dead scc
2762 } else {
2763 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: HiReg)
2764 .addImm(Val: 0);
2765 }
2766 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
2767 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
2768 .addImm(Val: AMDGPU::sub0)
2769 .addReg(RegNo: HiReg)
2770 .addImm(Val: AMDGPU::sub1);
2771 I.eraseFromParent();
2772 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_64RegClass,
2773 MRI&: *MRI);
2774 }
2775
2776 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2777 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2778
2779 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2780 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2781 // We need a 64-bit register source, but the high bits don't matter.
2782 Register ExtReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
2783 Register UndefReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2784 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2785
2786 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefReg);
2787 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: ExtReg)
2788 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
2789 .addImm(Val: AMDGPU::sub0)
2790 .addReg(RegNo: UndefReg)
2791 .addImm(Val: AMDGPU::sub1);
2792
2793 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE64), DestReg: DstReg)
2794 .addReg(RegNo: ExtReg)
2795 .addImm(Val: SrcSize << 16);
2796
2797 I.eraseFromParent();
2798 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI);
2799 }
2800
2801 unsigned Mask;
2802 if (!Signed && shouldUseAndMask(Size: SrcSize, Mask)) {
2803 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: DstReg)
2804 .addReg(RegNo: SrcReg)
2805 .addImm(Val: Mask)
2806 .setOperandDead(3); // Dead scc
2807 } else {
2808 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE32), DestReg: DstReg)
2809 .addReg(RegNo: SrcReg)
2810 .addImm(Val: SrcSize << 16);
2811 }
2812
2813 I.eraseFromParent();
2814 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2815 }
2816
2817 return false;
2818}
2819
2820static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
2821 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2822}
2823
2824static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
2825 Register BitcastSrc;
2826 if (mi_match(R: Reg, MRI, P: m_GBitcast(Src: m_Reg(R&: BitcastSrc))))
2827 Reg = BitcastSrc;
2828 return Reg;
2829}
2830
2831static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
2832 Register &Out) {
2833 Register Trunc;
2834 if (!mi_match(R: In, MRI, P: m_GTrunc(Src: m_Reg(R&: Trunc))))
2835 return false;
2836
2837 Register LShlSrc;
2838 Register Cst;
2839 if (mi_match(R: Trunc, MRI, P: m_GLShr(L: m_Reg(R&: LShlSrc), R: m_Reg(R&: Cst)))) {
2840 Cst = stripCopy(Reg: Cst, MRI);
2841 if (mi_match(R: Cst, MRI, P: m_SpecificICst(RequestedValue: 16))) {
2842 Out = stripBitCast(Reg: LShlSrc, MRI);
2843 return true;
2844 }
2845 }
2846
2847 MachineInstr *Shuffle = MRI.getVRegDef(Reg: Trunc);
2848 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2849 return false;
2850
2851 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2852 LLT::fixed_vector(2, 16));
2853
2854 ArrayRef<int> Mask = Shuffle->getOperand(i: 3).getShuffleMask();
2855 assert(Mask.size() == 2);
2856
2857 if (Mask[0] == 1 && Mask[1] <= 1) {
2858 Out = Shuffle->getOperand(i: 0).getReg();
2859 return true;
2860 }
2861
2862 return false;
2863}
2864
2865bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2866 if (!Subtarget->hasSALUFloatInsts())
2867 return false;
2868
2869 Register Dst = I.getOperand(i: 0).getReg();
2870 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
2871 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2872 return false;
2873
2874 Register Src = I.getOperand(i: 1).getReg();
2875
2876 if (MRI->getType(Reg: Dst) == LLT::scalar(SizeInBits: 32) &&
2877 MRI->getType(Reg: Src) == LLT::scalar(SizeInBits: 16)) {
2878 if (isExtractHiElt(MRI&: *MRI, In: Src, Out&: Src)) {
2879 MachineBasicBlock *BB = I.getParent();
2880 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_CVT_HI_F32_F16), DestReg: Dst)
2881 .addUse(RegNo: Src);
2882 I.eraseFromParent();
2883 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2884 }
2885 }
2886
2887 return false;
2888}
2889
2890bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2891 // Only manually handle the f64 SGPR case.
2892 //
2893 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2894 // the bit ops theoretically have a second result due to the implicit def of
2895 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2896 // that is easy by disabling the check. The result works, but uses a
2897 // nonsensical sreg32orlds_and_sreg_1 regclass.
2898 //
2899 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2900 // the variadic REG_SEQUENCE operands.
2901
2902 Register Dst = MI.getOperand(i: 0).getReg();
2903 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
2904 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2905 MRI->getType(Reg: Dst) != LLT::scalar(SizeInBits: 64))
2906 return false;
2907
2908 Register Src = MI.getOperand(i: 1).getReg();
2909 MachineInstr *Fabs = getOpcodeDef(Opcode: TargetOpcode::G_FABS, Reg: Src, MRI: *MRI);
2910 if (Fabs)
2911 Src = Fabs->getOperand(i: 1).getReg();
2912
2913 if (!RBI.constrainGenericRegister(Reg: Src, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI) ||
2914 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI))
2915 return false;
2916
2917 MachineBasicBlock *BB = MI.getParent();
2918 const DebugLoc &DL = MI.getDebugLoc();
2919 Register LoReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2920 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2921 Register ConstReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2922 Register OpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2923
2924 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
2925 .addReg(RegNo: Src, Flags: {}, SubReg: AMDGPU::sub0);
2926 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
2927 .addReg(RegNo: Src, Flags: {}, SubReg: AMDGPU::sub1);
2928 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: ConstReg)
2929 .addImm(Val: 0x80000000);
2930
2931 // Set or toggle sign bit.
2932 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2933 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: OpReg)
2934 .addReg(RegNo: HiReg)
2935 .addReg(RegNo: ConstReg)
2936 .setOperandDead(3); // Dead scc
2937 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
2938 .addReg(RegNo: LoReg)
2939 .addImm(Val: AMDGPU::sub0)
2940 .addReg(RegNo: OpReg)
2941 .addImm(Val: AMDGPU::sub1);
2942 MI.eraseFromParent();
2943 return true;
2944}
2945
2946// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2947bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2948 Register Dst = MI.getOperand(i: 0).getReg();
2949 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
2950 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2951 MRI->getType(Reg: Dst) != LLT::scalar(SizeInBits: 64))
2952 return false;
2953
2954 Register Src = MI.getOperand(i: 1).getReg();
2955 MachineBasicBlock *BB = MI.getParent();
2956 const DebugLoc &DL = MI.getDebugLoc();
2957 Register LoReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2958 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2959 Register ConstReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2960 Register OpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2961
2962 if (!RBI.constrainGenericRegister(Reg: Src, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI) ||
2963 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI))
2964 return false;
2965
2966 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
2967 .addReg(RegNo: Src, Flags: {}, SubReg: AMDGPU::sub0);
2968 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
2969 .addReg(RegNo: Src, Flags: {}, SubReg: AMDGPU::sub1);
2970 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: ConstReg)
2971 .addImm(Val: 0x7fffffff);
2972
2973 // Clear sign bit.
2974 // TODO: Should this used S_BITSET0_*?
2975 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: OpReg)
2976 .addReg(RegNo: HiReg)
2977 .addReg(RegNo: ConstReg)
2978 .setOperandDead(3); // Dead scc
2979 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
2980 .addReg(RegNo: LoReg)
2981 .addImm(Val: AMDGPU::sub0)
2982 .addReg(RegNo: OpReg)
2983 .addImm(Val: AMDGPU::sub1);
2984
2985 MI.eraseFromParent();
2986 return true;
2987}
2988
2989static bool isConstant(const MachineInstr &MI) {
2990 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2991}
2992
2993void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2994 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2995
2996 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2997 const MachineInstr *PtrMI =
2998 MRI.getUniqueVRegDef(Reg: Load.getOperand(i: OpNo).getReg());
2999
3000 assert(PtrMI);
3001
3002 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
3003 return;
3004
3005 GEPInfo GEPInfo;
3006
3007 for (unsigned i = 1; i != 3; ++i) {
3008 const MachineOperand &GEPOp = PtrMI->getOperand(i);
3009 const MachineInstr *OpDef = MRI.getUniqueVRegDef(Reg: GEPOp.getReg());
3010 assert(OpDef);
3011 if (i == 2 && isConstant(MI: *OpDef)) {
3012 // TODO: Could handle constant base + variable offset, but a combine
3013 // probably should have commuted it.
3014 assert(GEPInfo.Imm == 0);
3015 GEPInfo.Imm = OpDef->getOperand(i: 1).getCImm()->getSExtValue();
3016 continue;
3017 }
3018 const RegisterBank *OpBank = RBI.getRegBank(Reg: GEPOp.getReg(), MRI, TRI);
3019 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
3020 GEPInfo.SgprParts.push_back(Elt: GEPOp.getReg());
3021 else
3022 GEPInfo.VgprParts.push_back(Elt: GEPOp.getReg());
3023 }
3024
3025 AddrInfo.push_back(Elt: GEPInfo);
3026 getAddrModeInfo(Load: *PtrMI, MRI, AddrInfo);
3027}
3028
3029bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
3030 return RBI.getRegBank(Reg, MRI: *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
3031}
3032
3033bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
3034 if (!MI.hasOneMemOperand())
3035 return false;
3036
3037 const MachineMemOperand *MMO = *MI.memoperands_begin();
3038 const Value *Ptr = MMO->getValue();
3039
3040 // UndefValue means this is a load of a kernel input. These are uniform.
3041 // Sometimes LDS instructions have constant pointers.
3042 // If Ptr is null, then that means this mem operand contains a
3043 // PseudoSourceValue like GOT.
3044 if (!Ptr || isa<UndefValue, Argument, Constant, GlobalValue>(Val: Ptr))
3045 return true;
3046
3047 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
3048 return true;
3049
3050 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3051 return RBI.getRegBank(Reg: MI.getOperand(i: 0).getReg(), MRI: *MRI, TRI)->getID() ==
3052 AMDGPU::SGPRRegBankID;
3053
3054 const Instruction *I = dyn_cast<Instruction>(Val: Ptr);
3055 return I && I->getMetadata(Kind: "amdgpu.uniform");
3056}
3057
3058bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3059 for (const GEPInfo &GEPInfo : AddrInfo) {
3060 if (!GEPInfo.VgprParts.empty())
3061 return true;
3062 }
3063 return false;
3064}
3065
3066void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3067 const LLT PtrTy = MRI->getType(Reg: I.getOperand(i: 1).getReg());
3068 unsigned AS = PtrTy.getAddressSpace();
3069 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
3070 STI.ldsRequiresM0Init()) {
3071 MachineBasicBlock *BB = I.getParent();
3072
3073 // If DS instructions require M0 initialization, insert it before selecting.
3074 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
3075 .addImm(Val: -1);
3076 }
3077}
3078
3079bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3080 MachineInstr &I) const {
3081 initM0(I);
3082 return selectImpl(I, CoverageInfo&: *CoverageInfo);
3083}
3084
3085static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
3086 if (Reg.isPhysical())
3087 return false;
3088
3089 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
3090 const unsigned Opcode = MI.getOpcode();
3091
3092 if (Opcode == AMDGPU::COPY)
3093 return isVCmpResult(Reg: MI.getOperand(i: 1).getReg(), MRI);
3094
3095 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3096 Opcode == AMDGPU::G_XOR)
3097 return isVCmpResult(Reg: MI.getOperand(i: 1).getReg(), MRI) &&
3098 isVCmpResult(Reg: MI.getOperand(i: 2).getReg(), MRI);
3099
3100 if (auto *GI = dyn_cast<GIntrinsic>(Val: &MI))
3101 return GI->is(ID: Intrinsic::amdgcn_class);
3102
3103 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3104}
3105
3106bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3107 MachineBasicBlock *BB = I.getParent();
3108 MachineOperand &CondOp = I.getOperand(i: 0);
3109 Register CondReg = CondOp.getReg();
3110 const DebugLoc &DL = I.getDebugLoc();
3111
3112 unsigned BrOpcode;
3113 Register CondPhysReg;
3114 const TargetRegisterClass *ConstrainRC;
3115
3116 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3117 // whether the branch is uniform when selecting the instruction. In
3118 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3119 // RegBankSelect knows what it's doing if the branch condition is scc, even
3120 // though it currently does not.
3121 if (!isVCC(Reg: CondReg, MRI: *MRI)) {
3122 if (MRI->getType(Reg: CondReg) != LLT::scalar(SizeInBits: 32))
3123 return false;
3124
3125 CondPhysReg = AMDGPU::SCC;
3126 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3127 ConstrainRC = &AMDGPU::SReg_32RegClass;
3128 } else {
3129 // FIXME: Should scc->vcc copies and with exec?
3130
3131 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3132 // need to insert an and with exec.
3133 if (!isVCmpResult(Reg: CondReg, MRI&: *MRI)) {
3134 const bool Is64 = STI.isWave64();
3135 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3136 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3137
3138 Register TmpReg = MRI->createVirtualRegister(RegClass: TRI.getBoolRC());
3139 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: TmpReg)
3140 .addReg(RegNo: CondReg)
3141 .addReg(RegNo: Exec)
3142 .setOperandDead(3); // Dead scc
3143 CondReg = TmpReg;
3144 }
3145
3146 CondPhysReg = TRI.getVCC();
3147 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3148 ConstrainRC = TRI.getBoolRC();
3149 }
3150
3151 if (!MRI->getRegClassOrNull(Reg: CondReg))
3152 MRI->setRegClass(Reg: CondReg, RC: ConstrainRC);
3153
3154 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CondPhysReg)
3155 .addReg(RegNo: CondReg);
3156 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: BrOpcode))
3157 .addMBB(MBB: I.getOperand(i: 1).getMBB());
3158
3159 I.eraseFromParent();
3160 return true;
3161}
3162
3163bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3164 MachineInstr &I) const {
3165 Register DstReg = I.getOperand(i: 0).getReg();
3166 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3167 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3168 I.setDesc(TII.get(Opcode: IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3169 if (IsVGPR)
3170 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
3171
3172 return RBI.constrainGenericRegister(
3173 Reg: DstReg, RC: IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI&: *MRI);
3174}
3175
3176bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3177 Register DstReg = I.getOperand(i: 0).getReg();
3178 Register SrcReg = I.getOperand(i: 1).getReg();
3179 Register MaskReg = I.getOperand(i: 2).getReg();
3180 LLT Ty = MRI->getType(Reg: DstReg);
3181 LLT MaskTy = MRI->getType(Reg: MaskReg);
3182 MachineBasicBlock *BB = I.getParent();
3183 const DebugLoc &DL = I.getDebugLoc();
3184
3185 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3186 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
3187 const RegisterBank *MaskRB = RBI.getRegBank(Reg: MaskReg, MRI: *MRI, TRI);
3188 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3189 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3190 return false;
3191
3192 // Try to avoid emitting a bit operation when we only need to touch half of
3193 // the 64-bit pointer.
3194 APInt MaskOnes = VT->getKnownOnes(R: MaskReg).zext(width: 64);
3195 const APInt MaskHi32 = APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32);
3196 const APInt MaskLo32 = APInt::getLowBitsSet(numBits: 64, loBitsSet: 32);
3197
3198 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3199 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3200
3201 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3202 !CanCopyLow32 && !CanCopyHi32) {
3203 auto MIB = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B64), DestReg: DstReg)
3204 .addReg(RegNo: SrcReg)
3205 .addReg(RegNo: MaskReg)
3206 .setOperandDead(3); // Dead scc
3207 I.eraseFromParent();
3208 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3209 return true;
3210 }
3211
3212 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3213 const TargetRegisterClass &RegRC
3214 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3215
3216 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, Bank: *DstRB);
3217 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, Bank: *SrcRB);
3218 const TargetRegisterClass *MaskRC =
3219 TRI.getRegClassForTypeOnBank(Ty: MaskTy, Bank: *MaskRB);
3220
3221 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
3222 !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
3223 !RBI.constrainGenericRegister(Reg: MaskReg, RC: *MaskRC, MRI&: *MRI))
3224 return false;
3225
3226 if (Ty.getSizeInBits() == 32) {
3227 assert(MaskTy.getSizeInBits() == 32 &&
3228 "ptrmask should have been narrowed during legalize");
3229
3230 auto NewOp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: DstReg)
3231 .addReg(RegNo: SrcReg)
3232 .addReg(RegNo: MaskReg);
3233
3234 if (!IsVGPR)
3235 NewOp.setOperandDead(3); // Dead scc
3236 I.eraseFromParent();
3237 return true;
3238 }
3239
3240 Register HiReg = MRI->createVirtualRegister(RegClass: &RegRC);
3241 Register LoReg = MRI->createVirtualRegister(RegClass: &RegRC);
3242
3243 // Extract the subregisters from the source pointer.
3244 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
3245 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub0);
3246 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
3247 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub1);
3248
3249 Register MaskedLo, MaskedHi;
3250
3251 if (CanCopyLow32) {
3252 // If all the bits in the low half are 1, we only need a copy for it.
3253 MaskedLo = LoReg;
3254 } else {
3255 // Extract the mask subregister and apply the and.
3256 Register MaskLo = MRI->createVirtualRegister(RegClass: &RegRC);
3257 MaskedLo = MRI->createVirtualRegister(RegClass: &RegRC);
3258
3259 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: MaskLo)
3260 .addReg(RegNo: MaskReg, Flags: {}, SubReg: AMDGPU::sub0);
3261 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: MaskedLo)
3262 .addReg(RegNo: LoReg)
3263 .addReg(RegNo: MaskLo);
3264 }
3265
3266 if (CanCopyHi32) {
3267 // If all the bits in the high half are 1, we only need a copy for it.
3268 MaskedHi = HiReg;
3269 } else {
3270 Register MaskHi = MRI->createVirtualRegister(RegClass: &RegRC);
3271 MaskedHi = MRI->createVirtualRegister(RegClass: &RegRC);
3272
3273 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: MaskHi)
3274 .addReg(RegNo: MaskReg, Flags: {}, SubReg: AMDGPU::sub1);
3275 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: MaskedHi)
3276 .addReg(RegNo: HiReg)
3277 .addReg(RegNo: MaskHi);
3278 }
3279
3280 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
3281 .addReg(RegNo: MaskedLo)
3282 .addImm(Val: AMDGPU::sub0)
3283 .addReg(RegNo: MaskedHi)
3284 .addImm(Val: AMDGPU::sub1);
3285 I.eraseFromParent();
3286 return true;
3287}
3288
3289/// Return the register to use for the index value, and the subregister to use
3290/// for the indirectly accessed register.
3291static std::pair<Register, unsigned>
3292computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
3293 const TargetRegisterClass *SuperRC, Register IdxReg,
3294 unsigned EltSize, GISelValueTracking &ValueTracking) {
3295 Register IdxBaseReg;
3296 int Offset;
3297
3298 std::tie(args&: IdxBaseReg, args&: Offset) =
3299 AMDGPU::getBaseWithConstantOffset(MRI, Reg: IdxReg, ValueTracking: &ValueTracking);
3300 if (IdxBaseReg == AMDGPU::NoRegister) {
3301 // This will happen if the index is a known constant. This should ordinarily
3302 // be legalized out, but handle it as a register just in case.
3303 assert(Offset == 0);
3304 IdxBaseReg = IdxReg;
3305 }
3306
3307 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: SuperRC, EltSize);
3308
3309 // Skip out of bounds offsets, or else we would end up using an undefined
3310 // register.
3311 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3312 return std::pair(IdxReg, SubRegs[0]);
3313 return std::pair(IdxBaseReg, SubRegs[Offset]);
3314}
3315
3316bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3317 MachineInstr &MI) const {
3318 Register DstReg = MI.getOperand(i: 0).getReg();
3319 Register SrcReg = MI.getOperand(i: 1).getReg();
3320 Register IdxReg = MI.getOperand(i: 2).getReg();
3321
3322 LLT DstTy = MRI->getType(Reg: DstReg);
3323 LLT SrcTy = MRI->getType(Reg: SrcReg);
3324
3325 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3326 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
3327 const RegisterBank *IdxRB = RBI.getRegBank(Reg: IdxReg, MRI: *MRI, TRI);
3328
3329 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3330 // into a waterfall loop.
3331 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3332 return false;
3333
3334 const TargetRegisterClass *SrcRC =
3335 TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *SrcRB);
3336 const TargetRegisterClass *DstRC =
3337 TRI.getRegClassForTypeOnBank(Ty: DstTy, Bank: *DstRB);
3338 if (!SrcRC || !DstRC)
3339 return false;
3340 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
3341 !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
3342 !RBI.constrainGenericRegister(Reg: IdxReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
3343 return false;
3344
3345 MachineBasicBlock *BB = MI.getParent();
3346 const DebugLoc &DL = MI.getDebugLoc();
3347 const bool Is64 = DstTy.getSizeInBits() == 64;
3348
3349 unsigned SubReg;
3350 std::tie(args&: IdxReg, args&: SubReg) = computeIndirectRegIndex(
3351 MRI&: *MRI, TRI, SuperRC: SrcRC, IdxReg, EltSize: DstTy.getSizeInBits() / 8, ValueTracking&: *VT);
3352
3353 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3354 if (DstTy.getSizeInBits() != 32 && !Is64)
3355 return false;
3356
3357 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3358 .addReg(RegNo: IdxReg);
3359
3360 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3361 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
3362 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
3363 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
3364 MI.eraseFromParent();
3365 return true;
3366 }
3367
3368 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3369 return false;
3370
3371 if (!STI.useVGPRIndexMode()) {
3372 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3373 .addReg(RegNo: IdxReg);
3374 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: DstReg)
3375 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
3376 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
3377 MI.eraseFromParent();
3378 return true;
3379 }
3380
3381 const MCInstrDesc &GPRIDXDesc =
3382 TII.getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *SrcRC), IsIndirectSrc: true);
3383 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: GPRIDXDesc, DestReg: DstReg)
3384 .addReg(RegNo: SrcReg)
3385 .addReg(RegNo: IdxReg)
3386 .addImm(Val: SubReg);
3387
3388 MI.eraseFromParent();
3389 return true;
3390}
3391
3392// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3393bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3394 MachineInstr &MI) const {
3395 Register DstReg = MI.getOperand(i: 0).getReg();
3396 Register VecReg = MI.getOperand(i: 1).getReg();
3397 Register ValReg = MI.getOperand(i: 2).getReg();
3398 Register IdxReg = MI.getOperand(i: 3).getReg();
3399
3400 LLT VecTy = MRI->getType(Reg: DstReg);
3401 LLT ValTy = MRI->getType(Reg: ValReg);
3402 unsigned VecSize = VecTy.getSizeInBits();
3403 unsigned ValSize = ValTy.getSizeInBits();
3404
3405 const RegisterBank *VecRB = RBI.getRegBank(Reg: VecReg, MRI: *MRI, TRI);
3406 const RegisterBank *ValRB = RBI.getRegBank(Reg: ValReg, MRI: *MRI, TRI);
3407 const RegisterBank *IdxRB = RBI.getRegBank(Reg: IdxReg, MRI: *MRI, TRI);
3408
3409 assert(VecTy.getElementType() == ValTy);
3410
3411 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3412 // into a waterfall loop.
3413 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3414 return false;
3415
3416 const TargetRegisterClass *VecRC =
3417 TRI.getRegClassForTypeOnBank(Ty: VecTy, Bank: *VecRB);
3418 const TargetRegisterClass *ValRC =
3419 TRI.getRegClassForTypeOnBank(Ty: ValTy, Bank: *ValRB);
3420
3421 if (!RBI.constrainGenericRegister(Reg: VecReg, RC: *VecRC, MRI&: *MRI) ||
3422 !RBI.constrainGenericRegister(Reg: DstReg, RC: *VecRC, MRI&: *MRI) ||
3423 !RBI.constrainGenericRegister(Reg: ValReg, RC: *ValRC, MRI&: *MRI) ||
3424 !RBI.constrainGenericRegister(Reg: IdxReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
3425 return false;
3426
3427 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3428 return false;
3429
3430 unsigned SubReg;
3431 std::tie(args&: IdxReg, args&: SubReg) =
3432 computeIndirectRegIndex(MRI&: *MRI, TRI, SuperRC: VecRC, IdxReg, EltSize: ValSize / 8, ValueTracking&: *VT);
3433
3434 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3435 STI.useVGPRIndexMode();
3436
3437 MachineBasicBlock *BB = MI.getParent();
3438 const DebugLoc &DL = MI.getDebugLoc();
3439
3440 if (!IndexMode) {
3441 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3442 .addReg(RegNo: IdxReg);
3443
3444 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3445 VecSize, EltSize: ValSize, IsSGPR: VecRB->getID() == AMDGPU::SGPRRegBankID);
3446 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: RegWriteOp, DestReg: DstReg)
3447 .addReg(RegNo: VecReg)
3448 .addReg(RegNo: ValReg)
3449 .addImm(Val: SubReg);
3450 MI.eraseFromParent();
3451 return true;
3452 }
3453
3454 const MCInstrDesc &GPRIDXDesc =
3455 TII.getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
3456 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: GPRIDXDesc, DestReg: DstReg)
3457 .addReg(RegNo: VecReg)
3458 .addReg(RegNo: ValReg)
3459 .addReg(RegNo: IdxReg)
3460 .addImm(Val: SubReg);
3461
3462 MI.eraseFromParent();
3463 return true;
3464}
3465
3466static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
3467 switch (Intr) {
3468 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3469 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
3470 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3471 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
3472 case Intrinsic::amdgcn_load_async_to_lds:
3473 case Intrinsic::amdgcn_global_load_async_lds:
3474 return true;
3475 }
3476 return false;
3477}
3478
3479bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3480 if (!Subtarget->hasVMemToLDSLoad())
3481 return false;
3482 unsigned Opc;
3483 unsigned Size = MI.getOperand(i: 3).getImm();
3484 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
3485
3486 // The struct intrinsic variants add one additional operand over raw.
3487 const bool HasVIndex = MI.getNumOperands() == 9;
3488 Register VIndex;
3489 int OpOffset = 0;
3490 if (HasVIndex) {
3491 VIndex = MI.getOperand(i: 4).getReg();
3492 OpOffset = 1;
3493 }
3494
3495 Register VOffset = MI.getOperand(i: 4 + OpOffset).getReg();
3496 std::optional<ValueAndVReg> MaybeVOffset =
3497 getIConstantVRegValWithLookThrough(VReg: VOffset, MRI: *MRI);
3498 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3499
3500 switch (Size) {
3501 default:
3502 return false;
3503 case 1:
3504 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3505 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3506 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3507 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3508 break;
3509 case 2:
3510 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3511 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3512 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3513 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3514 break;
3515 case 4:
3516 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3517 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3518 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3519 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3520 break;
3521 case 12:
3522 if (!Subtarget->hasLDSLoadB96_B128())
3523 return false;
3524
3525 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3526 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3527 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3528 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3529 break;
3530 case 16:
3531 if (!Subtarget->hasLDSLoadB96_B128())
3532 return false;
3533
3534 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3535 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3536 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3537 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3538 break;
3539 }
3540
3541 MachineBasicBlock *MBB = MI.getParent();
3542 const DebugLoc &DL = MI.getDebugLoc();
3543 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3544 .add(MO: MI.getOperand(i: 2));
3545
3546 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc));
3547
3548 if (HasVIndex && HasVOffset) {
3549 Register IdxReg = MRI->createVirtualRegister(RegClass: TRI.getVGPR64Class());
3550 BuildMI(BB&: *MBB, I: &*MIB, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: IdxReg)
3551 .addReg(RegNo: VIndex)
3552 .addImm(Val: AMDGPU::sub0)
3553 .addReg(RegNo: VOffset)
3554 .addImm(Val: AMDGPU::sub1);
3555
3556 MIB.addReg(RegNo: IdxReg);
3557 } else if (HasVIndex) {
3558 MIB.addReg(RegNo: VIndex);
3559 } else if (HasVOffset) {
3560 MIB.addReg(RegNo: VOffset);
3561 }
3562
3563 MIB.add(MO: MI.getOperand(i: 1)); // rsrc
3564 MIB.add(MO: MI.getOperand(i: 5 + OpOffset)); // soffset
3565 MIB.add(MO: MI.getOperand(i: 6 + OpOffset)); // imm offset
3566 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3567 unsigned Aux = MI.getOperand(i: 7 + OpOffset).getImm();
3568 MIB.addImm(Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3569 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3570 MIB.addImm(
3571 Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3572 ? 1
3573 : 0); // swz
3574 MIB.addImm(Val: isAsyncLDSDMA(Intr: IntrinsicID));
3575
3576 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3577 // Don't set the offset value here because the pointer points to the base of
3578 // the buffer.
3579 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3580
3581 MachinePointerInfo StorePtrI = LoadPtrI;
3582 LoadPtrI.V = PoisonValue::get(T: PointerType::get(C&: MF->getFunction().getContext(),
3583 AddressSpace: AMDGPUAS::BUFFER_RESOURCE));
3584 LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
3585 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3586
3587 auto F = LoadMMO->getFlags() &
3588 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3589 LoadMMO = MF->getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad,
3590 Size, BaseAlignment: LoadMMO->getBaseAlign());
3591
3592 MachineMemOperand *StoreMMO =
3593 MF->getMachineMemOperand(PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore,
3594 Size: sizeof(int32_t), BaseAlignment: LoadMMO->getBaseAlign());
3595
3596 MIB.setMemRefs({LoadMMO, StoreMMO});
3597
3598 MI.eraseFromParent();
3599 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3600 return true;
3601}
3602
3603/// Match a zero extend from a 32-bit value to 64-bits.
3604Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3605 Register ZExtSrc;
3606 if (mi_match(R: Reg, MRI: *MRI, P: m_GZExt(Src: m_Reg(R&: ZExtSrc))))
3607 return MRI->getType(Reg: ZExtSrc) == LLT::scalar(SizeInBits: 32) ? ZExtSrc : Register();
3608
3609 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3610 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI);
3611 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3612 return Register();
3613
3614 assert(Def->getNumOperands() == 3 &&
3615 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3616 if (mi_match(R: Def->getOperand(i: 2).getReg(), MRI: *MRI, P: m_ZeroInt())) {
3617 return Def->getOperand(i: 1).getReg();
3618 }
3619
3620 return Register();
3621}
3622
3623/// Match a sign extend from a 32-bit value to 64-bits.
3624Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3625 Register SExtSrc;
3626 if (mi_match(R: Reg, MRI: *MRI, P: m_GSExt(Src: m_Reg(R&: SExtSrc))))
3627 return MRI->getType(Reg: SExtSrc) == LLT::scalar(SizeInBits: 32) ? SExtSrc : Register();
3628
3629 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3630 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI);
3631 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3632 return Register();
3633
3634 assert(Def->getNumOperands() == 3 &&
3635 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3636 if (mi_match(R: Def->getOperand(i: 2).getReg(), MRI: *MRI,
3637 P: m_GAShr(L: m_SpecificReg(RequestedReg: Def->getOperand(i: 1).getReg()),
3638 R: m_SpecificICst(RequestedValue: 31))))
3639 return Def->getOperand(i: 1).getReg();
3640
3641 if (VT->signBitIsZero(Op: Reg))
3642 return matchZeroExtendFromS32(Reg);
3643
3644 return Register();
3645}
3646
3647/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3648/// is 32-bit.
3649Register
3650AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3651 return MRI->getType(Reg) == LLT::scalar(SizeInBits: 32) ? Reg
3652 : matchZeroExtendFromS32(Reg);
3653}
3654
3655/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3656/// is 32-bit.
3657Register
3658AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3659 return MRI->getType(Reg) == LLT::scalar(SizeInBits: 32) ? Reg
3660 : matchSignExtendFromS32(Reg);
3661}
3662
3663Register
3664AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3665 bool IsSigned) const {
3666 if (IsSigned)
3667 return matchSignExtendFromS32OrS32(Reg);
3668
3669 return matchZeroExtendFromS32OrS32(Reg);
3670}
3671
3672Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3673 Register AnyExtSrc;
3674 if (mi_match(R: Reg, MRI: *MRI, P: m_GAnyExt(Src: m_Reg(R&: AnyExtSrc))))
3675 return MRI->getType(Reg: AnyExtSrc) == LLT::scalar(SizeInBits: 32) ? AnyExtSrc : Register();
3676
3677 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3678 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI);
3679 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3680 return Register();
3681
3682 assert(Def->getNumOperands() == 3 &&
3683 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3684
3685 if (mi_match(R: Def->getOperand(i: 2).getReg(), MRI: *MRI, P: m_GImplicitDef()))
3686 return Def->getOperand(i: 1).getReg();
3687
3688 return Register();
3689}
3690
3691bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3692 if (!Subtarget->hasVMemToLDSLoad())
3693 return false;
3694
3695 unsigned Opc;
3696 unsigned Size = MI.getOperand(i: 3).getImm();
3697 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
3698
3699 switch (Size) {
3700 default:
3701 return false;
3702 case 1:
3703 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3704 break;
3705 case 2:
3706 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3707 break;
3708 case 4:
3709 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3710 break;
3711 case 12:
3712 if (!Subtarget->hasLDSLoadB96_B128())
3713 return false;
3714 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3715 break;
3716 case 16:
3717 if (!Subtarget->hasLDSLoadB96_B128())
3718 return false;
3719 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3720 break;
3721 }
3722
3723 MachineBasicBlock *MBB = MI.getParent();
3724 const DebugLoc &DL = MI.getDebugLoc();
3725 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3726 .add(MO: MI.getOperand(i: 2));
3727
3728 Register Addr = MI.getOperand(i: 1).getReg();
3729 Register VOffset;
3730 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3731 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3732 if (!isSGPR(Reg: Addr)) {
3733 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
3734 if (isSGPR(Reg: AddrDef->Reg)) {
3735 Addr = AddrDef->Reg;
3736 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3737 Register SAddr =
3738 getSrcRegIgnoringCopies(Reg: AddrDef->MI->getOperand(i: 1).getReg(), MRI: *MRI);
3739 if (isSGPR(Reg: SAddr)) {
3740 Register PtrBaseOffset = AddrDef->MI->getOperand(i: 2).getReg();
3741 if (Register Off = matchZeroExtendFromS32(Reg: PtrBaseOffset)) {
3742 Addr = SAddr;
3743 VOffset = Off;
3744 }
3745 }
3746 }
3747 }
3748
3749 if (isSGPR(Reg: Addr)) {
3750 Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc);
3751 if (!VOffset) {
3752 VOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
3753 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VOffset)
3754 .addImm(Val: 0);
3755 }
3756 }
3757
3758 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc))
3759 .addReg(RegNo: Addr);
3760
3761 if (isSGPR(Reg: Addr))
3762 MIB.addReg(RegNo: VOffset);
3763
3764 MIB.add(MO: MI.getOperand(i: 4)); // offset
3765
3766 unsigned Aux = MI.getOperand(i: 5).getImm();
3767 MIB.addImm(Val: Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3768 MIB.addImm(Val: isAsyncLDSDMA(Intr: IntrinsicID));
3769
3770 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3771 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3772 LoadPtrI.Offset = MI.getOperand(i: 4).getImm();
3773 MachinePointerInfo StorePtrI = LoadPtrI;
3774 LoadPtrI.V = PoisonValue::get(T: PointerType::get(C&: MF->getFunction().getContext(),
3775 AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
3776 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
3777 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3778 auto F = LoadMMO->getFlags() &
3779 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3780 LoadMMO = MF->getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad,
3781 Size, BaseAlignment: LoadMMO->getBaseAlign());
3782 MachineMemOperand *StoreMMO =
3783 MF->getMachineMemOperand(PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore,
3784 Size: sizeof(int32_t), BaseAlignment: Align(4));
3785
3786 MIB.setMemRefs({LoadMMO, StoreMMO});
3787
3788 MI.eraseFromParent();
3789 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3790 return true;
3791}
3792
3793bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
3794 Intrinsic::ID IID) const {
3795 bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
3796 unsigned Opc =
3797 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS : AMDGPU::TENSOR_STORE_FROM_LDS;
3798 int NumGroups = 4;
3799
3800 // A lamda function to check whether an operand is a vector of all 0s.
3801 const auto isAllZeros = [&](MachineOperand &Opnd) {
3802 const MachineInstr *DefMI = MRI->getVRegDef(Reg: Opnd.getReg());
3803 if (!DefMI)
3804 return false;
3805 return llvm::isBuildVectorAllZeros(MI: *DefMI, MRI: *MRI, AllowUndef: true);
3806 };
3807
3808 // Use _D2 version if both group 2 and 3 are zero-initialized.
3809 if (isAllZeros(MI.getOperand(i: 3)) && isAllZeros(MI.getOperand(i: 4))) {
3810 NumGroups = 2;
3811 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2
3812 : AMDGPU::TENSOR_STORE_FROM_LDS_D2;
3813 }
3814
3815 // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
3816 // for now because all existing targets only support up to 4 groups.
3817 MachineBasicBlock *MBB = MI.getParent();
3818 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: Opc))
3819 .add(MO: MI.getOperand(i: 1)) // D# group 0
3820 .add(MO: MI.getOperand(i: 2)); // D# group 1
3821
3822 if (NumGroups >= 4) { // Has at least 4 groups
3823 MIB.add(MO: MI.getOperand(i: 3)) // D# group 2
3824 .add(MO: MI.getOperand(i: 4)); // D# group 3
3825 }
3826
3827 MIB.addImm(Val: 0) // r128
3828 .add(MO: MI.getOperand(i: 6)); // cpol
3829
3830 MI.eraseFromParent();
3831 return true;
3832}
3833
3834bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3835 MachineInstr &MI) const {
3836 unsigned OpcodeOpIdx =
3837 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3838 MI.setDesc(TII.get(Opcode: MI.getOperand(i: OpcodeOpIdx).getImm()));
3839 MI.removeOperand(OpNo: OpcodeOpIdx);
3840 MI.addImplicitDefUseOperands(MF&: *MI.getMF());
3841 constrainSelectedInstRegOperands(I&: MI, TII, TRI, RBI);
3842 return true;
3843}
3844
3845// FIXME: This should be removed and let the patterns select. We just need the
3846// AGPR/VGPR combination versions.
3847bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3848 unsigned Opc;
3849 switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
3850 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3851 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3852 break;
3853 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3854 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3855 break;
3856 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3857 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3858 break;
3859 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3860 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3861 break;
3862 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3863 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3864 break;
3865 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3866 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3867 break;
3868 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3869 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3870 break;
3871 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3872 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3873 break;
3874 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3875 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3876 break;
3877 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3878 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3879 break;
3880 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3881 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3882 break;
3883 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3884 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3885 break;
3886 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3887 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3888 break;
3889 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3890 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3891 break;
3892 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3893 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3894 break;
3895 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3896 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3897 break;
3898 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3899 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3900 break;
3901 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3902 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3903 break;
3904 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3905 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3906 break;
3907 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3908 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3909 break;
3910 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3911 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3912 break;
3913 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3914 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3915 break;
3916 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3917 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3918 break;
3919 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3920 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3921 break;
3922 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3923 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3924 break;
3925 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3926 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3927 break;
3928 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3929 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3930 break;
3931 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3932 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3933 break;
3934 default:
3935 llvm_unreachable("unhandled smfmac intrinsic");
3936 }
3937
3938 auto VDst_In = MI.getOperand(i: 4);
3939
3940 MI.setDesc(TII.get(Opcode: Opc));
3941 MI.removeOperand(OpNo: 4); // VDst_In
3942 MI.removeOperand(OpNo: 1); // Intrinsic ID
3943 MI.addOperand(Op: VDst_In); // Readd VDst_In to the end
3944 MI.addImplicitDefUseOperands(MF&: *MI.getMF());
3945 const MCInstrDesc &MCID = MI.getDesc();
3946 if (MCID.getOperandConstraint(OpNum: 0, Constraint: MCOI::EARLY_CLOBBER) != -1) {
3947 MI.getOperand(i: 0).setIsEarlyClobber(true);
3948 }
3949 return true;
3950}
3951
3952bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3953 MachineInstr &MI, Intrinsic::ID IntrID) const {
3954 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3955 !Subtarget->hasPermlane16Swap())
3956 return false;
3957 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3958 !Subtarget->hasPermlane32Swap())
3959 return false;
3960
3961 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3962 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3963 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3964
3965 MI.removeOperand(OpNo: 2);
3966 MI.setDesc(TII.get(Opcode));
3967 MI.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
3968
3969 MachineOperand &FI = MI.getOperand(i: 4);
3970 FI.setImm(FI.getImm() ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0);
3971
3972 constrainSelectedInstRegOperands(I&: MI, TII, TRI, RBI);
3973 return true;
3974}
3975
3976bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3977 Register DstReg = MI.getOperand(i: 0).getReg();
3978 Register SrcReg = MI.getOperand(i: 1).getReg();
3979 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3980 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3981 MachineBasicBlock *MBB = MI.getParent();
3982 const DebugLoc &DL = MI.getDebugLoc();
3983
3984 if (IsVALU) {
3985 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: DstReg)
3986 .addImm(Val: Subtarget->getWavefrontSizeLog2())
3987 .addReg(RegNo: SrcReg);
3988 } else {
3989 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: DstReg)
3990 .addReg(RegNo: SrcReg)
3991 .addImm(Val: Subtarget->getWavefrontSizeLog2())
3992 .setOperandDead(3); // Dead scc
3993 }
3994
3995 const TargetRegisterClass &RC =
3996 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3997 if (!RBI.constrainGenericRegister(Reg: DstReg, RC, MRI&: *MRI))
3998 return false;
3999
4000 MI.eraseFromParent();
4001 return true;
4002}
4003
4004bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
4005 MachineInstr &MI) const {
4006 assert(MI.getNumOperands() == 4);
4007 MachineBasicBlock *MBB = MI.getParent();
4008 const DebugLoc &DL = MI.getDebugLoc();
4009
4010 Register DstReg = MI.getOperand(i: 0).getReg();
4011 Register ValReg = MI.getOperand(i: 2).getReg();
4012 Register IdxReg = MI.getOperand(i: 3).getReg();
4013
4014 const LLT DstTy = MRI->getType(Reg: DstReg);
4015 unsigned DstSize = DstTy.getSizeInBits();
4016 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
4017 const TargetRegisterClass *DstRC =
4018 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstRB);
4019
4020 if (DstTy != LLT::scalar(SizeInBits: 32))
4021 return false;
4022
4023 if (!Subtarget->supportsBPermute())
4024 return false;
4025
4026 // If we can bpermute across the whole wave, then just do that
4027 if (Subtarget->supportsWaveWideBPermute()) {
4028 Register ShiftIdxReg = MRI->createVirtualRegister(RegClass: DstRC);
4029 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHLREV_B32_e64), DestReg: ShiftIdxReg)
4030 .addImm(Val: 2)
4031 .addReg(RegNo: IdxReg);
4032
4033 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_BPERMUTE_B32), DestReg: DstReg)
4034 .addReg(RegNo: ShiftIdxReg)
4035 .addReg(RegNo: ValReg)
4036 .addImm(Val: 0);
4037 } else {
4038 // Otherwise, we need to make use of whole wave mode
4039 assert(Subtarget->isWave64());
4040
4041 // Set inactive lanes to poison
4042 Register UndefValReg =
4043 MRI->createVirtualRegister(RegClass: TRI.getRegClass(i: AMDGPU::SReg_32RegClassID));
4044 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefValReg);
4045
4046 Register UndefExecReg = MRI->createVirtualRegister(
4047 RegClass: TRI.getRegClass(i: AMDGPU::SReg_64_XEXECRegClassID));
4048 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefExecReg);
4049
4050 Register PoisonValReg = MRI->createVirtualRegister(RegClass: DstRC);
4051 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SET_INACTIVE_B32), DestReg: PoisonValReg)
4052 .addImm(Val: 0)
4053 .addReg(RegNo: ValReg)
4054 .addImm(Val: 0)
4055 .addReg(RegNo: UndefValReg)
4056 .addReg(RegNo: UndefExecReg);
4057
4058 // ds_bpermute requires index to be multiplied by 4
4059 Register ShiftIdxReg = MRI->createVirtualRegister(RegClass: DstRC);
4060 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHLREV_B32_e64), DestReg: ShiftIdxReg)
4061 .addImm(Val: 2)
4062 .addReg(RegNo: IdxReg);
4063
4064 Register PoisonIdxReg = MRI->createVirtualRegister(RegClass: DstRC);
4065 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SET_INACTIVE_B32), DestReg: PoisonIdxReg)
4066 .addImm(Val: 0)
4067 .addReg(RegNo: ShiftIdxReg)
4068 .addImm(Val: 0)
4069 .addReg(RegNo: UndefValReg)
4070 .addReg(RegNo: UndefExecReg);
4071
4072 // Get permutation of each half, then we'll select which one to use
4073 Register SameSidePermReg = MRI->createVirtualRegister(RegClass: DstRC);
4074 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_BPERMUTE_B32), DestReg: SameSidePermReg)
4075 .addReg(RegNo: PoisonIdxReg)
4076 .addReg(RegNo: PoisonValReg)
4077 .addImm(Val: 0);
4078
4079 Register SwappedValReg = MRI->createVirtualRegister(RegClass: DstRC);
4080 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_PERMLANE64_B32), DestReg: SwappedValReg)
4081 .addReg(RegNo: PoisonValReg);
4082
4083 Register OppSidePermReg = MRI->createVirtualRegister(RegClass: DstRC);
4084 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_BPERMUTE_B32), DestReg: OppSidePermReg)
4085 .addReg(RegNo: PoisonIdxReg)
4086 .addReg(RegNo: SwappedValReg)
4087 .addImm(Val: 0);
4088
4089 Register WWMSwapPermReg = MRI->createVirtualRegister(RegClass: DstRC);
4090 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::STRICT_WWM), DestReg: WWMSwapPermReg)
4091 .addReg(RegNo: OppSidePermReg);
4092
4093 // Select which side to take the permute from
4094 // We can get away with only using mbcnt_lo here since we're only
4095 // trying to detect which side of 32 each lane is on, and mbcnt_lo
4096 // returns 32 for lanes 32-63.
4097 Register ThreadIDReg = MRI->createVirtualRegister(RegClass: DstRC);
4098 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MBCNT_LO_U32_B32_e64), DestReg: ThreadIDReg)
4099 .addImm(Val: -1)
4100 .addImm(Val: 0);
4101
4102 Register XORReg = MRI->createVirtualRegister(RegClass: DstRC);
4103 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_XOR_B32_e64), DestReg: XORReg)
4104 .addReg(RegNo: ThreadIDReg)
4105 .addReg(RegNo: PoisonIdxReg);
4106
4107 Register ANDReg = MRI->createVirtualRegister(RegClass: DstRC);
4108 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B32_e64), DestReg: ANDReg)
4109 .addReg(RegNo: XORReg)
4110 .addImm(Val: 32);
4111
4112 Register CompareReg = MRI->createVirtualRegister(
4113 RegClass: TRI.getRegClass(i: AMDGPU::SReg_64_XEXECRegClassID));
4114 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: CompareReg)
4115 .addReg(RegNo: ANDReg)
4116 .addImm(Val: 0);
4117
4118 // Finally do the selection
4119 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
4120 .addImm(Val: 0)
4121 .addReg(RegNo: WWMSwapPermReg)
4122 .addImm(Val: 0)
4123 .addReg(RegNo: SameSidePermReg)
4124 .addReg(RegNo: CompareReg);
4125 }
4126
4127 MI.eraseFromParent();
4128 return true;
4129}
4130
4131// Match BITOP3 operation and return a number of matched instructions plus
4132// truth table.
4133static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
4134 SmallVectorImpl<Register> &Src,
4135 const MachineRegisterInfo &MRI) {
4136 unsigned NumOpcodes = 0;
4137 uint8_t LHSBits, RHSBits;
4138
4139 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
4140 // Define truth table given Src0, Src1, Src2 bits permutations:
4141 // 0 0 0
4142 // 0 0 1
4143 // 0 1 0
4144 // 0 1 1
4145 // 1 0 0
4146 // 1 0 1
4147 // 1 1 0
4148 // 1 1 1
4149 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4150
4151 if (mi_match(R: Op, MRI, P: m_AllOnesInt())) {
4152 Bits = 0xff;
4153 return true;
4154 }
4155 if (mi_match(R: Op, MRI, P: m_ZeroInt())) {
4156 Bits = 0;
4157 return true;
4158 }
4159
4160 for (unsigned I = 0; I < Src.size(); ++I) {
4161 // Try to find existing reused operand
4162 if (Src[I] == Op) {
4163 Bits = SrcBits[I];
4164 return true;
4165 }
4166 // Try to replace parent operator
4167 if (Src[I] == R) {
4168 Bits = SrcBits[I];
4169 Src[I] = Op;
4170 return true;
4171 }
4172 }
4173
4174 if (Src.size() == 3) {
4175 // No room left for operands. Try one last time, there can be a 'not' of
4176 // one of our source operands. In this case we can compute the bits
4177 // without growing Src vector.
4178 Register LHS;
4179 if (mi_match(R: Op, MRI, P: m_Not(Src: m_Reg(R&: LHS)))) {
4180 LHS = getSrcRegIgnoringCopies(Reg: LHS, MRI);
4181 for (unsigned I = 0; I < Src.size(); ++I) {
4182 if (Src[I] == LHS) {
4183 Bits = ~SrcBits[I];
4184 return true;
4185 }
4186 }
4187 }
4188
4189 return false;
4190 }
4191
4192 Bits = SrcBits[Src.size()];
4193 Src.push_back(Elt: Op);
4194 return true;
4195 };
4196
4197 MachineInstr *MI = MRI.getVRegDef(Reg: R);
4198 switch (MI->getOpcode()) {
4199 case TargetOpcode::G_AND:
4200 case TargetOpcode::G_OR:
4201 case TargetOpcode::G_XOR: {
4202 Register LHS = getSrcRegIgnoringCopies(Reg: MI->getOperand(i: 1).getReg(), MRI);
4203 Register RHS = getSrcRegIgnoringCopies(Reg: MI->getOperand(i: 2).getReg(), MRI);
4204
4205 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
4206 if (!getOperandBits(LHS, LHSBits) ||
4207 !getOperandBits(RHS, RHSBits)) {
4208 Src = std::move(Backup);
4209 return std::make_pair(x: 0, y: 0);
4210 }
4211
4212 // Recursion is naturally limited by the size of the operand vector.
4213 auto Op = BitOp3_Op(R: LHS, Src, MRI);
4214 if (Op.first) {
4215 NumOpcodes += Op.first;
4216 LHSBits = Op.second;
4217 }
4218
4219 Op = BitOp3_Op(R: RHS, Src, MRI);
4220 if (Op.first) {
4221 NumOpcodes += Op.first;
4222 RHSBits = Op.second;
4223 }
4224 break;
4225 }
4226 default:
4227 return std::make_pair(x: 0, y: 0);
4228 }
4229
4230 uint8_t TTbl;
4231 switch (MI->getOpcode()) {
4232 case TargetOpcode::G_AND:
4233 TTbl = LHSBits & RHSBits;
4234 break;
4235 case TargetOpcode::G_OR:
4236 TTbl = LHSBits | RHSBits;
4237 break;
4238 case TargetOpcode::G_XOR:
4239 TTbl = LHSBits ^ RHSBits;
4240 break;
4241 default:
4242 break;
4243 }
4244
4245 return std::make_pair(x: NumOpcodes + 1, y&: TTbl);
4246}
4247
4248bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4249 if (!Subtarget->hasBitOp3Insts())
4250 return false;
4251
4252 Register DstReg = MI.getOperand(i: 0).getReg();
4253 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
4254 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4255 if (!IsVALU)
4256 return false;
4257
4258 SmallVector<Register, 3> Src;
4259 uint8_t TTbl;
4260 unsigned NumOpcodes;
4261
4262 std::tie(args&: NumOpcodes, args&: TTbl) = BitOp3_Op(R: DstReg, Src, MRI: *MRI);
4263
4264 // Src.empty() case can happen if all operands are all zero or all ones.
4265 // Normally it shall be optimized out before reaching this.
4266 if (NumOpcodes < 2 || Src.empty())
4267 return false;
4268
4269 const bool IsB32 = MRI->getType(Reg: DstReg) == LLT::scalar(SizeInBits: 32);
4270 if (NumOpcodes == 2 && IsB32) {
4271 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4272 // asm more readable. This cannot be modeled with AddedComplexity because
4273 // selector does not know how many operations did we match.
4274 if (mi_match(MI, MRI: *MRI, P: m_GXor(L: m_GXor(L: m_Reg(), R: m_Reg()), R: m_Reg())) ||
4275 mi_match(MI, MRI: *MRI, P: m_GOr(L: m_GOr(L: m_Reg(), R: m_Reg()), R: m_Reg())) ||
4276 mi_match(MI, MRI: *MRI, P: m_GOr(L: m_GAnd(L: m_Reg(), R: m_Reg()), R: m_Reg())))
4277 return false;
4278 } else if (NumOpcodes < 4) {
4279 // For a uniform case threshold should be higher to account for moves
4280 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4281 // in SGPRs and a readtfirstlane after.
4282 return false;
4283 }
4284
4285 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4286 if (!IsB32 && STI.hasTrue16BitInsts())
4287 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4288 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4289 unsigned CBL = STI.getConstantBusLimit(Opcode: Opc);
4290 MachineBasicBlock *MBB = MI.getParent();
4291 const DebugLoc &DL = MI.getDebugLoc();
4292
4293 for (unsigned I = 0; I < Src.size(); ++I) {
4294 const RegisterBank *RB = RBI.getRegBank(Reg: Src[I], MRI: *MRI, TRI);
4295 if (RB->getID() != AMDGPU::SGPRRegBankID)
4296 continue;
4297 if (CBL > 0) {
4298 --CBL;
4299 continue;
4300 }
4301 Register NewReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
4302 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: NewReg)
4303 .addReg(RegNo: Src[I]);
4304 Src[I] = NewReg;
4305 }
4306
4307 // Last operand can be ignored, turning a ternary operation into a binary.
4308 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4309 // 'c' with 'a' here without changing the answer. In some pathological
4310 // cases it should be possible to get an operation with a single operand
4311 // too if optimizer would not catch it.
4312 while (Src.size() < 3)
4313 Src.push_back(Elt: Src[0]);
4314
4315 auto MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg);
4316 if (!IsB32)
4317 MIB.addImm(Val: 0); // src_mod0
4318 MIB.addReg(RegNo: Src[0]);
4319 if (!IsB32)
4320 MIB.addImm(Val: 0); // src_mod1
4321 MIB.addReg(RegNo: Src[1]);
4322 if (!IsB32)
4323 MIB.addImm(Val: 0); // src_mod2
4324 MIB.addReg(RegNo: Src[2])
4325 .addImm(Val: TTbl);
4326 if (!IsB32)
4327 MIB.addImm(Val: 0); // op_sel
4328
4329 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
4330 MI.eraseFromParent();
4331
4332 return true;
4333}
4334
4335bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4336 Register SrcReg = MI.getOperand(i: 0).getReg();
4337 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
4338 return false;
4339
4340 MachineInstr *DefMI = MRI->getVRegDef(Reg: SrcReg);
4341 Register SP =
4342 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4343 Register WaveAddr = getWaveAddress(Def: DefMI);
4344 MachineBasicBlock *MBB = MI.getParent();
4345 const DebugLoc &DL = MI.getDebugLoc();
4346
4347 if (!WaveAddr) {
4348 WaveAddr = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
4349 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: WaveAddr)
4350 .addReg(RegNo: SrcReg)
4351 .addImm(Val: Subtarget->getWavefrontSizeLog2())
4352 .setOperandDead(3); // Dead scc
4353 }
4354
4355 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: SP)
4356 .addReg(RegNo: WaveAddr);
4357
4358 MI.eraseFromParent();
4359 return true;
4360}
4361
4362bool AMDGPUInstructionSelector::select(MachineInstr &I) {
4363
4364 if (!I.isPreISelOpcode()) {
4365 if (I.isCopy())
4366 return selectCOPY(I);
4367 return true;
4368 }
4369
4370 switch (I.getOpcode()) {
4371 case TargetOpcode::G_AND:
4372 case TargetOpcode::G_OR:
4373 case TargetOpcode::G_XOR:
4374 if (selectBITOP3(MI&: I))
4375 return true;
4376 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4377 return true;
4378 return selectG_AND_OR_XOR(I);
4379 case TargetOpcode::G_ADD:
4380 case TargetOpcode::G_SUB:
4381 case TargetOpcode::G_PTR_ADD:
4382 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4383 return true;
4384 return selectG_ADD_SUB(I);
4385 case TargetOpcode::G_UADDO:
4386 case TargetOpcode::G_USUBO:
4387 case TargetOpcode::G_UADDE:
4388 case TargetOpcode::G_USUBE:
4389 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4390 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4391 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4392 return selectG_AMDGPU_MAD_64_32(I);
4393 case TargetOpcode::G_INTTOPTR:
4394 case TargetOpcode::G_BITCAST:
4395 case TargetOpcode::G_PTRTOINT:
4396 case TargetOpcode::G_FREEZE:
4397 return selectCOPY(I);
4398 case TargetOpcode::G_FNEG:
4399 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4400 return true;
4401 return selectG_FNEG(MI&: I);
4402 case TargetOpcode::G_FABS:
4403 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4404 return true;
4405 return selectG_FABS(MI&: I);
4406 case TargetOpcode::G_EXTRACT:
4407 return selectG_EXTRACT(I);
4408 case TargetOpcode::G_MERGE_VALUES:
4409 case TargetOpcode::G_CONCAT_VECTORS:
4410 return selectG_MERGE_VALUES(MI&: I);
4411 case TargetOpcode::G_UNMERGE_VALUES:
4412 return selectG_UNMERGE_VALUES(MI&: I);
4413 case TargetOpcode::G_BUILD_VECTOR:
4414 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4415 return selectG_BUILD_VECTOR(MI&: I);
4416 case TargetOpcode::G_IMPLICIT_DEF:
4417 return selectG_IMPLICIT_DEF(I);
4418 case TargetOpcode::G_INSERT:
4419 return selectG_INSERT(I);
4420 case TargetOpcode::G_INTRINSIC:
4421 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4422 return selectG_INTRINSIC(I);
4423 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4424 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4425 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4426 case TargetOpcode::G_ICMP:
4427 case TargetOpcode::G_FCMP:
4428 if (selectG_ICMP_or_FCMP(I))
4429 return true;
4430 return selectImpl(I, CoverageInfo&: *CoverageInfo);
4431 case TargetOpcode::G_LOAD:
4432 case TargetOpcode::G_ZEXTLOAD:
4433 case TargetOpcode::G_SEXTLOAD:
4434 case TargetOpcode::G_STORE:
4435 case TargetOpcode::G_ATOMIC_CMPXCHG:
4436 case TargetOpcode::G_ATOMICRMW_XCHG:
4437 case TargetOpcode::G_ATOMICRMW_ADD:
4438 case TargetOpcode::G_ATOMICRMW_SUB:
4439 case TargetOpcode::G_ATOMICRMW_AND:
4440 case TargetOpcode::G_ATOMICRMW_OR:
4441 case TargetOpcode::G_ATOMICRMW_XOR:
4442 case TargetOpcode::G_ATOMICRMW_MIN:
4443 case TargetOpcode::G_ATOMICRMW_MAX:
4444 case TargetOpcode::G_ATOMICRMW_UMIN:
4445 case TargetOpcode::G_ATOMICRMW_UMAX:
4446 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4447 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4448 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4449 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4450 case TargetOpcode::G_ATOMICRMW_FADD:
4451 case TargetOpcode::G_ATOMICRMW_FMIN:
4452 case TargetOpcode::G_ATOMICRMW_FMAX:
4453 return selectG_LOAD_STORE_ATOMICRMW(I);
4454 case TargetOpcode::G_SELECT:
4455 return selectG_SELECT(I);
4456 case TargetOpcode::G_TRUNC:
4457 return selectG_TRUNC(I);
4458 case TargetOpcode::G_SEXT:
4459 case TargetOpcode::G_ZEXT:
4460 case TargetOpcode::G_ANYEXT:
4461 case TargetOpcode::G_SEXT_INREG:
4462 // This is a workaround. For extension from type i1, `selectImpl()` uses
4463 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4464 // i1 can only be hold in a SGPR class.
4465 if (MRI->getType(Reg: I.getOperand(i: 1).getReg()) != LLT::scalar(SizeInBits: 1) &&
4466 selectImpl(I, CoverageInfo&: *CoverageInfo))
4467 return true;
4468 return selectG_SZA_EXT(I);
4469 case TargetOpcode::G_FPEXT:
4470 if (selectG_FPEXT(I))
4471 return true;
4472 return selectImpl(I, CoverageInfo&: *CoverageInfo);
4473 case TargetOpcode::G_BRCOND:
4474 return selectG_BRCOND(I);
4475 case TargetOpcode::G_GLOBAL_VALUE:
4476 return selectG_GLOBAL_VALUE(I);
4477 case TargetOpcode::G_PTRMASK:
4478 return selectG_PTRMASK(I);
4479 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4480 return selectG_EXTRACT_VECTOR_ELT(MI&: I);
4481 case TargetOpcode::G_INSERT_VECTOR_ELT:
4482 return selectG_INSERT_VECTOR_ELT(MI&: I);
4483 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4484 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4485 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4486 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4487 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4488 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4489 AMDGPU::getImageDimIntrinsicInfo(Intr: AMDGPU::getIntrinsicID(I));
4490 assert(Intr && "not an image intrinsic with image pseudo");
4491 return selectImageIntrinsic(MI&: I, Intr);
4492 }
4493 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4494 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4495 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4496 return selectBVHIntersectRayIntrinsic(MI&: I);
4497 case AMDGPU::G_SBFX:
4498 case AMDGPU::G_UBFX:
4499 return selectG_SBFX_UBFX(MI&: I);
4500 case AMDGPU::G_SI_CALL:
4501 I.setDesc(TII.get(Opcode: AMDGPU::SI_CALL));
4502 return true;
4503 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4504 return selectWaveAddress(MI&: I);
4505 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4506 I.setDesc(TII.get(Opcode: AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4507 return true;
4508 }
4509 case AMDGPU::G_STACKRESTORE:
4510 return selectStackRestore(MI&: I);
4511 case AMDGPU::G_PHI:
4512 return selectPHI(I);
4513 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4514 return selectCOPY_SCC_VCC(I);
4515 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4516 return selectCOPY_VCC_SCC(I);
4517 case AMDGPU::G_AMDGPU_READANYLANE:
4518 return selectReadAnyLane(I);
4519 case TargetOpcode::G_CONSTANT:
4520 case TargetOpcode::G_FCONSTANT:
4521 default:
4522 return selectImpl(I, CoverageInfo&: *CoverageInfo);
4523 }
4524 return false;
4525}
4526
4527InstructionSelector::ComplexRendererFns
4528AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4529 return {{
4530 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); }
4531 }};
4532
4533}
4534
4535std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4536 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4537 unsigned Mods = 0;
4538 MachineInstr *MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI);
4539
4540 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4541 Src = MI->getOperand(i: 1).getReg();
4542 Mods |= SISrcMods::NEG;
4543 MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI);
4544 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4545 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4546 // denormal mode, but we're implicitly canonicalizing in a source operand.
4547 const ConstantFP *LHS =
4548 getConstantFPVRegVal(VReg: MI->getOperand(i: 1).getReg(), MRI: *MRI);
4549 if (LHS && LHS->isZero()) {
4550 Mods |= SISrcMods::NEG;
4551 Src = MI->getOperand(i: 2).getReg();
4552 }
4553 }
4554
4555 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4556 Src = MI->getOperand(i: 1).getReg();
4557 Mods |= SISrcMods::ABS;
4558 }
4559
4560 if (OpSel)
4561 Mods |= SISrcMods::OP_SEL_0;
4562
4563 return std::pair(Src, Mods);
4564}
4565
4566Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4567 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4568 bool ForceVGPR) const {
4569 if ((Mods != 0 || ForceVGPR) &&
4570 RBI.getRegBank(Reg: Src, MRI: *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4571
4572 // If we looked through copies to find source modifiers on an SGPR operand,
4573 // we now have an SGPR register source. To avoid potentially violating the
4574 // constant bus restriction, we need to insert a copy to a VGPR.
4575 Register VGPRSrc = MRI->cloneVirtualRegister(VReg: Root.getReg());
4576 BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
4577 MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: VGPRSrc)
4578 .addReg(RegNo: Src);
4579 Src = VGPRSrc;
4580 }
4581
4582 return Src;
4583}
4584
4585///
4586/// This will select either an SGPR or VGPR operand and will save us from
4587/// having to write an extra tablegen pattern.
4588InstructionSelector::ComplexRendererFns
4589AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4590 return {{
4591 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); }
4592 }};
4593}
4594
4595InstructionSelector::ComplexRendererFns
4596AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4597 Register Src;
4598 unsigned Mods;
4599 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
4600
4601 return {{
4602 [=](MachineInstrBuilder &MIB) {
4603 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4604 },
4605 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
4606 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
4607 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
4608 }};
4609}
4610
4611InstructionSelector::ComplexRendererFns
4612AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4613 Register Src;
4614 unsigned Mods;
4615 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg(),
4616 /*IsCanonicalizing=*/true,
4617 /*AllowAbs=*/false);
4618
4619 return {{
4620 [=](MachineInstrBuilder &MIB) {
4621 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4622 },
4623 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
4624 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
4625 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
4626 }};
4627}
4628
4629InstructionSelector::ComplexRendererFns
4630AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4631 return {{
4632 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); },
4633 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
4634 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
4635 }};
4636}
4637
4638InstructionSelector::ComplexRendererFns
4639AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4640 Register Src;
4641 unsigned Mods;
4642 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
4643
4644 return {{
4645 [=](MachineInstrBuilder &MIB) {
4646 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4647 },
4648 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4649 }};
4650}
4651
4652InstructionSelector::ComplexRendererFns
4653AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4654 MachineOperand &Root) const {
4655 Register Src;
4656 unsigned Mods;
4657 std::tie(args&: Src, args&: Mods) =
4658 selectVOP3ModsImpl(Src: Root.getReg(), /*IsCanonicalizing=*/false);
4659
4660 return {{
4661 [=](MachineInstrBuilder &MIB) {
4662 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4663 },
4664 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4665 }};
4666}
4667
4668InstructionSelector::ComplexRendererFns
4669AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4670 Register Src;
4671 unsigned Mods;
4672 std::tie(args&: Src, args&: Mods) =
4673 selectVOP3ModsImpl(Src: Root.getReg(), /*IsCanonicalizing=*/true,
4674 /*AllowAbs=*/false);
4675
4676 return {{
4677 [=](MachineInstrBuilder &MIB) {
4678 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4679 },
4680 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4681 }};
4682}
4683
4684InstructionSelector::ComplexRendererFns
4685AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4686 Register Reg = Root.getReg();
4687 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI);
4688 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4689 return {};
4690 return {{
4691 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
4692 }};
4693}
4694
4695enum class SrcStatus {
4696 IS_SAME,
4697 IS_UPPER_HALF,
4698 IS_LOWER_HALF,
4699 IS_UPPER_HALF_NEG,
4700 // This means current op = [op_upper, op_lower] and src = -op_lower.
4701 IS_LOWER_HALF_NEG,
4702 IS_HI_NEG,
4703 // This means current op = [op_upper, op_lower] and src = [op_upper,
4704 // -op_lower].
4705 IS_LO_NEG,
4706 IS_BOTH_NEG,
4707 INVALID,
4708 NEG_START = IS_UPPER_HALF_NEG,
4709 NEG_END = IS_BOTH_NEG,
4710 HALF_START = IS_UPPER_HALF,
4711 HALF_END = IS_LOWER_HALF_NEG
4712};
4713/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4714static bool isTruncHalf(const MachineInstr *MI,
4715 const MachineRegisterInfo &MRI) {
4716 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4717 return false;
4718
4719 unsigned DstSize = MRI.getType(Reg: MI->getOperand(i: 0).getReg()).getSizeInBits();
4720 unsigned SrcSize = MRI.getType(Reg: MI->getOperand(i: 1).getReg()).getSizeInBits();
4721 return DstSize * 2 == SrcSize;
4722}
4723
4724/// Test if the MI is logic shift right with half bits,
4725/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4726static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4727 if (MI->getOpcode() != AMDGPU::G_LSHR)
4728 return false;
4729
4730 Register ShiftSrc;
4731 std::optional<ValueAndVReg> ShiftAmt;
4732 if (mi_match(R: MI->getOperand(i: 0).getReg(), MRI,
4733 P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt)))) {
4734 unsigned SrcSize = MRI.getType(Reg: MI->getOperand(i: 1).getReg()).getSizeInBits();
4735 unsigned Shift = ShiftAmt->Value.getZExtValue();
4736 return Shift * 2 == SrcSize;
4737 }
4738 return false;
4739}
4740
4741/// Test if the MI is shift left with half bits,
4742/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4743static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4744 if (MI->getOpcode() != AMDGPU::G_SHL)
4745 return false;
4746
4747 Register ShiftSrc;
4748 std::optional<ValueAndVReg> ShiftAmt;
4749 if (mi_match(R: MI->getOperand(i: 0).getReg(), MRI,
4750 P: m_GShl(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt)))) {
4751 unsigned SrcSize = MRI.getType(Reg: MI->getOperand(i: 1).getReg()).getSizeInBits();
4752 unsigned Shift = ShiftAmt->Value.getZExtValue();
4753 return Shift * 2 == SrcSize;
4754 }
4755 return false;
4756}
4757
4758/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4759static bool isUnmergeHalf(const MachineInstr *MI,
4760 const MachineRegisterInfo &MRI) {
4761 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4762 return false;
4763 return MI->getNumOperands() == 3 && MI->getOperand(i: 0).isDef() &&
4764 MI->getOperand(i: 1).isDef() && !MI->getOperand(i: 2).isDef();
4765}
4766
4767enum class TypeClass { VECTOR_OF_TWO, SCALAR, NONE_OF_LISTED };
4768
4769static TypeClass isVectorOfTwoOrScalar(Register Reg,
4770 const MachineRegisterInfo &MRI) {
4771 LLT OpTy = MRI.getType(Reg);
4772 if (OpTy.isScalar())
4773 return TypeClass::SCALAR;
4774 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4775 return TypeClass::VECTOR_OF_TWO;
4776 return TypeClass::NONE_OF_LISTED;
4777}
4778
4779static SrcStatus getNegStatus(Register Reg, SrcStatus S,
4780 const MachineRegisterInfo &MRI) {
4781 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4782 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4783 return SrcStatus::INVALID;
4784
4785 switch (S) {
4786 case SrcStatus::IS_SAME:
4787 if (NegType == TypeClass::VECTOR_OF_TWO) {
4788 // Vector of 2:
4789 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4790 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4791 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4792 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4793 return SrcStatus::IS_BOTH_NEG;
4794 }
4795 if (NegType == TypeClass::SCALAR) {
4796 // Scalar:
4797 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4798 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4799 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4800 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4801 return SrcStatus::IS_HI_NEG;
4802 }
4803 break;
4804 case SrcStatus::IS_HI_NEG:
4805 if (NegType == TypeClass::VECTOR_OF_TWO) {
4806 // Vector of 2:
4807 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4808 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4809 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4810 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4811 return SrcStatus::IS_LO_NEG;
4812 }
4813 if (NegType == TypeClass::SCALAR) {
4814 // Scalar:
4815 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4816 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4817 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4818 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4819 return SrcStatus::IS_SAME;
4820 }
4821 break;
4822 case SrcStatus::IS_LO_NEG:
4823 if (NegType == TypeClass::VECTOR_OF_TWO) {
4824 // Vector of 2:
4825 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4826 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4827 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4828 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4829 return SrcStatus::IS_HI_NEG;
4830 }
4831 if (NegType == TypeClass::SCALAR) {
4832 // Scalar:
4833 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4834 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4835 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4836 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4837 return SrcStatus::IS_BOTH_NEG;
4838 }
4839 break;
4840 case SrcStatus::IS_BOTH_NEG:
4841 if (NegType == TypeClass::VECTOR_OF_TWO) {
4842 // Vector of 2:
4843 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4844 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4845 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4846 // [SrcHi, SrcLo] = [OpHi, OpLo]
4847 return SrcStatus::IS_SAME;
4848 }
4849 if (NegType == TypeClass::SCALAR) {
4850 // Scalar:
4851 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4852 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4853 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4854 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4855 return SrcStatus::IS_LO_NEG;
4856 }
4857 break;
4858 case SrcStatus::IS_UPPER_HALF:
4859 // Vector of 2:
4860 // Src = CurrUpper
4861 // Curr = [CurrUpper, CurrLower]
4862 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4863 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4864 // Src = -OpUpper
4865 //
4866 // Scalar:
4867 // Src = CurrUpper
4868 // Curr = [CurrUpper, CurrLower]
4869 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4870 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4871 // Src = -OpUpper
4872 return SrcStatus::IS_UPPER_HALF_NEG;
4873 case SrcStatus::IS_LOWER_HALF:
4874 if (NegType == TypeClass::VECTOR_OF_TWO) {
4875 // Vector of 2:
4876 // Src = CurrLower
4877 // Curr = [CurrUpper, CurrLower]
4878 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4879 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4880 // Src = -OpLower
4881 return SrcStatus::IS_LOWER_HALF_NEG;
4882 }
4883 if (NegType == TypeClass::SCALAR) {
4884 // Scalar:
4885 // Src = CurrLower
4886 // Curr = [CurrUpper, CurrLower]
4887 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4888 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4889 // Src = OpLower
4890 return SrcStatus::IS_LOWER_HALF;
4891 }
4892 break;
4893 case SrcStatus::IS_UPPER_HALF_NEG:
4894 // Vector of 2:
4895 // Src = -CurrUpper
4896 // Curr = [CurrUpper, CurrLower]
4897 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4898 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4899 // Src = -(-OpUpper) = OpUpper
4900 //
4901 // Scalar:
4902 // Src = -CurrUpper
4903 // Curr = [CurrUpper, CurrLower]
4904 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4905 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4906 // Src = -(-OpUpper) = OpUpper
4907 return SrcStatus::IS_UPPER_HALF;
4908 case SrcStatus::IS_LOWER_HALF_NEG:
4909 if (NegType == TypeClass::VECTOR_OF_TWO) {
4910 // Vector of 2:
4911 // Src = -CurrLower
4912 // Curr = [CurrUpper, CurrLower]
4913 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4914 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4915 // Src = -(-OpLower) = OpLower
4916 return SrcStatus::IS_LOWER_HALF;
4917 }
4918 if (NegType == TypeClass::SCALAR) {
4919 // Scalar:
4920 // Src = -CurrLower
4921 // Curr = [CurrUpper, CurrLower]
4922 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4923 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4924 // Src = -OpLower
4925 return SrcStatus::IS_LOWER_HALF_NEG;
4926 }
4927 break;
4928 default:
4929 break;
4930 }
4931 llvm_unreachable("unexpected SrcStatus & NegType combination");
4932}
4933
4934static std::optional<std::pair<Register, SrcStatus>>
4935calcNextStatus(std::pair<Register, SrcStatus> Curr,
4936 const MachineRegisterInfo &MRI) {
4937 const MachineInstr *MI = MRI.getVRegDef(Reg: Curr.first);
4938
4939 unsigned Opc = MI->getOpcode();
4940
4941 // Handle general Opc cases.
4942 switch (Opc) {
4943 case AMDGPU::G_BITCAST:
4944 return std::optional<std::pair<Register, SrcStatus>>(
4945 {MI->getOperand(i: 1).getReg(), Curr.second});
4946 case AMDGPU::COPY:
4947 if (MI->getOperand(i: 1).getReg().isPhysical())
4948 return std::nullopt;
4949 return std::optional<std::pair<Register, SrcStatus>>(
4950 {MI->getOperand(i: 1).getReg(), Curr.second});
4951 case AMDGPU::G_FNEG: {
4952 SrcStatus Stat = getNegStatus(Reg: Curr.first, S: Curr.second, MRI);
4953 if (Stat == SrcStatus::INVALID)
4954 return std::nullopt;
4955 return std::optional<std::pair<Register, SrcStatus>>(
4956 {MI->getOperand(i: 1).getReg(), Stat});
4957 }
4958 default:
4959 break;
4960 }
4961
4962 // Calc next Stat from current Stat.
4963 switch (Curr.second) {
4964 case SrcStatus::IS_SAME:
4965 if (isTruncHalf(MI, MRI))
4966 return std::optional<std::pair<Register, SrcStatus>>(
4967 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_LOWER_HALF});
4968 else if (isUnmergeHalf(MI, MRI)) {
4969 if (Curr.first == MI->getOperand(i: 0).getReg())
4970 return std::optional<std::pair<Register, SrcStatus>>(
4971 {MI->getOperand(i: 2).getReg(), SrcStatus::IS_LOWER_HALF});
4972 return std::optional<std::pair<Register, SrcStatus>>(
4973 {MI->getOperand(i: 2).getReg(), SrcStatus::IS_UPPER_HALF});
4974 }
4975 break;
4976 case SrcStatus::IS_HI_NEG:
4977 if (isTruncHalf(MI, MRI)) {
4978 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4979 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4980 // = [OpLowerHi, OpLowerLo]
4981 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4982 // = [-OpLowerHi, OpLowerLo]
4983 // = -OpLower
4984 return std::optional<std::pair<Register, SrcStatus>>(
4985 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4986 }
4987 if (isUnmergeHalf(MI, MRI)) {
4988 if (Curr.first == MI->getOperand(i: 0).getReg())
4989 return std::optional<std::pair<Register, SrcStatus>>(
4990 {MI->getOperand(i: 2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4991 return std::optional<std::pair<Register, SrcStatus>>(
4992 {MI->getOperand(i: 2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4993 }
4994 break;
4995 case SrcStatus::IS_UPPER_HALF:
4996 if (isShlHalf(MI, MRI))
4997 return std::optional<std::pair<Register, SrcStatus>>(
4998 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_LOWER_HALF});
4999 break;
5000 case SrcStatus::IS_LOWER_HALF:
5001 if (isLshrHalf(MI, MRI))
5002 return std::optional<std::pair<Register, SrcStatus>>(
5003 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_UPPER_HALF});
5004 break;
5005 case SrcStatus::IS_UPPER_HALF_NEG:
5006 if (isShlHalf(MI, MRI))
5007 return std::optional<std::pair<Register, SrcStatus>>(
5008 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5009 break;
5010 case SrcStatus::IS_LOWER_HALF_NEG:
5011 if (isLshrHalf(MI, MRI))
5012 return std::optional<std::pair<Register, SrcStatus>>(
5013 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5014 break;
5015 default:
5016 break;
5017 }
5018 return std::nullopt;
5019}
5020
5021/// This is used to control valid status that current MI supports. For example,
5022/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
5023/// bit on VOP3P.
5024/// The class can be further extended to recognize support on SEL, NEG, ABS bit
5025/// for different MI on different arch
5026class SearchOptions {
5027private:
5028 bool HasNeg = false;
5029 // Assume all complex pattern of VOP3P have opsel.
5030 bool HasOpsel = true;
5031
5032public:
5033 SearchOptions(Register Reg, const MachineRegisterInfo &MRI) {
5034 const MachineInstr *MI = MRI.getVRegDef(Reg);
5035 unsigned Opc = MI->getOpcode();
5036
5037 if (Opc == TargetOpcode::G_INTRINSIC) {
5038 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: *MI).getIntrinsicID();
5039 // Only float point intrinsic has neg & neg_hi bits.
5040 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
5041 HasNeg = true;
5042 } else if (TargetInstrInfo::isGenericOpcode(Opc)) {
5043 // Keep same for generic op.
5044 HasNeg = true;
5045 }
5046 }
5047 bool checkOptions(SrcStatus Stat) const {
5048 if (!HasNeg &&
5049 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
5050 return false;
5051 }
5052 if (!HasOpsel &&
5053 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
5054 return false;
5055 }
5056 return true;
5057 }
5058};
5059
5060static SmallVector<std::pair<Register, SrcStatus>>
5061getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO,
5062 int MaxDepth = 3) {
5063 int Depth = 0;
5064 auto Curr = calcNextStatus(Curr: {Reg, SrcStatus::IS_SAME}, MRI);
5065 SmallVector<std::pair<Register, SrcStatus>> Statlist;
5066
5067 while (Depth <= MaxDepth && Curr.has_value()) {
5068 Depth++;
5069 if (SO.checkOptions(Stat: Curr.value().second))
5070 Statlist.push_back(Elt: Curr.value());
5071 Curr = calcNextStatus(Curr: Curr.value(), MRI);
5072 }
5073
5074 return Statlist;
5075}
5076
5077static std::pair<Register, SrcStatus>
5078getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO,
5079 int MaxDepth = 3) {
5080 int Depth = 0;
5081 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
5082 auto Curr = calcNextStatus(Curr: LastSameOrNeg, MRI);
5083
5084 while (Depth <= MaxDepth && Curr.has_value()) {
5085 Depth++;
5086 SrcStatus Stat = Curr.value().second;
5087 if (SO.checkOptions(Stat)) {
5088 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
5089 Stat == SrcStatus::IS_LO_NEG || Stat == SrcStatus::IS_BOTH_NEG)
5090 LastSameOrNeg = Curr.value();
5091 }
5092 Curr = calcNextStatus(Curr: Curr.value(), MRI);
5093 }
5094
5095 return LastSameOrNeg;
5096}
5097
5098static bool isSameBitWidth(Register Reg1, Register Reg2,
5099 const MachineRegisterInfo &MRI) {
5100 unsigned Width1 = MRI.getType(Reg: Reg1).getSizeInBits();
5101 unsigned Width2 = MRI.getType(Reg: Reg2).getSizeInBits();
5102 return Width1 == Width2;
5103}
5104
5105static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
5106 // SrcStatus::IS_LOWER_HALF remain 0.
5107 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
5108 Mods ^= SISrcMods::NEG_HI;
5109 Mods |= SISrcMods::OP_SEL_1;
5110 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
5111 Mods |= SISrcMods::OP_SEL_1;
5112 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
5113 Mods ^= SISrcMods::NEG_HI;
5114 else if (HiStat == SrcStatus::IS_HI_NEG)
5115 Mods ^= SISrcMods::NEG_HI;
5116
5117 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
5118 Mods ^= SISrcMods::NEG;
5119 Mods |= SISrcMods::OP_SEL_0;
5120 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
5121 Mods |= SISrcMods::OP_SEL_0;
5122 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
5123 Mods |= SISrcMods::NEG;
5124 else if (LoStat == SrcStatus::IS_HI_NEG)
5125 Mods ^= SISrcMods::NEG;
5126
5127 return Mods;
5128}
5129
5130static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
5131 Register RootReg, const SIInstrInfo &TII,
5132 const MachineRegisterInfo &MRI) {
5133 auto IsHalfState = [](SrcStatus S) {
5134 return S == SrcStatus::IS_UPPER_HALF || S == SrcStatus::IS_UPPER_HALF_NEG ||
5135 S == SrcStatus::IS_LOWER_HALF || S == SrcStatus::IS_LOWER_HALF_NEG;
5136 };
5137 return isSameBitWidth(Reg1: NewReg, Reg2: RootReg, MRI) && IsHalfState(LoStat) &&
5138 IsHalfState(HiStat);
5139}
5140
5141std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5142 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
5143 unsigned Mods = 0;
5144 // No modification if Root type is not form of <2 x Type>.
5145 if (isVectorOfTwoOrScalar(Reg: RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
5146 Mods |= SISrcMods::OP_SEL_1;
5147 return {RootReg, Mods};
5148 }
5149
5150 SearchOptions SO(RootReg, MRI);
5151
5152 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(Reg: RootReg, MRI, SO);
5153
5154 if (Stat.second == SrcStatus::IS_BOTH_NEG)
5155 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
5156 else if (Stat.second == SrcStatus::IS_HI_NEG)
5157 Mods ^= SISrcMods::NEG_HI;
5158 else if (Stat.second == SrcStatus::IS_LO_NEG)
5159 Mods ^= SISrcMods::NEG;
5160
5161 MachineInstr *MI = MRI.getVRegDef(Reg: Stat.first);
5162
5163 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
5164 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5165 Mods |= SISrcMods::OP_SEL_1;
5166 return {Stat.first, Mods};
5167 }
5168
5169 SmallVector<std::pair<Register, SrcStatus>> StatlistHi =
5170 getSrcStats(Reg: MI->getOperand(i: 2).getReg(), MRI, SO);
5171
5172 if (StatlistHi.empty()) {
5173 Mods |= SISrcMods::OP_SEL_1;
5174 return {Stat.first, Mods};
5175 }
5176
5177 SmallVector<std::pair<Register, SrcStatus>> StatlistLo =
5178 getSrcStats(Reg: MI->getOperand(i: 1).getReg(), MRI, SO);
5179
5180 if (StatlistLo.empty()) {
5181 Mods |= SISrcMods::OP_SEL_1;
5182 return {Stat.first, Mods};
5183 }
5184
5185 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
5186 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
5187 if (StatlistHi[I].first == StatlistLo[J].first &&
5188 isValidToPack(HiStat: StatlistHi[I].second, LoStat: StatlistLo[J].second,
5189 NewReg: StatlistHi[I].first, RootReg, TII, MRI))
5190 return {StatlistHi[I].first,
5191 updateMods(HiStat: StatlistHi[I].second, LoStat: StatlistLo[J].second, Mods)};
5192 }
5193 }
5194 // Packed instructions do not have abs modifiers.
5195 Mods |= SISrcMods::OP_SEL_1;
5196
5197 return {Stat.first, Mods};
5198}
5199
5200// Removed unused function `getAllKindImm` to eliminate dead code.
5201
5202static bool checkRB(Register Reg, unsigned int RBNo,
5203 const AMDGPURegisterBankInfo &RBI,
5204 const MachineRegisterInfo &MRI,
5205 const TargetRegisterInfo &TRI) {
5206 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
5207 return RB->getID() == RBNo;
5208}
5209
5210// This function is used to get the correct register bank for returned reg.
5211// Assume:
5212// 1. VOP3P is always legal for VGPR.
5213// 2. RootOp's regbank is legal.
5214// Thus
5215// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
5216// 2. If RootOp is VGPR, then NewOp must be VGPR.
5217static Register getLegalRegBank(Register NewReg, Register RootReg,
5218 const AMDGPURegisterBankInfo &RBI,
5219 MachineRegisterInfo &MRI,
5220 const TargetRegisterInfo &TRI,
5221 const SIInstrInfo &TII) {
5222 // RootOp can only be VGPR or SGPR (some hand written cases such as.
5223 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
5224 if (checkRB(Reg: RootReg, RBNo: AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
5225 checkRB(Reg: NewReg, RBNo: AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
5226 return NewReg;
5227
5228 MachineInstr *MI = MRI.getVRegDef(Reg: RootReg);
5229 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(i: 1).getReg()) {
5230 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
5231 return RootReg;
5232 }
5233
5234 MachineBasicBlock *BB = MI->getParent();
5235 Register DstReg = MRI.cloneVirtualRegister(VReg: RootReg);
5236
5237 MachineInstrBuilder MIB =
5238 BuildMI(BB&: *BB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
5239 .addReg(RegNo: NewReg);
5240
5241 // Only accept VGPR.
5242 return MIB->getOperand(i: 0).getReg();
5243}
5244
5245InstructionSelector::ComplexRendererFns
5246AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5247 bool IsDOT) const {
5248 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5249 Register Reg;
5250 unsigned Mods;
5251 std::tie(args&: Reg, args&: Mods) = selectVOP3PModsImpl(RootReg: Root.getReg(), MRI, IsDOT);
5252
5253 Reg = getLegalRegBank(NewReg: Reg, RootReg: Root.getReg(), RBI, MRI, TRI, TII);
5254 return {{
5255 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
5256 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
5257 }};
5258}
5259
5260InstructionSelector::ComplexRendererFns
5261AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5262
5263 return selectVOP3PRetHelper(Root);
5264}
5265
5266InstructionSelector::ComplexRendererFns
5267AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5268
5269 return selectVOP3PRetHelper(Root, IsDOT: true);
5270}
5271
5272InstructionSelector::ComplexRendererFns
5273AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5274 MachineOperand &Root) const {
5275 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5276 "expected i1 value");
5277 unsigned Mods = SISrcMods::OP_SEL_1;
5278 if (Root.getImm() != 0)
5279 Mods |= SISrcMods::OP_SEL_0;
5280
5281 return {{
5282 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
5283 }};
5284}
5285
5286static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
5287 MachineInstr *InsertPt,
5288 MachineRegisterInfo &MRI) {
5289 const TargetRegisterClass *DstRegClass;
5290 switch (Elts.size()) {
5291 case 8:
5292 DstRegClass = &AMDGPU::VReg_256RegClass;
5293 break;
5294 case 4:
5295 DstRegClass = &AMDGPU::VReg_128RegClass;
5296 break;
5297 case 2:
5298 DstRegClass = &AMDGPU::VReg_64RegClass;
5299 break;
5300 default:
5301 llvm_unreachable("unhandled Reg sequence size");
5302 }
5303
5304 MachineIRBuilder B(*InsertPt);
5305 auto MIB = B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
5306 .addDef(RegNo: MRI.createVirtualRegister(RegClass: DstRegClass));
5307 for (unsigned i = 0; i < Elts.size(); ++i) {
5308 MIB.addReg(RegNo: Elts[i]);
5309 MIB.addImm(Val: SIRegisterInfo::getSubRegFromChannel(Channel: i));
5310 }
5311 return MIB->getOperand(i: 0).getReg();
5312}
5313
5314static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5315 SmallVectorImpl<Register> &Elts, Register &Src,
5316 MachineInstr *InsertPt,
5317 MachineRegisterInfo &MRI) {
5318 if (ModOpcode == TargetOpcode::G_FNEG) {
5319 Mods |= SISrcMods::NEG;
5320 // Check if all elements also have abs modifier
5321 SmallVector<Register, 8> NegAbsElts;
5322 for (auto El : Elts) {
5323 Register FabsSrc;
5324 if (!mi_match(R: El, MRI, P: m_GFabs(Src: m_Reg(R&: FabsSrc))))
5325 break;
5326 NegAbsElts.push_back(Elt: FabsSrc);
5327 }
5328 if (Elts.size() != NegAbsElts.size()) {
5329 // Neg
5330 Src = buildRegSequence(Elts, InsertPt, MRI);
5331 } else {
5332 // Neg and Abs
5333 Mods |= SISrcMods::NEG_HI;
5334 Src = buildRegSequence(Elts&: NegAbsElts, InsertPt, MRI);
5335 }
5336 } else {
5337 assert(ModOpcode == TargetOpcode::G_FABS);
5338 // Abs
5339 Mods |= SISrcMods::NEG_HI;
5340 Src = buildRegSequence(Elts, InsertPt, MRI);
5341 }
5342}
5343
5344InstructionSelector::ComplexRendererFns
5345AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5346 Register Src = Root.getReg();
5347 unsigned Mods = SISrcMods::OP_SEL_1;
5348 SmallVector<Register, 8> EltsF32;
5349
5350 if (GBuildVector *BV = dyn_cast<GBuildVector>(Val: MRI->getVRegDef(Reg: Src))) {
5351 assert(BV->getNumSources() > 0);
5352 // Based on first element decide which mod we match, neg or abs
5353 MachineInstr *ElF32 = MRI->getVRegDef(Reg: BV->getSourceReg(I: 0));
5354 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5355 ? AMDGPU::G_FNEG
5356 : AMDGPU::G_FABS;
5357 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5358 ElF32 = MRI->getVRegDef(Reg: BV->getSourceReg(I: i));
5359 if (ElF32->getOpcode() != ModOpcode)
5360 break;
5361 EltsF32.push_back(Elt: ElF32->getOperand(i: 1).getReg());
5362 }
5363
5364 // All elements had ModOpcode modifier
5365 if (BV->getNumSources() == EltsF32.size()) {
5366 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF32, Src, InsertPt: Root.getParent(),
5367 MRI&: *MRI);
5368 }
5369 }
5370
5371 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5372 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
5373}
5374
5375InstructionSelector::ComplexRendererFns
5376AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5377 Register Src = Root.getReg();
5378 unsigned Mods = SISrcMods::OP_SEL_1;
5379 SmallVector<Register, 8> EltsV2F16;
5380
5381 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(Val: MRI->getVRegDef(Reg: Src))) {
5382 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5383 Register FNegSrc;
5384 if (!mi_match(R: CV->getSourceReg(I: i), MRI: *MRI, P: m_GFNeg(Src: m_Reg(R&: FNegSrc))))
5385 break;
5386 EltsV2F16.push_back(Elt: FNegSrc);
5387 }
5388
5389 // All elements had ModOpcode modifier
5390 if (CV->getNumSources() == EltsV2F16.size()) {
5391 Mods |= SISrcMods::NEG;
5392 Mods |= SISrcMods::NEG_HI;
5393 Src = buildRegSequence(Elts&: EltsV2F16, InsertPt: Root.getParent(), MRI&: *MRI);
5394 }
5395 }
5396
5397 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5398 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
5399}
5400
5401InstructionSelector::ComplexRendererFns
5402AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5403 Register Src = Root.getReg();
5404 unsigned Mods = SISrcMods::OP_SEL_1;
5405 SmallVector<Register, 8> EltsV2F16;
5406
5407 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(Val: MRI->getVRegDef(Reg: Src))) {
5408 assert(CV->getNumSources() > 0);
5409 MachineInstr *ElV2F16 = MRI->getVRegDef(Reg: CV->getSourceReg(I: 0));
5410 // Based on first element decide which mod we match, neg or abs
5411 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5412 ? AMDGPU::G_FNEG
5413 : AMDGPU::G_FABS;
5414
5415 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5416 ElV2F16 = MRI->getVRegDef(Reg: CV->getSourceReg(I: i));
5417 if (ElV2F16->getOpcode() != ModOpcode)
5418 break;
5419 EltsV2F16.push_back(Elt: ElV2F16->getOperand(i: 1).getReg());
5420 }
5421
5422 // All elements had ModOpcode modifier
5423 if (CV->getNumSources() == EltsV2F16.size()) {
5424 MachineIRBuilder B(*Root.getParent());
5425 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsV2F16, Src, InsertPt: Root.getParent(),
5426 MRI&: *MRI);
5427 }
5428 }
5429
5430 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5431 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
5432}
5433
5434InstructionSelector::ComplexRendererFns
5435AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5436 std::optional<FPValueAndVReg> FPValReg;
5437 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_GFCstOrSplat(FPValReg))) {
5438 if (TII.isInlineConstant(Imm: FPValReg->Value)) {
5439 return {{[=](MachineInstrBuilder &MIB) {
5440 MIB.addImm(Val: FPValReg->Value.bitcastToAPInt().getSExtValue());
5441 }}};
5442 }
5443 // Non-inlineable splat floats should not fall-through for integer immediate
5444 // checks.
5445 return {};
5446 }
5447
5448 APInt ICst;
5449 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICstOrSplat(Cst&: ICst))) {
5450 if (TII.isInlineConstant(Imm: ICst)) {
5451 return {
5452 {[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ICst.getSExtValue()); }}};
5453 }
5454 }
5455
5456 return {};
5457}
5458
5459InstructionSelector::ComplexRendererFns
5460AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5461 Register Src =
5462 getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg();
5463 unsigned Key = 0;
5464
5465 Register ShiftSrc;
5466 std::optional<ValueAndVReg> ShiftAmt;
5467 if (mi_match(R: Src, MRI: *MRI, P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt))) &&
5468 MRI->getType(Reg: ShiftSrc).getSizeInBits() == 32 &&
5469 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5470 Key = ShiftAmt->Value.getZExtValue() / 8;
5471 Src = ShiftSrc;
5472 }
5473
5474 return {{
5475 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5476 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key
5477 }};
5478}
5479
5480InstructionSelector::ComplexRendererFns
5481AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5482
5483 Register Src =
5484 getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg();
5485 unsigned Key = 0;
5486
5487 Register ShiftSrc;
5488 std::optional<ValueAndVReg> ShiftAmt;
5489 if (mi_match(R: Src, MRI: *MRI, P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt))) &&
5490 MRI->getType(Reg: ShiftSrc).getSizeInBits() == 32 &&
5491 ShiftAmt->Value.getZExtValue() == 16) {
5492 Src = ShiftSrc;
5493 Key = 1;
5494 }
5495
5496 return {{
5497 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5498 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key
5499 }};
5500}
5501
5502InstructionSelector::ComplexRendererFns
5503AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5504 Register Src =
5505 getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg();
5506 unsigned Key = 0;
5507
5508 Register S32 = matchZeroExtendFromS32(Reg: Src);
5509 if (!S32)
5510 S32 = matchAnyExtendFromS32(Reg: Src);
5511
5512 if (S32) {
5513 const MachineInstr *Def = getDefIgnoringCopies(Reg: S32, MRI: *MRI);
5514 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5515 assert(Def->getNumOperands() == 3);
5516 Register DstReg1 = Def->getOperand(i: 1).getReg();
5517 if (mi_match(R: S32, MRI: *MRI,
5518 P: m_any_of(preds: m_SpecificReg(RequestedReg: DstReg1), preds: m_Copy(Src: m_Reg(R&: DstReg1))))) {
5519 Src = Def->getOperand(i: 2).getReg();
5520 Key = 1;
5521 }
5522 }
5523 }
5524
5525 return {{
5526 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5527 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key
5528 }};
5529}
5530
5531InstructionSelector::ComplexRendererFns
5532AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5533 Register Src;
5534 unsigned Mods;
5535 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
5536
5537 // FIXME: Handle op_sel
5538 return {{
5539 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5540 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
5541 }};
5542}
5543
5544// FIXME-TRUE16 remove when fake16 is removed
5545InstructionSelector::ComplexRendererFns
5546AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5547 Register Src;
5548 unsigned Mods;
5549 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg(),
5550 /*IsCanonicalizing=*/true,
5551 /*AllowAbs=*/false,
5552 /*OpSel=*/false);
5553
5554 return {{
5555 [=](MachineInstrBuilder &MIB) {
5556 MIB.addReg(
5557 RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB, /* ForceVGPR */ true));
5558 },
5559 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
5560 }};
5561}
5562
5563InstructionSelector::ComplexRendererFns
5564AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5565 Register Src;
5566 unsigned Mods;
5567 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg(),
5568 /*IsCanonicalizing=*/true,
5569 /*AllowAbs=*/false,
5570 /*OpSel=*/true);
5571
5572 return {{
5573 [=](MachineInstrBuilder &MIB) {
5574 MIB.addReg(
5575 RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB, /* ForceVGPR */ true));
5576 },
5577 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
5578 }};
5579}
5580
5581// Given \p Offset and load specified by the \p Root operand check if \p Offset
5582// is a multiple of the load byte size. If it is update \p Offset to a
5583// pre-scaled value and return true.
5584bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5585 Register &Offset,
5586 bool IsSigned) const {
5587 if (!Subtarget->hasScaleOffset())
5588 return false;
5589
5590 const MachineInstr &MI = *Root.getParent();
5591 MachineMemOperand *MMO = *MI.memoperands_begin();
5592
5593 if (!MMO->getSize().hasValue())
5594 return false;
5595
5596 uint64_t Size = MMO->getSize().getValue();
5597
5598 Register OffsetReg = matchExtendFromS32OrS32(Reg: Offset, IsSigned);
5599 if (!OffsetReg)
5600 OffsetReg = Offset;
5601
5602 if (auto Def = getDefSrcRegIgnoringCopies(Reg: OffsetReg, MRI: *MRI))
5603 OffsetReg = Def->Reg;
5604
5605 Register Op0;
5606 MachineInstr *Mul;
5607 bool ScaleOffset =
5608 (isPowerOf2_64(Value: Size) &&
5609 mi_match(R: OffsetReg, MRI: *MRI,
5610 P: m_GShl(L: m_Reg(R&: Op0),
5611 R: m_any_of(preds: m_SpecificICst(RequestedValue: Log2_64(Value: Size)),
5612 preds: m_Copy(Src: m_SpecificICst(RequestedValue: Log2_64(Value: Size))))))) ||
5613 mi_match(R: OffsetReg, MRI: *MRI,
5614 P: m_GMul(L: m_Reg(R&: Op0), R: m_any_of(preds: m_SpecificICst(RequestedValue: Size),
5615 preds: m_Copy(Src: m_SpecificICst(RequestedValue: Size))))) ||
5616 mi_match(
5617 R: OffsetReg, MRI: *MRI,
5618 P: m_BinOp(Opcode: IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5619 L: m_Reg(R&: Op0), R: m_SpecificICst(RequestedValue: Size))) ||
5620 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5621 (mi_match(R: OffsetReg, MRI: *MRI, P: m_MInstr(MI&: Mul)) &&
5622 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5623 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5624 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5625 VT->signBitIsZero(Op: Mul->getOperand(i: 2).getReg()))) &&
5626 mi_match(R: Mul->getOperand(i: 4).getReg(), MRI: *MRI, P: m_ZeroInt()) &&
5627 mi_match(R: Mul->getOperand(i: 3).getReg(), MRI: *MRI,
5628 P: m_GTrunc(Src: m_any_of(preds: m_SpecificICst(RequestedValue: Size),
5629 preds: m_Copy(Src: m_SpecificICst(RequestedValue: Size))))) &&
5630 mi_match(R: Mul->getOperand(i: 2).getReg(), MRI: *MRI, P: m_Reg(R&: Op0)));
5631
5632 if (ScaleOffset)
5633 Offset = Op0;
5634
5635 return ScaleOffset;
5636}
5637
5638bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5639 Register &Base,
5640 Register *SOffset,
5641 int64_t *Offset,
5642 bool *ScaleOffset) const {
5643 MachineInstr *MI = Root.getParent();
5644 MachineBasicBlock *MBB = MI->getParent();
5645
5646 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5647 // then we can select all ptr + 32-bit offsets.
5648 SmallVector<GEPInfo, 4> AddrInfo;
5649 getAddrModeInfo(Load: *MI, MRI: *MRI, AddrInfo);
5650
5651 if (AddrInfo.empty())
5652 return false;
5653
5654 const GEPInfo &GEPI = AddrInfo[0];
5655 std::optional<int64_t> EncodedImm;
5656
5657 if (ScaleOffset)
5658 *ScaleOffset = false;
5659
5660 if (SOffset && Offset) {
5661 EncodedImm = AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: GEPI.Imm, /*IsBuffer=*/false,
5662 /*HasSOffset=*/true);
5663 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5664 AddrInfo.size() > 1) {
5665 const GEPInfo &GEPI2 = AddrInfo[1];
5666 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5667 Register OffsetReg = GEPI2.SgprParts[1];
5668 if (ScaleOffset)
5669 *ScaleOffset =
5670 selectScaleOffset(Root, Offset&: OffsetReg, IsSigned: false /* IsSigned */);
5671 OffsetReg = matchZeroExtendFromS32OrS32(Reg: OffsetReg);
5672 if (OffsetReg) {
5673 Base = GEPI2.SgprParts[0];
5674 *SOffset = OffsetReg;
5675 *Offset = *EncodedImm;
5676 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(ST: STI))
5677 return true;
5678
5679 // For unbuffered smem loads, it is illegal for the Immediate Offset
5680 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5681 // is negative. Handle the case where the Immediate Offset + SOffset
5682 // is negative.
5683 auto SKnown = VT->getKnownBits(R: *SOffset);
5684 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5685 return false;
5686
5687 return true;
5688 }
5689 }
5690 }
5691 return false;
5692 }
5693
5694 EncodedImm = AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: GEPI.Imm, /*IsBuffer=*/false,
5695 /*HasSOffset=*/false);
5696 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5697 Base = GEPI.SgprParts[0];
5698 *Offset = *EncodedImm;
5699 return true;
5700 }
5701
5702 // SGPR offset is unsigned.
5703 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(x: GEPI.Imm) &&
5704 GEPI.Imm != 0) {
5705 // If we make it this far we have a load with an 32-bit immediate offset.
5706 // It is OK to select this using a sgpr offset, because we have already
5707 // failed trying to select this load into one of the _IMM variants since
5708 // the _IMM Patterns are considered before the _SGPR patterns.
5709 Base = GEPI.SgprParts[0];
5710 *SOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5711 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: *SOffset)
5712 .addImm(Val: GEPI.Imm);
5713 return true;
5714 }
5715
5716 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5717 Register OffsetReg = GEPI.SgprParts[1];
5718 if (ScaleOffset)
5719 *ScaleOffset = selectScaleOffset(Root, Offset&: OffsetReg, IsSigned: false /* IsSigned */);
5720 OffsetReg = matchZeroExtendFromS32OrS32(Reg: OffsetReg);
5721 if (OffsetReg) {
5722 Base = GEPI.SgprParts[0];
5723 *SOffset = OffsetReg;
5724 return true;
5725 }
5726 }
5727
5728 return false;
5729}
5730
5731InstructionSelector::ComplexRendererFns
5732AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5733 Register Base;
5734 int64_t Offset;
5735 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, Offset: &Offset,
5736 /* ScaleOffset */ nullptr))
5737 return std::nullopt;
5738
5739 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
5740 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }}};
5741}
5742
5743InstructionSelector::ComplexRendererFns
5744AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5745 SmallVector<GEPInfo, 4> AddrInfo;
5746 getAddrModeInfo(Load: *Root.getParent(), MRI: *MRI, AddrInfo);
5747
5748 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5749 return std::nullopt;
5750
5751 const GEPInfo &GEPInfo = AddrInfo[0];
5752 Register PtrReg = GEPInfo.SgprParts[0];
5753 std::optional<int64_t> EncodedImm =
5754 AMDGPU::getSMRDEncodedLiteralOffset32(ST: STI, ByteOffset: GEPInfo.Imm);
5755 if (!EncodedImm)
5756 return std::nullopt;
5757
5758 return {{
5759 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrReg); },
5760 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); }
5761 }};
5762}
5763
5764InstructionSelector::ComplexRendererFns
5765AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5766 Register Base, SOffset;
5767 bool ScaleOffset;
5768 if (!selectSmrdOffset(Root, Base, SOffset: &SOffset, /* Offset= */ nullptr,
5769 ScaleOffset: &ScaleOffset))
5770 return std::nullopt;
5771
5772 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5773 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
5774 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); },
5775 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPol); }}};
5776}
5777
5778InstructionSelector::ComplexRendererFns
5779AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5780 Register Base, SOffset;
5781 int64_t Offset;
5782 bool ScaleOffset;
5783 if (!selectSmrdOffset(Root, Base, SOffset: &SOffset, Offset: &Offset, ScaleOffset: &ScaleOffset))
5784 return std::nullopt;
5785
5786 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5787 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
5788 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); },
5789 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); },
5790 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPol); }}};
5791}
5792
5793std::pair<Register, int>
5794AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5795 uint64_t FlatVariant) const {
5796 MachineInstr *MI = Root.getParent();
5797
5798 auto Default = std::pair(Root.getReg(), 0);
5799
5800 if (!STI.hasFlatInstOffsets())
5801 return Default;
5802
5803 Register PtrBase;
5804 int64_t ConstOffset;
5805 bool IsInBounds;
5806 std::tie(args&: PtrBase, args&: ConstOffset, args&: IsInBounds) =
5807 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
5808
5809 // Adding the offset to the base address with an immediate in a FLAT
5810 // instruction must not change the memory aperture in which the address falls.
5811 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5812 // instructions.
5813 if (ConstOffset == 0 ||
5814 (FlatVariant == SIInstrFlags::FlatScratch &&
5815 !isFlatScratchBaseLegal(Addr: Root.getReg())) ||
5816 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5817 return Default;
5818
5819 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5820 if (!TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace, FlatVariant))
5821 return Default;
5822
5823 return std::pair(PtrBase, ConstOffset);
5824}
5825
5826InstructionSelector::ComplexRendererFns
5827AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5828 auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FLAT);
5829
5830 return {{
5831 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
5832 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
5833 }};
5834}
5835
5836InstructionSelector::ComplexRendererFns
5837AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5838 auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FlatGlobal);
5839
5840 return {{
5841 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
5842 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
5843 }};
5844}
5845
5846InstructionSelector::ComplexRendererFns
5847AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5848 auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FlatScratch);
5849
5850 return {{
5851 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
5852 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
5853 }};
5854}
5855
5856// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5857InstructionSelector::ComplexRendererFns
5858AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5859 unsigned CPolBits,
5860 bool NeedIOffset) const {
5861 Register Addr = Root.getReg();
5862 Register PtrBase;
5863 int64_t ConstOffset;
5864 int64_t ImmOffset = 0;
5865
5866 // Match the immediate offset first, which canonically is moved as low as
5867 // possible.
5868 std::tie(args&: PtrBase, args&: ConstOffset, args: std::ignore) =
5869 getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
5870
5871 if (ConstOffset != 0) {
5872 if (NeedIOffset &&
5873 TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
5874 FlatVariant: SIInstrFlags::FlatGlobal)) {
5875 Addr = PtrBase;
5876 ImmOffset = ConstOffset;
5877 } else {
5878 auto PtrBaseDef = getDefSrcRegIgnoringCopies(Reg: PtrBase, MRI: *MRI);
5879 if (isSGPR(Reg: PtrBaseDef->Reg)) {
5880 if (ConstOffset > 0) {
5881 // Offset is too large.
5882 //
5883 // saddr + large_offset -> saddr +
5884 // (voffset = large_offset & ~MaxOffset) +
5885 // (large_offset & MaxOffset);
5886 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5887 if (NeedIOffset) {
5888 std::tie(args&: SplitImmOffset, args&: RemainderOffset) =
5889 TII.splitFlatOffset(COffsetVal: ConstOffset, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
5890 FlatVariant: SIInstrFlags::FlatGlobal);
5891 }
5892
5893 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(x: RemainderOffset)
5894 : isUInt<32>(x: RemainderOffset)) {
5895 MachineInstr *MI = Root.getParent();
5896 MachineBasicBlock *MBB = MI->getParent();
5897 Register HighBits =
5898 MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5899
5900 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32),
5901 DestReg: HighBits)
5902 .addImm(Val: RemainderOffset);
5903
5904 if (NeedIOffset)
5905 return {{
5906 [=](MachineInstrBuilder &MIB) {
5907 MIB.addReg(RegNo: PtrBase);
5908 }, // saddr
5909 [=](MachineInstrBuilder &MIB) {
5910 MIB.addReg(RegNo: HighBits);
5911 }, // voffset
5912 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: SplitImmOffset); },
5913 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPolBits); },
5914 }};
5915 return {{
5916 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrBase); }, // saddr
5917 [=](MachineInstrBuilder &MIB) {
5918 MIB.addReg(RegNo: HighBits);
5919 }, // voffset
5920 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPolBits); },
5921 }};
5922 }
5923 }
5924
5925 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5926 // is 1 we would need to perform 1 or 2 extra moves for each half of
5927 // the constant and it is better to do a scalar add and then issue a
5928 // single VALU instruction to materialize zero. Otherwise it is less
5929 // instructions to perform VALU adds with immediates or inline literals.
5930 unsigned NumLiterals =
5931 !TII.isInlineConstant(Imm: APInt(32, Lo_32(Value: ConstOffset))) +
5932 !TII.isInlineConstant(Imm: APInt(32, Hi_32(Value: ConstOffset)));
5933 if (STI.getConstantBusLimit(Opcode: AMDGPU::V_ADD_U32_e64) > NumLiterals)
5934 return std::nullopt;
5935 }
5936 }
5937 }
5938
5939 // Match the variable offset.
5940 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
5941 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5942 // Look through the SGPR->VGPR copy.
5943 Register SAddr =
5944 getSrcRegIgnoringCopies(Reg: AddrDef->MI->getOperand(i: 1).getReg(), MRI: *MRI);
5945
5946 if (isSGPR(Reg: SAddr)) {
5947 Register PtrBaseOffset = AddrDef->MI->getOperand(i: 2).getReg();
5948
5949 // It's possible voffset is an SGPR here, but the copy to VGPR will be
5950 // inserted later.
5951 bool ScaleOffset = selectScaleOffset(Root, Offset&: PtrBaseOffset,
5952 IsSigned: Subtarget->hasSignedGVSOffset());
5953 if (Register VOffset = matchExtendFromS32OrS32(
5954 Reg: PtrBaseOffset, IsSigned: Subtarget->hasSignedGVSOffset())) {
5955 if (NeedIOffset)
5956 return {{[=](MachineInstrBuilder &MIB) { // saddr
5957 MIB.addReg(RegNo: SAddr);
5958 },
5959 [=](MachineInstrBuilder &MIB) { // voffset
5960 MIB.addReg(RegNo: VOffset);
5961 },
5962 [=](MachineInstrBuilder &MIB) { // offset
5963 MIB.addImm(Val: ImmOffset);
5964 },
5965 [=](MachineInstrBuilder &MIB) { // cpol
5966 MIB.addImm(Val: CPolBits |
5967 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5968 }}};
5969 return {{[=](MachineInstrBuilder &MIB) { // saddr
5970 MIB.addReg(RegNo: SAddr);
5971 },
5972 [=](MachineInstrBuilder &MIB) { // voffset
5973 MIB.addReg(RegNo: VOffset);
5974 },
5975 [=](MachineInstrBuilder &MIB) { // cpol
5976 MIB.addImm(Val: CPolBits |
5977 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5978 }}};
5979 }
5980 }
5981 }
5982
5983 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
5984 // drop this.
5985 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
5986 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(Reg: AddrDef->Reg))
5987 return std::nullopt;
5988
5989 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
5990 // moves required to copy a 64-bit SGPR to VGPR.
5991 MachineInstr *MI = Root.getParent();
5992 MachineBasicBlock *MBB = MI->getParent();
5993 Register VOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5994
5995 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VOffset)
5996 .addImm(Val: 0);
5997
5998 if (NeedIOffset)
5999 return {{
6000 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: AddrDef->Reg); }, // saddr
6001 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: VOffset); }, // voffset
6002 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); }, // offset
6003 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPolBits); } // cpol
6004 }};
6005 return {{
6006 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: AddrDef->Reg); }, // saddr
6007 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: VOffset); }, // voffset
6008 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPolBits); } // cpol
6009 }};
6010}
6011
6012InstructionSelector::ComplexRendererFns
6013AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
6014 return selectGlobalSAddr(Root, CPolBits: 0);
6015}
6016
6017InstructionSelector::ComplexRendererFns
6018AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
6019 const MachineInstr &I = *Root.getParent();
6020
6021 // We are assuming CPol is always the last operand of the intrinsic.
6022 auto PassedCPol =
6023 I.getOperand(i: I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6024 return selectGlobalSAddr(Root, CPolBits: PassedCPol);
6025}
6026
6027InstructionSelector::ComplexRendererFns
6028AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
6029 const MachineInstr &I = *Root.getParent();
6030
6031 // We are assuming CPol is second from last operand of the intrinsic.
6032 auto PassedCPol =
6033 I.getOperand(i: I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6034 return selectGlobalSAddr(Root, CPolBits: PassedCPol);
6035}
6036
6037InstructionSelector::ComplexRendererFns
6038AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
6039 return selectGlobalSAddr(Root, CPolBits: AMDGPU::CPol::GLC);
6040}
6041
6042InstructionSelector::ComplexRendererFns
6043AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
6044 MachineOperand &Root) const {
6045 const MachineInstr &I = *Root.getParent();
6046
6047 // We are assuming CPol is always the last operand of the intrinsic.
6048 auto PassedCPol =
6049 I.getOperand(i: I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6050 return selectGlobalSAddr(Root, CPolBits: PassedCPol, NeedIOffset: false);
6051}
6052
6053InstructionSelector::ComplexRendererFns
6054AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
6055 MachineOperand &Root) const {
6056 const MachineInstr &I = *Root.getParent();
6057
6058 // We are assuming CPol is second from last operand of the intrinsic.
6059 auto PassedCPol =
6060 I.getOperand(i: I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6061 return selectGlobalSAddr(Root, CPolBits: PassedCPol, NeedIOffset: false);
6062}
6063
6064InstructionSelector::ComplexRendererFns
6065AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
6066 Register Addr = Root.getReg();
6067 Register PtrBase;
6068 int64_t ConstOffset;
6069 int64_t ImmOffset = 0;
6070
6071 // Match the immediate offset first, which canonically is moved as low as
6072 // possible.
6073 std::tie(args&: PtrBase, args&: ConstOffset, args: std::ignore) =
6074 getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
6075
6076 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
6077 TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
6078 FlatVariant: SIInstrFlags::FlatScratch)) {
6079 Addr = PtrBase;
6080 ImmOffset = ConstOffset;
6081 }
6082
6083 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
6084 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6085 int FI = AddrDef->MI->getOperand(i: 1).getIndex();
6086 return {{
6087 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(Idx: FI); }, // saddr
6088 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
6089 }};
6090 }
6091
6092 Register SAddr = AddrDef->Reg;
6093
6094 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6095 Register LHS = AddrDef->MI->getOperand(i: 1).getReg();
6096 Register RHS = AddrDef->MI->getOperand(i: 2).getReg();
6097 auto LHSDef = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI);
6098 auto RHSDef = getDefSrcRegIgnoringCopies(Reg: RHS, MRI: *MRI);
6099
6100 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
6101 isSGPR(Reg: RHSDef->Reg)) {
6102 int FI = LHSDef->MI->getOperand(i: 1).getIndex();
6103 MachineInstr &I = *Root.getParent();
6104 MachineBasicBlock *BB = I.getParent();
6105 const DebugLoc &DL = I.getDebugLoc();
6106 SAddr = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6107
6108 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADD_I32), DestReg: SAddr)
6109 .addFrameIndex(Idx: FI)
6110 .addReg(RegNo: RHSDef->Reg)
6111 .setOperandDead(3); // Dead scc
6112 }
6113 }
6114
6115 if (!isSGPR(Reg: SAddr))
6116 return std::nullopt;
6117
6118 return {{
6119 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SAddr); }, // saddr
6120 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
6121 }};
6122}
6123
6124// Check whether the flat scratch SVS swizzle bug affects this access.
6125bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6126 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
6127 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6128 return false;
6129
6130 // The bug affects the swizzling of SVS accesses if there is any carry out
6131 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
6132 // voffset to (soffset + inst_offset).
6133 auto VKnown = VT->getKnownBits(R: VAddr);
6134 auto SKnown = KnownBits::add(LHS: VT->getKnownBits(R: SAddr),
6135 RHS: KnownBits::makeConstant(C: APInt(32, ImmOffset)));
6136 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6137 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
6138 return (VMax & 3) + (SMax & 3) >= 4;
6139}
6140
6141InstructionSelector::ComplexRendererFns
6142AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
6143 Register Addr = Root.getReg();
6144 Register PtrBase;
6145 int64_t ConstOffset;
6146 int64_t ImmOffset = 0;
6147
6148 // Match the immediate offset first, which canonically is moved as low as
6149 // possible.
6150 std::tie(args&: PtrBase, args&: ConstOffset, args: std::ignore) =
6151 getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
6152
6153 Register OrigAddr = Addr;
6154 if (ConstOffset != 0 &&
6155 TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
6156 FlatVariant: SIInstrFlags::FlatScratch)) {
6157 Addr = PtrBase;
6158 ImmOffset = ConstOffset;
6159 }
6160
6161 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
6162 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6163 return std::nullopt;
6164
6165 Register RHS = AddrDef->MI->getOperand(i: 2).getReg();
6166 if (RBI.getRegBank(Reg: RHS, MRI: *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6167 return std::nullopt;
6168
6169 Register LHS = AddrDef->MI->getOperand(i: 1).getReg();
6170 auto LHSDef = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI);
6171
6172 if (OrigAddr != Addr) {
6173 if (!isFlatScratchBaseLegalSVImm(Addr: OrigAddr))
6174 return std::nullopt;
6175 } else {
6176 if (!isFlatScratchBaseLegalSV(Addr: OrigAddr))
6177 return std::nullopt;
6178 }
6179
6180 if (checkFlatScratchSVSSwizzleBug(VAddr: RHS, SAddr: LHS, ImmOffset))
6181 return std::nullopt;
6182
6183 unsigned CPol = selectScaleOffset(Root, Offset&: RHS, IsSigned: true /* IsSigned */)
6184 ? AMDGPU::CPol::SCAL
6185 : 0;
6186
6187 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6188 int FI = LHSDef->MI->getOperand(i: 1).getIndex();
6189 return {{
6190 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: RHS); }, // vaddr
6191 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(Idx: FI); }, // saddr
6192 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); }, // offset
6193 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPol); } // cpol
6194 }};
6195 }
6196
6197 if (!isSGPR(Reg: LHS))
6198 if (auto Def = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI))
6199 LHS = Def->Reg;
6200
6201 if (!isSGPR(Reg: LHS))
6202 return std::nullopt;
6203
6204 return {{
6205 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: RHS); }, // vaddr
6206 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: LHS); }, // saddr
6207 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); }, // offset
6208 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPol); } // cpol
6209 }};
6210}
6211
6212InstructionSelector::ComplexRendererFns
6213AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
6214 MachineInstr *MI = Root.getParent();
6215 MachineBasicBlock *MBB = MI->getParent();
6216 MachineFunction *MF = MBB->getParent();
6217 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6218
6219 int64_t Offset = 0;
6220 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: Offset)) &&
6221 Offset != TM.getNullPointerValue(AddrSpace: AMDGPUAS::PRIVATE_ADDRESS)) {
6222 Register HighBits = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6223
6224 // TODO: Should this be inside the render function? The iterator seems to
6225 // move.
6226 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
6227 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32),
6228 DestReg: HighBits)
6229 .addImm(Val: Offset & ~MaxOffset);
6230
6231 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6232 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6233 },
6234 [=](MachineInstrBuilder &MIB) { // vaddr
6235 MIB.addReg(RegNo: HighBits);
6236 },
6237 [=](MachineInstrBuilder &MIB) { // soffset
6238 // Use constant zero for soffset and rely on eliminateFrameIndex
6239 // to choose the appropriate frame register if need be.
6240 MIB.addImm(Val: 0);
6241 },
6242 [=](MachineInstrBuilder &MIB) { // offset
6243 MIB.addImm(Val: Offset & MaxOffset);
6244 }}};
6245 }
6246
6247 assert(Offset == 0 || Offset == -1);
6248
6249 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6250 // offsets.
6251 std::optional<int> FI;
6252 Register VAddr = Root.getReg();
6253
6254 const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg());
6255 Register PtrBase;
6256 int64_t ConstOffset;
6257 std::tie(args&: PtrBase, args&: ConstOffset, args: std::ignore) =
6258 getPtrBaseWithConstantOffset(Root: VAddr, MRI: *MRI);
6259 if (ConstOffset != 0) {
6260 if (TII.isLegalMUBUFImmOffset(Imm: ConstOffset) &&
6261 (!STI.privateMemoryResourceIsRangeChecked() ||
6262 VT->signBitIsZero(Op: PtrBase))) {
6263 const MachineInstr *PtrBaseDef = MRI->getVRegDef(Reg: PtrBase);
6264 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6265 FI = PtrBaseDef->getOperand(i: 1).getIndex();
6266 else
6267 VAddr = PtrBase;
6268 Offset = ConstOffset;
6269 }
6270 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6271 FI = RootDef->getOperand(i: 1).getIndex();
6272 }
6273
6274 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6275 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6276 },
6277 [=](MachineInstrBuilder &MIB) { // vaddr
6278 if (FI)
6279 MIB.addFrameIndex(Idx: *FI);
6280 else
6281 MIB.addReg(RegNo: VAddr);
6282 },
6283 [=](MachineInstrBuilder &MIB) { // soffset
6284 // Use constant zero for soffset and rely on eliminateFrameIndex
6285 // to choose the appropriate frame register if need be.
6286 MIB.addImm(Val: 0);
6287 },
6288 [=](MachineInstrBuilder &MIB) { // offset
6289 MIB.addImm(Val: Offset);
6290 }}};
6291}
6292
6293bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6294 int64_t Offset) const {
6295 if (!isUInt<16>(x: Offset))
6296 return false;
6297
6298 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6299 return true;
6300
6301 // On Southern Islands instruction with a negative base value and an offset
6302 // don't seem to work.
6303 return VT->signBitIsZero(Op: Base);
6304}
6305
6306bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6307 int64_t Offset1,
6308 unsigned Size) const {
6309 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6310 return false;
6311 if (!isUInt<8>(x: Offset0 / Size) || !isUInt<8>(x: Offset1 / Size))
6312 return false;
6313
6314 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6315 return true;
6316
6317 // On Southern Islands instruction with a negative base value and an offset
6318 // don't seem to work.
6319 return VT->signBitIsZero(Op: Base);
6320}
6321
6322// Return whether the operation has NoUnsignedWrap property.
6323static bool isNoUnsignedWrap(MachineInstr *Addr) {
6324 return Addr->getOpcode() == TargetOpcode::G_OR ||
6325 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6326 Addr->getFlag(Flag: MachineInstr::NoUWrap));
6327}
6328
6329// Check that the base address of flat scratch load/store in the form of `base +
6330// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6331// requirement). We always treat the first operand as the base address here.
6332bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6333 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
6334
6335 if (isNoUnsignedWrap(Addr: AddrMI))
6336 return true;
6337
6338 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6339 // values.
6340 if (STI.hasSignedScratchOffsets())
6341 return true;
6342
6343 Register LHS = AddrMI->getOperand(i: 1).getReg();
6344 Register RHS = AddrMI->getOperand(i: 2).getReg();
6345
6346 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6347 std::optional<ValueAndVReg> RhsValReg =
6348 getIConstantVRegValWithLookThrough(VReg: RHS, MRI: *MRI);
6349 // If the immediate offset is negative and within certain range, the base
6350 // address cannot also be negative. If the base is also negative, the sum
6351 // would be either negative or much larger than the valid range of scratch
6352 // memory a thread can access.
6353 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6354 RhsValReg->Value.getSExtValue() > -0x40000000)
6355 return true;
6356 }
6357
6358 return VT->signBitIsZero(Op: LHS);
6359}
6360
6361// Check address value in SGPR/VGPR are legal for flat scratch in the form
6362// of: SGPR + VGPR.
6363bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6364 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
6365
6366 if (isNoUnsignedWrap(Addr: AddrMI))
6367 return true;
6368
6369 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6370 // values.
6371 if (STI.hasSignedScratchOffsets())
6372 return true;
6373
6374 Register LHS = AddrMI->getOperand(i: 1).getReg();
6375 Register RHS = AddrMI->getOperand(i: 2).getReg();
6376 return VT->signBitIsZero(Op: RHS) && VT->signBitIsZero(Op: LHS);
6377}
6378
6379// Check address value in SGPR/VGPR are legal for flat scratch in the form
6380// of: SGPR + VGPR + Imm.
6381bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6382 Register Addr) const {
6383 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6384 // values.
6385 if (STI.hasSignedScratchOffsets())
6386 return true;
6387
6388 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
6389 Register Base = AddrMI->getOperand(i: 1).getReg();
6390 std::optional<DefinitionAndSourceRegister> BaseDef =
6391 getDefSrcRegIgnoringCopies(Reg: Base, MRI: *MRI);
6392 std::optional<ValueAndVReg> RHSOffset =
6393 getIConstantVRegValWithLookThrough(VReg: AddrMI->getOperand(i: 2).getReg(), MRI: *MRI);
6394 assert(RHSOffset);
6395
6396 // If the immediate offset is negative and within certain range, the base
6397 // address cannot also be negative. If the base is also negative, the sum
6398 // would be either negative or much larger than the valid range of scratch
6399 // memory a thread can access.
6400 if (isNoUnsignedWrap(Addr: BaseDef->MI) &&
6401 (isNoUnsignedWrap(Addr: AddrMI) ||
6402 (RHSOffset->Value.getSExtValue() < 0 &&
6403 RHSOffset->Value.getSExtValue() > -0x40000000)))
6404 return true;
6405
6406 Register LHS = BaseDef->MI->getOperand(i: 1).getReg();
6407 Register RHS = BaseDef->MI->getOperand(i: 2).getReg();
6408 return VT->signBitIsZero(Op: RHS) && VT->signBitIsZero(Op: LHS);
6409}
6410
6411bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6412 unsigned ShAmtBits) const {
6413 assert(MI.getOpcode() == TargetOpcode::G_AND);
6414
6415 std::optional<APInt> RHS =
6416 getIConstantVRegVal(VReg: MI.getOperand(i: 2).getReg(), MRI: *MRI);
6417 if (!RHS)
6418 return false;
6419
6420 if (RHS->countr_one() >= ShAmtBits)
6421 return true;
6422
6423 const APInt &LHSKnownZeros = VT->getKnownZeroes(R: MI.getOperand(i: 1).getReg());
6424 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6425}
6426
6427InstructionSelector::ComplexRendererFns
6428AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6429 MachineOperand &Root) const {
6430 Register Reg = Root.getReg();
6431 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6432
6433 std::optional<DefinitionAndSourceRegister> Def =
6434 getDefSrcRegIgnoringCopies(Reg, MRI: *MRI);
6435 assert(Def && "this shouldn't be an optional result");
6436 Reg = Def->Reg;
6437
6438 if (Register WaveBase = getWaveAddress(Def: Def->MI)) {
6439 return {{
6440 [=](MachineInstrBuilder &MIB) { // rsrc
6441 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6442 },
6443 [=](MachineInstrBuilder &MIB) { // soffset
6444 MIB.addReg(RegNo: WaveBase);
6445 },
6446 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // offset
6447 }};
6448 }
6449
6450 int64_t Offset = 0;
6451
6452 // FIXME: Copy check is a hack
6453 Register BasePtr;
6454 if (mi_match(R: Reg, MRI: *MRI,
6455 P: m_GPtrAdd(L: m_Reg(R&: BasePtr),
6456 R: m_any_of(preds: m_ICst(Cst&: Offset), preds: m_Copy(Src: m_ICst(Cst&: Offset)))))) {
6457 if (!TII.isLegalMUBUFImmOffset(Imm: Offset))
6458 return {};
6459 MachineInstr *BasePtrDef = getDefIgnoringCopies(Reg: BasePtr, MRI: *MRI);
6460 Register WaveBase = getWaveAddress(Def: BasePtrDef);
6461 if (!WaveBase)
6462 return {};
6463
6464 return {{
6465 [=](MachineInstrBuilder &MIB) { // rsrc
6466 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6467 },
6468 [=](MachineInstrBuilder &MIB) { // soffset
6469 MIB.addReg(RegNo: WaveBase);
6470 },
6471 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); } // offset
6472 }};
6473 }
6474
6475 if (!mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: Offset)) ||
6476 !TII.isLegalMUBUFImmOffset(Imm: Offset))
6477 return {};
6478
6479 return {{
6480 [=](MachineInstrBuilder &MIB) { // rsrc
6481 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6482 },
6483 [=](MachineInstrBuilder &MIB) { // soffset
6484 MIB.addImm(Val: 0);
6485 },
6486 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); } // offset
6487 }};
6488}
6489
6490std::pair<Register, unsigned>
6491AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6492 const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg());
6493 int64_t ConstAddr = 0;
6494
6495 Register PtrBase;
6496 int64_t Offset;
6497 std::tie(args&: PtrBase, args&: Offset, args: std::ignore) =
6498 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
6499
6500 if (Offset) {
6501 if (isDSOffsetLegal(Base: PtrBase, Offset)) {
6502 // (add n0, c0)
6503 return std::pair(PtrBase, Offset);
6504 }
6505 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6506 // TODO
6507
6508
6509 } else if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: ConstAddr))) {
6510 // TODO
6511
6512 }
6513
6514 return std::pair(Root.getReg(), 0);
6515}
6516
6517InstructionSelector::ComplexRendererFns
6518AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6519 Register Reg;
6520 unsigned Offset;
6521 std::tie(args&: Reg, args&: Offset) = selectDS1Addr1OffsetImpl(Root);
6522 return {{
6523 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
6524 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }
6525 }};
6526}
6527
6528InstructionSelector::ComplexRendererFns
6529AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6530 return selectDSReadWrite2(Root, size: 4);
6531}
6532
6533InstructionSelector::ComplexRendererFns
6534AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6535 return selectDSReadWrite2(Root, size: 8);
6536}
6537
6538InstructionSelector::ComplexRendererFns
6539AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6540 unsigned Size) const {
6541 Register Reg;
6542 unsigned Offset;
6543 std::tie(args&: Reg, args&: Offset) = selectDSReadWrite2Impl(Root, size: Size);
6544 return {{
6545 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
6546 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); },
6547 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset+1); }
6548 }};
6549}
6550
6551std::pair<Register, unsigned>
6552AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6553 unsigned Size) const {
6554 const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg());
6555 int64_t ConstAddr = 0;
6556
6557 Register PtrBase;
6558 int64_t Offset;
6559 std::tie(args&: PtrBase, args&: Offset, args: std::ignore) =
6560 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
6561
6562 if (Offset) {
6563 int64_t OffsetValue0 = Offset;
6564 int64_t OffsetValue1 = Offset + Size;
6565 if (isDSOffset2Legal(Base: PtrBase, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
6566 // (add n0, c0)
6567 return std::pair(PtrBase, OffsetValue0 / Size);
6568 }
6569 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6570 // TODO
6571
6572 } else if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: ConstAddr))) {
6573 // TODO
6574
6575 }
6576
6577 return std::pair(Root.getReg(), 0);
6578}
6579
6580/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6581/// the base value with the constant offset, and if the offset computation is
6582/// known to be inbounds. There may be intervening copies between \p Root and
6583/// the identified constant. Returns \p Root, 0, false if this does not match
6584/// the pattern.
6585std::tuple<Register, int64_t, bool>
6586AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6587 Register Root, const MachineRegisterInfo &MRI) const {
6588 MachineInstr *RootI = getDefIgnoringCopies(Reg: Root, MRI);
6589 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6590 return {Root, 0, false};
6591
6592 MachineOperand &RHS = RootI->getOperand(i: 2);
6593 std::optional<ValueAndVReg> MaybeOffset =
6594 getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI);
6595 if (!MaybeOffset)
6596 return {Root, 0, false};
6597 bool IsInBounds = RootI->getFlag(Flag: MachineInstr::MIFlag::InBounds);
6598 return {RootI->getOperand(i: 1).getReg(), MaybeOffset->Value.getSExtValue(),
6599 IsInBounds};
6600}
6601
6602static void addZeroImm(MachineInstrBuilder &MIB) {
6603 MIB.addImm(Val: 0);
6604}
6605
6606/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6607/// BasePtr is not valid, a null base pointer will be used.
6608static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6609 uint32_t FormatLo, uint32_t FormatHi,
6610 Register BasePtr) {
6611 Register RSrc2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6612 Register RSrc3 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6613 Register RSrcHi = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
6614 Register RSrc = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_128RegClass);
6615
6616 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
6617 .addDef(RegNo: RSrc2)
6618 .addImm(Val: FormatLo);
6619 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
6620 .addDef(RegNo: RSrc3)
6621 .addImm(Val: FormatHi);
6622
6623 // Build the half of the subregister with the constants before building the
6624 // full 128-bit register. If we are building multiple resource descriptors,
6625 // this will allow CSEing of the 2-component register.
6626 B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
6627 .addDef(RegNo: RSrcHi)
6628 .addReg(RegNo: RSrc2)
6629 .addImm(Val: AMDGPU::sub0)
6630 .addReg(RegNo: RSrc3)
6631 .addImm(Val: AMDGPU::sub1);
6632
6633 Register RSrcLo = BasePtr;
6634 if (!BasePtr) {
6635 RSrcLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
6636 B.buildInstr(Opcode: AMDGPU::S_MOV_B64)
6637 .addDef(RegNo: RSrcLo)
6638 .addImm(Val: 0);
6639 }
6640
6641 B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
6642 .addDef(RegNo: RSrc)
6643 .addReg(RegNo: RSrcLo)
6644 .addImm(Val: AMDGPU::sub0_sub1)
6645 .addReg(RegNo: RSrcHi)
6646 .addImm(Val: AMDGPU::sub2_sub3);
6647
6648 return RSrc;
6649}
6650
6651static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6652 const SIInstrInfo &TII, Register BasePtr) {
6653 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6654
6655 // FIXME: Why are half the "default" bits ignored based on the addressing
6656 // mode?
6657 return buildRSRC(B, MRI, FormatLo: 0, FormatHi: Hi_32(Value: DefaultFormat), BasePtr);
6658}
6659
6660static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6661 const SIInstrInfo &TII, Register BasePtr) {
6662 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6663
6664 // FIXME: Why are half the "default" bits ignored based on the addressing
6665 // mode?
6666 return buildRSRC(B, MRI, FormatLo: -1, FormatHi: Hi_32(Value: DefaultFormat), BasePtr);
6667}
6668
6669AMDGPUInstructionSelector::MUBUFAddressData
6670AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6671 MUBUFAddressData Data;
6672 Data.N0 = Src;
6673
6674 Register PtrBase;
6675 int64_t Offset;
6676
6677 std::tie(args&: PtrBase, args&: Offset, args: std::ignore) =
6678 getPtrBaseWithConstantOffset(Root: Src, MRI: *MRI);
6679 if (isUInt<32>(x: Offset)) {
6680 Data.N0 = PtrBase;
6681 Data.Offset = Offset;
6682 }
6683
6684 if (MachineInstr *InputAdd
6685 = getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Data.N0, MRI: *MRI)) {
6686 Data.N2 = InputAdd->getOperand(i: 1).getReg();
6687 Data.N3 = InputAdd->getOperand(i: 2).getReg();
6688
6689 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6690 // FIXME: Don't know this was defined by operand 0
6691 //
6692 // TODO: Remove this when we have copy folding optimizations after
6693 // RegBankSelect.
6694 Data.N2 = getDefIgnoringCopies(Reg: Data.N2, MRI: *MRI)->getOperand(i: 0).getReg();
6695 Data.N3 = getDefIgnoringCopies(Reg: Data.N3, MRI: *MRI)->getOperand(i: 0).getReg();
6696 }
6697
6698 return Data;
6699}
6700
6701/// Return if the addr64 mubuf mode should be used for the given address.
6702bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6703 // (ptr_add N2, N3) -> addr64, or
6704 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6705 if (Addr.N2)
6706 return true;
6707
6708 const RegisterBank *N0Bank = RBI.getRegBank(Reg: Addr.N0, MRI: *MRI, TRI);
6709 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6710}
6711
6712/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6713/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6714/// component.
6715void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6716 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6717 if (TII.isLegalMUBUFImmOffset(Imm: ImmOffset))
6718 return;
6719
6720 // Illegal offset, store it in soffset.
6721 SOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6722 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
6723 .addDef(RegNo: SOffset)
6724 .addImm(Val: ImmOffset);
6725 ImmOffset = 0;
6726}
6727
6728bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6729 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6730 Register &SOffset, int64_t &Offset) const {
6731 // FIXME: Predicates should stop this from reaching here.
6732 // addr64 bit was removed for volcanic islands.
6733 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6734 return false;
6735
6736 MUBUFAddressData AddrData = parseMUBUFAddress(Src: Root.getReg());
6737 if (!shouldUseAddr64(Addr: AddrData))
6738 return false;
6739
6740 Register N0 = AddrData.N0;
6741 Register N2 = AddrData.N2;
6742 Register N3 = AddrData.N3;
6743 Offset = AddrData.Offset;
6744
6745 // Base pointer for the SRD.
6746 Register SRDPtr;
6747
6748 if (N2) {
6749 if (RBI.getRegBank(Reg: N2, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6750 assert(N3);
6751 if (RBI.getRegBank(Reg: N3, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6752 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6753 // addr64, and construct the default resource from a 0 address.
6754 VAddr = N0;
6755 } else {
6756 SRDPtr = N3;
6757 VAddr = N2;
6758 }
6759 } else {
6760 // N2 is not divergent.
6761 SRDPtr = N2;
6762 VAddr = N3;
6763 }
6764 } else if (RBI.getRegBank(Reg: N0, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6765 // Use the default null pointer in the resource
6766 VAddr = N0;
6767 } else {
6768 // N0 -> offset, or
6769 // (N0 + C1) -> offset
6770 SRDPtr = N0;
6771 }
6772
6773 MachineIRBuilder B(*Root.getParent());
6774 RSrcReg = buildAddr64RSrc(B, MRI&: *MRI, TII, BasePtr: SRDPtr);
6775 splitIllegalMUBUFOffset(B, SOffset, ImmOffset&: Offset);
6776 return true;
6777}
6778
6779bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6780 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6781 int64_t &Offset) const {
6782
6783 // FIXME: Pattern should not reach here.
6784 if (STI.useFlatForGlobal())
6785 return false;
6786
6787 MUBUFAddressData AddrData = parseMUBUFAddress(Src: Root.getReg());
6788 if (shouldUseAddr64(Addr: AddrData))
6789 return false;
6790
6791 // N0 -> offset, or
6792 // (N0 + C1) -> offset
6793 Register SRDPtr = AddrData.N0;
6794 Offset = AddrData.Offset;
6795
6796 // TODO: Look through extensions for 32-bit soffset.
6797 MachineIRBuilder B(*Root.getParent());
6798
6799 RSrcReg = buildOffsetSrc(B, MRI&: *MRI, TII, BasePtr: SRDPtr);
6800 splitIllegalMUBUFOffset(B, SOffset, ImmOffset&: Offset);
6801 return true;
6802}
6803
6804InstructionSelector::ComplexRendererFns
6805AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6806 Register VAddr;
6807 Register RSrcReg;
6808 Register SOffset;
6809 int64_t Offset = 0;
6810
6811 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6812 return {};
6813
6814 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6815 // pattern.
6816 return {{
6817 [=](MachineInstrBuilder &MIB) { // rsrc
6818 MIB.addReg(RegNo: RSrcReg);
6819 },
6820 [=](MachineInstrBuilder &MIB) { // vaddr
6821 MIB.addReg(RegNo: VAddr);
6822 },
6823 [=](MachineInstrBuilder &MIB) { // soffset
6824 if (SOffset)
6825 MIB.addReg(RegNo: SOffset);
6826 else if (STI.hasRestrictedSOffset())
6827 MIB.addReg(RegNo: AMDGPU::SGPR_NULL);
6828 else
6829 MIB.addImm(Val: 0);
6830 },
6831 [=](MachineInstrBuilder &MIB) { // offset
6832 MIB.addImm(Val: Offset);
6833 },
6834 addZeroImm, // cpol
6835 addZeroImm, // tfe
6836 addZeroImm // swz
6837 }};
6838}
6839
6840InstructionSelector::ComplexRendererFns
6841AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6842 Register RSrcReg;
6843 Register SOffset;
6844 int64_t Offset = 0;
6845
6846 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6847 return {};
6848
6849 return {{
6850 [=](MachineInstrBuilder &MIB) { // rsrc
6851 MIB.addReg(RegNo: RSrcReg);
6852 },
6853 [=](MachineInstrBuilder &MIB) { // soffset
6854 if (SOffset)
6855 MIB.addReg(RegNo: SOffset);
6856 else if (STI.hasRestrictedSOffset())
6857 MIB.addReg(RegNo: AMDGPU::SGPR_NULL);
6858 else
6859 MIB.addImm(Val: 0);
6860 },
6861 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }, // offset
6862 addZeroImm, // cpol
6863 addZeroImm, // tfe
6864 addZeroImm, // swz
6865 }};
6866}
6867
6868InstructionSelector::ComplexRendererFns
6869AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6870
6871 Register SOffset = Root.getReg();
6872
6873 if (STI.hasRestrictedSOffset() && mi_match(R: SOffset, MRI: *MRI, P: m_ZeroInt()))
6874 SOffset = AMDGPU::SGPR_NULL;
6875
6876 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); }}};
6877}
6878
6879/// Get an immediate that must be 32-bits, and treated as zero extended.
6880static std::optional<uint64_t>
6881getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
6882 // getIConstantVRegVal sexts any values, so see if that matters.
6883 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(VReg: Reg, MRI);
6884 if (!OffsetVal || !isInt<32>(x: *OffsetVal))
6885 return std::nullopt;
6886 return Lo_32(Value: *OffsetVal);
6887}
6888
6889InstructionSelector::ComplexRendererFns
6890AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6891 std::optional<uint64_t> OffsetVal =
6892 Root.isImm() ? Root.getImm() : getConstantZext32Val(Reg: Root.getReg(), MRI: *MRI);
6893 if (!OffsetVal)
6894 return {};
6895
6896 std::optional<int64_t> EncodedImm =
6897 AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: *OffsetVal, IsBuffer: true);
6898 if (!EncodedImm)
6899 return {};
6900
6901 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); } }};
6902}
6903
6904InstructionSelector::ComplexRendererFns
6905AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6906 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6907
6908 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Reg: Root.getReg(), MRI: *MRI);
6909 if (!OffsetVal)
6910 return {};
6911
6912 std::optional<int64_t> EncodedImm =
6913 AMDGPU::getSMRDEncodedLiteralOffset32(ST: STI, ByteOffset: *OffsetVal);
6914 if (!EncodedImm)
6915 return {};
6916
6917 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); } }};
6918}
6919
6920InstructionSelector::ComplexRendererFns
6921AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6922 // Match the (soffset + offset) pair as a 32-bit register base and
6923 // an immediate offset.
6924 Register SOffset;
6925 unsigned Offset;
6926 std::tie(args&: SOffset, args&: Offset) = AMDGPU::getBaseWithConstantOffset(
6927 MRI&: *MRI, Reg: Root.getReg(), ValueTracking: VT, /*CheckNUW*/ true);
6928 if (!SOffset)
6929 return std::nullopt;
6930
6931 std::optional<int64_t> EncodedOffset =
6932 AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: Offset, /* IsBuffer */ true);
6933 if (!EncodedOffset)
6934 return std::nullopt;
6935
6936 assert(MRI->getType(SOffset) == LLT::scalar(32));
6937 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); },
6938 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedOffset); }}};
6939}
6940
6941std::pair<Register, unsigned>
6942AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6943 bool &Matched) const {
6944 Matched = false;
6945
6946 Register Src;
6947 unsigned Mods;
6948 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
6949
6950 if (mi_match(R: Src, MRI: *MRI, P: m_GFPExt(Src: m_Reg(R&: Src)))) {
6951 assert(MRI->getType(Src) == LLT::scalar(16));
6952
6953 // Only change Src if src modifier could be gained. In such cases new Src
6954 // could be sgpr but this does not violate constant bus restriction for
6955 // instruction that is being selected.
6956 Src = stripBitCast(Reg: Src, MRI&: *MRI);
6957
6958 const auto CheckAbsNeg = [&]() {
6959 // Be careful about folding modifiers if we already have an abs. fneg is
6960 // applied last, so we don't want to apply an earlier fneg.
6961 if ((Mods & SISrcMods::ABS) == 0) {
6962 unsigned ModsTmp;
6963 std::tie(args&: Src, args&: ModsTmp) = selectVOP3ModsImpl(Src);
6964
6965 if ((ModsTmp & SISrcMods::NEG) != 0)
6966 Mods ^= SISrcMods::NEG;
6967
6968 if ((ModsTmp & SISrcMods::ABS) != 0)
6969 Mods |= SISrcMods::ABS;
6970 }
6971 };
6972
6973 CheckAbsNeg();
6974
6975 // op_sel/op_sel_hi decide the source type and source.
6976 // If the source's op_sel_hi is set, it indicates to do a conversion from
6977 // fp16. If the sources's op_sel is set, it picks the high half of the
6978 // source register.
6979
6980 Mods |= SISrcMods::OP_SEL_1;
6981
6982 if (isExtractHiElt(MRI&: *MRI, In: Src, Out&: Src)) {
6983 Mods |= SISrcMods::OP_SEL_0;
6984 CheckAbsNeg();
6985 }
6986
6987 Matched = true;
6988 }
6989
6990 return {Src, Mods};
6991}
6992
6993InstructionSelector::ComplexRendererFns
6994AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
6995 MachineOperand &Root) const {
6996 Register Src;
6997 unsigned Mods;
6998 bool Matched;
6999 std::tie(args&: Src, args&: Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7000 if (!Matched)
7001 return {};
7002
7003 return {{
7004 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
7005 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
7006 }};
7007}
7008
7009InstructionSelector::ComplexRendererFns
7010AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
7011 Register Src;
7012 unsigned Mods;
7013 bool Matched;
7014 std::tie(args&: Src, args&: Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7015
7016 return {{
7017 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
7018 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
7019 }};
7020}
7021
7022bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
7023 MachineInstr &I, Intrinsic::ID IntrID) const {
7024 MachineBasicBlock *MBB = I.getParent();
7025 const DebugLoc &DL = I.getDebugLoc();
7026 Register CCReg = I.getOperand(i: 0).getReg();
7027
7028 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
7029 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CMP_EQ_U32)).addImm(Val: 0).addImm(Val: 0);
7030
7031 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
7032 .addImm(Val: I.getOperand(i: 2).getImm());
7033
7034 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CCReg).addReg(RegNo: AMDGPU::SCC);
7035
7036 I.eraseFromParent();
7037 return RBI.constrainGenericRegister(Reg: CCReg, RC: AMDGPU::SReg_32_XM0_XEXECRegClass,
7038 MRI&: *MRI);
7039}
7040
7041bool AMDGPUInstructionSelector::selectSGetBarrierState(
7042 MachineInstr &I, Intrinsic::ID IntrID) const {
7043 MachineBasicBlock *MBB = I.getParent();
7044 const DebugLoc &DL = I.getDebugLoc();
7045 const MachineOperand &BarOp = I.getOperand(i: 2);
7046 std::optional<int64_t> BarValImm =
7047 getIConstantVRegSExtVal(VReg: BarOp.getReg(), MRI: *MRI);
7048
7049 if (!BarValImm) {
7050 auto CopyMIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
7051 .addReg(RegNo: BarOp.getReg());
7052 constrainSelectedInstRegOperands(I&: *CopyMIB, TII, TRI, RBI);
7053 }
7054 MachineInstrBuilder MIB;
7055 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
7056 : AMDGPU::S_GET_BARRIER_STATE_M0;
7057 MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc));
7058
7059 auto DstReg = I.getOperand(i: 0).getReg();
7060 const TargetRegisterClass *DstRC =
7061 TRI.getConstrainedRegClassForOperand(MO: I.getOperand(i: 0), MRI: *MRI);
7062 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
7063 return false;
7064 MIB.addDef(RegNo: DstReg);
7065 if (BarValImm) {
7066 MIB.addImm(Val: *BarValImm);
7067 }
7068 I.eraseFromParent();
7069 return true;
7070}
7071
7072unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
7073 if (HasInlineConst) {
7074 switch (IntrID) {
7075 default:
7076 llvm_unreachable("not a named barrier op");
7077 case Intrinsic::amdgcn_s_barrier_join:
7078 return AMDGPU::S_BARRIER_JOIN_IMM;
7079 case Intrinsic::amdgcn_s_wakeup_barrier:
7080 return AMDGPU::S_WAKEUP_BARRIER_IMM;
7081 case Intrinsic::amdgcn_s_get_named_barrier_state:
7082 return AMDGPU::S_GET_BARRIER_STATE_IMM;
7083 };
7084 } else {
7085 switch (IntrID) {
7086 default:
7087 llvm_unreachable("not a named barrier op");
7088 case Intrinsic::amdgcn_s_barrier_join:
7089 return AMDGPU::S_BARRIER_JOIN_M0;
7090 case Intrinsic::amdgcn_s_wakeup_barrier:
7091 return AMDGPU::S_WAKEUP_BARRIER_M0;
7092 case Intrinsic::amdgcn_s_get_named_barrier_state:
7093 return AMDGPU::S_GET_BARRIER_STATE_M0;
7094 };
7095 }
7096}
7097
7098bool AMDGPUInstructionSelector::selectNamedBarrierInit(
7099 MachineInstr &I, Intrinsic::ID IntrID) const {
7100 MachineBasicBlock *MBB = I.getParent();
7101 const DebugLoc &DL = I.getDebugLoc();
7102 const MachineOperand &BarOp = I.getOperand(i: 1);
7103 const MachineOperand &CntOp = I.getOperand(i: 2);
7104
7105 // BarID = (BarOp >> 4) & 0x3F
7106 Register TmpReg0 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7107 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: TmpReg0)
7108 .add(MO: BarOp)
7109 .addImm(Val: 4u)
7110 .setOperandDead(3); // Dead scc
7111
7112 Register TmpReg1 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7113 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: TmpReg1)
7114 .addReg(RegNo: TmpReg0)
7115 .addImm(Val: 0x3F)
7116 .setOperandDead(3); // Dead scc
7117
7118 // MO = ((CntOp & 0x3F) << shAmt) | BarID
7119 Register TmpReg2 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7120 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: TmpReg2)
7121 .add(MO: CntOp)
7122 .addImm(Val: 0x3F)
7123 .setOperandDead(3); // Dead scc
7124
7125 Register TmpReg3 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7126 constexpr unsigned ShAmt = 16;
7127 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: TmpReg3)
7128 .addReg(RegNo: TmpReg2)
7129 .addImm(Val: ShAmt)
7130 .setOperandDead(3); // Dead scc
7131
7132 Register TmpReg4 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7133 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_OR_B32), DestReg: TmpReg4)
7134 .addReg(RegNo: TmpReg1)
7135 .addReg(RegNo: TmpReg3)
7136 .setOperandDead(3); // Dead scc;
7137
7138 auto CopyMIB =
7139 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0).addReg(RegNo: TmpReg4);
7140 constrainSelectedInstRegOperands(I&: *CopyMIB, TII, TRI, RBI);
7141
7142 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7143 ? AMDGPU::S_BARRIER_INIT_M0
7144 : AMDGPU::S_BARRIER_SIGNAL_M0;
7145 MachineInstrBuilder MIB;
7146 MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc));
7147
7148 I.eraseFromParent();
7149 return true;
7150}
7151
7152bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7153 MachineInstr &I, Intrinsic::ID IntrID) const {
7154 MachineBasicBlock *MBB = I.getParent();
7155 const DebugLoc &DL = I.getDebugLoc();
7156 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7157 ? I.getOperand(i: 2)
7158 : I.getOperand(i: 1);
7159 std::optional<int64_t> BarValImm =
7160 getIConstantVRegSExtVal(VReg: BarOp.getReg(), MRI: *MRI);
7161
7162 if (!BarValImm) {
7163 // BarID = (BarOp >> 4) & 0x3F
7164 Register TmpReg0 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7165 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: TmpReg0)
7166 .addReg(RegNo: BarOp.getReg())
7167 .addImm(Val: 4u)
7168 .setOperandDead(3); // Dead scc;
7169
7170 Register TmpReg1 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7171 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: TmpReg1)
7172 .addReg(RegNo: TmpReg0)
7173 .addImm(Val: 0x3F)
7174 .setOperandDead(3); // Dead scc;
7175
7176 auto CopyMIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
7177 .addReg(RegNo: TmpReg1);
7178 constrainSelectedInstRegOperands(I&: *CopyMIB, TII, TRI, RBI);
7179 }
7180
7181 MachineInstrBuilder MIB;
7182 unsigned Opc = getNamedBarrierOp(HasInlineConst: BarValImm.has_value(), IntrID);
7183 MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc));
7184
7185 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7186 auto DstReg = I.getOperand(i: 0).getReg();
7187 const TargetRegisterClass *DstRC =
7188 TRI.getConstrainedRegClassForOperand(MO: I.getOperand(i: 0), MRI: *MRI);
7189 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
7190 return false;
7191 MIB.addDef(RegNo: DstReg);
7192 }
7193
7194 if (BarValImm) {
7195 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7196 MIB.addImm(Val: BarId);
7197 }
7198
7199 I.eraseFromParent();
7200 return true;
7201}
7202
7203void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
7204 const MachineInstr &MI,
7205 int OpIdx) const {
7206 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7207 "Expected G_CONSTANT");
7208 MIB.addImm(Val: MI.getOperand(i: 1).getCImm()->getSExtValue());
7209}
7210
7211void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
7212 const MachineInstr &MI,
7213 int OpIdx) const {
7214 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7215 "Expected G_CONSTANT");
7216 MIB.addImm(Val: -MI.getOperand(i: 1).getCImm()->getSExtValue());
7217}
7218
7219void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
7220 const MachineInstr &MI,
7221 int OpIdx) const {
7222 const MachineOperand &Op = MI.getOperand(i: 1);
7223 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
7224 MIB.addImm(Val: Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7225}
7226
7227void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
7228 const MachineInstr &MI,
7229 int OpIdx) const {
7230 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7231 "Expected G_CONSTANT");
7232 MIB.addImm(Val: MI.getOperand(i: 1).getCImm()->getValue().popcount());
7233}
7234
7235/// This only really exists to satisfy DAG type checking machinery, so is a
7236/// no-op here.
7237void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7238 const MachineInstr &MI,
7239 int OpIdx) const {
7240 const MachineOperand &Op = MI.getOperand(i: OpIdx);
7241 int64_t Imm;
7242 if (Op.isReg() && mi_match(R: Op.getReg(), MRI: *MRI, P: m_ICst(Cst&: Imm)))
7243 MIB.addImm(Val: Imm);
7244 else
7245 MIB.addImm(Val: Op.getImm());
7246}
7247
7248void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7249 const MachineInstr &MI,
7250 int OpIdx) const {
7251 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() != 0);
7252}
7253
7254void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7255 const MachineInstr &MI,
7256 int OpIdx) const {
7257 assert(OpIdx >= 0 && "expected to match an immediate operand");
7258 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7259}
7260
7261void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7262 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7263 assert(OpIdx >= 0 && "expected to match an immediate operand");
7264 MIB.addImm(
7265 Val: (MI.getOperand(i: OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7266}
7267
7268void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7269 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7270 assert(OpIdx >= 0 && "expected to match an immediate operand");
7271 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() & 0x1)
7272 ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL)
7273 : (int64_t)SISrcMods::DST_OP_SEL);
7274}
7275
7276void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7277 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7278 assert(OpIdx >= 0 && "expected to match an immediate operand");
7279 MIB.addImm(
7280 Val: (MI.getOperand(i: OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7281}
7282
7283void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7284 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7285 assert(OpIdx >= 0 && "expected to match an immediate operand");
7286 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() & 0x2)
7287 ? (int64_t)(SISrcMods::OP_SEL_0)
7288 : 0);
7289}
7290
7291void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7292 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7293 assert(OpIdx >= 0 && "expected to match an immediate operand");
7294 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7295 : 0);
7296}
7297
7298void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7299 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7300 assert(OpIdx >= 0 && "expected to match an immediate operand");
7301 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7302 : 0);
7303}
7304
7305void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7306 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7307 assert(OpIdx >= 0 && "expected to match an immediate operand");
7308 MIB.addImm(
7309 Val: (MI.getOperand(i: OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7310}
7311
7312void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7313 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7314 assert(OpIdx >= 0 && "expected to match an immediate operand");
7315 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() & 0x2)
7316 ? (int64_t)SISrcMods::DST_OP_SEL
7317 : 0);
7318}
7319
7320void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7321 const MachineInstr &MI,
7322 int OpIdx) const {
7323 assert(OpIdx >= 0 && "expected to match an immediate operand");
7324 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() &
7325 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
7326 : AMDGPU::CPol::ALL_pregfx12));
7327}
7328
7329void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7330 const MachineInstr &MI,
7331 int OpIdx) const {
7332 assert(OpIdx >= 0 && "expected to match an immediate operand");
7333 const bool Swizzle = MI.getOperand(i: OpIdx).getImm() &
7334 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ
7335 : AMDGPU::CPol::SWZ_pregfx12);
7336 MIB.addImm(Val: Swizzle);
7337}
7338
7339void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7340 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7341 assert(OpIdx >= 0 && "expected to match an immediate operand");
7342 const uint32_t Cpol = MI.getOperand(i: OpIdx).getImm() &
7343 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
7344 : AMDGPU::CPol::ALL_pregfx12);
7345 MIB.addImm(Val: Cpol | AMDGPU::CPol::GLC);
7346}
7347
7348void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7349 const MachineInstr &MI,
7350 int OpIdx) const {
7351 MIB.addFrameIndex(Idx: MI.getOperand(i: 1).getIndex());
7352}
7353
7354void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7355 const MachineInstr &MI,
7356 int OpIdx) const {
7357 const APFloat &APF = MI.getOperand(i: 1).getFPImm()->getValueAPF();
7358 int ExpVal = APF.getExactLog2Abs();
7359 assert(ExpVal != INT_MIN);
7360 MIB.addImm(Val: ExpVal);
7361}
7362
7363void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7364 const MachineInstr &MI,
7365 int OpIdx) const {
7366 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7367 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7368 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7369 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7370 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() + 3) % 4);
7371}
7372
7373void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7374 const MachineInstr &MI,
7375 int OpIdx) const {
7376 unsigned Mods = SISrcMods::OP_SEL_1;
7377 if (MI.getOperand(i: OpIdx).getImm())
7378 Mods ^= SISrcMods::NEG;
7379 MIB.addImm(Val: (int64_t)Mods);
7380}
7381
7382void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7383 const MachineInstr &MI,
7384 int OpIdx) const {
7385 unsigned Mods = SISrcMods::OP_SEL_1;
7386 if (MI.getOperand(i: OpIdx).getImm())
7387 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
7388 MIB.addImm(Val: (int64_t)Mods);
7389}
7390
7391void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7392 const MachineInstr &MI,
7393 int OpIdx) const {
7394 unsigned Val = MI.getOperand(i: OpIdx).getImm();
7395 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7396 if (Val == 1) // neg
7397 Mods ^= SISrcMods::NEG;
7398 if (Val == 2) // abs
7399 Mods ^= SISrcMods::ABS;
7400 if (Val == 3) // neg and abs
7401 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7402 MIB.addImm(Val: (int64_t)Mods);
7403}
7404
7405void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7406 const MachineInstr &MI,
7407 int OpIdx) const {
7408 uint32_t V = MI.getOperand(i: 2).getImm();
7409 V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK))
7410 << AMDGPU::CPol::SCOPE_SHIFT;
7411 if (!Subtarget->hasSafeCUPrefetch())
7412 V = std::max(a: V, b: (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7413 MIB.addImm(Val: V);
7414}
7415
7416/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7417void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7418 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7419 unsigned Val = MI.getOperand(i: OpIdx).getImm();
7420 unsigned New = 0;
7421 if (Val & 0x1)
7422 New |= SISrcMods::OP_SEL_0;
7423 if (Val & 0x2)
7424 New |= SISrcMods::OP_SEL_1;
7425 MIB.addImm(Val: New);
7426}
7427
7428bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7429 return TII.isInlineConstant(Imm);
7430}
7431
7432bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7433 return TII.isInlineConstant(Imm);
7434}
7435