1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUInstructionSelector.h"
15#include "AMDGPU.h"
16#include "AMDGPUGlobalISelUtils.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
23#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
24#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/CodeGen/MachineFrameInfo.h"
28#include "llvm/IR/DiagnosticInfo.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
43AMDGPUInstructionSelector::AMDGPUInstructionSelector(
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI)
45 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), STI(STI),
46#define GET_GLOBALISEL_PREDICATES_INIT
47#include "AMDGPUGenGlobalISel.inc"
48#undef GET_GLOBALISEL_PREDICATES_INIT
49#define GET_GLOBALISEL_TEMPORARIES_INIT
50#include "AMDGPUGenGlobalISel.inc"
51#undef GET_GLOBALISEL_TEMPORARIES_INIT
52{
53}
54
55const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
56
57void AMDGPUInstructionSelector::setupMF(MachineFunction &MF,
58 GISelValueTracking *VT,
59 CodeGenCoverage *CoverageInfo,
60 ProfileSummaryInfo *PSI,
61 BlockFrequencyInfo *BFI) {
62 MRI = &MF.getRegInfo();
63 Subtarget = &MF.getSubtarget<GCNSubtarget>();
64 Subtarget->checkSubtargetFeatures(F: MF.getFunction());
65 InstructionSelector::setupMF(mf&: MF, vt: VT, covinfo: CoverageInfo, psi: PSI, bfi: BFI);
66}
67
68// Return the wave level SGPR base address if this is a wave address.
69static Register getWaveAddress(const MachineInstr *Def) {
70 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
71 ? Def->getOperand(i: 1).getReg()
72 : Register();
73}
74
75static void diagnoseUnsupportedIntrinsic(const MachineInstr &I) {
76 const Function &F = I.getMF()->getFunction();
77 F.getContext().diagnose(DI: DiagnosticInfoUnsupported(
78 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error));
79}
80
81bool AMDGPUInstructionSelector::isVCC(Register Reg,
82 const MachineRegisterInfo &MRI) const {
83 // The verifier is oblivious to s1 being a valid value for wavesize registers.
84 if (Reg.isPhysical())
85 return false;
86
87 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
88 const TargetRegisterClass *RC =
89 dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank);
90 if (RC) {
91 const LLT Ty = MRI.getType(Reg);
92 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
93 return false;
94 // G_TRUNC s1 result is never vcc.
95 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
96 RC->hasSuperClassEq(RC: TRI.getBoolRC());
97 }
98
99 const RegisterBank *RB = cast<const RegisterBank *>(Val: RegClassOrBank);
100 return RB->getID() == AMDGPU::VCCRegBankID;
101}
102
103bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
104 unsigned NewOpc) const {
105 MI.setDesc(TII.get(Opcode: NewOpc));
106 MI.removeOperand(OpNo: 1); // Remove intrinsic ID.
107 MI.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
108
109 MachineOperand &Dst = MI.getOperand(i: 0);
110 MachineOperand &Src = MI.getOperand(i: 1);
111
112 // TODO: This should be legalized to s32 if needed
113 if (MRI->getType(Reg: Dst.getReg()) == LLT::scalar(SizeInBits: 1))
114 return false;
115
116 const TargetRegisterClass *DstRC
117 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
118 const TargetRegisterClass *SrcRC
119 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
120 if (!DstRC || DstRC != SrcRC)
121 return false;
122
123 if (!RBI.constrainGenericRegister(Reg: Dst.getReg(), RC: *DstRC, MRI&: *MRI) ||
124 !RBI.constrainGenericRegister(Reg: Src.getReg(), RC: *SrcRC, MRI&: *MRI))
125 return false;
126 const MCInstrDesc &MCID = MI.getDesc();
127 if (MCID.getOperandConstraint(OpNum: 0, Constraint: MCOI::EARLY_CLOBBER) != -1) {
128 MI.getOperand(i: 0).setIsEarlyClobber(true);
129 }
130 return true;
131}
132
133bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
134 const DebugLoc &DL = I.getDebugLoc();
135 MachineBasicBlock *BB = I.getParent();
136 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
137
138 const MachineOperand &Src = I.getOperand(i: 1);
139 MachineOperand &Dst = I.getOperand(i: 0);
140 Register DstReg = Dst.getReg();
141 Register SrcReg = Src.getReg();
142
143 if (isVCC(Reg: DstReg, MRI: *MRI)) {
144 if (SrcReg == AMDGPU::SCC) {
145 const TargetRegisterClass *RC
146 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
147 if (!RC)
148 return true;
149 return RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI);
150 }
151
152 if (!isVCC(Reg: SrcReg, MRI: *MRI)) {
153 // TODO: Should probably leave the copy and let copyPhysReg expand it.
154 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *TRI.getBoolRC(), MRI&: *MRI))
155 return false;
156
157 const TargetRegisterClass *SrcRC
158 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
159
160 std::optional<ValueAndVReg> ConstVal =
161 getIConstantVRegValWithLookThrough(VReg: SrcReg, MRI: *MRI, LookThroughInstrs: true);
162 if (ConstVal) {
163 unsigned MovOpc =
164 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
165 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: MovOpc), DestReg: DstReg)
166 .addImm(Val: ConstVal->Value.getBoolValue() ? -1 : 0);
167 } else {
168 Register MaskedReg = MRI->createVirtualRegister(RegClass: SrcRC);
169
170 // We can't trust the high bits at this point, so clear them.
171
172 // TODO: Skip masking high bits if def is known boolean.
173
174 if (AMDGPU::getRegBitWidth(RCID: SrcRC->getID()) == 16) {
175 assert(Subtarget->useRealTrue16Insts());
176 const int64_t NoMods = 0;
177 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B16_t16_e64), DestReg: MaskedReg)
178 .addImm(Val: NoMods)
179 .addImm(Val: 1)
180 .addImm(Val: NoMods)
181 .addReg(RegNo: SrcReg)
182 .addImm(Val: NoMods);
183 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_NE_U16_t16_e64), DestReg: DstReg)
184 .addImm(Val: NoMods)
185 .addImm(Val: 0)
186 .addImm(Val: NoMods)
187 .addReg(RegNo: MaskedReg)
188 .addImm(Val: NoMods);
189 } else {
190 bool IsSGPR = TRI.isSGPRClass(RC: SrcRC);
191 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
192 auto And = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: MaskedReg)
193 .addImm(Val: 1)
194 .addReg(RegNo: SrcReg);
195 if (IsSGPR)
196 And.setOperandDead(3); // Dead scc
197
198 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_NE_U32_e64), DestReg: DstReg)
199 .addImm(Val: 0)
200 .addReg(RegNo: MaskedReg);
201 }
202 }
203
204 if (!MRI->getRegClassOrNull(Reg: SrcReg))
205 MRI->setRegClass(Reg: SrcReg, RC: SrcRC);
206 I.eraseFromParent();
207 return true;
208 }
209
210 const TargetRegisterClass *RC =
211 TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
212 if (RC && !RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI))
213 return false;
214
215 return true;
216 }
217
218 for (const MachineOperand &MO : I.operands()) {
219 if (MO.getReg().isPhysical())
220 continue;
221
222 const TargetRegisterClass *RC =
223 TRI.getConstrainedRegClassForOperand(MO, MRI: *MRI);
224 if (!RC)
225 continue;
226 RBI.constrainGenericRegister(Reg: MO.getReg(), RC: *RC, MRI&: *MRI);
227 }
228 return true;
229}
230
231bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
232 const DebugLoc &DL = I.getDebugLoc();
233 MachineBasicBlock *BB = I.getParent();
234 Register VCCReg = I.getOperand(i: 1).getReg();
235 MachineInstr *Cmp;
236
237 // Set SCC as a side effect with S_CMP or S_OR.
238 if (STI.hasScalarCompareEq64()) {
239 unsigned CmpOpc =
240 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
241 Cmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: CmpOpc)).addReg(RegNo: VCCReg).addImm(Val: 0);
242 } else {
243 Register DeadDst = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
244 Cmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_OR_B64), DestReg: DeadDst)
245 .addReg(RegNo: VCCReg)
246 .addReg(RegNo: VCCReg);
247 }
248
249 constrainSelectedInstRegOperands(I&: *Cmp, TII, TRI, RBI);
250
251 Register DstReg = I.getOperand(i: 0).getReg();
252 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: AMDGPU::SCC);
253
254 I.eraseFromParent();
255 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
256}
257
258bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
259 const DebugLoc &DL = I.getDebugLoc();
260 MachineBasicBlock *BB = I.getParent();
261
262 Register DstReg = I.getOperand(i: 0).getReg();
263 Register SrcReg = I.getOperand(i: 1).getReg();
264 std::optional<ValueAndVReg> Arg =
265 getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: 1).getReg(), MRI: *MRI);
266
267 if (Arg) {
268 const int64_t Value = Arg->Value.getZExtValue();
269 if (Value == 0) {
270 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
271 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: DstReg).addImm(Val: 0);
272 } else {
273 assert(Value == 1);
274 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: TRI.getExec());
275 }
276 I.eraseFromParent();
277 return RBI.constrainGenericRegister(Reg: DstReg, RC: *TRI.getBoolRC(), MRI&: *MRI);
278 }
279
280 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
281 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC).addReg(RegNo: SrcReg);
282
283 unsigned SelectOpcode =
284 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
285 MachineInstr *Select = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: SelectOpcode), DestReg: DstReg)
286 .addReg(RegNo: TRI.getExec())
287 .addImm(Val: 0);
288
289 I.eraseFromParent();
290 constrainSelectedInstRegOperands(I&: *Select, TII, TRI, RBI);
291 return true;
292}
293
294bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
295 Register DstReg = I.getOperand(i: 0).getReg();
296 Register SrcReg = I.getOperand(i: 1).getReg();
297
298 const DebugLoc &DL = I.getDebugLoc();
299 MachineBasicBlock *BB = I.getParent();
300
301 auto RFL = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
302 .addReg(RegNo: SrcReg);
303
304 I.eraseFromParent();
305 constrainSelectedInstRegOperands(I&: *RFL, TII, TRI, RBI);
306 return true;
307}
308
309bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
310 const Register DefReg = I.getOperand(i: 0).getReg();
311 const LLT DefTy = MRI->getType(Reg: DefReg);
312
313 // S1 G_PHIs should not be selected in instruction-select, instead:
314 // - divergent S1 G_PHI should go through lane mask merging algorithm
315 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
316 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
317 if (DefTy == LLT::scalar(SizeInBits: 1))
318 return false;
319
320 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
321
322 const RegClassOrRegBank &RegClassOrBank =
323 MRI->getRegClassOrRegBank(Reg: DefReg);
324
325 const TargetRegisterClass *DefRC =
326 dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank);
327 if (!DefRC) {
328 if (!DefTy.isValid()) {
329 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
330 return false;
331 }
332
333 const RegisterBank &RB = *cast<const RegisterBank *>(Val: RegClassOrBank);
334 DefRC = TRI.getRegClassForTypeOnBank(Ty: DefTy, Bank: RB);
335 if (!DefRC) {
336 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
337 return false;
338 }
339 }
340
341 // If inputs have register bank, assign corresponding reg class.
342 // Note: registers don't need to have the same reg bank.
343 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
344 const Register SrcReg = I.getOperand(i).getReg();
345
346 const RegisterBank *RB = MRI->getRegBankOrNull(Reg: SrcReg);
347 if (RB) {
348 const LLT SrcTy = MRI->getType(Reg: SrcReg);
349 const TargetRegisterClass *SrcRC =
350 TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *RB);
351 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI))
352 return false;
353 }
354 }
355
356 I.setDesc(TII.get(Opcode: TargetOpcode::PHI));
357 return RBI.constrainGenericRegister(Reg: DefReg, RC: *DefRC, MRI&: *MRI);
358}
359
360MachineOperand
361AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
362 const TargetRegisterClass &SubRC,
363 unsigned SubIdx) const {
364
365 MachineInstr *MI = MO.getParent();
366 MachineBasicBlock *BB = MO.getParent()->getParent();
367 Register DstReg = MRI->createVirtualRegister(RegClass: &SubRC);
368
369 if (MO.isReg()) {
370 unsigned ComposedSubIdx = TRI.composeSubRegIndices(a: MO.getSubReg(), b: SubIdx);
371 Register Reg = MO.getReg();
372 BuildMI(BB&: *BB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
373 .addReg(RegNo: Reg, Flags: {}, SubReg: ComposedSubIdx);
374
375 return MachineOperand::CreateReg(Reg: DstReg, isDef: MO.isDef(), isImp: MO.isImplicit(),
376 isKill: MO.isKill(), isDead: MO.isDead(), isUndef: MO.isUndef(),
377 isEarlyClobber: MO.isEarlyClobber(), SubReg: 0, isDebug: MO.isDebug(),
378 isInternalRead: MO.isInternalRead());
379 }
380
381 assert(MO.isImm());
382
383 APInt Imm(64, MO.getImm());
384
385 switch (SubIdx) {
386 default:
387 llvm_unreachable("do not know to split immediate with this sub index.");
388 case AMDGPU::sub0:
389 return MachineOperand::CreateImm(Val: Imm.getLoBits(numBits: 32).getSExtValue());
390 case AMDGPU::sub1:
391 return MachineOperand::CreateImm(Val: Imm.getHiBits(numBits: 32).getSExtValue());
392 }
393}
394
395static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
396 switch (Opc) {
397 case AMDGPU::G_AND:
398 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
399 case AMDGPU::G_OR:
400 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
401 case AMDGPU::G_XOR:
402 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
403 default:
404 llvm_unreachable("not a bit op");
405 }
406}
407
408bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
409 Register DstReg = I.getOperand(i: 0).getReg();
410 unsigned Size = RBI.getSizeInBits(Reg: DstReg, MRI: *MRI, TRI);
411
412 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
413 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
414 DstRB->getID() != AMDGPU::VCCRegBankID)
415 return false;
416
417 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
418 STI.isWave64());
419 I.setDesc(TII.get(Opcode: getLogicalBitOpcode(Opc: I.getOpcode(), Is64)));
420
421 // Dead implicit-def of scc
422 I.addOperand(Op: MachineOperand::CreateReg(Reg: AMDGPU::SCC, isDef: true, // isDef
423 isImp: true, // isImp
424 isKill: false, // isKill
425 isDead: true)); // isDead
426 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
427 return true;
428}
429
430bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
431 MachineBasicBlock *BB = I.getParent();
432 MachineFunction *MF = BB->getParent();
433 Register DstReg = I.getOperand(i: 0).getReg();
434 const DebugLoc &DL = I.getDebugLoc();
435 LLT Ty = MRI->getType(Reg: DstReg);
436 if (Ty.isVector())
437 return false;
438
439 unsigned Size = Ty.getSizeInBits();
440 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
441 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
442 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
443
444 if (Size == 32) {
445 if (IsSALU) {
446 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
447 MachineInstr *Add =
448 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
449 .add(MO: I.getOperand(i: 1))
450 .add(MO: I.getOperand(i: 2))
451 .setOperandDead(3); // Dead scc
452 I.eraseFromParent();
453 constrainSelectedInstRegOperands(I&: *Add, TII, TRI, RBI);
454 return true;
455 }
456
457 if (STI.hasAddNoCarryInsts()) {
458 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
459 I.setDesc(TII.get(Opcode: Opc));
460 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
461 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
462 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
463 return true;
464 }
465
466 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
467
468 Register UnusedCarry = MRI->createVirtualRegister(RegClass: TRI.getWaveMaskRegClass());
469 MachineInstr *Add
470 = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
471 .addDef(RegNo: UnusedCarry, Flags: RegState::Dead)
472 .add(MO: I.getOperand(i: 1))
473 .add(MO: I.getOperand(i: 2))
474 .addImm(Val: 0);
475 I.eraseFromParent();
476 constrainSelectedInstRegOperands(I&: *Add, TII, TRI, RBI);
477 return true;
478 }
479
480 assert(!Sub && "illegal sub should not reach here");
481
482 const TargetRegisterClass &RC
483 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
484 const TargetRegisterClass &HalfRC
485 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
486
487 MachineOperand Lo1(getSubOperand64(MO&: I.getOperand(i: 1), SubRC: HalfRC, SubIdx: AMDGPU::sub0));
488 MachineOperand Lo2(getSubOperand64(MO&: I.getOperand(i: 2), SubRC: HalfRC, SubIdx: AMDGPU::sub0));
489 MachineOperand Hi1(getSubOperand64(MO&: I.getOperand(i: 1), SubRC: HalfRC, SubIdx: AMDGPU::sub1));
490 MachineOperand Hi2(getSubOperand64(MO&: I.getOperand(i: 2), SubRC: HalfRC, SubIdx: AMDGPU::sub1));
491
492 Register DstLo = MRI->createVirtualRegister(RegClass: &HalfRC);
493 Register DstHi = MRI->createVirtualRegister(RegClass: &HalfRC);
494
495 if (IsSALU) {
496 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADD_U32), DestReg: DstLo)
497 .add(MO: Lo1)
498 .add(MO: Lo2);
499 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADDC_U32), DestReg: DstHi)
500 .add(MO: Hi1)
501 .add(MO: Hi2)
502 .setOperandDead(3); // Dead scc
503 } else {
504 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
505 Register CarryReg = MRI->createVirtualRegister(RegClass: CarryRC);
506 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: DstLo)
507 .addDef(RegNo: CarryReg)
508 .add(MO: Lo1)
509 .add(MO: Lo2)
510 .addImm(Val: 0);
511 MachineInstr *Addc = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: DstHi)
512 .addDef(RegNo: MRI->createVirtualRegister(RegClass: CarryRC), Flags: RegState::Dead)
513 .add(MO: Hi1)
514 .add(MO: Hi2)
515 .addReg(RegNo: CarryReg, Flags: RegState::Kill)
516 .addImm(Val: 0);
517
518 constrainSelectedInstRegOperands(I&: *Addc, TII, TRI, RBI);
519 }
520
521 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
522 .addReg(RegNo: DstLo)
523 .addImm(Val: AMDGPU::sub0)
524 .addReg(RegNo: DstHi)
525 .addImm(Val: AMDGPU::sub1);
526
527
528 if (!RBI.constrainGenericRegister(Reg: DstReg, RC, MRI&: *MRI))
529 return false;
530
531 I.eraseFromParent();
532 return true;
533}
534
535bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
536 MachineInstr &I) const {
537 MachineBasicBlock *BB = I.getParent();
538 MachineFunction *MF = BB->getParent();
539 const DebugLoc &DL = I.getDebugLoc();
540 Register Dst0Reg = I.getOperand(i: 0).getReg();
541 Register Dst1Reg = I.getOperand(i: 1).getReg();
542 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
543 I.getOpcode() == AMDGPU::G_UADDE;
544 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
545 I.getOpcode() == AMDGPU::G_USUBE;
546
547 if (isVCC(Reg: Dst1Reg, MRI: *MRI)) {
548 unsigned NoCarryOpc =
549 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
550 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
551 I.setDesc(TII.get(Opcode: HasCarryIn ? CarryOpc : NoCarryOpc));
552 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
553 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
554 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
555 return true;
556 }
557
558 Register Src0Reg = I.getOperand(i: 2).getReg();
559 Register Src1Reg = I.getOperand(i: 3).getReg();
560
561 if (HasCarryIn) {
562 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC)
563 .addReg(RegNo: I.getOperand(i: 4).getReg());
564 }
565
566 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
567 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
568
569 auto CarryInst = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: HasCarryIn ? CarryOpc : NoCarryOpc), DestReg: Dst0Reg)
570 .add(MO: I.getOperand(i: 2))
571 .add(MO: I.getOperand(i: 3));
572
573 if (MRI->use_nodbg_empty(RegNo: Dst1Reg)) {
574 CarryInst.setOperandDead(3); // Dead scc
575 } else {
576 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: Dst1Reg)
577 .addReg(RegNo: AMDGPU::SCC);
578 if (!MRI->getRegClassOrNull(Reg: Dst1Reg))
579 MRI->setRegClass(Reg: Dst1Reg, RC: &AMDGPU::SReg_32RegClass);
580 }
581
582 if (!RBI.constrainGenericRegister(Reg: Dst0Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
583 !RBI.constrainGenericRegister(Reg: Src0Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
584 !RBI.constrainGenericRegister(Reg: Src1Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
585 return false;
586
587 if (HasCarryIn &&
588 !RBI.constrainGenericRegister(Reg: I.getOperand(i: 4).getReg(),
589 RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
590 return false;
591
592 I.eraseFromParent();
593 return true;
594}
595
596bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
597 MachineInstr &I) const {
598 MachineBasicBlock *BB = I.getParent();
599 MachineFunction *MF = BB->getParent();
600 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
601 bool UseNoCarry = Subtarget->hasMadNC64_32Insts() &&
602 MRI->use_nodbg_empty(RegNo: I.getOperand(i: 1).getReg());
603
604 unsigned Opc;
605 if (Subtarget->hasMADIntraFwdBug())
606 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
607 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
608 else if (UseNoCarry)
609 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
610 : AMDGPU::V_MAD_NC_I64_I32_e64;
611 else
612 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
613
614 if (UseNoCarry)
615 I.removeOperand(OpNo: 1);
616
617 I.setDesc(TII.get(Opcode: Opc));
618 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
619 I.addImplicitDefUseOperands(MF&: *MF);
620 I.getOperand(i: 0).setIsEarlyClobber(true);
621 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
622 return true;
623}
624
625// TODO: We should probably legalize these to only using 32-bit results.
626bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
627 MachineBasicBlock *BB = I.getParent();
628 Register DstReg = I.getOperand(i: 0).getReg();
629 Register SrcReg = I.getOperand(i: 1).getReg();
630 LLT DstTy = MRI->getType(Reg: DstReg);
631 LLT SrcTy = MRI->getType(Reg: SrcReg);
632 const unsigned SrcSize = SrcTy.getSizeInBits();
633 unsigned DstSize = DstTy.getSizeInBits();
634
635 // TODO: Should handle any multiple of 32 offset.
636 unsigned Offset = I.getOperand(i: 2).getImm();
637 if (Offset % 32 != 0 || DstSize > 128)
638 return false;
639
640 // 16-bit operations really use 32-bit registers.
641 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
642 if (DstSize == 16)
643 DstSize = 32;
644
645 const TargetRegisterClass *DstRC =
646 TRI.getConstrainedRegClassForOperand(MO: I.getOperand(i: 0), MRI: *MRI);
647 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
648 return false;
649
650 const RegisterBank *SrcBank = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
651 const TargetRegisterClass *SrcRC =
652 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcBank);
653 if (!SrcRC)
654 return false;
655 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Channel: Offset / 32,
656 NumRegs: DstSize / 32);
657 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
658 if (!SrcRC)
659 return false;
660
661 SrcReg = constrainOperandRegClass(MF: *MF, TRI, MRI&: *MRI, TII, RBI, InsertPt&: I,
662 RegClass: *SrcRC, RegMO&: I.getOperand(i: 1));
663 const DebugLoc &DL = I.getDebugLoc();
664 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::COPY), DestReg: DstReg)
665 .addReg(RegNo: SrcReg, Flags: {}, SubReg);
666
667 I.eraseFromParent();
668 return true;
669}
670
671bool AMDGPUInstructionSelector::selectS16MergeToS32(MachineInstr &MI) const {
672 Register Dst = MI.getOperand(i: 0).getReg();
673 Register Src0 = MI.getOperand(i: 1).getReg();
674 Register Src1 = MI.getOperand(i: 2).getReg();
675
676 LLT Src0Ty = MRI->getType(Reg: Src0);
677 LLT Src1Ty = MRI->getType(Reg: Src1);
678
679 const RegisterBank *DstBank = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
680 const RegisterBank *Src0Bank = RBI.getRegBank(Reg: Src0, MRI: *MRI, TRI);
681 const RegisterBank *Src1Bank = RBI.getRegBank(Reg: Src1, MRI: *MRI, TRI);
682 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
683
684 Register ShiftSrc0;
685 Register ShiftSrc1;
686
687 const DebugLoc &DL = MI.getDebugLoc();
688 MachineBasicBlock *BB = MI.getParent();
689
690 // VGPR case
691 if (IsVector) {
692 // If source are both VGPR16, use REG_SEQUENCE with lo16/hi16 subregisters
693 if (Src0Bank->getID() == AMDGPU::VGPRRegBankID &&
694 Src1Bank->getID() == AMDGPU::VGPRRegBankID &&
695 Src0Ty == LLT::scalar(SizeInBits: 16) && Src1Ty == LLT::scalar(SizeInBits: 16)) {
696 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dst)
697 .addReg(RegNo: Src0)
698 .addImm(Val: AMDGPU::lo16)
699 .addReg(RegNo: Src1)
700 .addImm(Val: AMDGPU::hi16);
701
702 if (!RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI))
703 return false;
704
705 MI.eraseFromParent();
706 return true;
707 }
708
709 // Otherwise, use V_LSHL_OR_B32_e64
710 Register TmpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
711 auto MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B32_e32), DestReg: TmpReg)
712 .addImm(Val: 0xFFFF)
713 .addReg(RegNo: Src0);
714 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
715
716 MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: Dst)
717 .addReg(RegNo: Src1)
718 .addImm(Val: 16)
719 .addReg(RegNo: TmpReg);
720 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
721
722 MI.eraseFromParent();
723 return true;
724 }
725
726 // SGPR case -> S_PACK_*_B32_B16
727 // With multiple uses of the shift, this will duplicate the shift and
728 // increase register pressure.
729 //
730 // (merge (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
731 // => (S_PACK_HH_B32_B16 $src0, $src1)
732 // (merge (lshr_oneuse SReg_32:$src0, 16), $src1)
733 // => (S_PACK_HL_B32_B16 $src0, $src1)
734 // (merge $src0, (lshr_oneuse SReg_32:$src1, 16))
735 // => (S_PACK_LH_B32_B16 $src0, $src1)
736 // (merge $src0, $src1)
737 // => (S_PACK_LL_B32_B16 $src0, $src1)
738
739 bool Shift0 = mi_match(
740 R: Src0, MRI: *MRI, P: m_OneUse(SP: m_GLShr(L: m_Reg(R&: ShiftSrc0), R: m_SpecificICst(RequestedValue: 16))));
741
742 bool Shift1 = mi_match(
743 R: Src1, MRI: *MRI, P: m_OneUse(SP: m_GLShr(L: m_Reg(R&: ShiftSrc1), R: m_SpecificICst(RequestedValue: 16))));
744
745 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
746 if (Shift0 && Shift1) {
747 Opc = AMDGPU::S_PACK_HH_B32_B16;
748 MI.getOperand(i: 1).setReg(ShiftSrc0);
749 MI.getOperand(i: 2).setReg(ShiftSrc1);
750 } else if (Shift1) {
751 Opc = AMDGPU::S_PACK_LH_B32_B16;
752 MI.getOperand(i: 2).setReg(ShiftSrc1);
753 } else if (Shift0) {
754 auto ConstSrc1 =
755 getAnyConstantVRegValWithLookThrough(VReg: Src1, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
756 if (ConstSrc1 && ConstSrc1->Value == 0) {
757 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
758 auto MIB = BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: Dst)
759 .addReg(RegNo: ShiftSrc0)
760 .addImm(Val: 16)
761 .setOperandDead(3); // Dead scc
762
763 MI.eraseFromParent();
764 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
765 return true;
766 }
767 if (STI.hasSPackHL()) {
768 Opc = AMDGPU::S_PACK_HL_B32_B16;
769 MI.getOperand(i: 1).setReg(ShiftSrc0);
770 }
771 }
772
773 MI.setDesc(TII.get(Opcode: Opc));
774 constrainSelectedInstRegOperands(I&: MI, TII, TRI, RBI);
775 return true;
776}
777
778bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
779 MachineBasicBlock *BB = MI.getParent();
780 Register DstReg = MI.getOperand(i: 0).getReg();
781 LLT DstTy = MRI->getType(Reg: DstReg);
782 LLT SrcTy = MRI->getType(Reg: MI.getOperand(i: 1).getReg());
783
784 const unsigned SrcSize = SrcTy.getSizeInBits();
785 if (SrcSize < 32) {
786 // Handle s32 <- G_MERGE_VALUES s16, s16
787 if (SrcSize == 16 && DstTy.getSizeInBits() == 32 &&
788 MI.getNumOperands() == 3) {
789 return selectS16MergeToS32(MI);
790 }
791 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
792 }
793
794 const DebugLoc &DL = MI.getDebugLoc();
795 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
796 const unsigned DstSize = DstTy.getSizeInBits();
797 const TargetRegisterClass *DstRC =
798 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
799 if (!DstRC)
800 return false;
801
802 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: DstRC, EltSize: SrcSize / 8);
803 MachineInstrBuilder MIB =
804 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg);
805 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
806 MachineOperand &Src = MI.getOperand(i: I + 1);
807 MIB.addReg(RegNo: Src.getReg(), Flags: getUndefRegState(B: Src.isUndef()));
808 MIB.addImm(Val: SubRegs[I]);
809
810 const TargetRegisterClass *SrcRC
811 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
812 if (SrcRC && !RBI.constrainGenericRegister(Reg: Src.getReg(), RC: *SrcRC, MRI&: *MRI))
813 return false;
814 }
815
816 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
817 return false;
818
819 MI.eraseFromParent();
820 return true;
821}
822
823bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
824 MachineBasicBlock *BB = MI.getParent();
825 const int NumDst = MI.getNumOperands() - 1;
826
827 MachineOperand &Src = MI.getOperand(i: NumDst);
828
829 Register SrcReg = Src.getReg();
830 Register DstReg0 = MI.getOperand(i: 0).getReg();
831 LLT DstTy = MRI->getType(Reg: DstReg0);
832 LLT SrcTy = MRI->getType(Reg: SrcReg);
833
834 const unsigned DstSize = DstTy.getSizeInBits();
835 const unsigned SrcSize = SrcTy.getSizeInBits();
836 const DebugLoc &DL = MI.getDebugLoc();
837 const RegisterBank *SrcBank = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
838
839 const TargetRegisterClass *SrcRC =
840 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcBank);
841 if (!SrcRC || !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI))
842 return false;
843
844 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
845 // source, and this relies on the fact that the same subregister indices are
846 // used for both.
847 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: SrcRC, EltSize: DstSize / 8);
848 for (int I = 0, E = NumDst; I != E; ++I) {
849 MachineOperand &Dst = MI.getOperand(i: I);
850 // hi16:sreg_32 is not allowed so explicitly shift upper 16-bits.
851 if (SrcBank->getID() == AMDGPU::SGPRRegBankID &&
852 SubRegs[I] == AMDGPU::hi16) {
853 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: Dst.getReg())
854 .addReg(RegNo: SrcReg)
855 .addImm(Val: 16);
856 } else {
857 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::COPY), DestReg: Dst.getReg())
858 .addReg(RegNo: SrcReg, Flags: {}, SubReg: SubRegs[I]);
859 }
860
861 // Make sure the subregister index is valid for the source register.
862 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
863 if (!SrcRC || !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI))
864 return false;
865
866 const TargetRegisterClass *DstRC =
867 TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
868 if (DstRC && !RBI.constrainGenericRegister(Reg: Dst.getReg(), RC: *DstRC, MRI&: *MRI))
869 return false;
870 }
871
872 MI.eraseFromParent();
873 return true;
874}
875
876bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
877 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
878 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
879
880 Register Src0 = MI.getOperand(i: 1).getReg();
881 Register Src1 = MI.getOperand(i: 2).getReg();
882 LLT SrcTy = MRI->getType(Reg: Src0);
883 const unsigned SrcSize = SrcTy.getSizeInBits();
884
885 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
886 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
887 return selectG_MERGE_VALUES(MI);
888 }
889
890 // Selection logic below is for V2S16 only.
891 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
892 Register Dst = MI.getOperand(i: 0).getReg();
893 if (MRI->getType(Reg: Dst) != LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) ||
894 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
895 SrcTy != LLT::scalar(SizeInBits: 32)))
896 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
897
898 const RegisterBank *DstBank = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
899 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
900 return false;
901
902 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
903 DstBank->getID() == AMDGPU::VGPRRegBankID);
904 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
905
906 const DebugLoc &DL = MI.getDebugLoc();
907 MachineBasicBlock *BB = MI.getParent();
908
909 // First, before trying TableGen patterns, check if both sources are
910 // constants. In those cases, we can trivially compute the final constant
911 // and emit a simple move.
912 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(VReg: Src1, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
913 if (ConstSrc1) {
914 auto ConstSrc0 =
915 getAnyConstantVRegValWithLookThrough(VReg: Src0, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
916 if (ConstSrc0) {
917 const int64_t K0 = ConstSrc0->Value.getSExtValue();
918 const int64_t K1 = ConstSrc1->Value.getSExtValue();
919 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
920 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
921 uint32_t Imm = Lo16 | (Hi16 << 16);
922
923 // VALU
924 if (IsVector) {
925 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: Dst).addImm(Val: Imm);
926 MI.eraseFromParent();
927 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI);
928 }
929
930 // SALU
931 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: Dst).addImm(Val: Imm);
932 MI.eraseFromParent();
933 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
934 }
935 }
936
937 // Now try TableGen patterns.
938 if (selectImpl(I&: MI, CoverageInfo&: *CoverageInfo))
939 return true;
940
941 // TODO: This should probably be a combine somewhere
942 // (build_vector $src0, undef) -> copy $src0
943 MachineInstr *Src1Def = getDefIgnoringCopies(Reg: Src1, MRI: *MRI);
944 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
945 MI.setDesc(TII.get(Opcode: AMDGPU::COPY));
946 MI.removeOperand(OpNo: 2);
947 const auto &RC =
948 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
949 return RBI.constrainGenericRegister(Reg: Dst, RC, MRI&: *MRI) &&
950 RBI.constrainGenericRegister(Reg: Src0, RC, MRI&: *MRI);
951 }
952
953 return selectS16MergeToS32(MI);
954}
955
956bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
957 const MachineOperand &MO = I.getOperand(i: 0);
958
959 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
960 // regbank check here is to know why getConstrainedRegClassForOperand failed.
961 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI: *MRI);
962 if ((!RC && !MRI->getRegBankOrNull(Reg: MO.getReg())) ||
963 (RC && RBI.constrainGenericRegister(Reg: MO.getReg(), RC: *RC, MRI&: *MRI))) {
964 I.setDesc(TII.get(Opcode: TargetOpcode::IMPLICIT_DEF));
965 return true;
966 }
967
968 return false;
969}
970
971bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
972 MachineBasicBlock *BB = I.getParent();
973
974 Register DstReg = I.getOperand(i: 0).getReg();
975 Register Src0Reg = I.getOperand(i: 1).getReg();
976 Register Src1Reg = I.getOperand(i: 2).getReg();
977 LLT Src1Ty = MRI->getType(Reg: Src1Reg);
978
979 unsigned DstSize = MRI->getType(Reg: DstReg).getSizeInBits();
980 unsigned InsSize = Src1Ty.getSizeInBits();
981
982 int64_t Offset = I.getOperand(i: 3).getImm();
983
984 // FIXME: These cases should have been illegal and unnecessary to check here.
985 if (Offset % 32 != 0 || InsSize % 32 != 0)
986 return false;
987
988 // Currently not handled by getSubRegFromChannel.
989 if (InsSize > 128)
990 return false;
991
992 unsigned SubReg = TRI.getSubRegFromChannel(Channel: Offset / 32, NumRegs: InsSize / 32);
993 if (SubReg == AMDGPU::NoSubRegister)
994 return false;
995
996 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
997 const TargetRegisterClass *DstRC =
998 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
999 if (!DstRC)
1000 return false;
1001
1002 const RegisterBank *Src0Bank = RBI.getRegBank(Reg: Src0Reg, MRI: *MRI, TRI);
1003 const RegisterBank *Src1Bank = RBI.getRegBank(Reg: Src1Reg, MRI: *MRI, TRI);
1004 const TargetRegisterClass *Src0RC =
1005 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *Src0Bank);
1006 const TargetRegisterClass *Src1RC =
1007 TRI.getRegClassForSizeOnBank(Size: InsSize, Bank: *Src1Bank);
1008
1009 // Deal with weird cases where the class only partially supports the subreg
1010 // index.
1011 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
1012 if (!Src0RC || !Src1RC)
1013 return false;
1014
1015 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
1016 !RBI.constrainGenericRegister(Reg: Src0Reg, RC: *Src0RC, MRI&: *MRI) ||
1017 !RBI.constrainGenericRegister(Reg: Src1Reg, RC: *Src1RC, MRI&: *MRI))
1018 return false;
1019
1020 const DebugLoc &DL = I.getDebugLoc();
1021 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: DstReg)
1022 .addReg(RegNo: Src0Reg)
1023 .addReg(RegNo: Src1Reg)
1024 .addImm(Val: SubReg);
1025
1026 I.eraseFromParent();
1027 return true;
1028}
1029
1030bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
1031 Register DstReg = MI.getOperand(i: 0).getReg();
1032 Register SrcReg = MI.getOperand(i: 1).getReg();
1033 Register OffsetReg = MI.getOperand(i: 2).getReg();
1034 Register WidthReg = MI.getOperand(i: 3).getReg();
1035
1036 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
1037 "scalar BFX instructions are expanded in regbankselect");
1038 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
1039 "64-bit vector BFX instructions are expanded in regbankselect");
1040
1041 const DebugLoc &DL = MI.getDebugLoc();
1042 MachineBasicBlock *MBB = MI.getParent();
1043
1044 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
1045 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
1046 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
1047 .addReg(RegNo: SrcReg)
1048 .addReg(RegNo: OffsetReg)
1049 .addReg(RegNo: WidthReg);
1050 MI.eraseFromParent();
1051 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1052 return true;
1053}
1054
1055bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
1056 if (STI.getLDSBankCount() != 16)
1057 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
1058
1059 Register Dst = MI.getOperand(i: 0).getReg();
1060 Register Src0 = MI.getOperand(i: 2).getReg();
1061 Register M0Val = MI.getOperand(i: 6).getReg();
1062 if (!RBI.constrainGenericRegister(Reg: M0Val, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
1063 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI) ||
1064 !RBI.constrainGenericRegister(Reg: Src0, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI))
1065 return false;
1066
1067 // This requires 2 instructions. It is possible to write a pattern to support
1068 // this, but the generated isel emitter doesn't correctly deal with multiple
1069 // output instructions using the same physical register input. The copy to m0
1070 // is incorrectly placed before the second instruction.
1071 //
1072 // TODO: Match source modifiers.
1073
1074 Register InterpMov = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1075 const DebugLoc &DL = MI.getDebugLoc();
1076 MachineBasicBlock *MBB = MI.getParent();
1077
1078 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1079 .addReg(RegNo: M0Val);
1080 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_INTERP_MOV_F32), DestReg: InterpMov)
1081 .addImm(Val: 2)
1082 .addImm(Val: MI.getOperand(i: 4).getImm()) // $attr
1083 .addImm(Val: MI.getOperand(i: 3).getImm()); // $attrchan
1084
1085 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_INTERP_P1LV_F16), DestReg: Dst)
1086 .addImm(Val: 0) // $src0_modifiers
1087 .addReg(RegNo: Src0) // $src0
1088 .addImm(Val: MI.getOperand(i: 4).getImm()) // $attr
1089 .addImm(Val: MI.getOperand(i: 3).getImm()) // $attrchan
1090 .addImm(Val: 0) // $src2_modifiers
1091 .addReg(RegNo: InterpMov) // $src2 - 2 f16 values selected by high
1092 .addImm(Val: MI.getOperand(i: 5).getImm()) // $high
1093 .addImm(Val: 0) // $clamp
1094 .addImm(Val: 0); // $omod
1095
1096 MI.eraseFromParent();
1097 return true;
1098}
1099
1100// Writelane is special in that it can use SGPR and M0 (which would normally
1101// count as using the constant bus twice - but in this case it is allowed since
1102// the lane selector doesn't count as a use of the constant bus). However, it is
1103// still required to abide by the 1 SGPR rule. Fix this up if we might have
1104// multiple SGPRs.
1105bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1106 // With a constant bus limit of at least 2, there's no issue.
1107 if (STI.getConstantBusLimit(Opcode: AMDGPU::V_WRITELANE_B32) > 1)
1108 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
1109
1110 MachineBasicBlock *MBB = MI.getParent();
1111 const DebugLoc &DL = MI.getDebugLoc();
1112 Register VDst = MI.getOperand(i: 0).getReg();
1113 Register Val = MI.getOperand(i: 2).getReg();
1114 Register LaneSelect = MI.getOperand(i: 3).getReg();
1115 Register VDstIn = MI.getOperand(i: 4).getReg();
1116
1117 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_WRITELANE_B32), DestReg: VDst);
1118
1119 std::optional<ValueAndVReg> ConstSelect =
1120 getIConstantVRegValWithLookThrough(VReg: LaneSelect, MRI: *MRI);
1121 if (ConstSelect) {
1122 // The selector has to be an inline immediate, so we can use whatever for
1123 // the other operands.
1124 MIB.addReg(RegNo: Val);
1125 MIB.addImm(Val: ConstSelect->Value.getSExtValue() &
1126 maskTrailingOnes<uint64_t>(N: STI.getWavefrontSizeLog2()));
1127 } else {
1128 std::optional<ValueAndVReg> ConstVal =
1129 getIConstantVRegValWithLookThrough(VReg: Val, MRI: *MRI);
1130
1131 // If the value written is an inline immediate, we can get away without a
1132 // copy to m0.
1133 if (ConstVal && AMDGPU::isInlinableLiteral32(Literal: ConstVal->Value.getSExtValue(),
1134 HasInv2Pi: STI.hasInv2PiInlineImm())) {
1135 MIB.addImm(Val: ConstVal->Value.getSExtValue());
1136 MIB.addReg(RegNo: LaneSelect);
1137 } else {
1138 MIB.addReg(RegNo: Val);
1139
1140 // If the lane selector was originally in a VGPR and copied with
1141 // readfirstlane, there's a hazard to read the same SGPR from the
1142 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1143 RBI.constrainGenericRegister(Reg: LaneSelect, RC: AMDGPU::SReg_32_XM0RegClass, MRI&: *MRI);
1144
1145 BuildMI(BB&: *MBB, I&: *MIB, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1146 .addReg(RegNo: LaneSelect);
1147 MIB.addReg(RegNo: AMDGPU::M0);
1148 }
1149 }
1150
1151 MIB.addReg(RegNo: VDstIn);
1152
1153 MI.eraseFromParent();
1154 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1155 return true;
1156}
1157
1158// We need to handle this here because tablegen doesn't support matching
1159// instructions with multiple outputs.
1160bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1161 Register Dst0 = MI.getOperand(i: 0).getReg();
1162 Register Dst1 = MI.getOperand(i: 1).getReg();
1163
1164 LLT Ty = MRI->getType(Reg: Dst0);
1165 unsigned Opc;
1166 if (Ty == LLT::scalar(SizeInBits: 32))
1167 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1168 else if (Ty == LLT::scalar(SizeInBits: 64))
1169 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1170 else
1171 return false;
1172
1173 // TODO: Match source modifiers.
1174
1175 const DebugLoc &DL = MI.getDebugLoc();
1176 MachineBasicBlock *MBB = MI.getParent();
1177
1178 Register Numer = MI.getOperand(i: 3).getReg();
1179 Register Denom = MI.getOperand(i: 4).getReg();
1180 unsigned ChooseDenom = MI.getOperand(i: 5).getImm();
1181
1182 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1183
1184 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: Dst0)
1185 .addDef(RegNo: Dst1)
1186 .addImm(Val: 0) // $src0_modifiers
1187 .addUse(RegNo: Src0) // $src0
1188 .addImm(Val: 0) // $src1_modifiers
1189 .addUse(RegNo: Denom) // $src1
1190 .addImm(Val: 0) // $src2_modifiers
1191 .addUse(RegNo: Numer) // $src2
1192 .addImm(Val: 0) // $clamp
1193 .addImm(Val: 0); // $omod
1194
1195 MI.eraseFromParent();
1196 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1197 return true;
1198}
1199
1200bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1201 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
1202 switch (IntrinsicID) {
1203 case Intrinsic::amdgcn_if_break: {
1204 MachineBasicBlock *BB = I.getParent();
1205
1206 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1207 // SelectionDAG uses for wave32 vs wave64.
1208 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::SI_IF_BREAK))
1209 .add(MO: I.getOperand(i: 0))
1210 .add(MO: I.getOperand(i: 2))
1211 .add(MO: I.getOperand(i: 3));
1212
1213 Register DstReg = I.getOperand(i: 0).getReg();
1214 Register Src0Reg = I.getOperand(i: 2).getReg();
1215 Register Src1Reg = I.getOperand(i: 3).getReg();
1216
1217 I.eraseFromParent();
1218
1219 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1220 MRI->setRegClass(Reg, RC: TRI.getWaveMaskRegClass());
1221
1222 return true;
1223 }
1224 case Intrinsic::amdgcn_interp_p1_f16:
1225 return selectInterpP1F16(MI&: I);
1226 case Intrinsic::amdgcn_wqm:
1227 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::WQM);
1228 case Intrinsic::amdgcn_softwqm:
1229 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::SOFT_WQM);
1230 case Intrinsic::amdgcn_strict_wwm:
1231 case Intrinsic::amdgcn_wwm:
1232 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::STRICT_WWM);
1233 case Intrinsic::amdgcn_strict_wqm:
1234 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::STRICT_WQM);
1235 case Intrinsic::amdgcn_writelane:
1236 return selectWritelane(MI&: I);
1237 case Intrinsic::amdgcn_div_scale:
1238 return selectDivScale(MI&: I);
1239 case Intrinsic::amdgcn_icmp:
1240 case Intrinsic::amdgcn_fcmp:
1241 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
1242 return true;
1243 return selectIntrinsicCmp(MI&: I);
1244 case Intrinsic::amdgcn_ballot:
1245 return selectBallot(I);
1246 case Intrinsic::amdgcn_reloc_constant:
1247 return selectRelocConstant(I);
1248 case Intrinsic::amdgcn_groupstaticsize:
1249 return selectGroupStaticSize(I);
1250 case Intrinsic::returnaddress:
1251 return selectReturnAddress(I);
1252 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1253 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1254 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1255 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1256 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1257 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1258 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1259 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1260 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1261 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1262 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1263 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1264 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1265 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1266 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1267 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1268 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1269 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1270 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1271 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1272 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1273 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1274 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1275 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1276 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1277 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1278 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1279 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1280 return selectSMFMACIntrin(I);
1281 case Intrinsic::amdgcn_permlane16_swap:
1282 case Intrinsic::amdgcn_permlane32_swap:
1283 return selectPermlaneSwapIntrin(I, IntrID: IntrinsicID);
1284 case Intrinsic::amdgcn_wave_shuffle:
1285 return selectWaveShuffleIntrin(I);
1286 case Intrinsic::amdgcn_fma_legacy:
1287 if (!STI.hasFmaLegacy32Insts()) {
1288 diagnoseUnsupportedIntrinsic(I);
1289 return false;
1290 }
1291 return selectImpl(I, CoverageInfo&: *CoverageInfo);
1292 case Intrinsic::amdgcn_sudot4:
1293 case Intrinsic::amdgcn_sudot8:
1294 if (!STI.hasDot8Insts()) {
1295 diagnoseUnsupportedIntrinsic(I);
1296 return false;
1297 }
1298 return selectImpl(I, CoverageInfo&: *CoverageInfo);
1299 case Intrinsic::amdgcn_permlane16:
1300 case Intrinsic::amdgcn_permlanex16:
1301 if (!STI.hasPermlane16Insts()) {
1302 diagnoseUnsupportedIntrinsic(I);
1303 return false;
1304 }
1305 return selectImpl(I, CoverageInfo&: *CoverageInfo);
1306 case Intrinsic::amdgcn_mov_dpp8:
1307 if (!STI.hasDPP8()) {
1308 diagnoseUnsupportedIntrinsic(I);
1309 return false;
1310 }
1311 return selectImpl(I, CoverageInfo&: *CoverageInfo);
1312 case Intrinsic::amdgcn_tanh:
1313 if (!STI.hasTanhInsts()) {
1314 diagnoseUnsupportedIntrinsic(I);
1315 return false;
1316 }
1317 return selectImpl(I, CoverageInfo&: *CoverageInfo);
1318 default:
1319 return selectImpl(I, CoverageInfo&: *CoverageInfo);
1320 }
1321}
1322
1323static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
1324 const GCNSubtarget &ST) {
1325 if (Size != 16 && Size != 32 && Size != 64)
1326 return -1;
1327
1328 if (Size == 16 && !ST.has16BitInsts())
1329 return -1;
1330
1331 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1332 unsigned FakeS16Opc, unsigned S32Opc,
1333 unsigned S64Opc) {
1334 if (Size == 16)
1335 return ST.hasTrue16BitInsts()
1336 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1337 : S16Opc;
1338 if (Size == 32)
1339 return S32Opc;
1340 return S64Opc;
1341 };
1342
1343 switch (P) {
1344 default:
1345 llvm_unreachable("Unknown condition code!");
1346 case CmpInst::ICMP_NE:
1347 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1348 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1349 AMDGPU::V_CMP_NE_U64_e64);
1350 case CmpInst::ICMP_EQ:
1351 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1352 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1353 AMDGPU::V_CMP_EQ_U64_e64);
1354 case CmpInst::ICMP_SGT:
1355 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1356 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1357 AMDGPU::V_CMP_GT_I64_e64);
1358 case CmpInst::ICMP_SGE:
1359 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1360 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1361 AMDGPU::V_CMP_GE_I64_e64);
1362 case CmpInst::ICMP_SLT:
1363 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1364 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1365 AMDGPU::V_CMP_LT_I64_e64);
1366 case CmpInst::ICMP_SLE:
1367 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1368 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1369 AMDGPU::V_CMP_LE_I64_e64);
1370 case CmpInst::ICMP_UGT:
1371 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1372 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1373 AMDGPU::V_CMP_GT_U64_e64);
1374 case CmpInst::ICMP_UGE:
1375 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1376 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1377 AMDGPU::V_CMP_GE_U64_e64);
1378 case CmpInst::ICMP_ULT:
1379 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1380 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1381 AMDGPU::V_CMP_LT_U64_e64);
1382 case CmpInst::ICMP_ULE:
1383 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1384 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1385 AMDGPU::V_CMP_LE_U64_e64);
1386
1387 case CmpInst::FCMP_OEQ:
1388 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1389 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1390 AMDGPU::V_CMP_EQ_F64_e64);
1391 case CmpInst::FCMP_OGT:
1392 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1393 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1394 AMDGPU::V_CMP_GT_F64_e64);
1395 case CmpInst::FCMP_OGE:
1396 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1397 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1398 AMDGPU::V_CMP_GE_F64_e64);
1399 case CmpInst::FCMP_OLT:
1400 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1401 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1402 AMDGPU::V_CMP_LT_F64_e64);
1403 case CmpInst::FCMP_OLE:
1404 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1405 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1406 AMDGPU::V_CMP_LE_F64_e64);
1407 case CmpInst::FCMP_ONE:
1408 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1409 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1410 AMDGPU::V_CMP_NEQ_F64_e64);
1411 case CmpInst::FCMP_ORD:
1412 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1413 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1414 AMDGPU::V_CMP_O_F64_e64);
1415 case CmpInst::FCMP_UNO:
1416 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1417 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1418 AMDGPU::V_CMP_U_F64_e64);
1419 case CmpInst::FCMP_UEQ:
1420 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1421 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1422 AMDGPU::V_CMP_NLG_F64_e64);
1423 case CmpInst::FCMP_UGT:
1424 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1425 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1426 AMDGPU::V_CMP_NLE_F64_e64);
1427 case CmpInst::FCMP_UGE:
1428 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1429 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1430 AMDGPU::V_CMP_NLT_F64_e64);
1431 case CmpInst::FCMP_ULT:
1432 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1433 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1434 AMDGPU::V_CMP_NGE_F64_e64);
1435 case CmpInst::FCMP_ULE:
1436 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1437 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1438 AMDGPU::V_CMP_NGT_F64_e64);
1439 case CmpInst::FCMP_UNE:
1440 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1441 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1442 AMDGPU::V_CMP_NEQ_F64_e64);
1443 case CmpInst::FCMP_TRUE:
1444 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1445 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1446 AMDGPU::V_CMP_TRU_F64_e64);
1447 case CmpInst::FCMP_FALSE:
1448 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1449 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1450 AMDGPU::V_CMP_F_F64_e64);
1451 }
1452}
1453
1454int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1455 unsigned Size) const {
1456 if (Size == 64) {
1457 if (!STI.hasScalarCompareEq64())
1458 return -1;
1459
1460 switch (P) {
1461 case CmpInst::ICMP_NE:
1462 return AMDGPU::S_CMP_LG_U64;
1463 case CmpInst::ICMP_EQ:
1464 return AMDGPU::S_CMP_EQ_U64;
1465 default:
1466 return -1;
1467 }
1468 }
1469
1470 if (Size == 32) {
1471 switch (P) {
1472 case CmpInst::ICMP_NE:
1473 return AMDGPU::S_CMP_LG_U32;
1474 case CmpInst::ICMP_EQ:
1475 return AMDGPU::S_CMP_EQ_U32;
1476 case CmpInst::ICMP_SGT:
1477 return AMDGPU::S_CMP_GT_I32;
1478 case CmpInst::ICMP_SGE:
1479 return AMDGPU::S_CMP_GE_I32;
1480 case CmpInst::ICMP_SLT:
1481 return AMDGPU::S_CMP_LT_I32;
1482 case CmpInst::ICMP_SLE:
1483 return AMDGPU::S_CMP_LE_I32;
1484 case CmpInst::ICMP_UGT:
1485 return AMDGPU::S_CMP_GT_U32;
1486 case CmpInst::ICMP_UGE:
1487 return AMDGPU::S_CMP_GE_U32;
1488 case CmpInst::ICMP_ULT:
1489 return AMDGPU::S_CMP_LT_U32;
1490 case CmpInst::ICMP_ULE:
1491 return AMDGPU::S_CMP_LE_U32;
1492 case CmpInst::FCMP_OEQ:
1493 return AMDGPU::S_CMP_EQ_F32;
1494 case CmpInst::FCMP_OGT:
1495 return AMDGPU::S_CMP_GT_F32;
1496 case CmpInst::FCMP_OGE:
1497 return AMDGPU::S_CMP_GE_F32;
1498 case CmpInst::FCMP_OLT:
1499 return AMDGPU::S_CMP_LT_F32;
1500 case CmpInst::FCMP_OLE:
1501 return AMDGPU::S_CMP_LE_F32;
1502 case CmpInst::FCMP_ONE:
1503 return AMDGPU::S_CMP_LG_F32;
1504 case CmpInst::FCMP_ORD:
1505 return AMDGPU::S_CMP_O_F32;
1506 case CmpInst::FCMP_UNO:
1507 return AMDGPU::S_CMP_U_F32;
1508 case CmpInst::FCMP_UEQ:
1509 return AMDGPU::S_CMP_NLG_F32;
1510 case CmpInst::FCMP_UGT:
1511 return AMDGPU::S_CMP_NLE_F32;
1512 case CmpInst::FCMP_UGE:
1513 return AMDGPU::S_CMP_NLT_F32;
1514 case CmpInst::FCMP_ULT:
1515 return AMDGPU::S_CMP_NGE_F32;
1516 case CmpInst::FCMP_ULE:
1517 return AMDGPU::S_CMP_NGT_F32;
1518 case CmpInst::FCMP_UNE:
1519 return AMDGPU::S_CMP_NEQ_F32;
1520 default:
1521 llvm_unreachable("Unknown condition code!");
1522 }
1523 }
1524
1525 if (Size == 16) {
1526 if (!STI.hasSALUFloatInsts())
1527 return -1;
1528
1529 switch (P) {
1530 case CmpInst::FCMP_OEQ:
1531 return AMDGPU::S_CMP_EQ_F16;
1532 case CmpInst::FCMP_OGT:
1533 return AMDGPU::S_CMP_GT_F16;
1534 case CmpInst::FCMP_OGE:
1535 return AMDGPU::S_CMP_GE_F16;
1536 case CmpInst::FCMP_OLT:
1537 return AMDGPU::S_CMP_LT_F16;
1538 case CmpInst::FCMP_OLE:
1539 return AMDGPU::S_CMP_LE_F16;
1540 case CmpInst::FCMP_ONE:
1541 return AMDGPU::S_CMP_LG_F16;
1542 case CmpInst::FCMP_ORD:
1543 return AMDGPU::S_CMP_O_F16;
1544 case CmpInst::FCMP_UNO:
1545 return AMDGPU::S_CMP_U_F16;
1546 case CmpInst::FCMP_UEQ:
1547 return AMDGPU::S_CMP_NLG_F16;
1548 case CmpInst::FCMP_UGT:
1549 return AMDGPU::S_CMP_NLE_F16;
1550 case CmpInst::FCMP_UGE:
1551 return AMDGPU::S_CMP_NLT_F16;
1552 case CmpInst::FCMP_ULT:
1553 return AMDGPU::S_CMP_NGE_F16;
1554 case CmpInst::FCMP_ULE:
1555 return AMDGPU::S_CMP_NGT_F16;
1556 case CmpInst::FCMP_UNE:
1557 return AMDGPU::S_CMP_NEQ_F16;
1558 default:
1559 llvm_unreachable("Unknown condition code!");
1560 }
1561 }
1562
1563 return -1;
1564}
1565
1566bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1567
1568 MachineBasicBlock *BB = I.getParent();
1569 const DebugLoc &DL = I.getDebugLoc();
1570
1571 Register SrcReg = I.getOperand(i: 2).getReg();
1572 unsigned Size = RBI.getSizeInBits(Reg: SrcReg, MRI: *MRI, TRI);
1573
1574 auto Pred = (CmpInst::Predicate)I.getOperand(i: 1).getPredicate();
1575
1576 Register CCReg = I.getOperand(i: 0).getReg();
1577 if (!isVCC(Reg: CCReg, MRI: *MRI)) {
1578 int Opcode = getS_CMPOpcode(P: Pred, Size);
1579 if (Opcode == -1)
1580 return false;
1581 MachineInstr *ICmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode))
1582 .add(MO: I.getOperand(i: 2))
1583 .add(MO: I.getOperand(i: 3));
1584 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CCReg)
1585 .addReg(RegNo: AMDGPU::SCC);
1586 constrainSelectedInstRegOperands(I&: *ICmp, TII, TRI, RBI);
1587 bool Ret =
1588 RBI.constrainGenericRegister(Reg: CCReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
1589 I.eraseFromParent();
1590 return Ret;
1591 }
1592
1593 if (I.getOpcode() == AMDGPU::G_FCMP)
1594 return false;
1595
1596 int Opcode = getV_CMPOpcode(P: Pred, Size, ST: *Subtarget);
1597 if (Opcode == -1)
1598 return false;
1599
1600 MachineInstrBuilder ICmp;
1601 // t16 instructions
1602 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers)) {
1603 ICmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: I.getOperand(i: 0).getReg())
1604 .addImm(Val: 0)
1605 .add(MO: I.getOperand(i: 2))
1606 .addImm(Val: 0)
1607 .add(MO: I.getOperand(i: 3))
1608 .addImm(Val: 0); // op_sel
1609 } else {
1610 ICmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: I.getOperand(i: 0).getReg())
1611 .add(MO: I.getOperand(i: 2))
1612 .add(MO: I.getOperand(i: 3));
1613 }
1614
1615 RBI.constrainGenericRegister(Reg: ICmp->getOperand(i: 0).getReg(),
1616 RC: *TRI.getBoolRC(), MRI&: *MRI);
1617 constrainSelectedInstRegOperands(I&: *ICmp, TII, TRI, RBI);
1618 I.eraseFromParent();
1619 return true;
1620}
1621
1622bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1623 Register Dst = I.getOperand(i: 0).getReg();
1624 if (isVCC(Reg: Dst, MRI: *MRI))
1625 return false;
1626
1627 LLT DstTy = MRI->getType(Reg: Dst);
1628 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1629 return false;
1630
1631 MachineBasicBlock *BB = I.getParent();
1632 const DebugLoc &DL = I.getDebugLoc();
1633 Register SrcReg = I.getOperand(i: 2).getReg();
1634 unsigned Size = RBI.getSizeInBits(Reg: SrcReg, MRI: *MRI, TRI);
1635
1636 // i1 inputs are not supported in GlobalISel.
1637 if (Size == 1)
1638 return false;
1639
1640 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(i: 4).getImm());
1641 if (!CmpInst::isIntPredicate(P: Pred) && !CmpInst::isFPPredicate(P: Pred)) {
1642 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Dst);
1643 I.eraseFromParent();
1644 return RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI);
1645 }
1646
1647 const int Opcode = getV_CMPOpcode(P: Pred, Size, ST: *Subtarget);
1648 if (Opcode == -1)
1649 return false;
1650
1651 MachineInstrBuilder SelectedMI;
1652 MachineOperand &LHS = I.getOperand(i: 2);
1653 MachineOperand &RHS = I.getOperand(i: 3);
1654 auto [Src0, Src0Mods] = selectVOP3ModsImpl(Src: LHS.getReg());
1655 auto [Src1, Src1Mods] = selectVOP3ModsImpl(Src: RHS.getReg());
1656 Register Src0Reg =
1657 copyToVGPRIfSrcFolded(Src: Src0, Mods: Src0Mods, Root: LHS, InsertPt: &I, /*ForceVGPR*/ true);
1658 Register Src1Reg =
1659 copyToVGPRIfSrcFolded(Src: Src1, Mods: Src1Mods, Root: RHS, InsertPt: &I, /*ForceVGPR*/ true);
1660 SelectedMI = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: Dst);
1661 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers))
1662 SelectedMI.addImm(Val: Src0Mods);
1663 SelectedMI.addReg(RegNo: Src0Reg);
1664 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src1_modifiers))
1665 SelectedMI.addImm(Val: Src1Mods);
1666 SelectedMI.addReg(RegNo: Src1Reg);
1667 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::clamp))
1668 SelectedMI.addImm(Val: 0); // clamp
1669 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::op_sel))
1670 SelectedMI.addImm(Val: 0); // op_sel
1671
1672 RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI);
1673 constrainSelectedInstRegOperands(I&: *SelectedMI, TII, TRI, RBI);
1674
1675 I.eraseFromParent();
1676 return true;
1677}
1678
1679// Ballot has to zero bits in input lane-mask that are zero in current exec,
1680// Done as AND with exec. For inputs that are results of instruction that
1681// implicitly use same exec, for example compares in same basic block or SCC to
1682// VCC copy, use copy.
1683static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI,
1684 MachineBasicBlock *MBB) {
1685 MachineInstr *MI = MRI.getVRegDef(Reg);
1686 if (MI->getParent() != MBB)
1687 return false;
1688
1689 // Lane mask generated by SCC to VCC copy.
1690 if (MI->getOpcode() == AMDGPU::COPY) {
1691 auto DstRB = MRI.getRegBankOrNull(Reg: MI->getOperand(i: 0).getReg());
1692 auto SrcRB = MRI.getRegBankOrNull(Reg: MI->getOperand(i: 1).getReg());
1693 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1694 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1695 return true;
1696 }
1697
1698 // Lane mask generated by SCC to VCC copy
1699 if (MI->getOpcode() == AMDGPU::G_AMDGPU_COPY_VCC_SCC)
1700 return true;
1701
1702 // Lane mask generated using compare with same exec.
1703 if (isa<GAnyCmp>(Val: MI))
1704 return true;
1705
1706 Register LHS, RHS;
1707 // Look through AND.
1708 if (mi_match(R: Reg, MRI, P: m_GAnd(L: m_Reg(R&: LHS), R: m_Reg(R&: RHS))))
1709 return isLaneMaskFromSameBlock(Reg: LHS, MRI, MBB) ||
1710 isLaneMaskFromSameBlock(Reg: RHS, MRI, MBB);
1711
1712 return false;
1713}
1714
1715bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1716 MachineBasicBlock *BB = I.getParent();
1717 const DebugLoc &DL = I.getDebugLoc();
1718 Register DstReg = I.getOperand(i: 0).getReg();
1719 Register SrcReg = I.getOperand(i: 2).getReg();
1720 const unsigned BallotSize = MRI->getType(Reg: DstReg).getSizeInBits();
1721 const unsigned WaveSize = STI.getWavefrontSize();
1722
1723 // In the common case, the return type matches the wave size.
1724 // However we also support emitting i64 ballots in wave32 mode.
1725 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1726 return false;
1727
1728 std::optional<ValueAndVReg> Arg =
1729 getIConstantVRegValWithLookThrough(VReg: SrcReg, MRI: *MRI);
1730
1731 Register Dst = DstReg;
1732 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1733 if (BallotSize != WaveSize) {
1734 Dst = MRI->createVirtualRegister(RegClass: TRI.getBoolRC());
1735 }
1736
1737 if (Arg) {
1738 const int64_t Value = Arg->Value.getZExtValue();
1739 if (Value == 0) {
1740 // Dst = S_MOV 0
1741 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1742 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: Dst).addImm(Val: 0);
1743 } else {
1744 // Dst = COPY EXEC
1745 assert(Value == 1);
1746 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: Dst).addReg(RegNo: TRI.getExec());
1747 }
1748 if (!RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI))
1749 return false;
1750 } else {
1751 if (isLaneMaskFromSameBlock(Reg: SrcReg, MRI&: *MRI, MBB: BB)) {
1752 // Dst = COPY SrcReg
1753 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: Dst).addReg(RegNo: SrcReg);
1754 if (!RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI))
1755 return false;
1756 } else {
1757 // Dst = S_AND SrcReg, EXEC
1758 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1759 auto And = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: Dst)
1760 .addReg(RegNo: SrcReg)
1761 .addReg(RegNo: TRI.getExec())
1762 .setOperandDead(3); // Dead scc
1763 constrainSelectedInstRegOperands(I&: *And, TII, TRI, RBI);
1764 }
1765 }
1766
1767 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1768 if (BallotSize != WaveSize) {
1769 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1770 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: HiReg).addImm(Val: 0);
1771 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
1772 .addReg(RegNo: Dst)
1773 .addImm(Val: AMDGPU::sub0)
1774 .addReg(RegNo: HiReg)
1775 .addImm(Val: AMDGPU::sub1);
1776 }
1777
1778 I.eraseFromParent();
1779 return true;
1780}
1781
1782bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1783 Register DstReg = I.getOperand(i: 0).getReg();
1784 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
1785 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(Size: 32, Bank: *DstBank);
1786 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
1787 return false;
1788
1789 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1790
1791 Module *M = MF->getFunction().getParent();
1792 const MDNode *Metadata = I.getOperand(i: 2).getMetadata();
1793 auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString();
1794 auto *RelocSymbol = cast<GlobalVariable>(
1795 Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext())));
1796
1797 MachineBasicBlock *BB = I.getParent();
1798 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(),
1799 MCID: TII.get(Opcode: IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DestReg: DstReg)
1800 .addGlobalAddress(GV: RelocSymbol, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
1801
1802 I.eraseFromParent();
1803 return true;
1804}
1805
1806bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1807 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1808
1809 Register DstReg = I.getOperand(i: 0).getReg();
1810 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
1811 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1812 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1813
1814 MachineBasicBlock *MBB = I.getParent();
1815 const DebugLoc &DL = I.getDebugLoc();
1816
1817 auto MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Mov), DestReg: DstReg);
1818
1819 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1820 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1821 MIB.addImm(Val: MFI->getLDSSize());
1822 } else {
1823 Module *M = MF->getFunction().getParent();
1824 const GlobalValue *GV =
1825 Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::amdgcn_groupstaticsize);
1826 MIB.addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
1827 }
1828
1829 I.eraseFromParent();
1830 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1831 return true;
1832}
1833
1834bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1835 MachineBasicBlock *MBB = I.getParent();
1836 MachineFunction &MF = *MBB->getParent();
1837 const DebugLoc &DL = I.getDebugLoc();
1838
1839 MachineOperand &Dst = I.getOperand(i: 0);
1840 Register DstReg = Dst.getReg();
1841 unsigned Depth = I.getOperand(i: 2).getImm();
1842
1843 const TargetRegisterClass *RC
1844 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
1845 if (!RC->hasSubClassEq(RC: &AMDGPU::SGPR_64RegClass) ||
1846 !RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI))
1847 return false;
1848
1849 // Check for kernel and shader functions
1850 if (Depth != 0 ||
1851 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1852 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B64), DestReg: DstReg)
1853 .addImm(Val: 0);
1854 I.eraseFromParent();
1855 return true;
1856 }
1857
1858 MachineFrameInfo &MFI = MF.getFrameInfo();
1859 // There is a call to @llvm.returnaddress in this function
1860 MFI.setReturnAddressIsTaken(true);
1861
1862 // Get the return address reg and mark it as an implicit live-in
1863 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1864 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, PhysReg: ReturnAddrReg,
1865 RC: AMDGPU::SReg_64RegClass, DL);
1866 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
1867 .addReg(RegNo: LiveIn);
1868 I.eraseFromParent();
1869 return true;
1870}
1871
1872bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1873 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1874 // SelectionDAG uses for wave32 vs wave64.
1875 MachineBasicBlock *BB = MI.getParent();
1876 BuildMI(BB&: *BB, I: &MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::SI_END_CF))
1877 .add(MO: MI.getOperand(i: 1));
1878
1879 Register Reg = MI.getOperand(i: 1).getReg();
1880 MI.eraseFromParent();
1881
1882 if (!MRI->getRegClassOrNull(Reg))
1883 MRI->setRegClass(Reg, RC: TRI.getWaveMaskRegClass());
1884 return true;
1885}
1886
1887bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1888 MachineInstr &MI, Intrinsic::ID IntrID) const {
1889 MachineBasicBlock *MBB = MI.getParent();
1890 MachineFunction *MF = MBB->getParent();
1891 const DebugLoc &DL = MI.getDebugLoc();
1892
1893 unsigned IndexOperand = MI.getOperand(i: 7).getImm();
1894 bool WaveRelease = MI.getOperand(i: 8).getImm() != 0;
1895 bool WaveDone = MI.getOperand(i: 9).getImm() != 0;
1896
1897 if (WaveDone && !WaveRelease) {
1898 // TODO: Move this to IR verifier
1899 const Function &Fn = MF->getFunction();
1900 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
1901 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1902 }
1903
1904 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1905 IndexOperand &= ~0x3f;
1906 unsigned CountDw = 0;
1907
1908 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1909 CountDw = (IndexOperand >> 24) & 0xf;
1910 IndexOperand &= ~(0xf << 24);
1911
1912 if (CountDw < 1 || CountDw > 4) {
1913 const Function &Fn = MF->getFunction();
1914 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
1915 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1916 CountDw = 1;
1917 }
1918 }
1919
1920 if (IndexOperand) {
1921 const Function &Fn = MF->getFunction();
1922 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
1923 Fn, "ds_ordered_count: bad index operand", DL));
1924 }
1925
1926 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1927 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(MF: *MF);
1928
1929 unsigned Offset0 = OrderedCountIndex << 2;
1930 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1931
1932 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1933 Offset1 |= (CountDw - 1) << 6;
1934
1935 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1936 Offset1 |= ShaderType << 2;
1937
1938 unsigned Offset = Offset0 | (Offset1 << 8);
1939
1940 Register M0Val = MI.getOperand(i: 2).getReg();
1941 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1942 .addReg(RegNo: M0Val);
1943
1944 Register DstReg = MI.getOperand(i: 0).getReg();
1945 Register ValReg = MI.getOperand(i: 3).getReg();
1946 MachineInstrBuilder DS =
1947 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_ORDERED_COUNT), DestReg: DstReg)
1948 .addReg(RegNo: ValReg)
1949 .addImm(Val: Offset)
1950 .cloneMemRefs(OtherMI: MI);
1951
1952 if (!RBI.constrainGenericRegister(Reg: M0Val, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
1953 return false;
1954
1955 constrainSelectedInstRegOperands(I&: *DS, TII, TRI, RBI);
1956 MI.eraseFromParent();
1957 return true;
1958}
1959
1960static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1961 switch (IntrID) {
1962 case Intrinsic::amdgcn_ds_gws_init:
1963 return AMDGPU::DS_GWS_INIT;
1964 case Intrinsic::amdgcn_ds_gws_barrier:
1965 return AMDGPU::DS_GWS_BARRIER;
1966 case Intrinsic::amdgcn_ds_gws_sema_v:
1967 return AMDGPU::DS_GWS_SEMA_V;
1968 case Intrinsic::amdgcn_ds_gws_sema_br:
1969 return AMDGPU::DS_GWS_SEMA_BR;
1970 case Intrinsic::amdgcn_ds_gws_sema_p:
1971 return AMDGPU::DS_GWS_SEMA_P;
1972 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1973 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1974 default:
1975 llvm_unreachable("not a gws intrinsic");
1976 }
1977}
1978
1979bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1980 Intrinsic::ID IID) const {
1981 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1982 !STI.hasGWSSemaReleaseAll()))
1983 return false;
1984
1985 // intrinsic ID, vsrc, offset
1986 const bool HasVSrc = MI.getNumOperands() == 3;
1987 assert(HasVSrc || MI.getNumOperands() == 2);
1988
1989 Register BaseOffset = MI.getOperand(i: HasVSrc ? 2 : 1).getReg();
1990 const RegisterBank *OffsetRB = RBI.getRegBank(Reg: BaseOffset, MRI: *MRI, TRI);
1991 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1992 return false;
1993
1994 MachineInstr *OffsetDef = getDefIgnoringCopies(Reg: BaseOffset, MRI: *MRI);
1995 unsigned ImmOffset;
1996
1997 MachineBasicBlock *MBB = MI.getParent();
1998 const DebugLoc &DL = MI.getDebugLoc();
1999
2000 MachineInstr *Readfirstlane = nullptr;
2001
2002 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
2003 // incoming offset, in case there's an add of a constant. We'll have to put it
2004 // back later.
2005 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
2006 Readfirstlane = OffsetDef;
2007 BaseOffset = OffsetDef->getOperand(i: 1).getReg();
2008 OffsetDef = getDefIgnoringCopies(Reg: BaseOffset, MRI: *MRI);
2009 }
2010
2011 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
2012 // If we have a constant offset, try to use the 0 in m0 as the base.
2013 // TODO: Look into changing the default m0 initialization value. If the
2014 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2015 // the immediate offset.
2016
2017 ImmOffset = OffsetDef->getOperand(i: 1).getCImm()->getZExtValue();
2018 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
2019 .addImm(Val: 0);
2020 } else {
2021 std::tie(args&: BaseOffset, args&: ImmOffset) =
2022 AMDGPU::getBaseWithConstantOffset(MRI&: *MRI, Reg: BaseOffset, ValueTracking: VT);
2023
2024 if (Readfirstlane) {
2025 // We have the constant offset now, so put the readfirstlane back on the
2026 // variable component.
2027 if (!RBI.constrainGenericRegister(Reg: BaseOffset, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI))
2028 return false;
2029
2030 Readfirstlane->getOperand(i: 1).setReg(BaseOffset);
2031 BaseOffset = Readfirstlane->getOperand(i: 0).getReg();
2032 } else {
2033 if (!RBI.constrainGenericRegister(Reg: BaseOffset,
2034 RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
2035 return false;
2036 }
2037
2038 Register M0Base = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2039 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: M0Base)
2040 .addReg(RegNo: BaseOffset)
2041 .addImm(Val: 16)
2042 .setOperandDead(3); // Dead scc
2043
2044 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
2045 .addReg(RegNo: M0Base);
2046 }
2047
2048 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2049 // offset field) % 64. Some versions of the programming guide omit the m0
2050 // part, or claim it's from offset 0.
2051
2052 unsigned Opc = gwsIntrinToOpcode(IntrID: IID);
2053 const MCInstrDesc &InstrDesc = TII.get(Opcode: Opc);
2054
2055 if (HasVSrc) {
2056 Register VSrc = MI.getOperand(i: 1).getReg();
2057
2058 int Data0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
2059 const TargetRegisterClass *DataRC = TII.getRegClass(MCID: InstrDesc, OpNum: Data0Idx);
2060 const TargetRegisterClass *SubRC =
2061 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
2062
2063 if (!SubRC) {
2064 // 32-bit normal case.
2065 if (!RBI.constrainGenericRegister(Reg: VSrc, RC: *DataRC, MRI&: *MRI))
2066 return false;
2067
2068 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: InstrDesc)
2069 .addReg(RegNo: VSrc)
2070 .addImm(Val: ImmOffset)
2071 .cloneMemRefs(OtherMI: MI);
2072 } else {
2073 // Requires even register alignment, so create 64-bit value and pad the
2074 // top half with undef.
2075 Register DataReg = MRI->createVirtualRegister(RegClass: DataRC);
2076 if (!RBI.constrainGenericRegister(Reg: VSrc, RC: *SubRC, MRI&: *MRI))
2077 return false;
2078
2079 Register UndefReg = MRI->createVirtualRegister(RegClass: SubRC);
2080 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefReg);
2081 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DataReg)
2082 .addReg(RegNo: VSrc)
2083 .addImm(Val: AMDGPU::sub0)
2084 .addReg(RegNo: UndefReg)
2085 .addImm(Val: AMDGPU::sub1);
2086
2087 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: InstrDesc)
2088 .addReg(RegNo: DataReg)
2089 .addImm(Val: ImmOffset)
2090 .cloneMemRefs(OtherMI: MI);
2091 }
2092 } else {
2093 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: InstrDesc)
2094 .addImm(Val: ImmOffset)
2095 .cloneMemRefs(OtherMI: MI);
2096 }
2097
2098 MI.eraseFromParent();
2099 return true;
2100}
2101
2102bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2103 bool IsAppend) const {
2104 Register PtrBase = MI.getOperand(i: 2).getReg();
2105 LLT PtrTy = MRI->getType(Reg: PtrBase);
2106 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2107
2108 unsigned Offset;
2109 std::tie(args&: PtrBase, args&: Offset) = selectDS1Addr1OffsetImpl(Root&: MI.getOperand(i: 2));
2110
2111 // TODO: Should this try to look through readfirstlane like GWS?
2112 if (!isDSOffsetLegal(Base: PtrBase, Offset)) {
2113 PtrBase = MI.getOperand(i: 2).getReg();
2114 Offset = 0;
2115 }
2116
2117 MachineBasicBlock *MBB = MI.getParent();
2118 const DebugLoc &DL = MI.getDebugLoc();
2119 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2120
2121 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
2122 .addReg(RegNo: PtrBase);
2123 if (!RBI.constrainGenericRegister(Reg: PtrBase, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
2124 return false;
2125
2126 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: MI.getOperand(i: 0).getReg())
2127 .addImm(Val: Offset)
2128 .addImm(Val: IsGDS ? -1 : 0)
2129 .cloneMemRefs(OtherMI: MI);
2130 MI.eraseFromParent();
2131 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2132 return true;
2133}
2134
2135bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2136 MachineFunction *MF = MI.getMF();
2137 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2138
2139 MFInfo->setInitWholeWave();
2140 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
2141}
2142
2143static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2144 bool &IsTexFail) {
2145 if (TexFailCtrl)
2146 IsTexFail = true;
2147
2148 TFE = TexFailCtrl & 0x1;
2149 TexFailCtrl &= ~(uint64_t)0x1;
2150 LWE = TexFailCtrl & 0x2;
2151 TexFailCtrl &= ~(uint64_t)0x2;
2152
2153 return TexFailCtrl == 0;
2154}
2155
2156bool AMDGPUInstructionSelector::selectImageIntrinsic(
2157 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2158 MachineBasicBlock *MBB = MI.getParent();
2159 const DebugLoc &DL = MI.getDebugLoc();
2160 unsigned IntrOpcode = Intr->BaseOpcode;
2161
2162 // For image atomic: use no-return opcode if result is unused.
2163 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2164 Register ResultDef = MI.getOperand(i: 0).getReg();
2165 if (MRI->use_nodbg_empty(RegNo: ResultDef))
2166 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2167 }
2168
2169 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2170 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: IntrOpcode);
2171
2172 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim);
2173 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2174 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2175 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2176 const bool IsGFX13Plus = AMDGPU::isGFX13Plus(STI);
2177
2178 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2179
2180 Register VDataIn = AMDGPU::NoRegister;
2181 Register VDataOut = AMDGPU::NoRegister;
2182 LLT VDataTy;
2183 int NumVDataDwords = -1;
2184 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2185 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2186
2187 bool Unorm;
2188 if (!BaseOpcode->Sampler)
2189 Unorm = true;
2190 else
2191 Unorm = MI.getOperand(i: ArgOffset + Intr->UnormIndex).getImm() != 0;
2192
2193 bool TFE;
2194 bool LWE;
2195 bool IsTexFail = false;
2196 if (!parseTexFail(TexFailCtrl: MI.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2197 TFE, LWE, IsTexFail))
2198 return false;
2199
2200 const int Flags = MI.getOperand(i: ArgOffset + Intr->NumArgs).getImm();
2201 const bool IsA16 = (Flags & 1) != 0;
2202 const bool IsG16 = (Flags & 2) != 0;
2203
2204 // A16 implies 16 bit gradients if subtarget doesn't support G16
2205 if (IsA16 && !STI.hasG16() && !IsG16)
2206 return false;
2207
2208 unsigned DMask = 0;
2209 unsigned DMaskLanes = 0;
2210
2211 if (BaseOpcode->Atomic) {
2212 if (!BaseOpcode->NoReturn)
2213 VDataOut = MI.getOperand(i: 0).getReg();
2214 VDataIn = MI.getOperand(i: 2).getReg();
2215 LLT Ty = MRI->getType(Reg: VDataIn);
2216
2217 // Be careful to allow atomic swap on 16-bit element vectors.
2218 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2219 Ty.getSizeInBits() == 128 :
2220 Ty.getSizeInBits() == 64;
2221
2222 if (BaseOpcode->AtomicX2) {
2223 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2224
2225 DMask = Is64Bit ? 0xf : 0x3;
2226 NumVDataDwords = Is64Bit ? 4 : 2;
2227 } else {
2228 DMask = Is64Bit ? 0x3 : 0x1;
2229 NumVDataDwords = Is64Bit ? 2 : 1;
2230 }
2231 } else {
2232 DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
2233 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask);
2234
2235 if (BaseOpcode->Store) {
2236 VDataIn = MI.getOperand(i: 1).getReg();
2237 VDataTy = MRI->getType(Reg: VDataIn);
2238 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2239 } else if (BaseOpcode->NoReturn) {
2240 NumVDataDwords = 0;
2241 } else {
2242 VDataOut = MI.getOperand(i: 0).getReg();
2243 VDataTy = MRI->getType(Reg: VDataOut);
2244 NumVDataDwords = DMaskLanes;
2245
2246 if (IsD16 && !STI.hasUnpackedD16VMem())
2247 NumVDataDwords = (DMaskLanes + 1) / 2;
2248 }
2249 }
2250
2251 // Set G16 opcode
2252 if (Subtarget->hasG16() && IsG16) {
2253 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2254 AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode);
2255 assert(G16MappingInfo);
2256 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2257 }
2258
2259 // TODO: Check this in verifier.
2260 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2261
2262 unsigned CPol = MI.getOperand(i: ArgOffset + Intr->CachePolicyIndex).getImm();
2263 // Keep GLC only when the atomic's result is actually used.
2264 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2265 CPol |= AMDGPU::CPol::GLC;
2266 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2267 AMDGPU::CPol::VOLATILE))
2268 return false;
2269
2270 int NumVAddrRegs = 0;
2271 int NumVAddrDwords = 0;
2272 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2273 // Skip the $noregs and 0s inserted during legalization.
2274 MachineOperand &AddrOp = MI.getOperand(i: ArgOffset + I);
2275 if (!AddrOp.isReg())
2276 continue; // XXX - Break?
2277
2278 Register Addr = AddrOp.getReg();
2279 if (!Addr)
2280 break;
2281
2282 ++NumVAddrRegs;
2283 NumVAddrDwords += (MRI->getType(Reg: Addr).getSizeInBits() + 31) / 32;
2284 }
2285
2286 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2287 // NSA, these should have been packed into a single value in the first
2288 // address register
2289 const bool UseNSA =
2290 NumVAddrRegs != 1 &&
2291 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2292 : NumVAddrDwords == NumVAddrRegs);
2293 if (UseNSA && !STI.hasFeature(Feature: AMDGPU::FeatureNSAEncoding)) {
2294 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2295 return false;
2296 }
2297
2298 if (IsTexFail)
2299 ++NumVDataDwords;
2300
2301 int Opcode = -1;
2302 if (IsGFX13Plus) {
2303 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx13,
2304 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2305 } else if (IsGFX12Plus) {
2306 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx12,
2307 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2308 } else if (IsGFX11Plus) {
2309 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
2310 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx11NSA
2311 : AMDGPU::MIMGEncGfx11Default,
2312 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2313 } else if (IsGFX10Plus) {
2314 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
2315 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx10NSA
2316 : AMDGPU::MIMGEncGfx10Default,
2317 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2318 } else {
2319 if (Subtarget->hasGFX90AInsts()) {
2320 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx90a,
2321 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2322 if (Opcode == -1) {
2323 LLVM_DEBUG(
2324 dbgs()
2325 << "requested image instruction is not supported on this GPU\n");
2326 return false;
2327 }
2328 }
2329 if (Opcode == -1 &&
2330 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2331 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx8,
2332 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2333 if (Opcode == -1)
2334 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx6,
2335 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2336 }
2337 if (Opcode == -1)
2338 return false;
2339
2340 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode))
2341 .cloneMemRefs(OtherMI: MI);
2342
2343 if (VDataOut) {
2344 if (BaseOpcode->AtomicX2) {
2345 const bool Is64 = MRI->getType(Reg: VDataOut).getSizeInBits() == 64;
2346
2347 Register TmpReg = MRI->createVirtualRegister(
2348 RegClass: Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2349 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2350
2351 MIB.addDef(RegNo: TmpReg);
2352 if (!MRI->use_empty(RegNo: VDataOut)) {
2353 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: VDataOut)
2354 .addReg(RegNo: TmpReg, Flags: RegState::Kill, SubReg);
2355 }
2356
2357 } else {
2358 MIB.addDef(RegNo: VDataOut); // vdata output
2359 }
2360 }
2361
2362 if (VDataIn)
2363 MIB.addReg(RegNo: VDataIn); // vdata input
2364
2365 for (int I = 0; I != NumVAddrRegs; ++I) {
2366 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + Intr->VAddrStart + I);
2367 if (SrcOp.isReg()) {
2368 assert(SrcOp.getReg() != 0);
2369 MIB.addReg(RegNo: SrcOp.getReg());
2370 }
2371 }
2372
2373 MIB.addReg(RegNo: MI.getOperand(i: ArgOffset + Intr->RsrcIndex).getReg());
2374 if (BaseOpcode->Sampler)
2375 MIB.addReg(RegNo: MI.getOperand(i: ArgOffset + Intr->SampIndex).getReg());
2376
2377 MIB.addImm(Val: DMask); // dmask
2378
2379 if (IsGFX10Plus)
2380 MIB.addImm(Val: DimInfo->Encoding);
2381 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::unorm))
2382 MIB.addImm(Val: Unorm);
2383
2384 MIB.addImm(Val: CPol);
2385 MIB.addImm(Val: IsA16 && // a16 or r128
2386 STI.hasFeature(Feature: AMDGPU::FeatureR128A16) ? -1 : 0);
2387 if (IsGFX10Plus)
2388 MIB.addImm(Val: IsA16 ? -1 : 0);
2389
2390 if (!Subtarget->hasGFX90AInsts()) {
2391 MIB.addImm(Val: TFE); // tfe
2392 } else if (TFE) {
2393 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2394 return false;
2395 }
2396
2397 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::lwe))
2398 MIB.addImm(Val: LWE); // lwe
2399 if (!IsGFX10Plus)
2400 MIB.addImm(Val: DimInfo->DA ? -1 : 0);
2401 if (BaseOpcode->HasD16)
2402 MIB.addImm(Val: IsD16 ? -1 : 0);
2403
2404 MI.eraseFromParent();
2405 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2406 TII.enforceOperandRCAlignment(MI&: *MIB, OpName: AMDGPU::OpName::vaddr);
2407 return true;
2408}
2409
2410// We need to handle this here because tablegen doesn't support matching
2411// instructions with multiple outputs.
2412bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2413 MachineInstr &MI) const {
2414 Register Dst0 = MI.getOperand(i: 0).getReg();
2415 Register Dst1 = MI.getOperand(i: 1).getReg();
2416
2417 const DebugLoc &DL = MI.getDebugLoc();
2418 MachineBasicBlock *MBB = MI.getParent();
2419
2420 Register Addr = MI.getOperand(i: 3).getReg();
2421 Register Data0 = MI.getOperand(i: 4).getReg();
2422 Register Data1 = MI.getOperand(i: 5).getReg();
2423 unsigned Offset = MI.getOperand(i: 6).getImm();
2424
2425 unsigned Opc;
2426 switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
2427 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2428 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2429 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2430 break;
2431 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2432 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2433 break;
2434 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2435 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2436 break;
2437 }
2438
2439 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: Dst0)
2440 .addDef(RegNo: Dst1)
2441 .addUse(RegNo: Addr)
2442 .addUse(RegNo: Data0)
2443 .addUse(RegNo: Data1)
2444 .addImm(Val: Offset)
2445 .cloneMemRefs(OtherMI: MI);
2446
2447 MI.eraseFromParent();
2448 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2449 return true;
2450}
2451
2452bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2453 MachineInstr &I) const {
2454 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
2455 switch (IntrinsicID) {
2456 case Intrinsic::amdgcn_end_cf:
2457 return selectEndCfIntrinsic(MI&: I);
2458 case Intrinsic::amdgcn_ds_ordered_add:
2459 case Intrinsic::amdgcn_ds_ordered_swap:
2460 return selectDSOrderedIntrinsic(MI&: I, IntrID: IntrinsicID);
2461 case Intrinsic::amdgcn_ds_gws_init:
2462 case Intrinsic::amdgcn_ds_gws_barrier:
2463 case Intrinsic::amdgcn_ds_gws_sema_v:
2464 case Intrinsic::amdgcn_ds_gws_sema_br:
2465 case Intrinsic::amdgcn_ds_gws_sema_p:
2466 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2467 return selectDSGWSIntrinsic(MI&: I, IID: IntrinsicID);
2468 case Intrinsic::amdgcn_ds_append:
2469 return selectDSAppendConsume(MI&: I, IsAppend: true);
2470 case Intrinsic::amdgcn_ds_consume:
2471 return selectDSAppendConsume(MI&: I, IsAppend: false);
2472 case Intrinsic::amdgcn_init_whole_wave:
2473 return selectInitWholeWave(MI&: I);
2474 case Intrinsic::amdgcn_raw_buffer_load_lds:
2475 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
2476 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2477 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
2478 case Intrinsic::amdgcn_struct_buffer_load_lds:
2479 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
2480 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2481 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
2482 return selectBufferLoadLds(MI&: I);
2483 // Until we can store both the address space of the global and the LDS
2484 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2485 // that the argument is a global pointer (buffer pointers have been handled by
2486 // a LLVM IR-level lowering).
2487 case Intrinsic::amdgcn_load_to_lds:
2488 case Intrinsic::amdgcn_load_async_to_lds:
2489 case Intrinsic::amdgcn_global_load_lds:
2490 case Intrinsic::amdgcn_global_load_async_lds:
2491 return selectGlobalLoadLds(MI&: I);
2492 case Intrinsic::amdgcn_tensor_load_to_lds:
2493 case Intrinsic::amdgcn_tensor_store_from_lds:
2494 return selectTensorLoadStore(MI&: I, IID: IntrinsicID);
2495 case Intrinsic::amdgcn_asyncmark:
2496 case Intrinsic::amdgcn_wait_asyncmark:
2497 if (!Subtarget->hasAsyncMark())
2498 return false;
2499 break;
2500 case Intrinsic::amdgcn_exp_compr:
2501 if (!STI.hasCompressedExport()) {
2502 diagnoseUnsupportedIntrinsic(I);
2503 return false;
2504 }
2505 break;
2506 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2507 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2508 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2509 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2510 return selectDSBvhStackIntrinsic(MI&: I);
2511 case Intrinsic::amdgcn_s_alloc_vgpr: {
2512 // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
2513 // SCC. We then need to COPY it into the result vreg.
2514 MachineBasicBlock *MBB = I.getParent();
2515 const DebugLoc &DL = I.getDebugLoc();
2516
2517 Register ResReg = I.getOperand(i: 0).getReg();
2518
2519 MachineInstr *AllocMI = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ALLOC_VGPR))
2520 .add(MO: I.getOperand(i: 2));
2521 (void)BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: ResReg)
2522 .addReg(RegNo: AMDGPU::SCC);
2523 I.eraseFromParent();
2524 constrainSelectedInstRegOperands(I&: *AllocMI, TII, TRI, RBI);
2525 return RBI.constrainGenericRegister(Reg: ResReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2526 }
2527 case Intrinsic::amdgcn_s_barrier_init:
2528 case Intrinsic::amdgcn_s_barrier_signal_var:
2529 return selectNamedBarrierInit(I, IID: IntrinsicID);
2530 case Intrinsic::amdgcn_s_wakeup_barrier: {
2531 if (!STI.hasSWakeupBarrier()) {
2532 diagnoseUnsupportedIntrinsic(I);
2533 return false;
2534 }
2535 return selectNamedBarrierInst(I, IID: IntrinsicID);
2536 }
2537 case Intrinsic::amdgcn_s_barrier_join:
2538 case Intrinsic::amdgcn_s_get_named_barrier_state:
2539 return selectNamedBarrierInst(I, IID: IntrinsicID);
2540 case Intrinsic::amdgcn_s_get_barrier_state:
2541 return selectSGetBarrierState(I, IID: IntrinsicID);
2542 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2543 return selectSBarrierSignalIsfirst(I, IID: IntrinsicID);
2544 }
2545 return selectImpl(I, CoverageInfo&: *CoverageInfo);
2546}
2547
2548bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2549 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
2550 return true;
2551
2552 MachineBasicBlock *BB = I.getParent();
2553 const DebugLoc &DL = I.getDebugLoc();
2554
2555 Register DstReg = I.getOperand(i: 0).getReg();
2556 unsigned Size = RBI.getSizeInBits(Reg: DstReg, MRI: *MRI, TRI);
2557 assert(Size <= 32 || Size == 64);
2558 const MachineOperand &CCOp = I.getOperand(i: 1);
2559 Register CCReg = CCOp.getReg();
2560 if (!isVCC(Reg: CCReg, MRI: *MRI)) {
2561 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2562 AMDGPU::S_CSELECT_B32;
2563 MachineInstr *CopySCC = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC)
2564 .addReg(RegNo: CCReg);
2565
2566 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2567 // bank, because it does not cover the register class that we used to represent
2568 // for it. So we need to manually set the register class here.
2569 if (!MRI->getRegClassOrNull(Reg: CCReg))
2570 MRI->setRegClass(Reg: CCReg, RC: TRI.getConstrainedRegClassForOperand(MO: CCOp, MRI: *MRI));
2571 MachineInstr *Select = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: SelectOpcode), DestReg: DstReg)
2572 .add(MO: I.getOperand(i: 2))
2573 .add(MO: I.getOperand(i: 3));
2574
2575 constrainSelectedInstRegOperands(I&: *Select, TII, TRI, RBI);
2576 constrainSelectedInstRegOperands(I&: *CopySCC, TII, TRI, RBI);
2577 I.eraseFromParent();
2578 return true;
2579 }
2580
2581 // Wide VGPR select should have been split in RegBankSelect.
2582 if (Size > 32)
2583 return false;
2584
2585 MachineInstr *Select =
2586 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
2587 .addImm(Val: 0)
2588 .add(MO: I.getOperand(i: 3))
2589 .addImm(Val: 0)
2590 .add(MO: I.getOperand(i: 2))
2591 .add(MO: I.getOperand(i: 1));
2592
2593 constrainSelectedInstRegOperands(I&: *Select, TII, TRI, RBI);
2594 I.eraseFromParent();
2595 return true;
2596}
2597
2598bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2599 Register DstReg = I.getOperand(i: 0).getReg();
2600 Register SrcReg = I.getOperand(i: 1).getReg();
2601 const LLT DstTy = MRI->getType(Reg: DstReg);
2602 const LLT SrcTy = MRI->getType(Reg: SrcReg);
2603 const LLT S1 = LLT::scalar(SizeInBits: 1);
2604
2605 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
2606 const RegisterBank *DstRB;
2607 if (DstTy == S1) {
2608 // This is a special case. We don't treat s1 for legalization artifacts as
2609 // vcc booleans.
2610 DstRB = SrcRB;
2611 } else {
2612 DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
2613 if (SrcRB != DstRB)
2614 return false;
2615 }
2616
2617 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2618
2619 unsigned DstSize = DstTy.getSizeInBits();
2620 unsigned SrcSize = SrcTy.getSizeInBits();
2621
2622 const TargetRegisterClass *SrcRC =
2623 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcRB);
2624 const TargetRegisterClass *DstRC =
2625 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstRB);
2626 if (!SrcRC || !DstRC)
2627 return false;
2628
2629 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
2630 !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI)) {
2631 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2632 return false;
2633 }
2634
2635 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2636 assert(STI.useRealTrue16Insts());
2637 const DebugLoc &DL = I.getDebugLoc();
2638 MachineBasicBlock *MBB = I.getParent();
2639 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
2640 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::lo16);
2641 I.eraseFromParent();
2642 return true;
2643 }
2644
2645 if (DstTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) && SrcTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
2646 MachineBasicBlock *MBB = I.getParent();
2647 const DebugLoc &DL = I.getDebugLoc();
2648
2649 Register LoReg = MRI->createVirtualRegister(RegClass: DstRC);
2650 Register HiReg = MRI->createVirtualRegister(RegClass: DstRC);
2651 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
2652 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub0);
2653 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
2654 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub1);
2655
2656 if (IsVALU && STI.hasSDWA()) {
2657 // Write the low 16-bits of the high element into the high 16-bits of the
2658 // low element.
2659 MachineInstr *MovSDWA =
2660 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_sdwa), DestReg: DstReg)
2661 .addImm(Val: 0) // $src0_modifiers
2662 .addReg(RegNo: HiReg) // $src0
2663 .addImm(Val: 0) // $clamp
2664 .addImm(Val: AMDGPU::SDWA::WORD_1) // $dst_sel
2665 .addImm(Val: AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2666 .addImm(Val: AMDGPU::SDWA::WORD_0) // $src0_sel
2667 .addReg(RegNo: LoReg, Flags: RegState::Implicit);
2668 MovSDWA->tieOperands(DefIdx: 0, UseIdx: MovSDWA->getNumOperands() - 1);
2669 } else {
2670 Register TmpReg0 = MRI->createVirtualRegister(RegClass: DstRC);
2671 Register TmpReg1 = MRI->createVirtualRegister(RegClass: DstRC);
2672 Register ImmReg = MRI->createVirtualRegister(RegClass: DstRC);
2673 if (IsVALU) {
2674 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHLREV_B32_e64), DestReg: TmpReg0)
2675 .addImm(Val: 16)
2676 .addReg(RegNo: HiReg);
2677 } else {
2678 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: TmpReg0)
2679 .addReg(RegNo: HiReg)
2680 .addImm(Val: 16)
2681 .setOperandDead(3); // Dead scc
2682 }
2683
2684 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2685 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2686 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2687
2688 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: MovOpc), DestReg: ImmReg)
2689 .addImm(Val: 0xffff);
2690 auto And = BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: TmpReg1)
2691 .addReg(RegNo: LoReg)
2692 .addReg(RegNo: ImmReg);
2693 auto Or = BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: OrOpc), DestReg: DstReg)
2694 .addReg(RegNo: TmpReg0)
2695 .addReg(RegNo: TmpReg1);
2696
2697 if (!IsVALU) {
2698 And.setOperandDead(3); // Dead scc
2699 Or.setOperandDead(3); // Dead scc
2700 }
2701 }
2702
2703 I.eraseFromParent();
2704 return true;
2705 }
2706
2707 if (!DstTy.isScalar())
2708 return false;
2709
2710 if (SrcSize > 32) {
2711 unsigned SubRegIdx = DstSize < 32
2712 ? static_cast<unsigned>(AMDGPU::sub0)
2713 : TRI.getSubRegFromChannel(Channel: 0, NumRegs: DstSize / 32);
2714 if (SubRegIdx == AMDGPU::NoSubRegister)
2715 return false;
2716
2717 // Deal with weird cases where the class only partially supports the subreg
2718 // index.
2719 const TargetRegisterClass *SrcWithSubRC
2720 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2721 if (!SrcWithSubRC)
2722 return false;
2723
2724 if (SrcWithSubRC != SrcRC) {
2725 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcWithSubRC, MRI&: *MRI))
2726 return false;
2727 }
2728
2729 I.getOperand(i: 1).setSubReg(SubRegIdx);
2730 }
2731
2732 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
2733 return true;
2734}
2735
2736/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2737static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2738 Mask = maskTrailingOnes<unsigned>(N: Size);
2739 int SignedMask = static_cast<int>(Mask);
2740 return SignedMask >= -16 && SignedMask <= 64;
2741}
2742
2743// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2744const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2745 Register Reg, const MachineRegisterInfo &MRI,
2746 const TargetRegisterInfo &TRI) const {
2747 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2748 if (auto *RB = dyn_cast<const RegisterBank *>(Val: RegClassOrBank))
2749 return RB;
2750
2751 // Ignore the type, since we don't use vcc in artifacts.
2752 if (auto *RC = dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank))
2753 return &RBI.getRegBankFromRegClass(RC: *RC, LLT());
2754 return nullptr;
2755}
2756
2757bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2758 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2759 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2760 const DebugLoc &DL = I.getDebugLoc();
2761 MachineBasicBlock &MBB = *I.getParent();
2762 const Register DstReg = I.getOperand(i: 0).getReg();
2763 const Register SrcReg = I.getOperand(i: 1).getReg();
2764
2765 const LLT DstTy = MRI->getType(Reg: DstReg);
2766 const LLT SrcTy = MRI->getType(Reg: SrcReg);
2767 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2768 I.getOperand(i: 2).getImm() : SrcTy.getSizeInBits();
2769 const unsigned DstSize = DstTy.getSizeInBits();
2770 if (!DstTy.isScalar())
2771 return false;
2772
2773 // Artifact casts should never use vcc.
2774 const RegisterBank *SrcBank = getArtifactRegBank(Reg: SrcReg, MRI: *MRI, TRI);
2775
2776 // FIXME: This should probably be illegal and split earlier.
2777 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2778 if (DstSize <= 32)
2779 return selectCOPY(I);
2780
2781 const TargetRegisterClass *SrcRC =
2782 TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *SrcBank);
2783 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
2784 const TargetRegisterClass *DstRC =
2785 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
2786
2787 Register UndefReg = MRI->createVirtualRegister(RegClass: SrcRC);
2788 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefReg);
2789 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
2790 .addReg(RegNo: SrcReg)
2791 .addImm(Val: AMDGPU::sub0)
2792 .addReg(RegNo: UndefReg)
2793 .addImm(Val: AMDGPU::sub1);
2794 I.eraseFromParent();
2795
2796 return RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) &&
2797 RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI);
2798 }
2799
2800 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2801 // 64-bit should have been split up in RegBankSelect
2802
2803 // Try to use an and with a mask if it will save code size.
2804 unsigned Mask;
2805 if (!Signed && shouldUseAndMask(Size: SrcSize, Mask)) {
2806 MachineInstr *ExtI =
2807 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B32_e32), DestReg: DstReg)
2808 .addImm(Val: Mask)
2809 .addReg(RegNo: SrcReg);
2810 I.eraseFromParent();
2811 constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
2812 return true;
2813 }
2814
2815 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2816 MachineInstr *ExtI =
2817 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE), DestReg: DstReg)
2818 .addReg(RegNo: SrcReg)
2819 .addImm(Val: 0) // Offset
2820 .addImm(Val: SrcSize); // Width
2821 I.eraseFromParent();
2822 constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
2823 return true;
2824 }
2825
2826 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2827 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2828 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2829 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: SrcRC, MRI&: *MRI))
2830 return false;
2831
2832 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2833 const unsigned SextOpc = SrcSize == 8 ?
2834 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2835 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: SextOpc), DestReg: DstReg)
2836 .addReg(RegNo: SrcReg);
2837 I.eraseFromParent();
2838 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2839 }
2840
2841 // Using a single 32-bit SALU to calculate the high half is smaller than
2842 // S_BFE with a literal constant operand.
2843 if (DstSize > 32 && SrcSize == 32) {
2844 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2845 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2846 if (Signed) {
2847 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ASHR_I32), DestReg: HiReg)
2848 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
2849 .addImm(Val: 31)
2850 .setOperandDead(3); // Dead scc
2851 } else {
2852 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: HiReg)
2853 .addImm(Val: 0);
2854 }
2855 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
2856 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
2857 .addImm(Val: AMDGPU::sub0)
2858 .addReg(RegNo: HiReg)
2859 .addImm(Val: AMDGPU::sub1);
2860 I.eraseFromParent();
2861 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_64RegClass,
2862 MRI&: *MRI);
2863 }
2864
2865 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2866 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2867
2868 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2869 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2870 // We need a 64-bit register source, but the high bits don't matter.
2871 Register ExtReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
2872 Register UndefReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2873 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2874
2875 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefReg);
2876 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: ExtReg)
2877 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
2878 .addImm(Val: AMDGPU::sub0)
2879 .addReg(RegNo: UndefReg)
2880 .addImm(Val: AMDGPU::sub1);
2881
2882 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE64), DestReg: DstReg)
2883 .addReg(RegNo: ExtReg)
2884 .addImm(Val: SrcSize << 16);
2885
2886 I.eraseFromParent();
2887 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI);
2888 }
2889
2890 unsigned Mask;
2891 if (!Signed && shouldUseAndMask(Size: SrcSize, Mask)) {
2892 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: DstReg)
2893 .addReg(RegNo: SrcReg)
2894 .addImm(Val: Mask)
2895 .setOperandDead(3); // Dead scc
2896 } else {
2897 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE32), DestReg: DstReg)
2898 .addReg(RegNo: SrcReg)
2899 .addImm(Val: SrcSize << 16);
2900 }
2901
2902 I.eraseFromParent();
2903 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2904 }
2905
2906 return false;
2907}
2908
2909static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
2910 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2911}
2912
2913static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
2914 Register BitcastSrc;
2915 if (mi_match(R: Reg, MRI, P: m_GBitcast(Src: m_Reg(R&: BitcastSrc))))
2916 Reg = BitcastSrc;
2917 return Reg;
2918}
2919
2920static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
2921 Register &Out) {
2922 // When unmerging a register that is composed of 2 x 16-bit values allow to
2923 // use an extract hi instruction for the upper 16 bits. We only need to check
2924 // the size of `In` as all defs are guaranteed to be the same type for
2925 // GUnmerge.
2926 if (auto *Unmerge = dyn_cast<GUnmerge>(Val: MRI.getVRegDef(Reg: In))) {
2927 if (Unmerge->getNumDefs() == 2 && Unmerge->getOperand(i: 1).getReg() == In &&
2928 MRI.getType(Reg: In).getSizeInBits() == 16) {
2929 Out = Unmerge->getSourceReg();
2930 return true;
2931 }
2932 }
2933
2934 Register Trunc;
2935 if (!mi_match(R: In, MRI, P: m_GTrunc(Src: m_Reg(R&: Trunc))))
2936 return false;
2937
2938 Register LShlSrc;
2939 Register Cst;
2940 if (mi_match(R: Trunc, MRI, P: m_GLShr(L: m_Reg(R&: LShlSrc), R: m_Reg(R&: Cst)))) {
2941 Cst = stripCopy(Reg: Cst, MRI);
2942 if (mi_match(R: Cst, MRI, P: m_SpecificICst(RequestedValue: 16))) {
2943 Out = stripBitCast(Reg: LShlSrc, MRI);
2944 return true;
2945 }
2946 }
2947
2948 MachineInstr *Shuffle = MRI.getVRegDef(Reg: Trunc);
2949 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2950 return false;
2951
2952 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2953 LLT::fixed_vector(2, 16));
2954
2955 ArrayRef<int> Mask = Shuffle->getOperand(i: 3).getShuffleMask();
2956 assert(Mask.size() == 2);
2957
2958 if (Mask[0] == 1 && Mask[1] <= 1) {
2959 Out = Shuffle->getOperand(i: 0).getReg();
2960 return true;
2961 }
2962
2963 return false;
2964}
2965
2966bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2967 if (!Subtarget->hasSALUFloatInsts())
2968 return false;
2969
2970 Register Dst = I.getOperand(i: 0).getReg();
2971 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
2972 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2973 return false;
2974
2975 Register Src = I.getOperand(i: 1).getReg();
2976
2977 if (MRI->getType(Reg: Dst) == LLT::scalar(SizeInBits: 32) &&
2978 MRI->getType(Reg: Src) == LLT::scalar(SizeInBits: 16)) {
2979 if (isExtractHiElt(MRI&: *MRI, In: Src, Out&: Src)) {
2980 MachineBasicBlock *BB = I.getParent();
2981 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_CVT_HI_F32_F16), DestReg: Dst)
2982 .addUse(RegNo: Src);
2983 I.eraseFromParent();
2984 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2985 }
2986 }
2987
2988 return false;
2989}
2990
2991bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2992 // Only manually handle the f64 SGPR case.
2993 //
2994 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2995 // the bit ops theoretically have a second result due to the implicit def of
2996 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2997 // that is easy by disabling the check. The result works, but uses a
2998 // nonsensical sreg32orlds_and_sreg_1 regclass.
2999 //
3000 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
3001 // the variadic REG_SEQUENCE operands.
3002
3003 Register Dst = MI.getOperand(i: 0).getReg();
3004 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
3005 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
3006 MRI->getType(Reg: Dst) != LLT::scalar(SizeInBits: 64))
3007 return false;
3008
3009 Register Src = MI.getOperand(i: 1).getReg();
3010 MachineInstr *Fabs = getOpcodeDef(Opcode: TargetOpcode::G_FABS, Reg: Src, MRI: *MRI);
3011 if (Fabs)
3012 Src = Fabs->getOperand(i: 1).getReg();
3013
3014 if (!RBI.constrainGenericRegister(Reg: Src, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI) ||
3015 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI))
3016 return false;
3017
3018 MachineBasicBlock *BB = MI.getParent();
3019 const DebugLoc &DL = MI.getDebugLoc();
3020 Register LoReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
3021 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
3022 Register ConstReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
3023 Register OpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
3024
3025 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
3026 .addReg(RegNo: Src, Flags: {}, SubReg: AMDGPU::sub0);
3027 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
3028 .addReg(RegNo: Src, Flags: {}, SubReg: AMDGPU::sub1);
3029 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: ConstReg)
3030 .addImm(Val: 0x80000000);
3031
3032 // Set or toggle sign bit.
3033 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
3034 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: OpReg)
3035 .addReg(RegNo: HiReg)
3036 .addReg(RegNo: ConstReg)
3037 .setOperandDead(3); // Dead scc
3038 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
3039 .addReg(RegNo: LoReg)
3040 .addImm(Val: AMDGPU::sub0)
3041 .addReg(RegNo: OpReg)
3042 .addImm(Val: AMDGPU::sub1);
3043 MI.eraseFromParent();
3044 return true;
3045}
3046
3047// FIXME: This is a workaround for the same tablegen problems as G_FNEG
3048bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
3049 Register Dst = MI.getOperand(i: 0).getReg();
3050 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
3051 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
3052 MRI->getType(Reg: Dst) != LLT::scalar(SizeInBits: 64))
3053 return false;
3054
3055 Register Src = MI.getOperand(i: 1).getReg();
3056 MachineBasicBlock *BB = MI.getParent();
3057 const DebugLoc &DL = MI.getDebugLoc();
3058 Register LoReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
3059 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
3060 Register ConstReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
3061 Register OpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
3062
3063 if (!RBI.constrainGenericRegister(Reg: Src, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI) ||
3064 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI))
3065 return false;
3066
3067 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
3068 .addReg(RegNo: Src, Flags: {}, SubReg: AMDGPU::sub0);
3069 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
3070 .addReg(RegNo: Src, Flags: {}, SubReg: AMDGPU::sub1);
3071 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: ConstReg)
3072 .addImm(Val: 0x7fffffff);
3073
3074 // Clear sign bit.
3075 // TODO: Should this used S_BITSET0_*?
3076 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: OpReg)
3077 .addReg(RegNo: HiReg)
3078 .addReg(RegNo: ConstReg)
3079 .setOperandDead(3); // Dead scc
3080 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
3081 .addReg(RegNo: LoReg)
3082 .addImm(Val: AMDGPU::sub0)
3083 .addReg(RegNo: OpReg)
3084 .addImm(Val: AMDGPU::sub1);
3085
3086 MI.eraseFromParent();
3087 return true;
3088}
3089
3090static bool isConstant(const MachineInstr &MI) {
3091 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
3092}
3093
3094void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
3095 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
3096
3097 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
3098 const MachineInstr *PtrMI =
3099 MRI.getUniqueVRegDef(Reg: Load.getOperand(i: OpNo).getReg());
3100
3101 assert(PtrMI);
3102
3103 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
3104 return;
3105
3106 GEPInfo GEPInfo;
3107
3108 for (unsigned i = 1; i != 3; ++i) {
3109 const MachineOperand &GEPOp = PtrMI->getOperand(i);
3110 const MachineInstr *OpDef = MRI.getUniqueVRegDef(Reg: GEPOp.getReg());
3111 assert(OpDef);
3112 if (i == 2 && isConstant(MI: *OpDef)) {
3113 // TODO: Could handle constant base + variable offset, but a combine
3114 // probably should have commuted it.
3115 assert(GEPInfo.Imm == 0);
3116 GEPInfo.Imm = OpDef->getOperand(i: 1).getCImm()->getSExtValue();
3117 continue;
3118 }
3119 const RegisterBank *OpBank = RBI.getRegBank(Reg: GEPOp.getReg(), MRI, TRI);
3120 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
3121 GEPInfo.SgprParts.push_back(Elt: GEPOp.getReg());
3122 else
3123 GEPInfo.VgprParts.push_back(Elt: GEPOp.getReg());
3124 }
3125
3126 AddrInfo.push_back(Elt: GEPInfo);
3127 getAddrModeInfo(Load: *PtrMI, MRI, AddrInfo);
3128}
3129
3130bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
3131 return RBI.getRegBank(Reg, MRI: *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
3132}
3133
3134bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
3135 if (!MI.hasOneMemOperand())
3136 return false;
3137
3138 const MachineMemOperand *MMO = *MI.memoperands_begin();
3139 const Value *Ptr = MMO->getValue();
3140
3141 // UndefValue means this is a load of a kernel input. These are uniform.
3142 // Sometimes LDS instructions have constant pointers.
3143 // If Ptr is null, then that means this mem operand contains a
3144 // PseudoSourceValue like GOT.
3145 if (!Ptr || isa<UndefValue, Argument, Constant, GlobalValue>(Val: Ptr))
3146 return true;
3147
3148 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
3149 return true;
3150
3151 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3152 return RBI.getRegBank(Reg: MI.getOperand(i: 0).getReg(), MRI: *MRI, TRI)->getID() ==
3153 AMDGPU::SGPRRegBankID;
3154
3155 const Instruction *I = dyn_cast<Instruction>(Val: Ptr);
3156 return I && I->getMetadata(Kind: "amdgpu.uniform");
3157}
3158
3159bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3160 for (const GEPInfo &GEPInfo : AddrInfo) {
3161 if (!GEPInfo.VgprParts.empty())
3162 return true;
3163 }
3164 return false;
3165}
3166
3167void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3168 const LLT PtrTy = MRI->getType(Reg: I.getOperand(i: 1).getReg());
3169 unsigned AS = PtrTy.getAddressSpace();
3170 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
3171 STI.ldsRequiresM0Init()) {
3172 MachineBasicBlock *BB = I.getParent();
3173
3174 // If DS instructions require M0 initialization, insert it before selecting.
3175 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
3176 .addImm(Val: -1);
3177 }
3178}
3179
3180bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3181 MachineInstr &I) const {
3182 initM0(I);
3183 return selectImpl(I, CoverageInfo&: *CoverageInfo);
3184}
3185
3186static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
3187 if (Reg.isPhysical())
3188 return false;
3189
3190 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
3191 const unsigned Opcode = MI.getOpcode();
3192
3193 if (Opcode == AMDGPU::COPY)
3194 return isVCmpResult(Reg: MI.getOperand(i: 1).getReg(), MRI);
3195
3196 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3197 Opcode == AMDGPU::G_XOR)
3198 return isVCmpResult(Reg: MI.getOperand(i: 1).getReg(), MRI) &&
3199 isVCmpResult(Reg: MI.getOperand(i: 2).getReg(), MRI);
3200
3201 if (auto *GI = dyn_cast<GIntrinsic>(Val: &MI))
3202 return GI->is(ID: Intrinsic::amdgcn_class);
3203
3204 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3205}
3206
3207bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3208 MachineBasicBlock *BB = I.getParent();
3209 MachineOperand &CondOp = I.getOperand(i: 0);
3210 Register CondReg = CondOp.getReg();
3211 const DebugLoc &DL = I.getDebugLoc();
3212
3213 unsigned BrOpcode;
3214 Register CondPhysReg;
3215 const TargetRegisterClass *ConstrainRC;
3216
3217 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3218 // whether the branch is uniform when selecting the instruction. In
3219 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3220 // RegBankSelect knows what it's doing if the branch condition is scc, even
3221 // though it currently does not.
3222 if (!isVCC(Reg: CondReg, MRI: *MRI)) {
3223 if (MRI->getType(Reg: CondReg) != LLT::scalar(SizeInBits: 32))
3224 return false;
3225
3226 CondPhysReg = AMDGPU::SCC;
3227 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3228 ConstrainRC = &AMDGPU::SReg_32RegClass;
3229 } else {
3230 // FIXME: Should scc->vcc copies and with exec?
3231
3232 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3233 // need to insert an and with exec.
3234 if (!isVCmpResult(Reg: CondReg, MRI&: *MRI)) {
3235 const bool Is64 = STI.isWave64();
3236 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3237 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3238
3239 Register TmpReg = MRI->createVirtualRegister(RegClass: TRI.getBoolRC());
3240 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: TmpReg)
3241 .addReg(RegNo: CondReg)
3242 .addReg(RegNo: Exec)
3243 .setOperandDead(3); // Dead scc
3244 CondReg = TmpReg;
3245 }
3246
3247 CondPhysReg = TRI.getVCC();
3248 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3249 ConstrainRC = TRI.getBoolRC();
3250 }
3251
3252 if (!MRI->getRegClassOrNull(Reg: CondReg))
3253 MRI->setRegClass(Reg: CondReg, RC: ConstrainRC);
3254
3255 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CondPhysReg)
3256 .addReg(RegNo: CondReg);
3257 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: BrOpcode))
3258 .addMBB(MBB: I.getOperand(i: 1).getMBB());
3259
3260 I.eraseFromParent();
3261 return true;
3262}
3263
3264bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3265 MachineInstr &I) const {
3266 Register DstReg = I.getOperand(i: 0).getReg();
3267 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3268 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3269 I.setDesc(TII.get(Opcode: IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3270 if (IsVGPR)
3271 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
3272
3273 return RBI.constrainGenericRegister(
3274 Reg: DstReg, RC: IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI&: *MRI);
3275}
3276
3277bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3278 Register DstReg = I.getOperand(i: 0).getReg();
3279 Register SrcReg = I.getOperand(i: 1).getReg();
3280 Register MaskReg = I.getOperand(i: 2).getReg();
3281 LLT Ty = MRI->getType(Reg: DstReg);
3282 LLT MaskTy = MRI->getType(Reg: MaskReg);
3283 MachineBasicBlock *BB = I.getParent();
3284 const DebugLoc &DL = I.getDebugLoc();
3285
3286 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3287 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
3288 const RegisterBank *MaskRB = RBI.getRegBank(Reg: MaskReg, MRI: *MRI, TRI);
3289 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3290 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3291 return false;
3292
3293 // Try to avoid emitting a bit operation when we only need to touch half of
3294 // the 64-bit pointer.
3295 APInt MaskOnes = VT->getKnownOnes(R: MaskReg).zext(width: 64);
3296 const APInt MaskHi32 = APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32);
3297 const APInt MaskLo32 = APInt::getLowBitsSet(numBits: 64, loBitsSet: 32);
3298
3299 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3300 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3301
3302 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3303 !CanCopyLow32 && !CanCopyHi32) {
3304 auto MIB = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B64), DestReg: DstReg)
3305 .addReg(RegNo: SrcReg)
3306 .addReg(RegNo: MaskReg)
3307 .setOperandDead(3); // Dead scc
3308 I.eraseFromParent();
3309 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3310 return true;
3311 }
3312
3313 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3314 const TargetRegisterClass &RegRC
3315 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3316
3317 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, Bank: *DstRB);
3318 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, Bank: *SrcRB);
3319 const TargetRegisterClass *MaskRC =
3320 TRI.getRegClassForTypeOnBank(Ty: MaskTy, Bank: *MaskRB);
3321
3322 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
3323 !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
3324 !RBI.constrainGenericRegister(Reg: MaskReg, RC: *MaskRC, MRI&: *MRI))
3325 return false;
3326
3327 if (Ty.getSizeInBits() == 32) {
3328 assert(MaskTy.getSizeInBits() == 32 &&
3329 "ptrmask should have been narrowed during legalize");
3330
3331 auto NewOp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: DstReg)
3332 .addReg(RegNo: SrcReg)
3333 .addReg(RegNo: MaskReg);
3334
3335 if (!IsVGPR)
3336 NewOp.setOperandDead(3); // Dead scc
3337 I.eraseFromParent();
3338 return true;
3339 }
3340
3341 Register HiReg = MRI->createVirtualRegister(RegClass: &RegRC);
3342 Register LoReg = MRI->createVirtualRegister(RegClass: &RegRC);
3343
3344 // Extract the subregisters from the source pointer.
3345 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
3346 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub0);
3347 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
3348 .addReg(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub1);
3349
3350 Register MaskedLo, MaskedHi;
3351
3352 if (CanCopyLow32) {
3353 // If all the bits in the low half are 1, we only need a copy for it.
3354 MaskedLo = LoReg;
3355 } else {
3356 // Extract the mask subregister and apply the and.
3357 Register MaskLo = MRI->createVirtualRegister(RegClass: &RegRC);
3358 MaskedLo = MRI->createVirtualRegister(RegClass: &RegRC);
3359
3360 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: MaskLo)
3361 .addReg(RegNo: MaskReg, Flags: {}, SubReg: AMDGPU::sub0);
3362 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: MaskedLo)
3363 .addReg(RegNo: LoReg)
3364 .addReg(RegNo: MaskLo);
3365 }
3366
3367 if (CanCopyHi32) {
3368 // If all the bits in the high half are 1, we only need a copy for it.
3369 MaskedHi = HiReg;
3370 } else {
3371 Register MaskHi = MRI->createVirtualRegister(RegClass: &RegRC);
3372 MaskedHi = MRI->createVirtualRegister(RegClass: &RegRC);
3373
3374 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: MaskHi)
3375 .addReg(RegNo: MaskReg, Flags: {}, SubReg: AMDGPU::sub1);
3376 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: MaskedHi)
3377 .addReg(RegNo: HiReg)
3378 .addReg(RegNo: MaskHi);
3379 }
3380
3381 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
3382 .addReg(RegNo: MaskedLo)
3383 .addImm(Val: AMDGPU::sub0)
3384 .addReg(RegNo: MaskedHi)
3385 .addImm(Val: AMDGPU::sub1);
3386 I.eraseFromParent();
3387 return true;
3388}
3389
3390/// Return the register to use for the index value, and the subregister to use
3391/// for the indirectly accessed register.
3392static std::pair<Register, unsigned>
3393computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
3394 const TargetRegisterClass *SuperRC, Register IdxReg,
3395 unsigned EltSize, GISelValueTracking &ValueTracking) {
3396 Register IdxBaseReg;
3397 int Offset;
3398
3399 std::tie(args&: IdxBaseReg, args&: Offset) =
3400 AMDGPU::getBaseWithConstantOffset(MRI, Reg: IdxReg, ValueTracking: &ValueTracking);
3401 if (IdxBaseReg == AMDGPU::NoRegister) {
3402 // This will happen if the index is a known constant. This should ordinarily
3403 // be legalized out, but handle it as a register just in case.
3404 assert(Offset == 0);
3405 IdxBaseReg = IdxReg;
3406 }
3407
3408 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: SuperRC, EltSize);
3409
3410 // Skip out of bounds offsets, or else we would end up using an undefined
3411 // register.
3412 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3413 return std::pair(IdxReg, SubRegs[0]);
3414 return std::pair(IdxBaseReg, SubRegs[Offset]);
3415}
3416
3417bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3418 MachineInstr &MI) const {
3419 Register DstReg = MI.getOperand(i: 0).getReg();
3420 Register SrcReg = MI.getOperand(i: 1).getReg();
3421 Register IdxReg = MI.getOperand(i: 2).getReg();
3422
3423 LLT DstTy = MRI->getType(Reg: DstReg);
3424 LLT SrcTy = MRI->getType(Reg: SrcReg);
3425
3426 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3427 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
3428 const RegisterBank *IdxRB = RBI.getRegBank(Reg: IdxReg, MRI: *MRI, TRI);
3429
3430 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3431 // into a waterfall loop.
3432 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3433 return false;
3434
3435 const TargetRegisterClass *SrcRC =
3436 TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *SrcRB);
3437 const TargetRegisterClass *DstRC =
3438 TRI.getRegClassForTypeOnBank(Ty: DstTy, Bank: *DstRB);
3439 if (!SrcRC || !DstRC)
3440 return false;
3441 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
3442 !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
3443 !RBI.constrainGenericRegister(Reg: IdxReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
3444 return false;
3445
3446 MachineBasicBlock *BB = MI.getParent();
3447 const DebugLoc &DL = MI.getDebugLoc();
3448 const bool Is64 = DstTy.getSizeInBits() == 64;
3449
3450 unsigned SubReg;
3451 std::tie(args&: IdxReg, args&: SubReg) = computeIndirectRegIndex(
3452 MRI&: *MRI, TRI, SuperRC: SrcRC, IdxReg, EltSize: DstTy.getSizeInBits() / 8, ValueTracking&: *VT);
3453
3454 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3455 if (DstTy.getSizeInBits() != 32 && !Is64)
3456 return false;
3457
3458 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3459 .addReg(RegNo: IdxReg);
3460
3461 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3462 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
3463 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
3464 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
3465 MI.eraseFromParent();
3466 return true;
3467 }
3468
3469 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3470 return false;
3471
3472 if (!STI.useVGPRIndexMode()) {
3473 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3474 .addReg(RegNo: IdxReg);
3475 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: DstReg)
3476 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
3477 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
3478 MI.eraseFromParent();
3479 return true;
3480 }
3481
3482 const MCInstrDesc &GPRIDXDesc =
3483 TII.getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *SrcRC), IsIndirectSrc: true);
3484 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: GPRIDXDesc, DestReg: DstReg)
3485 .addReg(RegNo: SrcReg)
3486 .addReg(RegNo: IdxReg)
3487 .addImm(Val: SubReg);
3488
3489 MI.eraseFromParent();
3490 return true;
3491}
3492
3493// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3494bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3495 MachineInstr &MI) const {
3496 Register DstReg = MI.getOperand(i: 0).getReg();
3497 Register VecReg = MI.getOperand(i: 1).getReg();
3498 Register ValReg = MI.getOperand(i: 2).getReg();
3499 Register IdxReg = MI.getOperand(i: 3).getReg();
3500
3501 LLT VecTy = MRI->getType(Reg: DstReg);
3502 LLT ValTy = MRI->getType(Reg: ValReg);
3503 unsigned VecSize = VecTy.getSizeInBits();
3504 unsigned ValSize = ValTy.getSizeInBits();
3505
3506 const RegisterBank *VecRB = RBI.getRegBank(Reg: VecReg, MRI: *MRI, TRI);
3507 const RegisterBank *ValRB = RBI.getRegBank(Reg: ValReg, MRI: *MRI, TRI);
3508 const RegisterBank *IdxRB = RBI.getRegBank(Reg: IdxReg, MRI: *MRI, TRI);
3509
3510 assert(VecTy.getElementType() == ValTy);
3511
3512 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3513 // into a waterfall loop.
3514 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3515 return false;
3516
3517 const TargetRegisterClass *VecRC =
3518 TRI.getRegClassForTypeOnBank(Ty: VecTy, Bank: *VecRB);
3519 const TargetRegisterClass *ValRC =
3520 TRI.getRegClassForTypeOnBank(Ty: ValTy, Bank: *ValRB);
3521
3522 if (!RBI.constrainGenericRegister(Reg: VecReg, RC: *VecRC, MRI&: *MRI) ||
3523 !RBI.constrainGenericRegister(Reg: DstReg, RC: *VecRC, MRI&: *MRI) ||
3524 !RBI.constrainGenericRegister(Reg: ValReg, RC: *ValRC, MRI&: *MRI) ||
3525 !RBI.constrainGenericRegister(Reg: IdxReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
3526 return false;
3527
3528 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3529 return false;
3530
3531 unsigned SubReg;
3532 std::tie(args&: IdxReg, args&: SubReg) =
3533 computeIndirectRegIndex(MRI&: *MRI, TRI, SuperRC: VecRC, IdxReg, EltSize: ValSize / 8, ValueTracking&: *VT);
3534
3535 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3536 STI.useVGPRIndexMode();
3537
3538 MachineBasicBlock *BB = MI.getParent();
3539 const DebugLoc &DL = MI.getDebugLoc();
3540
3541 if (!IndexMode) {
3542 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3543 .addReg(RegNo: IdxReg);
3544
3545 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3546 VecSize, EltSize: ValSize, IsSGPR: VecRB->getID() == AMDGPU::SGPRRegBankID);
3547 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: RegWriteOp, DestReg: DstReg)
3548 .addReg(RegNo: VecReg)
3549 .addReg(RegNo: ValReg)
3550 .addImm(Val: SubReg);
3551 MI.eraseFromParent();
3552 return true;
3553 }
3554
3555 const MCInstrDesc &GPRIDXDesc =
3556 TII.getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
3557 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: GPRIDXDesc, DestReg: DstReg)
3558 .addReg(RegNo: VecReg)
3559 .addReg(RegNo: ValReg)
3560 .addReg(RegNo: IdxReg)
3561 .addImm(Val: SubReg);
3562
3563 MI.eraseFromParent();
3564 return true;
3565}
3566
3567static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
3568 switch (Intr) {
3569 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3570 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
3571 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3572 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
3573 case Intrinsic::amdgcn_load_async_to_lds:
3574 case Intrinsic::amdgcn_global_load_async_lds:
3575 return true;
3576 }
3577 return false;
3578}
3579
3580bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3581 if (!Subtarget->hasVMemToLDSLoad())
3582 return false;
3583 unsigned Opc;
3584 unsigned Size = MI.getOperand(i: 3).getImm();
3585 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
3586
3587 // The struct intrinsic variants add one additional operand over raw.
3588 const bool HasVIndex = MI.getNumOperands() == 9;
3589 Register VIndex;
3590 int OpOffset = 0;
3591 if (HasVIndex) {
3592 VIndex = MI.getOperand(i: 4).getReg();
3593 OpOffset = 1;
3594 }
3595
3596 Register VOffset = MI.getOperand(i: 4 + OpOffset).getReg();
3597 std::optional<ValueAndVReg> MaybeVOffset =
3598 getIConstantVRegValWithLookThrough(VReg: VOffset, MRI: *MRI);
3599 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3600
3601 switch (Size) {
3602 default:
3603 return false;
3604 case 1:
3605 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3606 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3607 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3608 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3609 break;
3610 case 2:
3611 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3612 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3613 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3614 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3615 break;
3616 case 4:
3617 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3618 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3619 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3620 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3621 break;
3622 case 12:
3623 if (!Subtarget->hasLDSLoadB96_B128())
3624 return false;
3625
3626 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3627 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3628 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3629 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3630 break;
3631 case 16:
3632 if (!Subtarget->hasLDSLoadB96_B128())
3633 return false;
3634
3635 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3636 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3637 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3638 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3639 break;
3640 }
3641
3642 MachineBasicBlock *MBB = MI.getParent();
3643 const DebugLoc &DL = MI.getDebugLoc();
3644 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3645 .add(MO: MI.getOperand(i: 2));
3646
3647 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc));
3648
3649 if (HasVIndex && HasVOffset) {
3650 Register IdxReg = MRI->createVirtualRegister(RegClass: TRI.getVGPR64Class());
3651 BuildMI(BB&: *MBB, I: &*MIB, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: IdxReg)
3652 .addReg(RegNo: VIndex)
3653 .addImm(Val: AMDGPU::sub0)
3654 .addReg(RegNo: VOffset)
3655 .addImm(Val: AMDGPU::sub1);
3656
3657 MIB.addReg(RegNo: IdxReg);
3658 } else if (HasVIndex) {
3659 MIB.addReg(RegNo: VIndex);
3660 } else if (HasVOffset) {
3661 MIB.addReg(RegNo: VOffset);
3662 }
3663
3664 MIB.add(MO: MI.getOperand(i: 1)); // rsrc
3665 MIB.add(MO: MI.getOperand(i: 5 + OpOffset)); // soffset
3666 MIB.add(MO: MI.getOperand(i: 6 + OpOffset)); // imm offset
3667 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3668 unsigned Aux = MI.getOperand(i: 7 + OpOffset).getImm();
3669 MIB.addImm(Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3670 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3671 MIB.addImm(
3672 Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3673 ? 1
3674 : 0); // swz
3675 MIB.addImm(Val: isAsyncLDSDMA(Intr: IntrinsicID));
3676
3677 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3678 // Don't set the offset value here because the pointer points to the base of
3679 // the buffer.
3680 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3681
3682 MachinePointerInfo StorePtrI = LoadPtrI;
3683 LoadPtrI.V = PoisonValue::get(T: PointerType::get(C&: MF->getFunction().getContext(),
3684 AddressSpace: AMDGPUAS::BUFFER_RESOURCE));
3685 LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
3686 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3687
3688 auto F = LoadMMO->getFlags() &
3689 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3690 LoadMMO = MF->getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad,
3691 Size, BaseAlignment: LoadMMO->getBaseAlign());
3692
3693 MachineMemOperand *StoreMMO =
3694 MF->getMachineMemOperand(PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore,
3695 Size: sizeof(int32_t), BaseAlignment: LoadMMO->getBaseAlign());
3696
3697 MIB.setMemRefs({LoadMMO, StoreMMO});
3698
3699 MI.eraseFromParent();
3700 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3701 return true;
3702}
3703
3704/// Match a zero extend from a 32-bit value to 64-bits.
3705Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3706 Register ZExtSrc;
3707 if (mi_match(R: Reg, MRI: *MRI, P: m_GZExt(Src: m_Reg(R&: ZExtSrc))))
3708 return MRI->getType(Reg: ZExtSrc) == LLT::scalar(SizeInBits: 32) ? ZExtSrc : Register();
3709
3710 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3711 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI);
3712 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3713 return Register();
3714
3715 assert(Def->getNumOperands() == 3 &&
3716 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3717 if (mi_match(R: Def->getOperand(i: 2).getReg(), MRI: *MRI, P: m_ZeroInt())) {
3718 return Def->getOperand(i: 1).getReg();
3719 }
3720
3721 return Register();
3722}
3723
3724/// Match a sign extend from a 32-bit value to 64-bits.
3725Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3726 Register SExtSrc;
3727 if (mi_match(R: Reg, MRI: *MRI, P: m_GSExt(Src: m_Reg(R&: SExtSrc))))
3728 return MRI->getType(Reg: SExtSrc) == LLT::scalar(SizeInBits: 32) ? SExtSrc : Register();
3729
3730 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3731 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI);
3732 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3733 return Register();
3734
3735 assert(Def->getNumOperands() == 3 &&
3736 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3737 if (mi_match(R: Def->getOperand(i: 2).getReg(), MRI: *MRI,
3738 P: m_GAShr(L: m_SpecificReg(RequestedReg: Def->getOperand(i: 1).getReg()),
3739 R: m_SpecificICst(RequestedValue: 31))))
3740 return Def->getOperand(i: 1).getReg();
3741
3742 if (VT->signBitIsZero(Op: Reg))
3743 return matchZeroExtendFromS32(Reg);
3744
3745 return Register();
3746}
3747
3748/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3749/// is 32-bit.
3750Register
3751AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3752 return MRI->getType(Reg) == LLT::scalar(SizeInBits: 32) ? Reg
3753 : matchZeroExtendFromS32(Reg);
3754}
3755
3756/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3757/// is 32-bit.
3758Register
3759AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3760 return MRI->getType(Reg) == LLT::scalar(SizeInBits: 32) ? Reg
3761 : matchSignExtendFromS32(Reg);
3762}
3763
3764Register
3765AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3766 bool IsSigned) const {
3767 if (IsSigned)
3768 return matchSignExtendFromS32OrS32(Reg);
3769
3770 return matchZeroExtendFromS32OrS32(Reg);
3771}
3772
3773Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3774 Register AnyExtSrc;
3775 if (mi_match(R: Reg, MRI: *MRI, P: m_GAnyExt(Src: m_Reg(R&: AnyExtSrc))))
3776 return MRI->getType(Reg: AnyExtSrc) == LLT::scalar(SizeInBits: 32) ? AnyExtSrc : Register();
3777
3778 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3779 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI);
3780 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3781 return Register();
3782
3783 assert(Def->getNumOperands() == 3 &&
3784 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3785
3786 if (mi_match(R: Def->getOperand(i: 2).getReg(), MRI: *MRI, P: m_GImplicitDef()))
3787 return Def->getOperand(i: 1).getReg();
3788
3789 return Register();
3790}
3791
3792bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3793 if (!Subtarget->hasVMemToLDSLoad())
3794 return false;
3795
3796 unsigned Opc;
3797 unsigned Size = MI.getOperand(i: 3).getImm();
3798 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
3799
3800 switch (Size) {
3801 default:
3802 return false;
3803 case 1:
3804 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3805 break;
3806 case 2:
3807 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3808 break;
3809 case 4:
3810 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3811 break;
3812 case 12:
3813 if (!Subtarget->hasLDSLoadB96_B128())
3814 return false;
3815 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3816 break;
3817 case 16:
3818 if (!Subtarget->hasLDSLoadB96_B128())
3819 return false;
3820 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3821 break;
3822 }
3823
3824 MachineBasicBlock *MBB = MI.getParent();
3825 const DebugLoc &DL = MI.getDebugLoc();
3826 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3827 .add(MO: MI.getOperand(i: 2));
3828
3829 Register Addr = MI.getOperand(i: 1).getReg();
3830 Register VOffset;
3831 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3832 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3833 if (!isSGPR(Reg: Addr)) {
3834 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
3835 if (isSGPR(Reg: AddrDef->Reg)) {
3836 Addr = AddrDef->Reg;
3837 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3838 Register SAddr =
3839 getSrcRegIgnoringCopies(Reg: AddrDef->MI->getOperand(i: 1).getReg(), MRI: *MRI);
3840 if (isSGPR(Reg: SAddr)) {
3841 Register PtrBaseOffset = AddrDef->MI->getOperand(i: 2).getReg();
3842 if (Register Off = matchZeroExtendFromS32(Reg: PtrBaseOffset)) {
3843 Addr = SAddr;
3844 VOffset = Off;
3845 }
3846 }
3847 }
3848 }
3849
3850 if (isSGPR(Reg: Addr)) {
3851 Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc);
3852 if (!VOffset) {
3853 VOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
3854 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VOffset)
3855 .addImm(Val: 0);
3856 }
3857 }
3858
3859 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc))
3860 .addReg(RegNo: Addr);
3861
3862 if (isSGPR(Reg: Addr))
3863 MIB.addReg(RegNo: VOffset);
3864
3865 MIB.add(MO: MI.getOperand(i: 4)); // offset
3866
3867 unsigned Aux = MI.getOperand(i: 5).getImm();
3868 MIB.addImm(Val: Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3869 MIB.addImm(Val: isAsyncLDSDMA(Intr: IntrinsicID));
3870
3871 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3872 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3873 LoadPtrI.Offset = MI.getOperand(i: 4).getImm();
3874 MachinePointerInfo StorePtrI = LoadPtrI;
3875 LoadPtrI.V = PoisonValue::get(T: PointerType::get(C&: MF->getFunction().getContext(),
3876 AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
3877 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
3878 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3879 auto F = LoadMMO->getFlags() &
3880 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3881 LoadMMO = MF->getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad,
3882 Size, BaseAlignment: LoadMMO->getBaseAlign());
3883 MachineMemOperand *StoreMMO =
3884 MF->getMachineMemOperand(PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore,
3885 Size: sizeof(int32_t), BaseAlignment: Align(4));
3886
3887 MIB.setMemRefs({LoadMMO, StoreMMO});
3888
3889 MI.eraseFromParent();
3890 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3891 return true;
3892}
3893
3894bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
3895 Intrinsic::ID IID) const {
3896 bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
3897 unsigned Opc =
3898 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3899 int NumGroups = 4;
3900
3901 // A lamda function to check whether an operand is a vector of all 0s.
3902 const auto isAllZeros = [&](MachineOperand &Opnd) {
3903 const MachineInstr *DefMI = MRI->getVRegDef(Reg: Opnd.getReg());
3904 if (!DefMI)
3905 return false;
3906 return llvm::isBuildVectorAllZeros(MI: *DefMI, MRI: *MRI, AllowUndef: true);
3907 };
3908
3909 // Use _D2 version if both group 2 and 3 are zero-initialized.
3910 if (isAllZeros(MI.getOperand(i: 3)) && isAllZeros(MI.getOperand(i: 4))) {
3911 NumGroups = 2;
3912 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3913 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3914 }
3915
3916 // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
3917 // for now because all existing targets only support up to 4 groups.
3918 MachineBasicBlock *MBB = MI.getParent();
3919 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: Opc))
3920 .add(MO: MI.getOperand(i: 1)) // D# group 0
3921 .add(MO: MI.getOperand(i: 2)); // D# group 1
3922
3923 if (NumGroups >= 4) { // Has at least 4 groups
3924 MIB.add(MO: MI.getOperand(i: 3)) // D# group 2
3925 .add(MO: MI.getOperand(i: 4)); // D# group 3
3926 }
3927
3928 MIB.addImm(Val: 0) // r128
3929 .add(MO: MI.getOperand(i: 6)); // cpol
3930
3931 MI.eraseFromParent();
3932 return true;
3933}
3934
3935bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3936 MachineInstr &MI) const {
3937 unsigned OpcodeOpIdx =
3938 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3939 MI.setDesc(TII.get(Opcode: MI.getOperand(i: OpcodeOpIdx).getImm()));
3940 MI.removeOperand(OpNo: OpcodeOpIdx);
3941 MI.addImplicitDefUseOperands(MF&: *MI.getMF());
3942 constrainSelectedInstRegOperands(I&: MI, TII, TRI, RBI);
3943 return true;
3944}
3945
3946// FIXME: This should be removed and let the patterns select. We just need the
3947// AGPR/VGPR combination versions.
3948bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3949 unsigned Opc;
3950 switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
3951 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3952 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3953 break;
3954 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3955 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3956 break;
3957 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3958 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3959 break;
3960 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3961 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3962 break;
3963 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3964 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3965 break;
3966 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3967 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3968 break;
3969 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3970 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3971 break;
3972 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3973 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3974 break;
3975 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3976 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3977 break;
3978 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3979 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3980 break;
3981 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3982 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3983 break;
3984 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3985 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3986 break;
3987 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3988 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3989 break;
3990 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3991 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3992 break;
3993 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3994 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3995 break;
3996 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3997 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3998 break;
3999 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
4000 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
4001 break;
4002 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
4003 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
4004 break;
4005 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
4006 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
4007 break;
4008 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
4009 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
4010 break;
4011 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
4012 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
4013 break;
4014 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
4015 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
4016 break;
4017 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
4018 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
4019 break;
4020 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
4021 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
4022 break;
4023 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
4024 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
4025 break;
4026 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
4027 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
4028 break;
4029 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
4030 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
4031 break;
4032 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
4033 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
4034 break;
4035 default:
4036 llvm_unreachable("unhandled smfmac intrinsic");
4037 }
4038
4039 auto VDst_In = MI.getOperand(i: 4);
4040
4041 MI.setDesc(TII.get(Opcode: Opc));
4042 MI.removeOperand(OpNo: 4); // VDst_In
4043 MI.removeOperand(OpNo: 1); // Intrinsic ID
4044 MI.addOperand(Op: VDst_In); // Readd VDst_In to the end
4045 MI.addImplicitDefUseOperands(MF&: *MI.getMF());
4046 const MCInstrDesc &MCID = MI.getDesc();
4047 if (MCID.getOperandConstraint(OpNum: 0, Constraint: MCOI::EARLY_CLOBBER) != -1) {
4048 MI.getOperand(i: 0).setIsEarlyClobber(true);
4049 }
4050 return true;
4051}
4052
4053bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
4054 MachineInstr &MI, Intrinsic::ID IntrID) const {
4055 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
4056 !Subtarget->hasPermlane16Swap())
4057 return false;
4058 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
4059 !Subtarget->hasPermlane32Swap())
4060 return false;
4061
4062 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
4063 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
4064 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
4065
4066 MI.removeOperand(OpNo: 2);
4067 MI.setDesc(TII.get(Opcode));
4068 MI.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
4069
4070 MachineOperand &FI = MI.getOperand(i: 4);
4071 FI.setImm(FI.getImm() ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0);
4072
4073 constrainSelectedInstRegOperands(I&: MI, TII, TRI, RBI);
4074 return true;
4075}
4076
4077bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
4078 Register DstReg = MI.getOperand(i: 0).getReg();
4079 Register SrcReg = MI.getOperand(i: 1).getReg();
4080 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
4081 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4082 MachineBasicBlock *MBB = MI.getParent();
4083 const DebugLoc &DL = MI.getDebugLoc();
4084
4085 if (IsVALU) {
4086 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: DstReg)
4087 .addImm(Val: Subtarget->getWavefrontSizeLog2())
4088 .addReg(RegNo: SrcReg);
4089 } else {
4090 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: DstReg)
4091 .addReg(RegNo: SrcReg)
4092 .addImm(Val: Subtarget->getWavefrontSizeLog2())
4093 .setOperandDead(3); // Dead scc
4094 }
4095
4096 const TargetRegisterClass &RC =
4097 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
4098 if (!RBI.constrainGenericRegister(Reg: DstReg, RC, MRI&: *MRI))
4099 return false;
4100
4101 MI.eraseFromParent();
4102 return true;
4103}
4104
4105bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
4106 MachineInstr &MI) const {
4107 assert(MI.getNumOperands() == 4);
4108 MachineBasicBlock *MBB = MI.getParent();
4109 const DebugLoc &DL = MI.getDebugLoc();
4110
4111 Register DstReg = MI.getOperand(i: 0).getReg();
4112 Register ValReg = MI.getOperand(i: 2).getReg();
4113 Register IdxReg = MI.getOperand(i: 3).getReg();
4114
4115 const LLT DstTy = MRI->getType(Reg: DstReg);
4116 unsigned DstSize = DstTy.getSizeInBits();
4117 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
4118 const TargetRegisterClass *DstRC =
4119 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstRB);
4120
4121 if (DstTy != LLT::scalar(SizeInBits: 32))
4122 return false;
4123
4124 if (!Subtarget->supportsBPermute())
4125 return false;
4126
4127 // If we can bpermute across the whole wave, then just do that
4128 if (Subtarget->supportsWaveWideBPermute()) {
4129 Register ShiftIdxReg = MRI->createVirtualRegister(RegClass: DstRC);
4130 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHLREV_B32_e64), DestReg: ShiftIdxReg)
4131 .addImm(Val: 2)
4132 .addReg(RegNo: IdxReg);
4133
4134 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_BPERMUTE_B32), DestReg: DstReg)
4135 .addReg(RegNo: ShiftIdxReg)
4136 .addReg(RegNo: ValReg)
4137 .addImm(Val: 0);
4138 } else {
4139 // Otherwise, we need to make use of whole wave mode
4140 assert(Subtarget->isWave64());
4141
4142 // Set inactive lanes to poison
4143 Register UndefValReg =
4144 MRI->createVirtualRegister(RegClass: TRI.getRegClass(i: AMDGPU::SReg_32RegClassID));
4145 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefValReg);
4146
4147 Register UndefExecReg = MRI->createVirtualRegister(
4148 RegClass: TRI.getRegClass(i: AMDGPU::SReg_64_XEXECRegClassID));
4149 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefExecReg);
4150
4151 Register PoisonValReg = MRI->createVirtualRegister(RegClass: DstRC);
4152 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SET_INACTIVE_B32), DestReg: PoisonValReg)
4153 .addImm(Val: 0)
4154 .addReg(RegNo: ValReg)
4155 .addImm(Val: 0)
4156 .addReg(RegNo: UndefValReg)
4157 .addReg(RegNo: UndefExecReg);
4158
4159 // ds_bpermute requires index to be multiplied by 4
4160 Register ShiftIdxReg = MRI->createVirtualRegister(RegClass: DstRC);
4161 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHLREV_B32_e64), DestReg: ShiftIdxReg)
4162 .addImm(Val: 2)
4163 .addReg(RegNo: IdxReg);
4164
4165 Register PoisonIdxReg = MRI->createVirtualRegister(RegClass: DstRC);
4166 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SET_INACTIVE_B32), DestReg: PoisonIdxReg)
4167 .addImm(Val: 0)
4168 .addReg(RegNo: ShiftIdxReg)
4169 .addImm(Val: 0)
4170 .addReg(RegNo: UndefValReg)
4171 .addReg(RegNo: UndefExecReg);
4172
4173 // Get permutation of each half, then we'll select which one to use
4174 Register SameSidePermReg = MRI->createVirtualRegister(RegClass: DstRC);
4175 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_BPERMUTE_B32), DestReg: SameSidePermReg)
4176 .addReg(RegNo: PoisonIdxReg)
4177 .addReg(RegNo: PoisonValReg)
4178 .addImm(Val: 0);
4179
4180 Register SwappedValReg = MRI->createVirtualRegister(RegClass: DstRC);
4181 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_PERMLANE64_B32), DestReg: SwappedValReg)
4182 .addReg(RegNo: PoisonValReg);
4183
4184 Register OppSidePermReg = MRI->createVirtualRegister(RegClass: DstRC);
4185 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_BPERMUTE_B32), DestReg: OppSidePermReg)
4186 .addReg(RegNo: PoisonIdxReg)
4187 .addReg(RegNo: SwappedValReg)
4188 .addImm(Val: 0);
4189
4190 Register WWMSwapPermReg = MRI->createVirtualRegister(RegClass: DstRC);
4191 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::STRICT_WWM), DestReg: WWMSwapPermReg)
4192 .addReg(RegNo: OppSidePermReg);
4193
4194 // Select which side to take the permute from
4195 // We can get away with only using mbcnt_lo here since we're only
4196 // trying to detect which side of 32 each lane is on, and mbcnt_lo
4197 // returns 32 for lanes 32-63.
4198 Register ThreadIDReg = MRI->createVirtualRegister(RegClass: DstRC);
4199 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MBCNT_LO_U32_B32_e64), DestReg: ThreadIDReg)
4200 .addImm(Val: -1)
4201 .addImm(Val: 0);
4202
4203 Register XORReg = MRI->createVirtualRegister(RegClass: DstRC);
4204 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_XOR_B32_e64), DestReg: XORReg)
4205 .addReg(RegNo: ThreadIDReg)
4206 .addReg(RegNo: PoisonIdxReg);
4207
4208 Register ANDReg = MRI->createVirtualRegister(RegClass: DstRC);
4209 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B32_e64), DestReg: ANDReg)
4210 .addReg(RegNo: XORReg)
4211 .addImm(Val: 32);
4212
4213 Register CompareReg = MRI->createVirtualRegister(
4214 RegClass: TRI.getRegClass(i: AMDGPU::SReg_64_XEXECRegClassID));
4215 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: CompareReg)
4216 .addReg(RegNo: ANDReg)
4217 .addImm(Val: 0);
4218
4219 // Finally do the selection
4220 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
4221 .addImm(Val: 0)
4222 .addReg(RegNo: WWMSwapPermReg)
4223 .addImm(Val: 0)
4224 .addReg(RegNo: SameSidePermReg)
4225 .addReg(RegNo: CompareReg);
4226 }
4227
4228 MI.eraseFromParent();
4229 return true;
4230}
4231
4232// Match BITOP3 operation and return a number of matched instructions plus
4233// truth table.
4234static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
4235 SmallVectorImpl<Register> &Src,
4236 const MachineRegisterInfo &MRI) {
4237 unsigned NumOpcodes = 0;
4238 uint8_t LHSBits, RHSBits;
4239
4240 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
4241 // Define truth table given Src0, Src1, Src2 bits permutations:
4242 // 0 0 0
4243 // 0 0 1
4244 // 0 1 0
4245 // 0 1 1
4246 // 1 0 0
4247 // 1 0 1
4248 // 1 1 0
4249 // 1 1 1
4250 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4251
4252 if (mi_match(R: Op, MRI, P: m_AllOnesInt())) {
4253 Bits = 0xff;
4254 return true;
4255 }
4256 if (mi_match(R: Op, MRI, P: m_ZeroInt())) {
4257 Bits = 0;
4258 return true;
4259 }
4260
4261 for (unsigned I = 0; I < Src.size(); ++I) {
4262 // Try to find existing reused operand
4263 if (Src[I] == Op) {
4264 Bits = SrcBits[I];
4265 return true;
4266 }
4267 // Try to replace parent operator
4268 if (Src[I] == R) {
4269 Bits = SrcBits[I];
4270 Src[I] = Op;
4271 return true;
4272 }
4273 }
4274
4275 if (Src.size() == 3) {
4276 // No room left for operands. Try one last time, there can be a 'not' of
4277 // one of our source operands. In this case we can compute the bits
4278 // without growing Src vector.
4279 Register LHS;
4280 if (mi_match(R: Op, MRI, P: m_Not(Src: m_Reg(R&: LHS)))) {
4281 LHS = getSrcRegIgnoringCopies(Reg: LHS, MRI);
4282 for (unsigned I = 0; I < Src.size(); ++I) {
4283 if (Src[I] == LHS) {
4284 Bits = ~SrcBits[I];
4285 return true;
4286 }
4287 }
4288 }
4289
4290 return false;
4291 }
4292
4293 Bits = SrcBits[Src.size()];
4294 Src.push_back(Elt: Op);
4295 return true;
4296 };
4297
4298 MachineInstr *MI = MRI.getVRegDef(Reg: R);
4299 switch (MI->getOpcode()) {
4300 case TargetOpcode::G_AND:
4301 case TargetOpcode::G_OR:
4302 case TargetOpcode::G_XOR: {
4303 Register LHS = getSrcRegIgnoringCopies(Reg: MI->getOperand(i: 1).getReg(), MRI);
4304 Register RHS = getSrcRegIgnoringCopies(Reg: MI->getOperand(i: 2).getReg(), MRI);
4305
4306 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
4307 if (!getOperandBits(LHS, LHSBits) ||
4308 !getOperandBits(RHS, RHSBits)) {
4309 Src = std::move(Backup);
4310 return std::make_pair(x: 0, y: 0);
4311 }
4312
4313 // Recursion is naturally limited by the size of the operand vector.
4314 //
4315 // When LHS and RHS share a common sub-expression, one side's recursion
4316 // may decompose that sub-expression and replace the Src slot the other
4317 // side occupies with sub-operands via the "replace parent" path in
4318 // getOperandBits. The other side's cached bit-pattern then refers to a
4319 // slot whose contents changed, producing a wrong truth table.
4320 //
4321 // We detect this in three ways:
4322 // (A) If LHS recursed, its truth table is valid against the Src state
4323 // when LHS recursion completed (SrcAfterLHS). If RHS recursion
4324 // then mutates a Src slot that LHSBits depends on, LHSBits is
4325 // stale.
4326 // (B) If RHS did not recurse, RHSBits came from getOperandBits and
4327 // refers to a specific Src slot. If that slot's contents changed
4328 // (by either recursion), RHSBits is stale.
4329 // (C) Symmetrically for LHS if it did not recurse.
4330 SmallVector<Register, 3> SrcBeforeRecurse(Src.begin(), Src.end());
4331 uint8_t LHSBitsOrig = LHSBits;
4332 uint8_t RHSBitsOrig = RHSBits;
4333
4334 auto LHSOp = BitOp3_Op(R: LHS, Src, MRI);
4335 if (LHSOp.first) {
4336 NumOpcodes += LHSOp.first;
4337 LHSBits = LHSOp.second;
4338 }
4339
4340 SmallVector<Register, 3> SrcAfterLHS(Src.begin(), Src.end());
4341
4342 auto RHSOp = BitOp3_Op(R: RHS, Src, MRI);
4343 if (RHSOp.first) {
4344 NumOpcodes += RHSOp.first;
4345 RHSBits = RHSOp.second;
4346 }
4347
4348 // dependsOnSlot: true iff the truth table TT varies with slot Slot.
4349 auto dependsOnSlot = [](uint8_t TT, int Slot) -> bool {
4350 if (Slot < 0 || Slot > 2)
4351 return false;
4352 const uint8_t Masks[3] = {0x0f, 0x33, 0x55};
4353 const int Shifts[3] = {4, 2, 1};
4354 return ((TT ^ (TT >> Shifts[Slot])) & Masks[Slot]) != 0;
4355 };
4356
4357 // findSlot: locate the Src slot a getOperandBits result depends on,
4358 // including negated (NOT) patterns that getOperandBits resolves via
4359 // the ~SrcBits[I] shortcut.
4360 const uint8_t SrcBitsConst[3] = {0xf0, 0xcc, 0xaa};
4361 auto findSlot = [&](uint8_t Bits, Register Op,
4362 const SmallVectorImpl<Register> &S) -> int {
4363 Register NegatedInner;
4364 bool IsNegationOp = mi_match(R: Op, MRI, P: m_Not(Src: m_Reg(R&: NegatedInner)));
4365 if (IsNegationOp)
4366 NegatedInner = getSrcRegIgnoringCopies(Reg: NegatedInner, MRI);
4367 for (int I = 0; I < (int)S.size(); I++) {
4368 if (Bits == SrcBitsConst[I] && S[I] == Op)
4369 return I;
4370 if (IsNegationOp && Bits == (uint8_t)~SrcBitsConst[I] &&
4371 S[I] == NegatedInner)
4372 return I;
4373 }
4374 return -1;
4375 };
4376
4377 bool Stale = false;
4378
4379 // (A) LHS recursed: its truth table is against SrcAfterLHS.
4380 // Check if RHS recursion mutated a slot that LHSBits uses.
4381 if (LHSOp.first) {
4382 for (int I = 0; I < (int)SrcAfterLHS.size() && I < 3; I++) {
4383 if (I < (int)Src.size() && Src[I] != SrcAfterLHS[I] &&
4384 dependsOnSlot(LHSBits, I)) {
4385 Stale = true;
4386 break;
4387 }
4388 }
4389 }
4390
4391 // (B) RHS did not recurse: RHSBits from getOperandBits is against
4392 // SrcBeforeRecurse. Check if that slot was mutated since then.
4393 if (!Stale && !RHSOp.first) {
4394 int Slot = findSlot(RHSBitsOrig, RHS, SrcBeforeRecurse);
4395 if (Slot >= 0 &&
4396 (Slot >= (int)Src.size() || Src[Slot] != SrcBeforeRecurse[Slot]))
4397 Stale = true;
4398 }
4399
4400 // (C) LHS did not recurse: LHSBits from getOperandBits is against
4401 // SrcBeforeRecurse. Check if that slot was mutated since then.
4402 if (!Stale && !LHSOp.first) {
4403 int Slot = findSlot(LHSBitsOrig, LHS, SrcBeforeRecurse);
4404 if (Slot >= 0 &&
4405 (Slot >= (int)Src.size() || Src[Slot] != SrcBeforeRecurse[Slot]))
4406 Stale = true;
4407 }
4408
4409 if (Stale) {
4410 Src = std::move(SrcBeforeRecurse);
4411 LHSBits = LHSBitsOrig;
4412 RHSBits = RHSBitsOrig;
4413 NumOpcodes = 0;
4414 }
4415 break;
4416 }
4417 default:
4418 return std::make_pair(x: 0, y: 0);
4419 }
4420
4421 uint8_t TTbl;
4422 switch (MI->getOpcode()) {
4423 case TargetOpcode::G_AND:
4424 TTbl = LHSBits & RHSBits;
4425 break;
4426 case TargetOpcode::G_OR:
4427 TTbl = LHSBits | RHSBits;
4428 break;
4429 case TargetOpcode::G_XOR:
4430 TTbl = LHSBits ^ RHSBits;
4431 break;
4432 default:
4433 break;
4434 }
4435
4436 return std::make_pair(x: NumOpcodes + 1, y&: TTbl);
4437}
4438
4439bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4440 if (!Subtarget->hasBitOp3Insts())
4441 return false;
4442
4443 Register DstReg = MI.getOperand(i: 0).getReg();
4444 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
4445 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4446 if (!IsVALU)
4447 return false;
4448
4449 SmallVector<Register, 3> Src;
4450 uint8_t TTbl;
4451 unsigned NumOpcodes;
4452
4453 std::tie(args&: NumOpcodes, args&: TTbl) = BitOp3_Op(R: DstReg, Src, MRI: *MRI);
4454
4455 // Src.empty() case can happen if all operands are all zero or all ones.
4456 // Normally it shall be optimized out before reaching this.
4457 if (NumOpcodes < 2 || Src.empty())
4458 return false;
4459
4460 const bool IsB32 = MRI->getType(Reg: DstReg) == LLT::scalar(SizeInBits: 32);
4461 if (NumOpcodes == 2 && IsB32) {
4462 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4463 // asm more readable. This cannot be modeled with AddedComplexity because
4464 // selector does not know how many operations did we match.
4465 if (mi_match(MI, MRI: *MRI, P: m_GXor(L: m_GXor(L: m_Reg(), R: m_Reg()), R: m_Reg())) ||
4466 mi_match(MI, MRI: *MRI, P: m_GOr(L: m_GOr(L: m_Reg(), R: m_Reg()), R: m_Reg())) ||
4467 mi_match(MI, MRI: *MRI, P: m_GOr(L: m_GAnd(L: m_Reg(), R: m_Reg()), R: m_Reg())))
4468 return false;
4469 } else if (NumOpcodes < 4) {
4470 // For a uniform case threshold should be higher to account for moves
4471 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4472 // in SGPRs and a readtfirstlane after.
4473 return false;
4474 }
4475
4476 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4477 if (!IsB32 && STI.hasTrue16BitInsts())
4478 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4479 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4480 unsigned CBL = STI.getConstantBusLimit(Opcode: Opc);
4481 MachineBasicBlock *MBB = MI.getParent();
4482 const DebugLoc &DL = MI.getDebugLoc();
4483
4484 for (unsigned I = 0; I < Src.size(); ++I) {
4485 const RegisterBank *RB = RBI.getRegBank(Reg: Src[I], MRI: *MRI, TRI);
4486 if (RB->getID() != AMDGPU::SGPRRegBankID)
4487 continue;
4488 if (CBL > 0) {
4489 --CBL;
4490 continue;
4491 }
4492 Register NewReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
4493 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: NewReg)
4494 .addReg(RegNo: Src[I]);
4495 Src[I] = NewReg;
4496 }
4497
4498 // Last operand can be ignored, turning a ternary operation into a binary.
4499 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4500 // 'c' with 'a' here without changing the answer. In some pathological
4501 // cases it should be possible to get an operation with a single operand
4502 // too if optimizer would not catch it.
4503 while (Src.size() < 3)
4504 Src.push_back(Elt: Src[0]);
4505
4506 auto MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg);
4507 if (!IsB32)
4508 MIB.addImm(Val: 0); // src_mod0
4509 MIB.addReg(RegNo: Src[0]);
4510 if (!IsB32)
4511 MIB.addImm(Val: 0); // src_mod1
4512 MIB.addReg(RegNo: Src[1]);
4513 if (!IsB32)
4514 MIB.addImm(Val: 0); // src_mod2
4515 MIB.addReg(RegNo: Src[2])
4516 .addImm(Val: TTbl);
4517 if (!IsB32)
4518 MIB.addImm(Val: 0); // op_sel
4519
4520 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
4521 MI.eraseFromParent();
4522
4523 return true;
4524}
4525
4526bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4527 Register SrcReg = MI.getOperand(i: 0).getReg();
4528 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
4529 return false;
4530
4531 MachineInstr *DefMI = MRI->getVRegDef(Reg: SrcReg);
4532 Register SP =
4533 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4534 Register WaveAddr = getWaveAddress(Def: DefMI);
4535 MachineBasicBlock *MBB = MI.getParent();
4536 const DebugLoc &DL = MI.getDebugLoc();
4537
4538 if (!WaveAddr) {
4539 WaveAddr = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
4540 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: WaveAddr)
4541 .addReg(RegNo: SrcReg)
4542 .addImm(Val: Subtarget->getWavefrontSizeLog2())
4543 .setOperandDead(3); // Dead scc
4544 }
4545
4546 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: SP)
4547 .addReg(RegNo: WaveAddr);
4548
4549 MI.eraseFromParent();
4550 return true;
4551}
4552
4553bool AMDGPUInstructionSelector::select(MachineInstr &I) {
4554
4555 if (!I.isPreISelOpcode()) {
4556 if (I.isCopy())
4557 return selectCOPY(I);
4558 return true;
4559 }
4560
4561 switch (I.getOpcode()) {
4562 case TargetOpcode::G_AND:
4563 case TargetOpcode::G_OR:
4564 case TargetOpcode::G_XOR:
4565 if (selectBITOP3(MI&: I))
4566 return true;
4567 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4568 return true;
4569 return selectG_AND_OR_XOR(I);
4570 case TargetOpcode::G_ADD:
4571 case TargetOpcode::G_SUB:
4572 case TargetOpcode::G_PTR_ADD:
4573 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4574 return true;
4575 return selectG_ADD_SUB(I);
4576 case TargetOpcode::G_UADDO:
4577 case TargetOpcode::G_USUBO:
4578 case TargetOpcode::G_UADDE:
4579 case TargetOpcode::G_USUBE:
4580 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4581 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4582 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4583 return selectG_AMDGPU_MAD_64_32(I);
4584 case TargetOpcode::G_INTTOPTR:
4585 case TargetOpcode::G_BITCAST:
4586 case TargetOpcode::G_PTRTOINT:
4587 case TargetOpcode::G_FREEZE:
4588 return selectCOPY(I);
4589 case TargetOpcode::G_FNEG:
4590 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4591 return true;
4592 return selectG_FNEG(MI&: I);
4593 case TargetOpcode::G_FABS:
4594 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4595 return true;
4596 return selectG_FABS(MI&: I);
4597 case TargetOpcode::G_EXTRACT:
4598 return selectG_EXTRACT(I);
4599 case TargetOpcode::G_MERGE_VALUES:
4600 case TargetOpcode::G_CONCAT_VECTORS:
4601 return selectG_MERGE_VALUES(MI&: I);
4602 case TargetOpcode::G_UNMERGE_VALUES:
4603 return selectG_UNMERGE_VALUES(MI&: I);
4604 case TargetOpcode::G_BUILD_VECTOR:
4605 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4606 return selectG_BUILD_VECTOR(MI&: I);
4607 case TargetOpcode::G_IMPLICIT_DEF:
4608 return selectG_IMPLICIT_DEF(I);
4609 case TargetOpcode::G_INSERT:
4610 return selectG_INSERT(I);
4611 case TargetOpcode::G_INTRINSIC:
4612 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4613 return selectG_INTRINSIC(I);
4614 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4615 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4616 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4617 case TargetOpcode::G_ICMP:
4618 case TargetOpcode::G_FCMP:
4619 if (selectG_ICMP_or_FCMP(I))
4620 return true;
4621 return selectImpl(I, CoverageInfo&: *CoverageInfo);
4622 case TargetOpcode::G_LOAD:
4623 case TargetOpcode::G_ZEXTLOAD:
4624 case TargetOpcode::G_SEXTLOAD:
4625 case TargetOpcode::G_STORE:
4626 case TargetOpcode::G_ATOMIC_CMPXCHG:
4627 case TargetOpcode::G_ATOMICRMW_XCHG:
4628 case TargetOpcode::G_ATOMICRMW_ADD:
4629 case TargetOpcode::G_ATOMICRMW_SUB:
4630 case TargetOpcode::G_ATOMICRMW_AND:
4631 case TargetOpcode::G_ATOMICRMW_OR:
4632 case TargetOpcode::G_ATOMICRMW_XOR:
4633 case TargetOpcode::G_ATOMICRMW_MIN:
4634 case TargetOpcode::G_ATOMICRMW_MAX:
4635 case TargetOpcode::G_ATOMICRMW_UMIN:
4636 case TargetOpcode::G_ATOMICRMW_UMAX:
4637 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4638 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4639 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4640 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4641 case TargetOpcode::G_ATOMICRMW_FADD:
4642 case TargetOpcode::G_ATOMICRMW_FMIN:
4643 case TargetOpcode::G_ATOMICRMW_FMAX:
4644 return selectG_LOAD_STORE_ATOMICRMW(I);
4645 case TargetOpcode::G_SELECT:
4646 return selectG_SELECT(I);
4647 case TargetOpcode::G_TRUNC:
4648 return selectG_TRUNC(I);
4649 case TargetOpcode::G_SEXT:
4650 case TargetOpcode::G_ZEXT:
4651 case TargetOpcode::G_ANYEXT:
4652 case TargetOpcode::G_SEXT_INREG:
4653 // This is a workaround. For extension from type i1, `selectImpl()` uses
4654 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4655 // i1 can only be hold in a SGPR class.
4656 if (MRI->getType(Reg: I.getOperand(i: 1).getReg()) != LLT::scalar(SizeInBits: 1) &&
4657 selectImpl(I, CoverageInfo&: *CoverageInfo))
4658 return true;
4659 return selectG_SZA_EXT(I);
4660 case TargetOpcode::G_FPEXT:
4661 if (selectG_FPEXT(I))
4662 return true;
4663 return selectImpl(I, CoverageInfo&: *CoverageInfo);
4664 case TargetOpcode::G_BRCOND:
4665 return selectG_BRCOND(I);
4666 case TargetOpcode::G_GLOBAL_VALUE:
4667 return selectG_GLOBAL_VALUE(I);
4668 case TargetOpcode::G_PTRMASK:
4669 return selectG_PTRMASK(I);
4670 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4671 return selectG_EXTRACT_VECTOR_ELT(MI&: I);
4672 case TargetOpcode::G_INSERT_VECTOR_ELT:
4673 return selectG_INSERT_VECTOR_ELT(MI&: I);
4674 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4675 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4676 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4677 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4678 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4679 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4680 AMDGPU::getImageDimIntrinsicInfo(Intr: AMDGPU::getIntrinsicID(I));
4681 assert(Intr && "not an image intrinsic with image pseudo");
4682 return selectImageIntrinsic(MI&: I, Intr);
4683 }
4684 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4685 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4686 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4687 return selectBVHIntersectRayIntrinsic(MI&: I);
4688 case AMDGPU::G_SBFX:
4689 case AMDGPU::G_UBFX:
4690 return selectG_SBFX_UBFX(MI&: I);
4691 case AMDGPU::G_SI_CALL:
4692 I.setDesc(TII.get(Opcode: AMDGPU::SI_CALL));
4693 return true;
4694 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4695 return selectWaveAddress(MI&: I);
4696 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4697 I.setDesc(TII.get(Opcode: AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4698 return true;
4699 }
4700 case AMDGPU::G_STACKRESTORE:
4701 return selectStackRestore(MI&: I);
4702 case AMDGPU::G_PHI:
4703 return selectPHI(I);
4704 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4705 return selectCOPY_SCC_VCC(I);
4706 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4707 return selectCOPY_VCC_SCC(I);
4708 case AMDGPU::G_AMDGPU_READANYLANE:
4709 return selectReadAnyLane(I);
4710 case TargetOpcode::G_CONSTANT:
4711 case TargetOpcode::G_FCONSTANT:
4712 default:
4713 return selectImpl(I, CoverageInfo&: *CoverageInfo);
4714 }
4715 return false;
4716}
4717
4718InstructionSelector::ComplexRendererFns
4719AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4720 return {{
4721 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); }
4722 }};
4723
4724}
4725
4726std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4727 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4728 unsigned Mods = 0;
4729 MachineInstr *MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI);
4730
4731 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4732 Src = MI->getOperand(i: 1).getReg();
4733 Mods |= SISrcMods::NEG;
4734 MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI);
4735 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4736 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4737 // denormal mode, but we're implicitly canonicalizing in a source operand.
4738 const ConstantFP *LHS =
4739 getConstantFPVRegVal(VReg: MI->getOperand(i: 1).getReg(), MRI: *MRI);
4740 if (LHS && LHS->isZero()) {
4741 Mods |= SISrcMods::NEG;
4742 Src = MI->getOperand(i: 2).getReg();
4743 }
4744 }
4745
4746 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4747 Src = MI->getOperand(i: 1).getReg();
4748 Mods |= SISrcMods::ABS;
4749 }
4750
4751 if (OpSel)
4752 Mods |= SISrcMods::OP_SEL_0;
4753
4754 return std::pair(Src, Mods);
4755}
4756
4757std::pair<Register, unsigned>
4758AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const {
4759 unsigned Mods;
4760 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src);
4761 Mods |= SISrcMods::OP_SEL_1;
4762 return std::pair(Src, Mods);
4763}
4764
4765Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4766 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4767 bool ForceVGPR) const {
4768 if ((Mods != 0 || ForceVGPR) &&
4769 RBI.getRegBank(Reg: Src, MRI: *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4770
4771 // If we looked through copies to find source modifiers on an SGPR operand,
4772 // we now have an SGPR register source. To avoid potentially violating the
4773 // constant bus restriction, we need to insert a copy to a VGPR.
4774 Register VGPRSrc = MRI->cloneVirtualRegister(VReg: Root.getReg());
4775 BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
4776 MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: VGPRSrc)
4777 .addReg(RegNo: Src);
4778 Src = VGPRSrc;
4779 }
4780
4781 return Src;
4782}
4783
4784///
4785/// This will select either an SGPR or VGPR operand and will save us from
4786/// having to write an extra tablegen pattern.
4787InstructionSelector::ComplexRendererFns
4788AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4789 return {{
4790 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); }
4791 }};
4792}
4793
4794InstructionSelector::ComplexRendererFns
4795AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4796 Register Src;
4797 unsigned Mods;
4798 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
4799
4800 return {{
4801 [=](MachineInstrBuilder &MIB) {
4802 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4803 },
4804 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
4805 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
4806 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
4807 }};
4808}
4809
4810InstructionSelector::ComplexRendererFns
4811AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4812 Register Src;
4813 unsigned Mods;
4814 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg(),
4815 /*IsCanonicalizing=*/true,
4816 /*AllowAbs=*/false);
4817
4818 return {{
4819 [=](MachineInstrBuilder &MIB) {
4820 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4821 },
4822 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
4823 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
4824 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
4825 }};
4826}
4827
4828InstructionSelector::ComplexRendererFns
4829AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4830 return {{
4831 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); },
4832 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
4833 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
4834 }};
4835}
4836
4837InstructionSelector::ComplexRendererFns
4838AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4839 Register Src;
4840 unsigned Mods;
4841 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
4842
4843 return {{
4844 [=](MachineInstrBuilder &MIB) {
4845 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4846 },
4847 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4848 }};
4849}
4850
4851InstructionSelector::ComplexRendererFns
4852AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4853 MachineOperand &Root) const {
4854 Register Src;
4855 unsigned Mods;
4856 std::tie(args&: Src, args&: Mods) =
4857 selectVOP3ModsImpl(Src: Root.getReg(), /*IsCanonicalizing=*/false);
4858
4859 return {{
4860 [=](MachineInstrBuilder &MIB) {
4861 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4862 },
4863 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4864 }};
4865}
4866
4867InstructionSelector::ComplexRendererFns
4868AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4869 Register Src;
4870 unsigned Mods;
4871 std::tie(args&: Src, args&: Mods) =
4872 selectVOP3ModsImpl(Src: Root.getReg(), /*IsCanonicalizing=*/true,
4873 /*AllowAbs=*/false);
4874
4875 return {{
4876 [=](MachineInstrBuilder &MIB) {
4877 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4878 },
4879 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4880 }};
4881}
4882
4883InstructionSelector::ComplexRendererFns
4884AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4885 Register Reg = Root.getReg();
4886 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI);
4887 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4888 return {};
4889 return {{
4890 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
4891 }};
4892}
4893
4894enum class SrcStatus {
4895 IS_SAME,
4896 IS_UPPER_HALF,
4897 IS_LOWER_HALF,
4898 IS_UPPER_HALF_NEG,
4899 // This means current op = [op_upper, op_lower] and src = -op_lower.
4900 IS_LOWER_HALF_NEG,
4901 IS_HI_NEG,
4902 // This means current op = [op_upper, op_lower] and src = [op_upper,
4903 // -op_lower].
4904 IS_LO_NEG,
4905 IS_BOTH_NEG,
4906 INVALID,
4907 NEG_START = IS_UPPER_HALF_NEG,
4908 NEG_END = IS_BOTH_NEG,
4909 HALF_START = IS_UPPER_HALF,
4910 HALF_END = IS_LOWER_HALF_NEG
4911};
4912/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4913static bool isTruncHalf(const MachineInstr *MI,
4914 const MachineRegisterInfo &MRI) {
4915 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4916 return false;
4917
4918 unsigned DstSize = MRI.getType(Reg: MI->getOperand(i: 0).getReg()).getSizeInBits();
4919 unsigned SrcSize = MRI.getType(Reg: MI->getOperand(i: 1).getReg()).getSizeInBits();
4920 return DstSize * 2 == SrcSize;
4921}
4922
4923/// Test if the MI is logic shift right with half bits,
4924/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4925static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4926 if (MI->getOpcode() != AMDGPU::G_LSHR)
4927 return false;
4928
4929 Register ShiftSrc;
4930 std::optional<ValueAndVReg> ShiftAmt;
4931 if (mi_match(R: MI->getOperand(i: 0).getReg(), MRI,
4932 P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt)))) {
4933 unsigned SrcSize = MRI.getType(Reg: MI->getOperand(i: 1).getReg()).getSizeInBits();
4934 unsigned Shift = ShiftAmt->Value.getZExtValue();
4935 return Shift * 2 == SrcSize;
4936 }
4937 return false;
4938}
4939
4940/// Test if the MI is shift left with half bits,
4941/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4942static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4943 if (MI->getOpcode() != AMDGPU::G_SHL)
4944 return false;
4945
4946 Register ShiftSrc;
4947 std::optional<ValueAndVReg> ShiftAmt;
4948 if (mi_match(R: MI->getOperand(i: 0).getReg(), MRI,
4949 P: m_GShl(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt)))) {
4950 unsigned SrcSize = MRI.getType(Reg: MI->getOperand(i: 1).getReg()).getSizeInBits();
4951 unsigned Shift = ShiftAmt->Value.getZExtValue();
4952 return Shift * 2 == SrcSize;
4953 }
4954 return false;
4955}
4956
4957/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4958static bool isUnmergeHalf(const MachineInstr *MI,
4959 const MachineRegisterInfo &MRI) {
4960 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4961 return false;
4962 return MI->getNumOperands() == 3 && MI->getOperand(i: 0).isDef() &&
4963 MI->getOperand(i: 1).isDef() && !MI->getOperand(i: 2).isDef();
4964}
4965
4966enum class TypeClass { VECTOR_OF_TWO, SCALAR, NONE_OF_LISTED };
4967
4968static TypeClass isVectorOfTwoOrScalar(Register Reg,
4969 const MachineRegisterInfo &MRI) {
4970 LLT OpTy = MRI.getType(Reg);
4971 if (OpTy.isScalar())
4972 return TypeClass::SCALAR;
4973 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4974 return TypeClass::VECTOR_OF_TWO;
4975 return TypeClass::NONE_OF_LISTED;
4976}
4977
4978static SrcStatus getNegStatus(Register Reg, SrcStatus S,
4979 const MachineRegisterInfo &MRI) {
4980 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4981 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4982 return SrcStatus::INVALID;
4983
4984 switch (S) {
4985 case SrcStatus::IS_SAME:
4986 if (NegType == TypeClass::VECTOR_OF_TWO) {
4987 // Vector of 2:
4988 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4989 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4990 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4991 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4992 return SrcStatus::IS_BOTH_NEG;
4993 }
4994 if (NegType == TypeClass::SCALAR) {
4995 // Scalar:
4996 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4997 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4998 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4999 // [SrcHi, SrcLo] = [-OpHi, OpLo]
5000 return SrcStatus::IS_HI_NEG;
5001 }
5002 break;
5003 case SrcStatus::IS_HI_NEG:
5004 if (NegType == TypeClass::VECTOR_OF_TWO) {
5005 // Vector of 2:
5006 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5007 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
5008 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
5009 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
5010 return SrcStatus::IS_LO_NEG;
5011 }
5012 if (NegType == TypeClass::SCALAR) {
5013 // Scalar:
5014 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5015 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
5016 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
5017 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
5018 return SrcStatus::IS_SAME;
5019 }
5020 break;
5021 case SrcStatus::IS_LO_NEG:
5022 if (NegType == TypeClass::VECTOR_OF_TWO) {
5023 // Vector of 2:
5024 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
5025 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
5026 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
5027 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
5028 return SrcStatus::IS_HI_NEG;
5029 }
5030 if (NegType == TypeClass::SCALAR) {
5031 // Scalar:
5032 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
5033 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
5034 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
5035 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
5036 return SrcStatus::IS_BOTH_NEG;
5037 }
5038 break;
5039 case SrcStatus::IS_BOTH_NEG:
5040 if (NegType == TypeClass::VECTOR_OF_TWO) {
5041 // Vector of 2:
5042 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
5043 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
5044 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
5045 // [SrcHi, SrcLo] = [OpHi, OpLo]
5046 return SrcStatus::IS_SAME;
5047 }
5048 if (NegType == TypeClass::SCALAR) {
5049 // Scalar:
5050 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
5051 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
5052 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
5053 // [SrcHi, SrcLo] = [OpHi, -OpLo]
5054 return SrcStatus::IS_LO_NEG;
5055 }
5056 break;
5057 case SrcStatus::IS_UPPER_HALF:
5058 // Vector of 2:
5059 // Src = CurrUpper
5060 // Curr = [CurrUpper, CurrLower]
5061 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5062 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5063 // Src = -OpUpper
5064 //
5065 // Scalar:
5066 // Src = CurrUpper
5067 // Curr = [CurrUpper, CurrLower]
5068 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5069 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5070 // Src = -OpUpper
5071 return SrcStatus::IS_UPPER_HALF_NEG;
5072 case SrcStatus::IS_LOWER_HALF:
5073 if (NegType == TypeClass::VECTOR_OF_TWO) {
5074 // Vector of 2:
5075 // Src = CurrLower
5076 // Curr = [CurrUpper, CurrLower]
5077 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5078 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5079 // Src = -OpLower
5080 return SrcStatus::IS_LOWER_HALF_NEG;
5081 }
5082 if (NegType == TypeClass::SCALAR) {
5083 // Scalar:
5084 // Src = CurrLower
5085 // Curr = [CurrUpper, CurrLower]
5086 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5087 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5088 // Src = OpLower
5089 return SrcStatus::IS_LOWER_HALF;
5090 }
5091 break;
5092 case SrcStatus::IS_UPPER_HALF_NEG:
5093 // Vector of 2:
5094 // Src = -CurrUpper
5095 // Curr = [CurrUpper, CurrLower]
5096 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5097 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5098 // Src = -(-OpUpper) = OpUpper
5099 //
5100 // Scalar:
5101 // Src = -CurrUpper
5102 // Curr = [CurrUpper, CurrLower]
5103 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5104 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5105 // Src = -(-OpUpper) = OpUpper
5106 return SrcStatus::IS_UPPER_HALF;
5107 case SrcStatus::IS_LOWER_HALF_NEG:
5108 if (NegType == TypeClass::VECTOR_OF_TWO) {
5109 // Vector of 2:
5110 // Src = -CurrLower
5111 // Curr = [CurrUpper, CurrLower]
5112 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5113 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5114 // Src = -(-OpLower) = OpLower
5115 return SrcStatus::IS_LOWER_HALF;
5116 }
5117 if (NegType == TypeClass::SCALAR) {
5118 // Scalar:
5119 // Src = -CurrLower
5120 // Curr = [CurrUpper, CurrLower]
5121 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5122 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5123 // Src = -OpLower
5124 return SrcStatus::IS_LOWER_HALF_NEG;
5125 }
5126 break;
5127 default:
5128 break;
5129 }
5130 llvm_unreachable("unexpected SrcStatus & NegType combination");
5131}
5132
5133static std::optional<std::pair<Register, SrcStatus>>
5134calcNextStatus(std::pair<Register, SrcStatus> Curr,
5135 const MachineRegisterInfo &MRI) {
5136 const MachineInstr *MI = MRI.getVRegDef(Reg: Curr.first);
5137
5138 unsigned Opc = MI->getOpcode();
5139
5140 // Handle general Opc cases.
5141 switch (Opc) {
5142 case AMDGPU::G_BITCAST:
5143 return std::optional<std::pair<Register, SrcStatus>>(
5144 {MI->getOperand(i: 1).getReg(), Curr.second});
5145 case AMDGPU::COPY:
5146 if (MI->getOperand(i: 1).getReg().isPhysical())
5147 return std::nullopt;
5148 return std::optional<std::pair<Register, SrcStatus>>(
5149 {MI->getOperand(i: 1).getReg(), Curr.second});
5150 case AMDGPU::G_FNEG: {
5151 SrcStatus Stat = getNegStatus(Reg: Curr.first, S: Curr.second, MRI);
5152 if (Stat == SrcStatus::INVALID)
5153 return std::nullopt;
5154 return std::optional<std::pair<Register, SrcStatus>>(
5155 {MI->getOperand(i: 1).getReg(), Stat});
5156 }
5157 default:
5158 break;
5159 }
5160
5161 // Calc next Stat from current Stat.
5162 switch (Curr.second) {
5163 case SrcStatus::IS_SAME:
5164 if (isTruncHalf(MI, MRI))
5165 return std::optional<std::pair<Register, SrcStatus>>(
5166 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_LOWER_HALF});
5167 else if (isUnmergeHalf(MI, MRI)) {
5168 if (Curr.first == MI->getOperand(i: 0).getReg())
5169 return std::optional<std::pair<Register, SrcStatus>>(
5170 {MI->getOperand(i: 2).getReg(), SrcStatus::IS_LOWER_HALF});
5171 return std::optional<std::pair<Register, SrcStatus>>(
5172 {MI->getOperand(i: 2).getReg(), SrcStatus::IS_UPPER_HALF});
5173 }
5174 break;
5175 case SrcStatus::IS_HI_NEG:
5176 if (isTruncHalf(MI, MRI)) {
5177 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5178 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
5179 // = [OpLowerHi, OpLowerLo]
5180 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5181 // = [-OpLowerHi, OpLowerLo]
5182 // = -OpLower
5183 return std::optional<std::pair<Register, SrcStatus>>(
5184 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5185 }
5186 if (isUnmergeHalf(MI, MRI)) {
5187 if (Curr.first == MI->getOperand(i: 0).getReg())
5188 return std::optional<std::pair<Register, SrcStatus>>(
5189 {MI->getOperand(i: 2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5190 return std::optional<std::pair<Register, SrcStatus>>(
5191 {MI->getOperand(i: 2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5192 }
5193 break;
5194 case SrcStatus::IS_UPPER_HALF:
5195 if (isShlHalf(MI, MRI))
5196 return std::optional<std::pair<Register, SrcStatus>>(
5197 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_LOWER_HALF});
5198 break;
5199 case SrcStatus::IS_LOWER_HALF:
5200 if (isLshrHalf(MI, MRI))
5201 return std::optional<std::pair<Register, SrcStatus>>(
5202 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_UPPER_HALF});
5203 break;
5204 case SrcStatus::IS_UPPER_HALF_NEG:
5205 if (isShlHalf(MI, MRI))
5206 return std::optional<std::pair<Register, SrcStatus>>(
5207 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5208 break;
5209 case SrcStatus::IS_LOWER_HALF_NEG:
5210 if (isLshrHalf(MI, MRI))
5211 return std::optional<std::pair<Register, SrcStatus>>(
5212 {MI->getOperand(i: 1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5213 break;
5214 default:
5215 break;
5216 }
5217 return std::nullopt;
5218}
5219
5220/// This is used to control valid status that current MI supports. For example,
5221/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
5222/// bit on VOP3P.
5223/// The class can be further extended to recognize support on SEL, NEG, ABS bit
5224/// for different MI on different arch
5225class SearchOptions {
5226private:
5227 bool HasNeg = false;
5228 // Assume all complex pattern of VOP3P have opsel.
5229 bool HasOpsel = true;
5230
5231public:
5232 SearchOptions(Register Reg, const MachineRegisterInfo &MRI) {
5233 const MachineInstr *MI = MRI.getVRegDef(Reg);
5234 unsigned Opc = MI->getOpcode();
5235
5236 if (Opc == TargetOpcode::G_INTRINSIC) {
5237 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val: *MI).getIntrinsicID();
5238 // Only float point intrinsic has neg & neg_hi bits.
5239 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
5240 HasNeg = true;
5241 } else if (TargetInstrInfo::isGenericOpcode(Opc)) {
5242 // Keep same for generic op.
5243 HasNeg = true;
5244 }
5245 }
5246 bool checkOptions(SrcStatus Stat) const {
5247 if (!HasNeg &&
5248 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
5249 return false;
5250 }
5251 if (!HasOpsel &&
5252 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
5253 return false;
5254 }
5255 return true;
5256 }
5257};
5258
5259static SmallVector<std::pair<Register, SrcStatus>>
5260getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO,
5261 int MaxDepth = 3) {
5262 int Depth = 0;
5263 auto Curr = calcNextStatus(Curr: {Reg, SrcStatus::IS_SAME}, MRI);
5264 SmallVector<std::pair<Register, SrcStatus>> Statlist;
5265
5266 while (Depth <= MaxDepth && Curr.has_value()) {
5267 Depth++;
5268 if (SO.checkOptions(Stat: Curr.value().second))
5269 Statlist.push_back(Elt: Curr.value());
5270 Curr = calcNextStatus(Curr: Curr.value(), MRI);
5271 }
5272
5273 return Statlist;
5274}
5275
5276static std::pair<Register, SrcStatus>
5277getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO,
5278 int MaxDepth = 3) {
5279 int Depth = 0;
5280 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
5281 auto Curr = calcNextStatus(Curr: LastSameOrNeg, MRI);
5282
5283 while (Depth <= MaxDepth && Curr.has_value()) {
5284 Depth++;
5285 SrcStatus Stat = Curr.value().second;
5286 if (SO.checkOptions(Stat)) {
5287 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
5288 Stat == SrcStatus::IS_LO_NEG || Stat == SrcStatus::IS_BOTH_NEG)
5289 LastSameOrNeg = Curr.value();
5290 }
5291 Curr = calcNextStatus(Curr: Curr.value(), MRI);
5292 }
5293
5294 return LastSameOrNeg;
5295}
5296
5297static bool isSameBitWidth(Register Reg1, Register Reg2,
5298 const MachineRegisterInfo &MRI) {
5299 unsigned Width1 = MRI.getType(Reg: Reg1).getSizeInBits();
5300 unsigned Width2 = MRI.getType(Reg: Reg2).getSizeInBits();
5301 return Width1 == Width2;
5302}
5303
5304static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
5305 // SrcStatus::IS_LOWER_HALF remain 0.
5306 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
5307 Mods ^= SISrcMods::NEG_HI;
5308 Mods |= SISrcMods::OP_SEL_1;
5309 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
5310 Mods |= SISrcMods::OP_SEL_1;
5311 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
5312 Mods ^= SISrcMods::NEG_HI;
5313 else if (HiStat == SrcStatus::IS_HI_NEG)
5314 Mods ^= SISrcMods::NEG_HI;
5315
5316 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
5317 Mods ^= SISrcMods::NEG;
5318 Mods |= SISrcMods::OP_SEL_0;
5319 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
5320 Mods |= SISrcMods::OP_SEL_0;
5321 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
5322 Mods |= SISrcMods::NEG;
5323 else if (LoStat == SrcStatus::IS_HI_NEG)
5324 Mods ^= SISrcMods::NEG;
5325
5326 return Mods;
5327}
5328
5329static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
5330 Register RootReg, const SIInstrInfo &TII,
5331 const MachineRegisterInfo &MRI) {
5332 auto IsHalfState = [](SrcStatus S) {
5333 return S == SrcStatus::IS_UPPER_HALF || S == SrcStatus::IS_UPPER_HALF_NEG ||
5334 S == SrcStatus::IS_LOWER_HALF || S == SrcStatus::IS_LOWER_HALF_NEG;
5335 };
5336 return isSameBitWidth(Reg1: NewReg, Reg2: RootReg, MRI) && IsHalfState(LoStat) &&
5337 IsHalfState(HiStat);
5338}
5339
5340std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5341 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
5342 unsigned Mods = 0;
5343 // No modification if Root type is not form of <2 x Type>.
5344 if (isVectorOfTwoOrScalar(Reg: RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
5345 Mods |= SISrcMods::OP_SEL_1;
5346 return {RootReg, Mods};
5347 }
5348
5349 SearchOptions SO(RootReg, MRI);
5350
5351 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(Reg: RootReg, MRI, SO);
5352
5353 if (Stat.second == SrcStatus::IS_BOTH_NEG)
5354 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
5355 else if (Stat.second == SrcStatus::IS_HI_NEG)
5356 Mods ^= SISrcMods::NEG_HI;
5357 else if (Stat.second == SrcStatus::IS_LO_NEG)
5358 Mods ^= SISrcMods::NEG;
5359
5360 // 64-bit VOP3P instructions do not have OPSEL or ABS. Bail on v2f64 or v2i64.
5361 // TODO: Select NEG_LO and NEG_HI modifiers from BUILD_VECTOR.
5362 if (MRI.getType(Reg: RootReg).getSizeInBits() == 128) {
5363 Mods |= SISrcMods::OP_SEL_1; // Just the default, OPSEL unsupported.
5364 return {Stat.first, Mods};
5365 }
5366
5367 MachineInstr *MI = MRI.getVRegDef(Reg: Stat.first);
5368
5369 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
5370 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5371 Mods |= SISrcMods::OP_SEL_1;
5372 return {Stat.first, Mods};
5373 }
5374
5375 SmallVector<std::pair<Register, SrcStatus>> StatlistHi =
5376 getSrcStats(Reg: MI->getOperand(i: 2).getReg(), MRI, SO);
5377
5378 if (StatlistHi.empty()) {
5379 Mods |= SISrcMods::OP_SEL_1;
5380 return {Stat.first, Mods};
5381 }
5382
5383 SmallVector<std::pair<Register, SrcStatus>> StatlistLo =
5384 getSrcStats(Reg: MI->getOperand(i: 1).getReg(), MRI, SO);
5385
5386 if (StatlistLo.empty()) {
5387 Mods |= SISrcMods::OP_SEL_1;
5388 return {Stat.first, Mods};
5389 }
5390
5391 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
5392 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
5393 if (StatlistHi[I].first == StatlistLo[J].first &&
5394 isValidToPack(HiStat: StatlistHi[I].second, LoStat: StatlistLo[J].second,
5395 NewReg: StatlistHi[I].first, RootReg, TII, MRI))
5396 return {StatlistHi[I].first,
5397 updateMods(HiStat: StatlistHi[I].second, LoStat: StatlistLo[J].second, Mods)};
5398 }
5399 }
5400 // Packed instructions do not have abs modifiers.
5401 Mods |= SISrcMods::OP_SEL_1;
5402
5403 return {Stat.first, Mods};
5404}
5405
5406// Removed unused function `getAllKindImm` to eliminate dead code.
5407
5408static bool checkRB(Register Reg, unsigned int RBNo,
5409 const AMDGPURegisterBankInfo &RBI,
5410 const MachineRegisterInfo &MRI,
5411 const TargetRegisterInfo &TRI) {
5412 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
5413 return RB->getID() == RBNo;
5414}
5415
5416// This function is used to get the correct register bank for returned reg.
5417// Assume:
5418// 1. VOP3P is always legal for VGPR.
5419// 2. RootOp's regbank is legal.
5420// Thus
5421// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
5422// 2. If RootOp is VGPR, then NewOp must be VGPR.
5423static Register getLegalRegBank(Register NewReg, Register RootReg,
5424 const AMDGPURegisterBankInfo &RBI,
5425 MachineRegisterInfo &MRI,
5426 const TargetRegisterInfo &TRI,
5427 const SIInstrInfo &TII) {
5428 // RootOp can only be VGPR or SGPR (some hand written cases such as.
5429 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
5430 if (checkRB(Reg: RootReg, RBNo: AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
5431 checkRB(Reg: NewReg, RBNo: AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
5432 return NewReg;
5433
5434 MachineInstr *MI = MRI.getVRegDef(Reg: RootReg);
5435 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(i: 1).getReg()) {
5436 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
5437 return RootReg;
5438 }
5439
5440 MachineBasicBlock *BB = MI->getParent();
5441 Register DstReg = MRI.cloneVirtualRegister(VReg: RootReg);
5442
5443 MachineInstrBuilder MIB =
5444 BuildMI(BB&: *BB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
5445 .addReg(RegNo: NewReg);
5446
5447 // Only accept VGPR.
5448 return MIB->getOperand(i: 0).getReg();
5449}
5450
5451InstructionSelector::ComplexRendererFns
5452AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5453 bool IsDOT) const {
5454 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5455 Register Reg;
5456 unsigned Mods;
5457 std::tie(args&: Reg, args&: Mods) = selectVOP3PModsImpl(RootReg: Root.getReg(), MRI, IsDOT);
5458
5459 Reg = getLegalRegBank(NewReg: Reg, RootReg: Root.getReg(), RBI, MRI, TRI, TII);
5460 return {{
5461 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
5462 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
5463 }};
5464}
5465
5466InstructionSelector::ComplexRendererFns
5467AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5468
5469 return selectVOP3PRetHelper(Root);
5470}
5471
5472InstructionSelector::ComplexRendererFns
5473AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5474
5475 return selectVOP3PRetHelper(Root, IsDOT: true);
5476}
5477
5478InstructionSelector::ComplexRendererFns
5479AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const {
5480 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5481 Register Src;
5482 unsigned Mods;
5483 std::tie(args&: Src, args&: Mods) = selectVOP3PModsImpl(RootReg: Root.getReg(), MRI, IsDOT: true /*IsDOT*/);
5484 if (Mods != SISrcMods::OP_SEL_1)
5485 return {};
5486
5487 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); }}};
5488}
5489
5490InstructionSelector::ComplexRendererFns
5491AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const {
5492 Register Src;
5493 unsigned Mods;
5494 std::tie(args&: Src, args&: Mods) = selectVOP3PModsF32Impl(Src: Root.getReg());
5495
5496 return {{
5497 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5498 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
5499 }};
5500}
5501
5502InstructionSelector::ComplexRendererFns
5503AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const {
5504 Register Src;
5505 unsigned Mods;
5506 std::tie(args&: Src, args&: Mods) = selectVOP3PModsF32Impl(Src: Root.getReg());
5507 if (Mods != SISrcMods::OP_SEL_1)
5508 return {};
5509
5510 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); }}};
5511}
5512
5513InstructionSelector::ComplexRendererFns
5514AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5515 MachineOperand &Root) const {
5516 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5517 "expected i1 value");
5518 unsigned Mods = SISrcMods::OP_SEL_1;
5519 if (Root.getImm() != 0)
5520 Mods |= SISrcMods::OP_SEL_0;
5521
5522 return {{
5523 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
5524 }};
5525}
5526
5527static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
5528 MachineInstr *InsertPt,
5529 MachineRegisterInfo &MRI) {
5530 const TargetRegisterClass *DstRegClass;
5531 switch (Elts.size()) {
5532 case 8:
5533 DstRegClass = &AMDGPU::VReg_256RegClass;
5534 break;
5535 case 4:
5536 DstRegClass = &AMDGPU::VReg_128RegClass;
5537 break;
5538 case 2:
5539 DstRegClass = &AMDGPU::VReg_64RegClass;
5540 break;
5541 default:
5542 llvm_unreachable("unhandled Reg sequence size");
5543 }
5544
5545 MachineIRBuilder B(*InsertPt);
5546 auto MIB = B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
5547 .addDef(RegNo: MRI.createVirtualRegister(RegClass: DstRegClass));
5548 for (unsigned i = 0; i < Elts.size(); ++i) {
5549 MIB.addReg(RegNo: Elts[i]);
5550 MIB.addImm(Val: SIRegisterInfo::getSubRegFromChannel(Channel: i));
5551 }
5552 return MIB->getOperand(i: 0).getReg();
5553}
5554
5555static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5556 SmallVectorImpl<Register> &Elts, Register &Src,
5557 MachineInstr *InsertPt,
5558 MachineRegisterInfo &MRI) {
5559 if (ModOpcode == TargetOpcode::G_FNEG) {
5560 Mods |= SISrcMods::NEG;
5561 // Check if all elements also have abs modifier
5562 SmallVector<Register, 8> NegAbsElts;
5563 for (auto El : Elts) {
5564 Register FabsSrc;
5565 if (!mi_match(R: El, MRI, P: m_GFabs(Src: m_Reg(R&: FabsSrc))))
5566 break;
5567 NegAbsElts.push_back(Elt: FabsSrc);
5568 }
5569 if (Elts.size() != NegAbsElts.size()) {
5570 // Neg
5571 Src = buildRegSequence(Elts, InsertPt, MRI);
5572 } else {
5573 // Neg and Abs
5574 Mods |= SISrcMods::NEG_HI;
5575 Src = buildRegSequence(Elts&: NegAbsElts, InsertPt, MRI);
5576 }
5577 } else {
5578 assert(ModOpcode == TargetOpcode::G_FABS);
5579 // Abs
5580 Mods |= SISrcMods::NEG_HI;
5581 Src = buildRegSequence(Elts, InsertPt, MRI);
5582 }
5583}
5584
5585InstructionSelector::ComplexRendererFns
5586AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5587 Register Src = Root.getReg();
5588 unsigned Mods = SISrcMods::OP_SEL_1;
5589 SmallVector<Register, 8> EltsF32;
5590
5591 if (GBuildVector *BV = dyn_cast<GBuildVector>(Val: MRI->getVRegDef(Reg: Src))) {
5592 assert(BV->getNumSources() > 0);
5593 // Based on first element decide which mod we match, neg or abs
5594 MachineInstr *ElF32 = MRI->getVRegDef(Reg: BV->getSourceReg(I: 0));
5595 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5596 ? AMDGPU::G_FNEG
5597 : AMDGPU::G_FABS;
5598 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5599 ElF32 = MRI->getVRegDef(Reg: BV->getSourceReg(I: i));
5600 if (ElF32->getOpcode() != ModOpcode)
5601 break;
5602 EltsF32.push_back(Elt: ElF32->getOperand(i: 1).getReg());
5603 }
5604
5605 // All elements had ModOpcode modifier
5606 if (BV->getNumSources() == EltsF32.size()) {
5607 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF32, Src, InsertPt: Root.getParent(),
5608 MRI&: *MRI);
5609 }
5610 }
5611
5612 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5613 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
5614}
5615
5616InstructionSelector::ComplexRendererFns
5617AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5618 Register Src = Root.getReg();
5619 unsigned Mods = SISrcMods::OP_SEL_1;
5620 SmallVector<Register, 8> EltsV2F16;
5621
5622 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(Val: MRI->getVRegDef(Reg: Src))) {
5623 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5624 Register FNegSrc;
5625 if (!mi_match(R: CV->getSourceReg(I: i), MRI: *MRI, P: m_GFNeg(Src: m_Reg(R&: FNegSrc))))
5626 break;
5627 EltsV2F16.push_back(Elt: FNegSrc);
5628 }
5629
5630 // All elements had ModOpcode modifier
5631 if (CV->getNumSources() == EltsV2F16.size()) {
5632 Mods |= SISrcMods::NEG;
5633 Mods |= SISrcMods::NEG_HI;
5634 Src = buildRegSequence(Elts&: EltsV2F16, InsertPt: Root.getParent(), MRI&: *MRI);
5635 }
5636 }
5637
5638 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5639 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
5640}
5641
5642InstructionSelector::ComplexRendererFns
5643AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5644 Register Src = Root.getReg();
5645 unsigned Mods = SISrcMods::OP_SEL_1;
5646 SmallVector<Register, 8> EltsV2F16;
5647
5648 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(Val: MRI->getVRegDef(Reg: Src))) {
5649 assert(CV->getNumSources() > 0);
5650 MachineInstr *ElV2F16 = MRI->getVRegDef(Reg: CV->getSourceReg(I: 0));
5651 // Based on first element decide which mod we match, neg or abs
5652 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5653 ? AMDGPU::G_FNEG
5654 : AMDGPU::G_FABS;
5655
5656 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5657 ElV2F16 = MRI->getVRegDef(Reg: CV->getSourceReg(I: i));
5658 if (ElV2F16->getOpcode() != ModOpcode)
5659 break;
5660 EltsV2F16.push_back(Elt: ElV2F16->getOperand(i: 1).getReg());
5661 }
5662
5663 // All elements had ModOpcode modifier
5664 if (CV->getNumSources() == EltsV2F16.size()) {
5665 MachineIRBuilder B(*Root.getParent());
5666 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsV2F16, Src, InsertPt: Root.getParent(),
5667 MRI&: *MRI);
5668 }
5669 }
5670
5671 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5672 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
5673}
5674
5675InstructionSelector::ComplexRendererFns
5676AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5677 std::optional<FPValueAndVReg> FPValReg;
5678 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_GFCstOrSplat(FPValReg))) {
5679 if (TII.isInlineConstant(Imm: FPValReg->Value)) {
5680 return {{[=](MachineInstrBuilder &MIB) {
5681 MIB.addImm(Val: FPValReg->Value.bitcastToAPInt().getSExtValue());
5682 }}};
5683 }
5684 // Non-inlineable splat floats should not fall-through for integer immediate
5685 // checks.
5686 return {};
5687 }
5688
5689 APInt ICst;
5690 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICstOrSplat(Cst&: ICst))) {
5691 if (TII.isInlineConstant(Imm: ICst)) {
5692 return {
5693 {[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ICst.getSExtValue()); }}};
5694 }
5695 }
5696
5697 return {};
5698}
5699
5700InstructionSelector::ComplexRendererFns
5701AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5702 Register Src =
5703 getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg();
5704 unsigned Key = 0;
5705
5706 Register ShiftSrc;
5707 std::optional<ValueAndVReg> ShiftAmt;
5708 if (mi_match(R: Src, MRI: *MRI, P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt))) &&
5709 MRI->getType(Reg: ShiftSrc).getSizeInBits() == 32 &&
5710 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5711 Key = ShiftAmt->Value.getZExtValue() / 8;
5712 Src = ShiftSrc;
5713 }
5714
5715 return {{
5716 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5717 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key
5718 }};
5719}
5720
5721InstructionSelector::ComplexRendererFns
5722AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5723
5724 Register Src =
5725 getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg();
5726 unsigned Key = 0;
5727
5728 Register ShiftSrc;
5729 std::optional<ValueAndVReg> ShiftAmt;
5730 if (mi_match(R: Src, MRI: *MRI, P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt))) &&
5731 MRI->getType(Reg: ShiftSrc).getSizeInBits() == 32 &&
5732 ShiftAmt->Value.getZExtValue() == 16) {
5733 Src = ShiftSrc;
5734 Key = 1;
5735 }
5736
5737 return {{
5738 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5739 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key
5740 }};
5741}
5742
5743InstructionSelector::ComplexRendererFns
5744AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5745 Register Src =
5746 getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg();
5747 unsigned Key = 0;
5748
5749 Register S32 = matchZeroExtendFromS32(Reg: Src);
5750 if (!S32)
5751 S32 = matchAnyExtendFromS32(Reg: Src);
5752
5753 if (S32) {
5754 const MachineInstr *Def = getDefIgnoringCopies(Reg: S32, MRI: *MRI);
5755 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5756 assert(Def->getNumOperands() == 3);
5757 Register DstReg1 = Def->getOperand(i: 1).getReg();
5758 if (mi_match(R: S32, MRI: *MRI,
5759 P: m_any_of(preds: m_SpecificReg(RequestedReg: DstReg1), preds: m_Copy(Src: m_Reg(R&: DstReg1))))) {
5760 Src = Def->getOperand(i: 2).getReg();
5761 Key = 1;
5762 }
5763 }
5764 }
5765
5766 return {{
5767 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5768 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key
5769 }};
5770}
5771
5772InstructionSelector::ComplexRendererFns
5773AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5774 Register Src;
5775 unsigned Mods;
5776 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
5777
5778 // FIXME: Handle op_sel
5779 return {{
5780 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5781 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
5782 }};
5783}
5784
5785// FIXME-TRUE16 remove when fake16 is removed
5786InstructionSelector::ComplexRendererFns
5787AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5788 Register Src;
5789 unsigned Mods;
5790 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg(),
5791 /*IsCanonicalizing=*/true,
5792 /*AllowAbs=*/false,
5793 /*OpSel=*/false);
5794
5795 return {{
5796 [=](MachineInstrBuilder &MIB) {
5797 MIB.addReg(
5798 RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB, /* ForceVGPR */ true));
5799 },
5800 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
5801 }};
5802}
5803
5804InstructionSelector::ComplexRendererFns
5805AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5806 Register Src;
5807 unsigned Mods;
5808 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg(),
5809 /*IsCanonicalizing=*/true,
5810 /*AllowAbs=*/false,
5811 /*OpSel=*/true);
5812
5813 return {{
5814 [=](MachineInstrBuilder &MIB) {
5815 MIB.addReg(
5816 RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB, /* ForceVGPR */ true));
5817 },
5818 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
5819 }};
5820}
5821
5822// Given \p Offset and load specified by the \p Root operand check if \p Offset
5823// is a multiple of the load byte size. If it is update \p Offset to a
5824// pre-scaled value and return true.
5825bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5826 Register &Offset,
5827 bool IsSigned) const {
5828 if (!Subtarget->hasScaleOffset())
5829 return false;
5830
5831 const MachineInstr &MI = *Root.getParent();
5832 MachineMemOperand *MMO = *MI.memoperands_begin();
5833
5834 if (!MMO->getSize().hasValue())
5835 return false;
5836
5837 uint64_t Size = MMO->getSize().getValue();
5838
5839 Register OffsetReg = matchExtendFromS32OrS32(Reg: Offset, IsSigned);
5840 if (!OffsetReg)
5841 OffsetReg = Offset;
5842
5843 if (auto Def = getDefSrcRegIgnoringCopies(Reg: OffsetReg, MRI: *MRI))
5844 OffsetReg = Def->Reg;
5845
5846 Register Op0;
5847 MachineInstr *Mul;
5848 bool ScaleOffset =
5849 (isPowerOf2_64(Value: Size) &&
5850 mi_match(R: OffsetReg, MRI: *MRI,
5851 P: m_GShl(L: m_Reg(R&: Op0),
5852 R: m_any_of(preds: m_SpecificICst(RequestedValue: Log2_64(Value: Size)),
5853 preds: m_Copy(Src: m_SpecificICst(RequestedValue: Log2_64(Value: Size))))))) ||
5854 mi_match(R: OffsetReg, MRI: *MRI,
5855 P: m_GMul(L: m_Reg(R&: Op0), R: m_any_of(preds: m_SpecificICst(RequestedValue: Size),
5856 preds: m_Copy(Src: m_SpecificICst(RequestedValue: Size))))) ||
5857 mi_match(
5858 R: OffsetReg, MRI: *MRI,
5859 P: m_BinOp(Opcode: IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5860 L: m_Reg(R&: Op0), R: m_SpecificICst(RequestedValue: Size))) ||
5861 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5862 (mi_match(R: OffsetReg, MRI: *MRI, P: m_MInstr(MI&: Mul)) &&
5863 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5864 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5865 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5866 VT->signBitIsZero(Op: Mul->getOperand(i: 2).getReg()))) &&
5867 mi_match(R: Mul->getOperand(i: 4).getReg(), MRI: *MRI, P: m_ZeroInt()) &&
5868 mi_match(R: Mul->getOperand(i: 3).getReg(), MRI: *MRI,
5869 P: m_GTrunc(Src: m_any_of(preds: m_SpecificICst(RequestedValue: Size),
5870 preds: m_Copy(Src: m_SpecificICst(RequestedValue: Size))))) &&
5871 mi_match(R: Mul->getOperand(i: 2).getReg(), MRI: *MRI, P: m_Reg(R&: Op0)));
5872
5873 if (ScaleOffset)
5874 Offset = Op0;
5875
5876 return ScaleOffset;
5877}
5878
5879bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5880 Register &Base,
5881 Register *SOffset,
5882 int64_t *Offset,
5883 bool *ScaleOffset) const {
5884 MachineInstr *MI = Root.getParent();
5885 MachineBasicBlock *MBB = MI->getParent();
5886
5887 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5888 // then we can select all ptr + 32-bit offsets.
5889 SmallVector<GEPInfo, 4> AddrInfo;
5890 getAddrModeInfo(Load: *MI, MRI: *MRI, AddrInfo);
5891
5892 if (AddrInfo.empty())
5893 return false;
5894
5895 const GEPInfo &GEPI = AddrInfo[0];
5896 std::optional<int64_t> EncodedImm;
5897
5898 if (ScaleOffset)
5899 *ScaleOffset = false;
5900
5901 if (SOffset && Offset) {
5902 EncodedImm = AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: GEPI.Imm, /*IsBuffer=*/false,
5903 /*HasSOffset=*/true);
5904 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5905 AddrInfo.size() > 1) {
5906 const GEPInfo &GEPI2 = AddrInfo[1];
5907 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5908 Register OffsetReg = GEPI2.SgprParts[1];
5909 if (ScaleOffset)
5910 *ScaleOffset =
5911 selectScaleOffset(Root, Offset&: OffsetReg, IsSigned: false /* IsSigned */);
5912 OffsetReg = matchZeroExtendFromS32OrS32(Reg: OffsetReg);
5913 if (OffsetReg) {
5914 Base = GEPI2.SgprParts[0];
5915 *SOffset = OffsetReg;
5916 *Offset = *EncodedImm;
5917 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(ST: STI))
5918 return true;
5919
5920 // For unbuffered smem loads, it is illegal for the Immediate Offset
5921 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5922 // is negative. Handle the case where the Immediate Offset + SOffset
5923 // is negative.
5924 auto SKnown = VT->getKnownBits(R: *SOffset);
5925 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5926 return false;
5927
5928 return true;
5929 }
5930 }
5931 }
5932 return false;
5933 }
5934
5935 EncodedImm = AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: GEPI.Imm, /*IsBuffer=*/false,
5936 /*HasSOffset=*/false);
5937 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5938 Base = GEPI.SgprParts[0];
5939 *Offset = *EncodedImm;
5940 return true;
5941 }
5942
5943 // SGPR offset is unsigned.
5944 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(x: GEPI.Imm) &&
5945 GEPI.Imm != 0) {
5946 // If we make it this far we have a load with an 32-bit immediate offset.
5947 // It is OK to select this using a sgpr offset, because we have already
5948 // failed trying to select this load into one of the _IMM variants since
5949 // the _IMM Patterns are considered before the _SGPR patterns.
5950 Base = GEPI.SgprParts[0];
5951 *SOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5952 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: *SOffset)
5953 .addImm(Val: GEPI.Imm);
5954 return true;
5955 }
5956
5957 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5958 Register OffsetReg = GEPI.SgprParts[1];
5959 if (ScaleOffset)
5960 *ScaleOffset = selectScaleOffset(Root, Offset&: OffsetReg, IsSigned: false /* IsSigned */);
5961 OffsetReg = matchZeroExtendFromS32OrS32(Reg: OffsetReg);
5962 if (OffsetReg) {
5963 Base = GEPI.SgprParts[0];
5964 *SOffset = OffsetReg;
5965 return true;
5966 }
5967 }
5968
5969 return false;
5970}
5971
5972InstructionSelector::ComplexRendererFns
5973AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5974 Register Base;
5975 int64_t Offset;
5976 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, Offset: &Offset,
5977 /* ScaleOffset */ nullptr))
5978 return std::nullopt;
5979
5980 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
5981 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }}};
5982}
5983
5984InstructionSelector::ComplexRendererFns
5985AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5986 SmallVector<GEPInfo, 4> AddrInfo;
5987 getAddrModeInfo(Load: *Root.getParent(), MRI: *MRI, AddrInfo);
5988
5989 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5990 return std::nullopt;
5991
5992 const GEPInfo &GEPInfo = AddrInfo[0];
5993 Register PtrReg = GEPInfo.SgprParts[0];
5994 std::optional<int64_t> EncodedImm =
5995 AMDGPU::getSMRDEncodedLiteralOffset32(ST: STI, ByteOffset: GEPInfo.Imm);
5996 if (!EncodedImm)
5997 return std::nullopt;
5998
5999 return {{
6000 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrReg); },
6001 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); }
6002 }};
6003}
6004
6005InstructionSelector::ComplexRendererFns
6006AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
6007 Register Base, SOffset;
6008 bool ScaleOffset;
6009 if (!selectSmrdOffset(Root, Base, SOffset: &SOffset, /* Offset= */ nullptr,
6010 ScaleOffset: &ScaleOffset))
6011 return std::nullopt;
6012
6013 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
6014 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
6015 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); },
6016 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPol); }}};
6017}
6018
6019InstructionSelector::ComplexRendererFns
6020AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
6021 Register Base, SOffset;
6022 int64_t Offset;
6023 bool ScaleOffset;
6024 if (!selectSmrdOffset(Root, Base, SOffset: &SOffset, Offset: &Offset, ScaleOffset: &ScaleOffset))
6025 return std::nullopt;
6026
6027 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
6028 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
6029 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); },
6030 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); },
6031 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPol); }}};
6032}
6033
6034std::pair<Register, int> AMDGPUInstructionSelector::selectFlatOffsetImpl(
6035 MachineOperand &Root, AMDGPU::FlatAddrSpace FlatVariant) const {
6036 MachineInstr *MI = Root.getParent();
6037
6038 auto Default = std::pair(Root.getReg(), 0);
6039
6040 if (!STI.hasFlatInstOffsets())
6041 return Default;
6042
6043 Register PtrBase;
6044 int64_t ConstOffset;
6045 bool IsInBounds;
6046 std::tie(args&: PtrBase, args&: ConstOffset, args&: IsInBounds) =
6047 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
6048
6049 // Adding the offset to the base address with an immediate in a FLAT
6050 // instruction must not change the memory aperture in which the address falls.
6051 // Therefore we can only fold offsets from inbounds GEPs into FLAT
6052 // instructions.
6053 if (ConstOffset == 0 ||
6054 (FlatVariant == AMDGPU::FlatAddrSpace::FlatScratch &&
6055 !isFlatScratchBaseLegal(Addr: Root.getReg())) ||
6056 (FlatVariant == AMDGPU::FlatAddrSpace::FLAT && !IsInBounds))
6057 return Default;
6058
6059 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
6060 if (!TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace, FlatVariant))
6061 return Default;
6062
6063 return std::pair(PtrBase, ConstOffset);
6064}
6065
6066InstructionSelector::ComplexRendererFns
6067AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
6068 auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: AMDGPU::FlatAddrSpace::FLAT);
6069
6070 return {{
6071 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
6072 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
6073 }};
6074}
6075
6076InstructionSelector::ComplexRendererFns
6077AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
6078 auto PtrWithOffset =
6079 selectFlatOffsetImpl(Root, FlatVariant: AMDGPU::FlatAddrSpace::FlatGlobal);
6080
6081 return {{
6082 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
6083 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
6084 }};
6085}
6086
6087InstructionSelector::ComplexRendererFns
6088AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
6089 auto PtrWithOffset =
6090 selectFlatOffsetImpl(Root, FlatVariant: AMDGPU::FlatAddrSpace::FlatScratch);
6091
6092 return {{
6093 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
6094 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
6095 }};
6096}
6097
6098// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
6099InstructionSelector::ComplexRendererFns
6100AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
6101 unsigned CPolBits,
6102 bool NeedIOffset) const {
6103 Register Addr = Root.getReg();
6104 Register PtrBase;
6105 int64_t ConstOffset;
6106 int64_t ImmOffset = 0;
6107
6108 // Match the immediate offset first, which canonically is moved as low as
6109 // possible.
6110 std::tie(args&: PtrBase, args&: ConstOffset, args: std::ignore) =
6111 getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
6112
6113 if (ConstOffset != 0) {
6114 if (NeedIOffset &&
6115 TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
6116 FlatVariant: AMDGPU::FlatAddrSpace::FlatGlobal)) {
6117 Addr = PtrBase;
6118 ImmOffset = ConstOffset;
6119 } else {
6120 auto PtrBaseDef = getDefSrcRegIgnoringCopies(Reg: PtrBase, MRI: *MRI);
6121 if (isSGPR(Reg: PtrBaseDef->Reg)) {
6122 if (ConstOffset > 0) {
6123 // Offset is too large.
6124 //
6125 // saddr + large_offset -> saddr +
6126 // (voffset = large_offset & ~MaxOffset) +
6127 // (large_offset & MaxOffset);
6128 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
6129 if (NeedIOffset) {
6130 std::tie(args&: SplitImmOffset, args&: RemainderOffset) =
6131 TII.splitFlatOffset(COffsetVal: ConstOffset, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
6132 FlatVariant: AMDGPU::FlatAddrSpace::FlatGlobal);
6133 }
6134
6135 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(x: RemainderOffset)
6136 : isUInt<32>(x: RemainderOffset)) {
6137 MachineInstr *MI = Root.getParent();
6138 MachineBasicBlock *MBB = MI->getParent();
6139 Register HighBits =
6140 MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6141
6142 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32),
6143 DestReg: HighBits)
6144 .addImm(Val: RemainderOffset);
6145
6146 if (NeedIOffset)
6147 return {{
6148 [=](MachineInstrBuilder &MIB) {
6149 MIB.addReg(RegNo: PtrBase);
6150 }, // saddr
6151 [=](MachineInstrBuilder &MIB) {
6152 MIB.addReg(RegNo: HighBits);
6153 }, // voffset
6154 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: SplitImmOffset); },
6155 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPolBits); },
6156 }};
6157 return {{
6158 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrBase); }, // saddr
6159 [=](MachineInstrBuilder &MIB) {
6160 MIB.addReg(RegNo: HighBits);
6161 }, // voffset
6162 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPolBits); },
6163 }};
6164 }
6165 }
6166
6167 // We are adding a 64 bit SGPR and a constant. If constant bus limit
6168 // is 1 we would need to perform 1 or 2 extra moves for each half of
6169 // the constant and it is better to do a scalar add and then issue a
6170 // single VALU instruction to materialize zero. Otherwise it is less
6171 // instructions to perform VALU adds with immediates or inline literals.
6172 unsigned NumLiterals =
6173 !TII.isInlineConstant(Imm: APInt(32, Lo_32(Value: ConstOffset))) +
6174 !TII.isInlineConstant(Imm: APInt(32, Hi_32(Value: ConstOffset)));
6175 if (STI.getConstantBusLimit(Opcode: AMDGPU::V_ADD_U32_e64) > NumLiterals)
6176 return std::nullopt;
6177 }
6178 }
6179 }
6180
6181 // Match the variable offset.
6182 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
6183 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6184 // Look through the SGPR->VGPR copy.
6185 Register SAddr =
6186 getSrcRegIgnoringCopies(Reg: AddrDef->MI->getOperand(i: 1).getReg(), MRI: *MRI);
6187
6188 if (isSGPR(Reg: SAddr)) {
6189 Register PtrBaseOffset = AddrDef->MI->getOperand(i: 2).getReg();
6190
6191 // It's possible voffset is an SGPR here, but the copy to VGPR will be
6192 // inserted later.
6193 bool ScaleOffset = selectScaleOffset(Root, Offset&: PtrBaseOffset,
6194 IsSigned: Subtarget->hasSignedGVSOffset());
6195 if (Register VOffset = matchExtendFromS32OrS32(
6196 Reg: PtrBaseOffset, IsSigned: Subtarget->hasSignedGVSOffset())) {
6197 if (NeedIOffset)
6198 return {{[=](MachineInstrBuilder &MIB) { // saddr
6199 MIB.addReg(RegNo: SAddr);
6200 },
6201 [=](MachineInstrBuilder &MIB) { // voffset
6202 MIB.addReg(RegNo: VOffset);
6203 },
6204 [=](MachineInstrBuilder &MIB) { // offset
6205 MIB.addImm(Val: ImmOffset);
6206 },
6207 [=](MachineInstrBuilder &MIB) { // cpol
6208 MIB.addImm(Val: CPolBits |
6209 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6210 }}};
6211 return {{[=](MachineInstrBuilder &MIB) { // saddr
6212 MIB.addReg(RegNo: SAddr);
6213 },
6214 [=](MachineInstrBuilder &MIB) { // voffset
6215 MIB.addReg(RegNo: VOffset);
6216 },
6217 [=](MachineInstrBuilder &MIB) { // cpol
6218 MIB.addImm(Val: CPolBits |
6219 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6220 }}};
6221 }
6222 }
6223 }
6224
6225 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
6226 // drop this.
6227 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
6228 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(Reg: AddrDef->Reg))
6229 return std::nullopt;
6230
6231 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
6232 // moves required to copy a 64-bit SGPR to VGPR.
6233 MachineInstr *MI = Root.getParent();
6234 MachineBasicBlock *MBB = MI->getParent();
6235 Register VOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6236
6237 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VOffset)
6238 .addImm(Val: 0);
6239
6240 if (NeedIOffset)
6241 return {{
6242 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: AddrDef->Reg); }, // saddr
6243 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: VOffset); }, // voffset
6244 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); }, // offset
6245 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPolBits); } // cpol
6246 }};
6247 return {{
6248 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: AddrDef->Reg); }, // saddr
6249 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: VOffset); }, // voffset
6250 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPolBits); } // cpol
6251 }};
6252}
6253
6254InstructionSelector::ComplexRendererFns
6255AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
6256 return selectGlobalSAddr(Root, CPolBits: 0);
6257}
6258
6259InstructionSelector::ComplexRendererFns
6260AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
6261 const MachineInstr &I = *Root.getParent();
6262
6263 // We are assuming CPol is always the last operand of the intrinsic.
6264 auto PassedCPol =
6265 I.getOperand(i: I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6266 return selectGlobalSAddr(Root, CPolBits: PassedCPol);
6267}
6268
6269InstructionSelector::ComplexRendererFns
6270AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
6271 const MachineInstr &I = *Root.getParent();
6272
6273 // We are assuming CPol is second from last operand of the intrinsic.
6274 auto PassedCPol =
6275 I.getOperand(i: I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6276 return selectGlobalSAddr(Root, CPolBits: PassedCPol);
6277}
6278
6279InstructionSelector::ComplexRendererFns
6280AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
6281 return selectGlobalSAddr(Root, CPolBits: AMDGPU::CPol::GLC);
6282}
6283
6284InstructionSelector::ComplexRendererFns
6285AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
6286 MachineOperand &Root) const {
6287 const MachineInstr &I = *Root.getParent();
6288
6289 // We are assuming CPol is always the last operand of the intrinsic.
6290 auto PassedCPol =
6291 I.getOperand(i: I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6292 return selectGlobalSAddr(Root, CPolBits: PassedCPol, NeedIOffset: false);
6293}
6294
6295InstructionSelector::ComplexRendererFns
6296AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
6297 MachineOperand &Root) const {
6298 const MachineInstr &I = *Root.getParent();
6299
6300 // We are assuming CPol is second from last operand of the intrinsic.
6301 auto PassedCPol =
6302 I.getOperand(i: I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6303 return selectGlobalSAddr(Root, CPolBits: PassedCPol, NeedIOffset: false);
6304}
6305
6306InstructionSelector::ComplexRendererFns
6307AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
6308 Register Addr = Root.getReg();
6309 Register PtrBase;
6310 int64_t ConstOffset;
6311 int64_t ImmOffset = 0;
6312
6313 // Match the immediate offset first, which canonically is moved as low as
6314 // possible.
6315 std::tie(args&: PtrBase, args&: ConstOffset, args: std::ignore) =
6316 getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
6317
6318 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
6319 TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
6320 FlatVariant: AMDGPU::FlatAddrSpace::FlatScratch)) {
6321 Addr = PtrBase;
6322 ImmOffset = ConstOffset;
6323 }
6324
6325 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
6326 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6327 int FI = AddrDef->MI->getOperand(i: 1).getIndex();
6328 return {{
6329 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(Idx: FI); }, // saddr
6330 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
6331 }};
6332 }
6333
6334 Register SAddr = AddrDef->Reg;
6335
6336 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6337 Register LHS = AddrDef->MI->getOperand(i: 1).getReg();
6338 Register RHS = AddrDef->MI->getOperand(i: 2).getReg();
6339 auto LHSDef = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI);
6340 auto RHSDef = getDefSrcRegIgnoringCopies(Reg: RHS, MRI: *MRI);
6341
6342 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
6343 isSGPR(Reg: RHSDef->Reg)) {
6344 int FI = LHSDef->MI->getOperand(i: 1).getIndex();
6345 MachineInstr &I = *Root.getParent();
6346 MachineBasicBlock *BB = I.getParent();
6347 const DebugLoc &DL = I.getDebugLoc();
6348 SAddr = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6349
6350 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADD_I32), DestReg: SAddr)
6351 .addFrameIndex(Idx: FI)
6352 .addReg(RegNo: RHSDef->Reg)
6353 .setOperandDead(3); // Dead scc
6354 }
6355 }
6356
6357 if (!isSGPR(Reg: SAddr))
6358 return std::nullopt;
6359
6360 return {{
6361 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SAddr); }, // saddr
6362 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
6363 }};
6364}
6365
6366// Check whether the flat scratch SVS swizzle bug affects this access.
6367bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6368 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
6369 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6370 return false;
6371
6372 // The bug affects the swizzling of SVS accesses if there is any carry out
6373 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
6374 // voffset to (soffset + inst_offset).
6375 auto VKnown = VT->getKnownBits(R: VAddr);
6376 auto SKnown = KnownBits::add(LHS: VT->getKnownBits(R: SAddr),
6377 RHS: KnownBits::makeConstant(C: APInt(32, ImmOffset)));
6378 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6379 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
6380 return (VMax & 3) + (SMax & 3) >= 4;
6381}
6382
6383InstructionSelector::ComplexRendererFns
6384AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
6385 Register Addr = Root.getReg();
6386 Register PtrBase;
6387 int64_t ConstOffset;
6388 int64_t ImmOffset = 0;
6389
6390 // Match the immediate offset first, which canonically is moved as low as
6391 // possible.
6392 std::tie(args&: PtrBase, args&: ConstOffset, args: std::ignore) =
6393 getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
6394
6395 Register OrigAddr = Addr;
6396 if (ConstOffset != 0 &&
6397 TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
6398 FlatVariant: AMDGPU::FlatAddrSpace::FlatScratch)) {
6399 Addr = PtrBase;
6400 ImmOffset = ConstOffset;
6401 }
6402
6403 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
6404 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6405 return std::nullopt;
6406
6407 Register RHS = AddrDef->MI->getOperand(i: 2).getReg();
6408 if (RBI.getRegBank(Reg: RHS, MRI: *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6409 return std::nullopt;
6410
6411 Register LHS = AddrDef->MI->getOperand(i: 1).getReg();
6412 auto LHSDef = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI);
6413
6414 if (OrigAddr != Addr) {
6415 if (!isFlatScratchBaseLegalSVImm(Addr: OrigAddr))
6416 return std::nullopt;
6417 } else {
6418 if (!isFlatScratchBaseLegalSV(Addr: OrigAddr))
6419 return std::nullopt;
6420 }
6421
6422 if (checkFlatScratchSVSSwizzleBug(VAddr: RHS, SAddr: LHS, ImmOffset))
6423 return std::nullopt;
6424
6425 unsigned CPol = selectScaleOffset(Root, Offset&: RHS, IsSigned: true /* IsSigned */)
6426 ? AMDGPU::CPol::SCAL
6427 : 0;
6428
6429 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6430 int FI = LHSDef->MI->getOperand(i: 1).getIndex();
6431 return {{
6432 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: RHS); }, // vaddr
6433 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(Idx: FI); }, // saddr
6434 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); }, // offset
6435 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPol); } // cpol
6436 }};
6437 }
6438
6439 if (!isSGPR(Reg: LHS))
6440 if (auto Def = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI))
6441 LHS = Def->Reg;
6442
6443 if (!isSGPR(Reg: LHS))
6444 return std::nullopt;
6445
6446 return {{
6447 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: RHS); }, // vaddr
6448 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: LHS); }, // saddr
6449 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); }, // offset
6450 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: CPol); } // cpol
6451 }};
6452}
6453
6454InstructionSelector::ComplexRendererFns
6455AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
6456 MachineInstr *MI = Root.getParent();
6457 MachineBasicBlock *MBB = MI->getParent();
6458 MachineFunction *MF = MBB->getParent();
6459 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6460
6461 int64_t Offset = 0;
6462 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: Offset)) &&
6463 Offset != AMDGPU::getNullPointerValue(AS: AMDGPUAS::PRIVATE_ADDRESS)) {
6464 Register HighBits = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6465
6466 // TODO: Should this be inside the render function? The iterator seems to
6467 // move.
6468 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
6469 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32),
6470 DestReg: HighBits)
6471 .addImm(Val: Offset & ~MaxOffset);
6472
6473 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6474 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6475 },
6476 [=](MachineInstrBuilder &MIB) { // vaddr
6477 MIB.addReg(RegNo: HighBits);
6478 },
6479 [=](MachineInstrBuilder &MIB) { // soffset
6480 // Use constant zero for soffset and rely on eliminateFrameIndex
6481 // to choose the appropriate frame register if need be.
6482 MIB.addImm(Val: 0);
6483 },
6484 [=](MachineInstrBuilder &MIB) { // offset
6485 MIB.addImm(Val: Offset & MaxOffset);
6486 }}};
6487 }
6488
6489 assert(Offset == 0 || Offset == -1);
6490
6491 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6492 // offsets.
6493 std::optional<int> FI;
6494 Register VAddr = Root.getReg();
6495
6496 const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg());
6497 Register PtrBase;
6498 int64_t ConstOffset;
6499 std::tie(args&: PtrBase, args&: ConstOffset, args: std::ignore) =
6500 getPtrBaseWithConstantOffset(Root: VAddr, MRI: *MRI);
6501 if (ConstOffset != 0) {
6502 if (TII.isLegalMUBUFImmOffset(Imm: ConstOffset) &&
6503 (!STI.privateMemoryResourceIsRangeChecked() ||
6504 VT->signBitIsZero(Op: PtrBase))) {
6505 const MachineInstr *PtrBaseDef = MRI->getVRegDef(Reg: PtrBase);
6506 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6507 FI = PtrBaseDef->getOperand(i: 1).getIndex();
6508 else
6509 VAddr = PtrBase;
6510 Offset = ConstOffset;
6511 }
6512 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6513 FI = RootDef->getOperand(i: 1).getIndex();
6514 }
6515
6516 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6517 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6518 },
6519 [=](MachineInstrBuilder &MIB) { // vaddr
6520 if (FI)
6521 MIB.addFrameIndex(Idx: *FI);
6522 else
6523 MIB.addReg(RegNo: VAddr);
6524 },
6525 [=](MachineInstrBuilder &MIB) { // soffset
6526 // Use constant zero for soffset and rely on eliminateFrameIndex
6527 // to choose the appropriate frame register if need be.
6528 MIB.addImm(Val: 0);
6529 },
6530 [=](MachineInstrBuilder &MIB) { // offset
6531 MIB.addImm(Val: Offset);
6532 }}};
6533}
6534
6535bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6536 int64_t Offset) const {
6537 if (!isUInt<16>(x: Offset))
6538 return false;
6539
6540 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6541 return true;
6542
6543 // On Southern Islands instruction with a negative base value and an offset
6544 // don't seem to work.
6545 return VT->signBitIsZero(Op: Base);
6546}
6547
6548bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6549 int64_t Offset1,
6550 unsigned Size) const {
6551 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6552 return false;
6553 if (!isUInt<8>(x: Offset0 / Size) || !isUInt<8>(x: Offset1 / Size))
6554 return false;
6555
6556 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6557 return true;
6558
6559 // On Southern Islands instruction with a negative base value and an offset
6560 // don't seem to work.
6561 return VT->signBitIsZero(Op: Base);
6562}
6563
6564// Return whether the operation has NoUnsignedWrap property.
6565static bool isNoUnsignedWrap(MachineInstr *Addr) {
6566 return Addr->getOpcode() == TargetOpcode::G_OR ||
6567 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6568 Addr->getFlag(Flag: MachineInstr::NoUWrap));
6569}
6570
6571// Check that the base address of flat scratch load/store in the form of `base +
6572// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6573// requirement). We always treat the first operand as the base address here.
6574bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6575 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
6576
6577 if (isNoUnsignedWrap(Addr: AddrMI))
6578 return true;
6579
6580 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6581 // values.
6582 if (STI.hasSignedScratchOffsets())
6583 return true;
6584
6585 Register LHS = AddrMI->getOperand(i: 1).getReg();
6586 Register RHS = AddrMI->getOperand(i: 2).getReg();
6587
6588 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6589 std::optional<ValueAndVReg> RhsValReg =
6590 getIConstantVRegValWithLookThrough(VReg: RHS, MRI: *MRI);
6591 // If the immediate offset is negative and within certain range, the base
6592 // address cannot also be negative. If the base is also negative, the sum
6593 // would be either negative or much larger than the valid range of scratch
6594 // memory a thread can access.
6595 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6596 RhsValReg->Value.getSExtValue() > -0x40000000)
6597 return true;
6598 }
6599
6600 return VT->signBitIsZero(Op: LHS);
6601}
6602
6603// Check address value in SGPR/VGPR are legal for flat scratch in the form
6604// of: SGPR + VGPR.
6605bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6606 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
6607
6608 if (isNoUnsignedWrap(Addr: AddrMI))
6609 return true;
6610
6611 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6612 // values.
6613 if (STI.hasSignedScratchOffsets())
6614 return true;
6615
6616 Register LHS = AddrMI->getOperand(i: 1).getReg();
6617 Register RHS = AddrMI->getOperand(i: 2).getReg();
6618 return VT->signBitIsZero(Op: RHS) && VT->signBitIsZero(Op: LHS);
6619}
6620
6621// Check address value in SGPR/VGPR are legal for flat scratch in the form
6622// of: SGPR + VGPR + Imm.
6623bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6624 Register Addr) const {
6625 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6626 // values.
6627 if (STI.hasSignedScratchOffsets())
6628 return true;
6629
6630 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
6631 Register Base = AddrMI->getOperand(i: 1).getReg();
6632 std::optional<DefinitionAndSourceRegister> BaseDef =
6633 getDefSrcRegIgnoringCopies(Reg: Base, MRI: *MRI);
6634 std::optional<ValueAndVReg> RHSOffset =
6635 getIConstantVRegValWithLookThrough(VReg: AddrMI->getOperand(i: 2).getReg(), MRI: *MRI);
6636 assert(RHSOffset);
6637
6638 // If the immediate offset is negative and within certain range, the base
6639 // address cannot also be negative. If the base is also negative, the sum
6640 // would be either negative or much larger than the valid range of scratch
6641 // memory a thread can access.
6642 if (isNoUnsignedWrap(Addr: BaseDef->MI) &&
6643 (isNoUnsignedWrap(Addr: AddrMI) ||
6644 (RHSOffset->Value.getSExtValue() < 0 &&
6645 RHSOffset->Value.getSExtValue() > -0x40000000)))
6646 return true;
6647
6648 Register LHS = BaseDef->MI->getOperand(i: 1).getReg();
6649 Register RHS = BaseDef->MI->getOperand(i: 2).getReg();
6650 return VT->signBitIsZero(Op: RHS) && VT->signBitIsZero(Op: LHS);
6651}
6652
6653bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6654 unsigned ShAmtBits) const {
6655 assert(MI.getOpcode() == TargetOpcode::G_AND);
6656
6657 std::optional<APInt> RHS =
6658 getIConstantVRegVal(VReg: MI.getOperand(i: 2).getReg(), MRI: *MRI);
6659 if (!RHS)
6660 return false;
6661
6662 if (RHS->countr_one() >= ShAmtBits)
6663 return true;
6664
6665 const APInt &LHSKnownZeros = VT->getKnownZeroes(R: MI.getOperand(i: 1).getReg());
6666 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6667}
6668
6669InstructionSelector::ComplexRendererFns
6670AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6671 MachineOperand &Root) const {
6672 Register Reg = Root.getReg();
6673 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6674
6675 std::optional<DefinitionAndSourceRegister> Def =
6676 getDefSrcRegIgnoringCopies(Reg, MRI: *MRI);
6677 assert(Def && "this shouldn't be an optional result");
6678 Reg = Def->Reg;
6679
6680 if (Register WaveBase = getWaveAddress(Def: Def->MI)) {
6681 return {{
6682 [=](MachineInstrBuilder &MIB) { // rsrc
6683 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6684 },
6685 [=](MachineInstrBuilder &MIB) { // soffset
6686 MIB.addReg(RegNo: WaveBase);
6687 },
6688 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // offset
6689 }};
6690 }
6691
6692 int64_t Offset = 0;
6693
6694 // FIXME: Copy check is a hack
6695 Register BasePtr;
6696 if (mi_match(R: Reg, MRI: *MRI,
6697 P: m_GPtrAdd(L: m_Reg(R&: BasePtr),
6698 R: m_any_of(preds: m_ICst(Cst&: Offset), preds: m_Copy(Src: m_ICst(Cst&: Offset)))))) {
6699 if (!TII.isLegalMUBUFImmOffset(Imm: Offset))
6700 return {};
6701 MachineInstr *BasePtrDef = getDefIgnoringCopies(Reg: BasePtr, MRI: *MRI);
6702 Register WaveBase = getWaveAddress(Def: BasePtrDef);
6703 if (!WaveBase)
6704 return {};
6705
6706 return {{
6707 [=](MachineInstrBuilder &MIB) { // rsrc
6708 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6709 },
6710 [=](MachineInstrBuilder &MIB) { // soffset
6711 MIB.addReg(RegNo: WaveBase);
6712 },
6713 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); } // offset
6714 }};
6715 }
6716
6717 if (!mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: Offset)) ||
6718 !TII.isLegalMUBUFImmOffset(Imm: Offset))
6719 return {};
6720
6721 return {{
6722 [=](MachineInstrBuilder &MIB) { // rsrc
6723 MIB.addReg(RegNo: Info->getScratchRSrcReg());
6724 },
6725 [=](MachineInstrBuilder &MIB) { // soffset
6726 MIB.addImm(Val: 0);
6727 },
6728 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); } // offset
6729 }};
6730}
6731
6732std::pair<Register, unsigned>
6733AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6734 const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg());
6735 int64_t ConstAddr = 0;
6736
6737 Register PtrBase;
6738 int64_t Offset;
6739 std::tie(args&: PtrBase, args&: Offset, args: std::ignore) =
6740 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
6741
6742 if (Offset) {
6743 if (isDSOffsetLegal(Base: PtrBase, Offset)) {
6744 // (add n0, c0)
6745 return std::pair(PtrBase, Offset);
6746 }
6747 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6748 // TODO
6749
6750
6751 } else if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: ConstAddr))) {
6752 // TODO
6753
6754 }
6755
6756 return std::pair(Root.getReg(), 0);
6757}
6758
6759InstructionSelector::ComplexRendererFns
6760AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6761 Register Reg;
6762 unsigned Offset;
6763 std::tie(args&: Reg, args&: Offset) = selectDS1Addr1OffsetImpl(Root);
6764 return {{
6765 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
6766 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }
6767 }};
6768}
6769
6770InstructionSelector::ComplexRendererFns
6771AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6772 return selectDSReadWrite2(Root, size: 4);
6773}
6774
6775InstructionSelector::ComplexRendererFns
6776AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6777 return selectDSReadWrite2(Root, size: 8);
6778}
6779
6780InstructionSelector::ComplexRendererFns
6781AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6782 unsigned Size) const {
6783 Register Reg;
6784 unsigned Offset;
6785 std::tie(args&: Reg, args&: Offset) = selectDSReadWrite2Impl(Root, size: Size);
6786 return {{
6787 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
6788 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); },
6789 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset+1); }
6790 }};
6791}
6792
6793std::pair<Register, unsigned>
6794AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6795 unsigned Size) const {
6796 const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg());
6797 int64_t ConstAddr = 0;
6798
6799 Register PtrBase;
6800 int64_t Offset;
6801 std::tie(args&: PtrBase, args&: Offset, args: std::ignore) =
6802 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
6803
6804 if (Offset) {
6805 int64_t OffsetValue0 = Offset;
6806 int64_t OffsetValue1 = Offset + Size;
6807 if (isDSOffset2Legal(Base: PtrBase, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
6808 // (add n0, c0)
6809 return std::pair(PtrBase, OffsetValue0 / Size);
6810 }
6811 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6812 // TODO
6813
6814 } else if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: ConstAddr))) {
6815 // TODO
6816
6817 }
6818
6819 return std::pair(Root.getReg(), 0);
6820}
6821
6822/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6823/// the base value with the constant offset, and if the offset computation is
6824/// known to be inbounds. There may be intervening copies between \p Root and
6825/// the identified constant. Returns \p Root, 0, false if this does not match
6826/// the pattern.
6827std::tuple<Register, int64_t, bool>
6828AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6829 Register Root, const MachineRegisterInfo &MRI) const {
6830 MachineInstr *RootI = getDefIgnoringCopies(Reg: Root, MRI);
6831 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6832 return {Root, 0, false};
6833
6834 MachineOperand &RHS = RootI->getOperand(i: 2);
6835 std::optional<ValueAndVReg> MaybeOffset =
6836 getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI);
6837 if (!MaybeOffset)
6838 return {Root, 0, false};
6839 bool IsInBounds = RootI->getFlag(Flag: MachineInstr::MIFlag::InBounds);
6840 return {RootI->getOperand(i: 1).getReg(), MaybeOffset->Value.getSExtValue(),
6841 IsInBounds};
6842}
6843
6844static void addZeroImm(MachineInstrBuilder &MIB) {
6845 MIB.addImm(Val: 0);
6846}
6847
6848/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6849/// BasePtr is not valid, a null base pointer will be used.
6850static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6851 uint32_t FormatLo, uint32_t FormatHi,
6852 Register BasePtr) {
6853 Register RSrc2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6854 Register RSrc3 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6855 Register RSrcHi = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
6856 Register RSrc = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_128RegClass);
6857
6858 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
6859 .addDef(RegNo: RSrc2)
6860 .addImm(Val: FormatLo);
6861 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
6862 .addDef(RegNo: RSrc3)
6863 .addImm(Val: FormatHi);
6864
6865 // Build the half of the subregister with the constants before building the
6866 // full 128-bit register. If we are building multiple resource descriptors,
6867 // this will allow CSEing of the 2-component register.
6868 B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
6869 .addDef(RegNo: RSrcHi)
6870 .addReg(RegNo: RSrc2)
6871 .addImm(Val: AMDGPU::sub0)
6872 .addReg(RegNo: RSrc3)
6873 .addImm(Val: AMDGPU::sub1);
6874
6875 Register RSrcLo = BasePtr;
6876 if (!BasePtr) {
6877 RSrcLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
6878 B.buildInstr(Opcode: AMDGPU::S_MOV_B64)
6879 .addDef(RegNo: RSrcLo)
6880 .addImm(Val: 0);
6881 }
6882
6883 B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
6884 .addDef(RegNo: RSrc)
6885 .addReg(RegNo: RSrcLo)
6886 .addImm(Val: AMDGPU::sub0_sub1)
6887 .addReg(RegNo: RSrcHi)
6888 .addImm(Val: AMDGPU::sub2_sub3);
6889
6890 return RSrc;
6891}
6892
6893static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6894 const SIInstrInfo &TII, Register BasePtr) {
6895 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6896
6897 // FIXME: Why are half the "default" bits ignored based on the addressing
6898 // mode?
6899 return buildRSRC(B, MRI, FormatLo: 0, FormatHi: Hi_32(Value: DefaultFormat), BasePtr);
6900}
6901
6902static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6903 const SIInstrInfo &TII, Register BasePtr) {
6904 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6905
6906 // FIXME: Why are half the "default" bits ignored based on the addressing
6907 // mode?
6908 return buildRSRC(B, MRI, FormatLo: -1, FormatHi: Hi_32(Value: DefaultFormat), BasePtr);
6909}
6910
6911AMDGPUInstructionSelector::MUBUFAddressData
6912AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6913 MUBUFAddressData Data;
6914 Data.N0 = Src;
6915
6916 Register PtrBase;
6917 int64_t Offset;
6918
6919 std::tie(args&: PtrBase, args&: Offset, args: std::ignore) =
6920 getPtrBaseWithConstantOffset(Root: Src, MRI: *MRI);
6921 if (isUInt<32>(x: Offset)) {
6922 Data.N0 = PtrBase;
6923 Data.Offset = Offset;
6924 }
6925
6926 if (MachineInstr *InputAdd
6927 = getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Data.N0, MRI: *MRI)) {
6928 Data.N2 = InputAdd->getOperand(i: 1).getReg();
6929 Data.N3 = InputAdd->getOperand(i: 2).getReg();
6930
6931 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6932 // FIXME: Don't know this was defined by operand 0
6933 //
6934 // TODO: Remove this when we have copy folding optimizations after
6935 // RegBankSelect.
6936 Data.N2 = getDefIgnoringCopies(Reg: Data.N2, MRI: *MRI)->getOperand(i: 0).getReg();
6937 Data.N3 = getDefIgnoringCopies(Reg: Data.N3, MRI: *MRI)->getOperand(i: 0).getReg();
6938 }
6939
6940 return Data;
6941}
6942
6943/// Return if the addr64 mubuf mode should be used for the given address.
6944bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6945 // (ptr_add N2, N3) -> addr64, or
6946 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6947 if (Addr.N2)
6948 return true;
6949
6950 const RegisterBank *N0Bank = RBI.getRegBank(Reg: Addr.N0, MRI: *MRI, TRI);
6951 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6952}
6953
6954/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6955/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6956/// component.
6957void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6958 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6959 if (TII.isLegalMUBUFImmOffset(Imm: ImmOffset))
6960 return;
6961
6962 // Illegal offset, store it in soffset.
6963 SOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6964 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
6965 .addDef(RegNo: SOffset)
6966 .addImm(Val: ImmOffset);
6967 ImmOffset = 0;
6968}
6969
6970bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6971 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6972 Register &SOffset, int64_t &Offset) const {
6973 // FIXME: Predicates should stop this from reaching here.
6974 // addr64 bit was removed for volcanic islands.
6975 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6976 return false;
6977
6978 MUBUFAddressData AddrData = parseMUBUFAddress(Src: Root.getReg());
6979 if (!shouldUseAddr64(Addr: AddrData))
6980 return false;
6981
6982 Register N0 = AddrData.N0;
6983 Register N2 = AddrData.N2;
6984 Register N3 = AddrData.N3;
6985 Offset = AddrData.Offset;
6986
6987 // Base pointer for the SRD.
6988 Register SRDPtr;
6989
6990 if (N2) {
6991 if (RBI.getRegBank(Reg: N2, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6992 assert(N3);
6993 if (RBI.getRegBank(Reg: N3, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6994 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6995 // addr64, and construct the default resource from a 0 address.
6996 VAddr = N0;
6997 } else {
6998 SRDPtr = N3;
6999 VAddr = N2;
7000 }
7001 } else {
7002 // N2 is not divergent.
7003 SRDPtr = N2;
7004 VAddr = N3;
7005 }
7006 } else if (RBI.getRegBank(Reg: N0, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
7007 // Use the default null pointer in the resource
7008 VAddr = N0;
7009 } else {
7010 // N0 -> offset, or
7011 // (N0 + C1) -> offset
7012 SRDPtr = N0;
7013 }
7014
7015 MachineIRBuilder B(*Root.getParent());
7016 RSrcReg = buildAddr64RSrc(B, MRI&: *MRI, TII, BasePtr: SRDPtr);
7017 splitIllegalMUBUFOffset(B, SOffset, ImmOffset&: Offset);
7018 return true;
7019}
7020
7021bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
7022 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
7023 int64_t &Offset) const {
7024
7025 // FIXME: Pattern should not reach here.
7026 if (STI.useFlatForGlobal())
7027 return false;
7028
7029 MUBUFAddressData AddrData = parseMUBUFAddress(Src: Root.getReg());
7030 if (shouldUseAddr64(Addr: AddrData))
7031 return false;
7032
7033 // N0 -> offset, or
7034 // (N0 + C1) -> offset
7035 Register SRDPtr = AddrData.N0;
7036 Offset = AddrData.Offset;
7037
7038 // TODO: Look through extensions for 32-bit soffset.
7039 MachineIRBuilder B(*Root.getParent());
7040
7041 RSrcReg = buildOffsetSrc(B, MRI&: *MRI, TII, BasePtr: SRDPtr);
7042 splitIllegalMUBUFOffset(B, SOffset, ImmOffset&: Offset);
7043 return true;
7044}
7045
7046InstructionSelector::ComplexRendererFns
7047AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
7048 Register VAddr;
7049 Register RSrcReg;
7050 Register SOffset;
7051 int64_t Offset = 0;
7052
7053 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
7054 return {};
7055
7056 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
7057 // pattern.
7058 return {{
7059 [=](MachineInstrBuilder &MIB) { // rsrc
7060 MIB.addReg(RegNo: RSrcReg);
7061 },
7062 [=](MachineInstrBuilder &MIB) { // vaddr
7063 MIB.addReg(RegNo: VAddr);
7064 },
7065 [=](MachineInstrBuilder &MIB) { // soffset
7066 if (SOffset)
7067 MIB.addReg(RegNo: SOffset);
7068 else if (STI.hasRestrictedSOffset())
7069 MIB.addReg(RegNo: AMDGPU::SGPR_NULL);
7070 else
7071 MIB.addImm(Val: 0);
7072 },
7073 [=](MachineInstrBuilder &MIB) { // offset
7074 MIB.addImm(Val: Offset);
7075 },
7076 addZeroImm, // cpol
7077 addZeroImm, // tfe
7078 addZeroImm // swz
7079 }};
7080}
7081
7082InstructionSelector::ComplexRendererFns
7083AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
7084 Register RSrcReg;
7085 Register SOffset;
7086 int64_t Offset = 0;
7087
7088 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
7089 return {};
7090
7091 return {{
7092 [=](MachineInstrBuilder &MIB) { // rsrc
7093 MIB.addReg(RegNo: RSrcReg);
7094 },
7095 [=](MachineInstrBuilder &MIB) { // soffset
7096 if (SOffset)
7097 MIB.addReg(RegNo: SOffset);
7098 else if (STI.hasRestrictedSOffset())
7099 MIB.addReg(RegNo: AMDGPU::SGPR_NULL);
7100 else
7101 MIB.addImm(Val: 0);
7102 },
7103 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }, // offset
7104 addZeroImm, // cpol
7105 addZeroImm, // tfe
7106 addZeroImm, // swz
7107 }};
7108}
7109
7110InstructionSelector::ComplexRendererFns
7111AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
7112
7113 Register SOffset = Root.getReg();
7114
7115 if (STI.hasRestrictedSOffset() && mi_match(R: SOffset, MRI: *MRI, P: m_ZeroInt()))
7116 SOffset = AMDGPU::SGPR_NULL;
7117
7118 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); }}};
7119}
7120
7121/// Get an immediate that must be 32-bits, and treated as zero extended.
7122static std::optional<uint64_t>
7123getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
7124 // getIConstantVRegVal sexts any values, so see if that matters.
7125 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(VReg: Reg, MRI);
7126 if (!OffsetVal || !isInt<32>(x: *OffsetVal))
7127 return std::nullopt;
7128 return Lo_32(Value: *OffsetVal);
7129}
7130
7131InstructionSelector::ComplexRendererFns
7132AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
7133 std::optional<uint64_t> OffsetVal =
7134 Root.isImm() ? Root.getImm() : getConstantZext32Val(Reg: Root.getReg(), MRI: *MRI);
7135 if (!OffsetVal)
7136 return {};
7137
7138 std::optional<int64_t> EncodedImm =
7139 AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: *OffsetVal, IsBuffer: true);
7140 if (!EncodedImm)
7141 return {};
7142
7143 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); } }};
7144}
7145
7146InstructionSelector::ComplexRendererFns
7147AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
7148 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
7149
7150 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Reg: Root.getReg(), MRI: *MRI);
7151 if (!OffsetVal)
7152 return {};
7153
7154 std::optional<int64_t> EncodedImm =
7155 AMDGPU::getSMRDEncodedLiteralOffset32(ST: STI, ByteOffset: *OffsetVal);
7156 if (!EncodedImm)
7157 return {};
7158
7159 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); } }};
7160}
7161
7162InstructionSelector::ComplexRendererFns
7163AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
7164 // Match the (soffset + offset) pair as a 32-bit register base and
7165 // an immediate offset.
7166 Register SOffset;
7167 unsigned Offset;
7168 std::tie(args&: SOffset, args&: Offset) = AMDGPU::getBaseWithConstantOffset(
7169 MRI&: *MRI, Reg: Root.getReg(), ValueTracking: VT, /*CheckNUW*/ true);
7170 if (!SOffset)
7171 return std::nullopt;
7172
7173 std::optional<int64_t> EncodedOffset =
7174 AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: Offset, /* IsBuffer */ true);
7175 if (!EncodedOffset)
7176 return std::nullopt;
7177
7178 assert(MRI->getType(SOffset) == LLT::scalar(32));
7179 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); },
7180 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedOffset); }}};
7181}
7182
7183std::pair<Register, unsigned>
7184AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
7185 bool &Matched) const {
7186 Matched = false;
7187
7188 Register Src;
7189 unsigned Mods;
7190 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
7191
7192 if (mi_match(R: Src, MRI: *MRI, P: m_GFPExt(Src: m_Reg(R&: Src)))) {
7193 assert(MRI->getType(Src) == LLT::scalar(16));
7194
7195 // Only change Src if src modifier could be gained. In such cases new Src
7196 // could be sgpr but this does not violate constant bus restriction for
7197 // instruction that is being selected.
7198 Src = stripBitCast(Reg: Src, MRI&: *MRI);
7199
7200 const auto CheckAbsNeg = [&]() {
7201 // Be careful about folding modifiers if we already have an abs. fneg is
7202 // applied last, so we don't want to apply an earlier fneg.
7203 if ((Mods & SISrcMods::ABS) == 0) {
7204 unsigned ModsTmp;
7205 std::tie(args&: Src, args&: ModsTmp) = selectVOP3ModsImpl(Src);
7206
7207 if ((ModsTmp & SISrcMods::NEG) != 0)
7208 Mods ^= SISrcMods::NEG;
7209
7210 if ((ModsTmp & SISrcMods::ABS) != 0)
7211 Mods |= SISrcMods::ABS;
7212 }
7213 };
7214
7215 CheckAbsNeg();
7216
7217 // op_sel/op_sel_hi decide the source type and source.
7218 // If the source's op_sel_hi is set, it indicates to do a conversion from
7219 // fp16. If the sources's op_sel is set, it picks the high half of the
7220 // source register.
7221
7222 Mods |= SISrcMods::OP_SEL_1;
7223
7224 if (isExtractHiElt(MRI&: *MRI, In: Src, Out&: Src)) {
7225 Mods |= SISrcMods::OP_SEL_0;
7226 CheckAbsNeg();
7227 }
7228
7229 Matched = true;
7230 }
7231
7232 return {Src, Mods};
7233}
7234
7235InstructionSelector::ComplexRendererFns
7236AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
7237 MachineOperand &Root) const {
7238 Register Src;
7239 unsigned Mods;
7240 bool Matched;
7241 std::tie(args&: Src, args&: Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7242 if (!Matched)
7243 return {};
7244
7245 return {{
7246 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
7247 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
7248 }};
7249}
7250
7251InstructionSelector::ComplexRendererFns
7252AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
7253 Register Src;
7254 unsigned Mods;
7255 bool Matched;
7256 std::tie(args&: Src, args&: Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7257
7258 return {{
7259 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
7260 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
7261 }};
7262}
7263
7264bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
7265 MachineInstr &I, Intrinsic::ID IntrID) const {
7266 MachineBasicBlock *MBB = I.getParent();
7267 const DebugLoc &DL = I.getDebugLoc();
7268 Register CCReg = I.getOperand(i: 0).getReg();
7269
7270 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
7271 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CMP_EQ_U32)).addImm(Val: 0).addImm(Val: 0);
7272
7273 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
7274 .addImm(Val: I.getOperand(i: 2).getImm());
7275
7276 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CCReg).addReg(RegNo: AMDGPU::SCC);
7277
7278 I.eraseFromParent();
7279 return RBI.constrainGenericRegister(Reg: CCReg, RC: AMDGPU::SReg_32_XM0_XEXECRegClass,
7280 MRI&: *MRI);
7281}
7282
7283bool AMDGPUInstructionSelector::selectSGetBarrierState(
7284 MachineInstr &I, Intrinsic::ID IntrID) const {
7285 MachineBasicBlock *MBB = I.getParent();
7286 const DebugLoc &DL = I.getDebugLoc();
7287 const MachineOperand &BarOp = I.getOperand(i: 2);
7288 std::optional<int64_t> BarValImm =
7289 getIConstantVRegSExtVal(VReg: BarOp.getReg(), MRI: *MRI);
7290
7291 if (!BarValImm) {
7292 auto CopyMIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
7293 .addReg(RegNo: BarOp.getReg());
7294 constrainSelectedInstRegOperands(I&: *CopyMIB, TII, TRI, RBI);
7295 }
7296 MachineInstrBuilder MIB;
7297 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
7298 : AMDGPU::S_GET_BARRIER_STATE_M0;
7299 MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc));
7300
7301 auto DstReg = I.getOperand(i: 0).getReg();
7302 const TargetRegisterClass *DstRC =
7303 TRI.getConstrainedRegClassForOperand(MO: I.getOperand(i: 0), MRI: *MRI);
7304 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
7305 return false;
7306 MIB.addDef(RegNo: DstReg);
7307 if (BarValImm) {
7308 MIB.addImm(Val: *BarValImm);
7309 }
7310 I.eraseFromParent();
7311 return true;
7312}
7313
7314unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
7315 if (HasInlineConst) {
7316 switch (IntrID) {
7317 default:
7318 llvm_unreachable("not a named barrier op");
7319 case Intrinsic::amdgcn_s_barrier_join:
7320 return AMDGPU::S_BARRIER_JOIN_IMM;
7321 case Intrinsic::amdgcn_s_wakeup_barrier:
7322 return AMDGPU::S_WAKEUP_BARRIER_IMM;
7323 case Intrinsic::amdgcn_s_get_named_barrier_state:
7324 return AMDGPU::S_GET_BARRIER_STATE_IMM;
7325 };
7326 } else {
7327 switch (IntrID) {
7328 default:
7329 llvm_unreachable("not a named barrier op");
7330 case Intrinsic::amdgcn_s_barrier_join:
7331 return AMDGPU::S_BARRIER_JOIN_M0;
7332 case Intrinsic::amdgcn_s_wakeup_barrier:
7333 return AMDGPU::S_WAKEUP_BARRIER_M0;
7334 case Intrinsic::amdgcn_s_get_named_barrier_state:
7335 return AMDGPU::S_GET_BARRIER_STATE_M0;
7336 };
7337 }
7338}
7339
7340bool AMDGPUInstructionSelector::selectNamedBarrierInit(
7341 MachineInstr &I, Intrinsic::ID IntrID) const {
7342 MachineBasicBlock *MBB = I.getParent();
7343 const DebugLoc &DL = I.getDebugLoc();
7344 const MachineOperand &BarOp = I.getOperand(i: 1);
7345 const MachineOperand &CntOp = I.getOperand(i: 2);
7346
7347 // A member count of 0 means "keep existing member count". That plus a known
7348 // constant value for the barrier ID lets us use the immarg form.
7349 if (IntrID == Intrinsic::amdgcn_s_barrier_signal_var) {
7350 std::optional<int64_t> CntImm =
7351 getIConstantVRegSExtVal(VReg: CntOp.getReg(), MRI: *MRI);
7352 if (CntImm && *CntImm == 0) {
7353 std::optional<int64_t> BarValImm =
7354 getIConstantVRegSExtVal(VReg: BarOp.getReg(), MRI: *MRI);
7355 if (BarValImm) {
7356 auto BarID = ((*BarValImm) >> 4) & 0x3F;
7357 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_BARRIER_SIGNAL_IMM))
7358 .addImm(Val: BarID);
7359 I.eraseFromParent();
7360 return true;
7361 }
7362 }
7363 }
7364
7365 // BarID = (BarOp >> 4) & 0x3F
7366 Register TmpReg0 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7367 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: TmpReg0)
7368 .add(MO: BarOp)
7369 .addImm(Val: 4u)
7370 .setOperandDead(3); // Dead scc
7371
7372 Register TmpReg1 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7373 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: TmpReg1)
7374 .addReg(RegNo: TmpReg0)
7375 .addImm(Val: 0x3F)
7376 .setOperandDead(3); // Dead scc
7377
7378 // MO = ((CntOp & 0x3F) << shAmt) | BarID
7379 Register TmpReg2 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7380 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: TmpReg2)
7381 .add(MO: CntOp)
7382 .addImm(Val: 0x3F)
7383 .setOperandDead(3); // Dead scc
7384
7385 Register TmpReg3 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7386 constexpr unsigned ShAmt = 16;
7387 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: TmpReg3)
7388 .addReg(RegNo: TmpReg2)
7389 .addImm(Val: ShAmt)
7390 .setOperandDead(3); // Dead scc
7391
7392 Register TmpReg4 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7393 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_OR_B32), DestReg: TmpReg4)
7394 .addReg(RegNo: TmpReg1)
7395 .addReg(RegNo: TmpReg3)
7396 .setOperandDead(3); // Dead scc;
7397
7398 auto CopyMIB =
7399 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0).addReg(RegNo: TmpReg4);
7400 constrainSelectedInstRegOperands(I&: *CopyMIB, TII, TRI, RBI);
7401
7402 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7403 ? AMDGPU::S_BARRIER_INIT_M0
7404 : AMDGPU::S_BARRIER_SIGNAL_M0;
7405 MachineInstrBuilder MIB;
7406 MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc));
7407
7408 I.eraseFromParent();
7409 return true;
7410}
7411
7412bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7413 MachineInstr &I, Intrinsic::ID IntrID) const {
7414 MachineBasicBlock *MBB = I.getParent();
7415 const DebugLoc &DL = I.getDebugLoc();
7416 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7417 ? I.getOperand(i: 2)
7418 : I.getOperand(i: 1);
7419 std::optional<int64_t> BarValImm =
7420 getIConstantVRegSExtVal(VReg: BarOp.getReg(), MRI: *MRI);
7421
7422 if (!BarValImm) {
7423 // BarID = (BarOp >> 4) & 0x3F
7424 Register TmpReg0 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7425 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: TmpReg0)
7426 .addReg(RegNo: BarOp.getReg())
7427 .addImm(Val: 4u)
7428 .setOperandDead(3); // Dead scc;
7429
7430 Register TmpReg1 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7431 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: TmpReg1)
7432 .addReg(RegNo: TmpReg0)
7433 .addImm(Val: 0x3F)
7434 .setOperandDead(3); // Dead scc;
7435
7436 auto CopyMIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
7437 .addReg(RegNo: TmpReg1);
7438 constrainSelectedInstRegOperands(I&: *CopyMIB, TII, TRI, RBI);
7439 }
7440
7441 MachineInstrBuilder MIB;
7442 unsigned Opc = getNamedBarrierOp(HasInlineConst: BarValImm.has_value(), IntrID);
7443 MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc));
7444
7445 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7446 auto DstReg = I.getOperand(i: 0).getReg();
7447 const TargetRegisterClass *DstRC =
7448 TRI.getConstrainedRegClassForOperand(MO: I.getOperand(i: 0), MRI: *MRI);
7449 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
7450 return false;
7451 MIB.addDef(RegNo: DstReg);
7452 }
7453
7454 if (BarValImm) {
7455 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7456 MIB.addImm(Val: BarId);
7457 }
7458
7459 I.eraseFromParent();
7460 return true;
7461}
7462
7463void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
7464 const MachineInstr &MI,
7465 int OpIdx) const {
7466 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7467 "Expected G_CONSTANT");
7468 MIB.addImm(Val: MI.getOperand(i: 1).getCImm()->getSExtValue());
7469}
7470
7471void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
7472 const MachineInstr &MI,
7473 int OpIdx) const {
7474 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7475 "Expected G_CONSTANT");
7476 MIB.addImm(Val: -MI.getOperand(i: 1).getCImm()->getSExtValue());
7477}
7478
7479void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
7480 const MachineInstr &MI,
7481 int OpIdx) const {
7482 const MachineOperand &Op = MI.getOperand(i: 1);
7483 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
7484 MIB.addImm(Val: Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7485}
7486
7487void AMDGPUInstructionSelector::renderCountTrailingOnesImm(
7488 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7489 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7490 "Expected G_CONSTANT");
7491 MIB.addImm(Val: MI.getOperand(i: 1).getCImm()->getValue().countTrailingOnes());
7492}
7493
7494/// This only really exists to satisfy DAG type checking machinery, so is a
7495/// no-op here.
7496void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7497 const MachineInstr &MI,
7498 int OpIdx) const {
7499 const MachineOperand &Op = MI.getOperand(i: OpIdx);
7500 int64_t Imm;
7501 if (Op.isReg() && mi_match(R: Op.getReg(), MRI: *MRI, P: m_ICst(Cst&: Imm)))
7502 MIB.addImm(Val: Imm);
7503 else
7504 MIB.addImm(Val: Op.getImm());
7505}
7506
7507void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7508 const MachineInstr &MI,
7509 int OpIdx) const {
7510 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() != 0);
7511}
7512
7513void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7514 const MachineInstr &MI,
7515 int OpIdx) const {
7516 assert(OpIdx >= 0 && "expected to match an immediate operand");
7517 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7518}
7519
7520void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7521 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7522 assert(OpIdx >= 0 && "expected to match an immediate operand");
7523 MIB.addImm(
7524 Val: (MI.getOperand(i: OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7525}
7526
7527void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7528 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7529 assert(OpIdx >= 0 && "expected to match an immediate operand");
7530 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() & 0x1)
7531 ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL)
7532 : (int64_t)SISrcMods::DST_OP_SEL);
7533}
7534
7535void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7536 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7537 assert(OpIdx >= 0 && "expected to match an immediate operand");
7538 MIB.addImm(
7539 Val: (MI.getOperand(i: OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7540}
7541
7542void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7543 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7544 assert(OpIdx >= 0 && "expected to match an immediate operand");
7545 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() & 0x2)
7546 ? (int64_t)(SISrcMods::OP_SEL_0)
7547 : 0);
7548}
7549
7550void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7551 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7552 assert(OpIdx >= 0 && "expected to match an immediate operand");
7553 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7554 : 0);
7555}
7556
7557void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7558 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7559 assert(OpIdx >= 0 && "expected to match an immediate operand");
7560 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7561 : 0);
7562}
7563
7564void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7565 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7566 assert(OpIdx >= 0 && "expected to match an immediate operand");
7567 MIB.addImm(
7568 Val: (MI.getOperand(i: OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7569}
7570
7571void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7572 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7573 assert(OpIdx >= 0 && "expected to match an immediate operand");
7574 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() & 0x2)
7575 ? (int64_t)SISrcMods::DST_OP_SEL
7576 : 0);
7577}
7578
7579void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7580 const MachineInstr &MI,
7581 int OpIdx) const {
7582 assert(OpIdx >= 0 && "expected to match an immediate operand");
7583 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() &
7584 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
7585 : AMDGPU::CPol::ALL_pregfx12));
7586}
7587
7588void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7589 const MachineInstr &MI,
7590 int OpIdx) const {
7591 assert(OpIdx >= 0 && "expected to match an immediate operand");
7592 const bool Swizzle = MI.getOperand(i: OpIdx).getImm() &
7593 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ
7594 : AMDGPU::CPol::SWZ_pregfx12);
7595 MIB.addImm(Val: Swizzle);
7596}
7597
7598void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7599 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7600 assert(OpIdx >= 0 && "expected to match an immediate operand");
7601 const uint32_t Cpol = MI.getOperand(i: OpIdx).getImm() &
7602 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
7603 : AMDGPU::CPol::ALL_pregfx12);
7604 MIB.addImm(Val: Cpol | AMDGPU::CPol::GLC);
7605}
7606
7607void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7608 const MachineInstr &MI,
7609 int OpIdx) const {
7610 MIB.addFrameIndex(Idx: MI.getOperand(i: 1).getIndex());
7611}
7612
7613void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7614 const MachineInstr &MI,
7615 int OpIdx) const {
7616 const APFloat &APF = MI.getOperand(i: 1).getFPImm()->getValueAPF();
7617 int ExpVal = APF.getExactLog2Abs();
7618 assert(ExpVal != INT_MIN);
7619 MIB.addImm(Val: ExpVal);
7620}
7621
7622void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7623 const MachineInstr &MI,
7624 int OpIdx) const {
7625 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7626 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7627 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7628 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7629 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() + 3) % 4);
7630}
7631
7632void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7633 const MachineInstr &MI,
7634 int OpIdx) const {
7635 unsigned Mods = SISrcMods::OP_SEL_1;
7636 if (MI.getOperand(i: OpIdx).getImm())
7637 Mods ^= SISrcMods::NEG;
7638 MIB.addImm(Val: (int64_t)Mods);
7639}
7640
7641void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7642 const MachineInstr &MI,
7643 int OpIdx) const {
7644 unsigned Mods = SISrcMods::OP_SEL_1;
7645 if (MI.getOperand(i: OpIdx).getImm())
7646 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
7647 MIB.addImm(Val: (int64_t)Mods);
7648}
7649
7650void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7651 const MachineInstr &MI,
7652 int OpIdx) const {
7653 unsigned Val = MI.getOperand(i: OpIdx).getImm();
7654 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7655 if (Val == 1) // neg
7656 Mods ^= SISrcMods::NEG;
7657 if (Val == 2) // abs
7658 Mods ^= SISrcMods::ABS;
7659 if (Val == 3) // neg and abs
7660 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7661 MIB.addImm(Val: (int64_t)Mods);
7662}
7663
7664void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7665 const MachineInstr &MI,
7666 int OpIdx) const {
7667 uint32_t V = MI.getOperand(i: 2).getImm();
7668 V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK))
7669 << AMDGPU::CPol::SCOPE_SHIFT;
7670 if (!Subtarget->hasSafeCUPrefetch())
7671 V = std::max(a: V, b: (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7672 MIB.addImm(Val: V);
7673}
7674
7675/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7676void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7677 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7678 unsigned Val = MI.getOperand(i: OpIdx).getImm();
7679 unsigned New = 0;
7680 if (Val & 0x1)
7681 New |= SISrcMods::OP_SEL_0;
7682 if (Val & 0x2)
7683 New |= SISrcMods::OP_SEL_1;
7684 MIB.addImm(Val: New);
7685}
7686
7687bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7688 return TII.isInlineConstant(Imm);
7689}
7690
7691bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7692 return TII.isInlineConstant(Imm);
7693}
7694