1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUInstructionSelector.h"
15#include "AMDGPU.h"
16#include "AMDGPUGlobalISelUtils.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
23#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
24#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/CodeGen/MachineFrameInfo.h"
28#include "llvm/IR/DiagnosticInfo.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
43AMDGPUInstructionSelector::AMDGPUInstructionSelector(
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48#define GET_GLOBALISEL_PREDICATES_INIT
49#include "AMDGPUGenGlobalISel.inc"
50#undef GET_GLOBALISEL_PREDICATES_INIT
51#define GET_GLOBALISEL_TEMPORARIES_INIT
52#include "AMDGPUGenGlobalISel.inc"
53#undef GET_GLOBALISEL_TEMPORARIES_INIT
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
59void AMDGPUInstructionSelector::setupMF(MachineFunction &MF,
60 GISelValueTracking *VT,
61 CodeGenCoverage *CoverageInfo,
62 ProfileSummaryInfo *PSI,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
66 Subtarget->checkSubtargetFeatures(F: MF.getFunction());
67 InstructionSelector::setupMF(mf&: MF, vt: VT, covinfo: CoverageInfo, psi: PSI, bfi: BFI);
68}
69
70// Return the wave level SGPR base address if this is a wave address.
71static Register getWaveAddress(const MachineInstr *Def) {
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(i: 1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
85 dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank);
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(RC: TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(Val: RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(Opcode: NewOpc));
102 MI.removeOperand(OpNo: 1); // Remove intrinsic ID.
103 MI.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
104
105 MachineOperand &Dst = MI.getOperand(i: 0);
106 MachineOperand &Src = MI.getOperand(i: 1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Reg: Dst.getReg()) == LLT::scalar(SizeInBits: 1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 return RBI.constrainGenericRegister(Reg: Dst.getReg(), RC: *DstRC, MRI&: *MRI) &&
120 RBI.constrainGenericRegister(Reg: Src.getReg(), RC: *SrcRC, MRI&: *MRI);
121}
122
123bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124 const DebugLoc &DL = I.getDebugLoc();
125 MachineBasicBlock *BB = I.getParent();
126 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
127
128 const MachineOperand &Src = I.getOperand(i: 1);
129 MachineOperand &Dst = I.getOperand(i: 0);
130 Register DstReg = Dst.getReg();
131 Register SrcReg = Src.getReg();
132
133 if (isVCC(Reg: DstReg, MRI: *MRI)) {
134 if (SrcReg == AMDGPU::SCC) {
135 const TargetRegisterClass *RC
136 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
137 if (!RC)
138 return true;
139 return RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI);
140 }
141
142 if (!isVCC(Reg: SrcReg, MRI: *MRI)) {
143 // TODO: Should probably leave the copy and let copyPhysReg expand it.
144 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *TRI.getBoolRC(), MRI&: *MRI))
145 return false;
146
147 const TargetRegisterClass *SrcRC
148 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
149
150 std::optional<ValueAndVReg> ConstVal =
151 getIConstantVRegValWithLookThrough(VReg: SrcReg, MRI: *MRI, LookThroughInstrs: true);
152 if (ConstVal) {
153 unsigned MovOpc =
154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: MovOpc), DestReg: DstReg)
156 .addImm(Val: ConstVal->Value.getBoolValue() ? -1 : 0);
157 } else {
158 Register MaskedReg = MRI->createVirtualRegister(RegClass: SrcRC);
159
160 // We can't trust the high bits at this point, so clear them.
161
162 // TODO: Skip masking high bits if def is known boolean.
163
164 if (AMDGPU::getRegBitWidth(RCID: SrcRC->getID()) == 16) {
165 assert(Subtarget->useRealTrue16Insts());
166 const int64_t NoMods = 0;
167 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B16_t16_e64), DestReg: MaskedReg)
168 .addImm(Val: NoMods)
169 .addImm(Val: 1)
170 .addImm(Val: NoMods)
171 .addReg(RegNo: SrcReg)
172 .addImm(Val: NoMods);
173 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_NE_U16_t16_e64), DestReg: DstReg)
174 .addImm(Val: NoMods)
175 .addImm(Val: 0)
176 .addImm(Val: NoMods)
177 .addReg(RegNo: MaskedReg)
178 .addImm(Val: NoMods);
179 } else {
180 bool IsSGPR = TRI.isSGPRClass(RC: SrcRC);
181 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
182 auto And = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: MaskedReg)
183 .addImm(Val: 1)
184 .addReg(RegNo: SrcReg);
185 if (IsSGPR)
186 And.setOperandDead(3); // Dead scc
187
188 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_NE_U32_e64), DestReg: DstReg)
189 .addImm(Val: 0)
190 .addReg(RegNo: MaskedReg);
191 }
192 }
193
194 if (!MRI->getRegClassOrNull(Reg: SrcReg))
195 MRI->setRegClass(Reg: SrcReg, RC: SrcRC);
196 I.eraseFromParent();
197 return true;
198 }
199
200 const TargetRegisterClass *RC =
201 TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
202 if (RC && !RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI))
203 return false;
204
205 return true;
206 }
207
208 for (const MachineOperand &MO : I.operands()) {
209 if (MO.getReg().isPhysical())
210 continue;
211
212 const TargetRegisterClass *RC =
213 TRI.getConstrainedRegClassForOperand(MO, MRI: *MRI);
214 if (!RC)
215 continue;
216 RBI.constrainGenericRegister(Reg: MO.getReg(), RC: *RC, MRI&: *MRI);
217 }
218 return true;
219}
220
221bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
222 const DebugLoc &DL = I.getDebugLoc();
223 MachineBasicBlock *BB = I.getParent();
224
225 unsigned CmpOpc =
226 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
227 MachineInstr *Cmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: CmpOpc))
228 .addReg(RegNo: I.getOperand(i: 1).getReg())
229 .addImm(Val: 0);
230 if (!constrainSelectedInstRegOperands(I&: *Cmp, TII, TRI, RBI))
231 return false;
232
233 Register DstReg = I.getOperand(i: 0).getReg();
234 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: AMDGPU::SCC);
235
236 I.eraseFromParent();
237 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
238}
239
240bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
241 const DebugLoc &DL = I.getDebugLoc();
242 MachineBasicBlock *BB = I.getParent();
243
244 Register DstReg = I.getOperand(i: 0).getReg();
245 Register SrcReg = I.getOperand(i: 1).getReg();
246 std::optional<ValueAndVReg> Arg =
247 getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: 1).getReg(), MRI: *MRI);
248
249 if (Arg) {
250 const int64_t Value = Arg->Value.getZExtValue();
251 if (Value == 0) {
252 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
253 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: DstReg).addImm(Val: 0);
254 } else {
255 assert(Value == 1);
256 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: TRI.getExec());
257 }
258 I.eraseFromParent();
259 return RBI.constrainGenericRegister(Reg: DstReg, RC: *TRI.getBoolRC(), MRI&: *MRI);
260 }
261
262 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
263 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC).addReg(RegNo: SrcReg);
264
265 unsigned SelectOpcode =
266 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
267 MachineInstr *Select = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: SelectOpcode), DestReg: DstReg)
268 .addReg(RegNo: TRI.getExec())
269 .addImm(Val: 0);
270
271 I.eraseFromParent();
272 return constrainSelectedInstRegOperands(I&: *Select, TII, TRI, RBI);
273}
274
275bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
276 Register DstReg = I.getOperand(i: 0).getReg();
277 Register SrcReg = I.getOperand(i: 1).getReg();
278
279 const DebugLoc &DL = I.getDebugLoc();
280 MachineBasicBlock *BB = I.getParent();
281
282 auto RFL = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
283 .addReg(RegNo: SrcReg);
284
285 I.eraseFromParent();
286 return constrainSelectedInstRegOperands(I&: *RFL, TII, TRI, RBI);
287}
288
289bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
290 const Register DefReg = I.getOperand(i: 0).getReg();
291 const LLT DefTy = MRI->getType(Reg: DefReg);
292
293 // S1 G_PHIs should not be selected in instruction-select, instead:
294 // - divergent S1 G_PHI should go through lane mask merging algorithm
295 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
296 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
297 if (DefTy == LLT::scalar(SizeInBits: 1))
298 return false;
299
300 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
301
302 const RegClassOrRegBank &RegClassOrBank =
303 MRI->getRegClassOrRegBank(Reg: DefReg);
304
305 const TargetRegisterClass *DefRC =
306 dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank);
307 if (!DefRC) {
308 if (!DefTy.isValid()) {
309 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
310 return false;
311 }
312
313 const RegisterBank &RB = *cast<const RegisterBank *>(Val: RegClassOrBank);
314 DefRC = TRI.getRegClassForTypeOnBank(Ty: DefTy, Bank: RB);
315 if (!DefRC) {
316 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
317 return false;
318 }
319 }
320
321 // If inputs have register bank, assign corresponding reg class.
322 // Note: registers don't need to have the same reg bank.
323 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
324 const Register SrcReg = I.getOperand(i).getReg();
325
326 const RegisterBank *RB = MRI->getRegBankOrNull(Reg: SrcReg);
327 if (RB) {
328 const LLT SrcTy = MRI->getType(Reg: SrcReg);
329 const TargetRegisterClass *SrcRC =
330 TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *RB);
331 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI))
332 return false;
333 }
334 }
335
336 I.setDesc(TII.get(Opcode: TargetOpcode::PHI));
337 return RBI.constrainGenericRegister(Reg: DefReg, RC: *DefRC, MRI&: *MRI);
338}
339
340MachineOperand
341AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
342 const TargetRegisterClass &SubRC,
343 unsigned SubIdx) const {
344
345 MachineInstr *MI = MO.getParent();
346 MachineBasicBlock *BB = MO.getParent()->getParent();
347 Register DstReg = MRI->createVirtualRegister(RegClass: &SubRC);
348
349 if (MO.isReg()) {
350 unsigned ComposedSubIdx = TRI.composeSubRegIndices(a: MO.getSubReg(), b: SubIdx);
351 Register Reg = MO.getReg();
352 BuildMI(BB&: *BB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
353 .addReg(RegNo: Reg, flags: 0, SubReg: ComposedSubIdx);
354
355 return MachineOperand::CreateReg(Reg: DstReg, isDef: MO.isDef(), isImp: MO.isImplicit(),
356 isKill: MO.isKill(), isDead: MO.isDead(), isUndef: MO.isUndef(),
357 isEarlyClobber: MO.isEarlyClobber(), SubReg: 0, isDebug: MO.isDebug(),
358 isInternalRead: MO.isInternalRead());
359 }
360
361 assert(MO.isImm());
362
363 APInt Imm(64, MO.getImm());
364
365 switch (SubIdx) {
366 default:
367 llvm_unreachable("do not know to split immediate with this sub index.");
368 case AMDGPU::sub0:
369 return MachineOperand::CreateImm(Val: Imm.getLoBits(numBits: 32).getSExtValue());
370 case AMDGPU::sub1:
371 return MachineOperand::CreateImm(Val: Imm.getHiBits(numBits: 32).getSExtValue());
372 }
373}
374
375static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
376 switch (Opc) {
377 case AMDGPU::G_AND:
378 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
379 case AMDGPU::G_OR:
380 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
381 case AMDGPU::G_XOR:
382 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
383 default:
384 llvm_unreachable("not a bit op");
385 }
386}
387
388bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
389 Register DstReg = I.getOperand(i: 0).getReg();
390 unsigned Size = RBI.getSizeInBits(Reg: DstReg, MRI: *MRI, TRI);
391
392 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
393 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
394 DstRB->getID() != AMDGPU::VCCRegBankID)
395 return false;
396
397 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
398 STI.isWave64());
399 I.setDesc(TII.get(Opcode: getLogicalBitOpcode(Opc: I.getOpcode(), Is64)));
400
401 // Dead implicit-def of scc
402 I.addOperand(Op: MachineOperand::CreateReg(Reg: AMDGPU::SCC, isDef: true, // isDef
403 isImp: true, // isImp
404 isKill: false, // isKill
405 isDead: true)); // isDead
406 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
407}
408
409bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
410 MachineBasicBlock *BB = I.getParent();
411 MachineFunction *MF = BB->getParent();
412 Register DstReg = I.getOperand(i: 0).getReg();
413 const DebugLoc &DL = I.getDebugLoc();
414 LLT Ty = MRI->getType(Reg: DstReg);
415 if (Ty.isVector())
416 return false;
417
418 unsigned Size = Ty.getSizeInBits();
419 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
420 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
421 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
422
423 if (Size == 32) {
424 if (IsSALU) {
425 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
426 MachineInstr *Add =
427 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
428 .add(MO: I.getOperand(i: 1))
429 .add(MO: I.getOperand(i: 2))
430 .setOperandDead(3); // Dead scc
431 I.eraseFromParent();
432 return constrainSelectedInstRegOperands(I&: *Add, TII, TRI, RBI);
433 }
434
435 if (STI.hasAddNoCarry()) {
436 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
437 I.setDesc(TII.get(Opcode: Opc));
438 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
439 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
440 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
441 }
442
443 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
444
445 Register UnusedCarry = MRI->createVirtualRegister(RegClass: TRI.getWaveMaskRegClass());
446 MachineInstr *Add
447 = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
448 .addDef(RegNo: UnusedCarry, Flags: RegState::Dead)
449 .add(MO: I.getOperand(i: 1))
450 .add(MO: I.getOperand(i: 2))
451 .addImm(Val: 0);
452 I.eraseFromParent();
453 return constrainSelectedInstRegOperands(I&: *Add, TII, TRI, RBI);
454 }
455
456 assert(!Sub && "illegal sub should not reach here");
457
458 const TargetRegisterClass &RC
459 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
460 const TargetRegisterClass &HalfRC
461 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
462
463 MachineOperand Lo1(getSubOperand64(MO&: I.getOperand(i: 1), SubRC: HalfRC, SubIdx: AMDGPU::sub0));
464 MachineOperand Lo2(getSubOperand64(MO&: I.getOperand(i: 2), SubRC: HalfRC, SubIdx: AMDGPU::sub0));
465 MachineOperand Hi1(getSubOperand64(MO&: I.getOperand(i: 1), SubRC: HalfRC, SubIdx: AMDGPU::sub1));
466 MachineOperand Hi2(getSubOperand64(MO&: I.getOperand(i: 2), SubRC: HalfRC, SubIdx: AMDGPU::sub1));
467
468 Register DstLo = MRI->createVirtualRegister(RegClass: &HalfRC);
469 Register DstHi = MRI->createVirtualRegister(RegClass: &HalfRC);
470
471 if (IsSALU) {
472 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADD_U32), DestReg: DstLo)
473 .add(MO: Lo1)
474 .add(MO: Lo2);
475 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADDC_U32), DestReg: DstHi)
476 .add(MO: Hi1)
477 .add(MO: Hi2)
478 .setOperandDead(3); // Dead scc
479 } else {
480 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
481 Register CarryReg = MRI->createVirtualRegister(RegClass: CarryRC);
482 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: DstLo)
483 .addDef(RegNo: CarryReg)
484 .add(MO: Lo1)
485 .add(MO: Lo2)
486 .addImm(Val: 0);
487 MachineInstr *Addc = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: DstHi)
488 .addDef(RegNo: MRI->createVirtualRegister(RegClass: CarryRC), Flags: RegState::Dead)
489 .add(MO: Hi1)
490 .add(MO: Hi2)
491 .addReg(RegNo: CarryReg, flags: RegState::Kill)
492 .addImm(Val: 0);
493
494 if (!constrainSelectedInstRegOperands(I&: *Addc, TII, TRI, RBI))
495 return false;
496 }
497
498 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
499 .addReg(RegNo: DstLo)
500 .addImm(Val: AMDGPU::sub0)
501 .addReg(RegNo: DstHi)
502 .addImm(Val: AMDGPU::sub1);
503
504
505 if (!RBI.constrainGenericRegister(Reg: DstReg, RC, MRI&: *MRI))
506 return false;
507
508 I.eraseFromParent();
509 return true;
510}
511
512bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
513 MachineInstr &I) const {
514 MachineBasicBlock *BB = I.getParent();
515 MachineFunction *MF = BB->getParent();
516 const DebugLoc &DL = I.getDebugLoc();
517 Register Dst0Reg = I.getOperand(i: 0).getReg();
518 Register Dst1Reg = I.getOperand(i: 1).getReg();
519 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
520 I.getOpcode() == AMDGPU::G_UADDE;
521 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
522 I.getOpcode() == AMDGPU::G_USUBE;
523
524 if (isVCC(Reg: Dst1Reg, MRI: *MRI)) {
525 unsigned NoCarryOpc =
526 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
527 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
528 I.setDesc(TII.get(Opcode: HasCarryIn ? CarryOpc : NoCarryOpc));
529 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
530 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
531 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
532 }
533
534 Register Src0Reg = I.getOperand(i: 2).getReg();
535 Register Src1Reg = I.getOperand(i: 3).getReg();
536
537 if (HasCarryIn) {
538 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC)
539 .addReg(RegNo: I.getOperand(i: 4).getReg());
540 }
541
542 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
543 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
544
545 auto CarryInst = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: HasCarryIn ? CarryOpc : NoCarryOpc), DestReg: Dst0Reg)
546 .add(MO: I.getOperand(i: 2))
547 .add(MO: I.getOperand(i: 3));
548
549 if (MRI->use_nodbg_empty(RegNo: Dst1Reg)) {
550 CarryInst.setOperandDead(3); // Dead scc
551 } else {
552 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: Dst1Reg)
553 .addReg(RegNo: AMDGPU::SCC);
554 if (!MRI->getRegClassOrNull(Reg: Dst1Reg))
555 MRI->setRegClass(Reg: Dst1Reg, RC: &AMDGPU::SReg_32RegClass);
556 }
557
558 if (!RBI.constrainGenericRegister(Reg: Dst0Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
559 !RBI.constrainGenericRegister(Reg: Src0Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
560 !RBI.constrainGenericRegister(Reg: Src1Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
561 return false;
562
563 if (HasCarryIn &&
564 !RBI.constrainGenericRegister(Reg: I.getOperand(i: 4).getReg(),
565 RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
566 return false;
567
568 I.eraseFromParent();
569 return true;
570}
571
572bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
573 MachineInstr &I) const {
574 MachineBasicBlock *BB = I.getParent();
575 MachineFunction *MF = BB->getParent();
576 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
577
578 unsigned Opc;
579 if (Subtarget->hasMADIntraFwdBug())
580 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
581 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
582 else
583 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
584 I.setDesc(TII.get(Opcode: Opc));
585 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
586 I.addImplicitDefUseOperands(MF&: *MF);
587 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
588}
589
590// TODO: We should probably legalize these to only using 32-bit results.
591bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
592 MachineBasicBlock *BB = I.getParent();
593 Register DstReg = I.getOperand(i: 0).getReg();
594 Register SrcReg = I.getOperand(i: 1).getReg();
595 LLT DstTy = MRI->getType(Reg: DstReg);
596 LLT SrcTy = MRI->getType(Reg: SrcReg);
597 const unsigned SrcSize = SrcTy.getSizeInBits();
598 unsigned DstSize = DstTy.getSizeInBits();
599
600 // TODO: Should handle any multiple of 32 offset.
601 unsigned Offset = I.getOperand(i: 2).getImm();
602 if (Offset % 32 != 0 || DstSize > 128)
603 return false;
604
605 // 16-bit operations really use 32-bit registers.
606 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
607 if (DstSize == 16)
608 DstSize = 32;
609
610 const TargetRegisterClass *DstRC =
611 TRI.getConstrainedRegClassForOperand(MO: I.getOperand(i: 0), MRI: *MRI);
612 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
613 return false;
614
615 const RegisterBank *SrcBank = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
616 const TargetRegisterClass *SrcRC =
617 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcBank);
618 if (!SrcRC)
619 return false;
620 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Channel: Offset / 32,
621 NumRegs: DstSize / 32);
622 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
623 if (!SrcRC)
624 return false;
625
626 SrcReg = constrainOperandRegClass(MF: *MF, TRI, MRI&: *MRI, TII, RBI, InsertPt&: I,
627 RegClass: *SrcRC, RegMO&: I.getOperand(i: 1));
628 const DebugLoc &DL = I.getDebugLoc();
629 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::COPY), DestReg: DstReg)
630 .addReg(RegNo: SrcReg, flags: 0, SubReg);
631
632 I.eraseFromParent();
633 return true;
634}
635
636bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
637 MachineBasicBlock *BB = MI.getParent();
638 Register DstReg = MI.getOperand(i: 0).getReg();
639 LLT DstTy = MRI->getType(Reg: DstReg);
640 LLT SrcTy = MRI->getType(Reg: MI.getOperand(i: 1).getReg());
641
642 const unsigned SrcSize = SrcTy.getSizeInBits();
643 if (SrcSize < 32)
644 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
645
646 const DebugLoc &DL = MI.getDebugLoc();
647 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
648 const unsigned DstSize = DstTy.getSizeInBits();
649 const TargetRegisterClass *DstRC =
650 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
651 if (!DstRC)
652 return false;
653
654 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: DstRC, EltSize: SrcSize / 8);
655 MachineInstrBuilder MIB =
656 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg);
657 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
658 MachineOperand &Src = MI.getOperand(i: I + 1);
659 MIB.addReg(RegNo: Src.getReg(), flags: getUndefRegState(B: Src.isUndef()));
660 MIB.addImm(Val: SubRegs[I]);
661
662 const TargetRegisterClass *SrcRC
663 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
664 if (SrcRC && !RBI.constrainGenericRegister(Reg: Src.getReg(), RC: *SrcRC, MRI&: *MRI))
665 return false;
666 }
667
668 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
669 return false;
670
671 MI.eraseFromParent();
672 return true;
673}
674
675bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
676 MachineBasicBlock *BB = MI.getParent();
677 const int NumDst = MI.getNumOperands() - 1;
678
679 MachineOperand &Src = MI.getOperand(i: NumDst);
680
681 Register SrcReg = Src.getReg();
682 Register DstReg0 = MI.getOperand(i: 0).getReg();
683 LLT DstTy = MRI->getType(Reg: DstReg0);
684 LLT SrcTy = MRI->getType(Reg: SrcReg);
685
686 const unsigned DstSize = DstTy.getSizeInBits();
687 const unsigned SrcSize = SrcTy.getSizeInBits();
688 const DebugLoc &DL = MI.getDebugLoc();
689 const RegisterBank *SrcBank = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
690
691 const TargetRegisterClass *SrcRC =
692 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcBank);
693 if (!SrcRC || !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI))
694 return false;
695
696 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
697 // source, and this relies on the fact that the same subregister indices are
698 // used for both.
699 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: SrcRC, EltSize: DstSize / 8);
700 for (int I = 0, E = NumDst; I != E; ++I) {
701 MachineOperand &Dst = MI.getOperand(i: I);
702 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::COPY), DestReg: Dst.getReg())
703 .addReg(RegNo: SrcReg, flags: 0, SubReg: SubRegs[I]);
704
705 // Make sure the subregister index is valid for the source register.
706 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
707 if (!SrcRC || !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI))
708 return false;
709
710 const TargetRegisterClass *DstRC =
711 TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
712 if (DstRC && !RBI.constrainGenericRegister(Reg: Dst.getReg(), RC: *DstRC, MRI&: *MRI))
713 return false;
714 }
715
716 MI.eraseFromParent();
717 return true;
718}
719
720bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
721 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
722 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
723
724 Register Src0 = MI.getOperand(i: 1).getReg();
725 Register Src1 = MI.getOperand(i: 2).getReg();
726 LLT SrcTy = MRI->getType(Reg: Src0);
727 const unsigned SrcSize = SrcTy.getSizeInBits();
728
729 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
730 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
731 return selectG_MERGE_VALUES(MI);
732 }
733
734 // Selection logic below is for V2S16 only.
735 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
736 Register Dst = MI.getOperand(i: 0).getReg();
737 if (MRI->getType(Reg: Dst) != LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) ||
738 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
739 SrcTy != LLT::scalar(SizeInBits: 32)))
740 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
741
742 const RegisterBank *DstBank = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
743 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
744 return false;
745
746 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
747 DstBank->getID() == AMDGPU::VGPRRegBankID);
748 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
749
750 const DebugLoc &DL = MI.getDebugLoc();
751 MachineBasicBlock *BB = MI.getParent();
752
753 // First, before trying TableGen patterns, check if both sources are
754 // constants. In those cases, we can trivially compute the final constant
755 // and emit a simple move.
756 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(VReg: Src1, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
757 if (ConstSrc1) {
758 auto ConstSrc0 =
759 getAnyConstantVRegValWithLookThrough(VReg: Src0, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
760 if (ConstSrc0) {
761 const int64_t K0 = ConstSrc0->Value.getSExtValue();
762 const int64_t K1 = ConstSrc1->Value.getSExtValue();
763 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
764 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
765 uint32_t Imm = Lo16 | (Hi16 << 16);
766
767 // VALU
768 if (IsVector) {
769 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: Dst).addImm(Val: Imm);
770 MI.eraseFromParent();
771 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI);
772 }
773
774 // SALU
775 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: Dst).addImm(Val: Imm);
776 MI.eraseFromParent();
777 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
778 }
779 }
780
781 // Now try TableGen patterns.
782 if (selectImpl(I&: MI, CoverageInfo&: *CoverageInfo))
783 return true;
784
785 // TODO: This should probably be a combine somewhere
786 // (build_vector $src0, undef) -> copy $src0
787 MachineInstr *Src1Def = getDefIgnoringCopies(Reg: Src1, MRI: *MRI);
788 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
789 MI.setDesc(TII.get(Opcode: AMDGPU::COPY));
790 MI.removeOperand(OpNo: 2);
791 const auto &RC =
792 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
793 return RBI.constrainGenericRegister(Reg: Dst, RC, MRI&: *MRI) &&
794 RBI.constrainGenericRegister(Reg: Src0, RC, MRI&: *MRI);
795 }
796
797 // TODO: Can be improved?
798 if (IsVector) {
799 Register TmpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
800 auto MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B32_e32), DestReg: TmpReg)
801 .addImm(Val: 0xFFFF)
802 .addReg(RegNo: Src0);
803 if (!constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI))
804 return false;
805
806 MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: Dst)
807 .addReg(RegNo: Src1)
808 .addImm(Val: 16)
809 .addReg(RegNo: TmpReg);
810 if (!constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI))
811 return false;
812
813 MI.eraseFromParent();
814 return true;
815 }
816
817 Register ShiftSrc0;
818 Register ShiftSrc1;
819
820 // With multiple uses of the shift, this will duplicate the shift and
821 // increase register pressure.
822 //
823 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
824 // => (S_PACK_HH_B32_B16 $src0, $src1)
825 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
826 // => (S_PACK_HL_B32_B16 $src0, $src1)
827 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
828 // => (S_PACK_LH_B32_B16 $src0, $src1)
829 // (build_vector $src0, $src1)
830 // => (S_PACK_LL_B32_B16 $src0, $src1)
831
832 bool Shift0 = mi_match(
833 R: Src0, MRI: *MRI, P: m_OneUse(SP: m_GLShr(L: m_Reg(R&: ShiftSrc0), R: m_SpecificICst(RequestedValue: 16))));
834
835 bool Shift1 = mi_match(
836 R: Src1, MRI: *MRI, P: m_OneUse(SP: m_GLShr(L: m_Reg(R&: ShiftSrc1), R: m_SpecificICst(RequestedValue: 16))));
837
838 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
839 if (Shift0 && Shift1) {
840 Opc = AMDGPU::S_PACK_HH_B32_B16;
841 MI.getOperand(i: 1).setReg(ShiftSrc0);
842 MI.getOperand(i: 2).setReg(ShiftSrc1);
843 } else if (Shift1) {
844 Opc = AMDGPU::S_PACK_LH_B32_B16;
845 MI.getOperand(i: 2).setReg(ShiftSrc1);
846 } else if (Shift0) {
847 auto ConstSrc1 =
848 getAnyConstantVRegValWithLookThrough(VReg: Src1, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
849 if (ConstSrc1 && ConstSrc1->Value == 0) {
850 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
851 auto MIB = BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: Dst)
852 .addReg(RegNo: ShiftSrc0)
853 .addImm(Val: 16)
854 .setOperandDead(3); // Dead scc
855
856 MI.eraseFromParent();
857 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
858 }
859 if (STI.hasSPackHL()) {
860 Opc = AMDGPU::S_PACK_HL_B32_B16;
861 MI.getOperand(i: 1).setReg(ShiftSrc0);
862 }
863 }
864
865 MI.setDesc(TII.get(Opcode: Opc));
866 return constrainSelectedInstRegOperands(I&: MI, TII, TRI, RBI);
867}
868
869bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
870 const MachineOperand &MO = I.getOperand(i: 0);
871
872 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
873 // regbank check here is to know why getConstrainedRegClassForOperand failed.
874 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI: *MRI);
875 if ((!RC && !MRI->getRegBankOrNull(Reg: MO.getReg())) ||
876 (RC && RBI.constrainGenericRegister(Reg: MO.getReg(), RC: *RC, MRI&: *MRI))) {
877 I.setDesc(TII.get(Opcode: TargetOpcode::IMPLICIT_DEF));
878 return true;
879 }
880
881 return false;
882}
883
884bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
885 MachineBasicBlock *BB = I.getParent();
886
887 Register DstReg = I.getOperand(i: 0).getReg();
888 Register Src0Reg = I.getOperand(i: 1).getReg();
889 Register Src1Reg = I.getOperand(i: 2).getReg();
890 LLT Src1Ty = MRI->getType(Reg: Src1Reg);
891
892 unsigned DstSize = MRI->getType(Reg: DstReg).getSizeInBits();
893 unsigned InsSize = Src1Ty.getSizeInBits();
894
895 int64_t Offset = I.getOperand(i: 3).getImm();
896
897 // FIXME: These cases should have been illegal and unnecessary to check here.
898 if (Offset % 32 != 0 || InsSize % 32 != 0)
899 return false;
900
901 // Currently not handled by getSubRegFromChannel.
902 if (InsSize > 128)
903 return false;
904
905 unsigned SubReg = TRI.getSubRegFromChannel(Channel: Offset / 32, NumRegs: InsSize / 32);
906 if (SubReg == AMDGPU::NoSubRegister)
907 return false;
908
909 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
910 const TargetRegisterClass *DstRC =
911 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
912 if (!DstRC)
913 return false;
914
915 const RegisterBank *Src0Bank = RBI.getRegBank(Reg: Src0Reg, MRI: *MRI, TRI);
916 const RegisterBank *Src1Bank = RBI.getRegBank(Reg: Src1Reg, MRI: *MRI, TRI);
917 const TargetRegisterClass *Src0RC =
918 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *Src0Bank);
919 const TargetRegisterClass *Src1RC =
920 TRI.getRegClassForSizeOnBank(Size: InsSize, Bank: *Src1Bank);
921
922 // Deal with weird cases where the class only partially supports the subreg
923 // index.
924 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
925 if (!Src0RC || !Src1RC)
926 return false;
927
928 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
929 !RBI.constrainGenericRegister(Reg: Src0Reg, RC: *Src0RC, MRI&: *MRI) ||
930 !RBI.constrainGenericRegister(Reg: Src1Reg, RC: *Src1RC, MRI&: *MRI))
931 return false;
932
933 const DebugLoc &DL = I.getDebugLoc();
934 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: DstReg)
935 .addReg(RegNo: Src0Reg)
936 .addReg(RegNo: Src1Reg)
937 .addImm(Val: SubReg);
938
939 I.eraseFromParent();
940 return true;
941}
942
943bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
944 Register DstReg = MI.getOperand(i: 0).getReg();
945 Register SrcReg = MI.getOperand(i: 1).getReg();
946 Register OffsetReg = MI.getOperand(i: 2).getReg();
947 Register WidthReg = MI.getOperand(i: 3).getReg();
948
949 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
950 "scalar BFX instructions are expanded in regbankselect");
951 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
952 "64-bit vector BFX instructions are expanded in regbankselect");
953
954 const DebugLoc &DL = MI.getDebugLoc();
955 MachineBasicBlock *MBB = MI.getParent();
956
957 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
958 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
959 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
960 .addReg(RegNo: SrcReg)
961 .addReg(RegNo: OffsetReg)
962 .addReg(RegNo: WidthReg);
963 MI.eraseFromParent();
964 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
965}
966
967bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
968 if (STI.getLDSBankCount() != 16)
969 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
970
971 Register Dst = MI.getOperand(i: 0).getReg();
972 Register Src0 = MI.getOperand(i: 2).getReg();
973 Register M0Val = MI.getOperand(i: 6).getReg();
974 if (!RBI.constrainGenericRegister(Reg: M0Val, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
975 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI) ||
976 !RBI.constrainGenericRegister(Reg: Src0, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI))
977 return false;
978
979 // This requires 2 instructions. It is possible to write a pattern to support
980 // this, but the generated isel emitter doesn't correctly deal with multiple
981 // output instructions using the same physical register input. The copy to m0
982 // is incorrectly placed before the second instruction.
983 //
984 // TODO: Match source modifiers.
985
986 Register InterpMov = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
987 const DebugLoc &DL = MI.getDebugLoc();
988 MachineBasicBlock *MBB = MI.getParent();
989
990 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
991 .addReg(RegNo: M0Val);
992 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_INTERP_MOV_F32), DestReg: InterpMov)
993 .addImm(Val: 2)
994 .addImm(Val: MI.getOperand(i: 4).getImm()) // $attr
995 .addImm(Val: MI.getOperand(i: 3).getImm()); // $attrchan
996
997 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_INTERP_P1LV_F16), DestReg: Dst)
998 .addImm(Val: 0) // $src0_modifiers
999 .addReg(RegNo: Src0) // $src0
1000 .addImm(Val: MI.getOperand(i: 4).getImm()) // $attr
1001 .addImm(Val: MI.getOperand(i: 3).getImm()) // $attrchan
1002 .addImm(Val: 0) // $src2_modifiers
1003 .addReg(RegNo: InterpMov) // $src2 - 2 f16 values selected by high
1004 .addImm(Val: MI.getOperand(i: 5).getImm()) // $high
1005 .addImm(Val: 0) // $clamp
1006 .addImm(Val: 0); // $omod
1007
1008 MI.eraseFromParent();
1009 return true;
1010}
1011
1012// Writelane is special in that it can use SGPR and M0 (which would normally
1013// count as using the constant bus twice - but in this case it is allowed since
1014// the lane selector doesn't count as a use of the constant bus). However, it is
1015// still required to abide by the 1 SGPR rule. Fix this up if we might have
1016// multiple SGPRs.
1017bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1018 // With a constant bus limit of at least 2, there's no issue.
1019 if (STI.getConstantBusLimit(Opcode: AMDGPU::V_WRITELANE_B32) > 1)
1020 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
1021
1022 MachineBasicBlock *MBB = MI.getParent();
1023 const DebugLoc &DL = MI.getDebugLoc();
1024 Register VDst = MI.getOperand(i: 0).getReg();
1025 Register Val = MI.getOperand(i: 2).getReg();
1026 Register LaneSelect = MI.getOperand(i: 3).getReg();
1027 Register VDstIn = MI.getOperand(i: 4).getReg();
1028
1029 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_WRITELANE_B32), DestReg: VDst);
1030
1031 std::optional<ValueAndVReg> ConstSelect =
1032 getIConstantVRegValWithLookThrough(VReg: LaneSelect, MRI: *MRI);
1033 if (ConstSelect) {
1034 // The selector has to be an inline immediate, so we can use whatever for
1035 // the other operands.
1036 MIB.addReg(RegNo: Val);
1037 MIB.addImm(Val: ConstSelect->Value.getSExtValue() &
1038 maskTrailingOnes<uint64_t>(N: STI.getWavefrontSizeLog2()));
1039 } else {
1040 std::optional<ValueAndVReg> ConstVal =
1041 getIConstantVRegValWithLookThrough(VReg: Val, MRI: *MRI);
1042
1043 // If the value written is an inline immediate, we can get away without a
1044 // copy to m0.
1045 if (ConstVal && AMDGPU::isInlinableLiteral32(Literal: ConstVal->Value.getSExtValue(),
1046 HasInv2Pi: STI.hasInv2PiInlineImm())) {
1047 MIB.addImm(Val: ConstVal->Value.getSExtValue());
1048 MIB.addReg(RegNo: LaneSelect);
1049 } else {
1050 MIB.addReg(RegNo: Val);
1051
1052 // If the lane selector was originally in a VGPR and copied with
1053 // readfirstlane, there's a hazard to read the same SGPR from the
1054 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1055 RBI.constrainGenericRegister(Reg: LaneSelect, RC: AMDGPU::SReg_32_XM0RegClass, MRI&: *MRI);
1056
1057 BuildMI(BB&: *MBB, I&: *MIB, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1058 .addReg(RegNo: LaneSelect);
1059 MIB.addReg(RegNo: AMDGPU::M0);
1060 }
1061 }
1062
1063 MIB.addReg(RegNo: VDstIn);
1064
1065 MI.eraseFromParent();
1066 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1067}
1068
1069// We need to handle this here because tablegen doesn't support matching
1070// instructions with multiple outputs.
1071bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1072 Register Dst0 = MI.getOperand(i: 0).getReg();
1073 Register Dst1 = MI.getOperand(i: 1).getReg();
1074
1075 LLT Ty = MRI->getType(Reg: Dst0);
1076 unsigned Opc;
1077 if (Ty == LLT::scalar(SizeInBits: 32))
1078 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1079 else if (Ty == LLT::scalar(SizeInBits: 64))
1080 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1081 else
1082 return false;
1083
1084 // TODO: Match source modifiers.
1085
1086 const DebugLoc &DL = MI.getDebugLoc();
1087 MachineBasicBlock *MBB = MI.getParent();
1088
1089 Register Numer = MI.getOperand(i: 3).getReg();
1090 Register Denom = MI.getOperand(i: 4).getReg();
1091 unsigned ChooseDenom = MI.getOperand(i: 5).getImm();
1092
1093 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1094
1095 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: Dst0)
1096 .addDef(RegNo: Dst1)
1097 .addImm(Val: 0) // $src0_modifiers
1098 .addUse(RegNo: Src0) // $src0
1099 .addImm(Val: 0) // $src1_modifiers
1100 .addUse(RegNo: Denom) // $src1
1101 .addImm(Val: 0) // $src2_modifiers
1102 .addUse(RegNo: Numer) // $src2
1103 .addImm(Val: 0) // $clamp
1104 .addImm(Val: 0); // $omod
1105
1106 MI.eraseFromParent();
1107 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1108}
1109
1110bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1111 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
1112 switch (IntrinsicID) {
1113 case Intrinsic::amdgcn_if_break: {
1114 MachineBasicBlock *BB = I.getParent();
1115
1116 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1117 // SelectionDAG uses for wave32 vs wave64.
1118 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::SI_IF_BREAK))
1119 .add(MO: I.getOperand(i: 0))
1120 .add(MO: I.getOperand(i: 2))
1121 .add(MO: I.getOperand(i: 3));
1122
1123 Register DstReg = I.getOperand(i: 0).getReg();
1124 Register Src0Reg = I.getOperand(i: 2).getReg();
1125 Register Src1Reg = I.getOperand(i: 3).getReg();
1126
1127 I.eraseFromParent();
1128
1129 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1130 MRI->setRegClass(Reg, RC: TRI.getWaveMaskRegClass());
1131
1132 return true;
1133 }
1134 case Intrinsic::amdgcn_interp_p1_f16:
1135 return selectInterpP1F16(MI&: I);
1136 case Intrinsic::amdgcn_wqm:
1137 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::WQM);
1138 case Intrinsic::amdgcn_softwqm:
1139 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::SOFT_WQM);
1140 case Intrinsic::amdgcn_strict_wwm:
1141 case Intrinsic::amdgcn_wwm:
1142 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::STRICT_WWM);
1143 case Intrinsic::amdgcn_strict_wqm:
1144 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::STRICT_WQM);
1145 case Intrinsic::amdgcn_writelane:
1146 return selectWritelane(MI&: I);
1147 case Intrinsic::amdgcn_div_scale:
1148 return selectDivScale(MI&: I);
1149 case Intrinsic::amdgcn_icmp:
1150 case Intrinsic::amdgcn_fcmp:
1151 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
1152 return true;
1153 return selectIntrinsicCmp(MI&: I);
1154 case Intrinsic::amdgcn_ballot:
1155 return selectBallot(I);
1156 case Intrinsic::amdgcn_reloc_constant:
1157 return selectRelocConstant(I);
1158 case Intrinsic::amdgcn_groupstaticsize:
1159 return selectGroupStaticSize(I);
1160 case Intrinsic::returnaddress:
1161 return selectReturnAddress(I);
1162 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1163 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1164 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1165 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1166 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1167 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1168 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1169 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1170 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1171 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1172 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1173 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1174 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1175 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1176 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1177 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1178 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1179 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1180 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1181 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1182 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1183 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1184 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1185 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1186 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1187 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1188 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1189 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1190 return selectSMFMACIntrin(I);
1191 case Intrinsic::amdgcn_permlane16_swap:
1192 case Intrinsic::amdgcn_permlane32_swap:
1193 return selectPermlaneSwapIntrin(I, IntrID: IntrinsicID);
1194 default:
1195 return selectImpl(I, CoverageInfo&: *CoverageInfo);
1196 }
1197}
1198
1199static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
1200 const GCNSubtarget &ST) {
1201 if (Size != 16 && Size != 32 && Size != 64)
1202 return -1;
1203
1204 if (Size == 16 && !ST.has16BitInsts())
1205 return -1;
1206
1207 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1208 unsigned FakeS16Opc, unsigned S32Opc,
1209 unsigned S64Opc) {
1210 if (Size == 16)
1211 return ST.hasTrue16BitInsts()
1212 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1213 : S16Opc;
1214 if (Size == 32)
1215 return S32Opc;
1216 return S64Opc;
1217 };
1218
1219 switch (P) {
1220 default:
1221 llvm_unreachable("Unknown condition code!");
1222 case CmpInst::ICMP_NE:
1223 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1224 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1225 AMDGPU::V_CMP_NE_U64_e64);
1226 case CmpInst::ICMP_EQ:
1227 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1228 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1229 AMDGPU::V_CMP_EQ_U64_e64);
1230 case CmpInst::ICMP_SGT:
1231 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1232 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1233 AMDGPU::V_CMP_GT_I64_e64);
1234 case CmpInst::ICMP_SGE:
1235 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1236 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1237 AMDGPU::V_CMP_GE_I64_e64);
1238 case CmpInst::ICMP_SLT:
1239 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1240 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1241 AMDGPU::V_CMP_LT_I64_e64);
1242 case CmpInst::ICMP_SLE:
1243 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1244 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1245 AMDGPU::V_CMP_LE_I64_e64);
1246 case CmpInst::ICMP_UGT:
1247 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1248 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1249 AMDGPU::V_CMP_GT_U64_e64);
1250 case CmpInst::ICMP_UGE:
1251 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1252 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1253 AMDGPU::V_CMP_GE_U64_e64);
1254 case CmpInst::ICMP_ULT:
1255 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1256 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1257 AMDGPU::V_CMP_LT_U64_e64);
1258 case CmpInst::ICMP_ULE:
1259 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1260 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1261 AMDGPU::V_CMP_LE_U64_e64);
1262
1263 case CmpInst::FCMP_OEQ:
1264 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1265 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1266 AMDGPU::V_CMP_EQ_F64_e64);
1267 case CmpInst::FCMP_OGT:
1268 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1269 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1270 AMDGPU::V_CMP_GT_F64_e64);
1271 case CmpInst::FCMP_OGE:
1272 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1273 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1274 AMDGPU::V_CMP_GE_F64_e64);
1275 case CmpInst::FCMP_OLT:
1276 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1277 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1278 AMDGPU::V_CMP_LT_F64_e64);
1279 case CmpInst::FCMP_OLE:
1280 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1281 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1282 AMDGPU::V_CMP_LE_F64_e64);
1283 case CmpInst::FCMP_ONE:
1284 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1285 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1286 AMDGPU::V_CMP_NEQ_F64_e64);
1287 case CmpInst::FCMP_ORD:
1288 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1289 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1290 AMDGPU::V_CMP_O_F64_e64);
1291 case CmpInst::FCMP_UNO:
1292 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1293 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1294 AMDGPU::V_CMP_U_F64_e64);
1295 case CmpInst::FCMP_UEQ:
1296 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1297 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1298 AMDGPU::V_CMP_NLG_F64_e64);
1299 case CmpInst::FCMP_UGT:
1300 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1301 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1302 AMDGPU::V_CMP_NLE_F64_e64);
1303 case CmpInst::FCMP_UGE:
1304 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1305 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1306 AMDGPU::V_CMP_NLT_F64_e64);
1307 case CmpInst::FCMP_ULT:
1308 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1309 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1310 AMDGPU::V_CMP_NGE_F64_e64);
1311 case CmpInst::FCMP_ULE:
1312 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1313 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1314 AMDGPU::V_CMP_NGT_F64_e64);
1315 case CmpInst::FCMP_UNE:
1316 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1317 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1318 AMDGPU::V_CMP_NEQ_F64_e64);
1319 case CmpInst::FCMP_TRUE:
1320 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1321 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1322 AMDGPU::V_CMP_TRU_F64_e64);
1323 case CmpInst::FCMP_FALSE:
1324 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1325 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1326 AMDGPU::V_CMP_F_F64_e64);
1327 }
1328}
1329
1330int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1331 unsigned Size) const {
1332 if (Size == 64) {
1333 if (!STI.hasScalarCompareEq64())
1334 return -1;
1335
1336 switch (P) {
1337 case CmpInst::ICMP_NE:
1338 return AMDGPU::S_CMP_LG_U64;
1339 case CmpInst::ICMP_EQ:
1340 return AMDGPU::S_CMP_EQ_U64;
1341 default:
1342 return -1;
1343 }
1344 }
1345
1346 if (Size == 32) {
1347 switch (P) {
1348 case CmpInst::ICMP_NE:
1349 return AMDGPU::S_CMP_LG_U32;
1350 case CmpInst::ICMP_EQ:
1351 return AMDGPU::S_CMP_EQ_U32;
1352 case CmpInst::ICMP_SGT:
1353 return AMDGPU::S_CMP_GT_I32;
1354 case CmpInst::ICMP_SGE:
1355 return AMDGPU::S_CMP_GE_I32;
1356 case CmpInst::ICMP_SLT:
1357 return AMDGPU::S_CMP_LT_I32;
1358 case CmpInst::ICMP_SLE:
1359 return AMDGPU::S_CMP_LE_I32;
1360 case CmpInst::ICMP_UGT:
1361 return AMDGPU::S_CMP_GT_U32;
1362 case CmpInst::ICMP_UGE:
1363 return AMDGPU::S_CMP_GE_U32;
1364 case CmpInst::ICMP_ULT:
1365 return AMDGPU::S_CMP_LT_U32;
1366 case CmpInst::ICMP_ULE:
1367 return AMDGPU::S_CMP_LE_U32;
1368 case CmpInst::FCMP_OEQ:
1369 return AMDGPU::S_CMP_EQ_F32;
1370 case CmpInst::FCMP_OGT:
1371 return AMDGPU::S_CMP_GT_F32;
1372 case CmpInst::FCMP_OGE:
1373 return AMDGPU::S_CMP_GE_F32;
1374 case CmpInst::FCMP_OLT:
1375 return AMDGPU::S_CMP_LT_F32;
1376 case CmpInst::FCMP_OLE:
1377 return AMDGPU::S_CMP_LE_F32;
1378 case CmpInst::FCMP_ONE:
1379 return AMDGPU::S_CMP_LG_F32;
1380 case CmpInst::FCMP_ORD:
1381 return AMDGPU::S_CMP_O_F32;
1382 case CmpInst::FCMP_UNO:
1383 return AMDGPU::S_CMP_U_F32;
1384 case CmpInst::FCMP_UEQ:
1385 return AMDGPU::S_CMP_NLG_F32;
1386 case CmpInst::FCMP_UGT:
1387 return AMDGPU::S_CMP_NLE_F32;
1388 case CmpInst::FCMP_UGE:
1389 return AMDGPU::S_CMP_NLT_F32;
1390 case CmpInst::FCMP_ULT:
1391 return AMDGPU::S_CMP_NGE_F32;
1392 case CmpInst::FCMP_ULE:
1393 return AMDGPU::S_CMP_NGT_F32;
1394 case CmpInst::FCMP_UNE:
1395 return AMDGPU::S_CMP_NEQ_F32;
1396 default:
1397 llvm_unreachable("Unknown condition code!");
1398 }
1399 }
1400
1401 if (Size == 16) {
1402 if (!STI.hasSALUFloatInsts())
1403 return -1;
1404
1405 switch (P) {
1406 case CmpInst::FCMP_OEQ:
1407 return AMDGPU::S_CMP_EQ_F16;
1408 case CmpInst::FCMP_OGT:
1409 return AMDGPU::S_CMP_GT_F16;
1410 case CmpInst::FCMP_OGE:
1411 return AMDGPU::S_CMP_GE_F16;
1412 case CmpInst::FCMP_OLT:
1413 return AMDGPU::S_CMP_LT_F16;
1414 case CmpInst::FCMP_OLE:
1415 return AMDGPU::S_CMP_LE_F16;
1416 case CmpInst::FCMP_ONE:
1417 return AMDGPU::S_CMP_LG_F16;
1418 case CmpInst::FCMP_ORD:
1419 return AMDGPU::S_CMP_O_F16;
1420 case CmpInst::FCMP_UNO:
1421 return AMDGPU::S_CMP_U_F16;
1422 case CmpInst::FCMP_UEQ:
1423 return AMDGPU::S_CMP_NLG_F16;
1424 case CmpInst::FCMP_UGT:
1425 return AMDGPU::S_CMP_NLE_F16;
1426 case CmpInst::FCMP_UGE:
1427 return AMDGPU::S_CMP_NLT_F16;
1428 case CmpInst::FCMP_ULT:
1429 return AMDGPU::S_CMP_NGE_F16;
1430 case CmpInst::FCMP_ULE:
1431 return AMDGPU::S_CMP_NGT_F16;
1432 case CmpInst::FCMP_UNE:
1433 return AMDGPU::S_CMP_NEQ_F16;
1434 default:
1435 llvm_unreachable("Unknown condition code!");
1436 }
1437 }
1438
1439 return -1;
1440}
1441
1442bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1443
1444 MachineBasicBlock *BB = I.getParent();
1445 const DebugLoc &DL = I.getDebugLoc();
1446
1447 Register SrcReg = I.getOperand(i: 2).getReg();
1448 unsigned Size = RBI.getSizeInBits(Reg: SrcReg, MRI: *MRI, TRI);
1449
1450 auto Pred = (CmpInst::Predicate)I.getOperand(i: 1).getPredicate();
1451
1452 Register CCReg = I.getOperand(i: 0).getReg();
1453 if (!isVCC(Reg: CCReg, MRI: *MRI)) {
1454 int Opcode = getS_CMPOpcode(P: Pred, Size);
1455 if (Opcode == -1)
1456 return false;
1457 MachineInstr *ICmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode))
1458 .add(MO: I.getOperand(i: 2))
1459 .add(MO: I.getOperand(i: 3));
1460 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CCReg)
1461 .addReg(RegNo: AMDGPU::SCC);
1462 bool Ret =
1463 constrainSelectedInstRegOperands(I&: *ICmp, TII, TRI, RBI) &&
1464 RBI.constrainGenericRegister(Reg: CCReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
1465 I.eraseFromParent();
1466 return Ret;
1467 }
1468
1469 if (I.getOpcode() == AMDGPU::G_FCMP)
1470 return false;
1471
1472 int Opcode = getV_CMPOpcode(P: Pred, Size, ST: *Subtarget);
1473 if (Opcode == -1)
1474 return false;
1475
1476 MachineInstrBuilder ICmp;
1477 // t16 instructions
1478 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers)) {
1479 ICmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: I.getOperand(i: 0).getReg())
1480 .addImm(Val: 0)
1481 .add(MO: I.getOperand(i: 2))
1482 .addImm(Val: 0)
1483 .add(MO: I.getOperand(i: 3))
1484 .addImm(Val: 0); // op_sel
1485 } else {
1486 ICmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: I.getOperand(i: 0).getReg())
1487 .add(MO: I.getOperand(i: 2))
1488 .add(MO: I.getOperand(i: 3));
1489 }
1490
1491 RBI.constrainGenericRegister(Reg: ICmp->getOperand(i: 0).getReg(),
1492 RC: *TRI.getBoolRC(), MRI&: *MRI);
1493 bool Ret = constrainSelectedInstRegOperands(I&: *ICmp, TII, TRI, RBI);
1494 I.eraseFromParent();
1495 return Ret;
1496}
1497
1498bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1499 Register Dst = I.getOperand(i: 0).getReg();
1500 if (isVCC(Reg: Dst, MRI: *MRI))
1501 return false;
1502
1503 LLT DstTy = MRI->getType(Reg: Dst);
1504 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1505 return false;
1506
1507 MachineBasicBlock *BB = I.getParent();
1508 const DebugLoc &DL = I.getDebugLoc();
1509 Register SrcReg = I.getOperand(i: 2).getReg();
1510 unsigned Size = RBI.getSizeInBits(Reg: SrcReg, MRI: *MRI, TRI);
1511
1512 // i1 inputs are not supported in GlobalISel.
1513 if (Size == 1)
1514 return false;
1515
1516 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(i: 4).getImm());
1517 if (!CmpInst::isIntPredicate(P: Pred) && !CmpInst::isFPPredicate(P: Pred)) {
1518 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Dst);
1519 I.eraseFromParent();
1520 return RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI);
1521 }
1522
1523 const int Opcode = getV_CMPOpcode(P: Pred, Size, ST: *Subtarget);
1524 if (Opcode == -1)
1525 return false;
1526
1527 MachineInstrBuilder SelectedMI;
1528 MachineOperand &LHS = I.getOperand(i: 2);
1529 MachineOperand &RHS = I.getOperand(i: 3);
1530 auto [Src0, Src0Mods] = selectVOP3ModsImpl(Src: LHS.getReg());
1531 auto [Src1, Src1Mods] = selectVOP3ModsImpl(Src: RHS.getReg());
1532 Register Src0Reg =
1533 copyToVGPRIfSrcFolded(Src: Src0, Mods: Src0Mods, Root: LHS, InsertPt: &I, /*ForceVGPR*/ true);
1534 Register Src1Reg =
1535 copyToVGPRIfSrcFolded(Src: Src1, Mods: Src1Mods, Root: RHS, InsertPt: &I, /*ForceVGPR*/ true);
1536 SelectedMI = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: Dst);
1537 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers))
1538 SelectedMI.addImm(Val: Src0Mods);
1539 SelectedMI.addReg(RegNo: Src0Reg);
1540 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src1_modifiers))
1541 SelectedMI.addImm(Val: Src1Mods);
1542 SelectedMI.addReg(RegNo: Src1Reg);
1543 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::clamp))
1544 SelectedMI.addImm(Val: 0); // clamp
1545 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::op_sel))
1546 SelectedMI.addImm(Val: 0); // op_sel
1547
1548 RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI);
1549 if (!constrainSelectedInstRegOperands(I&: *SelectedMI, TII, TRI, RBI))
1550 return false;
1551
1552 I.eraseFromParent();
1553 return true;
1554}
1555
1556// Ballot has to zero bits in input lane-mask that are zero in current exec,
1557// Done as AND with exec. For inputs that are results of instruction that
1558// implicitly use same exec, for example compares in same basic block or SCC to
1559// VCC copy, use copy.
1560static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI,
1561 MachineBasicBlock *MBB) {
1562 MachineInstr *MI = MRI.getVRegDef(Reg);
1563 if (MI->getParent() != MBB)
1564 return false;
1565
1566 // Lane mask generated by SCC to VCC copy.
1567 if (MI->getOpcode() == AMDGPU::COPY) {
1568 auto DstRB = MRI.getRegBankOrNull(Reg: MI->getOperand(i: 0).getReg());
1569 auto SrcRB = MRI.getRegBankOrNull(Reg: MI->getOperand(i: 1).getReg());
1570 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1571 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1572 return true;
1573 }
1574
1575 // Lane mask generated using compare with same exec.
1576 if (isa<GAnyCmp>(Val: MI))
1577 return true;
1578
1579 Register LHS, RHS;
1580 // Look through AND.
1581 if (mi_match(R: Reg, MRI, P: m_GAnd(L: m_Reg(R&: LHS), R: m_Reg(R&: RHS))))
1582 return isLaneMaskFromSameBlock(Reg: LHS, MRI, MBB) ||
1583 isLaneMaskFromSameBlock(Reg: RHS, MRI, MBB);
1584
1585 return false;
1586}
1587
1588bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1589 MachineBasicBlock *BB = I.getParent();
1590 const DebugLoc &DL = I.getDebugLoc();
1591 Register DstReg = I.getOperand(i: 0).getReg();
1592 Register SrcReg = I.getOperand(i: 2).getReg();
1593 const unsigned BallotSize = MRI->getType(Reg: DstReg).getSizeInBits();
1594 const unsigned WaveSize = STI.getWavefrontSize();
1595
1596 // In the common case, the return type matches the wave size.
1597 // However we also support emitting i64 ballots in wave32 mode.
1598 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1599 return false;
1600
1601 std::optional<ValueAndVReg> Arg =
1602 getIConstantVRegValWithLookThrough(VReg: SrcReg, MRI: *MRI);
1603
1604 Register Dst = DstReg;
1605 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1606 if (BallotSize != WaveSize) {
1607 Dst = MRI->createVirtualRegister(RegClass: TRI.getBoolRC());
1608 }
1609
1610 if (Arg) {
1611 const int64_t Value = Arg->Value.getZExtValue();
1612 if (Value == 0) {
1613 // Dst = S_MOV 0
1614 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1615 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: Dst).addImm(Val: 0);
1616 } else {
1617 // Dst = COPY EXEC
1618 assert(Value == 1);
1619 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: Dst).addReg(RegNo: TRI.getExec());
1620 }
1621 if (!RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI))
1622 return false;
1623 } else {
1624 if (isLaneMaskFromSameBlock(Reg: SrcReg, MRI&: *MRI, MBB: BB)) {
1625 // Dst = COPY SrcReg
1626 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: Dst).addReg(RegNo: SrcReg);
1627 if (!RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI))
1628 return false;
1629 } else {
1630 // Dst = S_AND SrcReg, EXEC
1631 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1632 auto And = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: Dst)
1633 .addReg(RegNo: SrcReg)
1634 .addReg(RegNo: TRI.getExec())
1635 .setOperandDead(3); // Dead scc
1636 if (!constrainSelectedInstRegOperands(I&: *And, TII, TRI, RBI))
1637 return false;
1638 }
1639 }
1640
1641 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1642 if (BallotSize != WaveSize) {
1643 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1644 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: HiReg).addImm(Val: 0);
1645 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
1646 .addReg(RegNo: Dst)
1647 .addImm(Val: AMDGPU::sub0)
1648 .addReg(RegNo: HiReg)
1649 .addImm(Val: AMDGPU::sub1);
1650 }
1651
1652 I.eraseFromParent();
1653 return true;
1654}
1655
1656bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1657 Register DstReg = I.getOperand(i: 0).getReg();
1658 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
1659 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(Size: 32, Bank: *DstBank);
1660 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
1661 return false;
1662
1663 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1664
1665 Module *M = MF->getFunction().getParent();
1666 const MDNode *Metadata = I.getOperand(i: 2).getMetadata();
1667 auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString();
1668 auto *RelocSymbol = cast<GlobalVariable>(
1669 Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext())));
1670
1671 MachineBasicBlock *BB = I.getParent();
1672 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(),
1673 MCID: TII.get(Opcode: IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DestReg: DstReg)
1674 .addGlobalAddress(GV: RelocSymbol, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
1675
1676 I.eraseFromParent();
1677 return true;
1678}
1679
1680bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1681 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1682
1683 Register DstReg = I.getOperand(i: 0).getReg();
1684 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
1685 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1686 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1687
1688 MachineBasicBlock *MBB = I.getParent();
1689 const DebugLoc &DL = I.getDebugLoc();
1690
1691 auto MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Mov), DestReg: DstReg);
1692
1693 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1694 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1695 MIB.addImm(Val: MFI->getLDSSize());
1696 } else {
1697 Module *M = MF->getFunction().getParent();
1698 const GlobalValue *GV =
1699 Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::amdgcn_groupstaticsize);
1700 MIB.addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
1701 }
1702
1703 I.eraseFromParent();
1704 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1705}
1706
1707bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1708 MachineBasicBlock *MBB = I.getParent();
1709 MachineFunction &MF = *MBB->getParent();
1710 const DebugLoc &DL = I.getDebugLoc();
1711
1712 MachineOperand &Dst = I.getOperand(i: 0);
1713 Register DstReg = Dst.getReg();
1714 unsigned Depth = I.getOperand(i: 2).getImm();
1715
1716 const TargetRegisterClass *RC
1717 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
1718 if (!RC->hasSubClassEq(RC: &AMDGPU::SGPR_64RegClass) ||
1719 !RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI))
1720 return false;
1721
1722 // Check for kernel and shader functions
1723 if (Depth != 0 ||
1724 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1725 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B64), DestReg: DstReg)
1726 .addImm(Val: 0);
1727 I.eraseFromParent();
1728 return true;
1729 }
1730
1731 MachineFrameInfo &MFI = MF.getFrameInfo();
1732 // There is a call to @llvm.returnaddress in this function
1733 MFI.setReturnAddressIsTaken(true);
1734
1735 // Get the return address reg and mark it as an implicit live-in
1736 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1737 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, PhysReg: ReturnAddrReg,
1738 RC: AMDGPU::SReg_64RegClass, DL);
1739 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
1740 .addReg(RegNo: LiveIn);
1741 I.eraseFromParent();
1742 return true;
1743}
1744
1745bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1746 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1747 // SelectionDAG uses for wave32 vs wave64.
1748 MachineBasicBlock *BB = MI.getParent();
1749 BuildMI(BB&: *BB, I: &MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::SI_END_CF))
1750 .add(MO: MI.getOperand(i: 1));
1751
1752 Register Reg = MI.getOperand(i: 1).getReg();
1753 MI.eraseFromParent();
1754
1755 if (!MRI->getRegClassOrNull(Reg))
1756 MRI->setRegClass(Reg, RC: TRI.getWaveMaskRegClass());
1757 return true;
1758}
1759
1760bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1761 MachineInstr &MI, Intrinsic::ID IntrID) const {
1762 MachineBasicBlock *MBB = MI.getParent();
1763 MachineFunction *MF = MBB->getParent();
1764 const DebugLoc &DL = MI.getDebugLoc();
1765
1766 unsigned IndexOperand = MI.getOperand(i: 7).getImm();
1767 bool WaveRelease = MI.getOperand(i: 8).getImm() != 0;
1768 bool WaveDone = MI.getOperand(i: 9).getImm() != 0;
1769
1770 if (WaveDone && !WaveRelease) {
1771 // TODO: Move this to IR verifier
1772 const Function &Fn = MF->getFunction();
1773 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
1774 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1775 }
1776
1777 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1778 IndexOperand &= ~0x3f;
1779 unsigned CountDw = 0;
1780
1781 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1782 CountDw = (IndexOperand >> 24) & 0xf;
1783 IndexOperand &= ~(0xf << 24);
1784
1785 if (CountDw < 1 || CountDw > 4) {
1786 const Function &Fn = MF->getFunction();
1787 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
1788 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1789 CountDw = 1;
1790 }
1791 }
1792
1793 if (IndexOperand) {
1794 const Function &Fn = MF->getFunction();
1795 Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported(
1796 Fn, "ds_ordered_count: bad index operand", DL));
1797 }
1798
1799 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1800 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(MF: *MF);
1801
1802 unsigned Offset0 = OrderedCountIndex << 2;
1803 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1804
1805 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1806 Offset1 |= (CountDw - 1) << 6;
1807
1808 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1809 Offset1 |= ShaderType << 2;
1810
1811 unsigned Offset = Offset0 | (Offset1 << 8);
1812
1813 Register M0Val = MI.getOperand(i: 2).getReg();
1814 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1815 .addReg(RegNo: M0Val);
1816
1817 Register DstReg = MI.getOperand(i: 0).getReg();
1818 Register ValReg = MI.getOperand(i: 3).getReg();
1819 MachineInstrBuilder DS =
1820 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_ORDERED_COUNT), DestReg: DstReg)
1821 .addReg(RegNo: ValReg)
1822 .addImm(Val: Offset)
1823 .cloneMemRefs(OtherMI: MI);
1824
1825 if (!RBI.constrainGenericRegister(Reg: M0Val, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
1826 return false;
1827
1828 bool Ret = constrainSelectedInstRegOperands(I&: *DS, TII, TRI, RBI);
1829 MI.eraseFromParent();
1830 return Ret;
1831}
1832
1833static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1834 switch (IntrID) {
1835 case Intrinsic::amdgcn_ds_gws_init:
1836 return AMDGPU::DS_GWS_INIT;
1837 case Intrinsic::amdgcn_ds_gws_barrier:
1838 return AMDGPU::DS_GWS_BARRIER;
1839 case Intrinsic::amdgcn_ds_gws_sema_v:
1840 return AMDGPU::DS_GWS_SEMA_V;
1841 case Intrinsic::amdgcn_ds_gws_sema_br:
1842 return AMDGPU::DS_GWS_SEMA_BR;
1843 case Intrinsic::amdgcn_ds_gws_sema_p:
1844 return AMDGPU::DS_GWS_SEMA_P;
1845 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1846 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1847 default:
1848 llvm_unreachable("not a gws intrinsic");
1849 }
1850}
1851
1852bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1853 Intrinsic::ID IID) const {
1854 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1855 !STI.hasGWSSemaReleaseAll()))
1856 return false;
1857
1858 // intrinsic ID, vsrc, offset
1859 const bool HasVSrc = MI.getNumOperands() == 3;
1860 assert(HasVSrc || MI.getNumOperands() == 2);
1861
1862 Register BaseOffset = MI.getOperand(i: HasVSrc ? 2 : 1).getReg();
1863 const RegisterBank *OffsetRB = RBI.getRegBank(Reg: BaseOffset, MRI: *MRI, TRI);
1864 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1865 return false;
1866
1867 MachineInstr *OffsetDef = getDefIgnoringCopies(Reg: BaseOffset, MRI: *MRI);
1868 unsigned ImmOffset;
1869
1870 MachineBasicBlock *MBB = MI.getParent();
1871 const DebugLoc &DL = MI.getDebugLoc();
1872
1873 MachineInstr *Readfirstlane = nullptr;
1874
1875 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1876 // incoming offset, in case there's an add of a constant. We'll have to put it
1877 // back later.
1878 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1879 Readfirstlane = OffsetDef;
1880 BaseOffset = OffsetDef->getOperand(i: 1).getReg();
1881 OffsetDef = getDefIgnoringCopies(Reg: BaseOffset, MRI: *MRI);
1882 }
1883
1884 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1885 // If we have a constant offset, try to use the 0 in m0 as the base.
1886 // TODO: Look into changing the default m0 initialization value. If the
1887 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1888 // the immediate offset.
1889
1890 ImmOffset = OffsetDef->getOperand(i: 1).getCImm()->getZExtValue();
1891 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
1892 .addImm(Val: 0);
1893 } else {
1894 std::tie(args&: BaseOffset, args&: ImmOffset) =
1895 AMDGPU::getBaseWithConstantOffset(MRI&: *MRI, Reg: BaseOffset, ValueTracking: VT);
1896
1897 if (Readfirstlane) {
1898 // We have the constant offset now, so put the readfirstlane back on the
1899 // variable component.
1900 if (!RBI.constrainGenericRegister(Reg: BaseOffset, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI))
1901 return false;
1902
1903 Readfirstlane->getOperand(i: 1).setReg(BaseOffset);
1904 BaseOffset = Readfirstlane->getOperand(i: 0).getReg();
1905 } else {
1906 if (!RBI.constrainGenericRegister(Reg: BaseOffset,
1907 RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
1908 return false;
1909 }
1910
1911 Register M0Base = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1912 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: M0Base)
1913 .addReg(RegNo: BaseOffset)
1914 .addImm(Val: 16)
1915 .setOperandDead(3); // Dead scc
1916
1917 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1918 .addReg(RegNo: M0Base);
1919 }
1920
1921 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1922 // offset field) % 64. Some versions of the programming guide omit the m0
1923 // part, or claim it's from offset 0.
1924 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: gwsIntrinToOpcode(IntrID: IID)));
1925
1926 if (HasVSrc) {
1927 Register VSrc = MI.getOperand(i: 1).getReg();
1928 MIB.addReg(RegNo: VSrc);
1929
1930 if (!RBI.constrainGenericRegister(Reg: VSrc, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI))
1931 return false;
1932 }
1933
1934 MIB.addImm(Val: ImmOffset)
1935 .cloneMemRefs(OtherMI: MI);
1936
1937 TII.enforceOperandRCAlignment(MI&: *MIB, OpName: AMDGPU::OpName::data0);
1938
1939 MI.eraseFromParent();
1940 return true;
1941}
1942
1943bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1944 bool IsAppend) const {
1945 Register PtrBase = MI.getOperand(i: 2).getReg();
1946 LLT PtrTy = MRI->getType(Reg: PtrBase);
1947 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1948
1949 unsigned Offset;
1950 std::tie(args&: PtrBase, args&: Offset) = selectDS1Addr1OffsetImpl(Root&: MI.getOperand(i: 2));
1951
1952 // TODO: Should this try to look through readfirstlane like GWS?
1953 if (!isDSOffsetLegal(Base: PtrBase, Offset)) {
1954 PtrBase = MI.getOperand(i: 2).getReg();
1955 Offset = 0;
1956 }
1957
1958 MachineBasicBlock *MBB = MI.getParent();
1959 const DebugLoc &DL = MI.getDebugLoc();
1960 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1961
1962 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1963 .addReg(RegNo: PtrBase);
1964 if (!RBI.constrainGenericRegister(Reg: PtrBase, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
1965 return false;
1966
1967 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: MI.getOperand(i: 0).getReg())
1968 .addImm(Val: Offset)
1969 .addImm(Val: IsGDS ? -1 : 0)
1970 .cloneMemRefs(OtherMI: MI);
1971 MI.eraseFromParent();
1972 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1973}
1974
1975bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1976 MachineFunction *MF = MI.getParent()->getParent();
1977 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
1978
1979 MFInfo->setInitWholeWave();
1980 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
1981}
1982
1983bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1984 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
1985 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1986 unsigned WGSize = STI.getFlatWorkGroupSizes(F: MF->getFunction()).second;
1987 if (WGSize <= STI.getWavefrontSize()) {
1988 // If the workgroup fits in a wave, remove s_barrier_signal and lower
1989 // s_barrier/s_barrier_wait to wave_barrier.
1990 if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
1991 IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
1992 MachineBasicBlock *MBB = MI.getParent();
1993 const DebugLoc &DL = MI.getDebugLoc();
1994 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::WAVE_BARRIER));
1995 }
1996 MI.eraseFromParent();
1997 return true;
1998 }
1999 }
2000
2001 if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
2002 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
2003 MachineBasicBlock *MBB = MI.getParent();
2004 const DebugLoc &DL = MI.getDebugLoc();
2005 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_BARRIER_SIGNAL_IMM))
2006 .addImm(Val: AMDGPU::Barrier::WORKGROUP);
2007 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_BARRIER_WAIT))
2008 .addImm(Val: AMDGPU::Barrier::WORKGROUP);
2009 MI.eraseFromParent();
2010 return true;
2011 }
2012
2013 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
2014}
2015
2016static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2017 bool &IsTexFail) {
2018 if (TexFailCtrl)
2019 IsTexFail = true;
2020
2021 TFE = TexFailCtrl & 0x1;
2022 TexFailCtrl &= ~(uint64_t)0x1;
2023 LWE = TexFailCtrl & 0x2;
2024 TexFailCtrl &= ~(uint64_t)0x2;
2025
2026 return TexFailCtrl == 0;
2027}
2028
2029bool AMDGPUInstructionSelector::selectImageIntrinsic(
2030 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2031 MachineBasicBlock *MBB = MI.getParent();
2032 const DebugLoc &DL = MI.getDebugLoc();
2033
2034 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2035 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
2036
2037 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim);
2038 unsigned IntrOpcode = Intr->BaseOpcode;
2039 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2040 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2041 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2042
2043 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2044
2045 Register VDataIn, VDataOut;
2046 LLT VDataTy;
2047 int NumVDataDwords = -1;
2048 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2049 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2050
2051 bool Unorm;
2052 if (!BaseOpcode->Sampler)
2053 Unorm = true;
2054 else
2055 Unorm = MI.getOperand(i: ArgOffset + Intr->UnormIndex).getImm() != 0;
2056
2057 bool TFE;
2058 bool LWE;
2059 bool IsTexFail = false;
2060 if (!parseTexFail(TexFailCtrl: MI.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2061 TFE, LWE, IsTexFail))
2062 return false;
2063
2064 const int Flags = MI.getOperand(i: ArgOffset + Intr->NumArgs).getImm();
2065 const bool IsA16 = (Flags & 1) != 0;
2066 const bool IsG16 = (Flags & 2) != 0;
2067
2068 // A16 implies 16 bit gradients if subtarget doesn't support G16
2069 if (IsA16 && !STI.hasG16() && !IsG16)
2070 return false;
2071
2072 unsigned DMask = 0;
2073 unsigned DMaskLanes = 0;
2074
2075 if (BaseOpcode->Atomic) {
2076 VDataOut = MI.getOperand(i: 0).getReg();
2077 VDataIn = MI.getOperand(i: 2).getReg();
2078 LLT Ty = MRI->getType(Reg: VDataIn);
2079
2080 // Be careful to allow atomic swap on 16-bit element vectors.
2081 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2082 Ty.getSizeInBits() == 128 :
2083 Ty.getSizeInBits() == 64;
2084
2085 if (BaseOpcode->AtomicX2) {
2086 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2087
2088 DMask = Is64Bit ? 0xf : 0x3;
2089 NumVDataDwords = Is64Bit ? 4 : 2;
2090 } else {
2091 DMask = Is64Bit ? 0x3 : 0x1;
2092 NumVDataDwords = Is64Bit ? 2 : 1;
2093 }
2094 } else {
2095 DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
2096 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask);
2097
2098 if (BaseOpcode->Store) {
2099 VDataIn = MI.getOperand(i: 1).getReg();
2100 VDataTy = MRI->getType(Reg: VDataIn);
2101 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2102 } else if (BaseOpcode->NoReturn) {
2103 NumVDataDwords = 0;
2104 } else {
2105 VDataOut = MI.getOperand(i: 0).getReg();
2106 VDataTy = MRI->getType(Reg: VDataOut);
2107 NumVDataDwords = DMaskLanes;
2108
2109 if (IsD16 && !STI.hasUnpackedD16VMem())
2110 NumVDataDwords = (DMaskLanes + 1) / 2;
2111 }
2112 }
2113
2114 // Set G16 opcode
2115 if (Subtarget->hasG16() && IsG16) {
2116 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2117 AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode);
2118 assert(G16MappingInfo);
2119 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2120 }
2121
2122 // TODO: Check this in verifier.
2123 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2124
2125 unsigned CPol = MI.getOperand(i: ArgOffset + Intr->CachePolicyIndex).getImm();
2126 if (BaseOpcode->Atomic)
2127 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
2128 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2129 AMDGPU::CPol::VOLATILE))
2130 return false;
2131
2132 int NumVAddrRegs = 0;
2133 int NumVAddrDwords = 0;
2134 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2135 // Skip the $noregs and 0s inserted during legalization.
2136 MachineOperand &AddrOp = MI.getOperand(i: ArgOffset + I);
2137 if (!AddrOp.isReg())
2138 continue; // XXX - Break?
2139
2140 Register Addr = AddrOp.getReg();
2141 if (!Addr)
2142 break;
2143
2144 ++NumVAddrRegs;
2145 NumVAddrDwords += (MRI->getType(Reg: Addr).getSizeInBits() + 31) / 32;
2146 }
2147
2148 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2149 // NSA, these should have been packed into a single value in the first
2150 // address register
2151 const bool UseNSA =
2152 NumVAddrRegs != 1 &&
2153 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2154 : NumVAddrDwords == NumVAddrRegs);
2155 if (UseNSA && !STI.hasFeature(Feature: AMDGPU::FeatureNSAEncoding)) {
2156 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2157 return false;
2158 }
2159
2160 if (IsTexFail)
2161 ++NumVDataDwords;
2162
2163 int Opcode = -1;
2164 if (IsGFX12Plus) {
2165 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx12,
2166 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2167 } else if (IsGFX11Plus) {
2168 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
2169 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx11NSA
2170 : AMDGPU::MIMGEncGfx11Default,
2171 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2172 } else if (IsGFX10Plus) {
2173 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
2174 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx10NSA
2175 : AMDGPU::MIMGEncGfx10Default,
2176 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2177 } else {
2178 if (Subtarget->hasGFX90AInsts()) {
2179 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx90a,
2180 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2181 if (Opcode == -1) {
2182 LLVM_DEBUG(
2183 dbgs()
2184 << "requested image instruction is not supported on this GPU\n");
2185 return false;
2186 }
2187 }
2188 if (Opcode == -1 &&
2189 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2190 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx8,
2191 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2192 if (Opcode == -1)
2193 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx6,
2194 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
2195 }
2196 if (Opcode == -1)
2197 return false;
2198
2199 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode))
2200 .cloneMemRefs(OtherMI: MI);
2201
2202 if (VDataOut) {
2203 if (BaseOpcode->AtomicX2) {
2204 const bool Is64 = MRI->getType(Reg: VDataOut).getSizeInBits() == 64;
2205
2206 Register TmpReg = MRI->createVirtualRegister(
2207 RegClass: Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2208 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2209
2210 MIB.addDef(RegNo: TmpReg);
2211 if (!MRI->use_empty(RegNo: VDataOut)) {
2212 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: VDataOut)
2213 .addReg(RegNo: TmpReg, flags: RegState::Kill, SubReg);
2214 }
2215
2216 } else {
2217 MIB.addDef(RegNo: VDataOut); // vdata output
2218 }
2219 }
2220
2221 if (VDataIn)
2222 MIB.addReg(RegNo: VDataIn); // vdata input
2223
2224 for (int I = 0; I != NumVAddrRegs; ++I) {
2225 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + Intr->VAddrStart + I);
2226 if (SrcOp.isReg()) {
2227 assert(SrcOp.getReg() != 0);
2228 MIB.addReg(RegNo: SrcOp.getReg());
2229 }
2230 }
2231
2232 MIB.addReg(RegNo: MI.getOperand(i: ArgOffset + Intr->RsrcIndex).getReg());
2233 if (BaseOpcode->Sampler)
2234 MIB.addReg(RegNo: MI.getOperand(i: ArgOffset + Intr->SampIndex).getReg());
2235
2236 MIB.addImm(Val: DMask); // dmask
2237
2238 if (IsGFX10Plus)
2239 MIB.addImm(Val: DimInfo->Encoding);
2240 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::unorm))
2241 MIB.addImm(Val: Unorm);
2242
2243 MIB.addImm(Val: CPol);
2244 MIB.addImm(Val: IsA16 && // a16 or r128
2245 STI.hasFeature(Feature: AMDGPU::FeatureR128A16) ? -1 : 0);
2246 if (IsGFX10Plus)
2247 MIB.addImm(Val: IsA16 ? -1 : 0);
2248
2249 if (!Subtarget->hasGFX90AInsts()) {
2250 MIB.addImm(Val: TFE); // tfe
2251 } else if (TFE) {
2252 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2253 return false;
2254 }
2255
2256 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::lwe))
2257 MIB.addImm(Val: LWE); // lwe
2258 if (!IsGFX10Plus)
2259 MIB.addImm(Val: DimInfo->DA ? -1 : 0);
2260 if (BaseOpcode->HasD16)
2261 MIB.addImm(Val: IsD16 ? -1 : 0);
2262
2263 MI.eraseFromParent();
2264 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2265 TII.enforceOperandRCAlignment(MI&: *MIB, OpName: AMDGPU::OpName::vaddr);
2266 return true;
2267}
2268
2269// We need to handle this here because tablegen doesn't support matching
2270// instructions with multiple outputs.
2271bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2272 MachineInstr &MI) const {
2273 Register Dst0 = MI.getOperand(i: 0).getReg();
2274 Register Dst1 = MI.getOperand(i: 1).getReg();
2275
2276 const DebugLoc &DL = MI.getDebugLoc();
2277 MachineBasicBlock *MBB = MI.getParent();
2278
2279 Register Addr = MI.getOperand(i: 3).getReg();
2280 Register Data0 = MI.getOperand(i: 4).getReg();
2281 Register Data1 = MI.getOperand(i: 5).getReg();
2282 unsigned Offset = MI.getOperand(i: 6).getImm();
2283
2284 unsigned Opc;
2285 switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
2286 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2287 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2288 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2289 break;
2290 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2291 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2292 break;
2293 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2294 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2295 break;
2296 }
2297
2298 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: Dst0)
2299 .addDef(RegNo: Dst1)
2300 .addUse(RegNo: Addr)
2301 .addUse(RegNo: Data0)
2302 .addUse(RegNo: Data1)
2303 .addImm(Val: Offset)
2304 .cloneMemRefs(OtherMI: MI);
2305
2306 MI.eraseFromParent();
2307 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2308}
2309
2310bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2311 MachineInstr &I) const {
2312 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
2313 switch (IntrinsicID) {
2314 case Intrinsic::amdgcn_end_cf:
2315 return selectEndCfIntrinsic(MI&: I);
2316 case Intrinsic::amdgcn_ds_ordered_add:
2317 case Intrinsic::amdgcn_ds_ordered_swap:
2318 return selectDSOrderedIntrinsic(MI&: I, IntrID: IntrinsicID);
2319 case Intrinsic::amdgcn_ds_gws_init:
2320 case Intrinsic::amdgcn_ds_gws_barrier:
2321 case Intrinsic::amdgcn_ds_gws_sema_v:
2322 case Intrinsic::amdgcn_ds_gws_sema_br:
2323 case Intrinsic::amdgcn_ds_gws_sema_p:
2324 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2325 return selectDSGWSIntrinsic(MI&: I, IID: IntrinsicID);
2326 case Intrinsic::amdgcn_ds_append:
2327 return selectDSAppendConsume(MI&: I, IsAppend: true);
2328 case Intrinsic::amdgcn_ds_consume:
2329 return selectDSAppendConsume(MI&: I, IsAppend: false);
2330 case Intrinsic::amdgcn_init_whole_wave:
2331 return selectInitWholeWave(MI&: I);
2332 case Intrinsic::amdgcn_s_barrier:
2333 case Intrinsic::amdgcn_s_barrier_signal:
2334 case Intrinsic::amdgcn_s_barrier_wait:
2335 return selectSBarrier(MI&: I);
2336 case Intrinsic::amdgcn_raw_buffer_load_lds:
2337 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2338 case Intrinsic::amdgcn_struct_buffer_load_lds:
2339 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2340 return selectBufferLoadLds(MI&: I);
2341 // Until we can store both the address space of the global and the LDS
2342 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2343 // that the argument is a global pointer (buffer pointers have been handled by
2344 // a LLVM IR-level lowering).
2345 case Intrinsic::amdgcn_load_to_lds:
2346 case Intrinsic::amdgcn_global_load_lds:
2347 return selectGlobalLoadLds(MI&: I);
2348 case Intrinsic::amdgcn_exp_compr:
2349 if (!STI.hasCompressedExport()) {
2350 Function &F = I.getMF()->getFunction();
2351 F.getContext().diagnose(
2352 DI: DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2353 I.getDebugLoc(), DS_Error));
2354 return false;
2355 }
2356 break;
2357 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2358 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2359 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2360 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2361 return selectDSBvhStackIntrinsic(MI&: I);
2362 case Intrinsic::amdgcn_s_barrier_signal_var:
2363 return selectNamedBarrierInit(I, IID: IntrinsicID);
2364 case Intrinsic::amdgcn_s_get_named_barrier_state:
2365 return selectNamedBarrierInst(I, IID: IntrinsicID);
2366 case Intrinsic::amdgcn_s_get_barrier_state:
2367 return selectSGetBarrierState(I, IID: IntrinsicID);
2368 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2369 return selectSBarrierSignalIsfirst(I, IID: IntrinsicID);
2370 }
2371 return selectImpl(I, CoverageInfo&: *CoverageInfo);
2372}
2373
2374bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2375 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
2376 return true;
2377
2378 MachineBasicBlock *BB = I.getParent();
2379 const DebugLoc &DL = I.getDebugLoc();
2380
2381 Register DstReg = I.getOperand(i: 0).getReg();
2382 unsigned Size = RBI.getSizeInBits(Reg: DstReg, MRI: *MRI, TRI);
2383 assert(Size <= 32 || Size == 64);
2384 const MachineOperand &CCOp = I.getOperand(i: 1);
2385 Register CCReg = CCOp.getReg();
2386 if (!isVCC(Reg: CCReg, MRI: *MRI)) {
2387 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2388 AMDGPU::S_CSELECT_B32;
2389 MachineInstr *CopySCC = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC)
2390 .addReg(RegNo: CCReg);
2391
2392 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2393 // bank, because it does not cover the register class that we used to represent
2394 // for it. So we need to manually set the register class here.
2395 if (!MRI->getRegClassOrNull(Reg: CCReg))
2396 MRI->setRegClass(Reg: CCReg, RC: TRI.getConstrainedRegClassForOperand(MO: CCOp, MRI: *MRI));
2397 MachineInstr *Select = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: SelectOpcode), DestReg: DstReg)
2398 .add(MO: I.getOperand(i: 2))
2399 .add(MO: I.getOperand(i: 3));
2400
2401 bool Ret = false;
2402 Ret |= constrainSelectedInstRegOperands(I&: *Select, TII, TRI, RBI);
2403 Ret |= constrainSelectedInstRegOperands(I&: *CopySCC, TII, TRI, RBI);
2404 I.eraseFromParent();
2405 return Ret;
2406 }
2407
2408 // Wide VGPR select should have been split in RegBankSelect.
2409 if (Size > 32)
2410 return false;
2411
2412 MachineInstr *Select =
2413 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
2414 .addImm(Val: 0)
2415 .add(MO: I.getOperand(i: 3))
2416 .addImm(Val: 0)
2417 .add(MO: I.getOperand(i: 2))
2418 .add(MO: I.getOperand(i: 1));
2419
2420 bool Ret = constrainSelectedInstRegOperands(I&: *Select, TII, TRI, RBI);
2421 I.eraseFromParent();
2422 return Ret;
2423}
2424
2425bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2426 Register DstReg = I.getOperand(i: 0).getReg();
2427 Register SrcReg = I.getOperand(i: 1).getReg();
2428 const LLT DstTy = MRI->getType(Reg: DstReg);
2429 const LLT SrcTy = MRI->getType(Reg: SrcReg);
2430 const LLT S1 = LLT::scalar(SizeInBits: 1);
2431
2432 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
2433 const RegisterBank *DstRB;
2434 if (DstTy == S1) {
2435 // This is a special case. We don't treat s1 for legalization artifacts as
2436 // vcc booleans.
2437 DstRB = SrcRB;
2438 } else {
2439 DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
2440 if (SrcRB != DstRB)
2441 return false;
2442 }
2443
2444 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2445
2446 unsigned DstSize = DstTy.getSizeInBits();
2447 unsigned SrcSize = SrcTy.getSizeInBits();
2448
2449 const TargetRegisterClass *SrcRC =
2450 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcRB);
2451 const TargetRegisterClass *DstRC =
2452 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstRB);
2453 if (!SrcRC || !DstRC)
2454 return false;
2455
2456 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
2457 !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI)) {
2458 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2459 return false;
2460 }
2461
2462 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2463 assert(STI.useRealTrue16Insts());
2464 const DebugLoc &DL = I.getDebugLoc();
2465 MachineBasicBlock *MBB = I.getParent();
2466 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
2467 .addReg(RegNo: SrcReg, flags: 0, SubReg: AMDGPU::lo16);
2468 I.eraseFromParent();
2469 return true;
2470 }
2471
2472 if (DstTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) && SrcTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
2473 MachineBasicBlock *MBB = I.getParent();
2474 const DebugLoc &DL = I.getDebugLoc();
2475
2476 Register LoReg = MRI->createVirtualRegister(RegClass: DstRC);
2477 Register HiReg = MRI->createVirtualRegister(RegClass: DstRC);
2478 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
2479 .addReg(RegNo: SrcReg, flags: 0, SubReg: AMDGPU::sub0);
2480 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
2481 .addReg(RegNo: SrcReg, flags: 0, SubReg: AMDGPU::sub1);
2482
2483 if (IsVALU && STI.hasSDWA()) {
2484 // Write the low 16-bits of the high element into the high 16-bits of the
2485 // low element.
2486 MachineInstr *MovSDWA =
2487 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_sdwa), DestReg: DstReg)
2488 .addImm(Val: 0) // $src0_modifiers
2489 .addReg(RegNo: HiReg) // $src0
2490 .addImm(Val: 0) // $clamp
2491 .addImm(Val: AMDGPU::SDWA::WORD_1) // $dst_sel
2492 .addImm(Val: AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2493 .addImm(Val: AMDGPU::SDWA::WORD_0) // $src0_sel
2494 .addReg(RegNo: LoReg, flags: RegState::Implicit);
2495 MovSDWA->tieOperands(DefIdx: 0, UseIdx: MovSDWA->getNumOperands() - 1);
2496 } else {
2497 Register TmpReg0 = MRI->createVirtualRegister(RegClass: DstRC);
2498 Register TmpReg1 = MRI->createVirtualRegister(RegClass: DstRC);
2499 Register ImmReg = MRI->createVirtualRegister(RegClass: DstRC);
2500 if (IsVALU) {
2501 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHLREV_B32_e64), DestReg: TmpReg0)
2502 .addImm(Val: 16)
2503 .addReg(RegNo: HiReg);
2504 } else {
2505 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: TmpReg0)
2506 .addReg(RegNo: HiReg)
2507 .addImm(Val: 16)
2508 .setOperandDead(3); // Dead scc
2509 }
2510
2511 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2512 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2513 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2514
2515 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: MovOpc), DestReg: ImmReg)
2516 .addImm(Val: 0xffff);
2517 auto And = BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: TmpReg1)
2518 .addReg(RegNo: LoReg)
2519 .addReg(RegNo: ImmReg);
2520 auto Or = BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: OrOpc), DestReg: DstReg)
2521 .addReg(RegNo: TmpReg0)
2522 .addReg(RegNo: TmpReg1);
2523
2524 if (!IsVALU) {
2525 And.setOperandDead(3); // Dead scc
2526 Or.setOperandDead(3); // Dead scc
2527 }
2528 }
2529
2530 I.eraseFromParent();
2531 return true;
2532 }
2533
2534 if (!DstTy.isScalar())
2535 return false;
2536
2537 if (SrcSize > 32) {
2538 unsigned SubRegIdx = DstSize < 32
2539 ? static_cast<unsigned>(AMDGPU::sub0)
2540 : TRI.getSubRegFromChannel(Channel: 0, NumRegs: DstSize / 32);
2541 if (SubRegIdx == AMDGPU::NoSubRegister)
2542 return false;
2543
2544 // Deal with weird cases where the class only partially supports the subreg
2545 // index.
2546 const TargetRegisterClass *SrcWithSubRC
2547 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2548 if (!SrcWithSubRC)
2549 return false;
2550
2551 if (SrcWithSubRC != SrcRC) {
2552 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcWithSubRC, MRI&: *MRI))
2553 return false;
2554 }
2555
2556 I.getOperand(i: 1).setSubReg(SubRegIdx);
2557 }
2558
2559 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
2560 return true;
2561}
2562
2563/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2564static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2565 Mask = maskTrailingOnes<unsigned>(N: Size);
2566 int SignedMask = static_cast<int>(Mask);
2567 return SignedMask >= -16 && SignedMask <= 64;
2568}
2569
2570// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2571const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2572 Register Reg, const MachineRegisterInfo &MRI,
2573 const TargetRegisterInfo &TRI) const {
2574 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2575 if (auto *RB = dyn_cast<const RegisterBank *>(Val: RegClassOrBank))
2576 return RB;
2577
2578 // Ignore the type, since we don't use vcc in artifacts.
2579 if (auto *RC = dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank))
2580 return &RBI.getRegBankFromRegClass(RC: *RC, LLT());
2581 return nullptr;
2582}
2583
2584bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2585 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2586 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2587 const DebugLoc &DL = I.getDebugLoc();
2588 MachineBasicBlock &MBB = *I.getParent();
2589 const Register DstReg = I.getOperand(i: 0).getReg();
2590 const Register SrcReg = I.getOperand(i: 1).getReg();
2591
2592 const LLT DstTy = MRI->getType(Reg: DstReg);
2593 const LLT SrcTy = MRI->getType(Reg: SrcReg);
2594 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2595 I.getOperand(i: 2).getImm() : SrcTy.getSizeInBits();
2596 const unsigned DstSize = DstTy.getSizeInBits();
2597 if (!DstTy.isScalar())
2598 return false;
2599
2600 // Artifact casts should never use vcc.
2601 const RegisterBank *SrcBank = getArtifactRegBank(Reg: SrcReg, MRI: *MRI, TRI);
2602
2603 // FIXME: This should probably be illegal and split earlier.
2604 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2605 if (DstSize <= 32)
2606 return selectCOPY(I);
2607
2608 const TargetRegisterClass *SrcRC =
2609 TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *SrcBank);
2610 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
2611 const TargetRegisterClass *DstRC =
2612 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
2613
2614 Register UndefReg = MRI->createVirtualRegister(RegClass: SrcRC);
2615 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefReg);
2616 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
2617 .addReg(RegNo: SrcReg)
2618 .addImm(Val: AMDGPU::sub0)
2619 .addReg(RegNo: UndefReg)
2620 .addImm(Val: AMDGPU::sub1);
2621 I.eraseFromParent();
2622
2623 return RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) &&
2624 RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI);
2625 }
2626
2627 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2628 // 64-bit should have been split up in RegBankSelect
2629
2630 // Try to use an and with a mask if it will save code size.
2631 unsigned Mask;
2632 if (!Signed && shouldUseAndMask(Size: SrcSize, Mask)) {
2633 MachineInstr *ExtI =
2634 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B32_e32), DestReg: DstReg)
2635 .addImm(Val: Mask)
2636 .addReg(RegNo: SrcReg);
2637 I.eraseFromParent();
2638 return constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
2639 }
2640
2641 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2642 MachineInstr *ExtI =
2643 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE), DestReg: DstReg)
2644 .addReg(RegNo: SrcReg)
2645 .addImm(Val: 0) // Offset
2646 .addImm(Val: SrcSize); // Width
2647 I.eraseFromParent();
2648 return constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
2649 }
2650
2651 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2652 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2653 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2654 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: SrcRC, MRI&: *MRI))
2655 return false;
2656
2657 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2658 const unsigned SextOpc = SrcSize == 8 ?
2659 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2660 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: SextOpc), DestReg: DstReg)
2661 .addReg(RegNo: SrcReg);
2662 I.eraseFromParent();
2663 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2664 }
2665
2666 // Using a single 32-bit SALU to calculate the high half is smaller than
2667 // S_BFE with a literal constant operand.
2668 if (DstSize > 32 && SrcSize == 32) {
2669 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2670 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2671 if (Signed) {
2672 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ASHR_I32), DestReg: HiReg)
2673 .addReg(RegNo: SrcReg, flags: 0, SubReg)
2674 .addImm(Val: 31)
2675 .setOperandDead(3); // Dead scc
2676 } else {
2677 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: HiReg)
2678 .addImm(Val: 0);
2679 }
2680 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
2681 .addReg(RegNo: SrcReg, flags: 0, SubReg)
2682 .addImm(Val: AMDGPU::sub0)
2683 .addReg(RegNo: HiReg)
2684 .addImm(Val: AMDGPU::sub1);
2685 I.eraseFromParent();
2686 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_64RegClass,
2687 MRI&: *MRI);
2688 }
2689
2690 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2691 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2692
2693 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2694 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2695 // We need a 64-bit register source, but the high bits don't matter.
2696 Register ExtReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
2697 Register UndefReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2698 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2699
2700 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefReg);
2701 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: ExtReg)
2702 .addReg(RegNo: SrcReg, flags: 0, SubReg)
2703 .addImm(Val: AMDGPU::sub0)
2704 .addReg(RegNo: UndefReg)
2705 .addImm(Val: AMDGPU::sub1);
2706
2707 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE64), DestReg: DstReg)
2708 .addReg(RegNo: ExtReg)
2709 .addImm(Val: SrcSize << 16);
2710
2711 I.eraseFromParent();
2712 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI);
2713 }
2714
2715 unsigned Mask;
2716 if (!Signed && shouldUseAndMask(Size: SrcSize, Mask)) {
2717 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: DstReg)
2718 .addReg(RegNo: SrcReg)
2719 .addImm(Val: Mask)
2720 .setOperandDead(3); // Dead scc
2721 } else {
2722 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE32), DestReg: DstReg)
2723 .addReg(RegNo: SrcReg)
2724 .addImm(Val: SrcSize << 16);
2725 }
2726
2727 I.eraseFromParent();
2728 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2729 }
2730
2731 return false;
2732}
2733
2734static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
2735 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2736}
2737
2738static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
2739 Register BitcastSrc;
2740 if (mi_match(R: Reg, MRI, P: m_GBitcast(Src: m_Reg(R&: BitcastSrc))))
2741 Reg = BitcastSrc;
2742 return Reg;
2743}
2744
2745static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
2746 Register &Out) {
2747 Register Trunc;
2748 if (!mi_match(R: In, MRI, P: m_GTrunc(Src: m_Reg(R&: Trunc))))
2749 return false;
2750
2751 Register LShlSrc;
2752 Register Cst;
2753 if (mi_match(R: Trunc, MRI, P: m_GLShr(L: m_Reg(R&: LShlSrc), R: m_Reg(R&: Cst)))) {
2754 Cst = stripCopy(Reg: Cst, MRI);
2755 if (mi_match(R: Cst, MRI, P: m_SpecificICst(RequestedValue: 16))) {
2756 Out = stripBitCast(Reg: LShlSrc, MRI);
2757 return true;
2758 }
2759 }
2760
2761 MachineInstr *Shuffle = MRI.getVRegDef(Reg: Trunc);
2762 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2763 return false;
2764
2765 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2766 LLT::fixed_vector(2, 16));
2767
2768 ArrayRef<int> Mask = Shuffle->getOperand(i: 3).getShuffleMask();
2769 assert(Mask.size() == 2);
2770
2771 if (Mask[0] == 1 && Mask[1] <= 1) {
2772 Out = Shuffle->getOperand(i: 0).getReg();
2773 return true;
2774 }
2775
2776 return false;
2777}
2778
2779bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2780 if (!Subtarget->hasSALUFloatInsts())
2781 return false;
2782
2783 Register Dst = I.getOperand(i: 0).getReg();
2784 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
2785 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2786 return false;
2787
2788 Register Src = I.getOperand(i: 1).getReg();
2789
2790 if (MRI->getType(Reg: Dst) == LLT::scalar(SizeInBits: 32) &&
2791 MRI->getType(Reg: Src) == LLT::scalar(SizeInBits: 16)) {
2792 if (isExtractHiElt(MRI&: *MRI, In: Src, Out&: Src)) {
2793 MachineBasicBlock *BB = I.getParent();
2794 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_CVT_HI_F32_F16), DestReg: Dst)
2795 .addUse(RegNo: Src);
2796 I.eraseFromParent();
2797 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2798 }
2799 }
2800
2801 return false;
2802}
2803
2804bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2805 // Only manually handle the f64 SGPR case.
2806 //
2807 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2808 // the bit ops theoretically have a second result due to the implicit def of
2809 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2810 // that is easy by disabling the check. The result works, but uses a
2811 // nonsensical sreg32orlds_and_sreg_1 regclass.
2812 //
2813 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2814 // the variadic REG_SEQUENCE operands.
2815
2816 Register Dst = MI.getOperand(i: 0).getReg();
2817 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
2818 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2819 MRI->getType(Reg: Dst) != LLT::scalar(SizeInBits: 64))
2820 return false;
2821
2822 Register Src = MI.getOperand(i: 1).getReg();
2823 MachineInstr *Fabs = getOpcodeDef(Opcode: TargetOpcode::G_FABS, Reg: Src, MRI: *MRI);
2824 if (Fabs)
2825 Src = Fabs->getOperand(i: 1).getReg();
2826
2827 if (!RBI.constrainGenericRegister(Reg: Src, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI) ||
2828 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI))
2829 return false;
2830
2831 MachineBasicBlock *BB = MI.getParent();
2832 const DebugLoc &DL = MI.getDebugLoc();
2833 Register LoReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2834 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2835 Register ConstReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2836 Register OpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2837
2838 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
2839 .addReg(RegNo: Src, flags: 0, SubReg: AMDGPU::sub0);
2840 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
2841 .addReg(RegNo: Src, flags: 0, SubReg: AMDGPU::sub1);
2842 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: ConstReg)
2843 .addImm(Val: 0x80000000);
2844
2845 // Set or toggle sign bit.
2846 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2847 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: OpReg)
2848 .addReg(RegNo: HiReg)
2849 .addReg(RegNo: ConstReg)
2850 .setOperandDead(3); // Dead scc
2851 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
2852 .addReg(RegNo: LoReg)
2853 .addImm(Val: AMDGPU::sub0)
2854 .addReg(RegNo: OpReg)
2855 .addImm(Val: AMDGPU::sub1);
2856 MI.eraseFromParent();
2857 return true;
2858}
2859
2860// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2861bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2862 Register Dst = MI.getOperand(i: 0).getReg();
2863 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
2864 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2865 MRI->getType(Reg: Dst) != LLT::scalar(SizeInBits: 64))
2866 return false;
2867
2868 Register Src = MI.getOperand(i: 1).getReg();
2869 MachineBasicBlock *BB = MI.getParent();
2870 const DebugLoc &DL = MI.getDebugLoc();
2871 Register LoReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2872 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2873 Register ConstReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2874 Register OpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2875
2876 if (!RBI.constrainGenericRegister(Reg: Src, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI) ||
2877 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI))
2878 return false;
2879
2880 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
2881 .addReg(RegNo: Src, flags: 0, SubReg: AMDGPU::sub0);
2882 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
2883 .addReg(RegNo: Src, flags: 0, SubReg: AMDGPU::sub1);
2884 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: ConstReg)
2885 .addImm(Val: 0x7fffffff);
2886
2887 // Clear sign bit.
2888 // TODO: Should this used S_BITSET0_*?
2889 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: OpReg)
2890 .addReg(RegNo: HiReg)
2891 .addReg(RegNo: ConstReg)
2892 .setOperandDead(3); // Dead scc
2893 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
2894 .addReg(RegNo: LoReg)
2895 .addImm(Val: AMDGPU::sub0)
2896 .addReg(RegNo: OpReg)
2897 .addImm(Val: AMDGPU::sub1);
2898
2899 MI.eraseFromParent();
2900 return true;
2901}
2902
2903static bool isConstant(const MachineInstr &MI) {
2904 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2905}
2906
2907void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2908 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2909
2910 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2911 const MachineInstr *PtrMI =
2912 MRI.getUniqueVRegDef(Reg: Load.getOperand(i: OpNo).getReg());
2913
2914 assert(PtrMI);
2915
2916 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2917 return;
2918
2919 GEPInfo GEPInfo;
2920
2921 for (unsigned i = 1; i != 3; ++i) {
2922 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2923 const MachineInstr *OpDef = MRI.getUniqueVRegDef(Reg: GEPOp.getReg());
2924 assert(OpDef);
2925 if (i == 2 && isConstant(MI: *OpDef)) {
2926 // TODO: Could handle constant base + variable offset, but a combine
2927 // probably should have commuted it.
2928 assert(GEPInfo.Imm == 0);
2929 GEPInfo.Imm = OpDef->getOperand(i: 1).getCImm()->getSExtValue();
2930 continue;
2931 }
2932 const RegisterBank *OpBank = RBI.getRegBank(Reg: GEPOp.getReg(), MRI, TRI);
2933 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2934 GEPInfo.SgprParts.push_back(Elt: GEPOp.getReg());
2935 else
2936 GEPInfo.VgprParts.push_back(Elt: GEPOp.getReg());
2937 }
2938
2939 AddrInfo.push_back(Elt: GEPInfo);
2940 getAddrModeInfo(Load: *PtrMI, MRI, AddrInfo);
2941}
2942
2943bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2944 return RBI.getRegBank(Reg, MRI: *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2945}
2946
2947bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2948 if (!MI.hasOneMemOperand())
2949 return false;
2950
2951 const MachineMemOperand *MMO = *MI.memoperands_begin();
2952 const Value *Ptr = MMO->getValue();
2953
2954 // UndefValue means this is a load of a kernel input. These are uniform.
2955 // Sometimes LDS instructions have constant pointers.
2956 // If Ptr is null, then that means this mem operand contains a
2957 // PseudoSourceValue like GOT.
2958 if (!Ptr || isa<UndefValue, Argument, Constant, GlobalValue>(Val: Ptr))
2959 return true;
2960
2961 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2962 return true;
2963
2964 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2965 return RBI.getRegBank(Reg: MI.getOperand(i: 0).getReg(), MRI: *MRI, TRI)->getID() ==
2966 AMDGPU::SGPRRegBankID;
2967
2968 const Instruction *I = dyn_cast<Instruction>(Val: Ptr);
2969 return I && I->getMetadata(Kind: "amdgpu.uniform");
2970}
2971
2972bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2973 for (const GEPInfo &GEPInfo : AddrInfo) {
2974 if (!GEPInfo.VgprParts.empty())
2975 return true;
2976 }
2977 return false;
2978}
2979
2980void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2981 const LLT PtrTy = MRI->getType(Reg: I.getOperand(i: 1).getReg());
2982 unsigned AS = PtrTy.getAddressSpace();
2983 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2984 STI.ldsRequiresM0Init()) {
2985 MachineBasicBlock *BB = I.getParent();
2986
2987 // If DS instructions require M0 initialization, insert it before selecting.
2988 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
2989 .addImm(Val: -1);
2990 }
2991}
2992
2993bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2994 MachineInstr &I) const {
2995 initM0(I);
2996 return selectImpl(I, CoverageInfo&: *CoverageInfo);
2997}
2998
2999static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
3000 if (Reg.isPhysical())
3001 return false;
3002
3003 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
3004 const unsigned Opcode = MI.getOpcode();
3005
3006 if (Opcode == AMDGPU::COPY)
3007 return isVCmpResult(Reg: MI.getOperand(i: 1).getReg(), MRI);
3008
3009 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3010 Opcode == AMDGPU::G_XOR)
3011 return isVCmpResult(Reg: MI.getOperand(i: 1).getReg(), MRI) &&
3012 isVCmpResult(Reg: MI.getOperand(i: 2).getReg(), MRI);
3013
3014 if (auto *GI = dyn_cast<GIntrinsic>(Val: &MI))
3015 return GI->is(ID: Intrinsic::amdgcn_class);
3016
3017 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3018}
3019
3020bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3021 MachineBasicBlock *BB = I.getParent();
3022 MachineOperand &CondOp = I.getOperand(i: 0);
3023 Register CondReg = CondOp.getReg();
3024 const DebugLoc &DL = I.getDebugLoc();
3025
3026 unsigned BrOpcode;
3027 Register CondPhysReg;
3028 const TargetRegisterClass *ConstrainRC;
3029
3030 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3031 // whether the branch is uniform when selecting the instruction. In
3032 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3033 // RegBankSelect knows what it's doing if the branch condition is scc, even
3034 // though it currently does not.
3035 if (!isVCC(Reg: CondReg, MRI: *MRI)) {
3036 if (MRI->getType(Reg: CondReg) != LLT::scalar(SizeInBits: 32))
3037 return false;
3038
3039 CondPhysReg = AMDGPU::SCC;
3040 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3041 ConstrainRC = &AMDGPU::SReg_32RegClass;
3042 } else {
3043 // FIXME: Should scc->vcc copies and with exec?
3044
3045 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3046 // need to insert an and with exec.
3047 if (!isVCmpResult(Reg: CondReg, MRI&: *MRI)) {
3048 const bool Is64 = STI.isWave64();
3049 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3050 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3051
3052 Register TmpReg = MRI->createVirtualRegister(RegClass: TRI.getBoolRC());
3053 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: TmpReg)
3054 .addReg(RegNo: CondReg)
3055 .addReg(RegNo: Exec)
3056 .setOperandDead(3); // Dead scc
3057 CondReg = TmpReg;
3058 }
3059
3060 CondPhysReg = TRI.getVCC();
3061 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3062 ConstrainRC = TRI.getBoolRC();
3063 }
3064
3065 if (!MRI->getRegClassOrNull(Reg: CondReg))
3066 MRI->setRegClass(Reg: CondReg, RC: ConstrainRC);
3067
3068 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CondPhysReg)
3069 .addReg(RegNo: CondReg);
3070 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: BrOpcode))
3071 .addMBB(MBB: I.getOperand(i: 1).getMBB());
3072
3073 I.eraseFromParent();
3074 return true;
3075}
3076
3077bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3078 MachineInstr &I) const {
3079 Register DstReg = I.getOperand(i: 0).getReg();
3080 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3081 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3082 I.setDesc(TII.get(Opcode: IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3083 if (IsVGPR)
3084 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
3085
3086 return RBI.constrainGenericRegister(
3087 Reg: DstReg, RC: IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI&: *MRI);
3088}
3089
3090bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3091 Register DstReg = I.getOperand(i: 0).getReg();
3092 Register SrcReg = I.getOperand(i: 1).getReg();
3093 Register MaskReg = I.getOperand(i: 2).getReg();
3094 LLT Ty = MRI->getType(Reg: DstReg);
3095 LLT MaskTy = MRI->getType(Reg: MaskReg);
3096 MachineBasicBlock *BB = I.getParent();
3097 const DebugLoc &DL = I.getDebugLoc();
3098
3099 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3100 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
3101 const RegisterBank *MaskRB = RBI.getRegBank(Reg: MaskReg, MRI: *MRI, TRI);
3102 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3103 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3104 return false;
3105
3106 // Try to avoid emitting a bit operation when we only need to touch half of
3107 // the 64-bit pointer.
3108 APInt MaskOnes = VT->getKnownOnes(R: MaskReg).zext(width: 64);
3109 const APInt MaskHi32 = APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32);
3110 const APInt MaskLo32 = APInt::getLowBitsSet(numBits: 64, loBitsSet: 32);
3111
3112 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3113 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3114
3115 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3116 !CanCopyLow32 && !CanCopyHi32) {
3117 auto MIB = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B64), DestReg: DstReg)
3118 .addReg(RegNo: SrcReg)
3119 .addReg(RegNo: MaskReg)
3120 .setOperandDead(3); // Dead scc
3121 I.eraseFromParent();
3122 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3123 }
3124
3125 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3126 const TargetRegisterClass &RegRC
3127 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3128
3129 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, Bank: *DstRB);
3130 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, Bank: *SrcRB);
3131 const TargetRegisterClass *MaskRC =
3132 TRI.getRegClassForTypeOnBank(Ty: MaskTy, Bank: *MaskRB);
3133
3134 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
3135 !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
3136 !RBI.constrainGenericRegister(Reg: MaskReg, RC: *MaskRC, MRI&: *MRI))
3137 return false;
3138
3139 if (Ty.getSizeInBits() == 32) {
3140 assert(MaskTy.getSizeInBits() == 32 &&
3141 "ptrmask should have been narrowed during legalize");
3142
3143 auto NewOp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: DstReg)
3144 .addReg(RegNo: SrcReg)
3145 .addReg(RegNo: MaskReg);
3146
3147 if (!IsVGPR)
3148 NewOp.setOperandDead(3); // Dead scc
3149 I.eraseFromParent();
3150 return true;
3151 }
3152
3153 Register HiReg = MRI->createVirtualRegister(RegClass: &RegRC);
3154 Register LoReg = MRI->createVirtualRegister(RegClass: &RegRC);
3155
3156 // Extract the subregisters from the source pointer.
3157 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
3158 .addReg(RegNo: SrcReg, flags: 0, SubReg: AMDGPU::sub0);
3159 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
3160 .addReg(RegNo: SrcReg, flags: 0, SubReg: AMDGPU::sub1);
3161
3162 Register MaskedLo, MaskedHi;
3163
3164 if (CanCopyLow32) {
3165 // If all the bits in the low half are 1, we only need a copy for it.
3166 MaskedLo = LoReg;
3167 } else {
3168 // Extract the mask subregister and apply the and.
3169 Register MaskLo = MRI->createVirtualRegister(RegClass: &RegRC);
3170 MaskedLo = MRI->createVirtualRegister(RegClass: &RegRC);
3171
3172 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: MaskLo)
3173 .addReg(RegNo: MaskReg, flags: 0, SubReg: AMDGPU::sub0);
3174 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: MaskedLo)
3175 .addReg(RegNo: LoReg)
3176 .addReg(RegNo: MaskLo);
3177 }
3178
3179 if (CanCopyHi32) {
3180 // If all the bits in the high half are 1, we only need a copy for it.
3181 MaskedHi = HiReg;
3182 } else {
3183 Register MaskHi = MRI->createVirtualRegister(RegClass: &RegRC);
3184 MaskedHi = MRI->createVirtualRegister(RegClass: &RegRC);
3185
3186 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: MaskHi)
3187 .addReg(RegNo: MaskReg, flags: 0, SubReg: AMDGPU::sub1);
3188 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: MaskedHi)
3189 .addReg(RegNo: HiReg)
3190 .addReg(RegNo: MaskHi);
3191 }
3192
3193 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
3194 .addReg(RegNo: MaskedLo)
3195 .addImm(Val: AMDGPU::sub0)
3196 .addReg(RegNo: MaskedHi)
3197 .addImm(Val: AMDGPU::sub1);
3198 I.eraseFromParent();
3199 return true;
3200}
3201
3202/// Return the register to use for the index value, and the subregister to use
3203/// for the indirectly accessed register.
3204static std::pair<Register, unsigned>
3205computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
3206 const TargetRegisterClass *SuperRC, Register IdxReg,
3207 unsigned EltSize, GISelValueTracking &ValueTracking) {
3208 Register IdxBaseReg;
3209 int Offset;
3210
3211 std::tie(args&: IdxBaseReg, args&: Offset) =
3212 AMDGPU::getBaseWithConstantOffset(MRI, Reg: IdxReg, ValueTracking: &ValueTracking);
3213 if (IdxBaseReg == AMDGPU::NoRegister) {
3214 // This will happen if the index is a known constant. This should ordinarily
3215 // be legalized out, but handle it as a register just in case.
3216 assert(Offset == 0);
3217 IdxBaseReg = IdxReg;
3218 }
3219
3220 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: SuperRC, EltSize);
3221
3222 // Skip out of bounds offsets, or else we would end up using an undefined
3223 // register.
3224 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3225 return std::pair(IdxReg, SubRegs[0]);
3226 return std::pair(IdxBaseReg, SubRegs[Offset]);
3227}
3228
3229bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3230 MachineInstr &MI) const {
3231 Register DstReg = MI.getOperand(i: 0).getReg();
3232 Register SrcReg = MI.getOperand(i: 1).getReg();
3233 Register IdxReg = MI.getOperand(i: 2).getReg();
3234
3235 LLT DstTy = MRI->getType(Reg: DstReg);
3236 LLT SrcTy = MRI->getType(Reg: SrcReg);
3237
3238 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3239 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
3240 const RegisterBank *IdxRB = RBI.getRegBank(Reg: IdxReg, MRI: *MRI, TRI);
3241
3242 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3243 // into a waterfall loop.
3244 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3245 return false;
3246
3247 const TargetRegisterClass *SrcRC =
3248 TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *SrcRB);
3249 const TargetRegisterClass *DstRC =
3250 TRI.getRegClassForTypeOnBank(Ty: DstTy, Bank: *DstRB);
3251 if (!SrcRC || !DstRC)
3252 return false;
3253 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
3254 !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
3255 !RBI.constrainGenericRegister(Reg: IdxReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
3256 return false;
3257
3258 MachineBasicBlock *BB = MI.getParent();
3259 const DebugLoc &DL = MI.getDebugLoc();
3260 const bool Is64 = DstTy.getSizeInBits() == 64;
3261
3262 unsigned SubReg;
3263 std::tie(args&: IdxReg, args&: SubReg) = computeIndirectRegIndex(
3264 MRI&: *MRI, TRI, SuperRC: SrcRC, IdxReg, EltSize: DstTy.getSizeInBits() / 8, ValueTracking&: *VT);
3265
3266 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3267 if (DstTy.getSizeInBits() != 32 && !Is64)
3268 return false;
3269
3270 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3271 .addReg(RegNo: IdxReg);
3272
3273 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3274 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
3275 .addReg(RegNo: SrcReg, flags: 0, SubReg)
3276 .addReg(RegNo: SrcReg, flags: RegState::Implicit);
3277 MI.eraseFromParent();
3278 return true;
3279 }
3280
3281 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3282 return false;
3283
3284 if (!STI.useVGPRIndexMode()) {
3285 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3286 .addReg(RegNo: IdxReg);
3287 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: DstReg)
3288 .addReg(RegNo: SrcReg, flags: 0, SubReg)
3289 .addReg(RegNo: SrcReg, flags: RegState::Implicit);
3290 MI.eraseFromParent();
3291 return true;
3292 }
3293
3294 const MCInstrDesc &GPRIDXDesc =
3295 TII.getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *SrcRC), IsIndirectSrc: true);
3296 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: GPRIDXDesc, DestReg: DstReg)
3297 .addReg(RegNo: SrcReg)
3298 .addReg(RegNo: IdxReg)
3299 .addImm(Val: SubReg);
3300
3301 MI.eraseFromParent();
3302 return true;
3303}
3304
3305// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3306bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3307 MachineInstr &MI) const {
3308 Register DstReg = MI.getOperand(i: 0).getReg();
3309 Register VecReg = MI.getOperand(i: 1).getReg();
3310 Register ValReg = MI.getOperand(i: 2).getReg();
3311 Register IdxReg = MI.getOperand(i: 3).getReg();
3312
3313 LLT VecTy = MRI->getType(Reg: DstReg);
3314 LLT ValTy = MRI->getType(Reg: ValReg);
3315 unsigned VecSize = VecTy.getSizeInBits();
3316 unsigned ValSize = ValTy.getSizeInBits();
3317
3318 const RegisterBank *VecRB = RBI.getRegBank(Reg: VecReg, MRI: *MRI, TRI);
3319 const RegisterBank *ValRB = RBI.getRegBank(Reg: ValReg, MRI: *MRI, TRI);
3320 const RegisterBank *IdxRB = RBI.getRegBank(Reg: IdxReg, MRI: *MRI, TRI);
3321
3322 assert(VecTy.getElementType() == ValTy);
3323
3324 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3325 // into a waterfall loop.
3326 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3327 return false;
3328
3329 const TargetRegisterClass *VecRC =
3330 TRI.getRegClassForTypeOnBank(Ty: VecTy, Bank: *VecRB);
3331 const TargetRegisterClass *ValRC =
3332 TRI.getRegClassForTypeOnBank(Ty: ValTy, Bank: *ValRB);
3333
3334 if (!RBI.constrainGenericRegister(Reg: VecReg, RC: *VecRC, MRI&: *MRI) ||
3335 !RBI.constrainGenericRegister(Reg: DstReg, RC: *VecRC, MRI&: *MRI) ||
3336 !RBI.constrainGenericRegister(Reg: ValReg, RC: *ValRC, MRI&: *MRI) ||
3337 !RBI.constrainGenericRegister(Reg: IdxReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
3338 return false;
3339
3340 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3341 return false;
3342
3343 unsigned SubReg;
3344 std::tie(args&: IdxReg, args&: SubReg) =
3345 computeIndirectRegIndex(MRI&: *MRI, TRI, SuperRC: VecRC, IdxReg, EltSize: ValSize / 8, ValueTracking&: *VT);
3346
3347 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3348 STI.useVGPRIndexMode();
3349
3350 MachineBasicBlock *BB = MI.getParent();
3351 const DebugLoc &DL = MI.getDebugLoc();
3352
3353 if (!IndexMode) {
3354 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3355 .addReg(RegNo: IdxReg);
3356
3357 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3358 VecSize, EltSize: ValSize, IsSGPR: VecRB->getID() == AMDGPU::SGPRRegBankID);
3359 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: RegWriteOp, DestReg: DstReg)
3360 .addReg(RegNo: VecReg)
3361 .addReg(RegNo: ValReg)
3362 .addImm(Val: SubReg);
3363 MI.eraseFromParent();
3364 return true;
3365 }
3366
3367 const MCInstrDesc &GPRIDXDesc =
3368 TII.getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
3369 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: GPRIDXDesc, DestReg: DstReg)
3370 .addReg(RegNo: VecReg)
3371 .addReg(RegNo: ValReg)
3372 .addReg(RegNo: IdxReg)
3373 .addImm(Val: SubReg);
3374
3375 MI.eraseFromParent();
3376 return true;
3377}
3378
3379bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3380 if (!Subtarget->hasVMemToLDSLoad())
3381 return false;
3382 unsigned Opc;
3383 unsigned Size = MI.getOperand(i: 3).getImm();
3384
3385 // The struct intrinsic variants add one additional operand over raw.
3386 const bool HasVIndex = MI.getNumOperands() == 9;
3387 Register VIndex;
3388 int OpOffset = 0;
3389 if (HasVIndex) {
3390 VIndex = MI.getOperand(i: 4).getReg();
3391 OpOffset = 1;
3392 }
3393
3394 Register VOffset = MI.getOperand(i: 4 + OpOffset).getReg();
3395 std::optional<ValueAndVReg> MaybeVOffset =
3396 getIConstantVRegValWithLookThrough(VReg: VOffset, MRI: *MRI);
3397 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3398
3399 switch (Size) {
3400 default:
3401 return false;
3402 case 1:
3403 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3404 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3405 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3406 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3407 break;
3408 case 2:
3409 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3410 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3411 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3412 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3413 break;
3414 case 4:
3415 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3416 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3417 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3418 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3419 break;
3420 case 12:
3421 if (!Subtarget->hasLDSLoadB96_B128())
3422 return false;
3423
3424 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3425 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3426 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3427 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3428 break;
3429 case 16:
3430 if (!Subtarget->hasLDSLoadB96_B128())
3431 return false;
3432
3433 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3434 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3435 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3436 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3437 break;
3438 }
3439
3440 MachineBasicBlock *MBB = MI.getParent();
3441 const DebugLoc &DL = MI.getDebugLoc();
3442 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3443 .add(MO: MI.getOperand(i: 2));
3444
3445 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc));
3446
3447 if (HasVIndex && HasVOffset) {
3448 Register IdxReg = MRI->createVirtualRegister(RegClass: TRI.getVGPR64Class());
3449 BuildMI(BB&: *MBB, I: &*MIB, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: IdxReg)
3450 .addReg(RegNo: VIndex)
3451 .addImm(Val: AMDGPU::sub0)
3452 .addReg(RegNo: VOffset)
3453 .addImm(Val: AMDGPU::sub1);
3454
3455 MIB.addReg(RegNo: IdxReg);
3456 } else if (HasVIndex) {
3457 MIB.addReg(RegNo: VIndex);
3458 } else if (HasVOffset) {
3459 MIB.addReg(RegNo: VOffset);
3460 }
3461
3462 MIB.add(MO: MI.getOperand(i: 1)); // rsrc
3463 MIB.add(MO: MI.getOperand(i: 5 + OpOffset)); // soffset
3464 MIB.add(MO: MI.getOperand(i: 6 + OpOffset)); // imm offset
3465 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3466 unsigned Aux = MI.getOperand(i: 7 + OpOffset).getImm();
3467 MIB.addImm(Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3468 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3469 MIB.addImm(
3470 Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3471 ? 1
3472 : 0); // swz
3473
3474 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3475 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3476 LoadPtrI.Offset = MI.getOperand(i: 6 + OpOffset).getImm();
3477 MachinePointerInfo StorePtrI = LoadPtrI;
3478 StorePtrI.V = nullptr;
3479 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3480
3481 auto F = LoadMMO->getFlags() &
3482 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3483 LoadMMO = MF->getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad,
3484 Size, BaseAlignment: LoadMMO->getBaseAlign());
3485
3486 MachineMemOperand *StoreMMO =
3487 MF->getMachineMemOperand(PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore,
3488 Size: sizeof(int32_t), BaseAlignment: LoadMMO->getBaseAlign());
3489
3490 MIB.setMemRefs({LoadMMO, StoreMMO});
3491
3492 MI.eraseFromParent();
3493 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3494}
3495
3496/// Match a zero extend from a 32-bit value to 64-bits.
3497static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3498 Register ZExtSrc;
3499 if (mi_match(R: Reg, MRI, P: m_GZExt(Src: m_Reg(R&: ZExtSrc))))
3500 return MRI.getType(Reg: ZExtSrc) == LLT::scalar(SizeInBits: 32) ? ZExtSrc : Register();
3501
3502 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3503 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3504 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3505 return Register();
3506
3507 assert(Def->getNumOperands() == 3 &&
3508 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3509 if (mi_match(R: Def->getOperand(i: 2).getReg(), MRI, P: m_ZeroInt())) {
3510 return Def->getOperand(i: 1).getReg();
3511 }
3512
3513 return Register();
3514}
3515
3516bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3517 if (!Subtarget->hasVMemToLDSLoad())
3518 return false;
3519
3520 unsigned Opc;
3521 unsigned Size = MI.getOperand(i: 3).getImm();
3522
3523 switch (Size) {
3524 default:
3525 return false;
3526 case 1:
3527 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3528 break;
3529 case 2:
3530 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3531 break;
3532 case 4:
3533 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3534 break;
3535 case 12:
3536 if (!Subtarget->hasLDSLoadB96_B128())
3537 return false;
3538 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3539 break;
3540 case 16:
3541 if (!Subtarget->hasLDSLoadB96_B128())
3542 return false;
3543 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3544 break;
3545 }
3546
3547 MachineBasicBlock *MBB = MI.getParent();
3548 const DebugLoc &DL = MI.getDebugLoc();
3549 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3550 .add(MO: MI.getOperand(i: 2));
3551
3552 Register Addr = MI.getOperand(i: 1).getReg();
3553 Register VOffset;
3554 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3555 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3556 if (!isSGPR(Reg: Addr)) {
3557 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
3558 if (isSGPR(Reg: AddrDef->Reg)) {
3559 Addr = AddrDef->Reg;
3560 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3561 Register SAddr =
3562 getSrcRegIgnoringCopies(Reg: AddrDef->MI->getOperand(i: 1).getReg(), MRI: *MRI);
3563 if (isSGPR(Reg: SAddr)) {
3564 Register PtrBaseOffset = AddrDef->MI->getOperand(i: 2).getReg();
3565 if (Register Off = matchZeroExtendFromS32(MRI&: *MRI, Reg: PtrBaseOffset)) {
3566 Addr = SAddr;
3567 VOffset = Off;
3568 }
3569 }
3570 }
3571 }
3572
3573 if (isSGPR(Reg: Addr)) {
3574 Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc);
3575 if (!VOffset) {
3576 VOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
3577 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VOffset)
3578 .addImm(Val: 0);
3579 }
3580 }
3581
3582 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc))
3583 .addReg(RegNo: Addr);
3584
3585 if (isSGPR(Reg: Addr))
3586 MIB.addReg(RegNo: VOffset);
3587
3588 MIB.add(MO: MI.getOperand(i: 4)) // offset
3589 .add(MO: MI.getOperand(i: 5)); // cpol
3590
3591 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3592 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3593 LoadPtrI.Offset = MI.getOperand(i: 4).getImm();
3594 MachinePointerInfo StorePtrI = LoadPtrI;
3595 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
3596 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3597 auto F = LoadMMO->getFlags() &
3598 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3599 LoadMMO = MF->getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad,
3600 Size, BaseAlignment: LoadMMO->getBaseAlign());
3601 MachineMemOperand *StoreMMO =
3602 MF->getMachineMemOperand(PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore,
3603 Size: sizeof(int32_t), BaseAlignment: Align(4));
3604
3605 MIB.setMemRefs({LoadMMO, StoreMMO});
3606
3607 MI.eraseFromParent();
3608 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3609}
3610
3611bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3612 MachineInstr &MI) const {
3613 unsigned OpcodeOpIdx =
3614 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3615 MI.setDesc(TII.get(Opcode: MI.getOperand(i: OpcodeOpIdx).getImm()));
3616 MI.removeOperand(OpNo: OpcodeOpIdx);
3617 MI.addImplicitDefUseOperands(MF&: *MI.getParent()->getParent());
3618 return constrainSelectedInstRegOperands(I&: MI, TII, TRI, RBI);
3619}
3620
3621// FIXME: This should be removed and let the patterns select. We just need the
3622// AGPR/VGPR combination versions.
3623bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3624 unsigned Opc;
3625 switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
3626 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3627 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3628 break;
3629 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3630 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3631 break;
3632 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3633 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3634 break;
3635 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3636 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3637 break;
3638 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3639 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3640 break;
3641 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3642 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3643 break;
3644 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3645 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3646 break;
3647 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3648 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3649 break;
3650 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3651 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3652 break;
3653 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3654 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3655 break;
3656 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3657 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3658 break;
3659 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3660 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3661 break;
3662 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3663 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3664 break;
3665 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3666 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3667 break;
3668 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3669 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3670 break;
3671 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3672 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3673 break;
3674 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3675 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3676 break;
3677 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3678 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3679 break;
3680 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3681 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3682 break;
3683 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3684 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3685 break;
3686 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3687 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3688 break;
3689 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3690 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3691 break;
3692 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3693 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3694 break;
3695 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3696 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3697 break;
3698 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3699 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3700 break;
3701 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3702 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3703 break;
3704 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3705 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3706 break;
3707 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3708 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3709 break;
3710 default:
3711 llvm_unreachable("unhandled smfmac intrinsic");
3712 }
3713
3714 auto VDst_In = MI.getOperand(i: 4);
3715
3716 MI.setDesc(TII.get(Opcode: Opc));
3717 MI.removeOperand(OpNo: 4); // VDst_In
3718 MI.removeOperand(OpNo: 1); // Intrinsic ID
3719 MI.addOperand(Op: VDst_In); // Readd VDst_In to the end
3720 MI.addImplicitDefUseOperands(MF&: *MI.getParent()->getParent());
3721 return true;
3722}
3723
3724bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3725 MachineInstr &MI, Intrinsic::ID IntrID) const {
3726 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3727 !Subtarget->hasPermlane16Swap())
3728 return false;
3729 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3730 !Subtarget->hasPermlane32Swap())
3731 return false;
3732
3733 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3734 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3735 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3736
3737 MI.removeOperand(OpNo: 2);
3738 MI.setDesc(TII.get(Opcode));
3739 MI.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
3740
3741 MachineOperand &FI = MI.getOperand(i: 4);
3742 FI.setImm(FI.getImm() ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0);
3743
3744 return constrainSelectedInstRegOperands(I&: MI, TII, TRI, RBI);
3745}
3746
3747bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3748 Register DstReg = MI.getOperand(i: 0).getReg();
3749 Register SrcReg = MI.getOperand(i: 1).getReg();
3750 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3751 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3752 MachineBasicBlock *MBB = MI.getParent();
3753 const DebugLoc &DL = MI.getDebugLoc();
3754
3755 if (IsVALU) {
3756 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: DstReg)
3757 .addImm(Val: Subtarget->getWavefrontSizeLog2())
3758 .addReg(RegNo: SrcReg);
3759 } else {
3760 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: DstReg)
3761 .addReg(RegNo: SrcReg)
3762 .addImm(Val: Subtarget->getWavefrontSizeLog2())
3763 .setOperandDead(3); // Dead scc
3764 }
3765
3766 const TargetRegisterClass &RC =
3767 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3768 if (!RBI.constrainGenericRegister(Reg: DstReg, RC, MRI&: *MRI))
3769 return false;
3770
3771 MI.eraseFromParent();
3772 return true;
3773}
3774
3775// Match BITOP3 operation and return a number of matched instructions plus
3776// truth table.
3777static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3778 SmallVectorImpl<Register> &Src,
3779 const MachineRegisterInfo &MRI) {
3780 unsigned NumOpcodes = 0;
3781 uint8_t LHSBits, RHSBits;
3782
3783 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3784 // Define truth table given Src0, Src1, Src2 bits permutations:
3785 // 0 0 0
3786 // 0 0 1
3787 // 0 1 0
3788 // 0 1 1
3789 // 1 0 0
3790 // 1 0 1
3791 // 1 1 0
3792 // 1 1 1
3793 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3794
3795 if (mi_match(R: Op, MRI, P: m_AllOnesInt())) {
3796 Bits = 0xff;
3797 return true;
3798 }
3799 if (mi_match(R: Op, MRI, P: m_ZeroInt())) {
3800 Bits = 0;
3801 return true;
3802 }
3803
3804 for (unsigned I = 0; I < Src.size(); ++I) {
3805 // Try to find existing reused operand
3806 if (Src[I] == Op) {
3807 Bits = SrcBits[I];
3808 return true;
3809 }
3810 // Try to replace parent operator
3811 if (Src[I] == R) {
3812 Bits = SrcBits[I];
3813 Src[I] = Op;
3814 return true;
3815 }
3816 }
3817
3818 if (Src.size() == 3) {
3819 // No room left for operands. Try one last time, there can be a 'not' of
3820 // one of our source operands. In this case we can compute the bits
3821 // without growing Src vector.
3822 Register LHS;
3823 if (mi_match(R: Op, MRI, P: m_Not(Src: m_Reg(R&: LHS)))) {
3824 LHS = getSrcRegIgnoringCopies(Reg: LHS, MRI);
3825 for (unsigned I = 0; I < Src.size(); ++I) {
3826 if (Src[I] == LHS) {
3827 Bits = ~SrcBits[I];
3828 return true;
3829 }
3830 }
3831 }
3832
3833 return false;
3834 }
3835
3836 Bits = SrcBits[Src.size()];
3837 Src.push_back(Elt: Op);
3838 return true;
3839 };
3840
3841 MachineInstr *MI = MRI.getVRegDef(Reg: R);
3842 switch (MI->getOpcode()) {
3843 case TargetOpcode::G_AND:
3844 case TargetOpcode::G_OR:
3845 case TargetOpcode::G_XOR: {
3846 Register LHS = getSrcRegIgnoringCopies(Reg: MI->getOperand(i: 1).getReg(), MRI);
3847 Register RHS = getSrcRegIgnoringCopies(Reg: MI->getOperand(i: 2).getReg(), MRI);
3848
3849 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3850 if (!getOperandBits(LHS, LHSBits) ||
3851 !getOperandBits(RHS, RHSBits)) {
3852 Src = Backup;
3853 return std::make_pair(x: 0, y: 0);
3854 }
3855
3856 // Recursion is naturally limited by the size of the operand vector.
3857 auto Op = BitOp3_Op(R: LHS, Src, MRI);
3858 if (Op.first) {
3859 NumOpcodes += Op.first;
3860 LHSBits = Op.second;
3861 }
3862
3863 Op = BitOp3_Op(R: RHS, Src, MRI);
3864 if (Op.first) {
3865 NumOpcodes += Op.first;
3866 RHSBits = Op.second;
3867 }
3868 break;
3869 }
3870 default:
3871 return std::make_pair(x: 0, y: 0);
3872 }
3873
3874 uint8_t TTbl;
3875 switch (MI->getOpcode()) {
3876 case TargetOpcode::G_AND:
3877 TTbl = LHSBits & RHSBits;
3878 break;
3879 case TargetOpcode::G_OR:
3880 TTbl = LHSBits | RHSBits;
3881 break;
3882 case TargetOpcode::G_XOR:
3883 TTbl = LHSBits ^ RHSBits;
3884 break;
3885 default:
3886 break;
3887 }
3888
3889 return std::make_pair(x: NumOpcodes + 1, y&: TTbl);
3890}
3891
3892bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
3893 if (!Subtarget->hasBitOp3Insts())
3894 return false;
3895
3896 Register DstReg = MI.getOperand(i: 0).getReg();
3897 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3898 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3899 if (!IsVALU)
3900 return false;
3901
3902 SmallVector<Register, 3> Src;
3903 uint8_t TTbl;
3904 unsigned NumOpcodes;
3905
3906 std::tie(args&: NumOpcodes, args&: TTbl) = BitOp3_Op(R: DstReg, Src, MRI: *MRI);
3907
3908 // Src.empty() case can happen if all operands are all zero or all ones.
3909 // Normally it shall be optimized out before reaching this.
3910 if (NumOpcodes < 2 || Src.empty())
3911 return false;
3912
3913 const bool IsB32 = MRI->getType(Reg: DstReg) == LLT::scalar(SizeInBits: 32);
3914 if (NumOpcodes == 2 && IsB32) {
3915 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3916 // asm more readable. This cannot be modeled with AddedComplexity because
3917 // selector does not know how many operations did we match.
3918 if (mi_match(MI, MRI: *MRI, P: m_GXor(L: m_GXor(L: m_Reg(), R: m_Reg()), R: m_Reg())) ||
3919 mi_match(MI, MRI: *MRI, P: m_GOr(L: m_GOr(L: m_Reg(), R: m_Reg()), R: m_Reg())) ||
3920 mi_match(MI, MRI: *MRI, P: m_GOr(L: m_GAnd(L: m_Reg(), R: m_Reg()), R: m_Reg())))
3921 return false;
3922 } else if (NumOpcodes < 4) {
3923 // For a uniform case threshold should be higher to account for moves
3924 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
3925 // in SGPRs and a readtfirstlane after.
3926 return false;
3927 }
3928
3929 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
3930 unsigned CBL = STI.getConstantBusLimit(Opcode: Opc);
3931 MachineBasicBlock *MBB = MI.getParent();
3932 const DebugLoc &DL = MI.getDebugLoc();
3933
3934 for (unsigned I = 0; I < Src.size(); ++I) {
3935 const RegisterBank *RB = RBI.getRegBank(Reg: Src[I], MRI: *MRI, TRI);
3936 if (RB->getID() != AMDGPU::SGPRRegBankID)
3937 continue;
3938 if (CBL > 0) {
3939 --CBL;
3940 continue;
3941 }
3942 Register NewReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
3943 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: NewReg)
3944 .addReg(RegNo: Src[I]);
3945 Src[I] = NewReg;
3946 }
3947
3948 // Last operand can be ignored, turning a ternary operation into a binary.
3949 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3950 // 'c' with 'a' here without changing the answer. In some pathological
3951 // cases it should be possible to get an operation with a single operand
3952 // too if optimizer would not catch it.
3953 while (Src.size() < 3)
3954 Src.push_back(Elt: Src[0]);
3955
3956 auto MIB = BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg);
3957 if (!IsB32)
3958 MIB.addImm(Val: 0); // src_mod0
3959 MIB.addReg(RegNo: Src[0]);
3960 if (!IsB32)
3961 MIB.addImm(Val: 0); // src_mod1
3962 MIB.addReg(RegNo: Src[1]);
3963 if (!IsB32)
3964 MIB.addImm(Val: 0); // src_mod2
3965 MIB.addReg(RegNo: Src[2])
3966 .addImm(Val: TTbl);
3967 if (!IsB32)
3968 MIB.addImm(Val: 0); // op_sel
3969
3970 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3971 MI.eraseFromParent();
3972
3973 return true;
3974}
3975
3976bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3977 Register SrcReg = MI.getOperand(i: 0).getReg();
3978 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
3979 return false;
3980
3981 MachineInstr *DefMI = MRI->getVRegDef(Reg: SrcReg);
3982 Register SP =
3983 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
3984 Register WaveAddr = getWaveAddress(Def: DefMI);
3985 MachineBasicBlock *MBB = MI.getParent();
3986 const DebugLoc &DL = MI.getDebugLoc();
3987
3988 if (!WaveAddr) {
3989 WaveAddr = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
3990 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: WaveAddr)
3991 .addReg(RegNo: SrcReg)
3992 .addImm(Val: Subtarget->getWavefrontSizeLog2())
3993 .setOperandDead(3); // Dead scc
3994 }
3995
3996 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: SP)
3997 .addReg(RegNo: WaveAddr);
3998
3999 MI.eraseFromParent();
4000 return true;
4001}
4002
4003bool AMDGPUInstructionSelector::select(MachineInstr &I) {
4004
4005 if (!I.isPreISelOpcode()) {
4006 if (I.isCopy())
4007 return selectCOPY(I);
4008 return true;
4009 }
4010
4011 switch (I.getOpcode()) {
4012 case TargetOpcode::G_AND:
4013 case TargetOpcode::G_OR:
4014 case TargetOpcode::G_XOR:
4015 if (selectBITOP3(MI&: I))
4016 return true;
4017 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4018 return true;
4019 return selectG_AND_OR_XOR(I);
4020 case TargetOpcode::G_ADD:
4021 case TargetOpcode::G_SUB:
4022 case TargetOpcode::G_PTR_ADD:
4023 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4024 return true;
4025 return selectG_ADD_SUB(I);
4026 case TargetOpcode::G_UADDO:
4027 case TargetOpcode::G_USUBO:
4028 case TargetOpcode::G_UADDE:
4029 case TargetOpcode::G_USUBE:
4030 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4031 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4032 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4033 return selectG_AMDGPU_MAD_64_32(I);
4034 case TargetOpcode::G_INTTOPTR:
4035 case TargetOpcode::G_BITCAST:
4036 case TargetOpcode::G_PTRTOINT:
4037 case TargetOpcode::G_FREEZE:
4038 return selectCOPY(I);
4039 case TargetOpcode::G_FNEG:
4040 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4041 return true;
4042 return selectG_FNEG(MI&: I);
4043 case TargetOpcode::G_FABS:
4044 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
4045 return true;
4046 return selectG_FABS(MI&: I);
4047 case TargetOpcode::G_EXTRACT:
4048 return selectG_EXTRACT(I);
4049 case TargetOpcode::G_MERGE_VALUES:
4050 case TargetOpcode::G_CONCAT_VECTORS:
4051 return selectG_MERGE_VALUES(MI&: I);
4052 case TargetOpcode::G_UNMERGE_VALUES:
4053 return selectG_UNMERGE_VALUES(MI&: I);
4054 case TargetOpcode::G_BUILD_VECTOR:
4055 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4056 return selectG_BUILD_VECTOR(MI&: I);
4057 case TargetOpcode::G_IMPLICIT_DEF:
4058 return selectG_IMPLICIT_DEF(I);
4059 case TargetOpcode::G_INSERT:
4060 return selectG_INSERT(I);
4061 case TargetOpcode::G_INTRINSIC:
4062 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4063 return selectG_INTRINSIC(I);
4064 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4065 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4066 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4067 case TargetOpcode::G_ICMP:
4068 case TargetOpcode::G_FCMP:
4069 if (selectG_ICMP_or_FCMP(I))
4070 return true;
4071 return selectImpl(I, CoverageInfo&: *CoverageInfo);
4072 case TargetOpcode::G_LOAD:
4073 case TargetOpcode::G_ZEXTLOAD:
4074 case TargetOpcode::G_SEXTLOAD:
4075 case TargetOpcode::G_STORE:
4076 case TargetOpcode::G_ATOMIC_CMPXCHG:
4077 case TargetOpcode::G_ATOMICRMW_XCHG:
4078 case TargetOpcode::G_ATOMICRMW_ADD:
4079 case TargetOpcode::G_ATOMICRMW_SUB:
4080 case TargetOpcode::G_ATOMICRMW_AND:
4081 case TargetOpcode::G_ATOMICRMW_OR:
4082 case TargetOpcode::G_ATOMICRMW_XOR:
4083 case TargetOpcode::G_ATOMICRMW_MIN:
4084 case TargetOpcode::G_ATOMICRMW_MAX:
4085 case TargetOpcode::G_ATOMICRMW_UMIN:
4086 case TargetOpcode::G_ATOMICRMW_UMAX:
4087 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4088 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4089 case TargetOpcode::G_ATOMICRMW_FADD:
4090 case TargetOpcode::G_ATOMICRMW_FMIN:
4091 case TargetOpcode::G_ATOMICRMW_FMAX:
4092 return selectG_LOAD_STORE_ATOMICRMW(I);
4093 case TargetOpcode::G_SELECT:
4094 return selectG_SELECT(I);
4095 case TargetOpcode::G_TRUNC:
4096 return selectG_TRUNC(I);
4097 case TargetOpcode::G_SEXT:
4098 case TargetOpcode::G_ZEXT:
4099 case TargetOpcode::G_ANYEXT:
4100 case TargetOpcode::G_SEXT_INREG:
4101 // This is a workaround. For extension from type i1, `selectImpl()` uses
4102 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4103 // i1 can only be hold in a SGPR class.
4104 if (MRI->getType(Reg: I.getOperand(i: 1).getReg()) != LLT::scalar(SizeInBits: 1) &&
4105 selectImpl(I, CoverageInfo&: *CoverageInfo))
4106 return true;
4107 return selectG_SZA_EXT(I);
4108 case TargetOpcode::G_FPEXT:
4109 if (selectG_FPEXT(I))
4110 return true;
4111 return selectImpl(I, CoverageInfo&: *CoverageInfo);
4112 case TargetOpcode::G_BRCOND:
4113 return selectG_BRCOND(I);
4114 case TargetOpcode::G_GLOBAL_VALUE:
4115 return selectG_GLOBAL_VALUE(I);
4116 case TargetOpcode::G_PTRMASK:
4117 return selectG_PTRMASK(I);
4118 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4119 return selectG_EXTRACT_VECTOR_ELT(MI&: I);
4120 case TargetOpcode::G_INSERT_VECTOR_ELT:
4121 return selectG_INSERT_VECTOR_ELT(MI&: I);
4122 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4123 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4124 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4125 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4126 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4127 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4128 AMDGPU::getImageDimIntrinsicInfo(Intr: AMDGPU::getIntrinsicID(I));
4129 assert(Intr && "not an image intrinsic with image pseudo");
4130 return selectImageIntrinsic(MI&: I, Intr);
4131 }
4132 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4133 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4134 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4135 return selectBVHIntersectRayIntrinsic(MI&: I);
4136 case AMDGPU::G_SBFX:
4137 case AMDGPU::G_UBFX:
4138 return selectG_SBFX_UBFX(MI&: I);
4139 case AMDGPU::G_SI_CALL:
4140 I.setDesc(TII.get(Opcode: AMDGPU::SI_CALL));
4141 return true;
4142 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4143 return selectWaveAddress(MI&: I);
4144 case AMDGPU::G_STACKRESTORE:
4145 return selectStackRestore(MI&: I);
4146 case AMDGPU::G_PHI:
4147 return selectPHI(I);
4148 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4149 return selectCOPY_SCC_VCC(I);
4150 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4151 return selectCOPY_VCC_SCC(I);
4152 case AMDGPU::G_AMDGPU_READANYLANE:
4153 return selectReadAnyLane(I);
4154 case TargetOpcode::G_CONSTANT:
4155 case TargetOpcode::G_FCONSTANT:
4156 default:
4157 return selectImpl(I, CoverageInfo&: *CoverageInfo);
4158 }
4159 return false;
4160}
4161
4162InstructionSelector::ComplexRendererFns
4163AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4164 return {{
4165 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); }
4166 }};
4167
4168}
4169
4170std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4171 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4172 unsigned Mods = 0;
4173 MachineInstr *MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI);
4174
4175 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4176 Src = MI->getOperand(i: 1).getReg();
4177 Mods |= SISrcMods::NEG;
4178 MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI);
4179 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4180 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4181 // denormal mode, but we're implicitly canonicalizing in a source operand.
4182 const ConstantFP *LHS =
4183 getConstantFPVRegVal(VReg: MI->getOperand(i: 1).getReg(), MRI: *MRI);
4184 if (LHS && LHS->isZero()) {
4185 Mods |= SISrcMods::NEG;
4186 Src = MI->getOperand(i: 2).getReg();
4187 }
4188 }
4189
4190 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4191 Src = MI->getOperand(i: 1).getReg();
4192 Mods |= SISrcMods::ABS;
4193 }
4194
4195 if (OpSel)
4196 Mods |= SISrcMods::OP_SEL_0;
4197
4198 return std::pair(Src, Mods);
4199}
4200
4201Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4202 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4203 bool ForceVGPR) const {
4204 if ((Mods != 0 || ForceVGPR) &&
4205 RBI.getRegBank(Reg: Src, MRI: *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4206
4207 // If we looked through copies to find source modifiers on an SGPR operand,
4208 // we now have an SGPR register source. To avoid potentially violating the
4209 // constant bus restriction, we need to insert a copy to a VGPR.
4210 Register VGPRSrc = MRI->cloneVirtualRegister(VReg: Root.getReg());
4211 BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
4212 MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: VGPRSrc)
4213 .addReg(RegNo: Src);
4214 Src = VGPRSrc;
4215 }
4216
4217 return Src;
4218}
4219
4220///
4221/// This will select either an SGPR or VGPR operand and will save us from
4222/// having to write an extra tablegen pattern.
4223InstructionSelector::ComplexRendererFns
4224AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4225 return {{
4226 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); }
4227 }};
4228}
4229
4230InstructionSelector::ComplexRendererFns
4231AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4232 Register Src;
4233 unsigned Mods;
4234 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
4235
4236 return {{
4237 [=](MachineInstrBuilder &MIB) {
4238 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4239 },
4240 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
4241 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
4242 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
4243 }};
4244}
4245
4246InstructionSelector::ComplexRendererFns
4247AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4248 Register Src;
4249 unsigned Mods;
4250 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg(),
4251 /*IsCanonicalizing=*/true,
4252 /*AllowAbs=*/false);
4253
4254 return {{
4255 [=](MachineInstrBuilder &MIB) {
4256 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4257 },
4258 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
4259 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
4260 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
4261 }};
4262}
4263
4264InstructionSelector::ComplexRendererFns
4265AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4266 return {{
4267 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); },
4268 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
4269 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
4270 }};
4271}
4272
4273InstructionSelector::ComplexRendererFns
4274AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4275 Register Src;
4276 unsigned Mods;
4277 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
4278
4279 return {{
4280 [=](MachineInstrBuilder &MIB) {
4281 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4282 },
4283 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4284 }};
4285}
4286
4287InstructionSelector::ComplexRendererFns
4288AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4289 MachineOperand &Root) const {
4290 Register Src;
4291 unsigned Mods;
4292 std::tie(args&: Src, args&: Mods) =
4293 selectVOP3ModsImpl(Src: Root.getReg(), /*IsCanonicalizing=*/false);
4294
4295 return {{
4296 [=](MachineInstrBuilder &MIB) {
4297 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4298 },
4299 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4300 }};
4301}
4302
4303InstructionSelector::ComplexRendererFns
4304AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4305 Register Src;
4306 unsigned Mods;
4307 std::tie(args&: Src, args&: Mods) =
4308 selectVOP3ModsImpl(Src: Root.getReg(), /*IsCanonicalizing=*/true,
4309 /*AllowAbs=*/false);
4310
4311 return {{
4312 [=](MachineInstrBuilder &MIB) {
4313 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
4314 },
4315 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4316 }};
4317}
4318
4319InstructionSelector::ComplexRendererFns
4320AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4321 Register Reg = Root.getReg();
4322 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI);
4323 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4324 return {};
4325 return {{
4326 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
4327 }};
4328}
4329
4330std::pair<Register, unsigned>
4331AMDGPUInstructionSelector::selectVOP3PModsImpl(
4332 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
4333 unsigned Mods = 0;
4334 MachineInstr *MI = MRI.getVRegDef(Reg: Src);
4335
4336 if (MI->getOpcode() == AMDGPU::G_FNEG &&
4337 // It's possible to see an f32 fneg here, but unlikely.
4338 // TODO: Treat f32 fneg as only high bit.
4339 MRI.getType(Reg: Src) == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16)) {
4340 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
4341 Src = MI->getOperand(i: 1).getReg();
4342 MI = MRI.getVRegDef(Reg: Src);
4343 }
4344
4345 // TODO: Handle G_FSUB 0 as fneg
4346
4347 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
4348 (void)IsDOT; // DOTs do not use OPSEL on gfx942+, check ST.hasDOTOpSelHazard()
4349
4350 // Packed instructions do not have abs modifiers.
4351 Mods |= SISrcMods::OP_SEL_1;
4352
4353 return std::pair(Src, Mods);
4354}
4355
4356InstructionSelector::ComplexRendererFns
4357AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
4358 MachineRegisterInfo &MRI
4359 = Root.getParent()->getParent()->getParent()->getRegInfo();
4360
4361 Register Src;
4362 unsigned Mods;
4363 std::tie(args&: Src, args&: Mods) = selectVOP3PModsImpl(Src: Root.getReg(), MRI);
4364
4365 return {{
4366 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
4367 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4368 }};
4369}
4370
4371InstructionSelector::ComplexRendererFns
4372AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
4373 MachineRegisterInfo &MRI
4374 = Root.getParent()->getParent()->getParent()->getRegInfo();
4375
4376 Register Src;
4377 unsigned Mods;
4378 std::tie(args&: Src, args&: Mods) = selectVOP3PModsImpl(Src: Root.getReg(), MRI, IsDOT: true);
4379
4380 return {{
4381 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
4382 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4383 }};
4384}
4385
4386InstructionSelector::ComplexRendererFns
4387AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
4388 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
4389 // Value is in Imm operand as i1 sign extended to int64_t.
4390 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
4391 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4392 "expected i1 value");
4393 unsigned Mods = SISrcMods::OP_SEL_1;
4394 if (Root.getImm() == -1)
4395 Mods ^= SISrcMods::NEG;
4396 return {{
4397 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4398 }};
4399}
4400
4401InstructionSelector::ComplexRendererFns
4402AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4403 MachineOperand &Root) const {
4404 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4405 "expected i1 value");
4406 unsigned Mods = SISrcMods::OP_SEL_1;
4407 if (Root.getImm() != 0)
4408 Mods |= SISrcMods::OP_SEL_0;
4409
4410 return {{
4411 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4412 }};
4413}
4414
4415static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
4416 MachineInstr *InsertPt,
4417 MachineRegisterInfo &MRI) {
4418 const TargetRegisterClass *DstRegClass;
4419 switch (Elts.size()) {
4420 case 8:
4421 DstRegClass = &AMDGPU::VReg_256RegClass;
4422 break;
4423 case 4:
4424 DstRegClass = &AMDGPU::VReg_128RegClass;
4425 break;
4426 case 2:
4427 DstRegClass = &AMDGPU::VReg_64RegClass;
4428 break;
4429 default:
4430 llvm_unreachable("unhandled Reg sequence size");
4431 }
4432
4433 MachineIRBuilder B(*InsertPt);
4434 auto MIB = B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
4435 .addDef(RegNo: MRI.createVirtualRegister(RegClass: DstRegClass));
4436 for (unsigned i = 0; i < Elts.size(); ++i) {
4437 MIB.addReg(RegNo: Elts[i]);
4438 MIB.addImm(Val: SIRegisterInfo::getSubRegFromChannel(Channel: i));
4439 }
4440 return MIB->getOperand(i: 0).getReg();
4441}
4442
4443static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
4444 SmallVectorImpl<Register> &Elts, Register &Src,
4445 MachineInstr *InsertPt,
4446 MachineRegisterInfo &MRI) {
4447 if (ModOpcode == TargetOpcode::G_FNEG) {
4448 Mods |= SISrcMods::NEG;
4449 // Check if all elements also have abs modifier
4450 SmallVector<Register, 8> NegAbsElts;
4451 for (auto El : Elts) {
4452 Register FabsSrc;
4453 if (!mi_match(R: El, MRI, P: m_GFabs(Src: m_Reg(R&: FabsSrc))))
4454 break;
4455 NegAbsElts.push_back(Elt: FabsSrc);
4456 }
4457 if (Elts.size() != NegAbsElts.size()) {
4458 // Neg
4459 Src = buildRegSequence(Elts, InsertPt, MRI);
4460 } else {
4461 // Neg and Abs
4462 Mods |= SISrcMods::NEG_HI;
4463 Src = buildRegSequence(Elts&: NegAbsElts, InsertPt, MRI);
4464 }
4465 } else {
4466 assert(ModOpcode == TargetOpcode::G_FABS);
4467 // Abs
4468 Mods |= SISrcMods::NEG_HI;
4469 Src = buildRegSequence(Elts, InsertPt, MRI);
4470 }
4471}
4472
4473InstructionSelector::ComplexRendererFns
4474AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
4475 Register Src = Root.getReg();
4476 unsigned Mods = SISrcMods::OP_SEL_1;
4477 SmallVector<Register, 8> EltsF32;
4478
4479 if (GBuildVector *BV = dyn_cast<GBuildVector>(Val: MRI->getVRegDef(Reg: Src))) {
4480 assert(BV->getNumSources() > 0);
4481 // Based on first element decide which mod we match, neg or abs
4482 MachineInstr *ElF32 = MRI->getVRegDef(Reg: BV->getSourceReg(I: 0));
4483 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
4484 ? AMDGPU::G_FNEG
4485 : AMDGPU::G_FABS;
4486 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
4487 ElF32 = MRI->getVRegDef(Reg: BV->getSourceReg(I: i));
4488 if (ElF32->getOpcode() != ModOpcode)
4489 break;
4490 EltsF32.push_back(Elt: ElF32->getOperand(i: 1).getReg());
4491 }
4492
4493 // All elements had ModOpcode modifier
4494 if (BV->getNumSources() == EltsF32.size()) {
4495 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF32, Src, InsertPt: Root.getParent(),
4496 MRI&: *MRI);
4497 }
4498 }
4499
4500 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
4501 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
4502}
4503
4504InstructionSelector::ComplexRendererFns
4505AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
4506 Register Src = Root.getReg();
4507 unsigned Mods = SISrcMods::OP_SEL_1;
4508 SmallVector<Register, 8> EltsV2F16;
4509
4510 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(Val: MRI->getVRegDef(Reg: Src))) {
4511 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4512 Register FNegSrc;
4513 if (!mi_match(R: CV->getSourceReg(I: i), MRI: *MRI, P: m_GFNeg(Src: m_Reg(R&: FNegSrc))))
4514 break;
4515 EltsV2F16.push_back(Elt: FNegSrc);
4516 }
4517
4518 // All elements had ModOpcode modifier
4519 if (CV->getNumSources() == EltsV2F16.size()) {
4520 Mods |= SISrcMods::NEG;
4521 Mods |= SISrcMods::NEG_HI;
4522 Src = buildRegSequence(Elts&: EltsV2F16, InsertPt: Root.getParent(), MRI&: *MRI);
4523 }
4524 }
4525
4526 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
4527 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
4528}
4529
4530InstructionSelector::ComplexRendererFns
4531AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4532 Register Src = Root.getReg();
4533 unsigned Mods = SISrcMods::OP_SEL_1;
4534 SmallVector<Register, 8> EltsV2F16;
4535
4536 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(Val: MRI->getVRegDef(Reg: Src))) {
4537 assert(CV->getNumSources() > 0);
4538 MachineInstr *ElV2F16 = MRI->getVRegDef(Reg: CV->getSourceReg(I: 0));
4539 // Based on first element decide which mod we match, neg or abs
4540 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4541 ? AMDGPU::G_FNEG
4542 : AMDGPU::G_FABS;
4543
4544 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4545 ElV2F16 = MRI->getVRegDef(Reg: CV->getSourceReg(I: i));
4546 if (ElV2F16->getOpcode() != ModOpcode)
4547 break;
4548 EltsV2F16.push_back(Elt: ElV2F16->getOperand(i: 1).getReg());
4549 }
4550
4551 // All elements had ModOpcode modifier
4552 if (CV->getNumSources() == EltsV2F16.size()) {
4553 MachineIRBuilder B(*Root.getParent());
4554 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsV2F16, Src, InsertPt: Root.getParent(),
4555 MRI&: *MRI);
4556 }
4557 }
4558
4559 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
4560 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
4561}
4562
4563InstructionSelector::ComplexRendererFns
4564AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4565 std::optional<FPValueAndVReg> FPValReg;
4566 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_GFCstOrSplat(FPValReg))) {
4567 if (TII.isInlineConstant(Imm: FPValReg->Value)) {
4568 return {{[=](MachineInstrBuilder &MIB) {
4569 MIB.addImm(Val: FPValReg->Value.bitcastToAPInt().getSExtValue());
4570 }}};
4571 }
4572 // Non-inlineable splat floats should not fall-through for integer immediate
4573 // checks.
4574 return {};
4575 }
4576
4577 APInt ICst;
4578 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICstOrSplat(Cst&: ICst))) {
4579 if (TII.isInlineConstant(Imm: ICst)) {
4580 return {
4581 {[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ICst.getSExtValue()); }}};
4582 }
4583 }
4584
4585 return {};
4586}
4587
4588InstructionSelector::ComplexRendererFns
4589AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4590 Register Src =
4591 getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg();
4592 unsigned Key = 0;
4593
4594 Register ShiftSrc;
4595 std::optional<ValueAndVReg> ShiftAmt;
4596 if (mi_match(R: Src, MRI: *MRI, P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt))) &&
4597 MRI->getType(Reg: ShiftSrc).getSizeInBits() == 32 &&
4598 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4599 Key = ShiftAmt->Value.getZExtValue() / 8;
4600 Src = ShiftSrc;
4601 }
4602
4603 return {{
4604 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
4605 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key
4606 }};
4607}
4608
4609InstructionSelector::ComplexRendererFns
4610AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4611
4612 Register Src =
4613 getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg();
4614 unsigned Key = 0;
4615
4616 Register ShiftSrc;
4617 std::optional<ValueAndVReg> ShiftAmt;
4618 if (mi_match(R: Src, MRI: *MRI, P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt))) &&
4619 MRI->getType(Reg: ShiftSrc).getSizeInBits() == 32 &&
4620 ShiftAmt->Value.getZExtValue() == 16) {
4621 Src = ShiftSrc;
4622 Key = 1;
4623 }
4624
4625 return {{
4626 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
4627 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key
4628 }};
4629}
4630
4631InstructionSelector::ComplexRendererFns
4632AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4633 Register Src;
4634 unsigned Mods;
4635 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
4636
4637 // FIXME: Handle op_sel
4638 return {{
4639 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
4640 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4641 }};
4642}
4643
4644// FIXME-TRUE16 remove when fake16 is removed
4645InstructionSelector::ComplexRendererFns
4646AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4647 Register Src;
4648 unsigned Mods;
4649 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg(),
4650 /*IsCanonicalizing=*/true,
4651 /*AllowAbs=*/false,
4652 /*OpSel=*/false);
4653
4654 return {{
4655 [=](MachineInstrBuilder &MIB) {
4656 MIB.addReg(
4657 RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB, /* ForceVGPR */ true));
4658 },
4659 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
4660 }};
4661}
4662
4663InstructionSelector::ComplexRendererFns
4664AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4665 Register Src;
4666 unsigned Mods;
4667 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg(),
4668 /*IsCanonicalizing=*/true,
4669 /*AllowAbs=*/false,
4670 /*OpSel=*/true);
4671
4672 return {{
4673 [=](MachineInstrBuilder &MIB) {
4674 MIB.addReg(
4675 RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB, /* ForceVGPR */ true));
4676 },
4677 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
4678 }};
4679}
4680
4681bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4682 Register &Base,
4683 Register *SOffset,
4684 int64_t *Offset) const {
4685 MachineInstr *MI = Root.getParent();
4686 MachineBasicBlock *MBB = MI->getParent();
4687
4688 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4689 // then we can select all ptr + 32-bit offsets.
4690 SmallVector<GEPInfo, 4> AddrInfo;
4691 getAddrModeInfo(Load: *MI, MRI: *MRI, AddrInfo);
4692
4693 if (AddrInfo.empty())
4694 return false;
4695
4696 const GEPInfo &GEPI = AddrInfo[0];
4697 std::optional<int64_t> EncodedImm;
4698
4699 if (SOffset && Offset) {
4700 EncodedImm = AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: GEPI.Imm, /*IsBuffer=*/false,
4701 /*HasSOffset=*/true);
4702 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4703 AddrInfo.size() > 1) {
4704 const GEPInfo &GEPI2 = AddrInfo[1];
4705 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4706 if (Register OffsetReg =
4707 matchZeroExtendFromS32(MRI&: *MRI, Reg: GEPI2.SgprParts[1])) {
4708 Base = GEPI2.SgprParts[0];
4709 *SOffset = OffsetReg;
4710 *Offset = *EncodedImm;
4711 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(ST: STI))
4712 return true;
4713
4714 // For unbuffered smem loads, it is illegal for the Immediate Offset
4715 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4716 // is negative. Handle the case where the Immediate Offset + SOffset
4717 // is negative.
4718 auto SKnown = VT->getKnownBits(R: *SOffset);
4719 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4720 return false;
4721
4722 return true;
4723 }
4724 }
4725 }
4726 return false;
4727 }
4728
4729 EncodedImm = AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: GEPI.Imm, /*IsBuffer=*/false,
4730 /*HasSOffset=*/false);
4731 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4732 Base = GEPI.SgprParts[0];
4733 *Offset = *EncodedImm;
4734 return true;
4735 }
4736
4737 // SGPR offset is unsigned.
4738 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(x: GEPI.Imm) &&
4739 GEPI.Imm != 0) {
4740 // If we make it this far we have a load with an 32-bit immediate offset.
4741 // It is OK to select this using a sgpr offset, because we have already
4742 // failed trying to select this load into one of the _IMM variants since
4743 // the _IMM Patterns are considered before the _SGPR patterns.
4744 Base = GEPI.SgprParts[0];
4745 *SOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
4746 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: *SOffset)
4747 .addImm(Val: GEPI.Imm);
4748 return true;
4749 }
4750
4751 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4752 if (Register OffsetReg = matchZeroExtendFromS32(MRI&: *MRI, Reg: GEPI.SgprParts[1])) {
4753 Base = GEPI.SgprParts[0];
4754 *SOffset = OffsetReg;
4755 return true;
4756 }
4757 }
4758
4759 return false;
4760}
4761
4762InstructionSelector::ComplexRendererFns
4763AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4764 Register Base;
4765 int64_t Offset;
4766 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, Offset: &Offset))
4767 return std::nullopt;
4768
4769 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
4770 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }}};
4771}
4772
4773InstructionSelector::ComplexRendererFns
4774AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4775 SmallVector<GEPInfo, 4> AddrInfo;
4776 getAddrModeInfo(Load: *Root.getParent(), MRI: *MRI, AddrInfo);
4777
4778 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4779 return std::nullopt;
4780
4781 const GEPInfo &GEPInfo = AddrInfo[0];
4782 Register PtrReg = GEPInfo.SgprParts[0];
4783 std::optional<int64_t> EncodedImm =
4784 AMDGPU::getSMRDEncodedLiteralOffset32(ST: STI, ByteOffset: GEPInfo.Imm);
4785 if (!EncodedImm)
4786 return std::nullopt;
4787
4788 return {{
4789 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrReg); },
4790 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); }
4791 }};
4792}
4793
4794InstructionSelector::ComplexRendererFns
4795AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4796 Register Base, SOffset;
4797 if (!selectSmrdOffset(Root, Base, SOffset: &SOffset, /* Offset= */ nullptr))
4798 return std::nullopt;
4799
4800 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
4801 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); }}};
4802}
4803
4804InstructionSelector::ComplexRendererFns
4805AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4806 Register Base, SOffset;
4807 int64_t Offset;
4808 if (!selectSmrdOffset(Root, Base, SOffset: &SOffset, Offset: &Offset))
4809 return std::nullopt;
4810
4811 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
4812 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); },
4813 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }}};
4814}
4815
4816std::pair<Register, int>
4817AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4818 uint64_t FlatVariant) const {
4819 MachineInstr *MI = Root.getParent();
4820
4821 auto Default = std::pair(Root.getReg(), 0);
4822
4823 if (!STI.hasFlatInstOffsets())
4824 return Default;
4825
4826 Register PtrBase;
4827 int64_t ConstOffset;
4828 std::tie(args&: PtrBase, args&: ConstOffset) =
4829 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
4830
4831 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4832 !isFlatScratchBaseLegal(Addr: Root.getReg())))
4833 return Default;
4834
4835 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4836 if (!TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace, FlatVariant))
4837 return Default;
4838
4839 return std::pair(PtrBase, ConstOffset);
4840}
4841
4842InstructionSelector::ComplexRendererFns
4843AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4844 auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FLAT);
4845
4846 return {{
4847 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
4848 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
4849 }};
4850}
4851
4852InstructionSelector::ComplexRendererFns
4853AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4854 auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FlatGlobal);
4855
4856 return {{
4857 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
4858 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
4859 }};
4860}
4861
4862InstructionSelector::ComplexRendererFns
4863AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4864 auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FlatScratch);
4865
4866 return {{
4867 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
4868 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
4869 }};
4870}
4871
4872// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4873InstructionSelector::ComplexRendererFns
4874AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4875 Register Addr = Root.getReg();
4876 Register PtrBase;
4877 int64_t ConstOffset;
4878 int64_t ImmOffset = 0;
4879
4880 // Match the immediate offset first, which canonically is moved as low as
4881 // possible.
4882 std::tie(args&: PtrBase, args&: ConstOffset) = getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
4883
4884 if (ConstOffset != 0) {
4885 if (TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
4886 FlatVariant: SIInstrFlags::FlatGlobal)) {
4887 Addr = PtrBase;
4888 ImmOffset = ConstOffset;
4889 } else {
4890 auto PtrBaseDef = getDefSrcRegIgnoringCopies(Reg: PtrBase, MRI: *MRI);
4891 if (isSGPR(Reg: PtrBaseDef->Reg)) {
4892 if (ConstOffset > 0) {
4893 // Offset is too large.
4894 //
4895 // saddr + large_offset -> saddr +
4896 // (voffset = large_offset & ~MaxOffset) +
4897 // (large_offset & MaxOffset);
4898 int64_t SplitImmOffset, RemainderOffset;
4899 std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII.splitFlatOffset(
4900 COffsetVal: ConstOffset, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS, FlatVariant: SIInstrFlags::FlatGlobal);
4901
4902 if (isUInt<32>(x: RemainderOffset)) {
4903 MachineInstr *MI = Root.getParent();
4904 MachineBasicBlock *MBB = MI->getParent();
4905 Register HighBits =
4906 MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
4907
4908 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32),
4909 DestReg: HighBits)
4910 .addImm(Val: RemainderOffset);
4911
4912 return {{
4913 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrBase); }, // saddr
4914 [=](MachineInstrBuilder &MIB) {
4915 MIB.addReg(RegNo: HighBits);
4916 }, // voffset
4917 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: SplitImmOffset); },
4918 }};
4919 }
4920 }
4921
4922 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4923 // is 1 we would need to perform 1 or 2 extra moves for each half of
4924 // the constant and it is better to do a scalar add and then issue a
4925 // single VALU instruction to materialize zero. Otherwise it is less
4926 // instructions to perform VALU adds with immediates or inline literals.
4927 unsigned NumLiterals =
4928 !TII.isInlineConstant(Imm: APInt(32, Lo_32(Value: ConstOffset))) +
4929 !TII.isInlineConstant(Imm: APInt(32, Hi_32(Value: ConstOffset)));
4930 if (STI.getConstantBusLimit(Opcode: AMDGPU::V_ADD_U32_e64) > NumLiterals)
4931 return std::nullopt;
4932 }
4933 }
4934 }
4935
4936 // Match the variable offset.
4937 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
4938 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4939 // Look through the SGPR->VGPR copy.
4940 Register SAddr =
4941 getSrcRegIgnoringCopies(Reg: AddrDef->MI->getOperand(i: 1).getReg(), MRI: *MRI);
4942
4943 if (isSGPR(Reg: SAddr)) {
4944 Register PtrBaseOffset = AddrDef->MI->getOperand(i: 2).getReg();
4945
4946 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4947 // inserted later.
4948 if (Register VOffset = matchZeroExtendFromS32(MRI&: *MRI, Reg: PtrBaseOffset)) {
4949 return {{[=](MachineInstrBuilder &MIB) { // saddr
4950 MIB.addReg(RegNo: SAddr);
4951 },
4952 [=](MachineInstrBuilder &MIB) { // voffset
4953 MIB.addReg(RegNo: VOffset);
4954 },
4955 [=](MachineInstrBuilder &MIB) { // offset
4956 MIB.addImm(Val: ImmOffset);
4957 }}};
4958 }
4959 }
4960 }
4961
4962 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4963 // drop this.
4964 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4965 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(Reg: AddrDef->Reg))
4966 return std::nullopt;
4967
4968 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4969 // moves required to copy a 64-bit SGPR to VGPR.
4970 MachineInstr *MI = Root.getParent();
4971 MachineBasicBlock *MBB = MI->getParent();
4972 Register VOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
4973
4974 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VOffset)
4975 .addImm(Val: 0);
4976
4977 return {{
4978 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: AddrDef->Reg); }, // saddr
4979 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: VOffset); }, // voffset
4980 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
4981 }};
4982}
4983
4984InstructionSelector::ComplexRendererFns
4985AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4986 Register Addr = Root.getReg();
4987 Register PtrBase;
4988 int64_t ConstOffset;
4989 int64_t ImmOffset = 0;
4990
4991 // Match the immediate offset first, which canonically is moved as low as
4992 // possible.
4993 std::tie(args&: PtrBase, args&: ConstOffset) = getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
4994
4995 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4996 TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
4997 FlatVariant: SIInstrFlags::FlatScratch)) {
4998 Addr = PtrBase;
4999 ImmOffset = ConstOffset;
5000 }
5001
5002 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
5003 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5004 int FI = AddrDef->MI->getOperand(i: 1).getIndex();
5005 return {{
5006 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(Idx: FI); }, // saddr
5007 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
5008 }};
5009 }
5010
5011 Register SAddr = AddrDef->Reg;
5012
5013 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5014 Register LHS = AddrDef->MI->getOperand(i: 1).getReg();
5015 Register RHS = AddrDef->MI->getOperand(i: 2).getReg();
5016 auto LHSDef = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI);
5017 auto RHSDef = getDefSrcRegIgnoringCopies(Reg: RHS, MRI: *MRI);
5018
5019 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
5020 isSGPR(Reg: RHSDef->Reg)) {
5021 int FI = LHSDef->MI->getOperand(i: 1).getIndex();
5022 MachineInstr &I = *Root.getParent();
5023 MachineBasicBlock *BB = I.getParent();
5024 const DebugLoc &DL = I.getDebugLoc();
5025 SAddr = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5026
5027 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADD_I32), DestReg: SAddr)
5028 .addFrameIndex(Idx: FI)
5029 .addReg(RegNo: RHSDef->Reg)
5030 .setOperandDead(3); // Dead scc
5031 }
5032 }
5033
5034 if (!isSGPR(Reg: SAddr))
5035 return std::nullopt;
5036
5037 return {{
5038 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SAddr); }, // saddr
5039 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
5040 }};
5041}
5042
5043// Check whether the flat scratch SVS swizzle bug affects this access.
5044bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
5045 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
5046 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
5047 return false;
5048
5049 // The bug affects the swizzling of SVS accesses if there is any carry out
5050 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
5051 // voffset to (soffset + inst_offset).
5052 auto VKnown = VT->getKnownBits(R: VAddr);
5053 auto SKnown = KnownBits::add(LHS: VT->getKnownBits(R: SAddr),
5054 RHS: KnownBits::makeConstant(C: APInt(32, ImmOffset)));
5055 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5056 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
5057 return (VMax & 3) + (SMax & 3) >= 4;
5058}
5059
5060InstructionSelector::ComplexRendererFns
5061AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
5062 Register Addr = Root.getReg();
5063 Register PtrBase;
5064 int64_t ConstOffset;
5065 int64_t ImmOffset = 0;
5066
5067 // Match the immediate offset first, which canonically is moved as low as
5068 // possible.
5069 std::tie(args&: PtrBase, args&: ConstOffset) = getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
5070
5071 Register OrigAddr = Addr;
5072 if (ConstOffset != 0 &&
5073 TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: true)) {
5074 Addr = PtrBase;
5075 ImmOffset = ConstOffset;
5076 }
5077
5078 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
5079 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5080 return std::nullopt;
5081
5082 Register RHS = AddrDef->MI->getOperand(i: 2).getReg();
5083 if (RBI.getRegBank(Reg: RHS, MRI: *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
5084 return std::nullopt;
5085
5086 Register LHS = AddrDef->MI->getOperand(i: 1).getReg();
5087 auto LHSDef = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI);
5088
5089 if (OrigAddr != Addr) {
5090 if (!isFlatScratchBaseLegalSVImm(Addr: OrigAddr))
5091 return std::nullopt;
5092 } else {
5093 if (!isFlatScratchBaseLegalSV(Addr: OrigAddr))
5094 return std::nullopt;
5095 }
5096
5097 if (checkFlatScratchSVSSwizzleBug(VAddr: RHS, SAddr: LHS, ImmOffset))
5098 return std::nullopt;
5099
5100 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5101 int FI = LHSDef->MI->getOperand(i: 1).getIndex();
5102 return {{
5103 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: RHS); }, // vaddr
5104 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(Idx: FI); }, // saddr
5105 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
5106 }};
5107 }
5108
5109 if (!isSGPR(Reg: LHS))
5110 return std::nullopt;
5111
5112 return {{
5113 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: RHS); }, // vaddr
5114 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: LHS); }, // saddr
5115 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
5116 }};
5117}
5118
5119InstructionSelector::ComplexRendererFns
5120AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
5121 MachineInstr *MI = Root.getParent();
5122 MachineBasicBlock *MBB = MI->getParent();
5123 MachineFunction *MF = MBB->getParent();
5124 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5125
5126 int64_t Offset = 0;
5127 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: Offset)) &&
5128 Offset != TM.getNullPointerValue(AddrSpace: AMDGPUAS::PRIVATE_ADDRESS)) {
5129 Register HighBits = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5130
5131 // TODO: Should this be inside the render function? The iterator seems to
5132 // move.
5133 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
5134 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32),
5135 DestReg: HighBits)
5136 .addImm(Val: Offset & ~MaxOffset);
5137
5138 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5139 MIB.addReg(RegNo: Info->getScratchRSrcReg());
5140 },
5141 [=](MachineInstrBuilder &MIB) { // vaddr
5142 MIB.addReg(RegNo: HighBits);
5143 },
5144 [=](MachineInstrBuilder &MIB) { // soffset
5145 // Use constant zero for soffset and rely on eliminateFrameIndex
5146 // to choose the appropriate frame register if need be.
5147 MIB.addImm(Val: 0);
5148 },
5149 [=](MachineInstrBuilder &MIB) { // offset
5150 MIB.addImm(Val: Offset & MaxOffset);
5151 }}};
5152 }
5153
5154 assert(Offset == 0 || Offset == -1);
5155
5156 // Try to fold a frame index directly into the MUBUF vaddr field, and any
5157 // offsets.
5158 std::optional<int> FI;
5159 Register VAddr = Root.getReg();
5160
5161 const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg());
5162 Register PtrBase;
5163 int64_t ConstOffset;
5164 std::tie(args&: PtrBase, args&: ConstOffset) = getPtrBaseWithConstantOffset(Root: VAddr, MRI: *MRI);
5165 if (ConstOffset != 0) {
5166 if (TII.isLegalMUBUFImmOffset(Imm: ConstOffset) &&
5167 (!STI.privateMemoryResourceIsRangeChecked() ||
5168 VT->signBitIsZero(Op: PtrBase))) {
5169 const MachineInstr *PtrBaseDef = MRI->getVRegDef(Reg: PtrBase);
5170 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
5171 FI = PtrBaseDef->getOperand(i: 1).getIndex();
5172 else
5173 VAddr = PtrBase;
5174 Offset = ConstOffset;
5175 }
5176 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5177 FI = RootDef->getOperand(i: 1).getIndex();
5178 }
5179
5180 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5181 MIB.addReg(RegNo: Info->getScratchRSrcReg());
5182 },
5183 [=](MachineInstrBuilder &MIB) { // vaddr
5184 if (FI)
5185 MIB.addFrameIndex(Idx: *FI);
5186 else
5187 MIB.addReg(RegNo: VAddr);
5188 },
5189 [=](MachineInstrBuilder &MIB) { // soffset
5190 // Use constant zero for soffset and rely on eliminateFrameIndex
5191 // to choose the appropriate frame register if need be.
5192 MIB.addImm(Val: 0);
5193 },
5194 [=](MachineInstrBuilder &MIB) { // offset
5195 MIB.addImm(Val: Offset);
5196 }}};
5197}
5198
5199bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
5200 int64_t Offset) const {
5201 if (!isUInt<16>(x: Offset))
5202 return false;
5203
5204 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
5205 return true;
5206
5207 // On Southern Islands instruction with a negative base value and an offset
5208 // don't seem to work.
5209 return VT->signBitIsZero(Op: Base);
5210}
5211
5212bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
5213 int64_t Offset1,
5214 unsigned Size) const {
5215 if (Offset0 % Size != 0 || Offset1 % Size != 0)
5216 return false;
5217 if (!isUInt<8>(x: Offset0 / Size) || !isUInt<8>(x: Offset1 / Size))
5218 return false;
5219
5220 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
5221 return true;
5222
5223 // On Southern Islands instruction with a negative base value and an offset
5224 // don't seem to work.
5225 return VT->signBitIsZero(Op: Base);
5226}
5227
5228// Return whether the operation has NoUnsignedWrap property.
5229static bool isNoUnsignedWrap(MachineInstr *Addr) {
5230 return Addr->getOpcode() == TargetOpcode::G_OR ||
5231 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
5232 Addr->getFlag(Flag: MachineInstr::NoUWrap));
5233}
5234
5235// Check that the base address of flat scratch load/store in the form of `base +
5236// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
5237// requirement). We always treat the first operand as the base address here.
5238bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
5239 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
5240
5241 if (isNoUnsignedWrap(Addr: AddrMI))
5242 return true;
5243
5244 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5245 // values.
5246 if (STI.hasSignedScratchOffsets())
5247 return true;
5248
5249 Register LHS = AddrMI->getOperand(i: 1).getReg();
5250 Register RHS = AddrMI->getOperand(i: 2).getReg();
5251
5252 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
5253 std::optional<ValueAndVReg> RhsValReg =
5254 getIConstantVRegValWithLookThrough(VReg: RHS, MRI: *MRI);
5255 // If the immediate offset is negative and within certain range, the base
5256 // address cannot also be negative. If the base is also negative, the sum
5257 // would be either negative or much larger than the valid range of scratch
5258 // memory a thread can access.
5259 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
5260 RhsValReg->Value.getSExtValue() > -0x40000000)
5261 return true;
5262 }
5263
5264 return VT->signBitIsZero(Op: LHS);
5265}
5266
5267// Check address value in SGPR/VGPR are legal for flat scratch in the form
5268// of: SGPR + VGPR.
5269bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
5270 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
5271
5272 if (isNoUnsignedWrap(Addr: AddrMI))
5273 return true;
5274
5275 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5276 // values.
5277 if (STI.hasSignedScratchOffsets())
5278 return true;
5279
5280 Register LHS = AddrMI->getOperand(i: 1).getReg();
5281 Register RHS = AddrMI->getOperand(i: 2).getReg();
5282 return VT->signBitIsZero(Op: RHS) && VT->signBitIsZero(Op: LHS);
5283}
5284
5285// Check address value in SGPR/VGPR are legal for flat scratch in the form
5286// of: SGPR + VGPR + Imm.
5287bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
5288 Register Addr) const {
5289 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5290 // values.
5291 if (STI.hasSignedScratchOffsets())
5292 return true;
5293
5294 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
5295 Register Base = AddrMI->getOperand(i: 1).getReg();
5296 std::optional<DefinitionAndSourceRegister> BaseDef =
5297 getDefSrcRegIgnoringCopies(Reg: Base, MRI: *MRI);
5298 std::optional<ValueAndVReg> RHSOffset =
5299 getIConstantVRegValWithLookThrough(VReg: AddrMI->getOperand(i: 2).getReg(), MRI: *MRI);
5300 assert(RHSOffset);
5301
5302 // If the immediate offset is negative and within certain range, the base
5303 // address cannot also be negative. If the base is also negative, the sum
5304 // would be either negative or much larger than the valid range of scratch
5305 // memory a thread can access.
5306 if (isNoUnsignedWrap(Addr: BaseDef->MI) &&
5307 (isNoUnsignedWrap(Addr: AddrMI) ||
5308 (RHSOffset->Value.getSExtValue() < 0 &&
5309 RHSOffset->Value.getSExtValue() > -0x40000000)))
5310 return true;
5311
5312 Register LHS = BaseDef->MI->getOperand(i: 1).getReg();
5313 Register RHS = BaseDef->MI->getOperand(i: 2).getReg();
5314 return VT->signBitIsZero(Op: RHS) && VT->signBitIsZero(Op: LHS);
5315}
5316
5317bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
5318 unsigned ShAmtBits) const {
5319 assert(MI.getOpcode() == TargetOpcode::G_AND);
5320
5321 std::optional<APInt> RHS =
5322 getIConstantVRegVal(VReg: MI.getOperand(i: 2).getReg(), MRI: *MRI);
5323 if (!RHS)
5324 return false;
5325
5326 if (RHS->countr_one() >= ShAmtBits)
5327 return true;
5328
5329 const APInt &LHSKnownZeros = VT->getKnownZeroes(R: MI.getOperand(i: 1).getReg());
5330 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
5331}
5332
5333InstructionSelector::ComplexRendererFns
5334AMDGPUInstructionSelector::selectMUBUFScratchOffset(
5335 MachineOperand &Root) const {
5336 Register Reg = Root.getReg();
5337 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5338
5339 std::optional<DefinitionAndSourceRegister> Def =
5340 getDefSrcRegIgnoringCopies(Reg, MRI: *MRI);
5341 assert(Def && "this shouldn't be an optional result");
5342 Reg = Def->Reg;
5343
5344 if (Register WaveBase = getWaveAddress(Def: Def->MI)) {
5345 return {{
5346 [=](MachineInstrBuilder &MIB) { // rsrc
5347 MIB.addReg(RegNo: Info->getScratchRSrcReg());
5348 },
5349 [=](MachineInstrBuilder &MIB) { // soffset
5350 MIB.addReg(RegNo: WaveBase);
5351 },
5352 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // offset
5353 }};
5354 }
5355
5356 int64_t Offset = 0;
5357
5358 // FIXME: Copy check is a hack
5359 Register BasePtr;
5360 if (mi_match(R: Reg, MRI: *MRI,
5361 P: m_GPtrAdd(L: m_Reg(R&: BasePtr),
5362 R: m_any_of(preds: m_ICst(Cst&: Offset), preds: m_Copy(Src: m_ICst(Cst&: Offset)))))) {
5363 if (!TII.isLegalMUBUFImmOffset(Imm: Offset))
5364 return {};
5365 MachineInstr *BasePtrDef = getDefIgnoringCopies(Reg: BasePtr, MRI: *MRI);
5366 Register WaveBase = getWaveAddress(Def: BasePtrDef);
5367 if (!WaveBase)
5368 return {};
5369
5370 return {{
5371 [=](MachineInstrBuilder &MIB) { // rsrc
5372 MIB.addReg(RegNo: Info->getScratchRSrcReg());
5373 },
5374 [=](MachineInstrBuilder &MIB) { // soffset
5375 MIB.addReg(RegNo: WaveBase);
5376 },
5377 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); } // offset
5378 }};
5379 }
5380
5381 if (!mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: Offset)) ||
5382 !TII.isLegalMUBUFImmOffset(Imm: Offset))
5383 return {};
5384
5385 return {{
5386 [=](MachineInstrBuilder &MIB) { // rsrc
5387 MIB.addReg(RegNo: Info->getScratchRSrcReg());
5388 },
5389 [=](MachineInstrBuilder &MIB) { // soffset
5390 MIB.addImm(Val: 0);
5391 },
5392 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); } // offset
5393 }};
5394}
5395
5396std::pair<Register, unsigned>
5397AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
5398 const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg());
5399 int64_t ConstAddr = 0;
5400
5401 Register PtrBase;
5402 int64_t Offset;
5403 std::tie(args&: PtrBase, args&: Offset) =
5404 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
5405
5406 if (Offset) {
5407 if (isDSOffsetLegal(Base: PtrBase, Offset)) {
5408 // (add n0, c0)
5409 return std::pair(PtrBase, Offset);
5410 }
5411 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5412 // TODO
5413
5414
5415 } else if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: ConstAddr))) {
5416 // TODO
5417
5418 }
5419
5420 return std::pair(Root.getReg(), 0);
5421}
5422
5423InstructionSelector::ComplexRendererFns
5424AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
5425 Register Reg;
5426 unsigned Offset;
5427 std::tie(args&: Reg, args&: Offset) = selectDS1Addr1OffsetImpl(Root);
5428 return {{
5429 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
5430 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }
5431 }};
5432}
5433
5434InstructionSelector::ComplexRendererFns
5435AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
5436 return selectDSReadWrite2(Root, size: 4);
5437}
5438
5439InstructionSelector::ComplexRendererFns
5440AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
5441 return selectDSReadWrite2(Root, size: 8);
5442}
5443
5444InstructionSelector::ComplexRendererFns
5445AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
5446 unsigned Size) const {
5447 Register Reg;
5448 unsigned Offset;
5449 std::tie(args&: Reg, args&: Offset) = selectDSReadWrite2Impl(Root, size: Size);
5450 return {{
5451 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
5452 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); },
5453 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset+1); }
5454 }};
5455}
5456
5457std::pair<Register, unsigned>
5458AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
5459 unsigned Size) const {
5460 const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg());
5461 int64_t ConstAddr = 0;
5462
5463 Register PtrBase;
5464 int64_t Offset;
5465 std::tie(args&: PtrBase, args&: Offset) =
5466 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
5467
5468 if (Offset) {
5469 int64_t OffsetValue0 = Offset;
5470 int64_t OffsetValue1 = Offset + Size;
5471 if (isDSOffset2Legal(Base: PtrBase, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
5472 // (add n0, c0)
5473 return std::pair(PtrBase, OffsetValue0 / Size);
5474 }
5475 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5476 // TODO
5477
5478 } else if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: ConstAddr))) {
5479 // TODO
5480
5481 }
5482
5483 return std::pair(Root.getReg(), 0);
5484}
5485
5486/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
5487/// the base value with the constant offset. There may be intervening copies
5488/// between \p Root and the identified constant. Returns \p Root, 0 if this does
5489/// not match the pattern.
5490std::pair<Register, int64_t>
5491AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
5492 Register Root, const MachineRegisterInfo &MRI) const {
5493 MachineInstr *RootI = getDefIgnoringCopies(Reg: Root, MRI);
5494 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
5495 return {Root, 0};
5496
5497 MachineOperand &RHS = RootI->getOperand(i: 2);
5498 std::optional<ValueAndVReg> MaybeOffset =
5499 getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI);
5500 if (!MaybeOffset)
5501 return {Root, 0};
5502 return {RootI->getOperand(i: 1).getReg(), MaybeOffset->Value.getSExtValue()};
5503}
5504
5505static void addZeroImm(MachineInstrBuilder &MIB) {
5506 MIB.addImm(Val: 0);
5507}
5508
5509/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5510/// BasePtr is not valid, a null base pointer will be used.
5511static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5512 uint32_t FormatLo, uint32_t FormatHi,
5513 Register BasePtr) {
5514 Register RSrc2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5515 Register RSrc3 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5516 Register RSrcHi = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
5517 Register RSrc = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_128RegClass);
5518
5519 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
5520 .addDef(RegNo: RSrc2)
5521 .addImm(Val: FormatLo);
5522 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
5523 .addDef(RegNo: RSrc3)
5524 .addImm(Val: FormatHi);
5525
5526 // Build the half of the subregister with the constants before building the
5527 // full 128-bit register. If we are building multiple resource descriptors,
5528 // this will allow CSEing of the 2-component register.
5529 B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
5530 .addDef(RegNo: RSrcHi)
5531 .addReg(RegNo: RSrc2)
5532 .addImm(Val: AMDGPU::sub0)
5533 .addReg(RegNo: RSrc3)
5534 .addImm(Val: AMDGPU::sub1);
5535
5536 Register RSrcLo = BasePtr;
5537 if (!BasePtr) {
5538 RSrcLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
5539 B.buildInstr(Opcode: AMDGPU::S_MOV_B64)
5540 .addDef(RegNo: RSrcLo)
5541 .addImm(Val: 0);
5542 }
5543
5544 B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
5545 .addDef(RegNo: RSrc)
5546 .addReg(RegNo: RSrcLo)
5547 .addImm(Val: AMDGPU::sub0_sub1)
5548 .addReg(RegNo: RSrcHi)
5549 .addImm(Val: AMDGPU::sub2_sub3);
5550
5551 return RSrc;
5552}
5553
5554static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5555 const SIInstrInfo &TII, Register BasePtr) {
5556 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5557
5558 // FIXME: Why are half the "default" bits ignored based on the addressing
5559 // mode?
5560 return buildRSRC(B, MRI, FormatLo: 0, FormatHi: Hi_32(Value: DefaultFormat), BasePtr);
5561}
5562
5563static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5564 const SIInstrInfo &TII, Register BasePtr) {
5565 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5566
5567 // FIXME: Why are half the "default" bits ignored based on the addressing
5568 // mode?
5569 return buildRSRC(B, MRI, FormatLo: -1, FormatHi: Hi_32(Value: DefaultFormat), BasePtr);
5570}
5571
5572AMDGPUInstructionSelector::MUBUFAddressData
5573AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5574 MUBUFAddressData Data;
5575 Data.N0 = Src;
5576
5577 Register PtrBase;
5578 int64_t Offset;
5579
5580 std::tie(args&: PtrBase, args&: Offset) = getPtrBaseWithConstantOffset(Root: Src, MRI: *MRI);
5581 if (isUInt<32>(x: Offset)) {
5582 Data.N0 = PtrBase;
5583 Data.Offset = Offset;
5584 }
5585
5586 if (MachineInstr *InputAdd
5587 = getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Data.N0, MRI: *MRI)) {
5588 Data.N2 = InputAdd->getOperand(i: 1).getReg();
5589 Data.N3 = InputAdd->getOperand(i: 2).getReg();
5590
5591 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5592 // FIXME: Don't know this was defined by operand 0
5593 //
5594 // TODO: Remove this when we have copy folding optimizations after
5595 // RegBankSelect.
5596 Data.N2 = getDefIgnoringCopies(Reg: Data.N2, MRI: *MRI)->getOperand(i: 0).getReg();
5597 Data.N3 = getDefIgnoringCopies(Reg: Data.N3, MRI: *MRI)->getOperand(i: 0).getReg();
5598 }
5599
5600 return Data;
5601}
5602
5603/// Return if the addr64 mubuf mode should be used for the given address.
5604bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5605 // (ptr_add N2, N3) -> addr64, or
5606 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5607 if (Addr.N2)
5608 return true;
5609
5610 const RegisterBank *N0Bank = RBI.getRegBank(Reg: Addr.N0, MRI: *MRI, TRI);
5611 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5612}
5613
5614/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5615/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5616/// component.
5617void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5618 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5619 if (TII.isLegalMUBUFImmOffset(Imm: ImmOffset))
5620 return;
5621
5622 // Illegal offset, store it in soffset.
5623 SOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5624 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
5625 .addDef(RegNo: SOffset)
5626 .addImm(Val: ImmOffset);
5627 ImmOffset = 0;
5628}
5629
5630bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5631 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5632 Register &SOffset, int64_t &Offset) const {
5633 // FIXME: Predicates should stop this from reaching here.
5634 // addr64 bit was removed for volcanic islands.
5635 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5636 return false;
5637
5638 MUBUFAddressData AddrData = parseMUBUFAddress(Src: Root.getReg());
5639 if (!shouldUseAddr64(Addr: AddrData))
5640 return false;
5641
5642 Register N0 = AddrData.N0;
5643 Register N2 = AddrData.N2;
5644 Register N3 = AddrData.N3;
5645 Offset = AddrData.Offset;
5646
5647 // Base pointer for the SRD.
5648 Register SRDPtr;
5649
5650 if (N2) {
5651 if (RBI.getRegBank(Reg: N2, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5652 assert(N3);
5653 if (RBI.getRegBank(Reg: N3, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5654 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5655 // addr64, and construct the default resource from a 0 address.
5656 VAddr = N0;
5657 } else {
5658 SRDPtr = N3;
5659 VAddr = N2;
5660 }
5661 } else {
5662 // N2 is not divergent.
5663 SRDPtr = N2;
5664 VAddr = N3;
5665 }
5666 } else if (RBI.getRegBank(Reg: N0, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5667 // Use the default null pointer in the resource
5668 VAddr = N0;
5669 } else {
5670 // N0 -> offset, or
5671 // (N0 + C1) -> offset
5672 SRDPtr = N0;
5673 }
5674
5675 MachineIRBuilder B(*Root.getParent());
5676 RSrcReg = buildAddr64RSrc(B, MRI&: *MRI, TII, BasePtr: SRDPtr);
5677 splitIllegalMUBUFOffset(B, SOffset, ImmOffset&: Offset);
5678 return true;
5679}
5680
5681bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5682 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5683 int64_t &Offset) const {
5684
5685 // FIXME: Pattern should not reach here.
5686 if (STI.useFlatForGlobal())
5687 return false;
5688
5689 MUBUFAddressData AddrData = parseMUBUFAddress(Src: Root.getReg());
5690 if (shouldUseAddr64(Addr: AddrData))
5691 return false;
5692
5693 // N0 -> offset, or
5694 // (N0 + C1) -> offset
5695 Register SRDPtr = AddrData.N0;
5696 Offset = AddrData.Offset;
5697
5698 // TODO: Look through extensions for 32-bit soffset.
5699 MachineIRBuilder B(*Root.getParent());
5700
5701 RSrcReg = buildOffsetSrc(B, MRI&: *MRI, TII, BasePtr: SRDPtr);
5702 splitIllegalMUBUFOffset(B, SOffset, ImmOffset&: Offset);
5703 return true;
5704}
5705
5706InstructionSelector::ComplexRendererFns
5707AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5708 Register VAddr;
5709 Register RSrcReg;
5710 Register SOffset;
5711 int64_t Offset = 0;
5712
5713 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5714 return {};
5715
5716 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5717 // pattern.
5718 return {{
5719 [=](MachineInstrBuilder &MIB) { // rsrc
5720 MIB.addReg(RegNo: RSrcReg);
5721 },
5722 [=](MachineInstrBuilder &MIB) { // vaddr
5723 MIB.addReg(RegNo: VAddr);
5724 },
5725 [=](MachineInstrBuilder &MIB) { // soffset
5726 if (SOffset)
5727 MIB.addReg(RegNo: SOffset);
5728 else if (STI.hasRestrictedSOffset())
5729 MIB.addReg(RegNo: AMDGPU::SGPR_NULL);
5730 else
5731 MIB.addImm(Val: 0);
5732 },
5733 [=](MachineInstrBuilder &MIB) { // offset
5734 MIB.addImm(Val: Offset);
5735 },
5736 addZeroImm, // cpol
5737 addZeroImm, // tfe
5738 addZeroImm // swz
5739 }};
5740}
5741
5742InstructionSelector::ComplexRendererFns
5743AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5744 Register RSrcReg;
5745 Register SOffset;
5746 int64_t Offset = 0;
5747
5748 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5749 return {};
5750
5751 return {{
5752 [=](MachineInstrBuilder &MIB) { // rsrc
5753 MIB.addReg(RegNo: RSrcReg);
5754 },
5755 [=](MachineInstrBuilder &MIB) { // soffset
5756 if (SOffset)
5757 MIB.addReg(RegNo: SOffset);
5758 else if (STI.hasRestrictedSOffset())
5759 MIB.addReg(RegNo: AMDGPU::SGPR_NULL);
5760 else
5761 MIB.addImm(Val: 0);
5762 },
5763 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }, // offset
5764 addZeroImm, // cpol
5765 addZeroImm, // tfe
5766 addZeroImm, // swz
5767 }};
5768}
5769
5770InstructionSelector::ComplexRendererFns
5771AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5772
5773 Register SOffset = Root.getReg();
5774
5775 if (STI.hasRestrictedSOffset() && mi_match(R: SOffset, MRI: *MRI, P: m_ZeroInt()))
5776 SOffset = AMDGPU::SGPR_NULL;
5777
5778 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); }}};
5779}
5780
5781/// Get an immediate that must be 32-bits, and treated as zero extended.
5782static std::optional<uint64_t>
5783getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
5784 // getIConstantVRegVal sexts any values, so see if that matters.
5785 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(VReg: Reg, MRI);
5786 if (!OffsetVal || !isInt<32>(x: *OffsetVal))
5787 return std::nullopt;
5788 return Lo_32(Value: *OffsetVal);
5789}
5790
5791InstructionSelector::ComplexRendererFns
5792AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5793 std::optional<uint64_t> OffsetVal =
5794 Root.isImm() ? Root.getImm() : getConstantZext32Val(Reg: Root.getReg(), MRI: *MRI);
5795 if (!OffsetVal)
5796 return {};
5797
5798 std::optional<int64_t> EncodedImm =
5799 AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: *OffsetVal, IsBuffer: true);
5800 if (!EncodedImm)
5801 return {};
5802
5803 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); } }};
5804}
5805
5806InstructionSelector::ComplexRendererFns
5807AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5808 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
5809
5810 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Reg: Root.getReg(), MRI: *MRI);
5811 if (!OffsetVal)
5812 return {};
5813
5814 std::optional<int64_t> EncodedImm =
5815 AMDGPU::getSMRDEncodedLiteralOffset32(ST: STI, ByteOffset: *OffsetVal);
5816 if (!EncodedImm)
5817 return {};
5818
5819 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); } }};
5820}
5821
5822InstructionSelector::ComplexRendererFns
5823AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5824 // Match the (soffset + offset) pair as a 32-bit register base and
5825 // an immediate offset.
5826 Register SOffset;
5827 unsigned Offset;
5828 std::tie(args&: SOffset, args&: Offset) = AMDGPU::getBaseWithConstantOffset(
5829 MRI&: *MRI, Reg: Root.getReg(), ValueTracking: VT, /*CheckNUW*/ true);
5830 if (!SOffset)
5831 return std::nullopt;
5832
5833 std::optional<int64_t> EncodedOffset =
5834 AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: Offset, /* IsBuffer */ true);
5835 if (!EncodedOffset)
5836 return std::nullopt;
5837
5838 assert(MRI->getType(SOffset) == LLT::scalar(32));
5839 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); },
5840 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedOffset); }}};
5841}
5842
5843std::pair<Register, unsigned>
5844AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5845 bool &Matched) const {
5846 Matched = false;
5847
5848 Register Src;
5849 unsigned Mods;
5850 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Src: Root.getReg());
5851
5852 if (mi_match(R: Src, MRI: *MRI, P: m_GFPExt(Src: m_Reg(R&: Src)))) {
5853 assert(MRI->getType(Src) == LLT::scalar(16));
5854
5855 // Only change Src if src modifier could be gained. In such cases new Src
5856 // could be sgpr but this does not violate constant bus restriction for
5857 // instruction that is being selected.
5858 Src = stripBitCast(Reg: Src, MRI&: *MRI);
5859
5860 const auto CheckAbsNeg = [&]() {
5861 // Be careful about folding modifiers if we already have an abs. fneg is
5862 // applied last, so we don't want to apply an earlier fneg.
5863 if ((Mods & SISrcMods::ABS) == 0) {
5864 unsigned ModsTmp;
5865 std::tie(args&: Src, args&: ModsTmp) = selectVOP3ModsImpl(Src);
5866
5867 if ((ModsTmp & SISrcMods::NEG) != 0)
5868 Mods ^= SISrcMods::NEG;
5869
5870 if ((ModsTmp & SISrcMods::ABS) != 0)
5871 Mods |= SISrcMods::ABS;
5872 }
5873 };
5874
5875 CheckAbsNeg();
5876
5877 // op_sel/op_sel_hi decide the source type and source.
5878 // If the source's op_sel_hi is set, it indicates to do a conversion from
5879 // fp16. If the sources's op_sel is set, it picks the high half of the
5880 // source register.
5881
5882 Mods |= SISrcMods::OP_SEL_1;
5883
5884 if (isExtractHiElt(MRI&: *MRI, In: Src, Out&: Src)) {
5885 Mods |= SISrcMods::OP_SEL_0;
5886 CheckAbsNeg();
5887 }
5888
5889 Matched = true;
5890 }
5891
5892 return {Src, Mods};
5893}
5894
5895InstructionSelector::ComplexRendererFns
5896AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5897 MachineOperand &Root) const {
5898 Register Src;
5899 unsigned Mods;
5900 bool Matched;
5901 std::tie(args&: Src, args&: Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5902 if (!Matched)
5903 return {};
5904
5905 return {{
5906 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5907 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
5908 }};
5909}
5910
5911InstructionSelector::ComplexRendererFns
5912AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5913 Register Src;
5914 unsigned Mods;
5915 bool Matched;
5916 std::tie(args&: Src, args&: Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5917
5918 return {{
5919 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5920 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
5921 }};
5922}
5923
5924bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5925 MachineInstr &I, Intrinsic::ID IntrID) const {
5926 MachineBasicBlock *MBB = I.getParent();
5927 const DebugLoc &DL = I.getDebugLoc();
5928 Register CCReg = I.getOperand(i: 0).getReg();
5929
5930 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
5931 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CMP_EQ_U32)).addImm(Val: 0).addImm(Val: 0);
5932
5933 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5934 .addImm(Val: I.getOperand(i: 2).getImm());
5935
5936 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CCReg).addReg(RegNo: AMDGPU::SCC);
5937
5938 I.eraseFromParent();
5939 return RBI.constrainGenericRegister(Reg: CCReg, RC: AMDGPU::SReg_32_XM0_XEXECRegClass,
5940 MRI&: *MRI);
5941}
5942
5943bool AMDGPUInstructionSelector::selectSGetBarrierState(
5944 MachineInstr &I, Intrinsic::ID IntrID) const {
5945 MachineBasicBlock *MBB = I.getParent();
5946 const DebugLoc &DL = I.getDebugLoc();
5947 MachineOperand BarOp = I.getOperand(i: 2);
5948 std::optional<int64_t> BarValImm =
5949 getIConstantVRegSExtVal(VReg: BarOp.getReg(), MRI: *MRI);
5950
5951 if (!BarValImm) {
5952 auto CopyMIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
5953 .addReg(RegNo: BarOp.getReg());
5954 constrainSelectedInstRegOperands(I&: *CopyMIB, TII, TRI, RBI);
5955 }
5956 MachineInstrBuilder MIB;
5957 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
5958 : AMDGPU::S_GET_BARRIER_STATE_M0;
5959 MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc));
5960
5961 auto DstReg = I.getOperand(i: 0).getReg();
5962 const TargetRegisterClass *DstRC =
5963 TRI.getConstrainedRegClassForOperand(MO: I.getOperand(i: 0), MRI: *MRI);
5964 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
5965 return false;
5966 MIB.addDef(RegNo: DstReg);
5967 if (BarValImm) {
5968 MIB.addImm(Val: *BarValImm);
5969 }
5970 I.eraseFromParent();
5971 return true;
5972}
5973
5974unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5975 if (HasInlineConst) {
5976 switch (IntrID) {
5977 default:
5978 llvm_unreachable("not a named barrier op");
5979 case Intrinsic::amdgcn_s_get_named_barrier_state:
5980 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5981 };
5982 } else {
5983 switch (IntrID) {
5984 default:
5985 llvm_unreachable("not a named barrier op");
5986 case Intrinsic::amdgcn_s_get_named_barrier_state:
5987 return AMDGPU::S_GET_BARRIER_STATE_M0;
5988 };
5989 }
5990}
5991
5992bool AMDGPUInstructionSelector::selectNamedBarrierInit(
5993 MachineInstr &I, Intrinsic::ID IntrID) const {
5994 MachineBasicBlock *MBB = I.getParent();
5995 const DebugLoc &DL = I.getDebugLoc();
5996 MachineOperand BarOp = I.getOperand(i: 1);
5997 MachineOperand CntOp = I.getOperand(i: 2);
5998
5999 // BarID = (BarOp >> 4) & 0x3F
6000 Register TmpReg0 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6001 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: TmpReg0)
6002 .add(MO: BarOp)
6003 .addImm(Val: 4u)
6004 .setOperandDead(3); // Dead scc
6005
6006 Register TmpReg1 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6007 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: TmpReg1)
6008 .addReg(RegNo: TmpReg0)
6009 .addImm(Val: 0x3F)
6010 .setOperandDead(3); // Dead scc
6011
6012 // MO = ((CntOp & 0x3F) << shAmt) | BarID
6013 Register TmpReg2 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6014 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: TmpReg2)
6015 .add(MO: CntOp)
6016 .addImm(Val: 0x3F)
6017 .setOperandDead(3); // Dead scc
6018
6019 Register TmpReg3 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6020 constexpr unsigned ShAmt = 16;
6021 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: TmpReg3)
6022 .addReg(RegNo: TmpReg2)
6023 .addImm(Val: ShAmt)
6024 .setOperandDead(3); // Dead scc
6025
6026 Register TmpReg4 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6027 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_OR_B32), DestReg: TmpReg4)
6028 .addReg(RegNo: TmpReg1)
6029 .addReg(RegNo: TmpReg3)
6030 .setOperandDead(3); // Dead scc;
6031
6032 auto CopyMIB =
6033 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0).addReg(RegNo: TmpReg4);
6034 constrainSelectedInstRegOperands(I&: *CopyMIB, TII, TRI, RBI);
6035
6036 MachineInstrBuilder MIB;
6037 MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_BARRIER_SIGNAL_M0));
6038
6039 I.eraseFromParent();
6040 return true;
6041}
6042
6043bool AMDGPUInstructionSelector::selectNamedBarrierInst(
6044 MachineInstr &I, Intrinsic::ID IntrID) const {
6045 MachineBasicBlock *MBB = I.getParent();
6046 const DebugLoc &DL = I.getDebugLoc();
6047 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6048 ? I.getOperand(i: 2)
6049 : I.getOperand(i: 1);
6050 std::optional<int64_t> BarValImm =
6051 getIConstantVRegSExtVal(VReg: BarOp.getReg(), MRI: *MRI);
6052
6053 if (!BarValImm) {
6054 // BarID = (BarOp >> 4) & 0x3F
6055 Register TmpReg0 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6056 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: TmpReg0)
6057 .addReg(RegNo: BarOp.getReg())
6058 .addImm(Val: 4u)
6059 .setOperandDead(3); // Dead scc;
6060
6061 Register TmpReg1 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6062 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: TmpReg1)
6063 .addReg(RegNo: TmpReg0)
6064 .addImm(Val: 0x3F)
6065 .setOperandDead(3); // Dead scc;
6066
6067 auto CopyMIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
6068 .addReg(RegNo: TmpReg1);
6069 constrainSelectedInstRegOperands(I&: *CopyMIB, TII, TRI, RBI);
6070 }
6071
6072 MachineInstrBuilder MIB;
6073 unsigned Opc = getNamedBarrierOp(HasInlineConst: BarValImm.has_value(), IntrID);
6074 MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc));
6075
6076 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6077 auto DstReg = I.getOperand(i: 0).getReg();
6078 const TargetRegisterClass *DstRC =
6079 TRI.getConstrainedRegClassForOperand(MO: I.getOperand(i: 0), MRI: *MRI);
6080 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
6081 return false;
6082 MIB.addDef(RegNo: DstReg);
6083 }
6084
6085 if (BarValImm) {
6086 auto BarId = ((*BarValImm) >> 4) & 0x3F;
6087 MIB.addImm(Val: BarId);
6088 }
6089
6090 I.eraseFromParent();
6091 return true;
6092}
6093
6094void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
6095 const MachineInstr &MI,
6096 int OpIdx) const {
6097 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6098 "Expected G_CONSTANT");
6099 MIB.addImm(Val: MI.getOperand(i: 1).getCImm()->getSExtValue());
6100}
6101
6102void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
6103 const MachineInstr &MI,
6104 int OpIdx) const {
6105 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6106 "Expected G_CONSTANT");
6107 MIB.addImm(Val: -MI.getOperand(i: 1).getCImm()->getSExtValue());
6108}
6109
6110void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
6111 const MachineInstr &MI,
6112 int OpIdx) const {
6113 const MachineOperand &Op = MI.getOperand(i: 1);
6114 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6115 MIB.addImm(Val: Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6116}
6117
6118void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
6119 const MachineInstr &MI,
6120 int OpIdx) const {
6121 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6122 "Expected G_CONSTANT");
6123 MIB.addImm(Val: MI.getOperand(i: 1).getCImm()->getValue().popcount());
6124}
6125
6126/// This only really exists to satisfy DAG type checking machinery, so is a
6127/// no-op here.
6128void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6129 const MachineInstr &MI,
6130 int OpIdx) const {
6131 const MachineOperand &Op = MI.getOperand(i: OpIdx);
6132 int64_t Imm;
6133 if (Op.isReg() && mi_match(R: Op.getReg(), MRI: *MRI, P: m_ICst(Cst&: Imm)))
6134 MIB.addImm(Val: Imm);
6135 else
6136 MIB.addImm(Val: Op.getImm());
6137}
6138
6139void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6140 const MachineInstr &MI,
6141 int OpIdx) const {
6142 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() != 0);
6143}
6144
6145void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
6146 const MachineInstr &MI,
6147 int OpIdx) const {
6148 assert(OpIdx >= 0 && "expected to match an immediate operand");
6149 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6150}
6151
6152void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6153 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6154 assert(OpIdx >= 0 && "expected to match an immediate operand");
6155 MIB.addImm(
6156 Val: (MI.getOperand(i: OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6157}
6158
6159void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6160 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6161 assert(OpIdx >= 0 && "expected to match an immediate operand");
6162 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() & 0x2)
6163 ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL)
6164 : (int64_t)SISrcMods::DST_OP_SEL);
6165}
6166
6167void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6168 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6169 assert(OpIdx >= 0 && "expected to match an immediate operand");
6170 MIB.addImm(
6171 Val: (MI.getOperand(i: OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6172}
6173
6174void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
6175 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6176 assert(OpIdx >= 0 && "expected to match an immediate operand");
6177 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() & 0x1)
6178 ? (int64_t)(SISrcMods::OP_SEL_0)
6179 : 0);
6180}
6181
6182void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
6183 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6184 assert(OpIdx >= 0 && "expected to match an immediate operand");
6185 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
6186 : 0);
6187}
6188
6189void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
6190 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6191 assert(OpIdx >= 0 && "expected to match an immediate operand");
6192 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
6193 : 0);
6194}
6195
6196void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
6197 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6198 assert(OpIdx >= 0 && "expected to match an immediate operand");
6199 MIB.addImm(
6200 Val: (MI.getOperand(i: OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6201}
6202
6203void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
6204 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6205 assert(OpIdx >= 0 && "expected to match an immediate operand");
6206 MIB.addImm(
6207 Val: (MI.getOperand(i: OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0);
6208}
6209
6210void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
6211 const MachineInstr &MI,
6212 int OpIdx) const {
6213 assert(OpIdx >= 0 && "expected to match an immediate operand");
6214 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() &
6215 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
6216 : AMDGPU::CPol::ALL_pregfx12));
6217}
6218
6219void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
6220 const MachineInstr &MI,
6221 int OpIdx) const {
6222 assert(OpIdx >= 0 && "expected to match an immediate operand");
6223 const bool Swizzle = MI.getOperand(i: OpIdx).getImm() &
6224 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ
6225 : AMDGPU::CPol::SWZ_pregfx12);
6226 MIB.addImm(Val: Swizzle);
6227}
6228
6229void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
6230 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6231 assert(OpIdx >= 0 && "expected to match an immediate operand");
6232 const uint32_t Cpol = MI.getOperand(i: OpIdx).getImm() &
6233 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
6234 : AMDGPU::CPol::ALL_pregfx12);
6235 MIB.addImm(Val: Cpol | AMDGPU::CPol::GLC);
6236}
6237
6238void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
6239 const MachineInstr &MI,
6240 int OpIdx) const {
6241 MIB.addFrameIndex(Idx: MI.getOperand(i: 1).getIndex());
6242}
6243
6244void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
6245 const MachineInstr &MI,
6246 int OpIdx) const {
6247 const APFloat &APF = MI.getOperand(i: 1).getFPImm()->getValueAPF();
6248 int ExpVal = APF.getExactLog2Abs();
6249 assert(ExpVal != INT_MIN);
6250 MIB.addImm(Val: ExpVal);
6251}
6252
6253void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
6254 const MachineInstr &MI,
6255 int OpIdx) const {
6256 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
6257 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
6258 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
6259 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
6260 MIB.addImm(Val: (MI.getOperand(i: OpIdx).getImm() + 3) % 4);
6261}
6262
6263/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
6264void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
6265 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6266 unsigned Val = MI.getOperand(i: OpIdx).getImm();
6267 unsigned New = 0;
6268 if (Val & 0x1)
6269 New |= SISrcMods::OP_SEL_0;
6270 if (Val & 0x2)
6271 New |= SISrcMods::OP_SEL_1;
6272 MIB.addImm(Val: New);
6273}
6274
6275bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
6276 return TII.isInlineConstant(Imm);
6277}
6278
6279bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
6280 return TII.isInlineConstant(Imm);
6281}
6282