1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUInstructionSelector.h"
15#include "AMDGPU.h"
16#include "AMDGPUGlobalISelUtils.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
23#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
24#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/CodeGen/MachineFrameInfo.h"
28#include "llvm/IR/DiagnosticInfo.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
43AMDGPUInstructionSelector::AMDGPUInstructionSelector(
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
49#define GET_GLOBALISEL_PREDICATES_INIT
50#include "AMDGPUGenGlobalISel.inc"
51#undef GET_GLOBALISEL_PREDICATES_INIT
52#define GET_GLOBALISEL_TEMPORARIES_INIT
53#include "AMDGPUGenGlobalISel.inc"
54#undef GET_GLOBALISEL_TEMPORARIES_INIT
55{
56}
57
58const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
59
60void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
61 CodeGenCoverage *CoverageInfo,
62 ProfileSummaryInfo *PSI,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
66 Subtarget->checkSubtargetFeatures(F: MF.getFunction());
67 InstructionSelector::setupMF(mf&: MF, kb: KB, covinfo: CoverageInfo, psi: PSI, bfi: BFI);
68}
69
70// Return the wave level SGPR base address if this is a wave address.
71static Register getWaveAddress(const MachineInstr *Def) {
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(i: 1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(RC: TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(Opcode: NewOpc));
102 MI.removeOperand(OpNo: 1); // Remove intrinsic ID.
103 MI.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
104
105 MachineOperand &Dst = MI.getOperand(i: 0);
106 MachineOperand &Src = MI.getOperand(i: 1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Reg: Dst.getReg()) == LLT::scalar(SizeInBits: 1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 return RBI.constrainGenericRegister(Reg: Dst.getReg(), RC: *DstRC, MRI&: *MRI) &&
120 RBI.constrainGenericRegister(Reg: Src.getReg(), RC: *SrcRC, MRI&: *MRI);
121}
122
123bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124 const DebugLoc &DL = I.getDebugLoc();
125 MachineBasicBlock *BB = I.getParent();
126 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
127
128 const MachineOperand &Src = I.getOperand(i: 1);
129 MachineOperand &Dst = I.getOperand(i: 0);
130 Register DstReg = Dst.getReg();
131 Register SrcReg = Src.getReg();
132
133 if (isVCC(Reg: DstReg, MRI: *MRI)) {
134 if (SrcReg == AMDGPU::SCC) {
135 const TargetRegisterClass *RC
136 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
137 if (!RC)
138 return true;
139 return RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI);
140 }
141
142 if (!isVCC(Reg: SrcReg, MRI: *MRI)) {
143 // TODO: Should probably leave the copy and let copyPhysReg expand it.
144 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *TRI.getBoolRC(), MRI&: *MRI))
145 return false;
146
147 const TargetRegisterClass *SrcRC
148 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
149
150 std::optional<ValueAndVReg> ConstVal =
151 getIConstantVRegValWithLookThrough(VReg: SrcReg, MRI: *MRI, LookThroughInstrs: true);
152 if (ConstVal) {
153 unsigned MovOpc =
154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: MovOpc), DestReg: DstReg)
156 .addImm(Val: ConstVal->Value.getBoolValue() ? -1 : 0);
157 } else {
158 Register MaskedReg = MRI->createVirtualRegister(RegClass: SrcRC);
159
160 // We can't trust the high bits at this point, so clear them.
161
162 // TODO: Skip masking high bits if def is known boolean.
163
164 bool IsSGPR = TRI.isSGPRClass(RC: SrcRC);
165 unsigned AndOpc =
166 IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
167 auto And = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: MaskedReg)
168 .addImm(Val: 1)
169 .addReg(RegNo: SrcReg);
170 if (IsSGPR)
171 And.setOperandDead(3); // Dead scc
172
173 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_NE_U32_e64), DestReg: DstReg)
174 .addImm(Val: 0)
175 .addReg(RegNo: MaskedReg);
176 }
177
178 if (!MRI->getRegClassOrNull(Reg: SrcReg))
179 MRI->setRegClass(Reg: SrcReg, RC: SrcRC);
180 I.eraseFromParent();
181 return true;
182 }
183
184 const TargetRegisterClass *RC =
185 TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
186 if (RC && !RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI))
187 return false;
188
189 return true;
190 }
191
192 for (const MachineOperand &MO : I.operands()) {
193 if (MO.getReg().isPhysical())
194 continue;
195
196 const TargetRegisterClass *RC =
197 TRI.getConstrainedRegClassForOperand(MO, MRI: *MRI);
198 if (!RC)
199 continue;
200 RBI.constrainGenericRegister(Reg: MO.getReg(), RC: *RC, MRI&: *MRI);
201 }
202 return true;
203}
204
205bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
206 const Register DefReg = I.getOperand(i: 0).getReg();
207 const LLT DefTy = MRI->getType(Reg: DefReg);
208
209 // S1 G_PHIs should not be selected in instruction-select, instead:
210 // - divergent S1 G_PHI should go through lane mask merging algorithm
211 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
212 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
213 if (DefTy == LLT::scalar(SizeInBits: 1))
214 return false;
215
216 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
217
218 const RegClassOrRegBank &RegClassOrBank =
219 MRI->getRegClassOrRegBank(Reg: DefReg);
220
221 const TargetRegisterClass *DefRC
222 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
223 if (!DefRC) {
224 if (!DefTy.isValid()) {
225 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
226 return false;
227 }
228
229 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
230 DefRC = TRI.getRegClassForTypeOnBank(Ty: DefTy, Bank: RB);
231 if (!DefRC) {
232 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
233 return false;
234 }
235 }
236
237 // TODO: Verify that all registers have the same bank
238 I.setDesc(TII.get(Opcode: TargetOpcode::PHI));
239 return RBI.constrainGenericRegister(Reg: DefReg, RC: *DefRC, MRI&: *MRI);
240}
241
242MachineOperand
243AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
244 const TargetRegisterClass &SubRC,
245 unsigned SubIdx) const {
246
247 MachineInstr *MI = MO.getParent();
248 MachineBasicBlock *BB = MO.getParent()->getParent();
249 Register DstReg = MRI->createVirtualRegister(RegClass: &SubRC);
250
251 if (MO.isReg()) {
252 unsigned ComposedSubIdx = TRI.composeSubRegIndices(a: MO.getSubReg(), b: SubIdx);
253 Register Reg = MO.getReg();
254 BuildMI(BB&: *BB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
255 .addReg(RegNo: Reg, flags: 0, SubReg: ComposedSubIdx);
256
257 return MachineOperand::CreateReg(Reg: DstReg, isDef: MO.isDef(), isImp: MO.isImplicit(),
258 isKill: MO.isKill(), isDead: MO.isDead(), isUndef: MO.isUndef(),
259 isEarlyClobber: MO.isEarlyClobber(), SubReg: 0, isDebug: MO.isDebug(),
260 isInternalRead: MO.isInternalRead());
261 }
262
263 assert(MO.isImm());
264
265 APInt Imm(64, MO.getImm());
266
267 switch (SubIdx) {
268 default:
269 llvm_unreachable("do not know to split immediate with this sub index.");
270 case AMDGPU::sub0:
271 return MachineOperand::CreateImm(Val: Imm.getLoBits(numBits: 32).getSExtValue());
272 case AMDGPU::sub1:
273 return MachineOperand::CreateImm(Val: Imm.getHiBits(numBits: 32).getSExtValue());
274 }
275}
276
277static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
278 switch (Opc) {
279 case AMDGPU::G_AND:
280 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
281 case AMDGPU::G_OR:
282 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
283 case AMDGPU::G_XOR:
284 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
285 default:
286 llvm_unreachable("not a bit op");
287 }
288}
289
290bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
291 Register DstReg = I.getOperand(i: 0).getReg();
292 unsigned Size = RBI.getSizeInBits(Reg: DstReg, MRI: *MRI, TRI);
293
294 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
295 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
296 DstRB->getID() != AMDGPU::VCCRegBankID)
297 return false;
298
299 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
300 STI.isWave64());
301 I.setDesc(TII.get(Opcode: getLogicalBitOpcode(Opc: I.getOpcode(), Is64)));
302
303 // Dead implicit-def of scc
304 I.addOperand(Op: MachineOperand::CreateReg(Reg: AMDGPU::SCC, isDef: true, // isDef
305 isImp: true, // isImp
306 isKill: false, // isKill
307 isDead: true)); // isDead
308 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
309}
310
311bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
312 MachineBasicBlock *BB = I.getParent();
313 MachineFunction *MF = BB->getParent();
314 Register DstReg = I.getOperand(i: 0).getReg();
315 const DebugLoc &DL = I.getDebugLoc();
316 LLT Ty = MRI->getType(Reg: DstReg);
317 if (Ty.isVector())
318 return false;
319
320 unsigned Size = Ty.getSizeInBits();
321 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
322 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
323 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
324
325 if (Size == 32) {
326 if (IsSALU) {
327 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
328 MachineInstr *Add =
329 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
330 .add(MO: I.getOperand(i: 1))
331 .add(MO: I.getOperand(i: 2))
332 .setOperandDead(3); // Dead scc
333 I.eraseFromParent();
334 return constrainSelectedInstRegOperands(I&: *Add, TII, TRI, RBI);
335 }
336
337 if (STI.hasAddNoCarry()) {
338 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
339 I.setDesc(TII.get(Opcode: Opc));
340 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
341 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
342 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
343 }
344
345 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
346
347 Register UnusedCarry = MRI->createVirtualRegister(RegClass: TRI.getWaveMaskRegClass());
348 MachineInstr *Add
349 = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
350 .addDef(RegNo: UnusedCarry, Flags: RegState::Dead)
351 .add(MO: I.getOperand(i: 1))
352 .add(MO: I.getOperand(i: 2))
353 .addImm(Val: 0);
354 I.eraseFromParent();
355 return constrainSelectedInstRegOperands(I&: *Add, TII, TRI, RBI);
356 }
357
358 assert(!Sub && "illegal sub should not reach here");
359
360 const TargetRegisterClass &RC
361 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
362 const TargetRegisterClass &HalfRC
363 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
364
365 MachineOperand Lo1(getSubOperand64(MO&: I.getOperand(i: 1), SubRC: HalfRC, SubIdx: AMDGPU::sub0));
366 MachineOperand Lo2(getSubOperand64(MO&: I.getOperand(i: 2), SubRC: HalfRC, SubIdx: AMDGPU::sub0));
367 MachineOperand Hi1(getSubOperand64(MO&: I.getOperand(i: 1), SubRC: HalfRC, SubIdx: AMDGPU::sub1));
368 MachineOperand Hi2(getSubOperand64(MO&: I.getOperand(i: 2), SubRC: HalfRC, SubIdx: AMDGPU::sub1));
369
370 Register DstLo = MRI->createVirtualRegister(RegClass: &HalfRC);
371 Register DstHi = MRI->createVirtualRegister(RegClass: &HalfRC);
372
373 if (IsSALU) {
374 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADD_U32), DestReg: DstLo)
375 .add(MO: Lo1)
376 .add(MO: Lo2);
377 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADDC_U32), DestReg: DstHi)
378 .add(MO: Hi1)
379 .add(MO: Hi2)
380 .setOperandDead(3); // Dead scc
381 } else {
382 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
383 Register CarryReg = MRI->createVirtualRegister(RegClass: CarryRC);
384 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: DstLo)
385 .addDef(RegNo: CarryReg)
386 .add(MO: Lo1)
387 .add(MO: Lo2)
388 .addImm(Val: 0);
389 MachineInstr *Addc = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: DstHi)
390 .addDef(RegNo: MRI->createVirtualRegister(RegClass: CarryRC), Flags: RegState::Dead)
391 .add(MO: Hi1)
392 .add(MO: Hi2)
393 .addReg(RegNo: CarryReg, flags: RegState::Kill)
394 .addImm(Val: 0);
395
396 if (!constrainSelectedInstRegOperands(I&: *Addc, TII, TRI, RBI))
397 return false;
398 }
399
400 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
401 .addReg(RegNo: DstLo)
402 .addImm(Val: AMDGPU::sub0)
403 .addReg(RegNo: DstHi)
404 .addImm(Val: AMDGPU::sub1);
405
406
407 if (!RBI.constrainGenericRegister(Reg: DstReg, RC, MRI&: *MRI))
408 return false;
409
410 I.eraseFromParent();
411 return true;
412}
413
414bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
415 MachineInstr &I) const {
416 MachineBasicBlock *BB = I.getParent();
417 MachineFunction *MF = BB->getParent();
418 const DebugLoc &DL = I.getDebugLoc();
419 Register Dst0Reg = I.getOperand(i: 0).getReg();
420 Register Dst1Reg = I.getOperand(i: 1).getReg();
421 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
422 I.getOpcode() == AMDGPU::G_UADDE;
423 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
424 I.getOpcode() == AMDGPU::G_USUBE;
425
426 if (isVCC(Reg: Dst1Reg, MRI: *MRI)) {
427 unsigned NoCarryOpc =
428 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
429 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
430 I.setDesc(TII.get(Opcode: HasCarryIn ? CarryOpc : NoCarryOpc));
431 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
432 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
433 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
434 }
435
436 Register Src0Reg = I.getOperand(i: 2).getReg();
437 Register Src1Reg = I.getOperand(i: 3).getReg();
438
439 if (HasCarryIn) {
440 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC)
441 .addReg(RegNo: I.getOperand(i: 4).getReg());
442 }
443
444 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
445 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
446
447 auto CarryInst = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: HasCarryIn ? CarryOpc : NoCarryOpc), DestReg: Dst0Reg)
448 .add(MO: I.getOperand(i: 2))
449 .add(MO: I.getOperand(i: 3));
450
451 if (MRI->use_nodbg_empty(RegNo: Dst1Reg)) {
452 CarryInst.setOperandDead(3); // Dead scc
453 } else {
454 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: Dst1Reg)
455 .addReg(RegNo: AMDGPU::SCC);
456 if (!MRI->getRegClassOrNull(Reg: Dst1Reg))
457 MRI->setRegClass(Reg: Dst1Reg, RC: &AMDGPU::SReg_32RegClass);
458 }
459
460 if (!RBI.constrainGenericRegister(Reg: Dst0Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
461 !RBI.constrainGenericRegister(Reg: Src0Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
462 !RBI.constrainGenericRegister(Reg: Src1Reg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
463 return false;
464
465 if (HasCarryIn &&
466 !RBI.constrainGenericRegister(Reg: I.getOperand(i: 4).getReg(),
467 RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
468 return false;
469
470 I.eraseFromParent();
471 return true;
472}
473
474bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
475 MachineInstr &I) const {
476 MachineBasicBlock *BB = I.getParent();
477 MachineFunction *MF = BB->getParent();
478 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
479
480 unsigned Opc;
481 if (Subtarget->hasMADIntraFwdBug())
482 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
483 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
484 else
485 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
486 I.setDesc(TII.get(Opcode: Opc));
487 I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0));
488 I.addImplicitDefUseOperands(MF&: *MF);
489 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
490}
491
492// TODO: We should probably legalize these to only using 32-bit results.
493bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
494 MachineBasicBlock *BB = I.getParent();
495 Register DstReg = I.getOperand(i: 0).getReg();
496 Register SrcReg = I.getOperand(i: 1).getReg();
497 LLT DstTy = MRI->getType(Reg: DstReg);
498 LLT SrcTy = MRI->getType(Reg: SrcReg);
499 const unsigned SrcSize = SrcTy.getSizeInBits();
500 unsigned DstSize = DstTy.getSizeInBits();
501
502 // TODO: Should handle any multiple of 32 offset.
503 unsigned Offset = I.getOperand(i: 2).getImm();
504 if (Offset % 32 != 0 || DstSize > 128)
505 return false;
506
507 // 16-bit operations really use 32-bit registers.
508 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
509 if (DstSize == 16)
510 DstSize = 32;
511
512 const TargetRegisterClass *DstRC =
513 TRI.getConstrainedRegClassForOperand(MO: I.getOperand(i: 0), MRI: *MRI);
514 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
515 return false;
516
517 const RegisterBank *SrcBank = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
518 const TargetRegisterClass *SrcRC =
519 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcBank);
520 if (!SrcRC)
521 return false;
522 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Channel: Offset / 32,
523 NumRegs: DstSize / 32);
524 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
525 if (!SrcRC)
526 return false;
527
528 SrcReg = constrainOperandRegClass(MF: *MF, TRI, MRI&: *MRI, TII, RBI, InsertPt&: I,
529 RegClass: *SrcRC, RegMO&: I.getOperand(i: 1));
530 const DebugLoc &DL = I.getDebugLoc();
531 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::COPY), DestReg: DstReg)
532 .addReg(RegNo: SrcReg, flags: 0, SubReg);
533
534 I.eraseFromParent();
535 return true;
536}
537
538bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
539 MachineBasicBlock *BB = MI.getParent();
540 Register DstReg = MI.getOperand(i: 0).getReg();
541 LLT DstTy = MRI->getType(Reg: DstReg);
542 LLT SrcTy = MRI->getType(Reg: MI.getOperand(i: 1).getReg());
543
544 const unsigned SrcSize = SrcTy.getSizeInBits();
545 if (SrcSize < 32)
546 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
547
548 const DebugLoc &DL = MI.getDebugLoc();
549 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
550 const unsigned DstSize = DstTy.getSizeInBits();
551 const TargetRegisterClass *DstRC =
552 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
553 if (!DstRC)
554 return false;
555
556 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: DstRC, EltSize: SrcSize / 8);
557 MachineInstrBuilder MIB =
558 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg);
559 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
560 MachineOperand &Src = MI.getOperand(i: I + 1);
561 MIB.addReg(RegNo: Src.getReg(), flags: getUndefRegState(B: Src.isUndef()));
562 MIB.addImm(Val: SubRegs[I]);
563
564 const TargetRegisterClass *SrcRC
565 = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI);
566 if (SrcRC && !RBI.constrainGenericRegister(Reg: Src.getReg(), RC: *SrcRC, MRI&: *MRI))
567 return false;
568 }
569
570 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
571 return false;
572
573 MI.eraseFromParent();
574 return true;
575}
576
577bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
578 MachineBasicBlock *BB = MI.getParent();
579 const int NumDst = MI.getNumOperands() - 1;
580
581 MachineOperand &Src = MI.getOperand(i: NumDst);
582
583 Register SrcReg = Src.getReg();
584 Register DstReg0 = MI.getOperand(i: 0).getReg();
585 LLT DstTy = MRI->getType(Reg: DstReg0);
586 LLT SrcTy = MRI->getType(Reg: SrcReg);
587
588 const unsigned DstSize = DstTy.getSizeInBits();
589 const unsigned SrcSize = SrcTy.getSizeInBits();
590 const DebugLoc &DL = MI.getDebugLoc();
591 const RegisterBank *SrcBank = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
592
593 const TargetRegisterClass *SrcRC =
594 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcBank);
595 if (!SrcRC || !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI))
596 return false;
597
598 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
599 // source, and this relies on the fact that the same subregister indices are
600 // used for both.
601 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: SrcRC, EltSize: DstSize / 8);
602 for (int I = 0, E = NumDst; I != E; ++I) {
603 MachineOperand &Dst = MI.getOperand(i: I);
604 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::COPY), DestReg: Dst.getReg())
605 .addReg(RegNo: SrcReg, flags: 0, SubReg: SubRegs[I]);
606
607 // Make sure the subregister index is valid for the source register.
608 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
609 if (!SrcRC || !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI))
610 return false;
611
612 const TargetRegisterClass *DstRC =
613 TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
614 if (DstRC && !RBI.constrainGenericRegister(Reg: Dst.getReg(), RC: *DstRC, MRI&: *MRI))
615 return false;
616 }
617
618 MI.eraseFromParent();
619 return true;
620}
621
622bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
623 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
624 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
625
626 Register Src0 = MI.getOperand(i: 1).getReg();
627 Register Src1 = MI.getOperand(i: 2).getReg();
628 LLT SrcTy = MRI->getType(Reg: Src0);
629 const unsigned SrcSize = SrcTy.getSizeInBits();
630
631 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
632 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
633 return selectG_MERGE_VALUES(MI);
634 }
635
636 // Selection logic below is for V2S16 only.
637 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
638 Register Dst = MI.getOperand(i: 0).getReg();
639 if (MRI->getType(Reg: Dst) != LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) ||
640 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
641 SrcTy != LLT::scalar(SizeInBits: 32)))
642 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
643
644 const RegisterBank *DstBank = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
645 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
646 return false;
647
648 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
649 DstBank->getID() == AMDGPU::VGPRRegBankID);
650 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
651
652 const DebugLoc &DL = MI.getDebugLoc();
653 MachineBasicBlock *BB = MI.getParent();
654
655 // First, before trying TableGen patterns, check if both sources are
656 // constants. In those cases, we can trivially compute the final constant
657 // and emit a simple move.
658 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(VReg: Src1, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
659 if (ConstSrc1) {
660 auto ConstSrc0 =
661 getAnyConstantVRegValWithLookThrough(VReg: Src0, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
662 if (ConstSrc0) {
663 const int64_t K0 = ConstSrc0->Value.getSExtValue();
664 const int64_t K1 = ConstSrc1->Value.getSExtValue();
665 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
666 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
667 uint32_t Imm = Lo16 | (Hi16 << 16);
668
669 // VALU
670 if (IsVector) {
671 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: Dst).addImm(Val: Imm);
672 MI.eraseFromParent();
673 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI);
674 }
675
676 // SALU
677 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: Dst).addImm(Val: Imm);
678 MI.eraseFromParent();
679 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
680 }
681 }
682
683 // Now try TableGen patterns.
684 if (selectImpl(I&: MI, CoverageInfo&: *CoverageInfo))
685 return true;
686
687 // TODO: This should probably be a combine somewhere
688 // (build_vector $src0, undef) -> copy $src0
689 MachineInstr *Src1Def = getDefIgnoringCopies(Reg: Src1, MRI: *MRI);
690 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
691 MI.setDesc(TII.get(Opcode: AMDGPU::COPY));
692 MI.removeOperand(OpNo: 2);
693 const auto &RC =
694 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
695 return RBI.constrainGenericRegister(Reg: Dst, RC, MRI&: *MRI) &&
696 RBI.constrainGenericRegister(Reg: Src0, RC, MRI&: *MRI);
697 }
698
699 // TODO: Can be improved?
700 if (IsVector) {
701 Register TmpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
702 auto MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B32_e32), DestReg: TmpReg)
703 .addImm(Val: 0xFFFF)
704 .addReg(RegNo: Src0);
705 if (!constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI))
706 return false;
707
708 MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: Dst)
709 .addReg(RegNo: Src1)
710 .addImm(Val: 16)
711 .addReg(RegNo: TmpReg);
712 if (!constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI))
713 return false;
714
715 MI.eraseFromParent();
716 return true;
717 }
718
719 Register ShiftSrc0;
720 Register ShiftSrc1;
721
722 // With multiple uses of the shift, this will duplicate the shift and
723 // increase register pressure.
724 //
725 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
726 // => (S_PACK_HH_B32_B16 $src0, $src1)
727 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
728 // => (S_PACK_HL_B32_B16 $src0, $src1)
729 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
730 // => (S_PACK_LH_B32_B16 $src0, $src1)
731 // (build_vector $src0, $src1)
732 // => (S_PACK_LL_B32_B16 $src0, $src1)
733
734 bool Shift0 = mi_match(
735 R: Src0, MRI: *MRI, P: m_OneUse(SP: m_GLShr(L: m_Reg(R&: ShiftSrc0), R: m_SpecificICst(RequestedValue: 16))));
736
737 bool Shift1 = mi_match(
738 R: Src1, MRI: *MRI, P: m_OneUse(SP: m_GLShr(L: m_Reg(R&: ShiftSrc1), R: m_SpecificICst(RequestedValue: 16))));
739
740 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
741 if (Shift0 && Shift1) {
742 Opc = AMDGPU::S_PACK_HH_B32_B16;
743 MI.getOperand(i: 1).setReg(ShiftSrc0);
744 MI.getOperand(i: 2).setReg(ShiftSrc1);
745 } else if (Shift1) {
746 Opc = AMDGPU::S_PACK_LH_B32_B16;
747 MI.getOperand(i: 2).setReg(ShiftSrc1);
748 } else if (Shift0) {
749 auto ConstSrc1 =
750 getAnyConstantVRegValWithLookThrough(VReg: Src1, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true);
751 if (ConstSrc1 && ConstSrc1->Value == 0) {
752 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
753 auto MIB = BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: Dst)
754 .addReg(RegNo: ShiftSrc0)
755 .addImm(Val: 16)
756 .setOperandDead(3); // Dead scc
757
758 MI.eraseFromParent();
759 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
760 }
761 if (STI.hasSPackHL()) {
762 Opc = AMDGPU::S_PACK_HL_B32_B16;
763 MI.getOperand(i: 1).setReg(ShiftSrc0);
764 }
765 }
766
767 MI.setDesc(TII.get(Opcode: Opc));
768 return constrainSelectedInstRegOperands(I&: MI, TII, TRI, RBI);
769}
770
771bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
772 const MachineOperand &MO = I.getOperand(i: 0);
773
774 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
775 // regbank check here is to know why getConstrainedRegClassForOperand failed.
776 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI: *MRI);
777 if ((!RC && !MRI->getRegBankOrNull(Reg: MO.getReg())) ||
778 (RC && RBI.constrainGenericRegister(Reg: MO.getReg(), RC: *RC, MRI&: *MRI))) {
779 I.setDesc(TII.get(Opcode: TargetOpcode::IMPLICIT_DEF));
780 return true;
781 }
782
783 return false;
784}
785
786bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
787 MachineBasicBlock *BB = I.getParent();
788
789 Register DstReg = I.getOperand(i: 0).getReg();
790 Register Src0Reg = I.getOperand(i: 1).getReg();
791 Register Src1Reg = I.getOperand(i: 2).getReg();
792 LLT Src1Ty = MRI->getType(Reg: Src1Reg);
793
794 unsigned DstSize = MRI->getType(Reg: DstReg).getSizeInBits();
795 unsigned InsSize = Src1Ty.getSizeInBits();
796
797 int64_t Offset = I.getOperand(i: 3).getImm();
798
799 // FIXME: These cases should have been illegal and unnecessary to check here.
800 if (Offset % 32 != 0 || InsSize % 32 != 0)
801 return false;
802
803 // Currently not handled by getSubRegFromChannel.
804 if (InsSize > 128)
805 return false;
806
807 unsigned SubReg = TRI.getSubRegFromChannel(Channel: Offset / 32, NumRegs: InsSize / 32);
808 if (SubReg == AMDGPU::NoSubRegister)
809 return false;
810
811 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
812 const TargetRegisterClass *DstRC =
813 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
814 if (!DstRC)
815 return false;
816
817 const RegisterBank *Src0Bank = RBI.getRegBank(Reg: Src0Reg, MRI: *MRI, TRI);
818 const RegisterBank *Src1Bank = RBI.getRegBank(Reg: Src1Reg, MRI: *MRI, TRI);
819 const TargetRegisterClass *Src0RC =
820 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *Src0Bank);
821 const TargetRegisterClass *Src1RC =
822 TRI.getRegClassForSizeOnBank(Size: InsSize, Bank: *Src1Bank);
823
824 // Deal with weird cases where the class only partially supports the subreg
825 // index.
826 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
827 if (!Src0RC || !Src1RC)
828 return false;
829
830 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
831 !RBI.constrainGenericRegister(Reg: Src0Reg, RC: *Src0RC, MRI&: *MRI) ||
832 !RBI.constrainGenericRegister(Reg: Src1Reg, RC: *Src1RC, MRI&: *MRI))
833 return false;
834
835 const DebugLoc &DL = I.getDebugLoc();
836 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: DstReg)
837 .addReg(RegNo: Src0Reg)
838 .addReg(RegNo: Src1Reg)
839 .addImm(Val: SubReg);
840
841 I.eraseFromParent();
842 return true;
843}
844
845bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
846 Register DstReg = MI.getOperand(i: 0).getReg();
847 Register SrcReg = MI.getOperand(i: 1).getReg();
848 Register OffsetReg = MI.getOperand(i: 2).getReg();
849 Register WidthReg = MI.getOperand(i: 3).getReg();
850
851 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
852 "scalar BFX instructions are expanded in regbankselect");
853 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
854 "64-bit vector BFX instructions are expanded in regbankselect");
855
856 const DebugLoc &DL = MI.getDebugLoc();
857 MachineBasicBlock *MBB = MI.getParent();
858
859 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
860 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
861 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
862 .addReg(RegNo: SrcReg)
863 .addReg(RegNo: OffsetReg)
864 .addReg(RegNo: WidthReg);
865 MI.eraseFromParent();
866 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
867}
868
869bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
870 if (STI.getLDSBankCount() != 16)
871 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
872
873 Register Dst = MI.getOperand(i: 0).getReg();
874 Register Src0 = MI.getOperand(i: 2).getReg();
875 Register M0Val = MI.getOperand(i: 6).getReg();
876 if (!RBI.constrainGenericRegister(Reg: M0Val, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI) ||
877 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI) ||
878 !RBI.constrainGenericRegister(Reg: Src0, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI))
879 return false;
880
881 // This requires 2 instructions. It is possible to write a pattern to support
882 // this, but the generated isel emitter doesn't correctly deal with multiple
883 // output instructions using the same physical register input. The copy to m0
884 // is incorrectly placed before the second instruction.
885 //
886 // TODO: Match source modifiers.
887
888 Register InterpMov = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
889 const DebugLoc &DL = MI.getDebugLoc();
890 MachineBasicBlock *MBB = MI.getParent();
891
892 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
893 .addReg(RegNo: M0Val);
894 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_INTERP_MOV_F32), DestReg: InterpMov)
895 .addImm(Val: 2)
896 .addImm(Val: MI.getOperand(i: 4).getImm()) // $attr
897 .addImm(Val: MI.getOperand(i: 3).getImm()); // $attrchan
898
899 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_INTERP_P1LV_F16), DestReg: Dst)
900 .addImm(Val: 0) // $src0_modifiers
901 .addReg(RegNo: Src0) // $src0
902 .addImm(Val: MI.getOperand(i: 4).getImm()) // $attr
903 .addImm(Val: MI.getOperand(i: 3).getImm()) // $attrchan
904 .addImm(Val: 0) // $src2_modifiers
905 .addReg(RegNo: InterpMov) // $src2 - 2 f16 values selected by high
906 .addImm(Val: MI.getOperand(i: 5).getImm()) // $high
907 .addImm(Val: 0) // $clamp
908 .addImm(Val: 0); // $omod
909
910 MI.eraseFromParent();
911 return true;
912}
913
914// Writelane is special in that it can use SGPR and M0 (which would normally
915// count as using the constant bus twice - but in this case it is allowed since
916// the lane selector doesn't count as a use of the constant bus). However, it is
917// still required to abide by the 1 SGPR rule. Fix this up if we might have
918// multiple SGPRs.
919bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
920 // With a constant bus limit of at least 2, there's no issue.
921 if (STI.getConstantBusLimit(Opcode: AMDGPU::V_WRITELANE_B32) > 1)
922 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
923
924 MachineBasicBlock *MBB = MI.getParent();
925 const DebugLoc &DL = MI.getDebugLoc();
926 Register VDst = MI.getOperand(i: 0).getReg();
927 Register Val = MI.getOperand(i: 2).getReg();
928 Register LaneSelect = MI.getOperand(i: 3).getReg();
929 Register VDstIn = MI.getOperand(i: 4).getReg();
930
931 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_WRITELANE_B32), DestReg: VDst);
932
933 std::optional<ValueAndVReg> ConstSelect =
934 getIConstantVRegValWithLookThrough(VReg: LaneSelect, MRI: *MRI);
935 if (ConstSelect) {
936 // The selector has to be an inline immediate, so we can use whatever for
937 // the other operands.
938 MIB.addReg(RegNo: Val);
939 MIB.addImm(Val: ConstSelect->Value.getSExtValue() &
940 maskTrailingOnes<uint64_t>(N: STI.getWavefrontSizeLog2()));
941 } else {
942 std::optional<ValueAndVReg> ConstVal =
943 getIConstantVRegValWithLookThrough(VReg: Val, MRI: *MRI);
944
945 // If the value written is an inline immediate, we can get away without a
946 // copy to m0.
947 if (ConstVal && AMDGPU::isInlinableLiteral32(Literal: ConstVal->Value.getSExtValue(),
948 HasInv2Pi: STI.hasInv2PiInlineImm())) {
949 MIB.addImm(Val: ConstVal->Value.getSExtValue());
950 MIB.addReg(RegNo: LaneSelect);
951 } else {
952 MIB.addReg(RegNo: Val);
953
954 // If the lane selector was originally in a VGPR and copied with
955 // readfirstlane, there's a hazard to read the same SGPR from the
956 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
957 RBI.constrainGenericRegister(Reg: LaneSelect, RC: AMDGPU::SReg_32_XM0RegClass, MRI&: *MRI);
958
959 BuildMI(BB&: *MBB, I&: *MIB, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
960 .addReg(RegNo: LaneSelect);
961 MIB.addReg(RegNo: AMDGPU::M0);
962 }
963 }
964
965 MIB.addReg(RegNo: VDstIn);
966
967 MI.eraseFromParent();
968 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
969}
970
971// We need to handle this here because tablegen doesn't support matching
972// instructions with multiple outputs.
973bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
974 Register Dst0 = MI.getOperand(i: 0).getReg();
975 Register Dst1 = MI.getOperand(i: 1).getReg();
976
977 LLT Ty = MRI->getType(Reg: Dst0);
978 unsigned Opc;
979 if (Ty == LLT::scalar(SizeInBits: 32))
980 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
981 else if (Ty == LLT::scalar(SizeInBits: 64))
982 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
983 else
984 return false;
985
986 // TODO: Match source modifiers.
987
988 const DebugLoc &DL = MI.getDebugLoc();
989 MachineBasicBlock *MBB = MI.getParent();
990
991 Register Numer = MI.getOperand(i: 3).getReg();
992 Register Denom = MI.getOperand(i: 4).getReg();
993 unsigned ChooseDenom = MI.getOperand(i: 5).getImm();
994
995 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
996
997 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: Dst0)
998 .addDef(RegNo: Dst1)
999 .addImm(Val: 0) // $src0_modifiers
1000 .addUse(RegNo: Src0) // $src0
1001 .addImm(Val: 0) // $src1_modifiers
1002 .addUse(RegNo: Denom) // $src1
1003 .addImm(Val: 0) // $src2_modifiers
1004 .addUse(RegNo: Numer) // $src2
1005 .addImm(Val: 0) // $clamp
1006 .addImm(Val: 0); // $omod
1007
1008 MI.eraseFromParent();
1009 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1010}
1011
1012bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1013 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
1014 switch (IntrinsicID) {
1015 case Intrinsic::amdgcn_if_break: {
1016 MachineBasicBlock *BB = I.getParent();
1017
1018 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1019 // SelectionDAG uses for wave32 vs wave64.
1020 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::SI_IF_BREAK))
1021 .add(MO: I.getOperand(i: 0))
1022 .add(MO: I.getOperand(i: 2))
1023 .add(MO: I.getOperand(i: 3));
1024
1025 Register DstReg = I.getOperand(i: 0).getReg();
1026 Register Src0Reg = I.getOperand(i: 2).getReg();
1027 Register Src1Reg = I.getOperand(i: 3).getReg();
1028
1029 I.eraseFromParent();
1030
1031 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1032 MRI->setRegClass(Reg, RC: TRI.getWaveMaskRegClass());
1033
1034 return true;
1035 }
1036 case Intrinsic::amdgcn_interp_p1_f16:
1037 return selectInterpP1F16(MI&: I);
1038 case Intrinsic::amdgcn_wqm:
1039 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::WQM);
1040 case Intrinsic::amdgcn_softwqm:
1041 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::SOFT_WQM);
1042 case Intrinsic::amdgcn_strict_wwm:
1043 case Intrinsic::amdgcn_wwm:
1044 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::STRICT_WWM);
1045 case Intrinsic::amdgcn_strict_wqm:
1046 return constrainCopyLikeIntrin(MI&: I, NewOpc: AMDGPU::STRICT_WQM);
1047 case Intrinsic::amdgcn_writelane:
1048 return selectWritelane(MI&: I);
1049 case Intrinsic::amdgcn_div_scale:
1050 return selectDivScale(MI&: I);
1051 case Intrinsic::amdgcn_icmp:
1052 case Intrinsic::amdgcn_fcmp:
1053 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
1054 return true;
1055 return selectIntrinsicCmp(MI&: I);
1056 case Intrinsic::amdgcn_ballot:
1057 return selectBallot(I);
1058 case Intrinsic::amdgcn_reloc_constant:
1059 return selectRelocConstant(I);
1060 case Intrinsic::amdgcn_groupstaticsize:
1061 return selectGroupStaticSize(I);
1062 case Intrinsic::returnaddress:
1063 return selectReturnAddress(I);
1064 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1065 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1066 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1067 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1068 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1069 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1070 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1071 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1072 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1073 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1074 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1075 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1076 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1077 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1078 return selectSMFMACIntrin(I);
1079 default:
1080 return selectImpl(I, CoverageInfo&: *CoverageInfo);
1081 }
1082}
1083
1084static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
1085 const GCNSubtarget &ST) {
1086 if (Size != 16 && Size != 32 && Size != 64)
1087 return -1;
1088
1089 if (Size == 16 && !ST.has16BitInsts())
1090 return -1;
1091
1092 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1093 unsigned S64Opc) {
1094 if (Size == 16)
1095 return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1096 if (Size == 32)
1097 return S32Opc;
1098 return S64Opc;
1099 };
1100
1101 switch (P) {
1102 default:
1103 llvm_unreachable("Unknown condition code!");
1104 case CmpInst::ICMP_NE:
1105 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1106 AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1107 case CmpInst::ICMP_EQ:
1108 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1109 AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1110 case CmpInst::ICMP_SGT:
1111 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1112 AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1113 case CmpInst::ICMP_SGE:
1114 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1115 AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1116 case CmpInst::ICMP_SLT:
1117 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1118 AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1119 case CmpInst::ICMP_SLE:
1120 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1121 AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1122 case CmpInst::ICMP_UGT:
1123 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1124 AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1125 case CmpInst::ICMP_UGE:
1126 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1127 AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1128 case CmpInst::ICMP_ULT:
1129 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1130 AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1131 case CmpInst::ICMP_ULE:
1132 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1133 AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1134
1135 case CmpInst::FCMP_OEQ:
1136 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1137 AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1138 case CmpInst::FCMP_OGT:
1139 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1140 AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1141 case CmpInst::FCMP_OGE:
1142 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1143 AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1144 case CmpInst::FCMP_OLT:
1145 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1146 AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1147 case CmpInst::FCMP_OLE:
1148 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1149 AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1150 case CmpInst::FCMP_ONE:
1151 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1152 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1153 case CmpInst::FCMP_ORD:
1154 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1155 AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1156 case CmpInst::FCMP_UNO:
1157 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1158 AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1159 case CmpInst::FCMP_UEQ:
1160 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1161 AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1162 case CmpInst::FCMP_UGT:
1163 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1164 AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1165 case CmpInst::FCMP_UGE:
1166 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1167 AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1168 case CmpInst::FCMP_ULT:
1169 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1170 AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1171 case CmpInst::FCMP_ULE:
1172 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1173 AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1174 case CmpInst::FCMP_UNE:
1175 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1176 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1177 case CmpInst::FCMP_TRUE:
1178 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1179 AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1180 case CmpInst::FCMP_FALSE:
1181 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1182 AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1183 }
1184}
1185
1186int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1187 unsigned Size) const {
1188 if (Size == 64) {
1189 if (!STI.hasScalarCompareEq64())
1190 return -1;
1191
1192 switch (P) {
1193 case CmpInst::ICMP_NE:
1194 return AMDGPU::S_CMP_LG_U64;
1195 case CmpInst::ICMP_EQ:
1196 return AMDGPU::S_CMP_EQ_U64;
1197 default:
1198 return -1;
1199 }
1200 }
1201
1202 if (Size == 32) {
1203 switch (P) {
1204 case CmpInst::ICMP_NE:
1205 return AMDGPU::S_CMP_LG_U32;
1206 case CmpInst::ICMP_EQ:
1207 return AMDGPU::S_CMP_EQ_U32;
1208 case CmpInst::ICMP_SGT:
1209 return AMDGPU::S_CMP_GT_I32;
1210 case CmpInst::ICMP_SGE:
1211 return AMDGPU::S_CMP_GE_I32;
1212 case CmpInst::ICMP_SLT:
1213 return AMDGPU::S_CMP_LT_I32;
1214 case CmpInst::ICMP_SLE:
1215 return AMDGPU::S_CMP_LE_I32;
1216 case CmpInst::ICMP_UGT:
1217 return AMDGPU::S_CMP_GT_U32;
1218 case CmpInst::ICMP_UGE:
1219 return AMDGPU::S_CMP_GE_U32;
1220 case CmpInst::ICMP_ULT:
1221 return AMDGPU::S_CMP_LT_U32;
1222 case CmpInst::ICMP_ULE:
1223 return AMDGPU::S_CMP_LE_U32;
1224 case CmpInst::FCMP_OEQ:
1225 return AMDGPU::S_CMP_EQ_F32;
1226 case CmpInst::FCMP_OGT:
1227 return AMDGPU::S_CMP_GT_F32;
1228 case CmpInst::FCMP_OGE:
1229 return AMDGPU::S_CMP_GE_F32;
1230 case CmpInst::FCMP_OLT:
1231 return AMDGPU::S_CMP_LT_F32;
1232 case CmpInst::FCMP_OLE:
1233 return AMDGPU::S_CMP_LE_F32;
1234 case CmpInst::FCMP_ONE:
1235 return AMDGPU::S_CMP_LG_F32;
1236 case CmpInst::FCMP_ORD:
1237 return AMDGPU::S_CMP_O_F32;
1238 case CmpInst::FCMP_UNO:
1239 return AMDGPU::S_CMP_U_F32;
1240 case CmpInst::FCMP_UEQ:
1241 return AMDGPU::S_CMP_NLG_F32;
1242 case CmpInst::FCMP_UGT:
1243 return AMDGPU::S_CMP_NLE_F32;
1244 case CmpInst::FCMP_UGE:
1245 return AMDGPU::S_CMP_NLT_F32;
1246 case CmpInst::FCMP_ULT:
1247 return AMDGPU::S_CMP_NGE_F32;
1248 case CmpInst::FCMP_ULE:
1249 return AMDGPU::S_CMP_NGT_F32;
1250 case CmpInst::FCMP_UNE:
1251 return AMDGPU::S_CMP_NEQ_F32;
1252 default:
1253 llvm_unreachable("Unknown condition code!");
1254 }
1255 }
1256
1257 if (Size == 16) {
1258 if (!STI.hasSALUFloatInsts())
1259 return -1;
1260
1261 switch (P) {
1262 case CmpInst::FCMP_OEQ:
1263 return AMDGPU::S_CMP_EQ_F16;
1264 case CmpInst::FCMP_OGT:
1265 return AMDGPU::S_CMP_GT_F16;
1266 case CmpInst::FCMP_OGE:
1267 return AMDGPU::S_CMP_GE_F16;
1268 case CmpInst::FCMP_OLT:
1269 return AMDGPU::S_CMP_LT_F16;
1270 case CmpInst::FCMP_OLE:
1271 return AMDGPU::S_CMP_LE_F16;
1272 case CmpInst::FCMP_ONE:
1273 return AMDGPU::S_CMP_LG_F16;
1274 case CmpInst::FCMP_ORD:
1275 return AMDGPU::S_CMP_O_F16;
1276 case CmpInst::FCMP_UNO:
1277 return AMDGPU::S_CMP_U_F16;
1278 case CmpInst::FCMP_UEQ:
1279 return AMDGPU::S_CMP_NLG_F16;
1280 case CmpInst::FCMP_UGT:
1281 return AMDGPU::S_CMP_NLE_F16;
1282 case CmpInst::FCMP_UGE:
1283 return AMDGPU::S_CMP_NLT_F16;
1284 case CmpInst::FCMP_ULT:
1285 return AMDGPU::S_CMP_NGE_F16;
1286 case CmpInst::FCMP_ULE:
1287 return AMDGPU::S_CMP_NGT_F16;
1288 case CmpInst::FCMP_UNE:
1289 return AMDGPU::S_CMP_NEQ_F16;
1290 default:
1291 llvm_unreachable("Unknown condition code!");
1292 }
1293 }
1294
1295 return -1;
1296}
1297
1298bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1299
1300 MachineBasicBlock *BB = I.getParent();
1301 const DebugLoc &DL = I.getDebugLoc();
1302
1303 Register SrcReg = I.getOperand(i: 2).getReg();
1304 unsigned Size = RBI.getSizeInBits(Reg: SrcReg, MRI: *MRI, TRI);
1305
1306 auto Pred = (CmpInst::Predicate)I.getOperand(i: 1).getPredicate();
1307
1308 Register CCReg = I.getOperand(i: 0).getReg();
1309 if (!isVCC(Reg: CCReg, MRI: *MRI)) {
1310 int Opcode = getS_CMPOpcode(P: Pred, Size);
1311 if (Opcode == -1)
1312 return false;
1313 MachineInstr *ICmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode))
1314 .add(MO: I.getOperand(i: 2))
1315 .add(MO: I.getOperand(i: 3));
1316 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CCReg)
1317 .addReg(RegNo: AMDGPU::SCC);
1318 bool Ret =
1319 constrainSelectedInstRegOperands(I&: *ICmp, TII, TRI, RBI) &&
1320 RBI.constrainGenericRegister(Reg: CCReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
1321 I.eraseFromParent();
1322 return Ret;
1323 }
1324
1325 if (I.getOpcode() == AMDGPU::G_FCMP)
1326 return false;
1327
1328 int Opcode = getV_CMPOpcode(P: Pred, Size, ST: *Subtarget);
1329 if (Opcode == -1)
1330 return false;
1331
1332 MachineInstr *ICmp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode),
1333 DestReg: I.getOperand(i: 0).getReg())
1334 .add(MO: I.getOperand(i: 2))
1335 .add(MO: I.getOperand(i: 3));
1336 RBI.constrainGenericRegister(Reg: ICmp->getOperand(i: 0).getReg(),
1337 RC: *TRI.getBoolRC(), MRI&: *MRI);
1338 bool Ret = constrainSelectedInstRegOperands(I&: *ICmp, TII, TRI, RBI);
1339 I.eraseFromParent();
1340 return Ret;
1341}
1342
1343bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1344 Register Dst = I.getOperand(i: 0).getReg();
1345 if (isVCC(Reg: Dst, MRI: *MRI))
1346 return false;
1347
1348 LLT DstTy = MRI->getType(Reg: Dst);
1349 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1350 return false;
1351
1352 MachineBasicBlock *BB = I.getParent();
1353 const DebugLoc &DL = I.getDebugLoc();
1354 Register SrcReg = I.getOperand(i: 2).getReg();
1355 unsigned Size = RBI.getSizeInBits(Reg: SrcReg, MRI: *MRI, TRI);
1356
1357 // i1 inputs are not supported in GlobalISel.
1358 if (Size == 1)
1359 return false;
1360
1361 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(i: 4).getImm());
1362 if (!CmpInst::isIntPredicate(P: Pred) && !CmpInst::isFPPredicate(P: Pred)) {
1363 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Dst);
1364 I.eraseFromParent();
1365 return RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI);
1366 }
1367
1368 const int Opcode = getV_CMPOpcode(P: Pred, Size, ST: *Subtarget);
1369 if (Opcode == -1)
1370 return false;
1371
1372 MachineInstrBuilder SelectedMI;
1373 MachineOperand &LHS = I.getOperand(i: 2);
1374 MachineOperand &RHS = I.getOperand(i: 3);
1375 auto [Src0, Src0Mods] = selectVOP3ModsImpl(Root&: LHS);
1376 auto [Src1, Src1Mods] = selectVOP3ModsImpl(Root&: RHS);
1377 Register Src0Reg =
1378 copyToVGPRIfSrcFolded(Src: Src0, Mods: Src0Mods, Root: LHS, InsertPt: &I, /*ForceVGPR*/ true);
1379 Register Src1Reg =
1380 copyToVGPRIfSrcFolded(Src: Src1, Mods: Src1Mods, Root: RHS, InsertPt: &I, /*ForceVGPR*/ true);
1381 SelectedMI = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: Dst);
1382 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers))
1383 SelectedMI.addImm(Val: Src0Mods);
1384 SelectedMI.addReg(RegNo: Src0Reg);
1385 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src1_modifiers))
1386 SelectedMI.addImm(Val: Src1Mods);
1387 SelectedMI.addReg(RegNo: Src1Reg);
1388 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::clamp))
1389 SelectedMI.addImm(Val: 0); // clamp
1390 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::op_sel))
1391 SelectedMI.addImm(Val: 0); // op_sel
1392
1393 RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI);
1394 if (!constrainSelectedInstRegOperands(I&: *SelectedMI, TII, TRI, RBI))
1395 return false;
1396
1397 I.eraseFromParent();
1398 return true;
1399}
1400
1401bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1402 MachineBasicBlock *BB = I.getParent();
1403 const DebugLoc &DL = I.getDebugLoc();
1404 Register DstReg = I.getOperand(i: 0).getReg();
1405 const unsigned Size = MRI->getType(Reg: DstReg).getSizeInBits();
1406 const bool Is64 = Size == 64;
1407 const bool IsWave32 = (STI.getWavefrontSize() == 32);
1408
1409 // In the common case, the return type matches the wave size.
1410 // However we also support emitting i64 ballots in wave32 mode.
1411 if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1412 return false;
1413
1414 std::optional<ValueAndVReg> Arg =
1415 getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: 2).getReg(), MRI: *MRI);
1416
1417 const auto BuildCopy = [&](Register SrcReg) {
1418 if (Size == STI.getWavefrontSize()) {
1419 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
1420 .addReg(RegNo: SrcReg);
1421 return;
1422 }
1423
1424 // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1425 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1426 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: HiReg).addImm(Val: 0);
1427 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
1428 .addReg(RegNo: SrcReg)
1429 .addImm(Val: AMDGPU::sub0)
1430 .addReg(RegNo: HiReg)
1431 .addImm(Val: AMDGPU::sub1);
1432 };
1433
1434 if (Arg) {
1435 const int64_t Value = Arg->Value.getSExtValue();
1436 if (Value == 0) {
1437 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1438 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: DstReg).addImm(Val: 0);
1439 } else if (Value == -1) // all ones
1440 BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1441 else
1442 return false;
1443 } else
1444 BuildCopy(I.getOperand(i: 2).getReg());
1445
1446 I.eraseFromParent();
1447 return true;
1448}
1449
1450bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1451 Register DstReg = I.getOperand(i: 0).getReg();
1452 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
1453 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(Size: 32, Bank: *DstBank);
1454 if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI))
1455 return false;
1456
1457 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1458
1459 Module *M = MF->getFunction().getParent();
1460 const MDNode *Metadata = I.getOperand(i: 2).getMetadata();
1461 auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString();
1462 auto RelocSymbol = cast<GlobalVariable>(
1463 Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext())));
1464
1465 MachineBasicBlock *BB = I.getParent();
1466 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(),
1467 MCID: TII.get(Opcode: IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DestReg: DstReg)
1468 .addGlobalAddress(GV: RelocSymbol, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
1469
1470 I.eraseFromParent();
1471 return true;
1472}
1473
1474bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1475 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1476
1477 Register DstReg = I.getOperand(i: 0).getReg();
1478 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
1479 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1480 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1481
1482 MachineBasicBlock *MBB = I.getParent();
1483 const DebugLoc &DL = I.getDebugLoc();
1484
1485 auto MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Mov), DestReg: DstReg);
1486
1487 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1488 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1489 MIB.addImm(Val: MFI->getLDSSize());
1490 } else {
1491 Module *M = MF->getFunction().getParent();
1492 const GlobalValue *GV
1493 = Intrinsic::getDeclaration(M, id: Intrinsic::amdgcn_groupstaticsize);
1494 MIB.addGlobalAddress(GV, Offset: 0, TargetFlags: SIInstrInfo::MO_ABS32_LO);
1495 }
1496
1497 I.eraseFromParent();
1498 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1499}
1500
1501bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1502 MachineBasicBlock *MBB = I.getParent();
1503 MachineFunction &MF = *MBB->getParent();
1504 const DebugLoc &DL = I.getDebugLoc();
1505
1506 MachineOperand &Dst = I.getOperand(i: 0);
1507 Register DstReg = Dst.getReg();
1508 unsigned Depth = I.getOperand(i: 2).getImm();
1509
1510 const TargetRegisterClass *RC
1511 = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI);
1512 if (!RC->hasSubClassEq(RC: &AMDGPU::SGPR_64RegClass) ||
1513 !RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI))
1514 return false;
1515
1516 // Check for kernel and shader functions
1517 if (Depth != 0 ||
1518 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1519 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B64), DestReg: DstReg)
1520 .addImm(Val: 0);
1521 I.eraseFromParent();
1522 return true;
1523 }
1524
1525 MachineFrameInfo &MFI = MF.getFrameInfo();
1526 // There is a call to @llvm.returnaddress in this function
1527 MFI.setReturnAddressIsTaken(true);
1528
1529 // Get the return address reg and mark it as an implicit live-in
1530 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1531 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, PhysReg: ReturnAddrReg,
1532 RC: AMDGPU::SReg_64RegClass, DL);
1533 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: DstReg)
1534 .addReg(RegNo: LiveIn);
1535 I.eraseFromParent();
1536 return true;
1537}
1538
1539bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1540 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1541 // SelectionDAG uses for wave32 vs wave64.
1542 MachineBasicBlock *BB = MI.getParent();
1543 BuildMI(BB&: *BB, I: &MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::SI_END_CF))
1544 .add(MO: MI.getOperand(i: 1));
1545
1546 Register Reg = MI.getOperand(i: 1).getReg();
1547 MI.eraseFromParent();
1548
1549 if (!MRI->getRegClassOrNull(Reg))
1550 MRI->setRegClass(Reg, RC: TRI.getWaveMaskRegClass());
1551 return true;
1552}
1553
1554bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1555 MachineInstr &MI, Intrinsic::ID IntrID) const {
1556 MachineBasicBlock *MBB = MI.getParent();
1557 MachineFunction *MF = MBB->getParent();
1558 const DebugLoc &DL = MI.getDebugLoc();
1559
1560 unsigned IndexOperand = MI.getOperand(i: 7).getImm();
1561 bool WaveRelease = MI.getOperand(i: 8).getImm() != 0;
1562 bool WaveDone = MI.getOperand(i: 9).getImm() != 0;
1563
1564 if (WaveDone && !WaveRelease)
1565 report_fatal_error(reason: "ds_ordered_count: wave_done requires wave_release");
1566
1567 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1568 IndexOperand &= ~0x3f;
1569 unsigned CountDw = 0;
1570
1571 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1572 CountDw = (IndexOperand >> 24) & 0xf;
1573 IndexOperand &= ~(0xf << 24);
1574
1575 if (CountDw < 1 || CountDw > 4) {
1576 report_fatal_error(
1577 reason: "ds_ordered_count: dword count must be between 1 and 4");
1578 }
1579 }
1580
1581 if (IndexOperand)
1582 report_fatal_error(reason: "ds_ordered_count: bad index operand");
1583
1584 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1585 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(MF: *MF);
1586
1587 unsigned Offset0 = OrderedCountIndex << 2;
1588 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1589
1590 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1591 Offset1 |= (CountDw - 1) << 6;
1592
1593 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1594 Offset1 |= ShaderType << 2;
1595
1596 unsigned Offset = Offset0 | (Offset1 << 8);
1597
1598 Register M0Val = MI.getOperand(i: 2).getReg();
1599 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1600 .addReg(RegNo: M0Val);
1601
1602 Register DstReg = MI.getOperand(i: 0).getReg();
1603 Register ValReg = MI.getOperand(i: 3).getReg();
1604 MachineInstrBuilder DS =
1605 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_ORDERED_COUNT), DestReg: DstReg)
1606 .addReg(RegNo: ValReg)
1607 .addImm(Val: Offset)
1608 .cloneMemRefs(OtherMI: MI);
1609
1610 if (!RBI.constrainGenericRegister(Reg: M0Val, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
1611 return false;
1612
1613 bool Ret = constrainSelectedInstRegOperands(I&: *DS, TII, TRI, RBI);
1614 MI.eraseFromParent();
1615 return Ret;
1616}
1617
1618static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1619 switch (IntrID) {
1620 case Intrinsic::amdgcn_ds_gws_init:
1621 return AMDGPU::DS_GWS_INIT;
1622 case Intrinsic::amdgcn_ds_gws_barrier:
1623 return AMDGPU::DS_GWS_BARRIER;
1624 case Intrinsic::amdgcn_ds_gws_sema_v:
1625 return AMDGPU::DS_GWS_SEMA_V;
1626 case Intrinsic::amdgcn_ds_gws_sema_br:
1627 return AMDGPU::DS_GWS_SEMA_BR;
1628 case Intrinsic::amdgcn_ds_gws_sema_p:
1629 return AMDGPU::DS_GWS_SEMA_P;
1630 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1631 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1632 default:
1633 llvm_unreachable("not a gws intrinsic");
1634 }
1635}
1636
1637bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1638 Intrinsic::ID IID) const {
1639 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1640 !STI.hasGWSSemaReleaseAll()))
1641 return false;
1642
1643 // intrinsic ID, vsrc, offset
1644 const bool HasVSrc = MI.getNumOperands() == 3;
1645 assert(HasVSrc || MI.getNumOperands() == 2);
1646
1647 Register BaseOffset = MI.getOperand(i: HasVSrc ? 2 : 1).getReg();
1648 const RegisterBank *OffsetRB = RBI.getRegBank(Reg: BaseOffset, MRI: *MRI, TRI);
1649 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1650 return false;
1651
1652 MachineInstr *OffsetDef = getDefIgnoringCopies(Reg: BaseOffset, MRI: *MRI);
1653 unsigned ImmOffset;
1654
1655 MachineBasicBlock *MBB = MI.getParent();
1656 const DebugLoc &DL = MI.getDebugLoc();
1657
1658 MachineInstr *Readfirstlane = nullptr;
1659
1660 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1661 // incoming offset, in case there's an add of a constant. We'll have to put it
1662 // back later.
1663 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1664 Readfirstlane = OffsetDef;
1665 BaseOffset = OffsetDef->getOperand(i: 1).getReg();
1666 OffsetDef = getDefIgnoringCopies(Reg: BaseOffset, MRI: *MRI);
1667 }
1668
1669 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1670 // If we have a constant offset, try to use the 0 in m0 as the base.
1671 // TODO: Look into changing the default m0 initialization value. If the
1672 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1673 // the immediate offset.
1674
1675 ImmOffset = OffsetDef->getOperand(i: 1).getCImm()->getZExtValue();
1676 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
1677 .addImm(Val: 0);
1678 } else {
1679 std::tie(args&: BaseOffset, args&: ImmOffset) =
1680 AMDGPU::getBaseWithConstantOffset(MRI&: *MRI, Reg: BaseOffset, KnownBits: KB);
1681
1682 if (Readfirstlane) {
1683 // We have the constant offset now, so put the readfirstlane back on the
1684 // variable component.
1685 if (!RBI.constrainGenericRegister(Reg: BaseOffset, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI))
1686 return false;
1687
1688 Readfirstlane->getOperand(i: 1).setReg(BaseOffset);
1689 BaseOffset = Readfirstlane->getOperand(i: 0).getReg();
1690 } else {
1691 if (!RBI.constrainGenericRegister(Reg: BaseOffset,
1692 RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
1693 return false;
1694 }
1695
1696 Register M0Base = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1697 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: M0Base)
1698 .addReg(RegNo: BaseOffset)
1699 .addImm(Val: 16)
1700 .setOperandDead(3); // Dead scc
1701
1702 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1703 .addReg(RegNo: M0Base);
1704 }
1705
1706 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1707 // offset field) % 64. Some versions of the programming guide omit the m0
1708 // part, or claim it's from offset 0.
1709 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: gwsIntrinToOpcode(IntrID: IID)));
1710
1711 if (HasVSrc) {
1712 Register VSrc = MI.getOperand(i: 1).getReg();
1713 MIB.addReg(RegNo: VSrc);
1714
1715 if (!RBI.constrainGenericRegister(Reg: VSrc, RC: AMDGPU::VGPR_32RegClass, MRI&: *MRI))
1716 return false;
1717 }
1718
1719 MIB.addImm(Val: ImmOffset)
1720 .cloneMemRefs(OtherMI: MI);
1721
1722 TII.enforceOperandRCAlignment(MI&: *MIB, OpName: AMDGPU::OpName::data0);
1723
1724 MI.eraseFromParent();
1725 return true;
1726}
1727
1728bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1729 bool IsAppend) const {
1730 Register PtrBase = MI.getOperand(i: 2).getReg();
1731 LLT PtrTy = MRI->getType(Reg: PtrBase);
1732 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1733
1734 unsigned Offset;
1735 std::tie(args&: PtrBase, args&: Offset) = selectDS1Addr1OffsetImpl(Root&: MI.getOperand(i: 2));
1736
1737 // TODO: Should this try to look through readfirstlane like GWS?
1738 if (!isDSOffsetLegal(Base: PtrBase, Offset)) {
1739 PtrBase = MI.getOperand(i: 2).getReg();
1740 Offset = 0;
1741 }
1742
1743 MachineBasicBlock *MBB = MI.getParent();
1744 const DebugLoc &DL = MI.getDebugLoc();
1745 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1746
1747 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
1748 .addReg(RegNo: PtrBase);
1749 if (!RBI.constrainGenericRegister(Reg: PtrBase, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
1750 return false;
1751
1752 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: MI.getOperand(i: 0).getReg())
1753 .addImm(Val: Offset)
1754 .addImm(Val: IsGDS ? -1 : 0)
1755 .cloneMemRefs(OtherMI: MI);
1756 MI.eraseFromParent();
1757 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1758}
1759
1760bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1761 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1762 unsigned WGSize = STI.getFlatWorkGroupSizes(F: MF->getFunction()).second;
1763 if (WGSize <= STI.getWavefrontSize()) {
1764 MachineBasicBlock *MBB = MI.getParent();
1765 const DebugLoc &DL = MI.getDebugLoc();
1766 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::WAVE_BARRIER));
1767 MI.eraseFromParent();
1768 return true;
1769 }
1770 }
1771
1772 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1773 if (STI.hasSplitBarriers()) {
1774 MachineBasicBlock *MBB = MI.getParent();
1775 const DebugLoc &DL = MI.getDebugLoc();
1776 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_BARRIER_SIGNAL_IMM))
1777 .addImm(Val: AMDGPU::Barrier::WORKGROUP);
1778 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_BARRIER_WAIT))
1779 .addImm(Val: AMDGPU::Barrier::WORKGROUP);
1780 MI.eraseFromParent();
1781 return true;
1782 }
1783
1784 return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo);
1785}
1786
1787static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1788 bool &IsTexFail) {
1789 if (TexFailCtrl)
1790 IsTexFail = true;
1791
1792 TFE = (TexFailCtrl & 0x1) ? true : false;
1793 TexFailCtrl &= ~(uint64_t)0x1;
1794 LWE = (TexFailCtrl & 0x2) ? true : false;
1795 TexFailCtrl &= ~(uint64_t)0x2;
1796
1797 return TexFailCtrl == 0;
1798}
1799
1800bool AMDGPUInstructionSelector::selectImageIntrinsic(
1801 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1802 MachineBasicBlock *MBB = MI.getParent();
1803 const DebugLoc &DL = MI.getDebugLoc();
1804
1805 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1806 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
1807
1808 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim);
1809 unsigned IntrOpcode = Intr->BaseOpcode;
1810 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1811 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1812 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1813
1814 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1815
1816 Register VDataIn, VDataOut;
1817 LLT VDataTy;
1818 int NumVDataDwords = -1;
1819 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1820 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1821
1822 bool Unorm;
1823 if (!BaseOpcode->Sampler)
1824 Unorm = true;
1825 else
1826 Unorm = MI.getOperand(i: ArgOffset + Intr->UnormIndex).getImm() != 0;
1827
1828 bool TFE;
1829 bool LWE;
1830 bool IsTexFail = false;
1831 if (!parseTexFail(TexFailCtrl: MI.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1832 TFE, LWE, IsTexFail))
1833 return false;
1834
1835 const int Flags = MI.getOperand(i: ArgOffset + Intr->NumArgs).getImm();
1836 const bool IsA16 = (Flags & 1) != 0;
1837 const bool IsG16 = (Flags & 2) != 0;
1838
1839 // A16 implies 16 bit gradients if subtarget doesn't support G16
1840 if (IsA16 && !STI.hasG16() && !IsG16)
1841 return false;
1842
1843 unsigned DMask = 0;
1844 unsigned DMaskLanes = 0;
1845
1846 if (BaseOpcode->Atomic) {
1847 VDataOut = MI.getOperand(i: 0).getReg();
1848 VDataIn = MI.getOperand(i: 2).getReg();
1849 LLT Ty = MRI->getType(Reg: VDataIn);
1850
1851 // Be careful to allow atomic swap on 16-bit element vectors.
1852 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1853 Ty.getSizeInBits() == 128 :
1854 Ty.getSizeInBits() == 64;
1855
1856 if (BaseOpcode->AtomicX2) {
1857 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1858
1859 DMask = Is64Bit ? 0xf : 0x3;
1860 NumVDataDwords = Is64Bit ? 4 : 2;
1861 } else {
1862 DMask = Is64Bit ? 0x3 : 0x1;
1863 NumVDataDwords = Is64Bit ? 2 : 1;
1864 }
1865 } else {
1866 DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
1867 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask);
1868
1869 if (BaseOpcode->Store) {
1870 VDataIn = MI.getOperand(i: 1).getReg();
1871 VDataTy = MRI->getType(Reg: VDataIn);
1872 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1873 } else if (BaseOpcode->NoReturn) {
1874 NumVDataDwords = 0;
1875 } else {
1876 VDataOut = MI.getOperand(i: 0).getReg();
1877 VDataTy = MRI->getType(Reg: VDataOut);
1878 NumVDataDwords = DMaskLanes;
1879
1880 if (IsD16 && !STI.hasUnpackedD16VMem())
1881 NumVDataDwords = (DMaskLanes + 1) / 2;
1882 }
1883 }
1884
1885 // Set G16 opcode
1886 if (Subtarget->hasG16() && IsG16) {
1887 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1888 AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode);
1889 assert(G16MappingInfo);
1890 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1891 }
1892
1893 // TODO: Check this in verifier.
1894 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1895
1896 unsigned CPol = MI.getOperand(i: ArgOffset + Intr->CachePolicyIndex).getImm();
1897 if (BaseOpcode->Atomic)
1898 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1899 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
1900 AMDGPU::CPol::VOLATILE))
1901 return false;
1902
1903 int NumVAddrRegs = 0;
1904 int NumVAddrDwords = 0;
1905 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1906 // Skip the $noregs and 0s inserted during legalization.
1907 MachineOperand &AddrOp = MI.getOperand(i: ArgOffset + I);
1908 if (!AddrOp.isReg())
1909 continue; // XXX - Break?
1910
1911 Register Addr = AddrOp.getReg();
1912 if (!Addr)
1913 break;
1914
1915 ++NumVAddrRegs;
1916 NumVAddrDwords += (MRI->getType(Reg: Addr).getSizeInBits() + 31) / 32;
1917 }
1918
1919 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1920 // NSA, these should have been packed into a single value in the first
1921 // address register
1922 const bool UseNSA =
1923 NumVAddrRegs != 1 &&
1924 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
1925 : NumVAddrDwords == NumVAddrRegs);
1926 if (UseNSA && !STI.hasFeature(Feature: AMDGPU::FeatureNSAEncoding)) {
1927 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1928 return false;
1929 }
1930
1931 if (IsTexFail)
1932 ++NumVDataDwords;
1933
1934 int Opcode = -1;
1935 if (IsGFX12Plus) {
1936 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx12,
1937 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
1938 } else if (IsGFX11Plus) {
1939 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
1940 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx11NSA
1941 : AMDGPU::MIMGEncGfx11Default,
1942 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
1943 } else if (IsGFX10Plus) {
1944 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
1945 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx10NSA
1946 : AMDGPU::MIMGEncGfx10Default,
1947 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
1948 } else {
1949 if (Subtarget->hasGFX90AInsts()) {
1950 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx90a,
1951 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
1952 if (Opcode == -1) {
1953 LLVM_DEBUG(
1954 dbgs()
1955 << "requested image instruction is not supported on this GPU\n");
1956 return false;
1957 }
1958 }
1959 if (Opcode == -1 &&
1960 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1961 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx8,
1962 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
1963 if (Opcode == -1)
1964 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx6,
1965 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
1966 }
1967 if (Opcode == -1)
1968 return false;
1969
1970 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode))
1971 .cloneMemRefs(OtherMI: MI);
1972
1973 if (VDataOut) {
1974 if (BaseOpcode->AtomicX2) {
1975 const bool Is64 = MRI->getType(Reg: VDataOut).getSizeInBits() == 64;
1976
1977 Register TmpReg = MRI->createVirtualRegister(
1978 RegClass: Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1979 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1980
1981 MIB.addDef(RegNo: TmpReg);
1982 if (!MRI->use_empty(RegNo: VDataOut)) {
1983 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: VDataOut)
1984 .addReg(RegNo: TmpReg, flags: RegState::Kill, SubReg);
1985 }
1986
1987 } else {
1988 MIB.addDef(RegNo: VDataOut); // vdata output
1989 }
1990 }
1991
1992 if (VDataIn)
1993 MIB.addReg(RegNo: VDataIn); // vdata input
1994
1995 for (int I = 0; I != NumVAddrRegs; ++I) {
1996 MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + Intr->VAddrStart + I);
1997 if (SrcOp.isReg()) {
1998 assert(SrcOp.getReg() != 0);
1999 MIB.addReg(RegNo: SrcOp.getReg());
2000 }
2001 }
2002
2003 MIB.addReg(RegNo: MI.getOperand(i: ArgOffset + Intr->RsrcIndex).getReg());
2004 if (BaseOpcode->Sampler)
2005 MIB.addReg(RegNo: MI.getOperand(i: ArgOffset + Intr->SampIndex).getReg());
2006
2007 MIB.addImm(Val: DMask); // dmask
2008
2009 if (IsGFX10Plus)
2010 MIB.addImm(Val: DimInfo->Encoding);
2011 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::unorm))
2012 MIB.addImm(Val: Unorm);
2013
2014 MIB.addImm(Val: CPol);
2015 MIB.addImm(Val: IsA16 && // a16 or r128
2016 STI.hasFeature(Feature: AMDGPU::FeatureR128A16) ? -1 : 0);
2017 if (IsGFX10Plus)
2018 MIB.addImm(Val: IsA16 ? -1 : 0);
2019
2020 if (!Subtarget->hasGFX90AInsts()) {
2021 MIB.addImm(Val: TFE); // tfe
2022 } else if (TFE) {
2023 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2024 return false;
2025 }
2026
2027 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::lwe))
2028 MIB.addImm(Val: LWE); // lwe
2029 if (!IsGFX10Plus)
2030 MIB.addImm(Val: DimInfo->DA ? -1 : 0);
2031 if (BaseOpcode->HasD16)
2032 MIB.addImm(Val: IsD16 ? -1 : 0);
2033
2034 MI.eraseFromParent();
2035 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2036 TII.enforceOperandRCAlignment(MI&: *MIB, OpName: AMDGPU::OpName::vaddr);
2037 return true;
2038}
2039
2040// We need to handle this here because tablegen doesn't support matching
2041// instructions with multiple outputs.
2042bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2043 MachineInstr &MI) const {
2044 Register Dst0 = MI.getOperand(i: 0).getReg();
2045 Register Dst1 = MI.getOperand(i: 1).getReg();
2046
2047 const DebugLoc &DL = MI.getDebugLoc();
2048 MachineBasicBlock *MBB = MI.getParent();
2049
2050 Register Addr = MI.getOperand(i: 3).getReg();
2051 Register Data0 = MI.getOperand(i: 4).getReg();
2052 Register Data1 = MI.getOperand(i: 5).getReg();
2053 unsigned Offset = MI.getOperand(i: 6).getImm();
2054
2055 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::DS_BVH_STACK_RTN_B32), DestReg: Dst0)
2056 .addDef(RegNo: Dst1)
2057 .addUse(RegNo: Addr)
2058 .addUse(RegNo: Data0)
2059 .addUse(RegNo: Data1)
2060 .addImm(Val: Offset)
2061 .cloneMemRefs(OtherMI: MI);
2062
2063 MI.eraseFromParent();
2064 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2065}
2066
2067bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2068 MachineInstr &I) const {
2069 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
2070 switch (IntrinsicID) {
2071 case Intrinsic::amdgcn_end_cf:
2072 return selectEndCfIntrinsic(MI&: I);
2073 case Intrinsic::amdgcn_ds_ordered_add:
2074 case Intrinsic::amdgcn_ds_ordered_swap:
2075 return selectDSOrderedIntrinsic(MI&: I, IntrID: IntrinsicID);
2076 case Intrinsic::amdgcn_ds_gws_init:
2077 case Intrinsic::amdgcn_ds_gws_barrier:
2078 case Intrinsic::amdgcn_ds_gws_sema_v:
2079 case Intrinsic::amdgcn_ds_gws_sema_br:
2080 case Intrinsic::amdgcn_ds_gws_sema_p:
2081 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2082 return selectDSGWSIntrinsic(MI&: I, IID: IntrinsicID);
2083 case Intrinsic::amdgcn_ds_append:
2084 return selectDSAppendConsume(MI&: I, IsAppend: true);
2085 case Intrinsic::amdgcn_ds_consume:
2086 return selectDSAppendConsume(MI&: I, IsAppend: false);
2087 case Intrinsic::amdgcn_s_barrier:
2088 return selectSBarrier(MI&: I);
2089 case Intrinsic::amdgcn_raw_buffer_load_lds:
2090 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2091 case Intrinsic::amdgcn_struct_buffer_load_lds:
2092 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2093 return selectBufferLoadLds(MI&: I);
2094 case Intrinsic::amdgcn_global_load_lds:
2095 return selectGlobalLoadLds(MI&: I);
2096 case Intrinsic::amdgcn_exp_compr:
2097 if (!STI.hasCompressedExport()) {
2098 Function &F = I.getMF()->getFunction();
2099 DiagnosticInfoUnsupported NoFpRet(
2100 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2101 F.getContext().diagnose(DI: NoFpRet);
2102 return false;
2103 }
2104 break;
2105 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2106 return selectDSBvhStackIntrinsic(MI&: I);
2107 case Intrinsic::amdgcn_s_barrier_init:
2108 case Intrinsic::amdgcn_s_barrier_join:
2109 case Intrinsic::amdgcn_s_wakeup_barrier:
2110 case Intrinsic::amdgcn_s_get_barrier_state:
2111 return selectNamedBarrierInst(I, IID: IntrinsicID);
2112 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2113 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
2114 return selectSBarrierSignalIsfirst(I, IID: IntrinsicID);
2115 case Intrinsic::amdgcn_s_barrier_leave:
2116 return selectSBarrierLeave(I);
2117 }
2118 return selectImpl(I, CoverageInfo&: *CoverageInfo);
2119}
2120
2121bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2122 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
2123 return true;
2124
2125 MachineBasicBlock *BB = I.getParent();
2126 const DebugLoc &DL = I.getDebugLoc();
2127
2128 Register DstReg = I.getOperand(i: 0).getReg();
2129 unsigned Size = RBI.getSizeInBits(Reg: DstReg, MRI: *MRI, TRI);
2130 assert(Size <= 32 || Size == 64);
2131 const MachineOperand &CCOp = I.getOperand(i: 1);
2132 Register CCReg = CCOp.getReg();
2133 if (!isVCC(Reg: CCReg, MRI: *MRI)) {
2134 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2135 AMDGPU::S_CSELECT_B32;
2136 MachineInstr *CopySCC = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC)
2137 .addReg(RegNo: CCReg);
2138
2139 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2140 // bank, because it does not cover the register class that we used to represent
2141 // for it. So we need to manually set the register class here.
2142 if (!MRI->getRegClassOrNull(Reg: CCReg))
2143 MRI->setRegClass(Reg: CCReg, RC: TRI.getConstrainedRegClassForOperand(MO: CCOp, MRI: *MRI));
2144 MachineInstr *Select = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: SelectOpcode), DestReg: DstReg)
2145 .add(MO: I.getOperand(i: 2))
2146 .add(MO: I.getOperand(i: 3));
2147
2148 bool Ret = false;
2149 Ret |= constrainSelectedInstRegOperands(I&: *Select, TII, TRI, RBI);
2150 Ret |= constrainSelectedInstRegOperands(I&: *CopySCC, TII, TRI, RBI);
2151 I.eraseFromParent();
2152 return Ret;
2153 }
2154
2155 // Wide VGPR select should have been split in RegBankSelect.
2156 if (Size > 32)
2157 return false;
2158
2159 MachineInstr *Select =
2160 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
2161 .addImm(Val: 0)
2162 .add(MO: I.getOperand(i: 3))
2163 .addImm(Val: 0)
2164 .add(MO: I.getOperand(i: 2))
2165 .add(MO: I.getOperand(i: 1));
2166
2167 bool Ret = constrainSelectedInstRegOperands(I&: *Select, TII, TRI, RBI);
2168 I.eraseFromParent();
2169 return Ret;
2170}
2171
2172static int sizeToSubRegIndex(unsigned Size) {
2173 switch (Size) {
2174 case 32:
2175 return AMDGPU::sub0;
2176 case 64:
2177 return AMDGPU::sub0_sub1;
2178 case 96:
2179 return AMDGPU::sub0_sub1_sub2;
2180 case 128:
2181 return AMDGPU::sub0_sub1_sub2_sub3;
2182 case 256:
2183 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2184 default:
2185 if (Size < 32)
2186 return AMDGPU::sub0;
2187 if (Size > 256)
2188 return -1;
2189 return sizeToSubRegIndex(Size: llvm::bit_ceil(Value: Size));
2190 }
2191}
2192
2193bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2194 Register DstReg = I.getOperand(i: 0).getReg();
2195 Register SrcReg = I.getOperand(i: 1).getReg();
2196 const LLT DstTy = MRI->getType(Reg: DstReg);
2197 const LLT SrcTy = MRI->getType(Reg: SrcReg);
2198 const LLT S1 = LLT::scalar(SizeInBits: 1);
2199
2200 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
2201 const RegisterBank *DstRB;
2202 if (DstTy == S1) {
2203 // This is a special case. We don't treat s1 for legalization artifacts as
2204 // vcc booleans.
2205 DstRB = SrcRB;
2206 } else {
2207 DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
2208 if (SrcRB != DstRB)
2209 return false;
2210 }
2211
2212 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2213
2214 unsigned DstSize = DstTy.getSizeInBits();
2215 unsigned SrcSize = SrcTy.getSizeInBits();
2216
2217 const TargetRegisterClass *SrcRC =
2218 TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcRB);
2219 const TargetRegisterClass *DstRC =
2220 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstRB);
2221 if (!SrcRC || !DstRC)
2222 return false;
2223
2224 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
2225 !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI)) {
2226 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2227 return false;
2228 }
2229
2230 if (DstTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) && SrcTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
2231 MachineBasicBlock *MBB = I.getParent();
2232 const DebugLoc &DL = I.getDebugLoc();
2233
2234 Register LoReg = MRI->createVirtualRegister(RegClass: DstRC);
2235 Register HiReg = MRI->createVirtualRegister(RegClass: DstRC);
2236 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
2237 .addReg(RegNo: SrcReg, flags: 0, SubReg: AMDGPU::sub0);
2238 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
2239 .addReg(RegNo: SrcReg, flags: 0, SubReg: AMDGPU::sub1);
2240
2241 if (IsVALU && STI.hasSDWA()) {
2242 // Write the low 16-bits of the high element into the high 16-bits of the
2243 // low element.
2244 MachineInstr *MovSDWA =
2245 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_sdwa), DestReg: DstReg)
2246 .addImm(Val: 0) // $src0_modifiers
2247 .addReg(RegNo: HiReg) // $src0
2248 .addImm(Val: 0) // $clamp
2249 .addImm(Val: AMDGPU::SDWA::WORD_1) // $dst_sel
2250 .addImm(Val: AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2251 .addImm(Val: AMDGPU::SDWA::WORD_0) // $src0_sel
2252 .addReg(RegNo: LoReg, flags: RegState::Implicit);
2253 MovSDWA->tieOperands(DefIdx: 0, UseIdx: MovSDWA->getNumOperands() - 1);
2254 } else {
2255 Register TmpReg0 = MRI->createVirtualRegister(RegClass: DstRC);
2256 Register TmpReg1 = MRI->createVirtualRegister(RegClass: DstRC);
2257 Register ImmReg = MRI->createVirtualRegister(RegClass: DstRC);
2258 if (IsVALU) {
2259 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHLREV_B32_e64), DestReg: TmpReg0)
2260 .addImm(Val: 16)
2261 .addReg(RegNo: HiReg);
2262 } else {
2263 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: TmpReg0)
2264 .addReg(RegNo: HiReg)
2265 .addImm(Val: 16)
2266 .setOperandDead(3); // Dead scc
2267 }
2268
2269 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2270 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2271 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2272
2273 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: MovOpc), DestReg: ImmReg)
2274 .addImm(Val: 0xffff);
2275 auto And = BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: TmpReg1)
2276 .addReg(RegNo: LoReg)
2277 .addReg(RegNo: ImmReg);
2278 auto Or = BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII.get(Opcode: OrOpc), DestReg: DstReg)
2279 .addReg(RegNo: TmpReg0)
2280 .addReg(RegNo: TmpReg1);
2281
2282 if (!IsVALU) {
2283 And.setOperandDead(3); // Dead scc
2284 Or.setOperandDead(3); // Dead scc
2285 }
2286 }
2287
2288 I.eraseFromParent();
2289 return true;
2290 }
2291
2292 if (!DstTy.isScalar())
2293 return false;
2294
2295 if (SrcSize > 32) {
2296 int SubRegIdx = sizeToSubRegIndex(Size: DstSize);
2297 if (SubRegIdx == -1)
2298 return false;
2299
2300 // Deal with weird cases where the class only partially supports the subreg
2301 // index.
2302 const TargetRegisterClass *SrcWithSubRC
2303 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2304 if (!SrcWithSubRC)
2305 return false;
2306
2307 if (SrcWithSubRC != SrcRC) {
2308 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcWithSubRC, MRI&: *MRI))
2309 return false;
2310 }
2311
2312 I.getOperand(i: 1).setSubReg(SubRegIdx);
2313 }
2314
2315 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
2316 return true;
2317}
2318
2319/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2320static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2321 Mask = maskTrailingOnes<unsigned>(N: Size);
2322 int SignedMask = static_cast<int>(Mask);
2323 return SignedMask >= -16 && SignedMask <= 64;
2324}
2325
2326// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2327const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2328 Register Reg, const MachineRegisterInfo &MRI,
2329 const TargetRegisterInfo &TRI) const {
2330 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2331 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2332 return RB;
2333
2334 // Ignore the type, since we don't use vcc in artifacts.
2335 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2336 return &RBI.getRegBankFromRegClass(RC: *RC, LLT());
2337 return nullptr;
2338}
2339
2340bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2341 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2342 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2343 const DebugLoc &DL = I.getDebugLoc();
2344 MachineBasicBlock &MBB = *I.getParent();
2345 const Register DstReg = I.getOperand(i: 0).getReg();
2346 const Register SrcReg = I.getOperand(i: 1).getReg();
2347
2348 const LLT DstTy = MRI->getType(Reg: DstReg);
2349 const LLT SrcTy = MRI->getType(Reg: SrcReg);
2350 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2351 I.getOperand(i: 2).getImm() : SrcTy.getSizeInBits();
2352 const unsigned DstSize = DstTy.getSizeInBits();
2353 if (!DstTy.isScalar())
2354 return false;
2355
2356 // Artifact casts should never use vcc.
2357 const RegisterBank *SrcBank = getArtifactRegBank(Reg: SrcReg, MRI: *MRI, TRI);
2358
2359 // FIXME: This should probably be illegal and split earlier.
2360 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2361 if (DstSize <= 32)
2362 return selectCOPY(I);
2363
2364 const TargetRegisterClass *SrcRC =
2365 TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *SrcBank);
2366 const RegisterBank *DstBank = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
2367 const TargetRegisterClass *DstRC =
2368 TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank);
2369
2370 Register UndefReg = MRI->createVirtualRegister(RegClass: SrcRC);
2371 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefReg);
2372 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
2373 .addReg(RegNo: SrcReg)
2374 .addImm(Val: AMDGPU::sub0)
2375 .addReg(RegNo: UndefReg)
2376 .addImm(Val: AMDGPU::sub1);
2377 I.eraseFromParent();
2378
2379 return RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) &&
2380 RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI);
2381 }
2382
2383 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2384 // 64-bit should have been split up in RegBankSelect
2385
2386 // Try to use an and with a mask if it will save code size.
2387 unsigned Mask;
2388 if (!Signed && shouldUseAndMask(Size: SrcSize, Mask)) {
2389 MachineInstr *ExtI =
2390 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_AND_B32_e32), DestReg: DstReg)
2391 .addImm(Val: Mask)
2392 .addReg(RegNo: SrcReg);
2393 I.eraseFromParent();
2394 return constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
2395 }
2396
2397 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2398 MachineInstr *ExtI =
2399 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE), DestReg: DstReg)
2400 .addReg(RegNo: SrcReg)
2401 .addImm(Val: 0) // Offset
2402 .addImm(Val: SrcSize); // Width
2403 I.eraseFromParent();
2404 return constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
2405 }
2406
2407 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2408 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2409 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2410 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: SrcRC, MRI&: *MRI))
2411 return false;
2412
2413 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2414 const unsigned SextOpc = SrcSize == 8 ?
2415 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2416 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: SextOpc), DestReg: DstReg)
2417 .addReg(RegNo: SrcReg);
2418 I.eraseFromParent();
2419 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2420 }
2421
2422 // Using a single 32-bit SALU to calculate the high half is smaller than
2423 // S_BFE with a literal constant operand.
2424 if (DstSize > 32 && SrcSize == 32) {
2425 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2426 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2427 if (Signed) {
2428 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ASHR_I32), DestReg: HiReg)
2429 .addReg(RegNo: SrcReg, flags: 0, SubReg)
2430 .addImm(Val: 31)
2431 .setOperandDead(3); // Dead scc
2432 } else {
2433 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: HiReg)
2434 .addImm(Val: 0);
2435 }
2436 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
2437 .addReg(RegNo: SrcReg, flags: 0, SubReg)
2438 .addImm(Val: AMDGPU::sub0)
2439 .addReg(RegNo: HiReg)
2440 .addImm(Val: AMDGPU::sub1);
2441 I.eraseFromParent();
2442 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_64RegClass,
2443 MRI&: *MRI);
2444 }
2445
2446 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2447 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2448
2449 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2450 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2451 // We need a 64-bit register source, but the high bits don't matter.
2452 Register ExtReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
2453 Register UndefReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2454 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2455
2456 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefReg);
2457 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: ExtReg)
2458 .addReg(RegNo: SrcReg, flags: 0, SubReg)
2459 .addImm(Val: AMDGPU::sub0)
2460 .addReg(RegNo: UndefReg)
2461 .addImm(Val: AMDGPU::sub1);
2462
2463 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE64), DestReg: DstReg)
2464 .addReg(RegNo: ExtReg)
2465 .addImm(Val: SrcSize << 16);
2466
2467 I.eraseFromParent();
2468 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI);
2469 }
2470
2471 unsigned Mask;
2472 if (!Signed && shouldUseAndMask(Size: SrcSize, Mask)) {
2473 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: DstReg)
2474 .addReg(RegNo: SrcReg)
2475 .addImm(Val: Mask)
2476 .setOperandDead(3); // Dead scc
2477 } else {
2478 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode: BFE32), DestReg: DstReg)
2479 .addReg(RegNo: SrcReg)
2480 .addImm(Val: SrcSize << 16);
2481 }
2482
2483 I.eraseFromParent();
2484 return RBI.constrainGenericRegister(Reg: DstReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2485 }
2486
2487 return false;
2488}
2489
2490static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
2491 Register &Out) {
2492 Register LShlSrc;
2493 if (mi_match(R: In, MRI,
2494 P: m_GTrunc(Src: m_GLShr(L: m_Reg(R&: LShlSrc), R: m_SpecificICst(RequestedValue: 16))))) {
2495 Out = LShlSrc;
2496 return true;
2497 }
2498 return false;
2499}
2500
2501bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2502 if (!Subtarget->hasSALUFloatInsts())
2503 return false;
2504
2505 Register Dst = I.getOperand(i: 0).getReg();
2506 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
2507 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2508 return false;
2509
2510 Register Src = I.getOperand(i: 1).getReg();
2511
2512 if (MRI->getType(Reg: Dst) == LLT::scalar(SizeInBits: 32) &&
2513 MRI->getType(Reg: Src) == LLT::scalar(SizeInBits: 16)) {
2514 if (isExtractHiElt(MRI&: *MRI, In: Src, Out&: Src)) {
2515 MachineBasicBlock *BB = I.getParent();
2516 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_CVT_HI_F32_F16), DestReg: Dst)
2517 .addUse(RegNo: Src);
2518 I.eraseFromParent();
2519 return RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI);
2520 }
2521 }
2522
2523 return false;
2524}
2525
2526bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2527 MachineBasicBlock *BB = I.getParent();
2528 MachineOperand &ImmOp = I.getOperand(i: 1);
2529 Register DstReg = I.getOperand(i: 0).getReg();
2530 unsigned Size = MRI->getType(Reg: DstReg).getSizeInBits();
2531 bool IsFP = false;
2532
2533 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2534 if (ImmOp.isFPImm()) {
2535 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2536 ImmOp.ChangeToImmediate(ImmVal: Imm.getZExtValue());
2537 IsFP = true;
2538 } else if (ImmOp.isCImm()) {
2539 ImmOp.ChangeToImmediate(ImmVal: ImmOp.getCImm()->getSExtValue());
2540 } else {
2541 llvm_unreachable("Not supported by g_constants");
2542 }
2543
2544 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
2545 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2546
2547 unsigned Opcode;
2548 if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2549 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2550 } else if (Size == 64 &&
2551 AMDGPU::isValid32BitLiteral(Val: I.getOperand(i: 1).getImm(), IsFP64: IsFP)) {
2552 Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
2553 I.setDesc(TII.get(Opcode));
2554 I.addImplicitDefUseOperands(MF&: *MF);
2555 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2556 } else {
2557 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2558
2559 // We should never produce s1 values on banks other than VCC. If the user of
2560 // this already constrained the register, we may incorrectly think it's VCC
2561 // if it wasn't originally.
2562 if (Size == 1)
2563 return false;
2564 }
2565
2566 if (Size != 64) {
2567 I.setDesc(TII.get(Opcode));
2568 I.addImplicitDefUseOperands(MF&: *MF);
2569 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2570 }
2571
2572 const DebugLoc &DL = I.getDebugLoc();
2573
2574 APInt Imm(Size, I.getOperand(i: 1).getImm());
2575
2576 MachineInstr *ResInst;
2577 if (IsSgpr && TII.isInlineConstant(Imm)) {
2578 ResInst = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B64), DestReg: DstReg)
2579 .addImm(Val: I.getOperand(i: 1).getImm());
2580 } else {
2581 const TargetRegisterClass *RC = IsSgpr ?
2582 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2583 Register LoReg = MRI->createVirtualRegister(RegClass: RC);
2584 Register HiReg = MRI->createVirtualRegister(RegClass: RC);
2585
2586 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: LoReg)
2587 .addImm(Val: Imm.trunc(width: 32).getZExtValue());
2588
2589 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: HiReg)
2590 .addImm(Val: Imm.ashr(ShiftAmt: 32).getZExtValue());
2591
2592 ResInst = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
2593 .addReg(RegNo: LoReg)
2594 .addImm(Val: AMDGPU::sub0)
2595 .addReg(RegNo: HiReg)
2596 .addImm(Val: AMDGPU::sub1);
2597 }
2598
2599 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2600 // work for target independent opcodes
2601 I.eraseFromParent();
2602 const TargetRegisterClass *DstRC =
2603 TRI.getConstrainedRegClassForOperand(MO: ResInst->getOperand(i: 0), MRI: *MRI);
2604 if (!DstRC)
2605 return true;
2606 return RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI);
2607}
2608
2609bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2610 // Only manually handle the f64 SGPR case.
2611 //
2612 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2613 // the bit ops theoretically have a second result due to the implicit def of
2614 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2615 // that is easy by disabling the check. The result works, but uses a
2616 // nonsensical sreg32orlds_and_sreg_1 regclass.
2617 //
2618 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2619 // the variadic REG_SEQUENCE operands.
2620
2621 Register Dst = MI.getOperand(i: 0).getReg();
2622 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
2623 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2624 MRI->getType(Reg: Dst) != LLT::scalar(SizeInBits: 64))
2625 return false;
2626
2627 Register Src = MI.getOperand(i: 1).getReg();
2628 MachineInstr *Fabs = getOpcodeDef(Opcode: TargetOpcode::G_FABS, Reg: Src, MRI: *MRI);
2629 if (Fabs)
2630 Src = Fabs->getOperand(i: 1).getReg();
2631
2632 if (!RBI.constrainGenericRegister(Reg: Src, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI) ||
2633 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI))
2634 return false;
2635
2636 MachineBasicBlock *BB = MI.getParent();
2637 const DebugLoc &DL = MI.getDebugLoc();
2638 Register LoReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2639 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2640 Register ConstReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2641 Register OpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2642
2643 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
2644 .addReg(RegNo: Src, flags: 0, SubReg: AMDGPU::sub0);
2645 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
2646 .addReg(RegNo: Src, flags: 0, SubReg: AMDGPU::sub1);
2647 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: ConstReg)
2648 .addImm(Val: 0x80000000);
2649
2650 // Set or toggle sign bit.
2651 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2652 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: OpReg)
2653 .addReg(RegNo: HiReg)
2654 .addReg(RegNo: ConstReg)
2655 .setOperandDead(3); // Dead scc
2656 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
2657 .addReg(RegNo: LoReg)
2658 .addImm(Val: AMDGPU::sub0)
2659 .addReg(RegNo: OpReg)
2660 .addImm(Val: AMDGPU::sub1);
2661 MI.eraseFromParent();
2662 return true;
2663}
2664
2665// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2666bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2667 Register Dst = MI.getOperand(i: 0).getReg();
2668 const RegisterBank *DstRB = RBI.getRegBank(Reg: Dst, MRI: *MRI, TRI);
2669 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2670 MRI->getType(Reg: Dst) != LLT::scalar(SizeInBits: 64))
2671 return false;
2672
2673 Register Src = MI.getOperand(i: 1).getReg();
2674 MachineBasicBlock *BB = MI.getParent();
2675 const DebugLoc &DL = MI.getDebugLoc();
2676 Register LoReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2677 Register HiReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2678 Register ConstReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2679 Register OpReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2680
2681 if (!RBI.constrainGenericRegister(Reg: Src, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI) ||
2682 !RBI.constrainGenericRegister(Reg: Dst, RC: AMDGPU::SReg_64RegClass, MRI&: *MRI))
2683 return false;
2684
2685 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
2686 .addReg(RegNo: Src, flags: 0, SubReg: AMDGPU::sub0);
2687 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
2688 .addReg(RegNo: Src, flags: 0, SubReg: AMDGPU::sub1);
2689 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: ConstReg)
2690 .addImm(Val: 0x7fffffff);
2691
2692 // Clear sign bit.
2693 // TODO: Should this used S_BITSET0_*?
2694 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B32), DestReg: OpReg)
2695 .addReg(RegNo: HiReg)
2696 .addReg(RegNo: ConstReg)
2697 .setOperandDead(3); // Dead scc
2698 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
2699 .addReg(RegNo: LoReg)
2700 .addImm(Val: AMDGPU::sub0)
2701 .addReg(RegNo: OpReg)
2702 .addImm(Val: AMDGPU::sub1);
2703
2704 MI.eraseFromParent();
2705 return true;
2706}
2707
2708static bool isConstant(const MachineInstr &MI) {
2709 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2710}
2711
2712void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2713 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2714
2715 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2716 const MachineInstr *PtrMI =
2717 MRI.getUniqueVRegDef(Reg: Load.getOperand(i: OpNo).getReg());
2718
2719 assert(PtrMI);
2720
2721 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2722 return;
2723
2724 GEPInfo GEPInfo;
2725
2726 for (unsigned i = 1; i != 3; ++i) {
2727 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2728 const MachineInstr *OpDef = MRI.getUniqueVRegDef(Reg: GEPOp.getReg());
2729 assert(OpDef);
2730 if (i == 2 && isConstant(MI: *OpDef)) {
2731 // TODO: Could handle constant base + variable offset, but a combine
2732 // probably should have commuted it.
2733 assert(GEPInfo.Imm == 0);
2734 GEPInfo.Imm = OpDef->getOperand(i: 1).getCImm()->getSExtValue();
2735 continue;
2736 }
2737 const RegisterBank *OpBank = RBI.getRegBank(Reg: GEPOp.getReg(), MRI, TRI);
2738 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2739 GEPInfo.SgprParts.push_back(Elt: GEPOp.getReg());
2740 else
2741 GEPInfo.VgprParts.push_back(Elt: GEPOp.getReg());
2742 }
2743
2744 AddrInfo.push_back(Elt: GEPInfo);
2745 getAddrModeInfo(Load: *PtrMI, MRI, AddrInfo);
2746}
2747
2748bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2749 return RBI.getRegBank(Reg, MRI: *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2750}
2751
2752bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2753 if (!MI.hasOneMemOperand())
2754 return false;
2755
2756 const MachineMemOperand *MMO = *MI.memoperands_begin();
2757 const Value *Ptr = MMO->getValue();
2758
2759 // UndefValue means this is a load of a kernel input. These are uniform.
2760 // Sometimes LDS instructions have constant pointers.
2761 // If Ptr is null, then that means this mem operand contains a
2762 // PseudoSourceValue like GOT.
2763 if (!Ptr || isa<UndefValue>(Val: Ptr) || isa<Argument>(Val: Ptr) ||
2764 isa<Constant>(Val: Ptr) || isa<GlobalValue>(Val: Ptr))
2765 return true;
2766
2767 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2768 return true;
2769
2770 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2771 return RBI.getRegBank(Reg: MI.getOperand(i: 0).getReg(), MRI: *MRI, TRI)->getID() ==
2772 AMDGPU::SGPRRegBankID;
2773
2774 const Instruction *I = dyn_cast<Instruction>(Val: Ptr);
2775 return I && I->getMetadata(Kind: "amdgpu.uniform");
2776}
2777
2778bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2779 for (const GEPInfo &GEPInfo : AddrInfo) {
2780 if (!GEPInfo.VgprParts.empty())
2781 return true;
2782 }
2783 return false;
2784}
2785
2786void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2787 const LLT PtrTy = MRI->getType(Reg: I.getOperand(i: 1).getReg());
2788 unsigned AS = PtrTy.getAddressSpace();
2789 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2790 STI.ldsRequiresM0Init()) {
2791 MachineBasicBlock *BB = I.getParent();
2792
2793 // If DS instructions require M0 initialization, insert it before selecting.
2794 BuildMI(BB&: *BB, I: &I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
2795 .addImm(Val: -1);
2796 }
2797}
2798
2799bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2800 MachineInstr &I) const {
2801 initM0(I);
2802 return selectImpl(I, CoverageInfo&: *CoverageInfo);
2803}
2804
2805static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
2806 if (Reg.isPhysical())
2807 return false;
2808
2809 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2810 const unsigned Opcode = MI.getOpcode();
2811
2812 if (Opcode == AMDGPU::COPY)
2813 return isVCmpResult(Reg: MI.getOperand(i: 1).getReg(), MRI);
2814
2815 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2816 Opcode == AMDGPU::G_XOR)
2817 return isVCmpResult(Reg: MI.getOperand(i: 1).getReg(), MRI) &&
2818 isVCmpResult(Reg: MI.getOperand(i: 2).getReg(), MRI);
2819
2820 if (auto *GI = dyn_cast<GIntrinsic>(Val: &MI))
2821 return GI->is(ID: Intrinsic::amdgcn_class);
2822
2823 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2824}
2825
2826bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2827 MachineBasicBlock *BB = I.getParent();
2828 MachineOperand &CondOp = I.getOperand(i: 0);
2829 Register CondReg = CondOp.getReg();
2830 const DebugLoc &DL = I.getDebugLoc();
2831
2832 unsigned BrOpcode;
2833 Register CondPhysReg;
2834 const TargetRegisterClass *ConstrainRC;
2835
2836 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2837 // whether the branch is uniform when selecting the instruction. In
2838 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2839 // RegBankSelect knows what it's doing if the branch condition is scc, even
2840 // though it currently does not.
2841 if (!isVCC(Reg: CondReg, MRI: *MRI)) {
2842 if (MRI->getType(Reg: CondReg) != LLT::scalar(SizeInBits: 32))
2843 return false;
2844
2845 CondPhysReg = AMDGPU::SCC;
2846 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2847 ConstrainRC = &AMDGPU::SReg_32RegClass;
2848 } else {
2849 // FIXME: Should scc->vcc copies and with exec?
2850
2851 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2852 // need to insert an and with exec.
2853 if (!isVCmpResult(Reg: CondReg, MRI&: *MRI)) {
2854 const bool Is64 = STI.isWave64();
2855 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2856 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2857
2858 Register TmpReg = MRI->createVirtualRegister(RegClass: TRI.getBoolRC());
2859 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode), DestReg: TmpReg)
2860 .addReg(RegNo: CondReg)
2861 .addReg(RegNo: Exec)
2862 .setOperandDead(3); // Dead scc
2863 CondReg = TmpReg;
2864 }
2865
2866 CondPhysReg = TRI.getVCC();
2867 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2868 ConstrainRC = TRI.getBoolRC();
2869 }
2870
2871 if (!MRI->getRegClassOrNull(Reg: CondReg))
2872 MRI->setRegClass(Reg: CondReg, RC: ConstrainRC);
2873
2874 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CondPhysReg)
2875 .addReg(RegNo: CondReg);
2876 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: BrOpcode))
2877 .addMBB(MBB: I.getOperand(i: 1).getMBB());
2878
2879 I.eraseFromParent();
2880 return true;
2881}
2882
2883bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2884 MachineInstr &I) const {
2885 Register DstReg = I.getOperand(i: 0).getReg();
2886 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
2887 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2888 I.setDesc(TII.get(Opcode: IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2889 if (IsVGPR)
2890 I.addOperand(MF&: *MF, Op: MachineOperand::CreateReg(Reg: AMDGPU::EXEC, isDef: false, isImp: true));
2891
2892 return RBI.constrainGenericRegister(
2893 Reg: DstReg, RC: IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI&: *MRI);
2894}
2895
2896bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2897 Register DstReg = I.getOperand(i: 0).getReg();
2898 Register SrcReg = I.getOperand(i: 1).getReg();
2899 Register MaskReg = I.getOperand(i: 2).getReg();
2900 LLT Ty = MRI->getType(Reg: DstReg);
2901 LLT MaskTy = MRI->getType(Reg: MaskReg);
2902 MachineBasicBlock *BB = I.getParent();
2903 const DebugLoc &DL = I.getDebugLoc();
2904
2905 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
2906 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
2907 const RegisterBank *MaskRB = RBI.getRegBank(Reg: MaskReg, MRI: *MRI, TRI);
2908 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2909 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2910 return false;
2911
2912 // Try to avoid emitting a bit operation when we only need to touch half of
2913 // the 64-bit pointer.
2914 APInt MaskOnes = KB->getKnownOnes(R: MaskReg).zext(width: 64);
2915 const APInt MaskHi32 = APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32);
2916 const APInt MaskLo32 = APInt::getLowBitsSet(numBits: 64, loBitsSet: 32);
2917
2918 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2919 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2920
2921 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2922 !CanCopyLow32 && !CanCopyHi32) {
2923 auto MIB = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_AND_B64), DestReg: DstReg)
2924 .addReg(RegNo: SrcReg)
2925 .addReg(RegNo: MaskReg)
2926 .setOperandDead(3); // Dead scc
2927 I.eraseFromParent();
2928 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2929 }
2930
2931 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2932 const TargetRegisterClass &RegRC
2933 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2934
2935 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, Bank: *DstRB);
2936 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, Bank: *SrcRB);
2937 const TargetRegisterClass *MaskRC =
2938 TRI.getRegClassForTypeOnBank(Ty: MaskTy, Bank: *MaskRB);
2939
2940 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
2941 !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
2942 !RBI.constrainGenericRegister(Reg: MaskReg, RC: *MaskRC, MRI&: *MRI))
2943 return false;
2944
2945 if (Ty.getSizeInBits() == 32) {
2946 assert(MaskTy.getSizeInBits() == 32 &&
2947 "ptrmask should have been narrowed during legalize");
2948
2949 auto NewOp = BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: DstReg)
2950 .addReg(RegNo: SrcReg)
2951 .addReg(RegNo: MaskReg);
2952
2953 if (!IsVGPR)
2954 NewOp.setOperandDead(3); // Dead scc
2955 I.eraseFromParent();
2956 return true;
2957 }
2958
2959 Register HiReg = MRI->createVirtualRegister(RegClass: &RegRC);
2960 Register LoReg = MRI->createVirtualRegister(RegClass: &RegRC);
2961
2962 // Extract the subregisters from the source pointer.
2963 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: LoReg)
2964 .addReg(RegNo: SrcReg, flags: 0, SubReg: AMDGPU::sub0);
2965 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: HiReg)
2966 .addReg(RegNo: SrcReg, flags: 0, SubReg: AMDGPU::sub1);
2967
2968 Register MaskedLo, MaskedHi;
2969
2970 if (CanCopyLow32) {
2971 // If all the bits in the low half are 1, we only need a copy for it.
2972 MaskedLo = LoReg;
2973 } else {
2974 // Extract the mask subregister and apply the and.
2975 Register MaskLo = MRI->createVirtualRegister(RegClass: &RegRC);
2976 MaskedLo = MRI->createVirtualRegister(RegClass: &RegRC);
2977
2978 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: MaskLo)
2979 .addReg(RegNo: MaskReg, flags: 0, SubReg: AMDGPU::sub0);
2980 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: MaskedLo)
2981 .addReg(RegNo: LoReg)
2982 .addReg(RegNo: MaskLo);
2983 }
2984
2985 if (CanCopyHi32) {
2986 // If all the bits in the high half are 1, we only need a copy for it.
2987 MaskedHi = HiReg;
2988 } else {
2989 Register MaskHi = MRI->createVirtualRegister(RegClass: &RegRC);
2990 MaskedHi = MRI->createVirtualRegister(RegClass: &RegRC);
2991
2992 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: MaskHi)
2993 .addReg(RegNo: MaskReg, flags: 0, SubReg: AMDGPU::sub1);
2994 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: NewOpc), DestReg: MaskedHi)
2995 .addReg(RegNo: HiReg)
2996 .addReg(RegNo: MaskHi);
2997 }
2998
2999 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg)
3000 .addReg(RegNo: MaskedLo)
3001 .addImm(Val: AMDGPU::sub0)
3002 .addReg(RegNo: MaskedHi)
3003 .addImm(Val: AMDGPU::sub1);
3004 I.eraseFromParent();
3005 return true;
3006}
3007
3008/// Return the register to use for the index value, and the subregister to use
3009/// for the indirectly accessed register.
3010static std::pair<Register, unsigned>
3011computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
3012 const TargetRegisterClass *SuperRC, Register IdxReg,
3013 unsigned EltSize, GISelKnownBits &KnownBits) {
3014 Register IdxBaseReg;
3015 int Offset;
3016
3017 std::tie(args&: IdxBaseReg, args&: Offset) =
3018 AMDGPU::getBaseWithConstantOffset(MRI, Reg: IdxReg, KnownBits: &KnownBits);
3019 if (IdxBaseReg == AMDGPU::NoRegister) {
3020 // This will happen if the index is a known constant. This should ordinarily
3021 // be legalized out, but handle it as a register just in case.
3022 assert(Offset == 0);
3023 IdxBaseReg = IdxReg;
3024 }
3025
3026 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: SuperRC, EltSize);
3027
3028 // Skip out of bounds offsets, or else we would end up using an undefined
3029 // register.
3030 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3031 return std::pair(IdxReg, SubRegs[0]);
3032 return std::pair(IdxBaseReg, SubRegs[Offset]);
3033}
3034
3035bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3036 MachineInstr &MI) const {
3037 Register DstReg = MI.getOperand(i: 0).getReg();
3038 Register SrcReg = MI.getOperand(i: 1).getReg();
3039 Register IdxReg = MI.getOperand(i: 2).getReg();
3040
3041 LLT DstTy = MRI->getType(Reg: DstReg);
3042 LLT SrcTy = MRI->getType(Reg: SrcReg);
3043
3044 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3045 const RegisterBank *SrcRB = RBI.getRegBank(Reg: SrcReg, MRI: *MRI, TRI);
3046 const RegisterBank *IdxRB = RBI.getRegBank(Reg: IdxReg, MRI: *MRI, TRI);
3047
3048 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3049 // into a waterfall loop.
3050 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3051 return false;
3052
3053 const TargetRegisterClass *SrcRC =
3054 TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *SrcRB);
3055 const TargetRegisterClass *DstRC =
3056 TRI.getRegClassForTypeOnBank(Ty: DstTy, Bank: *DstRB);
3057 if (!SrcRC || !DstRC)
3058 return false;
3059 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) ||
3060 !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) ||
3061 !RBI.constrainGenericRegister(Reg: IdxReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
3062 return false;
3063
3064 MachineBasicBlock *BB = MI.getParent();
3065 const DebugLoc &DL = MI.getDebugLoc();
3066 const bool Is64 = DstTy.getSizeInBits() == 64;
3067
3068 unsigned SubReg;
3069 std::tie(args&: IdxReg, args&: SubReg) = computeIndirectRegIndex(
3070 MRI&: *MRI, TRI, SuperRC: SrcRC, IdxReg, EltSize: DstTy.getSizeInBits() / 8, KnownBits&: *KB);
3071
3072 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3073 if (DstTy.getSizeInBits() != 32 && !Is64)
3074 return false;
3075
3076 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3077 .addReg(RegNo: IdxReg);
3078
3079 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3080 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc), DestReg: DstReg)
3081 .addReg(RegNo: SrcReg, flags: 0, SubReg)
3082 .addReg(RegNo: SrcReg, flags: RegState::Implicit);
3083 MI.eraseFromParent();
3084 return true;
3085 }
3086
3087 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3088 return false;
3089
3090 if (!STI.useVGPRIndexMode()) {
3091 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3092 .addReg(RegNo: IdxReg);
3093 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: DstReg)
3094 .addReg(RegNo: SrcReg, flags: 0, SubReg)
3095 .addReg(RegNo: SrcReg, flags: RegState::Implicit);
3096 MI.eraseFromParent();
3097 return true;
3098 }
3099
3100 const MCInstrDesc &GPRIDXDesc =
3101 TII.getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *SrcRC), IsIndirectSrc: true);
3102 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: GPRIDXDesc, DestReg: DstReg)
3103 .addReg(RegNo: SrcReg)
3104 .addReg(RegNo: IdxReg)
3105 .addImm(Val: SubReg);
3106
3107 MI.eraseFromParent();
3108 return true;
3109}
3110
3111// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3112bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3113 MachineInstr &MI) const {
3114 Register DstReg = MI.getOperand(i: 0).getReg();
3115 Register VecReg = MI.getOperand(i: 1).getReg();
3116 Register ValReg = MI.getOperand(i: 2).getReg();
3117 Register IdxReg = MI.getOperand(i: 3).getReg();
3118
3119 LLT VecTy = MRI->getType(Reg: DstReg);
3120 LLT ValTy = MRI->getType(Reg: ValReg);
3121 unsigned VecSize = VecTy.getSizeInBits();
3122 unsigned ValSize = ValTy.getSizeInBits();
3123
3124 const RegisterBank *VecRB = RBI.getRegBank(Reg: VecReg, MRI: *MRI, TRI);
3125 const RegisterBank *ValRB = RBI.getRegBank(Reg: ValReg, MRI: *MRI, TRI);
3126 const RegisterBank *IdxRB = RBI.getRegBank(Reg: IdxReg, MRI: *MRI, TRI);
3127
3128 assert(VecTy.getElementType() == ValTy);
3129
3130 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3131 // into a waterfall loop.
3132 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3133 return false;
3134
3135 const TargetRegisterClass *VecRC =
3136 TRI.getRegClassForTypeOnBank(Ty: VecTy, Bank: *VecRB);
3137 const TargetRegisterClass *ValRC =
3138 TRI.getRegClassForTypeOnBank(Ty: ValTy, Bank: *ValRB);
3139
3140 if (!RBI.constrainGenericRegister(Reg: VecReg, RC: *VecRC, MRI&: *MRI) ||
3141 !RBI.constrainGenericRegister(Reg: DstReg, RC: *VecRC, MRI&: *MRI) ||
3142 !RBI.constrainGenericRegister(Reg: ValReg, RC: *ValRC, MRI&: *MRI) ||
3143 !RBI.constrainGenericRegister(Reg: IdxReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
3144 return false;
3145
3146 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3147 return false;
3148
3149 unsigned SubReg;
3150 std::tie(args&: IdxReg, args&: SubReg) =
3151 computeIndirectRegIndex(MRI&: *MRI, TRI, SuperRC: VecRC, IdxReg, EltSize: ValSize / 8, KnownBits&: *KB);
3152
3153 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3154 STI.useVGPRIndexMode();
3155
3156 MachineBasicBlock *BB = MI.getParent();
3157 const DebugLoc &DL = MI.getDebugLoc();
3158
3159 if (!IndexMode) {
3160 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3161 .addReg(RegNo: IdxReg);
3162
3163 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3164 VecSize, EltSize: ValSize, IsSGPR: VecRB->getID() == AMDGPU::SGPRRegBankID);
3165 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: RegWriteOp, DestReg: DstReg)
3166 .addReg(RegNo: VecReg)
3167 .addReg(RegNo: ValReg)
3168 .addImm(Val: SubReg);
3169 MI.eraseFromParent();
3170 return true;
3171 }
3172
3173 const MCInstrDesc &GPRIDXDesc =
3174 TII.getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
3175 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: GPRIDXDesc, DestReg: DstReg)
3176 .addReg(RegNo: VecReg)
3177 .addReg(RegNo: ValReg)
3178 .addReg(RegNo: IdxReg)
3179 .addImm(Val: SubReg);
3180
3181 MI.eraseFromParent();
3182 return true;
3183}
3184
3185bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3186 assert(!AMDGPU::isGFX12Plus(STI));
3187 unsigned Opc;
3188 unsigned Size = MI.getOperand(i: 3).getImm();
3189
3190 // The struct intrinsic variants add one additional operand over raw.
3191 const bool HasVIndex = MI.getNumOperands() == 9;
3192 Register VIndex;
3193 int OpOffset = 0;
3194 if (HasVIndex) {
3195 VIndex = MI.getOperand(i: 4).getReg();
3196 OpOffset = 1;
3197 }
3198
3199 Register VOffset = MI.getOperand(i: 4 + OpOffset).getReg();
3200 std::optional<ValueAndVReg> MaybeVOffset =
3201 getIConstantVRegValWithLookThrough(VReg: VOffset, MRI: *MRI);
3202 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3203
3204 switch (Size) {
3205 default:
3206 return false;
3207 case 1:
3208 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3209 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3210 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3211 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3212 break;
3213 case 2:
3214 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3215 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3216 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3217 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3218 break;
3219 case 4:
3220 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3221 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3222 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3223 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3224 break;
3225 }
3226
3227 MachineBasicBlock *MBB = MI.getParent();
3228 const DebugLoc &DL = MI.getDebugLoc();
3229 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3230 .add(MO: MI.getOperand(i: 2));
3231
3232 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc));
3233
3234 if (HasVIndex && HasVOffset) {
3235 Register IdxReg = MRI->createVirtualRegister(RegClass: TRI.getVGPR64Class());
3236 BuildMI(BB&: *MBB, I: &*MIB, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: IdxReg)
3237 .addReg(RegNo: VIndex)
3238 .addImm(Val: AMDGPU::sub0)
3239 .addReg(RegNo: VOffset)
3240 .addImm(Val: AMDGPU::sub1);
3241
3242 MIB.addReg(RegNo: IdxReg);
3243 } else if (HasVIndex) {
3244 MIB.addReg(RegNo: VIndex);
3245 } else if (HasVOffset) {
3246 MIB.addReg(RegNo: VOffset);
3247 }
3248
3249 MIB.add(MO: MI.getOperand(i: 1)); // rsrc
3250 MIB.add(MO: MI.getOperand(i: 5 + OpOffset)); // soffset
3251 MIB.add(MO: MI.getOperand(i: 6 + OpOffset)); // imm offset
3252 unsigned Aux = MI.getOperand(i: 7 + OpOffset).getImm();
3253 MIB.addImm(Val: Aux & AMDGPU::CPol::ALL); // cpol
3254 MIB.addImm(Val: Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
3255
3256 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3257 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3258 LoadPtrI.Offset = MI.getOperand(i: 6 + OpOffset).getImm();
3259 MachinePointerInfo StorePtrI = LoadPtrI;
3260 StorePtrI.V = nullptr;
3261 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3262
3263 auto F = LoadMMO->getFlags() &
3264 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3265 LoadMMO = MF->getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad,
3266 Size, BaseAlignment: LoadMMO->getBaseAlign());
3267
3268 MachineMemOperand *StoreMMO =
3269 MF->getMachineMemOperand(PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore,
3270 Size: sizeof(int32_t), BaseAlignment: LoadMMO->getBaseAlign());
3271
3272 MIB.setMemRefs({LoadMMO, StoreMMO});
3273
3274 MI.eraseFromParent();
3275 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3276}
3277
3278/// Match a zero extend from a 32-bit value to 64-bits.
3279static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3280 Register ZExtSrc;
3281 if (mi_match(R: Reg, MRI, P: m_GZExt(Src: m_Reg(R&: ZExtSrc))))
3282 return MRI.getType(Reg: ZExtSrc) == LLT::scalar(SizeInBits: 32) ? ZExtSrc : Register();
3283
3284 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3285 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3286 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3287 return Register();
3288
3289 assert(Def->getNumOperands() == 3 &&
3290 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3291 if (mi_match(R: Def->getOperand(i: 2).getReg(), MRI, P: m_ZeroInt())) {
3292 return Def->getOperand(i: 1).getReg();
3293 }
3294
3295 return Register();
3296}
3297
3298bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3299 unsigned Opc;
3300 unsigned Size = MI.getOperand(i: 3).getImm();
3301
3302 switch (Size) {
3303 default:
3304 return false;
3305 case 1:
3306 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3307 break;
3308 case 2:
3309 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3310 break;
3311 case 4:
3312 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3313 break;
3314 }
3315
3316 MachineBasicBlock *MBB = MI.getParent();
3317 const DebugLoc &DL = MI.getDebugLoc();
3318 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
3319 .add(MO: MI.getOperand(i: 2));
3320
3321 Register Addr = MI.getOperand(i: 1).getReg();
3322 Register VOffset;
3323 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3324 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3325 if (!isSGPR(Reg: Addr)) {
3326 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
3327 if (isSGPR(Reg: AddrDef->Reg)) {
3328 Addr = AddrDef->Reg;
3329 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3330 Register SAddr =
3331 getSrcRegIgnoringCopies(Reg: AddrDef->MI->getOperand(i: 1).getReg(), MRI: *MRI);
3332 if (isSGPR(Reg: SAddr)) {
3333 Register PtrBaseOffset = AddrDef->MI->getOperand(i: 2).getReg();
3334 if (Register Off = matchZeroExtendFromS32(MRI&: *MRI, Reg: PtrBaseOffset)) {
3335 Addr = SAddr;
3336 VOffset = Off;
3337 }
3338 }
3339 }
3340 }
3341
3342 if (isSGPR(Reg: Addr)) {
3343 Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc);
3344 if (!VOffset) {
3345 VOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
3346 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VOffset)
3347 .addImm(Val: 0);
3348 }
3349 }
3350
3351 auto MIB = BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: Opc))
3352 .addReg(RegNo: Addr);
3353
3354 if (isSGPR(Reg: Addr))
3355 MIB.addReg(RegNo: VOffset);
3356
3357 MIB.add(MO: MI.getOperand(i: 4)) // offset
3358 .add(MO: MI.getOperand(i: 5)); // cpol
3359
3360 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3361 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3362 LoadPtrI.Offset = MI.getOperand(i: 4).getImm();
3363 MachinePointerInfo StorePtrI = LoadPtrI;
3364 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
3365 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3366 auto F = LoadMMO->getFlags() &
3367 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3368 LoadMMO = MF->getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad,
3369 Size, BaseAlignment: LoadMMO->getBaseAlign());
3370 MachineMemOperand *StoreMMO =
3371 MF->getMachineMemOperand(PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore,
3372 Size: sizeof(int32_t), BaseAlignment: Align(4));
3373
3374 MIB.setMemRefs({LoadMMO, StoreMMO});
3375
3376 MI.eraseFromParent();
3377 return constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
3378}
3379
3380bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3381 MI.setDesc(TII.get(Opcode: MI.getOperand(i: 1).getImm()));
3382 MI.removeOperand(OpNo: 1);
3383 MI.addImplicitDefUseOperands(MF&: *MI.getParent()->getParent());
3384 return true;
3385}
3386
3387bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3388 unsigned Opc;
3389 switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
3390 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3391 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3392 break;
3393 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3394 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3395 break;
3396 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3397 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3398 break;
3399 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3400 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3401 break;
3402 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3403 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3404 break;
3405 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3406 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3407 break;
3408 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3409 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3410 break;
3411 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3412 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3413 break;
3414 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3415 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3416 break;
3417 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3418 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3419 break;
3420 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3421 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3422 break;
3423 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3424 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3425 break;
3426 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3427 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3428 break;
3429 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3430 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3431 break;
3432 default:
3433 llvm_unreachable("unhandled smfmac intrinsic");
3434 }
3435
3436 auto VDst_In = MI.getOperand(i: 4);
3437
3438 MI.setDesc(TII.get(Opcode: Opc));
3439 MI.removeOperand(OpNo: 4); // VDst_In
3440 MI.removeOperand(OpNo: 1); // Intrinsic ID
3441 MI.addOperand(Op: VDst_In); // Readd VDst_In to the end
3442 MI.addImplicitDefUseOperands(MF&: *MI.getParent()->getParent());
3443 return true;
3444}
3445
3446bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3447 Register DstReg = MI.getOperand(i: 0).getReg();
3448 Register SrcReg = MI.getOperand(i: 1).getReg();
3449 const RegisterBank *DstRB = RBI.getRegBank(Reg: DstReg, MRI: *MRI, TRI);
3450 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3451 MachineBasicBlock *MBB = MI.getParent();
3452 const DebugLoc &DL = MI.getDebugLoc();
3453
3454 if (IsVALU) {
3455 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: DstReg)
3456 .addImm(Val: Subtarget->getWavefrontSizeLog2())
3457 .addReg(RegNo: SrcReg);
3458 } else {
3459 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: DstReg)
3460 .addReg(RegNo: SrcReg)
3461 .addImm(Val: Subtarget->getWavefrontSizeLog2())
3462 .setOperandDead(3); // Dead scc
3463 }
3464
3465 const TargetRegisterClass &RC =
3466 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3467 if (!RBI.constrainGenericRegister(Reg: DstReg, RC, MRI&: *MRI))
3468 return false;
3469
3470 MI.eraseFromParent();
3471 return true;
3472}
3473
3474bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3475 Register SrcReg = MI.getOperand(i: 0).getReg();
3476 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_32RegClass, MRI&: *MRI))
3477 return false;
3478
3479 MachineInstr *DefMI = MRI->getVRegDef(Reg: SrcReg);
3480 Register SP =
3481 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
3482 Register WaveAddr = getWaveAddress(Def: DefMI);
3483 MachineBasicBlock *MBB = MI.getParent();
3484 const DebugLoc &DL = MI.getDebugLoc();
3485
3486 if (!WaveAddr) {
3487 WaveAddr = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
3488 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHR_B32), DestReg: WaveAddr)
3489 .addReg(RegNo: SrcReg)
3490 .addImm(Val: Subtarget->getWavefrontSizeLog2())
3491 .setOperandDead(3); // Dead scc
3492 }
3493
3494 BuildMI(BB&: *MBB, I: &MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: SP)
3495 .addReg(RegNo: WaveAddr);
3496
3497 MI.eraseFromParent();
3498 return true;
3499}
3500
3501bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3502
3503 if (!I.isPreISelOpcode()) {
3504 if (I.isCopy())
3505 return selectCOPY(I);
3506 return true;
3507 }
3508
3509 switch (I.getOpcode()) {
3510 case TargetOpcode::G_AND:
3511 case TargetOpcode::G_OR:
3512 case TargetOpcode::G_XOR:
3513 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
3514 return true;
3515 return selectG_AND_OR_XOR(I);
3516 case TargetOpcode::G_ADD:
3517 case TargetOpcode::G_SUB:
3518 case TargetOpcode::G_PTR_ADD:
3519 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
3520 return true;
3521 return selectG_ADD_SUB(I);
3522 case TargetOpcode::G_UADDO:
3523 case TargetOpcode::G_USUBO:
3524 case TargetOpcode::G_UADDE:
3525 case TargetOpcode::G_USUBE:
3526 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3527 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3528 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3529 return selectG_AMDGPU_MAD_64_32(I);
3530 case TargetOpcode::G_INTTOPTR:
3531 case TargetOpcode::G_BITCAST:
3532 case TargetOpcode::G_PTRTOINT:
3533 case TargetOpcode::G_FREEZE:
3534 return selectCOPY(I);
3535 case TargetOpcode::G_CONSTANT:
3536 case TargetOpcode::G_FCONSTANT:
3537 return selectG_CONSTANT(I);
3538 case TargetOpcode::G_FNEG:
3539 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
3540 return true;
3541 return selectG_FNEG(MI&: I);
3542 case TargetOpcode::G_FABS:
3543 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
3544 return true;
3545 return selectG_FABS(MI&: I);
3546 case TargetOpcode::G_EXTRACT:
3547 return selectG_EXTRACT(I);
3548 case TargetOpcode::G_MERGE_VALUES:
3549 case TargetOpcode::G_CONCAT_VECTORS:
3550 return selectG_MERGE_VALUES(MI&: I);
3551 case TargetOpcode::G_UNMERGE_VALUES:
3552 return selectG_UNMERGE_VALUES(MI&: I);
3553 case TargetOpcode::G_BUILD_VECTOR:
3554 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3555 return selectG_BUILD_VECTOR(MI&: I);
3556 case TargetOpcode::G_IMPLICIT_DEF:
3557 return selectG_IMPLICIT_DEF(I);
3558 case TargetOpcode::G_INSERT:
3559 return selectG_INSERT(I);
3560 case TargetOpcode::G_INTRINSIC:
3561 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3562 return selectG_INTRINSIC(I);
3563 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3564 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3565 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3566 case TargetOpcode::G_ICMP:
3567 case TargetOpcode::G_FCMP:
3568 if (selectG_ICMP_or_FCMP(I))
3569 return true;
3570 return selectImpl(I, CoverageInfo&: *CoverageInfo);
3571 case TargetOpcode::G_LOAD:
3572 case TargetOpcode::G_STORE:
3573 case TargetOpcode::G_ATOMIC_CMPXCHG:
3574 case TargetOpcode::G_ATOMICRMW_XCHG:
3575 case TargetOpcode::G_ATOMICRMW_ADD:
3576 case TargetOpcode::G_ATOMICRMW_SUB:
3577 case TargetOpcode::G_ATOMICRMW_AND:
3578 case TargetOpcode::G_ATOMICRMW_OR:
3579 case TargetOpcode::G_ATOMICRMW_XOR:
3580 case TargetOpcode::G_ATOMICRMW_MIN:
3581 case TargetOpcode::G_ATOMICRMW_MAX:
3582 case TargetOpcode::G_ATOMICRMW_UMIN:
3583 case TargetOpcode::G_ATOMICRMW_UMAX:
3584 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3585 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3586 case TargetOpcode::G_ATOMICRMW_FADD:
3587 case TargetOpcode::G_ATOMICRMW_FMIN:
3588 case TargetOpcode::G_ATOMICRMW_FMAX:
3589 return selectG_LOAD_STORE_ATOMICRMW(I);
3590 case TargetOpcode::G_SELECT:
3591 return selectG_SELECT(I);
3592 case TargetOpcode::G_TRUNC:
3593 return selectG_TRUNC(I);
3594 case TargetOpcode::G_SEXT:
3595 case TargetOpcode::G_ZEXT:
3596 case TargetOpcode::G_ANYEXT:
3597 case TargetOpcode::G_SEXT_INREG:
3598 // This is a workaround. For extension from type i1, `selectImpl()` uses
3599 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3600 // i1 can only be hold in a SGPR class.
3601 if (MRI->getType(Reg: I.getOperand(i: 1).getReg()) != LLT::scalar(SizeInBits: 1) &&
3602 selectImpl(I, CoverageInfo&: *CoverageInfo))
3603 return true;
3604 return selectG_SZA_EXT(I);
3605 case TargetOpcode::G_FPEXT:
3606 if (selectG_FPEXT(I))
3607 return true;
3608 return selectImpl(I, CoverageInfo&: *CoverageInfo);
3609 case TargetOpcode::G_BRCOND:
3610 return selectG_BRCOND(I);
3611 case TargetOpcode::G_GLOBAL_VALUE:
3612 return selectG_GLOBAL_VALUE(I);
3613 case TargetOpcode::G_PTRMASK:
3614 return selectG_PTRMASK(I);
3615 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3616 return selectG_EXTRACT_VECTOR_ELT(MI&: I);
3617 case TargetOpcode::G_INSERT_VECTOR_ELT:
3618 return selectG_INSERT_VECTOR_ELT(MI&: I);
3619 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3620 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3621 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3622 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3623 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3624 const AMDGPU::ImageDimIntrinsicInfo *Intr =
3625 AMDGPU::getImageDimIntrinsicInfo(Intr: AMDGPU::getIntrinsicID(I));
3626 assert(Intr && "not an image intrinsic with image pseudo");
3627 return selectImageIntrinsic(MI&: I, Intr);
3628 }
3629 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3630 return selectBVHIntrinsic(MI&: I);
3631 case AMDGPU::G_SBFX:
3632 case AMDGPU::G_UBFX:
3633 return selectG_SBFX_UBFX(MI&: I);
3634 case AMDGPU::G_SI_CALL:
3635 I.setDesc(TII.get(Opcode: AMDGPU::SI_CALL));
3636 return true;
3637 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3638 return selectWaveAddress(MI&: I);
3639 case AMDGPU::G_STACKRESTORE:
3640 return selectStackRestore(MI&: I);
3641 case AMDGPU::G_PHI:
3642 return selectPHI(I);
3643 default:
3644 return selectImpl(I, CoverageInfo&: *CoverageInfo);
3645 }
3646 return false;
3647}
3648
3649InstructionSelector::ComplexRendererFns
3650AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3651 return {{
3652 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); }
3653 }};
3654
3655}
3656
3657std::pair<Register, unsigned>
3658AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3659 bool IsCanonicalizing,
3660 bool AllowAbs, bool OpSel) const {
3661 Register Src = Root.getReg();
3662 unsigned Mods = 0;
3663 MachineInstr *MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI);
3664
3665 if (MI->getOpcode() == AMDGPU::G_FNEG) {
3666 Src = MI->getOperand(i: 1).getReg();
3667 Mods |= SISrcMods::NEG;
3668 MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI);
3669 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3670 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3671 // denormal mode, but we're implicitly canonicalizing in a source operand.
3672 const ConstantFP *LHS =
3673 getConstantFPVRegVal(VReg: MI->getOperand(i: 1).getReg(), MRI: *MRI);
3674 if (LHS && LHS->isZero()) {
3675 Mods |= SISrcMods::NEG;
3676 Src = MI->getOperand(i: 2).getReg();
3677 }
3678 }
3679
3680 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3681 Src = MI->getOperand(i: 1).getReg();
3682 Mods |= SISrcMods::ABS;
3683 }
3684
3685 if (OpSel)
3686 Mods |= SISrcMods::OP_SEL_0;
3687
3688 return std::pair(Src, Mods);
3689}
3690
3691Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3692 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3693 bool ForceVGPR) const {
3694 if ((Mods != 0 || ForceVGPR) &&
3695 RBI.getRegBank(Reg: Src, MRI: *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3696
3697 // If we looked through copies to find source modifiers on an SGPR operand,
3698 // we now have an SGPR register source. To avoid potentially violating the
3699 // constant bus restriction, we need to insert a copy to a VGPR.
3700 Register VGPRSrc = MRI->cloneVirtualRegister(VReg: Root.getReg());
3701 BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
3702 MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: VGPRSrc)
3703 .addReg(RegNo: Src);
3704 Src = VGPRSrc;
3705 }
3706
3707 return Src;
3708}
3709
3710///
3711/// This will select either an SGPR or VGPR operand and will save us from
3712/// having to write an extra tablegen pattern.
3713InstructionSelector::ComplexRendererFns
3714AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3715 return {{
3716 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); }
3717 }};
3718}
3719
3720InstructionSelector::ComplexRendererFns
3721AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3722 Register Src;
3723 unsigned Mods;
3724 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root);
3725
3726 return {{
3727 [=](MachineInstrBuilder &MIB) {
3728 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
3729 },
3730 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
3731 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
3732 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
3733 }};
3734}
3735
3736InstructionSelector::ComplexRendererFns
3737AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3738 Register Src;
3739 unsigned Mods;
3740 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root,
3741 /*IsCanonicalizing=*/true,
3742 /*AllowAbs=*/false);
3743
3744 return {{
3745 [=](MachineInstrBuilder &MIB) {
3746 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
3747 },
3748 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
3749 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
3750 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
3751 }};
3752}
3753
3754InstructionSelector::ComplexRendererFns
3755AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3756 return {{
3757 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); },
3758 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp
3759 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod
3760 }};
3761}
3762
3763InstructionSelector::ComplexRendererFns
3764AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3765 Register Src;
3766 unsigned Mods;
3767 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root);
3768
3769 return {{
3770 [=](MachineInstrBuilder &MIB) {
3771 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
3772 },
3773 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
3774 }};
3775}
3776
3777InstructionSelector::ComplexRendererFns
3778AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
3779 MachineOperand &Root) const {
3780 Register Src;
3781 unsigned Mods;
3782 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
3783
3784 return {{
3785 [=](MachineInstrBuilder &MIB) {
3786 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
3787 },
3788 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
3789 }};
3790}
3791
3792InstructionSelector::ComplexRendererFns
3793AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3794 Register Src;
3795 unsigned Mods;
3796 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
3797 /*AllowAbs=*/false);
3798
3799 return {{
3800 [=](MachineInstrBuilder &MIB) {
3801 MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB));
3802 },
3803 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
3804 }};
3805}
3806
3807InstructionSelector::ComplexRendererFns
3808AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3809 Register Reg = Root.getReg();
3810 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI);
3811 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3812 return {};
3813 return {{
3814 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
3815 }};
3816}
3817
3818std::pair<Register, unsigned>
3819AMDGPUInstructionSelector::selectVOP3PModsImpl(
3820 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3821 unsigned Mods = 0;
3822 MachineInstr *MI = MRI.getVRegDef(Reg: Src);
3823
3824 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3825 // It's possible to see an f32 fneg here, but unlikely.
3826 // TODO: Treat f32 fneg as only high bit.
3827 MRI.getType(Reg: Src) == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16)) {
3828 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3829 Src = MI->getOperand(i: 1).getReg();
3830 MI = MRI.getVRegDef(Reg: Src);
3831 }
3832
3833 // TODO: Handle G_FSUB 0 as fneg
3834
3835 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3836 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3837
3838 // Packed instructions do not have abs modifiers.
3839 Mods |= SISrcMods::OP_SEL_1;
3840
3841 return std::pair(Src, Mods);
3842}
3843
3844InstructionSelector::ComplexRendererFns
3845AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3846 MachineRegisterInfo &MRI
3847 = Root.getParent()->getParent()->getParent()->getRegInfo();
3848
3849 Register Src;
3850 unsigned Mods;
3851 std::tie(args&: Src, args&: Mods) = selectVOP3PModsImpl(Src: Root.getReg(), MRI);
3852
3853 return {{
3854 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
3855 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
3856 }};
3857}
3858
3859InstructionSelector::ComplexRendererFns
3860AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3861 MachineRegisterInfo &MRI
3862 = Root.getParent()->getParent()->getParent()->getRegInfo();
3863
3864 Register Src;
3865 unsigned Mods;
3866 std::tie(args&: Src, args&: Mods) = selectVOP3PModsImpl(Src: Root.getReg(), MRI, IsDOT: true);
3867
3868 return {{
3869 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
3870 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
3871 }};
3872}
3873
3874InstructionSelector::ComplexRendererFns
3875AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
3876 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3877 // Value is in Imm operand as i1 sign extended to int64_t.
3878 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3879 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3880 "expected i1 value");
3881 unsigned Mods = SISrcMods::OP_SEL_1;
3882 if (Root.getImm() == -1)
3883 Mods ^= SISrcMods::NEG;
3884 return {{
3885 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
3886 }};
3887}
3888
3889InstructionSelector::ComplexRendererFns
3890AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3891 MachineOperand &Root) const {
3892 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3893 "expected i1 value");
3894 unsigned Mods = SISrcMods::OP_SEL_1;
3895 if (Root.getImm() != 0)
3896 Mods |= SISrcMods::OP_SEL_0;
3897
3898 return {{
3899 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
3900 }};
3901}
3902
3903static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
3904 MachineInstr *InsertPt,
3905 MachineRegisterInfo &MRI) {
3906 const TargetRegisterClass *DstRegClass;
3907 switch (Elts.size()) {
3908 case 8:
3909 DstRegClass = &AMDGPU::VReg_256RegClass;
3910 break;
3911 case 4:
3912 DstRegClass = &AMDGPU::VReg_128RegClass;
3913 break;
3914 case 2:
3915 DstRegClass = &AMDGPU::VReg_64RegClass;
3916 break;
3917 default:
3918 llvm_unreachable("unhandled Reg sequence size");
3919 }
3920
3921 MachineIRBuilder B(*InsertPt);
3922 auto MIB = B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
3923 .addDef(RegNo: MRI.createVirtualRegister(RegClass: DstRegClass));
3924 for (unsigned i = 0; i < Elts.size(); ++i) {
3925 MIB.addReg(RegNo: Elts[i]);
3926 MIB.addImm(Val: SIRegisterInfo::getSubRegFromChannel(Channel: i));
3927 }
3928 return MIB->getOperand(i: 0).getReg();
3929}
3930
3931static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3932 SmallVectorImpl<Register> &Elts, Register &Src,
3933 MachineInstr *InsertPt,
3934 MachineRegisterInfo &MRI) {
3935 if (ModOpcode == TargetOpcode::G_FNEG) {
3936 Mods |= SISrcMods::NEG;
3937 // Check if all elements also have abs modifier
3938 SmallVector<Register, 8> NegAbsElts;
3939 for (auto El : Elts) {
3940 Register FabsSrc;
3941 if (!mi_match(R: El, MRI, P: m_GFabs(Src: m_Reg(R&: FabsSrc))))
3942 break;
3943 NegAbsElts.push_back(Elt: FabsSrc);
3944 }
3945 if (Elts.size() != NegAbsElts.size()) {
3946 // Neg
3947 Src = buildRegSequence(Elts, InsertPt, MRI);
3948 } else {
3949 // Neg and Abs
3950 Mods |= SISrcMods::NEG_HI;
3951 Src = buildRegSequence(Elts&: NegAbsElts, InsertPt, MRI);
3952 }
3953 } else {
3954 assert(ModOpcode == TargetOpcode::G_FABS);
3955 // Abs
3956 Mods |= SISrcMods::NEG_HI;
3957 Src = buildRegSequence(Elts, InsertPt, MRI);
3958 }
3959}
3960
3961InstructionSelector::ComplexRendererFns
3962AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
3963 Register Src = Root.getReg();
3964 unsigned Mods = SISrcMods::OP_SEL_1;
3965 SmallVector<Register, 8> EltsF32;
3966
3967 if (GBuildVector *BV = dyn_cast<GBuildVector>(Val: MRI->getVRegDef(Reg: Src))) {
3968 assert(BV->getNumSources() > 0);
3969 // Based on first element decide which mod we match, neg or abs
3970 MachineInstr *ElF32 = MRI->getVRegDef(Reg: BV->getSourceReg(I: 0));
3971 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
3972 ? AMDGPU::G_FNEG
3973 : AMDGPU::G_FABS;
3974 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
3975 ElF32 = MRI->getVRegDef(Reg: BV->getSourceReg(I: i));
3976 if (ElF32->getOpcode() != ModOpcode)
3977 break;
3978 EltsF32.push_back(Elt: ElF32->getOperand(i: 1).getReg());
3979 }
3980
3981 // All elements had ModOpcode modifier
3982 if (BV->getNumSources() == EltsF32.size()) {
3983 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF32, Src, InsertPt: Root.getParent(),
3984 MRI&: *MRI);
3985 }
3986 }
3987
3988 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
3989 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
3990}
3991
3992InstructionSelector::ComplexRendererFns
3993AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
3994 Register Src = Root.getReg();
3995 unsigned Mods = SISrcMods::OP_SEL_1;
3996 SmallVector<Register, 8> EltsV2F16;
3997
3998 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(Val: MRI->getVRegDef(Reg: Src))) {
3999 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4000 Register FNegSrc;
4001 if (!mi_match(R: CV->getSourceReg(I: i), MRI: *MRI, P: m_GFNeg(Src: m_Reg(R&: FNegSrc))))
4002 break;
4003 EltsV2F16.push_back(Elt: FNegSrc);
4004 }
4005
4006 // All elements had ModOpcode modifier
4007 if (CV->getNumSources() == EltsV2F16.size()) {
4008 Mods |= SISrcMods::NEG;
4009 Mods |= SISrcMods::NEG_HI;
4010 Src = buildRegSequence(Elts&: EltsV2F16, InsertPt: Root.getParent(), MRI&: *MRI);
4011 }
4012 }
4013
4014 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
4015 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
4016}
4017
4018InstructionSelector::ComplexRendererFns
4019AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4020 Register Src = Root.getReg();
4021 unsigned Mods = SISrcMods::OP_SEL_1;
4022 SmallVector<Register, 8> EltsV2F16;
4023
4024 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(Val: MRI->getVRegDef(Reg: Src))) {
4025 assert(CV->getNumSources() > 0);
4026 MachineInstr *ElV2F16 = MRI->getVRegDef(Reg: CV->getSourceReg(I: 0));
4027 // Based on first element decide which mod we match, neg or abs
4028 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4029 ? AMDGPU::G_FNEG
4030 : AMDGPU::G_FABS;
4031
4032 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4033 ElV2F16 = MRI->getVRegDef(Reg: CV->getSourceReg(I: i));
4034 if (ElV2F16->getOpcode() != ModOpcode)
4035 break;
4036 EltsV2F16.push_back(Elt: ElV2F16->getOperand(i: 1).getReg());
4037 }
4038
4039 // All elements had ModOpcode modifier
4040 if (CV->getNumSources() == EltsV2F16.size()) {
4041 MachineIRBuilder B(*Root.getParent());
4042 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsV2F16, Src, InsertPt: Root.getParent(),
4043 MRI&: *MRI);
4044 }
4045 }
4046
4047 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
4048 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}};
4049}
4050
4051InstructionSelector::ComplexRendererFns
4052AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4053 std::optional<FPValueAndVReg> FPValReg;
4054 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_GFCstOrSplat(FPValReg))) {
4055 if (TII.isInlineConstant(Imm: FPValReg->Value)) {
4056 return {{[=](MachineInstrBuilder &MIB) {
4057 MIB.addImm(Val: FPValReg->Value.bitcastToAPInt().getSExtValue());
4058 }}};
4059 }
4060 // Non-inlineable splat floats should not fall-through for integer immediate
4061 // checks.
4062 return {};
4063 }
4064
4065 APInt ICst;
4066 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICstOrSplat(Cst&: ICst))) {
4067 if (TII.isInlineConstant(Imm: ICst)) {
4068 return {
4069 {[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ICst.getSExtValue()); }}};
4070 }
4071 }
4072
4073 return {};
4074}
4075
4076InstructionSelector::ComplexRendererFns
4077AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4078 Register Src =
4079 getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg();
4080 unsigned Key = 0;
4081
4082 Register ShiftSrc;
4083 std::optional<ValueAndVReg> ShiftAmt;
4084 if (mi_match(R: Src, MRI: *MRI, P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt))) &&
4085 MRI->getType(Reg: ShiftSrc).getSizeInBits() == 32 &&
4086 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4087 Key = ShiftAmt->Value.getZExtValue() / 8;
4088 Src = ShiftSrc;
4089 }
4090
4091 return {{
4092 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
4093 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key
4094 }};
4095}
4096
4097InstructionSelector::ComplexRendererFns
4098AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4099
4100 Register Src =
4101 getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg();
4102 unsigned Key = 0;
4103
4104 Register ShiftSrc;
4105 std::optional<ValueAndVReg> ShiftAmt;
4106 if (mi_match(R: Src, MRI: *MRI, P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt))) &&
4107 MRI->getType(Reg: ShiftSrc).getSizeInBits() == 32 &&
4108 ShiftAmt->Value.getZExtValue() == 16) {
4109 Src = ShiftSrc;
4110 Key = 1;
4111 }
4112
4113 return {{
4114 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
4115 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key
4116 }};
4117}
4118
4119InstructionSelector::ComplexRendererFns
4120AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4121 Register Src;
4122 unsigned Mods;
4123 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root);
4124
4125 // FIXME: Handle op_sel
4126 return {{
4127 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
4128 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
4129 }};
4130}
4131
4132InstructionSelector::ComplexRendererFns
4133AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4134 Register Src;
4135 unsigned Mods;
4136 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root,
4137 /*IsCanonicalizing=*/true,
4138 /*AllowAbs=*/false,
4139 /*OpSel=*/false);
4140
4141 return {{
4142 [=](MachineInstrBuilder &MIB) {
4143 MIB.addReg(
4144 RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB, /* ForceVGPR */ true));
4145 },
4146 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
4147 }};
4148}
4149
4150InstructionSelector::ComplexRendererFns
4151AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4152 Register Src;
4153 unsigned Mods;
4154 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root,
4155 /*IsCanonicalizing=*/true,
4156 /*AllowAbs=*/false,
4157 /*OpSel=*/true);
4158
4159 return {{
4160 [=](MachineInstrBuilder &MIB) {
4161 MIB.addReg(
4162 RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB, /* ForceVGPR */ true));
4163 },
4164 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods
4165 }};
4166}
4167
4168bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4169 Register &Base,
4170 Register *SOffset,
4171 int64_t *Offset) const {
4172 MachineInstr *MI = Root.getParent();
4173 MachineBasicBlock *MBB = MI->getParent();
4174
4175 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4176 // then we can select all ptr + 32-bit offsets.
4177 SmallVector<GEPInfo, 4> AddrInfo;
4178 getAddrModeInfo(Load: *MI, MRI: *MRI, AddrInfo);
4179
4180 if (AddrInfo.empty())
4181 return false;
4182
4183 const GEPInfo &GEPI = AddrInfo[0];
4184 std::optional<int64_t> EncodedImm;
4185
4186 if (SOffset && Offset) {
4187 EncodedImm = AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: GEPI.Imm, /*IsBuffer=*/false,
4188 /*HasSOffset=*/true);
4189 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4190 AddrInfo.size() > 1) {
4191 const GEPInfo &GEPI2 = AddrInfo[1];
4192 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4193 if (Register OffsetReg =
4194 matchZeroExtendFromS32(MRI&: *MRI, Reg: GEPI2.SgprParts[1])) {
4195 Base = GEPI2.SgprParts[0];
4196 *SOffset = OffsetReg;
4197 *Offset = *EncodedImm;
4198 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(ST: STI))
4199 return true;
4200
4201 // For unbuffered smem loads, it is illegal for the Immediate Offset
4202 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4203 // is negative. Handle the case where the Immediate Offset + SOffset
4204 // is negative.
4205 auto SKnown = KB->getKnownBits(R: *SOffset);
4206 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4207 return false;
4208
4209 return true;
4210 }
4211 }
4212 }
4213 return false;
4214 }
4215
4216 EncodedImm = AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: GEPI.Imm, /*IsBuffer=*/false,
4217 /*HasSOffset=*/false);
4218 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4219 Base = GEPI.SgprParts[0];
4220 *Offset = *EncodedImm;
4221 return true;
4222 }
4223
4224 // SGPR offset is unsigned.
4225 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(x: GEPI.Imm) &&
4226 GEPI.Imm != 0) {
4227 // If we make it this far we have a load with an 32-bit immediate offset.
4228 // It is OK to select this using a sgpr offset, because we have already
4229 // failed trying to select this load into one of the _IMM variants since
4230 // the _IMM Patterns are considered before the _SGPR patterns.
4231 Base = GEPI.SgprParts[0];
4232 *SOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
4233 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: *SOffset)
4234 .addImm(Val: GEPI.Imm);
4235 return true;
4236 }
4237
4238 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4239 if (Register OffsetReg = matchZeroExtendFromS32(MRI&: *MRI, Reg: GEPI.SgprParts[1])) {
4240 Base = GEPI.SgprParts[0];
4241 *SOffset = OffsetReg;
4242 return true;
4243 }
4244 }
4245
4246 return false;
4247}
4248
4249InstructionSelector::ComplexRendererFns
4250AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4251 Register Base;
4252 int64_t Offset;
4253 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, Offset: &Offset))
4254 return std::nullopt;
4255
4256 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
4257 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }}};
4258}
4259
4260InstructionSelector::ComplexRendererFns
4261AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4262 SmallVector<GEPInfo, 4> AddrInfo;
4263 getAddrModeInfo(Load: *Root.getParent(), MRI: *MRI, AddrInfo);
4264
4265 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4266 return std::nullopt;
4267
4268 const GEPInfo &GEPInfo = AddrInfo[0];
4269 Register PtrReg = GEPInfo.SgprParts[0];
4270 std::optional<int64_t> EncodedImm =
4271 AMDGPU::getSMRDEncodedLiteralOffset32(ST: STI, ByteOffset: GEPInfo.Imm);
4272 if (!EncodedImm)
4273 return std::nullopt;
4274
4275 return {{
4276 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrReg); },
4277 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); }
4278 }};
4279}
4280
4281InstructionSelector::ComplexRendererFns
4282AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4283 Register Base, SOffset;
4284 if (!selectSmrdOffset(Root, Base, SOffset: &SOffset, /* Offset= */ nullptr))
4285 return std::nullopt;
4286
4287 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
4288 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); }}};
4289}
4290
4291InstructionSelector::ComplexRendererFns
4292AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4293 Register Base, SOffset;
4294 int64_t Offset;
4295 if (!selectSmrdOffset(Root, Base, SOffset: &SOffset, Offset: &Offset))
4296 return std::nullopt;
4297
4298 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); },
4299 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); },
4300 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }}};
4301}
4302
4303std::pair<Register, int>
4304AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4305 uint64_t FlatVariant) const {
4306 MachineInstr *MI = Root.getParent();
4307
4308 auto Default = std::pair(Root.getReg(), 0);
4309
4310 if (!STI.hasFlatInstOffsets())
4311 return Default;
4312
4313 Register PtrBase;
4314 int64_t ConstOffset;
4315 std::tie(args&: PtrBase, args&: ConstOffset) =
4316 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
4317
4318 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4319 !isFlatScratchBaseLegal(Addr: Root.getReg())))
4320 return Default;
4321
4322 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4323 if (!TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace, FlatVariant))
4324 return Default;
4325
4326 return std::pair(PtrBase, ConstOffset);
4327}
4328
4329InstructionSelector::ComplexRendererFns
4330AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4331 auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FLAT);
4332
4333 return {{
4334 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
4335 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
4336 }};
4337}
4338
4339InstructionSelector::ComplexRendererFns
4340AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4341 auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FlatGlobal);
4342
4343 return {{
4344 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
4345 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
4346 }};
4347}
4348
4349InstructionSelector::ComplexRendererFns
4350AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4351 auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FlatScratch);
4352
4353 return {{
4354 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); },
4355 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); },
4356 }};
4357}
4358
4359// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4360InstructionSelector::ComplexRendererFns
4361AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4362 Register Addr = Root.getReg();
4363 Register PtrBase;
4364 int64_t ConstOffset;
4365 int64_t ImmOffset = 0;
4366
4367 // Match the immediate offset first, which canonically is moved as low as
4368 // possible.
4369 std::tie(args&: PtrBase, args&: ConstOffset) = getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
4370
4371 if (ConstOffset != 0) {
4372 if (TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
4373 FlatVariant: SIInstrFlags::FlatGlobal)) {
4374 Addr = PtrBase;
4375 ImmOffset = ConstOffset;
4376 } else {
4377 auto PtrBaseDef = getDefSrcRegIgnoringCopies(Reg: PtrBase, MRI: *MRI);
4378 if (isSGPR(Reg: PtrBaseDef->Reg)) {
4379 if (ConstOffset > 0) {
4380 // Offset is too large.
4381 //
4382 // saddr + large_offset -> saddr +
4383 // (voffset = large_offset & ~MaxOffset) +
4384 // (large_offset & MaxOffset);
4385 int64_t SplitImmOffset, RemainderOffset;
4386 std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII.splitFlatOffset(
4387 COffsetVal: ConstOffset, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS, FlatVariant: SIInstrFlags::FlatGlobal);
4388
4389 if (isUInt<32>(x: RemainderOffset)) {
4390 MachineInstr *MI = Root.getParent();
4391 MachineBasicBlock *MBB = MI->getParent();
4392 Register HighBits =
4393 MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
4394
4395 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32),
4396 DestReg: HighBits)
4397 .addImm(Val: RemainderOffset);
4398
4399 return {{
4400 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrBase); }, // saddr
4401 [=](MachineInstrBuilder &MIB) {
4402 MIB.addReg(RegNo: HighBits);
4403 }, // voffset
4404 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: SplitImmOffset); },
4405 }};
4406 }
4407 }
4408
4409 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4410 // is 1 we would need to perform 1 or 2 extra moves for each half of
4411 // the constant and it is better to do a scalar add and then issue a
4412 // single VALU instruction to materialize zero. Otherwise it is less
4413 // instructions to perform VALU adds with immediates or inline literals.
4414 unsigned NumLiterals =
4415 !TII.isInlineConstant(Imm: APInt(32, ConstOffset & 0xffffffff)) +
4416 !TII.isInlineConstant(Imm: APInt(32, ConstOffset >> 32));
4417 if (STI.getConstantBusLimit(Opcode: AMDGPU::V_ADD_U32_e64) > NumLiterals)
4418 return std::nullopt;
4419 }
4420 }
4421 }
4422
4423 // Match the variable offset.
4424 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
4425 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4426 // Look through the SGPR->VGPR copy.
4427 Register SAddr =
4428 getSrcRegIgnoringCopies(Reg: AddrDef->MI->getOperand(i: 1).getReg(), MRI: *MRI);
4429
4430 if (isSGPR(Reg: SAddr)) {
4431 Register PtrBaseOffset = AddrDef->MI->getOperand(i: 2).getReg();
4432
4433 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4434 // inserted later.
4435 if (Register VOffset = matchZeroExtendFromS32(MRI&: *MRI, Reg: PtrBaseOffset)) {
4436 return {{[=](MachineInstrBuilder &MIB) { // saddr
4437 MIB.addReg(RegNo: SAddr);
4438 },
4439 [=](MachineInstrBuilder &MIB) { // voffset
4440 MIB.addReg(RegNo: VOffset);
4441 },
4442 [=](MachineInstrBuilder &MIB) { // offset
4443 MIB.addImm(Val: ImmOffset);
4444 }}};
4445 }
4446 }
4447 }
4448
4449 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4450 // drop this.
4451 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4452 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(Reg: AddrDef->Reg))
4453 return std::nullopt;
4454
4455 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4456 // moves required to copy a 64-bit SGPR to VGPR.
4457 MachineInstr *MI = Root.getParent();
4458 MachineBasicBlock *MBB = MI->getParent();
4459 Register VOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
4460
4461 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VOffset)
4462 .addImm(Val: 0);
4463
4464 return {{
4465 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: AddrDef->Reg); }, // saddr
4466 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: VOffset); }, // voffset
4467 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
4468 }};
4469}
4470
4471InstructionSelector::ComplexRendererFns
4472AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4473 Register Addr = Root.getReg();
4474 Register PtrBase;
4475 int64_t ConstOffset;
4476 int64_t ImmOffset = 0;
4477
4478 // Match the immediate offset first, which canonically is moved as low as
4479 // possible.
4480 std::tie(args&: PtrBase, args&: ConstOffset) = getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
4481
4482 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4483 TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
4484 FlatVariant: SIInstrFlags::FlatScratch)) {
4485 Addr = PtrBase;
4486 ImmOffset = ConstOffset;
4487 }
4488
4489 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
4490 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4491 int FI = AddrDef->MI->getOperand(i: 1).getIndex();
4492 return {{
4493 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(Idx: FI); }, // saddr
4494 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
4495 }};
4496 }
4497
4498 Register SAddr = AddrDef->Reg;
4499
4500 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4501 Register LHS = AddrDef->MI->getOperand(i: 1).getReg();
4502 Register RHS = AddrDef->MI->getOperand(i: 2).getReg();
4503 auto LHSDef = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI);
4504 auto RHSDef = getDefSrcRegIgnoringCopies(Reg: RHS, MRI: *MRI);
4505
4506 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4507 isSGPR(Reg: RHSDef->Reg)) {
4508 int FI = LHSDef->MI->getOperand(i: 1).getIndex();
4509 MachineInstr &I = *Root.getParent();
4510 MachineBasicBlock *BB = I.getParent();
4511 const DebugLoc &DL = I.getDebugLoc();
4512 SAddr = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
4513
4514 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_ADD_I32), DestReg: SAddr)
4515 .addFrameIndex(Idx: FI)
4516 .addReg(RegNo: RHSDef->Reg)
4517 .setOperandDead(3); // Dead scc
4518 }
4519 }
4520
4521 if (!isSGPR(Reg: SAddr))
4522 return std::nullopt;
4523
4524 return {{
4525 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SAddr); }, // saddr
4526 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
4527 }};
4528}
4529
4530// Check whether the flat scratch SVS swizzle bug affects this access.
4531bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4532 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4533 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4534 return false;
4535
4536 // The bug affects the swizzling of SVS accesses if there is any carry out
4537 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4538 // voffset to (soffset + inst_offset).
4539 auto VKnown = KB->getKnownBits(R: VAddr);
4540 auto SKnown = KnownBits::computeForAddSub(
4541 /*Add=*/true, /*NSW=*/false, /*NUW=*/false, LHS: KB->getKnownBits(R: SAddr),
4542 RHS: KnownBits::makeConstant(C: APInt(32, ImmOffset)));
4543 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4544 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4545 return (VMax & 3) + (SMax & 3) >= 4;
4546}
4547
4548InstructionSelector::ComplexRendererFns
4549AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4550 Register Addr = Root.getReg();
4551 Register PtrBase;
4552 int64_t ConstOffset;
4553 int64_t ImmOffset = 0;
4554
4555 // Match the immediate offset first, which canonically is moved as low as
4556 // possible.
4557 std::tie(args&: PtrBase, args&: ConstOffset) = getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI);
4558
4559 Register OrigAddr = Addr;
4560 if (ConstOffset != 0 &&
4561 TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: true)) {
4562 Addr = PtrBase;
4563 ImmOffset = ConstOffset;
4564 }
4565
4566 auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI);
4567 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4568 return std::nullopt;
4569
4570 Register RHS = AddrDef->MI->getOperand(i: 2).getReg();
4571 if (RBI.getRegBank(Reg: RHS, MRI: *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4572 return std::nullopt;
4573
4574 Register LHS = AddrDef->MI->getOperand(i: 1).getReg();
4575 auto LHSDef = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI);
4576
4577 if (OrigAddr != Addr) {
4578 if (!isFlatScratchBaseLegalSVImm(Addr: OrigAddr))
4579 return std::nullopt;
4580 } else {
4581 if (!isFlatScratchBaseLegalSV(Addr: OrigAddr))
4582 return std::nullopt;
4583 }
4584
4585 if (checkFlatScratchSVSSwizzleBug(VAddr: RHS, SAddr: LHS, ImmOffset))
4586 return std::nullopt;
4587
4588 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4589 int FI = LHSDef->MI->getOperand(i: 1).getIndex();
4590 return {{
4591 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: RHS); }, // vaddr
4592 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(Idx: FI); }, // saddr
4593 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
4594 }};
4595 }
4596
4597 if (!isSGPR(Reg: LHS))
4598 return std::nullopt;
4599
4600 return {{
4601 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: RHS); }, // vaddr
4602 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: LHS); }, // saddr
4603 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset
4604 }};
4605}
4606
4607InstructionSelector::ComplexRendererFns
4608AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4609 MachineInstr *MI = Root.getParent();
4610 MachineBasicBlock *MBB = MI->getParent();
4611 MachineFunction *MF = MBB->getParent();
4612 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4613
4614 int64_t Offset = 0;
4615 if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: Offset)) &&
4616 Offset != TM.getNullPointerValue(AddrSpace: AMDGPUAS::PRIVATE_ADDRESS)) {
4617 Register HighBits = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
4618
4619 // TODO: Should this be inside the render function? The iterator seems to
4620 // move.
4621 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
4622 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32),
4623 DestReg: HighBits)
4624 .addImm(Val: Offset & ~MaxOffset);
4625
4626 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4627 MIB.addReg(RegNo: Info->getScratchRSrcReg());
4628 },
4629 [=](MachineInstrBuilder &MIB) { // vaddr
4630 MIB.addReg(RegNo: HighBits);
4631 },
4632 [=](MachineInstrBuilder &MIB) { // soffset
4633 // Use constant zero for soffset and rely on eliminateFrameIndex
4634 // to choose the appropriate frame register if need be.
4635 MIB.addImm(Val: 0);
4636 },
4637 [=](MachineInstrBuilder &MIB) { // offset
4638 MIB.addImm(Val: Offset & MaxOffset);
4639 }}};
4640 }
4641
4642 assert(Offset == 0 || Offset == -1);
4643
4644 // Try to fold a frame index directly into the MUBUF vaddr field, and any
4645 // offsets.
4646 std::optional<int> FI;
4647 Register VAddr = Root.getReg();
4648 if (const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg())) {
4649 Register PtrBase;
4650 int64_t ConstOffset;
4651 std::tie(args&: PtrBase, args&: ConstOffset) = getPtrBaseWithConstantOffset(Root: VAddr, MRI: *MRI);
4652 if (ConstOffset != 0) {
4653 if (TII.isLegalMUBUFImmOffset(Imm: ConstOffset) &&
4654 (!STI.privateMemoryResourceIsRangeChecked() ||
4655 KB->signBitIsZero(Op: PtrBase))) {
4656 const MachineInstr *PtrBaseDef = MRI->getVRegDef(Reg: PtrBase);
4657 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4658 FI = PtrBaseDef->getOperand(i: 1).getIndex();
4659 else
4660 VAddr = PtrBase;
4661 Offset = ConstOffset;
4662 }
4663 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4664 FI = RootDef->getOperand(i: 1).getIndex();
4665 }
4666 }
4667
4668 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4669 MIB.addReg(RegNo: Info->getScratchRSrcReg());
4670 },
4671 [=](MachineInstrBuilder &MIB) { // vaddr
4672 if (FI)
4673 MIB.addFrameIndex(Idx: *FI);
4674 else
4675 MIB.addReg(RegNo: VAddr);
4676 },
4677 [=](MachineInstrBuilder &MIB) { // soffset
4678 // Use constant zero for soffset and rely on eliminateFrameIndex
4679 // to choose the appropriate frame register if need be.
4680 MIB.addImm(Val: 0);
4681 },
4682 [=](MachineInstrBuilder &MIB) { // offset
4683 MIB.addImm(Val: Offset);
4684 }}};
4685}
4686
4687bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4688 int64_t Offset) const {
4689 if (!isUInt<16>(x: Offset))
4690 return false;
4691
4692 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
4693 return true;
4694
4695 // On Southern Islands instruction with a negative base value and an offset
4696 // don't seem to work.
4697 return KB->signBitIsZero(Op: Base);
4698}
4699
4700bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4701 int64_t Offset1,
4702 unsigned Size) const {
4703 if (Offset0 % Size != 0 || Offset1 % Size != 0)
4704 return false;
4705 if (!isUInt<8>(x: Offset0 / Size) || !isUInt<8>(x: Offset1 / Size))
4706 return false;
4707
4708 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
4709 return true;
4710
4711 // On Southern Islands instruction with a negative base value and an offset
4712 // don't seem to work.
4713 return KB->signBitIsZero(Op: Base);
4714}
4715
4716// Return whether the operation has NoUnsignedWrap property.
4717static bool isNoUnsignedWrap(MachineInstr *Addr) {
4718 return Addr->getOpcode() == TargetOpcode::G_OR ||
4719 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
4720 Addr->getFlag(Flag: MachineInstr::NoUWrap));
4721}
4722
4723// Check that the base address of flat scratch load/store in the form of `base +
4724// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
4725// requirement). We always treat the first operand as the base address here.
4726bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
4727 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
4728
4729 if (isNoUnsignedWrap(Addr: AddrMI))
4730 return true;
4731
4732 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4733 // values.
4734 if (STI.hasSignedScratchOffsets())
4735 return true;
4736
4737 Register LHS = AddrMI->getOperand(i: 1).getReg();
4738 Register RHS = AddrMI->getOperand(i: 2).getReg();
4739
4740 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
4741 std::optional<ValueAndVReg> RhsValReg =
4742 getIConstantVRegValWithLookThrough(VReg: RHS, MRI: *MRI);
4743 // If the immediate offset is negative and within certain range, the base
4744 // address cannot also be negative. If the base is also negative, the sum
4745 // would be either negative or much larger than the valid range of scratch
4746 // memory a thread can access.
4747 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
4748 RhsValReg->Value.getSExtValue() > -0x40000000)
4749 return true;
4750 }
4751
4752 return KB->signBitIsZero(Op: LHS);
4753}
4754
4755// Check address value in SGPR/VGPR are legal for flat scratch in the form
4756// of: SGPR + VGPR.
4757bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
4758 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
4759
4760 if (isNoUnsignedWrap(Addr: AddrMI))
4761 return true;
4762
4763 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4764 // values.
4765 if (STI.hasSignedScratchOffsets())
4766 return true;
4767
4768 Register LHS = AddrMI->getOperand(i: 1).getReg();
4769 Register RHS = AddrMI->getOperand(i: 2).getReg();
4770 return KB->signBitIsZero(Op: RHS) && KB->signBitIsZero(Op: LHS);
4771}
4772
4773// Check address value in SGPR/VGPR are legal for flat scratch in the form
4774// of: SGPR + VGPR + Imm.
4775bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
4776 Register Addr) const {
4777 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4778 // values.
4779 if (STI.hasSignedScratchOffsets())
4780 return true;
4781
4782 MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI);
4783 Register Base = AddrMI->getOperand(i: 1).getReg();
4784 std::optional<DefinitionAndSourceRegister> BaseDef =
4785 getDefSrcRegIgnoringCopies(Reg: Base, MRI: *MRI);
4786 std::optional<ValueAndVReg> RHSOffset =
4787 getIConstantVRegValWithLookThrough(VReg: AddrMI->getOperand(i: 2).getReg(), MRI: *MRI);
4788 assert(RHSOffset);
4789
4790 // If the immediate offset is negative and within certain range, the base
4791 // address cannot also be negative. If the base is also negative, the sum
4792 // would be either negative or much larger than the valid range of scratch
4793 // memory a thread can access.
4794 if (isNoUnsignedWrap(Addr: BaseDef->MI) &&
4795 (isNoUnsignedWrap(Addr: AddrMI) ||
4796 (RHSOffset->Value.getSExtValue() < 0 &&
4797 RHSOffset->Value.getSExtValue() > -0x40000000)))
4798 return true;
4799
4800 Register LHS = BaseDef->MI->getOperand(i: 1).getReg();
4801 Register RHS = BaseDef->MI->getOperand(i: 2).getReg();
4802 return KB->signBitIsZero(Op: RHS) && KB->signBitIsZero(Op: LHS);
4803}
4804
4805bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4806 unsigned ShAmtBits) const {
4807 assert(MI.getOpcode() == TargetOpcode::G_AND);
4808
4809 std::optional<APInt> RHS =
4810 getIConstantVRegVal(VReg: MI.getOperand(i: 2).getReg(), MRI: *MRI);
4811 if (!RHS)
4812 return false;
4813
4814 if (RHS->countr_one() >= ShAmtBits)
4815 return true;
4816
4817 const APInt &LHSKnownZeros = KB->getKnownZeroes(R: MI.getOperand(i: 1).getReg());
4818 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
4819}
4820
4821InstructionSelector::ComplexRendererFns
4822AMDGPUInstructionSelector::selectMUBUFScratchOffset(
4823 MachineOperand &Root) const {
4824 Register Reg = Root.getReg();
4825 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4826
4827 std::optional<DefinitionAndSourceRegister> Def =
4828 getDefSrcRegIgnoringCopies(Reg, MRI: *MRI);
4829 assert(Def && "this shouldn't be an optional result");
4830 Reg = Def->Reg;
4831
4832 if (Register WaveBase = getWaveAddress(Def: Def->MI)) {
4833 return {{
4834 [=](MachineInstrBuilder &MIB) { // rsrc
4835 MIB.addReg(RegNo: Info->getScratchRSrcReg());
4836 },
4837 [=](MachineInstrBuilder &MIB) { // soffset
4838 MIB.addReg(RegNo: WaveBase);
4839 },
4840 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // offset
4841 }};
4842 }
4843
4844 int64_t Offset = 0;
4845
4846 // FIXME: Copy check is a hack
4847 Register BasePtr;
4848 if (mi_match(R: Reg, MRI: *MRI,
4849 P: m_GPtrAdd(L: m_Reg(R&: BasePtr),
4850 R: m_any_of(preds: m_ICst(Cst&: Offset), preds: m_Copy(Src: m_ICst(Cst&: Offset)))))) {
4851 if (!TII.isLegalMUBUFImmOffset(Imm: Offset))
4852 return {};
4853 MachineInstr *BasePtrDef = getDefIgnoringCopies(Reg: BasePtr, MRI: *MRI);
4854 Register WaveBase = getWaveAddress(Def: BasePtrDef);
4855 if (!WaveBase)
4856 return {};
4857
4858 return {{
4859 [=](MachineInstrBuilder &MIB) { // rsrc
4860 MIB.addReg(RegNo: Info->getScratchRSrcReg());
4861 },
4862 [=](MachineInstrBuilder &MIB) { // soffset
4863 MIB.addReg(RegNo: WaveBase);
4864 },
4865 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); } // offset
4866 }};
4867 }
4868
4869 if (!mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: Offset)) ||
4870 !TII.isLegalMUBUFImmOffset(Imm: Offset))
4871 return {};
4872
4873 return {{
4874 [=](MachineInstrBuilder &MIB) { // rsrc
4875 MIB.addReg(RegNo: Info->getScratchRSrcReg());
4876 },
4877 [=](MachineInstrBuilder &MIB) { // soffset
4878 MIB.addImm(Val: 0);
4879 },
4880 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); } // offset
4881 }};
4882}
4883
4884std::pair<Register, unsigned>
4885AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
4886 const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg());
4887 if (!RootDef)
4888 return std::pair(Root.getReg(), 0);
4889
4890 int64_t ConstAddr = 0;
4891
4892 Register PtrBase;
4893 int64_t Offset;
4894 std::tie(args&: PtrBase, args&: Offset) =
4895 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
4896
4897 if (Offset) {
4898 if (isDSOffsetLegal(Base: PtrBase, Offset)) {
4899 // (add n0, c0)
4900 return std::pair(PtrBase, Offset);
4901 }
4902 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4903 // TODO
4904
4905
4906 } else if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: ConstAddr))) {
4907 // TODO
4908
4909 }
4910
4911 return std::pair(Root.getReg(), 0);
4912}
4913
4914InstructionSelector::ComplexRendererFns
4915AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4916 Register Reg;
4917 unsigned Offset;
4918 std::tie(args&: Reg, args&: Offset) = selectDS1Addr1OffsetImpl(Root);
4919 return {{
4920 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
4921 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }
4922 }};
4923}
4924
4925InstructionSelector::ComplexRendererFns
4926AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
4927 return selectDSReadWrite2(Root, size: 4);
4928}
4929
4930InstructionSelector::ComplexRendererFns
4931AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
4932 return selectDSReadWrite2(Root, size: 8);
4933}
4934
4935InstructionSelector::ComplexRendererFns
4936AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
4937 unsigned Size) const {
4938 Register Reg;
4939 unsigned Offset;
4940 std::tie(args&: Reg, args&: Offset) = selectDSReadWrite2Impl(Root, size: Size);
4941 return {{
4942 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); },
4943 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); },
4944 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset+1); }
4945 }};
4946}
4947
4948std::pair<Register, unsigned>
4949AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
4950 unsigned Size) const {
4951 const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg());
4952 if (!RootDef)
4953 return std::pair(Root.getReg(), 0);
4954
4955 int64_t ConstAddr = 0;
4956
4957 Register PtrBase;
4958 int64_t Offset;
4959 std::tie(args&: PtrBase, args&: Offset) =
4960 getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI);
4961
4962 if (Offset) {
4963 int64_t OffsetValue0 = Offset;
4964 int64_t OffsetValue1 = Offset + Size;
4965 if (isDSOffset2Legal(Base: PtrBase, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
4966 // (add n0, c0)
4967 return std::pair(PtrBase, OffsetValue0 / Size);
4968 }
4969 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4970 // TODO
4971
4972 } else if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: ConstAddr))) {
4973 // TODO
4974
4975 }
4976
4977 return std::pair(Root.getReg(), 0);
4978}
4979
4980/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4981/// the base value with the constant offset. There may be intervening copies
4982/// between \p Root and the identified constant. Returns \p Root, 0 if this does
4983/// not match the pattern.
4984std::pair<Register, int64_t>
4985AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4986 Register Root, const MachineRegisterInfo &MRI) const {
4987 MachineInstr *RootI = getDefIgnoringCopies(Reg: Root, MRI);
4988 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4989 return {Root, 0};
4990
4991 MachineOperand &RHS = RootI->getOperand(i: 2);
4992 std::optional<ValueAndVReg> MaybeOffset =
4993 getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI);
4994 if (!MaybeOffset)
4995 return {Root, 0};
4996 return {RootI->getOperand(i: 1).getReg(), MaybeOffset->Value.getSExtValue()};
4997}
4998
4999static void addZeroImm(MachineInstrBuilder &MIB) {
5000 MIB.addImm(Val: 0);
5001}
5002
5003/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5004/// BasePtr is not valid, a null base pointer will be used.
5005static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5006 uint32_t FormatLo, uint32_t FormatHi,
5007 Register BasePtr) {
5008 Register RSrc2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5009 Register RSrc3 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5010 Register RSrcHi = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
5011 Register RSrc = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_128RegClass);
5012
5013 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
5014 .addDef(RegNo: RSrc2)
5015 .addImm(Val: FormatLo);
5016 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
5017 .addDef(RegNo: RSrc3)
5018 .addImm(Val: FormatHi);
5019
5020 // Build the half of the subregister with the constants before building the
5021 // full 128-bit register. If we are building multiple resource descriptors,
5022 // this will allow CSEing of the 2-component register.
5023 B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
5024 .addDef(RegNo: RSrcHi)
5025 .addReg(RegNo: RSrc2)
5026 .addImm(Val: AMDGPU::sub0)
5027 .addReg(RegNo: RSrc3)
5028 .addImm(Val: AMDGPU::sub1);
5029
5030 Register RSrcLo = BasePtr;
5031 if (!BasePtr) {
5032 RSrcLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
5033 B.buildInstr(Opcode: AMDGPU::S_MOV_B64)
5034 .addDef(RegNo: RSrcLo)
5035 .addImm(Val: 0);
5036 }
5037
5038 B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
5039 .addDef(RegNo: RSrc)
5040 .addReg(RegNo: RSrcLo)
5041 .addImm(Val: AMDGPU::sub0_sub1)
5042 .addReg(RegNo: RSrcHi)
5043 .addImm(Val: AMDGPU::sub2_sub3);
5044
5045 return RSrc;
5046}
5047
5048static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5049 const SIInstrInfo &TII, Register BasePtr) {
5050 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5051
5052 // FIXME: Why are half the "default" bits ignored based on the addressing
5053 // mode?
5054 return buildRSRC(B, MRI, FormatLo: 0, FormatHi: Hi_32(Value: DefaultFormat), BasePtr);
5055}
5056
5057static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5058 const SIInstrInfo &TII, Register BasePtr) {
5059 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5060
5061 // FIXME: Why are half the "default" bits ignored based on the addressing
5062 // mode?
5063 return buildRSRC(B, MRI, FormatLo: -1, FormatHi: Hi_32(Value: DefaultFormat), BasePtr);
5064}
5065
5066AMDGPUInstructionSelector::MUBUFAddressData
5067AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5068 MUBUFAddressData Data;
5069 Data.N0 = Src;
5070
5071 Register PtrBase;
5072 int64_t Offset;
5073
5074 std::tie(args&: PtrBase, args&: Offset) = getPtrBaseWithConstantOffset(Root: Src, MRI: *MRI);
5075 if (isUInt<32>(x: Offset)) {
5076 Data.N0 = PtrBase;
5077 Data.Offset = Offset;
5078 }
5079
5080 if (MachineInstr *InputAdd
5081 = getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Data.N0, MRI: *MRI)) {
5082 Data.N2 = InputAdd->getOperand(i: 1).getReg();
5083 Data.N3 = InputAdd->getOperand(i: 2).getReg();
5084
5085 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5086 // FIXME: Don't know this was defined by operand 0
5087 //
5088 // TODO: Remove this when we have copy folding optimizations after
5089 // RegBankSelect.
5090 Data.N2 = getDefIgnoringCopies(Reg: Data.N2, MRI: *MRI)->getOperand(i: 0).getReg();
5091 Data.N3 = getDefIgnoringCopies(Reg: Data.N3, MRI: *MRI)->getOperand(i: 0).getReg();
5092 }
5093
5094 return Data;
5095}
5096
5097/// Return if the addr64 mubuf mode should be used for the given address.
5098bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5099 // (ptr_add N2, N3) -> addr64, or
5100 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5101 if (Addr.N2)
5102 return true;
5103
5104 const RegisterBank *N0Bank = RBI.getRegBank(Reg: Addr.N0, MRI: *MRI, TRI);
5105 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5106}
5107
5108/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5109/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5110/// component.
5111void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5112 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5113 if (TII.isLegalMUBUFImmOffset(Imm: ImmOffset))
5114 return;
5115
5116 // Illegal offset, store it in soffset.
5117 SOffset = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5118 B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
5119 .addDef(RegNo: SOffset)
5120 .addImm(Val: ImmOffset);
5121 ImmOffset = 0;
5122}
5123
5124bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5125 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5126 Register &SOffset, int64_t &Offset) const {
5127 // FIXME: Predicates should stop this from reaching here.
5128 // addr64 bit was removed for volcanic islands.
5129 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5130 return false;
5131
5132 MUBUFAddressData AddrData = parseMUBUFAddress(Src: Root.getReg());
5133 if (!shouldUseAddr64(Addr: AddrData))
5134 return false;
5135
5136 Register N0 = AddrData.N0;
5137 Register N2 = AddrData.N2;
5138 Register N3 = AddrData.N3;
5139 Offset = AddrData.Offset;
5140
5141 // Base pointer for the SRD.
5142 Register SRDPtr;
5143
5144 if (N2) {
5145 if (RBI.getRegBank(Reg: N2, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5146 assert(N3);
5147 if (RBI.getRegBank(Reg: N3, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5148 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5149 // addr64, and construct the default resource from a 0 address.
5150 VAddr = N0;
5151 } else {
5152 SRDPtr = N3;
5153 VAddr = N2;
5154 }
5155 } else {
5156 // N2 is not divergent.
5157 SRDPtr = N2;
5158 VAddr = N3;
5159 }
5160 } else if (RBI.getRegBank(Reg: N0, MRI: *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5161 // Use the default null pointer in the resource
5162 VAddr = N0;
5163 } else {
5164 // N0 -> offset, or
5165 // (N0 + C1) -> offset
5166 SRDPtr = N0;
5167 }
5168
5169 MachineIRBuilder B(*Root.getParent());
5170 RSrcReg = buildAddr64RSrc(B, MRI&: *MRI, TII, BasePtr: SRDPtr);
5171 splitIllegalMUBUFOffset(B, SOffset, ImmOffset&: Offset);
5172 return true;
5173}
5174
5175bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5176 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5177 int64_t &Offset) const {
5178
5179 // FIXME: Pattern should not reach here.
5180 if (STI.useFlatForGlobal())
5181 return false;
5182
5183 MUBUFAddressData AddrData = parseMUBUFAddress(Src: Root.getReg());
5184 if (shouldUseAddr64(Addr: AddrData))
5185 return false;
5186
5187 // N0 -> offset, or
5188 // (N0 + C1) -> offset
5189 Register SRDPtr = AddrData.N0;
5190 Offset = AddrData.Offset;
5191
5192 // TODO: Look through extensions for 32-bit soffset.
5193 MachineIRBuilder B(*Root.getParent());
5194
5195 RSrcReg = buildOffsetSrc(B, MRI&: *MRI, TII, BasePtr: SRDPtr);
5196 splitIllegalMUBUFOffset(B, SOffset, ImmOffset&: Offset);
5197 return true;
5198}
5199
5200InstructionSelector::ComplexRendererFns
5201AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5202 Register VAddr;
5203 Register RSrcReg;
5204 Register SOffset;
5205 int64_t Offset = 0;
5206
5207 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5208 return {};
5209
5210 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5211 // pattern.
5212 return {{
5213 [=](MachineInstrBuilder &MIB) { // rsrc
5214 MIB.addReg(RegNo: RSrcReg);
5215 },
5216 [=](MachineInstrBuilder &MIB) { // vaddr
5217 MIB.addReg(RegNo: VAddr);
5218 },
5219 [=](MachineInstrBuilder &MIB) { // soffset
5220 if (SOffset)
5221 MIB.addReg(RegNo: SOffset);
5222 else if (STI.hasRestrictedSOffset())
5223 MIB.addReg(RegNo: AMDGPU::SGPR_NULL);
5224 else
5225 MIB.addImm(Val: 0);
5226 },
5227 [=](MachineInstrBuilder &MIB) { // offset
5228 MIB.addImm(Val: Offset);
5229 },
5230 addZeroImm, // cpol
5231 addZeroImm, // tfe
5232 addZeroImm // swz
5233 }};
5234}
5235
5236InstructionSelector::ComplexRendererFns
5237AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5238 Register RSrcReg;
5239 Register SOffset;
5240 int64_t Offset = 0;
5241
5242 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5243 return {};
5244
5245 return {{
5246 [=](MachineInstrBuilder &MIB) { // rsrc
5247 MIB.addReg(RegNo: RSrcReg);
5248 },
5249 [=](MachineInstrBuilder &MIB) { // soffset
5250 if (SOffset)
5251 MIB.addReg(RegNo: SOffset);
5252 else if (STI.hasRestrictedSOffset())
5253 MIB.addReg(RegNo: AMDGPU::SGPR_NULL);
5254 else
5255 MIB.addImm(Val: 0);
5256 },
5257 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }, // offset
5258 addZeroImm, // cpol
5259 addZeroImm, // tfe
5260 addZeroImm, // swz
5261 }};
5262}
5263
5264InstructionSelector::ComplexRendererFns
5265AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5266
5267 Register SOffset = Root.getReg();
5268
5269 if (STI.hasRestrictedSOffset() && mi_match(R: SOffset, MRI: *MRI, P: m_ZeroInt()))
5270 SOffset = AMDGPU::SGPR_NULL;
5271
5272 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); }}};
5273}
5274
5275/// Get an immediate that must be 32-bits, and treated as zero extended.
5276static std::optional<uint64_t>
5277getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
5278 // getIConstantVRegVal sexts any values, so see if that matters.
5279 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(VReg: Reg, MRI);
5280 if (!OffsetVal || !isInt<32>(x: *OffsetVal))
5281 return std::nullopt;
5282 return Lo_32(Value: *OffsetVal);
5283}
5284
5285InstructionSelector::ComplexRendererFns
5286AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5287 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Reg: Root.getReg(), MRI: *MRI);
5288 if (!OffsetVal)
5289 return {};
5290
5291 std::optional<int64_t> EncodedImm =
5292 AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: *OffsetVal, IsBuffer: true);
5293 if (!EncodedImm)
5294 return {};
5295
5296 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); } }};
5297}
5298
5299InstructionSelector::ComplexRendererFns
5300AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5301 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
5302
5303 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Reg: Root.getReg(), MRI: *MRI);
5304 if (!OffsetVal)
5305 return {};
5306
5307 std::optional<int64_t> EncodedImm =
5308 AMDGPU::getSMRDEncodedLiteralOffset32(ST: STI, ByteOffset: *OffsetVal);
5309 if (!EncodedImm)
5310 return {};
5311
5312 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); } }};
5313}
5314
5315InstructionSelector::ComplexRendererFns
5316AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5317 // Match the (soffset + offset) pair as a 32-bit register base and
5318 // an immediate offset.
5319 Register SOffset;
5320 unsigned Offset;
5321 std::tie(args&: SOffset, args&: Offset) = AMDGPU::getBaseWithConstantOffset(
5322 MRI&: *MRI, Reg: Root.getReg(), KnownBits: KB, /*CheckNUW*/ true);
5323 if (!SOffset)
5324 return std::nullopt;
5325
5326 std::optional<int64_t> EncodedOffset =
5327 AMDGPU::getSMRDEncodedOffset(ST: STI, ByteOffset: Offset, /* IsBuffer */ true);
5328 if (!EncodedOffset)
5329 return std::nullopt;
5330
5331 assert(MRI->getType(SOffset) == LLT::scalar(32));
5332 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); },
5333 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedOffset); }}};
5334}
5335
5336// Variant of stripBitCast that returns the instruction instead of a
5337// MachineOperand.
5338static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
5339 if (MI->getOpcode() == AMDGPU::G_BITCAST)
5340 return getDefIgnoringCopies(Reg: MI->getOperand(i: 1).getReg(), MRI);
5341 return MI;
5342}
5343
5344// Figure out if this is really an extract of the high 16-bits of a dword,
5345// returns nullptr if it isn't.
5346static MachineInstr *isExtractHiElt(MachineInstr *Inst,
5347 MachineRegisterInfo &MRI) {
5348 Inst = stripBitCast(MI: Inst, MRI);
5349
5350 if (Inst->getOpcode() != AMDGPU::G_TRUNC)
5351 return nullptr;
5352
5353 MachineInstr *TruncOp =
5354 getDefIgnoringCopies(Reg: Inst->getOperand(i: 1).getReg(), MRI);
5355 TruncOp = stripBitCast(MI: TruncOp, MRI);
5356
5357 // G_LSHR x, (G_CONSTANT i32 16)
5358 if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
5359 auto SrlAmount = getIConstantVRegValWithLookThrough(
5360 VReg: TruncOp->getOperand(i: 2).getReg(), MRI);
5361 if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
5362 MachineInstr *SrlOp =
5363 getDefIgnoringCopies(Reg: TruncOp->getOperand(i: 1).getReg(), MRI);
5364 return stripBitCast(MI: SrlOp, MRI);
5365 }
5366 }
5367
5368 // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
5369 // 1, 0 swaps the low/high 16 bits.
5370 // 1, 1 sets the high 16 bits to be the same as the low 16.
5371 // in any case, it selects the high elts.
5372 if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
5373 assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
5374 LLT::fixed_vector(2, 16));
5375
5376 ArrayRef<int> Mask = TruncOp->getOperand(i: 3).getShuffleMask();
5377 assert(Mask.size() == 2);
5378
5379 if (Mask[0] == 1 && Mask[1] <= 1) {
5380 MachineInstr *LHS =
5381 getDefIgnoringCopies(Reg: TruncOp->getOperand(i: 1).getReg(), MRI);
5382 return stripBitCast(MI: LHS, MRI);
5383 }
5384 }
5385
5386 return nullptr;
5387}
5388
5389std::pair<Register, unsigned>
5390AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5391 bool &Matched) const {
5392 Matched = false;
5393
5394 Register Src;
5395 unsigned Mods;
5396 std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root);
5397
5398 MachineInstr *MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI);
5399 if (MI->getOpcode() == AMDGPU::G_FPEXT) {
5400 MachineOperand *MO = &MI->getOperand(i: 1);
5401 Src = MO->getReg();
5402 MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI);
5403
5404 assert(MRI->getType(Src) == LLT::scalar(16));
5405
5406 // See through bitcasts.
5407 // FIXME: Would be nice to use stripBitCast here.
5408 if (MI->getOpcode() == AMDGPU::G_BITCAST) {
5409 MO = &MI->getOperand(i: 1);
5410 Src = MO->getReg();
5411 MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI);
5412 }
5413
5414 const auto CheckAbsNeg = [&]() {
5415 // Be careful about folding modifiers if we already have an abs. fneg is
5416 // applied last, so we don't want to apply an earlier fneg.
5417 if ((Mods & SISrcMods::ABS) == 0) {
5418 unsigned ModsTmp;
5419 std::tie(args&: Src, args&: ModsTmp) = selectVOP3ModsImpl(Root&: *MO);
5420 MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI);
5421
5422 if ((ModsTmp & SISrcMods::NEG) != 0)
5423 Mods ^= SISrcMods::NEG;
5424
5425 if ((ModsTmp & SISrcMods::ABS) != 0)
5426 Mods |= SISrcMods::ABS;
5427 }
5428 };
5429
5430 CheckAbsNeg();
5431
5432 // op_sel/op_sel_hi decide the source type and source.
5433 // If the source's op_sel_hi is set, it indicates to do a conversion from
5434 // fp16. If the sources's op_sel is set, it picks the high half of the
5435 // source register.
5436
5437 Mods |= SISrcMods::OP_SEL_1;
5438
5439 if (MachineInstr *ExtractHiEltMI = isExtractHiElt(Inst: MI, MRI&: *MRI)) {
5440 Mods |= SISrcMods::OP_SEL_0;
5441 MI = ExtractHiEltMI;
5442 MO = &MI->getOperand(i: 0);
5443 Src = MO->getReg();
5444
5445 CheckAbsNeg();
5446 }
5447
5448 Matched = true;
5449 }
5450
5451 return {Src, Mods};
5452}
5453
5454InstructionSelector::ComplexRendererFns
5455AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5456 MachineOperand &Root) const {
5457 Register Src;
5458 unsigned Mods;
5459 bool Matched;
5460 std::tie(args&: Src, args&: Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5461 if (!Matched)
5462 return {};
5463
5464 return {{
5465 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5466 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
5467 }};
5468}
5469
5470InstructionSelector::ComplexRendererFns
5471AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5472 Register Src;
5473 unsigned Mods;
5474 bool Matched;
5475 std::tie(args&: Src, args&: Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5476
5477 return {{
5478 [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); },
5479 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods
5480 }};
5481}
5482
5483bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5484 MachineInstr &I, Intrinsic::ID IntrID) const {
5485 MachineBasicBlock *MBB = I.getParent();
5486 const DebugLoc &DL = I.getDebugLoc();
5487 Register CCReg = I.getOperand(i: 0).getReg();
5488
5489 bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
5490
5491 if (HasM0) {
5492 auto CopyMIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
5493 .addReg(RegNo: I.getOperand(i: 2).getReg());
5494 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
5495 if (!constrainSelectedInstRegOperands(I&: *CopyMIB, TII, TRI, RBI))
5496 return false;
5497 } else {
5498 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5499 .addImm(Val: I.getOperand(i: 2).getImm());
5500 }
5501
5502 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CCReg).addReg(RegNo: AMDGPU::SCC);
5503
5504 I.eraseFromParent();
5505 return RBI.constrainGenericRegister(Reg: CCReg, RC: AMDGPU::SReg_32_XM0_XEXECRegClass,
5506 MRI&: *MRI);
5507}
5508
5509unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5510 if (HasInlineConst) {
5511 switch (IntrID) {
5512 default:
5513 llvm_unreachable("not a named barrier op");
5514 case Intrinsic::amdgcn_s_barrier_init:
5515 return AMDGPU::S_BARRIER_INIT_IMM;
5516 case Intrinsic::amdgcn_s_barrier_join:
5517 return AMDGPU::S_BARRIER_JOIN_IMM;
5518 case Intrinsic::amdgcn_s_wakeup_barrier:
5519 return AMDGPU::S_WAKEUP_BARRIER_IMM;
5520 case Intrinsic::amdgcn_s_get_barrier_state:
5521 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5522 };
5523 } else {
5524 switch (IntrID) {
5525 default:
5526 llvm_unreachable("not a named barrier op");
5527 case Intrinsic::amdgcn_s_barrier_init:
5528 return AMDGPU::S_BARRIER_INIT_M0;
5529 case Intrinsic::amdgcn_s_barrier_join:
5530 return AMDGPU::S_BARRIER_JOIN_M0;
5531 case Intrinsic::amdgcn_s_wakeup_barrier:
5532 return AMDGPU::S_WAKEUP_BARRIER_M0;
5533 case Intrinsic::amdgcn_s_get_barrier_state:
5534 return AMDGPU::S_GET_BARRIER_STATE_M0;
5535 };
5536 }
5537}
5538
5539bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5540 MachineInstr &I, Intrinsic::ID IntrID) const {
5541 MachineBasicBlock *MBB = I.getParent();
5542 const DebugLoc &DL = I.getDebugLoc();
5543 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
5544 ? I.getOperand(i: 2)
5545 : I.getOperand(i: 1);
5546 std::optional<int64_t> BarValImm =
5547 getIConstantVRegSExtVal(VReg: BarOp.getReg(), MRI: *MRI);
5548 Register M0Val;
5549 Register TmpReg0;
5550
5551 // For S_BARRIER_INIT, member count will always be read from M0[16:22]
5552 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5553 Register MemberCount = I.getOperand(i: 2).getReg();
5554 TmpReg0 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5555 // TODO: This should be expanded during legalization so that the the S_LSHL
5556 // and S_OR can be constant-folded
5557 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_LSHL_B32), DestReg: TmpReg0)
5558 .addImm(Val: 16)
5559 .addReg(RegNo: MemberCount);
5560 M0Val = TmpReg0;
5561 }
5562
5563 // If not inlinable, get reference to barrier depending on the instruction
5564 if (!BarValImm) {
5565 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5566 // If reference to barrier id is not an inlinable constant then it must be
5567 // referenced with M0[4:0]. Perform an OR with the member count to include
5568 // it in M0 for S_BARRIER_INIT.
5569 Register TmpReg1 = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5570 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_OR_B32), DestReg: TmpReg1)
5571 .addReg(RegNo: BarOp.getReg())
5572 .addReg(RegNo: TmpReg0);
5573 M0Val = TmpReg1;
5574 } else {
5575 M0Val = BarOp.getReg();
5576 }
5577 }
5578
5579 // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
5580 if (M0Val) {
5581 auto CopyMIB =
5582 BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0).addReg(RegNo: M0Val);
5583 constrainSelectedInstRegOperands(I&: *CopyMIB, TII, TRI, RBI);
5584 }
5585
5586 MachineInstrBuilder MIB;
5587 unsigned Opc = getNamedBarrierOp(HasInlineConst: BarValImm.has_value(), IntrID);
5588 MIB = BuildMI(BB&: *MBB, I: &I, MIMD: DL, MCID: TII.get(Opcode: Opc));
5589
5590 if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
5591 MIB.addDef(RegNo: I.getOperand(i: 0).getReg());
5592
5593 if (BarValImm)
5594 MIB.addImm(Val: *BarValImm);
5595
5596 I.eraseFromParent();
5597 return true;
5598}
5599
5600bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
5601 MachineBasicBlock *BB = I.getParent();
5602 const DebugLoc &DL = I.getDebugLoc();
5603 Register CCReg = I.getOperand(i: 0).getReg();
5604
5605 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_BARRIER_LEAVE));
5606 BuildMI(BB&: *BB, I: &I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: CCReg).addReg(RegNo: AMDGPU::SCC);
5607
5608 I.eraseFromParent();
5609 return RBI.constrainGenericRegister(Reg: CCReg, RC: AMDGPU::SReg_32_XM0_XEXECRegClass,
5610 MRI&: *MRI);
5611}
5612
5613void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5614 const MachineInstr &MI,
5615 int OpIdx) const {
5616 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5617 "Expected G_CONSTANT");
5618 MIB.addImm(Val: MI.getOperand(i: 1).getCImm()->getSExtValue());
5619}
5620
5621void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5622 const MachineInstr &MI,
5623 int OpIdx) const {
5624 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5625 "Expected G_CONSTANT");
5626 MIB.addImm(Val: -MI.getOperand(i: 1).getCImm()->getSExtValue());
5627}
5628
5629void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
5630 const MachineInstr &MI,
5631 int OpIdx) const {
5632 assert(OpIdx == -1);
5633
5634 const MachineOperand &Op = MI.getOperand(i: 1);
5635 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
5636 MIB.addImm(Val: Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5637 else {
5638 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
5639 MIB.addImm(Val: Op.getCImm()->getSExtValue());
5640 }
5641}
5642
5643void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5644 const MachineInstr &MI,
5645 int OpIdx) const {
5646 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5647 "Expected G_CONSTANT");
5648 MIB.addImm(Val: MI.getOperand(i: 1).getCImm()->getValue().popcount());
5649}
5650
5651/// This only really exists to satisfy DAG type checking machinery, so is a
5652/// no-op here.
5653void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
5654 const MachineInstr &MI,
5655 int OpIdx) const {
5656 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm());
5657}
5658
5659void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
5660 const MachineInstr &MI,
5661 int OpIdx) const {
5662 assert(OpIdx >= 0 && "expected to match an immediate operand");
5663 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
5664}
5665
5666void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
5667 const MachineInstr &MI,
5668 int OpIdx) const {
5669 assert(OpIdx >= 0 && "expected to match an immediate operand");
5670 MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() &
5671 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
5672 : AMDGPU::CPol::ALL_pregfx12));
5673}
5674
5675void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
5676 const MachineInstr &MI,
5677 int OpIdx) const {
5678 assert(OpIdx >= 0 && "expected to match an immediate operand");
5679 const bool Swizzle = MI.getOperand(i: OpIdx).getImm() &
5680 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ
5681 : AMDGPU::CPol::SWZ_pregfx12);
5682 MIB.addImm(Val: Swizzle);
5683}
5684
5685void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
5686 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
5687 assert(OpIdx >= 0 && "expected to match an immediate operand");
5688 const uint32_t Cpol = MI.getOperand(i: OpIdx).getImm() &
5689 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
5690 : AMDGPU::CPol::ALL_pregfx12);
5691 MIB.addImm(Val: Cpol | AMDGPU::CPol::GLC);
5692}
5693
5694void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
5695 const MachineInstr &MI,
5696 int OpIdx) const {
5697 MIB.addFrameIndex(Idx: MI.getOperand(i: 1).getIndex());
5698}
5699
5700void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
5701 const MachineInstr &MI,
5702 int OpIdx) const {
5703 const APFloat &APF = MI.getOperand(i: 1).getFPImm()->getValueAPF();
5704 int ExpVal = APF.getExactLog2Abs();
5705 assert(ExpVal != INT_MIN);
5706 MIB.addImm(Val: ExpVal);
5707}
5708
5709bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
5710 return TII.isInlineConstant(Imm);
5711}
5712
5713bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
5714 return TII.isInlineConstant(Imm);
5715}
5716