1//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the RegisterBankInfo class for
10/// AMDGPU.
11///
12/// \par
13///
14/// AMDGPU has unique register bank constraints that require special high level
15/// strategies to deal with. There are two main true physical register banks
16/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17/// sort of pseudo-register bank needed to represent SGPRs used in a vector
18/// boolean context. There is also the AGPR bank, which is a special purpose
19/// physical register bank present on some subtargets.
20///
21/// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22/// be uniform. It is generally not valid to legalize operands by inserting
23/// copies as on other targets. Operations which require uniform, SGPR operands
24/// generally require scalarization by repeatedly executing the instruction,
25/// activating each set of lanes using a unique set of input values. This is
26/// referred to as a waterfall loop.
27///
28/// \par Booleans
29///
30/// Booleans (s1 values) requires special consideration. A vector compare result
31/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32/// register. These are represented with the VCC bank. During selection, we need
33/// to be able to unambiguously go back from a register class to a register
34/// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35/// bank, we need to know the use context type. An SGPR s1 value always means a
36/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37/// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38/// a 32-bit virtual register. Taken together, this means we need to adjust the
39/// type of boolean operations to be regbank legal. All SALU booleans need to be
40/// widened to 32-bits, and all VALU booleans need to be s1 values.
41///
42/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44/// bank. A non-boolean source (such as a truncate from a 1-bit load from
45/// memory) will require a copy to the VCC bank which will require clearing the
46/// high bits and inserting a compare.
47///
48/// \par Constant bus restriction
49///
50/// VALU instructions have a limitation known as the constant bus
51/// restriction. Most VALU instructions can use SGPR operands, but may read at
52/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53/// instructions). This is one unique SGPR, so the same SGPR may be used for
54/// multiple operands. From a register bank perspective, any combination of
55/// operands should be legal as an SGPR, but this is contextually dependent on
56/// the SGPR operands all being the same register. There is therefore optimal to
57/// choose the SGPR with the most uses to minimize the number of copies.
58///
59/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60/// operation should have its source operands all mapped to VGPRs (except for
61/// VCC), inserting copies from any SGPR operands. This the most trivial legal
62/// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63/// complicated to solve here. Every optimization pattern or instruction
64/// selected to multiple outputs would have to enforce this rule, and there
65/// would be additional complexity in tracking this rule for every G_*
66/// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67/// picking the optimal operand combination from a post-isel optimization pass.
68///
69//===----------------------------------------------------------------------===//
70
71#include "AMDGPURegisterBankInfo.h"
72
73#include "AMDGPU.h"
74#include "AMDGPUGlobalISelUtils.h"
75#include "AMDGPUInstrInfo.h"
76#include "AMDGPULaneMaskUtils.h"
77#include "GCNSubtarget.h"
78#include "SIMachineFunctionInfo.h"
79#include "SIRegisterInfo.h"
80#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
81#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
82#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
83#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
84#include "llvm/CodeGen/RegisterBank.h"
85#include "llvm/IR/IntrinsicsAMDGPU.h"
86
87#define GET_TARGET_REGBANK_IMPL
88#include "AMDGPUGenRegisterBank.inc"
89
90// This file will be TableGen'ed at some point.
91#include "AMDGPUGenRegisterBankInfo.def"
92
93using namespace llvm;
94using namespace MIPatternMatch;
95
96namespace {
97
98// Observer to apply a register bank to new registers created by LegalizerHelper.
99class ApplyRegBankMapping final : public GISelChangeObserver {
100private:
101 MachineIRBuilder &B;
102 const AMDGPURegisterBankInfo &RBI;
103 MachineRegisterInfo &MRI;
104 const RegisterBank *NewBank;
105 SmallVector<MachineInstr *, 4> NewInsts;
106
107public:
108 ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
109 MachineRegisterInfo &MRI_, const RegisterBank *RB)
110 : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
111 assert(!B.isObservingChanges());
112 B.setChangeObserver(*this);
113 }
114
115 ~ApplyRegBankMapping() override {
116 for (MachineInstr *MI : NewInsts)
117 applyBank(MI&: *MI);
118
119 B.stopObservingChanges();
120 }
121
122 /// Set any registers that don't have a set register class or bank to SALU.
123 void applyBank(MachineInstr &MI) {
124 const unsigned Opc = MI.getOpcode();
125 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
126 Opc == AMDGPU::G_SEXT) {
127 // LegalizerHelper wants to use the basic legalization artifacts when
128 // widening etc. We don't handle selection with vcc in artifact sources,
129 // so we need to use a select instead to handle these properly.
130 Register DstReg = MI.getOperand(i: 0).getReg();
131 Register SrcReg = MI.getOperand(i: 1).getReg();
132 const RegisterBank *SrcBank = RBI.getRegBank(Reg: SrcReg, MRI, TRI: *RBI.TRI);
133 if (SrcBank == &AMDGPU::VCCRegBank) {
134 const LLT S32 = LLT::scalar(SizeInBits: 32);
135 assert(MRI.getType(SrcReg) == LLT::scalar(1));
136 assert(MRI.getType(DstReg) == S32);
137 assert(NewBank == &AMDGPU::VGPRRegBank);
138
139 // Replace the extension with a select, which really uses the boolean
140 // source.
141 B.setInsertPt(MBB&: *MI.getParent(), II: MI);
142
143 auto True = B.buildConstant(Res: S32, Val: Opc == AMDGPU::G_SEXT ? -1 : 1);
144 auto False = B.buildConstant(Res: S32, Val: 0);
145 B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False);
146 MRI.setRegBank(Reg: True.getReg(Idx: 0), RegBank: *NewBank);
147 MRI.setRegBank(Reg: False.getReg(Idx: 0), RegBank: *NewBank);
148 MI.eraseFromParent();
149 }
150
151 assert(!MRI.getRegClassOrRegBank(DstReg));
152 MRI.setRegBank(Reg: DstReg, RegBank: *NewBank);
153 return;
154 }
155
156#ifndef NDEBUG
157 if (Opc == AMDGPU::G_TRUNC) {
158 Register DstReg = MI.getOperand(0).getReg();
159 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
160 assert(DstBank != &AMDGPU::VCCRegBank);
161 }
162#endif
163
164 for (MachineOperand &Op : MI.operands()) {
165 if (!Op.isReg())
166 continue;
167
168 // We may see physical registers if building a real MI
169 Register Reg = Op.getReg();
170 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
171 continue;
172
173 const RegisterBank *RB = NewBank;
174 if (MRI.getType(Reg) == LLT::scalar(SizeInBits: 1)) {
175 assert(NewBank == &AMDGPU::VGPRRegBank &&
176 "s1 operands should only be used for vector bools");
177 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
178 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
179 "not expecting legalization artifacts here");
180 RB = &AMDGPU::VCCRegBank;
181 }
182
183 MRI.setRegBank(Reg, RegBank: *RB);
184 }
185 }
186
187 void erasingInstr(MachineInstr &MI) override {}
188
189 void createdInstr(MachineInstr &MI) override {
190 // At this point, the instruction was just inserted and has no operands.
191 NewInsts.push_back(Elt: &MI);
192 }
193
194 void changingInstr(MachineInstr &MI) override {}
195 void changedInstr(MachineInstr &MI) override {
196 // FIXME: In principle we should probably add the instruction to NewInsts,
197 // but the way the LegalizerHelper uses the observer, we will always see the
198 // registers we need to set the regbank on also referenced in a new
199 // instruction.
200 }
201};
202
203} // anonymous namespace
204
205AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
206 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
207 TII(Subtarget.getInstrInfo()) {
208
209 // HACK: Until this is fully tablegen'd.
210 static llvm::once_flag InitializeRegisterBankFlag;
211
212 static auto InitializeRegisterBankOnce = [this]() {
213 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
214 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
215 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
216 (void)this;
217 };
218
219 llvm::call_once(flag&: InitializeRegisterBankFlag, F&: InitializeRegisterBankOnce);
220}
221
222static bool isVectorRegisterBank(const RegisterBank &Bank) {
223 unsigned BankID = Bank.getID();
224 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
225}
226
227bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {
228 return RB != &AMDGPU::SGPRRegBank;
229}
230
231unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
232 const RegisterBank &Src,
233 TypeSize Size) const {
234 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
235 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
236 (isVectorRegisterBank(Bank: Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
237 return std::numeric_limits<unsigned>::max();
238 }
239
240 // Bool values are tricky, because the meaning is based on context. The SCC
241 // and VCC banks are for the natural scalar and vector conditions produced by
242 // a compare.
243 //
244 // Legalization doesn't know about the necessary context, so an s1 use may
245 // have been a truncate from an arbitrary value, in which case a copy (lowered
246 // as a compare with 0) needs to be inserted.
247 if (Size == 1 &&
248 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
249 (isVectorRegisterBank(Bank: Src) ||
250 Src.getID() == AMDGPU::SGPRRegBankID ||
251 Src.getID() == AMDGPU::VCCRegBankID))
252 return std::numeric_limits<unsigned>::max();
253
254 // There is no direct copy between AGPRs.
255 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
256 Src.getID() == AMDGPU::AGPRRegBankID)
257 return 4;
258
259 return RegisterBankInfo::copyCost(A: Dst, B: Src, Size);
260}
261
262unsigned AMDGPURegisterBankInfo::getBreakDownCost(
263 const ValueMapping &ValMapping,
264 const RegisterBank *CurBank) const {
265 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
266 // VGPR.
267 // FIXME: Is there a better way to do this?
268 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
269 return 10; // This is expensive.
270
271 assert(ValMapping.NumBreakDowns == 2 &&
272 ValMapping.BreakDown[0].Length == 32 &&
273 ValMapping.BreakDown[0].StartIdx == 0 &&
274 ValMapping.BreakDown[1].Length == 32 &&
275 ValMapping.BreakDown[1].StartIdx == 32 &&
276 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
277
278 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
279 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
280 // want.
281
282 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
283 // alignment restrictions, but this probably isn't important.
284 return 1;
285}
286
287const RegisterBank &
288AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
289 LLT Ty) const {
290 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
291 // VCC-like use.
292 if (TRI->isSGPRClass(RC: &RC)) {
293 // FIXME: This probably came from a copy from a physical register, which
294 // should be inferable from the copied to-type. We don't have many boolean
295 // physical register constraints so just assume a normal SGPR for now.
296 if (!Ty.isValid())
297 return AMDGPU::SGPRRegBank;
298
299 return Ty == LLT::scalar(SizeInBits: 1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
300 }
301
302 return TRI->isAGPRClass(RC: &RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
303}
304
305template <unsigned NumOps>
306RegisterBankInfo::InstructionMappings
307AMDGPURegisterBankInfo::addMappingFromTable(
308 const MachineInstr &MI, const MachineRegisterInfo &MRI,
309 const std::array<unsigned, NumOps> RegSrcOpIdx,
310 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
311
312 InstructionMappings AltMappings;
313
314 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
315
316 unsigned Sizes[NumOps];
317 for (unsigned I = 0; I < NumOps; ++I) {
318 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
319 Sizes[I] = getSizeInBits(Reg, MRI, TRI: *TRI);
320 }
321
322 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
323 unsigned SizeI = getSizeInBits(Reg: MI.getOperand(i: I).getReg(), MRI, TRI: *TRI);
324 Operands[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SizeI);
325 }
326
327 // getInstrMapping's default mapping uses ID 1, so start at 2.
328 unsigned MappingID = 2;
329 for (const auto &Entry : Table) {
330 for (unsigned I = 0; I < NumOps; ++I) {
331 int OpIdx = RegSrcOpIdx[I];
332 Operands[OpIdx] = AMDGPU::getValueMapping(BankID: Entry.RegBanks[I], Size: Sizes[I]);
333 }
334
335 AltMappings.push_back(Elt: &getInstructionMapping(ID: MappingID++, Cost: Entry.Cost,
336 OperandsMapping: getOperandsMapping(OpdsMapping: Operands),
337 NumOperands: Operands.size()));
338 }
339
340 return AltMappings;
341}
342
343RegisterBankInfo::InstructionMappings
344AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
345 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
346 switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
347 case Intrinsic::amdgcn_readlane: {
348 static const OpRegBankEntry<3> Table[2] = {
349 // Perfectly legal.
350 { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: 1 },
351
352 // Need a readfirstlane for the index.
353 { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 }
354 };
355
356 const std::array<unsigned, 3> RegSrcOpIdx = { ._M_elems: { 0, 2, 3 } };
357 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
358 }
359 case Intrinsic::amdgcn_writelane: {
360 static const OpRegBankEntry<4> Table[4] = {
361 // Perfectly legal.
362 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 1 },
363
364 // Need readfirstlane of first op
365 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 },
366
367 // Need readfirstlane of second op
368 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 },
369
370 // Need readfirstlane of both ops
371 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 3 }
372 };
373
374 // rsrc, voffset, offset
375 const std::array<unsigned, 4> RegSrcOpIdx = { ._M_elems: { 0, 2, 3, 4 } };
376 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
377 }
378 default:
379 return RegisterBankInfo::getInstrAlternativeMappings(MI);
380 }
381}
382
383RegisterBankInfo::InstructionMappings
384AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
385 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
386
387 switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
388 case Intrinsic::amdgcn_s_buffer_load: {
389 static const OpRegBankEntry<2> Table[4] = {
390 // Perfectly legal.
391 { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: 1 },
392
393 // Only need 1 register in loop
394 { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 300 },
395
396 // Have to waterfall the resource.
397 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: 1000 },
398
399 // Have to waterfall the resource, and the offset.
400 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 1500 }
401 };
402
403 // rsrc, offset
404 const std::array<unsigned, 2> RegSrcOpIdx = { ._M_elems: { 2, 3 } };
405 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
406 }
407 case Intrinsic::amdgcn_ds_ordered_add:
408 case Intrinsic::amdgcn_ds_ordered_swap: {
409 // VGPR = M0, VGPR
410 static const OpRegBankEntry<3> Table[2] = {
411 // Perfectly legal.
412 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 1 },
413
414 // Need a readfirstlane for m0
415 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 }
416 };
417
418 const std::array<unsigned, 3> RegSrcOpIdx = { ._M_elems: { 0, 2, 3 } };
419 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
420 }
421 case Intrinsic::amdgcn_s_sendmsg:
422 case Intrinsic::amdgcn_s_sendmsghalt: {
423 // FIXME: Should have no register for immediate
424 static const OpRegBankEntry<1> Table[2] = {
425 // Perfectly legal.
426 { .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: 1 },
427
428 // Need readlane
429 { .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: 3 }
430 };
431
432 const std::array<unsigned, 1> RegSrcOpIdx = { ._M_elems: { 2 } };
433 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
434 }
435 default:
436 return RegisterBankInfo::getInstrAlternativeMappings(MI);
437 }
438}
439
440// FIXME: Returns uniform if there's no source value information. This is
441// probably wrong.
442bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
443 if (!MI.hasOneMemOperand())
444 return false;
445
446 const MachineMemOperand *MMO = *MI.memoperands_begin();
447 const unsigned AS = MMO->getAddrSpace();
448 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
449 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
450 const unsigned MemSize = 8 * MMO->getSize().getValue();
451
452 // Require 4-byte alignment.
453 return (MMO->getAlign() >= Align(4) ||
454 (Subtarget.hasScalarSubwordLoads() &&
455 ((MemSize == 16 && MMO->getAlign() >= Align(2)) ||
456 (MemSize == 8 && MMO->getAlign() >= Align(1))))) &&
457 // Can't do a scalar atomic load.
458 !MMO->isAtomic() &&
459 // Don't use scalar loads for volatile accesses to non-constant address
460 // spaces.
461 (IsConst || !MMO->isVolatile()) &&
462 // Memory must be known constant, or not written before this load.
463 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
464 AMDGPU::isUniformMMO(MMO);
465}
466
467RegisterBankInfo::InstructionMappings
468AMDGPURegisterBankInfo::getInstrAlternativeMappings(
469 const MachineInstr &MI) const {
470
471 const MachineFunction &MF = *MI.getMF();
472 const MachineRegisterInfo &MRI = MF.getRegInfo();
473
474
475 InstructionMappings AltMappings;
476 switch (MI.getOpcode()) {
477 case TargetOpcode::G_CONSTANT:
478 case TargetOpcode::G_IMPLICIT_DEF: {
479 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
480 if (Size == 1) {
481 static const OpRegBankEntry<1> Table[3] = {
482 { .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: 1 },
483 { .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: 1 },
484 { .RegBanks: { AMDGPU::VCCRegBankID }, .Cost: 1 }
485 };
486
487 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx: {._M_elems: { 0 }}, Table);
488 }
489
490 [[fallthrough]];
491 }
492 case TargetOpcode::G_FCONSTANT:
493 case TargetOpcode::G_FRAME_INDEX:
494 case TargetOpcode::G_GLOBAL_VALUE: {
495 static const OpRegBankEntry<1> Table[2] = {
496 { .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: 1 },
497 { .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: 1 }
498 };
499
500 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx: {._M_elems: { 0 }}, Table);
501 }
502 case TargetOpcode::G_AND:
503 case TargetOpcode::G_OR:
504 case TargetOpcode::G_XOR: {
505 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
506
507 if (Size == 1) {
508 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
509 const InstructionMapping &SCCMapping = getInstructionMapping(
510 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
511 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32),
512 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32),
513 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32)}),
514 NumOperands: 3); // Num Operands
515 AltMappings.push_back(Elt: &SCCMapping);
516
517 const InstructionMapping &VCCMapping0 = getInstructionMapping(
518 ID: 2, Cost: 1, OperandsMapping: getOperandsMapping(
519 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size),
520 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size),
521 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size)}),
522 NumOperands: 3); // Num Operands
523 AltMappings.push_back(Elt: &VCCMapping0);
524 return AltMappings;
525 }
526
527 if (Size != 64)
528 break;
529
530 const InstructionMapping &SSMapping = getInstructionMapping(
531 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
532 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
533 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
534 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size)}),
535 NumOperands: 3); // Num Operands
536 AltMappings.push_back(Elt: &SSMapping);
537
538 const InstructionMapping &VVMapping = getInstructionMapping(
539 ID: 2, Cost: 2, OperandsMapping: getOperandsMapping(
540 OpdsMapping: {AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
541 AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
542 AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size)}),
543 NumOperands: 3); // Num Operands
544 AltMappings.push_back(Elt: &VVMapping);
545 break;
546 }
547 case TargetOpcode::G_LOAD:
548 case TargetOpcode::G_ZEXTLOAD:
549 case TargetOpcode::G_SEXTLOAD: {
550 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
551 LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
552 unsigned PtrSize = PtrTy.getSizeInBits();
553 unsigned AS = PtrTy.getAddressSpace();
554
555 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
556 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
557 isScalarLoadLegal(MI)) {
558 const InstructionMapping &SSMapping = getInstructionMapping(
559 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
560 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
561 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: PtrSize)}),
562 NumOperands: 2); // Num Operands
563 AltMappings.push_back(Elt: &SSMapping);
564 }
565
566 const InstructionMapping &VVMapping = getInstructionMapping(
567 ID: 2, Cost: 1,
568 OperandsMapping: getOperandsMapping(
569 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
570 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: PtrSize)}),
571 NumOperands: 2); // Num Operands
572 AltMappings.push_back(Elt: &VVMapping);
573
574 // It may be possible to have a vgpr = load sgpr mapping here, because
575 // the mubuf instructions support this kind of load, but probably for only
576 // gfx7 and older. However, the addressing mode matching in the instruction
577 // selector should be able to do a better job of detecting and selecting
578 // these kinds of loads from the vgpr = load vgpr mapping.
579
580 return AltMappings;
581
582 }
583 case TargetOpcode::G_SELECT: {
584 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
585 const InstructionMapping &SSMapping = getInstructionMapping(ID: 1, Cost: 1,
586 OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
587 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1),
588 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
589 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size)}),
590 NumOperands: 4); // Num Operands
591 AltMappings.push_back(Elt: &SSMapping);
592
593 const InstructionMapping &VVMapping = getInstructionMapping(ID: 2, Cost: 1,
594 OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
595 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1),
596 AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
597 AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size)}),
598 NumOperands: 4); // Num Operands
599 AltMappings.push_back(Elt: &VVMapping);
600
601 return AltMappings;
602 }
603 case TargetOpcode::G_UADDE:
604 case TargetOpcode::G_USUBE:
605 case TargetOpcode::G_SADDE:
606 case TargetOpcode::G_SSUBE: {
607 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
608 const InstructionMapping &SSMapping = getInstructionMapping(ID: 1, Cost: 1,
609 OperandsMapping: getOperandsMapping(
610 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
611 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1),
612 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
613 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
614 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1)}),
615 NumOperands: 5); // Num Operands
616 AltMappings.push_back(Elt: &SSMapping);
617
618 const InstructionMapping &VVMapping = getInstructionMapping(ID: 2, Cost: 1,
619 OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
620 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1),
621 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
622 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
623 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1)}),
624 NumOperands: 5); // Num Operands
625 AltMappings.push_back(Elt: &VVMapping);
626 return AltMappings;
627 }
628 case AMDGPU::G_BRCOND: {
629 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
630
631 // TODO: Change type to 32 for scalar
632 const InstructionMapping &SMapping = getInstructionMapping(
633 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
634 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1), nullptr}),
635 NumOperands: 2); // Num Operands
636 AltMappings.push_back(Elt: &SMapping);
637
638 const InstructionMapping &VMapping = getInstructionMapping(
639 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
640 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1), nullptr }),
641 NumOperands: 2); // Num Operands
642 AltMappings.push_back(Elt: &VMapping);
643 return AltMappings;
644 }
645 case AMDGPU::G_INTRINSIC:
646 case AMDGPU::G_INTRINSIC_CONVERGENT:
647 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
648 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
649 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
650 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
651 default:
652 break;
653 }
654 return RegisterBankInfo::getInstrAlternativeMappings(MI);
655}
656
657void AMDGPURegisterBankInfo::split64BitValueForMapping(
658 MachineIRBuilder &B,
659 SmallVector<Register, 2> &Regs,
660 LLT HalfTy,
661 Register Reg) const {
662 assert(HalfTy.getSizeInBits() == 32);
663 MachineRegisterInfo *MRI = B.getMRI();
664 Register LoLHS = MRI->createGenericVirtualRegister(Ty: HalfTy);
665 Register HiLHS = MRI->createGenericVirtualRegister(Ty: HalfTy);
666 const RegisterBank *Bank = getRegBank(Reg, MRI: *MRI, TRI: *TRI);
667 MRI->setRegBank(Reg: LoLHS, RegBank: *Bank);
668 MRI->setRegBank(Reg: HiLHS, RegBank: *Bank);
669
670 Regs.push_back(Elt: LoLHS);
671 Regs.push_back(Elt: HiLHS);
672
673 B.buildInstr(Opcode: AMDGPU::G_UNMERGE_VALUES)
674 .addDef(RegNo: LoLHS)
675 .addDef(RegNo: HiLHS)
676 .addUse(RegNo: Reg);
677}
678
679/// Replace the current type each register in \p Regs has with \p NewTy
680static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
681 LLT NewTy) {
682 for (Register Reg : Regs) {
683 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
684 MRI.setType(VReg: Reg, Ty: NewTy);
685 }
686}
687
688static LLT getHalfSizedType(LLT Ty) {
689 if (Ty.isVector()) {
690 assert(Ty.getElementCount().isKnownMultipleOf(2));
691 return LLT::scalarOrVector(EC: Ty.getElementCount().divideCoefficientBy(RHS: 2),
692 ScalarTy: Ty.getElementType());
693 }
694
695 assert(Ty.getScalarSizeInBits() % 2 == 0);
696 return LLT::scalar(SizeInBits: Ty.getScalarSizeInBits() / 2);
697}
698
699// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
700// source value into a scalar register.
701Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
702 MachineRegisterInfo &MRI,
703 Register Src) const {
704 LLT Ty = MRI.getType(Reg: Src);
705 const RegisterBank *Bank = getRegBank(Reg: Src, MRI, TRI: *TRI);
706
707 if (Bank == &AMDGPU::SGPRRegBank)
708 return Src;
709
710 unsigned Bits = Ty.getSizeInBits();
711 assert(Bits % 32 == 0);
712
713 if (Bank != &AMDGPU::VGPRRegBank) {
714 // We need to copy from AGPR to VGPR
715 Src = B.buildCopy(Res: Ty, Op: Src).getReg(Idx: 0);
716 MRI.setRegBank(Reg: Src, RegBank: AMDGPU::VGPRRegBank);
717 }
718
719 LLT S32 = LLT::scalar(SizeInBits: 32);
720 unsigned NumParts = Bits / 32;
721 SmallVector<Register, 8> SrcParts;
722 SmallVector<Register, 8> DstParts;
723
724 if (Bits == 32) {
725 SrcParts.push_back(Elt: Src);
726 } else {
727 auto Unmerge = B.buildUnmerge(Res: S32, Op: Src);
728 for (unsigned i = 0; i < NumParts; ++i)
729 SrcParts.push_back(Elt: Unmerge.getReg(Idx: i));
730 }
731
732 for (unsigned i = 0; i < NumParts; ++i) {
733 Register SrcPart = SrcParts[i];
734 Register DstPart = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
735 MRI.setType(VReg: DstPart, Ty: NumParts == 1 ? Ty : S32);
736
737 const TargetRegisterClass *Constrained =
738 constrainGenericRegister(Reg: SrcPart, RC: AMDGPU::VGPR_32RegClass, MRI);
739 (void)Constrained;
740 assert(Constrained && "Failed to constrain readfirstlane src reg");
741
742 B.buildInstr(Opc: AMDGPU::V_READFIRSTLANE_B32, DstOps: {DstPart}, SrcOps: {SrcPart});
743
744 DstParts.push_back(Elt: DstPart);
745 }
746
747 if (Bits == 32)
748 return DstParts[0];
749
750 Register Dst = B.buildMergeLikeInstr(Res: Ty, Ops: DstParts).getReg(Idx: 0);
751 MRI.setRegBank(Reg: Dst, RegBank: AMDGPU::SGPRRegBank);
752 return Dst;
753}
754
755/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
756/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
757/// execute the instruction for each unique combination of values in all lanes
758/// in the wave. The block will be split such that rest of the instructions are
759/// moved to a new block.
760///
761/// Essentially performs this loop:
762//
763/// Save Execution Mask
764/// For (Lane : Wavefront) {
765/// Enable Lane, Disable all other lanes
766/// SGPR = read SGPR value for current lane from VGPR
767/// VGPRResult[Lane] = use_op SGPR
768/// }
769/// Restore Execution Mask
770///
771/// There is additional complexity to try for compare values to identify the
772/// unique values used.
773bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
774 MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
775 SmallSet<Register, 4> &SGPROperandRegs) const {
776 // Track use registers which have already been expanded with a readfirstlane
777 // sequence. This may have multiple uses if moving a sequence.
778 DenseMap<Register, Register> WaterfalledRegMap;
779
780 MachineBasicBlock &MBB = B.getMBB();
781 MachineFunction *MF = &B.getMF();
782
783 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
784 const AMDGPU::LaneMaskConstants &LMC =
785 AMDGPU::LaneMaskConstants::get(ST: Subtarget);
786
787#ifndef NDEBUG
788 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
789#endif
790
791 MachineRegisterInfo &MRI = *B.getMRI();
792 Register SaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
793 Register InitSaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
794
795 // Don't bother using generic instructions/registers for the exec mask.
796 B.buildInstr(Opcode: TargetOpcode::IMPLICIT_DEF)
797 .addDef(RegNo: InitSaveExecReg);
798
799 Register PhiExec = MRI.createVirtualRegister(RegClass: WaveRC);
800 Register NewExec = MRI.createVirtualRegister(RegClass: WaveRC);
801
802 // To insert the loop we need to split the block. Move everything before this
803 // point to a new block, and insert a new empty block before this instruction.
804 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
805 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
806 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
807 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
808 MachineFunction::iterator MBBI(MBB);
809 ++MBBI;
810 MF->insert(MBBI, MBB: LoopBB);
811 MF->insert(MBBI, MBB: BodyBB);
812 MF->insert(MBBI, MBB: RestoreExecBB);
813 MF->insert(MBBI, MBB: RemainderBB);
814
815 LoopBB->addSuccessor(Succ: BodyBB);
816 BodyBB->addSuccessor(Succ: RestoreExecBB);
817 BodyBB->addSuccessor(Succ: LoopBB);
818
819 // Move the rest of the block into a new block.
820 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
821 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Range.end(), To: MBB.end());
822
823 MBB.addSuccessor(Succ: LoopBB);
824 RestoreExecBB->addSuccessor(Succ: RemainderBB);
825
826 B.setInsertPt(MBB&: *LoopBB, II: LoopBB->end());
827
828 B.buildInstr(Opcode: TargetOpcode::PHI)
829 .addDef(RegNo: PhiExec)
830 .addReg(RegNo: InitSaveExecReg)
831 .addMBB(MBB: &MBB)
832 .addReg(RegNo: NewExec)
833 .addMBB(MBB: BodyBB);
834
835 const DebugLoc &DL = B.getDL();
836
837 MachineInstr &FirstInst = *Range.begin();
838
839 // Move the instruction into the loop body. Note we moved everything after
840 // Range.end() already into a new block, so Range.end() is no longer valid.
841 BodyBB->splice(Where: BodyBB->end(), Other: &MBB, From: Range.begin(), To: MBB.end());
842
843 // Figure out the iterator range after splicing the instructions.
844 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
845 auto NewEnd = BodyBB->end();
846
847 B.setMBB(*LoopBB);
848
849 LLT S1 = LLT::scalar(SizeInBits: 1);
850 Register CondReg;
851
852 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
853
854 for (MachineInstr &MI : make_range(x: NewBegin, y: NewEnd)) {
855 for (MachineOperand &Op : MI.all_uses()) {
856 Register OldReg = Op.getReg();
857 if (!SGPROperandRegs.count(V: OldReg))
858 continue;
859
860 // See if we already processed this register in another instruction in the
861 // sequence.
862 auto OldVal = WaterfalledRegMap.find(Val: OldReg);
863 if (OldVal != WaterfalledRegMap.end()) {
864 Op.setReg(OldVal->second);
865 continue;
866 }
867
868 Register OpReg = Op.getReg();
869 LLT OpTy = MRI.getType(Reg: OpReg);
870
871 const RegisterBank *OpBank = getRegBank(Reg: OpReg, MRI, TRI: *TRI);
872 if (OpBank != &AMDGPU::VGPRRegBank) {
873 // Insert copy from AGPR to VGPR before the loop.
874 B.setMBB(MBB);
875 OpReg = B.buildCopy(Res: OpTy, Op: OpReg).getReg(Idx: 0);
876 MRI.setRegBank(Reg: OpReg, RegBank: AMDGPU::VGPRRegBank);
877 B.setMBB(*LoopBB);
878 }
879
880 Register CurrentLaneReg = buildReadFirstLane(B, MRI, Src: OpReg);
881
882 // Build the comparison(s).
883 unsigned OpSize = OpTy.getSizeInBits();
884 bool Is64 = OpSize % 64 == 0;
885 unsigned PartSize = Is64 ? 64 : 32;
886 LLT PartTy = LLT::scalar(SizeInBits: PartSize);
887 unsigned NumParts = OpSize / PartSize;
888 SmallVector<Register, 8> OpParts;
889 SmallVector<Register, 8> CurrentLaneParts;
890
891 if (NumParts == 1) {
892 OpParts.push_back(Elt: OpReg);
893 CurrentLaneParts.push_back(Elt: CurrentLaneReg);
894 } else {
895 auto UnmergeOp = B.buildUnmerge(Res: PartTy, Op: OpReg);
896 auto UnmergeCurrentLane = B.buildUnmerge(Res: PartTy, Op: CurrentLaneReg);
897 for (unsigned i = 0; i < NumParts; ++i) {
898 OpParts.push_back(Elt: UnmergeOp.getReg(Idx: i));
899 CurrentLaneParts.push_back(Elt: UnmergeCurrentLane.getReg(Idx: i));
900 MRI.setRegBank(Reg: OpParts[i], RegBank: AMDGPU::VGPRRegBank);
901 MRI.setRegBank(Reg: CurrentLaneParts[i], RegBank: AMDGPU::SGPRRegBank);
902 }
903 }
904
905 for (unsigned i = 0; i < NumParts; ++i) {
906 auto CmpReg = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: CurrentLaneParts[i],
907 Op1: OpParts[i]).getReg(Idx: 0);
908 MRI.setRegBank(Reg: CmpReg, RegBank: AMDGPU::VCCRegBank);
909
910 if (!CondReg) {
911 CondReg = CmpReg;
912 } else {
913 CondReg = B.buildAnd(Dst: S1, Src0: CondReg, Src1: CmpReg).getReg(Idx: 0);
914 MRI.setRegBank(Reg: CondReg, RegBank: AMDGPU::VCCRegBank);
915 }
916 }
917
918 Op.setReg(CurrentLaneReg);
919
920 // Make sure we don't re-process this register again.
921 WaterfalledRegMap.insert(KV: std::pair(OldReg, Op.getReg()));
922 }
923 }
924
925 // The ballot becomes a no-op during instruction selection.
926 CondReg = B.buildIntrinsic(ID: Intrinsic::amdgcn_ballot,
927 Res: {LLT::scalar(SizeInBits: Subtarget.isWave32() ? 32 : 64)})
928 .addReg(RegNo: CondReg)
929 .getReg(Idx: 0);
930 MRI.setRegClass(Reg: CondReg, RC: WaveRC);
931
932 // Update EXEC, save the original EXEC value to VCC.
933 B.buildInstr(Opcode: LMC.AndSaveExecOpc)
934 .addDef(RegNo: NewExec)
935 .addReg(RegNo: CondReg, Flags: RegState::Kill);
936
937 MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
938
939 B.setInsertPt(MBB&: *BodyBB, II: BodyBB->end());
940
941 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
942 B.buildInstr(Opcode: LMC.XorTermOpc)
943 .addDef(RegNo: LMC.ExecReg)
944 .addReg(RegNo: LMC.ExecReg)
945 .addReg(RegNo: NewExec);
946
947 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
948 // s_cbranch_scc0?
949
950 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
951 B.buildInstr(Opcode: AMDGPU::SI_WATERFALL_LOOP).addMBB(MBB: LoopBB);
952
953 // Save the EXEC mask before the loop.
954 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: SaveExecReg)
955 .addReg(RegNo: LMC.ExecReg);
956
957 // Restore the EXEC mask after the loop.
958 B.setMBB(*RestoreExecBB);
959 B.buildInstr(Opcode: LMC.MovTermOpc).addDef(RegNo: LMC.ExecReg).addReg(RegNo: SaveExecReg);
960
961 // Set the insert point after the original instruction, so any new
962 // instructions will be in the remainder.
963 B.setInsertPt(MBB&: *RemainderBB, II: RemainderBB->begin());
964
965 return true;
966}
967
968// Return any unique registers used by \p MI at \p OpIndices that need to be
969// handled in a waterfall loop. Returns these registers in \p
970// SGPROperandRegs. Returns true if there are any operands to handle and a
971// waterfall loop is necessary.
972bool AMDGPURegisterBankInfo::collectWaterfallOperands(
973 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
974 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
975 for (unsigned Op : OpIndices) {
976 assert(MI.getOperand(Op).isUse());
977 Register Reg = MI.getOperand(i: Op).getReg();
978 const RegisterBank *OpBank = getRegBank(Reg, MRI, TRI: *TRI);
979 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
980 SGPROperandRegs.insert(V: Reg);
981 }
982
983 // No operands need to be replaced, so no need to loop.
984 return !SGPROperandRegs.empty();
985}
986
987bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
988 MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
989 // Use a set to avoid extra readfirstlanes in the case where multiple operands
990 // are the same register.
991 SmallSet<Register, 4> SGPROperandRegs;
992
993 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI&: *B.getMRI(), OpIndices))
994 return false;
995
996 MachineBasicBlock::iterator I = MI.getIterator();
997 return executeInWaterfallLoop(B, Range: make_range(x: I, y: std::next(x: I)),
998 SGPROperandRegs);
999}
1000
1001// Legalize an operand that must be an SGPR by inserting a readfirstlane.
1002void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1003 MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1004 Register Reg = MI.getOperand(i: OpIdx).getReg();
1005 MachineRegisterInfo &MRI = *B.getMRI();
1006 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI);
1007 if (Bank == &AMDGPU::SGPRRegBank)
1008 return;
1009
1010 Reg = buildReadFirstLane(B, MRI, Src: Reg);
1011 MI.getOperand(i: OpIdx).setReg(Reg);
1012}
1013
1014/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1015/// rest will be in the remainder.
1016static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1017 unsigned TotalSize = Ty.getSizeInBits();
1018 if (!Ty.isVector())
1019 return {LLT::scalar(SizeInBits: FirstSize), LLT::scalar(SizeInBits: TotalSize - FirstSize)};
1020
1021 LLT EltTy = Ty.getElementType();
1022 unsigned EltSize = EltTy.getSizeInBits();
1023 assert(FirstSize % EltSize == 0);
1024
1025 unsigned FirstPartNumElts = FirstSize / EltSize;
1026 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1027
1028 return {LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: FirstPartNumElts), ScalarTy: EltTy),
1029 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: RemainderElts), ScalarTy: EltTy)};
1030}
1031
1032static LLT widen96To128(LLT Ty) {
1033 if (!Ty.isVector())
1034 return LLT::scalar(SizeInBits: 128);
1035
1036 LLT EltTy = Ty.getElementType();
1037 assert(128 % EltTy.getSizeInBits() == 0);
1038 return LLT::fixed_vector(NumElements: 128 / EltTy.getSizeInBits(), ScalarTy: EltTy);
1039}
1040
1041bool AMDGPURegisterBankInfo::applyMappingLoad(
1042 MachineIRBuilder &B,
1043 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1044 MachineInstr &MI) const {
1045 MachineRegisterInfo &MRI = *B.getMRI();
1046 Register DstReg = MI.getOperand(i: 0).getReg();
1047 const LLT LoadTy = MRI.getType(Reg: DstReg);
1048 unsigned LoadSize = LoadTy.getSizeInBits();
1049 MachineMemOperand *MMO = *MI.memoperands_begin();
1050 const unsigned MaxNonSmrdLoadSize = 128;
1051
1052 const RegisterBank *DstBank =
1053 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
1054 if (DstBank == &AMDGPU::SGPRRegBank) {
1055 // There are some special cases that we need to look at for 32 bit and 96
1056 // bit SGPR loads otherwise we have nothing to do.
1057 if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
1058 return false;
1059
1060 const unsigned MemSize = 8 * MMO->getSize().getValue();
1061 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1062 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1063 // scalar loads should have a load size of 32 but memory access size of less
1064 // than 32.
1065 if (LoadSize == 32 &&
1066 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1067 return false;
1068
1069 if (LoadSize == 32 &&
1070 ((MemSize == 8 && MMO->getAlign() >= Align(1)) ||
1071 (MemSize == 16 && MMO->getAlign() >= Align(2))) &&
1072 isScalarLoadLegal(MI) &&
1073 Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
1074 return false;
1075
1076 Register PtrReg = MI.getOperand(i: 1).getReg();
1077
1078 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1079
1080 if (LoadSize == 32) {
1081 // This is an extending load from a sub-dword size. Widen the memory
1082 // access size to 4 bytes and clear the extra high bits appropriately
1083 const LLT S32 = LLT::scalar(SizeInBits: 32);
1084 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1085 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1086 auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1087 B.buildSExtInReg(Res: MI.getOperand(i: 0), Op: WideLoad, ImmOp: MemSize);
1088 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1089 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1090 auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1091 B.buildZExtInReg(Res: MI.getOperand(i: 0), Op: WideLoad, ImmOp: MemSize);
1092 } else
1093 // We do not need to touch the higher bits for regular loads.
1094 B.buildLoadFromOffset(Dst: MI.getOperand(i: 0), BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1095 } else {
1096 // 96-bit loads are only available for vector loads. We need to split this
1097 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1098 if (MMO->getAlign() < Align(16)) {
1099 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1100 LLT Part64, Part32;
1101 std::tie(args&: Part64, args&: Part32) = splitUnequalType(Ty: LoadTy, FirstSize: 64);
1102 if (Helper.reduceLoadStoreWidth(MI&: cast<GAnyLoad>(Val&: MI), TypeIdx: 0, NarrowTy: Part64) !=
1103 LegalizerHelper::Legalized)
1104 return false;
1105 return true;
1106 }
1107 LLT WiderTy = widen96To128(Ty: LoadTy);
1108 auto WideLoad = B.buildLoadFromOffset(Dst: WiderTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1109 if (WiderTy.isScalar()) {
1110 B.buildTrunc(Res: MI.getOperand(i: 0), Op: WideLoad);
1111 } else {
1112 B.buildDeleteTrailingVectorElements(Res: MI.getOperand(i: 0).getReg(),
1113 Op0: WideLoad);
1114 }
1115 }
1116
1117 MI.eraseFromParent();
1118 return true;
1119 }
1120
1121 // 128-bit loads are supported for all instruction types.
1122 if (LoadSize <= MaxNonSmrdLoadSize)
1123 return false;
1124
1125 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(OpIdx: 1));
1126
1127 if (SrcRegs.empty())
1128 SrcRegs.push_back(Elt: MI.getOperand(i: 1).getReg());
1129
1130 // RegBankSelect only emits scalar types, so we need to reset the pointer
1131 // operand to a pointer type.
1132 Register BasePtrReg = SrcRegs[0];
1133 LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
1134 MRI.setType(VReg: BasePtrReg, Ty: PtrTy);
1135
1136 // The following are the loads not splitted enough during legalization
1137 // because it was not clear they are smem-load or vmem-load
1138 if (AMDGPU::isExtendedGlobalAddrSpace(AS: MMO->getAddrSpace()) ||
1139 MMO->getAddrSpace() == AMDGPUAS::BUFFER_RESOURCE) {
1140 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1141 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1142 const LLT LoadSplitTy = LoadTy.divide(Factor: NumSplitParts);
1143 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1144 LegalizerHelper Helper(B.getMF(), O, B);
1145 if (LoadTy.isVector()) {
1146 if (Helper.fewerElementsVector(MI, TypeIdx: 0, NarrowTy: LoadSplitTy) !=
1147 LegalizerHelper::Legalized)
1148 return false;
1149 } else {
1150 if (Helper.narrowScalar(MI, TypeIdx: 0, NarrowTy: LoadSplitTy) != LegalizerHelper::Legalized)
1151 return false;
1152 }
1153 }
1154
1155 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
1156 return true;
1157}
1158
1159bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1160 MachineIRBuilder &B,
1161 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1162 MachineInstr &MI) const {
1163 MachineRegisterInfo &MRI = *B.getMRI();
1164 const MachineFunction &MF = B.getMF();
1165 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1166 const auto &TFI = *ST.getFrameLowering();
1167
1168 // Guard in case the stack growth direction ever changes with scratch
1169 // instructions.
1170 assert(TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
1171 "Stack grows upwards for AMDGPU");
1172
1173 Register Dst = MI.getOperand(i: 0).getReg();
1174 Register AllocSize = MI.getOperand(i: 1).getReg();
1175 Align Alignment = assumeAligned(Value: MI.getOperand(i: 2).getImm());
1176
1177 const RegisterBank *SizeBank = getRegBank(Reg: AllocSize, MRI, TRI: *TRI);
1178
1179 if (SizeBank != &AMDGPU::SGPRRegBank) {
1180 auto WaveReduction =
1181 B.buildIntrinsic(ID: Intrinsic::amdgcn_wave_reduce_umax, Res: {LLT::scalar(SizeInBits: 32)})
1182 .addUse(RegNo: AllocSize)
1183 .addImm(Val: 0);
1184 AllocSize = WaveReduction.getReg(Idx: 0);
1185 }
1186
1187 LLT PtrTy = MRI.getType(Reg: Dst);
1188 LLT IntPtrTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
1189
1190 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1191 Register SPReg = Info->getStackPtrOffsetReg();
1192 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1193
1194 auto WaveSize = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: ST.getWavefrontSizeLog2());
1195 auto ScaledSize = B.buildShl(Dst: IntPtrTy, Src0: AllocSize, Src1: WaveSize);
1196
1197 auto OldSP = B.buildCopy(Res: PtrTy, Op: SPReg);
1198 if (Alignment > TFI.getStackAlign()) {
1199 auto StackAlignMask = (Alignment.value() << ST.getWavefrontSizeLog2()) - 1;
1200 auto Tmp1 = B.buildPtrAdd(Res: PtrTy, Op0: OldSP,
1201 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: StackAlignMask));
1202 B.buildMaskLowPtrBits(Res: Dst, Op0: Tmp1,
1203 NumBits: Log2(A: Alignment) + ST.getWavefrontSizeLog2());
1204 } else {
1205 B.buildCopy(Res: Dst, Op: OldSP);
1206 }
1207 auto PtrAdd = B.buildPtrAdd(Res: PtrTy, Op0: Dst, Op1: ScaledSize);
1208 B.buildCopy(Res: SPReg, Op: PtrAdd);
1209 MI.eraseFromParent();
1210 return true;
1211}
1212
1213bool AMDGPURegisterBankInfo::applyMappingImage(
1214 MachineIRBuilder &B, MachineInstr &MI,
1215 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1216 int RsrcIdx) const {
1217 const int NumDefs = MI.getNumExplicitDefs();
1218
1219 // The reported argument index is relative to the IR intrinsic call arguments,
1220 // so we need to shift by the number of defs and the intrinsic ID.
1221 RsrcIdx += NumDefs + 1;
1222
1223 // Insert copies to VGPR arguments.
1224 applyDefaultMapping(OpdMapper);
1225
1226 // Fixup any SGPR arguments.
1227 SmallVector<unsigned, 4> SGPRIndexes;
1228 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1229 if (!MI.getOperand(i: I).isReg())
1230 continue;
1231
1232 // If this intrinsic has a sampler, it immediately follows rsrc.
1233 if (I == RsrcIdx || I == RsrcIdx + 1)
1234 SGPRIndexes.push_back(Elt: I);
1235 }
1236
1237 executeInWaterfallLoop(B, MI, OpIndices: SGPRIndexes);
1238 return true;
1239}
1240
1241// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1242// the three offsets (voffset, soffset and instoffset)
1243unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1244 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1245 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1246 const LLT S32 = LLT::scalar(SizeInBits: 32);
1247 MachineRegisterInfo *MRI = B.getMRI();
1248
1249 if (std::optional<int64_t> Imm =
1250 getIConstantVRegSExtVal(VReg: CombinedOffset, MRI: *MRI)) {
1251 uint32_t SOffset, ImmOffset;
1252 if (TII->splitMUBUFOffset(Imm: *Imm, SOffset, ImmOffset, Alignment)) {
1253 VOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1254 SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: 0);
1255 InstOffsetVal = ImmOffset;
1256
1257 B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank);
1258 B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank);
1259 return SOffset + ImmOffset;
1260 }
1261 }
1262
1263 const bool CheckNUW = Subtarget.hasGFX1250Insts();
1264 Register Base;
1265 unsigned Offset;
1266
1267 std::tie(args&: Base, args&: Offset) =
1268 AMDGPU::getBaseWithConstantOffset(MRI&: *MRI, Reg: CombinedOffset,
1269 /*KnownBits=*/ValueTracking: nullptr,
1270 /*CheckNUW=*/CheckNUW);
1271
1272 uint32_t SOffset, ImmOffset;
1273 if ((int)Offset > 0 &&
1274 TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
1275 if (getRegBank(Reg: Base, MRI: *MRI, TRI: *TRI) == &AMDGPU::VGPRRegBank) {
1276 VOffsetReg = Base;
1277 SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: 0);
1278 B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank);
1279 InstOffsetVal = ImmOffset;
1280 return 0; // XXX - Why is this 0?
1281 }
1282
1283 // If we have SGPR base, we can use it for soffset.
1284 if (SOffset == 0) {
1285 VOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1286 B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank);
1287 SOffsetReg = Base;
1288 InstOffsetVal = ImmOffset;
1289 return 0; // XXX - Why is this 0?
1290 }
1291 }
1292
1293 // Handle the variable sgpr + vgpr case.
1294 MachineInstr *Add = getOpcodeDef(Opcode: AMDGPU::G_ADD, Reg: CombinedOffset, MRI: *MRI);
1295 if (Add && (int)Offset >= 0 &&
1296 (!CheckNUW || Add->getFlag(Flag: MachineInstr::NoUWrap))) {
1297 Register Src0 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: 1).getReg(), MRI: *MRI);
1298 Register Src1 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: 2).getReg(), MRI: *MRI);
1299
1300 const RegisterBank *Src0Bank = getRegBank(Reg: Src0, MRI: *MRI, TRI: *TRI);
1301 const RegisterBank *Src1Bank = getRegBank(Reg: Src1, MRI: *MRI, TRI: *TRI);
1302
1303 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1304 VOffsetReg = Src0;
1305 SOffsetReg = Src1;
1306 return 0;
1307 }
1308
1309 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1310 VOffsetReg = Src1;
1311 SOffsetReg = Src0;
1312 return 0;
1313 }
1314 }
1315
1316 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1317 // have an SGPR offset and a VGPR resource.
1318 if (getRegBank(Reg: CombinedOffset, MRI: *MRI, TRI: *TRI) == &AMDGPU::VGPRRegBank) {
1319 VOffsetReg = CombinedOffset;
1320 } else {
1321 VOffsetReg = B.buildCopy(Res: S32, Op: CombinedOffset).getReg(Idx: 0);
1322 B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank);
1323 }
1324
1325 SOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1326 B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank);
1327 return 0;
1328}
1329
1330static unsigned getSBufferLoadCorrespondingBufferLoadOpcode(unsigned Opc) {
1331 switch (Opc) {
1332 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
1333 return AMDGPU::G_AMDGPU_BUFFER_LOAD;
1334 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
1335 return AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
1336 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
1337 return AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE;
1338 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
1339 return AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
1340 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
1341 return AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
1342 default:
1343 break;
1344 }
1345 llvm_unreachable("Unexpected s_buffer_load opcode");
1346}
1347
1348bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1349 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1350 MachineInstr &MI = OpdMapper.getMI();
1351 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1352
1353 const LLT S32 = LLT::scalar(SizeInBits: 32);
1354 Register Dst = MI.getOperand(i: 0).getReg();
1355 LLT Ty = MRI.getType(Reg: Dst);
1356
1357 const RegisterBank *RSrcBank =
1358 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
1359 const RegisterBank *OffsetBank =
1360 OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
1361 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1362 OffsetBank == &AMDGPU::SGPRRegBank)
1363 return true; // Legal mapping
1364
1365 // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1366 // here but don't have an MMO.
1367
1368 unsigned LoadSize = Ty.getSizeInBits();
1369 int NumLoads = 1;
1370 if (LoadSize == 256 || LoadSize == 512) {
1371 NumLoads = LoadSize / 128;
1372 Ty = Ty.divide(Factor: NumLoads);
1373 }
1374
1375 // Use the alignment to ensure that the required offsets will fit into the
1376 // immediate offsets.
1377 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1378
1379 MachineFunction &MF = B.getMF();
1380
1381 Register SOffset;
1382 Register VOffset;
1383 int64_t ImmOffset = 0;
1384
1385 unsigned MMOOffset = setBufferOffsets(B, CombinedOffset: MI.getOperand(i: 2).getReg(), VOffsetReg&: VOffset,
1386 SOffsetReg&: SOffset, InstOffsetVal&: ImmOffset, Alignment);
1387
1388 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1389 // can, but we need to track an MMO for that.
1390 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1391 const Align MemAlign(4); // FIXME: ABI type alignment?
1392 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1393 PtrInfo: MachinePointerInfo(),
1394 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1395 MachineMemOperand::MOInvariant,
1396 Size: MemSize, BaseAlignment: MemAlign);
1397 if (MMOOffset != 0)
1398 BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset, Size: MemSize);
1399
1400 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1401 // assume that the buffer is unswizzled.
1402
1403 Register RSrc = MI.getOperand(i: 1).getReg();
1404 Register VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1405 B.getMRI()->setRegBank(Reg: VIndex, RegBank: AMDGPU::VGPRRegBank);
1406
1407 SmallVector<Register, 4> LoadParts(NumLoads);
1408
1409 MachineBasicBlock::iterator MII = MI.getIterator();
1410 MachineInstrSpan Span(MII, &B.getMBB());
1411
1412 for (int i = 0; i < NumLoads; ++i) {
1413 if (NumLoads == 1) {
1414 LoadParts[i] = Dst;
1415 } else {
1416 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1417 MRI.setRegBank(Reg: LoadParts[i], RegBank: AMDGPU::VGPRRegBank);
1418 }
1419
1420 MachineMemOperand *MMO = BaseMMO;
1421 if (i != 0)
1422 BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset + 16 * i, Size: MemSize);
1423
1424 B.buildInstr(Opcode: getSBufferLoadCorrespondingBufferLoadOpcode(Opc: MI.getOpcode()))
1425 .addDef(RegNo: LoadParts[i]) // vdata
1426 .addUse(RegNo: RSrc) // rsrc
1427 .addUse(RegNo: VIndex) // vindex
1428 .addUse(RegNo: VOffset) // voffset
1429 .addUse(RegNo: SOffset) // soffset
1430 .addImm(Val: ImmOffset + 16 * i) // offset(imm)
1431 .addImm(Val: 0) // cachepolicy, swizzled buffer(imm)
1432 .addImm(Val: 0) // idxen(imm)
1433 .addMemOperand(MMO);
1434 }
1435
1436 // TODO: If only the resource is a VGPR, it may be better to execute the
1437 // scalar load in the waterfall loop if the resource is expected to frequently
1438 // be dynamically uniform.
1439 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1440 // Remove the original instruction to avoid potentially confusing the
1441 // waterfall loop logic.
1442 B.setInstr(*Span.begin());
1443 MI.eraseFromParent();
1444
1445 SmallSet<Register, 4> OpsToWaterfall;
1446
1447 OpsToWaterfall.insert(V: RSrc);
1448 executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()),
1449 SGPROperandRegs&: OpsToWaterfall);
1450 }
1451
1452 if (NumLoads != 1) {
1453 if (Ty.isVector())
1454 B.buildConcatVectors(Res: Dst, Ops: LoadParts);
1455 else
1456 B.buildMergeLikeInstr(Res: Dst, Ops: LoadParts);
1457 }
1458
1459 // We removed the instruction earlier with a waterfall loop.
1460 if (RSrcBank == &AMDGPU::SGPRRegBank)
1461 MI.eraseFromParent();
1462
1463 return true;
1464}
1465
1466bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
1467 const OperandsMapper &OpdMapper,
1468 bool Signed) const {
1469 MachineInstr &MI = OpdMapper.getMI();
1470 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1471
1472 // Insert basic copies
1473 applyDefaultMapping(OpdMapper);
1474
1475 Register DstReg = MI.getOperand(i: 0).getReg();
1476 LLT Ty = MRI.getType(Reg: DstReg);
1477
1478 const LLT S32 = LLT::scalar(SizeInBits: 32);
1479
1480 unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1;
1481 Register SrcReg = MI.getOperand(i: FirstOpnd).getReg();
1482 Register OffsetReg = MI.getOperand(i: FirstOpnd + 1).getReg();
1483 Register WidthReg = MI.getOperand(i: FirstOpnd + 2).getReg();
1484
1485 const RegisterBank *DstBank =
1486 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
1487 if (DstBank == &AMDGPU::VGPRRegBank) {
1488 if (Ty == S32)
1489 return true;
1490
1491 // There is no 64-bit vgpr bitfield extract instructions so the operation
1492 // is expanded to a sequence of instructions that implement the operation.
1493 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1494
1495 const LLT S64 = LLT::scalar(SizeInBits: 64);
1496 // Shift the source operand so that extracted bits start at bit 0.
1497 auto ShiftOffset = Signed ? B.buildAShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg)
1498 : B.buildLShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg);
1499 auto UnmergeSOffset = B.buildUnmerge(Res: {S32, S32}, Op: ShiftOffset);
1500
1501 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1502 // if the width is a constant.
1503 if (auto ConstWidth = getIConstantVRegValWithLookThrough(VReg: WidthReg, MRI)) {
1504 // Use the 32-bit bitfield extract instruction if the width is a constant.
1505 // Depending on the width size, use either the low or high 32-bits.
1506 auto Zero = B.buildConstant(Res: S32, Val: 0);
1507 auto WidthImm = ConstWidth->Value.getZExtValue();
1508 if (WidthImm <= 32) {
1509 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1510 // or clear the upper 32-bits.
1511 auto Extract =
1512 Signed ? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 0), LSB: Zero, Width: WidthReg)
1513 : B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 0), LSB: Zero, Width: WidthReg);
1514 auto Extend =
1515 Signed ? B.buildAShr(Dst: S32, Src0: Extract, Src1: B.buildConstant(Res: S32, Val: 31)) : Zero;
1516 B.buildMergeLikeInstr(Res: DstReg, Ops: {Extract, Extend});
1517 } else {
1518 // Use bitfield extract on upper 32-bit source, and combine with lower
1519 // 32-bit source.
1520 auto UpperWidth = B.buildConstant(Res: S32, Val: WidthImm - 32);
1521 auto Extract =
1522 Signed
1523 ? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 1), LSB: Zero, Width: UpperWidth)
1524 : B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 1), LSB: Zero, Width: UpperWidth);
1525 B.buildMergeLikeInstr(Res: DstReg, Ops: {UnmergeSOffset.getReg(Idx: 0), Extract});
1526 }
1527 MI.eraseFromParent();
1528 return true;
1529 }
1530
1531 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1532 // operations.
1533 auto ExtShift = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: 64), Src1: WidthReg);
1534 auto SignBit = B.buildShl(Dst: S64, Src0: ShiftOffset, Src1: ExtShift);
1535 if (Signed)
1536 B.buildAShr(Dst: S64, Src0: SignBit, Src1: ExtShift);
1537 else
1538 B.buildLShr(Dst: S64, Src0: SignBit, Src1: ExtShift);
1539 MI.eraseFromParent();
1540 return true;
1541 }
1542
1543 // The scalar form packs the offset and width in a single operand.
1544
1545 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1546
1547 // Ensure the high bits are clear to insert the offset.
1548 auto OffsetMask = B.buildConstant(Res: S32, Val: maskTrailingOnes<unsigned>(N: 6));
1549 auto ClampOffset = B.buildAnd(Dst: S32, Src0: OffsetReg, Src1: OffsetMask);
1550
1551 // Zeros out the low bits, so don't bother clamping the input value.
1552 auto ShiftWidth = B.buildShl(Dst: S32, Src0: WidthReg, Src1: B.buildConstant(Res: S32, Val: 16));
1553
1554 // Transformation function, pack the offset and width of a BFE into
1555 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1556 // source, bits [5:0] contain the offset and bits [22:16] the width.
1557 auto MergedInputs = B.buildOr(Dst: S32, Src0: ClampOffset, Src1: ShiftWidth);
1558
1559 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1560 // register class constraints.
1561 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1562 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1563
1564 auto MIB = B.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {SrcReg, MergedInputs});
1565 constrainSelectedInstRegOperands(I&: *MIB, TII: *TII, TRI: *TRI, RBI: *this);
1566
1567 MI.eraseFromParent();
1568 return true;
1569}
1570
1571bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1572 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1573 MachineInstr &MI = OpdMapper.getMI();
1574 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1575
1576 // Insert basic copies.
1577 applyDefaultMapping(OpdMapper);
1578
1579 Register Dst0 = MI.getOperand(i: 0).getReg();
1580 Register Dst1 = MI.getOperand(i: 1).getReg();
1581 Register Src0 = MI.getOperand(i: 2).getReg();
1582 Register Src1 = MI.getOperand(i: 3).getReg();
1583 Register Src2 = MI.getOperand(i: 4).getReg();
1584
1585 if (MRI.getRegBankOrNull(Reg: Src0) == &AMDGPU::VGPRRegBank)
1586 return true;
1587
1588 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1589 LLT S1 = LLT::scalar(SizeInBits: 1);
1590 LLT S32 = LLT::scalar(SizeInBits: 32);
1591
1592 bool DstOnValu = MRI.getRegBankOrNull(Reg: Src2) == &AMDGPU::VGPRRegBank;
1593 bool Accumulate = true;
1594
1595 if (!DstOnValu) {
1596 if (mi_match(R: Src2, MRI, P: m_ZeroInt()))
1597 Accumulate = false;
1598 }
1599
1600 // Keep the multiplication on the SALU.
1601 Register DstHi;
1602 Register DstLo = B.buildMul(Dst: S32, Src0, Src1).getReg(Idx: 0);
1603 bool MulHiInVgpr = false;
1604
1605 MRI.setRegBank(Reg: DstLo, RegBank: AMDGPU::SGPRRegBank);
1606
1607 if (Subtarget.hasSMulHi()) {
1608 DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0, Src1).getReg(Idx: 0)
1609 : B.buildSMulH(Dst: S32, Src0, Src1).getReg(Idx: 0);
1610 MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::SGPRRegBank);
1611 } else {
1612 Register VSrc0 = B.buildCopy(Res: S32, Op: Src0).getReg(Idx: 0);
1613 Register VSrc1 = B.buildCopy(Res: S32, Op: Src1).getReg(Idx: 0);
1614
1615 MRI.setRegBank(Reg: VSrc0, RegBank: AMDGPU::VGPRRegBank);
1616 MRI.setRegBank(Reg: VSrc1, RegBank: AMDGPU::VGPRRegBank);
1617
1618 DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: 0)
1619 : B.buildSMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: 0);
1620 MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::VGPRRegBank);
1621
1622 if (!DstOnValu) {
1623 DstHi = buildReadFirstLane(B, MRI, Src: DstHi);
1624 } else {
1625 MulHiInVgpr = true;
1626 }
1627 }
1628
1629 // Accumulate and produce the "carry-out" bit.
1630 //
1631 // The "carry-out" is defined as bit 64 of the result when computed as a
1632 // big integer. For unsigned multiply-add, this matches the usual definition
1633 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1634 // result, which is determined as:
1635 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1636 LLT CarryType = DstOnValu ? S1 : S32;
1637 const RegisterBank &CarryBank =
1638 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1639 const RegisterBank &DstBank =
1640 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1641 Register Carry;
1642 Register Zero;
1643
1644 if (!IsUnsigned) {
1645 Zero = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1646 MRI.setRegBank(Reg: Zero,
1647 RegBank: MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1648
1649 Carry = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: MulHiInVgpr ? S1 : S32, Op0: DstHi, Op1: Zero)
1650 .getReg(Idx: 0);
1651 MRI.setRegBank(Reg: Carry, RegBank: MulHiInVgpr ? AMDGPU::VCCRegBank
1652 : AMDGPU::SGPRRegBank);
1653
1654 if (DstOnValu && !MulHiInVgpr) {
1655 Carry = B.buildTrunc(Res: S1, Op: Carry).getReg(Idx: 0);
1656 MRI.setRegBank(Reg: Carry, RegBank: AMDGPU::VCCRegBank);
1657 }
1658 }
1659
1660 if (Accumulate) {
1661 if (DstOnValu) {
1662 DstLo = B.buildCopy(Res: S32, Op: DstLo).getReg(Idx: 0);
1663 DstHi = B.buildCopy(Res: S32, Op: DstHi).getReg(Idx: 0);
1664 MRI.setRegBank(Reg: DstLo, RegBank: AMDGPU::VGPRRegBank);
1665 MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::VGPRRegBank);
1666 }
1667
1668 auto Unmerge = B.buildUnmerge(Res: S32, Op: Src2);
1669 Register Src2Lo = Unmerge.getReg(Idx: 0);
1670 Register Src2Hi = Unmerge.getReg(Idx: 1);
1671 MRI.setRegBank(Reg: Src2Lo, RegBank: DstBank);
1672 MRI.setRegBank(Reg: Src2Hi, RegBank: DstBank);
1673
1674 if (!IsUnsigned) {
1675 auto Src2Sign = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: CarryType, Op0: Src2Hi, Op1: Zero);
1676 MRI.setRegBank(Reg: Src2Sign.getReg(Idx: 0), RegBank: CarryBank);
1677
1678 Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: Src2Sign).getReg(Idx: 0);
1679 MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1680 }
1681
1682 auto AddLo = B.buildUAddo(Res: S32, CarryOut: CarryType, Op0: DstLo, Op1: Src2Lo);
1683 DstLo = AddLo.getReg(Idx: 0);
1684 Register CarryLo = AddLo.getReg(Idx: 1);
1685 MRI.setRegBank(Reg: DstLo, RegBank: DstBank);
1686 MRI.setRegBank(Reg: CarryLo, RegBank: CarryBank);
1687
1688 auto AddHi = B.buildUAdde(Res: S32, CarryOut: CarryType, Op0: DstHi, Op1: Src2Hi, CarryIn: CarryLo);
1689 DstHi = AddHi.getReg(Idx: 0);
1690 MRI.setRegBank(Reg: DstHi, RegBank: DstBank);
1691
1692 Register CarryHi = AddHi.getReg(Idx: 1);
1693 MRI.setRegBank(Reg: CarryHi, RegBank: CarryBank);
1694
1695 if (IsUnsigned) {
1696 Carry = CarryHi;
1697 } else {
1698 Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: CarryHi).getReg(Idx: 0);
1699 MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1700 }
1701 } else {
1702 if (IsUnsigned) {
1703 Carry = B.buildConstant(Res: CarryType, Val: 0).getReg(Idx: 0);
1704 MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1705 }
1706 }
1707
1708 B.buildMergeLikeInstr(Res: Dst0, Ops: {DstLo, DstHi});
1709
1710 if (DstOnValu) {
1711 B.buildCopy(Res: Dst1, Op: Carry);
1712 } else {
1713 B.buildTrunc(Res: Dst1, Op: Carry);
1714 }
1715
1716 MI.eraseFromParent();
1717 return true;
1718}
1719
1720// Return a suitable opcode for extending the operands of Opc when widening.
1721static unsigned getExtendOp(unsigned Opc) {
1722 switch (Opc) {
1723 case TargetOpcode::G_ASHR:
1724 case TargetOpcode::G_SMIN:
1725 case TargetOpcode::G_SMAX:
1726 return TargetOpcode::G_SEXT;
1727 case TargetOpcode::G_LSHR:
1728 case TargetOpcode::G_UMIN:
1729 case TargetOpcode::G_UMAX:
1730 return TargetOpcode::G_ZEXT;
1731 default:
1732 return TargetOpcode::G_ANYEXT;
1733 }
1734}
1735
1736// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1737// any illegal vector extend or unmerge operations.
1738static std::pair<Register, Register>
1739unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1740 const LLT S32 = LLT::scalar(SizeInBits: 32);
1741 auto Bitcast = B.buildBitcast(Dst: S32, Src);
1742
1743 if (ExtOpcode == TargetOpcode::G_SEXT) {
1744 auto ExtLo = B.buildSExtInReg(Res: S32, Op: Bitcast, ImmOp: 16);
1745 auto ShiftHi = B.buildAShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 16));
1746 return std::pair(ExtLo.getReg(Idx: 0), ShiftHi.getReg(Idx: 0));
1747 }
1748
1749 auto ShiftHi = B.buildLShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 16));
1750 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1751 auto ExtLo = B.buildAnd(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 0xffff));
1752 return std::pair(ExtLo.getReg(Idx: 0), ShiftHi.getReg(Idx: 0));
1753 }
1754
1755 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1756 return std::pair(Bitcast.getReg(Idx: 0), ShiftHi.getReg(Idx: 0));
1757}
1758
1759// For cases where only a single copy is inserted for matching register banks.
1760// Replace the register in the instruction operand
1761static bool substituteSimpleCopyRegs(
1762 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1763 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1764 if (!SrcReg.empty()) {
1765 assert(SrcReg.size() == 1);
1766 OpdMapper.getMI().getOperand(i: OpIdx).setReg(SrcReg[0]);
1767 return true;
1768 }
1769
1770 return false;
1771}
1772
1773/// Handle register layout difference for f16 images for some subtargets.
1774Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1775 MachineRegisterInfo &MRI,
1776 Register Reg) const {
1777 if (!Subtarget.hasUnpackedD16VMem())
1778 return Reg;
1779
1780 const LLT S16 = LLT::scalar(SizeInBits: 16);
1781 LLT StoreVT = MRI.getType(Reg);
1782 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1783 return Reg;
1784
1785 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
1786
1787
1788 SmallVector<Register, 4> WideRegs;
1789 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1790 WideRegs.push_back(Elt: Unmerge.getReg(Idx: I));
1791
1792 const LLT S32 = LLT::scalar(SizeInBits: 32);
1793 int NumElts = StoreVT.getNumElements();
1794
1795 return B.buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
1796 .getReg(Idx: 0);
1797}
1798
1799static std::pair<Register, unsigned>
1800getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1801 int64_t Const;
1802 if (mi_match(R: Reg, MRI, P: m_ICst(Cst&: Const)))
1803 return std::pair(Register(), Const);
1804
1805 Register Base;
1806 if (mi_match(R: Reg, MRI, P: m_GAdd(L: m_Reg(R&: Base), R: m_ICst(Cst&: Const))))
1807 return std::pair(Base, Const);
1808
1809 // TODO: Handle G_OR used for add case
1810 return std::pair(Reg, 0);
1811}
1812
1813std::pair<Register, unsigned>
1814AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1815 Register OrigOffset) const {
1816 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: Subtarget);
1817 Register BaseReg;
1818 unsigned ImmOffset;
1819 const LLT S32 = LLT::scalar(SizeInBits: 32);
1820
1821 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1822 std::tie(args&: BaseReg, args&: ImmOffset) = getBaseWithConstantOffset(MRI&: *B.getMRI(),
1823 Reg: OrigOffset);
1824
1825 unsigned C1 = 0;
1826 if (ImmOffset != 0) {
1827 // If the immediate value is too big for the immoffset field, put only bits
1828 // that would normally fit in the immoffset field. The remaining value that
1829 // is copied/added for the voffset field is a large power of 2, and it
1830 // stands more chance of being CSEd with the copy/add for another similar
1831 // load/store.
1832 // However, do not do that rounding down if that is a negative
1833 // number, as it appears to be illegal to have a negative offset in the
1834 // vgpr, even if adding the immediate offset makes it positive.
1835 unsigned Overflow = ImmOffset & ~MaxImm;
1836 ImmOffset -= Overflow;
1837 if ((int32_t)Overflow < 0) {
1838 Overflow += ImmOffset;
1839 ImmOffset = 0;
1840 }
1841
1842 C1 = ImmOffset;
1843 if (Overflow != 0) {
1844 if (!BaseReg)
1845 BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: 0);
1846 else {
1847 auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
1848 BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: 0);
1849 }
1850 }
1851 }
1852
1853 if (!BaseReg)
1854 BaseReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1855
1856 return {BaseReg, C1};
1857}
1858
1859bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1860 Register SrcReg) const {
1861 MachineRegisterInfo &MRI = *B.getMRI();
1862 LLT SrcTy = MRI.getType(Reg: SrcReg);
1863 if (SrcTy.getSizeInBits() == 32) {
1864 // Use a v_mov_b32 here to make the exec dependency explicit.
1865 B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32)
1866 .addDef(RegNo: DstReg)
1867 .addUse(RegNo: SrcReg);
1868 return constrainGenericRegister(Reg: DstReg, RC: AMDGPU::VGPR_32RegClass, MRI) &&
1869 constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_32RegClass, MRI);
1870 }
1871
1872 Register TmpReg0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1873 Register TmpReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1874
1875 B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32)
1876 .addDef(RegNo: TmpReg0)
1877 .addUse(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub0);
1878 B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32)
1879 .addDef(RegNo: TmpReg1)
1880 .addUse(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub1);
1881 B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
1882 .addDef(RegNo: DstReg)
1883 .addUse(RegNo: TmpReg0)
1884 .addImm(Val: AMDGPU::sub0)
1885 .addUse(RegNo: TmpReg1)
1886 .addImm(Val: AMDGPU::sub1);
1887
1888 return constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_64RegClass, MRI) &&
1889 constrainGenericRegister(Reg: DstReg, RC: AMDGPU::VReg_64RegClass, MRI);
1890}
1891
1892/// Utility function for pushing dynamic vector indexes with a constant offset
1893/// into waterfall loops.
1894static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1895 MachineInstr &IdxUseInstr,
1896 unsigned OpIdx,
1897 unsigned ConstOffset) {
1898 MachineRegisterInfo &MRI = *B.getMRI();
1899 const LLT S32 = LLT::scalar(SizeInBits: 32);
1900 Register WaterfallIdx = IdxUseInstr.getOperand(i: OpIdx).getReg();
1901 B.setInsertPt(MBB&: *IdxUseInstr.getParent(), II: IdxUseInstr.getIterator());
1902
1903 auto MaterializedOffset = B.buildConstant(Res: S32, Val: ConstOffset);
1904
1905 auto Add = B.buildAdd(Dst: S32, Src0: WaterfallIdx, Src1: MaterializedOffset);
1906 MRI.setRegBank(Reg: MaterializedOffset.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
1907 MRI.setRegBank(Reg: Add.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
1908 IdxUseInstr.getOperand(i: OpIdx).setReg(Add.getReg(Idx: 0));
1909}
1910
1911/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1912/// original 32-bit source value (to be inserted in the low part of the combined
1913/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1914/// value.
1915static void extendLow32IntoHigh32(MachineIRBuilder &B,
1916 Register Hi32Reg, Register Lo32Reg,
1917 unsigned ExtOpc,
1918 const RegisterBank &RegBank,
1919 bool IsBooleanSrc = false) {
1920 if (ExtOpc == AMDGPU::G_ZEXT) {
1921 B.buildConstant(Res: Hi32Reg, Val: 0);
1922 } else if (ExtOpc == AMDGPU::G_SEXT) {
1923 if (IsBooleanSrc) {
1924 // If we know the original source was an s1, the high half is the same as
1925 // the low.
1926 B.buildCopy(Res: Hi32Reg, Op: Lo32Reg);
1927 } else {
1928 // Replicate sign bit from 32-bit extended part.
1929 auto ShiftAmt = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 31);
1930 B.getMRI()->setRegBank(Reg: ShiftAmt.getReg(Idx: 0), RegBank);
1931 B.buildAShr(Dst: Hi32Reg, Src0: Lo32Reg, Src1: ShiftAmt);
1932 }
1933 } else {
1934 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1935 B.buildUndef(Res: Hi32Reg);
1936 }
1937}
1938
1939bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1940 MachineIRBuilder &B, MachineInstr &MI,
1941 const OperandsMapper &OpdMapper) const {
1942 MachineRegisterInfo &MRI = *B.getMRI();
1943
1944 Register VecReg = MI.getOperand(i: 1).getReg();
1945 Register Idx = MI.getOperand(i: 2).getReg();
1946
1947 const RegisterBank &IdxBank =
1948 *OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
1949
1950 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1951
1952 LLT VecTy = MRI.getType(Reg: VecReg);
1953 unsigned EltSize = VecTy.getScalarSizeInBits();
1954 unsigned NumElem = VecTy.getNumElements();
1955
1956 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1957 IsDivergentIdx, Subtarget: &Subtarget))
1958 return false;
1959
1960 LLT S32 = LLT::scalar(SizeInBits: 32);
1961
1962 const RegisterBank &DstBank =
1963 *OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
1964 const RegisterBank &SrcBank =
1965 *OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
1966
1967 const RegisterBank &CCBank =
1968 (DstBank == AMDGPU::SGPRRegBank &&
1969 SrcBank == AMDGPU::SGPRRegBank &&
1970 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1971 : AMDGPU::VCCRegBank;
1972 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(SizeInBits: 1);
1973
1974 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1975 Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: 0).getReg();
1976 MRI.setRegBank(Reg: Idx, RegBank: AMDGPU::VGPRRegBank);
1977 }
1978
1979 LLT EltTy = VecTy.getScalarType();
1980 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0));
1981 unsigned NumLanes = DstRegs.size();
1982 if (!NumLanes)
1983 NumLanes = 1;
1984 else
1985 EltTy = MRI.getType(Reg: DstRegs[0]);
1986
1987 auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg);
1988 SmallVector<Register, 2> Res(NumLanes);
1989 for (unsigned L = 0; L < NumLanes; ++L)
1990 Res[L] = UnmergeToEltTy.getReg(Idx: L);
1991
1992 for (unsigned I = 1; I < NumElem; ++I) {
1993 auto IC = B.buildConstant(Res: S32, Val: I);
1994 MRI.setRegBank(Reg: IC->getOperand(i: 0).getReg(), RegBank: AMDGPU::SGPRRegBank);
1995 auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC);
1996 MRI.setRegBank(Reg: Cmp->getOperand(i: 0).getReg(), RegBank: CCBank);
1997
1998 for (unsigned L = 0; L < NumLanes; ++L) {
1999 auto S = B.buildSelect(Res: EltTy, Tst: Cmp,
2000 Op0: UnmergeToEltTy.getReg(Idx: I * NumLanes + L), Op1: Res[L]);
2001
2002 for (unsigned N : { 0, 2, 3 })
2003 MRI.setRegBank(Reg: S->getOperand(i: N).getReg(), RegBank: DstBank);
2004
2005 Res[L] = S->getOperand(i: 0).getReg();
2006 }
2007 }
2008
2009 for (unsigned L = 0; L < NumLanes; ++L) {
2010 Register DstReg = (NumLanes == 1) ? MI.getOperand(i: 0).getReg() : DstRegs[L];
2011 B.buildCopy(Res: DstReg, Op: Res[L]);
2012 MRI.setRegBank(Reg: DstReg, RegBank: DstBank);
2013 }
2014
2015 MRI.setRegBank(Reg: MI.getOperand(i: 0).getReg(), RegBank: DstBank);
2016 MI.eraseFromParent();
2017
2018 return true;
2019}
2020
2021// Insert a cross regbank copy for a register if it already has a bank that
2022// differs from the one we want to set.
2023static Register constrainRegToBank(MachineRegisterInfo &MRI,
2024 MachineIRBuilder &B, Register &Reg,
2025 const RegisterBank &Bank) {
2026 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2027 if (CurrBank && *CurrBank != Bank) {
2028 Register Copy = B.buildCopy(Res: MRI.getType(Reg), Op: Reg).getReg(Idx: 0);
2029 MRI.setRegBank(Reg: Copy, RegBank: Bank);
2030 return Copy;
2031 }
2032
2033 MRI.setRegBank(Reg, RegBank: Bank);
2034 return Reg;
2035}
2036
2037bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2038 MachineIRBuilder &B, MachineInstr &MI,
2039 const OperandsMapper &OpdMapper) const {
2040
2041 MachineRegisterInfo &MRI = *B.getMRI();
2042 Register VecReg = MI.getOperand(i: 1).getReg();
2043 Register Idx = MI.getOperand(i: 3).getReg();
2044
2045 const RegisterBank &IdxBank =
2046 *OpdMapper.getInstrMapping().getOperandMapping(i: 3).BreakDown[0].RegBank;
2047
2048 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2049
2050 LLT VecTy = MRI.getType(Reg: VecReg);
2051 unsigned EltSize = VecTy.getScalarSizeInBits();
2052 unsigned NumElem = VecTy.getNumElements();
2053
2054 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2055 IsDivergentIdx, Subtarget: &Subtarget))
2056 return false;
2057
2058 LLT S32 = LLT::scalar(SizeInBits: 32);
2059
2060 const RegisterBank &DstBank =
2061 *OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2062 const RegisterBank &SrcBank =
2063 *OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
2064 const RegisterBank &InsBank =
2065 *OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
2066
2067 const RegisterBank &CCBank =
2068 (DstBank == AMDGPU::SGPRRegBank &&
2069 SrcBank == AMDGPU::SGPRRegBank &&
2070 InsBank == AMDGPU::SGPRRegBank &&
2071 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2072 : AMDGPU::VCCRegBank;
2073 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(SizeInBits: 1);
2074
2075 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2076 Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: 0).getReg();
2077 MRI.setRegBank(Reg: Idx, RegBank: AMDGPU::VGPRRegBank);
2078 }
2079
2080 LLT EltTy = VecTy.getScalarType();
2081 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(OpIdx: 2));
2082 unsigned NumLanes = InsRegs.size();
2083 if (!NumLanes) {
2084 NumLanes = 1;
2085 InsRegs.push_back(Elt: MI.getOperand(i: 2).getReg());
2086 } else {
2087 EltTy = MRI.getType(Reg: InsRegs[0]);
2088 }
2089
2090 auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg);
2091 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2092
2093 for (unsigned I = 0; I < NumElem; ++I) {
2094 auto IC = B.buildConstant(Res: S32, Val: I);
2095 MRI.setRegBank(Reg: IC->getOperand(i: 0).getReg(), RegBank: AMDGPU::SGPRRegBank);
2096 auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC);
2097 MRI.setRegBank(Reg: Cmp->getOperand(i: 0).getReg(), RegBank: CCBank);
2098
2099 for (unsigned L = 0; L < NumLanes; ++L) {
2100 Register Op0 = constrainRegToBank(MRI, B, Reg&: InsRegs[L], Bank: DstBank);
2101 Register Op1 = UnmergeToEltTy.getReg(Idx: I * NumLanes + L);
2102 Op1 = constrainRegToBank(MRI, B, Reg&: Op1, Bank: DstBank);
2103
2104 Register Select = B.buildSelect(Res: EltTy, Tst: Cmp, Op0, Op1).getReg(Idx: 0);
2105 MRI.setRegBank(Reg: Select, RegBank: DstBank);
2106
2107 Ops[I * NumLanes + L] = Select;
2108 }
2109 }
2110
2111 LLT MergeTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarTy: EltTy);
2112 if (MergeTy == MRI.getType(Reg: MI.getOperand(i: 0).getReg())) {
2113 B.buildBuildVector(Res: MI.getOperand(i: 0), Ops);
2114 } else {
2115 auto Vec = B.buildBuildVector(Res: MergeTy, Ops);
2116 MRI.setRegBank(Reg: Vec->getOperand(i: 0).getReg(), RegBank: DstBank);
2117 B.buildBitcast(Dst: MI.getOperand(i: 0).getReg(), Src: Vec);
2118 }
2119
2120 MRI.setRegBank(Reg: MI.getOperand(i: 0).getReg(), RegBank: DstBank);
2121 MI.eraseFromParent();
2122
2123 return true;
2124}
2125
2126// Break s_mul_u64 into 32-bit vector operations.
2127void AMDGPURegisterBankInfo::applyMappingSMULU64(
2128 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2129 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2130 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(OpIdx: 1));
2131 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2));
2132
2133 // All inputs are SGPRs, nothing special to do.
2134 if (DefRegs.empty()) {
2135 assert(Src0Regs.empty() && Src1Regs.empty());
2136 applyDefaultMapping(OpdMapper);
2137 return;
2138 }
2139
2140 assert(DefRegs.size() == 2);
2141 assert(Src0Regs.size() == Src1Regs.size() &&
2142 (Src0Regs.empty() || Src0Regs.size() == 2));
2143
2144 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2145 MachineInstr &MI = OpdMapper.getMI();
2146 Register DstReg = MI.getOperand(i: 0).getReg();
2147 LLT HalfTy = LLT::scalar(SizeInBits: 32);
2148
2149 // Depending on where the source registers came from, the generic code may
2150 // have decided to split the inputs already or not. If not, we still need to
2151 // extract the values.
2152
2153 if (Src0Regs.empty())
2154 split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: 1).getReg());
2155 else
2156 setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy);
2157
2158 if (Src1Regs.empty())
2159 split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg());
2160 else
2161 setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2162
2163 setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2164
2165 // The multiplication is done as follows:
2166 //
2167 // Op1H Op1L
2168 // * Op0H Op0L
2169 // --------------------
2170 // Op1H*Op0L Op1L*Op0L
2171 // + Op1H*Op0H Op1L*Op0H
2172 // -----------------------------------------
2173 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
2174 //
2175 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
2176 // value and that would overflow.
2177 // The low 32-bit value is Op1L*Op0L.
2178 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
2179 // Op1L*Op0L).
2180
2181 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
2182
2183 Register Hi = B.buildUMulH(Dst: HalfTy, Src0: Src0Regs[0], Src1: Src1Regs[0]).getReg(Idx: 0);
2184 Register MulLoHi = B.buildMul(Dst: HalfTy, Src0: Src0Regs[0], Src1: Src1Regs[1]).getReg(Idx: 0);
2185 Register Add = B.buildAdd(Dst: HalfTy, Src0: Hi, Src1: MulLoHi).getReg(Idx: 0);
2186 Register MulHiLo = B.buildMul(Dst: HalfTy, Src0: Src0Regs[1], Src1: Src1Regs[0]).getReg(Idx: 0);
2187 B.buildAdd(Dst: DefRegs[1], Src0: Add, Src1: MulHiLo);
2188 B.buildMul(Dst: DefRegs[0], Src0: Src0Regs[0], Src1: Src1Regs[0]);
2189
2190 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2191 MI.eraseFromParent();
2192}
2193
2194void AMDGPURegisterBankInfo::applyMappingImpl(
2195 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2196 MachineInstr &MI = OpdMapper.getMI();
2197 B.setInstrAndDebugLoc(MI);
2198 unsigned Opc = MI.getOpcode();
2199 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2200 switch (Opc) {
2201 case AMDGPU::G_CONSTANT:
2202 case AMDGPU::G_IMPLICIT_DEF: {
2203 Register DstReg = MI.getOperand(i: 0).getReg();
2204 LLT DstTy = MRI.getType(Reg: DstReg);
2205 if (DstTy != LLT::scalar(SizeInBits: 1))
2206 break;
2207
2208 const RegisterBank *DstBank =
2209 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2210 if (DstBank == &AMDGPU::VCCRegBank)
2211 break;
2212 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2213 if (DefRegs.empty())
2214 DefRegs.push_back(Elt: DstReg);
2215
2216 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
2217
2218 Register NewDstReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
2219 LLVMContext &Ctx = B.getMF().getFunction().getContext();
2220
2221 MI.getOperand(i: 0).setReg(NewDstReg);
2222 if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2223 uint64_t ConstVal = MI.getOperand(i: 1).getCImm()->getZExtValue();
2224 MI.getOperand(i: 1).setCImm(
2225 ConstantInt::get(Ty: IntegerType::getInt32Ty(C&: Ctx), V: ConstVal));
2226 }
2227
2228 MRI.setRegBank(Reg: NewDstReg, RegBank: *DstBank);
2229 B.buildTrunc(Res: DefRegs[0], Op: NewDstReg);
2230 return;
2231 }
2232 case AMDGPU::G_PHI: {
2233 Register DstReg = MI.getOperand(i: 0).getReg();
2234 LLT DstTy = MRI.getType(Reg: DstReg);
2235 if (DstTy != LLT::scalar(SizeInBits: 1))
2236 break;
2237
2238 const LLT S32 = LLT::scalar(SizeInBits: 32);
2239 const RegisterBank *DstBank =
2240 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2241 if (DstBank == &AMDGPU::VCCRegBank) {
2242 applyDefaultMapping(OpdMapper);
2243 // The standard handling only considers the result register bank for
2244 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2245 // produce an invalid copy. We can only copy with some kind of compare to
2246 // get a vector boolean result. Insert a register bank copy that will be
2247 // correctly lowered to a compare.
2248 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2249 Register SrcReg = MI.getOperand(i: I).getReg();
2250 const RegisterBank *SrcBank = getRegBank(Reg: SrcReg, MRI, TRI: *TRI);
2251
2252 if (SrcBank != &AMDGPU::VCCRegBank) {
2253 MachineBasicBlock *SrcMBB = MI.getOperand(i: I + 1).getMBB();
2254 B.setInsertPt(MBB&: *SrcMBB, II: SrcMBB->getFirstTerminator());
2255
2256 auto Copy = B.buildCopy(Res: LLT::scalar(SizeInBits: 1), Op: SrcReg);
2257 MRI.setRegBank(Reg: Copy.getReg(Idx: 0), RegBank: AMDGPU::VCCRegBank);
2258 MI.getOperand(i: I).setReg(Copy.getReg(Idx: 0));
2259 }
2260 }
2261
2262 return;
2263 }
2264
2265 // Phi handling is strange and only considers the bank of the destination.
2266 substituteSimpleCopyRegs(OpdMapper, OpIdx: 0);
2267
2268 // Promote SGPR/VGPR booleans to s32
2269 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2270 B.setInsertPt(MBB&: B.getMBB(), II: MI);
2271 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2272
2273 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized)
2274 llvm_unreachable("widen scalar should have succeeded");
2275
2276 return;
2277 }
2278 case AMDGPU::G_FCMP:
2279 if (!Subtarget.hasSALUFloatInsts())
2280 break;
2281 [[fallthrough]];
2282 case AMDGPU::G_ICMP:
2283 case AMDGPU::G_UADDO:
2284 case AMDGPU::G_USUBO:
2285 case AMDGPU::G_UADDE:
2286 case AMDGPU::G_SADDE:
2287 case AMDGPU::G_USUBE:
2288 case AMDGPU::G_SSUBE: {
2289 unsigned BoolDstOp =
2290 (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
2291 Register DstReg = MI.getOperand(i: BoolDstOp).getReg();
2292
2293 const RegisterBank *DstBank =
2294 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2295 if (DstBank != &AMDGPU::SGPRRegBank)
2296 break;
2297
2298 const bool HasCarryIn = MI.getNumOperands() == 5;
2299
2300 // If this is a scalar compare, promote the result to s32, as the selection
2301 // will end up using a copy to a 32-bit vreg.
2302 const LLT S32 = LLT::scalar(SizeInBits: 32);
2303 Register NewDstReg = MRI.createGenericVirtualRegister(Ty: S32);
2304 MRI.setRegBank(Reg: NewDstReg, RegBank: AMDGPU::SGPRRegBank);
2305 MI.getOperand(i: BoolDstOp).setReg(NewDstReg);
2306
2307 if (HasCarryIn) {
2308 Register NewSrcReg = MRI.createGenericVirtualRegister(Ty: S32);
2309 MRI.setRegBank(Reg: NewSrcReg, RegBank: AMDGPU::SGPRRegBank);
2310 B.buildZExt(Res: NewSrcReg, Op: MI.getOperand(i: 4).getReg());
2311 MI.getOperand(i: 4).setReg(NewSrcReg);
2312 }
2313
2314 MachineBasicBlock *MBB = MI.getParent();
2315 B.setInsertPt(MBB&: *MBB, II: std::next(x: MI.getIterator()));
2316
2317 // If we had a constrained VCC result register, a copy was inserted to VCC
2318 // from SGPR.
2319 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2320 if (DefRegs.empty())
2321 DefRegs.push_back(Elt: DstReg);
2322 B.buildTrunc(Res: DefRegs[0], Op: NewDstReg);
2323 return;
2324 }
2325 case AMDGPU::G_SELECT: {
2326 Register DstReg = MI.getOperand(i: 0).getReg();
2327 LLT DstTy = MRI.getType(Reg: DstReg);
2328
2329 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(OpIdx: 1));
2330 if (CondRegs.empty())
2331 CondRegs.push_back(Elt: MI.getOperand(i: 1).getReg());
2332 else {
2333 assert(CondRegs.size() == 1);
2334 }
2335
2336 const RegisterBank *CondBank = getRegBank(Reg: CondRegs[0], MRI, TRI: *TRI);
2337 if (CondBank == &AMDGPU::SGPRRegBank) {
2338 const LLT S32 = LLT::scalar(SizeInBits: 32);
2339 Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32);
2340 MRI.setRegBank(Reg: NewCondReg, RegBank: AMDGPU::SGPRRegBank);
2341
2342 MI.getOperand(i: 1).setReg(NewCondReg);
2343 B.buildZExt(Res: NewCondReg, Op: CondRegs[0]);
2344 }
2345
2346 if (DstTy.getSizeInBits() != 64)
2347 break;
2348
2349 LLT HalfTy = getHalfSizedType(Ty: DstTy);
2350
2351 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2352 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2));
2353 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(OpIdx: 3));
2354
2355 // All inputs are SGPRs, nothing special to do.
2356 if (DefRegs.empty()) {
2357 assert(Src1Regs.empty() && Src2Regs.empty());
2358 break;
2359 }
2360
2361 if (Src1Regs.empty())
2362 split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg());
2363 else {
2364 setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2365 }
2366
2367 if (Src2Regs.empty())
2368 split64BitValueForMapping(B, Regs&: Src2Regs, HalfTy, Reg: MI.getOperand(i: 3).getReg());
2369 else
2370 setRegsToType(MRI, Regs: Src2Regs, NewTy: HalfTy);
2371
2372 setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2373
2374 auto Flags = MI.getFlags();
2375 B.buildSelect(Res: DefRegs[0], Tst: CondRegs[0], Op0: Src1Regs[0], Op1: Src2Regs[0], Flags);
2376 B.buildSelect(Res: DefRegs[1], Tst: CondRegs[0], Op0: Src1Regs[1], Op1: Src2Regs[1], Flags);
2377
2378 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2379 MI.eraseFromParent();
2380 return;
2381 }
2382 case AMDGPU::G_BRCOND: {
2383 Register CondReg = MI.getOperand(i: 0).getReg();
2384 // FIXME: Should use legalizer helper, but should change bool ext type.
2385 const RegisterBank *CondBank =
2386 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2387
2388 if (CondBank == &AMDGPU::SGPRRegBank) {
2389 const LLT S32 = LLT::scalar(SizeInBits: 32);
2390 Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32);
2391 MRI.setRegBank(Reg: NewCondReg, RegBank: AMDGPU::SGPRRegBank);
2392
2393 MI.getOperand(i: 0).setReg(NewCondReg);
2394 B.buildZExt(Res: NewCondReg, Op: CondReg);
2395 return;
2396 }
2397
2398 break;
2399 }
2400 case AMDGPU::G_AND:
2401 case AMDGPU::G_OR:
2402 case AMDGPU::G_XOR: {
2403 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2404 // there is a VGPR input.
2405 Register DstReg = MI.getOperand(i: 0).getReg();
2406 LLT DstTy = MRI.getType(Reg: DstReg);
2407
2408 const RegisterBank *DstBank =
2409 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2410
2411 if (DstTy.getSizeInBits() == 1) {
2412 if (DstBank == &AMDGPU::VCCRegBank)
2413 break;
2414
2415 MachineFunction *MF = MI.getMF();
2416 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2417 LegalizerHelper Helper(*MF, ApplyBank, B);
2418
2419 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: LLT::scalar(SizeInBits: 32)) !=
2420 LegalizerHelper::Legalized)
2421 llvm_unreachable("widen scalar should have succeeded");
2422 return;
2423 }
2424
2425 if (DstTy.getSizeInBits() == 16 && DstBank == &AMDGPU::SGPRRegBank) {
2426 const LLT S32 = LLT::scalar(SizeInBits: 32);
2427 MachineBasicBlock *MBB = MI.getParent();
2428 MachineFunction *MF = MBB->getParent();
2429 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2430 LegalizerHelper Helper(*MF, ApplySALU, B);
2431 // Widen to S32, but handle `G_XOR x, -1` differently. Legalizer widening
2432 // will use a G_ANYEXT to extend the -1 which prevents matching G_XOR -1
2433 // as "not".
2434 if (MI.getOpcode() == AMDGPU::G_XOR &&
2435 mi_match(R: MI.getOperand(i: 2).getReg(), MRI, P: m_SpecificICstOrSplat(RequestedValue: -1))) {
2436 Helper.widenScalarSrc(MI, WideTy: S32, OpIdx: 1, ExtOpcode: AMDGPU::G_ANYEXT);
2437 Helper.widenScalarSrc(MI, WideTy: S32, OpIdx: 2, ExtOpcode: AMDGPU::G_SEXT);
2438 Helper.widenScalarDst(MI, WideTy: S32);
2439 } else {
2440 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized)
2441 llvm_unreachable("widen scalar should have succeeded");
2442 }
2443 return;
2444 }
2445
2446 if (DstTy.getSizeInBits() != 64)
2447 break;
2448
2449 LLT HalfTy = getHalfSizedType(Ty: DstTy);
2450 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2451 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(OpIdx: 1));
2452 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2));
2453
2454 // All inputs are SGPRs, nothing special to do.
2455 if (DefRegs.empty()) {
2456 assert(Src0Regs.empty() && Src1Regs.empty());
2457 break;
2458 }
2459
2460 assert(DefRegs.size() == 2);
2461 assert(Src0Regs.size() == Src1Regs.size() &&
2462 (Src0Regs.empty() || Src0Regs.size() == 2));
2463
2464 // Depending on where the source registers came from, the generic code may
2465 // have decided to split the inputs already or not. If not, we still need to
2466 // extract the values.
2467
2468 if (Src0Regs.empty())
2469 split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: 1).getReg());
2470 else
2471 setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy);
2472
2473 if (Src1Regs.empty())
2474 split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg());
2475 else
2476 setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2477
2478 setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2479
2480 auto Flags = MI.getFlags();
2481 B.buildInstr(Opc, DstOps: {DefRegs[0]}, SrcOps: {Src0Regs[0], Src1Regs[0]}, Flags);
2482 B.buildInstr(Opc, DstOps: {DefRegs[1]}, SrcOps: {Src0Regs[1], Src1Regs[1]}, Flags);
2483
2484 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2485 MI.eraseFromParent();
2486 return;
2487 }
2488 case AMDGPU::G_ABS: {
2489 Register SrcReg = MI.getOperand(i: 1).getReg();
2490 const RegisterBank *SrcBank = MRI.getRegBankOrNull(Reg: SrcReg);
2491
2492 // There is no VALU abs instruction so we need to replace it with a sub and
2493 // max combination.
2494 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2495 MachineFunction *MF = MI.getMF();
2496 ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2497 LegalizerHelper Helper(*MF, Apply, B);
2498
2499 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2500 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2501 return;
2502 }
2503 [[fallthrough]];
2504 }
2505 case AMDGPU::G_ADD:
2506 case AMDGPU::G_SUB:
2507 case AMDGPU::G_MUL:
2508 case AMDGPU::G_SHL:
2509 case AMDGPU::G_LSHR:
2510 case AMDGPU::G_ASHR:
2511 case AMDGPU::G_SMIN:
2512 case AMDGPU::G_SMAX:
2513 case AMDGPU::G_UMIN:
2514 case AMDGPU::G_UMAX: {
2515 Register DstReg = MI.getOperand(i: 0).getReg();
2516 LLT DstTy = MRI.getType(Reg: DstReg);
2517
2518 // Special case for s_mul_u64. There is not a vector equivalent of
2519 // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2520 // multiplications.
2521 if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL &&
2522 DstTy.getSizeInBits() == 64) {
2523 applyMappingSMULU64(B, OpdMapper);
2524 return;
2525 }
2526
2527 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2528 // Packed 16-bit operations need to be scalarized and promoted.
2529 if (DstTy != LLT::scalar(SizeInBits: 16) && DstTy != LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16))
2530 break;
2531
2532 const RegisterBank *DstBank =
2533 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2534 if (DstBank == &AMDGPU::VGPRRegBank)
2535 break;
2536
2537 const LLT S32 = LLT::scalar(SizeInBits: 32);
2538 MachineBasicBlock *MBB = MI.getParent();
2539 MachineFunction *MF = MBB->getParent();
2540 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2541
2542 if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2543 Register WideSrcLo, WideSrcHi;
2544
2545 std::tie(args&: WideSrcLo, args&: WideSrcHi) =
2546 unpackV2S16ToS32(B, Src: MI.getOperand(i: 1).getReg(), ExtOpcode: TargetOpcode::G_SEXT);
2547 auto Lo = B.buildInstr(Opc: AMDGPU::G_ABS, DstOps: {S32}, SrcOps: {WideSrcLo});
2548 auto Hi = B.buildInstr(Opc: AMDGPU::G_ABS, DstOps: {S32}, SrcOps: {WideSrcHi});
2549 B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)});
2550 MI.eraseFromParent();
2551 return;
2552 }
2553
2554 if (DstTy.isVector()) {
2555 Register WideSrc0Lo, WideSrc0Hi;
2556 Register WideSrc1Lo, WideSrc1Hi;
2557
2558 unsigned ExtendOp = getExtendOp(Opc: MI.getOpcode());
2559 std::tie(args&: WideSrc0Lo, args&: WideSrc0Hi)
2560 = unpackV2S16ToS32(B, Src: MI.getOperand(i: 1).getReg(), ExtOpcode: ExtendOp);
2561 std::tie(args&: WideSrc1Lo, args&: WideSrc1Hi)
2562 = unpackV2S16ToS32(B, Src: MI.getOperand(i: 2).getReg(), ExtOpcode: ExtendOp);
2563 auto Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Lo, WideSrc1Lo});
2564 auto Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Hi, WideSrc1Hi});
2565 B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)});
2566 MI.eraseFromParent();
2567 } else {
2568 LegalizerHelper Helper(*MF, ApplySALU, B);
2569
2570 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized)
2571 llvm_unreachable("widen scalar should have succeeded");
2572
2573 // FIXME: s16 shift amounts should be legal.
2574 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2575 Opc == AMDGPU::G_ASHR) {
2576 B.setInsertPt(MBB&: *MBB, II: MI.getIterator());
2577 if (Helper.widenScalar(MI, TypeIdx: 1, WideTy: S32) != LegalizerHelper::Legalized)
2578 llvm_unreachable("widen scalar should have succeeded");
2579 }
2580 }
2581
2582 return;
2583 }
2584 case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
2585 case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
2586 // This is a special case for s_mul_u64. We use
2587 // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2588 // where the 33 higher bits are sign-extended and
2589 // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2590 // where the 32 higher bits are zero-extended. In case scalar registers are
2591 // selected, both opcodes are lowered as s_mul_u64. If the vector registers
2592 // are selected, then G_AMDGPU_S_MUL_I64_I32 and
2593 // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2594
2595 // Insert basic copies.
2596 applyDefaultMapping(OpdMapper);
2597
2598 Register DstReg = MI.getOperand(i: 0).getReg();
2599 Register SrcReg0 = MI.getOperand(i: 1).getReg();
2600 Register SrcReg1 = MI.getOperand(i: 2).getReg();
2601 const LLT S32 = LLT::scalar(SizeInBits: 32);
2602 const LLT S64 = LLT::scalar(SizeInBits: 64);
2603 assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
2604 "that handles only 64-bit operands.");
2605 const RegisterBank *DstBank =
2606 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2607
2608 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2609 // with s_mul_u64 operation.
2610 if (DstBank == &AMDGPU::SGPRRegBank) {
2611 MI.setDesc(TII->get(Opcode: AMDGPU::S_MUL_U64));
2612 MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SGPR_64RegClass);
2613 MRI.setRegClass(Reg: SrcReg0, RC: &AMDGPU::SGPR_64RegClass);
2614 MRI.setRegClass(Reg: SrcReg1, RC: &AMDGPU::SGPR_64RegClass);
2615 return;
2616 }
2617
2618 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2619 // with a vector mad.
2620 assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
2621 "The destination operand should be in vector registers.");
2622
2623 // Extract the lower subregister from the first operand.
2624 Register Op0L = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2625 MRI.setRegClass(Reg: Op0L, RC: &AMDGPU::VGPR_32RegClass);
2626 MRI.setType(VReg: Op0L, Ty: S32);
2627 B.buildTrunc(Res: Op0L, Op: SrcReg0);
2628
2629 // Extract the lower subregister from the second operand.
2630 Register Op1L = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2631 MRI.setRegClass(Reg: Op1L, RC: &AMDGPU::VGPR_32RegClass);
2632 MRI.setType(VReg: Op1L, Ty: S32);
2633 B.buildTrunc(Res: Op1L, Op: SrcReg1);
2634
2635 unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
2636 ? AMDGPU::G_AMDGPU_MAD_U64_U32
2637 : AMDGPU::G_AMDGPU_MAD_I64_I32;
2638
2639 MachineIRBuilder B(MI);
2640 Register Zero64 = B.buildConstant(Res: S64, Val: 0).getReg(Idx: 0);
2641 MRI.setRegClass(Reg: Zero64, RC: &AMDGPU::VReg_64RegClass);
2642 Register CarryOut = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
2643 MRI.setRegClass(Reg: CarryOut, RC: &AMDGPU::VReg_64RegClass);
2644 B.buildInstr(Opc: NewOpc, DstOps: {DstReg, CarryOut}, SrcOps: {Op0L, Op1L, Zero64});
2645 MI.eraseFromParent();
2646 return;
2647 }
2648 case AMDGPU::G_SEXT_INREG: {
2649 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(OpIdx: 1));
2650 if (SrcRegs.empty())
2651 break; // Nothing to repair
2652
2653 const LLT S32 = LLT::scalar(SizeInBits: 32);
2654 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2655
2656 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2657 // we would need to further expand, and doesn't let us directly set the
2658 // result registers.
2659 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0));
2660
2661 int Amt = MI.getOperand(i: 2).getImm();
2662 if (Amt <= 32) {
2663 // Downstream users have expectations for the high bit behavior, so freeze
2664 // incoming undefined bits.
2665 if (Amt == 32) {
2666 // The low bits are unchanged.
2667 B.buildFreeze(Dst: DstRegs[0], Src: SrcRegs[0]);
2668 } else {
2669 auto Freeze = B.buildFreeze(Dst: S32, Src: SrcRegs[0]);
2670 // Extend in the low bits and propagate the sign bit to the high half.
2671 B.buildSExtInReg(Res: DstRegs[0], Op: Freeze, ImmOp: Amt);
2672 }
2673
2674 B.buildAShr(Dst: DstRegs[1], Src0: DstRegs[0], Src1: B.buildConstant(Res: S32, Val: 31));
2675 } else {
2676 // The low bits are unchanged, and extend in the high bits.
2677 // No freeze required
2678 B.buildCopy(Res: DstRegs[0], Op: SrcRegs[0]);
2679 B.buildSExtInReg(Res: DstRegs[1], Op: DstRegs[0], ImmOp: Amt - 32);
2680 }
2681
2682 Register DstReg = MI.getOperand(i: 0).getReg();
2683 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2684 MI.eraseFromParent();
2685 return;
2686 }
2687 case AMDGPU::G_CTPOP:
2688 case AMDGPU::G_BITREVERSE: {
2689 const RegisterBank *DstBank =
2690 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2691 if (DstBank == &AMDGPU::SGPRRegBank)
2692 break;
2693
2694 Register SrcReg = MI.getOperand(i: 1).getReg();
2695 const LLT S32 = LLT::scalar(SizeInBits: 32);
2696 LLT Ty = MRI.getType(Reg: SrcReg);
2697 if (Ty == S32)
2698 break;
2699
2700 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2701
2702 MachineFunction &MF = B.getMF();
2703 LegalizerHelper Helper(MF, ApplyVALU, B);
2704
2705 if (Helper.narrowScalar(MI, TypeIdx: 1, NarrowTy: S32) != LegalizerHelper::Legalized)
2706 llvm_unreachable("narrowScalar should have succeeded");
2707 return;
2708 }
2709 case AMDGPU::G_AMDGPU_FFBH_U32:
2710 case AMDGPU::G_AMDGPU_FFBL_B32:
2711 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2712 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2713 const RegisterBank *DstBank =
2714 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2715 if (DstBank == &AMDGPU::SGPRRegBank)
2716 break;
2717
2718 Register SrcReg = MI.getOperand(i: 1).getReg();
2719 const LLT S32 = LLT::scalar(SizeInBits: 32);
2720 LLT Ty = MRI.getType(Reg: SrcReg);
2721 if (Ty == S32)
2722 break;
2723
2724 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2725 // which return -1 when the input is zero:
2726 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2727 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2728 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2729 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2730 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2731 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(OpIdx: 1));
2732 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2733 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2734 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2735 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2736 : Opc;
2737 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2738 auto X = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs[Idx]});
2739 auto Y = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs[Idx ^ 1]});
2740 unsigned AddOpc =
2741 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2742 ? AMDGPU::G_ADD
2743 : AMDGPU::G_UADDSAT;
2744 Y = B.buildInstr(Opc: AddOpc, DstOps: {S32}, SrcOps: {Y, B.buildConstant(Res: S32, Val: 32)});
2745 Register DstReg = MI.getOperand(i: 0).getReg();
2746 B.buildUMin(Dst: DstReg, Src0: X, Src1: Y);
2747 MI.eraseFromParent();
2748 return;
2749 }
2750 case AMDGPU::G_SEXT:
2751 case AMDGPU::G_ZEXT:
2752 case AMDGPU::G_ANYEXT: {
2753 Register SrcReg = MI.getOperand(i: 1).getReg();
2754 LLT SrcTy = MRI.getType(Reg: SrcReg);
2755 const bool Signed = Opc == AMDGPU::G_SEXT;
2756
2757 assert(OpdMapper.getVRegs(1).empty());
2758
2759 const RegisterBank *SrcBank =
2760 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
2761
2762 Register DstReg = MI.getOperand(i: 0).getReg();
2763 LLT DstTy = MRI.getType(Reg: DstReg);
2764 if (DstTy.isScalar() &&
2765 SrcBank != &AMDGPU::SGPRRegBank &&
2766 SrcBank != &AMDGPU::VCCRegBank &&
2767 // FIXME: Should handle any type that round to s64 when irregular
2768 // breakdowns supported.
2769 DstTy.getSizeInBits() == 64 &&
2770 SrcTy.getSizeInBits() <= 32) {
2771 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2772
2773 // Extend to 32-bit, and then extend the low half.
2774 if (Signed) {
2775 // TODO: Should really be buildSExtOrCopy
2776 B.buildSExtOrTrunc(Res: DefRegs[0], Op: SrcReg);
2777 } else if (Opc == AMDGPU::G_ZEXT) {
2778 B.buildZExtOrTrunc(Res: DefRegs[0], Op: SrcReg);
2779 } else {
2780 B.buildAnyExtOrTrunc(Res: DefRegs[0], Op: SrcReg);
2781 }
2782
2783 extendLow32IntoHigh32(B, Hi32Reg: DefRegs[1], Lo32Reg: DefRegs[0], ExtOpc: Opc, RegBank: *SrcBank);
2784 MRI.setRegBank(Reg: DstReg, RegBank: *SrcBank);
2785 MI.eraseFromParent();
2786 return;
2787 }
2788
2789 if (SrcTy != LLT::scalar(SizeInBits: 1))
2790 return;
2791
2792 // It is not legal to have a legalization artifact with a VCC source. Rather
2793 // than introducing a copy, insert the select we would have to select the
2794 // copy to.
2795 if (SrcBank == &AMDGPU::VCCRegBank) {
2796 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2797
2798 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2799
2800 unsigned DstSize = DstTy.getSizeInBits();
2801 // 64-bit select is SGPR only
2802 const bool UseSel64 = DstSize > 32 &&
2803 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2804
2805 // TODO: Should s16 select be legal?
2806 LLT SelType = UseSel64 ? LLT::scalar(SizeInBits: 64) : LLT::scalar(SizeInBits: 32);
2807 auto True = B.buildConstant(Res: SelType, Val: Signed ? -1 : 1);
2808 auto False = B.buildConstant(Res: SelType, Val: 0);
2809
2810 MRI.setRegBank(Reg: True.getReg(Idx: 0), RegBank: *DstBank);
2811 MRI.setRegBank(Reg: False.getReg(Idx: 0), RegBank: *DstBank);
2812 MRI.setRegBank(Reg: DstReg, RegBank: *DstBank);
2813
2814 if (DstSize > 32) {
2815 B.buildSelect(Res: DefRegs[0], Tst: SrcReg, Op0: True, Op1: False);
2816 extendLow32IntoHigh32(B, Hi32Reg: DefRegs[1], Lo32Reg: DefRegs[0], ExtOpc: Opc, RegBank: *SrcBank, IsBooleanSrc: true);
2817 } else if (DstSize < 32) {
2818 auto Sel = B.buildSelect(Res: SelType, Tst: SrcReg, Op0: True, Op1: False);
2819 MRI.setRegBank(Reg: Sel.getReg(Idx: 0), RegBank: *DstBank);
2820 B.buildTrunc(Res: DstReg, Op: Sel);
2821 } else {
2822 B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False);
2823 }
2824
2825 MI.eraseFromParent();
2826 return;
2827 }
2828
2829 break;
2830 }
2831 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2832 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0));
2833
2834 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2835
2836 Register DstReg = MI.getOperand(i: 0).getReg();
2837 Register SrcReg = MI.getOperand(i: 1).getReg();
2838
2839 const LLT S32 = LLT::scalar(SizeInBits: 32);
2840 LLT DstTy = MRI.getType(Reg: DstReg);
2841 LLT SrcTy = MRI.getType(Reg: SrcReg);
2842
2843 if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2844 return;
2845
2846 const ValueMapping &DstMapping
2847 = OpdMapper.getInstrMapping().getOperandMapping(i: 0);
2848 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2849 const RegisterBank *SrcBank =
2850 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
2851 const RegisterBank *IdxBank =
2852 OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
2853
2854 Register BaseIdxReg;
2855 unsigned ConstOffset;
2856 std::tie(args&: BaseIdxReg, args&: ConstOffset) =
2857 AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: 2).getReg());
2858
2859 // See if the index is an add of a constant which will be foldable by moving
2860 // the base register of the index later if this is going to be executed in a
2861 // waterfall loop. This is essentially to reassociate the add of a constant
2862 // with the readfirstlane.
2863 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2864 ConstOffset > 0 &&
2865 ConstOffset < SrcTy.getNumElements();
2866
2867 // Move the base register. We'll re-insert the add later.
2868 if (ShouldMoveIndexIntoLoop)
2869 MI.getOperand(i: 2).setReg(BaseIdxReg);
2870
2871 // If this is a VGPR result only because the index was a VGPR result, the
2872 // actual indexing will be done on the SGPR source vector, which will
2873 // produce a scalar result. We need to copy to the VGPR result inside the
2874 // waterfall loop.
2875 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2876 SrcBank == &AMDGPU::SGPRRegBank;
2877 if (DstRegs.empty()) {
2878 applyDefaultMapping(OpdMapper);
2879
2880 executeInWaterfallLoop(B, MI, OpIndices: {2});
2881
2882 if (NeedCopyToVGPR) {
2883 // We don't want a phi for this temporary reg.
2884 Register TmpReg = MRI.createGenericVirtualRegister(Ty: DstTy);
2885 MRI.setRegBank(Reg: TmpReg, RegBank: AMDGPU::SGPRRegBank);
2886 MI.getOperand(i: 0).setReg(TmpReg);
2887 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
2888
2889 // Use a v_mov_b32 here to make the exec dependency explicit.
2890 buildVCopy(B, DstReg, SrcReg: TmpReg);
2891 }
2892
2893 // Re-insert the constant offset add inside the waterfall loop.
2894 if (ShouldMoveIndexIntoLoop)
2895 reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: 2, ConstOffset);
2896
2897 return;
2898 }
2899
2900 assert(DstTy.getSizeInBits() == 64);
2901
2902 LLT Vec32 = LLT::fixed_vector(NumElements: 2 * SrcTy.getNumElements(), ScalarSizeInBits: 32);
2903
2904 auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg);
2905 auto One = B.buildConstant(Res: S32, Val: 1);
2906
2907 MachineBasicBlock::iterator MII = MI.getIterator();
2908
2909 // Split the vector index into 32-bit pieces. Prepare to move all of the
2910 // new instructions into a waterfall loop if necessary.
2911 //
2912 // Don't put the bitcast or constant in the loop.
2913 MachineInstrSpan Span(MII, &B.getMBB());
2914
2915 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2916 auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One);
2917 auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One);
2918
2919 auto Extract0 = B.buildExtractVectorElement(Res: DstRegs[0], Val: CastSrc, Idx: IdxLo);
2920 auto Extract1 = B.buildExtractVectorElement(Res: DstRegs[1], Val: CastSrc, Idx: IdxHi);
2921
2922 MRI.setRegBank(Reg: DstReg, RegBank: *DstBank);
2923 MRI.setRegBank(Reg: CastSrc.getReg(Idx: 0), RegBank: *SrcBank);
2924 MRI.setRegBank(Reg: One.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
2925 MRI.setRegBank(Reg: IdxLo.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
2926 MRI.setRegBank(Reg: IdxHi.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
2927
2928 SmallSet<Register, 4> OpsToWaterfall;
2929 if (!collectWaterfallOperands(SGPROperandRegs&: OpsToWaterfall, MI, MRI, OpIndices: { 2 })) {
2930 MI.eraseFromParent();
2931 return;
2932 }
2933
2934 // Remove the original instruction to avoid potentially confusing the
2935 // waterfall loop logic.
2936 B.setInstr(*Span.begin());
2937 MI.eraseFromParent();
2938 executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()),
2939 SGPROperandRegs&: OpsToWaterfall);
2940
2941 if (NeedCopyToVGPR) {
2942 MachineBasicBlock *LoopBB = Extract1->getParent();
2943 Register TmpReg0 = MRI.createGenericVirtualRegister(Ty: S32);
2944 Register TmpReg1 = MRI.createGenericVirtualRegister(Ty: S32);
2945 MRI.setRegBank(Reg: TmpReg0, RegBank: AMDGPU::SGPRRegBank);
2946 MRI.setRegBank(Reg: TmpReg1, RegBank: AMDGPU::SGPRRegBank);
2947
2948 Extract0->getOperand(i: 0).setReg(TmpReg0);
2949 Extract1->getOperand(i: 0).setReg(TmpReg1);
2950
2951 B.setInsertPt(MBB&: *LoopBB, II: ++Extract1->getIterator());
2952
2953 buildVCopy(B, DstReg: DstRegs[0], SrcReg: TmpReg0);
2954 buildVCopy(B, DstReg: DstRegs[1], SrcReg: TmpReg1);
2955 }
2956
2957 if (ShouldMoveIndexIntoLoop)
2958 reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: 1, ConstOffset);
2959
2960 return;
2961 }
2962 case AMDGPU::G_INSERT_VECTOR_ELT: {
2963 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(OpIdx: 2));
2964
2965 Register DstReg = MI.getOperand(i: 0).getReg();
2966 LLT VecTy = MRI.getType(Reg: DstReg);
2967
2968 assert(OpdMapper.getVRegs(0).empty());
2969 assert(OpdMapper.getVRegs(3).empty());
2970
2971 if (substituteSimpleCopyRegs(OpdMapper, OpIdx: 1))
2972 MRI.setType(VReg: MI.getOperand(i: 1).getReg(), Ty: VecTy);
2973
2974 if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2975 return;
2976
2977 const RegisterBank *IdxBank =
2978 OpdMapper.getInstrMapping().getOperandMapping(i: 3).BreakDown[0].RegBank;
2979
2980 Register SrcReg = MI.getOperand(i: 1).getReg();
2981 Register InsReg = MI.getOperand(i: 2).getReg();
2982 LLT InsTy = MRI.getType(Reg: InsReg);
2983 (void)InsTy;
2984
2985 Register BaseIdxReg;
2986 unsigned ConstOffset;
2987 std::tie(args&: BaseIdxReg, args&: ConstOffset) =
2988 AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: 3).getReg());
2989
2990 // See if the index is an add of a constant which will be foldable by moving
2991 // the base register of the index later if this is going to be executed in a
2992 // waterfall loop. This is essentially to reassociate the add of a constant
2993 // with the readfirstlane.
2994 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2995 ConstOffset > 0 &&
2996 ConstOffset < VecTy.getNumElements();
2997
2998 // Move the base register. We'll re-insert the add later.
2999 if (ShouldMoveIndexIntoLoop)
3000 MI.getOperand(i: 3).setReg(BaseIdxReg);
3001
3002
3003 if (InsRegs.empty()) {
3004 executeInWaterfallLoop(B, MI, OpIndices: {3});
3005
3006 // Re-insert the constant offset add inside the waterfall loop.
3007 if (ShouldMoveIndexIntoLoop) {
3008 reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: 3, ConstOffset);
3009 }
3010
3011 return;
3012 }
3013
3014 assert(InsTy.getSizeInBits() == 64);
3015
3016 const LLT S32 = LLT::scalar(SizeInBits: 32);
3017 LLT Vec32 = LLT::fixed_vector(NumElements: 2 * VecTy.getNumElements(), ScalarSizeInBits: 32);
3018
3019 auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg);
3020 auto One = B.buildConstant(Res: S32, Val: 1);
3021
3022 // Split the vector index into 32-bit pieces. Prepare to move all of the
3023 // new instructions into a waterfall loop if necessary.
3024 //
3025 // Don't put the bitcast or constant in the loop.
3026 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
3027
3028 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
3029 auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One);
3030 auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One);
3031
3032 auto InsLo = B.buildInsertVectorElement(Res: Vec32, Val: CastSrc, Elt: InsRegs[0], Idx: IdxLo);
3033 auto InsHi = B.buildInsertVectorElement(Res: Vec32, Val: InsLo, Elt: InsRegs[1], Idx: IdxHi);
3034
3035 const RegisterBank *DstBank =
3036 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
3037 const RegisterBank *SrcBank =
3038 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
3039 const RegisterBank *InsSrcBank =
3040 OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
3041
3042 MRI.setRegBank(Reg: InsReg, RegBank: *InsSrcBank);
3043 MRI.setRegBank(Reg: CastSrc.getReg(Idx: 0), RegBank: *SrcBank);
3044 MRI.setRegBank(Reg: InsLo.getReg(Idx: 0), RegBank: *DstBank);
3045 MRI.setRegBank(Reg: InsHi.getReg(Idx: 0), RegBank: *DstBank);
3046 MRI.setRegBank(Reg: One.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
3047 MRI.setRegBank(Reg: IdxLo.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
3048 MRI.setRegBank(Reg: IdxHi.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
3049
3050
3051 SmallSet<Register, 4> OpsToWaterfall;
3052 if (!collectWaterfallOperands(SGPROperandRegs&: OpsToWaterfall, MI, MRI, OpIndices: { 3 })) {
3053 B.setInsertPt(MBB&: B.getMBB(), II: MI);
3054 B.buildBitcast(Dst: DstReg, Src: InsHi);
3055 MI.eraseFromParent();
3056 return;
3057 }
3058
3059 B.setInstr(*Span.begin());
3060 MI.eraseFromParent();
3061
3062 // Figure out the point after the waterfall loop before mangling the control
3063 // flow.
3064 executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()),
3065 SGPROperandRegs&: OpsToWaterfall);
3066
3067 // The insertion point is now right after the original instruction.
3068 //
3069 // Keep the bitcast to the original vector type out of the loop. Doing this
3070 // saved an extra phi we don't need inside the loop.
3071 B.buildBitcast(Dst: DstReg, Src: InsHi);
3072
3073 // Re-insert the constant offset add inside the waterfall loop.
3074 if (ShouldMoveIndexIntoLoop)
3075 reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: 1, ConstOffset);
3076
3077 return;
3078 }
3079 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3080 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3081 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3082 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3083 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3084 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
3085 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
3086 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
3087 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
3088 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
3089 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3090 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
3091 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3092 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3093 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3094 case AMDGPU::G_AMDGPU_BUFFER_STORE:
3095 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3096 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3097 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3098 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
3099 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3100 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
3101 applyDefaultMapping(OpdMapper);
3102 executeInWaterfallLoop(B, MI, OpIndices: {1, 4});
3103 return;
3104 }
3105 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3106 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3107 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3108 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3109 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3110 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3111 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3112 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3113 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3114 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3115 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3116 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3117 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
3118 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
3119 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3120 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3121 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3122 applyDefaultMapping(OpdMapper);
3123 executeInWaterfallLoop(B, MI, OpIndices: {2, 5});
3124 return;
3125 }
3126 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3127 applyDefaultMapping(OpdMapper);
3128 executeInWaterfallLoop(B, MI, OpIndices: {3, 6});
3129 return;
3130 }
3131 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
3132 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
3133 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
3134 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
3135 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
3136 applyMappingSBufferLoad(B, OpdMapper);
3137 return;
3138 }
3139 case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
3140 constrainOpWithReadfirstlane(B, MI, OpIdx: 0);
3141 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3142 return;
3143 case AMDGPU::G_INTRINSIC:
3144 case AMDGPU::G_INTRINSIC_CONVERGENT: {
3145 switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
3146 case Intrinsic::amdgcn_readlane: {
3147 substituteSimpleCopyRegs(OpdMapper, OpIdx: 2);
3148
3149 assert(OpdMapper.getVRegs(0).empty());
3150 assert(OpdMapper.getVRegs(3).empty());
3151
3152 // Make sure the index is an SGPR. It doesn't make sense to run this in a
3153 // waterfall loop, so assume it's a uniform value.
3154 constrainOpWithReadfirstlane(B, MI, OpIdx: 3); // Index
3155 return;
3156 }
3157 case Intrinsic::amdgcn_writelane: {
3158 assert(OpdMapper.getVRegs(0).empty());
3159 assert(OpdMapper.getVRegs(2).empty());
3160 assert(OpdMapper.getVRegs(3).empty());
3161
3162 substituteSimpleCopyRegs(OpdMapper, OpIdx: 4); // VGPR input val
3163 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // Source value
3164 constrainOpWithReadfirstlane(B, MI, OpIdx: 3); // Index
3165 return;
3166 }
3167 case Intrinsic::amdgcn_interp_p1:
3168 case Intrinsic::amdgcn_interp_p2:
3169 case Intrinsic::amdgcn_interp_mov:
3170 case Intrinsic::amdgcn_interp_p1_f16:
3171 case Intrinsic::amdgcn_interp_p2_f16:
3172 case Intrinsic::amdgcn_lds_param_load: {
3173 applyDefaultMapping(OpdMapper);
3174
3175 // Readlane for m0 value, which is always the last operand.
3176 // FIXME: Should this be a waterfall loop instead?
3177 constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - 1); // Index
3178 return;
3179 }
3180 case Intrinsic::amdgcn_interp_inreg_p10:
3181 case Intrinsic::amdgcn_interp_inreg_p2:
3182 case Intrinsic::amdgcn_interp_inreg_p10_f16:
3183 case Intrinsic::amdgcn_interp_inreg_p2_f16:
3184 case Intrinsic::amdgcn_interp_p10_rtz_f16:
3185 case Intrinsic::amdgcn_interp_p2_rtz_f16:
3186 case Intrinsic::amdgcn_permlane16_swap:
3187 case Intrinsic::amdgcn_permlane32_swap:
3188 applyDefaultMapping(OpdMapper);
3189 return;
3190 case Intrinsic::amdgcn_permlane16:
3191 case Intrinsic::amdgcn_permlanex16: {
3192 // Doing a waterfall loop over these wouldn't make any sense.
3193 substituteSimpleCopyRegs(OpdMapper, OpIdx: 2);
3194 substituteSimpleCopyRegs(OpdMapper, OpIdx: 3);
3195 constrainOpWithReadfirstlane(B, MI, OpIdx: 4);
3196 constrainOpWithReadfirstlane(B, MI, OpIdx: 5);
3197 return;
3198 }
3199 case Intrinsic::amdgcn_permlane_bcast:
3200 case Intrinsic::amdgcn_permlane_up:
3201 case Intrinsic::amdgcn_permlane_down:
3202 case Intrinsic::amdgcn_permlane_xor:
3203 // Doing a waterfall loop over these wouldn't make any sense.
3204 constrainOpWithReadfirstlane(B, MI, OpIdx: 3);
3205 constrainOpWithReadfirstlane(B, MI, OpIdx: 4);
3206 return;
3207 case Intrinsic::amdgcn_permlane_idx_gen: {
3208 constrainOpWithReadfirstlane(B, MI, OpIdx: 3);
3209 return;
3210 }
3211 case Intrinsic::amdgcn_sbfe:
3212 applyMappingBFE(B, OpdMapper, Signed: true);
3213 return;
3214 case Intrinsic::amdgcn_ubfe:
3215 applyMappingBFE(B, OpdMapper, Signed: false);
3216 return;
3217 case Intrinsic::amdgcn_inverse_ballot:
3218 case Intrinsic::amdgcn_s_bitreplicate:
3219 case Intrinsic::amdgcn_s_quadmask:
3220 case Intrinsic::amdgcn_s_wqm:
3221 applyDefaultMapping(OpdMapper);
3222 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // Mask
3223 return;
3224 case Intrinsic::amdgcn_ballot:
3225 // Use default handling and insert copy to vcc source.
3226 break;
3227 }
3228 break;
3229 }
3230 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3231 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3232 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3233 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3234 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3235 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3236 AMDGPU::lookupRsrcIntrinsic(Intr: AMDGPU::getIntrinsicID(I: MI));
3237 assert(RSrcIntrin && RSrcIntrin->IsImage);
3238 // Non-images can have complications from operands that allow both SGPR
3239 // and VGPR. For now it's too complicated to figure out the final opcode
3240 // to derive the register bank from the MCInstrDesc.
3241 applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg);
3242 return;
3243 }
3244 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
3245 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
3246 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {
3247 bool IsDualOrBVH8 =
3248 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY ||
3249 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY;
3250 unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier
3251 unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods;
3252 applyDefaultMapping(OpdMapper);
3253 executeInWaterfallLoop(B, MI, OpIndices: {LastRegOpIdx});
3254 return;
3255 }
3256 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3257 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3258 auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
3259 switch (IntrID) {
3260 case Intrinsic::amdgcn_ds_ordered_add:
3261 case Intrinsic::amdgcn_ds_ordered_swap: {
3262 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3263 assert(OpdMapper.getVRegs(0).empty());
3264 substituteSimpleCopyRegs(OpdMapper, OpIdx: 3);
3265 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3266 return;
3267 }
3268 case Intrinsic::amdgcn_ds_gws_init:
3269 case Intrinsic::amdgcn_ds_gws_barrier:
3270 case Intrinsic::amdgcn_ds_gws_sema_br: {
3271 // Only the first lane is executes, so readfirstlane is safe.
3272 substituteSimpleCopyRegs(OpdMapper, OpIdx: 1);
3273 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3274 return;
3275 }
3276 case Intrinsic::amdgcn_ds_gws_sema_v:
3277 case Intrinsic::amdgcn_ds_gws_sema_p:
3278 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3279 // Only the first lane is executes, so readfirstlane is safe.
3280 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // M0
3281 return;
3282 }
3283 case Intrinsic::amdgcn_ds_append:
3284 case Intrinsic::amdgcn_ds_consume: {
3285 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3286 return;
3287 }
3288 case Intrinsic::amdgcn_s_alloc_vgpr:
3289 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3290 return;
3291 case Intrinsic::amdgcn_s_sendmsg:
3292 case Intrinsic::amdgcn_s_sendmsghalt: {
3293 // FIXME: Should this use a waterfall loop?
3294 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3295 return;
3296 }
3297 case Intrinsic::amdgcn_s_setreg: {
3298 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3299 return;
3300 }
3301 case Intrinsic::amdgcn_s_ttracedata:
3302 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // M0
3303 return;
3304 case Intrinsic::amdgcn_raw_buffer_load_lds:
3305 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3306 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
3307 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: {
3308 applyDefaultMapping(OpdMapper);
3309 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // rsrc
3310 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3311 constrainOpWithReadfirstlane(B, MI, OpIdx: 5); // soffset
3312 return;
3313 }
3314 case Intrinsic::amdgcn_struct_buffer_load_lds:
3315 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3316 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
3317 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
3318 applyDefaultMapping(OpdMapper);
3319 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // rsrc
3320 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3321 constrainOpWithReadfirstlane(B, MI, OpIdx: 6); // soffset
3322 return;
3323 }
3324 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
3325 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
3326 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
3327 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
3328 applyDefaultMapping(OpdMapper);
3329 constrainOpWithReadfirstlane(B, MI, OpIdx: 5);
3330 return;
3331 }
3332 case Intrinsic::amdgcn_load_to_lds:
3333 case Intrinsic::amdgcn_load_async_to_lds:
3334 case Intrinsic::amdgcn_global_load_lds:
3335 case Intrinsic::amdgcn_global_load_async_lds: {
3336 applyDefaultMapping(OpdMapper);
3337 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3338 return;
3339 }
3340 case Intrinsic::amdgcn_lds_direct_load: {
3341 applyDefaultMapping(OpdMapper);
3342 // Readlane for m0 value, which is always the last operand.
3343 constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - 1); // Index
3344 return;
3345 }
3346 case Intrinsic::amdgcn_exp_row:
3347 applyDefaultMapping(OpdMapper);
3348 constrainOpWithReadfirstlane(B, MI, OpIdx: 8); // M0
3349 return;
3350 case Intrinsic::amdgcn_cluster_load_b32:
3351 case Intrinsic::amdgcn_cluster_load_b64:
3352 case Intrinsic::amdgcn_cluster_load_b128: {
3353 applyDefaultMapping(OpdMapper);
3354 constrainOpWithReadfirstlane(B, MI, OpIdx: 4); // M0
3355 return;
3356 }
3357 case Intrinsic::amdgcn_s_sleep_var:
3358 assert(OpdMapper.getVRegs(1).empty());
3359 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3360 return;
3361 case Intrinsic::amdgcn_s_barrier_join:
3362 case Intrinsic::amdgcn_s_wakeup_barrier:
3363 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3364 return;
3365 case Intrinsic::amdgcn_s_barrier_init:
3366 case Intrinsic::amdgcn_s_barrier_signal_var:
3367 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3368 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3369 return;
3370 case Intrinsic::amdgcn_s_get_barrier_state:
3371 case Intrinsic::amdgcn_s_get_named_barrier_state: {
3372 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3373 return;
3374 }
3375 case Intrinsic::amdgcn_s_prefetch_data: {
3376 Register PtrReg = MI.getOperand(i: 1).getReg();
3377 unsigned AS = MRI.getType(Reg: PtrReg).getAddressSpace();
3378 if (AMDGPU::isFlatGlobalAddrSpace(AS)) {
3379 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3380 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3381 } else
3382 MI.eraseFromParent();
3383 return;
3384 }
3385 case Intrinsic::amdgcn_tensor_load_to_lds:
3386 case Intrinsic::amdgcn_tensor_store_from_lds: {
3387 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3388 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3389 constrainOpWithReadfirstlane(B, MI, OpIdx: 3);
3390 constrainOpWithReadfirstlane(B, MI, OpIdx: 4);
3391 constrainOpWithReadfirstlane(B, MI, OpIdx: 5);
3392 return;
3393 }
3394 default: {
3395 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3396 AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
3397 // Non-images can have complications from operands that allow both SGPR
3398 // and VGPR. For now it's too complicated to figure out the final opcode
3399 // to derive the register bank from the MCInstrDesc.
3400 if (RSrcIntrin->IsImage) {
3401 applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg);
3402 return;
3403 }
3404 }
3405
3406 break;
3407 }
3408 }
3409 break;
3410 }
3411 case AMDGPU::G_SI_CALL: {
3412 // Use a set to avoid extra readfirstlanes in the case where multiple
3413 // operands are the same register.
3414 SmallSet<Register, 4> SGPROperandRegs;
3415
3416 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices: {1}))
3417 break;
3418
3419 // Move all copies to physical SGPRs that are used by the call instruction
3420 // into the loop block. Start searching for these copies until the
3421 // ADJCALLSTACKUP.
3422 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3423 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3424
3425 // Move all non-copies before the copies, so that a complete range can be
3426 // moved into the waterfall loop.
3427 SmallVector<MachineInstr *, 4> NonCopyInstrs;
3428 // Count of NonCopyInstrs found until the current LastCopy.
3429 unsigned NonCopyInstrsLen = 0;
3430 MachineBasicBlock::iterator Start(&MI);
3431 MachineBasicBlock::iterator LastCopy = Start;
3432 MachineBasicBlock *MBB = MI.getParent();
3433 const SIMachineFunctionInfo *Info =
3434 MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3435 while (Start->getOpcode() != FrameSetupOpcode) {
3436 --Start;
3437 bool IsCopy = false;
3438 if (Start->getOpcode() == AMDGPU::COPY) {
3439 auto &Dst = Start->getOperand(i: 0);
3440 if (Dst.isReg()) {
3441 Register Reg = Dst.getReg();
3442 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3443 IsCopy = true;
3444 } else {
3445 // Also move the copy from the scratch rsrc descriptor into the loop
3446 // to allow it to be optimized away.
3447 auto &Src = Start->getOperand(i: 1);
3448 if (Src.isReg()) {
3449 Reg = Src.getReg();
3450 IsCopy = Info->getScratchRSrcReg() == Reg;
3451 }
3452 }
3453 }
3454 }
3455
3456 if (IsCopy) {
3457 LastCopy = Start;
3458 NonCopyInstrsLen = NonCopyInstrs.size();
3459 } else {
3460 NonCopyInstrs.push_back(Elt: &*Start);
3461 }
3462 }
3463 NonCopyInstrs.resize(N: NonCopyInstrsLen);
3464
3465 for (auto *NonCopy : reverse(C&: NonCopyInstrs)) {
3466 MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator());
3467 }
3468 Start = LastCopy;
3469
3470 // Do the same for copies after the loop
3471 NonCopyInstrs.clear();
3472 NonCopyInstrsLen = 0;
3473 MachineBasicBlock::iterator End(&MI);
3474 LastCopy = End;
3475 while (End->getOpcode() != FrameDestroyOpcode) {
3476 ++End;
3477 bool IsCopy = false;
3478 if (End->getOpcode() == AMDGPU::COPY) {
3479 auto &Src = End->getOperand(i: 1);
3480 if (Src.isReg()) {
3481 Register Reg = Src.getReg();
3482 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3483 }
3484 }
3485
3486 if (IsCopy) {
3487 LastCopy = End;
3488 NonCopyInstrsLen = NonCopyInstrs.size();
3489 } else {
3490 NonCopyInstrs.push_back(Elt: &*End);
3491 }
3492 }
3493 NonCopyInstrs.resize(N: NonCopyInstrsLen);
3494
3495 End = LastCopy;
3496 ++LastCopy;
3497 for (auto *NonCopy : reverse(C&: NonCopyInstrs)) {
3498 MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator());
3499 }
3500
3501 ++End;
3502 B.setInsertPt(MBB&: B.getMBB(), II: Start);
3503 executeInWaterfallLoop(B, Range: make_range(x: Start, y: End), SGPROperandRegs);
3504 break;
3505 }
3506 case AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR:
3507 case AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR:
3508 case AMDGPU::G_LOAD:
3509 case AMDGPU::G_ZEXTLOAD:
3510 case AMDGPU::G_SEXTLOAD: {
3511 if (applyMappingLoad(B, OpdMapper, MI))
3512 return;
3513 break;
3514 }
3515 case AMDGPU::G_DYN_STACKALLOC:
3516 applyMappingDynStackAlloc(B, OpdMapper, MI);
3517 return;
3518 case AMDGPU::G_STACKRESTORE: {
3519 applyDefaultMapping(OpdMapper);
3520 constrainOpWithReadfirstlane(B, MI, OpIdx: 0);
3521 return;
3522 }
3523 case AMDGPU::G_SBFX:
3524 applyMappingBFE(B, OpdMapper, /*Signed*/ true);
3525 return;
3526 case AMDGPU::G_UBFX:
3527 applyMappingBFE(B, OpdMapper, /*Signed*/ false);
3528 return;
3529 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3530 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3531 applyMappingMAD_64_32(B, OpdMapper);
3532 return;
3533 case AMDGPU::G_PREFETCH: {
3534 if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) {
3535 MI.eraseFromParent();
3536 return;
3537 }
3538 Register PtrReg = MI.getOperand(i: 0).getReg();
3539 unsigned PtrBank = getRegBankID(Reg: PtrReg, MRI, Default: AMDGPU::SGPRRegBankID);
3540 if (PtrBank == AMDGPU::VGPRRegBankID &&
3541 (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(i: 3).getImm())) {
3542 // Cannot do I$ prefetch with divergent pointer.
3543 MI.eraseFromParent();
3544 return;
3545 }
3546 unsigned AS = MRI.getType(Reg: PtrReg).getAddressSpace();
3547 if ((!AMDGPU::isFlatGlobalAddrSpace(AS) &&
3548 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3549 (!Subtarget.hasSafeSmemPrefetch() &&
3550 (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
3551 !MI.getOperand(i: 3).getImm() /* I$ prefetch */))) {
3552 MI.eraseFromParent();
3553 return;
3554 }
3555 applyDefaultMapping(OpdMapper);
3556 return;
3557 }
3558 default:
3559 break;
3560 }
3561
3562 return applyDefaultMapping(OpdMapper);
3563}
3564
3565// vgpr, sgpr -> vgpr
3566// vgpr, agpr -> vgpr
3567// agpr, agpr -> agpr
3568// agpr, sgpr -> vgpr
3569static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3570 if (RB0 == AMDGPU::InvalidRegBankID)
3571 return RB1;
3572 if (RB1 == AMDGPU::InvalidRegBankID)
3573 return RB0;
3574
3575 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3576 return AMDGPU::SGPRRegBankID;
3577
3578 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3579 return AMDGPU::AGPRRegBankID;
3580
3581 return AMDGPU::VGPRRegBankID;
3582}
3583
3584static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3585 if (RB0 == AMDGPU::InvalidRegBankID)
3586 return RB1;
3587 if (RB1 == AMDGPU::InvalidRegBankID)
3588 return RB0;
3589
3590 // vcc, vcc -> vcc
3591 // vcc, sgpr -> vcc
3592 // vcc, vgpr -> vcc
3593 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3594 return AMDGPU::VCCRegBankID;
3595
3596 // vcc, vgpr -> vgpr
3597 return regBankUnion(RB0, RB1);
3598}
3599
3600unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3601 const MachineInstr &MI) const {
3602 unsigned RegBank = AMDGPU::InvalidRegBankID;
3603
3604 for (const MachineOperand &MO : MI.operands()) {
3605 if (!MO.isReg())
3606 continue;
3607 Register Reg = MO.getReg();
3608 if (const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI)) {
3609 RegBank = regBankUnion(RB0: RegBank, RB1: Bank->getID());
3610 if (RegBank == AMDGPU::VGPRRegBankID)
3611 break;
3612 }
3613 }
3614
3615 return RegBank;
3616}
3617
3618bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3619 const MachineFunction &MF = *MI.getMF();
3620 const MachineRegisterInfo &MRI = MF.getRegInfo();
3621 for (const MachineOperand &MO : MI.operands()) {
3622 if (!MO.isReg())
3623 continue;
3624 Register Reg = MO.getReg();
3625 if (const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI)) {
3626 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3627 return false;
3628 }
3629 }
3630 return true;
3631}
3632
3633const RegisterBankInfo::InstructionMapping &
3634AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3635 const MachineFunction &MF = *MI.getMF();
3636 const MachineRegisterInfo &MRI = MF.getRegInfo();
3637 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3638
3639 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3640 const MachineOperand &SrcOp = MI.getOperand(i);
3641 if (!SrcOp.isReg())
3642 continue;
3643
3644 unsigned Size = getSizeInBits(Reg: SrcOp.getReg(), MRI, TRI: *TRI);
3645 OpdsMapping[i] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
3646 }
3647 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping),
3648 NumOperands: MI.getNumOperands());
3649}
3650
3651const RegisterBankInfo::InstructionMapping &
3652AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3653 const MachineFunction &MF = *MI.getMF();
3654 const MachineRegisterInfo &MRI = MF.getRegInfo();
3655 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3656
3657 // Even though we technically could use SGPRs, this would require knowledge of
3658 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3659 //
3660 // TODO: Unary ops are trivially OK, so accept SGPRs?
3661 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3662 const MachineOperand &Src = MI.getOperand(i);
3663 if (!Src.isReg())
3664 continue;
3665
3666 unsigned Size = getSizeInBits(Reg: Src.getReg(), MRI, TRI: *TRI);
3667 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3668 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3669 }
3670
3671 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping),
3672 NumOperands: MI.getNumOperands());
3673}
3674
3675const RegisterBankInfo::InstructionMapping &
3676AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3677 const MachineFunction &MF = *MI.getMF();
3678 const MachineRegisterInfo &MRI = MF.getRegInfo();
3679 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3680
3681 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3682 const MachineOperand &Op = MI.getOperand(i: I);
3683 if (!Op.isReg())
3684 continue;
3685
3686 unsigned Size = getSizeInBits(Reg: Op.getReg(), MRI, TRI: *TRI);
3687 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3688 }
3689
3690 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping),
3691 NumOperands: MI.getNumOperands());
3692}
3693
3694const RegisterBankInfo::InstructionMapping &
3695AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3696 const MachineInstr &MI,
3697 int RsrcIdx) const {
3698 // The reported argument index is relative to the IR intrinsic call arguments,
3699 // so we need to shift by the number of defs and the intrinsic ID.
3700 RsrcIdx += MI.getNumExplicitDefs() + 1;
3701
3702 const int NumOps = MI.getNumOperands();
3703 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3704
3705 // TODO: Should packed/unpacked D16 difference be reported here as part of
3706 // the value mapping?
3707 for (int I = 0; I != NumOps; ++I) {
3708 if (!MI.getOperand(i: I).isReg())
3709 continue;
3710
3711 Register OpReg = MI.getOperand(i: I).getReg();
3712 // We replace some dead address operands with $noreg
3713 if (!OpReg)
3714 continue;
3715
3716 unsigned Size = getSizeInBits(Reg: OpReg, MRI, TRI: *TRI);
3717
3718 // FIXME: Probably need a new intrinsic register bank searchable table to
3719 // handle arbitrary intrinsics easily.
3720 //
3721 // If this has a sampler, it immediately follows rsrc.
3722 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3723
3724 if (MustBeSGPR) {
3725 // If this must be an SGPR, so we must report whatever it is as legal.
3726 unsigned NewBank = getRegBankID(Reg: OpReg, MRI, Default: AMDGPU::SGPRRegBankID);
3727 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: NewBank, Size);
3728 } else {
3729 // Some operands must be VGPR, and these are easy to copy to.
3730 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3731 }
3732 }
3733
3734 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: NumOps);
3735}
3736
3737/// Return the mapping for a pointer argument.
3738const RegisterBankInfo::ValueMapping *
3739AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3740 Register PtrReg) const {
3741 LLT PtrTy = MRI.getType(Reg: PtrReg);
3742 unsigned Size = PtrTy.getSizeInBits();
3743 if (Subtarget.useFlatForGlobal() ||
3744 !AMDGPU::isFlatGlobalAddrSpace(AS: PtrTy.getAddressSpace()))
3745 return AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3746
3747 // If we're using MUBUF instructions for global memory, an SGPR base register
3748 // is possible. Otherwise this needs to be a VGPR.
3749 const RegisterBank *PtrBank = getRegBank(Reg: PtrReg, MRI, TRI: *TRI);
3750 return AMDGPU::getValueMapping(BankID: PtrBank->getID(), Size);
3751}
3752
3753const RegisterBankInfo::InstructionMapping &
3754AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3755
3756 const MachineFunction &MF = *MI.getMF();
3757 const MachineRegisterInfo &MRI = MF.getRegInfo();
3758 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3759 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
3760 Register PtrReg = MI.getOperand(i: 1).getReg();
3761 LLT PtrTy = MRI.getType(Reg: PtrReg);
3762 unsigned AS = PtrTy.getAddressSpace();
3763 unsigned PtrSize = PtrTy.getSizeInBits();
3764
3765 const ValueMapping *ValMapping;
3766 const ValueMapping *PtrMapping;
3767
3768 const RegisterBank *PtrBank = getRegBank(Reg: PtrReg, MRI, TRI: *TRI);
3769
3770 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3771 if (isScalarLoadLegal(MI)) {
3772 // We have a uniform instruction so we want to use an SMRD load
3773 ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
3774 PtrMapping = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: PtrSize);
3775 } else {
3776 ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3777
3778 // If we're using MUBUF instructions for global memory, an SGPR base
3779 // register is possible. Otherwise this needs to be a VGPR.
3780 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3781 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3782
3783 PtrMapping = AMDGPU::getValueMapping(BankID: PtrBankID, Size: PtrSize);
3784 }
3785 } else {
3786 ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3787 PtrMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: PtrSize);
3788 }
3789
3790 OpdsMapping[0] = ValMapping;
3791 OpdsMapping[1] = PtrMapping;
3792 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3793 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: MI.getNumOperands());
3794 return Mapping;
3795
3796 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3797 // handle that during instruction selection?
3798}
3799
3800unsigned
3801AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3802 const MachineRegisterInfo &MRI,
3803 unsigned Default) const {
3804 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI);
3805 return Bank ? Bank->getID() : Default;
3806}
3807
3808const RegisterBankInfo::ValueMapping *
3809AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3810 const MachineRegisterInfo &MRI,
3811 const TargetRegisterInfo &TRI) const {
3812 // Lie and claim anything is legal, even though this needs to be an SGPR
3813 // applyMapping will have to deal with it as a waterfall loop.
3814 unsigned Bank = getRegBankID(Reg, MRI, Default: AMDGPU::SGPRRegBankID);
3815 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3816 return AMDGPU::getValueMapping(BankID: Bank, Size);
3817}
3818
3819const RegisterBankInfo::ValueMapping *
3820AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3821 const MachineRegisterInfo &MRI,
3822 const TargetRegisterInfo &TRI) const {
3823 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3824 return AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3825}
3826
3827const RegisterBankInfo::ValueMapping *
3828AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3829 const MachineRegisterInfo &MRI,
3830 const TargetRegisterInfo &TRI) const {
3831 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3832 return AMDGPU::getValueMapping(BankID: AMDGPU::AGPRRegBankID, Size);
3833}
3834
3835///
3836/// This function must return a legal mapping, because
3837/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3838/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3839/// VGPR to SGPR generated is illegal.
3840///
3841// Operands that must be SGPRs must accept potentially divergent VGPRs as
3842// legal. These will be dealt with in applyMappingImpl.
3843//
3844const RegisterBankInfo::InstructionMapping &
3845AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3846 const MachineFunction &MF = *MI.getMF();
3847 const MachineRegisterInfo &MRI = MF.getRegInfo();
3848
3849 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3850 Register DstReg = MI.getOperand(i: 0).getReg();
3851 Register SrcReg = MI.getOperand(i: 1).getReg();
3852
3853 // The default logic bothers to analyze impossible alternative mappings. We
3854 // want the most straightforward mapping, so just directly handle this.
3855 const RegisterBank *DstBank = getRegBank(Reg: DstReg, MRI, TRI: *TRI);
3856 const RegisterBank *SrcBank = getRegBank(Reg: SrcReg, MRI, TRI: *TRI);
3857
3858 // For COPY between a physical reg and an s1, there is no type associated so
3859 // we need to take the virtual register's type as a hint on how to interpret
3860 // s1 values.
3861 unsigned Size;
3862 if (!SrcReg.isVirtual() && !DstBank &&
3863 MRI.getType(Reg: DstReg) == LLT::scalar(SizeInBits: 1)) {
3864 DstBank = &AMDGPU::VCCRegBank;
3865 Size = 1;
3866 } else if (!DstReg.isVirtual() && MRI.getType(Reg: SrcReg) == LLT::scalar(SizeInBits: 1)) {
3867 DstBank = &AMDGPU::VCCRegBank;
3868 Size = 1;
3869 } else {
3870 Size = getSizeInBits(Reg: DstReg, MRI, TRI: *TRI);
3871 }
3872
3873 if (!DstBank)
3874 DstBank = SrcBank;
3875 else if (!SrcBank)
3876 SrcBank = DstBank;
3877
3878 if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3879 cannotCopy(Dst: *DstBank, Src: *SrcBank, Size: TypeSize::getFixed(ExactSize: Size)))
3880 return getInvalidInstructionMapping();
3881
3882 const ValueMapping &ValMap = getValueMapping(StartIdx: 0, Length: Size, RegBank: *DstBank);
3883 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3884 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3885 OpdsMapping[0] = &ValMap;
3886 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3887 OpdsMapping[1] = &ValMap;
3888
3889 return getInstructionMapping(
3890 ID: 1, /*Cost*/ 1,
3891 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), NumOperands: OpdsMappingSize);
3892 }
3893
3894 if (MI.isRegSequence()) {
3895 // If any input is a VGPR, the result must be a VGPR. The default handling
3896 // assumes any copy between banks is legal.
3897 unsigned BankID = AMDGPU::SGPRRegBankID;
3898
3899 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3900 auto OpBank = getRegBankID(Reg: MI.getOperand(i: I).getReg(), MRI);
3901 // It doesn't make sense to use vcc or scc banks here, so just ignore
3902 // them.
3903 if (OpBank != AMDGPU::SGPRRegBankID) {
3904 BankID = AMDGPU::VGPRRegBankID;
3905 break;
3906 }
3907 }
3908 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
3909
3910 const ValueMapping &ValMap = getValueMapping(StartIdx: 0, Length: Size, RegBank: getRegBank(ID: BankID));
3911 return getInstructionMapping(
3912 ID: 1, /*Cost*/ 1,
3913 /*OperandsMapping*/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: 1);
3914 }
3915
3916 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3917 // properly.
3918 //
3919 // TODO: There are additional exec masking dependencies to analyze.
3920 if (auto *PHI = dyn_cast<GPhi>(Val: &MI)) {
3921 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3922 Register DstReg = PHI->getReg(Idx: 0);
3923
3924 // Sometimes the result may have already been assigned a bank.
3925 if (const RegisterBank *DstBank = getRegBank(Reg: DstReg, MRI, TRI: *TRI))
3926 ResultBank = DstBank->getID();
3927
3928 for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) {
3929 Register Reg = PHI->getIncomingValue(I);
3930 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI);
3931
3932 // FIXME: Assuming VGPR for any undetermined inputs.
3933 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3934 ResultBank = AMDGPU::VGPRRegBankID;
3935 break;
3936 }
3937
3938 // FIXME: Need to promote SGPR case to s32
3939 unsigned OpBank = Bank->getID();
3940 ResultBank = regBankBoolUnion(RB0: ResultBank, RB1: OpBank);
3941 }
3942
3943 assert(ResultBank != AMDGPU::InvalidRegBankID);
3944
3945 unsigned Size = MRI.getType(Reg: DstReg).getSizeInBits();
3946
3947 const ValueMapping &ValMap =
3948 getValueMapping(StartIdx: 0, Length: Size, RegBank: getRegBank(ID: ResultBank));
3949 return getInstructionMapping(
3950 ID: 1, /*Cost*/ 1,
3951 /*OperandsMapping*/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: 1);
3952 }
3953
3954 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3955 if (Mapping.isValid())
3956 return Mapping;
3957
3958 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3959
3960 switch (MI.getOpcode()) {
3961 default:
3962 return getInvalidInstructionMapping();
3963
3964 case AMDGPU::G_AND:
3965 case AMDGPU::G_OR:
3966 case AMDGPU::G_XOR:
3967 case AMDGPU::G_MUL: {
3968 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
3969 if (Size == 1) {
3970 const RegisterBank *DstBank
3971 = getRegBank(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
3972
3973 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3974 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3975 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3976 if (DstBank) {
3977 TargetBankID = DstBank->getID();
3978 if (DstBank == &AMDGPU::VCCRegBank) {
3979 TargetBankID = AMDGPU::VCCRegBankID;
3980 BankLHS = AMDGPU::VCCRegBankID;
3981 BankRHS = AMDGPU::VCCRegBankID;
3982 } else {
3983 BankLHS = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI,
3984 Default: AMDGPU::SGPRRegBankID);
3985 BankRHS = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
3986 Default: AMDGPU::SGPRRegBankID);
3987 }
3988 } else {
3989 BankLHS = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI,
3990 Default: AMDGPU::VCCRegBankID);
3991 BankRHS = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
3992 Default: AMDGPU::VCCRegBankID);
3993
3994 // Both inputs should be true booleans to produce a boolean result.
3995 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3996 TargetBankID = AMDGPU::VGPRRegBankID;
3997 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3998 TargetBankID = AMDGPU::VCCRegBankID;
3999 BankLHS = AMDGPU::VCCRegBankID;
4000 BankRHS = AMDGPU::VCCRegBankID;
4001 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
4002 TargetBankID = AMDGPU::SGPRRegBankID;
4003 }
4004 }
4005
4006 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: TargetBankID, Size);
4007 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: BankLHS, Size);
4008 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: BankRHS, Size);
4009 break;
4010 }
4011
4012 if (Size == 64) {
4013
4014 if (isSALUMapping(MI)) {
4015 OpdsMapping[0] = getValueMappingSGPR64Only(BankID: AMDGPU::SGPRRegBankID, Size);
4016 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
4017 } else {
4018 if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64())
4019 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
4020 else
4021 OpdsMapping[0] =
4022 getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size);
4023 unsigned Bank1 = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI /*, DefaultBankID*/);
4024 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank1, Size);
4025
4026 unsigned Bank2 = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI /*, DefaultBankID*/);
4027 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank2, Size);
4028 }
4029
4030 break;
4031 }
4032
4033 [[fallthrough]];
4034 }
4035 case AMDGPU::G_PTR_ADD:
4036 case AMDGPU::G_PTRMASK:
4037 case AMDGPU::G_ADD:
4038 case AMDGPU::G_SUB:
4039 case AMDGPU::G_SHL:
4040 case AMDGPU::G_LSHR:
4041 case AMDGPU::G_ASHR:
4042 case AMDGPU::G_UADDO:
4043 case AMDGPU::G_USUBO:
4044 case AMDGPU::G_UADDE:
4045 case AMDGPU::G_SADDE:
4046 case AMDGPU::G_USUBE:
4047 case AMDGPU::G_SSUBE:
4048 case AMDGPU::G_ABS:
4049 case AMDGPU::G_SHUFFLE_VECTOR:
4050 case AMDGPU::G_SBFX:
4051 case AMDGPU::G_UBFX:
4052 case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
4053 case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
4054 if (isSALUMapping(MI))
4055 return getDefaultMappingSOP(MI);
4056 return getDefaultMappingVOP(MI);
4057 case AMDGPU::G_SMIN:
4058 case AMDGPU::G_SMAX:
4059 case AMDGPU::G_UMIN:
4060 case AMDGPU::G_UMAX:
4061 if (isSALUMapping(MI)) {
4062 // There are no scalar 64-bit min and max, use vector instruction instead.
4063 if (MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits() == 64 &&
4064 Subtarget.hasIntMinMax64())
4065 return getDefaultMappingVOP(MI);
4066 return getDefaultMappingSOP(MI);
4067 }
4068 return getDefaultMappingVOP(MI);
4069 case AMDGPU::G_FADD:
4070 case AMDGPU::G_FSUB:
4071 case AMDGPU::G_FMUL:
4072 case AMDGPU::G_FMA:
4073 case AMDGPU::G_FFLOOR:
4074 case AMDGPU::G_FCEIL:
4075 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
4076 case AMDGPU::G_FMINNUM:
4077 case AMDGPU::G_FMAXNUM:
4078 case AMDGPU::G_FMINIMUM:
4079 case AMDGPU::G_FMAXIMUM:
4080 case AMDGPU::G_FMINIMUMNUM:
4081 case AMDGPU::G_FMAXIMUMNUM:
4082 case AMDGPU::G_INTRINSIC_TRUNC:
4083 case AMDGPU::G_STRICT_FADD:
4084 case AMDGPU::G_STRICT_FSUB:
4085 case AMDGPU::G_STRICT_FMUL:
4086 case AMDGPU::G_STRICT_FMA: {
4087 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4088 unsigned Size = Ty.getSizeInBits();
4089 if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
4090 (Size == 32 || Size == 16) && isSALUMapping(MI))
4091 return getDefaultMappingSOP(MI);
4092 return getDefaultMappingVOP(MI);
4093 }
4094 case AMDGPU::G_FPTOSI:
4095 case AMDGPU::G_FPTOUI:
4096 case AMDGPU::G_FPTOSI_SAT:
4097 case AMDGPU::G_FPTOUI_SAT:
4098 case AMDGPU::G_SITOFP:
4099 case AMDGPU::G_UITOFP: {
4100 unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4101 unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4102 if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
4103 isSALUMapping(MI))
4104 return getDefaultMappingSOP(MI);
4105 return getDefaultMappingVOP(MI);
4106 }
4107 case AMDGPU::G_FPTRUNC:
4108 case AMDGPU::G_FPEXT: {
4109 unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4110 unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4111 if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
4112 isSALUMapping(MI))
4113 return getDefaultMappingSOP(MI);
4114 return getDefaultMappingVOP(MI);
4115 }
4116 case AMDGPU::G_FSQRT:
4117 case AMDGPU::G_FEXP2:
4118 case AMDGPU::G_FLOG2: {
4119 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4120 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4121 isSALUMapping(MI))
4122 return getDefaultMappingSOP(MI);
4123 return getDefaultMappingVOP(MI);
4124 }
4125 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
4126 case AMDGPU::G_SSUBSAT:
4127 case AMDGPU::G_UADDSAT:
4128 case AMDGPU::G_USUBSAT:
4129 case AMDGPU::G_FMAD:
4130 case AMDGPU::G_FLDEXP:
4131 case AMDGPU::G_FMINNUM_IEEE:
4132 case AMDGPU::G_FMAXNUM_IEEE:
4133 case AMDGPU::G_FCANONICALIZE:
4134 case AMDGPU::G_STRICT_FLDEXP:
4135 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
4136 case AMDGPU::G_FSHR: // TODO: Expand for scalar
4137 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
4138 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
4139 case AMDGPU::G_AMDGPU_RCP_IFLAG:
4140 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
4141 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
4142 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
4143 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
4144 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
4145 case AMDGPU::G_AMDGPU_SMED3:
4146 case AMDGPU::G_AMDGPU_FMED3:
4147 return getDefaultMappingVOP(MI);
4148 case AMDGPU::G_UMULH:
4149 case AMDGPU::G_SMULH: {
4150 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
4151 return getDefaultMappingSOP(MI);
4152 return getDefaultMappingVOP(MI);
4153 }
4154 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4155 case AMDGPU::G_AMDGPU_MAD_I64_I32: {
4156 // Three possible mappings:
4157 //
4158 // - Default SOP
4159 // - Default VOP
4160 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
4161 //
4162 // This allows instruction selection to keep the multiplication part of the
4163 // instruction on the SALU.
4164 bool AllSalu = true;
4165 bool MulSalu = true;
4166 for (unsigned i = 0; i < 5; ++i) {
4167 Register Reg = MI.getOperand(i).getReg();
4168 if (const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI)) {
4169 if (Bank->getID() != AMDGPU::SGPRRegBankID) {
4170 AllSalu = false;
4171 if (i == 2 || i == 3) {
4172 MulSalu = false;
4173 break;
4174 }
4175 }
4176 }
4177 }
4178
4179 if (AllSalu)
4180 return getDefaultMappingSOP(MI);
4181
4182 // If the multiply-add is full-rate in VALU, use that even if the
4183 // multiplication part is scalar. Accumulating separately on the VALU would
4184 // take two instructions.
4185 if (!MulSalu || Subtarget.hasFullRate64Ops())
4186 return getDefaultMappingVOP(MI);
4187
4188 // Keep the multiplication on the SALU, then accumulate on the VALU.
4189 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 64);
4190 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
4191 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4192 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4193 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 64);
4194 break;
4195 }
4196 case AMDGPU::G_IMPLICIT_DEF: {
4197 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4198 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4199 break;
4200 }
4201 case AMDGPU::G_FCONSTANT:
4202 case AMDGPU::G_CONSTANT:
4203 case AMDGPU::G_GLOBAL_VALUE:
4204 case AMDGPU::G_FRAME_INDEX:
4205 case AMDGPU::G_BLOCK_ADDR:
4206 case AMDGPU::G_READSTEADYCOUNTER:
4207 case AMDGPU::G_READCYCLECOUNTER: {
4208 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4209 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4210 break;
4211 }
4212 case AMDGPU::G_DYN_STACKALLOC: {
4213 // Result is always uniform, and a wave reduction is needed for the source.
4214 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4215 unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4216 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: 32);
4217 break;
4218 }
4219 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
4220 // This case is weird because we expect a physical register in the source,
4221 // but need to set a bank anyway.
4222 //
4223 // TODO: We could select the result to SGPR or VGPR
4224 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4225 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4226 break;
4227 }
4228 case AMDGPU::G_INSERT: {
4229 unsigned BankID = getMappingType(MRI, MI);
4230 unsigned DstSize = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4231 unsigned SrcSize = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4232 unsigned EltSize = getSizeInBits(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4233 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: DstSize);
4234 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size: SrcSize);
4235 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, Size: EltSize);
4236 OpdsMapping[3] = nullptr;
4237 break;
4238 }
4239 case AMDGPU::G_EXTRACT: {
4240 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4241 unsigned DstSize = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4242 unsigned SrcSize = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4243 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: DstSize);
4244 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size: SrcSize);
4245 OpdsMapping[2] = nullptr;
4246 break;
4247 }
4248 case AMDGPU::G_BUILD_VECTOR:
4249 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
4250 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4251 if (DstTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16)) {
4252 unsigned DstSize = DstTy.getSizeInBits();
4253 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4254 unsigned Src0BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4255 unsigned Src1BankID = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4256 unsigned DstBankID = regBankUnion(RB0: Src0BankID, RB1: Src1BankID);
4257
4258 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: DstBankID, Size: DstSize);
4259 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Src0BankID, Size: SrcSize);
4260 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Src1BankID, Size: SrcSize);
4261 break;
4262 }
4263
4264 [[fallthrough]];
4265 }
4266 case AMDGPU::G_MERGE_VALUES:
4267 case AMDGPU::G_CONCAT_VECTORS: {
4268 unsigned Bank = getMappingType(MRI, MI);
4269 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4270 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4271
4272 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize);
4273 // Op1 and Dst should use the same register bank.
4274 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
4275 OpdsMapping[i] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize);
4276 break;
4277 }
4278 case AMDGPU::G_BITREVERSE:
4279 case AMDGPU::G_BITCAST:
4280 case AMDGPU::G_INTTOPTR:
4281 case AMDGPU::G_PTRTOINT:
4282 case AMDGPU::G_FABS:
4283 case AMDGPU::G_FNEG: {
4284 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4285 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4286 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4287 break;
4288 }
4289 case AMDGPU::G_AMDGPU_FFBH_U32:
4290 case AMDGPU::G_AMDGPU_FFBL_B32:
4291 case AMDGPU::G_CTLZ_ZERO_UNDEF:
4292 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
4293 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4294 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4295 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: 32);
4296 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
4297 break;
4298 }
4299 case AMDGPU::G_CTPOP: {
4300 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4301 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4302 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: 32);
4303
4304 // This should really be getValueMappingSGPR64Only, but allowing the generic
4305 // code to handle the register split just makes using LegalizerHelper more
4306 // difficult.
4307 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4308 break;
4309 }
4310 case AMDGPU::G_TRUNC: {
4311 Register Dst = MI.getOperand(i: 0).getReg();
4312 Register Src = MI.getOperand(i: 1).getReg();
4313 unsigned Bank = getRegBankID(Reg: Src, MRI);
4314 unsigned DstSize = getSizeInBits(Reg: Dst, MRI, TRI: *TRI);
4315 unsigned SrcSize = getSizeInBits(Reg: Src, MRI, TRI: *TRI);
4316 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize);
4317 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize);
4318 break;
4319 }
4320 case AMDGPU::G_ZEXT:
4321 case AMDGPU::G_SEXT:
4322 case AMDGPU::G_ANYEXT:
4323 case AMDGPU::G_SEXT_INREG: {
4324 Register Dst = MI.getOperand(i: 0).getReg();
4325 Register Src = MI.getOperand(i: 1).getReg();
4326 unsigned DstSize = getSizeInBits(Reg: Dst, MRI, TRI: *TRI);
4327 unsigned SrcSize = getSizeInBits(Reg: Src, MRI, TRI: *TRI);
4328
4329 unsigned DstBank;
4330 const RegisterBank *SrcBank = getRegBank(Reg: Src, MRI, TRI: *TRI);
4331 assert(SrcBank);
4332 switch (SrcBank->getID()) {
4333 case AMDGPU::SGPRRegBankID:
4334 DstBank = AMDGPU::SGPRRegBankID;
4335 break;
4336 default:
4337 DstBank = AMDGPU::VGPRRegBankID;
4338 break;
4339 }
4340
4341 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
4342 // 32-bits, and then to 64.
4343 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: DstBank, Size: DstSize);
4344 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID: SrcBank->getID(),
4345 Size: SrcSize);
4346 break;
4347 }
4348 case AMDGPU::G_IS_FPCLASS: {
4349 Register SrcReg = MI.getOperand(i: 1).getReg();
4350 unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits();
4351 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4352 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: DstSize);
4353 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4354 break;
4355 }
4356 case AMDGPU::G_STORE: {
4357 assert(MI.getOperand(0).isReg());
4358 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4359
4360 // FIXME: We need to specify a different reg bank once scalar stores are
4361 // supported.
4362 const ValueMapping *ValMapping =
4363 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
4364 OpdsMapping[0] = ValMapping;
4365 OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg());
4366 break;
4367 }
4368 case AMDGPU::G_ICMP:
4369 case AMDGPU::G_FCMP: {
4370 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4371
4372 // See if the result register has already been constrained to vcc, which may
4373 // happen due to control flow intrinsic lowering.
4374 unsigned DstBank = getRegBankID(Reg: MI.getOperand(i: 0).getReg(), MRI,
4375 Default: AMDGPU::SGPRRegBankID);
4376 unsigned Op2Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4377 unsigned Op3Bank = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI);
4378
4379 auto canUseSCCICMP = [&]() {
4380 auto Pred =
4381 static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate());
4382 return Size == 32 ||
4383 (Size == 64 &&
4384 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4385 Subtarget.hasScalarCompareEq64());
4386 };
4387 auto canUseSCCFCMP = [&]() {
4388 return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
4389 };
4390
4391 bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4392 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4393 Op2Bank == AMDGPU::SGPRRegBankID &&
4394 Op3Bank == AMDGPU::SGPRRegBankID &&
4395 (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
4396
4397 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4398 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4399
4400 // TODO: Use 32-bit for scalar output size.
4401 // SCC results will need to be copied to a 32-bit SGPR virtual register.
4402 const unsigned ResultSize = 1;
4403
4404 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: DstBank, Size: ResultSize);
4405 OpdsMapping[1] = nullptr; // Predicate Operand.
4406 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: SrcBank, Size);
4407 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: SrcBank, Size);
4408 break;
4409 }
4410 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4411 // VGPR index can be used for waterfall when indexing a SGPR vector.
4412 unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4413 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4414 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4415 unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4416 unsigned IdxBank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4417 unsigned OutputBankID = regBankUnion(RB0: SrcBankID, RB1: IdxBank);
4418
4419 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: OutputBankID, Size: DstSize);
4420 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: SrcSize);
4421
4422 // The index can be either if the source vector is VGPR.
4423 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4424 break;
4425 }
4426 case AMDGPU::G_INSERT_VECTOR_ELT: {
4427 unsigned OutputBankID = isSALUMapping(MI) ?
4428 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4429
4430 unsigned VecSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4431 unsigned InsertSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4432 unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: 3).getReg()).getSizeInBits();
4433 unsigned InsertEltBankID = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4434 unsigned IdxBankID = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI);
4435
4436 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize);
4437 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize);
4438
4439 // This is a weird case, because we need to break down the mapping based on
4440 // the register bank of a different operand.
4441 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4442 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(BankID: InsertEltBankID,
4443 Size: InsertSize);
4444 } else {
4445 assert(InsertSize == 32 || InsertSize == 64);
4446 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: InsertEltBankID, Size: InsertSize);
4447 }
4448
4449 // The index can be either if the source vector is VGPR.
4450 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBankID, Size: IdxSize);
4451 break;
4452 }
4453 case AMDGPU::G_UNMERGE_VALUES: {
4454 unsigned Bank = getMappingType(MRI, MI);
4455
4456 // Op1 and Dst should use the same register bank.
4457 // FIXME: Shouldn't this be the default? Why do we need to handle this?
4458 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4459 unsigned Size = getSizeInBits(Reg: MI.getOperand(i).getReg(), MRI, TRI: *TRI);
4460 OpdsMapping[i] = AMDGPU::getValueMapping(BankID: Bank, Size);
4461 }
4462 break;
4463 }
4464 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4465 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4466 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4467 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4468 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4469 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
4470 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
4471 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
4472 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
4473 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
4474 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4475 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4476 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4477 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4478 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4479 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4480 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4481 case AMDGPU::G_AMDGPU_BUFFER_STORE:
4482 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4483 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4484 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4485 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4486 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4487
4488 // rsrc
4489 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4490
4491 // vindex
4492 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4493
4494 // voffset
4495 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
4496
4497 // soffset
4498 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
4499
4500 // Any remaining operands are immediates and were correctly null
4501 // initialized.
4502 break;
4503 }
4504 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4505 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4506 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4507 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4508 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4509 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4510 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4511 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4512 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4513 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4514 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4515 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4516 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
4517 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
4518 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4519 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4520 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4521 // vdata_out
4522 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4523
4524 // vdata_in
4525 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4526
4527 // rsrc
4528 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4529
4530 // vindex
4531 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
4532
4533 // voffset
4534 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
4535
4536 // soffset
4537 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
4538
4539 // Any remaining operands are immediates and were correctly null
4540 // initialized.
4541 break;
4542 }
4543 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4544 // vdata_out
4545 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4546
4547 // vdata_in
4548 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4549
4550 // cmp
4551 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4552
4553 // rsrc
4554 OpdsMapping[3] = getSGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
4555
4556 // vindex
4557 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
4558
4559 // voffset
4560 OpdsMapping[5] = getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
4561
4562 // soffset
4563 OpdsMapping[6] = getSGPROpMapping(Reg: MI.getOperand(i: 6).getReg(), MRI, TRI: *TRI);
4564
4565 // Any remaining operands are immediates and were correctly null
4566 // initialized.
4567 break;
4568 }
4569 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
4570 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
4571 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
4572 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
4573 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
4574 // Lie and claim everything is legal, even though some need to be
4575 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4576 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4577 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4578
4579 // We need to convert this to a MUBUF if either the resource of offset is
4580 // VGPR.
4581 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4582 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4583 unsigned ResultBank = regBankUnion(RB0: RSrcBank, RB1: OffsetBank);
4584
4585 unsigned Size0 = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4586 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: ResultBank, Size: Size0);
4587 break;
4588 }
4589 case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
4590 OpdsMapping[0] = getSGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4591 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4592 break;
4593 case AMDGPU::G_AMDGPU_SPONENTRY: {
4594 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4595 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4596 break;
4597 }
4598 case AMDGPU::G_INTRINSIC:
4599 case AMDGPU::G_INTRINSIC_CONVERGENT: {
4600 switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
4601 default:
4602 return getInvalidInstructionMapping();
4603 case Intrinsic::amdgcn_div_fmas:
4604 case Intrinsic::amdgcn_div_fixup:
4605 case Intrinsic::amdgcn_trig_preop:
4606 case Intrinsic::amdgcn_sin:
4607 case Intrinsic::amdgcn_cos:
4608 case Intrinsic::amdgcn_log_clamp:
4609 case Intrinsic::amdgcn_rcp_legacy:
4610 case Intrinsic::amdgcn_rsq_legacy:
4611 case Intrinsic::amdgcn_rsq_clamp:
4612 case Intrinsic::amdgcn_tanh:
4613 case Intrinsic::amdgcn_fmul_legacy:
4614 case Intrinsic::amdgcn_fma_legacy:
4615 case Intrinsic::amdgcn_frexp_mant:
4616 case Intrinsic::amdgcn_frexp_exp:
4617 case Intrinsic::amdgcn_fract:
4618 case Intrinsic::amdgcn_cvt_pknorm_i16:
4619 case Intrinsic::amdgcn_cvt_pknorm_u16:
4620 case Intrinsic::amdgcn_cvt_pk_i16:
4621 case Intrinsic::amdgcn_cvt_pk_u16:
4622 case Intrinsic::amdgcn_cvt_sr_pk_f16_f32:
4623 case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32:
4624 case Intrinsic::amdgcn_cvt_pk_f16_fp8:
4625 case Intrinsic::amdgcn_cvt_pk_f16_bf8:
4626 case Intrinsic::amdgcn_cvt_pk_fp8_f16:
4627 case Intrinsic::amdgcn_cvt_pk_bf8_f16:
4628 case Intrinsic::amdgcn_cvt_sr_fp8_f16:
4629 case Intrinsic::amdgcn_cvt_sr_bf8_f16:
4630 case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp8:
4631 case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp8:
4632 case Intrinsic::amdgcn_cvt_scale_pk8_f16_bf8:
4633 case Intrinsic::amdgcn_cvt_scale_pk8_bf16_bf8:
4634 case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp4:
4635 case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp4:
4636 case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8:
4637 case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8:
4638 case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4:
4639 case Intrinsic::amdgcn_cvt_scale_pk16_f16_fp6:
4640 case Intrinsic::amdgcn_cvt_scale_pk16_bf16_fp6:
4641 case Intrinsic::amdgcn_cvt_scale_pk16_f16_bf6:
4642 case Intrinsic::amdgcn_cvt_scale_pk16_bf16_bf6:
4643 case Intrinsic::amdgcn_cvt_scale_pk16_f32_fp6:
4644 case Intrinsic::amdgcn_cvt_scale_pk16_f32_bf6:
4645 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16:
4646 case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16:
4647 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16:
4648 case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f16:
4649 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f32:
4650 case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f32:
4651 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32:
4652 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16:
4653 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16:
4654 case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f32:
4655 case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f32:
4656 case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f16:
4657 case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f16:
4658 case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_bf16:
4659 case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_bf16:
4660 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_bf16:
4661 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_bf16:
4662 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f16:
4663 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f16:
4664 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f32:
4665 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f32:
4666 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32:
4667 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16:
4668 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16:
4669 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32:
4670 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32:
4671 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16:
4672 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16:
4673 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16:
4674 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16:
4675 case Intrinsic::amdgcn_sat_pk4_i4_i8:
4676 case Intrinsic::amdgcn_sat_pk4_u4_u8:
4677 case Intrinsic::amdgcn_fmed3:
4678 case Intrinsic::amdgcn_cubeid:
4679 case Intrinsic::amdgcn_cubema:
4680 case Intrinsic::amdgcn_cubesc:
4681 case Intrinsic::amdgcn_cubetc:
4682 case Intrinsic::amdgcn_sffbh:
4683 case Intrinsic::amdgcn_fmad_ftz:
4684 case Intrinsic::amdgcn_mbcnt_lo:
4685 case Intrinsic::amdgcn_mbcnt_hi:
4686 case Intrinsic::amdgcn_mul_u24:
4687 case Intrinsic::amdgcn_mul_i24:
4688 case Intrinsic::amdgcn_mulhi_u24:
4689 case Intrinsic::amdgcn_mulhi_i24:
4690 case Intrinsic::amdgcn_lerp:
4691 case Intrinsic::amdgcn_sad_u8:
4692 case Intrinsic::amdgcn_msad_u8:
4693 case Intrinsic::amdgcn_sad_hi_u8:
4694 case Intrinsic::amdgcn_sad_u16:
4695 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4696 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4697 case Intrinsic::amdgcn_mqsad_u32_u8:
4698 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4699 case Intrinsic::amdgcn_alignbyte:
4700 case Intrinsic::amdgcn_perm:
4701 case Intrinsic::amdgcn_prng_b32:
4702 case Intrinsic::amdgcn_fdot2:
4703 case Intrinsic::amdgcn_sdot2:
4704 case Intrinsic::amdgcn_udot2:
4705 case Intrinsic::amdgcn_sdot4:
4706 case Intrinsic::amdgcn_udot4:
4707 case Intrinsic::amdgcn_sdot8:
4708 case Intrinsic::amdgcn_udot8:
4709 case Intrinsic::amdgcn_fdot2_bf16_bf16:
4710 case Intrinsic::amdgcn_fdot2_f16_f16:
4711 case Intrinsic::amdgcn_fdot2_f32_bf16:
4712 case Intrinsic::amdgcn_fdot2c_f32_bf16:
4713 case Intrinsic::amdgcn_sudot4:
4714 case Intrinsic::amdgcn_sudot8:
4715 case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
4716 case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
4717 case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
4718 case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
4719 case Intrinsic::amdgcn_cvt_f32_fp8:
4720 case Intrinsic::amdgcn_cvt_f32_fp8_e5m3:
4721 case Intrinsic::amdgcn_cvt_f32_bf8:
4722 case Intrinsic::amdgcn_cvt_off_f32_i4:
4723 case Intrinsic::amdgcn_cvt_pk_f32_fp8:
4724 case Intrinsic::amdgcn_cvt_pk_f32_bf8:
4725 case Intrinsic::amdgcn_cvt_pk_fp8_f32:
4726 case Intrinsic::amdgcn_cvt_pk_fp8_f32_e5m3:
4727 case Intrinsic::amdgcn_cvt_pk_bf8_f32:
4728 case Intrinsic::amdgcn_cvt_sr_fp8_f32:
4729 case Intrinsic::amdgcn_cvt_sr_fp8_f32_e5m3:
4730 case Intrinsic::amdgcn_cvt_sr_bf8_f32:
4731 case Intrinsic::amdgcn_cvt_sr_bf16_f32:
4732 case Intrinsic::amdgcn_cvt_sr_f16_f32:
4733 case Intrinsic::amdgcn_cvt_f16_fp8:
4734 case Intrinsic::amdgcn_cvt_f16_bf8:
4735 case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_f16:
4736 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
4737 case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
4738 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:
4739 case Intrinsic::amdgcn_cvt_scalef32_f16_fp8:
4740 case Intrinsic::amdgcn_cvt_scalef32_f16_bf8:
4741 case Intrinsic::amdgcn_cvt_scalef32_f32_fp8:
4742 case Intrinsic::amdgcn_cvt_scalef32_f32_bf8:
4743 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f32:
4744 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f32:
4745 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp8:
4746 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_bf8:
4747 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f16:
4748 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_bf16:
4749 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f16:
4750 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_bf16:
4751 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp4:
4752 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f32:
4753 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp4:
4754 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp4:
4755 case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_fp6:
4756 case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_bf6:
4757 case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_bf6:
4758 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_bf6:
4759 case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_fp6:
4760 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_fp6:
4761 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_bf8:
4762 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_bf8:
4763 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp8:
4764 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp8:
4765 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f16:
4766 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_bf16:
4767 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f16:
4768 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_bf16:
4769 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f32:
4770 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_bf16:
4771 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f16:
4772 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f32:
4773 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_bf16:
4774 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f16:
4775 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f32:
4776 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_bf16:
4777 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f16:
4778 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f32:
4779 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_bf16:
4780 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f16:
4781 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f32:
4782 case Intrinsic::amdgcn_ashr_pk_i8_i32:
4783 case Intrinsic::amdgcn_ashr_pk_u8_i32:
4784 case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
4785 case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32:
4786 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4787 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4788 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4789 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
4790 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4791 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4792 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4793 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4794 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
4795 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
4796 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
4797 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
4798 case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
4799 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
4800 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
4801 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
4802 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
4803 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
4804 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
4805 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
4806 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
4807 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
4808 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
4809 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
4810 case Intrinsic::amdgcn_wmma_f32_16x16x4_f32:
4811 case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16:
4812 case Intrinsic::amdgcn_wmma_f32_16x16x32_f16:
4813 case Intrinsic::amdgcn_wmma_f16_16x16x32_f16:
4814 case Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16:
4815 case Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16:
4816 case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8:
4817 case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8:
4818 case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8:
4819 case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8:
4820 case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8:
4821 case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8:
4822 case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8:
4823 case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8:
4824 case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8:
4825 case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8:
4826 case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8:
4827 case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8:
4828 case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8:
4829 case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8:
4830 case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8:
4831 case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:
4832 case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
4833 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
4834 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
4835 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
4836 case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
4837 case Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4:
4838 case Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4:
4839 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
4840 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
4841 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
4842 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
4843 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
4844 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
4845 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
4846 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
4847 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
4848 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
4849 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
4850 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
4851 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8:
4852 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
4853 case Intrinsic::amdgcn_perm_pk16_b4_u4:
4854 case Intrinsic::amdgcn_perm_pk16_b6_u4:
4855 case Intrinsic::amdgcn_perm_pk16_b8_u4:
4856 case Intrinsic::amdgcn_add_max_i32:
4857 case Intrinsic::amdgcn_add_max_u32:
4858 case Intrinsic::amdgcn_add_min_i32:
4859 case Intrinsic::amdgcn_add_min_u32:
4860 case Intrinsic::amdgcn_pk_add_max_i16:
4861 case Intrinsic::amdgcn_pk_add_max_u16:
4862 case Intrinsic::amdgcn_pk_add_min_i16:
4863 case Intrinsic::amdgcn_pk_add_min_u16:
4864 return getDefaultMappingVOP(MI);
4865 case Intrinsic::amdgcn_log:
4866 case Intrinsic::amdgcn_exp2:
4867 case Intrinsic::amdgcn_rcp:
4868 case Intrinsic::amdgcn_rsq:
4869 case Intrinsic::amdgcn_sqrt: {
4870 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4871 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4872 isSALUMapping(MI))
4873 return getDefaultMappingSOP(MI);
4874 return getDefaultMappingVOP(MI);
4875 }
4876 case Intrinsic::amdgcn_sbfe:
4877 case Intrinsic::amdgcn_ubfe:
4878 if (isSALUMapping(MI))
4879 return getDefaultMappingSOP(MI);
4880 return getDefaultMappingVOP(MI);
4881 case Intrinsic::amdgcn_ds_swizzle:
4882 case Intrinsic::amdgcn_ds_permute:
4883 case Intrinsic::amdgcn_ds_bpermute:
4884 case Intrinsic::amdgcn_update_dpp:
4885 case Intrinsic::amdgcn_mov_dpp8:
4886 case Intrinsic::amdgcn_mov_dpp:
4887 case Intrinsic::amdgcn_strict_wwm:
4888 case Intrinsic::amdgcn_wwm:
4889 case Intrinsic::amdgcn_strict_wqm:
4890 case Intrinsic::amdgcn_wqm:
4891 case Intrinsic::amdgcn_softwqm:
4892 case Intrinsic::amdgcn_set_inactive:
4893 case Intrinsic::amdgcn_set_inactive_chain_arg:
4894 case Intrinsic::amdgcn_permlane64:
4895 case Intrinsic::amdgcn_ds_bpermute_fi_b32:
4896 return getDefaultMappingAllVGPR(MI);
4897 case Intrinsic::amdgcn_cvt_pkrtz:
4898 if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
4899 return getDefaultMappingSOP(MI);
4900 return getDefaultMappingVOP(MI);
4901 case Intrinsic::amdgcn_kernarg_segment_ptr:
4902 case Intrinsic::amdgcn_s_getpc:
4903 case Intrinsic::amdgcn_groupstaticsize:
4904 case Intrinsic::amdgcn_reloc_constant:
4905 case Intrinsic::returnaddress: {
4906 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4907 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4908 break;
4909 }
4910 case Intrinsic::amdgcn_wqm_vote: {
4911 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4912 OpdsMapping[0] = OpdsMapping[2]
4913 = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size);
4914 break;
4915 }
4916 case Intrinsic::amdgcn_ps_live: {
4917 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
4918 break;
4919 }
4920 case Intrinsic::amdgcn_div_scale: {
4921 unsigned Dst0Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4922 unsigned Dst1Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4923 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Dst0Size);
4924 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: Dst1Size);
4925
4926 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 3).getReg()).getSizeInBits();
4927 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4928 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4929 break;
4930 }
4931 case Intrinsic::amdgcn_class: {
4932 Register Src0Reg = MI.getOperand(i: 2).getReg();
4933 Register Src1Reg = MI.getOperand(i: 3).getReg();
4934 unsigned Src0Size = MRI.getType(Reg: Src0Reg).getSizeInBits();
4935 unsigned Src1Size = MRI.getType(Reg: Src1Reg).getSizeInBits();
4936 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4937 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: DstSize);
4938 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Src0Size);
4939 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Src1Size);
4940 break;
4941 }
4942 case Intrinsic::amdgcn_icmp:
4943 case Intrinsic::amdgcn_fcmp: {
4944 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4945 // This is not VCCRegBank because this is not used in boolean contexts.
4946 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
4947 unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4948 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
4949 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
4950 break;
4951 }
4952 case Intrinsic::amdgcn_readlane: {
4953 // This must be an SGPR, but accept a VGPR.
4954 Register IdxReg = MI.getOperand(i: 3).getReg();
4955 unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits();
4956 unsigned IdxBank = getRegBankID(Reg: IdxReg, MRI, Default: AMDGPU::SGPRRegBankID);
4957 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4958 [[fallthrough]];
4959 }
4960 case Intrinsic::amdgcn_readfirstlane: {
4961 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4962 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4963 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
4964 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4965 break;
4966 }
4967 case Intrinsic::amdgcn_writelane: {
4968 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4969 Register SrcReg = MI.getOperand(i: 2).getReg();
4970 unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits();
4971 unsigned SrcBank = getRegBankID(Reg: SrcReg, MRI, Default: AMDGPU::SGPRRegBankID);
4972 Register IdxReg = MI.getOperand(i: 3).getReg();
4973 unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits();
4974 unsigned IdxBank = getRegBankID(Reg: IdxReg, MRI, Default: AMDGPU::SGPRRegBankID);
4975 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
4976
4977 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4978 // to legalize.
4979 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: SrcBank, Size: SrcSize);
4980 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4981 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4982 break;
4983 }
4984 case Intrinsic::amdgcn_if_break: {
4985 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4986 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4987 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
4988 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4989 break;
4990 }
4991 case Intrinsic::amdgcn_permlane16:
4992 case Intrinsic::amdgcn_permlanex16: {
4993 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4994 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
4995 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
4996 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
4997 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
4998 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
4999 break;
5000 }
5001 case Intrinsic::amdgcn_permlane_bcast:
5002 case Intrinsic::amdgcn_permlane_up:
5003 case Intrinsic::amdgcn_permlane_down:
5004 case Intrinsic::amdgcn_permlane_xor: {
5005 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5006 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5007 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5008 OpdsMapping[3] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5009 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5010 break;
5011 }
5012 case Intrinsic::amdgcn_permlane_idx_gen: {
5013 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5014 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5015 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5016 OpdsMapping[3] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5017 break;
5018 }
5019 case Intrinsic::amdgcn_permlane16_var:
5020 case Intrinsic::amdgcn_permlanex16_var: {
5021 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5022 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5023 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5024 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5025 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5026 break;
5027 }
5028 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
5029 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
5030 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
5031 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
5032 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
5033 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
5034 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
5035 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
5036 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
5037 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
5038 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
5039 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
5040 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
5041 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
5042 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
5043 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
5044 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
5045 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
5046 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
5047 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
5048 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
5049 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
5050 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
5051 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
5052 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
5053 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
5054 case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
5055 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
5056 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
5057 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
5058 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
5059 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
5060 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
5061 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
5062 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
5063 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
5064 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
5065 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
5066 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
5067 case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
5068 case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
5069 case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
5070 case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
5071 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
5072 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5073 unsigned MinNumRegsRequired = DstSize / 32;
5074
5075 // Default for MAI intrinsics.
5076 // srcC can also be an immediate which can be folded later.
5077 // FIXME: Should we eventually add an alternative mapping with AGPR src
5078 // for srcA/srcB?
5079 //
5080 // vdst, srcA, srcB, srcC
5081 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5082
5083 bool UseAGPRForm = !Subtarget.hasGFX90AInsts() ||
5084 Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired);
5085
5086 OpdsMapping[0] =
5087 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI)
5088 : getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5089 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5090 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5091 OpdsMapping[4] =
5092 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI)
5093 : getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5094 break;
5095 }
5096 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
5097 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
5098 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5099 unsigned MinNumRegsRequired = DstSize / 32;
5100
5101 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5102 bool UseAGPRForm = Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired);
5103
5104 OpdsMapping[0] =
5105 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI)
5106 : getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5107
5108 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5109 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5110 OpdsMapping[4] =
5111 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI)
5112 : getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5113
5114 OpdsMapping[8] = getVGPROpMapping(Reg: MI.getOperand(i: 8).getReg(), MRI, TRI: *TRI);
5115 OpdsMapping[10] = getVGPROpMapping(Reg: MI.getOperand(i: 10).getReg(), MRI, TRI: *TRI);
5116 break;
5117 }
5118 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
5119 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
5120 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
5121 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
5122 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
5123 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
5124 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
5125 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
5126 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
5127 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
5128 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
5129 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
5130 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
5131 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
5132 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
5133 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
5134 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
5135 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
5136 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
5137 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
5138 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
5139 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
5140 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
5141 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
5142 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
5143 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
5144 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
5145 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: {
5146 Register DstReg = MI.getOperand(i: 0).getReg();
5147 unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits();
5148 unsigned MinNumRegsRequired = DstSize / 32;
5149 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5150 bool UseAGPRForm = Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired);
5151
5152 // vdst, srcA, srcB, srcC, idx
5153 OpdsMapping[0] = UseAGPRForm ? getAGPROpMapping(Reg: DstReg, MRI, TRI: *TRI)
5154 : getVGPROpMapping(Reg: DstReg, MRI, TRI: *TRI);
5155
5156 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5157 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5158 OpdsMapping[4] =
5159 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI)
5160 : getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5161 OpdsMapping[5] = getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5162 break;
5163 }
5164 case Intrinsic::amdgcn_interp_p1:
5165 case Intrinsic::amdgcn_interp_p2:
5166 case Intrinsic::amdgcn_interp_mov:
5167 case Intrinsic::amdgcn_interp_p1_f16:
5168 case Intrinsic::amdgcn_interp_p2_f16:
5169 case Intrinsic::amdgcn_lds_param_load: {
5170 const int M0Idx = MI.getNumOperands() - 1;
5171 Register M0Reg = MI.getOperand(i: M0Idx).getReg();
5172 unsigned M0Bank = getRegBankID(Reg: M0Reg, MRI, Default: AMDGPU::SGPRRegBankID);
5173 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5174
5175 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5176 for (int I = 2; I != M0Idx && MI.getOperand(i: I).isReg(); ++I)
5177 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5178
5179 // Must be SGPR, but we must take whatever the original bank is and fix it
5180 // later.
5181 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5182 break;
5183 }
5184 case Intrinsic::amdgcn_interp_inreg_p10:
5185 case Intrinsic::amdgcn_interp_inreg_p2:
5186 case Intrinsic::amdgcn_interp_inreg_p10_f16:
5187 case Intrinsic::amdgcn_interp_inreg_p2_f16:
5188 case Intrinsic::amdgcn_interp_p10_rtz_f16:
5189 case Intrinsic::amdgcn_interp_p2_rtz_f16: {
5190 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5191 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5192 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5193 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5194 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5195 break;
5196 }
5197 case Intrinsic::amdgcn_permlane16_swap:
5198 case Intrinsic::amdgcn_permlane32_swap: {
5199 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5200 OpdsMapping[0] = OpdsMapping[1] = OpdsMapping[3] = OpdsMapping[4] =
5201 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5202 break;
5203 }
5204 case Intrinsic::amdgcn_ballot: {
5205 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5206 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
5207 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
5208 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: SrcSize);
5209 break;
5210 }
5211 case Intrinsic::amdgcn_inverse_ballot: {
5212 // This must be an SGPR, but accept a VGPR.
5213 Register MaskReg = MI.getOperand(i: 2).getReg();
5214 unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits();
5215 unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID);
5216 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5217 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize);
5218 break;
5219 }
5220 case Intrinsic::amdgcn_bitop3: {
5221 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5222 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5223 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5224 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5225 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5226 break;
5227 }
5228 case Intrinsic::amdgcn_s_quadmask:
5229 case Intrinsic::amdgcn_s_wqm: {
5230 Register MaskReg = MI.getOperand(i: 2).getReg();
5231 unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits();
5232 unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID);
5233 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: MaskSize);
5234 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize);
5235 break;
5236 }
5237 case Intrinsic::amdgcn_wave_reduce_add:
5238 case Intrinsic::amdgcn_wave_reduce_fadd:
5239 case Intrinsic::amdgcn_wave_reduce_sub:
5240 case Intrinsic::amdgcn_wave_reduce_fsub:
5241 case Intrinsic::amdgcn_wave_reduce_min:
5242 case Intrinsic::amdgcn_wave_reduce_umin:
5243 case Intrinsic::amdgcn_wave_reduce_fmin:
5244 case Intrinsic::amdgcn_wave_reduce_max:
5245 case Intrinsic::amdgcn_wave_reduce_umax:
5246 case Intrinsic::amdgcn_wave_reduce_fmax:
5247 case Intrinsic::amdgcn_wave_reduce_and:
5248 case Intrinsic::amdgcn_wave_reduce_or:
5249 case Intrinsic::amdgcn_wave_reduce_xor: {
5250 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5251 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
5252 unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
5253 auto regBankID =
5254 isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5255 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: regBankID, Size: OpSize);
5256 break;
5257 }
5258 case Intrinsic::amdgcn_s_bitreplicate: {
5259 Register MaskReg = MI.getOperand(i: 2).getReg();
5260 unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID);
5261 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 64);
5262 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: 32);
5263 break;
5264 }
5265 case Intrinsic::amdgcn_wave_shuffle: {
5266 unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5267 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
5268 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
5269 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
5270 break;
5271 }
5272 }
5273 break;
5274 }
5275 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
5276 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
5277 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
5278 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
5279 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
5280 auto IntrID = AMDGPU::getIntrinsicID(I: MI);
5281 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(Intr: IntrID);
5282 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
5283 // Non-images can have complications from operands that allow both SGPR
5284 // and VGPR. For now it's too complicated to figure out the final opcode
5285 // to derive the register bank from the MCInstrDesc.
5286 assert(RSrcIntrin->IsImage);
5287 return getImageMapping(MRI, MI, RsrcIdx: RSrcIntrin->RsrcArg);
5288 }
5289 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
5290 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
5291 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {
5292 bool IsDualOrBVH8 =
5293 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY ||
5294 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY;
5295 unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier
5296 unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods;
5297 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5298 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5299 if (IsDualOrBVH8) {
5300 OpdsMapping[1] = AMDGPU::getValueMapping(
5301 BankID: AMDGPU::VGPRRegBankID,
5302 Size: MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits());
5303 OpdsMapping[2] = AMDGPU::getValueMapping(
5304 BankID: AMDGPU::VGPRRegBankID,
5305 Size: MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits());
5306 }
5307 OpdsMapping[LastRegOpIdx] =
5308 getSGPROpMapping(Reg: MI.getOperand(i: LastRegOpIdx).getReg(), MRI, TRI: *TRI);
5309 if (LastRegOpIdx == 3) {
5310 // Sequential form: all operands combined into VGPR256/VGPR512
5311 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
5312 if (Size > 256)
5313 Size = 512;
5314 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5315 } else {
5316 // NSA form
5317 unsigned FirstSrcOpIdx = IsDualOrBVH8 ? 4 : 2;
5318 for (unsigned I = FirstSrcOpIdx; I < LastRegOpIdx; ++I) {
5319 unsigned Size = MRI.getType(Reg: MI.getOperand(i: I).getReg()).getSizeInBits();
5320 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5321 }
5322 }
5323 break;
5324 }
5325 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
5326 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
5327 auto IntrID = cast<GIntrinsic>(Val: MI).getIntrinsicID();
5328 switch (IntrID) {
5329 case Intrinsic::amdgcn_s_getreg:
5330 case Intrinsic::amdgcn_s_memtime:
5331 case Intrinsic::amdgcn_s_memrealtime:
5332 case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
5333 case Intrinsic::amdgcn_s_sendmsg_rtn: {
5334 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5335 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
5336 break;
5337 }
5338 case Intrinsic::amdgcn_global_atomic_fmin_num:
5339 case Intrinsic::amdgcn_global_atomic_fmax_num:
5340 case Intrinsic::amdgcn_flat_atomic_fmin_num:
5341 case Intrinsic::amdgcn_flat_atomic_fmax_num:
5342 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
5343 case Intrinsic::amdgcn_global_load_tr_b64:
5344 case Intrinsic::amdgcn_global_load_tr_b128:
5345 case Intrinsic::amdgcn_global_load_tr4_b64:
5346 case Intrinsic::amdgcn_global_load_tr6_b96:
5347 case Intrinsic::amdgcn_ds_load_tr8_b64:
5348 case Intrinsic::amdgcn_ds_load_tr16_b128:
5349 case Intrinsic::amdgcn_ds_load_tr4_b64:
5350 case Intrinsic::amdgcn_ds_load_tr6_b96:
5351 case Intrinsic::amdgcn_ds_read_tr4_b64:
5352 case Intrinsic::amdgcn_ds_read_tr6_b96:
5353 case Intrinsic::amdgcn_ds_read_tr8_b64:
5354 case Intrinsic::amdgcn_ds_read_tr16_b64:
5355 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
5356 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
5357 return getDefaultMappingAllVGPR(MI);
5358 case Intrinsic::amdgcn_ds_ordered_add:
5359 case Intrinsic::amdgcn_ds_ordered_swap: {
5360 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5361 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5362 unsigned M0Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5363 Default: AMDGPU::SGPRRegBankID);
5364 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5365 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5366 break;
5367 }
5368 case Intrinsic::amdgcn_ds_append:
5369 case Intrinsic::amdgcn_ds_consume: {
5370 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5371 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5372 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5373 break;
5374 }
5375 case Intrinsic::amdgcn_exp_compr:
5376 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5377 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5378 break;
5379 case Intrinsic::amdgcn_exp:
5380 // FIXME: Could we support packed types here?
5381 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5382 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5383 OpdsMapping[5] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5384 OpdsMapping[6] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5385 break;
5386 case Intrinsic::amdgcn_exp_row:
5387 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5388 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5389 OpdsMapping[5] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5390 OpdsMapping[6] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5391 OpdsMapping[8] = getSGPROpMapping(Reg: MI.getOperand(i: 8).getReg(), MRI, TRI: *TRI);
5392 break;
5393 case Intrinsic::amdgcn_s_alloc_vgpr:
5394 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1);
5395 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
5396 break;
5397 case Intrinsic::amdgcn_s_sendmsg:
5398 case Intrinsic::amdgcn_s_sendmsghalt: {
5399 // This must be an SGPR, but accept a VGPR.
5400 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5401 Default: AMDGPU::SGPRRegBankID);
5402 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5403 break;
5404 }
5405 case Intrinsic::amdgcn_s_setreg: {
5406 // This must be an SGPR, but accept a VGPR.
5407 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5408 Default: AMDGPU::SGPRRegBankID);
5409 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5410 break;
5411 }
5412 case Intrinsic::amdgcn_s_ttracedata: {
5413 // This must be an SGPR, but accept a VGPR.
5414 unsigned Bank =
5415 getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI, Default: AMDGPU::SGPRRegBankID);
5416 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5417 break;
5418 }
5419 case Intrinsic::amdgcn_end_cf: {
5420 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5421 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
5422 break;
5423 }
5424 case Intrinsic::amdgcn_else: {
5425 unsigned WaveSize = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5426 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5427 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: WaveSize);
5428 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: WaveSize);
5429 break;
5430 }
5431 case Intrinsic::amdgcn_init_whole_wave:
5432 case Intrinsic::amdgcn_live_mask: {
5433 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5434 break;
5435 }
5436 case Intrinsic::amdgcn_wqm_demote:
5437 case Intrinsic::amdgcn_kill: {
5438 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5439 break;
5440 }
5441 case Intrinsic::amdgcn_raw_buffer_load:
5442 case Intrinsic::amdgcn_raw_ptr_buffer_load:
5443 case Intrinsic::amdgcn_raw_atomic_buffer_load:
5444 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
5445 case Intrinsic::amdgcn_raw_tbuffer_load:
5446 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
5447 // FIXME: Should make intrinsic ID the last operand of the instruction,
5448 // then this would be the same as store
5449 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5450 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5451 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5452 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5453 break;
5454 }
5455 case Intrinsic::amdgcn_raw_buffer_load_lds:
5456 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
5457 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
5458 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: {
5459 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5460 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5461 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5462 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5463 break;
5464 }
5465 case Intrinsic::amdgcn_raw_buffer_store:
5466 case Intrinsic::amdgcn_raw_ptr_buffer_store:
5467 case Intrinsic::amdgcn_raw_buffer_store_format:
5468 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
5469 case Intrinsic::amdgcn_raw_tbuffer_store:
5470 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
5471 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5472 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5473 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5474 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5475 break;
5476 }
5477 case Intrinsic::amdgcn_struct_buffer_load:
5478 case Intrinsic::amdgcn_struct_ptr_buffer_load:
5479 case Intrinsic::amdgcn_struct_tbuffer_load:
5480 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
5481 case Intrinsic::amdgcn_struct_atomic_buffer_load:
5482 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
5483 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5484 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5485 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5486 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5487 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5488 break;
5489 }
5490 case Intrinsic::amdgcn_struct_buffer_load_lds:
5491 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
5492 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
5493 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
5494 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5495 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5496 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5497 OpdsMapping[5] = getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5498 OpdsMapping[6] = getSGPROpMapping(Reg: MI.getOperand(i: 6).getReg(), MRI, TRI: *TRI);
5499 break;
5500 }
5501 case Intrinsic::amdgcn_struct_buffer_store:
5502 case Intrinsic::amdgcn_struct_ptr_buffer_store:
5503 case Intrinsic::amdgcn_struct_tbuffer_store:
5504 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
5505 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5506 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5507 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5508 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5509 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5510 break;
5511 }
5512 case Intrinsic::amdgcn_init_exec_from_input: {
5513 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5514 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
5515 break;
5516 }
5517 case Intrinsic::amdgcn_ds_gws_init:
5518 case Intrinsic::amdgcn_ds_gws_barrier:
5519 case Intrinsic::amdgcn_ds_gws_sema_br: {
5520 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5521
5522 // This must be an SGPR, but accept a VGPR.
5523 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5524 Default: AMDGPU::SGPRRegBankID);
5525 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5526 break;
5527 }
5528 case Intrinsic::amdgcn_ds_gws_sema_v:
5529 case Intrinsic::amdgcn_ds_gws_sema_p:
5530 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
5531 // This must be an SGPR, but accept a VGPR.
5532 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI,
5533 Default: AMDGPU::SGPRRegBankID);
5534 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5535 break;
5536 }
5537 case Intrinsic::amdgcn_cluster_load_b32:
5538 case Intrinsic::amdgcn_cluster_load_b64:
5539 case Intrinsic::amdgcn_cluster_load_b128: {
5540 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5541 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5542 unsigned M0Bank =
5543 getRegBankID(Reg: MI.getOperand(i: 4).getReg(), MRI, Default: AMDGPU::SGPRRegBankID);
5544 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5545 break;
5546 }
5547 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
5548 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
5549 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
5550 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
5551 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5552 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5553 unsigned M0Bank =
5554 getRegBankID(Reg: MI.getOperand(i: 5).getReg(), MRI, Default: AMDGPU::SGPRRegBankID);
5555 OpdsMapping[5] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5556 break;
5557 }
5558 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
5559 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
5560 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
5561 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
5562 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
5563 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
5564 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
5565 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
5566 case Intrinsic::amdgcn_load_to_lds:
5567 case Intrinsic::amdgcn_global_load_lds: {
5568 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5569 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5570 break;
5571 }
5572 case Intrinsic::amdgcn_lds_direct_load: {
5573 const int M0Idx = MI.getNumOperands() - 1;
5574 Register M0Reg = MI.getOperand(i: M0Idx).getReg();
5575 unsigned M0Bank = getRegBankID(Reg: M0Reg, MRI, Default: AMDGPU::SGPRRegBankID);
5576 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5577
5578 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5579 for (int I = 2; I != M0Idx && MI.getOperand(i: I).isReg(); ++I)
5580 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5581
5582 // Must be SGPR, but we must take whatever the original bank is and fix it
5583 // later.
5584 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5585 break;
5586 }
5587 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
5588 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
5589 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5590 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5591 break;
5592 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
5593 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
5594 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
5595 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
5596 OpdsMapping[0] =
5597 getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); // %vdst
5598 OpdsMapping[1] =
5599 getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); // %addr
5600 OpdsMapping[3] =
5601 getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); // %addr
5602 OpdsMapping[4] =
5603 getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); // %data0
5604 OpdsMapping[5] =
5605 getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI); // %data1
5606 break;
5607 }
5608 case Intrinsic::amdgcn_s_sleep_var:
5609 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5610 break;
5611 case Intrinsic::amdgcn_s_barrier_join:
5612 case Intrinsic::amdgcn_s_wakeup_barrier:
5613 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5614 break;
5615 case Intrinsic::amdgcn_s_barrier_init:
5616 case Intrinsic::amdgcn_s_barrier_signal_var:
5617 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5618 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5619 break;
5620 case Intrinsic::amdgcn_s_barrier_signal_isfirst: {
5621 const unsigned ResultSize = 1;
5622 OpdsMapping[0] =
5623 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: ResultSize);
5624 break;
5625 }
5626 case Intrinsic::amdgcn_s_get_barrier_state:
5627 case Intrinsic::amdgcn_s_get_named_barrier_state: {
5628 OpdsMapping[0] = getSGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5629 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5630 break;
5631 }
5632 case Intrinsic::amdgcn_pops_exiting_wave_id:
5633 return getDefaultMappingSOP(MI);
5634 case Intrinsic::amdgcn_tensor_load_to_lds:
5635 case Intrinsic::amdgcn_tensor_store_from_lds: {
5636 // Lie and claim everything is legal, even all operands need to be
5637 // SGPRs. applyMapping will have to deal with it with readfirstlane.
5638 for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
5639 if (MI.getOperand(i: I).isReg()) {
5640 Register Reg = MI.getOperand(i: I).getReg();
5641 auto OpBank = getRegBankID(Reg, MRI);
5642 unsigned Size = getSizeInBits(Reg, MRI, TRI: *TRI);
5643 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: OpBank, Size);
5644 }
5645 }
5646 break;
5647 }
5648 case Intrinsic::amdgcn_s_prefetch_data: {
5649 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5650 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5651 break;
5652 }
5653 case Intrinsic::amdgcn_flat_prefetch:
5654 case Intrinsic::amdgcn_global_prefetch:
5655 return getDefaultMappingVOP(MI);
5656 default:
5657 return getInvalidInstructionMapping();
5658 }
5659 break;
5660 }
5661 case AMDGPU::G_SELECT: {
5662 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5663 unsigned Op2Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5664 Default: AMDGPU::SGPRRegBankID);
5665 unsigned Op3Bank = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI,
5666 Default: AMDGPU::SGPRRegBankID);
5667 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
5668 Op3Bank == AMDGPU::SGPRRegBankID;
5669
5670 unsigned CondBankDefault = SGPRSrcs ?
5671 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5672 unsigned CondBank = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI,
5673 Default: CondBankDefault);
5674 if (CondBank == AMDGPU::SGPRRegBankID)
5675 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5676 else if (CondBank == AMDGPU::VGPRRegBankID)
5677 CondBank = AMDGPU::VCCRegBankID;
5678
5679 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
5680 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5681
5682 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
5683
5684 // TODO: Should report 32-bit for scalar condition type.
5685 if (Size == 64) {
5686 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5687 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: CondBank, Size: 1);
5688 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5689 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5690 } else {
5691 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size);
5692 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: CondBank, Size: 1);
5693 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size);
5694 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: Bank, Size);
5695 }
5696
5697 break;
5698 }
5699
5700 case AMDGPU::G_SI_CALL: {
5701 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 64);
5702 // Lie and claim everything is legal, even though some need to be
5703 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
5704 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5705
5706 // Allow anything for implicit arguments
5707 for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
5708 if (MI.getOperand(i: I).isReg()) {
5709 Register Reg = MI.getOperand(i: I).getReg();
5710 auto OpBank = getRegBankID(Reg, MRI);
5711 unsigned Size = getSizeInBits(Reg, MRI, TRI: *TRI);
5712 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: OpBank, Size);
5713 }
5714 }
5715 break;
5716 }
5717 case AMDGPU::G_LOAD:
5718 case AMDGPU::G_ZEXTLOAD:
5719 case AMDGPU::G_SEXTLOAD:
5720 return getInstrMappingForLoad(MI);
5721
5722 case AMDGPU::G_ATOMICRMW_XCHG:
5723 case AMDGPU::G_ATOMICRMW_ADD:
5724 case AMDGPU::G_ATOMICRMW_SUB:
5725 case AMDGPU::G_ATOMICRMW_AND:
5726 case AMDGPU::G_ATOMICRMW_OR:
5727 case AMDGPU::G_ATOMICRMW_XOR:
5728 case AMDGPU::G_ATOMICRMW_MAX:
5729 case AMDGPU::G_ATOMICRMW_MIN:
5730 case AMDGPU::G_ATOMICRMW_UMAX:
5731 case AMDGPU::G_ATOMICRMW_UMIN:
5732 case AMDGPU::G_ATOMICRMW_FADD:
5733 case AMDGPU::G_ATOMICRMW_FMIN:
5734 case AMDGPU::G_ATOMICRMW_FMAX:
5735 case AMDGPU::G_ATOMICRMW_UINC_WRAP:
5736 case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5737 case AMDGPU::G_ATOMICRMW_USUB_COND:
5738 case AMDGPU::G_ATOMICRMW_USUB_SAT:
5739 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
5740 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5741 OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg());
5742 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5743 break;
5744 }
5745 case AMDGPU::G_ATOMIC_CMPXCHG: {
5746 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5747 OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg());
5748 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5749 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5750 break;
5751 }
5752 case AMDGPU::G_BRCOND: {
5753 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 0).getReg(), MRI,
5754 Default: AMDGPU::SGPRRegBankID);
5755 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
5756 if (Bank != AMDGPU::SGPRRegBankID)
5757 Bank = AMDGPU::VCCRegBankID;
5758
5759 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: 1);
5760 break;
5761 }
5762 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
5763 return getDefaultMappingVOP(MI);
5764 case AMDGPU::G_PREFETCH:
5765 OpdsMapping[0] = getSGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5766 break;
5767 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP:
5768 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN:
5769 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5770 break;
5771 case AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR:
5772 case AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR: {
5773 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5774 unsigned PtrSize = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5775 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5776 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: PtrSize);
5777 break;
5778 }
5779 }
5780
5781 return getInstructionMapping(/*ID*/1, /*Cost*/1,
5782 OperandsMapping: getOperandsMapping(OpdsMapping),
5783 NumOperands: MI.getNumOperands());
5784}
5785