AMDGPURegisterBankInfo.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp]

1	//===- AMDGPURegisterBankInfo.cpp -------------------------------- C++ --==//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	/// This file implements the targeting of the RegisterBankInfo class for
10	/// AMDGPU.
11	///
12	/// \par
13	///
14	/// AMDGPU has unique register bank constraints that require special high level
15	/// strategies to deal with. There are two main true physical register banks
16	/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17	/// sort of pseudo-register bank needed to represent SGPRs used in a vector
18	/// boolean context. There is also the AGPR bank, which is a special purpose
19	/// physical register bank present on some subtargets.
20	///
21	/// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22	/// be uniform. It is generally not valid to legalize operands by inserting
23	/// copies as on other targets. Operations which require uniform, SGPR operands
24	/// generally require scalarization by repeatedly executing the instruction,
25	/// activating each set of lanes using a unique set of input values. This is
26	/// referred to as a waterfall loop.
27	///
28	/// \par Booleans
29	///
30	/// Booleans (s1 values) requires special consideration. A vector compare result
31	/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32	/// register. These are represented with the VCC bank. During selection, we need
33	/// to be able to unambiguously go back from a register class to a register
34	/// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35	/// bank, we need to know the use context type. An SGPR s1 value always means a
36	/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37	/// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38	/// a 32-bit virtual register. Taken together, this means we need to adjust the
39	/// type of boolean operations to be regbank legal. All SALU booleans need to be
40	/// widened to 32-bits, and all VALU booleans need to be s1 values.
41	///
42	/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43	/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44	/// bank. A non-boolean source (such as a truncate from a 1-bit load from
45	/// memory) will require a copy to the VCC bank which will require clearing the
46	/// high bits and inserting a compare.
47	///
48	/// \par Constant bus restriction
49	///
50	/// VALU instructions have a limitation known as the constant bus
51	/// restriction. Most VALU instructions can use SGPR operands, but may read at
52	/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53	/// instructions). This is one unique SGPR, so the same SGPR may be used for
54	/// multiple operands. From a register bank perspective, any combination of
55	/// operands should be legal as an SGPR, but this is contextually dependent on
56	/// the SGPR operands all being the same register. There is therefore optimal to
57	/// choose the SGPR with the most uses to minimize the number of copies.
58	///
59	/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60	/// operation should have its source operands all mapped to VGPRs (except for
61	/// VCC), inserting copies from any SGPR operands. This the most trivial legal
62	/// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63	/// complicated to solve here. Every optimization pattern or instruction
64	/// selected to multiple outputs would have to enforce this rule, and there
65	/// would be additional complexity in tracking this rule for every G_*
66	/// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67	/// picking the optimal operand combination from a post-isel optimization pass.
68	///
69	//===----------------------------------------------------------------------===//
70
71	#include "AMDGPURegisterBankInfo.h"
72
73	#include "AMDGPU.h"
74	#include "AMDGPUGlobalISelUtils.h"
75	#include "AMDGPUInstrInfo.h"
76	#include "AMDGPULaneMaskUtils.h"
77	#include "GCNSubtarget.h"
78	#include "SIMachineFunctionInfo.h"
79	#include "SIRegisterInfo.h"
80	#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
81	#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
82	#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
83	#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
84	#include "llvm/CodeGen/RegisterBank.h"
85	#include "llvm/IR/IntrinsicsAMDGPU.h"
86
87	#define GET_TARGET_REGBANK_IMPL
88	#include "AMDGPUGenRegisterBank.inc"
89
90	// This file will be TableGen'ed at some point.
91	#include "AMDGPUGenRegisterBankInfo.def"
92
93	using namespace llvm;
94	using namespace MIPatternMatch;
95
96	namespace {
97
98	// Observer to apply a register bank to new registers created by LegalizerHelper.
99	class ApplyRegBankMapping final : public GISelChangeObserver {
100	private:
101	MachineIRBuilder &B;
102	const AMDGPURegisterBankInfo &RBI;
103	MachineRegisterInfo &MRI;
104	const RegisterBank *NewBank;
105	SmallVector<MachineInstr *, `4`> NewInsts;
106
107	public:
108	ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
109	MachineRegisterInfo &MRI_, const RegisterBank *RB)
110	: B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
111	assert(!B.isObservingChanges());
112	B.setChangeObserver(*this);
113	}
114
115	~ApplyRegBankMapping() override {
116	for (MachineInstr *MI : NewInsts)
117	applyBank(MI&: *MI);
118
119	B.stopObservingChanges();
120	}
121
122	/// Set any registers that don't have a set register class or bank to SALU.
123	void applyBank(MachineInstr &MI) {
124	const unsigned Opc = MI.getOpcode();
125	if (Opc == AMDGPU::G_ANYEXT \|\| Opc == AMDGPU::G_ZEXT \|\|
126	Opc == AMDGPU::G_SEXT) {
127	// LegalizerHelper wants to use the basic legalization artifacts when
128	// widening etc. We don't handle selection with vcc in artifact sources,
129	// so we need to use a select instead to handle these properly.
130	Register DstReg = MI.getOperand(i: `0`).getReg();
131	Register SrcReg = MI.getOperand(i: `1`).getReg();
132	const RegisterBank SrcBank = RBI.getRegBank(Reg: SrcReg, MRI, TRI: RBI.TRI);
133	if (SrcBank == &AMDGPU::VCCRegBank) {
134	const LLT S32 = LLT::scalar(SizeInBits: `32`);
135	assert(MRI.getType(SrcReg) == LLT::scalar(`1`));
136	assert(MRI.getType(DstReg) == S32);
137	assert(NewBank == &AMDGPU::VGPRRegBank);
138
139	// Replace the extension with a select, which really uses the boolean
140	// source.
141	B.setInsertPt(MBB&: *MI.getParent(), II: MI);
142
143	auto True = B.buildConstant(Res: S32, Val: Opc == AMDGPU::G_SEXT ? -`1` : `1`);
144	auto False = B.buildConstant(Res: S32, Val: `0`);
145	B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False);
146	MRI.setRegBank(Reg: True.getReg(Idx: `0`), RegBank: *NewBank);
147	MRI.setRegBank(Reg: False.getReg(Idx: `0`), RegBank: *NewBank);
148	MI.eraseFromParent();
149	}
150
151	assert(!MRI.getRegClassOrRegBank(DstReg));
152	MRI.setRegBank(Reg: DstReg, RegBank: *NewBank);
153	return;
154	}
155
156	#ifndef NDEBUG
157	if (Opc == AMDGPU::G_TRUNC) {
158	Register DstReg = MI.getOperand(`0`).getReg();
159	const RegisterBank DstBank = RBI.getRegBank(DstReg, MRI, RBI.TRI);
160	assert(DstBank != &AMDGPU::VCCRegBank);
161	}
162	#endif
163
164	for (MachineOperand &Op : MI.operands()) {
165	if (!Op.isReg())
166	continue;
167
168	// We may see physical registers if building a real MI
169	Register Reg = Op.getReg();
170	if (Reg.isPhysical() \|\| MRI.getRegClassOrRegBank(Reg))
171	continue;
172
173	const RegisterBank *RB = NewBank;
174	if (MRI.getType(Reg) == LLT::scalar(SizeInBits: `1`)) {
175	assert(NewBank == &AMDGPU::VGPRRegBank &&
176	"s1 operands should only be used for vector bools");
177	assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
178	MI.getOpcode() != AMDGPU::G_ANYEXT) &&
179	"not expecting legalization artifacts here");
180	RB = &AMDGPU::VCCRegBank;
181	}
182
183	MRI.setRegBank(Reg, RegBank: *RB);
184	}
185	}
186
187	void erasingInstr(MachineInstr &MI) override {}
188
189	void createdInstr(MachineInstr &MI) override {
190	// At this point, the instruction was just inserted and has no operands.
191	NewInsts.push_back(Elt: &MI);
192	}
193
194	void changingInstr(MachineInstr &MI) override {}
195	void changedInstr(MachineInstr &MI) override {
196	// FIXME: In principle we should probably add the instruction to NewInsts,
197	// but the way the LegalizerHelper uses the observer, we will always see the
198	// registers we need to set the regbank on also referenced in a new
199	// instruction.
200	}
201	};
202
203	} // anonymous namespace
204
205	AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
206	: Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
207	TII(Subtarget.getInstrInfo()) {
208
209	// HACK: Until this is fully tablegen'd.
210	static llvm::once_flag InitializeRegisterBankFlag;
211
212	static auto InitializeRegisterBankOnce = [this]() {
213	assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
214	&getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
215	&getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
216	(void)this;
217	};
218
219	llvm::call_once(flag&: InitializeRegisterBankFlag, F&: InitializeRegisterBankOnce);
220	}
221
222	static bool isVectorRegisterBank(const RegisterBank &Bank) {
223	unsigned BankID = Bank.getID();
224	return BankID == AMDGPU::VGPRRegBankID \|\| BankID == AMDGPU::AGPRRegBankID;
225	}
226
227	bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank RB) const* {
228	return RB != &AMDGPU::SGPRRegBank;
229	}
230
231	unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
232	const RegisterBank &Src,
233	TypeSize Size) const {
234	// TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
235	if (Dst.getID() == AMDGPU::SGPRRegBankID &&
236	(isVectorRegisterBank(Bank: Src) \|\| Src.getID() == AMDGPU::VCCRegBankID)) {
237	return std::numeric_limits<unsigned>::max();
238	}
239
240	// Bool values are tricky, because the meaning is based on context. The SCC
241	// and VCC banks are for the natural scalar and vector conditions produced by
242	// a compare.
243	//
244	// Legalization doesn't know about the necessary context, so an s1 use may
245	// have been a truncate from an arbitrary value, in which case a copy (lowered
246	// as a compare with 0) needs to be inserted.
247	if (Size == `1` &&
248	(Dst.getID() == AMDGPU::SGPRRegBankID) &&
249	(isVectorRegisterBank(Bank: Src) \|\|
250	Src.getID() == AMDGPU::SGPRRegBankID \|\|
251	Src.getID() == AMDGPU::VCCRegBankID))
252	return std::numeric_limits<unsigned>::max();
253
254	// There is no direct copy between AGPRs.
255	if (Dst.getID() == AMDGPU::AGPRRegBankID &&
256	Src.getID() == AMDGPU::AGPRRegBankID)
257	return `4`;
258
259	return RegisterBankInfo::copyCost(A: Dst, B: Src, Size);
260	}
261
262	unsigned AMDGPURegisterBankInfo::getBreakDownCost(
263	const ValueMapping &ValMapping,
264	const RegisterBank CurBank) const* {
265	// Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
266	// VGPR.
267	// FIXME: Is there a better way to do this?
268	if (ValMapping.NumBreakDowns >= `2` \|\| ValMapping.BreakDown[`0`].Length >= `64`)
269	return `10`; // This is expensive.
270
271	assert(ValMapping.NumBreakDowns == `2` &&
272	ValMapping.BreakDown[`0`].Length == `32` &&
273	ValMapping.BreakDown[`0`].StartIdx == `0` &&
274	ValMapping.BreakDown[`1`].Length == `32` &&
275	ValMapping.BreakDown[`1`].StartIdx == `32` &&
276	ValMapping.BreakDown[`0`].RegBank == ValMapping.BreakDown[`1`].RegBank);
277
278	// 32-bit extract of a 64-bit value is just access of a subregister, so free.
279	// TODO: Cost of 0 hits assert, though it's not clear it's what we really
280	// want.
281
282	// TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
283	// alignment restrictions, but this probably isn't important.
284	return `1`;
285	}
286
287	const RegisterBank &
288	AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
289	LLT Ty) const {
290	// We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
291	// VCC-like use.
292	if (TRI->isSGPRClass(RC: &RC)) {
293	// FIXME: This probably came from a copy from a physical register, which
294	// should be inferable from the copied to-type. We don't have many boolean
295	// physical register constraints so just assume a normal SGPR for now.
296	if (!Ty.isValid())
297	return AMDGPU::SGPRRegBank;
298
299	return Ty == LLT::scalar(SizeInBits: `1`) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
300	}
301
302	return TRI->isAGPRClass(RC: &RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
303	}
304
305	template <unsigned NumOps>
306	RegisterBankInfo::InstructionMappings
307	AMDGPURegisterBankInfo::addMappingFromTable(
308	const MachineInstr &MI, const MachineRegisterInfo &MRI,
309	const std::array<unsigned, NumOps> RegSrcOpIdx,
310	ArrayRef<OpRegBankEntry<NumOps>> Table) const {
311
312	InstructionMappings AltMappings;
313
314	SmallVector<const ValueMapping *, `10`> Operands(MI.getNumOperands());
315
316	unsigned Sizes[NumOps];
317	for (unsigned I = `0`; I < NumOps; ++I) {
318	Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
319	Sizes[I] = getSizeInBits(Reg, MRI, TRI: *TRI);
320	}
321
322	for (unsigned I = `0`, E = MI.getNumExplicitDefs(); I != E; ++I) {
323	unsigned SizeI = getSizeInBits(Reg: MI.getOperand(i: I).getReg(), MRI, TRI: *TRI);
324	Operands [I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SizeI);
325	}
326
327	// getInstrMapping's default mapping uses ID 1, so start at 2.
328	unsigned MappingID = `2`;
329	for (const auto &Entry : Table) {
330	for (unsigned I = `0`; I < NumOps; ++I) {
331	int OpIdx = RegSrcOpIdx[I];
332	Operands [OpIdx] = AMDGPU::getValueMapping(BankID: Entry.RegBanks[I], Size: Sizes[I]);
333	}
334
335	AltMappings.push_back(Elt: &getInstructionMapping(ID: MappingID++, Cost: Entry.Cost,
336	OperandsMapping: getOperandsMapping(OpdsMapping: Operands),
337	NumOperands: Operands.size()));
338	}
339
340	return AltMappings;
341	}
342
343	RegisterBankInfo::InstructionMappings
344	AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
345	const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
346	switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
347	case Intrinsic::amdgcn_readlane: {
348	static const OpRegBankEntry<`3`> Table[`2`] = {
349	// Perfectly legal.
350	{ .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: `1` },
351
352	// Need a readfirstlane for the index.
353	{ .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: `2` }
354	};
355
356	const std::array<unsigned, `3`> RegSrcOpIdx = { ._M_elems: { `0`, `2`, `3` } };
357	return addMappingFromTable<`3`>(MI, MRI, RegSrcOpIdx, Table);
358	}
359	case Intrinsic::amdgcn_writelane: {
360	static const OpRegBankEntry<`4`> Table[`4`] = {
361	// Perfectly legal.
362	{ .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: `1` },
363
364	// Need readfirstlane of first op
365	{ .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: `2` },
366
367	// Need readfirstlane of second op
368	{ .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: `2` },
369
370	// Need readfirstlane of both ops
371	{ .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: `3` }
372	};
373
374	// rsrc, voffset, offset
375	const std::array<unsigned, `4`> RegSrcOpIdx = { ._M_elems: { `0`, `2`, `3`, `4` } };
376	return addMappingFromTable<`4`>(MI, MRI, RegSrcOpIdx, Table);
377	}
378	default:
379	return RegisterBankInfo::getInstrAlternativeMappings(MI);
380	}
381	}
382
383	RegisterBankInfo::InstructionMappings
384	AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
385	const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
386
387	switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
388	case Intrinsic::amdgcn_s_buffer_load: {
389	static const OpRegBankEntry<`2`> Table[`4`] = {
390	// Perfectly legal.
391	{ .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: `1` },
392
393	// Only need 1 register in loop
394	{ .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: `300` },
395
396	// Have to waterfall the resource.
397	{ .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: `1000` },
398
399	// Have to waterfall the resource, and the offset.
400	{ .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: `1500` }
401	};
402
403	// rsrc, offset
404	const std::array<unsigned, `2`> RegSrcOpIdx = { ._M_elems: { `2`, `3` } };
405	return addMappingFromTable<`2`>(MI, MRI, RegSrcOpIdx, Table);
406	}
407	case Intrinsic::amdgcn_ds_ordered_add:
408	case Intrinsic::amdgcn_ds_ordered_swap: {
409	// VGPR = M0, VGPR
410	static const OpRegBankEntry<`3`> Table[`2`] = {
411	// Perfectly legal.
412	{ .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: `1` },
413
414	// Need a readfirstlane for m0
415	{ .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: `2` }
416	};
417
418	const std::array<unsigned, `3`> RegSrcOpIdx = { ._M_elems: { `0`, `2`, `3` } };
419	return addMappingFromTable<`3`>(MI, MRI, RegSrcOpIdx, Table);
420	}
421	case Intrinsic::amdgcn_s_sendmsg:
422	case Intrinsic::amdgcn_s_sendmsghalt: {
423	// FIXME: Should have no register for immediate
424	static const OpRegBankEntry<`1`> Table[`2`] = {
425	// Perfectly legal.
426	{ .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: `1` },
427
428	// Need readlane
429	{ .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: `3` }
430	};
431
432	const std::array<unsigned, `1`> RegSrcOpIdx = { ._M_elems: { `2` } };
433	return addMappingFromTable<`1`>(MI, MRI, RegSrcOpIdx, Table);
434	}
435	default:
436	return RegisterBankInfo::getInstrAlternativeMappings(MI);
437	}
438	}
439
440	// FIXME: Returns uniform if there's no source value information. This is
441	// probably wrong.
442	bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
443	if (!MI.hasOneMemOperand())
444	return false;
445
446	const MachineMemOperand MMO = MI.memoperands_begin();
447	const unsigned AS = MMO->getAddrSpace();
448	const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
449	AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
450	const unsigned MemSize = `8` * MMO->getSize().getValue();
451
452	// Require 4-byte alignment.
453	return (MMO->getAlign() >= Align (`4`) \|\|
454	(Subtarget.hasScalarSubwordLoads() &&
455	((MemSize == `16` && MMO->getAlign() >= Align (`2`)) \|\|
456	(MemSize == `8` && MMO->getAlign() >= Align (`1`))))) &&
457	// Can't do a scalar atomic load.
458	!MMO->isAtomic() &&
459	// Don't use scalar loads for volatile accesses to non-constant address
460	// spaces.
461	(IsConst \|\| !MMO->isVolatile()) &&
462	// Memory must be known constant, or not written before this load.
463	(IsConst \|\| MMO->isInvariant() \|\| (MMO->getFlags() & MONoClobber)) &&
464	AMDGPU::isUniformMMO(MMO);
465	}
466
467	RegisterBankInfo::InstructionMappings
468	AMDGPURegisterBankInfo::getInstrAlternativeMappings(
469	const MachineInstr &MI) const {
470
471	const MachineFunction &MF = *MI.getMF();
472	const MachineRegisterInfo &MRI = MF.getRegInfo();
473
474
475	InstructionMappings AltMappings;
476	switch (MI.getOpcode()) {
477	case TargetOpcode::G_CONSTANT:
478	case TargetOpcode::G_IMPLICIT_DEF: {
479	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
480	if (Size == `1`) {
481	static const OpRegBankEntry<`1`> Table[`3`] = {
482	{ .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: `1` },
483	{ .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: `1` },
484	{ .RegBanks: { AMDGPU::VCCRegBankID }, .Cost: `1` }
485	};
486
487	return addMappingFromTable<`1`>(MI, MRI, RegSrcOpIdx: {._M_elems: { `0` }}, Table);
488	}
489
490	[[fallthrough]];
491	}
492	case TargetOpcode::G_FCONSTANT:
493	case TargetOpcode::G_FRAME_INDEX:
494	case TargetOpcode::G_GLOBAL_VALUE: {
495	static const OpRegBankEntry<`1`> Table[`2`] = {
496	{ .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: `1` },
497	{ .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: `1` }
498	};
499
500	return addMappingFromTable<`1`>(MI, MRI, RegSrcOpIdx: {._M_elems: { `0` }}, Table);
501	}
502	case TargetOpcode::G_AND:
503	case TargetOpcode::G_OR:
504	case TargetOpcode::G_XOR: {
505	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
506
507	if (Size == `1`) {
508	// s_{and\|or\|xor}_b32 set scc when the result of the 32-bit op is not 0.
509	const InstructionMapping &SCCMapping = getInstructionMapping(
510	ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(
511	OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `32`),
512	AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `32`),
513	AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `32`)}),
514	NumOperands: `3`); // Num Operands
515	AltMappings.push_back(Elt: &SCCMapping);
516
517	const InstructionMapping &VCCMapping0 = getInstructionMapping(
518	ID: `2`, Cost: `1`, OperandsMapping: getOperandsMapping(
519	OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size),
520	AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size),
521	AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size)}),
522	NumOperands: `3`); // Num Operands
523	AltMappings.push_back(Elt: &VCCMapping0);
524	return AltMappings;
525	}
526
527	if (Size != `64`)
528	break;
529
530	const InstructionMapping &SSMapping = getInstructionMapping(
531	ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(
532	OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
533	AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
534	AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size)}),
535	NumOperands: `3`); // Num Operands
536	AltMappings.push_back(Elt: &SSMapping);
537
538	const InstructionMapping &VVMapping = getInstructionMapping(
539	ID: `2`, Cost: `2`, OperandsMapping: getOperandsMapping(
540	OpdsMapping: {AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
541	AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
542	AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size)}),
543	NumOperands: `3`); // Num Operands
544	AltMappings.push_back(Elt: &VVMapping);
545	break;
546	}
547	case TargetOpcode::G_LOAD:
548	case TargetOpcode::G_ZEXTLOAD:
549	case TargetOpcode::G_SEXTLOAD: {
550	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
551	LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: `1`).getReg());
552	unsigned PtrSize = PtrTy.getSizeInBits();
553	unsigned AS = PtrTy.getAddressSpace();
554
555	if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
556	AS != AMDGPUAS::PRIVATE_ADDRESS) &&
557	isScalarLoadLegal(MI)) {
558	const InstructionMapping &SSMapping = getInstructionMapping(
559	ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(
560	OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
561	AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: PtrSize)}),
562	NumOperands: `2`); // Num Operands
563	AltMappings.push_back(Elt: &SSMapping);
564	}
565
566	const InstructionMapping &VVMapping = getInstructionMapping(
567	ID: `2`, Cost: `1`,
568	OperandsMapping: getOperandsMapping(
569	OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
570	AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: PtrSize)}),
571	NumOperands: `2`); // Num Operands
572	AltMappings.push_back(Elt: &VVMapping);
573
574	// It may be possible to have a vgpr = load sgpr mapping here, because
575	// the mubuf instructions support this kind of load, but probably for only
576	// gfx7 and older. However, the addressing mode matching in the instruction
577	// selector should be able to do a better job of detecting and selecting
578	// these kinds of loads from the vgpr = load vgpr mapping.
579
580	return AltMappings;
581
582	}
583	case TargetOpcode::G_SELECT: {
584	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
585	const InstructionMapping &SSMapping = getInstructionMapping(ID: `1`, Cost: `1`,
586	OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
587	AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `1`),
588	AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
589	AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size)}),
590	NumOperands: `4`); // Num Operands
591	AltMappings.push_back(Elt: &SSMapping);
592
593	const InstructionMapping &VVMapping = getInstructionMapping(ID: `2`, Cost: `1`,
594	OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
595	AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: `1`),
596	AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
597	AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size)}),
598	NumOperands: `4`); // Num Operands
599	AltMappings.push_back(Elt: &VVMapping);
600
601	return AltMappings;
602	}
603	case TargetOpcode::G_UADDE:
604	case TargetOpcode::G_USUBE:
605	case TargetOpcode::G_SADDE:
606	case TargetOpcode::G_SSUBE: {
607	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
608	const InstructionMapping &SSMapping = getInstructionMapping(ID: `1`, Cost: `1`,
609	OperandsMapping: getOperandsMapping(
610	OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
611	AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `1`),
612	AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
613	AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
614	AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `1`)}),
615	NumOperands: `5`); // Num Operands
616	AltMappings.push_back(Elt: &SSMapping);
617
618	const InstructionMapping &VVMapping = getInstructionMapping(ID: `2`, Cost: `1`,
619	OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
620	AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: `1`),
621	AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
622	AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
623	AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: `1`)}),
624	NumOperands: `5`); // Num Operands
625	AltMappings.push_back(Elt: &VVMapping);
626	return AltMappings;
627	}
628	case AMDGPU::G_BRCOND: {
629	assert(MRI.getType(MI.getOperand(`0`).getReg()).getSizeInBits() == `1`);
630
631	// TODO: Change type to 32 for scalar
632	const InstructionMapping &SMapping = getInstructionMapping(
633	ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(
634	OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `1`), nullptr}),
635	NumOperands: `2`); // Num Operands
636	AltMappings.push_back(Elt: &SMapping);
637
638	const InstructionMapping &VMapping = getInstructionMapping(
639	ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(
640	OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: `1`), nullptr }),
641	NumOperands: `2`); // Num Operands
642	AltMappings.push_back(Elt: &VMapping);
643	return AltMappings;
644	}
645	case AMDGPU::G_INTRINSIC:
646	case AMDGPU::G_INTRINSIC_CONVERGENT:
647	return getInstrAlternativeMappingsIntrinsic(MI, MRI);
648	case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
649	case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
650	return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
651	default:
652	break;
653	}
654	return RegisterBankInfo::getInstrAlternativeMappings(MI);
655	}
656
657	void AMDGPURegisterBankInfo::split64BitValueForMapping(
658	MachineIRBuilder &B,
659	SmallVector<Register, `2`> &Regs,
660	LLT HalfTy,
661	Register Reg) const {
662	assert(HalfTy.getSizeInBits() == `32`);
663	MachineRegisterInfo *MRI = B.getMRI();
664	Register LoLHS = MRI->createGenericVirtualRegister(Ty: HalfTy);
665	Register HiLHS = MRI->createGenericVirtualRegister(Ty: HalfTy);
666	const RegisterBank Bank = getRegBank(Reg, MRI: MRI, TRI: *TRI);
667	MRI->setRegBank(Reg: LoLHS, RegBank: *Bank);
668	MRI->setRegBank(Reg: HiLHS, RegBank: *Bank);
669
670	Regs.push_back(Elt: LoLHS);
671	Regs.push_back(Elt: HiLHS);
672
673	B.buildInstr(Opcode: AMDGPU::G_UNMERGE_VALUES)
674	.addDef(RegNo: LoLHS)
675	.addDef(RegNo: HiLHS)
676	.addUse(RegNo: Reg);
677	}
678
679	/// Replace the current type each register in \p Regs has with \p NewTy
680	static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
681	LLT NewTy) {
682	for (Register Reg : Regs) {
683	assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
684	MRI.setType(VReg: Reg, Ty: NewTy);
685	}
686	}
687
688	static LLT getHalfSizedType(LLT Ty) {
689	if (Ty.isVector()) {
690	assert(Ty.getElementCount().isKnownMultipleOf(`2`));
691	return LLT::scalarOrVector(EC: Ty.getElementCount().divideCoefficientBy(RHS: `2`),
692	ScalarTy: Ty.getElementType());
693	}
694
695	assert(Ty.getScalarSizeInBits() % `2` == `0`);
696	return LLT::scalar(SizeInBits: Ty.getScalarSizeInBits() / `2`);
697	}
698
699	// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
700	// source value into a scalar register.
701	Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
702	MachineRegisterInfo &MRI,
703	Register Src) const {
704	LLT Ty = MRI.getType(Reg: Src);
705	const RegisterBank Bank = getRegBank(Reg: Src, MRI, TRI: TRI);
706
707	if (Bank == &AMDGPU::SGPRRegBank)
708	return Src;
709
710	unsigned Bits = Ty.getSizeInBits();
711	assert(Bits % `32` == `0`);
712
713	if (Bank != &AMDGPU::VGPRRegBank) {
714	// We need to copy from AGPR to VGPR
715	Src = B.buildCopy(Res: Ty, Op: Src).getReg(Idx: `0`);
716	MRI.setRegBank(Reg: Src, RegBank: AMDGPU::VGPRRegBank);
717	}
718
719	LLT S32 = LLT::scalar(SizeInBits: `32`);
720	unsigned NumParts = Bits / `32`;
721	SmallVector<Register, `8`> SrcParts;
722	SmallVector<Register, `8`> DstParts;
723
724	if (Bits == `32`) {
725	SrcParts.push_back(Elt: Src);
726	} else {
727	auto Unmerge = B.buildUnmerge(Res: S32, Op: Src);
728	for (unsigned i = `0`; i < NumParts; ++i)
729	SrcParts.push_back(Elt: Unmerge.getReg(Idx: i));
730	}
731
732	for (unsigned i = `0`; i < NumParts; ++i) {
733	Register SrcPart = SrcParts [i];
734	Register DstPart = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
735	MRI.setType(VReg: DstPart, Ty: NumParts == `1` ? Ty : S32);
736
737	const TargetRegisterClass *Constrained =
738	constrainGenericRegister(Reg: SrcPart, RC: AMDGPU::VGPR_32RegClass, MRI);
739	(void)Constrained;
740	assert(Constrained && "Failed to constrain readfirstlane src reg");
741
742	B.buildInstr(Opc: AMDGPU::V_READFIRSTLANE_B32, DstOps: {DstPart}, SrcOps: {SrcPart});
743
744	DstParts.push_back(Elt: DstPart);
745	}
746
747	if (Bits == `32`)
748	return DstParts [`0`];
749
750	Register Dst = B.buildMergeLikeInstr(Res: Ty, Ops: DstParts).getReg(Idx: `0`);
751	MRI.setRegBank(Reg: Dst, RegBank: AMDGPU::SGPRRegBank);
752	return Dst;
753	}
754
755	/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
756	/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
757	/// execute the instruction for each unique combination of values in all lanes
758	/// in the wave. The block will be split such that rest of the instructions are
759	/// moved to a new block.
760	///
761	/// Essentially performs this loop:
762	//
763	/// Save Execution Mask
764	/// For (Lane : Wavefront) {
765	/// Enable Lane, Disable all other lanes
766	/// SGPR = read SGPR value for current lane from VGPR
767	/// VGPRResult[Lane] = use_op SGPR
768	/// }
769	/// Restore Execution Mask
770	///
771	/// There is additional complexity to try for compare values to identify the
772	/// unique values used.
773	bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
774	MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
775	SmallSet<Register, `4`> &SGPROperandRegs) const {
776	// Track use registers which have already been expanded with a readfirstlane
777	// sequence. This may have multiple uses if moving a sequence.
778	DenseMap<Register, Register> WaterfalledRegMap;
779
780	MachineBasicBlock &MBB = B.getMBB();
781	MachineFunction *MF = &B.getMF();
782
783	const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
784	const AMDGPU::LaneMaskConstants &LMC =
785	AMDGPU::LaneMaskConstants::get(ST: Subtarget);
786
787	#ifndef NDEBUG
788	const int OrigRangeSize = std::distance(Range.begin(), Range.end());
789	#endif
790
791	MachineRegisterInfo &MRI = *B.getMRI();
792	Register SaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
793	Register InitSaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
794
795	// Don't bother using generic instructions/registers for the exec mask.
796	B.buildInstr(Opcode: TargetOpcode::IMPLICIT_DEF)
797	.addDef(RegNo: InitSaveExecReg);
798
799	Register PhiExec = MRI.createVirtualRegister(RegClass: WaveRC);
800	Register NewExec = MRI.createVirtualRegister(RegClass: WaveRC);
801
802	// To insert the loop we need to split the block. Move everything before this
803	// point to a new block, and insert a new empty block before this instruction.
804	MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
805	MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
806	MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
807	MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
808	MachineFunction::iterator MBBI(MBB);
809	++MBBI;
810	MF->insert(MBBI, MBB: LoopBB);
811	MF->insert(MBBI, MBB: BodyBB);
812	MF->insert(MBBI, MBB: RestoreExecBB);
813	MF->insert(MBBI, MBB: RemainderBB);
814
815	LoopBB->addSuccessor(Succ: BodyBB);
816	BodyBB->addSuccessor(Succ: RestoreExecBB);
817	BodyBB->addSuccessor(Succ: LoopBB);
818
819	// Move the rest of the block into a new block.
820	RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
821	RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Range.end(), To: MBB.end());
822
823	MBB.addSuccessor(Succ: LoopBB);
824	RestoreExecBB->addSuccessor(Succ: RemainderBB);
825
826	B.setInsertPt(MBB&: *LoopBB, II: LoopBB->end());
827
828	B.buildInstr(Opcode: TargetOpcode::PHI)
829	.addDef(RegNo: PhiExec)
830	.addReg(RegNo: InitSaveExecReg)
831	.addMBB(MBB: &MBB)
832	.addReg(RegNo: NewExec)
833	.addMBB(MBB: BodyBB);
834
835	const DebugLoc &DL = B.getDL();
836
837	MachineInstr &FirstInst = *Range.begin();
838
839	// Move the instruction into the loop body. Note we moved everything after
840	// Range.end() already into a new block, so Range.end() is no longer valid.
841	BodyBB->splice(Where: BodyBB->end(), Other: &MBB, From: Range.begin(), To: MBB.end());
842
843	// Figure out the iterator range after splicing the instructions.
844	MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
845	auto NewEnd = BodyBB->end();
846
847	B.setMBB(*LoopBB);
848
849	LLT S1 = LLT::scalar(SizeInBits: `1`);
850	Register CondReg;
851
852	assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
853
854	for (MachineInstr &MI : make_range(x: NewBegin, y: NewEnd)) {
855	for (MachineOperand &Op : MI.all_uses()) {
856	Register OldReg = Op.getReg();
857	if (!SGPROperandRegs.count(V: OldReg))
858	continue;
859
860	// See if we already processed this register in another instruction in the
861	// sequence.
862	auto OldVal = WaterfalledRegMap.find(Val: OldReg);
863	if (OldVal != WaterfalledRegMap.end()) {
864	Op.setReg(OldVal ->second);
865	continue;
866	}
867
868	Register OpReg = Op.getReg();
869	LLT OpTy = MRI.getType(Reg: OpReg);
870
871	const RegisterBank OpBank = getRegBank(Reg: OpReg, MRI, TRI: TRI);
872	if (OpBank != &AMDGPU::VGPRRegBank) {
873	// Insert copy from AGPR to VGPR before the loop.
874	B.setMBB(MBB);
875	OpReg = B.buildCopy(Res: OpTy, Op: OpReg).getReg(Idx: `0`);
876	MRI.setRegBank(Reg: OpReg, RegBank: AMDGPU::VGPRRegBank);
877	B.setMBB(*LoopBB);
878	}
879
880	Register CurrentLaneReg = buildReadFirstLane(B, MRI, Src: OpReg);
881
882	// Build the comparison(s).
883	unsigned OpSize = OpTy.getSizeInBits();
884	bool Is64 = OpSize % `64` == `0`;
885	unsigned PartSize = Is64 ? `64` : `32`;
886	LLT PartTy = LLT::scalar(SizeInBits: PartSize);
887	unsigned NumParts = OpSize / PartSize;
888	SmallVector<Register, `8`> OpParts;
889	SmallVector<Register, `8`> CurrentLaneParts;
890
891	if (NumParts == `1`) {
892	OpParts.push_back(Elt: OpReg);
893	CurrentLaneParts.push_back(Elt: CurrentLaneReg);
894	} else {
895	auto UnmergeOp = B.buildUnmerge(Res: PartTy, Op: OpReg);
896	auto UnmergeCurrentLane = B.buildUnmerge(Res: PartTy, Op: CurrentLaneReg);
897	for (unsigned i = `0`; i < NumParts; ++i) {
898	OpParts.push_back(Elt: UnmergeOp.getReg(Idx: i));
899	CurrentLaneParts.push_back(Elt: UnmergeCurrentLane.getReg(Idx: i));
900	MRI.setRegBank(Reg: OpParts [i], RegBank: AMDGPU::VGPRRegBank);
901	MRI.setRegBank(Reg: CurrentLaneParts [i], RegBank: AMDGPU::SGPRRegBank);
902	}
903	}
904
905	for (unsigned i = `0`; i < NumParts; ++i) {
906	auto CmpReg = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: CurrentLaneParts [i],
907	Op1: OpParts [i]).getReg(Idx: `0`);
908	MRI.setRegBank(Reg: CmpReg, RegBank: AMDGPU::VCCRegBank);
909
910	if (!CondReg) {
911	CondReg = CmpReg;
912	} else {
913	CondReg = B.buildAnd(Dst: S1, Src0: CondReg, Src1: CmpReg).getReg(Idx: `0`);
914	MRI.setRegBank(Reg: CondReg, RegBank: AMDGPU::VCCRegBank);
915	}
916	}
917
918	Op.setReg(CurrentLaneReg);
919
920	// Make sure we don't re-process this register again.
921	WaterfalledRegMap.insert(KV: std::pair(OldReg, Op.getReg()));
922	}
923	}
924
925	// The ballot becomes a no-op during instruction selection.
926	CondReg = B.buildIntrinsic(ID: Intrinsic::amdgcn_ballot,
927	Res: {LLT::scalar(SizeInBits: Subtarget.isWave32() ? `32` : `64`)})
928	.addReg(RegNo: CondReg)
929	.getReg(Idx: `0`);
930	MRI.setRegClass(Reg: CondReg, RC: WaveRC);
931
932	// Update EXEC, save the original EXEC value to VCC.
933	B.buildInstr(Opcode: LMC.AndSaveExecOpc)
934	.addDef(RegNo: NewExec)
935	.addReg(RegNo: CondReg, Flags: RegState::Kill);
936
937	MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
938
939	B.setInsertPt(MBB&: *BodyBB, II: BodyBB->end());
940
941	// Update EXEC, switch all done bits to 0 and all todo bits to 1.
942	B.buildInstr(Opcode: LMC.XorTermOpc)
943	.addDef(RegNo: LMC.ExecReg)
944	.addReg(RegNo: LMC.ExecReg)
945	.addReg(RegNo: NewExec);
946
947	// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
948	// s_cbranch_scc0?
949
950	// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
951	B.buildInstr(Opcode: AMDGPU::SI_WATERFALL_LOOP).addMBB(MBB: LoopBB);
952
953	// Save the EXEC mask before the loop.
954	BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: SaveExecReg)
955	.addReg(RegNo: LMC.ExecReg);
956
957	// Restore the EXEC mask after the loop.
958	B.setMBB(*RestoreExecBB);
959	B.buildInstr(Opcode: LMC.MovTermOpc).addDef(RegNo: LMC.ExecReg).addReg(RegNo: SaveExecReg);
960
961	// Set the insert point after the original instruction, so any new
962	// instructions will be in the remainder.
963	B.setInsertPt(MBB&: *RemainderBB, II: RemainderBB->begin());
964
965	return true;
966	}
967
968	// Return any unique registers used by \p MI at \p OpIndices that need to be
969	// handled in a waterfall loop. Returns these registers in \p
970	// SGPROperandRegs. Returns true if there are any operands to handle and a
971	// waterfall loop is necessary.
972	bool AMDGPURegisterBankInfo::collectWaterfallOperands(
973	SmallSet<Register, `4`> &SGPROperandRegs, MachineInstr &MI,
974	MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
975	for (unsigned Op : OpIndices) {
976	assert(MI.getOperand(Op).isUse());
977	Register Reg = MI.getOperand(i: Op).getReg();
978	const RegisterBank OpBank = getRegBank(Reg, MRI, TRI: TRI);
979	if (OpBank->getID() != AMDGPU::SGPRRegBankID)
980	SGPROperandRegs.insert(V: Reg);
981	}
982
983	// No operands need to be replaced, so no need to loop.
984	return !SGPROperandRegs.empty();
985	}
986
987	bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
988	MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
989	// Use a set to avoid extra readfirstlanes in the case where multiple operands
990	// are the same register.
991	SmallSet<Register, `4`> SGPROperandRegs;
992
993	if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI&: *B.getMRI(), OpIndices))
994	return false;
995
996	MachineBasicBlock::iterator I = MI.getIterator();
997	return executeInWaterfallLoop(B, Range: make_range(x: I, y: std::next(x: I)),
998	SGPROperandRegs);
999	}
1000
1001	// Legalize an operand that must be an SGPR by inserting a readfirstlane.
1002	void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1003	MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1004	Register Reg = MI.getOperand(i: OpIdx).getReg();
1005	MachineRegisterInfo &MRI = *B.getMRI();
1006	const RegisterBank Bank = getRegBank(Reg, MRI, TRI: TRI);
1007	if (Bank == &AMDGPU::SGPRRegBank)
1008	return;
1009
1010	Reg = buildReadFirstLane(B, MRI, Src: Reg);
1011	MI.getOperand(i: OpIdx).setReg(Reg);
1012	}
1013
1014	/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1015	/// rest will be in the remainder.
1016	static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1017	unsigned TotalSize = Ty.getSizeInBits();
1018	if (!Ty.isVector())
1019	return {LLT::scalar(SizeInBits: FirstSize), LLT::scalar(SizeInBits: TotalSize - FirstSize)};
1020
1021	LLT EltTy = Ty.getElementType();
1022	unsigned EltSize = EltTy.getSizeInBits();
1023	assert(FirstSize % EltSize == `0`);
1024
1025	unsigned FirstPartNumElts = FirstSize / EltSize;
1026	unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1027
1028	return {LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: FirstPartNumElts), ScalarTy: EltTy),
1029	LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: RemainderElts), ScalarTy: EltTy)};
1030	}
1031
1032	static LLT widen96To128(LLT Ty) {
1033	if (!Ty.isVector())
1034	return LLT::scalar(SizeInBits: `128`);
1035
1036	LLT EltTy = Ty.getElementType();
1037	assert(`128` % EltTy.getSizeInBits() == `0`);
1038	return LLT::fixed_vector(NumElements: `128` / EltTy.getSizeInBits(), ScalarTy: EltTy);
1039	}
1040
1041	bool AMDGPURegisterBankInfo::applyMappingLoad(
1042	MachineIRBuilder &B,
1043	const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1044	MachineInstr &MI) const {
1045	MachineRegisterInfo &MRI = *B.getMRI();
1046	Register DstReg = MI.getOperand(i: `0`).getReg();
1047	const LLT LoadTy = MRI.getType(Reg: DstReg);
1048	unsigned LoadSize = LoadTy.getSizeInBits();
1049	MachineMemOperand MMO = MI.memoperands_begin();
1050	const unsigned MaxNonSmrdLoadSize = `128`;
1051
1052	const RegisterBank *DstBank =
1053	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
1054	if (DstBank == &AMDGPU::SGPRRegBank) {
1055	// There are some special cases that we need to look at for 32 bit and 96
1056	// bit SGPR loads otherwise we have nothing to do.
1057	if (LoadSize != `32` && (LoadSize != `96` \|\| Subtarget.hasScalarDwordx3Loads()))
1058	return false;
1059
1060	const unsigned MemSize = `8` * MMO->getSize().getValue();
1061	// Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1062	// 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1063	// scalar loads should have a load size of 32 but memory access size of less
1064	// than 32.
1065	if (LoadSize == `32` &&
1066	(MemSize == `32` \|\| LoadTy.isVector() \|\| !isScalarLoadLegal(MI)))
1067	return false;
1068
1069	if (LoadSize == `32` &&
1070	((MemSize == `8` && MMO->getAlign() >= Align (`1`)) \|\|
1071	(MemSize == `16` && MMO->getAlign() >= Align (`2`))) &&
1072	isScalarLoadLegal(MI) &&
1073	Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
1074	return false;
1075
1076	Register PtrReg = MI.getOperand(i: `1`).getReg();
1077
1078	ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1079
1080	if (LoadSize == `32`) {
1081	// This is an extending load from a sub-dword size. Widen the memory
1082	// access size to 4 bytes and clear the extra high bits appropriately
1083	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1084	if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1085	// Must extend the sign bit into higher bits for a G_SEXTLOAD
1086	auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`);
1087	B.buildSExtInReg(Res: MI.getOperand(i: `0`), Op: WideLoad, ImmOp: MemSize);
1088	} else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1089	// Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1090	auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`);
1091	B.buildZExtInReg(Res: MI.getOperand(i: `0`), Op: WideLoad, ImmOp: MemSize);
1092	} else
1093	// We do not need to touch the higher bits for regular loads.
1094	B.buildLoadFromOffset(Dst: MI.getOperand(i: `0`), BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`);
1095	} else {
1096	// 96-bit loads are only available for vector loads. We need to split this
1097	// into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1098	if (MMO->getAlign() < Align (`16`)) {
1099	LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1100	LLT Part64, Part32;
1101	std::tie(args&: Part64, args&: Part32) = splitUnequalType(Ty: LoadTy, FirstSize: `64`);
1102	if (Helper.reduceLoadStoreWidth(MI&: cast<GAnyLoad>(Val&: MI), TypeIdx: `0`, NarrowTy: Part64) !=
1103	LegalizerHelper::Legalized)
1104	return false;
1105	return true;
1106	}
1107	LLT WiderTy = widen96To128(Ty: LoadTy);
1108	auto WideLoad = B.buildLoadFromOffset(Dst: WiderTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`);
1109	if (WiderTy.isScalar()) {
1110	B.buildTrunc(Res: MI.getOperand(i: `0`), Op: WideLoad);
1111	} else {
1112	B.buildDeleteTrailingVectorElements(Res: MI.getOperand(i: `0`).getReg(),
1113	Op0: WideLoad);
1114	}
1115	}
1116
1117	MI.eraseFromParent();
1118	return true;
1119	}
1120
1121	// 128-bit loads are supported for all instruction types.
1122	if (LoadSize <= MaxNonSmrdLoadSize)
1123	return false;
1124
1125	SmallVector<Register, `1`> SrcRegs(OpdMapper.getVRegs(OpIdx: `1`));
1126
1127	if (SrcRegs.empty())
1128	SrcRegs.push_back(Elt: MI.getOperand(i: `1`).getReg());
1129
1130	// RegBankSelect only emits scalar types, so we need to reset the pointer
1131	// operand to a pointer type.
1132	Register BasePtrReg = SrcRegs [`0`];
1133	LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: `1`).getReg());
1134	MRI.setType(VReg: BasePtrReg, Ty: PtrTy);
1135
1136	// The following are the loads not splitted enough during legalization
1137	// because it was not clear they are smem-load or vmem-load
1138	if (AMDGPU::isExtendedGlobalAddrSpace(AS: MMO->getAddrSpace()) \|\|
1139	MMO->getAddrSpace() == AMDGPUAS::BUFFER_RESOURCE) {
1140	assert(LoadSize % MaxNonSmrdLoadSize == `0`);
1141	unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1142	const LLT LoadSplitTy = LoadTy.divide(Factor: NumSplitParts);
1143	ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1144	LegalizerHelper Helper(B.getMF(), O, B);
1145	if (LoadTy.isVector()) {
1146	if (Helper.fewerElementsVector(MI, TypeIdx: `0`, NarrowTy: LoadSplitTy) !=
1147	LegalizerHelper::Legalized)
1148	return false;
1149	} else {
1150	if (Helper.narrowScalar(MI, TypeIdx: `0`, NarrowTy: LoadSplitTy) != LegalizerHelper::Legalized)
1151	return false;
1152	}
1153	}
1154
1155	MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
1156	return true;
1157	}
1158
1159	bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1160	MachineIRBuilder &B,
1161	const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1162	MachineInstr &MI) const {
1163	MachineRegisterInfo &MRI = *B.getMRI();
1164	const MachineFunction &MF = B.getMF();
1165	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1166	const auto &TFI = *ST.getFrameLowering();
1167
1168	// Guard in case the stack growth direction ever changes with scratch
1169	// instructions.
1170	assert(TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
1171	"Stack grows upwards for AMDGPU");
1172
1173	Register Dst = MI.getOperand(i: `0`).getReg();
1174	Register AllocSize = MI.getOperand(i: `1`).getReg();
1175	Align Alignment = assumeAligned(Value: MI.getOperand(i: `2`).getImm());
1176
1177	const RegisterBank SizeBank = getRegBank(Reg: AllocSize, MRI, TRI: TRI);
1178
1179	if (SizeBank != &AMDGPU::SGPRRegBank) {
1180	auto WaveReduction =
1181	B.buildIntrinsic(ID: Intrinsic::amdgcn_wave_reduce_umax, Res: {LLT::scalar(SizeInBits: `32`)})
1182	.addUse(RegNo: AllocSize)
1183	.addImm(Val: `0`);
1184	AllocSize = WaveReduction.getReg(Idx: `0`);
1185	}
1186
1187	LLT PtrTy = MRI.getType(Reg: Dst);
1188	LLT IntPtrTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
1189
1190	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1191	Register SPReg = Info->getStackPtrOffsetReg();
1192	ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1193
1194	auto WaveSize = B.buildConstant(Res: LLT::scalar(SizeInBits: `32`), Val: ST.getWavefrontSizeLog2());
1195	auto ScaledSize = B.buildShl(Dst: IntPtrTy, Src0: AllocSize, Src1: WaveSize);
1196
1197	auto OldSP = B.buildCopy(Res: PtrTy, Op: SPReg);
1198	if (Alignment > TFI.getStackAlign()) {
1199	auto StackAlignMask = (Alignment.value() << ST.getWavefrontSizeLog2()) - `1`;
1200	auto Tmp1 = B.buildPtrAdd(Res: PtrTy, Op0: OldSP,
1201	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `32`), Val: StackAlignMask));
1202	B.buildMaskLowPtrBits(Res: Dst, Op0: Tmp1,
1203	NumBits: Log2(A: Alignment) + ST.getWavefrontSizeLog2());
1204	} else {
1205	B.buildCopy(Res: Dst, Op: OldSP);
1206	}
1207	auto PtrAdd = B.buildPtrAdd(Res: PtrTy, Op0: Dst, Op1: ScaledSize);
1208	B.buildCopy(Res: SPReg, Op: PtrAdd);
1209	MI.eraseFromParent();
1210	return true;
1211	}
1212
1213	bool AMDGPURegisterBankInfo::applyMappingImage(
1214	MachineIRBuilder &B, MachineInstr &MI,
1215	const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1216	int RsrcIdx) const {
1217	const int NumDefs = MI.getNumExplicitDefs();
1218
1219	// The reported argument index is relative to the IR intrinsic call arguments,
1220	// so we need to shift by the number of defs and the intrinsic ID.
1221	RsrcIdx += NumDefs + `1`;
1222
1223	// Insert copies to VGPR arguments.
1224	applyDefaultMapping(OpdMapper);
1225
1226	// Fixup any SGPR arguments.
1227	SmallVector<unsigned, `4`> SGPRIndexes;
1228	for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1229	if (!MI.getOperand(i: I).isReg())
1230	continue;
1231
1232	// If this intrinsic has a sampler, it immediately follows rsrc.
1233	if (I == RsrcIdx \|\| I == RsrcIdx + `1`)
1234	SGPRIndexes.push_back(Elt: I);
1235	}
1236
1237	executeInWaterfallLoop(B, MI, OpIndices: SGPRIndexes);
1238	return true;
1239	}
1240
1241	// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1242	// the three offsets (voffset, soffset and instoffset)
1243	unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1244	MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1245	Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1246	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1247	MachineRegisterInfo *MRI = B.getMRI();
1248
1249	if (std::optional<int64_t> Imm =
1250	getIConstantVRegSExtVal(VReg: CombinedOffset, MRI: *MRI)) {
1251	uint32_t SOffset, ImmOffset;
1252	if (TII->splitMUBUFOffset(Imm: *Imm, SOffset, ImmOffset, Alignment)) {
1253	VOffsetReg = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
1254	SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: `0`);
1255	InstOffsetVal = ImmOffset;
1256
1257	B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank);
1258	B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank);
1259	return SOffset + ImmOffset;
1260	}
1261	}
1262
1263	const bool CheckNUW = Subtarget.hasGFX1250Insts();
1264	Register Base;
1265	unsigned Offset;
1266
1267	std::tie(args&: Base, args&: Offset) =
1268	AMDGPU::getBaseWithConstantOffset(MRI&: *MRI, Reg: CombinedOffset,
1269	/KnownBits=/ValueTracking: nullptr,
1270	/CheckNUW=/CheckNUW);
1271
1272	uint32_t SOffset, ImmOffset;
1273	if ((int)Offset > `0` &&
1274	TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
1275	if (getRegBank(Reg: Base, MRI: MRI, TRI: TRI) == &AMDGPU::VGPRRegBank) {
1276	VOffsetReg = Base;
1277	SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: `0`);
1278	B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank);
1279	InstOffsetVal = ImmOffset;
1280	return `0`; // XXX - Why is this 0?
1281	}
1282
1283	// If we have SGPR base, we can use it for soffset.
1284	if (SOffset == `0`) {
1285	VOffsetReg = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
1286	B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank);
1287	SOffsetReg = Base;
1288	InstOffsetVal = ImmOffset;
1289	return `0`; // XXX - Why is this 0?
1290	}
1291	}
1292
1293	// Handle the variable sgpr + vgpr case.
1294	MachineInstr Add = getOpcodeDef(Opcode: AMDGPU::G_ADD, Reg: CombinedOffset, MRI: MRI);
1295	if (Add && (int)Offset >= `0` &&
1296	(!CheckNUW \|\| Add->getFlag(Flag: MachineInstr::NoUWrap))) {
1297	Register Src0 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: `1`).getReg(), MRI: *MRI);
1298	Register Src1 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: `2`).getReg(), MRI: *MRI);
1299
1300	const RegisterBank Src0Bank = getRegBank(Reg: Src0, MRI: MRI, TRI: *TRI);
1301	const RegisterBank Src1Bank = getRegBank(Reg: Src1, MRI: MRI, TRI: *TRI);
1302
1303	if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1304	VOffsetReg = Src0;
1305	SOffsetReg = Src1;
1306	return `0`;
1307	}
1308
1309	if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1310	VOffsetReg = Src1;
1311	SOffsetReg = Src0;
1312	return `0`;
1313	}
1314	}
1315
1316	// Ensure we have a VGPR for the combined offset. This could be an issue if we
1317	// have an SGPR offset and a VGPR resource.
1318	if (getRegBank(Reg: CombinedOffset, MRI: MRI, TRI: TRI) == &AMDGPU::VGPRRegBank) {
1319	VOffsetReg = CombinedOffset;
1320	} else {
1321	VOffsetReg = B.buildCopy(Res: S32, Op: CombinedOffset).getReg(Idx: `0`);
1322	B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank);
1323	}
1324
1325	SOffsetReg = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
1326	B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank);
1327	return `0`;
1328	}
1329
1330	static unsigned getSBufferLoadCorrespondingBufferLoadOpcode(unsigned Opc) {
1331	switch (Opc) {
1332	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
1333	return AMDGPU::G_AMDGPU_BUFFER_LOAD;
1334	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
1335	return AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
1336	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
1337	return AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE;
1338	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
1339	return AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
1340	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
1341	return AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
1342	default:
1343	break;
1344	}
1345	llvm_unreachable("Unexpected s_buffer_load opcode");
1346	}
1347
1348	bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1349	MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1350	MachineInstr &MI = OpdMapper.getMI();
1351	MachineRegisterInfo &MRI = OpdMapper.getMRI();
1352
1353	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1354	Register Dst = MI.getOperand(i: `0`).getReg();
1355	LLT Ty = MRI.getType(Reg: Dst);
1356
1357	const RegisterBank *RSrcBank =
1358	OpdMapper.getInstrMapping().getOperandMapping(i: `1`).BreakDown[`0`].RegBank;
1359	const RegisterBank *OffsetBank =
1360	OpdMapper.getInstrMapping().getOperandMapping(i: `2`).BreakDown[`0`].RegBank;
1361	if (RSrcBank == &AMDGPU::SGPRRegBank &&
1362	OffsetBank == &AMDGPU::SGPRRegBank)
1363	return true; // Legal mapping
1364
1365	// FIXME: 96-bit case was widened during legalize. We need to narrow it back
1366	// here but don't have an MMO.
1367
1368	unsigned LoadSize = Ty.getSizeInBits();
1369	int NumLoads = `1`;
1370	if (LoadSize == `256` \|\| LoadSize == `512`) {
1371	NumLoads = LoadSize / `128`;
1372	Ty = Ty.divide(Factor: NumLoads);
1373	}
1374
1375	// Use the alignment to ensure that the required offsets will fit into the
1376	// immediate offsets.
1377	const Align Alignment = NumLoads > `1` ? Align (`16` * NumLoads) : Align (`1`);
1378
1379	MachineFunction &MF = B.getMF();
1380
1381	Register SOffset;
1382	Register VOffset;
1383	int64_t ImmOffset = `0`;
1384
1385	unsigned MMOOffset = setBufferOffsets(B, CombinedOffset: MI.getOperand(i: `2`).getReg(), VOffsetReg&: VOffset,
1386	SOffsetReg&: SOffset, InstOffsetVal&: ImmOffset, Alignment);
1387
1388	// TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1389	// can, but we need to track an MMO for that.
1390	const unsigned MemSize = (Ty.getSizeInBits() + `7`) / `8`;
1391	const Align MemAlign(`4`); // FIXME: ABI type alignment?
1392	MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1393	PtrInfo: MachinePointerInfo (),
1394	F: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
1395	MachineMemOperand::MOInvariant,
1396	Size: MemSize, BaseAlignment: MemAlign);
1397	if (MMOOffset != `0`)
1398	BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset, Size: MemSize);
1399
1400	// If only the offset is divergent, emit a MUBUF buffer load instead. We can
1401	// assume that the buffer is unswizzled.
1402
1403	Register RSrc = MI.getOperand(i: `1`).getReg();
1404	Register VIndex = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
1405	B.getMRI()->setRegBank(Reg: VIndex, RegBank: AMDGPU::VGPRRegBank);
1406
1407	SmallVector<Register, `4`> LoadParts(NumLoads);
1408
1409	MachineBasicBlock::iterator MII = MI.getIterator();
1410	MachineInstrSpan Span(MII, &B.getMBB());
1411
1412	for (int i = `0`; i < NumLoads; ++i) {
1413	if (NumLoads == `1`) {
1414	LoadParts [i] = Dst;
1415	} else {
1416	LoadParts [i] = MRI.createGenericVirtualRegister(Ty);
1417	MRI.setRegBank(Reg: LoadParts [i], RegBank: AMDGPU::VGPRRegBank);
1418	}
1419
1420	MachineMemOperand *MMO = BaseMMO;
1421	if (i != `0`)
1422	BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset + `16` * i, Size: MemSize);
1423
1424	B.buildInstr(Opcode: getSBufferLoadCorrespondingBufferLoadOpcode(Opc: MI.getOpcode()))
1425	.addDef(RegNo: LoadParts [i]) // vdata
1426	.addUse(RegNo: RSrc) // rsrc
1427	.addUse(RegNo: VIndex) // vindex
1428	.addUse(RegNo: VOffset) // voffset
1429	.addUse(RegNo: SOffset) // soffset
1430	.addImm(Val: ImmOffset + `16` * i) // offset(imm)
1431	.addImm(Val: `0`) // cachepolicy, swizzled buffer(imm)
1432	.addImm(Val: `0`) // idxen(imm)
1433	.addMemOperand(MMO);
1434	}
1435
1436	// TODO: If only the resource is a VGPR, it may be better to execute the
1437	// scalar load in the waterfall loop if the resource is expected to frequently
1438	// be dynamically uniform.
1439	if (RSrcBank != &AMDGPU::SGPRRegBank) {
1440	// Remove the original instruction to avoid potentially confusing the
1441	// waterfall loop logic.
1442	B.setInstr(*Span.begin());
1443	MI.eraseFromParent();
1444
1445	SmallSet<Register, `4`> OpsToWaterfall;
1446
1447	OpsToWaterfall.insert(V: RSrc);
1448	executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()),
1449	SGPROperandRegs&: OpsToWaterfall);
1450	}
1451
1452	if (NumLoads != `1`) {
1453	if (Ty.isVector())
1454	B.buildConcatVectors(Res: Dst, Ops: LoadParts);
1455	else
1456	B.buildMergeLikeInstr(Res: Dst, Ops: LoadParts);
1457	}
1458
1459	// We removed the instruction earlier with a waterfall loop.
1460	if (RSrcBank == &AMDGPU::SGPRRegBank)
1461	MI.eraseFromParent();
1462
1463	return true;
1464	}
1465
1466	bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
1467	const OperandsMapper &OpdMapper,
1468	bool Signed) const {
1469	MachineInstr &MI = OpdMapper.getMI();
1470	MachineRegisterInfo &MRI = OpdMapper.getMRI();
1471
1472	// Insert basic copies
1473	applyDefaultMapping(OpdMapper);
1474
1475	Register DstReg = MI.getOperand(i: `0`).getReg();
1476	LLT Ty = MRI.getType(Reg: DstReg);
1477
1478	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1479
1480	unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? `2` : `1`;
1481	Register SrcReg = MI.getOperand(i: FirstOpnd).getReg();
1482	Register OffsetReg = MI.getOperand(i: FirstOpnd + `1`).getReg();
1483	Register WidthReg = MI.getOperand(i: FirstOpnd + `2`).getReg();
1484
1485	const RegisterBank *DstBank =
1486	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
1487	if (DstBank == &AMDGPU::VGPRRegBank) {
1488	if (Ty == S32)
1489	return true;
1490
1491	// There is no 64-bit vgpr bitfield extract instructions so the operation
1492	// is expanded to a sequence of instructions that implement the operation.
1493	ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1494
1495	const LLT S64 = LLT::scalar(SizeInBits: `64`);
1496	// Shift the source operand so that extracted bits start at bit 0.
1497	auto ShiftOffset = Signed ? B.buildAShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg)
1498	: B.buildLShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg);
1499	auto UnmergeSOffset = B.buildUnmerge(Res: {S32, S32}, Op: ShiftOffset);
1500
1501	// A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1502	// if the width is a constant.
1503	if (auto ConstWidth = getIConstantVRegValWithLookThrough(VReg: WidthReg, MRI)) {
1504	// Use the 32-bit bitfield extract instruction if the width is a constant.
1505	// Depending on the width size, use either the low or high 32-bits.
1506	auto Zero = B.buildConstant(Res: S32, Val: `0`);
1507	auto WidthImm = ConstWidth ->Value.getZExtValue();
1508	if (WidthImm <= `32`) {
1509	// Use bitfield extract on the lower 32-bit source, and then sign-extend
1510	// or clear the upper 32-bits.
1511	auto Extract =
1512	Signed ? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: `0`), LSB: Zero, Width: WidthReg)
1513	: B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: `0`), LSB: Zero, Width: WidthReg);
1514	auto Extend =
1515	Signed ? B.buildAShr(Dst: S32, Src0: Extract, Src1: B.buildConstant(Res: S32, Val: `31`)) : Zero;
1516	B.buildMergeLikeInstr(Res: DstReg, Ops: {Extract, Extend});
1517	} else {
1518	// Use bitfield extract on upper 32-bit source, and combine with lower
1519	// 32-bit source.
1520	auto UpperWidth = B.buildConstant(Res: S32, Val: WidthImm - `32`);
1521	auto Extract =
1522	Signed
1523	? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: `1`), LSB: Zero, Width: UpperWidth)
1524	: B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: `1`), LSB: Zero, Width: UpperWidth);
1525	B.buildMergeLikeInstr(Res: DstReg, Ops: {UnmergeSOffset.getReg(Idx: `0`), Extract});
1526	}
1527	MI.eraseFromParent();
1528	return true;
1529	}
1530
1531	// Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1532	// operations.
1533	auto ExtShift = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: `64`), Src1: WidthReg);
1534	auto SignBit = B.buildShl(Dst: S64, Src0: ShiftOffset, Src1: ExtShift);
1535	if (Signed)
1536	B.buildAShr(Dst: S64, Src0: SignBit, Src1: ExtShift);
1537	else
1538	B.buildLShr(Dst: S64, Src0: SignBit, Src1: ExtShift);
1539	MI.eraseFromParent();
1540	return true;
1541	}
1542
1543	// The scalar form packs the offset and width in a single operand.
1544
1545	ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1546
1547	// Ensure the high bits are clear to insert the offset.
1548	auto OffsetMask = B.buildConstant(Res: S32, Val: maskTrailingOnes<unsigned>(N: `6`));
1549	auto ClampOffset = B.buildAnd(Dst: S32, Src0: OffsetReg, Src1: OffsetMask);
1550
1551	// Zeros out the low bits, so don't bother clamping the input value.
1552	auto ShiftWidth = B.buildShl(Dst: S32, Src0: WidthReg, Src1: B.buildConstant(Res: S32, Val: `16`));
1553
1554	// Transformation function, pack the offset and width of a BFE into
1555	// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1556	// source, bits [5:0] contain the offset and bits [22:16] the width.
1557	auto MergedInputs = B.buildOr(Dst: S32, Src0: ClampOffset, Src1: ShiftWidth);
1558
1559	// TODO: It might be worth using a pseudo here to avoid scc clobber and
1560	// register class constraints.
1561	unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1562	(Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1563
1564	auto MIB = B.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {SrcReg, MergedInputs});
1565	constrainSelectedInstRegOperands(I&: MIB, TII: TII, TRI: TRI, RBI: this);
1566
1567	MI.eraseFromParent();
1568	return true;
1569	}
1570
1571	bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1572	MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1573	MachineInstr &MI = OpdMapper.getMI();
1574	MachineRegisterInfo &MRI = OpdMapper.getMRI();
1575
1576	// Insert basic copies.
1577	applyDefaultMapping(OpdMapper);
1578
1579	Register Dst0 = MI.getOperand(i: `0`).getReg();
1580	Register Dst1 = MI.getOperand(i: `1`).getReg();
1581	Register Src0 = MI.getOperand(i: `2`).getReg();
1582	Register Src1 = MI.getOperand(i: `3`).getReg();
1583	Register Src2 = MI.getOperand(i: `4`).getReg();
1584
1585	if (MRI.getRegBankOrNull(Reg: Src0) == &AMDGPU::VGPRRegBank)
1586	return true;
1587
1588	bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1589	LLT S1 = LLT::scalar(SizeInBits: `1`);
1590	LLT S32 = LLT::scalar(SizeInBits: `32`);
1591
1592	bool DstOnValu = MRI.getRegBankOrNull(Reg: Src2) == &AMDGPU::VGPRRegBank;
1593	bool Accumulate = true;
1594
1595	if (!DstOnValu) {
1596	if (mi_match(R: Src2, MRI, P: m_ZeroInt()))
1597	Accumulate = false;
1598	}
1599
1600	// Keep the multiplication on the SALU.
1601	Register DstHi;
1602	Register DstLo = B.buildMul(Dst: S32, Src0, Src1).getReg(Idx: `0`);
1603	bool MulHiInVgpr = false;
1604
1605	MRI.setRegBank(Reg: DstLo, RegBank: AMDGPU::SGPRRegBank);
1606
1607	if (Subtarget.hasSMulHi()) {
1608	DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0, Src1).getReg(Idx: `0`)
1609	: B.buildSMulH(Dst: S32, Src0, Src1).getReg(Idx: `0`);
1610	MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::SGPRRegBank);
1611	} else {
1612	Register VSrc0 = B.buildCopy(Res: S32, Op: Src0).getReg(Idx: `0`);
1613	Register VSrc1 = B.buildCopy(Res: S32, Op: Src1).getReg(Idx: `0`);
1614
1615	MRI.setRegBank(Reg: VSrc0, RegBank: AMDGPU::VGPRRegBank);
1616	MRI.setRegBank(Reg: VSrc1, RegBank: AMDGPU::VGPRRegBank);
1617
1618	DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: `0`)
1619	: B.buildSMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: `0`);
1620	MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::VGPRRegBank);
1621
1622	if (!DstOnValu) {
1623	DstHi = buildReadFirstLane(B, MRI, Src: DstHi);
1624	} else {
1625	MulHiInVgpr = true;
1626	}
1627	}
1628
1629	// Accumulate and produce the "carry-out" bit.
1630	//
1631	// The "carry-out" is defined as bit 64 of the result when computed as a
1632	// big integer. For unsigned multiply-add, this matches the usual definition
1633	// of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1634	// result, which is determined as:
1635	// sign(Src0 Src1) + sign(Src2) + carry-out from unsigned 64-bit add*
1636	LLT CarryType = DstOnValu ? S1 : S32;
1637	const RegisterBank &CarryBank =
1638	DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1639	const RegisterBank &DstBank =
1640	DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1641	Register Carry;
1642	Register Zero;
1643
1644	if (!IsUnsigned) {
1645	Zero = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
1646	MRI.setRegBank(Reg: Zero,
1647	RegBank: MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1648
1649	Carry = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: MulHiInVgpr ? S1 : S32, Op0: DstHi, Op1: Zero)
1650	.getReg(Idx: `0`);
1651	MRI.setRegBank(Reg: Carry, RegBank: MulHiInVgpr ? AMDGPU::VCCRegBank
1652	: AMDGPU::SGPRRegBank);
1653
1654	if (DstOnValu && !MulHiInVgpr) {
1655	Carry = B.buildTrunc(Res: S1, Op: Carry).getReg(Idx: `0`);
1656	MRI.setRegBank(Reg: Carry, RegBank: AMDGPU::VCCRegBank);
1657	}
1658	}
1659
1660	if (Accumulate) {
1661	if (DstOnValu) {
1662	DstLo = B.buildCopy(Res: S32, Op: DstLo).getReg(Idx: `0`);
1663	DstHi = B.buildCopy(Res: S32, Op: DstHi).getReg(Idx: `0`);
1664	MRI.setRegBank(Reg: DstLo, RegBank: AMDGPU::VGPRRegBank);
1665	MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::VGPRRegBank);
1666	}
1667
1668	auto Unmerge = B.buildUnmerge(Res: S32, Op: Src2);
1669	Register Src2Lo = Unmerge.getReg(Idx: `0`);
1670	Register Src2Hi = Unmerge.getReg(Idx: `1`);
1671	MRI.setRegBank(Reg: Src2Lo, RegBank: DstBank);
1672	MRI.setRegBank(Reg: Src2Hi, RegBank: DstBank);
1673
1674	if (!IsUnsigned) {
1675	auto Src2Sign = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: CarryType, Op0: Src2Hi, Op1: Zero);
1676	MRI.setRegBank(Reg: Src2Sign.getReg(Idx: `0`), RegBank: CarryBank);
1677
1678	Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: Src2Sign).getReg(Idx: `0`);
1679	MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1680	}
1681
1682	auto AddLo = B.buildUAddo(Res: S32, CarryOut: CarryType, Op0: DstLo, Op1: Src2Lo);
1683	DstLo = AddLo.getReg(Idx: `0`);
1684	Register CarryLo = AddLo.getReg(Idx: `1`);
1685	MRI.setRegBank(Reg: DstLo, RegBank: DstBank);
1686	MRI.setRegBank(Reg: CarryLo, RegBank: CarryBank);
1687
1688	auto AddHi = B.buildUAdde(Res: S32, CarryOut: CarryType, Op0: DstHi, Op1: Src2Hi, CarryIn: CarryLo);
1689	DstHi = AddHi.getReg(Idx: `0`);
1690	MRI.setRegBank(Reg: DstHi, RegBank: DstBank);
1691
1692	Register CarryHi = AddHi.getReg(Idx: `1`);
1693	MRI.setRegBank(Reg: CarryHi, RegBank: CarryBank);
1694
1695	if (IsUnsigned) {
1696	Carry = CarryHi;
1697	} else {
1698	Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: CarryHi).getReg(Idx: `0`);
1699	MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1700	}
1701	} else {
1702	if (IsUnsigned) {
1703	Carry = B.buildConstant(Res: CarryType, Val: `0`).getReg(Idx: `0`);
1704	MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1705	}
1706	}
1707
1708	B.buildMergeLikeInstr(Res: Dst0, Ops: {DstLo, DstHi});
1709
1710	if (DstOnValu) {
1711	B.buildCopy(Res: Dst1, Op: Carry);
1712	} else {
1713	B.buildTrunc(Res: Dst1, Op: Carry);
1714	}
1715
1716	MI.eraseFromParent();
1717	return true;
1718	}
1719
1720	// Return a suitable opcode for extending the operands of Opc when widening.
1721	static unsigned getExtendOp(unsigned Opc) {
1722	switch (Opc) {
1723	case TargetOpcode::G_ASHR:
1724	case TargetOpcode::G_SMIN:
1725	case TargetOpcode::G_SMAX:
1726	return TargetOpcode::G_SEXT;
1727	case TargetOpcode::G_LSHR:
1728	case TargetOpcode::G_UMIN:
1729	case TargetOpcode::G_UMAX:
1730	return TargetOpcode::G_ZEXT;
1731	default:
1732	return TargetOpcode::G_ANYEXT;
1733	}
1734	}
1735
1736	// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1737	// any illegal vector extend or unmerge operations.
1738	static std::pair<Register, Register>
1739	unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1740	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1741	auto Bitcast = B.buildBitcast(Dst: S32, Src);
1742
1743	if (ExtOpcode == TargetOpcode::G_SEXT) {
1744	auto ExtLo = B.buildSExtInReg(Res: S32, Op: Bitcast, ImmOp: `16`);
1745	auto ShiftHi = B.buildAShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: `16`));
1746	return std::pair(ExtLo.getReg(Idx: `0`), ShiftHi.getReg(Idx: `0`));
1747	}
1748
1749	auto ShiftHi = B.buildLShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: `16`));
1750	if (ExtOpcode == TargetOpcode::G_ZEXT) {
1751	auto ExtLo = B.buildAnd(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: `0xffff`));
1752	return std::pair(ExtLo.getReg(Idx: `0`), ShiftHi.getReg(Idx: `0`));
1753	}
1754
1755	assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1756	return std::pair(Bitcast.getReg(Idx: `0`), ShiftHi.getReg(Idx: `0`));
1757	}
1758
1759	// For cases where only a single copy is inserted for matching register banks.
1760	// Replace the register in the instruction operand
1761	static bool substituteSimpleCopyRegs(
1762	const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1763	SmallVector<unsigned, `1`> SrcReg(OpdMapper.getVRegs(OpIdx));
1764	if (!SrcReg.empty()) {
1765	assert(SrcReg.size() == `1`);
1766	OpdMapper.getMI().getOperand(i: OpIdx).setReg(SrcReg [`0`]);
1767	return true;
1768	}
1769
1770	return false;
1771	}
1772
1773	/// Handle register layout difference for f16 images for some subtargets.
1774	Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1775	MachineRegisterInfo &MRI,
1776	Register Reg) const {
1777	if (!Subtarget.hasUnpackedD16VMem())
1778	return Reg;
1779
1780	const LLT S16 = LLT::scalar(SizeInBits: `16`);
1781	LLT StoreVT = MRI.getType(Reg);
1782	if (!StoreVT.isVector() \|\| StoreVT.getElementType() != S16)
1783	return Reg;
1784
1785	auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
1786
1787
1788	SmallVector<Register, `4`> WideRegs;
1789	for (int I = `0`, E = Unmerge ->getNumOperands() - `1`; I != E; ++I)
1790	WideRegs.push_back(Elt: Unmerge.getReg(Idx: I));
1791
1792	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1793	int NumElts = StoreVT.getNumElements();
1794
1795	return B.buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
1796	.getReg(Idx: `0`);
1797	}
1798
1799	static std::pair<Register, unsigned>
1800	getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1801	int64_t Const;
1802	if (mi_match(R: Reg, MRI, P: m_ICst(Cst&: Const)))
1803	return std::pair(Register (), Const);
1804
1805	Register Base;
1806	if (mi_match(R: Reg, MRI, P: m_GAdd(L: m_Reg(R&: Base), R: m_ICst(Cst&: Const))))
1807	return std::pair(Base, Const);
1808
1809	// TODO: Handle G_OR used for add case
1810	return std::pair(Reg, `0`);
1811	}
1812
1813	std::pair<Register, unsigned>
1814	AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1815	Register OrigOffset) const {
1816	const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: Subtarget);
1817	Register BaseReg;
1818	unsigned ImmOffset;
1819	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1820
1821	// TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1822	std::tie(args&: BaseReg, args&: ImmOffset) = getBaseWithConstantOffset(MRI&: *B.getMRI(),
1823	Reg: OrigOffset);
1824
1825	unsigned C1 = `0`;
1826	if (ImmOffset != `0`) {
1827	// If the immediate value is too big for the immoffset field, put only bits
1828	// that would normally fit in the immoffset field. The remaining value that
1829	// is copied/added for the voffset field is a large power of 2, and it
1830	// stands more chance of being CSEd with the copy/add for another similar
1831	// load/store.
1832	// However, do not do that rounding down if that is a negative
1833	// number, as it appears to be illegal to have a negative offset in the
1834	// vgpr, even if adding the immediate offset makes it positive.
1835	unsigned Overflow = ImmOffset & ~MaxImm;
1836	ImmOffset -= Overflow;
1837	if ((int32_t)Overflow < `0`) {
1838	Overflow += ImmOffset;
1839	ImmOffset = `0`;
1840	}
1841
1842	C1 = ImmOffset;
1843	if (Overflow != `0`) {
1844	if (!BaseReg)
1845	BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: `0`);
1846	else {
1847	auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
1848	BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: `0`);
1849	}
1850	}
1851	}
1852
1853	if (!BaseReg)
1854	BaseReg = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
1855
1856	return {BaseReg, C1};
1857	}
1858
1859	bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1860	Register SrcReg) const {
1861	MachineRegisterInfo &MRI = *B.getMRI();
1862	LLT SrcTy = MRI.getType(Reg: SrcReg);
1863	if (SrcTy.getSizeInBits() == `32`) {
1864	// Use a v_mov_b32 here to make the exec dependency explicit.
1865	B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32)
1866	.addDef(RegNo: DstReg)
1867	.addUse(RegNo: SrcReg);
1868	return constrainGenericRegister(Reg: DstReg, RC: AMDGPU::VGPR_32RegClass, MRI) &&
1869	constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_32RegClass, MRI);
1870	}
1871
1872	Register TmpReg0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1873	Register TmpReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1874
1875	B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32)
1876	.addDef(RegNo: TmpReg0)
1877	.addUse(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub0);
1878	B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32)
1879	.addDef(RegNo: TmpReg1)
1880	.addUse(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub1);
1881	B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
1882	.addDef(RegNo: DstReg)
1883	.addUse(RegNo: TmpReg0)
1884	.addImm(Val: AMDGPU::sub0)
1885	.addUse(RegNo: TmpReg1)
1886	.addImm(Val: AMDGPU::sub1);
1887
1888	return constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_64RegClass, MRI) &&
1889	constrainGenericRegister(Reg: DstReg, RC: AMDGPU::VReg_64RegClass, MRI);
1890	}
1891
1892	/// Utility function for pushing dynamic vector indexes with a constant offset
1893	/// into waterfall loops.
1894	static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1895	MachineInstr &IdxUseInstr,
1896	unsigned OpIdx,
1897	unsigned ConstOffset) {
1898	MachineRegisterInfo &MRI = *B.getMRI();
1899	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1900	Register WaterfallIdx = IdxUseInstr.getOperand(i: OpIdx).getReg();
1901	B.setInsertPt(MBB&: *IdxUseInstr.getParent(), II: IdxUseInstr.getIterator());
1902
1903	auto MaterializedOffset = B.buildConstant(Res: S32, Val: ConstOffset);
1904
1905	auto Add = B.buildAdd(Dst: S32, Src0: WaterfallIdx, Src1: MaterializedOffset);
1906	MRI.setRegBank(Reg: MaterializedOffset.getReg(Idx: `0`), RegBank: AMDGPU::SGPRRegBank);
1907	MRI.setRegBank(Reg: Add.getReg(Idx: `0`), RegBank: AMDGPU::SGPRRegBank);
1908	IdxUseInstr.getOperand(i: OpIdx).setReg(Add.getReg(Idx: `0`));
1909	}
1910
1911	/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1912	/// original 32-bit source value (to be inserted in the low part of the combined
1913	/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1914	/// value.
1915	static void extendLow32IntoHigh32(MachineIRBuilder &B,
1916	Register Hi32Reg, Register Lo32Reg,
1917	unsigned ExtOpc,
1918	const RegisterBank &RegBank,
1919	bool IsBooleanSrc = false) {
1920	if (ExtOpc == AMDGPU::G_ZEXT) {
1921	B.buildConstant(Res: Hi32Reg, Val: `0`);
1922	} else if (ExtOpc == AMDGPU::G_SEXT) {
1923	if (IsBooleanSrc) {
1924	// If we know the original source was an s1, the high half is the same as
1925	// the low.
1926	B.buildCopy(Res: Hi32Reg, Op: Lo32Reg);
1927	} else {
1928	// Replicate sign bit from 32-bit extended part.
1929	auto ShiftAmt = B.buildConstant(Res: LLT::scalar(SizeInBits: `32`), Val: `31`);
1930	B.getMRI()->setRegBank(Reg: ShiftAmt.getReg(Idx: `0`), RegBank);
1931	B.buildAShr(Dst: Hi32Reg, Src0: Lo32Reg, Src1: ShiftAmt);
1932	}
1933	} else {
1934	assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1935	B.buildUndef(Res: Hi32Reg);
1936	}
1937	}
1938
1939	bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1940	MachineIRBuilder &B, MachineInstr &MI,
1941	const OperandsMapper &OpdMapper) const {
1942	MachineRegisterInfo &MRI = *B.getMRI();
1943
1944	Register VecReg = MI.getOperand(i: `1`).getReg();
1945	Register Idx = MI.getOperand(i: `2`).getReg();
1946
1947	const RegisterBank &IdxBank =
1948	*OpdMapper.getInstrMapping().getOperandMapping(i: `2`).BreakDown[`0`].RegBank;
1949
1950	bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1951
1952	LLT VecTy = MRI.getType(Reg: VecReg);
1953	unsigned EltSize = VecTy.getScalarSizeInBits();
1954	unsigned NumElem = VecTy.getNumElements();
1955
1956	if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1957	IsDivergentIdx, Subtarget: &Subtarget))
1958	return false;
1959
1960	LLT S32 = LLT::scalar(SizeInBits: `32`);
1961
1962	const RegisterBank &DstBank =
1963	*OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
1964	const RegisterBank &SrcBank =
1965	*OpdMapper.getInstrMapping().getOperandMapping(i: `1`).BreakDown[`0`].RegBank;
1966
1967	const RegisterBank &CCBank =
1968	(DstBank == AMDGPU::SGPRRegBank &&
1969	SrcBank == AMDGPU::SGPRRegBank &&
1970	IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1971	: AMDGPU::VCCRegBank;
1972	LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(SizeInBits: `1`);
1973
1974	if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1975	Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: `0`).getReg();
1976	MRI.setRegBank(Reg: Idx, RegBank: AMDGPU::VGPRRegBank);
1977	}
1978
1979	LLT EltTy = VecTy.getScalarType();
1980	SmallVector<Register, `2`> DstRegs(OpdMapper.getVRegs(OpIdx: `0`));
1981	unsigned NumLanes = DstRegs.size();
1982	if (!NumLanes)
1983	NumLanes = `1`;
1984	else
1985	EltTy = MRI.getType(Reg: DstRegs [`0`]);
1986
1987	auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg);
1988	SmallVector<Register, `2`> Res(NumLanes);
1989	for (unsigned L = `0`; L < NumLanes; ++L)
1990	Res [L] = UnmergeToEltTy.getReg(Idx: L);
1991
1992	for (unsigned I = `1`; I < NumElem; ++I) {
1993	auto IC = B.buildConstant(Res: S32, Val: I);
1994	MRI.setRegBank(Reg: IC ->getOperand(i: `0`).getReg(), RegBank: AMDGPU::SGPRRegBank);
1995	auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC);
1996	MRI.setRegBank(Reg: Cmp ->getOperand(i: `0`).getReg(), RegBank: CCBank);
1997
1998	for (unsigned L = `0`; L < NumLanes; ++L) {
1999	auto S = B.buildSelect(Res: EltTy, Tst: Cmp,
2000	Op0: UnmergeToEltTy.getReg(Idx: I * NumLanes + L), Op1: Res [L]);
2001
2002	for (unsigned N : { `0`, `2`, `3` })
2003	MRI.setRegBank(Reg: S ->getOperand(i: N).getReg(), RegBank: DstBank);
2004
2005	Res [L] = S ->getOperand(i: `0`).getReg();
2006	}
2007	}
2008
2009	for (unsigned L = `0`; L < NumLanes; ++L) {
2010	Register DstReg = (NumLanes == `1`) ? MI.getOperand(i: `0`).getReg() : DstRegs [L];
2011	B.buildCopy(Res: DstReg, Op: Res [L]);
2012	MRI.setRegBank(Reg: DstReg, RegBank: DstBank);
2013	}
2014
2015	MRI.setRegBank(Reg: MI.getOperand(i: `0`).getReg(), RegBank: DstBank);
2016	MI.eraseFromParent();
2017
2018	return true;
2019	}
2020
2021	// Insert a cross regbank copy for a register if it already has a bank that
2022	// differs from the one we want to set.
2023	static Register constrainRegToBank(MachineRegisterInfo &MRI,
2024	MachineIRBuilder &B, Register &Reg,
2025	const RegisterBank &Bank) {
2026	const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2027	if (CurrBank && *CurrBank != Bank) {
2028	Register Copy = B.buildCopy(Res: MRI.getType(Reg), Op: Reg).getReg(Idx: `0`);
2029	MRI.setRegBank(Reg: Copy, RegBank: Bank);
2030	return Copy;
2031	}
2032
2033	MRI.setRegBank(Reg, RegBank: Bank);
2034	return Reg;
2035	}
2036
2037	bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2038	MachineIRBuilder &B, MachineInstr &MI,
2039	const OperandsMapper &OpdMapper) const {
2040
2041	MachineRegisterInfo &MRI = *B.getMRI();
2042	Register VecReg = MI.getOperand(i: `1`).getReg();
2043	Register Idx = MI.getOperand(i: `3`).getReg();
2044
2045	const RegisterBank &IdxBank =
2046	*OpdMapper.getInstrMapping().getOperandMapping(i: `3`).BreakDown[`0`].RegBank;
2047
2048	bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2049
2050	LLT VecTy = MRI.getType(Reg: VecReg);
2051	unsigned EltSize = VecTy.getScalarSizeInBits();
2052	unsigned NumElem = VecTy.getNumElements();
2053
2054	if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2055	IsDivergentIdx, Subtarget: &Subtarget))
2056	return false;
2057
2058	LLT S32 = LLT::scalar(SizeInBits: `32`);
2059
2060	const RegisterBank &DstBank =
2061	*OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2062	const RegisterBank &SrcBank =
2063	*OpdMapper.getInstrMapping().getOperandMapping(i: `1`).BreakDown[`0`].RegBank;
2064	const RegisterBank &InsBank =
2065	*OpdMapper.getInstrMapping().getOperandMapping(i: `2`).BreakDown[`0`].RegBank;
2066
2067	const RegisterBank &CCBank =
2068	(DstBank == AMDGPU::SGPRRegBank &&
2069	SrcBank == AMDGPU::SGPRRegBank &&
2070	InsBank == AMDGPU::SGPRRegBank &&
2071	IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2072	: AMDGPU::VCCRegBank;
2073	LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(SizeInBits: `1`);
2074
2075	if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2076	Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: `0`).getReg();
2077	MRI.setRegBank(Reg: Idx, RegBank: AMDGPU::VGPRRegBank);
2078	}
2079
2080	LLT EltTy = VecTy.getScalarType();
2081	SmallVector<Register, `2`> InsRegs(OpdMapper.getVRegs(OpIdx: `2`));
2082	unsigned NumLanes = InsRegs.size();
2083	if (!NumLanes) {
2084	NumLanes = `1`;
2085	InsRegs.push_back(Elt: MI.getOperand(i: `2`).getReg());
2086	} else {
2087	EltTy = MRI.getType(Reg: InsRegs [`0`]);
2088	}
2089
2090	auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg);
2091	SmallVector<Register, `16`> Ops(NumElem * NumLanes);
2092
2093	for (unsigned I = `0`; I < NumElem; ++I) {
2094	auto IC = B.buildConstant(Res: S32, Val: I);
2095	MRI.setRegBank(Reg: IC ->getOperand(i: `0`).getReg(), RegBank: AMDGPU::SGPRRegBank);
2096	auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC);
2097	MRI.setRegBank(Reg: Cmp ->getOperand(i: `0`).getReg(), RegBank: CCBank);
2098
2099	for (unsigned L = `0`; L < NumLanes; ++L) {
2100	Register Op0 = constrainRegToBank(MRI, B, Reg&: InsRegs [L], Bank: DstBank);
2101	Register Op1 = UnmergeToEltTy.getReg(Idx: I * NumLanes + L);
2102	Op1 = constrainRegToBank(MRI, B, Reg&: Op1, Bank: DstBank);
2103
2104	Register Select = B.buildSelect(Res: EltTy, Tst: Cmp, Op0, Op1).getReg(Idx: `0`);
2105	MRI.setRegBank(Reg: Select, RegBank: DstBank);
2106
2107	Ops [I * NumLanes + L] = Select;
2108	}
2109	}
2110
2111	LLT MergeTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarTy: EltTy);
2112	if (MergeTy == MRI.getType(Reg: MI.getOperand(i: `0`).getReg())) {
2113	B.buildBuildVector(Res: MI.getOperand(i: `0`), Ops);
2114	} else {
2115	auto Vec = B.buildBuildVector(Res: MergeTy, Ops);
2116	MRI.setRegBank(Reg: Vec ->getOperand(i: `0`).getReg(), RegBank: DstBank);
2117	B.buildBitcast(Dst: MI.getOperand(i: `0`).getReg(), Src: Vec);
2118	}
2119
2120	MRI.setRegBank(Reg: MI.getOperand(i: `0`).getReg(), RegBank: DstBank);
2121	MI.eraseFromParent();
2122
2123	return true;
2124	}
2125
2126	// Break s_mul_u64 into 32-bit vector operations.
2127	void AMDGPURegisterBankInfo::applyMappingSMULU64(
2128	MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2129	SmallVector<Register, `2`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
2130	SmallVector<Register, `2`> Src0Regs(OpdMapper.getVRegs(OpIdx: `1`));
2131	SmallVector<Register, `2`> Src1Regs(OpdMapper.getVRegs(OpIdx: `2`));
2132
2133	// All inputs are SGPRs, nothing special to do.
2134	if (DefRegs.empty()) {
2135	assert(Src0Regs.empty() && Src1Regs.empty());
2136	applyDefaultMapping(OpdMapper);
2137	return;
2138	}
2139
2140	assert(DefRegs.size() == `2`);
2141	assert(Src0Regs.size() == Src1Regs.size() &&
2142	(Src0Regs.empty() \|\| Src0Regs.size() == `2`));
2143
2144	MachineRegisterInfo &MRI = OpdMapper.getMRI();
2145	MachineInstr &MI = OpdMapper.getMI();
2146	Register DstReg = MI.getOperand(i: `0`).getReg();
2147	LLT HalfTy = LLT::scalar(SizeInBits: `32`);
2148
2149	// Depending on where the source registers came from, the generic code may
2150	// have decided to split the inputs already or not. If not, we still need to
2151	// extract the values.
2152
2153	if (Src0Regs.empty())
2154	split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: `1`).getReg());
2155	else
2156	setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy);
2157
2158	if (Src1Regs.empty())
2159	split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: `2`).getReg());
2160	else
2161	setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2162
2163	setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2164
2165	// The multiplication is done as follows:
2166	//
2167	// Op1H Op1L
2168	// Op0H Op0L*
2169	// --------------------
2170	// Op1HOp0L Op1LOp0L
2171	// + Op1HOp0H Op1LOp0H
2172	// -----------------------------------------
2173	// (Op1HOp0L + Op1LOp0H + carry) Op1LOp0L*
2174	//
2175	// We drop Op1HOp0H because the result of the multiplication is a 64-bit*
2176	// value and that would overflow.
2177	// The low 32-bit value is Op1LOp0L.*
2178	// The high 32-bit value is Op1HOp0L + Op1LOp0H + carry (from
2179	// Op1LOp0L).*
2180
2181	ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
2182
2183	Register Hi = B.buildUMulH(Dst: HalfTy, Src0: Src0Regs [`0`], Src1: Src1Regs [`0`]).getReg(Idx: `0`);
2184	Register MulLoHi = B.buildMul(Dst: HalfTy, Src0: Src0Regs [`0`], Src1: Src1Regs [`1`]).getReg(Idx: `0`);
2185	Register Add = B.buildAdd(Dst: HalfTy, Src0: Hi, Src1: MulLoHi).getReg(Idx: `0`);
2186	Register MulHiLo = B.buildMul(Dst: HalfTy, Src0: Src0Regs [`1`], Src1: Src1Regs [`0`]).getReg(Idx: `0`);
2187	B.buildAdd(Dst: DefRegs [`1`], Src0: Add, Src1: MulHiLo);
2188	B.buildMul(Dst: DefRegs [`0`], Src0: Src0Regs [`0`], Src1: Src1Regs [`0`]);
2189
2190	MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2191	MI.eraseFromParent();
2192	}
2193
2194	void AMDGPURegisterBankInfo::applyMappingImpl(
2195	MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2196	MachineInstr &MI = OpdMapper.getMI();
2197	B.setInstrAndDebugLoc(MI);
2198	unsigned Opc = MI.getOpcode();
2199	MachineRegisterInfo &MRI = OpdMapper.getMRI();
2200	switch (Opc) {
2201	case AMDGPU::G_CONSTANT:
2202	case AMDGPU::G_IMPLICIT_DEF: {
2203	Register DstReg = MI.getOperand(i: `0`).getReg();
2204	LLT DstTy = MRI.getType(Reg: DstReg);
2205	if (DstTy != LLT::scalar(SizeInBits: `1`))
2206	break;
2207
2208	const RegisterBank *DstBank =
2209	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2210	if (DstBank == &AMDGPU::VCCRegBank)
2211	break;
2212	SmallVector<Register, `1`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
2213	if (DefRegs.empty())
2214	DefRegs.push_back(Elt: DstReg);
2215
2216	B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
2217
2218	Register NewDstReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: `32`));
2219	LLVMContext &Ctx = B.getMF().getFunction().getContext();
2220
2221	MI.getOperand(i: `0`).setReg(NewDstReg);
2222	if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2223	uint64_t ConstVal = MI.getOperand(i: `1`).getCImm()->getZExtValue();
2224	MI.getOperand(i: `1`).setCImm(
2225	ConstantInt::get(Ty: IntegerType::getInt32Ty(C&: Ctx), V: ConstVal));
2226	}
2227
2228	MRI.setRegBank(Reg: NewDstReg, RegBank: *DstBank);
2229	B.buildTrunc(Res: DefRegs [`0`], Op: NewDstReg);
2230	return;
2231	}
2232	case AMDGPU::G_PHI: {
2233	Register DstReg = MI.getOperand(i: `0`).getReg();
2234	LLT DstTy = MRI.getType(Reg: DstReg);
2235	if (DstTy != LLT::scalar(SizeInBits: `1`))
2236	break;
2237
2238	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2239	const RegisterBank *DstBank =
2240	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2241	if (DstBank == &AMDGPU::VCCRegBank) {
2242	applyDefaultMapping(OpdMapper);
2243	// The standard handling only considers the result register bank for
2244	// phis. For VCC, blindly inserting a copy when the phi is lowered will
2245	// produce an invalid copy. We can only copy with some kind of compare to
2246	// get a vector boolean result. Insert a register bank copy that will be
2247	// correctly lowered to a compare.
2248	for (unsigned I = `1`, E = MI.getNumOperands(); I != E; I += `2`) {
2249	Register SrcReg = MI.getOperand(i: I).getReg();
2250	const RegisterBank SrcBank = getRegBank(Reg: SrcReg, MRI, TRI: TRI);
2251
2252	if (SrcBank != &AMDGPU::VCCRegBank) {
2253	MachineBasicBlock *SrcMBB = MI.getOperand(i: I + `1`).getMBB();
2254	B.setInsertPt(MBB&: *SrcMBB, II: SrcMBB->getFirstTerminator());
2255
2256	auto Copy = B.buildCopy(Res: LLT::scalar(SizeInBits: `1`), Op: SrcReg);
2257	MRI.setRegBank(Reg: Copy.getReg(Idx: `0`), RegBank: AMDGPU::VCCRegBank);
2258	MI.getOperand(i: I).setReg(Copy.getReg(Idx: `0`));
2259	}
2260	}
2261
2262	return;
2263	}
2264
2265	// Phi handling is strange and only considers the bank of the destination.
2266	substituteSimpleCopyRegs(OpdMapper, OpIdx: `0`);
2267
2268	// Promote SGPR/VGPR booleans to s32
2269	ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2270	B.setInsertPt(MBB&: B.getMBB(), II: MI);
2271	LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2272
2273	if (Helper.widenScalar(MI, TypeIdx: `0`, WideTy: S32) != LegalizerHelper::Legalized)
2274	llvm_unreachable("widen scalar should have succeeded");
2275
2276	return;
2277	}
2278	case AMDGPU::G_FCMP:
2279	if (!Subtarget.hasSALUFloatInsts())
2280	break;
2281	[[fallthrough]];
2282	case AMDGPU::G_ICMP:
2283	case AMDGPU::G_UADDO:
2284	case AMDGPU::G_USUBO:
2285	case AMDGPU::G_UADDE:
2286	case AMDGPU::G_SADDE:
2287	case AMDGPU::G_USUBE:
2288	case AMDGPU::G_SSUBE: {
2289	unsigned BoolDstOp =
2290	(Opc == AMDGPU::G_ICMP \|\| Opc == AMDGPU::G_FCMP) ? `0` : `1`;
2291	Register DstReg = MI.getOperand(i: BoolDstOp).getReg();
2292
2293	const RegisterBank *DstBank =
2294	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2295	if (DstBank != &AMDGPU::SGPRRegBank)
2296	break;
2297
2298	const bool HasCarryIn = MI.getNumOperands() == `5`;
2299
2300	// If this is a scalar compare, promote the result to s32, as the selection
2301	// will end up using a copy to a 32-bit vreg.
2302	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2303	Register NewDstReg = MRI.createGenericVirtualRegister(Ty: S32);
2304	MRI.setRegBank(Reg: NewDstReg, RegBank: AMDGPU::SGPRRegBank);
2305	MI.getOperand(i: BoolDstOp).setReg(NewDstReg);
2306
2307	if (HasCarryIn) {
2308	Register NewSrcReg = MRI.createGenericVirtualRegister(Ty: S32);
2309	MRI.setRegBank(Reg: NewSrcReg, RegBank: AMDGPU::SGPRRegBank);
2310	B.buildZExt(Res: NewSrcReg, Op: MI.getOperand(i: `4`).getReg());
2311	MI.getOperand(i: `4`).setReg(NewSrcReg);
2312	}
2313
2314	MachineBasicBlock *MBB = MI.getParent();
2315	B.setInsertPt(MBB&: *MBB, II: std::next(x: MI.getIterator()));
2316
2317	// If we had a constrained VCC result register, a copy was inserted to VCC
2318	// from SGPR.
2319	SmallVector<Register, `1`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
2320	if (DefRegs.empty())
2321	DefRegs.push_back(Elt: DstReg);
2322	B.buildTrunc(Res: DefRegs [`0`], Op: NewDstReg);
2323	return;
2324	}
2325	case AMDGPU::G_SELECT: {
2326	Register DstReg = MI.getOperand(i: `0`).getReg();
2327	LLT DstTy = MRI.getType(Reg: DstReg);
2328
2329	SmallVector<Register, `1`> CondRegs(OpdMapper.getVRegs(OpIdx: `1`));
2330	if (CondRegs.empty())
2331	CondRegs.push_back(Elt: MI.getOperand(i: `1`).getReg());
2332	else {
2333	assert(CondRegs.size() == `1`);
2334	}
2335
2336	const RegisterBank CondBank = getRegBank(Reg: CondRegs [`0`], MRI, TRI: TRI);
2337	if (CondBank == &AMDGPU::SGPRRegBank) {
2338	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2339	Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32);
2340	MRI.setRegBank(Reg: NewCondReg, RegBank: AMDGPU::SGPRRegBank);
2341
2342	MI.getOperand(i: `1`).setReg(NewCondReg);
2343	B.buildZExt(Res: NewCondReg, Op: CondRegs [`0`]);
2344	}
2345
2346	if (DstTy.getSizeInBits() != `64`)
2347	break;
2348
2349	LLT HalfTy = getHalfSizedType(Ty: DstTy);
2350
2351	SmallVector<Register, `2`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
2352	SmallVector<Register, `2`> Src1Regs(OpdMapper.getVRegs(OpIdx: `2`));
2353	SmallVector<Register, `2`> Src2Regs(OpdMapper.getVRegs(OpIdx: `3`));
2354
2355	// All inputs are SGPRs, nothing special to do.
2356	if (DefRegs.empty()) {
2357	assert(Src1Regs.empty() && Src2Regs.empty());
2358	break;
2359	}
2360
2361	if (Src1Regs.empty())
2362	split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: `2`).getReg());
2363	else {
2364	setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2365	}
2366
2367	if (Src2Regs.empty())
2368	split64BitValueForMapping(B, Regs&: Src2Regs, HalfTy, Reg: MI.getOperand(i: `3`).getReg());
2369	else
2370	setRegsToType(MRI, Regs: Src2Regs, NewTy: HalfTy);
2371
2372	setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2373
2374	auto Flags = MI.getFlags();
2375	B.buildSelect(Res: DefRegs [`0`], Tst: CondRegs [`0`], Op0: Src1Regs [`0`], Op1: Src2Regs [`0`], Flags);
2376	B.buildSelect(Res: DefRegs [`1`], Tst: CondRegs [`0`], Op0: Src1Regs [`1`], Op1: Src2Regs [`1`], Flags);
2377
2378	MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2379	MI.eraseFromParent();
2380	return;
2381	}
2382	case AMDGPU::G_BRCOND: {
2383	Register CondReg = MI.getOperand(i: `0`).getReg();
2384	// FIXME: Should use legalizer helper, but should change bool ext type.
2385	const RegisterBank *CondBank =
2386	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2387
2388	if (CondBank == &AMDGPU::SGPRRegBank) {
2389	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2390	Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32);
2391	MRI.setRegBank(Reg: NewCondReg, RegBank: AMDGPU::SGPRRegBank);
2392
2393	MI.getOperand(i: `0`).setReg(NewCondReg);
2394	B.buildZExt(Res: NewCondReg, Op: CondReg);
2395	return;
2396	}
2397
2398	break;
2399	}
2400	case AMDGPU::G_AND:
2401	case AMDGPU::G_OR:
2402	case AMDGPU::G_XOR: {
2403	// 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2404	// there is a VGPR input.
2405	Register DstReg = MI.getOperand(i: `0`).getReg();
2406	LLT DstTy = MRI.getType(Reg: DstReg);
2407
2408	const RegisterBank *DstBank =
2409	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2410
2411	if (DstTy.getSizeInBits() == `1`) {
2412	if (DstBank == &AMDGPU::VCCRegBank)
2413	break;
2414
2415	MachineFunction *MF = MI.getMF();
2416	ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2417	LegalizerHelper Helper(*MF, ApplyBank, B);
2418
2419	if (Helper.widenScalar(MI, TypeIdx: `0`, WideTy: LLT::scalar(SizeInBits: `32`)) !=
2420	LegalizerHelper::Legalized)
2421	llvm_unreachable("widen scalar should have succeeded");
2422	return;
2423	}
2424
2425	if (DstTy.getSizeInBits() == `16` && DstBank == &AMDGPU::SGPRRegBank) {
2426	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2427	MachineBasicBlock *MBB = MI.getParent();
2428	MachineFunction *MF = MBB->getParent();
2429	ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2430	LegalizerHelper Helper(*MF, ApplySALU, B);
2431	// Widen to S32, but handle `G_XOR x, -1` differently. Legalizer widening
2432	// will use a G_ANYEXT to extend the -1 which prevents matching G_XOR -1
2433	// as "not".
2434	if (MI.getOpcode() == AMDGPU::G_XOR &&
2435	mi_match(R: MI.getOperand(i: `2`).getReg(), MRI, P: m_SpecificICstOrSplat(RequestedValue: -`1`))) {
2436	Helper.widenScalarSrc(MI, WideTy: S32, OpIdx: `1`, ExtOpcode: AMDGPU::G_ANYEXT);
2437	Helper.widenScalarSrc(MI, WideTy: S32, OpIdx: `2`, ExtOpcode: AMDGPU::G_SEXT);
2438	Helper.widenScalarDst(MI, WideTy: S32);
2439	} else {
2440	if (Helper.widenScalar(MI, TypeIdx: `0`, WideTy: S32) != LegalizerHelper::Legalized)
2441	llvm_unreachable("widen scalar should have succeeded");
2442	}
2443	return;
2444	}
2445
2446	if (DstTy.getSizeInBits() != `64`)
2447	break;
2448
2449	LLT HalfTy = getHalfSizedType(Ty: DstTy);
2450	SmallVector<Register, `2`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
2451	SmallVector<Register, `2`> Src0Regs(OpdMapper.getVRegs(OpIdx: `1`));
2452	SmallVector<Register, `2`> Src1Regs(OpdMapper.getVRegs(OpIdx: `2`));
2453
2454	// All inputs are SGPRs, nothing special to do.
2455	if (DefRegs.empty()) {
2456	assert(Src0Regs.empty() && Src1Regs.empty());
2457	break;
2458	}
2459
2460	assert(DefRegs.size() == `2`);
2461	assert(Src0Regs.size() == Src1Regs.size() &&
2462	(Src0Regs.empty() \|\| Src0Regs.size() == `2`));
2463
2464	// Depending on where the source registers came from, the generic code may
2465	// have decided to split the inputs already or not. If not, we still need to
2466	// extract the values.
2467
2468	if (Src0Regs.empty())
2469	split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: `1`).getReg());
2470	else
2471	setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy);
2472
2473	if (Src1Regs.empty())
2474	split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: `2`).getReg());
2475	else
2476	setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2477
2478	setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2479
2480	auto Flags = MI.getFlags();
2481	B.buildInstr(Opc, DstOps: {DefRegs [`0`]}, SrcOps: {Src0Regs [`0`], Src1Regs [`0`]}, Flags);
2482	B.buildInstr(Opc, DstOps: {DefRegs [`1`]}, SrcOps: {Src0Regs [`1`], Src1Regs [`1`]}, Flags);
2483
2484	MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2485	MI.eraseFromParent();
2486	return;
2487	}
2488	case AMDGPU::G_ABS: {
2489	Register SrcReg = MI.getOperand(i: `1`).getReg();
2490	const RegisterBank *SrcBank = MRI.getRegBankOrNull(Reg: SrcReg);
2491
2492	// There is no VALU abs instruction so we need to replace it with a sub and
2493	// max combination.
2494	if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2495	MachineFunction *MF = MI.getMF();
2496	ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2497	LegalizerHelper Helper(*MF, Apply, B);
2498
2499	if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2500	llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2501	return;
2502	}
2503	[[fallthrough]];
2504	}
2505	case AMDGPU::G_ADD:
2506	case AMDGPU::G_SUB:
2507	case AMDGPU::G_MUL:
2508	case AMDGPU::G_SHL:
2509	case AMDGPU::G_LSHR:
2510	case AMDGPU::G_ASHR:
2511	case AMDGPU::G_SMIN:
2512	case AMDGPU::G_SMAX:
2513	case AMDGPU::G_UMIN:
2514	case AMDGPU::G_UMAX: {
2515	Register DstReg = MI.getOperand(i: `0`).getReg();
2516	LLT DstTy = MRI.getType(Reg: DstReg);
2517
2518	// Special case for s_mul_u64. There is not a vector equivalent of
2519	// s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2520	// multiplications.
2521	if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL &&
2522	DstTy.getSizeInBits() == `64`) {
2523	applyMappingSMULU64(B, OpdMapper);
2524	return;
2525	}
2526
2527	// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2528	// Packed 16-bit operations need to be scalarized and promoted.
2529	if (DstTy != LLT::scalar(SizeInBits: `16`) && DstTy != LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`))
2530	break;
2531
2532	const RegisterBank *DstBank =
2533	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2534	if (DstBank == &AMDGPU::VGPRRegBank)
2535	break;
2536
2537	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2538	MachineBasicBlock *MBB = MI.getParent();
2539	MachineFunction *MF = MBB->getParent();
2540	ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2541
2542	if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2543	Register WideSrcLo, WideSrcHi;
2544
2545	std::tie(args&: WideSrcLo, args&: WideSrcHi) =
2546	unpackV2S16ToS32(B, Src: MI.getOperand(i: `1`).getReg(), ExtOpcode: TargetOpcode::G_SEXT);
2547	auto Lo = B.buildInstr(Opc: AMDGPU::G_ABS, DstOps: {S32}, SrcOps: {WideSrcLo});
2548	auto Hi = B.buildInstr(Opc: AMDGPU::G_ABS, DstOps: {S32}, SrcOps: {WideSrcHi});
2549	B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(Idx: `0`), Hi.getReg(Idx: `0`)});
2550	MI.eraseFromParent();
2551	return;
2552	}
2553
2554	if (DstTy.isVector()) {
2555	Register WideSrc0Lo, WideSrc0Hi;
2556	Register WideSrc1Lo, WideSrc1Hi;
2557
2558	unsigned ExtendOp = getExtendOp(Opc: MI.getOpcode());
2559	std::tie(args&: WideSrc0Lo, args&: WideSrc0Hi)
2560	= unpackV2S16ToS32(B, Src: MI.getOperand(i: `1`).getReg(), ExtOpcode: ExtendOp);
2561	std::tie(args&: WideSrc1Lo, args&: WideSrc1Hi)
2562	= unpackV2S16ToS32(B, Src: MI.getOperand(i: `2`).getReg(), ExtOpcode: ExtendOp);
2563	auto Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Lo, WideSrc1Lo});
2564	auto Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Hi, WideSrc1Hi});
2565	B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(Idx: `0`), Hi.getReg(Idx: `0`)});
2566	MI.eraseFromParent();
2567	} else {
2568	LegalizerHelper Helper(*MF, ApplySALU, B);
2569
2570	if (Helper.widenScalar(MI, TypeIdx: `0`, WideTy: S32) != LegalizerHelper::Legalized)
2571	llvm_unreachable("widen scalar should have succeeded");
2572
2573	// FIXME: s16 shift amounts should be legal.
2574	if (Opc == AMDGPU::G_SHL \|\| Opc == AMDGPU::G_LSHR \|\|
2575	Opc == AMDGPU::G_ASHR) {
2576	B.setInsertPt(MBB&: *MBB, II: MI.getIterator());
2577	if (Helper.widenScalar(MI, TypeIdx: `1`, WideTy: S32) != LegalizerHelper::Legalized)
2578	llvm_unreachable("widen scalar should have succeeded");
2579	}
2580	}
2581
2582	return;
2583	}
2584	case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
2585	case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
2586	// This is a special case for s_mul_u64. We use
2587	// G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2588	// where the 33 higher bits are sign-extended and
2589	// G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2590	// where the 32 higher bits are zero-extended. In case scalar registers are
2591	// selected, both opcodes are lowered as s_mul_u64. If the vector registers
2592	// are selected, then G_AMDGPU_S_MUL_I64_I32 and
2593	// G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2594
2595	// Insert basic copies.
2596	applyDefaultMapping(OpdMapper);
2597
2598	Register DstReg = MI.getOperand(i: `0`).getReg();
2599	Register SrcReg0 = MI.getOperand(i: `1`).getReg();
2600	Register SrcReg1 = MI.getOperand(i: `2`).getReg();
2601	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2602	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2603	assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
2604	"that handles only 64-bit operands.");
2605	const RegisterBank *DstBank =
2606	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2607
2608	// Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2609	// with s_mul_u64 operation.
2610	if (DstBank == &AMDGPU::SGPRRegBank) {
2611	MI.setDesc(TII->get(Opcode: AMDGPU::S_MUL_U64));
2612	MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SGPR_64RegClass);
2613	MRI.setRegClass(Reg: SrcReg0, RC: &AMDGPU::SGPR_64RegClass);
2614	MRI.setRegClass(Reg: SrcReg1, RC: &AMDGPU::SGPR_64RegClass);
2615	return;
2616	}
2617
2618	// Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2619	// with a vector mad.
2620	assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
2621	"The destination operand should be in vector registers.");
2622
2623	// Extract the lower subregister from the first operand.
2624	Register Op0L = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2625	MRI.setRegClass(Reg: Op0L, RC: &AMDGPU::VGPR_32RegClass);
2626	MRI.setType(VReg: Op0L, Ty: S32);
2627	B.buildTrunc(Res: Op0L, Op: SrcReg0);
2628
2629	// Extract the lower subregister from the second operand.
2630	Register Op1L = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2631	MRI.setRegClass(Reg: Op1L, RC: &AMDGPU::VGPR_32RegClass);
2632	MRI.setType(VReg: Op1L, Ty: S32);
2633	B.buildTrunc(Res: Op1L, Op: SrcReg1);
2634
2635	unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
2636	? AMDGPU::G_AMDGPU_MAD_U64_U32
2637	: AMDGPU::G_AMDGPU_MAD_I64_I32;
2638
2639	MachineIRBuilder B(MI);
2640	Register Zero64 = B.buildConstant(Res: S64, Val: `0`).getReg(Idx: `0`);
2641	MRI.setRegClass(Reg: Zero64, RC: &AMDGPU::VReg_64RegClass);
2642	Register CarryOut = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
2643	MRI.setRegClass(Reg: CarryOut, RC: &AMDGPU::VReg_64RegClass);
2644	B.buildInstr(Opc: NewOpc, DstOps: {DstReg, CarryOut}, SrcOps: {Op0L, Op1L, Zero64});
2645	MI.eraseFromParent();
2646	return;
2647	}
2648	case AMDGPU::G_SEXT_INREG: {
2649	SmallVector<Register, `2`> SrcRegs(OpdMapper.getVRegs(OpIdx: `1`));
2650	if (SrcRegs.empty())
2651	break; // Nothing to repair
2652
2653	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2654	ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2655
2656	// Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2657	// we would need to further expand, and doesn't let us directly set the
2658	// result registers.
2659	SmallVector<Register, `2`> DstRegs(OpdMapper.getVRegs(OpIdx: `0`));
2660
2661	int Amt = MI.getOperand(i: `2`).getImm();
2662	if (Amt <= `32`) {
2663	// Downstream users have expectations for the high bit behavior, so freeze
2664	// incoming undefined bits.
2665	if (Amt == `32`) {
2666	// The low bits are unchanged.
2667	B.buildFreeze(Dst: DstRegs [`0`], Src: SrcRegs [`0`]);
2668	} else {
2669	auto Freeze = B.buildFreeze(Dst: S32, Src: SrcRegs [`0`]);
2670	// Extend in the low bits and propagate the sign bit to the high half.
2671	B.buildSExtInReg(Res: DstRegs [`0`], Op: Freeze, ImmOp: Amt);
2672	}
2673
2674	B.buildAShr(Dst: DstRegs [`1`], Src0: DstRegs [`0`], Src1: B.buildConstant(Res: S32, Val: `31`));
2675	} else {
2676	// The low bits are unchanged, and extend in the high bits.
2677	// No freeze required
2678	B.buildCopy(Res: DstRegs [`0`], Op: SrcRegs [`0`]);
2679	B.buildSExtInReg(Res: DstRegs [`1`], Op: DstRegs [`0`], ImmOp: Amt - `32`);
2680	}
2681
2682	Register DstReg = MI.getOperand(i: `0`).getReg();
2683	MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2684	MI.eraseFromParent();
2685	return;
2686	}
2687	case AMDGPU::G_CTPOP:
2688	case AMDGPU::G_BITREVERSE: {
2689	const RegisterBank *DstBank =
2690	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2691	if (DstBank == &AMDGPU::SGPRRegBank)
2692	break;
2693
2694	Register SrcReg = MI.getOperand(i: `1`).getReg();
2695	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2696	LLT Ty = MRI.getType(Reg: SrcReg);
2697	if (Ty == S32)
2698	break;
2699
2700	ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2701
2702	MachineFunction &MF = B.getMF();
2703	LegalizerHelper Helper(MF, ApplyVALU, B);
2704
2705	if (Helper.narrowScalar(MI, TypeIdx: `1`, NarrowTy: S32) != LegalizerHelper::Legalized)
2706	llvm_unreachable("narrowScalar should have succeeded");
2707	return;
2708	}
2709	case AMDGPU::G_AMDGPU_FFBH_U32:
2710	case AMDGPU::G_AMDGPU_FFBL_B32:
2711	case AMDGPU::G_CTLZ_ZERO_UNDEF:
2712	case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2713	const RegisterBank *DstBank =
2714	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2715	if (DstBank == &AMDGPU::SGPRRegBank)
2716	break;
2717
2718	Register SrcReg = MI.getOperand(i: `1`).getReg();
2719	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2720	LLT Ty = MRI.getType(Reg: SrcReg);
2721	if (Ty == S32)
2722	break;
2723
2724	// We can narrow this more efficiently than Helper can by using ffbh/ffbl
2725	// which return -1 when the input is zero:
2726	// (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2727	// (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2728	// (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2729	// (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2730	ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2731	SmallVector<Register, `2`> SrcRegs(OpdMapper.getVRegs(OpIdx: `1`));
2732	unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2733	? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2734	: Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2735	? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2736	: Opc;
2737	unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2738	auto X = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs [Idx]});
2739	auto Y = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs [Idx ^ `1`]});
2740	unsigned AddOpc =
2741	Opc == AMDGPU::G_CTLZ_ZERO_UNDEF \|\| Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2742	? AMDGPU::G_ADD
2743	: AMDGPU::G_UADDSAT;
2744	Y = B.buildInstr(Opc: AddOpc, DstOps: {S32}, SrcOps: {Y, B.buildConstant(Res: S32, Val: `32`)});
2745	Register DstReg = MI.getOperand(i: `0`).getReg();
2746	B.buildUMin(Dst: DstReg, Src0: X, Src1: Y);
2747	MI.eraseFromParent();
2748	return;
2749	}
2750	case AMDGPU::G_SEXT:
2751	case AMDGPU::G_ZEXT:
2752	case AMDGPU::G_ANYEXT: {
2753	Register SrcReg = MI.getOperand(i: `1`).getReg();
2754	LLT SrcTy = MRI.getType(Reg: SrcReg);
2755	const bool Signed = Opc == AMDGPU::G_SEXT;
2756
2757	assert(OpdMapper.getVRegs(`1`).empty());
2758
2759	const RegisterBank *SrcBank =
2760	OpdMapper.getInstrMapping().getOperandMapping(i: `1`).BreakDown[`0`].RegBank;
2761
2762	Register DstReg = MI.getOperand(i: `0`).getReg();
2763	LLT DstTy = MRI.getType(Reg: DstReg);
2764	if (DstTy.isScalar() &&
2765	SrcBank != &AMDGPU::SGPRRegBank &&
2766	SrcBank != &AMDGPU::VCCRegBank &&
2767	// FIXME: Should handle any type that round to s64 when irregular
2768	// breakdowns supported.
2769	DstTy.getSizeInBits() == `64` &&
2770	SrcTy.getSizeInBits() <= `32`) {
2771	SmallVector<Register, `2`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
2772
2773	// Extend to 32-bit, and then extend the low half.
2774	if (Signed) {
2775	// TODO: Should really be buildSExtOrCopy
2776	B.buildSExtOrTrunc(Res: DefRegs [`0`], Op: SrcReg);
2777	} else if (Opc == AMDGPU::G_ZEXT) {
2778	B.buildZExtOrTrunc(Res: DefRegs [`0`], Op: SrcReg);
2779	} else {
2780	B.buildAnyExtOrTrunc(Res: DefRegs [`0`], Op: SrcReg);
2781	}
2782
2783	extendLow32IntoHigh32(B, Hi32Reg: DefRegs [`1`], Lo32Reg: DefRegs [`0`], ExtOpc: Opc, RegBank: *SrcBank);
2784	MRI.setRegBank(Reg: DstReg, RegBank: *SrcBank);
2785	MI.eraseFromParent();
2786	return;
2787	}
2788
2789	if (SrcTy != LLT::scalar(SizeInBits: `1`))
2790	return;
2791
2792	// It is not legal to have a legalization artifact with a VCC source. Rather
2793	// than introducing a copy, insert the select we would have to select the
2794	// copy to.
2795	if (SrcBank == &AMDGPU::VCCRegBank) {
2796	SmallVector<Register, `2`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
2797
2798	const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2799
2800	unsigned DstSize = DstTy.getSizeInBits();
2801	// 64-bit select is SGPR only
2802	const bool UseSel64 = DstSize > `32` &&
2803	SrcBank->getID() == AMDGPU::SGPRRegBankID;
2804
2805	// TODO: Should s16 select be legal?
2806	LLT SelType = UseSel64 ? LLT::scalar(SizeInBits: `64`) : LLT::scalar(SizeInBits: `32`);
2807	auto True = B.buildConstant(Res: SelType, Val: Signed ? -`1` : `1`);
2808	auto False = B.buildConstant(Res: SelType, Val: `0`);
2809
2810	MRI.setRegBank(Reg: True.getReg(Idx: `0`), RegBank: *DstBank);
2811	MRI.setRegBank(Reg: False.getReg(Idx: `0`), RegBank: *DstBank);
2812	MRI.setRegBank(Reg: DstReg, RegBank: *DstBank);
2813
2814	if (DstSize > `32`) {
2815	B.buildSelect(Res: DefRegs [`0`], Tst: SrcReg, Op0: True, Op1: False);
2816	extendLow32IntoHigh32(B, Hi32Reg: DefRegs [`1`], Lo32Reg: DefRegs [`0`], ExtOpc: Opc, RegBank: SrcBank, IsBooleanSrc: true*);
2817	} else if (DstSize < `32`) {
2818	auto Sel = B.buildSelect(Res: SelType, Tst: SrcReg, Op0: True, Op1: False);
2819	MRI.setRegBank(Reg: Sel.getReg(Idx: `0`), RegBank: *DstBank);
2820	B.buildTrunc(Res: DstReg, Op: Sel);
2821	} else {
2822	B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False);
2823	}
2824
2825	MI.eraseFromParent();
2826	return;
2827	}
2828
2829	break;
2830	}
2831	case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2832	SmallVector<Register, `2`> DstRegs(OpdMapper.getVRegs(OpIdx: `0`));
2833
2834	assert(OpdMapper.getVRegs(`1`).empty() && OpdMapper.getVRegs(`2`).empty());
2835
2836	Register DstReg = MI.getOperand(i: `0`).getReg();
2837	Register SrcReg = MI.getOperand(i: `1`).getReg();
2838
2839	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2840	LLT DstTy = MRI.getType(Reg: DstReg);
2841	LLT SrcTy = MRI.getType(Reg: SrcReg);
2842
2843	if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2844	return;
2845
2846	const ValueMapping &DstMapping
2847	= OpdMapper.getInstrMapping().getOperandMapping(i: `0`);
2848	const RegisterBank *DstBank = DstMapping.BreakDown[`0`].RegBank;
2849	const RegisterBank *SrcBank =
2850	OpdMapper.getInstrMapping().getOperandMapping(i: `1`).BreakDown[`0`].RegBank;
2851	const RegisterBank *IdxBank =
2852	OpdMapper.getInstrMapping().getOperandMapping(i: `2`).BreakDown[`0`].RegBank;
2853
2854	Register BaseIdxReg;
2855	unsigned ConstOffset;
2856	std::tie(args&: BaseIdxReg, args&: ConstOffset) =
2857	AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: `2`).getReg());
2858
2859	// See if the index is an add of a constant which will be foldable by moving
2860	// the base register of the index later if this is going to be executed in a
2861	// waterfall loop. This is essentially to reassociate the add of a constant
2862	// with the readfirstlane.
2863	bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2864	ConstOffset > `0` &&
2865	ConstOffset < SrcTy.getNumElements();
2866
2867	// Move the base register. We'll re-insert the add later.
2868	if (ShouldMoveIndexIntoLoop)
2869	MI.getOperand(i: `2`).setReg(BaseIdxReg);
2870
2871	// If this is a VGPR result only because the index was a VGPR result, the
2872	// actual indexing will be done on the SGPR source vector, which will
2873	// produce a scalar result. We need to copy to the VGPR result inside the
2874	// waterfall loop.
2875	const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2876	SrcBank == &AMDGPU::SGPRRegBank;
2877	if (DstRegs.empty()) {
2878	applyDefaultMapping(OpdMapper);
2879
2880	executeInWaterfallLoop(B, MI, OpIndices: {`2`});
2881
2882	if (NeedCopyToVGPR) {
2883	// We don't want a phi for this temporary reg.
2884	Register TmpReg = MRI.createGenericVirtualRegister(Ty: DstTy);
2885	MRI.setRegBank(Reg: TmpReg, RegBank: AMDGPU::SGPRRegBank);
2886	MI.getOperand(i: `0`).setReg(TmpReg);
2887	B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
2888
2889	// Use a v_mov_b32 here to make the exec dependency explicit.
2890	buildVCopy(B, DstReg, SrcReg: TmpReg);
2891	}
2892
2893	// Re-insert the constant offset add inside the waterfall loop.
2894	if (ShouldMoveIndexIntoLoop)
2895	reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: `2`, ConstOffset);
2896
2897	return;
2898	}
2899
2900	assert(DstTy.getSizeInBits() == `64`);
2901
2902	LLT Vec32 = LLT::fixed_vector(NumElements: `2` * SrcTy.getNumElements(), ScalarSizeInBits: `32`);
2903
2904	auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg);
2905	auto One = B.buildConstant(Res: S32, Val: `1`);
2906
2907	MachineBasicBlock::iterator MII = MI.getIterator();
2908
2909	// Split the vector index into 32-bit pieces. Prepare to move all of the
2910	// new instructions into a waterfall loop if necessary.
2911	//
2912	// Don't put the bitcast or constant in the loop.
2913	MachineInstrSpan Span(MII, &B.getMBB());
2914
2915	// Compute 32-bit element indices, (2 OrigIdx, 2 * OrigIdx + 1).*
2916	auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One);
2917	auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One);
2918
2919	auto Extract0 = B.buildExtractVectorElement(Res: DstRegs [`0`], Val: CastSrc, Idx: IdxLo);
2920	auto Extract1 = B.buildExtractVectorElement(Res: DstRegs [`1`], Val: CastSrc, Idx: IdxHi);
2921
2922	MRI.setRegBank(Reg: DstReg, RegBank: *DstBank);
2923	MRI.setRegBank(Reg: CastSrc.getReg(Idx: `0`), RegBank: *SrcBank);
2924	MRI.setRegBank(Reg: One.getReg(Idx: `0`), RegBank: AMDGPU::SGPRRegBank);
2925	MRI.setRegBank(Reg: IdxLo.getReg(Idx: `0`), RegBank: AMDGPU::SGPRRegBank);
2926	MRI.setRegBank(Reg: IdxHi.getReg(Idx: `0`), RegBank: AMDGPU::SGPRRegBank);
2927
2928	SmallSet<Register, `4`> OpsToWaterfall;
2929	if (!collectWaterfallOperands(SGPROperandRegs&: OpsToWaterfall, MI, MRI, OpIndices: { `2` })) {
2930	MI.eraseFromParent();
2931	return;
2932	}
2933
2934	// Remove the original instruction to avoid potentially confusing the
2935	// waterfall loop logic.
2936	B.setInstr(*Span.begin());
2937	MI.eraseFromParent();
2938	executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()),
2939	SGPROperandRegs&: OpsToWaterfall);
2940
2941	if (NeedCopyToVGPR) {
2942	MachineBasicBlock *LoopBB = Extract1 ->getParent();
2943	Register TmpReg0 = MRI.createGenericVirtualRegister(Ty: S32);
2944	Register TmpReg1 = MRI.createGenericVirtualRegister(Ty: S32);
2945	MRI.setRegBank(Reg: TmpReg0, RegBank: AMDGPU::SGPRRegBank);
2946	MRI.setRegBank(Reg: TmpReg1, RegBank: AMDGPU::SGPRRegBank);
2947
2948	Extract0 ->getOperand(i: `0`).setReg(TmpReg0);
2949	Extract1 ->getOperand(i: `0`).setReg(TmpReg1);
2950
2951	B.setInsertPt(MBB&: *LoopBB, II: ++Extract1 ->getIterator());
2952
2953	buildVCopy(B, DstReg: DstRegs [`0`], SrcReg: TmpReg0);
2954	buildVCopy(B, DstReg: DstRegs [`1`], SrcReg: TmpReg1);
2955	}
2956
2957	if (ShouldMoveIndexIntoLoop)
2958	reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: `1`, ConstOffset);
2959
2960	return;
2961	}
2962	case AMDGPU::G_INSERT_VECTOR_ELT: {
2963	SmallVector<Register, `2`> InsRegs(OpdMapper.getVRegs(OpIdx: `2`));
2964
2965	Register DstReg = MI.getOperand(i: `0`).getReg();
2966	LLT VecTy = MRI.getType(Reg: DstReg);
2967
2968	assert(OpdMapper.getVRegs(`0`).empty());
2969	assert(OpdMapper.getVRegs(`3`).empty());
2970
2971	if (substituteSimpleCopyRegs(OpdMapper, OpIdx: `1`))
2972	MRI.setType(VReg: MI.getOperand(i: `1`).getReg(), Ty: VecTy);
2973
2974	if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2975	return;
2976
2977	const RegisterBank *IdxBank =
2978	OpdMapper.getInstrMapping().getOperandMapping(i: `3`).BreakDown[`0`].RegBank;
2979
2980	Register SrcReg = MI.getOperand(i: `1`).getReg();
2981	Register InsReg = MI.getOperand(i: `2`).getReg();
2982	LLT InsTy = MRI.getType(Reg: InsReg);
2983	(void)InsTy;
2984
2985	Register BaseIdxReg;
2986	unsigned ConstOffset;
2987	std::tie(args&: BaseIdxReg, args&: ConstOffset) =
2988	AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: `3`).getReg());
2989
2990	// See if the index is an add of a constant which will be foldable by moving
2991	// the base register of the index later if this is going to be executed in a
2992	// waterfall loop. This is essentially to reassociate the add of a constant
2993	// with the readfirstlane.
2994	bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2995	ConstOffset > `0` &&
2996	ConstOffset < VecTy.getNumElements();
2997
2998	// Move the base register. We'll re-insert the add later.
2999	if (ShouldMoveIndexIntoLoop)
3000	MI.getOperand(i: `3`).setReg(BaseIdxReg);
3001
3002
3003	if (InsRegs.empty()) {
3004	executeInWaterfallLoop(B, MI, OpIndices: {`3`});
3005
3006	// Re-insert the constant offset add inside the waterfall loop.
3007	if (ShouldMoveIndexIntoLoop) {
3008	reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: `3`, ConstOffset);
3009	}
3010
3011	return;
3012	}
3013
3014	assert(InsTy.getSizeInBits() == `64`);
3015
3016	const LLT S32 = LLT::scalar(SizeInBits: `32`);
3017	LLT Vec32 = LLT::fixed_vector(NumElements: `2` * VecTy.getNumElements(), ScalarSizeInBits: `32`);
3018
3019	auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg);
3020	auto One = B.buildConstant(Res: S32, Val: `1`);
3021
3022	// Split the vector index into 32-bit pieces. Prepare to move all of the
3023	// new instructions into a waterfall loop if necessary.
3024	//
3025	// Don't put the bitcast or constant in the loop.
3026	MachineInstrSpan Span(MachineBasicBlock::iterator (&MI), &B.getMBB());
3027
3028	// Compute 32-bit element indices, (2 OrigIdx, 2 * OrigIdx + 1).*
3029	auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One);
3030	auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One);
3031
3032	auto InsLo = B.buildInsertVectorElement(Res: Vec32, Val: CastSrc, Elt: InsRegs [`0`], Idx: IdxLo);
3033	auto InsHi = B.buildInsertVectorElement(Res: Vec32, Val: InsLo, Elt: InsRegs [`1`], Idx: IdxHi);
3034
3035	const RegisterBank *DstBank =
3036	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
3037	const RegisterBank *SrcBank =
3038	OpdMapper.getInstrMapping().getOperandMapping(i: `1`).BreakDown[`0`].RegBank;
3039	const RegisterBank *InsSrcBank =
3040	OpdMapper.getInstrMapping().getOperandMapping(i: `2`).BreakDown[`0`].RegBank;
3041
3042	MRI.setRegBank(Reg: InsReg, RegBank: *InsSrcBank);
3043	MRI.setRegBank(Reg: CastSrc.getReg(Idx: `0`), RegBank: *SrcBank);
3044	MRI.setRegBank(Reg: InsLo.getReg(Idx: `0`), RegBank: *DstBank);
3045	MRI.setRegBank(Reg: InsHi.getReg(Idx: `0`), RegBank: *DstBank);
3046	MRI.setRegBank(Reg: One.getReg(Idx: `0`), RegBank: AMDGPU::SGPRRegBank);
3047	MRI.setRegBank(Reg: IdxLo.getReg(Idx: `0`), RegBank: AMDGPU::SGPRRegBank);
3048	MRI.setRegBank(Reg: IdxHi.getReg(Idx: `0`), RegBank: AMDGPU::SGPRRegBank);
3049
3050
3051	SmallSet<Register, `4`> OpsToWaterfall;
3052	if (!collectWaterfallOperands(SGPROperandRegs&: OpsToWaterfall, MI, MRI, OpIndices: { `3` })) {
3053	B.setInsertPt(MBB&: B.getMBB(), II: MI);
3054	B.buildBitcast(Dst: DstReg, Src: InsHi);
3055	MI.eraseFromParent();
3056	return;
3057	}
3058
3059	B.setInstr(*Span.begin());
3060	MI.eraseFromParent();
3061
3062	// Figure out the point after the waterfall loop before mangling the control
3063	// flow.
3064	executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()),
3065	SGPROperandRegs&: OpsToWaterfall);
3066
3067	// The insertion point is now right after the original instruction.
3068	//
3069	// Keep the bitcast to the original vector type out of the loop. Doing this
3070	// saved an extra phi we don't need inside the loop.
3071	B.buildBitcast(Dst: DstReg, Src: InsHi);
3072
3073	// Re-insert the constant offset add inside the waterfall loop.
3074	if (ShouldMoveIndexIntoLoop)
3075	reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: `1`, ConstOffset);
3076
3077	return;
3078	}
3079	case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3080	case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3081	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3082	case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3083	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3084	case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
3085	case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
3086	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
3087	case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
3088	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
3089	case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3090	case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
3091	case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3092	case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3093	case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3094	case AMDGPU::G_AMDGPU_BUFFER_STORE:
3095	case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3096	case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3097	case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3098	case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
3099	case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3100	case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
3101	applyDefaultMapping(OpdMapper);
3102	executeInWaterfallLoop(B, MI, OpIndices: {`1`, `4`});
3103	return;
3104	}
3105	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3106	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3107	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3108	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3109	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3110	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3111	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3112	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3113	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3114	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3115	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3116	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3117	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
3118	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
3119	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3120	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3121	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3122	applyDefaultMapping(OpdMapper);
3123	executeInWaterfallLoop(B, MI, OpIndices: {`2`, `5`});
3124	return;
3125	}
3126	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3127	applyDefaultMapping(OpdMapper);
3128	executeInWaterfallLoop(B, MI, OpIndices: {`3`, `6`});
3129	return;
3130	}
3131	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
3132	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
3133	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
3134	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
3135	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
3136	applyMappingSBufferLoad(B, OpdMapper);
3137	return;
3138	}
3139	case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
3140	constrainOpWithReadfirstlane(B, MI, OpIdx: `0`);
3141	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`);
3142	return;
3143	case AMDGPU::G_INTRINSIC:
3144	case AMDGPU::G_INTRINSIC_CONVERGENT: {
3145	switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
3146	case Intrinsic::amdgcn_readlane: {
3147	substituteSimpleCopyRegs(OpdMapper, OpIdx: `2`);
3148
3149	assert(OpdMapper.getVRegs(`0`).empty());
3150	assert(OpdMapper.getVRegs(`3`).empty());
3151
3152	// Make sure the index is an SGPR. It doesn't make sense to run this in a
3153	// waterfall loop, so assume it's a uniform value.
3154	constrainOpWithReadfirstlane(B, MI, OpIdx: `3`); // Index
3155	return;
3156	}
3157	case Intrinsic::amdgcn_writelane: {
3158	assert(OpdMapper.getVRegs(`0`).empty());
3159	assert(OpdMapper.getVRegs(`2`).empty());
3160	assert(OpdMapper.getVRegs(`3`).empty());
3161
3162	substituteSimpleCopyRegs(OpdMapper, OpIdx: `4`); // VGPR input val
3163	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // Source value
3164	constrainOpWithReadfirstlane(B, MI, OpIdx: `3`); // Index
3165	return;
3166	}
3167	case Intrinsic::amdgcn_interp_p1:
3168	case Intrinsic::amdgcn_interp_p2:
3169	case Intrinsic::amdgcn_interp_mov:
3170	case Intrinsic::amdgcn_interp_p1_f16:
3171	case Intrinsic::amdgcn_interp_p2_f16:
3172	case Intrinsic::amdgcn_lds_param_load: {
3173	applyDefaultMapping(OpdMapper);
3174
3175	// Readlane for m0 value, which is always the last operand.
3176	// FIXME: Should this be a waterfall loop instead?
3177	constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - `1`); // Index
3178	return;
3179	}
3180	case Intrinsic::amdgcn_interp_inreg_p10:
3181	case Intrinsic::amdgcn_interp_inreg_p2:
3182	case Intrinsic::amdgcn_interp_inreg_p10_f16:
3183	case Intrinsic::amdgcn_interp_inreg_p2_f16:
3184	case Intrinsic::amdgcn_interp_p10_rtz_f16:
3185	case Intrinsic::amdgcn_interp_p2_rtz_f16:
3186	case Intrinsic::amdgcn_permlane16_swap:
3187	case Intrinsic::amdgcn_permlane32_swap:
3188	applyDefaultMapping(OpdMapper);
3189	return;
3190	case Intrinsic::amdgcn_permlane16:
3191	case Intrinsic::amdgcn_permlanex16: {
3192	// Doing a waterfall loop over these wouldn't make any sense.
3193	substituteSimpleCopyRegs(OpdMapper, OpIdx: `2`);
3194	substituteSimpleCopyRegs(OpdMapper, OpIdx: `3`);
3195	constrainOpWithReadfirstlane(B, MI, OpIdx: `4`);
3196	constrainOpWithReadfirstlane(B, MI, OpIdx: `5`);
3197	return;
3198	}
3199	case Intrinsic::amdgcn_permlane_bcast:
3200	case Intrinsic::amdgcn_permlane_up:
3201	case Intrinsic::amdgcn_permlane_down:
3202	case Intrinsic::amdgcn_permlane_xor:
3203	// Doing a waterfall loop over these wouldn't make any sense.
3204	constrainOpWithReadfirstlane(B, MI, OpIdx: `3`);
3205	constrainOpWithReadfirstlane(B, MI, OpIdx: `4`);
3206	return;
3207	case Intrinsic::amdgcn_permlane_idx_gen: {
3208	constrainOpWithReadfirstlane(B, MI, OpIdx: `3`);
3209	return;
3210	}
3211	case Intrinsic::amdgcn_sbfe:
3212	applyMappingBFE(B, OpdMapper, Signed: true);
3213	return;
3214	case Intrinsic::amdgcn_ubfe:
3215	applyMappingBFE(B, OpdMapper, Signed: false);
3216	return;
3217	case Intrinsic::amdgcn_inverse_ballot:
3218	case Intrinsic::amdgcn_s_bitreplicate:
3219	case Intrinsic::amdgcn_s_quadmask:
3220	case Intrinsic::amdgcn_s_wqm:
3221	applyDefaultMapping(OpdMapper);
3222	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // Mask
3223	return;
3224	case Intrinsic::amdgcn_ballot:
3225	// Use default handling and insert copy to vcc source.
3226	break;
3227	}
3228	break;
3229	}
3230	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3231	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3232	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3233	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3234	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3235	const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3236	AMDGPU::lookupRsrcIntrinsic(Intr: AMDGPU::getIntrinsicID(I: MI));
3237	assert(RSrcIntrin && RSrcIntrin->IsImage);
3238	// Non-images can have complications from operands that allow both SGPR
3239	// and VGPR. For now it's too complicated to figure out the final opcode
3240	// to derive the register bank from the MCInstrDesc.
3241	applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg);
3242	return;
3243	}
3244	case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
3245	case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
3246	case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {
3247	bool IsDualOrBVH8 =
3248	MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY \|\|
3249	MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY;
3250	unsigned NumMods = IsDualOrBVH8 ? `0` : `1`; // Has A16 modifier
3251	unsigned LastRegOpIdx = MI.getNumExplicitOperands() - `1` - NumMods;
3252	applyDefaultMapping(OpdMapper);
3253	executeInWaterfallLoop(B, MI, OpIndices: {LastRegOpIdx});
3254	return;
3255	}
3256	case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3257	case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3258	auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
3259	switch (IntrID) {
3260	case Intrinsic::amdgcn_ds_ordered_add:
3261	case Intrinsic::amdgcn_ds_ordered_swap: {
3262	// This is only allowed to execute with 1 lane, so readfirstlane is safe.
3263	assert(OpdMapper.getVRegs(`0`).empty());
3264	substituteSimpleCopyRegs(OpdMapper, OpIdx: `3`);
3265	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // M0
3266	return;
3267	}
3268	case Intrinsic::amdgcn_ds_gws_init:
3269	case Intrinsic::amdgcn_ds_gws_barrier:
3270	case Intrinsic::amdgcn_ds_gws_sema_br: {
3271	// Only the first lane is executes, so readfirstlane is safe.
3272	substituteSimpleCopyRegs(OpdMapper, OpIdx: `1`);
3273	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // M0
3274	return;
3275	}
3276	case Intrinsic::amdgcn_ds_gws_sema_v:
3277	case Intrinsic::amdgcn_ds_gws_sema_p:
3278	case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3279	// Only the first lane is executes, so readfirstlane is safe.
3280	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`); // M0
3281	return;
3282	}
3283	case Intrinsic::amdgcn_ds_append:
3284	case Intrinsic::amdgcn_ds_consume: {
3285	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // M0
3286	return;
3287	}
3288	case Intrinsic::amdgcn_s_alloc_vgpr:
3289	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`);
3290	return;
3291	case Intrinsic::amdgcn_s_sendmsg:
3292	case Intrinsic::amdgcn_s_sendmsghalt: {
3293	// FIXME: Should this use a waterfall loop?
3294	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // M0
3295	return;
3296	}
3297	case Intrinsic::amdgcn_s_setreg: {
3298	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`);
3299	return;
3300	}
3301	case Intrinsic::amdgcn_s_ttracedata:
3302	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`); // M0
3303	return;
3304	case Intrinsic::amdgcn_raw_buffer_load_lds:
3305	case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3306	case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
3307	case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: {
3308	applyDefaultMapping(OpdMapper);
3309	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`); // rsrc
3310	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // M0
3311	constrainOpWithReadfirstlane(B, MI, OpIdx: `5`); // soffset
3312	return;
3313	}
3314	case Intrinsic::amdgcn_struct_buffer_load_lds:
3315	case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3316	case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
3317	case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
3318	applyDefaultMapping(OpdMapper);
3319	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`); // rsrc
3320	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // M0
3321	constrainOpWithReadfirstlane(B, MI, OpIdx: `6`); // soffset
3322	return;
3323	}
3324	case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
3325	case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
3326	case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
3327	case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
3328	applyDefaultMapping(OpdMapper);
3329	constrainOpWithReadfirstlane(B, MI, OpIdx: `5`);
3330	return;
3331	}
3332	case Intrinsic::amdgcn_load_to_lds:
3333	case Intrinsic::amdgcn_load_async_to_lds:
3334	case Intrinsic::amdgcn_global_load_lds:
3335	case Intrinsic::amdgcn_global_load_async_lds: {
3336	applyDefaultMapping(OpdMapper);
3337	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`);
3338	return;
3339	}
3340	case Intrinsic::amdgcn_lds_direct_load: {
3341	applyDefaultMapping(OpdMapper);
3342	// Readlane for m0 value, which is always the last operand.
3343	constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - `1`); // Index
3344	return;
3345	}
3346	case Intrinsic::amdgcn_exp_row:
3347	applyDefaultMapping(OpdMapper);
3348	constrainOpWithReadfirstlane(B, MI, OpIdx: `8`); // M0
3349	return;
3350	case Intrinsic::amdgcn_cluster_load_b32:
3351	case Intrinsic::amdgcn_cluster_load_b64:
3352	case Intrinsic::amdgcn_cluster_load_b128: {
3353	applyDefaultMapping(OpdMapper);
3354	constrainOpWithReadfirstlane(B, MI, OpIdx: `4`); // M0
3355	return;
3356	}
3357	case Intrinsic::amdgcn_s_sleep_var:
3358	assert(OpdMapper.getVRegs(`1`).empty());
3359	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`);
3360	return;
3361	case Intrinsic::amdgcn_s_barrier_join:
3362	case Intrinsic::amdgcn_s_wakeup_barrier:
3363	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`);
3364	return;
3365	case Intrinsic::amdgcn_s_barrier_init:
3366	case Intrinsic::amdgcn_s_barrier_signal_var:
3367	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`);
3368	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`);
3369	return;
3370	case Intrinsic::amdgcn_s_get_barrier_state:
3371	case Intrinsic::amdgcn_s_get_named_barrier_state: {
3372	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`);
3373	return;
3374	}
3375	case Intrinsic::amdgcn_s_prefetch_data: {
3376	Register PtrReg = MI.getOperand(i: `1`).getReg();
3377	unsigned AS = MRI.getType(Reg: PtrReg).getAddressSpace();
3378	if (AMDGPU::isFlatGlobalAddrSpace(AS)) {
3379	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`);
3380	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`);
3381	} else
3382	MI.eraseFromParent();
3383	return;
3384	}
3385	case Intrinsic::amdgcn_tensor_load_to_lds:
3386	case Intrinsic::amdgcn_tensor_store_from_lds: {
3387	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`);
3388	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`);
3389	constrainOpWithReadfirstlane(B, MI, OpIdx: `3`);
3390	constrainOpWithReadfirstlane(B, MI, OpIdx: `4`);
3391	constrainOpWithReadfirstlane(B, MI, OpIdx: `5`);
3392	return;
3393	}
3394	default: {
3395	if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3396	AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
3397	// Non-images can have complications from operands that allow both SGPR
3398	// and VGPR. For now it's too complicated to figure out the final opcode
3399	// to derive the register bank from the MCInstrDesc.
3400	if (RSrcIntrin->IsImage) {
3401	applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg);
3402	return;
3403	}
3404	}
3405
3406	break;
3407	}
3408	}
3409	break;
3410	}
3411	case AMDGPU::G_SI_CALL: {
3412	// Use a set to avoid extra readfirstlanes in the case where multiple
3413	// operands are the same register.
3414	SmallSet<Register, `4`> SGPROperandRegs;
3415
3416	if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices: {`1`}))
3417	break;
3418
3419	// Move all copies to physical SGPRs that are used by the call instruction
3420	// into the loop block. Start searching for these copies until the
3421	// ADJCALLSTACKUP.
3422	unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3423	unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3424
3425	// Move all non-copies before the copies, so that a complete range can be
3426	// moved into the waterfall loop.
3427	SmallVector<MachineInstr *, `4`> NonCopyInstrs;
3428	// Count of NonCopyInstrs found until the current LastCopy.
3429	unsigned NonCopyInstrsLen = `0`;
3430	MachineBasicBlock::iterator Start(&MI);
3431	MachineBasicBlock::iterator LastCopy = Start;
3432	MachineBasicBlock *MBB = MI.getParent();
3433	const SIMachineFunctionInfo *Info =
3434	MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3435	while (Start ->getOpcode() != FrameSetupOpcode) {
3436	--Start;
3437	bool IsCopy = false;
3438	if (Start ->getOpcode() == AMDGPU::COPY) {
3439	auto &Dst = Start ->getOperand(i: `0`);
3440	if (Dst.isReg()) {
3441	Register Reg = Dst.getReg();
3442	if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3443	IsCopy = true;
3444	} else {
3445	// Also move the copy from the scratch rsrc descriptor into the loop
3446	// to allow it to be optimized away.
3447	auto &Src = Start ->getOperand(i: `1`);
3448	if (Src.isReg()) {
3449	Reg = Src.getReg();
3450	IsCopy = Info->getScratchRSrcReg() == Reg;
3451	}
3452	}
3453	}
3454	}
3455
3456	if (IsCopy) {
3457	LastCopy = Start;
3458	NonCopyInstrsLen = NonCopyInstrs.size();
3459	} else {
3460	NonCopyInstrs.push_back(Elt: &*Start);
3461	}
3462	}
3463	NonCopyInstrs.resize(N: NonCopyInstrsLen);
3464
3465	for (auto *NonCopy : reverse(C&: NonCopyInstrs)) {
3466	MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator());
3467	}
3468	Start = LastCopy;
3469
3470	// Do the same for copies after the loop
3471	NonCopyInstrs.clear();
3472	NonCopyInstrsLen = `0`;
3473	MachineBasicBlock::iterator End(&MI);
3474	LastCopy = End;
3475	while (End ->getOpcode() != FrameDestroyOpcode) {
3476	++End;
3477	bool IsCopy = false;
3478	if (End ->getOpcode() == AMDGPU::COPY) {
3479	auto &Src = End ->getOperand(i: `1`);
3480	if (Src.isReg()) {
3481	Register Reg = Src.getReg();
3482	IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3483	}
3484	}
3485
3486	if (IsCopy) {
3487	LastCopy = End;
3488	NonCopyInstrsLen = NonCopyInstrs.size();
3489	} else {
3490	NonCopyInstrs.push_back(Elt: &*End);
3491	}
3492	}
3493	NonCopyInstrs.resize(N: NonCopyInstrsLen);
3494
3495	End = LastCopy;
3496	++LastCopy;
3497	for (auto *NonCopy : reverse(C&: NonCopyInstrs)) {
3498	MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator());
3499	}
3500
3501	++End;
3502	B.setInsertPt(MBB&: B.getMBB(), II: Start);
3503	executeInWaterfallLoop(B, Range: make_range(x: Start, y: End), SGPROperandRegs);
3504	break;
3505	}
3506	case AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR:
3507	case AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR:
3508	case AMDGPU::G_LOAD:
3509	case AMDGPU::G_ZEXTLOAD:
3510	case AMDGPU::G_SEXTLOAD: {
3511	if (applyMappingLoad(B, OpdMapper, MI))
3512	return;
3513	break;
3514	}
3515	case AMDGPU::G_DYN_STACKALLOC:
3516	applyMappingDynStackAlloc(B, OpdMapper, MI);
3517	return;
3518	case AMDGPU::G_STACKRESTORE: {
3519	applyDefaultMapping(OpdMapper);
3520	constrainOpWithReadfirstlane(B, MI, OpIdx: `0`);
3521	return;
3522	}
3523	case AMDGPU::G_SBFX:
3524	applyMappingBFE(B, OpdMapper, /Signed/ true);
3525	return;
3526	case AMDGPU::G_UBFX:
3527	applyMappingBFE(B, OpdMapper, /Signed/ false);
3528	return;
3529	case AMDGPU::G_AMDGPU_MAD_U64_U32:
3530	case AMDGPU::G_AMDGPU_MAD_I64_I32:
3531	applyMappingMAD_64_32(B, OpdMapper);
3532	return;
3533	case AMDGPU::G_PREFETCH: {
3534	if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) {
3535	MI.eraseFromParent();
3536	return;
3537	}
3538	Register PtrReg = MI.getOperand(i: `0`).getReg();
3539	unsigned PtrBank = getRegBankID(Reg: PtrReg, MRI, Default: AMDGPU::SGPRRegBankID);
3540	if (PtrBank == AMDGPU::VGPRRegBankID &&
3541	(!Subtarget.hasVmemPrefInsts() \|\| !MI.getOperand(i: `3`).getImm())) {
3542	// Cannot do I$ prefetch with divergent pointer.
3543	MI.eraseFromParent();
3544	return;
3545	}
3546	unsigned AS = MRI.getType(Reg: PtrReg).getAddressSpace();
3547	if ((!AMDGPU::isFlatGlobalAddrSpace(AS) &&
3548	AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) \|\|
3549	(!Subtarget.hasSafeSmemPrefetch() &&
3550	(AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
3551	!MI.getOperand(i: `3`).getImm() / I$ prefetch /))) {
3552	MI.eraseFromParent();
3553	return;
3554	}
3555	applyDefaultMapping(OpdMapper);
3556	return;
3557	}
3558	default:
3559	break;
3560	}
3561
3562	return applyDefaultMapping(OpdMapper);
3563	}
3564
3565	// vgpr, sgpr -> vgpr
3566	// vgpr, agpr -> vgpr
3567	// agpr, agpr -> agpr
3568	// agpr, sgpr -> vgpr
3569	static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3570	if (RB0 == AMDGPU::InvalidRegBankID)
3571	return RB1;
3572	if (RB1 == AMDGPU::InvalidRegBankID)
3573	return RB0;
3574
3575	if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3576	return AMDGPU::SGPRRegBankID;
3577
3578	if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3579	return AMDGPU::AGPRRegBankID;
3580
3581	return AMDGPU::VGPRRegBankID;
3582	}
3583
3584	static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3585	if (RB0 == AMDGPU::InvalidRegBankID)
3586	return RB1;
3587	if (RB1 == AMDGPU::InvalidRegBankID)
3588	return RB0;
3589
3590	// vcc, vcc -> vcc
3591	// vcc, sgpr -> vcc
3592	// vcc, vgpr -> vcc
3593	if (RB0 == AMDGPU::VCCRegBankID \|\| RB1 == AMDGPU::VCCRegBankID)
3594	return AMDGPU::VCCRegBankID;
3595
3596	// vcc, vgpr -> vgpr
3597	return regBankUnion(RB0, RB1);
3598	}
3599
3600	unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3601	const MachineInstr &MI) const {
3602	unsigned RegBank = AMDGPU::InvalidRegBankID;
3603
3604	for (const MachineOperand &MO : MI.operands()) {
3605	if (!MO.isReg())
3606	continue;
3607	Register Reg = MO.getReg();
3608	if (const RegisterBank Bank = getRegBank(Reg, MRI, TRI: TRI)) {
3609	RegBank = regBankUnion(RB0: RegBank, RB1: Bank->getID());
3610	if (RegBank == AMDGPU::VGPRRegBankID)
3611	break;
3612	}
3613	}
3614
3615	return RegBank;
3616	}
3617
3618	bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3619	const MachineFunction &MF = *MI.getMF();
3620	const MachineRegisterInfo &MRI = MF.getRegInfo();
3621	for (const MachineOperand &MO : MI.operands()) {
3622	if (!MO.isReg())
3623	continue;
3624	Register Reg = MO.getReg();
3625	if (const RegisterBank Bank = getRegBank(Reg, MRI, TRI: TRI)) {
3626	if (Bank->getID() != AMDGPU::SGPRRegBankID)
3627	return false;
3628	}
3629	}
3630	return true;
3631	}
3632
3633	const RegisterBankInfo::InstructionMapping &
3634	AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3635	const MachineFunction &MF = *MI.getMF();
3636	const MachineRegisterInfo &MRI = MF.getRegInfo();
3637	SmallVector<const ValueMapping*, `8`> OpdsMapping(MI.getNumOperands());
3638
3639	for (unsigned i = `0`, e = MI.getNumOperands(); i != e; ++i) {
3640	const MachineOperand &SrcOp = MI.getOperand(i);
3641	if (!SrcOp.isReg())
3642	continue;
3643
3644	unsigned Size = getSizeInBits(Reg: SrcOp.getReg(), MRI, TRI: *TRI);
3645	OpdsMapping [i] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
3646	}
3647	return getInstructionMapping(ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(OpdsMapping),
3648	NumOperands: MI.getNumOperands());
3649	}
3650
3651	const RegisterBankInfo::InstructionMapping &
3652	AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3653	const MachineFunction &MF = *MI.getMF();
3654	const MachineRegisterInfo &MRI = MF.getRegInfo();
3655	SmallVector<const ValueMapping*, `8`> OpdsMapping(MI.getNumOperands());
3656
3657	// Even though we technically could use SGPRs, this would require knowledge of
3658	// the constant bus restriction. Force all sources to VGPR (except for VCC).
3659	//
3660	// TODO: Unary ops are trivially OK, so accept SGPRs?
3661	for (unsigned i = `0`, e = MI.getNumOperands(); i != e; ++i) {
3662	const MachineOperand &Src = MI.getOperand(i);
3663	if (!Src.isReg())
3664	continue;
3665
3666	unsigned Size = getSizeInBits(Reg: Src.getReg(), MRI, TRI: *TRI);
3667	unsigned BankID = Size == `1` ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3668	OpdsMapping [i] = AMDGPU::getValueMapping(BankID, Size);
3669	}
3670
3671	return getInstructionMapping(ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(OpdsMapping),
3672	NumOperands: MI.getNumOperands());
3673	}
3674
3675	const RegisterBankInfo::InstructionMapping &
3676	AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3677	const MachineFunction &MF = *MI.getMF();
3678	const MachineRegisterInfo &MRI = MF.getRegInfo();
3679	SmallVector<const ValueMapping*, `8`> OpdsMapping(MI.getNumOperands());
3680
3681	for (unsigned I = `0`, E = MI.getNumOperands(); I != E; ++I) {
3682	const MachineOperand &Op = MI.getOperand(i: I);
3683	if (!Op.isReg())
3684	continue;
3685
3686	unsigned Size = getSizeInBits(Reg: Op.getReg(), MRI, TRI: *TRI);
3687	OpdsMapping [I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3688	}
3689
3690	return getInstructionMapping(ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(OpdsMapping),
3691	NumOperands: MI.getNumOperands());
3692	}
3693
3694	const RegisterBankInfo::InstructionMapping &
3695	AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3696	const MachineInstr &MI,
3697	int RsrcIdx) const {
3698	// The reported argument index is relative to the IR intrinsic call arguments,
3699	// so we need to shift by the number of defs and the intrinsic ID.
3700	RsrcIdx += MI.getNumExplicitDefs() + `1`;
3701
3702	const int NumOps = MI.getNumOperands();
3703	SmallVector<const ValueMapping *, `8`> OpdsMapping(NumOps);
3704
3705	// TODO: Should packed/unpacked D16 difference be reported here as part of
3706	// the value mapping?
3707	for (int I = `0`; I != NumOps; ++I) {
3708	if (!MI.getOperand(i: I).isReg())
3709	continue;
3710
3711	Register OpReg = MI.getOperand(i: I).getReg();
3712	// We replace some dead address operands with $noreg
3713	if (!OpReg)
3714	continue;
3715
3716	unsigned Size = getSizeInBits(Reg: OpReg, MRI, TRI: *TRI);
3717
3718	// FIXME: Probably need a new intrinsic register bank searchable table to
3719	// handle arbitrary intrinsics easily.
3720	//
3721	// If this has a sampler, it immediately follows rsrc.
3722	const bool MustBeSGPR = I == RsrcIdx \|\| I == RsrcIdx + `1`;
3723
3724	if (MustBeSGPR) {
3725	// If this must be an SGPR, so we must report whatever it is as legal.
3726	unsigned NewBank = getRegBankID(Reg: OpReg, MRI, Default: AMDGPU::SGPRRegBankID);
3727	OpdsMapping [I] = AMDGPU::getValueMapping(BankID: NewBank, Size);
3728	} else {
3729	// Some operands must be VGPR, and these are easy to copy to.
3730	OpdsMapping [I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3731	}
3732	}
3733
3734	return getInstructionMapping(ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: NumOps);
3735	}
3736
3737	/// Return the mapping for a pointer argument.
3738	const RegisterBankInfo::ValueMapping *
3739	AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3740	Register PtrReg) const {
3741	LLT PtrTy = MRI.getType(Reg: PtrReg);
3742	unsigned Size = PtrTy.getSizeInBits();
3743	if (Subtarget.useFlatForGlobal() \|\|
3744	!AMDGPU::isFlatGlobalAddrSpace(AS: PtrTy.getAddressSpace()))
3745	return AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3746
3747	// If we're using MUBUF instructions for global memory, an SGPR base register
3748	// is possible. Otherwise this needs to be a VGPR.
3749	const RegisterBank PtrBank = getRegBank(Reg: PtrReg, MRI, TRI: TRI);
3750	return AMDGPU::getValueMapping(BankID: PtrBank->getID(), Size);
3751	}
3752
3753	const RegisterBankInfo::InstructionMapping &
3754	AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3755
3756	const MachineFunction &MF = *MI.getMF();
3757	const MachineRegisterInfo &MRI = MF.getRegInfo();
3758	SmallVector<const ValueMapping*, `2`> OpdsMapping(`2`);
3759	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
3760	Register PtrReg = MI.getOperand(i: `1`).getReg();
3761	LLT PtrTy = MRI.getType(Reg: PtrReg);
3762	unsigned AS = PtrTy.getAddressSpace();
3763	unsigned PtrSize = PtrTy.getSizeInBits();
3764
3765	const ValueMapping *ValMapping;
3766	const ValueMapping *PtrMapping;
3767
3768	const RegisterBank PtrBank = getRegBank(Reg: PtrReg, MRI, TRI: TRI);
3769
3770	if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3771	if (isScalarLoadLegal(MI)) {
3772	// We have a uniform instruction so we want to use an SMRD load
3773	ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
3774	PtrMapping = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: PtrSize);
3775	} else {
3776	ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3777
3778	// If we're using MUBUF instructions for global memory, an SGPR base
3779	// register is possible. Otherwise this needs to be a VGPR.
3780	unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3781	AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3782
3783	PtrMapping = AMDGPU::getValueMapping(BankID: PtrBankID, Size: PtrSize);
3784	}
3785	} else {
3786	ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3787	PtrMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: PtrSize);
3788	}
3789
3790	OpdsMapping [`0`] = ValMapping;
3791	OpdsMapping [`1`] = PtrMapping;
3792	const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3793	ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: MI.getNumOperands());
3794	return Mapping;
3795
3796	// FIXME: Do we want to add a mapping for FLAT load, or should we just
3797	// handle that during instruction selection?
3798	}
3799
3800	unsigned
3801	AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3802	const MachineRegisterInfo &MRI,
3803	unsigned Default) const {
3804	const RegisterBank Bank = getRegBank(Reg, MRI, TRI: TRI);
3805	return Bank ? Bank->getID() : Default;
3806	}
3807
3808	const RegisterBankInfo::ValueMapping *
3809	AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3810	const MachineRegisterInfo &MRI,
3811	const TargetRegisterInfo &TRI) const {
3812	// Lie and claim anything is legal, even though this needs to be an SGPR
3813	// applyMapping will have to deal with it as a waterfall loop.
3814	unsigned Bank = getRegBankID(Reg, MRI, Default: AMDGPU::SGPRRegBankID);
3815	unsigned Size = getSizeInBits(Reg, MRI, TRI);
3816	return AMDGPU::getValueMapping(BankID: Bank, Size);
3817	}
3818
3819	const RegisterBankInfo::ValueMapping *
3820	AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3821	const MachineRegisterInfo &MRI,
3822	const TargetRegisterInfo &TRI) const {
3823	unsigned Size = getSizeInBits(Reg, MRI, TRI);
3824	return AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3825	}
3826
3827	const RegisterBankInfo::ValueMapping *
3828	AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3829	const MachineRegisterInfo &MRI,
3830	const TargetRegisterInfo &TRI) const {
3831	unsigned Size = getSizeInBits(Reg, MRI, TRI);
3832	return AMDGPU::getValueMapping(BankID: AMDGPU::AGPRRegBankID, Size);
3833	}
3834
3835	///
3836	/// This function must return a legal mapping, because
3837	/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3838	/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3839	/// VGPR to SGPR generated is illegal.
3840	///
3841	// Operands that must be SGPRs must accept potentially divergent VGPRs as
3842	// legal. These will be dealt with in applyMappingImpl.
3843	//
3844	const RegisterBankInfo::InstructionMapping &
3845	AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3846	const MachineFunction &MF = *MI.getMF();
3847	const MachineRegisterInfo &MRI = MF.getRegInfo();
3848
3849	if (MI.isCopy() \|\| MI.getOpcode() == AMDGPU::G_FREEZE) {
3850	Register DstReg = MI.getOperand(i: `0`).getReg();
3851	Register SrcReg = MI.getOperand(i: `1`).getReg();
3852
3853	// The default logic bothers to analyze impossible alternative mappings. We
3854	// want the most straightforward mapping, so just directly handle this.
3855	const RegisterBank DstBank = getRegBank(Reg: DstReg, MRI, TRI: TRI);
3856	const RegisterBank SrcBank = getRegBank(Reg: SrcReg, MRI, TRI: TRI);
3857
3858	// For COPY between a physical reg and an s1, there is no type associated so
3859	// we need to take the virtual register's type as a hint on how to interpret
3860	// s1 values.
3861	unsigned Size;
3862	if (!SrcReg.isVirtual() && !DstBank &&
3863	MRI.getType(Reg: DstReg) == LLT::scalar(SizeInBits: `1`)) {
3864	DstBank = &AMDGPU::VCCRegBank;
3865	Size = `1`;
3866	} else if (!DstReg.isVirtual() && MRI.getType(Reg: SrcReg) == LLT::scalar(SizeInBits: `1`)) {
3867	DstBank = &AMDGPU::VCCRegBank;
3868	Size = `1`;
3869	} else {
3870	Size = getSizeInBits(Reg: DstReg, MRI, TRI: *TRI);
3871	}
3872
3873	if (!DstBank)
3874	DstBank = SrcBank;
3875	else if (!SrcBank)
3876	SrcBank = DstBank;
3877
3878	if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3879	cannotCopy(Dst: DstBank, Src: SrcBank, Size: TypeSize::getFixed(ExactSize: Size)))
3880	return getInvalidInstructionMapping();
3881
3882	const ValueMapping &ValMap = getValueMapping(StartIdx: `0`, Length: Size, RegBank: *DstBank);
3883	unsigned OpdsMappingSize = MI.isCopy() ? `1` : `2`;
3884	SmallVector<const ValueMapping *, `1`> OpdsMapping(OpdsMappingSize);
3885	OpdsMapping [`0`] = &ValMap;
3886	if (MI.getOpcode() == AMDGPU::G_FREEZE)
3887	OpdsMapping [`1`] = &ValMap;
3888
3889	return getInstructionMapping(
3890	ID: `1`, /Cost/ `1`,
3891	/OperandsMapping/ getOperandsMapping(OpdsMapping), NumOperands: OpdsMappingSize);
3892	}
3893
3894	if (MI.isRegSequence()) {
3895	// If any input is a VGPR, the result must be a VGPR. The default handling
3896	// assumes any copy between banks is legal.
3897	unsigned BankID = AMDGPU::SGPRRegBankID;
3898
3899	for (unsigned I = `1`, E = MI.getNumOperands(); I != E; I += `2`) {
3900	auto OpBank = getRegBankID(Reg: MI.getOperand(i: I).getReg(), MRI);
3901	// It doesn't make sense to use vcc or scc banks here, so just ignore
3902	// them.
3903	if (OpBank != AMDGPU::SGPRRegBankID) {
3904	BankID = AMDGPU::VGPRRegBankID;
3905	break;
3906	}
3907	}
3908	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
3909
3910	const ValueMapping &ValMap = getValueMapping(StartIdx: `0`, Length: Size, RegBank: getRegBank(ID: BankID));
3911	return getInstructionMapping(
3912	ID: `1`, /Cost/ `1`,
3913	/OperandsMapping/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: `1`);
3914	}
3915
3916	// The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3917	// properly.
3918	//
3919	// TODO: There are additional exec masking dependencies to analyze.
3920	if (auto *PHI = dyn_cast<GPhi>(Val: &MI)) {
3921	unsigned ResultBank = AMDGPU::InvalidRegBankID;
3922	Register DstReg = PHI->getReg(Idx: `0`);
3923
3924	// Sometimes the result may have already been assigned a bank.
3925	if (const RegisterBank DstBank = getRegBank(Reg: DstReg, MRI, TRI: TRI))
3926	ResultBank = DstBank->getID();
3927
3928	for (unsigned I = `0`; I < PHI->getNumIncomingValues(); ++I) {
3929	Register Reg = PHI->getIncomingValue(I);
3930	const RegisterBank Bank = getRegBank(Reg, MRI, TRI: TRI);
3931
3932	// FIXME: Assuming VGPR for any undetermined inputs.
3933	if (!Bank \|\| Bank->getID() == AMDGPU::VGPRRegBankID) {
3934	ResultBank = AMDGPU::VGPRRegBankID;
3935	break;
3936	}
3937
3938	// FIXME: Need to promote SGPR case to s32
3939	unsigned OpBank = Bank->getID();
3940	ResultBank = regBankBoolUnion(RB0: ResultBank, RB1: OpBank);
3941	}
3942
3943	assert(ResultBank != AMDGPU::InvalidRegBankID);
3944
3945	unsigned Size = MRI.getType(Reg: DstReg).getSizeInBits();
3946
3947	const ValueMapping &ValMap =
3948	getValueMapping(StartIdx: `0`, Length: Size, RegBank: getRegBank(ID: ResultBank));
3949	return getInstructionMapping(
3950	ID: `1`, /Cost/ `1`,
3951	/OperandsMapping/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: `1`);
3952	}
3953
3954	const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3955	if (Mapping.isValid())
3956	return Mapping;
3957
3958	SmallVector<const ValueMapping*, `8`> OpdsMapping(MI.getNumOperands());
3959
3960	switch (MI.getOpcode()) {
3961	default:
3962	return getInvalidInstructionMapping();
3963
3964	case AMDGPU::G_AND:
3965	case AMDGPU::G_OR:
3966	case AMDGPU::G_XOR:
3967	case AMDGPU::G_MUL: {
3968	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
3969	if (Size == `1`) {
3970	const RegisterBank *DstBank
3971	= getRegBank(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
3972
3973	unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3974	unsigned BankLHS = AMDGPU::InvalidRegBankID;
3975	unsigned BankRHS = AMDGPU::InvalidRegBankID;
3976	if (DstBank) {
3977	TargetBankID = DstBank->getID();
3978	if (DstBank == &AMDGPU::VCCRegBank) {
3979	TargetBankID = AMDGPU::VCCRegBankID;
3980	BankLHS = AMDGPU::VCCRegBankID;
3981	BankRHS = AMDGPU::VCCRegBankID;
3982	} else {
3983	BankLHS = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI,
3984	Default: AMDGPU::SGPRRegBankID);
3985	BankRHS = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI,
3986	Default: AMDGPU::SGPRRegBankID);
3987	}
3988	} else {
3989	BankLHS = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI,
3990	Default: AMDGPU::VCCRegBankID);
3991	BankRHS = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI,
3992	Default: AMDGPU::VCCRegBankID);
3993
3994	// Both inputs should be true booleans to produce a boolean result.
3995	if (BankLHS == AMDGPU::VGPRRegBankID \|\| BankRHS == AMDGPU::VGPRRegBankID) {
3996	TargetBankID = AMDGPU::VGPRRegBankID;
3997	} else if (BankLHS == AMDGPU::VCCRegBankID \|\| BankRHS == AMDGPU::VCCRegBankID) {
3998	TargetBankID = AMDGPU::VCCRegBankID;
3999	BankLHS = AMDGPU::VCCRegBankID;
4000	BankRHS = AMDGPU::VCCRegBankID;
4001	} else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
4002	TargetBankID = AMDGPU::SGPRRegBankID;
4003	}
4004	}
4005
4006	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: TargetBankID, Size);
4007	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: BankLHS, Size);
4008	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: BankRHS, Size);
4009	break;
4010	}
4011
4012	if (Size == `64`) {
4013
4014	if (isSALUMapping(MI)) {
4015	OpdsMapping [`0`] = getValueMappingSGPR64Only(BankID: AMDGPU::SGPRRegBankID, Size);
4016	OpdsMapping [`1`] = OpdsMapping [`2`] = OpdsMapping [`0`];
4017	} else {
4018	if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64())
4019	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
4020	else
4021	OpdsMapping [`0`] =
4022	getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size);
4023	unsigned Bank1 = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI /, DefaultBankID/);
4024	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: Bank1, Size);
4025
4026	unsigned Bank2 = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI /, DefaultBankID/);
4027	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: Bank2, Size);
4028	}
4029
4030	break;
4031	}
4032
4033	[[fallthrough]];
4034	}
4035	case AMDGPU::G_PTR_ADD:
4036	case AMDGPU::G_PTRMASK:
4037	case AMDGPU::G_ADD:
4038	case AMDGPU::G_SUB:
4039	case AMDGPU::G_SHL:
4040	case AMDGPU::G_LSHR:
4041	case AMDGPU::G_ASHR:
4042	case AMDGPU::G_UADDO:
4043	case AMDGPU::G_USUBO:
4044	case AMDGPU::G_UADDE:
4045	case AMDGPU::G_SADDE:
4046	case AMDGPU::G_USUBE:
4047	case AMDGPU::G_SSUBE:
4048	case AMDGPU::G_ABS:
4049	case AMDGPU::G_SHUFFLE_VECTOR:
4050	case AMDGPU::G_SBFX:
4051	case AMDGPU::G_UBFX:
4052	case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
4053	case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
4054	if (isSALUMapping(MI))
4055	return getDefaultMappingSOP(MI);
4056	return getDefaultMappingVOP(MI);
4057	case AMDGPU::G_SMIN:
4058	case AMDGPU::G_SMAX:
4059	case AMDGPU::G_UMIN:
4060	case AMDGPU::G_UMAX:
4061	if (isSALUMapping(MI)) {
4062	// There are no scalar 64-bit min and max, use vector instruction instead.
4063	if (MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits() == `64` &&
4064	Subtarget.hasIntMinMax64())
4065	return getDefaultMappingVOP(MI);
4066	return getDefaultMappingSOP(MI);
4067	}
4068	return getDefaultMappingVOP(MI);
4069	case AMDGPU::G_FADD:
4070	case AMDGPU::G_FSUB:
4071	case AMDGPU::G_FMUL:
4072	case AMDGPU::G_FMA:
4073	case AMDGPU::G_FFLOOR:
4074	case AMDGPU::G_FCEIL:
4075	case AMDGPU::G_INTRINSIC_ROUNDEVEN:
4076	case AMDGPU::G_FMINNUM:
4077	case AMDGPU::G_FMAXNUM:
4078	case AMDGPU::G_FMINIMUMNUM:
4079	case AMDGPU::G_FMAXIMUMNUM:
4080	case AMDGPU::G_INTRINSIC_TRUNC:
4081	case AMDGPU::G_STRICT_FADD:
4082	case AMDGPU::G_STRICT_FSUB:
4083	case AMDGPU::G_STRICT_FMUL:
4084	case AMDGPU::G_STRICT_FMA: {
4085	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
4086	unsigned Size = Ty.getSizeInBits();
4087	if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
4088	(Size == `32` \|\| Size == `16`) && isSALUMapping(MI))
4089	return getDefaultMappingSOP(MI);
4090	return getDefaultMappingVOP(MI);
4091	}
4092	case AMDGPU::G_FMINIMUM:
4093	case AMDGPU::G_FMAXIMUM: {
4094	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
4095	unsigned Size = Ty.getSizeInBits();
4096	if (Subtarget.hasSALUMinimumMaximumInsts() && Ty.isScalar() &&
4097	(Size == `32` \|\| Size == `16`) && isSALUMapping(MI))
4098	return getDefaultMappingSOP(MI);
4099	return getDefaultMappingVOP(MI);
4100	}
4101	case AMDGPU::G_FPTOSI:
4102	case AMDGPU::G_FPTOUI:
4103	case AMDGPU::G_FPTOSI_SAT:
4104	case AMDGPU::G_FPTOUI_SAT:
4105	case AMDGPU::G_SITOFP:
4106	case AMDGPU::G_UITOFP: {
4107	unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4108	unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
4109	if (Subtarget.hasSALUFloatInsts() && SizeDst == `32` && SizeSrc == `32` &&
4110	isSALUMapping(MI))
4111	return getDefaultMappingSOP(MI);
4112	return getDefaultMappingVOP(MI);
4113	}
4114	case AMDGPU::G_FPTRUNC:
4115	case AMDGPU::G_FPEXT: {
4116	unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4117	unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
4118	if (Subtarget.hasSALUFloatInsts() && SizeDst != `64` && SizeSrc != `64` &&
4119	isSALUMapping(MI))
4120	return getDefaultMappingSOP(MI);
4121	return getDefaultMappingVOP(MI);
4122	}
4123	case AMDGPU::G_FSQRT:
4124	case AMDGPU::G_FEXP2:
4125	case AMDGPU::G_FLOG2: {
4126	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4127	if (Subtarget.hasPseudoScalarTrans() && (Size == `16` \|\| Size == `32`) &&
4128	isSALUMapping(MI))
4129	return getDefaultMappingSOP(MI);
4130	return getDefaultMappingVOP(MI);
4131	}
4132	case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
4133	case AMDGPU::G_SSUBSAT:
4134	case AMDGPU::G_UADDSAT:
4135	case AMDGPU::G_USUBSAT:
4136	case AMDGPU::G_FMAD:
4137	case AMDGPU::G_FLDEXP:
4138	case AMDGPU::G_FMINNUM_IEEE:
4139	case AMDGPU::G_FMAXNUM_IEEE:
4140	case AMDGPU::G_FCANONICALIZE:
4141	case AMDGPU::G_STRICT_FLDEXP:
4142	case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
4143	case AMDGPU::G_FSHR: // TODO: Expand for scalar
4144	case AMDGPU::G_AMDGPU_FMIN_LEGACY:
4145	case AMDGPU::G_AMDGPU_FMAX_LEGACY:
4146	case AMDGPU::G_AMDGPU_RCP_IFLAG:
4147	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
4148	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
4149	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
4150	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
4151	case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
4152	case AMDGPU::G_AMDGPU_SMED3:
4153	case AMDGPU::G_AMDGPU_FMED3:
4154	return getDefaultMappingVOP(MI);
4155	case AMDGPU::G_UMULH:
4156	case AMDGPU::G_SMULH: {
4157	if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
4158	return getDefaultMappingSOP(MI);
4159	return getDefaultMappingVOP(MI);
4160	}
4161	case AMDGPU::G_AMDGPU_MAD_U64_U32:
4162	case AMDGPU::G_AMDGPU_MAD_I64_I32: {
4163	// Three possible mappings:
4164	//
4165	// - Default SOP
4166	// - Default VOP
4167	// - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
4168	//
4169	// This allows instruction selection to keep the multiplication part of the
4170	// instruction on the SALU.
4171	bool AllSalu = true;
4172	bool MulSalu = true;
4173	for (unsigned i = `0`; i < `5`; ++i) {
4174	Register Reg = MI.getOperand(i).getReg();
4175	if (const RegisterBank Bank = getRegBank(Reg, MRI, TRI: TRI)) {
4176	if (Bank->getID() != AMDGPU::SGPRRegBankID) {
4177	AllSalu = false;
4178	if (i == `2` \|\| i == `3`) {
4179	MulSalu = false;
4180	break;
4181	}
4182	}
4183	}
4184	}
4185
4186	if (AllSalu)
4187	return getDefaultMappingSOP(MI);
4188
4189	// If the multiply-add is full-rate in VALU, use that even if the
4190	// multiplication part is scalar. Accumulating separately on the VALU would
4191	// take two instructions.
4192	if (!MulSalu \|\| Subtarget.hasFullRate64Ops())
4193	return getDefaultMappingVOP(MI);
4194
4195	// Keep the multiplication on the SALU, then accumulate on the VALU.
4196	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `64`);
4197	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: `1`);
4198	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `32`);
4199	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `32`);
4200	OpdsMapping [`4`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `64`);
4201	break;
4202	}
4203	case AMDGPU::G_IMPLICIT_DEF: {
4204	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4205	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4206	break;
4207	}
4208	case AMDGPU::G_FCONSTANT:
4209	case AMDGPU::G_CONSTANT:
4210	case AMDGPU::G_GLOBAL_VALUE:
4211	case AMDGPU::G_FRAME_INDEX:
4212	case AMDGPU::G_BLOCK_ADDR:
4213	case AMDGPU::G_READSTEADYCOUNTER:
4214	case AMDGPU::G_READCYCLECOUNTER: {
4215	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4216	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4217	break;
4218	}
4219	case AMDGPU::G_DYN_STACKALLOC: {
4220	// Result is always uniform, and a wave reduction is needed for the source.
4221	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `32`);
4222	unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI);
4223	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: `32`);
4224	break;
4225	}
4226	case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
4227	// This case is weird because we expect a physical register in the source,
4228	// but need to set a bank anyway.
4229	//
4230	// TODO: We could select the result to SGPR or VGPR
4231	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `32`);
4232	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `32`);
4233	break;
4234	}
4235	case AMDGPU::G_INSERT: {
4236	unsigned BankID = getMappingType(MRI, MI);
4237	unsigned DstSize = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
4238	unsigned SrcSize = getSizeInBits(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
4239	unsigned EltSize = getSizeInBits(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
4240	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID, Size: DstSize);
4241	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID, Size: SrcSize);
4242	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID, Size: EltSize);
4243	OpdsMapping [`3`] = nullptr;
4244	break;
4245	}
4246	case AMDGPU::G_EXTRACT: {
4247	unsigned BankID = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI);
4248	unsigned DstSize = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
4249	unsigned SrcSize = getSizeInBits(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
4250	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID, Size: DstSize);
4251	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID, Size: SrcSize);
4252	OpdsMapping [`2`] = nullptr;
4253	break;
4254	}
4255	case AMDGPU::G_BUILD_VECTOR:
4256	case AMDGPU::G_BUILD_VECTOR_TRUNC: {
4257	LLT DstTy = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
4258	if (DstTy == LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`)) {
4259	unsigned DstSize = DstTy.getSizeInBits();
4260	unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
4261	unsigned Src0BankID = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI);
4262	unsigned Src1BankID = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI);
4263	unsigned DstBankID = regBankUnion(RB0: Src0BankID, RB1: Src1BankID);
4264
4265	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: DstBankID, Size: DstSize);
4266	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: Src0BankID, Size: SrcSize);
4267	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: Src1BankID, Size: SrcSize);
4268	break;
4269	}
4270
4271	[[fallthrough]];
4272	}
4273	case AMDGPU::G_MERGE_VALUES:
4274	case AMDGPU::G_CONCAT_VECTORS: {
4275	unsigned Bank = getMappingType(MRI, MI);
4276	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4277	unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
4278
4279	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize);
4280	// Op1 and Dst should use the same register bank.
4281	for (unsigned i = `1`, e = MI.getNumOperands(); i != e; ++i)
4282	OpdsMapping [i] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize);
4283	break;
4284	}
4285	case AMDGPU::G_BITREVERSE:
4286	case AMDGPU::G_BITCAST:
4287	case AMDGPU::G_INTTOPTR:
4288	case AMDGPU::G_PTRTOINT:
4289	case AMDGPU::G_FABS:
4290	case AMDGPU::G_FNEG: {
4291	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4292	unsigned BankID = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI);
4293	OpdsMapping [`0`] = OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID, Size);
4294	break;
4295	}
4296	case AMDGPU::G_AMDGPU_FFBH_U32:
4297	case AMDGPU::G_AMDGPU_FFBL_B32:
4298	case AMDGPU::G_CTLZ_ZERO_UNDEF:
4299	case AMDGPU::G_CTTZ_ZERO_UNDEF: {
4300	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
4301	unsigned BankID = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI);
4302	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID, Size: `32`);
4303	OpdsMapping [`1`] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
4304	break;
4305	}
4306	case AMDGPU::G_CTPOP: {
4307	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
4308	unsigned BankID = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI);
4309	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID, Size: `32`);
4310
4311	// This should really be getValueMappingSGPR64Only, but allowing the generic
4312	// code to handle the register split just makes using LegalizerHelper more
4313	// difficult.
4314	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID, Size);
4315	break;
4316	}
4317	case AMDGPU::G_TRUNC: {
4318	Register Dst = MI.getOperand(i: `0`).getReg();
4319	Register Src = MI.getOperand(i: `1`).getReg();
4320	unsigned Bank = getRegBankID(Reg: Src, MRI);
4321	unsigned DstSize = getSizeInBits(Reg: Dst, MRI, TRI: *TRI);
4322	unsigned SrcSize = getSizeInBits(Reg: Src, MRI, TRI: *TRI);
4323	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize);
4324	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize);
4325	break;
4326	}
4327	case AMDGPU::G_ZEXT:
4328	case AMDGPU::G_SEXT:
4329	case AMDGPU::G_ANYEXT:
4330	case AMDGPU::G_SEXT_INREG: {
4331	Register Dst = MI.getOperand(i: `0`).getReg();
4332	Register Src = MI.getOperand(i: `1`).getReg();
4333	unsigned DstSize = getSizeInBits(Reg: Dst, MRI, TRI: *TRI);
4334	unsigned SrcSize = getSizeInBits(Reg: Src, MRI, TRI: *TRI);
4335
4336	unsigned DstBank;
4337	const RegisterBank SrcBank = getRegBank(Reg: Src, MRI, TRI: TRI);
4338	assert(SrcBank);
4339	switch (SrcBank->getID()) {
4340	case AMDGPU::SGPRRegBankID:
4341	DstBank = AMDGPU::SGPRRegBankID;
4342	break;
4343	default:
4344	DstBank = AMDGPU::VGPRRegBankID;
4345	break;
4346	}
4347
4348	// Scalar extend can use 64-bit BFE, but VGPRs require extending to
4349	// 32-bits, and then to 64.
4350	OpdsMapping [`0`] = AMDGPU::getValueMappingSGPR64Only(BankID: DstBank, Size: DstSize);
4351	OpdsMapping [`1`] = AMDGPU::getValueMappingSGPR64Only(BankID: SrcBank->getID(),
4352	Size: SrcSize);
4353	break;
4354	}
4355	case AMDGPU::G_IS_FPCLASS: {
4356	Register SrcReg = MI.getOperand(i: `1`).getReg();
4357	unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits();
4358	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4359	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: DstSize);
4360	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4361	break;
4362	}
4363	case AMDGPU::G_STORE: {
4364	assert(MI.getOperand(`0`).isReg());
4365	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4366
4367	// FIXME: We need to specify a different reg bank once scalar stores are
4368	// supported.
4369	const ValueMapping *ValMapping =
4370	AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
4371	OpdsMapping [`0`] = ValMapping;
4372	OpdsMapping [`1`] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: `1`).getReg());
4373	break;
4374	}
4375	case AMDGPU::G_ICMP:
4376	case AMDGPU::G_FCMP: {
4377	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
4378
4379	// See if the result register has already been constrained to vcc, which may
4380	// happen due to control flow intrinsic lowering.
4381	unsigned DstBank = getRegBankID(Reg: MI.getOperand(i: `0`).getReg(), MRI,
4382	Default: AMDGPU::SGPRRegBankID);
4383	unsigned Op2Bank = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI);
4384	unsigned Op3Bank = getRegBankID(Reg: MI.getOperand(i: `3`).getReg(), MRI);
4385
4386	auto canUseSCCICMP = [&]() {
4387	auto Pred =
4388	static_cast<CmpInst::Predicate>(MI.getOperand(i: `1`).getPredicate());
4389	return Size == `32` \|\|
4390	(Size == `64` &&
4391	(Pred == CmpInst::ICMP_EQ \|\| Pred == CmpInst::ICMP_NE) &&
4392	Subtarget.hasScalarCompareEq64());
4393	};
4394	auto canUseSCCFCMP = [&]() {
4395	return Subtarget.hasSALUFloatInsts() && (Size == `32` \|\| Size == `16`);
4396	};
4397
4398	bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4399	bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4400	Op2Bank == AMDGPU::SGPRRegBankID &&
4401	Op3Bank == AMDGPU::SGPRRegBankID &&
4402	(isICMP ? canUseSCCICMP () : canUseSCCFCMP ());
4403
4404	DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4405	unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4406
4407	// TODO: Use 32-bit for scalar output size.
4408	// SCC results will need to be copied to a 32-bit SGPR virtual register.
4409	const unsigned ResultSize = `1`;
4410
4411	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: DstBank, Size: ResultSize);
4412	OpdsMapping [`1`] = nullptr; // Predicate Operand.
4413	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: SrcBank, Size);
4414	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: SrcBank, Size);
4415	break;
4416	}
4417	case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4418	// VGPR index can be used for waterfall when indexing a SGPR vector.
4419	unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI);
4420	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4421	unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
4422	unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
4423	unsigned IdxBank = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI);
4424	unsigned OutputBankID = regBankUnion(RB0: SrcBankID, RB1: IdxBank);
4425
4426	OpdsMapping [`0`] = AMDGPU::getValueMappingSGPR64Only(BankID: OutputBankID, Size: DstSize);
4427	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: SrcSize);
4428
4429	// The index can be either if the source vector is VGPR.
4430	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4431	break;
4432	}
4433	case AMDGPU::G_INSERT_VECTOR_ELT: {
4434	unsigned OutputBankID = isSALUMapping(MI) ?
4435	AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4436
4437	unsigned VecSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4438	unsigned InsertSize = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
4439	unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: `3`).getReg()).getSizeInBits();
4440	unsigned InsertEltBankID = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI);
4441	unsigned IdxBankID = getRegBankID(Reg: MI.getOperand(i: `3`).getReg(), MRI);
4442
4443	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize);
4444	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize);
4445
4446	// This is a weird case, because we need to break down the mapping based on
4447	// the register bank of a different operand.
4448	if (InsertSize == `64` && OutputBankID == AMDGPU::VGPRRegBankID) {
4449	OpdsMapping [`2`] = AMDGPU::getValueMappingSplit64(BankID: InsertEltBankID,
4450	Size: InsertSize);
4451	} else {
4452	assert(InsertSize == `32` \|\| InsertSize == `64`);
4453	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: InsertEltBankID, Size: InsertSize);
4454	}
4455
4456	// The index can be either if the source vector is VGPR.
4457	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: IdxBankID, Size: IdxSize);
4458	break;
4459	}
4460	case AMDGPU::G_UNMERGE_VALUES: {
4461	unsigned Bank = getMappingType(MRI, MI);
4462
4463	// Op1 and Dst should use the same register bank.
4464	// FIXME: Shouldn't this be the default? Why do we need to handle this?
4465	for (unsigned i = `0`, e = MI.getNumOperands(); i != e; ++i) {
4466	unsigned Size = getSizeInBits(Reg: MI.getOperand(i).getReg(), MRI, TRI: *TRI);
4467	OpdsMapping [i] = AMDGPU::getValueMapping(BankID: Bank, Size);
4468	}
4469	break;
4470	}
4471	case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4472	case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4473	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4474	case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4475	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4476	case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
4477	case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
4478	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
4479	case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
4480	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
4481	case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4482	case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4483	case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4484	case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4485	case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4486	case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4487	case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4488	case AMDGPU::G_AMDGPU_BUFFER_STORE:
4489	case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4490	case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4491	case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4492	case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4493	OpdsMapping [`0`] = getVGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
4494
4495	// rsrc
4496	OpdsMapping [`1`] = getSGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
4497
4498	// vindex
4499	OpdsMapping [`2`] = getVGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
4500
4501	// voffset
4502	OpdsMapping [`3`] = getVGPROpMapping(Reg: MI.getOperand(i: `3`).getReg(), MRI, TRI: *TRI);
4503
4504	// soffset
4505	OpdsMapping [`4`] = getSGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI);
4506
4507	// Any remaining operands are immediates and were correctly null
4508	// initialized.
4509	break;
4510	}
4511	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4512	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4513	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4514	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4515	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4516	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4517	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4518	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4519	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4520	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4521	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4522	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4523	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
4524	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
4525	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4526	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4527	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4528	// vdata_out
4529	OpdsMapping [`0`] = getVGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
4530
4531	// vdata_in
4532	OpdsMapping [`1`] = getVGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
4533
4534	// rsrc
4535	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
4536
4537	// vindex
4538	OpdsMapping [`3`] = getVGPROpMapping(Reg: MI.getOperand(i: `3`).getReg(), MRI, TRI: *TRI);
4539
4540	// voffset
4541	OpdsMapping [`4`] = getVGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI);
4542
4543	// soffset
4544	OpdsMapping [`5`] = getSGPROpMapping(Reg: MI.getOperand(i: `5`).getReg(), MRI, TRI: *TRI);
4545
4546	// Any remaining operands are immediates and were correctly null
4547	// initialized.
4548	break;
4549	}
4550	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4551	// vdata_out
4552	OpdsMapping [`0`] = getVGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
4553
4554	// vdata_in
4555	OpdsMapping [`1`] = getVGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
4556
4557	// cmp
4558	OpdsMapping [`2`] = getVGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
4559
4560	// rsrc
4561	OpdsMapping [`3`] = getSGPROpMapping(Reg: MI.getOperand(i: `3`).getReg(), MRI, TRI: *TRI);
4562
4563	// vindex
4564	OpdsMapping [`4`] = getVGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI);
4565
4566	// voffset
4567	OpdsMapping [`5`] = getVGPROpMapping(Reg: MI.getOperand(i: `5`).getReg(), MRI, TRI: *TRI);
4568
4569	// soffset
4570	OpdsMapping [`6`] = getSGPROpMapping(Reg: MI.getOperand(i: `6`).getReg(), MRI, TRI: *TRI);
4571
4572	// Any remaining operands are immediates and were correctly null
4573	// initialized.
4574	break;
4575	}
4576	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
4577	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
4578	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
4579	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
4580	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
4581	// Lie and claim everything is legal, even though some need to be
4582	// SGPRs. applyMapping will have to deal with it as a waterfall loop.
4583	OpdsMapping [`1`] = getSGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
4584	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
4585
4586	// We need to convert this to a MUBUF if either the resource of offset is
4587	// VGPR.
4588	unsigned RSrcBank = OpdsMapping [`1`]->BreakDown[`0`].RegBank->getID();
4589	unsigned OffsetBank = OpdsMapping [`2`]->BreakDown[`0`].RegBank->getID();
4590	unsigned ResultBank = regBankUnion(RB0: RSrcBank, RB1: OffsetBank);
4591
4592	unsigned Size0 = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4593	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: ResultBank, Size: Size0);
4594	break;
4595	}
4596	case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
4597	OpdsMapping [`0`] = getSGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
4598	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
4599	break;
4600	case AMDGPU::G_AMDGPU_SPONENTRY: {
4601	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4602	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4603	break;
4604	}
4605	case AMDGPU::G_INTRINSIC:
4606	case AMDGPU::G_INTRINSIC_CONVERGENT: {
4607	switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
4608	default:
4609	return getInvalidInstructionMapping();
4610	case Intrinsic::amdgcn_div_fmas:
4611	case Intrinsic::amdgcn_div_fixup:
4612	case Intrinsic::amdgcn_trig_preop:
4613	case Intrinsic::amdgcn_sin:
4614	case Intrinsic::amdgcn_cos:
4615	case Intrinsic::amdgcn_log_clamp:
4616	case Intrinsic::amdgcn_rcp_legacy:
4617	case Intrinsic::amdgcn_rsq_legacy:
4618	case Intrinsic::amdgcn_rsq_clamp:
4619	case Intrinsic::amdgcn_tanh:
4620	case Intrinsic::amdgcn_fmul_legacy:
4621	case Intrinsic::amdgcn_fma_legacy:
4622	case Intrinsic::amdgcn_frexp_mant:
4623	case Intrinsic::amdgcn_frexp_exp:
4624	case Intrinsic::amdgcn_fract:
4625	case Intrinsic::amdgcn_cvt_pknorm_i16:
4626	case Intrinsic::amdgcn_cvt_pknorm_u16:
4627	case Intrinsic::amdgcn_cvt_pk_i16:
4628	case Intrinsic::amdgcn_cvt_pk_u16:
4629	case Intrinsic::amdgcn_cvt_sr_pk_f16_f32:
4630	case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32:
4631	case Intrinsic::amdgcn_cvt_pk_f16_fp8:
4632	case Intrinsic::amdgcn_cvt_pk_f16_bf8:
4633	case Intrinsic::amdgcn_cvt_pk_fp8_f16:
4634	case Intrinsic::amdgcn_cvt_pk_bf8_f16:
4635	case Intrinsic::amdgcn_cvt_sr_fp8_f16:
4636	case Intrinsic::amdgcn_cvt_sr_bf8_f16:
4637	case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp8:
4638	case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp8:
4639	case Intrinsic::amdgcn_cvt_scale_pk8_f16_bf8:
4640	case Intrinsic::amdgcn_cvt_scale_pk8_bf16_bf8:
4641	case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp4:
4642	case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp4:
4643	case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8:
4644	case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8:
4645	case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4:
4646	case Intrinsic::amdgcn_cvt_scale_pk16_f16_fp6:
4647	case Intrinsic::amdgcn_cvt_scale_pk16_bf16_fp6:
4648	case Intrinsic::amdgcn_cvt_scale_pk16_f16_bf6:
4649	case Intrinsic::amdgcn_cvt_scale_pk16_bf16_bf6:
4650	case Intrinsic::amdgcn_cvt_scale_pk16_f32_fp6:
4651	case Intrinsic::amdgcn_cvt_scale_pk16_f32_bf6:
4652	case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16:
4653	case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16:
4654	case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16:
4655	case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f16:
4656	case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f32:
4657	case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f32:
4658	case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32:
4659	case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16:
4660	case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16:
4661	case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f32:
4662	case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f32:
4663	case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f16:
4664	case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f16:
4665	case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_bf16:
4666	case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_bf16:
4667	case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_bf16:
4668	case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_bf16:
4669	case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f16:
4670	case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f16:
4671	case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f32:
4672	case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f32:
4673	case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32:
4674	case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16:
4675	case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16:
4676	case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32:
4677	case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32:
4678	case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16:
4679	case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16:
4680	case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16:
4681	case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16:
4682	case Intrinsic::amdgcn_sat_pk4_i4_i8:
4683	case Intrinsic::amdgcn_sat_pk4_u4_u8:
4684	case Intrinsic::amdgcn_fmed3:
4685	case Intrinsic::amdgcn_cubeid:
4686	case Intrinsic::amdgcn_cubema:
4687	case Intrinsic::amdgcn_cubesc:
4688	case Intrinsic::amdgcn_cubetc:
4689	case Intrinsic::amdgcn_sffbh:
4690	case Intrinsic::amdgcn_fmad_ftz:
4691	case Intrinsic::amdgcn_mbcnt_lo:
4692	case Intrinsic::amdgcn_mbcnt_hi:
4693	case Intrinsic::amdgcn_mul_u24:
4694	case Intrinsic::amdgcn_mul_i24:
4695	case Intrinsic::amdgcn_mulhi_u24:
4696	case Intrinsic::amdgcn_mulhi_i24:
4697	case Intrinsic::amdgcn_lerp:
4698	case Intrinsic::amdgcn_sad_u8:
4699	case Intrinsic::amdgcn_msad_u8:
4700	case Intrinsic::amdgcn_sad_hi_u8:
4701	case Intrinsic::amdgcn_sad_u16:
4702	case Intrinsic::amdgcn_qsad_pk_u16_u8:
4703	case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4704	case Intrinsic::amdgcn_mqsad_u32_u8:
4705	case Intrinsic::amdgcn_cvt_pk_u8_f32:
4706	case Intrinsic::amdgcn_alignbyte:
4707	case Intrinsic::amdgcn_perm:
4708	case Intrinsic::amdgcn_prng_b32:
4709	case Intrinsic::amdgcn_fdot2:
4710	case Intrinsic::amdgcn_sdot2:
4711	case Intrinsic::amdgcn_udot2:
4712	case Intrinsic::amdgcn_sdot4:
4713	case Intrinsic::amdgcn_udot4:
4714	case Intrinsic::amdgcn_sdot8:
4715	case Intrinsic::amdgcn_udot8:
4716	case Intrinsic::amdgcn_fdot2_bf16_bf16:
4717	case Intrinsic::amdgcn_fdot2_f16_f16:
4718	case Intrinsic::amdgcn_fdot2_f32_bf16:
4719	case Intrinsic::amdgcn_fdot2c_f32_bf16:
4720	case Intrinsic::amdgcn_sudot4:
4721	case Intrinsic::amdgcn_sudot8:
4722	case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
4723	case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
4724	case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
4725	case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
4726	case Intrinsic::amdgcn_cvt_f32_fp8:
4727	case Intrinsic::amdgcn_cvt_f32_fp8_e5m3:
4728	case Intrinsic::amdgcn_cvt_f32_bf8:
4729	case Intrinsic::amdgcn_cvt_off_f32_i4:
4730	case Intrinsic::amdgcn_cvt_pk_f32_fp8:
4731	case Intrinsic::amdgcn_cvt_pk_f32_bf8:
4732	case Intrinsic::amdgcn_cvt_pk_fp8_f32:
4733	case Intrinsic::amdgcn_cvt_pk_fp8_f32_e5m3:
4734	case Intrinsic::amdgcn_cvt_pk_bf8_f32:
4735	case Intrinsic::amdgcn_cvt_sr_fp8_f32:
4736	case Intrinsic::amdgcn_cvt_sr_fp8_f32_e5m3:
4737	case Intrinsic::amdgcn_cvt_sr_bf8_f32:
4738	case Intrinsic::amdgcn_cvt_sr_bf16_f32:
4739	case Intrinsic::amdgcn_cvt_sr_f16_f32:
4740	case Intrinsic::amdgcn_cvt_f16_fp8:
4741	case Intrinsic::amdgcn_cvt_f16_bf8:
4742	case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_f16:
4743	case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
4744	case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
4745	case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:
4746	case Intrinsic::amdgcn_cvt_scalef32_f16_fp8:
4747	case Intrinsic::amdgcn_cvt_scalef32_f16_bf8:
4748	case Intrinsic::amdgcn_cvt_scalef32_f32_fp8:
4749	case Intrinsic::amdgcn_cvt_scalef32_f32_bf8:
4750	case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f32:
4751	case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f32:
4752	case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp8:
4753	case Intrinsic::amdgcn_cvt_scalef32_pk_f32_bf8:
4754	case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f16:
4755	case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_bf16:
4756	case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f16:
4757	case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_bf16:
4758	case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp4:
4759	case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f32:
4760	case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp4:
4761	case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp4:
4762	case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_fp6:
4763	case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_bf6:
4764	case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_bf6:
4765	case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_bf6:
4766	case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_fp6:
4767	case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_fp6:
4768	case Intrinsic::amdgcn_cvt_scalef32_pk_f16_bf8:
4769	case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_bf8:
4770	case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp8:
4771	case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp8:
4772	case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f16:
4773	case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_bf16:
4774	case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f16:
4775	case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_bf16:
4776	case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f32:
4777	case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_bf16:
4778	case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f16:
4779	case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f32:
4780	case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_bf16:
4781	case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f16:
4782	case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f32:
4783	case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_bf16:
4784	case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f16:
4785	case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f32:
4786	case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_bf16:
4787	case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f16:
4788	case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f32:
4789	case Intrinsic::amdgcn_ashr_pk_i8_i32:
4790	case Intrinsic::amdgcn_ashr_pk_u8_i32:
4791	case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
4792	case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32:
4793	case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4794	case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4795	case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4796	case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
4797	case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4798	case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4799	case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4800	case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4801	case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
4802	case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
4803	case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
4804	case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
4805	case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
4806	case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
4807	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
4808	case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
4809	case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
4810	case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
4811	case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
4812	case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
4813	case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
4814	case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
4815	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
4816	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
4817	case Intrinsic::amdgcn_wmma_f32_16x16x4_f32:
4818	case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16:
4819	case Intrinsic::amdgcn_wmma_f32_16x16x32_f16:
4820	case Intrinsic::amdgcn_wmma_f16_16x16x32_f16:
4821	case Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16:
4822	case Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16:
4823	case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8:
4824	case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8:
4825	case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8:
4826	case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8:
4827	case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8:
4828	case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8:
4829	case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8:
4830	case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8:
4831	case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8:
4832	case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8:
4833	case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8:
4834	case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8:
4835	case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8:
4836	case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8:
4837	case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8:
4838	case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:
4839	case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
4840	case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
4841	case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
4842	case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
4843	case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
4844	case Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4:
4845	case Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4:
4846	case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
4847	case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
4848	case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
4849	case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
4850	case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
4851	case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
4852	case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
4853	case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
4854	case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
4855	case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
4856	case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
4857	case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
4858	case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8:
4859	case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
4860	case Intrinsic::amdgcn_perm_pk16_b4_u4:
4861	case Intrinsic::amdgcn_perm_pk16_b6_u4:
4862	case Intrinsic::amdgcn_perm_pk16_b8_u4:
4863	case Intrinsic::amdgcn_add_max_i32:
4864	case Intrinsic::amdgcn_add_max_u32:
4865	case Intrinsic::amdgcn_add_min_i32:
4866	case Intrinsic::amdgcn_add_min_u32:
4867	case Intrinsic::amdgcn_pk_add_max_i16:
4868	case Intrinsic::amdgcn_pk_add_max_u16:
4869	case Intrinsic::amdgcn_pk_add_min_i16:
4870	case Intrinsic::amdgcn_pk_add_min_u16:
4871	return getDefaultMappingVOP(MI);
4872	case Intrinsic::amdgcn_log:
4873	case Intrinsic::amdgcn_exp2:
4874	case Intrinsic::amdgcn_rcp:
4875	case Intrinsic::amdgcn_rsq:
4876	case Intrinsic::amdgcn_sqrt: {
4877	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4878	if (Subtarget.hasPseudoScalarTrans() && (Size == `16` \|\| Size == `32`) &&
4879	isSALUMapping(MI))
4880	return getDefaultMappingSOP(MI);
4881	return getDefaultMappingVOP(MI);
4882	}
4883	case Intrinsic::amdgcn_sbfe:
4884	case Intrinsic::amdgcn_ubfe:
4885	if (isSALUMapping(MI))
4886	return getDefaultMappingSOP(MI);
4887	return getDefaultMappingVOP(MI);
4888	case Intrinsic::amdgcn_ds_swizzle:
4889	case Intrinsic::amdgcn_ds_permute:
4890	case Intrinsic::amdgcn_ds_bpermute:
4891	case Intrinsic::amdgcn_update_dpp:
4892	case Intrinsic::amdgcn_mov_dpp8:
4893	case Intrinsic::amdgcn_mov_dpp:
4894	case Intrinsic::amdgcn_strict_wwm:
4895	case Intrinsic::amdgcn_wwm:
4896	case Intrinsic::amdgcn_strict_wqm:
4897	case Intrinsic::amdgcn_wqm:
4898	case Intrinsic::amdgcn_softwqm:
4899	case Intrinsic::amdgcn_set_inactive:
4900	case Intrinsic::amdgcn_set_inactive_chain_arg:
4901	case Intrinsic::amdgcn_permlane64:
4902	case Intrinsic::amdgcn_ds_bpermute_fi_b32:
4903	return getDefaultMappingAllVGPR(MI);
4904	case Intrinsic::amdgcn_cvt_pkrtz:
4905	if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
4906	return getDefaultMappingSOP(MI);
4907	return getDefaultMappingVOP(MI);
4908	case Intrinsic::amdgcn_kernarg_segment_ptr:
4909	case Intrinsic::amdgcn_s_getpc:
4910	case Intrinsic::amdgcn_groupstaticsize:
4911	case Intrinsic::amdgcn_reloc_constant:
4912	case Intrinsic::returnaddress: {
4913	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4914	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4915	break;
4916	}
4917	case Intrinsic::amdgcn_wqm_vote: {
4918	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4919	OpdsMapping [`0`] = OpdsMapping [`2`]
4920	= AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size);
4921	break;
4922	}
4923	case Intrinsic::amdgcn_ps_live: {
4924	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: `1`);
4925	break;
4926	}
4927	case Intrinsic::amdgcn_div_scale: {
4928	unsigned Dst0Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4929	unsigned Dst1Size = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
4930	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Dst0Size);
4931	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: Dst1Size);
4932
4933	unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: `3`).getReg()).getSizeInBits();
4934	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4935	OpdsMapping [`4`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4936	break;
4937	}
4938	case Intrinsic::amdgcn_class: {
4939	Register Src0Reg = MI.getOperand(i: `2`).getReg();
4940	Register Src1Reg = MI.getOperand(i: `3`).getReg();
4941	unsigned Src0Size = MRI.getType(Reg: Src0Reg).getSizeInBits();
4942	unsigned Src1Size = MRI.getType(Reg: Src1Reg).getSizeInBits();
4943	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4944	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: DstSize);
4945	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Src0Size);
4946	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Src1Size);
4947	break;
4948	}
4949	case Intrinsic::amdgcn_icmp:
4950	case Intrinsic::amdgcn_fcmp: {
4951	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4952	// This is not VCCRegBank because this is not used in boolean contexts.
4953	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
4954	unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
4955	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
4956	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
4957	break;
4958	}
4959	case Intrinsic::amdgcn_readlane: {
4960	// This must be an SGPR, but accept a VGPR.
4961	Register IdxReg = MI.getOperand(i: `3`).getReg();
4962	unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits();
4963	unsigned IdxBank = getRegBankID(Reg: IdxReg, MRI, Default: AMDGPU::SGPRRegBankID);
4964	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4965	[[fallthrough]];
4966	}
4967	case Intrinsic::amdgcn_readfirstlane: {
4968	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4969	unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
4970	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
4971	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4972	break;
4973	}
4974	case Intrinsic::amdgcn_writelane: {
4975	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4976	Register SrcReg = MI.getOperand(i: `2`).getReg();
4977	unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits();
4978	unsigned SrcBank = getRegBankID(Reg: SrcReg, MRI, Default: AMDGPU::SGPRRegBankID);
4979	Register IdxReg = MI.getOperand(i: `3`).getReg();
4980	unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits();
4981	unsigned IdxBank = getRegBankID(Reg: IdxReg, MRI, Default: AMDGPU::SGPRRegBankID);
4982	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
4983
4984	// These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4985	// to legalize.
4986	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: SrcBank, Size: SrcSize);
4987	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4988	OpdsMapping [`4`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4989	break;
4990	}
4991	case Intrinsic::amdgcn_if_break: {
4992	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
4993	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4994	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: `1`);
4995	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4996	break;
4997	}
4998	case Intrinsic::amdgcn_permlane16:
4999	case Intrinsic::amdgcn_permlanex16: {
5000	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5001	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5002	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5003	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5004	OpdsMapping [`4`] = getSGPROpMapping(Reg: MI.getOperand(i: `3`).getReg(), MRI, TRI: *TRI);
5005	OpdsMapping [`5`] = getSGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI);
5006	break;
5007	}
5008	case Intrinsic::amdgcn_permlane_bcast:
5009	case Intrinsic::amdgcn_permlane_up:
5010	case Intrinsic::amdgcn_permlane_down:
5011	case Intrinsic::amdgcn_permlane_xor: {
5012	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5013	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5014	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5015	OpdsMapping [`3`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5016	OpdsMapping [`4`] = getSGPROpMapping(Reg: MI.getOperand(i: `3`).getReg(), MRI, TRI: *TRI);
5017	break;
5018	}
5019	case Intrinsic::amdgcn_permlane_idx_gen: {
5020	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5021	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5022	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5023	OpdsMapping [`3`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5024	break;
5025	}
5026	case Intrinsic::amdgcn_permlane16_var:
5027	case Intrinsic::amdgcn_permlanex16_var: {
5028	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5029	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5030	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5031	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5032	OpdsMapping [`4`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5033	break;
5034	}
5035	case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
5036	case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
5037	case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
5038	case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
5039	case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
5040	case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
5041	case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
5042	case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
5043	case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
5044	case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
5045	case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
5046	case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
5047	case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
5048	case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
5049	case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
5050	case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
5051	case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
5052	case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
5053	case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
5054	case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
5055	case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
5056	case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
5057	case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
5058	case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
5059	case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
5060	case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
5061	case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
5062	case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
5063	case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
5064	case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
5065	case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
5066	case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
5067	case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
5068	case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
5069	case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
5070	case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
5071	case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
5072	case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
5073	case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
5074	case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
5075	case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
5076	case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
5077	case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
5078	case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
5079	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5080	unsigned MinNumRegsRequired = DstSize / `32`;
5081
5082	// Default for MAI intrinsics.
5083	// srcC can also be an immediate which can be folded later.
5084	// FIXME: Should we eventually add an alternative mapping with AGPR src
5085	// for srcA/srcB?
5086	//
5087	// vdst, srcA, srcB, srcC
5088	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5089
5090	bool UseAGPRForm = !Subtarget.hasGFX90AInsts() \|\|
5091	Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired);
5092
5093	OpdsMapping [`0`] =
5094	UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI)
5095	: getVGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5096	OpdsMapping [`2`] = getVGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5097	OpdsMapping [`3`] = getVGPROpMapping(Reg: MI.getOperand(i: `3`).getReg(), MRI, TRI: *TRI);
5098	OpdsMapping [`4`] =
5099	UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI)
5100	: getVGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI);
5101	break;
5102	}
5103	case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
5104	case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
5105	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5106	unsigned MinNumRegsRequired = DstSize / `32`;
5107
5108	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5109	bool UseAGPRForm = Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired);
5110
5111	OpdsMapping [`0`] =
5112	UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI)
5113	: getVGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5114
5115	OpdsMapping [`2`] = getVGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5116	OpdsMapping [`3`] = getVGPROpMapping(Reg: MI.getOperand(i: `3`).getReg(), MRI, TRI: *TRI);
5117	OpdsMapping [`4`] =
5118	UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI)
5119	: getVGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI);
5120
5121	OpdsMapping [`8`] = getVGPROpMapping(Reg: MI.getOperand(i: `8`).getReg(), MRI, TRI: *TRI);
5122	OpdsMapping [`10`] = getVGPROpMapping(Reg: MI.getOperand(i: `10`).getReg(), MRI, TRI: *TRI);
5123	break;
5124	}
5125	case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
5126	case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
5127	case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
5128	case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
5129	case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
5130	case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
5131	case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
5132	case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
5133	case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
5134	case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
5135	case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
5136	case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
5137	case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
5138	case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
5139	case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
5140	case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
5141	case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
5142	case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
5143	case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
5144	case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
5145	case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
5146	case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
5147	case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
5148	case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
5149	case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
5150	case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
5151	case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
5152	case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: {
5153	Register DstReg = MI.getOperand(i: `0`).getReg();
5154	unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits();
5155	unsigned MinNumRegsRequired = DstSize / `32`;
5156	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5157	bool UseAGPRForm = Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired);
5158
5159	// vdst, srcA, srcB, srcC, idx
5160	OpdsMapping [`0`] = UseAGPRForm ? getAGPROpMapping(Reg: DstReg, MRI, TRI: *TRI)
5161	: getVGPROpMapping(Reg: DstReg, MRI, TRI: *TRI);
5162
5163	OpdsMapping [`2`] = getVGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5164	OpdsMapping [`3`] = getVGPROpMapping(Reg: MI.getOperand(i: `3`).getReg(), MRI, TRI: *TRI);
5165	OpdsMapping [`4`] =
5166	UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI)
5167	: getVGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI);
5168	OpdsMapping [`5`] = getVGPROpMapping(Reg: MI.getOperand(i: `5`).getReg(), MRI, TRI: *TRI);
5169	break;
5170	}
5171	case Intrinsic::amdgcn_interp_p1:
5172	case Intrinsic::amdgcn_interp_p2:
5173	case Intrinsic::amdgcn_interp_mov:
5174	case Intrinsic::amdgcn_interp_p1_f16:
5175	case Intrinsic::amdgcn_interp_p2_f16:
5176	case Intrinsic::amdgcn_lds_param_load: {
5177	const int M0Idx = MI.getNumOperands() - `1`;
5178	Register M0Reg = MI.getOperand(i: M0Idx).getReg();
5179	unsigned M0Bank = getRegBankID(Reg: M0Reg, MRI, Default: AMDGPU::SGPRRegBankID);
5180	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5181
5182	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5183	for (int I = `2`; I != M0Idx && MI.getOperand(i: I).isReg(); ++I)
5184	OpdsMapping [I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5185
5186	// Must be SGPR, but we must take whatever the original bank is and fix it
5187	// later.
5188	OpdsMapping [M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: `32`);
5189	break;
5190	}
5191	case Intrinsic::amdgcn_interp_inreg_p10:
5192	case Intrinsic::amdgcn_interp_inreg_p2:
5193	case Intrinsic::amdgcn_interp_inreg_p10_f16:
5194	case Intrinsic::amdgcn_interp_inreg_p2_f16:
5195	case Intrinsic::amdgcn_interp_p10_rtz_f16:
5196	case Intrinsic::amdgcn_interp_p2_rtz_f16: {
5197	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5198	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5199	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5200	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5201	OpdsMapping [`4`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5202	break;
5203	}
5204	case Intrinsic::amdgcn_permlane16_swap:
5205	case Intrinsic::amdgcn_permlane32_swap: {
5206	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5207	OpdsMapping [`0`] = OpdsMapping [`1`] = OpdsMapping [`3`] = OpdsMapping [`4`] =
5208	AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5209	break;
5210	}
5211	case Intrinsic::amdgcn_ballot: {
5212	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5213	unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
5214	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
5215	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: SrcSize);
5216	break;
5217	}
5218	case Intrinsic::amdgcn_inverse_ballot: {
5219	// This must be an SGPR, but accept a VGPR.
5220	Register MaskReg = MI.getOperand(i: `2`).getReg();
5221	unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits();
5222	unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID);
5223	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: `1`);
5224	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize);
5225	break;
5226	}
5227	case Intrinsic::amdgcn_bitop3: {
5228	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5229	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5230	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5231	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5232	OpdsMapping [`4`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5233	break;
5234	}
5235	case Intrinsic::amdgcn_s_quadmask:
5236	case Intrinsic::amdgcn_s_wqm: {
5237	Register MaskReg = MI.getOperand(i: `2`).getReg();
5238	unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits();
5239	unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID);
5240	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: MaskSize);
5241	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize);
5242	break;
5243	}
5244	case Intrinsic::amdgcn_wave_reduce_add:
5245	case Intrinsic::amdgcn_wave_reduce_fadd:
5246	case Intrinsic::amdgcn_wave_reduce_sub:
5247	case Intrinsic::amdgcn_wave_reduce_fsub:
5248	case Intrinsic::amdgcn_wave_reduce_min:
5249	case Intrinsic::amdgcn_wave_reduce_umin:
5250	case Intrinsic::amdgcn_wave_reduce_fmin:
5251	case Intrinsic::amdgcn_wave_reduce_max:
5252	case Intrinsic::amdgcn_wave_reduce_umax:
5253	case Intrinsic::amdgcn_wave_reduce_fmax:
5254	case Intrinsic::amdgcn_wave_reduce_and:
5255	case Intrinsic::amdgcn_wave_reduce_or:
5256	case Intrinsic::amdgcn_wave_reduce_xor: {
5257	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5258	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
5259	unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
5260	auto regBankID =
5261	isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5262	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: regBankID, Size: OpSize);
5263	break;
5264	}
5265	case Intrinsic::amdgcn_s_bitreplicate: {
5266	Register MaskReg = MI.getOperand(i: `2`).getReg();
5267	unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID);
5268	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `64`);
5269	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: MaskBank, Size: `32`);
5270	break;
5271	}
5272	case Intrinsic::amdgcn_wave_shuffle: {
5273	unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5274	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
5275	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
5276	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
5277	break;
5278	}
5279	}
5280	break;
5281	}
5282	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
5283	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
5284	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
5285	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
5286	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
5287	auto IntrID = AMDGPU::getIntrinsicID(I: MI);
5288	const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(Intr: IntrID);
5289	assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
5290	// Non-images can have complications from operands that allow both SGPR
5291	// and VGPR. For now it's too complicated to figure out the final opcode
5292	// to derive the register bank from the MCInstrDesc.
5293	assert(RSrcIntrin->IsImage);
5294	return getImageMapping(MRI, MI, RsrcIdx: RSrcIntrin->RsrcArg);
5295	}
5296	case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
5297	case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
5298	case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {
5299	bool IsDualOrBVH8 =
5300	MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY \|\|
5301	MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY;
5302	unsigned NumMods = IsDualOrBVH8 ? `0` : `1`; // Has A16 modifier
5303	unsigned LastRegOpIdx = MI.getNumExplicitOperands() - `1` - NumMods;
5304	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5305	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5306	if (IsDualOrBVH8) {
5307	OpdsMapping [`1`] = AMDGPU::getValueMapping(
5308	BankID: AMDGPU::VGPRRegBankID,
5309	Size: MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits());
5310	OpdsMapping [`2`] = AMDGPU::getValueMapping(
5311	BankID: AMDGPU::VGPRRegBankID,
5312	Size: MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits());
5313	}
5314	OpdsMapping [LastRegOpIdx] =
5315	getSGPROpMapping(Reg: MI.getOperand(i: LastRegOpIdx).getReg(), MRI, TRI: *TRI);
5316	if (LastRegOpIdx == `3`) {
5317	// Sequential form: all operands combined into VGPR256/VGPR512
5318	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
5319	if (Size > `256`)
5320	Size = `512`;
5321	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5322	} else {
5323	// NSA form
5324	unsigned FirstSrcOpIdx = IsDualOrBVH8 ? `4` : `2`;
5325	for (unsigned I = FirstSrcOpIdx; I < LastRegOpIdx; ++I) {
5326	unsigned Size = MRI.getType(Reg: MI.getOperand(i: I).getReg()).getSizeInBits();
5327	OpdsMapping [I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5328	}
5329	}
5330	break;
5331	}
5332	case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
5333	case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
5334	auto IntrID = cast<GIntrinsic>(Val: MI).getIntrinsicID();
5335	switch (IntrID) {
5336	case Intrinsic::amdgcn_s_getreg:
5337	case Intrinsic::amdgcn_s_memtime:
5338	case Intrinsic::amdgcn_s_memrealtime:
5339	case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
5340	case Intrinsic::amdgcn_s_sendmsg_rtn: {
5341	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5342	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
5343	break;
5344	}
5345	case Intrinsic::amdgcn_global_atomic_fmin_num:
5346	case Intrinsic::amdgcn_global_atomic_fmax_num:
5347	case Intrinsic::amdgcn_flat_atomic_fmin_num:
5348	case Intrinsic::amdgcn_flat_atomic_fmax_num:
5349	case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
5350	case Intrinsic::amdgcn_global_load_tr_b64:
5351	case Intrinsic::amdgcn_global_load_tr_b128:
5352	case Intrinsic::amdgcn_global_load_tr4_b64:
5353	case Intrinsic::amdgcn_global_load_tr6_b96:
5354	case Intrinsic::amdgcn_ds_load_tr8_b64:
5355	case Intrinsic::amdgcn_ds_load_tr16_b128:
5356	case Intrinsic::amdgcn_ds_load_tr4_b64:
5357	case Intrinsic::amdgcn_ds_load_tr6_b96:
5358	case Intrinsic::amdgcn_ds_read_tr4_b64:
5359	case Intrinsic::amdgcn_ds_read_tr6_b96:
5360	case Intrinsic::amdgcn_ds_read_tr8_b64:
5361	case Intrinsic::amdgcn_ds_read_tr16_b64:
5362	case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
5363	case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
5364	return getDefaultMappingAllVGPR(MI);
5365	case Intrinsic::amdgcn_ds_ordered_add:
5366	case Intrinsic::amdgcn_ds_ordered_swap: {
5367	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5368	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5369	unsigned M0Bank = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI,
5370	Default: AMDGPU::SGPRRegBankID);
5371	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: M0Bank, Size: `32`);
5372	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5373	break;
5374	}
5375	case Intrinsic::amdgcn_ds_append:
5376	case Intrinsic::amdgcn_ds_consume: {
5377	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5378	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5379	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5380	break;
5381	}
5382	case Intrinsic::amdgcn_exp_compr:
5383	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5384	OpdsMapping [`4`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5385	break;
5386	case Intrinsic::amdgcn_exp:
5387	// FIXME: Could we support packed types here?
5388	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5389	OpdsMapping [`4`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5390	OpdsMapping [`5`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5391	OpdsMapping [`6`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5392	break;
5393	case Intrinsic::amdgcn_exp_row:
5394	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5395	OpdsMapping [`4`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5396	OpdsMapping [`5`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5397	OpdsMapping [`6`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5398	OpdsMapping [`8`] = getSGPROpMapping(Reg: MI.getOperand(i: `8`).getReg(), MRI, TRI: *TRI);
5399	break;
5400	case Intrinsic::amdgcn_s_alloc_vgpr:
5401	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `1`);
5402	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `32`);
5403	break;
5404	case Intrinsic::amdgcn_s_sendmsg:
5405	case Intrinsic::amdgcn_s_sendmsghalt: {
5406	// This must be an SGPR, but accept a VGPR.
5407	unsigned Bank = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI,
5408	Default: AMDGPU::SGPRRegBankID);
5409	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: Bank, Size: `32`);
5410	break;
5411	}
5412	case Intrinsic::amdgcn_s_setreg: {
5413	// This must be an SGPR, but accept a VGPR.
5414	unsigned Bank = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI,
5415	Default: AMDGPU::SGPRRegBankID);
5416	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: Bank, Size: `32`);
5417	break;
5418	}
5419	case Intrinsic::amdgcn_s_ttracedata: {
5420	// This must be an SGPR, but accept a VGPR.
5421	unsigned Bank =
5422	getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI, Default: AMDGPU::SGPRRegBankID);
5423	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: Bank, Size: `32`);
5424	break;
5425	}
5426	case Intrinsic::amdgcn_end_cf: {
5427	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5428	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
5429	break;
5430	}
5431	case Intrinsic::amdgcn_else: {
5432	unsigned WaveSize = getSizeInBits(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5433	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: `1`);
5434	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: WaveSize);
5435	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: WaveSize);
5436	break;
5437	}
5438	case Intrinsic::amdgcn_init_whole_wave:
5439	case Intrinsic::amdgcn_live_mask: {
5440	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: `1`);
5441	break;
5442	}
5443	case Intrinsic::amdgcn_wqm_demote:
5444	case Intrinsic::amdgcn_kill: {
5445	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: `1`);
5446	break;
5447	}
5448	case Intrinsic::amdgcn_raw_buffer_load:
5449	case Intrinsic::amdgcn_raw_ptr_buffer_load:
5450	case Intrinsic::amdgcn_raw_atomic_buffer_load:
5451	case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
5452	case Intrinsic::amdgcn_raw_tbuffer_load:
5453	case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
5454	// FIXME: Should make intrinsic ID the last operand of the instruction,
5455	// then this would be the same as store
5456	OpdsMapping [`0`] = getVGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5457	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5458	OpdsMapping [`3`] = getVGPROpMapping(Reg: MI.getOperand(i: `3`).getReg(), MRI, TRI: *TRI);
5459	OpdsMapping [`4`] = getSGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI);
5460	break;
5461	}
5462	case Intrinsic::amdgcn_raw_buffer_load_lds:
5463	case Intrinsic::amdgcn_raw_buffer_load_async_lds:
5464	case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
5465	case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: {
5466	OpdsMapping [`1`] = getSGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5467	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5468	OpdsMapping [`4`] = getVGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI);
5469	OpdsMapping [`5`] = getSGPROpMapping(Reg: MI.getOperand(i: `5`).getReg(), MRI, TRI: *TRI);
5470	break;
5471	}
5472	case Intrinsic::amdgcn_raw_buffer_store:
5473	case Intrinsic::amdgcn_raw_ptr_buffer_store:
5474	case Intrinsic::amdgcn_raw_buffer_store_format:
5475	case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
5476	case Intrinsic::amdgcn_raw_tbuffer_store:
5477	case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
5478	OpdsMapping [`1`] = getVGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5479	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5480	OpdsMapping [`3`] = getVGPROpMapping(Reg: MI.getOperand(i: `3`).getReg(), MRI, TRI: *TRI);
5481	OpdsMapping [`4`] = getSGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI);
5482	break;
5483	}
5484	case Intrinsic::amdgcn_struct_buffer_load:
5485	case Intrinsic::amdgcn_struct_ptr_buffer_load:
5486	case Intrinsic::amdgcn_struct_tbuffer_load:
5487	case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
5488	case Intrinsic::amdgcn_struct_atomic_buffer_load:
5489	case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
5490	OpdsMapping [`0`] = getVGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5491	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5492	OpdsMapping [`3`] = getVGPROpMapping(Reg: MI.getOperand(i: `3`).getReg(), MRI, TRI: *TRI);
5493	OpdsMapping [`4`] = getVGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI);
5494	OpdsMapping [`5`] = getSGPROpMapping(Reg: MI.getOperand(i: `5`).getReg(), MRI, TRI: *TRI);
5495	break;
5496	}
5497	case Intrinsic::amdgcn_struct_buffer_load_lds:
5498	case Intrinsic::amdgcn_struct_buffer_load_async_lds:
5499	case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
5500	case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
5501	OpdsMapping [`1`] = getSGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5502	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5503	OpdsMapping [`4`] = getVGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI);
5504	OpdsMapping [`5`] = getVGPROpMapping(Reg: MI.getOperand(i: `5`).getReg(), MRI, TRI: *TRI);
5505	OpdsMapping [`6`] = getSGPROpMapping(Reg: MI.getOperand(i: `6`).getReg(), MRI, TRI: *TRI);
5506	break;
5507	}
5508	case Intrinsic::amdgcn_struct_buffer_store:
5509	case Intrinsic::amdgcn_struct_ptr_buffer_store:
5510	case Intrinsic::amdgcn_struct_tbuffer_store:
5511	case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
5512	OpdsMapping [`1`] = getVGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5513	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5514	OpdsMapping [`3`] = getVGPROpMapping(Reg: MI.getOperand(i: `3`).getReg(), MRI, TRI: *TRI);
5515	OpdsMapping [`4`] = getVGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: *TRI);
5516	OpdsMapping [`5`] = getSGPROpMapping(Reg: MI.getOperand(i: `5`).getReg(), MRI, TRI: *TRI);
5517	break;
5518	}
5519	case Intrinsic::amdgcn_init_exec_from_input: {
5520	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5521	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
5522	break;
5523	}
5524	case Intrinsic::amdgcn_ds_gws_init:
5525	case Intrinsic::amdgcn_ds_gws_barrier:
5526	case Intrinsic::amdgcn_ds_gws_sema_br: {
5527	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5528
5529	// This must be an SGPR, but accept a VGPR.
5530	unsigned Bank = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI,
5531	Default: AMDGPU::SGPRRegBankID);
5532	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: Bank, Size: `32`);
5533	break;
5534	}
5535	case Intrinsic::amdgcn_ds_gws_sema_v:
5536	case Intrinsic::amdgcn_ds_gws_sema_p:
5537	case Intrinsic::amdgcn_ds_gws_sema_release_all: {
5538	// This must be an SGPR, but accept a VGPR.
5539	unsigned Bank = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI,
5540	Default: AMDGPU::SGPRRegBankID);
5541	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: Bank, Size: `32`);
5542	break;
5543	}
5544	case Intrinsic::amdgcn_cluster_load_b32:
5545	case Intrinsic::amdgcn_cluster_load_b64:
5546	case Intrinsic::amdgcn_cluster_load_b128: {
5547	OpdsMapping [`0`] = getVGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5548	OpdsMapping [`2`] = getVGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5549	unsigned M0Bank =
5550	getRegBankID(Reg: MI.getOperand(i: `4`).getReg(), MRI, Default: AMDGPU::SGPRRegBankID);
5551	OpdsMapping [`4`] = AMDGPU::getValueMapping(BankID: M0Bank, Size: `32`);
5552	break;
5553	}
5554	case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
5555	case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
5556	case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
5557	case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
5558	OpdsMapping [`1`] = getVGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5559	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5560	unsigned M0Bank =
5561	getRegBankID(Reg: MI.getOperand(i: `5`).getReg(), MRI, Default: AMDGPU::SGPRRegBankID);
5562	OpdsMapping [`5`] = AMDGPU::getValueMapping(BankID: M0Bank, Size: `32`);
5563	break;
5564	}
5565	case Intrinsic::amdgcn_global_store_async_from_lds_b8:
5566	case Intrinsic::amdgcn_global_store_async_from_lds_b32:
5567	case Intrinsic::amdgcn_global_store_async_from_lds_b64:
5568	case Intrinsic::amdgcn_global_store_async_from_lds_b128:
5569	case Intrinsic::amdgcn_global_load_async_to_lds_b8:
5570	case Intrinsic::amdgcn_global_load_async_to_lds_b32:
5571	case Intrinsic::amdgcn_global_load_async_to_lds_b64:
5572	case Intrinsic::amdgcn_global_load_async_to_lds_b128:
5573	case Intrinsic::amdgcn_load_to_lds:
5574	case Intrinsic::amdgcn_global_load_lds: {
5575	OpdsMapping [`1`] = getVGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5576	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5577	break;
5578	}
5579	case Intrinsic::amdgcn_lds_direct_load: {
5580	const int M0Idx = MI.getNumOperands() - `1`;
5581	Register M0Reg = MI.getOperand(i: M0Idx).getReg();
5582	unsigned M0Bank = getRegBankID(Reg: M0Reg, MRI, Default: AMDGPU::SGPRRegBankID);
5583	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5584
5585	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5586	for (int I = `2`; I != M0Idx && MI.getOperand(i: I).isReg(); ++I)
5587	OpdsMapping [I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: `32`);
5588
5589	// Must be SGPR, but we must take whatever the original bank is and fix it
5590	// later.
5591	OpdsMapping [M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: `32`);
5592	break;
5593	}
5594	case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
5595	case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
5596	OpdsMapping [`0`] = getVGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5597	OpdsMapping [`2`] = getVGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5598	break;
5599	case Intrinsic::amdgcn_ds_bvh_stack_rtn:
5600	case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
5601	case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
5602	case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
5603	OpdsMapping [`0`] =
5604	getVGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: TRI); // %vdst*
5605	OpdsMapping [`1`] =
5606	getVGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: TRI); // %addr*
5607	OpdsMapping [`3`] =
5608	getVGPROpMapping(Reg: MI.getOperand(i: `3`).getReg(), MRI, TRI: TRI); // %addr*
5609	OpdsMapping [`4`] =
5610	getVGPROpMapping(Reg: MI.getOperand(i: `4`).getReg(), MRI, TRI: TRI); // %data0*
5611	OpdsMapping [`5`] =
5612	getVGPROpMapping(Reg: MI.getOperand(i: `5`).getReg(), MRI, TRI: TRI); // %data1*
5613	break;
5614	}
5615	case Intrinsic::amdgcn_s_sleep_var:
5616	OpdsMapping [`1`] = getSGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5617	break;
5618	case Intrinsic::amdgcn_s_barrier_join:
5619	case Intrinsic::amdgcn_s_wakeup_barrier:
5620	OpdsMapping [`1`] = getSGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5621	break;
5622	case Intrinsic::amdgcn_s_barrier_init:
5623	case Intrinsic::amdgcn_s_barrier_signal_var:
5624	OpdsMapping [`1`] = getSGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5625	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5626	break;
5627	case Intrinsic::amdgcn_s_barrier_signal_isfirst: {
5628	const unsigned ResultSize = `1`;
5629	OpdsMapping [`0`] =
5630	AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: ResultSize);
5631	break;
5632	}
5633	case Intrinsic::amdgcn_s_get_barrier_state:
5634	case Intrinsic::amdgcn_s_get_named_barrier_state: {
5635	OpdsMapping [`0`] = getSGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5636	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5637	break;
5638	}
5639	case Intrinsic::amdgcn_pops_exiting_wave_id:
5640	return getDefaultMappingSOP(MI);
5641	case Intrinsic::amdgcn_tensor_load_to_lds:
5642	case Intrinsic::amdgcn_tensor_store_from_lds: {
5643	// Lie and claim everything is legal, even all operands need to be
5644	// SGPRs. applyMapping will have to deal with it with readfirstlane.
5645	for (unsigned I = `1`; I < MI.getNumOperands(); ++I) {
5646	if (MI.getOperand(i: I).isReg()) {
5647	Register Reg = MI.getOperand(i: I).getReg();
5648	auto OpBank = getRegBankID(Reg, MRI);
5649	unsigned Size = getSizeInBits(Reg, MRI, TRI: *TRI);
5650	OpdsMapping [I] = AMDGPU::getValueMapping(BankID: OpBank, Size);
5651	}
5652	}
5653	break;
5654	}
5655	case Intrinsic::amdgcn_s_prefetch_data: {
5656	OpdsMapping [`1`] = getSGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5657	OpdsMapping [`2`] = getSGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5658	break;
5659	}
5660	case Intrinsic::amdgcn_flat_prefetch:
5661	case Intrinsic::amdgcn_global_prefetch:
5662	return getDefaultMappingVOP(MI);
5663	default:
5664	return getInvalidInstructionMapping();
5665	}
5666	break;
5667	}
5668	case AMDGPU::G_SELECT: {
5669	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5670	unsigned Op2Bank = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI,
5671	Default: AMDGPU::SGPRRegBankID);
5672	unsigned Op3Bank = getRegBankID(Reg: MI.getOperand(i: `3`).getReg(), MRI,
5673	Default: AMDGPU::SGPRRegBankID);
5674	bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
5675	Op3Bank == AMDGPU::SGPRRegBankID;
5676
5677	unsigned CondBankDefault = SGPRSrcs ?
5678	AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5679	unsigned CondBank = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI,
5680	Default: CondBankDefault);
5681	if (CondBank == AMDGPU::SGPRRegBankID)
5682	CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5683	else if (CondBank == AMDGPU::VGPRRegBankID)
5684	CondBank = AMDGPU::VCCRegBankID;
5685
5686	unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
5687	AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5688
5689	assert(CondBank == AMDGPU::VCCRegBankID \|\| CondBank == AMDGPU::SGPRRegBankID);
5690
5691	// TODO: Should report 32-bit for scalar condition type.
5692	if (Size == `64`) {
5693	OpdsMapping [`0`] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5694	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: CondBank, Size: `1`);
5695	OpdsMapping [`2`] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5696	OpdsMapping [`3`] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5697	} else {
5698	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: Bank, Size);
5699	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: CondBank, Size: `1`);
5700	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: Bank, Size);
5701	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: Bank, Size);
5702	}
5703
5704	break;
5705	}
5706
5707	case AMDGPU::G_SI_CALL: {
5708	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: `64`);
5709	// Lie and claim everything is legal, even though some need to be
5710	// SGPRs. applyMapping will have to deal with it as a waterfall loop.
5711	OpdsMapping [`1`] = getSGPROpMapping(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5712
5713	// Allow anything for implicit arguments
5714	for (unsigned I = `4`; I < MI.getNumOperands(); ++I) {
5715	if (MI.getOperand(i: I).isReg()) {
5716	Register Reg = MI.getOperand(i: I).getReg();
5717	auto OpBank = getRegBankID(Reg, MRI);
5718	unsigned Size = getSizeInBits(Reg, MRI, TRI: *TRI);
5719	OpdsMapping [I] = AMDGPU::getValueMapping(BankID: OpBank, Size);
5720	}
5721	}
5722	break;
5723	}
5724	case AMDGPU::G_LOAD:
5725	case AMDGPU::G_ZEXTLOAD:
5726	case AMDGPU::G_SEXTLOAD:
5727	return getInstrMappingForLoad(MI);
5728
5729	case AMDGPU::G_ATOMICRMW_XCHG:
5730	case AMDGPU::G_ATOMICRMW_ADD:
5731	case AMDGPU::G_ATOMICRMW_SUB:
5732	case AMDGPU::G_ATOMICRMW_AND:
5733	case AMDGPU::G_ATOMICRMW_OR:
5734	case AMDGPU::G_ATOMICRMW_XOR:
5735	case AMDGPU::G_ATOMICRMW_MAX:
5736	case AMDGPU::G_ATOMICRMW_MIN:
5737	case AMDGPU::G_ATOMICRMW_UMAX:
5738	case AMDGPU::G_ATOMICRMW_UMIN:
5739	case AMDGPU::G_ATOMICRMW_FADD:
5740	case AMDGPU::G_ATOMICRMW_FMIN:
5741	case AMDGPU::G_ATOMICRMW_FMAX:
5742	case AMDGPU::G_ATOMICRMW_UINC_WRAP:
5743	case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5744	case AMDGPU::G_ATOMICRMW_USUB_COND:
5745	case AMDGPU::G_ATOMICRMW_USUB_SAT:
5746	case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
5747	OpdsMapping [`0`] = getVGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5748	OpdsMapping [`1`] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: `1`).getReg());
5749	OpdsMapping [`2`] = getVGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5750	break;
5751	}
5752	case AMDGPU::G_ATOMIC_CMPXCHG: {
5753	OpdsMapping [`0`] = getVGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5754	OpdsMapping [`1`] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: `1`).getReg());
5755	OpdsMapping [`2`] = getVGPROpMapping(Reg: MI.getOperand(i: `2`).getReg(), MRI, TRI: *TRI);
5756	OpdsMapping [`3`] = getVGPROpMapping(Reg: MI.getOperand(i: `3`).getReg(), MRI, TRI: *TRI);
5757	break;
5758	}
5759	case AMDGPU::G_BRCOND: {
5760	unsigned Bank = getRegBankID(Reg: MI.getOperand(i: `0`).getReg(), MRI,
5761	Default: AMDGPU::SGPRRegBankID);
5762	assert(MRI.getType(MI.getOperand(`0`).getReg()).getSizeInBits() == `1`);
5763	if (Bank != AMDGPU::SGPRRegBankID)
5764	Bank = AMDGPU::VCCRegBankID;
5765
5766	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: Bank, Size: `1`);
5767	break;
5768	}
5769	case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
5770	return getDefaultMappingVOP(MI);
5771	case AMDGPU::G_PREFETCH:
5772	OpdsMapping [`0`] = getSGPROpMapping(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5773	break;
5774	case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP:
5775	case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN:
5776	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: `1`);
5777	break;
5778	case AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR:
5779	case AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR: {
5780	unsigned Size = getSizeInBits(Reg: MI.getOperand(i: `0`).getReg(), MRI, TRI: *TRI);
5781	unsigned PtrSize = getSizeInBits(Reg: MI.getOperand(i: `1`).getReg(), MRI, TRI: *TRI);
5782	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5783	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: PtrSize);
5784	break;
5785	}
5786	}
5787
5788	return getInstructionMapping(/ID/`1`, /Cost/`1`,
5789	OperandsMapping: getOperandsMapping(OpdsMapping),
5790	NumOperands: MI.getNumOperands());
5791	}
5792

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp