1//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the RegisterBankInfo class for
10/// AMDGPU.
11///
12/// \par
13///
14/// AMDGPU has unique register bank constraints that require special high level
15/// strategies to deal with. There are two main true physical register banks
16/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17/// sort of pseudo-register bank needed to represent SGPRs used in a vector
18/// boolean context. There is also the AGPR bank, which is a special purpose
19/// physical register bank present on some subtargets.
20///
21/// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22/// be uniform. It is generally not valid to legalize operands by inserting
23/// copies as on other targets. Operations which require uniform, SGPR operands
24/// generally require scalarization by repeatedly executing the instruction,
25/// activating each set of lanes using a unique set of input values. This is
26/// referred to as a waterfall loop.
27///
28/// \par Booleans
29///
30/// Booleans (s1 values) requires special consideration. A vector compare result
31/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32/// register. These are represented with the VCC bank. During selection, we need
33/// to be able to unambiguously go back from a register class to a register
34/// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35/// bank, we need to know the use context type. An SGPR s1 value always means a
36/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37/// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38/// a 32-bit virtual register. Taken together, this means we need to adjust the
39/// type of boolean operations to be regbank legal. All SALU booleans need to be
40/// widened to 32-bits, and all VALU booleans need to be s1 values.
41///
42/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44/// bank. A non-boolean source (such as a truncate from a 1-bit load from
45/// memory) will require a copy to the VCC bank which will require clearing the
46/// high bits and inserting a compare.
47///
48/// \par Constant bus restriction
49///
50/// VALU instructions have a limitation known as the constant bus
51/// restriction. Most VALU instructions can use SGPR operands, but may read at
52/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53/// instructions). This is one unique SGPR, so the same SGPR may be used for
54/// multiple operands. From a register bank perspective, any combination of
55/// operands should be legal as an SGPR, but this is contextually dependent on
56/// the SGPR operands all being the same register. There is therefore optimal to
57/// choose the SGPR with the most uses to minimize the number of copies.
58///
59/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60/// operation should have its source operands all mapped to VGPRs (except for
61/// VCC), inserting copies from any SGPR operands. This the most trivial legal
62/// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63/// complicated to solve here. Every optimization pattern or instruction
64/// selected to multiple outputs would have to enforce this rule, and there
65/// would be additional complexity in tracking this rule for every G_*
66/// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67/// picking the optimal operand combination from a post-isel optimization pass.
68///
69//===----------------------------------------------------------------------===//
70
71#include "AMDGPURegisterBankInfo.h"
72
73#include "AMDGPU.h"
74#include "AMDGPUGlobalISelUtils.h"
75#include "AMDGPUInstrInfo.h"
76#include "AMDGPULaneMaskUtils.h"
77#include "GCNSubtarget.h"
78#include "SIMachineFunctionInfo.h"
79#include "SIRegisterInfo.h"
80#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
81#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
82#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
83#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
84#include "llvm/CodeGen/RegisterBank.h"
85#include "llvm/IR/IntrinsicsAMDGPU.h"
86
87#define GET_TARGET_REGBANK_IMPL
88#include "AMDGPUGenRegisterBank.inc"
89
90// This file will be TableGen'ed at some point.
91#include "AMDGPUGenRegisterBankInfo.def"
92
93using namespace llvm;
94using namespace MIPatternMatch;
95
96namespace {
97
98// Observer to apply a register bank to new registers created by LegalizerHelper.
99class ApplyRegBankMapping final : public GISelChangeObserver {
100private:
101 MachineIRBuilder &B;
102 const AMDGPURegisterBankInfo &RBI;
103 MachineRegisterInfo &MRI;
104 const RegisterBank *NewBank;
105 SmallVector<MachineInstr *, 4> NewInsts;
106
107public:
108 ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
109 MachineRegisterInfo &MRI_, const RegisterBank *RB)
110 : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
111 assert(!B.isObservingChanges());
112 B.setChangeObserver(*this);
113 }
114
115 ~ApplyRegBankMapping() override {
116 for (MachineInstr *MI : NewInsts)
117 applyBank(MI&: *MI);
118
119 B.stopObservingChanges();
120 }
121
122 /// Set any registers that don't have a set register class or bank to SALU.
123 void applyBank(MachineInstr &MI) {
124 const unsigned Opc = MI.getOpcode();
125 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
126 Opc == AMDGPU::G_SEXT) {
127 // LegalizerHelper wants to use the basic legalization artifacts when
128 // widening etc. We don't handle selection with vcc in artifact sources,
129 // so we need to use a select instead to handle these properly.
130 Register DstReg = MI.getOperand(i: 0).getReg();
131 Register SrcReg = MI.getOperand(i: 1).getReg();
132 const RegisterBank *SrcBank = RBI.getRegBank(Reg: SrcReg, MRI, TRI: *RBI.TRI);
133 if (SrcBank == &AMDGPU::VCCRegBank) {
134 const LLT S32 = LLT::scalar(SizeInBits: 32);
135 assert(MRI.getType(SrcReg) == LLT::scalar(1));
136 assert(MRI.getType(DstReg) == S32);
137 assert(NewBank == &AMDGPU::VGPRRegBank);
138
139 // Replace the extension with a select, which really uses the boolean
140 // source.
141 B.setInsertPt(MBB&: *MI.getParent(), II: MI);
142
143 auto True = B.buildConstant(Res: S32, Val: Opc == AMDGPU::G_SEXT ? -1 : 1);
144 auto False = B.buildConstant(Res: S32, Val: 0);
145 B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False);
146 MRI.setRegBank(Reg: True.getReg(Idx: 0), RegBank: *NewBank);
147 MRI.setRegBank(Reg: False.getReg(Idx: 0), RegBank: *NewBank);
148 MI.eraseFromParent();
149 }
150
151 assert(!MRI.getRegClassOrRegBank(DstReg));
152 MRI.setRegBank(Reg: DstReg, RegBank: *NewBank);
153 return;
154 }
155
156#ifndef NDEBUG
157 if (Opc == AMDGPU::G_TRUNC) {
158 Register DstReg = MI.getOperand(0).getReg();
159 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
160 assert(DstBank != &AMDGPU::VCCRegBank);
161 }
162#endif
163
164 for (MachineOperand &Op : MI.operands()) {
165 if (!Op.isReg())
166 continue;
167
168 // We may see physical registers if building a real MI
169 Register Reg = Op.getReg();
170 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
171 continue;
172
173 const RegisterBank *RB = NewBank;
174 if (MRI.getType(Reg) == LLT::scalar(SizeInBits: 1)) {
175 assert(NewBank == &AMDGPU::VGPRRegBank &&
176 "s1 operands should only be used for vector bools");
177 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
178 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
179 "not expecting legalization artifacts here");
180 RB = &AMDGPU::VCCRegBank;
181 }
182
183 MRI.setRegBank(Reg, RegBank: *RB);
184 }
185 }
186
187 void erasingInstr(MachineInstr &MI) override {}
188
189 void createdInstr(MachineInstr &MI) override {
190 // At this point, the instruction was just inserted and has no operands.
191 NewInsts.push_back(Elt: &MI);
192 }
193
194 void changingInstr(MachineInstr &MI) override {}
195 void changedInstr(MachineInstr &MI) override {
196 // FIXME: In principle we should probably add the instruction to NewInsts,
197 // but the way the LegalizerHelper uses the observer, we will always see the
198 // registers we need to set the regbank on also referenced in a new
199 // instruction.
200 }
201};
202
203} // anonymous namespace
204
205AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
206 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
207 TII(Subtarget.getInstrInfo()) {
208
209 // HACK: Until this is fully tablegen'd.
210 static llvm::once_flag InitializeRegisterBankFlag;
211
212 static auto InitializeRegisterBankOnce = [this]() {
213 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
214 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
215 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
216 (void)this;
217 };
218
219 llvm::call_once(flag&: InitializeRegisterBankFlag, F&: InitializeRegisterBankOnce);
220}
221
222static bool isVectorRegisterBank(const RegisterBank &Bank) {
223 unsigned BankID = Bank.getID();
224 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
225}
226
227bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {
228 return RB != &AMDGPU::SGPRRegBank;
229}
230
231unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
232 const RegisterBank &Src,
233 TypeSize Size) const {
234 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
235 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
236 (isVectorRegisterBank(Bank: Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
237 return std::numeric_limits<unsigned>::max();
238 }
239
240 // Bool values are tricky, because the meaning is based on context. The SCC
241 // and VCC banks are for the natural scalar and vector conditions produced by
242 // a compare.
243 //
244 // Legalization doesn't know about the necessary context, so an s1 use may
245 // have been a truncate from an arbitrary value, in which case a copy (lowered
246 // as a compare with 0) needs to be inserted.
247 if (Size == 1 &&
248 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
249 (isVectorRegisterBank(Bank: Src) ||
250 Src.getID() == AMDGPU::SGPRRegBankID ||
251 Src.getID() == AMDGPU::VCCRegBankID))
252 return std::numeric_limits<unsigned>::max();
253
254 // There is no direct copy between AGPRs.
255 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
256 Src.getID() == AMDGPU::AGPRRegBankID)
257 return 4;
258
259 return RegisterBankInfo::copyCost(A: Dst, B: Src, Size);
260}
261
262unsigned AMDGPURegisterBankInfo::getBreakDownCost(
263 const ValueMapping &ValMapping,
264 const RegisterBank *CurBank) const {
265 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
266 // VGPR.
267 // FIXME: Is there a better way to do this?
268 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
269 return 10; // This is expensive.
270
271 assert(ValMapping.NumBreakDowns == 2 &&
272 ValMapping.BreakDown[0].Length == 32 &&
273 ValMapping.BreakDown[0].StartIdx == 0 &&
274 ValMapping.BreakDown[1].Length == 32 &&
275 ValMapping.BreakDown[1].StartIdx == 32 &&
276 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
277
278 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
279 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
280 // want.
281
282 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
283 // alignment restrictions, but this probably isn't important.
284 return 1;
285}
286
287const RegisterBank &
288AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
289 LLT Ty) const {
290 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
291 // VCC-like use.
292 if (TRI->isSGPRClass(RC: &RC)) {
293 // FIXME: This probably came from a copy from a physical register, which
294 // should be inferable from the copied to-type. We don't have many boolean
295 // physical register constraints so just assume a normal SGPR for now.
296 if (!Ty.isValid())
297 return AMDGPU::SGPRRegBank;
298
299 return Ty == LLT::scalar(SizeInBits: 1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
300 }
301
302 return TRI->isAGPRClass(RC: &RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
303}
304
305template <unsigned NumOps>
306RegisterBankInfo::InstructionMappings
307AMDGPURegisterBankInfo::addMappingFromTable(
308 const MachineInstr &MI, const MachineRegisterInfo &MRI,
309 const std::array<unsigned, NumOps> RegSrcOpIdx,
310 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
311
312 InstructionMappings AltMappings;
313
314 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
315
316 unsigned Sizes[NumOps];
317 for (unsigned I = 0; I < NumOps; ++I) {
318 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
319 Sizes[I] = getSizeInBits(Reg, MRI, TRI: *TRI);
320 }
321
322 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
323 unsigned SizeI = getSizeInBits(Reg: MI.getOperand(i: I).getReg(), MRI, TRI: *TRI);
324 Operands[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SizeI);
325 }
326
327 // getInstrMapping's default mapping uses ID 1, so start at 2.
328 unsigned MappingID = 2;
329 for (const auto &Entry : Table) {
330 for (unsigned I = 0; I < NumOps; ++I) {
331 int OpIdx = RegSrcOpIdx[I];
332 Operands[OpIdx] = AMDGPU::getValueMapping(BankID: Entry.RegBanks[I], Size: Sizes[I]);
333 }
334
335 AltMappings.push_back(Elt: &getInstructionMapping(ID: MappingID++, Cost: Entry.Cost,
336 OperandsMapping: getOperandsMapping(OpdsMapping: Operands),
337 NumOperands: Operands.size()));
338 }
339
340 return AltMappings;
341}
342
343RegisterBankInfo::InstructionMappings
344AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
345 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
346 switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
347 case Intrinsic::amdgcn_readlane: {
348 static const OpRegBankEntry<3> Table[2] = {
349 // Perfectly legal.
350 { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: 1 },
351
352 // Need a readfirstlane for the index.
353 { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 }
354 };
355
356 const std::array<unsigned, 3> RegSrcOpIdx = { ._M_elems: { 0, 2, 3 } };
357 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
358 }
359 case Intrinsic::amdgcn_writelane: {
360 static const OpRegBankEntry<4> Table[4] = {
361 // Perfectly legal.
362 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 1 },
363
364 // Need readfirstlane of first op
365 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 },
366
367 // Need readfirstlane of second op
368 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 },
369
370 // Need readfirstlane of both ops
371 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 3 }
372 };
373
374 // rsrc, voffset, offset
375 const std::array<unsigned, 4> RegSrcOpIdx = { ._M_elems: { 0, 2, 3, 4 } };
376 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
377 }
378 default:
379 return RegisterBankInfo::getInstrAlternativeMappings(MI);
380 }
381}
382
383RegisterBankInfo::InstructionMappings
384AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
385 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
386
387 switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
388 case Intrinsic::amdgcn_s_buffer_load: {
389 static const OpRegBankEntry<2> Table[4] = {
390 // Perfectly legal.
391 { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: 1 },
392
393 // Only need 1 register in loop
394 { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 300 },
395
396 // Have to waterfall the resource.
397 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: 1000 },
398
399 // Have to waterfall the resource, and the offset.
400 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 1500 }
401 };
402
403 // rsrc, offset
404 const std::array<unsigned, 2> RegSrcOpIdx = { ._M_elems: { 2, 3 } };
405 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
406 }
407 case Intrinsic::amdgcn_ds_ordered_add:
408 case Intrinsic::amdgcn_ds_ordered_swap: {
409 // VGPR = M0, VGPR
410 static const OpRegBankEntry<3> Table[2] = {
411 // Perfectly legal.
412 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 1 },
413
414 // Need a readfirstlane for m0
415 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 }
416 };
417
418 const std::array<unsigned, 3> RegSrcOpIdx = { ._M_elems: { 0, 2, 3 } };
419 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
420 }
421 case Intrinsic::amdgcn_s_sendmsg:
422 case Intrinsic::amdgcn_s_sendmsghalt: {
423 // FIXME: Should have no register for immediate
424 static const OpRegBankEntry<1> Table[2] = {
425 // Perfectly legal.
426 { .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: 1 },
427
428 // Need readlane
429 { .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: 3 }
430 };
431
432 const std::array<unsigned, 1> RegSrcOpIdx = { ._M_elems: { 2 } };
433 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
434 }
435 default:
436 return RegisterBankInfo::getInstrAlternativeMappings(MI);
437 }
438}
439
440// FIXME: Returns uniform if there's no source value information. This is
441// probably wrong.
442bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
443 if (!MI.hasOneMemOperand())
444 return false;
445
446 const MachineMemOperand *MMO = *MI.memoperands_begin();
447 const unsigned AS = MMO->getAddrSpace();
448 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
449 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
450 const unsigned MemSize = 8 * MMO->getSize().getValue();
451
452 // Require 4-byte alignment.
453 return (MMO->getAlign() >= Align(4) ||
454 (Subtarget.hasScalarSubwordLoads() &&
455 ((MemSize == 16 && MMO->getAlign() >= Align(2)) ||
456 (MemSize == 8 && MMO->getAlign() >= Align(1))))) &&
457 // Can't do a scalar atomic load.
458 !MMO->isAtomic() &&
459 // Don't use scalar loads for volatile accesses to non-constant address
460 // spaces.
461 (IsConst || !MMO->isVolatile()) &&
462 // Memory must be known constant, or not written before this load.
463 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
464 AMDGPU::isUniformMMO(MMO);
465}
466
467RegisterBankInfo::InstructionMappings
468AMDGPURegisterBankInfo::getInstrAlternativeMappings(
469 const MachineInstr &MI) const {
470
471 const MachineFunction &MF = *MI.getMF();
472 const MachineRegisterInfo &MRI = MF.getRegInfo();
473
474
475 InstructionMappings AltMappings;
476 switch (MI.getOpcode()) {
477 case TargetOpcode::G_CONSTANT:
478 case TargetOpcode::G_IMPLICIT_DEF: {
479 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
480 if (Size == 1) {
481 static const OpRegBankEntry<1> Table[3] = {
482 { .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: 1 },
483 { .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: 1 },
484 { .RegBanks: { AMDGPU::VCCRegBankID }, .Cost: 1 }
485 };
486
487 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx: {._M_elems: { 0 }}, Table);
488 }
489
490 [[fallthrough]];
491 }
492 case TargetOpcode::G_FCONSTANT:
493 case TargetOpcode::G_FRAME_INDEX:
494 case TargetOpcode::G_GLOBAL_VALUE: {
495 static const OpRegBankEntry<1> Table[2] = {
496 { .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: 1 },
497 { .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: 1 }
498 };
499
500 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx: {._M_elems: { 0 }}, Table);
501 }
502 case TargetOpcode::G_AND:
503 case TargetOpcode::G_OR:
504 case TargetOpcode::G_XOR: {
505 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
506
507 if (Size == 1) {
508 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
509 const InstructionMapping &SCCMapping = getInstructionMapping(
510 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
511 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32),
512 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32),
513 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32)}),
514 NumOperands: 3); // Num Operands
515 AltMappings.push_back(Elt: &SCCMapping);
516
517 const InstructionMapping &VCCMapping0 = getInstructionMapping(
518 ID: 2, Cost: 1, OperandsMapping: getOperandsMapping(
519 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size),
520 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size),
521 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size)}),
522 NumOperands: 3); // Num Operands
523 AltMappings.push_back(Elt: &VCCMapping0);
524 return AltMappings;
525 }
526
527 if (Size != 64)
528 break;
529
530 const InstructionMapping &SSMapping = getInstructionMapping(
531 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
532 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
533 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
534 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size)}),
535 NumOperands: 3); // Num Operands
536 AltMappings.push_back(Elt: &SSMapping);
537
538 const InstructionMapping &VVMapping = getInstructionMapping(
539 ID: 2, Cost: 2, OperandsMapping: getOperandsMapping(
540 OpdsMapping: {AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
541 AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
542 AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size)}),
543 NumOperands: 3); // Num Operands
544 AltMappings.push_back(Elt: &VVMapping);
545 break;
546 }
547 case TargetOpcode::G_LOAD:
548 case TargetOpcode::G_ZEXTLOAD:
549 case TargetOpcode::G_SEXTLOAD: {
550 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
551 LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
552 unsigned PtrSize = PtrTy.getSizeInBits();
553 unsigned AS = PtrTy.getAddressSpace();
554
555 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
556 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
557 isScalarLoadLegal(MI)) {
558 const InstructionMapping &SSMapping = getInstructionMapping(
559 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
560 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
561 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: PtrSize)}),
562 NumOperands: 2); // Num Operands
563 AltMappings.push_back(Elt: &SSMapping);
564 }
565
566 const InstructionMapping &VVMapping = getInstructionMapping(
567 ID: 2, Cost: 1,
568 OperandsMapping: getOperandsMapping(
569 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
570 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: PtrSize)}),
571 NumOperands: 2); // Num Operands
572 AltMappings.push_back(Elt: &VVMapping);
573
574 // It may be possible to have a vgpr = load sgpr mapping here, because
575 // the mubuf instructions support this kind of load, but probably for only
576 // gfx7 and older. However, the addressing mode matching in the instruction
577 // selector should be able to do a better job of detecting and selecting
578 // these kinds of loads from the vgpr = load vgpr mapping.
579
580 return AltMappings;
581
582 }
583 case TargetOpcode::G_SELECT: {
584 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
585 const InstructionMapping &SSMapping = getInstructionMapping(ID: 1, Cost: 1,
586 OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
587 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1),
588 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
589 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size)}),
590 NumOperands: 4); // Num Operands
591 AltMappings.push_back(Elt: &SSMapping);
592
593 const InstructionMapping &VVMapping = getInstructionMapping(ID: 2, Cost: 1,
594 OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
595 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1),
596 AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
597 AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size)}),
598 NumOperands: 4); // Num Operands
599 AltMappings.push_back(Elt: &VVMapping);
600
601 return AltMappings;
602 }
603 case TargetOpcode::G_UADDE:
604 case TargetOpcode::G_USUBE:
605 case TargetOpcode::G_SADDE:
606 case TargetOpcode::G_SSUBE: {
607 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
608 const InstructionMapping &SSMapping = getInstructionMapping(ID: 1, Cost: 1,
609 OperandsMapping: getOperandsMapping(
610 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
611 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1),
612 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
613 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
614 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1)}),
615 NumOperands: 5); // Num Operands
616 AltMappings.push_back(Elt: &SSMapping);
617
618 const InstructionMapping &VVMapping = getInstructionMapping(ID: 2, Cost: 1,
619 OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
620 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1),
621 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
622 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
623 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1)}),
624 NumOperands: 5); // Num Operands
625 AltMappings.push_back(Elt: &VVMapping);
626 return AltMappings;
627 }
628 case AMDGPU::G_BRCOND: {
629 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
630
631 // TODO: Change type to 32 for scalar
632 const InstructionMapping &SMapping = getInstructionMapping(
633 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
634 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1), nullptr}),
635 NumOperands: 2); // Num Operands
636 AltMappings.push_back(Elt: &SMapping);
637
638 const InstructionMapping &VMapping = getInstructionMapping(
639 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
640 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1), nullptr }),
641 NumOperands: 2); // Num Operands
642 AltMappings.push_back(Elt: &VMapping);
643 return AltMappings;
644 }
645 case AMDGPU::G_INTRINSIC:
646 case AMDGPU::G_INTRINSIC_CONVERGENT:
647 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
648 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
649 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
650 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
651 default:
652 break;
653 }
654 return RegisterBankInfo::getInstrAlternativeMappings(MI);
655}
656
657void AMDGPURegisterBankInfo::split64BitValueForMapping(
658 MachineIRBuilder &B,
659 SmallVector<Register, 2> &Regs,
660 LLT HalfTy,
661 Register Reg) const {
662 assert(HalfTy.getSizeInBits() == 32);
663 MachineRegisterInfo *MRI = B.getMRI();
664 Register LoLHS = MRI->createGenericVirtualRegister(Ty: HalfTy);
665 Register HiLHS = MRI->createGenericVirtualRegister(Ty: HalfTy);
666 const RegisterBank *Bank = getRegBank(Reg, MRI: *MRI, TRI: *TRI);
667 MRI->setRegBank(Reg: LoLHS, RegBank: *Bank);
668 MRI->setRegBank(Reg: HiLHS, RegBank: *Bank);
669
670 Regs.push_back(Elt: LoLHS);
671 Regs.push_back(Elt: HiLHS);
672
673 B.buildInstr(Opcode: AMDGPU::G_UNMERGE_VALUES)
674 .addDef(RegNo: LoLHS)
675 .addDef(RegNo: HiLHS)
676 .addUse(RegNo: Reg);
677}
678
679/// Replace the current type each register in \p Regs has with \p NewTy
680static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
681 LLT NewTy) {
682 for (Register Reg : Regs) {
683 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
684 MRI.setType(VReg: Reg, Ty: NewTy);
685 }
686}
687
688static LLT getHalfSizedType(LLT Ty) {
689 if (Ty.isVector()) {
690 assert(Ty.getElementCount().isKnownMultipleOf(2));
691 return LLT::scalarOrVector(EC: Ty.getElementCount().divideCoefficientBy(RHS: 2),
692 ScalarTy: Ty.getElementType());
693 }
694
695 assert(Ty.getScalarSizeInBits() % 2 == 0);
696 return LLT::scalar(SizeInBits: Ty.getScalarSizeInBits() / 2);
697}
698
699// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
700// source value into a scalar register.
701Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
702 MachineRegisterInfo &MRI,
703 Register Src) const {
704 LLT Ty = MRI.getType(Reg: Src);
705 const RegisterBank *Bank = getRegBank(Reg: Src, MRI, TRI: *TRI);
706
707 if (Bank == &AMDGPU::SGPRRegBank)
708 return Src;
709
710 unsigned Bits = Ty.getSizeInBits();
711 assert(Bits % 32 == 0);
712
713 if (Bank != &AMDGPU::VGPRRegBank) {
714 // We need to copy from AGPR to VGPR
715 Src = B.buildCopy(Res: Ty, Op: Src).getReg(Idx: 0);
716 MRI.setRegBank(Reg: Src, RegBank: AMDGPU::VGPRRegBank);
717 }
718
719 LLT S32 = LLT::scalar(SizeInBits: 32);
720 unsigned NumParts = Bits / 32;
721 SmallVector<Register, 8> SrcParts;
722 SmallVector<Register, 8> DstParts;
723
724 if (Bits == 32) {
725 SrcParts.push_back(Elt: Src);
726 } else {
727 auto Unmerge = B.buildUnmerge(Res: S32, Op: Src);
728 for (unsigned i = 0; i < NumParts; ++i)
729 SrcParts.push_back(Elt: Unmerge.getReg(Idx: i));
730 }
731
732 for (unsigned i = 0; i < NumParts; ++i) {
733 Register SrcPart = SrcParts[i];
734 Register DstPart = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
735 MRI.setType(VReg: DstPart, Ty: NumParts == 1 ? Ty : S32);
736
737 const TargetRegisterClass *Constrained =
738 constrainGenericRegister(Reg: SrcPart, RC: AMDGPU::VGPR_32RegClass, MRI);
739 (void)Constrained;
740 assert(Constrained && "Failed to constrain readfirstlane src reg");
741
742 B.buildInstr(Opc: AMDGPU::V_READFIRSTLANE_B32, DstOps: {DstPart}, SrcOps: {SrcPart});
743
744 DstParts.push_back(Elt: DstPart);
745 }
746
747 if (Bits == 32)
748 return DstParts[0];
749
750 Register Dst = B.buildMergeLikeInstr(Res: Ty, Ops: DstParts).getReg(Idx: 0);
751 MRI.setRegBank(Reg: Dst, RegBank: AMDGPU::SGPRRegBank);
752 return Dst;
753}
754
755/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
756/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
757/// execute the instruction for each unique combination of values in all lanes
758/// in the wave. The block will be split such that rest of the instructions are
759/// moved to a new block.
760///
761/// Essentially performs this loop:
762//
763/// Save Execution Mask
764/// For (Lane : Wavefront) {
765/// Enable Lane, Disable all other lanes
766/// SGPR = read SGPR value for current lane from VGPR
767/// VGPRResult[Lane] = use_op SGPR
768/// }
769/// Restore Execution Mask
770///
771/// There is additional complexity to try for compare values to identify the
772/// unique values used.
773bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
774 MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
775 SmallSet<Register, 4> &SGPROperandRegs) const {
776 // Track use registers which have already been expanded with a readfirstlane
777 // sequence. This may have multiple uses if moving a sequence.
778 DenseMap<Register, Register> WaterfalledRegMap;
779
780 MachineBasicBlock &MBB = B.getMBB();
781 MachineFunction *MF = &B.getMF();
782
783 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
784 const AMDGPU::LaneMaskConstants &LMC =
785 AMDGPU::LaneMaskConstants::get(ST: Subtarget);
786
787#ifndef NDEBUG
788 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
789#endif
790
791 MachineRegisterInfo &MRI = *B.getMRI();
792 Register SaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
793 Register InitSaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
794
795 // Don't bother using generic instructions/registers for the exec mask.
796 B.buildInstr(Opcode: TargetOpcode::IMPLICIT_DEF)
797 .addDef(RegNo: InitSaveExecReg);
798
799 Register PhiExec = MRI.createVirtualRegister(RegClass: WaveRC);
800 Register NewExec = MRI.createVirtualRegister(RegClass: WaveRC);
801
802 // To insert the loop we need to split the block. Move everything before this
803 // point to a new block, and insert a new empty block before this instruction.
804 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
805 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
806 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
807 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
808 MachineFunction::iterator MBBI(MBB);
809 ++MBBI;
810 MF->insert(MBBI, MBB: LoopBB);
811 MF->insert(MBBI, MBB: BodyBB);
812 MF->insert(MBBI, MBB: RestoreExecBB);
813 MF->insert(MBBI, MBB: RemainderBB);
814
815 LoopBB->addSuccessor(Succ: BodyBB);
816 BodyBB->addSuccessor(Succ: RestoreExecBB);
817 BodyBB->addSuccessor(Succ: LoopBB);
818
819 // Move the rest of the block into a new block.
820 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
821 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Range.end(), To: MBB.end());
822
823 MBB.addSuccessor(Succ: LoopBB);
824 RestoreExecBB->addSuccessor(Succ: RemainderBB);
825
826 B.setInsertPt(MBB&: *LoopBB, II: LoopBB->end());
827
828 B.buildInstr(Opcode: TargetOpcode::PHI)
829 .addDef(RegNo: PhiExec)
830 .addReg(RegNo: InitSaveExecReg)
831 .addMBB(MBB: &MBB)
832 .addReg(RegNo: NewExec)
833 .addMBB(MBB: BodyBB);
834
835 const DebugLoc &DL = B.getDL();
836
837 MachineInstr &FirstInst = *Range.begin();
838
839 // Move the instruction into the loop body. Note we moved everything after
840 // Range.end() already into a new block, so Range.end() is no longer valid.
841 BodyBB->splice(Where: BodyBB->end(), Other: &MBB, From: Range.begin(), To: MBB.end());
842
843 // Figure out the iterator range after splicing the instructions.
844 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
845 auto NewEnd = BodyBB->end();
846
847 B.setMBB(*LoopBB);
848
849 LLT S1 = LLT::scalar(SizeInBits: 1);
850 Register CondReg;
851
852 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
853
854 for (MachineInstr &MI : make_range(x: NewBegin, y: NewEnd)) {
855 for (MachineOperand &Op : MI.all_uses()) {
856 Register OldReg = Op.getReg();
857 if (!SGPROperandRegs.count(V: OldReg))
858 continue;
859
860 // See if we already processed this register in another instruction in the
861 // sequence.
862 auto OldVal = WaterfalledRegMap.find(Val: OldReg);
863 if (OldVal != WaterfalledRegMap.end()) {
864 Op.setReg(OldVal->second);
865 continue;
866 }
867
868 Register OpReg = Op.getReg();
869 LLT OpTy = MRI.getType(Reg: OpReg);
870
871 const RegisterBank *OpBank = getRegBank(Reg: OpReg, MRI, TRI: *TRI);
872 if (OpBank != &AMDGPU::VGPRRegBank) {
873 // Insert copy from AGPR to VGPR before the loop.
874 B.setMBB(MBB);
875 OpReg = B.buildCopy(Res: OpTy, Op: OpReg).getReg(Idx: 0);
876 MRI.setRegBank(Reg: OpReg, RegBank: AMDGPU::VGPRRegBank);
877 B.setMBB(*LoopBB);
878 }
879
880 Register CurrentLaneReg = buildReadFirstLane(B, MRI, Src: OpReg);
881
882 // Build the comparison(s).
883 unsigned OpSize = OpTy.getSizeInBits();
884 bool Is64 = OpSize % 64 == 0;
885 unsigned PartSize = Is64 ? 64 : 32;
886 LLT PartTy = LLT::scalar(SizeInBits: PartSize);
887 unsigned NumParts = OpSize / PartSize;
888 SmallVector<Register, 8> OpParts;
889 SmallVector<Register, 8> CurrentLaneParts;
890
891 if (NumParts == 1) {
892 OpParts.push_back(Elt: OpReg);
893 CurrentLaneParts.push_back(Elt: CurrentLaneReg);
894 } else {
895 auto UnmergeOp = B.buildUnmerge(Res: PartTy, Op: OpReg);
896 auto UnmergeCurrentLane = B.buildUnmerge(Res: PartTy, Op: CurrentLaneReg);
897 for (unsigned i = 0; i < NumParts; ++i) {
898 OpParts.push_back(Elt: UnmergeOp.getReg(Idx: i));
899 CurrentLaneParts.push_back(Elt: UnmergeCurrentLane.getReg(Idx: i));
900 MRI.setRegBank(Reg: OpParts[i], RegBank: AMDGPU::VGPRRegBank);
901 MRI.setRegBank(Reg: CurrentLaneParts[i], RegBank: AMDGPU::SGPRRegBank);
902 }
903 }
904
905 for (unsigned i = 0; i < NumParts; ++i) {
906 auto CmpReg = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: CurrentLaneParts[i],
907 Op1: OpParts[i]).getReg(Idx: 0);
908 MRI.setRegBank(Reg: CmpReg, RegBank: AMDGPU::VCCRegBank);
909
910 if (!CondReg) {
911 CondReg = CmpReg;
912 } else {
913 CondReg = B.buildAnd(Dst: S1, Src0: CondReg, Src1: CmpReg).getReg(Idx: 0);
914 MRI.setRegBank(Reg: CondReg, RegBank: AMDGPU::VCCRegBank);
915 }
916 }
917
918 Op.setReg(CurrentLaneReg);
919
920 // Make sure we don't re-process this register again.
921 WaterfalledRegMap.insert(KV: std::pair(OldReg, Op.getReg()));
922 }
923 }
924
925 // The ballot becomes a no-op during instruction selection.
926 CondReg = B.buildIntrinsic(ID: Intrinsic::amdgcn_ballot,
927 Res: {LLT::scalar(SizeInBits: Subtarget.isWave32() ? 32 : 64)})
928 .addReg(RegNo: CondReg)
929 .getReg(Idx: 0);
930 MRI.setRegClass(Reg: CondReg, RC: WaveRC);
931
932 // Update EXEC, save the original EXEC value to VCC.
933 B.buildInstr(Opcode: LMC.AndSaveExecOpc)
934 .addDef(RegNo: NewExec)
935 .addReg(RegNo: CondReg, Flags: RegState::Kill);
936
937 MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
938
939 B.setInsertPt(MBB&: *BodyBB, II: BodyBB->end());
940
941 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
942 B.buildInstr(Opcode: LMC.XorTermOpc)
943 .addDef(RegNo: LMC.ExecReg)
944 .addReg(RegNo: LMC.ExecReg)
945 .addReg(RegNo: NewExec);
946
947 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
948 // s_cbranch_scc0?
949
950 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
951 B.buildInstr(Opcode: AMDGPU::SI_WATERFALL_LOOP).addMBB(MBB: LoopBB);
952
953 // Save the EXEC mask before the loop.
954 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: SaveExecReg)
955 .addReg(RegNo: LMC.ExecReg);
956
957 // Restore the EXEC mask after the loop.
958 B.setMBB(*RestoreExecBB);
959 B.buildInstr(Opcode: LMC.MovTermOpc).addDef(RegNo: LMC.ExecReg).addReg(RegNo: SaveExecReg);
960
961 // Set the insert point after the original instruction, so any new
962 // instructions will be in the remainder.
963 B.setInsertPt(MBB&: *RemainderBB, II: RemainderBB->begin());
964
965 return true;
966}
967
968// Return any unique registers used by \p MI at \p OpIndices that need to be
969// handled in a waterfall loop. Returns these registers in \p
970// SGPROperandRegs. Returns true if there are any operands to handle and a
971// waterfall loop is necessary.
972bool AMDGPURegisterBankInfo::collectWaterfallOperands(
973 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
974 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
975 for (unsigned Op : OpIndices) {
976 assert(MI.getOperand(Op).isUse());
977 Register Reg = MI.getOperand(i: Op).getReg();
978 const RegisterBank *OpBank = getRegBank(Reg, MRI, TRI: *TRI);
979 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
980 SGPROperandRegs.insert(V: Reg);
981 }
982
983 // No operands need to be replaced, so no need to loop.
984 return !SGPROperandRegs.empty();
985}
986
987bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
988 MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
989 // Use a set to avoid extra readfirstlanes in the case where multiple operands
990 // are the same register.
991 SmallSet<Register, 4> SGPROperandRegs;
992
993 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI&: *B.getMRI(), OpIndices))
994 return false;
995
996 MachineBasicBlock::iterator I = MI.getIterator();
997 return executeInWaterfallLoop(B, Range: make_range(x: I, y: std::next(x: I)),
998 SGPROperandRegs);
999}
1000
1001// Legalize an operand that must be an SGPR by inserting a readfirstlane.
1002void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1003 MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1004 Register Reg = MI.getOperand(i: OpIdx).getReg();
1005 MachineRegisterInfo &MRI = *B.getMRI();
1006 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI);
1007 if (Bank == &AMDGPU::SGPRRegBank)
1008 return;
1009
1010 Reg = buildReadFirstLane(B, MRI, Src: Reg);
1011 MI.getOperand(i: OpIdx).setReg(Reg);
1012}
1013
1014/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1015/// rest will be in the remainder.
1016static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1017 unsigned TotalSize = Ty.getSizeInBits();
1018 if (!Ty.isVector())
1019 return {LLT::scalar(SizeInBits: FirstSize), LLT::scalar(SizeInBits: TotalSize - FirstSize)};
1020
1021 LLT EltTy = Ty.getElementType();
1022 unsigned EltSize = EltTy.getSizeInBits();
1023 assert(FirstSize % EltSize == 0);
1024
1025 unsigned FirstPartNumElts = FirstSize / EltSize;
1026 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1027
1028 return {LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: FirstPartNumElts), ScalarTy: EltTy),
1029 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: RemainderElts), ScalarTy: EltTy)};
1030}
1031
1032static LLT widen96To128(LLT Ty) {
1033 if (!Ty.isVector())
1034 return LLT::scalar(SizeInBits: 128);
1035
1036 LLT EltTy = Ty.getElementType();
1037 assert(128 % EltTy.getSizeInBits() == 0);
1038 return LLT::fixed_vector(NumElements: 128 / EltTy.getSizeInBits(), ScalarTy: EltTy);
1039}
1040
1041bool AMDGPURegisterBankInfo::applyMappingLoad(
1042 MachineIRBuilder &B,
1043 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1044 MachineInstr &MI) const {
1045 MachineRegisterInfo &MRI = *B.getMRI();
1046 Register DstReg = MI.getOperand(i: 0).getReg();
1047 const LLT LoadTy = MRI.getType(Reg: DstReg);
1048 unsigned LoadSize = LoadTy.getSizeInBits();
1049 MachineMemOperand *MMO = *MI.memoperands_begin();
1050 const unsigned MaxNonSmrdLoadSize = 128;
1051
1052 const RegisterBank *DstBank =
1053 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
1054 if (DstBank == &AMDGPU::SGPRRegBank) {
1055 // There are some special cases that we need to look at for 32 bit and 96
1056 // bit SGPR loads otherwise we have nothing to do.
1057 if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
1058 return false;
1059
1060 const unsigned MemSize = 8 * MMO->getSize().getValue();
1061 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1062 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1063 // scalar loads should have a load size of 32 but memory access size of less
1064 // than 32.
1065 if (LoadSize == 32 &&
1066 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1067 return false;
1068
1069 if (LoadSize == 32 &&
1070 ((MemSize == 8 && MMO->getAlign() >= Align(1)) ||
1071 (MemSize == 16 && MMO->getAlign() >= Align(2))) &&
1072 isScalarLoadLegal(MI) &&
1073 Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
1074 return false;
1075
1076 Register PtrReg = MI.getOperand(i: 1).getReg();
1077
1078 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1079
1080 if (LoadSize == 32) {
1081 // This is an extending load from a sub-dword size. Widen the memory
1082 // access size to 4 bytes and clear the extra high bits appropriately
1083 const LLT S32 = LLT::scalar(SizeInBits: 32);
1084 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1085 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1086 auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1087 B.buildSExtInReg(Res: MI.getOperand(i: 0), Op: WideLoad, ImmOp: MemSize);
1088 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1089 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1090 auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1091 B.buildZExtInReg(Res: MI.getOperand(i: 0), Op: WideLoad, ImmOp: MemSize);
1092 } else
1093 // We do not need to touch the higher bits for regular loads.
1094 B.buildLoadFromOffset(Dst: MI.getOperand(i: 0), BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1095 } else {
1096 // 96-bit loads are only available for vector loads. We need to split this
1097 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1098 if (MMO->getAlign() < Align(16)) {
1099 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1100 LLT Part64, Part32;
1101 std::tie(args&: Part64, args&: Part32) = splitUnequalType(Ty: LoadTy, FirstSize: 64);
1102 if (Helper.reduceLoadStoreWidth(MI&: cast<GAnyLoad>(Val&: MI), TypeIdx: 0, NarrowTy: Part64) !=
1103 LegalizerHelper::Legalized)
1104 return false;
1105 return true;
1106 }
1107 LLT WiderTy = widen96To128(Ty: LoadTy);
1108 auto WideLoad = B.buildLoadFromOffset(Dst: WiderTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1109 if (WiderTy.isScalar()) {
1110 B.buildTrunc(Res: MI.getOperand(i: 0), Op: WideLoad);
1111 } else {
1112 B.buildDeleteTrailingVectorElements(Res: MI.getOperand(i: 0).getReg(),
1113 Op0: WideLoad);
1114 }
1115 }
1116
1117 MI.eraseFromParent();
1118 return true;
1119 }
1120
1121 // 128-bit loads are supported for all instruction types.
1122 if (LoadSize <= MaxNonSmrdLoadSize)
1123 return false;
1124
1125 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(OpIdx: 1));
1126
1127 if (SrcRegs.empty())
1128 SrcRegs.push_back(Elt: MI.getOperand(i: 1).getReg());
1129
1130 // RegBankSelect only emits scalar types, so we need to reset the pointer
1131 // operand to a pointer type.
1132 Register BasePtrReg = SrcRegs[0];
1133 LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
1134 MRI.setType(VReg: BasePtrReg, Ty: PtrTy);
1135
1136 // The following are the loads not splitted enough during legalization
1137 // because it was not clear they are smem-load or vmem-load
1138 if (AMDGPU::isExtendedGlobalAddrSpace(AS: MMO->getAddrSpace()) ||
1139 MMO->getAddrSpace() == AMDGPUAS::BUFFER_RESOURCE) {
1140 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1141 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1142 const LLT LoadSplitTy = LoadTy.divide(Factor: NumSplitParts);
1143 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1144 LegalizerHelper Helper(B.getMF(), O, B);
1145 if (LoadTy.isVector()) {
1146 if (Helper.fewerElementsVector(MI, TypeIdx: 0, NarrowTy: LoadSplitTy) !=
1147 LegalizerHelper::Legalized)
1148 return false;
1149 } else {
1150 if (Helper.narrowScalar(MI, TypeIdx: 0, NarrowTy: LoadSplitTy) != LegalizerHelper::Legalized)
1151 return false;
1152 }
1153 }
1154
1155 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
1156 return true;
1157}
1158
1159bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1160 MachineIRBuilder &B,
1161 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1162 MachineInstr &MI) const {
1163 MachineRegisterInfo &MRI = *B.getMRI();
1164 const MachineFunction &MF = B.getMF();
1165 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1166 const auto &TFI = *ST.getFrameLowering();
1167
1168 // Guard in case the stack growth direction ever changes with scratch
1169 // instructions.
1170 assert(TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
1171 "Stack grows upwards for AMDGPU");
1172
1173 Register Dst = MI.getOperand(i: 0).getReg();
1174 Register AllocSize = MI.getOperand(i: 1).getReg();
1175 Align Alignment = assumeAligned(Value: MI.getOperand(i: 2).getImm());
1176
1177 // When using flat-scratch, the stack offset is unscaled.
1178 const bool HasFlatScratch = ST.hasFlatScratchEnabled();
1179 const unsigned WavefrontSizeLog2 = ST.getWavefrontSizeLog2();
1180
1181 const RegisterBank *SizeBank = getRegBank(Reg: AllocSize, MRI, TRI: *TRI);
1182
1183 if (SizeBank != &AMDGPU::SGPRRegBank) {
1184 auto WaveReduction =
1185 B.buildIntrinsic(ID: Intrinsic::amdgcn_wave_reduce_umax, Res: {LLT::scalar(SizeInBits: 32)})
1186 .addUse(RegNo: AllocSize)
1187 .addImm(Val: 0);
1188 AllocSize = WaveReduction.getReg(Idx: 0);
1189 }
1190
1191 LLT PtrTy = MRI.getType(Reg: Dst);
1192 LLT IntPtrTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
1193
1194 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1195 Register SPReg = Info->getStackPtrOffsetReg();
1196 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1197
1198 Register ScaledSize = AllocSize;
1199 if (!HasFlatScratch) {
1200 auto WaveSize = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: WavefrontSizeLog2);
1201 ScaledSize = B.buildShl(Dst: IntPtrTy, Src0: AllocSize, Src1: WaveSize).getReg(Idx: 0);
1202 }
1203
1204 auto OldSP = B.buildCopy(Res: PtrTy, Op: SPReg);
1205 if (Alignment > TFI.getStackAlign()) {
1206 const uint64_t ScaledAlignment =
1207 HasFlatScratch ? Alignment.value()
1208 : (Alignment.value() << WavefrontSizeLog2);
1209 const uint64_t StackAlignMask = ScaledAlignment - 1;
1210 auto Tmp1 = B.buildPtrAdd(Res: PtrTy, Op0: OldSP,
1211 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: StackAlignMask));
1212 B.buildMaskLowPtrBits(Res: Dst, Op0: Tmp1,
1213 NumBits: (HasFlatScratch
1214 ? Log2(A: Alignment)
1215 : Log2(A: Alignment) + WavefrontSizeLog2));
1216 } else {
1217 B.buildCopy(Res: Dst, Op: OldSP);
1218 }
1219 auto PtrAdd = B.buildPtrAdd(Res: PtrTy, Op0: Dst, Op1: ScaledSize);
1220 B.buildCopy(Res: SPReg, Op: PtrAdd);
1221 MI.eraseFromParent();
1222 return true;
1223}
1224
1225bool AMDGPURegisterBankInfo::applyMappingImage(
1226 MachineIRBuilder &B, MachineInstr &MI,
1227 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1228 int RsrcIdx) const {
1229 const int NumDefs = MI.getNumExplicitDefs();
1230
1231 // The reported argument index is relative to the IR intrinsic call arguments,
1232 // so we need to shift by the number of defs and the intrinsic ID.
1233 RsrcIdx += NumDefs + 1;
1234
1235 // Insert copies to VGPR arguments.
1236 applyDefaultMapping(OpdMapper);
1237
1238 // Fixup any SGPR arguments.
1239 SmallVector<unsigned, 4> SGPRIndexes;
1240 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1241 if (!MI.getOperand(i: I).isReg())
1242 continue;
1243
1244 // If this intrinsic has a sampler, it immediately follows rsrc.
1245 if (I == RsrcIdx || I == RsrcIdx + 1)
1246 SGPRIndexes.push_back(Elt: I);
1247 }
1248
1249 executeInWaterfallLoop(B, MI, OpIndices: SGPRIndexes);
1250 return true;
1251}
1252
1253// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1254// the three offsets (voffset, soffset and instoffset)
1255unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1256 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1257 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1258 const LLT S32 = LLT::scalar(SizeInBits: 32);
1259 MachineRegisterInfo *MRI = B.getMRI();
1260
1261 if (std::optional<int64_t> Imm =
1262 getIConstantVRegSExtVal(VReg: CombinedOffset, MRI: *MRI)) {
1263 uint32_t SOffset, ImmOffset;
1264 if (TII->splitMUBUFOffset(Imm: *Imm, SOffset, ImmOffset, Alignment)) {
1265 VOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1266 SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: 0);
1267 InstOffsetVal = ImmOffset;
1268
1269 B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank);
1270 B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank);
1271 return SOffset + ImmOffset;
1272 }
1273 }
1274
1275 const bool CheckNUW = Subtarget.hasGFX1250Insts();
1276 Register Base;
1277 unsigned Offset;
1278
1279 std::tie(args&: Base, args&: Offset) =
1280 AMDGPU::getBaseWithConstantOffset(MRI&: *MRI, Reg: CombinedOffset,
1281 /*KnownBits=*/ValueTracking: nullptr,
1282 /*CheckNUW=*/CheckNUW);
1283
1284 uint32_t SOffset, ImmOffset;
1285 if (static_cast<int32_t>(Offset) > 0 &&
1286 TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
1287 if (getRegBank(Reg: Base, MRI: *MRI, TRI: *TRI) == &AMDGPU::VGPRRegBank) {
1288 VOffsetReg = Base;
1289 SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: 0);
1290 B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank);
1291 InstOffsetVal = ImmOffset;
1292 return 0; // XXX - Why is this 0?
1293 }
1294
1295 // If we have SGPR base, we can use it for soffset.
1296 if (SOffset == 0) {
1297 VOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1298 B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank);
1299 SOffsetReg = Base;
1300 InstOffsetVal = ImmOffset;
1301 return 0; // XXX - Why is this 0?
1302 }
1303 }
1304
1305 // Handle the variable sgpr + vgpr case.
1306 MachineInstr *Add = getOpcodeDef(Opcode: AMDGPU::G_ADD, Reg: CombinedOffset, MRI: *MRI);
1307 if (Add && static_cast<int32_t>(Offset) >= 0 &&
1308 (!CheckNUW || Add->getFlag(Flag: MachineInstr::NoUWrap))) {
1309 Register Src0 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: 1).getReg(), MRI: *MRI);
1310 Register Src1 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: 2).getReg(), MRI: *MRI);
1311
1312 const RegisterBank *Src0Bank = getRegBank(Reg: Src0, MRI: *MRI, TRI: *TRI);
1313 const RegisterBank *Src1Bank = getRegBank(Reg: Src1, MRI: *MRI, TRI: *TRI);
1314
1315 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1316 VOffsetReg = Src0;
1317 SOffsetReg = Src1;
1318 return 0;
1319 }
1320
1321 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1322 VOffsetReg = Src1;
1323 SOffsetReg = Src0;
1324 return 0;
1325 }
1326 }
1327
1328 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1329 // have an SGPR offset and a VGPR resource.
1330 if (getRegBank(Reg: CombinedOffset, MRI: *MRI, TRI: *TRI) == &AMDGPU::VGPRRegBank) {
1331 VOffsetReg = CombinedOffset;
1332 } else {
1333 VOffsetReg = B.buildCopy(Res: S32, Op: CombinedOffset).getReg(Idx: 0);
1334 B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank);
1335 }
1336
1337 SOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1338 B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank);
1339 return 0;
1340}
1341
1342static unsigned getSBufferLoadCorrespondingBufferLoadOpcode(unsigned Opc) {
1343 switch (Opc) {
1344 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
1345 return AMDGPU::G_AMDGPU_BUFFER_LOAD;
1346 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
1347 return AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
1348 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
1349 return AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE;
1350 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
1351 return AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
1352 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
1353 return AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
1354 default:
1355 break;
1356 }
1357 llvm_unreachable("Unexpected s_buffer_load opcode");
1358}
1359
1360bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1361 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1362 MachineInstr &MI = OpdMapper.getMI();
1363 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1364
1365 const LLT S32 = LLT::scalar(SizeInBits: 32);
1366 Register Dst = MI.getOperand(i: 0).getReg();
1367 LLT Ty = MRI.getType(Reg: Dst);
1368
1369 const RegisterBank *RSrcBank =
1370 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
1371 const RegisterBank *OffsetBank =
1372 OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
1373 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1374 OffsetBank == &AMDGPU::SGPRRegBank)
1375 return true; // Legal mapping
1376
1377 // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1378 // here but don't have an MMO.
1379
1380 unsigned LoadSize = Ty.getSizeInBits();
1381 int NumLoads = 1;
1382 if (LoadSize == 256 || LoadSize == 512) {
1383 NumLoads = LoadSize / 128;
1384 Ty = Ty.divide(Factor: NumLoads);
1385 }
1386
1387 // Use the alignment to ensure that the required offsets will fit into the
1388 // immediate offsets.
1389 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1390
1391 MachineFunction &MF = B.getMF();
1392
1393 Register SOffset;
1394 Register VOffset;
1395 int64_t ImmOffset = 0;
1396
1397 unsigned MMOOffset = setBufferOffsets(B, CombinedOffset: MI.getOperand(i: 2).getReg(), VOffsetReg&: VOffset,
1398 SOffsetReg&: SOffset, InstOffsetVal&: ImmOffset, Alignment);
1399
1400 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1401 // can, but we need to track an MMO for that.
1402 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1403 const Align MemAlign(4); // FIXME: ABI type alignment?
1404 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1405 PtrInfo: MachinePointerInfo(),
1406 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1407 MachineMemOperand::MOInvariant,
1408 Size: MemSize, BaseAlignment: MemAlign);
1409 if (MMOOffset != 0)
1410 BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset, Size: MemSize);
1411
1412 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1413 // assume that the buffer is unswizzled.
1414
1415 Register RSrc = MI.getOperand(i: 1).getReg();
1416 Register VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1417 B.getMRI()->setRegBank(Reg: VIndex, RegBank: AMDGPU::VGPRRegBank);
1418
1419 SmallVector<Register, 4> LoadParts(NumLoads);
1420
1421 MachineBasicBlock::iterator MII = MI.getIterator();
1422 MachineInstrSpan Span(MII, &B.getMBB());
1423
1424 for (int i = 0; i < NumLoads; ++i) {
1425 if (NumLoads == 1) {
1426 LoadParts[i] = Dst;
1427 } else {
1428 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1429 MRI.setRegBank(Reg: LoadParts[i], RegBank: AMDGPU::VGPRRegBank);
1430 }
1431
1432 if (i != 0)
1433 BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: 16, Size: MemSize);
1434
1435 B.buildInstr(Opcode: getSBufferLoadCorrespondingBufferLoadOpcode(Opc: MI.getOpcode()))
1436 .addDef(RegNo: LoadParts[i]) // vdata
1437 .addUse(RegNo: RSrc) // rsrc
1438 .addUse(RegNo: VIndex) // vindex
1439 .addUse(RegNo: VOffset) // voffset
1440 .addUse(RegNo: SOffset) // soffset
1441 .addImm(Val: ImmOffset + 16 * i) // offset(imm)
1442 .addImm(Val: 0) // cachepolicy, swizzled buffer(imm)
1443 .addImm(Val: 0) // idxen(imm)
1444 .addMemOperand(MMO: BaseMMO);
1445 }
1446
1447 // TODO: If only the resource is a VGPR, it may be better to execute the
1448 // scalar load in the waterfall loop if the resource is expected to frequently
1449 // be dynamically uniform.
1450 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1451 // Remove the original instruction to avoid potentially confusing the
1452 // waterfall loop logic.
1453 B.setInstr(*Span.begin());
1454 MI.eraseFromParent();
1455
1456 SmallSet<Register, 4> OpsToWaterfall;
1457
1458 OpsToWaterfall.insert(V: RSrc);
1459 executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()),
1460 SGPROperandRegs&: OpsToWaterfall);
1461 }
1462
1463 if (NumLoads != 1) {
1464 if (Ty.isVector())
1465 B.buildConcatVectors(Res: Dst, Ops: LoadParts);
1466 else
1467 B.buildMergeLikeInstr(Res: Dst, Ops: LoadParts);
1468 }
1469
1470 // We removed the instruction earlier with a waterfall loop.
1471 if (RSrcBank == &AMDGPU::SGPRRegBank)
1472 MI.eraseFromParent();
1473
1474 return true;
1475}
1476
1477bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
1478 const OperandsMapper &OpdMapper,
1479 bool Signed) const {
1480 MachineInstr &MI = OpdMapper.getMI();
1481 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1482
1483 // Insert basic copies
1484 applyDefaultMapping(OpdMapper);
1485
1486 Register DstReg = MI.getOperand(i: 0).getReg();
1487 LLT Ty = MRI.getType(Reg: DstReg);
1488
1489 const LLT S32 = LLT::scalar(SizeInBits: 32);
1490
1491 unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1;
1492 Register SrcReg = MI.getOperand(i: FirstOpnd).getReg();
1493 Register OffsetReg = MI.getOperand(i: FirstOpnd + 1).getReg();
1494 Register WidthReg = MI.getOperand(i: FirstOpnd + 2).getReg();
1495
1496 const RegisterBank *DstBank =
1497 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
1498 if (DstBank == &AMDGPU::VGPRRegBank) {
1499 if (Ty == S32)
1500 return true;
1501
1502 // There is no 64-bit vgpr bitfield extract instructions so the operation
1503 // is expanded to a sequence of instructions that implement the operation.
1504 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1505
1506 const LLT S64 = LLT::scalar(SizeInBits: 64);
1507 // Shift the source operand so that extracted bits start at bit 0.
1508 auto ShiftOffset = Signed ? B.buildAShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg)
1509 : B.buildLShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg);
1510 auto UnmergeSOffset = B.buildUnmerge(Res: {S32, S32}, Op: ShiftOffset);
1511
1512 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1513 // if the width is a constant.
1514 if (auto ConstWidth = getIConstantVRegValWithLookThrough(VReg: WidthReg, MRI)) {
1515 // Use the 32-bit bitfield extract instruction if the width is a constant.
1516 // Depending on the width size, use either the low or high 32-bits.
1517 auto Zero = B.buildConstant(Res: S32, Val: 0);
1518 auto WidthImm = ConstWidth->Value.getZExtValue();
1519 if (WidthImm <= 32) {
1520 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1521 // or clear the upper 32-bits.
1522 auto Extract =
1523 Signed ? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 0), LSB: Zero, Width: WidthReg)
1524 : B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 0), LSB: Zero, Width: WidthReg);
1525 auto Extend =
1526 Signed ? B.buildAShr(Dst: S32, Src0: Extract, Src1: B.buildConstant(Res: S32, Val: 31)) : Zero;
1527 B.buildMergeLikeInstr(Res: DstReg, Ops: {Extract, Extend});
1528 } else {
1529 // Use bitfield extract on upper 32-bit source, and combine with lower
1530 // 32-bit source.
1531 auto UpperWidth = B.buildConstant(Res: S32, Val: WidthImm - 32);
1532 auto Extract =
1533 Signed
1534 ? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 1), LSB: Zero, Width: UpperWidth)
1535 : B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 1), LSB: Zero, Width: UpperWidth);
1536 B.buildMergeLikeInstr(Res: DstReg, Ops: {UnmergeSOffset.getReg(Idx: 0), Extract});
1537 }
1538 MI.eraseFromParent();
1539 return true;
1540 }
1541
1542 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1543 // operations.
1544 auto ExtShift = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: 64), Src1: WidthReg);
1545 auto SignBit = B.buildShl(Dst: S64, Src0: ShiftOffset, Src1: ExtShift);
1546 if (Signed)
1547 B.buildAShr(Dst: S64, Src0: SignBit, Src1: ExtShift);
1548 else
1549 B.buildLShr(Dst: S64, Src0: SignBit, Src1: ExtShift);
1550 MI.eraseFromParent();
1551 return true;
1552 }
1553
1554 // The scalar form packs the offset and width in a single operand.
1555
1556 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1557
1558 // Ensure the high bits are clear to insert the offset.
1559 auto OffsetMask = B.buildConstant(Res: S32, Val: maskTrailingOnes<unsigned>(N: 6));
1560 auto ClampOffset = B.buildAnd(Dst: S32, Src0: OffsetReg, Src1: OffsetMask);
1561
1562 // Zeros out the low bits, so don't bother clamping the input value.
1563 auto ShiftWidth = B.buildShl(Dst: S32, Src0: WidthReg, Src1: B.buildConstant(Res: S32, Val: 16));
1564
1565 // Transformation function, pack the offset and width of a BFE into
1566 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1567 // source, bits [5:0] contain the offset and bits [22:16] the width.
1568 auto MergedInputs = B.buildOr(Dst: S32, Src0: ClampOffset, Src1: ShiftWidth);
1569
1570 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1571 // register class constraints.
1572 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1573 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1574
1575 auto MIB = B.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {SrcReg, MergedInputs});
1576 constrainSelectedInstRegOperands(I&: *MIB, TII: *TII, TRI: *TRI, RBI: *this);
1577
1578 MI.eraseFromParent();
1579 return true;
1580}
1581
1582bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1583 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1584 MachineInstr &MI = OpdMapper.getMI();
1585 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1586
1587 // Insert basic copies.
1588 applyDefaultMapping(OpdMapper);
1589
1590 Register Dst0 = MI.getOperand(i: 0).getReg();
1591 Register Dst1 = MI.getOperand(i: 1).getReg();
1592 Register Src0 = MI.getOperand(i: 2).getReg();
1593 Register Src1 = MI.getOperand(i: 3).getReg();
1594 Register Src2 = MI.getOperand(i: 4).getReg();
1595
1596 if (MRI.getRegBankOrNull(Reg: Src0) == &AMDGPU::VGPRRegBank)
1597 return true;
1598
1599 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1600 LLT S1 = LLT::scalar(SizeInBits: 1);
1601 LLT S32 = LLT::scalar(SizeInBits: 32);
1602
1603 bool DstOnValu = MRI.getRegBankOrNull(Reg: Src2) == &AMDGPU::VGPRRegBank;
1604 bool Accumulate = true;
1605
1606 if (!DstOnValu) {
1607 if (mi_match(R: Src2, MRI, P: m_ZeroInt()))
1608 Accumulate = false;
1609 }
1610
1611 // Keep the multiplication on the SALU.
1612 Register DstHi;
1613 Register DstLo = B.buildMul(Dst: S32, Src0, Src1).getReg(Idx: 0);
1614 bool MulHiInVgpr = false;
1615
1616 MRI.setRegBank(Reg: DstLo, RegBank: AMDGPU::SGPRRegBank);
1617
1618 if (Subtarget.hasSMulHi()) {
1619 DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0, Src1).getReg(Idx: 0)
1620 : B.buildSMulH(Dst: S32, Src0, Src1).getReg(Idx: 0);
1621 MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::SGPRRegBank);
1622 } else {
1623 Register VSrc0 = B.buildCopy(Res: S32, Op: Src0).getReg(Idx: 0);
1624 Register VSrc1 = B.buildCopy(Res: S32, Op: Src1).getReg(Idx: 0);
1625
1626 MRI.setRegBank(Reg: VSrc0, RegBank: AMDGPU::VGPRRegBank);
1627 MRI.setRegBank(Reg: VSrc1, RegBank: AMDGPU::VGPRRegBank);
1628
1629 DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: 0)
1630 : B.buildSMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: 0);
1631 MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::VGPRRegBank);
1632
1633 if (!DstOnValu) {
1634 DstHi = buildReadFirstLane(B, MRI, Src: DstHi);
1635 } else {
1636 MulHiInVgpr = true;
1637 }
1638 }
1639
1640 // Accumulate and produce the "carry-out" bit.
1641 //
1642 // The "carry-out" is defined as bit 64 of the result when computed as a
1643 // big integer. For unsigned multiply-add, this matches the usual definition
1644 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1645 // result, which is determined as:
1646 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1647 LLT CarryType = DstOnValu ? S1 : S32;
1648 const RegisterBank &CarryBank =
1649 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1650 const RegisterBank &DstBank =
1651 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1652 Register Carry;
1653 Register Zero;
1654
1655 if (!IsUnsigned) {
1656 Zero = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1657 MRI.setRegBank(Reg: Zero,
1658 RegBank: MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1659
1660 Carry = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: MulHiInVgpr ? S1 : S32, Op0: DstHi, Op1: Zero)
1661 .getReg(Idx: 0);
1662 MRI.setRegBank(Reg: Carry, RegBank: MulHiInVgpr ? AMDGPU::VCCRegBank
1663 : AMDGPU::SGPRRegBank);
1664
1665 if (DstOnValu && !MulHiInVgpr) {
1666 Carry = B.buildTrunc(Res: S1, Op: Carry).getReg(Idx: 0);
1667 MRI.setRegBank(Reg: Carry, RegBank: AMDGPU::VCCRegBank);
1668 }
1669 }
1670
1671 if (Accumulate) {
1672 if (DstOnValu) {
1673 DstLo = B.buildCopy(Res: S32, Op: DstLo).getReg(Idx: 0);
1674 DstHi = B.buildCopy(Res: S32, Op: DstHi).getReg(Idx: 0);
1675 MRI.setRegBank(Reg: DstLo, RegBank: AMDGPU::VGPRRegBank);
1676 MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::VGPRRegBank);
1677 }
1678
1679 auto Unmerge = B.buildUnmerge(Res: S32, Op: Src2);
1680 Register Src2Lo = Unmerge.getReg(Idx: 0);
1681 Register Src2Hi = Unmerge.getReg(Idx: 1);
1682 MRI.setRegBank(Reg: Src2Lo, RegBank: DstBank);
1683 MRI.setRegBank(Reg: Src2Hi, RegBank: DstBank);
1684
1685 if (!IsUnsigned) {
1686 auto Src2Sign = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: CarryType, Op0: Src2Hi, Op1: Zero);
1687 MRI.setRegBank(Reg: Src2Sign.getReg(Idx: 0), RegBank: CarryBank);
1688
1689 Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: Src2Sign).getReg(Idx: 0);
1690 MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1691 }
1692
1693 auto AddLo = B.buildUAddo(Res: S32, CarryOut: CarryType, Op0: DstLo, Op1: Src2Lo);
1694 DstLo = AddLo.getReg(Idx: 0);
1695 Register CarryLo = AddLo.getReg(Idx: 1);
1696 MRI.setRegBank(Reg: DstLo, RegBank: DstBank);
1697 MRI.setRegBank(Reg: CarryLo, RegBank: CarryBank);
1698
1699 auto AddHi = B.buildUAdde(Res: S32, CarryOut: CarryType, Op0: DstHi, Op1: Src2Hi, CarryIn: CarryLo);
1700 DstHi = AddHi.getReg(Idx: 0);
1701 MRI.setRegBank(Reg: DstHi, RegBank: DstBank);
1702
1703 Register CarryHi = AddHi.getReg(Idx: 1);
1704 MRI.setRegBank(Reg: CarryHi, RegBank: CarryBank);
1705
1706 if (IsUnsigned) {
1707 Carry = CarryHi;
1708 } else {
1709 Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: CarryHi).getReg(Idx: 0);
1710 MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1711 }
1712 } else {
1713 if (IsUnsigned) {
1714 Carry = B.buildConstant(Res: CarryType, Val: 0).getReg(Idx: 0);
1715 MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1716 }
1717 }
1718
1719 B.buildMergeLikeInstr(Res: Dst0, Ops: {DstLo, DstHi});
1720
1721 if (DstOnValu) {
1722 B.buildCopy(Res: Dst1, Op: Carry);
1723 } else {
1724 B.buildTrunc(Res: Dst1, Op: Carry);
1725 }
1726
1727 MI.eraseFromParent();
1728 return true;
1729}
1730
1731// Return a suitable opcode for extending the operands of Opc when widening.
1732static unsigned getExtendOp(unsigned Opc) {
1733 switch (Opc) {
1734 case TargetOpcode::G_ASHR:
1735 case TargetOpcode::G_SMIN:
1736 case TargetOpcode::G_SMAX:
1737 return TargetOpcode::G_SEXT;
1738 case TargetOpcode::G_LSHR:
1739 case TargetOpcode::G_UMIN:
1740 case TargetOpcode::G_UMAX:
1741 return TargetOpcode::G_ZEXT;
1742 default:
1743 return TargetOpcode::G_ANYEXT;
1744 }
1745}
1746
1747// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1748// any illegal vector extend or unmerge operations.
1749static std::pair<Register, Register>
1750unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1751 const LLT S32 = LLT::scalar(SizeInBits: 32);
1752 auto Bitcast = B.buildBitcast(Dst: S32, Src);
1753
1754 if (ExtOpcode == TargetOpcode::G_SEXT) {
1755 auto ExtLo = B.buildSExtInReg(Res: S32, Op: Bitcast, ImmOp: 16);
1756 auto ShiftHi = B.buildAShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 16));
1757 return std::pair(ExtLo.getReg(Idx: 0), ShiftHi.getReg(Idx: 0));
1758 }
1759
1760 auto ShiftHi = B.buildLShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 16));
1761 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1762 auto ExtLo = B.buildAnd(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 0xffff));
1763 return std::pair(ExtLo.getReg(Idx: 0), ShiftHi.getReg(Idx: 0));
1764 }
1765
1766 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1767 return std::pair(Bitcast.getReg(Idx: 0), ShiftHi.getReg(Idx: 0));
1768}
1769
1770// For cases where only a single copy is inserted for matching register banks.
1771// Replace the register in the instruction operand
1772static bool substituteSimpleCopyRegs(
1773 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1774 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1775 if (!SrcReg.empty()) {
1776 assert(SrcReg.size() == 1);
1777 OpdMapper.getMI().getOperand(i: OpIdx).setReg(SrcReg[0]);
1778 return true;
1779 }
1780
1781 return false;
1782}
1783
1784/// Handle register layout difference for f16 images for some subtargets.
1785Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1786 MachineRegisterInfo &MRI,
1787 Register Reg) const {
1788 if (!Subtarget.hasUnpackedD16VMem())
1789 return Reg;
1790
1791 const LLT S16 = LLT::scalar(SizeInBits: 16);
1792 LLT StoreVT = MRI.getType(Reg);
1793 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1794 return Reg;
1795
1796 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
1797
1798
1799 SmallVector<Register, 4> WideRegs;
1800 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1801 WideRegs.push_back(Elt: Unmerge.getReg(Idx: I));
1802
1803 const LLT S32 = LLT::scalar(SizeInBits: 32);
1804 int NumElts = StoreVT.getNumElements();
1805
1806 return B.buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
1807 .getReg(Idx: 0);
1808}
1809
1810static std::pair<Register, unsigned>
1811getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1812 int64_t Const;
1813 if (mi_match(R: Reg, MRI, P: m_ICst(Cst&: Const)))
1814 return std::pair(Register(), Const);
1815
1816 Register Base;
1817 if (mi_match(R: Reg, MRI, P: m_GAdd(L: m_Reg(R&: Base), R: m_ICst(Cst&: Const))))
1818 return std::pair(Base, Const);
1819
1820 // TODO: Handle G_OR used for add case
1821 return std::pair(Reg, 0);
1822}
1823
1824std::pair<Register, unsigned>
1825AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1826 Register OrigOffset) const {
1827 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: Subtarget);
1828 Register BaseReg;
1829 unsigned ImmOffset;
1830 const LLT S32 = LLT::scalar(SizeInBits: 32);
1831
1832 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1833 std::tie(args&: BaseReg, args&: ImmOffset) = getBaseWithConstantOffset(MRI&: *B.getMRI(),
1834 Reg: OrigOffset);
1835
1836 unsigned C1 = 0;
1837 if (ImmOffset != 0) {
1838 // If the immediate value is too big for the immoffset field, put only bits
1839 // that would normally fit in the immoffset field. The remaining value that
1840 // is copied/added for the voffset field is a large power of 2, and it
1841 // stands more chance of being CSEd with the copy/add for another similar
1842 // load/store.
1843 // However, do not do that rounding down if that is a negative
1844 // number, as it appears to be illegal to have a negative offset in the
1845 // vgpr, even if adding the immediate offset makes it positive.
1846 unsigned Overflow = ImmOffset & ~MaxImm;
1847 ImmOffset -= Overflow;
1848 if (static_cast<int32_t>(Overflow) < 0) {
1849 Overflow += ImmOffset;
1850 ImmOffset = 0;
1851 }
1852
1853 C1 = ImmOffset;
1854 if (Overflow != 0) {
1855 if (!BaseReg)
1856 BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: 0);
1857 else {
1858 auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
1859 BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: 0);
1860 }
1861 }
1862 }
1863
1864 if (!BaseReg)
1865 BaseReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1866
1867 return {BaseReg, C1};
1868}
1869
1870bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1871 Register SrcReg) const {
1872 MachineRegisterInfo &MRI = *B.getMRI();
1873 LLT SrcTy = MRI.getType(Reg: SrcReg);
1874 if (SrcTy.getSizeInBits() == 32) {
1875 // Use a v_mov_b32 here to make the exec dependency explicit.
1876 B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32)
1877 .addDef(RegNo: DstReg)
1878 .addUse(RegNo: SrcReg);
1879 return constrainGenericRegister(Reg: DstReg, RC: AMDGPU::VGPR_32RegClass, MRI) &&
1880 constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_32RegClass, MRI);
1881 }
1882
1883 Register TmpReg0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1884 Register TmpReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1885
1886 B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32)
1887 .addDef(RegNo: TmpReg0)
1888 .addUse(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub0);
1889 B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32)
1890 .addDef(RegNo: TmpReg1)
1891 .addUse(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub1);
1892 B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
1893 .addDef(RegNo: DstReg)
1894 .addUse(RegNo: TmpReg0)
1895 .addImm(Val: AMDGPU::sub0)
1896 .addUse(RegNo: TmpReg1)
1897 .addImm(Val: AMDGPU::sub1);
1898
1899 return constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_64RegClass, MRI) &&
1900 constrainGenericRegister(Reg: DstReg, RC: AMDGPU::VReg_64RegClass, MRI);
1901}
1902
1903/// Utility function for pushing dynamic vector indexes with a constant offset
1904/// into waterfall loops.
1905static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1906 MachineInstr &IdxUseInstr,
1907 unsigned OpIdx,
1908 unsigned ConstOffset) {
1909 MachineRegisterInfo &MRI = *B.getMRI();
1910 const LLT S32 = LLT::scalar(SizeInBits: 32);
1911 Register WaterfallIdx = IdxUseInstr.getOperand(i: OpIdx).getReg();
1912 B.setInsertPt(MBB&: *IdxUseInstr.getParent(), II: IdxUseInstr.getIterator());
1913
1914 auto MaterializedOffset = B.buildConstant(Res: S32, Val: ConstOffset);
1915
1916 auto Add = B.buildAdd(Dst: S32, Src0: WaterfallIdx, Src1: MaterializedOffset);
1917 MRI.setRegBank(Reg: MaterializedOffset.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
1918 MRI.setRegBank(Reg: Add.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
1919 IdxUseInstr.getOperand(i: OpIdx).setReg(Add.getReg(Idx: 0));
1920}
1921
1922/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1923/// original 32-bit source value (to be inserted in the low part of the combined
1924/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1925/// value.
1926static void extendLow32IntoHigh32(MachineIRBuilder &B,
1927 Register Hi32Reg, Register Lo32Reg,
1928 unsigned ExtOpc,
1929 const RegisterBank &RegBank,
1930 bool IsBooleanSrc = false) {
1931 if (ExtOpc == AMDGPU::G_ZEXT) {
1932 B.buildConstant(Res: Hi32Reg, Val: 0);
1933 } else if (ExtOpc == AMDGPU::G_SEXT) {
1934 if (IsBooleanSrc) {
1935 // If we know the original source was an s1, the high half is the same as
1936 // the low.
1937 B.buildCopy(Res: Hi32Reg, Op: Lo32Reg);
1938 } else {
1939 // Replicate sign bit from 32-bit extended part.
1940 auto ShiftAmt = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 31);
1941 B.getMRI()->setRegBank(Reg: ShiftAmt.getReg(Idx: 0), RegBank);
1942 B.buildAShr(Dst: Hi32Reg, Src0: Lo32Reg, Src1: ShiftAmt);
1943 }
1944 } else {
1945 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1946 B.buildUndef(Res: Hi32Reg);
1947 }
1948}
1949
1950bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1951 MachineIRBuilder &B, MachineInstr &MI,
1952 const OperandsMapper &OpdMapper) const {
1953 MachineRegisterInfo &MRI = *B.getMRI();
1954
1955 Register VecReg = MI.getOperand(i: 1).getReg();
1956 Register Idx = MI.getOperand(i: 2).getReg();
1957
1958 const RegisterBank &IdxBank =
1959 *OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
1960
1961 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1962
1963 LLT VecTy = MRI.getType(Reg: VecReg);
1964 unsigned EltSize = VecTy.getScalarSizeInBits();
1965 unsigned NumElem = VecTy.getNumElements();
1966
1967 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1968 IsDivergentIdx, Subtarget: &Subtarget))
1969 return false;
1970
1971 LLT S32 = LLT::scalar(SizeInBits: 32);
1972
1973 const RegisterBank &DstBank =
1974 *OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
1975 const RegisterBank &SrcBank =
1976 *OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
1977
1978 const RegisterBank &CCBank =
1979 (DstBank == AMDGPU::SGPRRegBank &&
1980 SrcBank == AMDGPU::SGPRRegBank &&
1981 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1982 : AMDGPU::VCCRegBank;
1983 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(SizeInBits: 1);
1984
1985 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1986 Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: 0).getReg();
1987 MRI.setRegBank(Reg: Idx, RegBank: AMDGPU::VGPRRegBank);
1988 }
1989
1990 LLT EltTy = VecTy.getScalarType();
1991 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0));
1992 unsigned NumLanes = DstRegs.size();
1993 if (!NumLanes)
1994 NumLanes = 1;
1995 else
1996 EltTy = MRI.getType(Reg: DstRegs[0]);
1997
1998 auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg);
1999 SmallVector<Register, 2> Res(NumLanes);
2000 for (unsigned L = 0; L < NumLanes; ++L)
2001 Res[L] = UnmergeToEltTy.getReg(Idx: L);
2002
2003 for (unsigned I = 1; I < NumElem; ++I) {
2004 auto IC = B.buildConstant(Res: S32, Val: I);
2005 MRI.setRegBank(Reg: IC->getOperand(i: 0).getReg(), RegBank: AMDGPU::SGPRRegBank);
2006 auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC);
2007 MRI.setRegBank(Reg: Cmp->getOperand(i: 0).getReg(), RegBank: CCBank);
2008
2009 for (unsigned L = 0; L < NumLanes; ++L) {
2010 auto S = B.buildSelect(Res: EltTy, Tst: Cmp,
2011 Op0: UnmergeToEltTy.getReg(Idx: I * NumLanes + L), Op1: Res[L]);
2012
2013 for (unsigned N : { 0, 2, 3 })
2014 MRI.setRegBank(Reg: S->getOperand(i: N).getReg(), RegBank: DstBank);
2015
2016 Res[L] = S->getOperand(i: 0).getReg();
2017 }
2018 }
2019
2020 for (unsigned L = 0; L < NumLanes; ++L) {
2021 Register DstReg = (NumLanes == 1) ? MI.getOperand(i: 0).getReg() : DstRegs[L];
2022 B.buildCopy(Res: DstReg, Op: Res[L]);
2023 MRI.setRegBank(Reg: DstReg, RegBank: DstBank);
2024 }
2025
2026 MRI.setRegBank(Reg: MI.getOperand(i: 0).getReg(), RegBank: DstBank);
2027 MI.eraseFromParent();
2028
2029 return true;
2030}
2031
2032// Insert a cross regbank copy for a register if it already has a bank that
2033// differs from the one we want to set.
2034static Register constrainRegToBank(MachineRegisterInfo &MRI,
2035 MachineIRBuilder &B, Register &Reg,
2036 const RegisterBank &Bank) {
2037 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2038 if (CurrBank && *CurrBank != Bank) {
2039 Register Copy = B.buildCopy(Res: MRI.getType(Reg), Op: Reg).getReg(Idx: 0);
2040 MRI.setRegBank(Reg: Copy, RegBank: Bank);
2041 return Copy;
2042 }
2043
2044 MRI.setRegBank(Reg, RegBank: Bank);
2045 return Reg;
2046}
2047
2048bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2049 MachineIRBuilder &B, MachineInstr &MI,
2050 const OperandsMapper &OpdMapper) const {
2051
2052 MachineRegisterInfo &MRI = *B.getMRI();
2053 Register VecReg = MI.getOperand(i: 1).getReg();
2054 Register Idx = MI.getOperand(i: 3).getReg();
2055
2056 const RegisterBank &IdxBank =
2057 *OpdMapper.getInstrMapping().getOperandMapping(i: 3).BreakDown[0].RegBank;
2058
2059 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2060
2061 LLT VecTy = MRI.getType(Reg: VecReg);
2062 unsigned EltSize = VecTy.getScalarSizeInBits();
2063 unsigned NumElem = VecTy.getNumElements();
2064
2065 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2066 IsDivergentIdx, Subtarget: &Subtarget))
2067 return false;
2068
2069 LLT S32 = LLT::scalar(SizeInBits: 32);
2070
2071 const RegisterBank &DstBank =
2072 *OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2073 const RegisterBank &SrcBank =
2074 *OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
2075 const RegisterBank &InsBank =
2076 *OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
2077
2078 const RegisterBank &CCBank =
2079 (DstBank == AMDGPU::SGPRRegBank &&
2080 SrcBank == AMDGPU::SGPRRegBank &&
2081 InsBank == AMDGPU::SGPRRegBank &&
2082 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2083 : AMDGPU::VCCRegBank;
2084 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(SizeInBits: 1);
2085
2086 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2087 Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: 0).getReg();
2088 MRI.setRegBank(Reg: Idx, RegBank: AMDGPU::VGPRRegBank);
2089 }
2090
2091 LLT EltTy = VecTy.getScalarType();
2092 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(OpIdx: 2));
2093 unsigned NumLanes = InsRegs.size();
2094 if (!NumLanes) {
2095 NumLanes = 1;
2096 InsRegs.push_back(Elt: MI.getOperand(i: 2).getReg());
2097 } else {
2098 EltTy = MRI.getType(Reg: InsRegs[0]);
2099 }
2100
2101 auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg);
2102 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2103
2104 for (unsigned I = 0; I < NumElem; ++I) {
2105 auto IC = B.buildConstant(Res: S32, Val: I);
2106 MRI.setRegBank(Reg: IC->getOperand(i: 0).getReg(), RegBank: AMDGPU::SGPRRegBank);
2107 auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC);
2108 MRI.setRegBank(Reg: Cmp->getOperand(i: 0).getReg(), RegBank: CCBank);
2109
2110 for (unsigned L = 0; L < NumLanes; ++L) {
2111 Register Op0 = constrainRegToBank(MRI, B, Reg&: InsRegs[L], Bank: DstBank);
2112 Register Op1 = UnmergeToEltTy.getReg(Idx: I * NumLanes + L);
2113 Op1 = constrainRegToBank(MRI, B, Reg&: Op1, Bank: DstBank);
2114
2115 Register Select = B.buildSelect(Res: EltTy, Tst: Cmp, Op0, Op1).getReg(Idx: 0);
2116 MRI.setRegBank(Reg: Select, RegBank: DstBank);
2117
2118 Ops[I * NumLanes + L] = Select;
2119 }
2120 }
2121
2122 LLT MergeTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarTy: EltTy);
2123 if (MergeTy == MRI.getType(Reg: MI.getOperand(i: 0).getReg())) {
2124 B.buildBuildVector(Res: MI.getOperand(i: 0), Ops);
2125 } else {
2126 auto Vec = B.buildBuildVector(Res: MergeTy, Ops);
2127 MRI.setRegBank(Reg: Vec->getOperand(i: 0).getReg(), RegBank: DstBank);
2128 B.buildBitcast(Dst: MI.getOperand(i: 0).getReg(), Src: Vec);
2129 }
2130
2131 MRI.setRegBank(Reg: MI.getOperand(i: 0).getReg(), RegBank: DstBank);
2132 MI.eraseFromParent();
2133
2134 return true;
2135}
2136
2137// Break s_mul_u64 into 32-bit vector operations.
2138void AMDGPURegisterBankInfo::applyMappingSMULU64(
2139 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2140 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2141 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(OpIdx: 1));
2142 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2));
2143
2144 // All inputs are SGPRs, nothing special to do.
2145 if (DefRegs.empty()) {
2146 assert(Src0Regs.empty() && Src1Regs.empty());
2147 applyDefaultMapping(OpdMapper);
2148 return;
2149 }
2150
2151 assert(DefRegs.size() == 2);
2152 assert(Src0Regs.size() == Src1Regs.size() &&
2153 (Src0Regs.empty() || Src0Regs.size() == 2));
2154
2155 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2156 MachineInstr &MI = OpdMapper.getMI();
2157 Register DstReg = MI.getOperand(i: 0).getReg();
2158 LLT HalfTy = LLT::scalar(SizeInBits: 32);
2159
2160 // Depending on where the source registers came from, the generic code may
2161 // have decided to split the inputs already or not. If not, we still need to
2162 // extract the values.
2163
2164 if (Src0Regs.empty())
2165 split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: 1).getReg());
2166 else
2167 setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy);
2168
2169 if (Src1Regs.empty())
2170 split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg());
2171 else
2172 setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2173
2174 setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2175
2176 // The multiplication is done as follows:
2177 //
2178 // Op1H Op1L
2179 // * Op0H Op0L
2180 // --------------------
2181 // Op1H*Op0L Op1L*Op0L
2182 // + Op1H*Op0H Op1L*Op0H
2183 // -----------------------------------------
2184 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
2185 //
2186 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
2187 // value and that would overflow.
2188 // The low 32-bit value is Op1L*Op0L.
2189 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
2190 // Op1L*Op0L).
2191
2192 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
2193
2194 Register Hi = B.buildUMulH(Dst: HalfTy, Src0: Src0Regs[0], Src1: Src1Regs[0]).getReg(Idx: 0);
2195 Register MulLoHi = B.buildMul(Dst: HalfTy, Src0: Src0Regs[0], Src1: Src1Regs[1]).getReg(Idx: 0);
2196 Register Add = B.buildAdd(Dst: HalfTy, Src0: Hi, Src1: MulLoHi).getReg(Idx: 0);
2197 Register MulHiLo = B.buildMul(Dst: HalfTy, Src0: Src0Regs[1], Src1: Src1Regs[0]).getReg(Idx: 0);
2198 B.buildAdd(Dst: DefRegs[1], Src0: Add, Src1: MulHiLo);
2199 B.buildMul(Dst: DefRegs[0], Src0: Src0Regs[0], Src1: Src1Regs[0]);
2200
2201 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2202 MI.eraseFromParent();
2203}
2204
2205void AMDGPURegisterBankInfo::applyMappingImpl(
2206 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2207 MachineInstr &MI = OpdMapper.getMI();
2208 B.setInstrAndDebugLoc(MI);
2209 unsigned Opc = MI.getOpcode();
2210 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2211 switch (Opc) {
2212 case AMDGPU::G_CONSTANT:
2213 case AMDGPU::G_IMPLICIT_DEF: {
2214 Register DstReg = MI.getOperand(i: 0).getReg();
2215 LLT DstTy = MRI.getType(Reg: DstReg);
2216 if (DstTy != LLT::scalar(SizeInBits: 1))
2217 break;
2218
2219 const RegisterBank *DstBank =
2220 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2221 if (DstBank == &AMDGPU::VCCRegBank)
2222 break;
2223 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2224 if (DefRegs.empty())
2225 DefRegs.push_back(Elt: DstReg);
2226
2227 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
2228
2229 Register NewDstReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
2230 LLVMContext &Ctx = B.getMF().getFunction().getContext();
2231
2232 MI.getOperand(i: 0).setReg(NewDstReg);
2233 if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2234 uint64_t ConstVal = MI.getOperand(i: 1).getCImm()->getZExtValue();
2235 MI.getOperand(i: 1).setCImm(
2236 ConstantInt::get(Ty: IntegerType::getInt32Ty(C&: Ctx), V: ConstVal));
2237 }
2238
2239 MRI.setRegBank(Reg: NewDstReg, RegBank: *DstBank);
2240 B.buildTrunc(Res: DefRegs[0], Op: NewDstReg);
2241 return;
2242 }
2243 case AMDGPU::G_PHI: {
2244 Register DstReg = MI.getOperand(i: 0).getReg();
2245 LLT DstTy = MRI.getType(Reg: DstReg);
2246 if (DstTy != LLT::scalar(SizeInBits: 1))
2247 break;
2248
2249 const LLT S32 = LLT::scalar(SizeInBits: 32);
2250 const RegisterBank *DstBank =
2251 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2252 if (DstBank == &AMDGPU::VCCRegBank) {
2253 applyDefaultMapping(OpdMapper);
2254 // The standard handling only considers the result register bank for
2255 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2256 // produce an invalid copy. We can only copy with some kind of compare to
2257 // get a vector boolean result. Insert a register bank copy that will be
2258 // correctly lowered to a compare.
2259 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2260 Register SrcReg = MI.getOperand(i: I).getReg();
2261 const RegisterBank *SrcBank = getRegBank(Reg: SrcReg, MRI, TRI: *TRI);
2262
2263 if (SrcBank != &AMDGPU::VCCRegBank) {
2264 MachineBasicBlock *SrcMBB = MI.getOperand(i: I + 1).getMBB();
2265 B.setInsertPt(MBB&: *SrcMBB, II: SrcMBB->getFirstTerminator());
2266
2267 auto Copy = B.buildCopy(Res: LLT::scalar(SizeInBits: 1), Op: SrcReg);
2268 MRI.setRegBank(Reg: Copy.getReg(Idx: 0), RegBank: AMDGPU::VCCRegBank);
2269 MI.getOperand(i: I).setReg(Copy.getReg(Idx: 0));
2270 }
2271 }
2272
2273 return;
2274 }
2275
2276 // Phi handling is strange and only considers the bank of the destination.
2277 substituteSimpleCopyRegs(OpdMapper, OpIdx: 0);
2278
2279 // Promote SGPR/VGPR booleans to s32
2280 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2281 B.setInsertPt(MBB&: B.getMBB(), II: MI);
2282 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2283
2284 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized)
2285 llvm_unreachable("widen scalar should have succeeded");
2286
2287 return;
2288 }
2289 case AMDGPU::G_FCMP:
2290 if (!Subtarget.hasSALUFloatInsts())
2291 break;
2292 [[fallthrough]];
2293 case AMDGPU::G_ICMP:
2294 case AMDGPU::G_UADDO:
2295 case AMDGPU::G_USUBO:
2296 case AMDGPU::G_UADDE:
2297 case AMDGPU::G_SADDE:
2298 case AMDGPU::G_USUBE:
2299 case AMDGPU::G_SSUBE: {
2300 unsigned BoolDstOp =
2301 (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
2302 Register DstReg = MI.getOperand(i: BoolDstOp).getReg();
2303
2304 const RegisterBank *DstBank =
2305 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2306 if (DstBank != &AMDGPU::SGPRRegBank)
2307 break;
2308
2309 const bool HasCarryIn = MI.getNumOperands() == 5;
2310
2311 // If this is a scalar compare, promote the result to s32, as the selection
2312 // will end up using a copy to a 32-bit vreg.
2313 const LLT S32 = LLT::scalar(SizeInBits: 32);
2314 Register NewDstReg = MRI.createGenericVirtualRegister(Ty: S32);
2315 MRI.setRegBank(Reg: NewDstReg, RegBank: AMDGPU::SGPRRegBank);
2316 MI.getOperand(i: BoolDstOp).setReg(NewDstReg);
2317
2318 if (HasCarryIn) {
2319 Register NewSrcReg = MRI.createGenericVirtualRegister(Ty: S32);
2320 MRI.setRegBank(Reg: NewSrcReg, RegBank: AMDGPU::SGPRRegBank);
2321 B.buildZExt(Res: NewSrcReg, Op: MI.getOperand(i: 4).getReg());
2322 MI.getOperand(i: 4).setReg(NewSrcReg);
2323 }
2324
2325 MachineBasicBlock *MBB = MI.getParent();
2326 B.setInsertPt(MBB&: *MBB, II: std::next(x: MI.getIterator()));
2327
2328 // If we had a constrained VCC result register, a copy was inserted to VCC
2329 // from SGPR.
2330 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2331 if (DefRegs.empty())
2332 DefRegs.push_back(Elt: DstReg);
2333 B.buildTrunc(Res: DefRegs[0], Op: NewDstReg);
2334 return;
2335 }
2336 case AMDGPU::G_SELECT: {
2337 Register DstReg = MI.getOperand(i: 0).getReg();
2338 LLT DstTy = MRI.getType(Reg: DstReg);
2339
2340 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(OpIdx: 1));
2341 if (CondRegs.empty())
2342 CondRegs.push_back(Elt: MI.getOperand(i: 1).getReg());
2343 else {
2344 assert(CondRegs.size() == 1);
2345 }
2346
2347 const RegisterBank *CondBank = getRegBank(Reg: CondRegs[0], MRI, TRI: *TRI);
2348 if (CondBank == &AMDGPU::SGPRRegBank) {
2349 const LLT S32 = LLT::scalar(SizeInBits: 32);
2350 Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32);
2351 MRI.setRegBank(Reg: NewCondReg, RegBank: AMDGPU::SGPRRegBank);
2352
2353 MI.getOperand(i: 1).setReg(NewCondReg);
2354 B.buildZExt(Res: NewCondReg, Op: CondRegs[0]);
2355 }
2356
2357 if (DstTy.getSizeInBits() != 64)
2358 break;
2359
2360 LLT HalfTy = getHalfSizedType(Ty: DstTy);
2361
2362 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2363 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2));
2364 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(OpIdx: 3));
2365
2366 // All inputs are SGPRs, nothing special to do.
2367 if (DefRegs.empty()) {
2368 assert(Src1Regs.empty() && Src2Regs.empty());
2369 break;
2370 }
2371
2372 if (Src1Regs.empty())
2373 split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg());
2374 else {
2375 setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2376 }
2377
2378 if (Src2Regs.empty())
2379 split64BitValueForMapping(B, Regs&: Src2Regs, HalfTy, Reg: MI.getOperand(i: 3).getReg());
2380 else
2381 setRegsToType(MRI, Regs: Src2Regs, NewTy: HalfTy);
2382
2383 setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2384
2385 auto Flags = MI.getFlags();
2386 B.buildSelect(Res: DefRegs[0], Tst: CondRegs[0], Op0: Src1Regs[0], Op1: Src2Regs[0], Flags);
2387 B.buildSelect(Res: DefRegs[1], Tst: CondRegs[0], Op0: Src1Regs[1], Op1: Src2Regs[1], Flags);
2388
2389 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2390 MI.eraseFromParent();
2391 return;
2392 }
2393 case AMDGPU::G_BRCOND: {
2394 Register CondReg = MI.getOperand(i: 0).getReg();
2395 // FIXME: Should use legalizer helper, but should change bool ext type.
2396 const RegisterBank *CondBank =
2397 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2398
2399 if (CondBank == &AMDGPU::SGPRRegBank) {
2400 const LLT S32 = LLT::scalar(SizeInBits: 32);
2401 Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32);
2402 MRI.setRegBank(Reg: NewCondReg, RegBank: AMDGPU::SGPRRegBank);
2403
2404 MI.getOperand(i: 0).setReg(NewCondReg);
2405 B.buildZExt(Res: NewCondReg, Op: CondReg);
2406 return;
2407 }
2408
2409 break;
2410 }
2411 case AMDGPU::G_AND:
2412 case AMDGPU::G_OR:
2413 case AMDGPU::G_XOR: {
2414 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2415 // there is a VGPR input.
2416 Register DstReg = MI.getOperand(i: 0).getReg();
2417 LLT DstTy = MRI.getType(Reg: DstReg);
2418
2419 const RegisterBank *DstBank =
2420 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2421
2422 if (DstTy.getSizeInBits() == 1) {
2423 if (DstBank == &AMDGPU::VCCRegBank)
2424 break;
2425
2426 MachineFunction *MF = MI.getMF();
2427 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2428 LegalizerHelper Helper(*MF, ApplyBank, B);
2429
2430 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: LLT::scalar(SizeInBits: 32)) !=
2431 LegalizerHelper::Legalized)
2432 llvm_unreachable("widen scalar should have succeeded");
2433 return;
2434 }
2435
2436 if (DstTy.getSizeInBits() == 16 && DstBank == &AMDGPU::SGPRRegBank) {
2437 const LLT S32 = LLT::scalar(SizeInBits: 32);
2438 MachineBasicBlock *MBB = MI.getParent();
2439 MachineFunction *MF = MBB->getParent();
2440 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2441 LegalizerHelper Helper(*MF, ApplySALU, B);
2442 // Widen to S32, but handle `G_XOR x, -1` differently. Legalizer widening
2443 // will use a G_ANYEXT to extend the -1 which prevents matching G_XOR -1
2444 // as "not".
2445 if (MI.getOpcode() == AMDGPU::G_XOR &&
2446 mi_match(R: MI.getOperand(i: 2).getReg(), MRI, P: m_SpecificICstOrSplat(RequestedValue: -1))) {
2447 Helper.widenScalarSrc(MI, WideTy: S32, OpIdx: 1, ExtOpcode: AMDGPU::G_ANYEXT);
2448 Helper.widenScalarSrc(MI, WideTy: S32, OpIdx: 2, ExtOpcode: AMDGPU::G_SEXT);
2449 Helper.widenScalarDst(MI, WideTy: S32);
2450 } else {
2451 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized)
2452 llvm_unreachable("widen scalar should have succeeded");
2453 }
2454 return;
2455 }
2456
2457 if (DstTy.getSizeInBits() != 64)
2458 break;
2459
2460 LLT HalfTy = getHalfSizedType(Ty: DstTy);
2461 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2462 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(OpIdx: 1));
2463 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2));
2464
2465 // All inputs are SGPRs, nothing special to do.
2466 if (DefRegs.empty()) {
2467 assert(Src0Regs.empty() && Src1Regs.empty());
2468 break;
2469 }
2470
2471 assert(DefRegs.size() == 2);
2472 assert(Src0Regs.size() == Src1Regs.size() &&
2473 (Src0Regs.empty() || Src0Regs.size() == 2));
2474
2475 // Depending on where the source registers came from, the generic code may
2476 // have decided to split the inputs already or not. If not, we still need to
2477 // extract the values.
2478
2479 if (Src0Regs.empty())
2480 split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: 1).getReg());
2481 else
2482 setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy);
2483
2484 if (Src1Regs.empty())
2485 split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg());
2486 else
2487 setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2488
2489 setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2490
2491 auto Flags = MI.getFlags();
2492 B.buildInstr(Opc, DstOps: {DefRegs[0]}, SrcOps: {Src0Regs[0], Src1Regs[0]}, Flags);
2493 B.buildInstr(Opc, DstOps: {DefRegs[1]}, SrcOps: {Src0Regs[1], Src1Regs[1]}, Flags);
2494
2495 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2496 MI.eraseFromParent();
2497 return;
2498 }
2499 case AMDGPU::G_ABS: {
2500 Register SrcReg = MI.getOperand(i: 1).getReg();
2501 const RegisterBank *SrcBank = MRI.getRegBankOrNull(Reg: SrcReg);
2502
2503 // There is no VALU abs instruction so we need to replace it with a sub and
2504 // max combination.
2505 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2506 MachineFunction *MF = MI.getMF();
2507 ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2508 LegalizerHelper Helper(*MF, Apply, B);
2509
2510 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2511 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2512 return;
2513 }
2514 [[fallthrough]];
2515 }
2516 case AMDGPU::G_ADD:
2517 case AMDGPU::G_SUB:
2518 case AMDGPU::G_MUL:
2519 case AMDGPU::G_SHL:
2520 case AMDGPU::G_LSHR:
2521 case AMDGPU::G_ASHR:
2522 case AMDGPU::G_SMIN:
2523 case AMDGPU::G_SMAX:
2524 case AMDGPU::G_UMIN:
2525 case AMDGPU::G_UMAX: {
2526 Register DstReg = MI.getOperand(i: 0).getReg();
2527 LLT DstTy = MRI.getType(Reg: DstReg);
2528
2529 // Special case for s_mul_u64. There is not a vector equivalent of
2530 // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2531 // multiplications.
2532 if (!Subtarget.hasVMulU64Inst() && Opc == AMDGPU::G_MUL &&
2533 DstTy.getSizeInBits() == 64) {
2534 applyMappingSMULU64(B, OpdMapper);
2535 return;
2536 }
2537
2538 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2539 // Packed 16-bit operations need to be scalarized and promoted.
2540 if (DstTy != LLT::scalar(SizeInBits: 16) && DstTy != LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16))
2541 break;
2542
2543 const RegisterBank *DstBank =
2544 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2545 if (DstBank == &AMDGPU::VGPRRegBank)
2546 break;
2547
2548 const LLT S32 = LLT::scalar(SizeInBits: 32);
2549 MachineBasicBlock *MBB = MI.getParent();
2550 MachineFunction *MF = MBB->getParent();
2551 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2552
2553 if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2554 Register WideSrcLo, WideSrcHi;
2555
2556 std::tie(args&: WideSrcLo, args&: WideSrcHi) =
2557 unpackV2S16ToS32(B, Src: MI.getOperand(i: 1).getReg(), ExtOpcode: TargetOpcode::G_SEXT);
2558 auto Lo = B.buildInstr(Opc: AMDGPU::G_ABS, DstOps: {S32}, SrcOps: {WideSrcLo});
2559 auto Hi = B.buildInstr(Opc: AMDGPU::G_ABS, DstOps: {S32}, SrcOps: {WideSrcHi});
2560 B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)});
2561 MI.eraseFromParent();
2562 return;
2563 }
2564
2565 if (DstTy.isVector()) {
2566 Register WideSrc0Lo, WideSrc0Hi;
2567 Register WideSrc1Lo, WideSrc1Hi;
2568
2569 unsigned ExtendOp = getExtendOp(Opc: MI.getOpcode());
2570 std::tie(args&: WideSrc0Lo, args&: WideSrc0Hi)
2571 = unpackV2S16ToS32(B, Src: MI.getOperand(i: 1).getReg(), ExtOpcode: ExtendOp);
2572 std::tie(args&: WideSrc1Lo, args&: WideSrc1Hi)
2573 = unpackV2S16ToS32(B, Src: MI.getOperand(i: 2).getReg(), ExtOpcode: ExtendOp);
2574 auto Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Lo, WideSrc1Lo});
2575 auto Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Hi, WideSrc1Hi});
2576 B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)});
2577 MI.eraseFromParent();
2578 } else {
2579 LegalizerHelper Helper(*MF, ApplySALU, B);
2580
2581 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized)
2582 llvm_unreachable("widen scalar should have succeeded");
2583
2584 // FIXME: s16 shift amounts should be legal.
2585 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2586 Opc == AMDGPU::G_ASHR) {
2587 B.setInsertPt(MBB&: *MBB, II: MI.getIterator());
2588 if (Helper.widenScalar(MI, TypeIdx: 1, WideTy: S32) != LegalizerHelper::Legalized)
2589 llvm_unreachable("widen scalar should have succeeded");
2590 }
2591 }
2592
2593 return;
2594 }
2595 case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
2596 case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
2597 // This is a special case for s_mul_u64. We use
2598 // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2599 // where the 33 higher bits are sign-extended and
2600 // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2601 // where the 32 higher bits are zero-extended. In case scalar registers are
2602 // selected, both opcodes are lowered as s_mul_u64. If the vector registers
2603 // are selected, then G_AMDGPU_S_MUL_I64_I32 and
2604 // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2605
2606 // Insert basic copies.
2607 applyDefaultMapping(OpdMapper);
2608
2609 Register DstReg = MI.getOperand(i: 0).getReg();
2610 Register SrcReg0 = MI.getOperand(i: 1).getReg();
2611 Register SrcReg1 = MI.getOperand(i: 2).getReg();
2612 const LLT S32 = LLT::scalar(SizeInBits: 32);
2613 const LLT S64 = LLT::scalar(SizeInBits: 64);
2614 assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
2615 "that handles only 64-bit operands.");
2616 const RegisterBank *DstBank =
2617 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2618
2619 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2620 // with s_mul_u64 operation.
2621 if (DstBank == &AMDGPU::SGPRRegBank) {
2622 MI.setDesc(TII->get(Opcode: AMDGPU::S_MUL_U64));
2623 MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SGPR_64RegClass);
2624 MRI.setRegClass(Reg: SrcReg0, RC: &AMDGPU::SGPR_64RegClass);
2625 MRI.setRegClass(Reg: SrcReg1, RC: &AMDGPU::SGPR_64RegClass);
2626 return;
2627 }
2628
2629 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2630 // with a vector mad.
2631 assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
2632 "The destination operand should be in vector registers.");
2633
2634 // Extract the lower subregister from the first operand.
2635 Register Op0L = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2636 MRI.setRegClass(Reg: Op0L, RC: &AMDGPU::VGPR_32RegClass);
2637 MRI.setType(VReg: Op0L, Ty: S32);
2638 B.buildTrunc(Res: Op0L, Op: SrcReg0);
2639
2640 // Extract the lower subregister from the second operand.
2641 Register Op1L = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2642 MRI.setRegClass(Reg: Op1L, RC: &AMDGPU::VGPR_32RegClass);
2643 MRI.setType(VReg: Op1L, Ty: S32);
2644 B.buildTrunc(Res: Op1L, Op: SrcReg1);
2645
2646 unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
2647 ? AMDGPU::G_AMDGPU_MAD_U64_U32
2648 : AMDGPU::G_AMDGPU_MAD_I64_I32;
2649
2650 MachineIRBuilder B(MI);
2651 Register Zero64 = B.buildConstant(Res: S64, Val: 0).getReg(Idx: 0);
2652 MRI.setRegClass(Reg: Zero64, RC: &AMDGPU::VReg_64RegClass);
2653 Register CarryOut = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
2654 MRI.setRegClass(Reg: CarryOut, RC: &AMDGPU::VReg_64RegClass);
2655 B.buildInstr(Opc: NewOpc, DstOps: {DstReg, CarryOut}, SrcOps: {Op0L, Op1L, Zero64});
2656 MI.eraseFromParent();
2657 return;
2658 }
2659 case AMDGPU::G_SEXT_INREG: {
2660 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(OpIdx: 1));
2661 if (SrcRegs.empty())
2662 break; // Nothing to repair
2663
2664 const LLT S32 = LLT::scalar(SizeInBits: 32);
2665 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2666
2667 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2668 // we would need to further expand, and doesn't let us directly set the
2669 // result registers.
2670 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0));
2671
2672 int Amt = MI.getOperand(i: 2).getImm();
2673 if (Amt <= 32) {
2674 // Downstream users have expectations for the high bit behavior, so freeze
2675 // incoming undefined bits.
2676 if (Amt == 32) {
2677 // The low bits are unchanged.
2678 B.buildFreeze(Dst: DstRegs[0], Src: SrcRegs[0]);
2679 } else {
2680 auto Freeze = B.buildFreeze(Dst: S32, Src: SrcRegs[0]);
2681 // Extend in the low bits and propagate the sign bit to the high half.
2682 B.buildSExtInReg(Res: DstRegs[0], Op: Freeze, ImmOp: Amt);
2683 }
2684
2685 B.buildAShr(Dst: DstRegs[1], Src0: DstRegs[0], Src1: B.buildConstant(Res: S32, Val: 31));
2686 } else {
2687 // The low bits are unchanged, and extend in the high bits.
2688 // No freeze required
2689 B.buildCopy(Res: DstRegs[0], Op: SrcRegs[0]);
2690 B.buildSExtInReg(Res: DstRegs[1], Op: DstRegs[0], ImmOp: Amt - 32);
2691 }
2692
2693 Register DstReg = MI.getOperand(i: 0).getReg();
2694 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2695 MI.eraseFromParent();
2696 return;
2697 }
2698 case AMDGPU::G_CTPOP:
2699 case AMDGPU::G_BITREVERSE: {
2700 const RegisterBank *DstBank =
2701 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2702 if (DstBank == &AMDGPU::SGPRRegBank)
2703 break;
2704
2705 Register SrcReg = MI.getOperand(i: 1).getReg();
2706 const LLT S32 = LLT::scalar(SizeInBits: 32);
2707 LLT Ty = MRI.getType(Reg: SrcReg);
2708 if (Ty == S32)
2709 break;
2710
2711 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2712
2713 MachineFunction &MF = B.getMF();
2714 LegalizerHelper Helper(MF, ApplyVALU, B);
2715
2716 if (Helper.narrowScalar(MI, TypeIdx: 1, NarrowTy: S32) != LegalizerHelper::Legalized)
2717 llvm_unreachable("narrowScalar should have succeeded");
2718 return;
2719 }
2720 case AMDGPU::G_AMDGPU_FFBH_U32:
2721 case AMDGPU::G_AMDGPU_FFBL_B32:
2722 case AMDGPU::G_CTLZ_ZERO_POISON:
2723 case AMDGPU::G_CTTZ_ZERO_POISON: {
2724 const RegisterBank *DstBank =
2725 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2726 if (DstBank == &AMDGPU::SGPRRegBank)
2727 break;
2728
2729 Register SrcReg = MI.getOperand(i: 1).getReg();
2730 const LLT S32 = LLT::scalar(SizeInBits: 32);
2731 LLT Ty = MRI.getType(Reg: SrcReg);
2732 if (Ty == S32)
2733 break;
2734
2735 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2736 // which return -1 when the input is zero:
2737 // (ctlz_zero_poison hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2738 // (cttz_zero_poison hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2739 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2740 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2741 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2742 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(OpIdx: 1));
2743 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_POISON
2744 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2745 : Opc == AMDGPU::G_CTTZ_ZERO_POISON
2746 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2747 : Opc;
2748 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2749 auto X = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs[Idx]});
2750 auto Y = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs[Idx ^ 1]});
2751 unsigned AddOpc =
2752 Opc == AMDGPU::G_CTLZ_ZERO_POISON || Opc == AMDGPU::G_CTTZ_ZERO_POISON
2753 ? AMDGPU::G_ADD
2754 : AMDGPU::G_UADDSAT;
2755 Y = B.buildInstr(Opc: AddOpc, DstOps: {S32}, SrcOps: {Y, B.buildConstant(Res: S32, Val: 32)});
2756 Register DstReg = MI.getOperand(i: 0).getReg();
2757 B.buildUMin(Dst: DstReg, Src0: X, Src1: Y);
2758 MI.eraseFromParent();
2759 return;
2760 }
2761 case AMDGPU::G_SEXT:
2762 case AMDGPU::G_ZEXT:
2763 case AMDGPU::G_ANYEXT: {
2764 Register SrcReg = MI.getOperand(i: 1).getReg();
2765 LLT SrcTy = MRI.getType(Reg: SrcReg);
2766 const bool Signed = Opc == AMDGPU::G_SEXT;
2767
2768 assert(OpdMapper.getVRegs(1).empty());
2769
2770 const RegisterBank *SrcBank =
2771 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
2772
2773 Register DstReg = MI.getOperand(i: 0).getReg();
2774 LLT DstTy = MRI.getType(Reg: DstReg);
2775 if (DstTy.isScalar() &&
2776 SrcBank != &AMDGPU::SGPRRegBank &&
2777 SrcBank != &AMDGPU::VCCRegBank &&
2778 // FIXME: Should handle any type that round to s64 when irregular
2779 // breakdowns supported.
2780 DstTy.getSizeInBits() == 64 &&
2781 SrcTy.getSizeInBits() <= 32) {
2782 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2783
2784 // Extend to 32-bit, and then extend the low half.
2785 if (Signed) {
2786 // TODO: Should really be buildSExtOrCopy
2787 B.buildSExtOrTrunc(Res: DefRegs[0], Op: SrcReg);
2788 } else if (Opc == AMDGPU::G_ZEXT) {
2789 B.buildZExtOrTrunc(Res: DefRegs[0], Op: SrcReg);
2790 } else {
2791 B.buildAnyExtOrTrunc(Res: DefRegs[0], Op: SrcReg);
2792 }
2793
2794 extendLow32IntoHigh32(B, Hi32Reg: DefRegs[1], Lo32Reg: DefRegs[0], ExtOpc: Opc, RegBank: *SrcBank);
2795 MRI.setRegBank(Reg: DstReg, RegBank: *SrcBank);
2796 MI.eraseFromParent();
2797 return;
2798 }
2799
2800 if (SrcTy != LLT::scalar(SizeInBits: 1))
2801 return;
2802
2803 // It is not legal to have a legalization artifact with a VCC source. Rather
2804 // than introducing a copy, insert the select we would have to select the
2805 // copy to.
2806 if (SrcBank == &AMDGPU::VCCRegBank) {
2807 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2808
2809 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2810
2811 unsigned DstSize = DstTy.getSizeInBits();
2812 // 64-bit select is SGPR only
2813 const bool UseSel64 = DstSize > 32 &&
2814 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2815
2816 // TODO: Should s16 select be legal?
2817 LLT SelType = UseSel64 ? LLT::scalar(SizeInBits: 64) : LLT::scalar(SizeInBits: 32);
2818 auto True = B.buildConstant(Res: SelType, Val: Signed ? -1 : 1);
2819 auto False = B.buildConstant(Res: SelType, Val: 0);
2820
2821 MRI.setRegBank(Reg: True.getReg(Idx: 0), RegBank: *DstBank);
2822 MRI.setRegBank(Reg: False.getReg(Idx: 0), RegBank: *DstBank);
2823 MRI.setRegBank(Reg: DstReg, RegBank: *DstBank);
2824
2825 if (DstSize > 32) {
2826 B.buildSelect(Res: DefRegs[0], Tst: SrcReg, Op0: True, Op1: False);
2827 extendLow32IntoHigh32(B, Hi32Reg: DefRegs[1], Lo32Reg: DefRegs[0], ExtOpc: Opc, RegBank: *SrcBank, IsBooleanSrc: true);
2828 } else if (DstSize < 32) {
2829 auto Sel = B.buildSelect(Res: SelType, Tst: SrcReg, Op0: True, Op1: False);
2830 MRI.setRegBank(Reg: Sel.getReg(Idx: 0), RegBank: *DstBank);
2831 B.buildTrunc(Res: DstReg, Op: Sel);
2832 } else {
2833 B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False);
2834 }
2835
2836 MI.eraseFromParent();
2837 return;
2838 }
2839
2840 break;
2841 }
2842 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2843 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0));
2844
2845 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2846
2847 Register DstReg = MI.getOperand(i: 0).getReg();
2848 Register SrcReg = MI.getOperand(i: 1).getReg();
2849
2850 const LLT S32 = LLT::scalar(SizeInBits: 32);
2851 LLT DstTy = MRI.getType(Reg: DstReg);
2852 LLT SrcTy = MRI.getType(Reg: SrcReg);
2853
2854 if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2855 return;
2856
2857 const ValueMapping &DstMapping
2858 = OpdMapper.getInstrMapping().getOperandMapping(i: 0);
2859 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2860 const RegisterBank *SrcBank =
2861 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
2862 const RegisterBank *IdxBank =
2863 OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
2864
2865 Register BaseIdxReg;
2866 unsigned ConstOffset;
2867 std::tie(args&: BaseIdxReg, args&: ConstOffset) =
2868 AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: 2).getReg());
2869
2870 // See if the index is an add of a constant which will be foldable by moving
2871 // the base register of the index later if this is going to be executed in a
2872 // waterfall loop. This is essentially to reassociate the add of a constant
2873 // with the readfirstlane.
2874 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2875 ConstOffset > 0 &&
2876 ConstOffset < SrcTy.getNumElements();
2877
2878 // Move the base register. We'll re-insert the add later.
2879 if (ShouldMoveIndexIntoLoop)
2880 MI.getOperand(i: 2).setReg(BaseIdxReg);
2881
2882 // If this is a VGPR result only because the index was a VGPR result, the
2883 // actual indexing will be done on the SGPR source vector, which will
2884 // produce a scalar result. We need to copy to the VGPR result inside the
2885 // waterfall loop.
2886 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2887 SrcBank == &AMDGPU::SGPRRegBank;
2888 if (DstRegs.empty()) {
2889 applyDefaultMapping(OpdMapper);
2890
2891 executeInWaterfallLoop(B, MI, OpIndices: {2});
2892
2893 if (NeedCopyToVGPR) {
2894 // We don't want a phi for this temporary reg.
2895 Register TmpReg = MRI.createGenericVirtualRegister(Ty: DstTy);
2896 MRI.setRegBank(Reg: TmpReg, RegBank: AMDGPU::SGPRRegBank);
2897 MI.getOperand(i: 0).setReg(TmpReg);
2898 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
2899
2900 // Use a v_mov_b32 here to make the exec dependency explicit.
2901 buildVCopy(B, DstReg, SrcReg: TmpReg);
2902 }
2903
2904 // Re-insert the constant offset add inside the waterfall loop.
2905 if (ShouldMoveIndexIntoLoop)
2906 reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: 2, ConstOffset);
2907
2908 return;
2909 }
2910
2911 assert(DstTy.getSizeInBits() == 64);
2912
2913 LLT Vec32 = LLT::fixed_vector(NumElements: 2 * SrcTy.getNumElements(), ScalarSizeInBits: 32);
2914
2915 auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg);
2916 auto One = B.buildConstant(Res: S32, Val: 1);
2917
2918 MachineBasicBlock::iterator MII = MI.getIterator();
2919
2920 // Split the vector index into 32-bit pieces. Prepare to move all of the
2921 // new instructions into a waterfall loop if necessary.
2922 //
2923 // Don't put the bitcast or constant in the loop.
2924 MachineInstrSpan Span(MII, &B.getMBB());
2925
2926 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2927 auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One);
2928 auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One);
2929
2930 auto Extract0 = B.buildExtractVectorElement(Res: DstRegs[0], Val: CastSrc, Idx: IdxLo);
2931 auto Extract1 = B.buildExtractVectorElement(Res: DstRegs[1], Val: CastSrc, Idx: IdxHi);
2932
2933 MRI.setRegBank(Reg: DstReg, RegBank: *DstBank);
2934 MRI.setRegBank(Reg: CastSrc.getReg(Idx: 0), RegBank: *SrcBank);
2935 MRI.setRegBank(Reg: One.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
2936 MRI.setRegBank(Reg: IdxLo.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
2937 MRI.setRegBank(Reg: IdxHi.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
2938
2939 SmallSet<Register, 4> OpsToWaterfall;
2940 if (!collectWaterfallOperands(SGPROperandRegs&: OpsToWaterfall, MI, MRI, OpIndices: { 2 })) {
2941 MI.eraseFromParent();
2942 return;
2943 }
2944
2945 // Remove the original instruction to avoid potentially confusing the
2946 // waterfall loop logic.
2947 B.setInstr(*Span.begin());
2948 MI.eraseFromParent();
2949 executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()),
2950 SGPROperandRegs&: OpsToWaterfall);
2951
2952 if (NeedCopyToVGPR) {
2953 MachineBasicBlock *LoopBB = Extract1->getParent();
2954 Register TmpReg0 = MRI.createGenericVirtualRegister(Ty: S32);
2955 Register TmpReg1 = MRI.createGenericVirtualRegister(Ty: S32);
2956 MRI.setRegBank(Reg: TmpReg0, RegBank: AMDGPU::SGPRRegBank);
2957 MRI.setRegBank(Reg: TmpReg1, RegBank: AMDGPU::SGPRRegBank);
2958
2959 Extract0->getOperand(i: 0).setReg(TmpReg0);
2960 Extract1->getOperand(i: 0).setReg(TmpReg1);
2961
2962 B.setInsertPt(MBB&: *LoopBB, II: ++Extract1->getIterator());
2963
2964 buildVCopy(B, DstReg: DstRegs[0], SrcReg: TmpReg0);
2965 buildVCopy(B, DstReg: DstRegs[1], SrcReg: TmpReg1);
2966 }
2967
2968 if (ShouldMoveIndexIntoLoop)
2969 reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: 1, ConstOffset);
2970
2971 return;
2972 }
2973 case AMDGPU::G_INSERT_VECTOR_ELT: {
2974 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(OpIdx: 2));
2975
2976 Register DstReg = MI.getOperand(i: 0).getReg();
2977 LLT VecTy = MRI.getType(Reg: DstReg);
2978
2979 assert(OpdMapper.getVRegs(0).empty());
2980 assert(OpdMapper.getVRegs(3).empty());
2981
2982 if (substituteSimpleCopyRegs(OpdMapper, OpIdx: 1))
2983 MRI.setType(VReg: MI.getOperand(i: 1).getReg(), Ty: VecTy);
2984
2985 if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2986 return;
2987
2988 const RegisterBank *IdxBank =
2989 OpdMapper.getInstrMapping().getOperandMapping(i: 3).BreakDown[0].RegBank;
2990
2991 Register SrcReg = MI.getOperand(i: 1).getReg();
2992 Register InsReg = MI.getOperand(i: 2).getReg();
2993 LLT InsTy = MRI.getType(Reg: InsReg);
2994 (void)InsTy;
2995
2996 Register BaseIdxReg;
2997 unsigned ConstOffset;
2998 std::tie(args&: BaseIdxReg, args&: ConstOffset) =
2999 AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: 3).getReg());
3000
3001 // See if the index is an add of a constant which will be foldable by moving
3002 // the base register of the index later if this is going to be executed in a
3003 // waterfall loop. This is essentially to reassociate the add of a constant
3004 // with the readfirstlane.
3005 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
3006 ConstOffset > 0 &&
3007 ConstOffset < VecTy.getNumElements();
3008
3009 // Move the base register. We'll re-insert the add later.
3010 if (ShouldMoveIndexIntoLoop)
3011 MI.getOperand(i: 3).setReg(BaseIdxReg);
3012
3013
3014 if (InsRegs.empty()) {
3015 executeInWaterfallLoop(B, MI, OpIndices: {3});
3016
3017 // Re-insert the constant offset add inside the waterfall loop.
3018 if (ShouldMoveIndexIntoLoop) {
3019 reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: 3, ConstOffset);
3020 }
3021
3022 return;
3023 }
3024
3025 assert(InsTy.getSizeInBits() == 64);
3026
3027 const LLT S32 = LLT::scalar(SizeInBits: 32);
3028 LLT Vec32 = LLT::fixed_vector(NumElements: 2 * VecTy.getNumElements(), ScalarSizeInBits: 32);
3029
3030 auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg);
3031 auto One = B.buildConstant(Res: S32, Val: 1);
3032
3033 // Split the vector index into 32-bit pieces. Prepare to move all of the
3034 // new instructions into a waterfall loop if necessary.
3035 //
3036 // Don't put the bitcast or constant in the loop.
3037 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
3038
3039 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
3040 auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One);
3041 auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One);
3042
3043 auto InsLo = B.buildInsertVectorElement(Res: Vec32, Val: CastSrc, Elt: InsRegs[0], Idx: IdxLo);
3044 auto InsHi = B.buildInsertVectorElement(Res: Vec32, Val: InsLo, Elt: InsRegs[1], Idx: IdxHi);
3045
3046 const RegisterBank *DstBank =
3047 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
3048 const RegisterBank *SrcBank =
3049 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
3050 const RegisterBank *InsSrcBank =
3051 OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
3052
3053 MRI.setRegBank(Reg: InsReg, RegBank: *InsSrcBank);
3054 MRI.setRegBank(Reg: CastSrc.getReg(Idx: 0), RegBank: *SrcBank);
3055 MRI.setRegBank(Reg: InsLo.getReg(Idx: 0), RegBank: *DstBank);
3056 MRI.setRegBank(Reg: InsHi.getReg(Idx: 0), RegBank: *DstBank);
3057 MRI.setRegBank(Reg: One.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
3058 MRI.setRegBank(Reg: IdxLo.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
3059 MRI.setRegBank(Reg: IdxHi.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
3060
3061
3062 SmallSet<Register, 4> OpsToWaterfall;
3063 if (!collectWaterfallOperands(SGPROperandRegs&: OpsToWaterfall, MI, MRI, OpIndices: { 3 })) {
3064 B.setInsertPt(MBB&: B.getMBB(), II: MI);
3065 B.buildBitcast(Dst: DstReg, Src: InsHi);
3066 MI.eraseFromParent();
3067 return;
3068 }
3069
3070 B.setInstr(*Span.begin());
3071 MI.eraseFromParent();
3072
3073 // Figure out the point after the waterfall loop before mangling the control
3074 // flow.
3075 executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()),
3076 SGPROperandRegs&: OpsToWaterfall);
3077
3078 // The insertion point is now right after the original instruction.
3079 //
3080 // Keep the bitcast to the original vector type out of the loop. Doing this
3081 // saved an extra phi we don't need inside the loop.
3082 B.buildBitcast(Dst: DstReg, Src: InsHi);
3083
3084 // Re-insert the constant offset add inside the waterfall loop.
3085 if (ShouldMoveIndexIntoLoop)
3086 reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: 1, ConstOffset);
3087
3088 return;
3089 }
3090 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3091 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3092 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3093 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3094 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3095 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
3096 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
3097 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
3098 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
3099 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
3100 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3101 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
3102 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3103 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3104 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3105 case AMDGPU::G_AMDGPU_BUFFER_STORE:
3106 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3107 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3108 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3109 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
3110 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3111 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
3112 applyDefaultMapping(OpdMapper);
3113 executeInWaterfallLoop(B, MI, OpIndices: {1, 4});
3114 return;
3115 }
3116 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3117 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3118 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3119 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3120 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3121 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3122 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3123 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3124 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3125 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3126 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3127 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3128 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
3129 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
3130 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3131 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3132 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3133 applyDefaultMapping(OpdMapper);
3134 executeInWaterfallLoop(B, MI, OpIndices: {2, 5});
3135 return;
3136 }
3137 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3138 applyDefaultMapping(OpdMapper);
3139 executeInWaterfallLoop(B, MI, OpIndices: {3, 6});
3140 return;
3141 }
3142 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
3143 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
3144 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
3145 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
3146 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
3147 applyMappingSBufferLoad(B, OpdMapper);
3148 return;
3149 }
3150 case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
3151 constrainOpWithReadfirstlane(B, MI, OpIdx: 0);
3152 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3153 return;
3154 case AMDGPU::G_INTRINSIC:
3155 case AMDGPU::G_INTRINSIC_CONVERGENT: {
3156 switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
3157 case Intrinsic::amdgcn_readlane: {
3158 substituteSimpleCopyRegs(OpdMapper, OpIdx: 2);
3159
3160 assert(OpdMapper.getVRegs(0).empty());
3161 assert(OpdMapper.getVRegs(3).empty());
3162
3163 // Make sure the index is an SGPR. It doesn't make sense to run this in a
3164 // waterfall loop, so assume it's a uniform value.
3165 constrainOpWithReadfirstlane(B, MI, OpIdx: 3); // Index
3166 return;
3167 }
3168 case Intrinsic::amdgcn_writelane: {
3169 assert(OpdMapper.getVRegs(0).empty());
3170 assert(OpdMapper.getVRegs(2).empty());
3171 assert(OpdMapper.getVRegs(3).empty());
3172
3173 substituteSimpleCopyRegs(OpdMapper, OpIdx: 4); // VGPR input val
3174 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // Source value
3175 constrainOpWithReadfirstlane(B, MI, OpIdx: 3); // Index
3176 return;
3177 }
3178 case Intrinsic::amdgcn_interp_p1:
3179 case Intrinsic::amdgcn_interp_p2:
3180 case Intrinsic::amdgcn_interp_mov:
3181 case Intrinsic::amdgcn_interp_p1_f16:
3182 case Intrinsic::amdgcn_interp_p2_f16:
3183 case Intrinsic::amdgcn_lds_param_load: {
3184 applyDefaultMapping(OpdMapper);
3185
3186 // Readlane for m0 value, which is always the last operand.
3187 // FIXME: Should this be a waterfall loop instead?
3188 constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - 1); // Index
3189 return;
3190 }
3191 case Intrinsic::amdgcn_interp_inreg_p10:
3192 case Intrinsic::amdgcn_interp_inreg_p2:
3193 case Intrinsic::amdgcn_interp_inreg_p10_f16:
3194 case Intrinsic::amdgcn_interp_inreg_p2_f16:
3195 case Intrinsic::amdgcn_interp_p10_rtz_f16:
3196 case Intrinsic::amdgcn_interp_p2_rtz_f16:
3197 case Intrinsic::amdgcn_permlane16_swap:
3198 case Intrinsic::amdgcn_permlane32_swap:
3199 applyDefaultMapping(OpdMapper);
3200 return;
3201 case Intrinsic::amdgcn_permlane16:
3202 case Intrinsic::amdgcn_permlanex16: {
3203 // Doing a waterfall loop over these wouldn't make any sense.
3204 substituteSimpleCopyRegs(OpdMapper, OpIdx: 2);
3205 substituteSimpleCopyRegs(OpdMapper, OpIdx: 3);
3206 constrainOpWithReadfirstlane(B, MI, OpIdx: 4);
3207 constrainOpWithReadfirstlane(B, MI, OpIdx: 5);
3208 return;
3209 }
3210 case Intrinsic::amdgcn_permlane_bcast:
3211 case Intrinsic::amdgcn_permlane_up:
3212 case Intrinsic::amdgcn_permlane_down:
3213 case Intrinsic::amdgcn_permlane_xor:
3214 // Doing a waterfall loop over these wouldn't make any sense.
3215 constrainOpWithReadfirstlane(B, MI, OpIdx: 3);
3216 constrainOpWithReadfirstlane(B, MI, OpIdx: 4);
3217 return;
3218 case Intrinsic::amdgcn_permlane_idx_gen: {
3219 constrainOpWithReadfirstlane(B, MI, OpIdx: 3);
3220 return;
3221 }
3222 case Intrinsic::amdgcn_sbfe:
3223 applyMappingBFE(B, OpdMapper, Signed: true);
3224 return;
3225 case Intrinsic::amdgcn_ubfe:
3226 applyMappingBFE(B, OpdMapper, Signed: false);
3227 return;
3228 case Intrinsic::amdgcn_inverse_ballot:
3229 case Intrinsic::amdgcn_s_bitreplicate:
3230 case Intrinsic::amdgcn_s_quadmask:
3231 case Intrinsic::amdgcn_s_wqm:
3232 applyDefaultMapping(OpdMapper);
3233 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // Mask
3234 return;
3235 case Intrinsic::amdgcn_ballot:
3236 // Use default handling and insert copy to vcc source.
3237 break;
3238 }
3239 break;
3240 }
3241 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3242 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3243 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3244 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3245 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3246 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3247 AMDGPU::lookupRsrcIntrinsic(Intr: AMDGPU::getIntrinsicID(I: MI));
3248 assert(RSrcIntrin && RSrcIntrin->IsImage);
3249 // Non-images can have complications from operands that allow both SGPR
3250 // and VGPR. For now it's too complicated to figure out the final opcode
3251 // to derive the register bank from the MCInstrDesc.
3252 applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg);
3253 return;
3254 }
3255 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
3256 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
3257 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {
3258 bool IsDualOrBVH8 =
3259 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY ||
3260 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY;
3261 unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier
3262 unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods;
3263 applyDefaultMapping(OpdMapper);
3264 executeInWaterfallLoop(B, MI, OpIndices: {LastRegOpIdx});
3265 return;
3266 }
3267 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3268 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3269 auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
3270 switch (IntrID) {
3271 case Intrinsic::amdgcn_ds_ordered_add:
3272 case Intrinsic::amdgcn_ds_ordered_swap: {
3273 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3274 assert(OpdMapper.getVRegs(0).empty());
3275 substituteSimpleCopyRegs(OpdMapper, OpIdx: 3);
3276 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3277 return;
3278 }
3279 case Intrinsic::amdgcn_ds_gws_init:
3280 case Intrinsic::amdgcn_ds_gws_barrier:
3281 case Intrinsic::amdgcn_ds_gws_sema_br: {
3282 // Only the first lane is executes, so readfirstlane is safe.
3283 substituteSimpleCopyRegs(OpdMapper, OpIdx: 1);
3284 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3285 return;
3286 }
3287 case Intrinsic::amdgcn_ds_gws_sema_v:
3288 case Intrinsic::amdgcn_ds_gws_sema_p:
3289 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3290 // Only the first lane is executes, so readfirstlane is safe.
3291 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // M0
3292 return;
3293 }
3294 case Intrinsic::amdgcn_ds_append:
3295 case Intrinsic::amdgcn_ds_consume: {
3296 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3297 return;
3298 }
3299 case Intrinsic::amdgcn_s_alloc_vgpr:
3300 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3301 return;
3302 case Intrinsic::amdgcn_s_sendmsg:
3303 case Intrinsic::amdgcn_s_sendmsghalt: {
3304 // FIXME: Should this use a waterfall loop?
3305 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3306 return;
3307 }
3308 case Intrinsic::amdgcn_s_setreg: {
3309 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3310 return;
3311 }
3312 case Intrinsic::amdgcn_s_ttracedata:
3313 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // M0
3314 return;
3315 case Intrinsic::amdgcn_raw_buffer_load_lds:
3316 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3317 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
3318 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: {
3319 applyDefaultMapping(OpdMapper);
3320 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // rsrc
3321 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3322 constrainOpWithReadfirstlane(B, MI, OpIdx: 5); // soffset
3323 return;
3324 }
3325 case Intrinsic::amdgcn_struct_buffer_load_lds:
3326 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3327 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
3328 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
3329 applyDefaultMapping(OpdMapper);
3330 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // rsrc
3331 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3332 constrainOpWithReadfirstlane(B, MI, OpIdx: 6); // soffset
3333 return;
3334 }
3335 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
3336 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
3337 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
3338 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
3339 applyDefaultMapping(OpdMapper);
3340 constrainOpWithReadfirstlane(B, MI, OpIdx: 5);
3341 return;
3342 }
3343 case Intrinsic::amdgcn_load_to_lds:
3344 case Intrinsic::amdgcn_load_async_to_lds:
3345 case Intrinsic::amdgcn_global_load_lds:
3346 case Intrinsic::amdgcn_global_load_async_lds: {
3347 applyDefaultMapping(OpdMapper);
3348 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3349 return;
3350 }
3351 case Intrinsic::amdgcn_lds_direct_load: {
3352 applyDefaultMapping(OpdMapper);
3353 // Readlane for m0 value, which is always the last operand.
3354 constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - 1); // Index
3355 return;
3356 }
3357 case Intrinsic::amdgcn_exp_row:
3358 applyDefaultMapping(OpdMapper);
3359 constrainOpWithReadfirstlane(B, MI, OpIdx: 8); // M0
3360 return;
3361 case Intrinsic::amdgcn_cluster_load_b32:
3362 case Intrinsic::amdgcn_cluster_load_b64:
3363 case Intrinsic::amdgcn_cluster_load_b128: {
3364 applyDefaultMapping(OpdMapper);
3365 constrainOpWithReadfirstlane(B, MI, OpIdx: 4); // M0
3366 return;
3367 }
3368 case Intrinsic::amdgcn_s_sleep_var:
3369 assert(OpdMapper.getVRegs(1).empty());
3370 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3371 return;
3372 case Intrinsic::amdgcn_s_barrier_join:
3373 case Intrinsic::amdgcn_s_wakeup_barrier:
3374 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3375 return;
3376 case Intrinsic::amdgcn_s_barrier_init:
3377 case Intrinsic::amdgcn_s_barrier_signal_var:
3378 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3379 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3380 return;
3381 case Intrinsic::amdgcn_s_get_barrier_state:
3382 case Intrinsic::amdgcn_s_get_named_barrier_state: {
3383 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3384 return;
3385 }
3386 case Intrinsic::amdgcn_s_prefetch_data:
3387 case Intrinsic::amdgcn_s_prefetch_inst: {
3388 Register PtrReg = MI.getOperand(i: 1).getReg();
3389 unsigned AS = MRI.getType(Reg: PtrReg).getAddressSpace();
3390 if (AMDGPU::isFlatGlobalAddrSpace(AS)) {
3391 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3392 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3393 } else
3394 MI.eraseFromParent();
3395 return;
3396 }
3397 case Intrinsic::amdgcn_tensor_load_to_lds:
3398 case Intrinsic::amdgcn_tensor_store_from_lds: {
3399 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3400 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3401 constrainOpWithReadfirstlane(B, MI, OpIdx: 3);
3402 constrainOpWithReadfirstlane(B, MI, OpIdx: 4);
3403 constrainOpWithReadfirstlane(B, MI, OpIdx: 5);
3404 return;
3405 }
3406 default: {
3407 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3408 AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
3409 // Non-images can have complications from operands that allow both SGPR
3410 // and VGPR. For now it's too complicated to figure out the final opcode
3411 // to derive the register bank from the MCInstrDesc.
3412 if (RSrcIntrin->IsImage) {
3413 applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg);
3414 return;
3415 }
3416 }
3417
3418 break;
3419 }
3420 }
3421 break;
3422 }
3423 case AMDGPU::G_SI_CALL: {
3424 // Use a set to avoid extra readfirstlanes in the case where multiple
3425 // operands are the same register.
3426 SmallSet<Register, 4> SGPROperandRegs;
3427
3428 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices: {1}))
3429 break;
3430
3431 // Move all copies to physical SGPRs that are used by the call instruction
3432 // into the loop block. Start searching for these copies until the
3433 // ADJCALLSTACKUP.
3434 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3435 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3436
3437 // Move all non-copies before the copies, so that a complete range can be
3438 // moved into the waterfall loop.
3439 SmallVector<MachineInstr *, 4> NonCopyInstrs;
3440 // Count of NonCopyInstrs found until the current LastCopy.
3441 unsigned NonCopyInstrsLen = 0;
3442 MachineBasicBlock::iterator Start(&MI);
3443 MachineBasicBlock::iterator LastCopy = Start;
3444 MachineBasicBlock *MBB = MI.getParent();
3445 const SIMachineFunctionInfo *Info =
3446 MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3447 while (Start->getOpcode() != FrameSetupOpcode) {
3448 --Start;
3449 bool IsCopy = false;
3450 if (Start->getOpcode() == AMDGPU::COPY) {
3451 auto &Dst = Start->getOperand(i: 0);
3452 if (Dst.isReg()) {
3453 Register Reg = Dst.getReg();
3454 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3455 IsCopy = true;
3456 } else {
3457 // Also move the copy from the scratch rsrc descriptor into the loop
3458 // to allow it to be optimized away.
3459 auto &Src = Start->getOperand(i: 1);
3460 if (Src.isReg()) {
3461 Reg = Src.getReg();
3462 IsCopy = Info->getScratchRSrcReg() == Reg;
3463 }
3464 }
3465 }
3466 }
3467
3468 if (IsCopy) {
3469 LastCopy = Start;
3470 NonCopyInstrsLen = NonCopyInstrs.size();
3471 } else {
3472 NonCopyInstrs.push_back(Elt: &*Start);
3473 }
3474 }
3475 NonCopyInstrs.resize(N: NonCopyInstrsLen);
3476
3477 for (auto *NonCopy : reverse(C&: NonCopyInstrs)) {
3478 MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator());
3479 }
3480 Start = LastCopy;
3481
3482 // Do the same for copies after the loop
3483 NonCopyInstrs.clear();
3484 NonCopyInstrsLen = 0;
3485 MachineBasicBlock::iterator End(&MI);
3486 LastCopy = End;
3487 while (End->getOpcode() != FrameDestroyOpcode) {
3488 ++End;
3489 bool IsCopy = false;
3490 if (End->getOpcode() == AMDGPU::COPY) {
3491 auto &Src = End->getOperand(i: 1);
3492 if (Src.isReg()) {
3493 Register Reg = Src.getReg();
3494 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3495 }
3496 }
3497
3498 if (IsCopy) {
3499 LastCopy = End;
3500 NonCopyInstrsLen = NonCopyInstrs.size();
3501 } else {
3502 NonCopyInstrs.push_back(Elt: &*End);
3503 }
3504 }
3505 NonCopyInstrs.resize(N: NonCopyInstrsLen);
3506
3507 End = LastCopy;
3508 ++LastCopy;
3509 for (auto *NonCopy : reverse(C&: NonCopyInstrs)) {
3510 MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator());
3511 }
3512
3513 ++End;
3514 B.setInsertPt(MBB&: B.getMBB(), II: Start);
3515 executeInWaterfallLoop(B, Range: make_range(x: Start, y: End), SGPROperandRegs);
3516 break;
3517 }
3518 case AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR:
3519 case AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR:
3520 case AMDGPU::G_LOAD:
3521 case AMDGPU::G_ZEXTLOAD:
3522 case AMDGPU::G_SEXTLOAD: {
3523 if (applyMappingLoad(B, OpdMapper, MI))
3524 return;
3525 break;
3526 }
3527 case AMDGPU::G_DYN_STACKALLOC:
3528 applyMappingDynStackAlloc(B, OpdMapper, MI);
3529 return;
3530 case AMDGPU::G_STACKRESTORE: {
3531 applyDefaultMapping(OpdMapper);
3532 constrainOpWithReadfirstlane(B, MI, OpIdx: 0);
3533 return;
3534 }
3535 case AMDGPU::G_SBFX:
3536 applyMappingBFE(B, OpdMapper, /*Signed*/ true);
3537 return;
3538 case AMDGPU::G_UBFX:
3539 applyMappingBFE(B, OpdMapper, /*Signed*/ false);
3540 return;
3541 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3542 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3543 applyMappingMAD_64_32(B, OpdMapper);
3544 return;
3545 case AMDGPU::G_PREFETCH: {
3546 if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) {
3547 MI.eraseFromParent();
3548 return;
3549 }
3550 Register PtrReg = MI.getOperand(i: 0).getReg();
3551 unsigned PtrBank = getRegBankID(Reg: PtrReg, MRI, Default: AMDGPU::SGPRRegBankID);
3552 if (PtrBank == AMDGPU::VGPRRegBankID &&
3553 (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(i: 3).getImm())) {
3554 // Cannot do I$ prefetch with divergent pointer.
3555 MI.eraseFromParent();
3556 return;
3557 }
3558 unsigned AS = MRI.getType(Reg: PtrReg).getAddressSpace();
3559 if ((!AMDGPU::isFlatGlobalAddrSpace(AS) &&
3560 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3561 (!Subtarget.hasSafeSmemPrefetch() &&
3562 (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
3563 !MI.getOperand(i: 3).getImm() /* I$ prefetch */))) {
3564 MI.eraseFromParent();
3565 return;
3566 }
3567 applyDefaultMapping(OpdMapper);
3568 return;
3569 }
3570 default:
3571 break;
3572 }
3573
3574 return applyDefaultMapping(OpdMapper);
3575}
3576
3577// vgpr, sgpr -> vgpr
3578// vgpr, agpr -> vgpr
3579// agpr, agpr -> agpr
3580// agpr, sgpr -> vgpr
3581static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3582 if (RB0 == AMDGPU::InvalidRegBankID)
3583 return RB1;
3584 if (RB1 == AMDGPU::InvalidRegBankID)
3585 return RB0;
3586
3587 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3588 return AMDGPU::SGPRRegBankID;
3589
3590 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3591 return AMDGPU::AGPRRegBankID;
3592
3593 return AMDGPU::VGPRRegBankID;
3594}
3595
3596static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3597 if (RB0 == AMDGPU::InvalidRegBankID)
3598 return RB1;
3599 if (RB1 == AMDGPU::InvalidRegBankID)
3600 return RB0;
3601
3602 // vcc, vcc -> vcc
3603 // vcc, sgpr -> vcc
3604 // vcc, vgpr -> vcc
3605 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3606 return AMDGPU::VCCRegBankID;
3607
3608 // vcc, vgpr -> vgpr
3609 return regBankUnion(RB0, RB1);
3610}
3611
3612unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3613 const MachineInstr &MI) const {
3614 unsigned RegBank = AMDGPU::InvalidRegBankID;
3615
3616 for (const MachineOperand &MO : MI.operands()) {
3617 if (!MO.isReg())
3618 continue;
3619 Register Reg = MO.getReg();
3620 if (const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI)) {
3621 RegBank = regBankUnion(RB0: RegBank, RB1: Bank->getID());
3622 if (RegBank == AMDGPU::VGPRRegBankID)
3623 break;
3624 }
3625 }
3626
3627 return RegBank;
3628}
3629
3630bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3631 const MachineFunction &MF = *MI.getMF();
3632 const MachineRegisterInfo &MRI = MF.getRegInfo();
3633 for (const MachineOperand &MO : MI.operands()) {
3634 if (!MO.isReg())
3635 continue;
3636 Register Reg = MO.getReg();
3637 if (const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI)) {
3638 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3639 return false;
3640 }
3641 }
3642 return true;
3643}
3644
3645const RegisterBankInfo::InstructionMapping &
3646AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3647 const MachineFunction &MF = *MI.getMF();
3648 const MachineRegisterInfo &MRI = MF.getRegInfo();
3649 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3650
3651 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3652 const MachineOperand &SrcOp = MI.getOperand(i);
3653 if (!SrcOp.isReg())
3654 continue;
3655
3656 unsigned Size = getSizeInBits(Reg: SrcOp.getReg(), MRI, TRI: *TRI);
3657 OpdsMapping[i] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
3658 }
3659 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping),
3660 NumOperands: MI.getNumOperands());
3661}
3662
3663const RegisterBankInfo::InstructionMapping &
3664AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3665 const MachineFunction &MF = *MI.getMF();
3666 const MachineRegisterInfo &MRI = MF.getRegInfo();
3667 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3668
3669 // Even though we technically could use SGPRs, this would require knowledge of
3670 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3671 //
3672 // TODO: Unary ops are trivially OK, so accept SGPRs?
3673 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3674 const MachineOperand &Src = MI.getOperand(i);
3675 if (!Src.isReg())
3676 continue;
3677
3678 unsigned Size = getSizeInBits(Reg: Src.getReg(), MRI, TRI: *TRI);
3679 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3680 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3681 }
3682
3683 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping),
3684 NumOperands: MI.getNumOperands());
3685}
3686
3687const RegisterBankInfo::InstructionMapping &
3688AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3689 const MachineFunction &MF = *MI.getMF();
3690 const MachineRegisterInfo &MRI = MF.getRegInfo();
3691 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3692
3693 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3694 const MachineOperand &Op = MI.getOperand(i: I);
3695 if (!Op.isReg())
3696 continue;
3697
3698 unsigned Size = getSizeInBits(Reg: Op.getReg(), MRI, TRI: *TRI);
3699 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3700 }
3701
3702 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping),
3703 NumOperands: MI.getNumOperands());
3704}
3705
3706const RegisterBankInfo::InstructionMapping &
3707AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3708 const MachineInstr &MI,
3709 int RsrcIdx) const {
3710 // The reported argument index is relative to the IR intrinsic call arguments,
3711 // so we need to shift by the number of defs and the intrinsic ID.
3712 RsrcIdx += MI.getNumExplicitDefs() + 1;
3713
3714 const int NumOps = MI.getNumOperands();
3715 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3716
3717 // TODO: Should packed/unpacked D16 difference be reported here as part of
3718 // the value mapping?
3719 for (int I = 0; I != NumOps; ++I) {
3720 if (!MI.getOperand(i: I).isReg())
3721 continue;
3722
3723 Register OpReg = MI.getOperand(i: I).getReg();
3724 // We replace some dead address operands with $noreg
3725 if (!OpReg)
3726 continue;
3727
3728 unsigned Size = getSizeInBits(Reg: OpReg, MRI, TRI: *TRI);
3729
3730 // FIXME: Probably need a new intrinsic register bank searchable table to
3731 // handle arbitrary intrinsics easily.
3732 //
3733 // If this has a sampler, it immediately follows rsrc.
3734 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3735
3736 if (MustBeSGPR) {
3737 // If this must be an SGPR, so we must report whatever it is as legal.
3738 unsigned NewBank = getRegBankID(Reg: OpReg, MRI, Default: AMDGPU::SGPRRegBankID);
3739 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: NewBank, Size);
3740 } else {
3741 // Some operands must be VGPR, and these are easy to copy to.
3742 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3743 }
3744 }
3745
3746 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: NumOps);
3747}
3748
3749/// Return the mapping for a pointer argument.
3750const RegisterBankInfo::ValueMapping *
3751AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3752 Register PtrReg) const {
3753 LLT PtrTy = MRI.getType(Reg: PtrReg);
3754 unsigned Size = PtrTy.getSizeInBits();
3755 if (Subtarget.useFlatForGlobal() ||
3756 !AMDGPU::isFlatGlobalAddrSpace(AS: PtrTy.getAddressSpace()))
3757 return AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3758
3759 // If we're using MUBUF instructions for global memory, an SGPR base register
3760 // is possible. Otherwise this needs to be a VGPR.
3761 const RegisterBank *PtrBank = getRegBank(Reg: PtrReg, MRI, TRI: *TRI);
3762 return AMDGPU::getValueMapping(BankID: PtrBank->getID(), Size);
3763}
3764
3765const RegisterBankInfo::InstructionMapping &
3766AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3767
3768 const MachineFunction &MF = *MI.getMF();
3769 const MachineRegisterInfo &MRI = MF.getRegInfo();
3770 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3771 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
3772 Register PtrReg = MI.getOperand(i: 1).getReg();
3773 LLT PtrTy = MRI.getType(Reg: PtrReg);
3774 unsigned AS = PtrTy.getAddressSpace();
3775 unsigned PtrSize = PtrTy.getSizeInBits();
3776
3777 const ValueMapping *ValMapping;
3778 const ValueMapping *PtrMapping;
3779
3780 const RegisterBank *PtrBank = getRegBank(Reg: PtrReg, MRI, TRI: *TRI);
3781
3782 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3783 if (isScalarLoadLegal(MI)) {
3784 // We have a uniform instruction so we want to use an SMRD load
3785 ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
3786 PtrMapping = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: PtrSize);
3787 } else {
3788 ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3789
3790 // If we're using MUBUF instructions for global memory, an SGPR base
3791 // register is possible. Otherwise this needs to be a VGPR.
3792 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3793 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3794
3795 PtrMapping = AMDGPU::getValueMapping(BankID: PtrBankID, Size: PtrSize);
3796 }
3797 } else {
3798 ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3799 PtrMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: PtrSize);
3800 }
3801
3802 OpdsMapping[0] = ValMapping;
3803 OpdsMapping[1] = PtrMapping;
3804 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3805 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: MI.getNumOperands());
3806 return Mapping;
3807
3808 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3809 // handle that during instruction selection?
3810}
3811
3812unsigned
3813AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3814 const MachineRegisterInfo &MRI,
3815 unsigned Default) const {
3816 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI);
3817 return Bank ? Bank->getID() : Default;
3818}
3819
3820const RegisterBankInfo::ValueMapping *
3821AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3822 const MachineRegisterInfo &MRI,
3823 const TargetRegisterInfo &TRI) const {
3824 // Lie and claim anything is legal, even though this needs to be an SGPR
3825 // applyMapping will have to deal with it as a waterfall loop.
3826 unsigned Bank = getRegBankID(Reg, MRI, Default: AMDGPU::SGPRRegBankID);
3827 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3828 return AMDGPU::getValueMapping(BankID: Bank, Size);
3829}
3830
3831const RegisterBankInfo::ValueMapping *
3832AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3833 const MachineRegisterInfo &MRI,
3834 const TargetRegisterInfo &TRI) const {
3835 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3836 return AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3837}
3838
3839const RegisterBankInfo::ValueMapping *
3840AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3841 const MachineRegisterInfo &MRI,
3842 const TargetRegisterInfo &TRI) const {
3843 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3844 return AMDGPU::getValueMapping(BankID: AMDGPU::AGPRRegBankID, Size);
3845}
3846
3847///
3848/// This function must return a legal mapping, because
3849/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3850/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3851/// VGPR to SGPR generated is illegal.
3852///
3853// Operands that must be SGPRs must accept potentially divergent VGPRs as
3854// legal. These will be dealt with in applyMappingImpl.
3855//
3856const RegisterBankInfo::InstructionMapping &
3857AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3858 const MachineFunction &MF = *MI.getMF();
3859 const MachineRegisterInfo &MRI = MF.getRegInfo();
3860
3861 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3862 Register DstReg = MI.getOperand(i: 0).getReg();
3863 Register SrcReg = MI.getOperand(i: 1).getReg();
3864
3865 // The default logic bothers to analyze impossible alternative mappings. We
3866 // want the most straightforward mapping, so just directly handle this.
3867 const RegisterBank *DstBank = getRegBank(Reg: DstReg, MRI, TRI: *TRI);
3868 const RegisterBank *SrcBank = getRegBank(Reg: SrcReg, MRI, TRI: *TRI);
3869
3870 // For COPY between a physical reg and an s1, there is no type associated so
3871 // we need to take the virtual register's type as a hint on how to interpret
3872 // s1 values.
3873 unsigned Size;
3874 if (!SrcReg.isVirtual() && !DstBank &&
3875 MRI.getType(Reg: DstReg) == LLT::scalar(SizeInBits: 1)) {
3876 DstBank = &AMDGPU::VCCRegBank;
3877 Size = 1;
3878 } else if (!DstReg.isVirtual() && MRI.getType(Reg: SrcReg) == LLT::scalar(SizeInBits: 1)) {
3879 DstBank = &AMDGPU::VCCRegBank;
3880 Size = 1;
3881 } else {
3882 Size = getSizeInBits(Reg: DstReg, MRI, TRI: *TRI);
3883 }
3884
3885 if (!DstBank)
3886 DstBank = SrcBank;
3887 else if (!SrcBank)
3888 SrcBank = DstBank;
3889
3890 if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3891 cannotCopy(Dst: *DstBank, Src: *SrcBank, Size: TypeSize::getFixed(ExactSize: Size)))
3892 return getInvalidInstructionMapping();
3893
3894 const ValueMapping &ValMap = getValueMapping(StartIdx: 0, Length: Size, RegBank: *DstBank);
3895 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3896 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3897 OpdsMapping[0] = &ValMap;
3898 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3899 OpdsMapping[1] = &ValMap;
3900
3901 return getInstructionMapping(
3902 ID: 1, /*Cost*/ 1,
3903 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), NumOperands: OpdsMappingSize);
3904 }
3905
3906 if (MI.isRegSequence()) {
3907 // If any input is a VGPR, the result must be a VGPR. The default handling
3908 // assumes any copy between banks is legal.
3909 unsigned BankID = AMDGPU::SGPRRegBankID;
3910
3911 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3912 auto OpBank = getRegBankID(Reg: MI.getOperand(i: I).getReg(), MRI);
3913 // It doesn't make sense to use vcc or scc banks here, so just ignore
3914 // them.
3915 if (OpBank != AMDGPU::SGPRRegBankID) {
3916 BankID = AMDGPU::VGPRRegBankID;
3917 break;
3918 }
3919 }
3920 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
3921
3922 const ValueMapping &ValMap = getValueMapping(StartIdx: 0, Length: Size, RegBank: getRegBank(ID: BankID));
3923 return getInstructionMapping(
3924 ID: 1, /*Cost*/ 1,
3925 /*OperandsMapping*/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: 1);
3926 }
3927
3928 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3929 // properly.
3930 //
3931 // TODO: There are additional exec masking dependencies to analyze.
3932 if (auto *PHI = dyn_cast<GPhi>(Val: &MI)) {
3933 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3934 Register DstReg = PHI->getReg(Idx: 0);
3935
3936 // Sometimes the result may have already been assigned a bank.
3937 if (const RegisterBank *DstBank = getRegBank(Reg: DstReg, MRI, TRI: *TRI))
3938 ResultBank = DstBank->getID();
3939
3940 for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) {
3941 Register Reg = PHI->getIncomingValue(I);
3942 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI);
3943
3944 // FIXME: Assuming VGPR for any undetermined inputs.
3945 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3946 ResultBank = AMDGPU::VGPRRegBankID;
3947 break;
3948 }
3949
3950 // FIXME: Need to promote SGPR case to s32
3951 unsigned OpBank = Bank->getID();
3952 ResultBank = regBankBoolUnion(RB0: ResultBank, RB1: OpBank);
3953 }
3954
3955 assert(ResultBank != AMDGPU::InvalidRegBankID);
3956
3957 unsigned Size = MRI.getType(Reg: DstReg).getSizeInBits();
3958
3959 const ValueMapping &ValMap =
3960 getValueMapping(StartIdx: 0, Length: Size, RegBank: getRegBank(ID: ResultBank));
3961 return getInstructionMapping(
3962 ID: 1, /*Cost*/ 1,
3963 /*OperandsMapping*/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: 1);
3964 }
3965
3966 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3967 if (Mapping.isValid())
3968 return Mapping;
3969
3970 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3971
3972 switch (MI.getOpcode()) {
3973 default:
3974 return getInvalidInstructionMapping();
3975
3976 case AMDGPU::G_AND:
3977 case AMDGPU::G_OR:
3978 case AMDGPU::G_XOR:
3979 case AMDGPU::G_MUL: {
3980 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
3981 if (Size == 1) {
3982 const RegisterBank *DstBank
3983 = getRegBank(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
3984
3985 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3986 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3987 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3988 if (DstBank) {
3989 TargetBankID = DstBank->getID();
3990 if (DstBank == &AMDGPU::VCCRegBank) {
3991 TargetBankID = AMDGPU::VCCRegBankID;
3992 BankLHS = AMDGPU::VCCRegBankID;
3993 BankRHS = AMDGPU::VCCRegBankID;
3994 } else {
3995 BankLHS = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI,
3996 Default: AMDGPU::SGPRRegBankID);
3997 BankRHS = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
3998 Default: AMDGPU::SGPRRegBankID);
3999 }
4000 } else {
4001 BankLHS = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI,
4002 Default: AMDGPU::VCCRegBankID);
4003 BankRHS = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
4004 Default: AMDGPU::VCCRegBankID);
4005
4006 // Both inputs should be true booleans to produce a boolean result.
4007 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
4008 TargetBankID = AMDGPU::VGPRRegBankID;
4009 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
4010 TargetBankID = AMDGPU::VCCRegBankID;
4011 BankLHS = AMDGPU::VCCRegBankID;
4012 BankRHS = AMDGPU::VCCRegBankID;
4013 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
4014 TargetBankID = AMDGPU::SGPRRegBankID;
4015 }
4016 }
4017
4018 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: TargetBankID, Size);
4019 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: BankLHS, Size);
4020 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: BankRHS, Size);
4021 break;
4022 }
4023
4024 if (Size == 64) {
4025
4026 if (isSALUMapping(MI)) {
4027 OpdsMapping[0] = getValueMappingSGPR64Only(BankID: AMDGPU::SGPRRegBankID, Size);
4028 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
4029 } else {
4030 if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVMulU64Inst())
4031 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
4032 else
4033 OpdsMapping[0] =
4034 getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size);
4035 unsigned Bank1 = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI /*, DefaultBankID*/);
4036 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank1, Size);
4037
4038 unsigned Bank2 = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI /*, DefaultBankID*/);
4039 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank2, Size);
4040 }
4041
4042 break;
4043 }
4044
4045 [[fallthrough]];
4046 }
4047 case AMDGPU::G_PTR_ADD:
4048 case AMDGPU::G_PTRMASK:
4049 case AMDGPU::G_ADD:
4050 case AMDGPU::G_SUB:
4051 case AMDGPU::G_SHL:
4052 case AMDGPU::G_LSHR:
4053 case AMDGPU::G_ASHR:
4054 case AMDGPU::G_UADDO:
4055 case AMDGPU::G_USUBO:
4056 case AMDGPU::G_UADDE:
4057 case AMDGPU::G_SADDE:
4058 case AMDGPU::G_USUBE:
4059 case AMDGPU::G_SSUBE:
4060 case AMDGPU::G_ABS:
4061 case AMDGPU::G_SHUFFLE_VECTOR:
4062 case AMDGPU::G_SBFX:
4063 case AMDGPU::G_UBFX:
4064 case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
4065 case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
4066 if (isSALUMapping(MI)) {
4067 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4068 unsigned Size = Ty.getSizeInBits();
4069 // Packed add and sub are VALU only.
4070 if (Subtarget.hasPackedU64Ops() && Ty.isVector() && Size == 128)
4071 return getDefaultMappingVOP(MI);
4072 return getDefaultMappingSOP(MI);
4073 }
4074 return getDefaultMappingVOP(MI);
4075 case AMDGPU::G_SMIN:
4076 case AMDGPU::G_SMAX:
4077 case AMDGPU::G_UMIN:
4078 case AMDGPU::G_UMAX:
4079 if (isSALUMapping(MI)) {
4080 // There are no scalar 64-bit min and max, use vector instruction instead.
4081 if (MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits() == 64 &&
4082 Subtarget.hasMinMaxI64Insts())
4083 return getDefaultMappingVOP(MI);
4084 return getDefaultMappingSOP(MI);
4085 }
4086 return getDefaultMappingVOP(MI);
4087 case AMDGPU::G_FADD:
4088 case AMDGPU::G_FSUB:
4089 case AMDGPU::G_FMUL:
4090 case AMDGPU::G_FMA:
4091 case AMDGPU::G_FFLOOR:
4092 case AMDGPU::G_FCEIL:
4093 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
4094 case AMDGPU::G_FMINNUM:
4095 case AMDGPU::G_FMAXNUM:
4096 case AMDGPU::G_FMINIMUMNUM:
4097 case AMDGPU::G_FMAXIMUMNUM:
4098 case AMDGPU::G_INTRINSIC_TRUNC:
4099 case AMDGPU::G_STRICT_FADD:
4100 case AMDGPU::G_STRICT_FSUB:
4101 case AMDGPU::G_STRICT_FMUL:
4102 case AMDGPU::G_STRICT_FMA: {
4103 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4104 unsigned Size = Ty.getSizeInBits();
4105 if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
4106 (Size == 32 || Size == 16) && isSALUMapping(MI))
4107 return getDefaultMappingSOP(MI);
4108 return getDefaultMappingVOP(MI);
4109 }
4110 case AMDGPU::G_FMINIMUM:
4111 case AMDGPU::G_FMAXIMUM: {
4112 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4113 unsigned Size = Ty.getSizeInBits();
4114 if (Subtarget.hasSALUMinimumMaximumInsts() && Ty.isScalar() &&
4115 (Size == 32 || Size == 16) && isSALUMapping(MI))
4116 return getDefaultMappingSOP(MI);
4117 return getDefaultMappingVOP(MI);
4118 }
4119 case AMDGPU::G_FPTOSI:
4120 case AMDGPU::G_FPTOUI:
4121 case AMDGPU::G_FPTOSI_SAT:
4122 case AMDGPU::G_FPTOUI_SAT:
4123 case AMDGPU::G_SITOFP:
4124 case AMDGPU::G_UITOFP: {
4125 unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4126 unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4127 if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
4128 isSALUMapping(MI))
4129 return getDefaultMappingSOP(MI);
4130 return getDefaultMappingVOP(MI);
4131 }
4132 case AMDGPU::G_FPTRUNC:
4133 case AMDGPU::G_FPEXT: {
4134 unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4135 unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4136 if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
4137 isSALUMapping(MI))
4138 return getDefaultMappingSOP(MI);
4139 return getDefaultMappingVOP(MI);
4140 }
4141 case AMDGPU::G_FSQRT:
4142 case AMDGPU::G_FEXP2:
4143 case AMDGPU::G_FLOG2: {
4144 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4145 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4146 isSALUMapping(MI))
4147 return getDefaultMappingSOP(MI);
4148 return getDefaultMappingVOP(MI);
4149 }
4150 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
4151 case AMDGPU::G_SSUBSAT:
4152 case AMDGPU::G_UADDSAT:
4153 case AMDGPU::G_USUBSAT:
4154 case AMDGPU::G_FMAD:
4155 case AMDGPU::G_FLDEXP:
4156 case AMDGPU::G_FMINNUM_IEEE:
4157 case AMDGPU::G_FMAXNUM_IEEE:
4158 case AMDGPU::G_FCANONICALIZE:
4159 case AMDGPU::G_STRICT_FLDEXP:
4160 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
4161 case AMDGPU::G_FSHR: // TODO: Expand for scalar
4162 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
4163 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
4164 case AMDGPU::G_AMDGPU_RCP_IFLAG:
4165 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
4166 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
4167 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
4168 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
4169 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
4170 case AMDGPU::G_AMDGPU_SMED3:
4171 case AMDGPU::G_AMDGPU_FMED3:
4172 return getDefaultMappingVOP(MI);
4173 case AMDGPU::G_UMULH:
4174 case AMDGPU::G_SMULH: {
4175 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
4176 return getDefaultMappingSOP(MI);
4177 return getDefaultMappingVOP(MI);
4178 }
4179 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4180 case AMDGPU::G_AMDGPU_MAD_I64_I32: {
4181 // Three possible mappings:
4182 //
4183 // - Default SOP
4184 // - Default VOP
4185 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
4186 //
4187 // This allows instruction selection to keep the multiplication part of the
4188 // instruction on the SALU.
4189 bool AllSalu = true;
4190 bool MulSalu = true;
4191 for (unsigned i = 0; i < 5; ++i) {
4192 Register Reg = MI.getOperand(i).getReg();
4193 if (const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI)) {
4194 if (Bank->getID() != AMDGPU::SGPRRegBankID) {
4195 AllSalu = false;
4196 if (i == 2 || i == 3) {
4197 MulSalu = false;
4198 break;
4199 }
4200 }
4201 }
4202 }
4203
4204 if (AllSalu)
4205 return getDefaultMappingSOP(MI);
4206
4207 // If the multiply-add is full-rate in VALU, use that even if the
4208 // multiplication part is scalar. Accumulating separately on the VALU would
4209 // take two instructions.
4210 if (!MulSalu || Subtarget.hasFullRate64Ops())
4211 return getDefaultMappingVOP(MI);
4212
4213 // Keep the multiplication on the SALU, then accumulate on the VALU.
4214 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 64);
4215 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
4216 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4217 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4218 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 64);
4219 break;
4220 }
4221 case AMDGPU::G_IMPLICIT_DEF: {
4222 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4223 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4224 break;
4225 }
4226 case AMDGPU::G_FCONSTANT:
4227 case AMDGPU::G_CONSTANT:
4228 case AMDGPU::G_GLOBAL_VALUE:
4229 case AMDGPU::G_FRAME_INDEX:
4230 case AMDGPU::G_BLOCK_ADDR:
4231 case AMDGPU::G_READSTEADYCOUNTER:
4232 case AMDGPU::G_READCYCLECOUNTER: {
4233 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4234 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4235 break;
4236 }
4237 case AMDGPU::G_DYN_STACKALLOC: {
4238 // Result is always uniform, and a wave reduction is needed for the source.
4239 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4240 unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4241 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: 32);
4242 break;
4243 }
4244 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
4245 // This case is weird because we expect a physical register in the source,
4246 // but need to set a bank anyway.
4247 //
4248 // TODO: We could select the result to SGPR or VGPR
4249 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4250 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4251 break;
4252 }
4253 case AMDGPU::G_INSERT: {
4254 unsigned BankID = getMappingType(MRI, MI);
4255 unsigned DstSize = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4256 unsigned SrcSize = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4257 unsigned EltSize = getSizeInBits(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4258 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: DstSize);
4259 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size: SrcSize);
4260 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, Size: EltSize);
4261 OpdsMapping[3] = nullptr;
4262 break;
4263 }
4264 case AMDGPU::G_EXTRACT: {
4265 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4266 unsigned DstSize = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4267 unsigned SrcSize = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4268 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: DstSize);
4269 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size: SrcSize);
4270 OpdsMapping[2] = nullptr;
4271 break;
4272 }
4273 case AMDGPU::G_BUILD_VECTOR:
4274 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
4275 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4276 if (DstTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16)) {
4277 unsigned DstSize = DstTy.getSizeInBits();
4278 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4279 unsigned Src0BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4280 unsigned Src1BankID = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4281 unsigned DstBankID = regBankUnion(RB0: Src0BankID, RB1: Src1BankID);
4282
4283 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: DstBankID, Size: DstSize);
4284 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Src0BankID, Size: SrcSize);
4285 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Src1BankID, Size: SrcSize);
4286 break;
4287 }
4288
4289 [[fallthrough]];
4290 }
4291 case AMDGPU::G_MERGE_VALUES:
4292 case AMDGPU::G_CONCAT_VECTORS: {
4293 unsigned Bank = getMappingType(MRI, MI);
4294 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4295 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4296
4297 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize);
4298 // Op1 and Dst should use the same register bank.
4299 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
4300 OpdsMapping[i] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize);
4301 break;
4302 }
4303 case AMDGPU::G_BITREVERSE:
4304 case AMDGPU::G_BITCAST:
4305 case AMDGPU::G_INTTOPTR:
4306 case AMDGPU::G_PTRTOINT:
4307 case AMDGPU::G_FABS:
4308 case AMDGPU::G_FNEG: {
4309 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4310 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4311 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4312 break;
4313 }
4314 case AMDGPU::G_AMDGPU_FFBH_U32:
4315 case AMDGPU::G_AMDGPU_FFBL_B32:
4316 case AMDGPU::G_CTLZ_ZERO_POISON:
4317 case AMDGPU::G_CTTZ_ZERO_POISON: {
4318 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4319 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4320 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: 32);
4321 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
4322 break;
4323 }
4324 case AMDGPU::G_CTPOP: {
4325 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4326 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4327 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: 32);
4328
4329 // This should really be getValueMappingSGPR64Only, but allowing the generic
4330 // code to handle the register split just makes using LegalizerHelper more
4331 // difficult.
4332 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4333 break;
4334 }
4335 case AMDGPU::G_TRUNC: {
4336 Register Dst = MI.getOperand(i: 0).getReg();
4337 Register Src = MI.getOperand(i: 1).getReg();
4338 unsigned Bank = getRegBankID(Reg: Src, MRI);
4339 unsigned DstSize = getSizeInBits(Reg: Dst, MRI, TRI: *TRI);
4340 unsigned SrcSize = getSizeInBits(Reg: Src, MRI, TRI: *TRI);
4341 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize);
4342 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize);
4343 break;
4344 }
4345 case AMDGPU::G_ZEXT:
4346 case AMDGPU::G_SEXT:
4347 case AMDGPU::G_ANYEXT:
4348 case AMDGPU::G_SEXT_INREG: {
4349 Register Dst = MI.getOperand(i: 0).getReg();
4350 Register Src = MI.getOperand(i: 1).getReg();
4351 unsigned DstSize = getSizeInBits(Reg: Dst, MRI, TRI: *TRI);
4352 unsigned SrcSize = getSizeInBits(Reg: Src, MRI, TRI: *TRI);
4353
4354 unsigned DstBank;
4355 const RegisterBank *SrcBank = getRegBank(Reg: Src, MRI, TRI: *TRI);
4356 assert(SrcBank);
4357 switch (SrcBank->getID()) {
4358 case AMDGPU::SGPRRegBankID:
4359 DstBank = AMDGPU::SGPRRegBankID;
4360 break;
4361 default:
4362 DstBank = AMDGPU::VGPRRegBankID;
4363 break;
4364 }
4365
4366 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
4367 // 32-bits, and then to 64.
4368 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: DstBank, Size: DstSize);
4369 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID: SrcBank->getID(),
4370 Size: SrcSize);
4371 break;
4372 }
4373 case AMDGPU::G_IS_FPCLASS: {
4374 Register SrcReg = MI.getOperand(i: 1).getReg();
4375 unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits();
4376 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4377 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: DstSize);
4378 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4379 break;
4380 }
4381 case AMDGPU::G_STORE: {
4382 assert(MI.getOperand(0).isReg());
4383 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4384
4385 // FIXME: We need to specify a different reg bank once scalar stores are
4386 // supported.
4387 const ValueMapping *ValMapping =
4388 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
4389 OpdsMapping[0] = ValMapping;
4390 OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg());
4391 break;
4392 }
4393 case AMDGPU::G_ICMP:
4394 case AMDGPU::G_FCMP: {
4395 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4396
4397 // See if the result register has already been constrained to vcc, which may
4398 // happen due to control flow intrinsic lowering.
4399 unsigned DstBank = getRegBankID(Reg: MI.getOperand(i: 0).getReg(), MRI,
4400 Default: AMDGPU::SGPRRegBankID);
4401 unsigned Op2Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4402 unsigned Op3Bank = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI);
4403
4404 auto canUseSCCICMP = [&]() {
4405 auto Pred =
4406 static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate());
4407 return Size == 32 ||
4408 (Size == 64 &&
4409 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4410 Subtarget.hasScalarCompareEq64());
4411 };
4412 auto canUseSCCFCMP = [&]() {
4413 return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
4414 };
4415
4416 bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4417 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4418 Op2Bank == AMDGPU::SGPRRegBankID &&
4419 Op3Bank == AMDGPU::SGPRRegBankID &&
4420 (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
4421
4422 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4423 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4424
4425 // TODO: Use 32-bit for scalar output size.
4426 // SCC results will need to be copied to a 32-bit SGPR virtual register.
4427 const unsigned ResultSize = 1;
4428
4429 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: DstBank, Size: ResultSize);
4430 OpdsMapping[1] = nullptr; // Predicate Operand.
4431 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: SrcBank, Size);
4432 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: SrcBank, Size);
4433 break;
4434 }
4435 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4436 // VGPR index can be used for waterfall when indexing a SGPR vector.
4437 unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4438 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4439 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4440 unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4441 unsigned IdxBank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4442 unsigned OutputBankID = regBankUnion(RB0: SrcBankID, RB1: IdxBank);
4443
4444 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: OutputBankID, Size: DstSize);
4445 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: SrcSize);
4446
4447 // The index can be either if the source vector is VGPR.
4448 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4449 break;
4450 }
4451 case AMDGPU::G_INSERT_VECTOR_ELT: {
4452 unsigned OutputBankID = isSALUMapping(MI) ?
4453 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4454
4455 unsigned VecSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4456 unsigned InsertSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4457 unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: 3).getReg()).getSizeInBits();
4458 unsigned InsertEltBankID = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4459 unsigned IdxBankID = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI);
4460
4461 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize);
4462 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize);
4463
4464 // This is a weird case, because we need to break down the mapping based on
4465 // the register bank of a different operand.
4466 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4467 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(BankID: InsertEltBankID,
4468 Size: InsertSize);
4469 } else {
4470 assert(InsertSize == 32 || InsertSize == 64);
4471 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: InsertEltBankID, Size: InsertSize);
4472 }
4473
4474 // The index can be either if the source vector is VGPR.
4475 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBankID, Size: IdxSize);
4476 break;
4477 }
4478 case AMDGPU::G_UNMERGE_VALUES: {
4479 unsigned Bank = getMappingType(MRI, MI);
4480
4481 // Op1 and Dst should use the same register bank.
4482 // FIXME: Shouldn't this be the default? Why do we need to handle this?
4483 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4484 unsigned Size = getSizeInBits(Reg: MI.getOperand(i).getReg(), MRI, TRI: *TRI);
4485 OpdsMapping[i] = AMDGPU::getValueMapping(BankID: Bank, Size);
4486 }
4487 break;
4488 }
4489 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4490 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4491 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4492 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4493 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4494 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
4495 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
4496 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
4497 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
4498 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
4499 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4500 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4501 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4502 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4503 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4504 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4505 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4506 case AMDGPU::G_AMDGPU_BUFFER_STORE:
4507 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4508 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4509 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4510 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4511 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4512
4513 // rsrc
4514 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4515
4516 // vindex
4517 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4518
4519 // voffset
4520 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
4521
4522 // soffset
4523 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
4524
4525 // Any remaining operands are immediates and were correctly null
4526 // initialized.
4527 break;
4528 }
4529 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4530 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4531 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4532 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4533 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4534 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4535 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4536 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4537 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4538 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4539 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4540 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4541 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
4542 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
4543 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4544 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4545 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4546 // vdata_out
4547 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4548
4549 // vdata_in
4550 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4551
4552 // rsrc
4553 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4554
4555 // vindex
4556 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
4557
4558 // voffset
4559 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
4560
4561 // soffset
4562 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
4563
4564 // Any remaining operands are immediates and were correctly null
4565 // initialized.
4566 break;
4567 }
4568 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4569 // vdata_out
4570 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4571
4572 // vdata_in
4573 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4574
4575 // cmp
4576 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4577
4578 // rsrc
4579 OpdsMapping[3] = getSGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
4580
4581 // vindex
4582 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
4583
4584 // voffset
4585 OpdsMapping[5] = getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
4586
4587 // soffset
4588 OpdsMapping[6] = getSGPROpMapping(Reg: MI.getOperand(i: 6).getReg(), MRI, TRI: *TRI);
4589
4590 // Any remaining operands are immediates and were correctly null
4591 // initialized.
4592 break;
4593 }
4594 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
4595 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
4596 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
4597 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
4598 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
4599 // Lie and claim everything is legal, even though some need to be
4600 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4601 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4602 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4603
4604 // We need to convert this to a MUBUF if either the resource of offset is
4605 // VGPR.
4606 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4607 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4608 unsigned ResultBank = regBankUnion(RB0: RSrcBank, RB1: OffsetBank);
4609
4610 unsigned Size0 = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4611 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: ResultBank, Size: Size0);
4612 break;
4613 }
4614 case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
4615 OpdsMapping[0] = getSGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4616 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4617 break;
4618 case AMDGPU::G_AMDGPU_SPONENTRY: {
4619 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4620 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4621 break;
4622 }
4623 case AMDGPU::G_INTRINSIC:
4624 case AMDGPU::G_INTRINSIC_CONVERGENT: {
4625 switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
4626 default:
4627 return getInvalidInstructionMapping();
4628 case Intrinsic::amdgcn_div_fmas:
4629 case Intrinsic::amdgcn_div_fixup:
4630 case Intrinsic::amdgcn_trig_preop:
4631 case Intrinsic::amdgcn_sin:
4632 case Intrinsic::amdgcn_cos:
4633 case Intrinsic::amdgcn_log_clamp:
4634 case Intrinsic::amdgcn_rcp_legacy:
4635 case Intrinsic::amdgcn_rsq_legacy:
4636 case Intrinsic::amdgcn_rsq_clamp:
4637 case Intrinsic::amdgcn_tanh:
4638 case Intrinsic::amdgcn_fmul_legacy:
4639 case Intrinsic::amdgcn_fma_legacy:
4640 case Intrinsic::amdgcn_frexp_mant:
4641 case Intrinsic::amdgcn_frexp_exp:
4642 case Intrinsic::amdgcn_fract:
4643 case Intrinsic::amdgcn_cvt_pknorm_i16:
4644 case Intrinsic::amdgcn_cvt_pknorm_u16:
4645 case Intrinsic::amdgcn_cvt_pk_i16:
4646 case Intrinsic::amdgcn_cvt_pk_u16:
4647 case Intrinsic::amdgcn_cvt_sr_pk_f16_f32:
4648 case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32:
4649 case Intrinsic::amdgcn_cvt_pk_f16_fp8:
4650 case Intrinsic::amdgcn_cvt_pk_f16_bf8:
4651 case Intrinsic::amdgcn_cvt_pk_fp8_f16:
4652 case Intrinsic::amdgcn_cvt_pk_bf8_f16:
4653 case Intrinsic::amdgcn_cvt_sr_fp8_f16:
4654 case Intrinsic::amdgcn_cvt_sr_bf8_f16:
4655 case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp8:
4656 case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp8:
4657 case Intrinsic::amdgcn_cvt_scale_pk8_f16_bf8:
4658 case Intrinsic::amdgcn_cvt_scale_pk8_bf16_bf8:
4659 case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp4:
4660 case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp4:
4661 case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8:
4662 case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8:
4663 case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4:
4664 case Intrinsic::amdgcn_cvt_scale_pk16_f16_fp6:
4665 case Intrinsic::amdgcn_cvt_scale_pk16_bf16_fp6:
4666 case Intrinsic::amdgcn_cvt_scale_pk16_f16_bf6:
4667 case Intrinsic::amdgcn_cvt_scale_pk16_bf16_bf6:
4668 case Intrinsic::amdgcn_cvt_scale_pk16_f32_fp6:
4669 case Intrinsic::amdgcn_cvt_scale_pk16_f32_bf6:
4670 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16:
4671 case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16:
4672 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16:
4673 case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f16:
4674 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f32:
4675 case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f32:
4676 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32:
4677 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16:
4678 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16:
4679 case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f32:
4680 case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f32:
4681 case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f16:
4682 case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f16:
4683 case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_bf16:
4684 case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_bf16:
4685 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_bf16:
4686 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_bf16:
4687 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f16:
4688 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f16:
4689 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f32:
4690 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f32:
4691 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32:
4692 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16:
4693 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16:
4694 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32:
4695 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32:
4696 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16:
4697 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16:
4698 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16:
4699 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16:
4700 case Intrinsic::amdgcn_sat_pk4_i4_i8:
4701 case Intrinsic::amdgcn_sat_pk4_u4_u8:
4702 case Intrinsic::amdgcn_fmed3:
4703 case Intrinsic::amdgcn_cubeid:
4704 case Intrinsic::amdgcn_cubema:
4705 case Intrinsic::amdgcn_cubesc:
4706 case Intrinsic::amdgcn_cubetc:
4707 case Intrinsic::amdgcn_sffbh:
4708 case Intrinsic::amdgcn_fmad_ftz:
4709 case Intrinsic::amdgcn_mbcnt_lo:
4710 case Intrinsic::amdgcn_mbcnt_hi:
4711 case Intrinsic::amdgcn_mul_u24:
4712 case Intrinsic::amdgcn_mul_i24:
4713 case Intrinsic::amdgcn_mulhi_u24:
4714 case Intrinsic::amdgcn_mulhi_i24:
4715 case Intrinsic::amdgcn_lerp:
4716 case Intrinsic::amdgcn_sad_u8:
4717 case Intrinsic::amdgcn_msad_u8:
4718 case Intrinsic::amdgcn_sad_hi_u8:
4719 case Intrinsic::amdgcn_sad_u16:
4720 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4721 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4722 case Intrinsic::amdgcn_mqsad_u32_u8:
4723 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4724 case Intrinsic::amdgcn_alignbyte:
4725 case Intrinsic::amdgcn_perm:
4726 case Intrinsic::amdgcn_prng_b32:
4727 case Intrinsic::amdgcn_fdot2:
4728 case Intrinsic::amdgcn_sdot2:
4729 case Intrinsic::amdgcn_udot2:
4730 case Intrinsic::amdgcn_sdot4:
4731 case Intrinsic::amdgcn_udot4:
4732 case Intrinsic::amdgcn_sdot8:
4733 case Intrinsic::amdgcn_udot8:
4734 case Intrinsic::amdgcn_fdot2_bf16_bf16:
4735 case Intrinsic::amdgcn_fdot2_f16_f16:
4736 case Intrinsic::amdgcn_fdot2_f32_bf16:
4737 case Intrinsic::amdgcn_fdot2c_f32_bf16:
4738 case Intrinsic::amdgcn_sudot4:
4739 case Intrinsic::amdgcn_sudot8:
4740 case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
4741 case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
4742 case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
4743 case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
4744 case Intrinsic::amdgcn_cvt_f32_fp8:
4745 case Intrinsic::amdgcn_cvt_f32_fp8_e5m3:
4746 case Intrinsic::amdgcn_cvt_f32_bf8:
4747 case Intrinsic::amdgcn_cvt_off_f32_i4:
4748 case Intrinsic::amdgcn_cvt_pk_f32_fp8:
4749 case Intrinsic::amdgcn_cvt_pk_f32_bf8:
4750 case Intrinsic::amdgcn_cvt_pk_fp8_f32:
4751 case Intrinsic::amdgcn_cvt_pk_fp8_f32_e5m3:
4752 case Intrinsic::amdgcn_cvt_pk_bf8_f32:
4753 case Intrinsic::amdgcn_cvt_sr_fp8_f32:
4754 case Intrinsic::amdgcn_cvt_sr_fp8_f32_e5m3:
4755 case Intrinsic::amdgcn_cvt_sr_bf8_f32:
4756 case Intrinsic::amdgcn_cvt_sr_bf16_f32:
4757 case Intrinsic::amdgcn_cvt_sr_f16_f32:
4758 case Intrinsic::amdgcn_cvt_f16_fp8:
4759 case Intrinsic::amdgcn_cvt_f16_bf8:
4760 case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_f16:
4761 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
4762 case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
4763 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:
4764 case Intrinsic::amdgcn_cvt_scalef32_f16_fp8:
4765 case Intrinsic::amdgcn_cvt_scalef32_f16_bf8:
4766 case Intrinsic::amdgcn_cvt_scalef32_f32_fp8:
4767 case Intrinsic::amdgcn_cvt_scalef32_f32_bf8:
4768 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f32:
4769 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f32:
4770 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp8:
4771 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_bf8:
4772 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f16:
4773 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_bf16:
4774 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f16:
4775 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_bf16:
4776 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp4:
4777 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f32:
4778 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp4:
4779 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp4:
4780 case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_fp6:
4781 case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_bf6:
4782 case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_bf6:
4783 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_bf6:
4784 case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_fp6:
4785 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_fp6:
4786 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_bf8:
4787 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_bf8:
4788 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp8:
4789 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp8:
4790 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f16:
4791 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_bf16:
4792 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f16:
4793 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_bf16:
4794 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f32:
4795 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_bf16:
4796 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f16:
4797 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f32:
4798 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_bf16:
4799 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f16:
4800 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f32:
4801 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_bf16:
4802 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f16:
4803 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f32:
4804 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_bf16:
4805 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f16:
4806 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f32:
4807 case Intrinsic::amdgcn_ashr_pk_i8_i32:
4808 case Intrinsic::amdgcn_ashr_pk_u8_i32:
4809 case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
4810 case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32:
4811 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4812 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4813 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4814 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
4815 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4816 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4817 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4818 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4819 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
4820 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
4821 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
4822 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
4823 case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
4824 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
4825 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
4826 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
4827 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
4828 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
4829 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
4830 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
4831 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
4832 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
4833 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
4834 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
4835 case Intrinsic::amdgcn_wmma_f64_16x16x4_f64:
4836 case Intrinsic::amdgcn_wmma_f32_16x16x4_f32:
4837 case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16:
4838 case Intrinsic::amdgcn_wmma_f32_16x16x32_f16:
4839 case Intrinsic::amdgcn_wmma_f16_16x16x32_f16:
4840 case Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16:
4841 case Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16:
4842 case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8:
4843 case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8:
4844 case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8:
4845 case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8:
4846 case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8:
4847 case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8:
4848 case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8:
4849 case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8:
4850 case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8:
4851 case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8:
4852 case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8:
4853 case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8:
4854 case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8:
4855 case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8:
4856 case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8:
4857 case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:
4858 case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
4859 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
4860 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
4861 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
4862 case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
4863 case Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4:
4864 case Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4:
4865 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
4866 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
4867 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
4868 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
4869 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
4870 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
4871 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
4872 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
4873 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
4874 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
4875 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
4876 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
4877 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8:
4878 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
4879 case Intrinsic::amdgcn_perm_pk16_b4_u4:
4880 case Intrinsic::amdgcn_perm_pk16_b6_u4:
4881 case Intrinsic::amdgcn_perm_pk16_b8_u4:
4882 case Intrinsic::amdgcn_add_max_i32:
4883 case Intrinsic::amdgcn_add_max_u32:
4884 case Intrinsic::amdgcn_add_min_i32:
4885 case Intrinsic::amdgcn_add_min_u32:
4886 case Intrinsic::amdgcn_pk_add_max_i16:
4887 case Intrinsic::amdgcn_pk_add_max_u16:
4888 case Intrinsic::amdgcn_pk_add_min_i16:
4889 case Intrinsic::amdgcn_pk_add_min_u16:
4890 return getDefaultMappingVOP(MI);
4891 case Intrinsic::amdgcn_log:
4892 case Intrinsic::amdgcn_exp2:
4893 case Intrinsic::amdgcn_rcp:
4894 case Intrinsic::amdgcn_rsq:
4895 case Intrinsic::amdgcn_sqrt: {
4896 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4897 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4898 isSALUMapping(MI))
4899 return getDefaultMappingSOP(MI);
4900 return getDefaultMappingVOP(MI);
4901 }
4902 case Intrinsic::amdgcn_sbfe:
4903 case Intrinsic::amdgcn_ubfe:
4904 if (isSALUMapping(MI))
4905 return getDefaultMappingSOP(MI);
4906 return getDefaultMappingVOP(MI);
4907 case Intrinsic::amdgcn_ds_swizzle:
4908 case Intrinsic::amdgcn_ds_permute:
4909 case Intrinsic::amdgcn_ds_bpermute:
4910 case Intrinsic::amdgcn_update_dpp:
4911 case Intrinsic::amdgcn_mov_dpp8:
4912 case Intrinsic::amdgcn_mov_dpp:
4913 case Intrinsic::amdgcn_strict_wwm:
4914 case Intrinsic::amdgcn_wwm:
4915 case Intrinsic::amdgcn_strict_wqm:
4916 case Intrinsic::amdgcn_wqm:
4917 case Intrinsic::amdgcn_softwqm:
4918 case Intrinsic::amdgcn_set_inactive:
4919 case Intrinsic::amdgcn_set_inactive_chain_arg:
4920 case Intrinsic::amdgcn_permlane64:
4921 case Intrinsic::amdgcn_ds_bpermute_fi_b32:
4922 return getDefaultMappingAllVGPR(MI);
4923 case Intrinsic::amdgcn_cvt_pkrtz:
4924 if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
4925 return getDefaultMappingSOP(MI);
4926 return getDefaultMappingVOP(MI);
4927 case Intrinsic::amdgcn_kernarg_segment_ptr:
4928 case Intrinsic::amdgcn_s_getpc:
4929 case Intrinsic::amdgcn_groupstaticsize:
4930 case Intrinsic::amdgcn_reloc_constant:
4931 case Intrinsic::returnaddress: {
4932 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4933 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4934 break;
4935 }
4936 case Intrinsic::amdgcn_wqm_vote: {
4937 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4938 OpdsMapping[0] = OpdsMapping[2]
4939 = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size);
4940 break;
4941 }
4942 case Intrinsic::amdgcn_ps_live: {
4943 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
4944 break;
4945 }
4946 case Intrinsic::amdgcn_div_scale: {
4947 unsigned Dst0Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4948 unsigned Dst1Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4949 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Dst0Size);
4950 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: Dst1Size);
4951
4952 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 3).getReg()).getSizeInBits();
4953 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4954 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4955 break;
4956 }
4957 case Intrinsic::amdgcn_class: {
4958 Register Src0Reg = MI.getOperand(i: 2).getReg();
4959 Register Src1Reg = MI.getOperand(i: 3).getReg();
4960 unsigned Src0Size = MRI.getType(Reg: Src0Reg).getSizeInBits();
4961 unsigned Src1Size = MRI.getType(Reg: Src1Reg).getSizeInBits();
4962 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4963 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: DstSize);
4964 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Src0Size);
4965 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Src1Size);
4966 break;
4967 }
4968 case Intrinsic::amdgcn_icmp:
4969 case Intrinsic::amdgcn_fcmp: {
4970 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4971 // This is not VCCRegBank because this is not used in boolean contexts.
4972 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
4973 unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4974 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
4975 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
4976 break;
4977 }
4978 case Intrinsic::amdgcn_readlane: {
4979 // This must be an SGPR, but accept a VGPR.
4980 Register IdxReg = MI.getOperand(i: 3).getReg();
4981 unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits();
4982 unsigned IdxBank = getRegBankID(Reg: IdxReg, MRI, Default: AMDGPU::SGPRRegBankID);
4983 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4984 [[fallthrough]];
4985 }
4986 case Intrinsic::amdgcn_readfirstlane: {
4987 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4988 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4989 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
4990 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4991 break;
4992 }
4993 case Intrinsic::amdgcn_writelane: {
4994 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4995 Register SrcReg = MI.getOperand(i: 2).getReg();
4996 unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits();
4997 unsigned SrcBank = getRegBankID(Reg: SrcReg, MRI, Default: AMDGPU::SGPRRegBankID);
4998 Register IdxReg = MI.getOperand(i: 3).getReg();
4999 unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits();
5000 unsigned IdxBank = getRegBankID(Reg: IdxReg, MRI, Default: AMDGPU::SGPRRegBankID);
5001 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5002
5003 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
5004 // to legalize.
5005 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: SrcBank, Size: SrcSize);
5006 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
5007 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
5008 break;
5009 }
5010 case Intrinsic::amdgcn_if_break: {
5011 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5012 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
5013 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5014 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
5015 break;
5016 }
5017 case Intrinsic::amdgcn_permlane16:
5018 case Intrinsic::amdgcn_permlanex16: {
5019 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5020 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5021 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5022 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5023 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5024 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5025 break;
5026 }
5027 case Intrinsic::amdgcn_permlane_bcast:
5028 case Intrinsic::amdgcn_permlane_up:
5029 case Intrinsic::amdgcn_permlane_down:
5030 case Intrinsic::amdgcn_permlane_xor: {
5031 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5032 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5033 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5034 OpdsMapping[3] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5035 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5036 break;
5037 }
5038 case Intrinsic::amdgcn_permlane_idx_gen: {
5039 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5040 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5041 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5042 OpdsMapping[3] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5043 break;
5044 }
5045 case Intrinsic::amdgcn_permlane16_var:
5046 case Intrinsic::amdgcn_permlanex16_var: {
5047 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5048 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5049 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5050 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5051 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5052 break;
5053 }
5054 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
5055 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
5056 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
5057 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
5058 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
5059 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
5060 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
5061 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
5062 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
5063 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
5064 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
5065 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
5066 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
5067 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
5068 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
5069 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
5070 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
5071 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
5072 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
5073 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
5074 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
5075 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
5076 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
5077 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
5078 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
5079 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
5080 case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
5081 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
5082 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
5083 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
5084 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
5085 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
5086 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
5087 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
5088 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
5089 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
5090 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
5091 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
5092 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
5093 case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
5094 case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
5095 case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
5096 case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
5097 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
5098 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5099 unsigned MinNumRegsRequired = DstSize / 32;
5100
5101 // Default for MAI intrinsics.
5102 // srcC can also be an immediate which can be folded later.
5103 // FIXME: Should we eventually add an alternative mapping with AGPR src
5104 // for srcA/srcB?
5105 //
5106 // vdst, srcA, srcB, srcC
5107 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5108
5109 bool UseAGPRForm = !Subtarget.hasGFX90AInsts() ||
5110 Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired);
5111
5112 OpdsMapping[0] =
5113 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI)
5114 : getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5115 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5116 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5117 OpdsMapping[4] =
5118 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI)
5119 : getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5120 break;
5121 }
5122 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
5123 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
5124 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5125 unsigned MinNumRegsRequired = DstSize / 32;
5126
5127 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5128 bool UseAGPRForm = Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired);
5129
5130 OpdsMapping[0] =
5131 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI)
5132 : getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5133
5134 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5135 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5136 OpdsMapping[4] =
5137 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI)
5138 : getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5139
5140 OpdsMapping[8] = getVGPROpMapping(Reg: MI.getOperand(i: 8).getReg(), MRI, TRI: *TRI);
5141 OpdsMapping[10] = getVGPROpMapping(Reg: MI.getOperand(i: 10).getReg(), MRI, TRI: *TRI);
5142 break;
5143 }
5144 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
5145 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
5146 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
5147 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
5148 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
5149 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
5150 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
5151 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
5152 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
5153 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
5154 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
5155 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
5156 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
5157 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
5158 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
5159 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
5160 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
5161 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
5162 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
5163 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
5164 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
5165 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
5166 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
5167 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
5168 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
5169 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
5170 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
5171 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: {
5172 Register DstReg = MI.getOperand(i: 0).getReg();
5173 unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits();
5174 unsigned MinNumRegsRequired = DstSize / 32;
5175 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5176 bool UseAGPRForm = Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired);
5177
5178 // vdst, srcA, srcB, srcC, idx
5179 OpdsMapping[0] = UseAGPRForm ? getAGPROpMapping(Reg: DstReg, MRI, TRI: *TRI)
5180 : getVGPROpMapping(Reg: DstReg, MRI, TRI: *TRI);
5181
5182 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5183 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5184 OpdsMapping[4] =
5185 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI)
5186 : getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5187 OpdsMapping[5] = getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5188 break;
5189 }
5190 case Intrinsic::amdgcn_interp_p1:
5191 case Intrinsic::amdgcn_interp_p2:
5192 case Intrinsic::amdgcn_interp_mov:
5193 case Intrinsic::amdgcn_interp_p1_f16:
5194 case Intrinsic::amdgcn_interp_p2_f16:
5195 case Intrinsic::amdgcn_lds_param_load: {
5196 const int M0Idx = MI.getNumOperands() - 1;
5197 Register M0Reg = MI.getOperand(i: M0Idx).getReg();
5198 unsigned M0Bank = getRegBankID(Reg: M0Reg, MRI, Default: AMDGPU::SGPRRegBankID);
5199 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5200
5201 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5202 for (int I = 2; I != M0Idx && MI.getOperand(i: I).isReg(); ++I)
5203 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5204
5205 // Must be SGPR, but we must take whatever the original bank is and fix it
5206 // later.
5207 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5208 break;
5209 }
5210 case Intrinsic::amdgcn_interp_inreg_p10:
5211 case Intrinsic::amdgcn_interp_inreg_p2:
5212 case Intrinsic::amdgcn_interp_inreg_p10_f16:
5213 case Intrinsic::amdgcn_interp_inreg_p2_f16:
5214 case Intrinsic::amdgcn_interp_p10_rtz_f16:
5215 case Intrinsic::amdgcn_interp_p2_rtz_f16: {
5216 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5217 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5218 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5219 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5220 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5221 break;
5222 }
5223 case Intrinsic::amdgcn_permlane16_swap:
5224 case Intrinsic::amdgcn_permlane32_swap: {
5225 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5226 OpdsMapping[0] = OpdsMapping[1] = OpdsMapping[3] = OpdsMapping[4] =
5227 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5228 break;
5229 }
5230 case Intrinsic::amdgcn_ballot: {
5231 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5232 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
5233 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
5234 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: SrcSize);
5235 break;
5236 }
5237 case Intrinsic::amdgcn_inverse_ballot: {
5238 // This must be an SGPR, but accept a VGPR.
5239 Register MaskReg = MI.getOperand(i: 2).getReg();
5240 unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits();
5241 unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID);
5242 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5243 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize);
5244 break;
5245 }
5246 case Intrinsic::amdgcn_bitop3: {
5247 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5248 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5249 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5250 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5251 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5252 break;
5253 }
5254 case Intrinsic::amdgcn_s_quadmask:
5255 case Intrinsic::amdgcn_s_wqm: {
5256 Register MaskReg = MI.getOperand(i: 2).getReg();
5257 unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits();
5258 unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID);
5259 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: MaskSize);
5260 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize);
5261 break;
5262 }
5263 case Intrinsic::amdgcn_wave_reduce_add:
5264 case Intrinsic::amdgcn_wave_reduce_fadd:
5265 case Intrinsic::amdgcn_wave_reduce_sub:
5266 case Intrinsic::amdgcn_wave_reduce_fsub:
5267 case Intrinsic::amdgcn_wave_reduce_min:
5268 case Intrinsic::amdgcn_wave_reduce_umin:
5269 case Intrinsic::amdgcn_wave_reduce_fmin:
5270 case Intrinsic::amdgcn_wave_reduce_max:
5271 case Intrinsic::amdgcn_wave_reduce_umax:
5272 case Intrinsic::amdgcn_wave_reduce_fmax:
5273 case Intrinsic::amdgcn_wave_reduce_and:
5274 case Intrinsic::amdgcn_wave_reduce_or:
5275 case Intrinsic::amdgcn_wave_reduce_xor: {
5276 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5277 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
5278 unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
5279 auto regBankID =
5280 isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5281 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: regBankID, Size: OpSize);
5282 break;
5283 }
5284 case Intrinsic::amdgcn_s_bitreplicate: {
5285 Register MaskReg = MI.getOperand(i: 2).getReg();
5286 unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID);
5287 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 64);
5288 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: 32);
5289 break;
5290 }
5291 case Intrinsic::amdgcn_wave_shuffle: {
5292 unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5293 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
5294 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
5295 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
5296 break;
5297 }
5298 }
5299 break;
5300 }
5301 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
5302 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
5303 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
5304 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
5305 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
5306 auto IntrID = AMDGPU::getIntrinsicID(I: MI);
5307 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(Intr: IntrID);
5308 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
5309 // Non-images can have complications from operands that allow both SGPR
5310 // and VGPR. For now it's too complicated to figure out the final opcode
5311 // to derive the register bank from the MCInstrDesc.
5312 assert(RSrcIntrin->IsImage);
5313 return getImageMapping(MRI, MI, RsrcIdx: RSrcIntrin->RsrcArg);
5314 }
5315 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
5316 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
5317 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {
5318 bool IsDualOrBVH8 =
5319 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY ||
5320 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY;
5321 unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier
5322 unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods;
5323 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5324 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5325 if (IsDualOrBVH8) {
5326 OpdsMapping[1] = AMDGPU::getValueMapping(
5327 BankID: AMDGPU::VGPRRegBankID,
5328 Size: MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits());
5329 OpdsMapping[2] = AMDGPU::getValueMapping(
5330 BankID: AMDGPU::VGPRRegBankID,
5331 Size: MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits());
5332 }
5333 OpdsMapping[LastRegOpIdx] =
5334 getSGPROpMapping(Reg: MI.getOperand(i: LastRegOpIdx).getReg(), MRI, TRI: *TRI);
5335 if (LastRegOpIdx == 3) {
5336 // Sequential form: all operands combined into VGPR256/VGPR512
5337 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
5338 if (Size > 256)
5339 Size = 512;
5340 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5341 } else {
5342 // NSA form
5343 unsigned FirstSrcOpIdx = IsDualOrBVH8 ? 4 : 2;
5344 for (unsigned I = FirstSrcOpIdx; I < LastRegOpIdx; ++I) {
5345 unsigned Size = MRI.getType(Reg: MI.getOperand(i: I).getReg()).getSizeInBits();
5346 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5347 }
5348 }
5349 break;
5350 }
5351 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
5352 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
5353 auto IntrID = cast<GIntrinsic>(Val: MI).getIntrinsicID();
5354 switch (IntrID) {
5355 case Intrinsic::amdgcn_s_getreg:
5356 case Intrinsic::amdgcn_s_memtime:
5357 case Intrinsic::amdgcn_s_memrealtime:
5358 case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
5359 case Intrinsic::amdgcn_s_sendmsg_rtn: {
5360 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5361 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
5362 break;
5363 }
5364 case Intrinsic::amdgcn_global_atomic_fmin_num:
5365 case Intrinsic::amdgcn_global_atomic_fmax_num:
5366 case Intrinsic::amdgcn_flat_atomic_fmin_num:
5367 case Intrinsic::amdgcn_flat_atomic_fmax_num:
5368 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
5369 case Intrinsic::amdgcn_global_load_tr_b64:
5370 case Intrinsic::amdgcn_global_load_tr_b128:
5371 case Intrinsic::amdgcn_global_load_tr4_b64:
5372 case Intrinsic::amdgcn_global_load_tr6_b96:
5373 case Intrinsic::amdgcn_ds_load_tr8_b64:
5374 case Intrinsic::amdgcn_ds_load_tr16_b128:
5375 case Intrinsic::amdgcn_ds_load_tr4_b64:
5376 case Intrinsic::amdgcn_ds_load_tr6_b96:
5377 case Intrinsic::amdgcn_ds_read_tr4_b64:
5378 case Intrinsic::amdgcn_ds_read_tr6_b96:
5379 case Intrinsic::amdgcn_ds_read_tr8_b64:
5380 case Intrinsic::amdgcn_ds_read_tr16_b64:
5381 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
5382 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
5383 return getDefaultMappingAllVGPR(MI);
5384 case Intrinsic::amdgcn_ds_ordered_add:
5385 case Intrinsic::amdgcn_ds_ordered_swap: {
5386 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5387 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5388 unsigned M0Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5389 Default: AMDGPU::SGPRRegBankID);
5390 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5391 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5392 break;
5393 }
5394 case Intrinsic::amdgcn_ds_append:
5395 case Intrinsic::amdgcn_ds_consume: {
5396 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5397 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5398 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5399 break;
5400 }
5401 case Intrinsic::amdgcn_exp_compr:
5402 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5403 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5404 break;
5405 case Intrinsic::amdgcn_exp:
5406 // FIXME: Could we support packed types here?
5407 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5408 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5409 OpdsMapping[5] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5410 OpdsMapping[6] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5411 break;
5412 case Intrinsic::amdgcn_exp_row:
5413 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5414 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5415 OpdsMapping[5] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5416 OpdsMapping[6] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5417 OpdsMapping[8] = getSGPROpMapping(Reg: MI.getOperand(i: 8).getReg(), MRI, TRI: *TRI);
5418 break;
5419 case Intrinsic::amdgcn_s_alloc_vgpr:
5420 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1);
5421 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
5422 break;
5423 case Intrinsic::amdgcn_s_sendmsg:
5424 case Intrinsic::amdgcn_s_sendmsghalt: {
5425 // This must be an SGPR, but accept a VGPR.
5426 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5427 Default: AMDGPU::SGPRRegBankID);
5428 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5429 break;
5430 }
5431 case Intrinsic::amdgcn_s_setreg: {
5432 // This must be an SGPR, but accept a VGPR.
5433 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5434 Default: AMDGPU::SGPRRegBankID);
5435 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5436 break;
5437 }
5438 case Intrinsic::amdgcn_s_ttracedata: {
5439 // This must be an SGPR, but accept a VGPR.
5440 unsigned Bank =
5441 getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI, Default: AMDGPU::SGPRRegBankID);
5442 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5443 break;
5444 }
5445 case Intrinsic::amdgcn_end_cf: {
5446 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5447 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
5448 break;
5449 }
5450 case Intrinsic::amdgcn_else: {
5451 unsigned WaveSize = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5452 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5453 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: WaveSize);
5454 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: WaveSize);
5455 break;
5456 }
5457 case Intrinsic::amdgcn_init_whole_wave:
5458 case Intrinsic::amdgcn_live_mask: {
5459 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5460 break;
5461 }
5462 case Intrinsic::amdgcn_wqm_demote:
5463 case Intrinsic::amdgcn_kill: {
5464 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5465 break;
5466 }
5467 case Intrinsic::amdgcn_raw_buffer_load:
5468 case Intrinsic::amdgcn_raw_ptr_buffer_load:
5469 case Intrinsic::amdgcn_raw_atomic_buffer_load:
5470 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
5471 case Intrinsic::amdgcn_raw_tbuffer_load:
5472 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
5473 // FIXME: Should make intrinsic ID the last operand of the instruction,
5474 // then this would be the same as store
5475 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5476 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5477 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5478 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5479 break;
5480 }
5481 case Intrinsic::amdgcn_raw_buffer_load_lds:
5482 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
5483 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
5484 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: {
5485 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5486 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5487 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5488 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5489 break;
5490 }
5491 case Intrinsic::amdgcn_raw_buffer_store:
5492 case Intrinsic::amdgcn_raw_ptr_buffer_store:
5493 case Intrinsic::amdgcn_raw_buffer_store_format:
5494 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
5495 case Intrinsic::amdgcn_raw_tbuffer_store:
5496 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
5497 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5498 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5499 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5500 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5501 break;
5502 }
5503 case Intrinsic::amdgcn_struct_buffer_load:
5504 case Intrinsic::amdgcn_struct_ptr_buffer_load:
5505 case Intrinsic::amdgcn_struct_tbuffer_load:
5506 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
5507 case Intrinsic::amdgcn_struct_atomic_buffer_load:
5508 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
5509 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5510 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5511 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5512 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5513 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5514 break;
5515 }
5516 case Intrinsic::amdgcn_struct_buffer_load_lds:
5517 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
5518 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
5519 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
5520 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5521 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5522 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5523 OpdsMapping[5] = getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5524 OpdsMapping[6] = getSGPROpMapping(Reg: MI.getOperand(i: 6).getReg(), MRI, TRI: *TRI);
5525 break;
5526 }
5527 case Intrinsic::amdgcn_struct_buffer_store:
5528 case Intrinsic::amdgcn_struct_ptr_buffer_store:
5529 case Intrinsic::amdgcn_struct_tbuffer_store:
5530 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
5531 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5532 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5533 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5534 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5535 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5536 break;
5537 }
5538 case Intrinsic::amdgcn_init_exec_from_input: {
5539 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5540 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
5541 break;
5542 }
5543 case Intrinsic::amdgcn_ds_gws_init:
5544 case Intrinsic::amdgcn_ds_gws_barrier:
5545 case Intrinsic::amdgcn_ds_gws_sema_br: {
5546 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5547
5548 // This must be an SGPR, but accept a VGPR.
5549 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5550 Default: AMDGPU::SGPRRegBankID);
5551 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5552 break;
5553 }
5554 case Intrinsic::amdgcn_ds_gws_sema_v:
5555 case Intrinsic::amdgcn_ds_gws_sema_p:
5556 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
5557 // This must be an SGPR, but accept a VGPR.
5558 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI,
5559 Default: AMDGPU::SGPRRegBankID);
5560 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5561 break;
5562 }
5563 case Intrinsic::amdgcn_cluster_load_b32:
5564 case Intrinsic::amdgcn_cluster_load_b64:
5565 case Intrinsic::amdgcn_cluster_load_b128: {
5566 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5567 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5568 unsigned M0Bank =
5569 getRegBankID(Reg: MI.getOperand(i: 4).getReg(), MRI, Default: AMDGPU::SGPRRegBankID);
5570 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5571 break;
5572 }
5573 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
5574 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
5575 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
5576 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
5577 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5578 // LDS address goes into $vdst (VGPR).
5579 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5580 unsigned M0Bank =
5581 getRegBankID(Reg: MI.getOperand(i: 5).getReg(), MRI, Default: AMDGPU::SGPRRegBankID);
5582 OpdsMapping[5] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5583 break;
5584 }
5585 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
5586 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
5587 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
5588 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
5589 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
5590 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
5591 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
5592 case Intrinsic::amdgcn_global_load_async_to_lds_b128: {
5593 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5594 // LDS address goes into $vdst/$vdata (VGPR).
5595 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5596 break;
5597 }
5598 case Intrinsic::amdgcn_load_to_lds:
5599 case Intrinsic::amdgcn_load_async_to_lds:
5600 case Intrinsic::amdgcn_global_load_lds:
5601 case Intrinsic::amdgcn_global_load_async_lds: {
5602 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5603 // LDS address goes into M0 (SGPR).
5604 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5605 break;
5606 }
5607 case Intrinsic::amdgcn_lds_direct_load: {
5608 const int M0Idx = MI.getNumOperands() - 1;
5609 Register M0Reg = MI.getOperand(i: M0Idx).getReg();
5610 unsigned M0Bank = getRegBankID(Reg: M0Reg, MRI, Default: AMDGPU::SGPRRegBankID);
5611 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5612
5613 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5614 for (int I = 2; I != M0Idx && MI.getOperand(i: I).isReg(); ++I)
5615 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5616
5617 // Must be SGPR, but we must take whatever the original bank is and fix it
5618 // later.
5619 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5620 break;
5621 }
5622 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
5623 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
5624 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5625 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5626 break;
5627 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
5628 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
5629 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
5630 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
5631 OpdsMapping[0] =
5632 getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); // %vdst
5633 OpdsMapping[1] =
5634 getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); // %addr
5635 OpdsMapping[3] =
5636 getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); // %addr
5637 OpdsMapping[4] =
5638 getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); // %data0
5639 OpdsMapping[5] =
5640 getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI); // %data1
5641 break;
5642 }
5643 case Intrinsic::amdgcn_s_sleep_var:
5644 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5645 break;
5646 case Intrinsic::amdgcn_s_barrier_join:
5647 case Intrinsic::amdgcn_s_wakeup_barrier:
5648 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5649 break;
5650 case Intrinsic::amdgcn_s_barrier_init:
5651 case Intrinsic::amdgcn_s_barrier_signal_var:
5652 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5653 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5654 break;
5655 case Intrinsic::amdgcn_s_barrier_signal_isfirst: {
5656 const unsigned ResultSize = 1;
5657 OpdsMapping[0] =
5658 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: ResultSize);
5659 break;
5660 }
5661 case Intrinsic::amdgcn_s_get_barrier_state:
5662 case Intrinsic::amdgcn_s_get_named_barrier_state: {
5663 OpdsMapping[0] = getSGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5664 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5665 break;
5666 }
5667 case Intrinsic::amdgcn_pops_exiting_wave_id:
5668 return getDefaultMappingSOP(MI);
5669 case Intrinsic::amdgcn_tensor_load_to_lds:
5670 case Intrinsic::amdgcn_tensor_store_from_lds: {
5671 // Lie and claim everything is legal, even all operands need to be
5672 // SGPRs. applyMapping will have to deal with it with readfirstlane.
5673 for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
5674 if (MI.getOperand(i: I).isReg()) {
5675 Register Reg = MI.getOperand(i: I).getReg();
5676 auto OpBank = getRegBankID(Reg, MRI);
5677 unsigned Size = getSizeInBits(Reg, MRI, TRI: *TRI);
5678 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: OpBank, Size);
5679 }
5680 }
5681 break;
5682 }
5683 case Intrinsic::amdgcn_s_prefetch_data:
5684 case Intrinsic::amdgcn_s_prefetch_inst: {
5685 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5686 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5687 break;
5688 }
5689 case Intrinsic::amdgcn_flat_prefetch:
5690 case Intrinsic::amdgcn_global_prefetch:
5691 return getDefaultMappingVOP(MI);
5692 default:
5693 return getInvalidInstructionMapping();
5694 }
5695 break;
5696 }
5697 case AMDGPU::G_SELECT: {
5698 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5699 unsigned Op2Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5700 Default: AMDGPU::SGPRRegBankID);
5701 unsigned Op3Bank = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI,
5702 Default: AMDGPU::SGPRRegBankID);
5703 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
5704 Op3Bank == AMDGPU::SGPRRegBankID;
5705
5706 unsigned CondBankDefault = SGPRSrcs ?
5707 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5708 unsigned CondBank = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI,
5709 Default: CondBankDefault);
5710 if (CondBank == AMDGPU::SGPRRegBankID)
5711 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5712 else if (CondBank == AMDGPU::VGPRRegBankID)
5713 CondBank = AMDGPU::VCCRegBankID;
5714
5715 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
5716 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5717
5718 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
5719
5720 // TODO: Should report 32-bit for scalar condition type.
5721 if (Size == 64) {
5722 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5723 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: CondBank, Size: 1);
5724 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5725 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5726 } else {
5727 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size);
5728 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: CondBank, Size: 1);
5729 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size);
5730 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: Bank, Size);
5731 }
5732
5733 break;
5734 }
5735
5736 case AMDGPU::G_SI_CALL: {
5737 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 64);
5738 // Lie and claim everything is legal, even though some need to be
5739 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
5740 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5741
5742 // Allow anything for implicit arguments
5743 for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
5744 if (MI.getOperand(i: I).isReg()) {
5745 Register Reg = MI.getOperand(i: I).getReg();
5746 auto OpBank = getRegBankID(Reg, MRI);
5747 unsigned Size = getSizeInBits(Reg, MRI, TRI: *TRI);
5748 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: OpBank, Size);
5749 }
5750 }
5751 break;
5752 }
5753 case AMDGPU::G_LOAD:
5754 case AMDGPU::G_ZEXTLOAD:
5755 case AMDGPU::G_SEXTLOAD:
5756 return getInstrMappingForLoad(MI);
5757
5758 case AMDGPU::G_ATOMICRMW_XCHG:
5759 case AMDGPU::G_ATOMICRMW_ADD:
5760 case AMDGPU::G_ATOMICRMW_SUB:
5761 case AMDGPU::G_ATOMICRMW_AND:
5762 case AMDGPU::G_ATOMICRMW_OR:
5763 case AMDGPU::G_ATOMICRMW_XOR:
5764 case AMDGPU::G_ATOMICRMW_MAX:
5765 case AMDGPU::G_ATOMICRMW_MIN:
5766 case AMDGPU::G_ATOMICRMW_UMAX:
5767 case AMDGPU::G_ATOMICRMW_UMIN:
5768 case AMDGPU::G_ATOMICRMW_FADD:
5769 case AMDGPU::G_ATOMICRMW_FMIN:
5770 case AMDGPU::G_ATOMICRMW_FMAX:
5771 case AMDGPU::G_ATOMICRMW_UINC_WRAP:
5772 case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5773 case AMDGPU::G_ATOMICRMW_USUB_COND:
5774 case AMDGPU::G_ATOMICRMW_USUB_SAT:
5775 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
5776 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5777 OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg());
5778 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5779 break;
5780 }
5781 case AMDGPU::G_ATOMIC_CMPXCHG: {
5782 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5783 OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg());
5784 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5785 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5786 break;
5787 }
5788 case AMDGPU::G_BRCOND: {
5789 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 0).getReg(), MRI,
5790 Default: AMDGPU::SGPRRegBankID);
5791 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
5792 if (Bank != AMDGPU::SGPRRegBankID)
5793 Bank = AMDGPU::VCCRegBankID;
5794
5795 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: 1);
5796 break;
5797 }
5798 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
5799 return getDefaultMappingVOP(MI);
5800 case AMDGPU::G_PREFETCH:
5801 OpdsMapping[0] = getSGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5802 break;
5803 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP:
5804 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN:
5805 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5806 break;
5807 case AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR:
5808 case AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR: {
5809 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5810 unsigned PtrSize = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5811 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5812 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: PtrSize);
5813 break;
5814 }
5815 }
5816
5817 return getInstructionMapping(/*ID*/1, /*Cost*/1,
5818 OperandsMapping: getOperandsMapping(OpdsMapping),
5819 NumOperands: MI.getNumOperands());
5820}
5821