1//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the RegisterBankInfo class for
10/// AMDGPU.
11///
12/// \par
13///
14/// AMDGPU has unique register bank constraints that require special high level
15/// strategies to deal with. There are two main true physical register banks
16/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17/// sort of pseudo-register bank needed to represent SGPRs used in a vector
18/// boolean context. There is also the AGPR bank, which is a special purpose
19/// physical register bank present on some subtargets.
20///
21/// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22/// be uniform. It is generally not valid to legalize operands by inserting
23/// copies as on other targets. Operations which require uniform, SGPR operands
24/// generally require scalarization by repeatedly executing the instruction,
25/// activating each set of lanes using a unique set of input values. This is
26/// referred to as a waterfall loop.
27///
28/// \par Booleans
29///
30/// Booleans (s1 values) requires special consideration. A vector compare result
31/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32/// register. These are represented with the VCC bank. During selection, we need
33/// to be able to unambiguously go back from a register class to a register
34/// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35/// bank, we need to know the use context type. An SGPR s1 value always means a
36/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37/// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38/// a 32-bit virtual register. Taken together, this means we need to adjust the
39/// type of boolean operations to be regbank legal. All SALU booleans need to be
40/// widened to 32-bits, and all VALU booleans need to be s1 values.
41///
42/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44/// bank. A non-boolean source (such as a truncate from a 1-bit load from
45/// memory) will require a copy to the VCC bank which will require clearing the
46/// high bits and inserting a compare.
47///
48/// \par Constant bus restriction
49///
50/// VALU instructions have a limitation known as the constant bus
51/// restriction. Most VALU instructions can use SGPR operands, but may read at
52/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53/// instructions). This is one unique SGPR, so the same SGPR may be used for
54/// multiple operands. From a register bank perspective, any combination of
55/// operands should be legal as an SGPR, but this is contextually dependent on
56/// the SGPR operands all being the same register. There is therefore optimal to
57/// choose the SGPR with the most uses to minimize the number of copies.
58///
59/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60/// operation should have its source operands all mapped to VGPRs (except for
61/// VCC), inserting copies from any SGPR operands. This the most trivial legal
62/// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63/// complicated to solve here. Every optimization pattern or instruction
64/// selected to multiple outputs would have to enforce this rule, and there
65/// would be additional complexity in tracking this rule for every G_*
66/// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67/// picking the optimal operand combination from a post-isel optimization pass.
68///
69//===----------------------------------------------------------------------===//
70
71#include "AMDGPURegisterBankInfo.h"
72
73#include "AMDGPU.h"
74#include "AMDGPUGlobalISelUtils.h"
75#include "AMDGPUInstrInfo.h"
76#include "AMDGPULaneMaskUtils.h"
77#include "GCNSubtarget.h"
78#include "SIMachineFunctionInfo.h"
79#include "SIRegisterInfo.h"
80#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
81#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
82#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
83#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
84#include "llvm/CodeGen/RegisterBank.h"
85#include "llvm/IR/IntrinsicsAMDGPU.h"
86
87#define GET_TARGET_REGBANK_IMPL
88#include "AMDGPUGenRegisterBank.inc"
89
90// This file will be TableGen'ed at some point.
91#include "AMDGPUGenRegisterBankInfo.def"
92
93using namespace llvm;
94using namespace MIPatternMatch;
95
96namespace {
97
98// Observer to apply a register bank to new registers created by LegalizerHelper.
99class ApplyRegBankMapping final : public GISelChangeObserver {
100private:
101 MachineIRBuilder &B;
102 const AMDGPURegisterBankInfo &RBI;
103 MachineRegisterInfo &MRI;
104 const RegisterBank *NewBank;
105 SmallVector<MachineInstr *, 4> NewInsts;
106
107public:
108 ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
109 MachineRegisterInfo &MRI_, const RegisterBank *RB)
110 : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
111 assert(!B.isObservingChanges());
112 B.setChangeObserver(*this);
113 }
114
115 ~ApplyRegBankMapping() override {
116 for (MachineInstr *MI : NewInsts)
117 applyBank(MI&: *MI);
118
119 B.stopObservingChanges();
120 }
121
122 /// Set any registers that don't have a set register class or bank to SALU.
123 void applyBank(MachineInstr &MI) {
124 const unsigned Opc = MI.getOpcode();
125 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
126 Opc == AMDGPU::G_SEXT) {
127 // LegalizerHelper wants to use the basic legalization artifacts when
128 // widening etc. We don't handle selection with vcc in artifact sources,
129 // so we need to use a select instead to handle these properly.
130 Register DstReg = MI.getOperand(i: 0).getReg();
131 Register SrcReg = MI.getOperand(i: 1).getReg();
132 const RegisterBank *SrcBank = RBI.getRegBank(Reg: SrcReg, MRI, TRI: *RBI.TRI);
133 if (SrcBank == &AMDGPU::VCCRegBank) {
134 const LLT S32 = LLT::scalar(SizeInBits: 32);
135 assert(MRI.getType(SrcReg) == LLT::scalar(1));
136 assert(MRI.getType(DstReg) == S32);
137 assert(NewBank == &AMDGPU::VGPRRegBank);
138
139 // Replace the extension with a select, which really uses the boolean
140 // source.
141 B.setInsertPt(MBB&: *MI.getParent(), II: MI);
142
143 auto True = B.buildConstant(Res: S32, Val: Opc == AMDGPU::G_SEXT ? -1 : 1);
144 auto False = B.buildConstant(Res: S32, Val: 0);
145 B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False);
146 MRI.setRegBank(Reg: True.getReg(Idx: 0), RegBank: *NewBank);
147 MRI.setRegBank(Reg: False.getReg(Idx: 0), RegBank: *NewBank);
148 MI.eraseFromParent();
149 }
150
151 assert(!MRI.getRegClassOrRegBank(DstReg));
152 MRI.setRegBank(Reg: DstReg, RegBank: *NewBank);
153 return;
154 }
155
156#ifndef NDEBUG
157 if (Opc == AMDGPU::G_TRUNC) {
158 Register DstReg = MI.getOperand(0).getReg();
159 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
160 assert(DstBank != &AMDGPU::VCCRegBank);
161 }
162#endif
163
164 for (MachineOperand &Op : MI.operands()) {
165 if (!Op.isReg())
166 continue;
167
168 // We may see physical registers if building a real MI
169 Register Reg = Op.getReg();
170 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
171 continue;
172
173 const RegisterBank *RB = NewBank;
174 if (MRI.getType(Reg) == LLT::scalar(SizeInBits: 1)) {
175 assert(NewBank == &AMDGPU::VGPRRegBank &&
176 "s1 operands should only be used for vector bools");
177 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
178 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
179 "not expecting legalization artifacts here");
180 RB = &AMDGPU::VCCRegBank;
181 }
182
183 MRI.setRegBank(Reg, RegBank: *RB);
184 }
185 }
186
187 void erasingInstr(MachineInstr &MI) override {}
188
189 void createdInstr(MachineInstr &MI) override {
190 // At this point, the instruction was just inserted and has no operands.
191 NewInsts.push_back(Elt: &MI);
192 }
193
194 void changingInstr(MachineInstr &MI) override {}
195 void changedInstr(MachineInstr &MI) override {
196 // FIXME: In principle we should probably add the instruction to NewInsts,
197 // but the way the LegalizerHelper uses the observer, we will always see the
198 // registers we need to set the regbank on also referenced in a new
199 // instruction.
200 }
201};
202
203} // anonymous namespace
204
205AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
206 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
207 TII(Subtarget.getInstrInfo()) {
208
209 // HACK: Until this is fully tablegen'd.
210 static llvm::once_flag InitializeRegisterBankFlag;
211
212 static auto InitializeRegisterBankOnce = [this]() {
213 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
214 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
215 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
216 (void)this;
217 };
218
219 llvm::call_once(flag&: InitializeRegisterBankFlag, F&: InitializeRegisterBankOnce);
220}
221
222static bool isVectorRegisterBank(const RegisterBank &Bank) {
223 unsigned BankID = Bank.getID();
224 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
225}
226
227bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {
228 return RB != &AMDGPU::SGPRRegBank;
229}
230
231unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
232 const RegisterBank &Src,
233 TypeSize Size) const {
234 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
235 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
236 (isVectorRegisterBank(Bank: Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
237 return std::numeric_limits<unsigned>::max();
238 }
239
240 // Bool values are tricky, because the meaning is based on context. The SCC
241 // and VCC banks are for the natural scalar and vector conditions produced by
242 // a compare.
243 //
244 // Legalization doesn't know about the necessary context, so an s1 use may
245 // have been a truncate from an arbitrary value, in which case a copy (lowered
246 // as a compare with 0) needs to be inserted.
247 if (Size == 1 &&
248 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
249 (isVectorRegisterBank(Bank: Src) ||
250 Src.getID() == AMDGPU::SGPRRegBankID ||
251 Src.getID() == AMDGPU::VCCRegBankID))
252 return std::numeric_limits<unsigned>::max();
253
254 // There is no direct copy between AGPRs.
255 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
256 Src.getID() == AMDGPU::AGPRRegBankID)
257 return 4;
258
259 return RegisterBankInfo::copyCost(A: Dst, B: Src, Size);
260}
261
262unsigned AMDGPURegisterBankInfo::getBreakDownCost(
263 const ValueMapping &ValMapping,
264 const RegisterBank *CurBank) const {
265 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
266 // VGPR.
267 // FIXME: Is there a better way to do this?
268 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
269 return 10; // This is expensive.
270
271 assert(ValMapping.NumBreakDowns == 2 &&
272 ValMapping.BreakDown[0].Length == 32 &&
273 ValMapping.BreakDown[0].StartIdx == 0 &&
274 ValMapping.BreakDown[1].Length == 32 &&
275 ValMapping.BreakDown[1].StartIdx == 32 &&
276 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
277
278 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
279 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
280 // want.
281
282 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
283 // alignment restrictions, but this probably isn't important.
284 return 1;
285}
286
287const RegisterBank &
288AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
289 LLT Ty) const {
290 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
291 // VCC-like use.
292 if (TRI->isSGPRClass(RC: &RC)) {
293 // FIXME: This probably came from a copy from a physical register, which
294 // should be inferable from the copied to-type. We don't have many boolean
295 // physical register constraints so just assume a normal SGPR for now.
296 if (!Ty.isValid())
297 return AMDGPU::SGPRRegBank;
298
299 return Ty == LLT::scalar(SizeInBits: 1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
300 }
301
302 return TRI->isAGPRClass(RC: &RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
303}
304
305template <unsigned NumOps>
306RegisterBankInfo::InstructionMappings
307AMDGPURegisterBankInfo::addMappingFromTable(
308 const MachineInstr &MI, const MachineRegisterInfo &MRI,
309 const std::array<unsigned, NumOps> RegSrcOpIdx,
310 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
311
312 InstructionMappings AltMappings;
313
314 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
315
316 unsigned Sizes[NumOps];
317 for (unsigned I = 0; I < NumOps; ++I) {
318 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
319 Sizes[I] = getSizeInBits(Reg, MRI, TRI: *TRI);
320 }
321
322 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
323 unsigned SizeI = getSizeInBits(Reg: MI.getOperand(i: I).getReg(), MRI, TRI: *TRI);
324 Operands[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SizeI);
325 }
326
327 // getInstrMapping's default mapping uses ID 1, so start at 2.
328 unsigned MappingID = 2;
329 for (const auto &Entry : Table) {
330 for (unsigned I = 0; I < NumOps; ++I) {
331 int OpIdx = RegSrcOpIdx[I];
332 Operands[OpIdx] = AMDGPU::getValueMapping(BankID: Entry.RegBanks[I], Size: Sizes[I]);
333 }
334
335 AltMappings.push_back(Elt: &getInstructionMapping(ID: MappingID++, Cost: Entry.Cost,
336 OperandsMapping: getOperandsMapping(OpdsMapping: Operands),
337 NumOperands: Operands.size()));
338 }
339
340 return AltMappings;
341}
342
343RegisterBankInfo::InstructionMappings
344AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
345 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
346 switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
347 case Intrinsic::amdgcn_readlane: {
348 static const OpRegBankEntry<3> Table[2] = {
349 // Perfectly legal.
350 { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: 1 },
351
352 // Need a readfirstlane for the index.
353 { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 }
354 };
355
356 const std::array<unsigned, 3> RegSrcOpIdx = { ._M_elems: { 0, 2, 3 } };
357 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
358 }
359 case Intrinsic::amdgcn_writelane: {
360 static const OpRegBankEntry<4> Table[4] = {
361 // Perfectly legal.
362 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 1 },
363
364 // Need readfirstlane of first op
365 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 },
366
367 // Need readfirstlane of second op
368 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 },
369
370 // Need readfirstlane of both ops
371 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 3 }
372 };
373
374 // rsrc, voffset, offset
375 const std::array<unsigned, 4> RegSrcOpIdx = { ._M_elems: { 0, 2, 3, 4 } };
376 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
377 }
378 default:
379 return RegisterBankInfo::getInstrAlternativeMappings(MI);
380 }
381}
382
383RegisterBankInfo::InstructionMappings
384AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
385 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
386
387 switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
388 case Intrinsic::amdgcn_s_buffer_load: {
389 static const OpRegBankEntry<2> Table[4] = {
390 // Perfectly legal.
391 { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: 1 },
392
393 // Only need 1 register in loop
394 { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 300 },
395
396 // Have to waterfall the resource.
397 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: 1000 },
398
399 // Have to waterfall the resource, and the offset.
400 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 1500 }
401 };
402
403 // rsrc, offset
404 const std::array<unsigned, 2> RegSrcOpIdx = { ._M_elems: { 2, 3 } };
405 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
406 }
407 case Intrinsic::amdgcn_ds_ordered_add:
408 case Intrinsic::amdgcn_ds_ordered_swap: {
409 // VGPR = M0, VGPR
410 static const OpRegBankEntry<3> Table[2] = {
411 // Perfectly legal.
412 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 1 },
413
414 // Need a readfirstlane for m0
415 { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 }
416 };
417
418 const std::array<unsigned, 3> RegSrcOpIdx = { ._M_elems: { 0, 2, 3 } };
419 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
420 }
421 case Intrinsic::amdgcn_s_sendmsg:
422 case Intrinsic::amdgcn_s_sendmsghalt: {
423 // FIXME: Should have no register for immediate
424 static const OpRegBankEntry<1> Table[2] = {
425 // Perfectly legal.
426 { .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: 1 },
427
428 // Need readlane
429 { .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: 3 }
430 };
431
432 const std::array<unsigned, 1> RegSrcOpIdx = { ._M_elems: { 2 } };
433 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
434 }
435 default:
436 return RegisterBankInfo::getInstrAlternativeMappings(MI);
437 }
438}
439
440// FIXME: Returns uniform if there's no source value information. This is
441// probably wrong.
442bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
443 if (!MI.hasOneMemOperand())
444 return false;
445
446 const MachineMemOperand *MMO = *MI.memoperands_begin();
447 const unsigned AS = MMO->getAddrSpace();
448 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
449 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
450 const unsigned MemSize = 8 * MMO->getSize().getValue();
451
452 // Require 4-byte alignment.
453 return (MMO->getAlign() >= Align(4) ||
454 (Subtarget.hasScalarSubwordLoads() &&
455 ((MemSize == 16 && MMO->getAlign() >= Align(2)) ||
456 (MemSize == 8 && MMO->getAlign() >= Align(1))))) &&
457 // Can't do a scalar atomic load.
458 !MMO->isAtomic() &&
459 // Don't use scalar loads for volatile accesses to non-constant address
460 // spaces.
461 (IsConst || !MMO->isVolatile()) &&
462 // Memory must be known constant, or not written before this load.
463 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
464 AMDGPU::isUniformMMO(MMO);
465}
466
467RegisterBankInfo::InstructionMappings
468AMDGPURegisterBankInfo::getInstrAlternativeMappings(
469 const MachineInstr &MI) const {
470
471 const MachineFunction &MF = *MI.getMF();
472 const MachineRegisterInfo &MRI = MF.getRegInfo();
473
474
475 InstructionMappings AltMappings;
476 switch (MI.getOpcode()) {
477 case TargetOpcode::G_CONSTANT:
478 case TargetOpcode::G_IMPLICIT_DEF: {
479 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
480 if (Size == 1) {
481 static const OpRegBankEntry<1> Table[3] = {
482 { .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: 1 },
483 { .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: 1 },
484 { .RegBanks: { AMDGPU::VCCRegBankID }, .Cost: 1 }
485 };
486
487 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx: {._M_elems: { 0 }}, Table);
488 }
489
490 [[fallthrough]];
491 }
492 case TargetOpcode::G_FCONSTANT:
493 case TargetOpcode::G_FRAME_INDEX:
494 case TargetOpcode::G_GLOBAL_VALUE: {
495 static const OpRegBankEntry<1> Table[2] = {
496 { .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: 1 },
497 { .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: 1 }
498 };
499
500 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx: {._M_elems: { 0 }}, Table);
501 }
502 case TargetOpcode::G_AND:
503 case TargetOpcode::G_OR:
504 case TargetOpcode::G_XOR: {
505 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
506
507 if (Size == 1) {
508 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
509 const InstructionMapping &SCCMapping = getInstructionMapping(
510 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
511 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32),
512 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32),
513 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32)}),
514 NumOperands: 3); // Num Operands
515 AltMappings.push_back(Elt: &SCCMapping);
516
517 const InstructionMapping &VCCMapping0 = getInstructionMapping(
518 ID: 2, Cost: 1, OperandsMapping: getOperandsMapping(
519 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size),
520 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size),
521 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size)}),
522 NumOperands: 3); // Num Operands
523 AltMappings.push_back(Elt: &VCCMapping0);
524 return AltMappings;
525 }
526
527 if (Size != 64)
528 break;
529
530 const InstructionMapping &SSMapping = getInstructionMapping(
531 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
532 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
533 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
534 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size)}),
535 NumOperands: 3); // Num Operands
536 AltMappings.push_back(Elt: &SSMapping);
537
538 const InstructionMapping &VVMapping = getInstructionMapping(
539 ID: 2, Cost: 2, OperandsMapping: getOperandsMapping(
540 OpdsMapping: {AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
541 AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
542 AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size)}),
543 NumOperands: 3); // Num Operands
544 AltMappings.push_back(Elt: &VVMapping);
545 break;
546 }
547 case TargetOpcode::G_LOAD:
548 case TargetOpcode::G_ZEXTLOAD:
549 case TargetOpcode::G_SEXTLOAD: {
550 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
551 LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
552 unsigned PtrSize = PtrTy.getSizeInBits();
553 unsigned AS = PtrTy.getAddressSpace();
554
555 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
556 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
557 isScalarLoadLegal(MI)) {
558 const InstructionMapping &SSMapping = getInstructionMapping(
559 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
560 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
561 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: PtrSize)}),
562 NumOperands: 2); // Num Operands
563 AltMappings.push_back(Elt: &SSMapping);
564 }
565
566 const InstructionMapping &VVMapping = getInstructionMapping(
567 ID: 2, Cost: 1,
568 OperandsMapping: getOperandsMapping(
569 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
570 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: PtrSize)}),
571 NumOperands: 2); // Num Operands
572 AltMappings.push_back(Elt: &VVMapping);
573
574 // It may be possible to have a vgpr = load sgpr mapping here, because
575 // the mubuf instructions support this kind of load, but probably for only
576 // gfx7 and older. However, the addressing mode matching in the instruction
577 // selector should be able to do a better job of detecting and selecting
578 // these kinds of loads from the vgpr = load vgpr mapping.
579
580 return AltMappings;
581
582 }
583 case TargetOpcode::G_SELECT: {
584 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
585 const InstructionMapping &SSMapping = getInstructionMapping(ID: 1, Cost: 1,
586 OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
587 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1),
588 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
589 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size)}),
590 NumOperands: 4); // Num Operands
591 AltMappings.push_back(Elt: &SSMapping);
592
593 const InstructionMapping &VVMapping = getInstructionMapping(ID: 2, Cost: 1,
594 OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
595 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1),
596 AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size),
597 AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size)}),
598 NumOperands: 4); // Num Operands
599 AltMappings.push_back(Elt: &VVMapping);
600
601 return AltMappings;
602 }
603 case TargetOpcode::G_UADDE:
604 case TargetOpcode::G_USUBE:
605 case TargetOpcode::G_SADDE:
606 case TargetOpcode::G_SSUBE: {
607 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
608 const InstructionMapping &SSMapping = getInstructionMapping(ID: 1, Cost: 1,
609 OperandsMapping: getOperandsMapping(
610 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
611 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1),
612 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
613 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size),
614 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1)}),
615 NumOperands: 5); // Num Operands
616 AltMappings.push_back(Elt: &SSMapping);
617
618 const InstructionMapping &VVMapping = getInstructionMapping(ID: 2, Cost: 1,
619 OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
620 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1),
621 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
622 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size),
623 AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1)}),
624 NumOperands: 5); // Num Operands
625 AltMappings.push_back(Elt: &VVMapping);
626 return AltMappings;
627 }
628 case AMDGPU::G_BRCOND: {
629 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
630
631 // TODO: Change type to 32 for scalar
632 const InstructionMapping &SMapping = getInstructionMapping(
633 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
634 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1), nullptr}),
635 NumOperands: 2); // Num Operands
636 AltMappings.push_back(Elt: &SMapping);
637
638 const InstructionMapping &VMapping = getInstructionMapping(
639 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(
640 OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1), nullptr }),
641 NumOperands: 2); // Num Operands
642 AltMappings.push_back(Elt: &VMapping);
643 return AltMappings;
644 }
645 case AMDGPU::G_INTRINSIC:
646 case AMDGPU::G_INTRINSIC_CONVERGENT:
647 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
648 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
649 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
650 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
651 default:
652 break;
653 }
654 return RegisterBankInfo::getInstrAlternativeMappings(MI);
655}
656
657void AMDGPURegisterBankInfo::split64BitValueForMapping(
658 MachineIRBuilder &B,
659 SmallVector<Register, 2> &Regs,
660 LLT HalfTy,
661 Register Reg) const {
662 assert(HalfTy.getSizeInBits() == 32);
663 MachineRegisterInfo *MRI = B.getMRI();
664 Register LoLHS = MRI->createGenericVirtualRegister(Ty: HalfTy);
665 Register HiLHS = MRI->createGenericVirtualRegister(Ty: HalfTy);
666 const RegisterBank *Bank = getRegBank(Reg, MRI: *MRI, TRI: *TRI);
667 MRI->setRegBank(Reg: LoLHS, RegBank: *Bank);
668 MRI->setRegBank(Reg: HiLHS, RegBank: *Bank);
669
670 Regs.push_back(Elt: LoLHS);
671 Regs.push_back(Elt: HiLHS);
672
673 B.buildInstr(Opcode: AMDGPU::G_UNMERGE_VALUES)
674 .addDef(RegNo: LoLHS)
675 .addDef(RegNo: HiLHS)
676 .addUse(RegNo: Reg);
677}
678
679/// Replace the current type each register in \p Regs has with \p NewTy
680static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
681 LLT NewTy) {
682 for (Register Reg : Regs) {
683 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
684 MRI.setType(VReg: Reg, Ty: NewTy);
685 }
686}
687
688static LLT getHalfSizedType(LLT Ty) {
689 if (Ty.isVector()) {
690 assert(Ty.getElementCount().isKnownMultipleOf(2));
691 return LLT::scalarOrVector(EC: Ty.getElementCount().divideCoefficientBy(RHS: 2),
692 ScalarTy: Ty.getElementType());
693 }
694
695 assert(Ty.getScalarSizeInBits() % 2 == 0);
696 return LLT::scalar(SizeInBits: Ty.getScalarSizeInBits() / 2);
697}
698
699// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
700// source value into a scalar register.
701Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
702 MachineRegisterInfo &MRI,
703 Register Src) const {
704 LLT Ty = MRI.getType(Reg: Src);
705 const RegisterBank *Bank = getRegBank(Reg: Src, MRI, TRI: *TRI);
706
707 if (Bank == &AMDGPU::SGPRRegBank)
708 return Src;
709
710 unsigned Bits = Ty.getSizeInBits();
711 assert(Bits % 32 == 0);
712
713 if (Bank != &AMDGPU::VGPRRegBank) {
714 // We need to copy from AGPR to VGPR
715 Src = B.buildCopy(Res: Ty, Op: Src).getReg(Idx: 0);
716 MRI.setRegBank(Reg: Src, RegBank: AMDGPU::VGPRRegBank);
717 }
718
719 LLT S32 = LLT::scalar(SizeInBits: 32);
720 unsigned NumParts = Bits / 32;
721 SmallVector<Register, 8> SrcParts;
722 SmallVector<Register, 8> DstParts;
723
724 if (Bits == 32) {
725 SrcParts.push_back(Elt: Src);
726 } else {
727 auto Unmerge = B.buildUnmerge(Res: S32, Op: Src);
728 for (unsigned i = 0; i < NumParts; ++i)
729 SrcParts.push_back(Elt: Unmerge.getReg(Idx: i));
730 }
731
732 for (unsigned i = 0; i < NumParts; ++i) {
733 Register SrcPart = SrcParts[i];
734 Register DstPart = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
735 MRI.setType(VReg: DstPart, Ty: NumParts == 1 ? Ty : S32);
736
737 const TargetRegisterClass *Constrained =
738 constrainGenericRegister(Reg: SrcPart, RC: AMDGPU::VGPR_32RegClass, MRI);
739 (void)Constrained;
740 assert(Constrained && "Failed to constrain readfirstlane src reg");
741
742 B.buildInstr(Opc: AMDGPU::V_READFIRSTLANE_B32, DstOps: {DstPart}, SrcOps: {SrcPart});
743
744 DstParts.push_back(Elt: DstPart);
745 }
746
747 if (Bits == 32)
748 return DstParts[0];
749
750 Register Dst = B.buildMergeLikeInstr(Res: Ty, Ops: DstParts).getReg(Idx: 0);
751 MRI.setRegBank(Reg: Dst, RegBank: AMDGPU::SGPRRegBank);
752 return Dst;
753}
754
755/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
756/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
757/// execute the instruction for each unique combination of values in all lanes
758/// in the wave. The block will be split such that rest of the instructions are
759/// moved to a new block.
760///
761/// Essentially performs this loop:
762//
763/// Save Execution Mask
764/// For (Lane : Wavefront) {
765/// Enable Lane, Disable all other lanes
766/// SGPR = read SGPR value for current lane from VGPR
767/// VGPRResult[Lane] = use_op SGPR
768/// }
769/// Restore Execution Mask
770///
771/// There is additional complexity to try for compare values to identify the
772/// unique values used.
773bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
774 MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
775 SmallSet<Register, 4> &SGPROperandRegs) const {
776 // Track use registers which have already been expanded with a readfirstlane
777 // sequence. This may have multiple uses if moving a sequence.
778 DenseMap<Register, Register> WaterfalledRegMap;
779
780 MachineBasicBlock &MBB = B.getMBB();
781 MachineFunction *MF = &B.getMF();
782
783 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
784 const AMDGPU::LaneMaskConstants &LMC =
785 AMDGPU::LaneMaskConstants::get(ST: Subtarget);
786
787#ifndef NDEBUG
788 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
789#endif
790
791 MachineRegisterInfo &MRI = *B.getMRI();
792 Register SaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
793 Register InitSaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
794
795 // Don't bother using generic instructions/registers for the exec mask.
796 B.buildInstr(Opcode: TargetOpcode::IMPLICIT_DEF)
797 .addDef(RegNo: InitSaveExecReg);
798
799 Register PhiExec = MRI.createVirtualRegister(RegClass: WaveRC);
800 Register NewExec = MRI.createVirtualRegister(RegClass: WaveRC);
801
802 // To insert the loop we need to split the block. Move everything before this
803 // point to a new block, and insert a new empty block before this instruction.
804 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
805 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
806 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
807 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
808 MachineFunction::iterator MBBI(MBB);
809 ++MBBI;
810 MF->insert(MBBI, MBB: LoopBB);
811 MF->insert(MBBI, MBB: BodyBB);
812 MF->insert(MBBI, MBB: RestoreExecBB);
813 MF->insert(MBBI, MBB: RemainderBB);
814
815 LoopBB->addSuccessor(Succ: BodyBB);
816 BodyBB->addSuccessor(Succ: RestoreExecBB);
817 BodyBB->addSuccessor(Succ: LoopBB);
818
819 // Move the rest of the block into a new block.
820 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
821 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Range.end(), To: MBB.end());
822
823 MBB.addSuccessor(Succ: LoopBB);
824 RestoreExecBB->addSuccessor(Succ: RemainderBB);
825
826 B.setInsertPt(MBB&: *LoopBB, II: LoopBB->end());
827
828 B.buildInstr(Opcode: TargetOpcode::PHI)
829 .addDef(RegNo: PhiExec)
830 .addReg(RegNo: InitSaveExecReg)
831 .addMBB(MBB: &MBB)
832 .addReg(RegNo: NewExec)
833 .addMBB(MBB: BodyBB);
834
835 const DebugLoc &DL = B.getDL();
836
837 MachineInstr &FirstInst = *Range.begin();
838
839 // Move the instruction into the loop body. Note we moved everything after
840 // Range.end() already into a new block, so Range.end() is no longer valid.
841 BodyBB->splice(Where: BodyBB->end(), Other: &MBB, From: Range.begin(), To: MBB.end());
842
843 // Figure out the iterator range after splicing the instructions.
844 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
845 auto NewEnd = BodyBB->end();
846
847 B.setMBB(*LoopBB);
848
849 LLT S1 = LLT::scalar(SizeInBits: 1);
850 Register CondReg;
851
852 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
853
854 for (MachineInstr &MI : make_range(x: NewBegin, y: NewEnd)) {
855 for (MachineOperand &Op : MI.all_uses()) {
856 Register OldReg = Op.getReg();
857 if (!SGPROperandRegs.count(V: OldReg))
858 continue;
859
860 // See if we already processed this register in another instruction in the
861 // sequence.
862 auto OldVal = WaterfalledRegMap.find(Val: OldReg);
863 if (OldVal != WaterfalledRegMap.end()) {
864 Op.setReg(OldVal->second);
865 continue;
866 }
867
868 Register OpReg = Op.getReg();
869 LLT OpTy = MRI.getType(Reg: OpReg);
870
871 const RegisterBank *OpBank = getRegBank(Reg: OpReg, MRI, TRI: *TRI);
872 if (OpBank != &AMDGPU::VGPRRegBank) {
873 // Insert copy from AGPR to VGPR before the loop.
874 B.setMBB(MBB);
875 OpReg = B.buildCopy(Res: OpTy, Op: OpReg).getReg(Idx: 0);
876 MRI.setRegBank(Reg: OpReg, RegBank: AMDGPU::VGPRRegBank);
877 B.setMBB(*LoopBB);
878 }
879
880 Register CurrentLaneReg = buildReadFirstLane(B, MRI, Src: OpReg);
881
882 // Build the comparison(s).
883 unsigned OpSize = OpTy.getSizeInBits();
884 bool Is64 = OpSize % 64 == 0;
885 unsigned PartSize = Is64 ? 64 : 32;
886 LLT PartTy = LLT::scalar(SizeInBits: PartSize);
887 unsigned NumParts = OpSize / PartSize;
888 SmallVector<Register, 8> OpParts;
889 SmallVector<Register, 8> CurrentLaneParts;
890
891 if (NumParts == 1) {
892 OpParts.push_back(Elt: OpReg);
893 CurrentLaneParts.push_back(Elt: CurrentLaneReg);
894 } else {
895 auto UnmergeOp = B.buildUnmerge(Res: PartTy, Op: OpReg);
896 auto UnmergeCurrentLane = B.buildUnmerge(Res: PartTy, Op: CurrentLaneReg);
897 for (unsigned i = 0; i < NumParts; ++i) {
898 OpParts.push_back(Elt: UnmergeOp.getReg(Idx: i));
899 CurrentLaneParts.push_back(Elt: UnmergeCurrentLane.getReg(Idx: i));
900 MRI.setRegBank(Reg: OpParts[i], RegBank: AMDGPU::VGPRRegBank);
901 MRI.setRegBank(Reg: CurrentLaneParts[i], RegBank: AMDGPU::SGPRRegBank);
902 }
903 }
904
905 for (unsigned i = 0; i < NumParts; ++i) {
906 auto CmpReg = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: CurrentLaneParts[i],
907 Op1: OpParts[i]).getReg(Idx: 0);
908 MRI.setRegBank(Reg: CmpReg, RegBank: AMDGPU::VCCRegBank);
909
910 if (!CondReg) {
911 CondReg = CmpReg;
912 } else {
913 CondReg = B.buildAnd(Dst: S1, Src0: CondReg, Src1: CmpReg).getReg(Idx: 0);
914 MRI.setRegBank(Reg: CondReg, RegBank: AMDGPU::VCCRegBank);
915 }
916 }
917
918 Op.setReg(CurrentLaneReg);
919
920 // Make sure we don't re-process this register again.
921 WaterfalledRegMap.insert(KV: std::pair(OldReg, Op.getReg()));
922 }
923 }
924
925 // The ballot becomes a no-op during instruction selection.
926 CondReg = B.buildIntrinsic(ID: Intrinsic::amdgcn_ballot,
927 Res: {LLT::scalar(SizeInBits: Subtarget.isWave32() ? 32 : 64)})
928 .addReg(RegNo: CondReg)
929 .getReg(Idx: 0);
930 MRI.setRegClass(Reg: CondReg, RC: WaveRC);
931
932 // Update EXEC, save the original EXEC value to VCC.
933 B.buildInstr(Opcode: LMC.AndSaveExecOpc)
934 .addDef(RegNo: NewExec)
935 .addReg(RegNo: CondReg, Flags: RegState::Kill);
936
937 MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
938
939 B.setInsertPt(MBB&: *BodyBB, II: BodyBB->end());
940
941 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
942 B.buildInstr(Opcode: LMC.XorTermOpc)
943 .addDef(RegNo: LMC.ExecReg)
944 .addReg(RegNo: LMC.ExecReg)
945 .addReg(RegNo: NewExec);
946
947 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
948 // s_cbranch_scc0?
949
950 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
951 B.buildInstr(Opcode: AMDGPU::SI_WATERFALL_LOOP).addMBB(MBB: LoopBB);
952
953 // Save the EXEC mask before the loop.
954 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: SaveExecReg)
955 .addReg(RegNo: LMC.ExecReg);
956
957 // Restore the EXEC mask after the loop.
958 B.setMBB(*RestoreExecBB);
959 B.buildInstr(Opcode: LMC.MovTermOpc).addDef(RegNo: LMC.ExecReg).addReg(RegNo: SaveExecReg);
960
961 // Set the insert point after the original instruction, so any new
962 // instructions will be in the remainder.
963 B.setInsertPt(MBB&: *RemainderBB, II: RemainderBB->begin());
964
965 return true;
966}
967
968// Return any unique registers used by \p MI at \p OpIndices that need to be
969// handled in a waterfall loop. Returns these registers in \p
970// SGPROperandRegs. Returns true if there are any operands to handle and a
971// waterfall loop is necessary.
972bool AMDGPURegisterBankInfo::collectWaterfallOperands(
973 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
974 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
975 for (unsigned Op : OpIndices) {
976 assert(MI.getOperand(Op).isUse());
977 Register Reg = MI.getOperand(i: Op).getReg();
978 const RegisterBank *OpBank = getRegBank(Reg, MRI, TRI: *TRI);
979 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
980 SGPROperandRegs.insert(V: Reg);
981 }
982
983 // No operands need to be replaced, so no need to loop.
984 return !SGPROperandRegs.empty();
985}
986
987bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
988 MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
989 // Use a set to avoid extra readfirstlanes in the case where multiple operands
990 // are the same register.
991 SmallSet<Register, 4> SGPROperandRegs;
992
993 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI&: *B.getMRI(), OpIndices))
994 return false;
995
996 MachineBasicBlock::iterator I = MI.getIterator();
997 return executeInWaterfallLoop(B, Range: make_range(x: I, y: std::next(x: I)),
998 SGPROperandRegs);
999}
1000
1001// Legalize an operand that must be an SGPR by inserting a readfirstlane.
1002void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1003 MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1004 Register Reg = MI.getOperand(i: OpIdx).getReg();
1005 MachineRegisterInfo &MRI = *B.getMRI();
1006 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI);
1007 if (Bank == &AMDGPU::SGPRRegBank)
1008 return;
1009
1010 Reg = buildReadFirstLane(B, MRI, Src: Reg);
1011 MI.getOperand(i: OpIdx).setReg(Reg);
1012}
1013
1014/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1015/// rest will be in the remainder.
1016static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1017 unsigned TotalSize = Ty.getSizeInBits();
1018 if (!Ty.isVector())
1019 return {LLT::scalar(SizeInBits: FirstSize), LLT::scalar(SizeInBits: TotalSize - FirstSize)};
1020
1021 LLT EltTy = Ty.getElementType();
1022 unsigned EltSize = EltTy.getSizeInBits();
1023 assert(FirstSize % EltSize == 0);
1024
1025 unsigned FirstPartNumElts = FirstSize / EltSize;
1026 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1027
1028 return {LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: FirstPartNumElts), ScalarTy: EltTy),
1029 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: RemainderElts), ScalarTy: EltTy)};
1030}
1031
1032static LLT widen96To128(LLT Ty) {
1033 if (!Ty.isVector())
1034 return LLT::scalar(SizeInBits: 128);
1035
1036 LLT EltTy = Ty.getElementType();
1037 assert(128 % EltTy.getSizeInBits() == 0);
1038 return LLT::fixed_vector(NumElements: 128 / EltTy.getSizeInBits(), ScalarTy: EltTy);
1039}
1040
1041bool AMDGPURegisterBankInfo::applyMappingLoad(
1042 MachineIRBuilder &B,
1043 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1044 MachineInstr &MI) const {
1045 MachineRegisterInfo &MRI = *B.getMRI();
1046 Register DstReg = MI.getOperand(i: 0).getReg();
1047 const LLT LoadTy = MRI.getType(Reg: DstReg);
1048 unsigned LoadSize = LoadTy.getSizeInBits();
1049 MachineMemOperand *MMO = *MI.memoperands_begin();
1050 const unsigned MaxNonSmrdLoadSize = 128;
1051
1052 const RegisterBank *DstBank =
1053 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
1054 if (DstBank == &AMDGPU::SGPRRegBank) {
1055 // There are some special cases that we need to look at for 32 bit and 96
1056 // bit SGPR loads otherwise we have nothing to do.
1057 if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
1058 return false;
1059
1060 const unsigned MemSize = 8 * MMO->getSize().getValue();
1061 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1062 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1063 // scalar loads should have a load size of 32 but memory access size of less
1064 // than 32.
1065 if (LoadSize == 32 &&
1066 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1067 return false;
1068
1069 if (LoadSize == 32 &&
1070 ((MemSize == 8 && MMO->getAlign() >= Align(1)) ||
1071 (MemSize == 16 && MMO->getAlign() >= Align(2))) &&
1072 isScalarLoadLegal(MI) &&
1073 Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
1074 return false;
1075
1076 Register PtrReg = MI.getOperand(i: 1).getReg();
1077
1078 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1079
1080 if (LoadSize == 32) {
1081 // This is an extending load from a sub-dword size. Widen the memory
1082 // access size to 4 bytes and clear the extra high bits appropriately
1083 const LLT S32 = LLT::scalar(SizeInBits: 32);
1084 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1085 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1086 auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1087 B.buildSExtInReg(Res: MI.getOperand(i: 0), Op: WideLoad, ImmOp: MemSize);
1088 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1089 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1090 auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1091 B.buildZExtInReg(Res: MI.getOperand(i: 0), Op: WideLoad, ImmOp: MemSize);
1092 } else
1093 // We do not need to touch the higher bits for regular loads.
1094 B.buildLoadFromOffset(Dst: MI.getOperand(i: 0), BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1095 } else {
1096 // 96-bit loads are only available for vector loads. We need to split this
1097 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1098 if (MMO->getAlign() < Align(16)) {
1099 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1100 LLT Part64, Part32;
1101 std::tie(args&: Part64, args&: Part32) = splitUnequalType(Ty: LoadTy, FirstSize: 64);
1102 if (Helper.reduceLoadStoreWidth(MI&: cast<GAnyLoad>(Val&: MI), TypeIdx: 0, NarrowTy: Part64) !=
1103 LegalizerHelper::Legalized)
1104 return false;
1105 return true;
1106 }
1107 LLT WiderTy = widen96To128(Ty: LoadTy);
1108 auto WideLoad = B.buildLoadFromOffset(Dst: WiderTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1109 if (WiderTy.isScalar()) {
1110 B.buildTrunc(Res: MI.getOperand(i: 0), Op: WideLoad);
1111 } else {
1112 B.buildDeleteTrailingVectorElements(Res: MI.getOperand(i: 0).getReg(),
1113 Op0: WideLoad);
1114 }
1115 }
1116
1117 MI.eraseFromParent();
1118 return true;
1119 }
1120
1121 // 128-bit loads are supported for all instruction types.
1122 if (LoadSize <= MaxNonSmrdLoadSize)
1123 return false;
1124
1125 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(OpIdx: 1));
1126
1127 if (SrcRegs.empty())
1128 SrcRegs.push_back(Elt: MI.getOperand(i: 1).getReg());
1129
1130 // RegBankSelect only emits scalar types, so we need to reset the pointer
1131 // operand to a pointer type.
1132 Register BasePtrReg = SrcRegs[0];
1133 LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
1134 MRI.setType(VReg: BasePtrReg, Ty: PtrTy);
1135
1136 // The following are the loads not splitted enough during legalization
1137 // because it was not clear they are smem-load or vmem-load
1138 if (AMDGPU::isExtendedGlobalAddrSpace(AS: MMO->getAddrSpace()) ||
1139 MMO->getAddrSpace() == AMDGPUAS::BUFFER_RESOURCE) {
1140 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1141 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1142 const LLT LoadSplitTy = LoadTy.divide(Factor: NumSplitParts);
1143 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1144 LegalizerHelper Helper(B.getMF(), O, B);
1145 if (LoadTy.isVector()) {
1146 if (Helper.fewerElementsVector(MI, TypeIdx: 0, NarrowTy: LoadSplitTy) !=
1147 LegalizerHelper::Legalized)
1148 return false;
1149 } else {
1150 if (Helper.narrowScalar(MI, TypeIdx: 0, NarrowTy: LoadSplitTy) != LegalizerHelper::Legalized)
1151 return false;
1152 }
1153 }
1154
1155 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
1156 return true;
1157}
1158
1159bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1160 MachineIRBuilder &B,
1161 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1162 MachineInstr &MI) const {
1163 MachineRegisterInfo &MRI = *B.getMRI();
1164 const MachineFunction &MF = B.getMF();
1165 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1166 const auto &TFI = *ST.getFrameLowering();
1167
1168 // Guard in case the stack growth direction ever changes with scratch
1169 // instructions.
1170 assert(TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
1171 "Stack grows upwards for AMDGPU");
1172
1173 Register Dst = MI.getOperand(i: 0).getReg();
1174 Register AllocSize = MI.getOperand(i: 1).getReg();
1175 Align Alignment = assumeAligned(Value: MI.getOperand(i: 2).getImm());
1176
1177 const RegisterBank *SizeBank = getRegBank(Reg: AllocSize, MRI, TRI: *TRI);
1178
1179 if (SizeBank != &AMDGPU::SGPRRegBank) {
1180 auto WaveReduction =
1181 B.buildIntrinsic(ID: Intrinsic::amdgcn_wave_reduce_umax, Res: {LLT::scalar(SizeInBits: 32)})
1182 .addUse(RegNo: AllocSize)
1183 .addImm(Val: 0);
1184 AllocSize = WaveReduction.getReg(Idx: 0);
1185 }
1186
1187 LLT PtrTy = MRI.getType(Reg: Dst);
1188 LLT IntPtrTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
1189
1190 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1191 Register SPReg = Info->getStackPtrOffsetReg();
1192 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1193
1194 auto WaveSize = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: ST.getWavefrontSizeLog2());
1195 auto ScaledSize = B.buildShl(Dst: IntPtrTy, Src0: AllocSize, Src1: WaveSize);
1196
1197 auto OldSP = B.buildCopy(Res: PtrTy, Op: SPReg);
1198 if (Alignment > TFI.getStackAlign()) {
1199 auto StackAlignMask = (Alignment.value() << ST.getWavefrontSizeLog2()) - 1;
1200 auto Tmp1 = B.buildPtrAdd(Res: PtrTy, Op0: OldSP,
1201 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: StackAlignMask));
1202 B.buildMaskLowPtrBits(Res: Dst, Op0: Tmp1,
1203 NumBits: Log2(A: Alignment) + ST.getWavefrontSizeLog2());
1204 } else {
1205 B.buildCopy(Res: Dst, Op: OldSP);
1206 }
1207 auto PtrAdd = B.buildPtrAdd(Res: PtrTy, Op0: Dst, Op1: ScaledSize);
1208 B.buildCopy(Res: SPReg, Op: PtrAdd);
1209 MI.eraseFromParent();
1210 return true;
1211}
1212
1213bool AMDGPURegisterBankInfo::applyMappingImage(
1214 MachineIRBuilder &B, MachineInstr &MI,
1215 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1216 int RsrcIdx) const {
1217 const int NumDefs = MI.getNumExplicitDefs();
1218
1219 // The reported argument index is relative to the IR intrinsic call arguments,
1220 // so we need to shift by the number of defs and the intrinsic ID.
1221 RsrcIdx += NumDefs + 1;
1222
1223 // Insert copies to VGPR arguments.
1224 applyDefaultMapping(OpdMapper);
1225
1226 // Fixup any SGPR arguments.
1227 SmallVector<unsigned, 4> SGPRIndexes;
1228 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1229 if (!MI.getOperand(i: I).isReg())
1230 continue;
1231
1232 // If this intrinsic has a sampler, it immediately follows rsrc.
1233 if (I == RsrcIdx || I == RsrcIdx + 1)
1234 SGPRIndexes.push_back(Elt: I);
1235 }
1236
1237 executeInWaterfallLoop(B, MI, OpIndices: SGPRIndexes);
1238 return true;
1239}
1240
1241// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1242// the three offsets (voffset, soffset and instoffset)
1243unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1244 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1245 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1246 const LLT S32 = LLT::scalar(SizeInBits: 32);
1247 MachineRegisterInfo *MRI = B.getMRI();
1248
1249 if (std::optional<int64_t> Imm =
1250 getIConstantVRegSExtVal(VReg: CombinedOffset, MRI: *MRI)) {
1251 uint32_t SOffset, ImmOffset;
1252 if (TII->splitMUBUFOffset(Imm: *Imm, SOffset, ImmOffset, Alignment)) {
1253 VOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1254 SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: 0);
1255 InstOffsetVal = ImmOffset;
1256
1257 B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank);
1258 B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank);
1259 return SOffset + ImmOffset;
1260 }
1261 }
1262
1263 const bool CheckNUW = Subtarget.hasGFX1250Insts();
1264 Register Base;
1265 unsigned Offset;
1266
1267 std::tie(args&: Base, args&: Offset) =
1268 AMDGPU::getBaseWithConstantOffset(MRI&: *MRI, Reg: CombinedOffset,
1269 /*KnownBits=*/ValueTracking: nullptr,
1270 /*CheckNUW=*/CheckNUW);
1271
1272 uint32_t SOffset, ImmOffset;
1273 if ((int)Offset > 0 &&
1274 TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
1275 if (getRegBank(Reg: Base, MRI: *MRI, TRI: *TRI) == &AMDGPU::VGPRRegBank) {
1276 VOffsetReg = Base;
1277 SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: 0);
1278 B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank);
1279 InstOffsetVal = ImmOffset;
1280 return 0; // XXX - Why is this 0?
1281 }
1282
1283 // If we have SGPR base, we can use it for soffset.
1284 if (SOffset == 0) {
1285 VOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1286 B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank);
1287 SOffsetReg = Base;
1288 InstOffsetVal = ImmOffset;
1289 return 0; // XXX - Why is this 0?
1290 }
1291 }
1292
1293 // Handle the variable sgpr + vgpr case.
1294 MachineInstr *Add = getOpcodeDef(Opcode: AMDGPU::G_ADD, Reg: CombinedOffset, MRI: *MRI);
1295 if (Add && (int)Offset >= 0 &&
1296 (!CheckNUW || Add->getFlag(Flag: MachineInstr::NoUWrap))) {
1297 Register Src0 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: 1).getReg(), MRI: *MRI);
1298 Register Src1 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: 2).getReg(), MRI: *MRI);
1299
1300 const RegisterBank *Src0Bank = getRegBank(Reg: Src0, MRI: *MRI, TRI: *TRI);
1301 const RegisterBank *Src1Bank = getRegBank(Reg: Src1, MRI: *MRI, TRI: *TRI);
1302
1303 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1304 VOffsetReg = Src0;
1305 SOffsetReg = Src1;
1306 return 0;
1307 }
1308
1309 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1310 VOffsetReg = Src1;
1311 SOffsetReg = Src0;
1312 return 0;
1313 }
1314 }
1315
1316 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1317 // have an SGPR offset and a VGPR resource.
1318 if (getRegBank(Reg: CombinedOffset, MRI: *MRI, TRI: *TRI) == &AMDGPU::VGPRRegBank) {
1319 VOffsetReg = CombinedOffset;
1320 } else {
1321 VOffsetReg = B.buildCopy(Res: S32, Op: CombinedOffset).getReg(Idx: 0);
1322 B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank);
1323 }
1324
1325 SOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1326 B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank);
1327 return 0;
1328}
1329
1330static unsigned getSBufferLoadCorrespondingBufferLoadOpcode(unsigned Opc) {
1331 switch (Opc) {
1332 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
1333 return AMDGPU::G_AMDGPU_BUFFER_LOAD;
1334 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
1335 return AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
1336 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
1337 return AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE;
1338 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
1339 return AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
1340 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
1341 return AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
1342 default:
1343 break;
1344 }
1345 llvm_unreachable("Unexpected s_buffer_load opcode");
1346}
1347
1348bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1349 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1350 MachineInstr &MI = OpdMapper.getMI();
1351 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1352
1353 const LLT S32 = LLT::scalar(SizeInBits: 32);
1354 Register Dst = MI.getOperand(i: 0).getReg();
1355 LLT Ty = MRI.getType(Reg: Dst);
1356
1357 const RegisterBank *RSrcBank =
1358 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
1359 const RegisterBank *OffsetBank =
1360 OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
1361 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1362 OffsetBank == &AMDGPU::SGPRRegBank)
1363 return true; // Legal mapping
1364
1365 // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1366 // here but don't have an MMO.
1367
1368 unsigned LoadSize = Ty.getSizeInBits();
1369 int NumLoads = 1;
1370 if (LoadSize == 256 || LoadSize == 512) {
1371 NumLoads = LoadSize / 128;
1372 Ty = Ty.divide(Factor: NumLoads);
1373 }
1374
1375 // Use the alignment to ensure that the required offsets will fit into the
1376 // immediate offsets.
1377 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1378
1379 MachineFunction &MF = B.getMF();
1380
1381 Register SOffset;
1382 Register VOffset;
1383 int64_t ImmOffset = 0;
1384
1385 unsigned MMOOffset = setBufferOffsets(B, CombinedOffset: MI.getOperand(i: 2).getReg(), VOffsetReg&: VOffset,
1386 SOffsetReg&: SOffset, InstOffsetVal&: ImmOffset, Alignment);
1387
1388 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1389 // can, but we need to track an MMO for that.
1390 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1391 const Align MemAlign(4); // FIXME: ABI type alignment?
1392 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1393 PtrInfo: MachinePointerInfo(),
1394 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1395 MachineMemOperand::MOInvariant,
1396 Size: MemSize, BaseAlignment: MemAlign);
1397 if (MMOOffset != 0)
1398 BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset, Size: MemSize);
1399
1400 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1401 // assume that the buffer is unswizzled.
1402
1403 Register RSrc = MI.getOperand(i: 1).getReg();
1404 Register VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1405 B.getMRI()->setRegBank(Reg: VIndex, RegBank: AMDGPU::VGPRRegBank);
1406
1407 SmallVector<Register, 4> LoadParts(NumLoads);
1408
1409 MachineBasicBlock::iterator MII = MI.getIterator();
1410 MachineInstrSpan Span(MII, &B.getMBB());
1411
1412 for (int i = 0; i < NumLoads; ++i) {
1413 if (NumLoads == 1) {
1414 LoadParts[i] = Dst;
1415 } else {
1416 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1417 MRI.setRegBank(Reg: LoadParts[i], RegBank: AMDGPU::VGPRRegBank);
1418 }
1419
1420 MachineMemOperand *MMO = BaseMMO;
1421 if (i != 0)
1422 BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset + 16 * i, Size: MemSize);
1423
1424 B.buildInstr(Opcode: getSBufferLoadCorrespondingBufferLoadOpcode(Opc: MI.getOpcode()))
1425 .addDef(RegNo: LoadParts[i]) // vdata
1426 .addUse(RegNo: RSrc) // rsrc
1427 .addUse(RegNo: VIndex) // vindex
1428 .addUse(RegNo: VOffset) // voffset
1429 .addUse(RegNo: SOffset) // soffset
1430 .addImm(Val: ImmOffset + 16 * i) // offset(imm)
1431 .addImm(Val: 0) // cachepolicy, swizzled buffer(imm)
1432 .addImm(Val: 0) // idxen(imm)
1433 .addMemOperand(MMO);
1434 }
1435
1436 // TODO: If only the resource is a VGPR, it may be better to execute the
1437 // scalar load in the waterfall loop if the resource is expected to frequently
1438 // be dynamically uniform.
1439 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1440 // Remove the original instruction to avoid potentially confusing the
1441 // waterfall loop logic.
1442 B.setInstr(*Span.begin());
1443 MI.eraseFromParent();
1444
1445 SmallSet<Register, 4> OpsToWaterfall;
1446
1447 OpsToWaterfall.insert(V: RSrc);
1448 executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()),
1449 SGPROperandRegs&: OpsToWaterfall);
1450 }
1451
1452 if (NumLoads != 1) {
1453 if (Ty.isVector())
1454 B.buildConcatVectors(Res: Dst, Ops: LoadParts);
1455 else
1456 B.buildMergeLikeInstr(Res: Dst, Ops: LoadParts);
1457 }
1458
1459 // We removed the instruction earlier with a waterfall loop.
1460 if (RSrcBank == &AMDGPU::SGPRRegBank)
1461 MI.eraseFromParent();
1462
1463 return true;
1464}
1465
1466bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
1467 const OperandsMapper &OpdMapper,
1468 bool Signed) const {
1469 MachineInstr &MI = OpdMapper.getMI();
1470 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1471
1472 // Insert basic copies
1473 applyDefaultMapping(OpdMapper);
1474
1475 Register DstReg = MI.getOperand(i: 0).getReg();
1476 LLT Ty = MRI.getType(Reg: DstReg);
1477
1478 const LLT S32 = LLT::scalar(SizeInBits: 32);
1479
1480 unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1;
1481 Register SrcReg = MI.getOperand(i: FirstOpnd).getReg();
1482 Register OffsetReg = MI.getOperand(i: FirstOpnd + 1).getReg();
1483 Register WidthReg = MI.getOperand(i: FirstOpnd + 2).getReg();
1484
1485 const RegisterBank *DstBank =
1486 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
1487 if (DstBank == &AMDGPU::VGPRRegBank) {
1488 if (Ty == S32)
1489 return true;
1490
1491 // There is no 64-bit vgpr bitfield extract instructions so the operation
1492 // is expanded to a sequence of instructions that implement the operation.
1493 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1494
1495 const LLT S64 = LLT::scalar(SizeInBits: 64);
1496 // Shift the source operand so that extracted bits start at bit 0.
1497 auto ShiftOffset = Signed ? B.buildAShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg)
1498 : B.buildLShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg);
1499 auto UnmergeSOffset = B.buildUnmerge(Res: {S32, S32}, Op: ShiftOffset);
1500
1501 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1502 // if the width is a constant.
1503 if (auto ConstWidth = getIConstantVRegValWithLookThrough(VReg: WidthReg, MRI)) {
1504 // Use the 32-bit bitfield extract instruction if the width is a constant.
1505 // Depending on the width size, use either the low or high 32-bits.
1506 auto Zero = B.buildConstant(Res: S32, Val: 0);
1507 auto WidthImm = ConstWidth->Value.getZExtValue();
1508 if (WidthImm <= 32) {
1509 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1510 // or clear the upper 32-bits.
1511 auto Extract =
1512 Signed ? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 0), LSB: Zero, Width: WidthReg)
1513 : B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 0), LSB: Zero, Width: WidthReg);
1514 auto Extend =
1515 Signed ? B.buildAShr(Dst: S32, Src0: Extract, Src1: B.buildConstant(Res: S32, Val: 31)) : Zero;
1516 B.buildMergeLikeInstr(Res: DstReg, Ops: {Extract, Extend});
1517 } else {
1518 // Use bitfield extract on upper 32-bit source, and combine with lower
1519 // 32-bit source.
1520 auto UpperWidth = B.buildConstant(Res: S32, Val: WidthImm - 32);
1521 auto Extract =
1522 Signed
1523 ? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 1), LSB: Zero, Width: UpperWidth)
1524 : B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 1), LSB: Zero, Width: UpperWidth);
1525 B.buildMergeLikeInstr(Res: DstReg, Ops: {UnmergeSOffset.getReg(Idx: 0), Extract});
1526 }
1527 MI.eraseFromParent();
1528 return true;
1529 }
1530
1531 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1532 // operations.
1533 auto ExtShift = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: 64), Src1: WidthReg);
1534 auto SignBit = B.buildShl(Dst: S64, Src0: ShiftOffset, Src1: ExtShift);
1535 if (Signed)
1536 B.buildAShr(Dst: S64, Src0: SignBit, Src1: ExtShift);
1537 else
1538 B.buildLShr(Dst: S64, Src0: SignBit, Src1: ExtShift);
1539 MI.eraseFromParent();
1540 return true;
1541 }
1542
1543 // The scalar form packs the offset and width in a single operand.
1544
1545 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1546
1547 // Ensure the high bits are clear to insert the offset.
1548 auto OffsetMask = B.buildConstant(Res: S32, Val: maskTrailingOnes<unsigned>(N: 6));
1549 auto ClampOffset = B.buildAnd(Dst: S32, Src0: OffsetReg, Src1: OffsetMask);
1550
1551 // Zeros out the low bits, so don't bother clamping the input value.
1552 auto ShiftWidth = B.buildShl(Dst: S32, Src0: WidthReg, Src1: B.buildConstant(Res: S32, Val: 16));
1553
1554 // Transformation function, pack the offset and width of a BFE into
1555 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1556 // source, bits [5:0] contain the offset and bits [22:16] the width.
1557 auto MergedInputs = B.buildOr(Dst: S32, Src0: ClampOffset, Src1: ShiftWidth);
1558
1559 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1560 // register class constraints.
1561 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1562 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1563
1564 auto MIB = B.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {SrcReg, MergedInputs});
1565 if (!constrainSelectedInstRegOperands(I&: *MIB, TII: *TII, TRI: *TRI, RBI: *this))
1566 llvm_unreachable("failed to constrain BFE");
1567
1568 MI.eraseFromParent();
1569 return true;
1570}
1571
1572bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1573 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1574 MachineInstr &MI = OpdMapper.getMI();
1575 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1576
1577 // Insert basic copies.
1578 applyDefaultMapping(OpdMapper);
1579
1580 Register Dst0 = MI.getOperand(i: 0).getReg();
1581 Register Dst1 = MI.getOperand(i: 1).getReg();
1582 Register Src0 = MI.getOperand(i: 2).getReg();
1583 Register Src1 = MI.getOperand(i: 3).getReg();
1584 Register Src2 = MI.getOperand(i: 4).getReg();
1585
1586 if (MRI.getRegBankOrNull(Reg: Src0) == &AMDGPU::VGPRRegBank)
1587 return true;
1588
1589 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1590 LLT S1 = LLT::scalar(SizeInBits: 1);
1591 LLT S32 = LLT::scalar(SizeInBits: 32);
1592
1593 bool DstOnValu = MRI.getRegBankOrNull(Reg: Src2) == &AMDGPU::VGPRRegBank;
1594 bool Accumulate = true;
1595
1596 if (!DstOnValu) {
1597 if (mi_match(R: Src2, MRI, P: m_ZeroInt()))
1598 Accumulate = false;
1599 }
1600
1601 // Keep the multiplication on the SALU.
1602 Register DstHi;
1603 Register DstLo = B.buildMul(Dst: S32, Src0, Src1).getReg(Idx: 0);
1604 bool MulHiInVgpr = false;
1605
1606 MRI.setRegBank(Reg: DstLo, RegBank: AMDGPU::SGPRRegBank);
1607
1608 if (Subtarget.hasSMulHi()) {
1609 DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0, Src1).getReg(Idx: 0)
1610 : B.buildSMulH(Dst: S32, Src0, Src1).getReg(Idx: 0);
1611 MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::SGPRRegBank);
1612 } else {
1613 Register VSrc0 = B.buildCopy(Res: S32, Op: Src0).getReg(Idx: 0);
1614 Register VSrc1 = B.buildCopy(Res: S32, Op: Src1).getReg(Idx: 0);
1615
1616 MRI.setRegBank(Reg: VSrc0, RegBank: AMDGPU::VGPRRegBank);
1617 MRI.setRegBank(Reg: VSrc1, RegBank: AMDGPU::VGPRRegBank);
1618
1619 DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: 0)
1620 : B.buildSMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: 0);
1621 MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::VGPRRegBank);
1622
1623 if (!DstOnValu) {
1624 DstHi = buildReadFirstLane(B, MRI, Src: DstHi);
1625 } else {
1626 MulHiInVgpr = true;
1627 }
1628 }
1629
1630 // Accumulate and produce the "carry-out" bit.
1631 //
1632 // The "carry-out" is defined as bit 64 of the result when computed as a
1633 // big integer. For unsigned multiply-add, this matches the usual definition
1634 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1635 // result, which is determined as:
1636 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1637 LLT CarryType = DstOnValu ? S1 : S32;
1638 const RegisterBank &CarryBank =
1639 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1640 const RegisterBank &DstBank =
1641 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1642 Register Carry;
1643 Register Zero;
1644
1645 if (!IsUnsigned) {
1646 Zero = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1647 MRI.setRegBank(Reg: Zero,
1648 RegBank: MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1649
1650 Carry = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: MulHiInVgpr ? S1 : S32, Op0: DstHi, Op1: Zero)
1651 .getReg(Idx: 0);
1652 MRI.setRegBank(Reg: Carry, RegBank: MulHiInVgpr ? AMDGPU::VCCRegBank
1653 : AMDGPU::SGPRRegBank);
1654
1655 if (DstOnValu && !MulHiInVgpr) {
1656 Carry = B.buildTrunc(Res: S1, Op: Carry).getReg(Idx: 0);
1657 MRI.setRegBank(Reg: Carry, RegBank: AMDGPU::VCCRegBank);
1658 }
1659 }
1660
1661 if (Accumulate) {
1662 if (DstOnValu) {
1663 DstLo = B.buildCopy(Res: S32, Op: DstLo).getReg(Idx: 0);
1664 DstHi = B.buildCopy(Res: S32, Op: DstHi).getReg(Idx: 0);
1665 MRI.setRegBank(Reg: DstLo, RegBank: AMDGPU::VGPRRegBank);
1666 MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::VGPRRegBank);
1667 }
1668
1669 auto Unmerge = B.buildUnmerge(Res: S32, Op: Src2);
1670 Register Src2Lo = Unmerge.getReg(Idx: 0);
1671 Register Src2Hi = Unmerge.getReg(Idx: 1);
1672 MRI.setRegBank(Reg: Src2Lo, RegBank: DstBank);
1673 MRI.setRegBank(Reg: Src2Hi, RegBank: DstBank);
1674
1675 if (!IsUnsigned) {
1676 auto Src2Sign = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: CarryType, Op0: Src2Hi, Op1: Zero);
1677 MRI.setRegBank(Reg: Src2Sign.getReg(Idx: 0), RegBank: CarryBank);
1678
1679 Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: Src2Sign).getReg(Idx: 0);
1680 MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1681 }
1682
1683 auto AddLo = B.buildUAddo(Res: S32, CarryOut: CarryType, Op0: DstLo, Op1: Src2Lo);
1684 DstLo = AddLo.getReg(Idx: 0);
1685 Register CarryLo = AddLo.getReg(Idx: 1);
1686 MRI.setRegBank(Reg: DstLo, RegBank: DstBank);
1687 MRI.setRegBank(Reg: CarryLo, RegBank: CarryBank);
1688
1689 auto AddHi = B.buildUAdde(Res: S32, CarryOut: CarryType, Op0: DstHi, Op1: Src2Hi, CarryIn: CarryLo);
1690 DstHi = AddHi.getReg(Idx: 0);
1691 MRI.setRegBank(Reg: DstHi, RegBank: DstBank);
1692
1693 Register CarryHi = AddHi.getReg(Idx: 1);
1694 MRI.setRegBank(Reg: CarryHi, RegBank: CarryBank);
1695
1696 if (IsUnsigned) {
1697 Carry = CarryHi;
1698 } else {
1699 Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: CarryHi).getReg(Idx: 0);
1700 MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1701 }
1702 } else {
1703 if (IsUnsigned) {
1704 Carry = B.buildConstant(Res: CarryType, Val: 0).getReg(Idx: 0);
1705 MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1706 }
1707 }
1708
1709 B.buildMergeLikeInstr(Res: Dst0, Ops: {DstLo, DstHi});
1710
1711 if (DstOnValu) {
1712 B.buildCopy(Res: Dst1, Op: Carry);
1713 } else {
1714 B.buildTrunc(Res: Dst1, Op: Carry);
1715 }
1716
1717 MI.eraseFromParent();
1718 return true;
1719}
1720
1721// Return a suitable opcode for extending the operands of Opc when widening.
1722static unsigned getExtendOp(unsigned Opc) {
1723 switch (Opc) {
1724 case TargetOpcode::G_ASHR:
1725 case TargetOpcode::G_SMIN:
1726 case TargetOpcode::G_SMAX:
1727 return TargetOpcode::G_SEXT;
1728 case TargetOpcode::G_LSHR:
1729 case TargetOpcode::G_UMIN:
1730 case TargetOpcode::G_UMAX:
1731 return TargetOpcode::G_ZEXT;
1732 default:
1733 return TargetOpcode::G_ANYEXT;
1734 }
1735}
1736
1737// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1738// any illegal vector extend or unmerge operations.
1739static std::pair<Register, Register>
1740unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1741 const LLT S32 = LLT::scalar(SizeInBits: 32);
1742 auto Bitcast = B.buildBitcast(Dst: S32, Src);
1743
1744 if (ExtOpcode == TargetOpcode::G_SEXT) {
1745 auto ExtLo = B.buildSExtInReg(Res: S32, Op: Bitcast, ImmOp: 16);
1746 auto ShiftHi = B.buildAShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 16));
1747 return std::pair(ExtLo.getReg(Idx: 0), ShiftHi.getReg(Idx: 0));
1748 }
1749
1750 auto ShiftHi = B.buildLShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 16));
1751 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1752 auto ExtLo = B.buildAnd(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 0xffff));
1753 return std::pair(ExtLo.getReg(Idx: 0), ShiftHi.getReg(Idx: 0));
1754 }
1755
1756 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1757 return std::pair(Bitcast.getReg(Idx: 0), ShiftHi.getReg(Idx: 0));
1758}
1759
1760// For cases where only a single copy is inserted for matching register banks.
1761// Replace the register in the instruction operand
1762static bool substituteSimpleCopyRegs(
1763 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1764 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1765 if (!SrcReg.empty()) {
1766 assert(SrcReg.size() == 1);
1767 OpdMapper.getMI().getOperand(i: OpIdx).setReg(SrcReg[0]);
1768 return true;
1769 }
1770
1771 return false;
1772}
1773
1774/// Handle register layout difference for f16 images for some subtargets.
1775Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1776 MachineRegisterInfo &MRI,
1777 Register Reg) const {
1778 if (!Subtarget.hasUnpackedD16VMem())
1779 return Reg;
1780
1781 const LLT S16 = LLT::scalar(SizeInBits: 16);
1782 LLT StoreVT = MRI.getType(Reg);
1783 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1784 return Reg;
1785
1786 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
1787
1788
1789 SmallVector<Register, 4> WideRegs;
1790 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1791 WideRegs.push_back(Elt: Unmerge.getReg(Idx: I));
1792
1793 const LLT S32 = LLT::scalar(SizeInBits: 32);
1794 int NumElts = StoreVT.getNumElements();
1795
1796 return B.buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
1797 .getReg(Idx: 0);
1798}
1799
1800static std::pair<Register, unsigned>
1801getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1802 int64_t Const;
1803 if (mi_match(R: Reg, MRI, P: m_ICst(Cst&: Const)))
1804 return std::pair(Register(), Const);
1805
1806 Register Base;
1807 if (mi_match(R: Reg, MRI, P: m_GAdd(L: m_Reg(R&: Base), R: m_ICst(Cst&: Const))))
1808 return std::pair(Base, Const);
1809
1810 // TODO: Handle G_OR used for add case
1811 return std::pair(Reg, 0);
1812}
1813
1814std::pair<Register, unsigned>
1815AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1816 Register OrigOffset) const {
1817 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: Subtarget);
1818 Register BaseReg;
1819 unsigned ImmOffset;
1820 const LLT S32 = LLT::scalar(SizeInBits: 32);
1821
1822 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1823 std::tie(args&: BaseReg, args&: ImmOffset) = getBaseWithConstantOffset(MRI&: *B.getMRI(),
1824 Reg: OrigOffset);
1825
1826 unsigned C1 = 0;
1827 if (ImmOffset != 0) {
1828 // If the immediate value is too big for the immoffset field, put only bits
1829 // that would normally fit in the immoffset field. The remaining value that
1830 // is copied/added for the voffset field is a large power of 2, and it
1831 // stands more chance of being CSEd with the copy/add for another similar
1832 // load/store.
1833 // However, do not do that rounding down if that is a negative
1834 // number, as it appears to be illegal to have a negative offset in the
1835 // vgpr, even if adding the immediate offset makes it positive.
1836 unsigned Overflow = ImmOffset & ~MaxImm;
1837 ImmOffset -= Overflow;
1838 if ((int32_t)Overflow < 0) {
1839 Overflow += ImmOffset;
1840 ImmOffset = 0;
1841 }
1842
1843 C1 = ImmOffset;
1844 if (Overflow != 0) {
1845 if (!BaseReg)
1846 BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: 0);
1847 else {
1848 auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
1849 BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: 0);
1850 }
1851 }
1852 }
1853
1854 if (!BaseReg)
1855 BaseReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1856
1857 return {BaseReg, C1};
1858}
1859
1860bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1861 Register SrcReg) const {
1862 MachineRegisterInfo &MRI = *B.getMRI();
1863 LLT SrcTy = MRI.getType(Reg: SrcReg);
1864 if (SrcTy.getSizeInBits() == 32) {
1865 // Use a v_mov_b32 here to make the exec dependency explicit.
1866 B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32)
1867 .addDef(RegNo: DstReg)
1868 .addUse(RegNo: SrcReg);
1869 return constrainGenericRegister(Reg: DstReg, RC: AMDGPU::VGPR_32RegClass, MRI) &&
1870 constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_32RegClass, MRI);
1871 }
1872
1873 Register TmpReg0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1874 Register TmpReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1875
1876 B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32)
1877 .addDef(RegNo: TmpReg0)
1878 .addUse(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub0);
1879 B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32)
1880 .addDef(RegNo: TmpReg1)
1881 .addUse(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub1);
1882 B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE)
1883 .addDef(RegNo: DstReg)
1884 .addUse(RegNo: TmpReg0)
1885 .addImm(Val: AMDGPU::sub0)
1886 .addUse(RegNo: TmpReg1)
1887 .addImm(Val: AMDGPU::sub1);
1888
1889 return constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_64RegClass, MRI) &&
1890 constrainGenericRegister(Reg: DstReg, RC: AMDGPU::VReg_64RegClass, MRI);
1891}
1892
1893/// Utility function for pushing dynamic vector indexes with a constant offset
1894/// into waterfall loops.
1895static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1896 MachineInstr &IdxUseInstr,
1897 unsigned OpIdx,
1898 unsigned ConstOffset) {
1899 MachineRegisterInfo &MRI = *B.getMRI();
1900 const LLT S32 = LLT::scalar(SizeInBits: 32);
1901 Register WaterfallIdx = IdxUseInstr.getOperand(i: OpIdx).getReg();
1902 B.setInsertPt(MBB&: *IdxUseInstr.getParent(), II: IdxUseInstr.getIterator());
1903
1904 auto MaterializedOffset = B.buildConstant(Res: S32, Val: ConstOffset);
1905
1906 auto Add = B.buildAdd(Dst: S32, Src0: WaterfallIdx, Src1: MaterializedOffset);
1907 MRI.setRegBank(Reg: MaterializedOffset.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
1908 MRI.setRegBank(Reg: Add.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
1909 IdxUseInstr.getOperand(i: OpIdx).setReg(Add.getReg(Idx: 0));
1910}
1911
1912/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1913/// original 32-bit source value (to be inserted in the low part of the combined
1914/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1915/// value.
1916static void extendLow32IntoHigh32(MachineIRBuilder &B,
1917 Register Hi32Reg, Register Lo32Reg,
1918 unsigned ExtOpc,
1919 const RegisterBank &RegBank,
1920 bool IsBooleanSrc = false) {
1921 if (ExtOpc == AMDGPU::G_ZEXT) {
1922 B.buildConstant(Res: Hi32Reg, Val: 0);
1923 } else if (ExtOpc == AMDGPU::G_SEXT) {
1924 if (IsBooleanSrc) {
1925 // If we know the original source was an s1, the high half is the same as
1926 // the low.
1927 B.buildCopy(Res: Hi32Reg, Op: Lo32Reg);
1928 } else {
1929 // Replicate sign bit from 32-bit extended part.
1930 auto ShiftAmt = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 31);
1931 B.getMRI()->setRegBank(Reg: ShiftAmt.getReg(Idx: 0), RegBank);
1932 B.buildAShr(Dst: Hi32Reg, Src0: Lo32Reg, Src1: ShiftAmt);
1933 }
1934 } else {
1935 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1936 B.buildUndef(Res: Hi32Reg);
1937 }
1938}
1939
1940bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1941 MachineIRBuilder &B, MachineInstr &MI,
1942 const OperandsMapper &OpdMapper) const {
1943 MachineRegisterInfo &MRI = *B.getMRI();
1944
1945 Register VecReg = MI.getOperand(i: 1).getReg();
1946 Register Idx = MI.getOperand(i: 2).getReg();
1947
1948 const RegisterBank &IdxBank =
1949 *OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
1950
1951 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1952
1953 LLT VecTy = MRI.getType(Reg: VecReg);
1954 unsigned EltSize = VecTy.getScalarSizeInBits();
1955 unsigned NumElem = VecTy.getNumElements();
1956
1957 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1958 IsDivergentIdx, Subtarget: &Subtarget))
1959 return false;
1960
1961 LLT S32 = LLT::scalar(SizeInBits: 32);
1962
1963 const RegisterBank &DstBank =
1964 *OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
1965 const RegisterBank &SrcBank =
1966 *OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
1967
1968 const RegisterBank &CCBank =
1969 (DstBank == AMDGPU::SGPRRegBank &&
1970 SrcBank == AMDGPU::SGPRRegBank &&
1971 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1972 : AMDGPU::VCCRegBank;
1973 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(SizeInBits: 1);
1974
1975 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1976 Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: 0).getReg();
1977 MRI.setRegBank(Reg: Idx, RegBank: AMDGPU::VGPRRegBank);
1978 }
1979
1980 LLT EltTy = VecTy.getScalarType();
1981 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0));
1982 unsigned NumLanes = DstRegs.size();
1983 if (!NumLanes)
1984 NumLanes = 1;
1985 else
1986 EltTy = MRI.getType(Reg: DstRegs[0]);
1987
1988 auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg);
1989 SmallVector<Register, 2> Res(NumLanes);
1990 for (unsigned L = 0; L < NumLanes; ++L)
1991 Res[L] = UnmergeToEltTy.getReg(Idx: L);
1992
1993 for (unsigned I = 1; I < NumElem; ++I) {
1994 auto IC = B.buildConstant(Res: S32, Val: I);
1995 MRI.setRegBank(Reg: IC->getOperand(i: 0).getReg(), RegBank: AMDGPU::SGPRRegBank);
1996 auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC);
1997 MRI.setRegBank(Reg: Cmp->getOperand(i: 0).getReg(), RegBank: CCBank);
1998
1999 for (unsigned L = 0; L < NumLanes; ++L) {
2000 auto S = B.buildSelect(Res: EltTy, Tst: Cmp,
2001 Op0: UnmergeToEltTy.getReg(Idx: I * NumLanes + L), Op1: Res[L]);
2002
2003 for (unsigned N : { 0, 2, 3 })
2004 MRI.setRegBank(Reg: S->getOperand(i: N).getReg(), RegBank: DstBank);
2005
2006 Res[L] = S->getOperand(i: 0).getReg();
2007 }
2008 }
2009
2010 for (unsigned L = 0; L < NumLanes; ++L) {
2011 Register DstReg = (NumLanes == 1) ? MI.getOperand(i: 0).getReg() : DstRegs[L];
2012 B.buildCopy(Res: DstReg, Op: Res[L]);
2013 MRI.setRegBank(Reg: DstReg, RegBank: DstBank);
2014 }
2015
2016 MRI.setRegBank(Reg: MI.getOperand(i: 0).getReg(), RegBank: DstBank);
2017 MI.eraseFromParent();
2018
2019 return true;
2020}
2021
2022// Insert a cross regbank copy for a register if it already has a bank that
2023// differs from the one we want to set.
2024static Register constrainRegToBank(MachineRegisterInfo &MRI,
2025 MachineIRBuilder &B, Register &Reg,
2026 const RegisterBank &Bank) {
2027 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2028 if (CurrBank && *CurrBank != Bank) {
2029 Register Copy = B.buildCopy(Res: MRI.getType(Reg), Op: Reg).getReg(Idx: 0);
2030 MRI.setRegBank(Reg: Copy, RegBank: Bank);
2031 return Copy;
2032 }
2033
2034 MRI.setRegBank(Reg, RegBank: Bank);
2035 return Reg;
2036}
2037
2038bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2039 MachineIRBuilder &B, MachineInstr &MI,
2040 const OperandsMapper &OpdMapper) const {
2041
2042 MachineRegisterInfo &MRI = *B.getMRI();
2043 Register VecReg = MI.getOperand(i: 1).getReg();
2044 Register Idx = MI.getOperand(i: 3).getReg();
2045
2046 const RegisterBank &IdxBank =
2047 *OpdMapper.getInstrMapping().getOperandMapping(i: 3).BreakDown[0].RegBank;
2048
2049 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2050
2051 LLT VecTy = MRI.getType(Reg: VecReg);
2052 unsigned EltSize = VecTy.getScalarSizeInBits();
2053 unsigned NumElem = VecTy.getNumElements();
2054
2055 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2056 IsDivergentIdx, Subtarget: &Subtarget))
2057 return false;
2058
2059 LLT S32 = LLT::scalar(SizeInBits: 32);
2060
2061 const RegisterBank &DstBank =
2062 *OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2063 const RegisterBank &SrcBank =
2064 *OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
2065 const RegisterBank &InsBank =
2066 *OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
2067
2068 const RegisterBank &CCBank =
2069 (DstBank == AMDGPU::SGPRRegBank &&
2070 SrcBank == AMDGPU::SGPRRegBank &&
2071 InsBank == AMDGPU::SGPRRegBank &&
2072 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2073 : AMDGPU::VCCRegBank;
2074 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(SizeInBits: 1);
2075
2076 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2077 Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: 0).getReg();
2078 MRI.setRegBank(Reg: Idx, RegBank: AMDGPU::VGPRRegBank);
2079 }
2080
2081 LLT EltTy = VecTy.getScalarType();
2082 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(OpIdx: 2));
2083 unsigned NumLanes = InsRegs.size();
2084 if (!NumLanes) {
2085 NumLanes = 1;
2086 InsRegs.push_back(Elt: MI.getOperand(i: 2).getReg());
2087 } else {
2088 EltTy = MRI.getType(Reg: InsRegs[0]);
2089 }
2090
2091 auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg);
2092 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2093
2094 for (unsigned I = 0; I < NumElem; ++I) {
2095 auto IC = B.buildConstant(Res: S32, Val: I);
2096 MRI.setRegBank(Reg: IC->getOperand(i: 0).getReg(), RegBank: AMDGPU::SGPRRegBank);
2097 auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC);
2098 MRI.setRegBank(Reg: Cmp->getOperand(i: 0).getReg(), RegBank: CCBank);
2099
2100 for (unsigned L = 0; L < NumLanes; ++L) {
2101 Register Op0 = constrainRegToBank(MRI, B, Reg&: InsRegs[L], Bank: DstBank);
2102 Register Op1 = UnmergeToEltTy.getReg(Idx: I * NumLanes + L);
2103 Op1 = constrainRegToBank(MRI, B, Reg&: Op1, Bank: DstBank);
2104
2105 Register Select = B.buildSelect(Res: EltTy, Tst: Cmp, Op0, Op1).getReg(Idx: 0);
2106 MRI.setRegBank(Reg: Select, RegBank: DstBank);
2107
2108 Ops[I * NumLanes + L] = Select;
2109 }
2110 }
2111
2112 LLT MergeTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarTy: EltTy);
2113 if (MergeTy == MRI.getType(Reg: MI.getOperand(i: 0).getReg())) {
2114 B.buildBuildVector(Res: MI.getOperand(i: 0), Ops);
2115 } else {
2116 auto Vec = B.buildBuildVector(Res: MergeTy, Ops);
2117 MRI.setRegBank(Reg: Vec->getOperand(i: 0).getReg(), RegBank: DstBank);
2118 B.buildBitcast(Dst: MI.getOperand(i: 0).getReg(), Src: Vec);
2119 }
2120
2121 MRI.setRegBank(Reg: MI.getOperand(i: 0).getReg(), RegBank: DstBank);
2122 MI.eraseFromParent();
2123
2124 return true;
2125}
2126
2127// Break s_mul_u64 into 32-bit vector operations.
2128void AMDGPURegisterBankInfo::applyMappingSMULU64(
2129 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2130 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2131 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(OpIdx: 1));
2132 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2));
2133
2134 // All inputs are SGPRs, nothing special to do.
2135 if (DefRegs.empty()) {
2136 assert(Src0Regs.empty() && Src1Regs.empty());
2137 applyDefaultMapping(OpdMapper);
2138 return;
2139 }
2140
2141 assert(DefRegs.size() == 2);
2142 assert(Src0Regs.size() == Src1Regs.size() &&
2143 (Src0Regs.empty() || Src0Regs.size() == 2));
2144
2145 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2146 MachineInstr &MI = OpdMapper.getMI();
2147 Register DstReg = MI.getOperand(i: 0).getReg();
2148 LLT HalfTy = LLT::scalar(SizeInBits: 32);
2149
2150 // Depending on where the source registers came from, the generic code may
2151 // have decided to split the inputs already or not. If not, we still need to
2152 // extract the values.
2153
2154 if (Src0Regs.empty())
2155 split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: 1).getReg());
2156 else
2157 setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy);
2158
2159 if (Src1Regs.empty())
2160 split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg());
2161 else
2162 setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2163
2164 setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2165
2166 // The multiplication is done as follows:
2167 //
2168 // Op1H Op1L
2169 // * Op0H Op0L
2170 // --------------------
2171 // Op1H*Op0L Op1L*Op0L
2172 // + Op1H*Op0H Op1L*Op0H
2173 // -----------------------------------------
2174 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
2175 //
2176 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
2177 // value and that would overflow.
2178 // The low 32-bit value is Op1L*Op0L.
2179 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
2180 // Op1L*Op0L).
2181
2182 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
2183
2184 Register Hi = B.buildUMulH(Dst: HalfTy, Src0: Src0Regs[0], Src1: Src1Regs[0]).getReg(Idx: 0);
2185 Register MulLoHi = B.buildMul(Dst: HalfTy, Src0: Src0Regs[0], Src1: Src1Regs[1]).getReg(Idx: 0);
2186 Register Add = B.buildAdd(Dst: HalfTy, Src0: Hi, Src1: MulLoHi).getReg(Idx: 0);
2187 Register MulHiLo = B.buildMul(Dst: HalfTy, Src0: Src0Regs[1], Src1: Src1Regs[0]).getReg(Idx: 0);
2188 B.buildAdd(Dst: DefRegs[1], Src0: Add, Src1: MulHiLo);
2189 B.buildMul(Dst: DefRegs[0], Src0: Src0Regs[0], Src1: Src1Regs[0]);
2190
2191 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2192 MI.eraseFromParent();
2193}
2194
2195void AMDGPURegisterBankInfo::applyMappingImpl(
2196 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2197 MachineInstr &MI = OpdMapper.getMI();
2198 B.setInstrAndDebugLoc(MI);
2199 unsigned Opc = MI.getOpcode();
2200 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2201 switch (Opc) {
2202 case AMDGPU::G_CONSTANT:
2203 case AMDGPU::G_IMPLICIT_DEF: {
2204 Register DstReg = MI.getOperand(i: 0).getReg();
2205 LLT DstTy = MRI.getType(Reg: DstReg);
2206 if (DstTy != LLT::scalar(SizeInBits: 1))
2207 break;
2208
2209 const RegisterBank *DstBank =
2210 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2211 if (DstBank == &AMDGPU::VCCRegBank)
2212 break;
2213 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2214 if (DefRegs.empty())
2215 DefRegs.push_back(Elt: DstReg);
2216
2217 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
2218
2219 Register NewDstReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
2220 LLVMContext &Ctx = B.getMF().getFunction().getContext();
2221
2222 MI.getOperand(i: 0).setReg(NewDstReg);
2223 if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2224 uint64_t ConstVal = MI.getOperand(i: 1).getCImm()->getZExtValue();
2225 MI.getOperand(i: 1).setCImm(
2226 ConstantInt::get(Ty: IntegerType::getInt32Ty(C&: Ctx), V: ConstVal));
2227 }
2228
2229 MRI.setRegBank(Reg: NewDstReg, RegBank: *DstBank);
2230 B.buildTrunc(Res: DefRegs[0], Op: NewDstReg);
2231 return;
2232 }
2233 case AMDGPU::G_PHI: {
2234 Register DstReg = MI.getOperand(i: 0).getReg();
2235 LLT DstTy = MRI.getType(Reg: DstReg);
2236 if (DstTy != LLT::scalar(SizeInBits: 1))
2237 break;
2238
2239 const LLT S32 = LLT::scalar(SizeInBits: 32);
2240 const RegisterBank *DstBank =
2241 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2242 if (DstBank == &AMDGPU::VCCRegBank) {
2243 applyDefaultMapping(OpdMapper);
2244 // The standard handling only considers the result register bank for
2245 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2246 // produce an invalid copy. We can only copy with some kind of compare to
2247 // get a vector boolean result. Insert a register bank copy that will be
2248 // correctly lowered to a compare.
2249 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2250 Register SrcReg = MI.getOperand(i: I).getReg();
2251 const RegisterBank *SrcBank = getRegBank(Reg: SrcReg, MRI, TRI: *TRI);
2252
2253 if (SrcBank != &AMDGPU::VCCRegBank) {
2254 MachineBasicBlock *SrcMBB = MI.getOperand(i: I + 1).getMBB();
2255 B.setInsertPt(MBB&: *SrcMBB, II: SrcMBB->getFirstTerminator());
2256
2257 auto Copy = B.buildCopy(Res: LLT::scalar(SizeInBits: 1), Op: SrcReg);
2258 MRI.setRegBank(Reg: Copy.getReg(Idx: 0), RegBank: AMDGPU::VCCRegBank);
2259 MI.getOperand(i: I).setReg(Copy.getReg(Idx: 0));
2260 }
2261 }
2262
2263 return;
2264 }
2265
2266 // Phi handling is strange and only considers the bank of the destination.
2267 substituteSimpleCopyRegs(OpdMapper, OpIdx: 0);
2268
2269 // Promote SGPR/VGPR booleans to s32
2270 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2271 B.setInsertPt(MBB&: B.getMBB(), II: MI);
2272 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2273
2274 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized)
2275 llvm_unreachable("widen scalar should have succeeded");
2276
2277 return;
2278 }
2279 case AMDGPU::G_FCMP:
2280 if (!Subtarget.hasSALUFloatInsts())
2281 break;
2282 [[fallthrough]];
2283 case AMDGPU::G_ICMP:
2284 case AMDGPU::G_UADDO:
2285 case AMDGPU::G_USUBO:
2286 case AMDGPU::G_UADDE:
2287 case AMDGPU::G_SADDE:
2288 case AMDGPU::G_USUBE:
2289 case AMDGPU::G_SSUBE: {
2290 unsigned BoolDstOp =
2291 (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
2292 Register DstReg = MI.getOperand(i: BoolDstOp).getReg();
2293
2294 const RegisterBank *DstBank =
2295 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2296 if (DstBank != &AMDGPU::SGPRRegBank)
2297 break;
2298
2299 const bool HasCarryIn = MI.getNumOperands() == 5;
2300
2301 // If this is a scalar compare, promote the result to s32, as the selection
2302 // will end up using a copy to a 32-bit vreg.
2303 const LLT S32 = LLT::scalar(SizeInBits: 32);
2304 Register NewDstReg = MRI.createGenericVirtualRegister(Ty: S32);
2305 MRI.setRegBank(Reg: NewDstReg, RegBank: AMDGPU::SGPRRegBank);
2306 MI.getOperand(i: BoolDstOp).setReg(NewDstReg);
2307
2308 if (HasCarryIn) {
2309 Register NewSrcReg = MRI.createGenericVirtualRegister(Ty: S32);
2310 MRI.setRegBank(Reg: NewSrcReg, RegBank: AMDGPU::SGPRRegBank);
2311 B.buildZExt(Res: NewSrcReg, Op: MI.getOperand(i: 4).getReg());
2312 MI.getOperand(i: 4).setReg(NewSrcReg);
2313 }
2314
2315 MachineBasicBlock *MBB = MI.getParent();
2316 B.setInsertPt(MBB&: *MBB, II: std::next(x: MI.getIterator()));
2317
2318 // If we had a constrained VCC result register, a copy was inserted to VCC
2319 // from SGPR.
2320 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2321 if (DefRegs.empty())
2322 DefRegs.push_back(Elt: DstReg);
2323 B.buildTrunc(Res: DefRegs[0], Op: NewDstReg);
2324 return;
2325 }
2326 case AMDGPU::G_SELECT: {
2327 Register DstReg = MI.getOperand(i: 0).getReg();
2328 LLT DstTy = MRI.getType(Reg: DstReg);
2329
2330 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(OpIdx: 1));
2331 if (CondRegs.empty())
2332 CondRegs.push_back(Elt: MI.getOperand(i: 1).getReg());
2333 else {
2334 assert(CondRegs.size() == 1);
2335 }
2336
2337 const RegisterBank *CondBank = getRegBank(Reg: CondRegs[0], MRI, TRI: *TRI);
2338 if (CondBank == &AMDGPU::SGPRRegBank) {
2339 const LLT S32 = LLT::scalar(SizeInBits: 32);
2340 Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32);
2341 MRI.setRegBank(Reg: NewCondReg, RegBank: AMDGPU::SGPRRegBank);
2342
2343 MI.getOperand(i: 1).setReg(NewCondReg);
2344 B.buildZExt(Res: NewCondReg, Op: CondRegs[0]);
2345 }
2346
2347 if (DstTy.getSizeInBits() != 64)
2348 break;
2349
2350 LLT HalfTy = getHalfSizedType(Ty: DstTy);
2351
2352 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2353 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2));
2354 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(OpIdx: 3));
2355
2356 // All inputs are SGPRs, nothing special to do.
2357 if (DefRegs.empty()) {
2358 assert(Src1Regs.empty() && Src2Regs.empty());
2359 break;
2360 }
2361
2362 if (Src1Regs.empty())
2363 split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg());
2364 else {
2365 setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2366 }
2367
2368 if (Src2Regs.empty())
2369 split64BitValueForMapping(B, Regs&: Src2Regs, HalfTy, Reg: MI.getOperand(i: 3).getReg());
2370 else
2371 setRegsToType(MRI, Regs: Src2Regs, NewTy: HalfTy);
2372
2373 setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2374
2375 auto Flags = MI.getFlags();
2376 B.buildSelect(Res: DefRegs[0], Tst: CondRegs[0], Op0: Src1Regs[0], Op1: Src2Regs[0], Flags);
2377 B.buildSelect(Res: DefRegs[1], Tst: CondRegs[0], Op0: Src1Regs[1], Op1: Src2Regs[1], Flags);
2378
2379 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2380 MI.eraseFromParent();
2381 return;
2382 }
2383 case AMDGPU::G_BRCOND: {
2384 Register CondReg = MI.getOperand(i: 0).getReg();
2385 // FIXME: Should use legalizer helper, but should change bool ext type.
2386 const RegisterBank *CondBank =
2387 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2388
2389 if (CondBank == &AMDGPU::SGPRRegBank) {
2390 const LLT S32 = LLT::scalar(SizeInBits: 32);
2391 Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32);
2392 MRI.setRegBank(Reg: NewCondReg, RegBank: AMDGPU::SGPRRegBank);
2393
2394 MI.getOperand(i: 0).setReg(NewCondReg);
2395 B.buildZExt(Res: NewCondReg, Op: CondReg);
2396 return;
2397 }
2398
2399 break;
2400 }
2401 case AMDGPU::G_AND:
2402 case AMDGPU::G_OR:
2403 case AMDGPU::G_XOR: {
2404 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2405 // there is a VGPR input.
2406 Register DstReg = MI.getOperand(i: 0).getReg();
2407 LLT DstTy = MRI.getType(Reg: DstReg);
2408
2409 const RegisterBank *DstBank =
2410 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2411
2412 if (DstTy.getSizeInBits() == 1) {
2413 if (DstBank == &AMDGPU::VCCRegBank)
2414 break;
2415
2416 MachineFunction *MF = MI.getMF();
2417 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2418 LegalizerHelper Helper(*MF, ApplyBank, B);
2419
2420 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: LLT::scalar(SizeInBits: 32)) !=
2421 LegalizerHelper::Legalized)
2422 llvm_unreachable("widen scalar should have succeeded");
2423 return;
2424 }
2425
2426 if (DstTy.getSizeInBits() == 16 && DstBank == &AMDGPU::SGPRRegBank) {
2427 const LLT S32 = LLT::scalar(SizeInBits: 32);
2428 MachineBasicBlock *MBB = MI.getParent();
2429 MachineFunction *MF = MBB->getParent();
2430 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2431 LegalizerHelper Helper(*MF, ApplySALU, B);
2432 // Widen to S32, but handle `G_XOR x, -1` differently. Legalizer widening
2433 // will use a G_ANYEXT to extend the -1 which prevents matching G_XOR -1
2434 // as "not".
2435 if (MI.getOpcode() == AMDGPU::G_XOR &&
2436 mi_match(R: MI.getOperand(i: 2).getReg(), MRI, P: m_SpecificICstOrSplat(RequestedValue: -1))) {
2437 Helper.widenScalarSrc(MI, WideTy: S32, OpIdx: 1, ExtOpcode: AMDGPU::G_ANYEXT);
2438 Helper.widenScalarSrc(MI, WideTy: S32, OpIdx: 2, ExtOpcode: AMDGPU::G_SEXT);
2439 Helper.widenScalarDst(MI, WideTy: S32);
2440 } else {
2441 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized)
2442 llvm_unreachable("widen scalar should have succeeded");
2443 }
2444 return;
2445 }
2446
2447 if (DstTy.getSizeInBits() != 64)
2448 break;
2449
2450 LLT HalfTy = getHalfSizedType(Ty: DstTy);
2451 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2452 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(OpIdx: 1));
2453 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2));
2454
2455 // All inputs are SGPRs, nothing special to do.
2456 if (DefRegs.empty()) {
2457 assert(Src0Regs.empty() && Src1Regs.empty());
2458 break;
2459 }
2460
2461 assert(DefRegs.size() == 2);
2462 assert(Src0Regs.size() == Src1Regs.size() &&
2463 (Src0Regs.empty() || Src0Regs.size() == 2));
2464
2465 // Depending on where the source registers came from, the generic code may
2466 // have decided to split the inputs already or not. If not, we still need to
2467 // extract the values.
2468
2469 if (Src0Regs.empty())
2470 split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: 1).getReg());
2471 else
2472 setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy);
2473
2474 if (Src1Regs.empty())
2475 split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg());
2476 else
2477 setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2478
2479 setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2480
2481 auto Flags = MI.getFlags();
2482 B.buildInstr(Opc, DstOps: {DefRegs[0]}, SrcOps: {Src0Regs[0], Src1Regs[0]}, Flags);
2483 B.buildInstr(Opc, DstOps: {DefRegs[1]}, SrcOps: {Src0Regs[1], Src1Regs[1]}, Flags);
2484
2485 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2486 MI.eraseFromParent();
2487 return;
2488 }
2489 case AMDGPU::G_ABS: {
2490 Register SrcReg = MI.getOperand(i: 1).getReg();
2491 const RegisterBank *SrcBank = MRI.getRegBankOrNull(Reg: SrcReg);
2492
2493 // There is no VALU abs instruction so we need to replace it with a sub and
2494 // max combination.
2495 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2496 MachineFunction *MF = MI.getMF();
2497 ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2498 LegalizerHelper Helper(*MF, Apply, B);
2499
2500 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2501 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2502 return;
2503 }
2504 [[fallthrough]];
2505 }
2506 case AMDGPU::G_ADD:
2507 case AMDGPU::G_SUB:
2508 case AMDGPU::G_MUL:
2509 case AMDGPU::G_SHL:
2510 case AMDGPU::G_LSHR:
2511 case AMDGPU::G_ASHR:
2512 case AMDGPU::G_SMIN:
2513 case AMDGPU::G_SMAX:
2514 case AMDGPU::G_UMIN:
2515 case AMDGPU::G_UMAX: {
2516 Register DstReg = MI.getOperand(i: 0).getReg();
2517 LLT DstTy = MRI.getType(Reg: DstReg);
2518
2519 // Special case for s_mul_u64. There is not a vector equivalent of
2520 // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2521 // multiplications.
2522 if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL &&
2523 DstTy.getSizeInBits() == 64) {
2524 applyMappingSMULU64(B, OpdMapper);
2525 return;
2526 }
2527
2528 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2529 // Packed 16-bit operations need to be scalarized and promoted.
2530 if (DstTy != LLT::scalar(SizeInBits: 16) && DstTy != LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16))
2531 break;
2532
2533 const RegisterBank *DstBank =
2534 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2535 if (DstBank == &AMDGPU::VGPRRegBank)
2536 break;
2537
2538 const LLT S32 = LLT::scalar(SizeInBits: 32);
2539 MachineBasicBlock *MBB = MI.getParent();
2540 MachineFunction *MF = MBB->getParent();
2541 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2542
2543 if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2544 Register WideSrcLo, WideSrcHi;
2545
2546 std::tie(args&: WideSrcLo, args&: WideSrcHi) =
2547 unpackV2S16ToS32(B, Src: MI.getOperand(i: 1).getReg(), ExtOpcode: TargetOpcode::G_SEXT);
2548 auto Lo = B.buildInstr(Opc: AMDGPU::G_ABS, DstOps: {S32}, SrcOps: {WideSrcLo});
2549 auto Hi = B.buildInstr(Opc: AMDGPU::G_ABS, DstOps: {S32}, SrcOps: {WideSrcHi});
2550 B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)});
2551 MI.eraseFromParent();
2552 return;
2553 }
2554
2555 if (DstTy.isVector()) {
2556 Register WideSrc0Lo, WideSrc0Hi;
2557 Register WideSrc1Lo, WideSrc1Hi;
2558
2559 unsigned ExtendOp = getExtendOp(Opc: MI.getOpcode());
2560 std::tie(args&: WideSrc0Lo, args&: WideSrc0Hi)
2561 = unpackV2S16ToS32(B, Src: MI.getOperand(i: 1).getReg(), ExtOpcode: ExtendOp);
2562 std::tie(args&: WideSrc1Lo, args&: WideSrc1Hi)
2563 = unpackV2S16ToS32(B, Src: MI.getOperand(i: 2).getReg(), ExtOpcode: ExtendOp);
2564 auto Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Lo, WideSrc1Lo});
2565 auto Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Hi, WideSrc1Hi});
2566 B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)});
2567 MI.eraseFromParent();
2568 } else {
2569 LegalizerHelper Helper(*MF, ApplySALU, B);
2570
2571 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized)
2572 llvm_unreachable("widen scalar should have succeeded");
2573
2574 // FIXME: s16 shift amounts should be legal.
2575 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2576 Opc == AMDGPU::G_ASHR) {
2577 B.setInsertPt(MBB&: *MBB, II: MI.getIterator());
2578 if (Helper.widenScalar(MI, TypeIdx: 1, WideTy: S32) != LegalizerHelper::Legalized)
2579 llvm_unreachable("widen scalar should have succeeded");
2580 }
2581 }
2582
2583 return;
2584 }
2585 case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
2586 case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
2587 // This is a special case for s_mul_u64. We use
2588 // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2589 // where the 33 higher bits are sign-extended and
2590 // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2591 // where the 32 higher bits are zero-extended. In case scalar registers are
2592 // selected, both opcodes are lowered as s_mul_u64. If the vector registers
2593 // are selected, then G_AMDGPU_S_MUL_I64_I32 and
2594 // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2595
2596 // Insert basic copies.
2597 applyDefaultMapping(OpdMapper);
2598
2599 Register DstReg = MI.getOperand(i: 0).getReg();
2600 Register SrcReg0 = MI.getOperand(i: 1).getReg();
2601 Register SrcReg1 = MI.getOperand(i: 2).getReg();
2602 const LLT S32 = LLT::scalar(SizeInBits: 32);
2603 const LLT S64 = LLT::scalar(SizeInBits: 64);
2604 assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
2605 "that handles only 64-bit operands.");
2606 const RegisterBank *DstBank =
2607 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2608
2609 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2610 // with s_mul_u64 operation.
2611 if (DstBank == &AMDGPU::SGPRRegBank) {
2612 MI.setDesc(TII->get(Opcode: AMDGPU::S_MUL_U64));
2613 MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SGPR_64RegClass);
2614 MRI.setRegClass(Reg: SrcReg0, RC: &AMDGPU::SGPR_64RegClass);
2615 MRI.setRegClass(Reg: SrcReg1, RC: &AMDGPU::SGPR_64RegClass);
2616 return;
2617 }
2618
2619 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2620 // with a vector mad.
2621 assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
2622 "The destination operand should be in vector registers.");
2623
2624 // Extract the lower subregister from the first operand.
2625 Register Op0L = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2626 MRI.setRegClass(Reg: Op0L, RC: &AMDGPU::VGPR_32RegClass);
2627 MRI.setType(VReg: Op0L, Ty: S32);
2628 B.buildTrunc(Res: Op0L, Op: SrcReg0);
2629
2630 // Extract the lower subregister from the second operand.
2631 Register Op1L = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2632 MRI.setRegClass(Reg: Op1L, RC: &AMDGPU::VGPR_32RegClass);
2633 MRI.setType(VReg: Op1L, Ty: S32);
2634 B.buildTrunc(Res: Op1L, Op: SrcReg1);
2635
2636 unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
2637 ? AMDGPU::G_AMDGPU_MAD_U64_U32
2638 : AMDGPU::G_AMDGPU_MAD_I64_I32;
2639
2640 MachineIRBuilder B(MI);
2641 Register Zero64 = B.buildConstant(Res: S64, Val: 0).getReg(Idx: 0);
2642 MRI.setRegClass(Reg: Zero64, RC: &AMDGPU::VReg_64RegClass);
2643 Register CarryOut = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
2644 MRI.setRegClass(Reg: CarryOut, RC: &AMDGPU::VReg_64RegClass);
2645 B.buildInstr(Opc: NewOpc, DstOps: {DstReg, CarryOut}, SrcOps: {Op0L, Op1L, Zero64});
2646 MI.eraseFromParent();
2647 return;
2648 }
2649 case AMDGPU::G_SEXT_INREG: {
2650 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(OpIdx: 1));
2651 if (SrcRegs.empty())
2652 break; // Nothing to repair
2653
2654 const LLT S32 = LLT::scalar(SizeInBits: 32);
2655 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2656
2657 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2658 // we would need to further expand, and doesn't let us directly set the
2659 // result registers.
2660 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0));
2661
2662 int Amt = MI.getOperand(i: 2).getImm();
2663 if (Amt <= 32) {
2664 // Downstream users have expectations for the high bit behavior, so freeze
2665 // incoming undefined bits.
2666 if (Amt == 32) {
2667 // The low bits are unchanged.
2668 B.buildFreeze(Dst: DstRegs[0], Src: SrcRegs[0]);
2669 } else {
2670 auto Freeze = B.buildFreeze(Dst: S32, Src: SrcRegs[0]);
2671 // Extend in the low bits and propagate the sign bit to the high half.
2672 B.buildSExtInReg(Res: DstRegs[0], Op: Freeze, ImmOp: Amt);
2673 }
2674
2675 B.buildAShr(Dst: DstRegs[1], Src0: DstRegs[0], Src1: B.buildConstant(Res: S32, Val: 31));
2676 } else {
2677 // The low bits are unchanged, and extend in the high bits.
2678 // No freeze required
2679 B.buildCopy(Res: DstRegs[0], Op: SrcRegs[0]);
2680 B.buildSExtInReg(Res: DstRegs[1], Op: DstRegs[0], ImmOp: Amt - 32);
2681 }
2682
2683 Register DstReg = MI.getOperand(i: 0).getReg();
2684 MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank);
2685 MI.eraseFromParent();
2686 return;
2687 }
2688 case AMDGPU::G_CTPOP:
2689 case AMDGPU::G_BITREVERSE: {
2690 const RegisterBank *DstBank =
2691 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2692 if (DstBank == &AMDGPU::SGPRRegBank)
2693 break;
2694
2695 Register SrcReg = MI.getOperand(i: 1).getReg();
2696 const LLT S32 = LLT::scalar(SizeInBits: 32);
2697 LLT Ty = MRI.getType(Reg: SrcReg);
2698 if (Ty == S32)
2699 break;
2700
2701 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2702
2703 MachineFunction &MF = B.getMF();
2704 LegalizerHelper Helper(MF, ApplyVALU, B);
2705
2706 if (Helper.narrowScalar(MI, TypeIdx: 1, NarrowTy: S32) != LegalizerHelper::Legalized)
2707 llvm_unreachable("narrowScalar should have succeeded");
2708 return;
2709 }
2710 case AMDGPU::G_AMDGPU_FFBH_U32:
2711 case AMDGPU::G_AMDGPU_FFBL_B32:
2712 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2713 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2714 const RegisterBank *DstBank =
2715 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2716 if (DstBank == &AMDGPU::SGPRRegBank)
2717 break;
2718
2719 Register SrcReg = MI.getOperand(i: 1).getReg();
2720 const LLT S32 = LLT::scalar(SizeInBits: 32);
2721 LLT Ty = MRI.getType(Reg: SrcReg);
2722 if (Ty == S32)
2723 break;
2724
2725 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2726 // which return -1 when the input is zero:
2727 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2728 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2729 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2730 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2731 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2732 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(OpIdx: 1));
2733 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2734 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2735 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2736 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2737 : Opc;
2738 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2739 auto X = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs[Idx]});
2740 auto Y = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs[Idx ^ 1]});
2741 unsigned AddOpc =
2742 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2743 ? AMDGPU::G_ADD
2744 : AMDGPU::G_UADDSAT;
2745 Y = B.buildInstr(Opc: AddOpc, DstOps: {S32}, SrcOps: {Y, B.buildConstant(Res: S32, Val: 32)});
2746 Register DstReg = MI.getOperand(i: 0).getReg();
2747 B.buildUMin(Dst: DstReg, Src0: X, Src1: Y);
2748 MI.eraseFromParent();
2749 return;
2750 }
2751 case AMDGPU::G_SEXT:
2752 case AMDGPU::G_ZEXT:
2753 case AMDGPU::G_ANYEXT: {
2754 Register SrcReg = MI.getOperand(i: 1).getReg();
2755 LLT SrcTy = MRI.getType(Reg: SrcReg);
2756 const bool Signed = Opc == AMDGPU::G_SEXT;
2757
2758 assert(OpdMapper.getVRegs(1).empty());
2759
2760 const RegisterBank *SrcBank =
2761 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
2762
2763 Register DstReg = MI.getOperand(i: 0).getReg();
2764 LLT DstTy = MRI.getType(Reg: DstReg);
2765 if (DstTy.isScalar() &&
2766 SrcBank != &AMDGPU::SGPRRegBank &&
2767 SrcBank != &AMDGPU::VCCRegBank &&
2768 // FIXME: Should handle any type that round to s64 when irregular
2769 // breakdowns supported.
2770 DstTy.getSizeInBits() == 64 &&
2771 SrcTy.getSizeInBits() <= 32) {
2772 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2773
2774 // Extend to 32-bit, and then extend the low half.
2775 if (Signed) {
2776 // TODO: Should really be buildSExtOrCopy
2777 B.buildSExtOrTrunc(Res: DefRegs[0], Op: SrcReg);
2778 } else if (Opc == AMDGPU::G_ZEXT) {
2779 B.buildZExtOrTrunc(Res: DefRegs[0], Op: SrcReg);
2780 } else {
2781 B.buildAnyExtOrTrunc(Res: DefRegs[0], Op: SrcReg);
2782 }
2783
2784 extendLow32IntoHigh32(B, Hi32Reg: DefRegs[1], Lo32Reg: DefRegs[0], ExtOpc: Opc, RegBank: *SrcBank);
2785 MRI.setRegBank(Reg: DstReg, RegBank: *SrcBank);
2786 MI.eraseFromParent();
2787 return;
2788 }
2789
2790 if (SrcTy != LLT::scalar(SizeInBits: 1))
2791 return;
2792
2793 // It is not legal to have a legalization artifact with a VCC source. Rather
2794 // than introducing a copy, insert the select we would have to select the
2795 // copy to.
2796 if (SrcBank == &AMDGPU::VCCRegBank) {
2797 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2798
2799 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2800
2801 unsigned DstSize = DstTy.getSizeInBits();
2802 // 64-bit select is SGPR only
2803 const bool UseSel64 = DstSize > 32 &&
2804 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2805
2806 // TODO: Should s16 select be legal?
2807 LLT SelType = UseSel64 ? LLT::scalar(SizeInBits: 64) : LLT::scalar(SizeInBits: 32);
2808 auto True = B.buildConstant(Res: SelType, Val: Signed ? -1 : 1);
2809 auto False = B.buildConstant(Res: SelType, Val: 0);
2810
2811 MRI.setRegBank(Reg: True.getReg(Idx: 0), RegBank: *DstBank);
2812 MRI.setRegBank(Reg: False.getReg(Idx: 0), RegBank: *DstBank);
2813 MRI.setRegBank(Reg: DstReg, RegBank: *DstBank);
2814
2815 if (DstSize > 32) {
2816 B.buildSelect(Res: DefRegs[0], Tst: SrcReg, Op0: True, Op1: False);
2817 extendLow32IntoHigh32(B, Hi32Reg: DefRegs[1], Lo32Reg: DefRegs[0], ExtOpc: Opc, RegBank: *SrcBank, IsBooleanSrc: true);
2818 } else if (DstSize < 32) {
2819 auto Sel = B.buildSelect(Res: SelType, Tst: SrcReg, Op0: True, Op1: False);
2820 MRI.setRegBank(Reg: Sel.getReg(Idx: 0), RegBank: *DstBank);
2821 B.buildTrunc(Res: DstReg, Op: Sel);
2822 } else {
2823 B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False);
2824 }
2825
2826 MI.eraseFromParent();
2827 return;
2828 }
2829
2830 break;
2831 }
2832 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2833 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0));
2834
2835 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2836
2837 Register DstReg = MI.getOperand(i: 0).getReg();
2838 Register SrcReg = MI.getOperand(i: 1).getReg();
2839
2840 const LLT S32 = LLT::scalar(SizeInBits: 32);
2841 LLT DstTy = MRI.getType(Reg: DstReg);
2842 LLT SrcTy = MRI.getType(Reg: SrcReg);
2843
2844 if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2845 return;
2846
2847 const ValueMapping &DstMapping
2848 = OpdMapper.getInstrMapping().getOperandMapping(i: 0);
2849 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2850 const RegisterBank *SrcBank =
2851 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
2852 const RegisterBank *IdxBank =
2853 OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
2854
2855 Register BaseIdxReg;
2856 unsigned ConstOffset;
2857 std::tie(args&: BaseIdxReg, args&: ConstOffset) =
2858 AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: 2).getReg());
2859
2860 // See if the index is an add of a constant which will be foldable by moving
2861 // the base register of the index later if this is going to be executed in a
2862 // waterfall loop. This is essentially to reassociate the add of a constant
2863 // with the readfirstlane.
2864 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2865 ConstOffset > 0 &&
2866 ConstOffset < SrcTy.getNumElements();
2867
2868 // Move the base register. We'll re-insert the add later.
2869 if (ShouldMoveIndexIntoLoop)
2870 MI.getOperand(i: 2).setReg(BaseIdxReg);
2871
2872 // If this is a VGPR result only because the index was a VGPR result, the
2873 // actual indexing will be done on the SGPR source vector, which will
2874 // produce a scalar result. We need to copy to the VGPR result inside the
2875 // waterfall loop.
2876 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2877 SrcBank == &AMDGPU::SGPRRegBank;
2878 if (DstRegs.empty()) {
2879 applyDefaultMapping(OpdMapper);
2880
2881 executeInWaterfallLoop(B, MI, OpIndices: {2});
2882
2883 if (NeedCopyToVGPR) {
2884 // We don't want a phi for this temporary reg.
2885 Register TmpReg = MRI.createGenericVirtualRegister(Ty: DstTy);
2886 MRI.setRegBank(Reg: TmpReg, RegBank: AMDGPU::SGPRRegBank);
2887 MI.getOperand(i: 0).setReg(TmpReg);
2888 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
2889
2890 // Use a v_mov_b32 here to make the exec dependency explicit.
2891 buildVCopy(B, DstReg, SrcReg: TmpReg);
2892 }
2893
2894 // Re-insert the constant offset add inside the waterfall loop.
2895 if (ShouldMoveIndexIntoLoop)
2896 reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: 2, ConstOffset);
2897
2898 return;
2899 }
2900
2901 assert(DstTy.getSizeInBits() == 64);
2902
2903 LLT Vec32 = LLT::fixed_vector(NumElements: 2 * SrcTy.getNumElements(), ScalarSizeInBits: 32);
2904
2905 auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg);
2906 auto One = B.buildConstant(Res: S32, Val: 1);
2907
2908 MachineBasicBlock::iterator MII = MI.getIterator();
2909
2910 // Split the vector index into 32-bit pieces. Prepare to move all of the
2911 // new instructions into a waterfall loop if necessary.
2912 //
2913 // Don't put the bitcast or constant in the loop.
2914 MachineInstrSpan Span(MII, &B.getMBB());
2915
2916 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2917 auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One);
2918 auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One);
2919
2920 auto Extract0 = B.buildExtractVectorElement(Res: DstRegs[0], Val: CastSrc, Idx: IdxLo);
2921 auto Extract1 = B.buildExtractVectorElement(Res: DstRegs[1], Val: CastSrc, Idx: IdxHi);
2922
2923 MRI.setRegBank(Reg: DstReg, RegBank: *DstBank);
2924 MRI.setRegBank(Reg: CastSrc.getReg(Idx: 0), RegBank: *SrcBank);
2925 MRI.setRegBank(Reg: One.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
2926 MRI.setRegBank(Reg: IdxLo.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
2927 MRI.setRegBank(Reg: IdxHi.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
2928
2929 SmallSet<Register, 4> OpsToWaterfall;
2930 if (!collectWaterfallOperands(SGPROperandRegs&: OpsToWaterfall, MI, MRI, OpIndices: { 2 })) {
2931 MI.eraseFromParent();
2932 return;
2933 }
2934
2935 // Remove the original instruction to avoid potentially confusing the
2936 // waterfall loop logic.
2937 B.setInstr(*Span.begin());
2938 MI.eraseFromParent();
2939 executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()),
2940 SGPROperandRegs&: OpsToWaterfall);
2941
2942 if (NeedCopyToVGPR) {
2943 MachineBasicBlock *LoopBB = Extract1->getParent();
2944 Register TmpReg0 = MRI.createGenericVirtualRegister(Ty: S32);
2945 Register TmpReg1 = MRI.createGenericVirtualRegister(Ty: S32);
2946 MRI.setRegBank(Reg: TmpReg0, RegBank: AMDGPU::SGPRRegBank);
2947 MRI.setRegBank(Reg: TmpReg1, RegBank: AMDGPU::SGPRRegBank);
2948
2949 Extract0->getOperand(i: 0).setReg(TmpReg0);
2950 Extract1->getOperand(i: 0).setReg(TmpReg1);
2951
2952 B.setInsertPt(MBB&: *LoopBB, II: ++Extract1->getIterator());
2953
2954 buildVCopy(B, DstReg: DstRegs[0], SrcReg: TmpReg0);
2955 buildVCopy(B, DstReg: DstRegs[1], SrcReg: TmpReg1);
2956 }
2957
2958 if (ShouldMoveIndexIntoLoop)
2959 reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: 1, ConstOffset);
2960
2961 return;
2962 }
2963 case AMDGPU::G_INSERT_VECTOR_ELT: {
2964 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(OpIdx: 2));
2965
2966 Register DstReg = MI.getOperand(i: 0).getReg();
2967 LLT VecTy = MRI.getType(Reg: DstReg);
2968
2969 assert(OpdMapper.getVRegs(0).empty());
2970 assert(OpdMapper.getVRegs(3).empty());
2971
2972 if (substituteSimpleCopyRegs(OpdMapper, OpIdx: 1))
2973 MRI.setType(VReg: MI.getOperand(i: 1).getReg(), Ty: VecTy);
2974
2975 if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2976 return;
2977
2978 const RegisterBank *IdxBank =
2979 OpdMapper.getInstrMapping().getOperandMapping(i: 3).BreakDown[0].RegBank;
2980
2981 Register SrcReg = MI.getOperand(i: 1).getReg();
2982 Register InsReg = MI.getOperand(i: 2).getReg();
2983 LLT InsTy = MRI.getType(Reg: InsReg);
2984 (void)InsTy;
2985
2986 Register BaseIdxReg;
2987 unsigned ConstOffset;
2988 std::tie(args&: BaseIdxReg, args&: ConstOffset) =
2989 AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: 3).getReg());
2990
2991 // See if the index is an add of a constant which will be foldable by moving
2992 // the base register of the index later if this is going to be executed in a
2993 // waterfall loop. This is essentially to reassociate the add of a constant
2994 // with the readfirstlane.
2995 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2996 ConstOffset > 0 &&
2997 ConstOffset < VecTy.getNumElements();
2998
2999 // Move the base register. We'll re-insert the add later.
3000 if (ShouldMoveIndexIntoLoop)
3001 MI.getOperand(i: 3).setReg(BaseIdxReg);
3002
3003
3004 if (InsRegs.empty()) {
3005 executeInWaterfallLoop(B, MI, OpIndices: {3});
3006
3007 // Re-insert the constant offset add inside the waterfall loop.
3008 if (ShouldMoveIndexIntoLoop) {
3009 reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: 3, ConstOffset);
3010 }
3011
3012 return;
3013 }
3014
3015 assert(InsTy.getSizeInBits() == 64);
3016
3017 const LLT S32 = LLT::scalar(SizeInBits: 32);
3018 LLT Vec32 = LLT::fixed_vector(NumElements: 2 * VecTy.getNumElements(), ScalarSizeInBits: 32);
3019
3020 auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg);
3021 auto One = B.buildConstant(Res: S32, Val: 1);
3022
3023 // Split the vector index into 32-bit pieces. Prepare to move all of the
3024 // new instructions into a waterfall loop if necessary.
3025 //
3026 // Don't put the bitcast or constant in the loop.
3027 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
3028
3029 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
3030 auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One);
3031 auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One);
3032
3033 auto InsLo = B.buildInsertVectorElement(Res: Vec32, Val: CastSrc, Elt: InsRegs[0], Idx: IdxLo);
3034 auto InsHi = B.buildInsertVectorElement(Res: Vec32, Val: InsLo, Elt: InsRegs[1], Idx: IdxHi);
3035
3036 const RegisterBank *DstBank =
3037 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
3038 const RegisterBank *SrcBank =
3039 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
3040 const RegisterBank *InsSrcBank =
3041 OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
3042
3043 MRI.setRegBank(Reg: InsReg, RegBank: *InsSrcBank);
3044 MRI.setRegBank(Reg: CastSrc.getReg(Idx: 0), RegBank: *SrcBank);
3045 MRI.setRegBank(Reg: InsLo.getReg(Idx: 0), RegBank: *DstBank);
3046 MRI.setRegBank(Reg: InsHi.getReg(Idx: 0), RegBank: *DstBank);
3047 MRI.setRegBank(Reg: One.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
3048 MRI.setRegBank(Reg: IdxLo.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
3049 MRI.setRegBank(Reg: IdxHi.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank);
3050
3051
3052 SmallSet<Register, 4> OpsToWaterfall;
3053 if (!collectWaterfallOperands(SGPROperandRegs&: OpsToWaterfall, MI, MRI, OpIndices: { 3 })) {
3054 B.setInsertPt(MBB&: B.getMBB(), II: MI);
3055 B.buildBitcast(Dst: DstReg, Src: InsHi);
3056 MI.eraseFromParent();
3057 return;
3058 }
3059
3060 B.setInstr(*Span.begin());
3061 MI.eraseFromParent();
3062
3063 // Figure out the point after the waterfall loop before mangling the control
3064 // flow.
3065 executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()),
3066 SGPROperandRegs&: OpsToWaterfall);
3067
3068 // The insertion point is now right after the original instruction.
3069 //
3070 // Keep the bitcast to the original vector type out of the loop. Doing this
3071 // saved an extra phi we don't need inside the loop.
3072 B.buildBitcast(Dst: DstReg, Src: InsHi);
3073
3074 // Re-insert the constant offset add inside the waterfall loop.
3075 if (ShouldMoveIndexIntoLoop)
3076 reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: 1, ConstOffset);
3077
3078 return;
3079 }
3080 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3081 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3082 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3083 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3084 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3085 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
3086 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
3087 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
3088 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
3089 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
3090 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3091 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
3092 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3093 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3094 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3095 case AMDGPU::G_AMDGPU_BUFFER_STORE:
3096 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3097 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3098 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3099 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
3100 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3101 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
3102 applyDefaultMapping(OpdMapper);
3103 executeInWaterfallLoop(B, MI, OpIndices: {1, 4});
3104 return;
3105 }
3106 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3107 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3108 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3109 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3110 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3111 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3112 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3113 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3114 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3115 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3116 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3117 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3118 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
3119 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
3120 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3121 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3122 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3123 applyDefaultMapping(OpdMapper);
3124 executeInWaterfallLoop(B, MI, OpIndices: {2, 5});
3125 return;
3126 }
3127 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3128 applyDefaultMapping(OpdMapper);
3129 executeInWaterfallLoop(B, MI, OpIndices: {3, 6});
3130 return;
3131 }
3132 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
3133 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
3134 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
3135 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
3136 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
3137 applyMappingSBufferLoad(B, OpdMapper);
3138 return;
3139 }
3140 case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
3141 constrainOpWithReadfirstlane(B, MI, OpIdx: 0);
3142 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3143 return;
3144 case AMDGPU::G_INTRINSIC:
3145 case AMDGPU::G_INTRINSIC_CONVERGENT: {
3146 switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
3147 case Intrinsic::amdgcn_readlane: {
3148 substituteSimpleCopyRegs(OpdMapper, OpIdx: 2);
3149
3150 assert(OpdMapper.getVRegs(0).empty());
3151 assert(OpdMapper.getVRegs(3).empty());
3152
3153 // Make sure the index is an SGPR. It doesn't make sense to run this in a
3154 // waterfall loop, so assume it's a uniform value.
3155 constrainOpWithReadfirstlane(B, MI, OpIdx: 3); // Index
3156 return;
3157 }
3158 case Intrinsic::amdgcn_writelane: {
3159 assert(OpdMapper.getVRegs(0).empty());
3160 assert(OpdMapper.getVRegs(2).empty());
3161 assert(OpdMapper.getVRegs(3).empty());
3162
3163 substituteSimpleCopyRegs(OpdMapper, OpIdx: 4); // VGPR input val
3164 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // Source value
3165 constrainOpWithReadfirstlane(B, MI, OpIdx: 3); // Index
3166 return;
3167 }
3168 case Intrinsic::amdgcn_interp_p1:
3169 case Intrinsic::amdgcn_interp_p2:
3170 case Intrinsic::amdgcn_interp_mov:
3171 case Intrinsic::amdgcn_interp_p1_f16:
3172 case Intrinsic::amdgcn_interp_p2_f16:
3173 case Intrinsic::amdgcn_lds_param_load: {
3174 applyDefaultMapping(OpdMapper);
3175
3176 // Readlane for m0 value, which is always the last operand.
3177 // FIXME: Should this be a waterfall loop instead?
3178 constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - 1); // Index
3179 return;
3180 }
3181 case Intrinsic::amdgcn_interp_inreg_p10:
3182 case Intrinsic::amdgcn_interp_inreg_p2:
3183 case Intrinsic::amdgcn_interp_inreg_p10_f16:
3184 case Intrinsic::amdgcn_interp_inreg_p2_f16:
3185 case Intrinsic::amdgcn_interp_p10_rtz_f16:
3186 case Intrinsic::amdgcn_interp_p2_rtz_f16:
3187 case Intrinsic::amdgcn_permlane16_swap:
3188 case Intrinsic::amdgcn_permlane32_swap:
3189 applyDefaultMapping(OpdMapper);
3190 return;
3191 case Intrinsic::amdgcn_permlane16:
3192 case Intrinsic::amdgcn_permlanex16: {
3193 // Doing a waterfall loop over these wouldn't make any sense.
3194 substituteSimpleCopyRegs(OpdMapper, OpIdx: 2);
3195 substituteSimpleCopyRegs(OpdMapper, OpIdx: 3);
3196 constrainOpWithReadfirstlane(B, MI, OpIdx: 4);
3197 constrainOpWithReadfirstlane(B, MI, OpIdx: 5);
3198 return;
3199 }
3200 case Intrinsic::amdgcn_permlane_bcast:
3201 case Intrinsic::amdgcn_permlane_up:
3202 case Intrinsic::amdgcn_permlane_down:
3203 case Intrinsic::amdgcn_permlane_xor:
3204 // Doing a waterfall loop over these wouldn't make any sense.
3205 constrainOpWithReadfirstlane(B, MI, OpIdx: 3);
3206 constrainOpWithReadfirstlane(B, MI, OpIdx: 4);
3207 return;
3208 case Intrinsic::amdgcn_permlane_idx_gen: {
3209 constrainOpWithReadfirstlane(B, MI, OpIdx: 3);
3210 return;
3211 }
3212 case Intrinsic::amdgcn_sbfe:
3213 applyMappingBFE(B, OpdMapper, Signed: true);
3214 return;
3215 case Intrinsic::amdgcn_ubfe:
3216 applyMappingBFE(B, OpdMapper, Signed: false);
3217 return;
3218 case Intrinsic::amdgcn_inverse_ballot:
3219 case Intrinsic::amdgcn_s_bitreplicate:
3220 case Intrinsic::amdgcn_s_quadmask:
3221 case Intrinsic::amdgcn_s_wqm:
3222 applyDefaultMapping(OpdMapper);
3223 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // Mask
3224 return;
3225 case Intrinsic::amdgcn_ballot:
3226 // Use default handling and insert copy to vcc source.
3227 break;
3228 }
3229 break;
3230 }
3231 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3232 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3233 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3234 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3235 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3236 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3237 AMDGPU::lookupRsrcIntrinsic(Intr: AMDGPU::getIntrinsicID(I: MI));
3238 assert(RSrcIntrin && RSrcIntrin->IsImage);
3239 // Non-images can have complications from operands that allow both SGPR
3240 // and VGPR. For now it's too complicated to figure out the final opcode
3241 // to derive the register bank from the MCInstrDesc.
3242 applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg);
3243 return;
3244 }
3245 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
3246 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
3247 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {
3248 bool IsDualOrBVH8 =
3249 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY ||
3250 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY;
3251 unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier
3252 unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods;
3253 applyDefaultMapping(OpdMapper);
3254 executeInWaterfallLoop(B, MI, OpIndices: {LastRegOpIdx});
3255 return;
3256 }
3257 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3258 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3259 auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
3260 switch (IntrID) {
3261 case Intrinsic::amdgcn_ds_ordered_add:
3262 case Intrinsic::amdgcn_ds_ordered_swap: {
3263 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3264 assert(OpdMapper.getVRegs(0).empty());
3265 substituteSimpleCopyRegs(OpdMapper, OpIdx: 3);
3266 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3267 return;
3268 }
3269 case Intrinsic::amdgcn_ds_gws_init:
3270 case Intrinsic::amdgcn_ds_gws_barrier:
3271 case Intrinsic::amdgcn_ds_gws_sema_br: {
3272 // Only the first lane is executes, so readfirstlane is safe.
3273 substituteSimpleCopyRegs(OpdMapper, OpIdx: 1);
3274 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3275 return;
3276 }
3277 case Intrinsic::amdgcn_ds_gws_sema_v:
3278 case Intrinsic::amdgcn_ds_gws_sema_p:
3279 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3280 // Only the first lane is executes, so readfirstlane is safe.
3281 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // M0
3282 return;
3283 }
3284 case Intrinsic::amdgcn_ds_append:
3285 case Intrinsic::amdgcn_ds_consume: {
3286 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3287 return;
3288 }
3289 case Intrinsic::amdgcn_s_sendmsg:
3290 case Intrinsic::amdgcn_s_sendmsghalt: {
3291 // FIXME: Should this use a waterfall loop?
3292 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3293 return;
3294 }
3295 case Intrinsic::amdgcn_s_setreg: {
3296 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3297 return;
3298 }
3299 case Intrinsic::amdgcn_s_ttracedata:
3300 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // M0
3301 return;
3302 case Intrinsic::amdgcn_raw_buffer_load_lds:
3303 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
3304 applyDefaultMapping(OpdMapper);
3305 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // rsrc
3306 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3307 constrainOpWithReadfirstlane(B, MI, OpIdx: 5); // soffset
3308 return;
3309 }
3310 case Intrinsic::amdgcn_struct_buffer_load_lds:
3311 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
3312 applyDefaultMapping(OpdMapper);
3313 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // rsrc
3314 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3315 constrainOpWithReadfirstlane(B, MI, OpIdx: 6); // soffset
3316 return;
3317 }
3318 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
3319 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
3320 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
3321 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
3322 applyDefaultMapping(OpdMapper);
3323 constrainOpWithReadfirstlane(B, MI, OpIdx: 5);
3324 return;
3325 }
3326 case Intrinsic::amdgcn_load_to_lds:
3327 case Intrinsic::amdgcn_global_load_lds: {
3328 applyDefaultMapping(OpdMapper);
3329 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3330 return;
3331 }
3332 case Intrinsic::amdgcn_lds_direct_load: {
3333 applyDefaultMapping(OpdMapper);
3334 // Readlane for m0 value, which is always the last operand.
3335 constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - 1); // Index
3336 return;
3337 }
3338 case Intrinsic::amdgcn_exp_row:
3339 applyDefaultMapping(OpdMapper);
3340 constrainOpWithReadfirstlane(B, MI, OpIdx: 8); // M0
3341 return;
3342 case Intrinsic::amdgcn_cluster_load_b32:
3343 case Intrinsic::amdgcn_cluster_load_b64:
3344 case Intrinsic::amdgcn_cluster_load_b128: {
3345 applyDefaultMapping(OpdMapper);
3346 constrainOpWithReadfirstlane(B, MI, OpIdx: 4); // M0
3347 return;
3348 }
3349 case Intrinsic::amdgcn_s_sleep_var:
3350 assert(OpdMapper.getVRegs(1).empty());
3351 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3352 return;
3353 case Intrinsic::amdgcn_s_barrier_join:
3354 case Intrinsic::amdgcn_s_wakeup_barrier:
3355 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3356 return;
3357 case Intrinsic::amdgcn_s_barrier_init:
3358 case Intrinsic::amdgcn_s_barrier_signal_var:
3359 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3360 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3361 return;
3362 case Intrinsic::amdgcn_s_get_barrier_state:
3363 case Intrinsic::amdgcn_s_get_named_barrier_state: {
3364 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3365 return;
3366 }
3367 case Intrinsic::amdgcn_s_prefetch_data: {
3368 Register PtrReg = MI.getOperand(i: 1).getReg();
3369 unsigned AS = MRI.getType(Reg: PtrReg).getAddressSpace();
3370 if (AMDGPU::isFlatGlobalAddrSpace(AS)) {
3371 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3372 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3373 } else
3374 MI.eraseFromParent();
3375 return;
3376 }
3377 case Intrinsic::amdgcn_tensor_load_to_lds:
3378 case Intrinsic::amdgcn_tensor_store_from_lds: {
3379 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3380 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3381 constrainOpWithReadfirstlane(B, MI, OpIdx: 3);
3382 constrainOpWithReadfirstlane(B, MI, OpIdx: 4);
3383 return;
3384 }
3385 case Intrinsic::amdgcn_tensor_load_to_lds_d2:
3386 case Intrinsic::amdgcn_tensor_store_from_lds_d2: {
3387 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3388 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3389 return;
3390 }
3391 default: {
3392 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3393 AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
3394 // Non-images can have complications from operands that allow both SGPR
3395 // and VGPR. For now it's too complicated to figure out the final opcode
3396 // to derive the register bank from the MCInstrDesc.
3397 if (RSrcIntrin->IsImage) {
3398 applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg);
3399 return;
3400 }
3401 }
3402
3403 break;
3404 }
3405 }
3406 break;
3407 }
3408 case AMDGPU::G_SI_CALL: {
3409 // Use a set to avoid extra readfirstlanes in the case where multiple
3410 // operands are the same register.
3411 SmallSet<Register, 4> SGPROperandRegs;
3412
3413 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices: {1}))
3414 break;
3415
3416 // Move all copies to physical SGPRs that are used by the call instruction
3417 // into the loop block. Start searching for these copies until the
3418 // ADJCALLSTACKUP.
3419 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3420 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3421
3422 // Move all non-copies before the copies, so that a complete range can be
3423 // moved into the waterfall loop.
3424 SmallVector<MachineInstr *, 4> NonCopyInstrs;
3425 // Count of NonCopyInstrs found until the current LastCopy.
3426 unsigned NonCopyInstrsLen = 0;
3427 MachineBasicBlock::iterator Start(&MI);
3428 MachineBasicBlock::iterator LastCopy = Start;
3429 MachineBasicBlock *MBB = MI.getParent();
3430 const SIMachineFunctionInfo *Info =
3431 MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3432 while (Start->getOpcode() != FrameSetupOpcode) {
3433 --Start;
3434 bool IsCopy = false;
3435 if (Start->getOpcode() == AMDGPU::COPY) {
3436 auto &Dst = Start->getOperand(i: 0);
3437 if (Dst.isReg()) {
3438 Register Reg = Dst.getReg();
3439 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3440 IsCopy = true;
3441 } else {
3442 // Also move the copy from the scratch rsrc descriptor into the loop
3443 // to allow it to be optimized away.
3444 auto &Src = Start->getOperand(i: 1);
3445 if (Src.isReg()) {
3446 Reg = Src.getReg();
3447 IsCopy = Info->getScratchRSrcReg() == Reg;
3448 }
3449 }
3450 }
3451 }
3452
3453 if (IsCopy) {
3454 LastCopy = Start;
3455 NonCopyInstrsLen = NonCopyInstrs.size();
3456 } else {
3457 NonCopyInstrs.push_back(Elt: &*Start);
3458 }
3459 }
3460 NonCopyInstrs.resize(N: NonCopyInstrsLen);
3461
3462 for (auto *NonCopy : reverse(C&: NonCopyInstrs)) {
3463 MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator());
3464 }
3465 Start = LastCopy;
3466
3467 // Do the same for copies after the loop
3468 NonCopyInstrs.clear();
3469 NonCopyInstrsLen = 0;
3470 MachineBasicBlock::iterator End(&MI);
3471 LastCopy = End;
3472 while (End->getOpcode() != FrameDestroyOpcode) {
3473 ++End;
3474 bool IsCopy = false;
3475 if (End->getOpcode() == AMDGPU::COPY) {
3476 auto &Src = End->getOperand(i: 1);
3477 if (Src.isReg()) {
3478 Register Reg = Src.getReg();
3479 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3480 }
3481 }
3482
3483 if (IsCopy) {
3484 LastCopy = End;
3485 NonCopyInstrsLen = NonCopyInstrs.size();
3486 } else {
3487 NonCopyInstrs.push_back(Elt: &*End);
3488 }
3489 }
3490 NonCopyInstrs.resize(N: NonCopyInstrsLen);
3491
3492 End = LastCopy;
3493 ++LastCopy;
3494 for (auto *NonCopy : reverse(C&: NonCopyInstrs)) {
3495 MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator());
3496 }
3497
3498 ++End;
3499 B.setInsertPt(MBB&: B.getMBB(), II: Start);
3500 executeInWaterfallLoop(B, Range: make_range(x: Start, y: End), SGPROperandRegs);
3501 break;
3502 }
3503 case AMDGPU::G_LOAD:
3504 case AMDGPU::G_ZEXTLOAD:
3505 case AMDGPU::G_SEXTLOAD: {
3506 if (applyMappingLoad(B, OpdMapper, MI))
3507 return;
3508 break;
3509 }
3510 case AMDGPU::G_DYN_STACKALLOC:
3511 applyMappingDynStackAlloc(B, OpdMapper, MI);
3512 return;
3513 case AMDGPU::G_STACKRESTORE: {
3514 applyDefaultMapping(OpdMapper);
3515 constrainOpWithReadfirstlane(B, MI, OpIdx: 0);
3516 return;
3517 }
3518 case AMDGPU::G_SBFX:
3519 applyMappingBFE(B, OpdMapper, /*Signed*/ true);
3520 return;
3521 case AMDGPU::G_UBFX:
3522 applyMappingBFE(B, OpdMapper, /*Signed*/ false);
3523 return;
3524 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3525 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3526 applyMappingMAD_64_32(B, OpdMapper);
3527 return;
3528 case AMDGPU::G_PREFETCH: {
3529 if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) {
3530 MI.eraseFromParent();
3531 return;
3532 }
3533 Register PtrReg = MI.getOperand(i: 0).getReg();
3534 unsigned PtrBank = getRegBankID(Reg: PtrReg, MRI, Default: AMDGPU::SGPRRegBankID);
3535 if (PtrBank == AMDGPU::VGPRRegBankID &&
3536 (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(i: 3).getImm())) {
3537 // Cannot do I$ prefetch with divergent pointer.
3538 MI.eraseFromParent();
3539 return;
3540 }
3541 unsigned AS = MRI.getType(Reg: PtrReg).getAddressSpace();
3542 if ((!AMDGPU::isFlatGlobalAddrSpace(AS) &&
3543 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3544 (!Subtarget.hasSafeSmemPrefetch() &&
3545 (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
3546 !MI.getOperand(i: 3).getImm() /* I$ prefetch */))) {
3547 MI.eraseFromParent();
3548 return;
3549 }
3550 applyDefaultMapping(OpdMapper);
3551 return;
3552 }
3553 default:
3554 break;
3555 }
3556
3557 return applyDefaultMapping(OpdMapper);
3558}
3559
3560// vgpr, sgpr -> vgpr
3561// vgpr, agpr -> vgpr
3562// agpr, agpr -> agpr
3563// agpr, sgpr -> vgpr
3564static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3565 if (RB0 == AMDGPU::InvalidRegBankID)
3566 return RB1;
3567 if (RB1 == AMDGPU::InvalidRegBankID)
3568 return RB0;
3569
3570 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3571 return AMDGPU::SGPRRegBankID;
3572
3573 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3574 return AMDGPU::AGPRRegBankID;
3575
3576 return AMDGPU::VGPRRegBankID;
3577}
3578
3579static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3580 if (RB0 == AMDGPU::InvalidRegBankID)
3581 return RB1;
3582 if (RB1 == AMDGPU::InvalidRegBankID)
3583 return RB0;
3584
3585 // vcc, vcc -> vcc
3586 // vcc, sgpr -> vcc
3587 // vcc, vgpr -> vcc
3588 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3589 return AMDGPU::VCCRegBankID;
3590
3591 // vcc, vgpr -> vgpr
3592 return regBankUnion(RB0, RB1);
3593}
3594
3595unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3596 const MachineInstr &MI) const {
3597 unsigned RegBank = AMDGPU::InvalidRegBankID;
3598
3599 for (const MachineOperand &MO : MI.operands()) {
3600 if (!MO.isReg())
3601 continue;
3602 Register Reg = MO.getReg();
3603 if (const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI)) {
3604 RegBank = regBankUnion(RB0: RegBank, RB1: Bank->getID());
3605 if (RegBank == AMDGPU::VGPRRegBankID)
3606 break;
3607 }
3608 }
3609
3610 return RegBank;
3611}
3612
3613bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3614 const MachineFunction &MF = *MI.getMF();
3615 const MachineRegisterInfo &MRI = MF.getRegInfo();
3616 for (const MachineOperand &MO : MI.operands()) {
3617 if (!MO.isReg())
3618 continue;
3619 Register Reg = MO.getReg();
3620 if (const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI)) {
3621 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3622 return false;
3623 }
3624 }
3625 return true;
3626}
3627
3628const RegisterBankInfo::InstructionMapping &
3629AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3630 const MachineFunction &MF = *MI.getMF();
3631 const MachineRegisterInfo &MRI = MF.getRegInfo();
3632 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3633
3634 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3635 const MachineOperand &SrcOp = MI.getOperand(i);
3636 if (!SrcOp.isReg())
3637 continue;
3638
3639 unsigned Size = getSizeInBits(Reg: SrcOp.getReg(), MRI, TRI: *TRI);
3640 OpdsMapping[i] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
3641 }
3642 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping),
3643 NumOperands: MI.getNumOperands());
3644}
3645
3646const RegisterBankInfo::InstructionMapping &
3647AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3648 const MachineFunction &MF = *MI.getMF();
3649 const MachineRegisterInfo &MRI = MF.getRegInfo();
3650 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3651
3652 // Even though we technically could use SGPRs, this would require knowledge of
3653 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3654 //
3655 // TODO: Unary ops are trivially OK, so accept SGPRs?
3656 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3657 const MachineOperand &Src = MI.getOperand(i);
3658 if (!Src.isReg())
3659 continue;
3660
3661 unsigned Size = getSizeInBits(Reg: Src.getReg(), MRI, TRI: *TRI);
3662 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3663 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3664 }
3665
3666 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping),
3667 NumOperands: MI.getNumOperands());
3668}
3669
3670const RegisterBankInfo::InstructionMapping &
3671AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3672 const MachineFunction &MF = *MI.getMF();
3673 const MachineRegisterInfo &MRI = MF.getRegInfo();
3674 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3675
3676 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3677 const MachineOperand &Op = MI.getOperand(i: I);
3678 if (!Op.isReg())
3679 continue;
3680
3681 unsigned Size = getSizeInBits(Reg: Op.getReg(), MRI, TRI: *TRI);
3682 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3683 }
3684
3685 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping),
3686 NumOperands: MI.getNumOperands());
3687}
3688
3689const RegisterBankInfo::InstructionMapping &
3690AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3691 const MachineInstr &MI,
3692 int RsrcIdx) const {
3693 // The reported argument index is relative to the IR intrinsic call arguments,
3694 // so we need to shift by the number of defs and the intrinsic ID.
3695 RsrcIdx += MI.getNumExplicitDefs() + 1;
3696
3697 const int NumOps = MI.getNumOperands();
3698 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3699
3700 // TODO: Should packed/unpacked D16 difference be reported here as part of
3701 // the value mapping?
3702 for (int I = 0; I != NumOps; ++I) {
3703 if (!MI.getOperand(i: I).isReg())
3704 continue;
3705
3706 Register OpReg = MI.getOperand(i: I).getReg();
3707 // We replace some dead address operands with $noreg
3708 if (!OpReg)
3709 continue;
3710
3711 unsigned Size = getSizeInBits(Reg: OpReg, MRI, TRI: *TRI);
3712
3713 // FIXME: Probably need a new intrinsic register bank searchable table to
3714 // handle arbitrary intrinsics easily.
3715 //
3716 // If this has a sampler, it immediately follows rsrc.
3717 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3718
3719 if (MustBeSGPR) {
3720 // If this must be an SGPR, so we must report whatever it is as legal.
3721 unsigned NewBank = getRegBankID(Reg: OpReg, MRI, Default: AMDGPU::SGPRRegBankID);
3722 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: NewBank, Size);
3723 } else {
3724 // Some operands must be VGPR, and these are easy to copy to.
3725 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3726 }
3727 }
3728
3729 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: NumOps);
3730}
3731
3732/// Return the mapping for a pointer argument.
3733const RegisterBankInfo::ValueMapping *
3734AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3735 Register PtrReg) const {
3736 LLT PtrTy = MRI.getType(Reg: PtrReg);
3737 unsigned Size = PtrTy.getSizeInBits();
3738 if (Subtarget.useFlatForGlobal() ||
3739 !AMDGPU::isFlatGlobalAddrSpace(AS: PtrTy.getAddressSpace()))
3740 return AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3741
3742 // If we're using MUBUF instructions for global memory, an SGPR base register
3743 // is possible. Otherwise this needs to be a VGPR.
3744 const RegisterBank *PtrBank = getRegBank(Reg: PtrReg, MRI, TRI: *TRI);
3745 return AMDGPU::getValueMapping(BankID: PtrBank->getID(), Size);
3746}
3747
3748const RegisterBankInfo::InstructionMapping &
3749AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3750
3751 const MachineFunction &MF = *MI.getMF();
3752 const MachineRegisterInfo &MRI = MF.getRegInfo();
3753 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3754 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
3755 Register PtrReg = MI.getOperand(i: 1).getReg();
3756 LLT PtrTy = MRI.getType(Reg: PtrReg);
3757 unsigned AS = PtrTy.getAddressSpace();
3758 unsigned PtrSize = PtrTy.getSizeInBits();
3759
3760 const ValueMapping *ValMapping;
3761 const ValueMapping *PtrMapping;
3762
3763 const RegisterBank *PtrBank = getRegBank(Reg: PtrReg, MRI, TRI: *TRI);
3764
3765 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3766 if (isScalarLoadLegal(MI)) {
3767 // We have a uniform instruction so we want to use an SMRD load
3768 ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
3769 PtrMapping = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: PtrSize);
3770 } else {
3771 ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3772
3773 // If we're using MUBUF instructions for global memory, an SGPR base
3774 // register is possible. Otherwise this needs to be a VGPR.
3775 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3776 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3777
3778 PtrMapping = AMDGPU::getValueMapping(BankID: PtrBankID, Size: PtrSize);
3779 }
3780 } else {
3781 ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3782 PtrMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: PtrSize);
3783 }
3784
3785 OpdsMapping[0] = ValMapping;
3786 OpdsMapping[1] = PtrMapping;
3787 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3788 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: MI.getNumOperands());
3789 return Mapping;
3790
3791 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3792 // handle that during instruction selection?
3793}
3794
3795unsigned
3796AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3797 const MachineRegisterInfo &MRI,
3798 unsigned Default) const {
3799 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI);
3800 return Bank ? Bank->getID() : Default;
3801}
3802
3803const RegisterBankInfo::ValueMapping *
3804AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3805 const MachineRegisterInfo &MRI,
3806 const TargetRegisterInfo &TRI) const {
3807 // Lie and claim anything is legal, even though this needs to be an SGPR
3808 // applyMapping will have to deal with it as a waterfall loop.
3809 unsigned Bank = getRegBankID(Reg, MRI, Default: AMDGPU::SGPRRegBankID);
3810 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3811 return AMDGPU::getValueMapping(BankID: Bank, Size);
3812}
3813
3814const RegisterBankInfo::ValueMapping *
3815AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3816 const MachineRegisterInfo &MRI,
3817 const TargetRegisterInfo &TRI) const {
3818 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3819 return AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
3820}
3821
3822const RegisterBankInfo::ValueMapping *
3823AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3824 const MachineRegisterInfo &MRI,
3825 const TargetRegisterInfo &TRI) const {
3826 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3827 return AMDGPU::getValueMapping(BankID: AMDGPU::AGPRRegBankID, Size);
3828}
3829
3830///
3831/// This function must return a legal mapping, because
3832/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3833/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3834/// VGPR to SGPR generated is illegal.
3835///
3836// Operands that must be SGPRs must accept potentially divergent VGPRs as
3837// legal. These will be dealt with in applyMappingImpl.
3838//
3839const RegisterBankInfo::InstructionMapping &
3840AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3841 const MachineFunction &MF = *MI.getMF();
3842 const MachineRegisterInfo &MRI = MF.getRegInfo();
3843
3844 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3845 Register DstReg = MI.getOperand(i: 0).getReg();
3846 Register SrcReg = MI.getOperand(i: 1).getReg();
3847
3848 // The default logic bothers to analyze impossible alternative mappings. We
3849 // want the most straightforward mapping, so just directly handle this.
3850 const RegisterBank *DstBank = getRegBank(Reg: DstReg, MRI, TRI: *TRI);
3851 const RegisterBank *SrcBank = getRegBank(Reg: SrcReg, MRI, TRI: *TRI);
3852
3853 // For COPY between a physical reg and an s1, there is no type associated so
3854 // we need to take the virtual register's type as a hint on how to interpret
3855 // s1 values.
3856 unsigned Size;
3857 if (!SrcReg.isVirtual() && !DstBank &&
3858 MRI.getType(Reg: DstReg) == LLT::scalar(SizeInBits: 1)) {
3859 DstBank = &AMDGPU::VCCRegBank;
3860 Size = 1;
3861 } else if (!DstReg.isVirtual() && MRI.getType(Reg: SrcReg) == LLT::scalar(SizeInBits: 1)) {
3862 DstBank = &AMDGPU::VCCRegBank;
3863 Size = 1;
3864 } else {
3865 Size = getSizeInBits(Reg: DstReg, MRI, TRI: *TRI);
3866 }
3867
3868 if (!DstBank)
3869 DstBank = SrcBank;
3870 else if (!SrcBank)
3871 SrcBank = DstBank;
3872
3873 if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3874 cannotCopy(Dst: *DstBank, Src: *SrcBank, Size: TypeSize::getFixed(ExactSize: Size)))
3875 return getInvalidInstructionMapping();
3876
3877 const ValueMapping &ValMap = getValueMapping(StartIdx: 0, Length: Size, RegBank: *DstBank);
3878 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3879 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3880 OpdsMapping[0] = &ValMap;
3881 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3882 OpdsMapping[1] = &ValMap;
3883
3884 return getInstructionMapping(
3885 ID: 1, /*Cost*/ 1,
3886 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), NumOperands: OpdsMappingSize);
3887 }
3888
3889 if (MI.isRegSequence()) {
3890 // If any input is a VGPR, the result must be a VGPR. The default handling
3891 // assumes any copy between banks is legal.
3892 unsigned BankID = AMDGPU::SGPRRegBankID;
3893
3894 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3895 auto OpBank = getRegBankID(Reg: MI.getOperand(i: I).getReg(), MRI);
3896 // It doesn't make sense to use vcc or scc banks here, so just ignore
3897 // them.
3898 if (OpBank != AMDGPU::SGPRRegBankID) {
3899 BankID = AMDGPU::VGPRRegBankID;
3900 break;
3901 }
3902 }
3903 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
3904
3905 const ValueMapping &ValMap = getValueMapping(StartIdx: 0, Length: Size, RegBank: getRegBank(ID: BankID));
3906 return getInstructionMapping(
3907 ID: 1, /*Cost*/ 1,
3908 /*OperandsMapping*/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: 1);
3909 }
3910
3911 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3912 // properly.
3913 //
3914 // TODO: There are additional exec masking dependencies to analyze.
3915 if (auto *PHI = dyn_cast<GPhi>(Val: &MI)) {
3916 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3917 Register DstReg = PHI->getReg(Idx: 0);
3918
3919 // Sometimes the result may have already been assigned a bank.
3920 if (const RegisterBank *DstBank = getRegBank(Reg: DstReg, MRI, TRI: *TRI))
3921 ResultBank = DstBank->getID();
3922
3923 for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) {
3924 Register Reg = PHI->getIncomingValue(I);
3925 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI);
3926
3927 // FIXME: Assuming VGPR for any undetermined inputs.
3928 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3929 ResultBank = AMDGPU::VGPRRegBankID;
3930 break;
3931 }
3932
3933 // FIXME: Need to promote SGPR case to s32
3934 unsigned OpBank = Bank->getID();
3935 ResultBank = regBankBoolUnion(RB0: ResultBank, RB1: OpBank);
3936 }
3937
3938 assert(ResultBank != AMDGPU::InvalidRegBankID);
3939
3940 unsigned Size = MRI.getType(Reg: DstReg).getSizeInBits();
3941
3942 const ValueMapping &ValMap =
3943 getValueMapping(StartIdx: 0, Length: Size, RegBank: getRegBank(ID: ResultBank));
3944 return getInstructionMapping(
3945 ID: 1, /*Cost*/ 1,
3946 /*OperandsMapping*/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: 1);
3947 }
3948
3949 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3950 if (Mapping.isValid())
3951 return Mapping;
3952
3953 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3954
3955 switch (MI.getOpcode()) {
3956 default:
3957 return getInvalidInstructionMapping();
3958
3959 case AMDGPU::G_AND:
3960 case AMDGPU::G_OR:
3961 case AMDGPU::G_XOR:
3962 case AMDGPU::G_MUL: {
3963 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
3964 if (Size == 1) {
3965 const RegisterBank *DstBank
3966 = getRegBank(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
3967
3968 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3969 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3970 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3971 if (DstBank) {
3972 TargetBankID = DstBank->getID();
3973 if (DstBank == &AMDGPU::VCCRegBank) {
3974 TargetBankID = AMDGPU::VCCRegBankID;
3975 BankLHS = AMDGPU::VCCRegBankID;
3976 BankRHS = AMDGPU::VCCRegBankID;
3977 } else {
3978 BankLHS = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI,
3979 Default: AMDGPU::SGPRRegBankID);
3980 BankRHS = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
3981 Default: AMDGPU::SGPRRegBankID);
3982 }
3983 } else {
3984 BankLHS = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI,
3985 Default: AMDGPU::VCCRegBankID);
3986 BankRHS = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
3987 Default: AMDGPU::VCCRegBankID);
3988
3989 // Both inputs should be true booleans to produce a boolean result.
3990 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3991 TargetBankID = AMDGPU::VGPRRegBankID;
3992 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3993 TargetBankID = AMDGPU::VCCRegBankID;
3994 BankLHS = AMDGPU::VCCRegBankID;
3995 BankRHS = AMDGPU::VCCRegBankID;
3996 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3997 TargetBankID = AMDGPU::SGPRRegBankID;
3998 }
3999 }
4000
4001 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: TargetBankID, Size);
4002 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: BankLHS, Size);
4003 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: BankRHS, Size);
4004 break;
4005 }
4006
4007 if (Size == 64) {
4008
4009 if (isSALUMapping(MI)) {
4010 OpdsMapping[0] = getValueMappingSGPR64Only(BankID: AMDGPU::SGPRRegBankID, Size);
4011 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
4012 } else {
4013 if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64())
4014 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
4015 else
4016 OpdsMapping[0] =
4017 getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size);
4018 unsigned Bank1 = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI /*, DefaultBankID*/);
4019 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank1, Size);
4020
4021 unsigned Bank2 = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI /*, DefaultBankID*/);
4022 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank2, Size);
4023 }
4024
4025 break;
4026 }
4027
4028 [[fallthrough]];
4029 }
4030 case AMDGPU::G_PTR_ADD:
4031 case AMDGPU::G_PTRMASK:
4032 case AMDGPU::G_ADD:
4033 case AMDGPU::G_SUB:
4034 case AMDGPU::G_SHL:
4035 case AMDGPU::G_LSHR:
4036 case AMDGPU::G_ASHR:
4037 case AMDGPU::G_UADDO:
4038 case AMDGPU::G_USUBO:
4039 case AMDGPU::G_UADDE:
4040 case AMDGPU::G_SADDE:
4041 case AMDGPU::G_USUBE:
4042 case AMDGPU::G_SSUBE:
4043 case AMDGPU::G_ABS:
4044 case AMDGPU::G_SHUFFLE_VECTOR:
4045 case AMDGPU::G_SBFX:
4046 case AMDGPU::G_UBFX:
4047 case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
4048 case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
4049 if (isSALUMapping(MI))
4050 return getDefaultMappingSOP(MI);
4051 return getDefaultMappingVOP(MI);
4052 case AMDGPU::G_SMIN:
4053 case AMDGPU::G_SMAX:
4054 case AMDGPU::G_UMIN:
4055 case AMDGPU::G_UMAX:
4056 if (isSALUMapping(MI)) {
4057 // There are no scalar 64-bit min and max, use vector instruction instead.
4058 if (MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits() == 64 &&
4059 Subtarget.hasIntMinMax64())
4060 return getDefaultMappingVOP(MI);
4061 return getDefaultMappingSOP(MI);
4062 }
4063 return getDefaultMappingVOP(MI);
4064 case AMDGPU::G_FADD:
4065 case AMDGPU::G_FSUB:
4066 case AMDGPU::G_FMUL:
4067 case AMDGPU::G_FMA:
4068 case AMDGPU::G_FFLOOR:
4069 case AMDGPU::G_FCEIL:
4070 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
4071 case AMDGPU::G_FMINNUM:
4072 case AMDGPU::G_FMAXNUM:
4073 case AMDGPU::G_FMINIMUM:
4074 case AMDGPU::G_FMAXIMUM:
4075 case AMDGPU::G_FMINIMUMNUM:
4076 case AMDGPU::G_FMAXIMUMNUM:
4077 case AMDGPU::G_INTRINSIC_TRUNC:
4078 case AMDGPU::G_STRICT_FADD:
4079 case AMDGPU::G_STRICT_FSUB:
4080 case AMDGPU::G_STRICT_FMUL:
4081 case AMDGPU::G_STRICT_FMA: {
4082 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4083 unsigned Size = Ty.getSizeInBits();
4084 if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
4085 (Size == 32 || Size == 16) && isSALUMapping(MI))
4086 return getDefaultMappingSOP(MI);
4087 return getDefaultMappingVOP(MI);
4088 }
4089 case AMDGPU::G_FPTOSI:
4090 case AMDGPU::G_FPTOUI:
4091 case AMDGPU::G_FPTOSI_SAT:
4092 case AMDGPU::G_FPTOUI_SAT:
4093 case AMDGPU::G_SITOFP:
4094 case AMDGPU::G_UITOFP: {
4095 unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4096 unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4097 if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
4098 isSALUMapping(MI))
4099 return getDefaultMappingSOP(MI);
4100 return getDefaultMappingVOP(MI);
4101 }
4102 case AMDGPU::G_FPTRUNC:
4103 case AMDGPU::G_FPEXT: {
4104 unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4105 unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4106 if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
4107 isSALUMapping(MI))
4108 return getDefaultMappingSOP(MI);
4109 return getDefaultMappingVOP(MI);
4110 }
4111 case AMDGPU::G_FSQRT:
4112 case AMDGPU::G_FEXP2:
4113 case AMDGPU::G_FLOG2: {
4114 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4115 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4116 isSALUMapping(MI))
4117 return getDefaultMappingSOP(MI);
4118 return getDefaultMappingVOP(MI);
4119 }
4120 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
4121 case AMDGPU::G_SSUBSAT:
4122 case AMDGPU::G_UADDSAT:
4123 case AMDGPU::G_USUBSAT:
4124 case AMDGPU::G_FMAD:
4125 case AMDGPU::G_FLDEXP:
4126 case AMDGPU::G_FMINNUM_IEEE:
4127 case AMDGPU::G_FMAXNUM_IEEE:
4128 case AMDGPU::G_FCANONICALIZE:
4129 case AMDGPU::G_STRICT_FLDEXP:
4130 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
4131 case AMDGPU::G_FSHR: // TODO: Expand for scalar
4132 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
4133 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
4134 case AMDGPU::G_AMDGPU_RCP_IFLAG:
4135 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
4136 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
4137 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
4138 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
4139 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
4140 case AMDGPU::G_AMDGPU_SMED3:
4141 case AMDGPU::G_AMDGPU_FMED3:
4142 return getDefaultMappingVOP(MI);
4143 case AMDGPU::G_UMULH:
4144 case AMDGPU::G_SMULH: {
4145 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
4146 return getDefaultMappingSOP(MI);
4147 return getDefaultMappingVOP(MI);
4148 }
4149 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4150 case AMDGPU::G_AMDGPU_MAD_I64_I32: {
4151 // Three possible mappings:
4152 //
4153 // - Default SOP
4154 // - Default VOP
4155 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
4156 //
4157 // This allows instruction selection to keep the multiplication part of the
4158 // instruction on the SALU.
4159 bool AllSalu = true;
4160 bool MulSalu = true;
4161 for (unsigned i = 0; i < 5; ++i) {
4162 Register Reg = MI.getOperand(i).getReg();
4163 if (const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI)) {
4164 if (Bank->getID() != AMDGPU::SGPRRegBankID) {
4165 AllSalu = false;
4166 if (i == 2 || i == 3) {
4167 MulSalu = false;
4168 break;
4169 }
4170 }
4171 }
4172 }
4173
4174 if (AllSalu)
4175 return getDefaultMappingSOP(MI);
4176
4177 // If the multiply-add is full-rate in VALU, use that even if the
4178 // multiplication part is scalar. Accumulating separately on the VALU would
4179 // take two instructions.
4180 if (!MulSalu || Subtarget.hasFullRate64Ops())
4181 return getDefaultMappingVOP(MI);
4182
4183 // Keep the multiplication on the SALU, then accumulate on the VALU.
4184 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 64);
4185 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
4186 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4187 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4188 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 64);
4189 break;
4190 }
4191 case AMDGPU::G_IMPLICIT_DEF: {
4192 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4193 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4194 break;
4195 }
4196 case AMDGPU::G_FCONSTANT:
4197 case AMDGPU::G_CONSTANT:
4198 case AMDGPU::G_GLOBAL_VALUE:
4199 case AMDGPU::G_FRAME_INDEX:
4200 case AMDGPU::G_BLOCK_ADDR:
4201 case AMDGPU::G_READSTEADYCOUNTER:
4202 case AMDGPU::G_READCYCLECOUNTER: {
4203 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4204 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4205 break;
4206 }
4207 case AMDGPU::G_DYN_STACKALLOC: {
4208 // Result is always uniform, and a wave reduction is needed for the source.
4209 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4210 unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4211 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: 32);
4212 break;
4213 }
4214 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
4215 // This case is weird because we expect a physical register in the source,
4216 // but need to set a bank anyway.
4217 //
4218 // TODO: We could select the result to SGPR or VGPR
4219 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4220 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32);
4221 break;
4222 }
4223 case AMDGPU::G_INSERT: {
4224 unsigned BankID = getMappingType(MRI, MI);
4225 unsigned DstSize = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4226 unsigned SrcSize = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4227 unsigned EltSize = getSizeInBits(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4228 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: DstSize);
4229 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size: SrcSize);
4230 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, Size: EltSize);
4231 OpdsMapping[3] = nullptr;
4232 break;
4233 }
4234 case AMDGPU::G_EXTRACT: {
4235 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4236 unsigned DstSize = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4237 unsigned SrcSize = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4238 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: DstSize);
4239 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size: SrcSize);
4240 OpdsMapping[2] = nullptr;
4241 break;
4242 }
4243 case AMDGPU::G_BUILD_VECTOR:
4244 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
4245 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4246 if (DstTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16)) {
4247 unsigned DstSize = DstTy.getSizeInBits();
4248 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4249 unsigned Src0BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4250 unsigned Src1BankID = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4251 unsigned DstBankID = regBankUnion(RB0: Src0BankID, RB1: Src1BankID);
4252
4253 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: DstBankID, Size: DstSize);
4254 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Src0BankID, Size: SrcSize);
4255 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Src1BankID, Size: SrcSize);
4256 break;
4257 }
4258
4259 [[fallthrough]];
4260 }
4261 case AMDGPU::G_MERGE_VALUES:
4262 case AMDGPU::G_CONCAT_VECTORS: {
4263 unsigned Bank = getMappingType(MRI, MI);
4264 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4265 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4266
4267 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize);
4268 // Op1 and Dst should use the same register bank.
4269 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
4270 OpdsMapping[i] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize);
4271 break;
4272 }
4273 case AMDGPU::G_BITREVERSE:
4274 case AMDGPU::G_BITCAST:
4275 case AMDGPU::G_INTTOPTR:
4276 case AMDGPU::G_PTRTOINT:
4277 case AMDGPU::G_FABS:
4278 case AMDGPU::G_FNEG: {
4279 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4280 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4281 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4282 break;
4283 }
4284 case AMDGPU::G_AMDGPU_FFBH_U32:
4285 case AMDGPU::G_AMDGPU_FFBL_B32:
4286 case AMDGPU::G_CTLZ_ZERO_UNDEF:
4287 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
4288 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4289 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4290 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: 32);
4291 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
4292 break;
4293 }
4294 case AMDGPU::G_CTPOP: {
4295 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4296 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4297 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: 32);
4298
4299 // This should really be getValueMappingSGPR64Only, but allowing the generic
4300 // code to handle the register split just makes using LegalizerHelper more
4301 // difficult.
4302 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4303 break;
4304 }
4305 case AMDGPU::G_TRUNC: {
4306 Register Dst = MI.getOperand(i: 0).getReg();
4307 Register Src = MI.getOperand(i: 1).getReg();
4308 unsigned Bank = getRegBankID(Reg: Src, MRI);
4309 unsigned DstSize = getSizeInBits(Reg: Dst, MRI, TRI: *TRI);
4310 unsigned SrcSize = getSizeInBits(Reg: Src, MRI, TRI: *TRI);
4311 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize);
4312 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize);
4313 break;
4314 }
4315 case AMDGPU::G_ZEXT:
4316 case AMDGPU::G_SEXT:
4317 case AMDGPU::G_ANYEXT:
4318 case AMDGPU::G_SEXT_INREG: {
4319 Register Dst = MI.getOperand(i: 0).getReg();
4320 Register Src = MI.getOperand(i: 1).getReg();
4321 unsigned DstSize = getSizeInBits(Reg: Dst, MRI, TRI: *TRI);
4322 unsigned SrcSize = getSizeInBits(Reg: Src, MRI, TRI: *TRI);
4323
4324 unsigned DstBank;
4325 const RegisterBank *SrcBank = getRegBank(Reg: Src, MRI, TRI: *TRI);
4326 assert(SrcBank);
4327 switch (SrcBank->getID()) {
4328 case AMDGPU::SGPRRegBankID:
4329 DstBank = AMDGPU::SGPRRegBankID;
4330 break;
4331 default:
4332 DstBank = AMDGPU::VGPRRegBankID;
4333 break;
4334 }
4335
4336 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
4337 // 32-bits, and then to 64.
4338 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: DstBank, Size: DstSize);
4339 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID: SrcBank->getID(),
4340 Size: SrcSize);
4341 break;
4342 }
4343 case AMDGPU::G_IS_FPCLASS: {
4344 Register SrcReg = MI.getOperand(i: 1).getReg();
4345 unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits();
4346 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4347 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: DstSize);
4348 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4349 break;
4350 }
4351 case AMDGPU::G_STORE: {
4352 assert(MI.getOperand(0).isReg());
4353 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4354
4355 // FIXME: We need to specify a different reg bank once scalar stores are
4356 // supported.
4357 const ValueMapping *ValMapping =
4358 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
4359 OpdsMapping[0] = ValMapping;
4360 OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg());
4361 break;
4362 }
4363 case AMDGPU::G_ICMP:
4364 case AMDGPU::G_FCMP: {
4365 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4366
4367 // See if the result register has already been constrained to vcc, which may
4368 // happen due to control flow intrinsic lowering.
4369 unsigned DstBank = getRegBankID(Reg: MI.getOperand(i: 0).getReg(), MRI,
4370 Default: AMDGPU::SGPRRegBankID);
4371 unsigned Op2Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4372 unsigned Op3Bank = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI);
4373
4374 auto canUseSCCICMP = [&]() {
4375 auto Pred =
4376 static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate());
4377 return Size == 32 ||
4378 (Size == 64 &&
4379 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4380 Subtarget.hasScalarCompareEq64());
4381 };
4382 auto canUseSCCFCMP = [&]() {
4383 return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
4384 };
4385
4386 bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4387 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4388 Op2Bank == AMDGPU::SGPRRegBankID &&
4389 Op3Bank == AMDGPU::SGPRRegBankID &&
4390 (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
4391
4392 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4393 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4394
4395 // TODO: Use 32-bit for scalar output size.
4396 // SCC results will need to be copied to a 32-bit SGPR virtual register.
4397 const unsigned ResultSize = 1;
4398
4399 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: DstBank, Size: ResultSize);
4400 OpdsMapping[1] = nullptr; // Predicate Operand.
4401 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: SrcBank, Size);
4402 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: SrcBank, Size);
4403 break;
4404 }
4405 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4406 // VGPR index can be used for waterfall when indexing a SGPR vector.
4407 unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4408 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4409 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4410 unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4411 unsigned IdxBank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4412 unsigned OutputBankID = regBankUnion(RB0: SrcBankID, RB1: IdxBank);
4413
4414 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: OutputBankID, Size: DstSize);
4415 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: SrcSize);
4416
4417 // The index can be either if the source vector is VGPR.
4418 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4419 break;
4420 }
4421 case AMDGPU::G_INSERT_VECTOR_ELT: {
4422 unsigned OutputBankID = isSALUMapping(MI) ?
4423 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4424
4425 unsigned VecSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4426 unsigned InsertSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4427 unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: 3).getReg()).getSizeInBits();
4428 unsigned InsertEltBankID = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4429 unsigned IdxBankID = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI);
4430
4431 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize);
4432 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize);
4433
4434 // This is a weird case, because we need to break down the mapping based on
4435 // the register bank of a different operand.
4436 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4437 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(BankID: InsertEltBankID,
4438 Size: InsertSize);
4439 } else {
4440 assert(InsertSize == 32 || InsertSize == 64);
4441 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: InsertEltBankID, Size: InsertSize);
4442 }
4443
4444 // The index can be either if the source vector is VGPR.
4445 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBankID, Size: IdxSize);
4446 break;
4447 }
4448 case AMDGPU::G_UNMERGE_VALUES: {
4449 unsigned Bank = getMappingType(MRI, MI);
4450
4451 // Op1 and Dst should use the same register bank.
4452 // FIXME: Shouldn't this be the default? Why do we need to handle this?
4453 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4454 unsigned Size = getSizeInBits(Reg: MI.getOperand(i).getReg(), MRI, TRI: *TRI);
4455 OpdsMapping[i] = AMDGPU::getValueMapping(BankID: Bank, Size);
4456 }
4457 break;
4458 }
4459 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4460 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4461 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4462 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4463 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4464 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
4465 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
4466 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
4467 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
4468 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
4469 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4470 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4471 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4472 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4473 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4474 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4475 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4476 case AMDGPU::G_AMDGPU_BUFFER_STORE:
4477 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4478 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4479 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4480 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4481 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4482
4483 // rsrc
4484 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4485
4486 // vindex
4487 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4488
4489 // voffset
4490 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
4491
4492 // soffset
4493 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
4494
4495 // Any remaining operands are immediates and were correctly null
4496 // initialized.
4497 break;
4498 }
4499 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4500 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4501 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4502 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4503 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4504 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4505 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4506 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4507 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4508 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4509 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4510 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4511 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
4512 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
4513 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4514 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4515 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4516 // vdata_out
4517 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4518
4519 // vdata_in
4520 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4521
4522 // rsrc
4523 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4524
4525 // vindex
4526 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
4527
4528 // voffset
4529 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
4530
4531 // soffset
4532 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
4533
4534 // Any remaining operands are immediates and were correctly null
4535 // initialized.
4536 break;
4537 }
4538 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4539 // vdata_out
4540 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4541
4542 // vdata_in
4543 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4544
4545 // cmp
4546 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4547
4548 // rsrc
4549 OpdsMapping[3] = getSGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
4550
4551 // vindex
4552 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
4553
4554 // voffset
4555 OpdsMapping[5] = getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
4556
4557 // soffset
4558 OpdsMapping[6] = getSGPROpMapping(Reg: MI.getOperand(i: 6).getReg(), MRI, TRI: *TRI);
4559
4560 // Any remaining operands are immediates and were correctly null
4561 // initialized.
4562 break;
4563 }
4564 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
4565 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
4566 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
4567 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
4568 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
4569 // Lie and claim everything is legal, even though some need to be
4570 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4571 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
4572 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4573
4574 // We need to convert this to a MUBUF if either the resource of offset is
4575 // VGPR.
4576 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4577 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4578 unsigned ResultBank = regBankUnion(RB0: RSrcBank, RB1: OffsetBank);
4579
4580 unsigned Size0 = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4581 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: ResultBank, Size: Size0);
4582 break;
4583 }
4584 case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
4585 OpdsMapping[0] = getSGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4586 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
4587 break;
4588 case AMDGPU::G_AMDGPU_SPONENTRY: {
4589 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4590 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4591 break;
4592 }
4593 case AMDGPU::G_INTRINSIC:
4594 case AMDGPU::G_INTRINSIC_CONVERGENT: {
4595 switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
4596 default:
4597 return getInvalidInstructionMapping();
4598 case Intrinsic::amdgcn_div_fmas:
4599 case Intrinsic::amdgcn_div_fixup:
4600 case Intrinsic::amdgcn_trig_preop:
4601 case Intrinsic::amdgcn_sin:
4602 case Intrinsic::amdgcn_cos:
4603 case Intrinsic::amdgcn_log_clamp:
4604 case Intrinsic::amdgcn_rcp_legacy:
4605 case Intrinsic::amdgcn_rsq_legacy:
4606 case Intrinsic::amdgcn_rsq_clamp:
4607 case Intrinsic::amdgcn_tanh:
4608 case Intrinsic::amdgcn_fmul_legacy:
4609 case Intrinsic::amdgcn_fma_legacy:
4610 case Intrinsic::amdgcn_frexp_mant:
4611 case Intrinsic::amdgcn_frexp_exp:
4612 case Intrinsic::amdgcn_fract:
4613 case Intrinsic::amdgcn_cvt_pknorm_i16:
4614 case Intrinsic::amdgcn_cvt_pknorm_u16:
4615 case Intrinsic::amdgcn_cvt_pk_i16:
4616 case Intrinsic::amdgcn_cvt_pk_u16:
4617 case Intrinsic::amdgcn_cvt_sr_pk_f16_f32:
4618 case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32:
4619 case Intrinsic::amdgcn_cvt_pk_f16_fp8:
4620 case Intrinsic::amdgcn_cvt_pk_f16_bf8:
4621 case Intrinsic::amdgcn_cvt_pk_fp8_f16:
4622 case Intrinsic::amdgcn_cvt_pk_bf8_f16:
4623 case Intrinsic::amdgcn_cvt_sr_fp8_f16:
4624 case Intrinsic::amdgcn_cvt_sr_bf8_f16:
4625 case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp8:
4626 case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp8:
4627 case Intrinsic::amdgcn_cvt_scale_pk8_f16_bf8:
4628 case Intrinsic::amdgcn_cvt_scale_pk8_bf16_bf8:
4629 case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp4:
4630 case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp4:
4631 case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8:
4632 case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8:
4633 case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4:
4634 case Intrinsic::amdgcn_cvt_scale_pk16_f16_fp6:
4635 case Intrinsic::amdgcn_cvt_scale_pk16_bf16_fp6:
4636 case Intrinsic::amdgcn_cvt_scale_pk16_f16_bf6:
4637 case Intrinsic::amdgcn_cvt_scale_pk16_bf16_bf6:
4638 case Intrinsic::amdgcn_cvt_scale_pk16_f32_fp6:
4639 case Intrinsic::amdgcn_cvt_scale_pk16_f32_bf6:
4640 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16:
4641 case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16:
4642 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16:
4643 case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f16:
4644 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f32:
4645 case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f32:
4646 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32:
4647 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16:
4648 case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16:
4649 case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f32:
4650 case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f32:
4651 case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f16:
4652 case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f16:
4653 case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_bf16:
4654 case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_bf16:
4655 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_bf16:
4656 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_bf16:
4657 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f16:
4658 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f16:
4659 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f32:
4660 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f32:
4661 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32:
4662 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16:
4663 case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16:
4664 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32:
4665 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32:
4666 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16:
4667 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16:
4668 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16:
4669 case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16:
4670 case Intrinsic::amdgcn_sat_pk4_i4_i8:
4671 case Intrinsic::amdgcn_sat_pk4_u4_u8:
4672 case Intrinsic::amdgcn_fmed3:
4673 case Intrinsic::amdgcn_cubeid:
4674 case Intrinsic::amdgcn_cubema:
4675 case Intrinsic::amdgcn_cubesc:
4676 case Intrinsic::amdgcn_cubetc:
4677 case Intrinsic::amdgcn_sffbh:
4678 case Intrinsic::amdgcn_fmad_ftz:
4679 case Intrinsic::amdgcn_mbcnt_lo:
4680 case Intrinsic::amdgcn_mbcnt_hi:
4681 case Intrinsic::amdgcn_mul_u24:
4682 case Intrinsic::amdgcn_mul_i24:
4683 case Intrinsic::amdgcn_mulhi_u24:
4684 case Intrinsic::amdgcn_mulhi_i24:
4685 case Intrinsic::amdgcn_lerp:
4686 case Intrinsic::amdgcn_sad_u8:
4687 case Intrinsic::amdgcn_msad_u8:
4688 case Intrinsic::amdgcn_sad_hi_u8:
4689 case Intrinsic::amdgcn_sad_u16:
4690 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4691 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4692 case Intrinsic::amdgcn_mqsad_u32_u8:
4693 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4694 case Intrinsic::amdgcn_alignbyte:
4695 case Intrinsic::amdgcn_perm:
4696 case Intrinsic::amdgcn_prng_b32:
4697 case Intrinsic::amdgcn_fdot2:
4698 case Intrinsic::amdgcn_sdot2:
4699 case Intrinsic::amdgcn_udot2:
4700 case Intrinsic::amdgcn_sdot4:
4701 case Intrinsic::amdgcn_udot4:
4702 case Intrinsic::amdgcn_sdot8:
4703 case Intrinsic::amdgcn_udot8:
4704 case Intrinsic::amdgcn_fdot2_bf16_bf16:
4705 case Intrinsic::amdgcn_fdot2_f16_f16:
4706 case Intrinsic::amdgcn_fdot2_f32_bf16:
4707 case Intrinsic::amdgcn_fdot2c_f32_bf16:
4708 case Intrinsic::amdgcn_sudot4:
4709 case Intrinsic::amdgcn_sudot8:
4710 case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
4711 case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
4712 case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
4713 case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
4714 case Intrinsic::amdgcn_cvt_f32_fp8:
4715 case Intrinsic::amdgcn_cvt_f32_fp8_e5m3:
4716 case Intrinsic::amdgcn_cvt_f32_bf8:
4717 case Intrinsic::amdgcn_cvt_off_f32_i4:
4718 case Intrinsic::amdgcn_cvt_pk_f32_fp8:
4719 case Intrinsic::amdgcn_cvt_pk_f32_bf8:
4720 case Intrinsic::amdgcn_cvt_pk_fp8_f32:
4721 case Intrinsic::amdgcn_cvt_pk_fp8_f32_e5m3:
4722 case Intrinsic::amdgcn_cvt_pk_bf8_f32:
4723 case Intrinsic::amdgcn_cvt_sr_fp8_f32:
4724 case Intrinsic::amdgcn_cvt_sr_fp8_f32_e5m3:
4725 case Intrinsic::amdgcn_cvt_sr_bf8_f32:
4726 case Intrinsic::amdgcn_cvt_sr_bf16_f32:
4727 case Intrinsic::amdgcn_cvt_sr_f16_f32:
4728 case Intrinsic::amdgcn_cvt_f16_fp8:
4729 case Intrinsic::amdgcn_cvt_f16_bf8:
4730 case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_f16:
4731 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
4732 case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
4733 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:
4734 case Intrinsic::amdgcn_cvt_scalef32_f16_fp8:
4735 case Intrinsic::amdgcn_cvt_scalef32_f16_bf8:
4736 case Intrinsic::amdgcn_cvt_scalef32_f32_fp8:
4737 case Intrinsic::amdgcn_cvt_scalef32_f32_bf8:
4738 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f32:
4739 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f32:
4740 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp8:
4741 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_bf8:
4742 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f16:
4743 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_bf16:
4744 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f16:
4745 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_bf16:
4746 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp4:
4747 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f32:
4748 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp4:
4749 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp4:
4750 case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_fp6:
4751 case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_bf6:
4752 case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_bf6:
4753 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_bf6:
4754 case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_fp6:
4755 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_fp6:
4756 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_bf8:
4757 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_bf8:
4758 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp8:
4759 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp8:
4760 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f16:
4761 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_bf16:
4762 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f16:
4763 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_bf16:
4764 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f32:
4765 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_bf16:
4766 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f16:
4767 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f32:
4768 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_bf16:
4769 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f16:
4770 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f32:
4771 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_bf16:
4772 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f16:
4773 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f32:
4774 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_bf16:
4775 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f16:
4776 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f32:
4777 case Intrinsic::amdgcn_ashr_pk_i8_i32:
4778 case Intrinsic::amdgcn_ashr_pk_u8_i32:
4779 case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
4780 case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32:
4781 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4782 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4783 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4784 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
4785 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4786 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4787 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4788 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4789 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
4790 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
4791 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
4792 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
4793 case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
4794 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
4795 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
4796 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
4797 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
4798 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
4799 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
4800 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
4801 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
4802 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
4803 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
4804 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
4805 case Intrinsic::amdgcn_wmma_f32_16x16x4_f32:
4806 case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16:
4807 case Intrinsic::amdgcn_wmma_f32_16x16x32_f16:
4808 case Intrinsic::amdgcn_wmma_f16_16x16x32_f16:
4809 case Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16:
4810 case Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16:
4811 case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8:
4812 case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8:
4813 case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8:
4814 case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8:
4815 case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8:
4816 case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8:
4817 case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8:
4818 case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8:
4819 case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8:
4820 case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8:
4821 case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8:
4822 case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8:
4823 case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8:
4824 case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8:
4825 case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8:
4826 case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:
4827 case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
4828 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
4829 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
4830 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
4831 case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
4832 case Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4:
4833 case Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4:
4834 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
4835 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
4836 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
4837 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
4838 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
4839 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
4840 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
4841 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
4842 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
4843 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
4844 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
4845 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
4846 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8:
4847 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
4848 case Intrinsic::amdgcn_perm_pk16_b4_u4:
4849 case Intrinsic::amdgcn_perm_pk16_b6_u4:
4850 case Intrinsic::amdgcn_perm_pk16_b8_u4:
4851 case Intrinsic::amdgcn_add_max_i32:
4852 case Intrinsic::amdgcn_add_max_u32:
4853 case Intrinsic::amdgcn_add_min_i32:
4854 case Intrinsic::amdgcn_add_min_u32:
4855 case Intrinsic::amdgcn_pk_add_max_i16:
4856 case Intrinsic::amdgcn_pk_add_max_u16:
4857 case Intrinsic::amdgcn_pk_add_min_i16:
4858 case Intrinsic::amdgcn_pk_add_min_u16:
4859 return getDefaultMappingVOP(MI);
4860 case Intrinsic::amdgcn_log:
4861 case Intrinsic::amdgcn_exp2:
4862 case Intrinsic::amdgcn_rcp:
4863 case Intrinsic::amdgcn_rsq:
4864 case Intrinsic::amdgcn_sqrt: {
4865 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4866 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4867 isSALUMapping(MI))
4868 return getDefaultMappingSOP(MI);
4869 return getDefaultMappingVOP(MI);
4870 }
4871 case Intrinsic::amdgcn_sbfe:
4872 case Intrinsic::amdgcn_ubfe:
4873 if (isSALUMapping(MI))
4874 return getDefaultMappingSOP(MI);
4875 return getDefaultMappingVOP(MI);
4876 case Intrinsic::amdgcn_ds_swizzle:
4877 case Intrinsic::amdgcn_ds_permute:
4878 case Intrinsic::amdgcn_ds_bpermute:
4879 case Intrinsic::amdgcn_update_dpp:
4880 case Intrinsic::amdgcn_mov_dpp8:
4881 case Intrinsic::amdgcn_mov_dpp:
4882 case Intrinsic::amdgcn_strict_wwm:
4883 case Intrinsic::amdgcn_wwm:
4884 case Intrinsic::amdgcn_strict_wqm:
4885 case Intrinsic::amdgcn_wqm:
4886 case Intrinsic::amdgcn_softwqm:
4887 case Intrinsic::amdgcn_set_inactive:
4888 case Intrinsic::amdgcn_set_inactive_chain_arg:
4889 case Intrinsic::amdgcn_permlane64:
4890 case Intrinsic::amdgcn_ds_bpermute_fi_b32:
4891 return getDefaultMappingAllVGPR(MI);
4892 case Intrinsic::amdgcn_cvt_pkrtz:
4893 if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
4894 return getDefaultMappingSOP(MI);
4895 return getDefaultMappingVOP(MI);
4896 case Intrinsic::amdgcn_kernarg_segment_ptr:
4897 case Intrinsic::amdgcn_s_getpc:
4898 case Intrinsic::amdgcn_groupstaticsize:
4899 case Intrinsic::amdgcn_reloc_constant:
4900 case Intrinsic::returnaddress: {
4901 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4902 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4903 break;
4904 }
4905 case Intrinsic::amdgcn_wqm_vote: {
4906 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4907 OpdsMapping[0] = OpdsMapping[2]
4908 = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size);
4909 break;
4910 }
4911 case Intrinsic::amdgcn_ps_live: {
4912 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
4913 break;
4914 }
4915 case Intrinsic::amdgcn_div_scale: {
4916 unsigned Dst0Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4917 unsigned Dst1Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4918 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Dst0Size);
4919 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: Dst1Size);
4920
4921 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 3).getReg()).getSizeInBits();
4922 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4923 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4924 break;
4925 }
4926 case Intrinsic::amdgcn_class: {
4927 Register Src0Reg = MI.getOperand(i: 2).getReg();
4928 Register Src1Reg = MI.getOperand(i: 3).getReg();
4929 unsigned Src0Size = MRI.getType(Reg: Src0Reg).getSizeInBits();
4930 unsigned Src1Size = MRI.getType(Reg: Src1Reg).getSizeInBits();
4931 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4932 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: DstSize);
4933 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Src0Size);
4934 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Src1Size);
4935 break;
4936 }
4937 case Intrinsic::amdgcn_icmp:
4938 case Intrinsic::amdgcn_fcmp: {
4939 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4940 // This is not VCCRegBank because this is not used in boolean contexts.
4941 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
4942 unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4943 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
4944 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
4945 break;
4946 }
4947 case Intrinsic::amdgcn_readlane: {
4948 // This must be an SGPR, but accept a VGPR.
4949 Register IdxReg = MI.getOperand(i: 3).getReg();
4950 unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits();
4951 unsigned IdxBank = getRegBankID(Reg: IdxReg, MRI, Default: AMDGPU::SGPRRegBankID);
4952 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4953 [[fallthrough]];
4954 }
4955 case Intrinsic::amdgcn_readfirstlane: {
4956 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4957 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4958 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
4959 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4960 break;
4961 }
4962 case Intrinsic::amdgcn_writelane: {
4963 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4964 Register SrcReg = MI.getOperand(i: 2).getReg();
4965 unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits();
4966 unsigned SrcBank = getRegBankID(Reg: SrcReg, MRI, Default: AMDGPU::SGPRRegBankID);
4967 Register IdxReg = MI.getOperand(i: 3).getReg();
4968 unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits();
4969 unsigned IdxBank = getRegBankID(Reg: IdxReg, MRI, Default: AMDGPU::SGPRRegBankID);
4970 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
4971
4972 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4973 // to legalize.
4974 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: SrcBank, Size: SrcSize);
4975 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4976 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize);
4977 break;
4978 }
4979 case Intrinsic::amdgcn_if_break: {
4980 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4981 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4982 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
4983 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
4984 break;
4985 }
4986 case Intrinsic::amdgcn_permlane16:
4987 case Intrinsic::amdgcn_permlanex16: {
4988 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
4989 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
4990 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
4991 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
4992 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
4993 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
4994 break;
4995 }
4996 case Intrinsic::amdgcn_permlane_bcast:
4997 case Intrinsic::amdgcn_permlane_up:
4998 case Intrinsic::amdgcn_permlane_down:
4999 case Intrinsic::amdgcn_permlane_xor: {
5000 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5001 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5002 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5003 OpdsMapping[3] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5004 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5005 break;
5006 }
5007 case Intrinsic::amdgcn_permlane_idx_gen: {
5008 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5009 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5010 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5011 OpdsMapping[3] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5012 break;
5013 }
5014 case Intrinsic::amdgcn_permlane16_var:
5015 case Intrinsic::amdgcn_permlanex16_var: {
5016 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5017 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5018 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5019 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5020 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5021 break;
5022 }
5023 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
5024 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
5025 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
5026 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
5027 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
5028 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
5029 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
5030 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
5031 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
5032 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
5033 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
5034 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
5035 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
5036 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
5037 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
5038 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
5039 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
5040 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
5041 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
5042 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
5043 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
5044 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
5045 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
5046 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
5047 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
5048 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
5049 case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
5050 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
5051 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
5052 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
5053 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
5054 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
5055 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
5056 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
5057 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
5058 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
5059 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
5060 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
5061 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
5062 case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
5063 case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
5064 case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
5065 case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
5066 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
5067 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5068 unsigned MinNumRegsRequired = DstSize / 32;
5069
5070 // Default for MAI intrinsics.
5071 // srcC can also be an immediate which can be folded later.
5072 // FIXME: Should we eventually add an alternative mapping with AGPR src
5073 // for srcA/srcB?
5074 //
5075 // vdst, srcA, srcB, srcC
5076 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5077
5078 bool UseAGPRForm = !Subtarget.hasGFX90AInsts() ||
5079 Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired);
5080
5081 OpdsMapping[0] =
5082 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI)
5083 : getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5084 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5085 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5086 OpdsMapping[4] =
5087 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI)
5088 : getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5089 break;
5090 }
5091 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
5092 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
5093 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5094 unsigned MinNumRegsRequired = DstSize / 32;
5095
5096 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5097 bool UseAGPRForm = Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired);
5098
5099 OpdsMapping[0] =
5100 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI)
5101 : getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5102
5103 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5104 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5105 OpdsMapping[4] =
5106 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI)
5107 : getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5108
5109 OpdsMapping[8] = getVGPROpMapping(Reg: MI.getOperand(i: 8).getReg(), MRI, TRI: *TRI);
5110 OpdsMapping[10] = getVGPROpMapping(Reg: MI.getOperand(i: 10).getReg(), MRI, TRI: *TRI);
5111 break;
5112 }
5113 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
5114 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
5115 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
5116 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
5117 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
5118 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
5119 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
5120 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
5121 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
5122 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
5123 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
5124 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
5125 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
5126 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
5127 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
5128 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
5129 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
5130 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
5131 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
5132 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
5133 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
5134 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
5135 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
5136 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
5137 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
5138 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
5139 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
5140 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: {
5141 Register DstReg = MI.getOperand(i: 0).getReg();
5142 unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits();
5143 unsigned MinNumRegsRequired = DstSize / 32;
5144 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5145 bool UseAGPRForm = Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired);
5146
5147 // vdst, srcA, srcB, srcC, idx
5148 OpdsMapping[0] = UseAGPRForm ? getAGPROpMapping(Reg: DstReg, MRI, TRI: *TRI)
5149 : getVGPROpMapping(Reg: DstReg, MRI, TRI: *TRI);
5150
5151 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5152 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5153 OpdsMapping[4] =
5154 UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI)
5155 : getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5156 OpdsMapping[5] = getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5157 break;
5158 }
5159 case Intrinsic::amdgcn_interp_p1:
5160 case Intrinsic::amdgcn_interp_p2:
5161 case Intrinsic::amdgcn_interp_mov:
5162 case Intrinsic::amdgcn_interp_p1_f16:
5163 case Intrinsic::amdgcn_interp_p2_f16:
5164 case Intrinsic::amdgcn_lds_param_load: {
5165 const int M0Idx = MI.getNumOperands() - 1;
5166 Register M0Reg = MI.getOperand(i: M0Idx).getReg();
5167 unsigned M0Bank = getRegBankID(Reg: M0Reg, MRI, Default: AMDGPU::SGPRRegBankID);
5168 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5169
5170 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5171 for (int I = 2; I != M0Idx && MI.getOperand(i: I).isReg(); ++I)
5172 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5173
5174 // Must be SGPR, but we must take whatever the original bank is and fix it
5175 // later.
5176 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5177 break;
5178 }
5179 case Intrinsic::amdgcn_interp_inreg_p10:
5180 case Intrinsic::amdgcn_interp_inreg_p2:
5181 case Intrinsic::amdgcn_interp_inreg_p10_f16:
5182 case Intrinsic::amdgcn_interp_inreg_p2_f16:
5183 case Intrinsic::amdgcn_interp_p10_rtz_f16:
5184 case Intrinsic::amdgcn_interp_p2_rtz_f16: {
5185 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5186 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5187 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5188 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5189 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5190 break;
5191 }
5192 case Intrinsic::amdgcn_permlane16_swap:
5193 case Intrinsic::amdgcn_permlane32_swap: {
5194 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5195 OpdsMapping[0] = OpdsMapping[1] = OpdsMapping[3] = OpdsMapping[4] =
5196 AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5197 break;
5198 }
5199 case Intrinsic::amdgcn_ballot: {
5200 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5201 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
5202 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
5203 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: SrcSize);
5204 break;
5205 }
5206 case Intrinsic::amdgcn_inverse_ballot: {
5207 // This must be an SGPR, but accept a VGPR.
5208 Register MaskReg = MI.getOperand(i: 2).getReg();
5209 unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits();
5210 unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID);
5211 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5212 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize);
5213 break;
5214 }
5215 case Intrinsic::amdgcn_bitop3: {
5216 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5217 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5218 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5219 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5220 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5221 break;
5222 }
5223 case Intrinsic::amdgcn_s_quadmask:
5224 case Intrinsic::amdgcn_s_wqm: {
5225 Register MaskReg = MI.getOperand(i: 2).getReg();
5226 unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits();
5227 unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID);
5228 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: MaskSize);
5229 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize);
5230 break;
5231 }
5232 case Intrinsic::amdgcn_wave_reduce_add:
5233 case Intrinsic::amdgcn_wave_reduce_fadd:
5234 case Intrinsic::amdgcn_wave_reduce_sub:
5235 case Intrinsic::amdgcn_wave_reduce_fsub:
5236 case Intrinsic::amdgcn_wave_reduce_min:
5237 case Intrinsic::amdgcn_wave_reduce_umin:
5238 case Intrinsic::amdgcn_wave_reduce_fmin:
5239 case Intrinsic::amdgcn_wave_reduce_max:
5240 case Intrinsic::amdgcn_wave_reduce_umax:
5241 case Intrinsic::amdgcn_wave_reduce_fmax:
5242 case Intrinsic::amdgcn_wave_reduce_and:
5243 case Intrinsic::amdgcn_wave_reduce_or:
5244 case Intrinsic::amdgcn_wave_reduce_xor: {
5245 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5246 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize);
5247 unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
5248 auto regBankID =
5249 isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5250 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: regBankID, Size: OpSize);
5251 break;
5252 }
5253 case Intrinsic::amdgcn_s_bitreplicate: {
5254 Register MaskReg = MI.getOperand(i: 2).getReg();
5255 unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID);
5256 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 64);
5257 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: 32);
5258 break;
5259 }
5260 case Intrinsic::amdgcn_wave_shuffle: {
5261 unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5262 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
5263 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
5264 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize);
5265 break;
5266 }
5267 }
5268 break;
5269 }
5270 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
5271 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
5272 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
5273 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
5274 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
5275 auto IntrID = AMDGPU::getIntrinsicID(I: MI);
5276 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(Intr: IntrID);
5277 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
5278 // Non-images can have complications from operands that allow both SGPR
5279 // and VGPR. For now it's too complicated to figure out the final opcode
5280 // to derive the register bank from the MCInstrDesc.
5281 assert(RSrcIntrin->IsImage);
5282 return getImageMapping(MRI, MI, RsrcIdx: RSrcIntrin->RsrcArg);
5283 }
5284 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
5285 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
5286 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {
5287 bool IsDualOrBVH8 =
5288 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY ||
5289 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY;
5290 unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier
5291 unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods;
5292 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5293 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5294 if (IsDualOrBVH8) {
5295 OpdsMapping[1] = AMDGPU::getValueMapping(
5296 BankID: AMDGPU::VGPRRegBankID,
5297 Size: MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits());
5298 OpdsMapping[2] = AMDGPU::getValueMapping(
5299 BankID: AMDGPU::VGPRRegBankID,
5300 Size: MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits());
5301 }
5302 OpdsMapping[LastRegOpIdx] =
5303 getSGPROpMapping(Reg: MI.getOperand(i: LastRegOpIdx).getReg(), MRI, TRI: *TRI);
5304 if (LastRegOpIdx == 3) {
5305 // Sequential form: all operands combined into VGPR256/VGPR512
5306 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
5307 if (Size > 256)
5308 Size = 512;
5309 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5310 } else {
5311 // NSA form
5312 unsigned FirstSrcOpIdx = IsDualOrBVH8 ? 4 : 2;
5313 for (unsigned I = FirstSrcOpIdx; I < LastRegOpIdx; ++I) {
5314 unsigned Size = MRI.getType(Reg: MI.getOperand(i: I).getReg()).getSizeInBits();
5315 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size);
5316 }
5317 }
5318 break;
5319 }
5320 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
5321 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
5322 auto IntrID = cast<GIntrinsic>(Val: MI).getIntrinsicID();
5323 switch (IntrID) {
5324 case Intrinsic::amdgcn_s_getreg:
5325 case Intrinsic::amdgcn_s_memtime:
5326 case Intrinsic::amdgcn_s_memrealtime:
5327 case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
5328 case Intrinsic::amdgcn_s_sendmsg_rtn: {
5329 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5330 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
5331 break;
5332 }
5333 case Intrinsic::amdgcn_global_atomic_fmin_num:
5334 case Intrinsic::amdgcn_global_atomic_fmax_num:
5335 case Intrinsic::amdgcn_flat_atomic_fmin_num:
5336 case Intrinsic::amdgcn_flat_atomic_fmax_num:
5337 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
5338 case Intrinsic::amdgcn_global_load_tr_b64:
5339 case Intrinsic::amdgcn_global_load_tr_b128:
5340 case Intrinsic::amdgcn_global_load_tr4_b64:
5341 case Intrinsic::amdgcn_global_load_tr6_b96:
5342 case Intrinsic::amdgcn_ds_load_tr8_b64:
5343 case Intrinsic::amdgcn_ds_load_tr16_b128:
5344 case Intrinsic::amdgcn_ds_load_tr4_b64:
5345 case Intrinsic::amdgcn_ds_load_tr6_b96:
5346 case Intrinsic::amdgcn_flat_load_monitor_b32:
5347 case Intrinsic::amdgcn_flat_load_monitor_b64:
5348 case Intrinsic::amdgcn_flat_load_monitor_b128:
5349 case Intrinsic::amdgcn_global_load_monitor_b32:
5350 case Intrinsic::amdgcn_global_load_monitor_b64:
5351 case Intrinsic::amdgcn_global_load_monitor_b128:
5352 case Intrinsic::amdgcn_ds_read_tr4_b64:
5353 case Intrinsic::amdgcn_ds_read_tr6_b96:
5354 case Intrinsic::amdgcn_ds_read_tr8_b64:
5355 case Intrinsic::amdgcn_ds_read_tr16_b64:
5356 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
5357 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
5358 return getDefaultMappingAllVGPR(MI);
5359 case Intrinsic::amdgcn_ds_ordered_add:
5360 case Intrinsic::amdgcn_ds_ordered_swap: {
5361 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5362 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5363 unsigned M0Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5364 Default: AMDGPU::SGPRRegBankID);
5365 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5366 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5367 break;
5368 }
5369 case Intrinsic::amdgcn_ds_append:
5370 case Intrinsic::amdgcn_ds_consume: {
5371 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5372 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5373 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5374 break;
5375 }
5376 case Intrinsic::amdgcn_exp_compr:
5377 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5378 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5379 break;
5380 case Intrinsic::amdgcn_exp:
5381 // FIXME: Could we support packed types here?
5382 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5383 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5384 OpdsMapping[5] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5385 OpdsMapping[6] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5386 break;
5387 case Intrinsic::amdgcn_exp_row:
5388 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5389 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5390 OpdsMapping[5] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5391 OpdsMapping[6] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5392 OpdsMapping[8] = getSGPROpMapping(Reg: MI.getOperand(i: 8).getReg(), MRI, TRI: *TRI);
5393 break;
5394 case Intrinsic::amdgcn_s_sendmsg:
5395 case Intrinsic::amdgcn_s_sendmsghalt: {
5396 // This must be an SGPR, but accept a VGPR.
5397 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5398 Default: AMDGPU::SGPRRegBankID);
5399 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5400 break;
5401 }
5402 case Intrinsic::amdgcn_s_setreg: {
5403 // This must be an SGPR, but accept a VGPR.
5404 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5405 Default: AMDGPU::SGPRRegBankID);
5406 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5407 break;
5408 }
5409 case Intrinsic::amdgcn_s_ttracedata: {
5410 // This must be an SGPR, but accept a VGPR.
5411 unsigned Bank =
5412 getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI, Default: AMDGPU::SGPRRegBankID);
5413 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5414 break;
5415 }
5416 case Intrinsic::amdgcn_end_cf: {
5417 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5418 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
5419 break;
5420 }
5421 case Intrinsic::amdgcn_else: {
5422 unsigned WaveSize = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5423 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5424 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: WaveSize);
5425 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: WaveSize);
5426 break;
5427 }
5428 case Intrinsic::amdgcn_init_whole_wave:
5429 case Intrinsic::amdgcn_live_mask: {
5430 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5431 break;
5432 }
5433 case Intrinsic::amdgcn_wqm_demote:
5434 case Intrinsic::amdgcn_kill: {
5435 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5436 break;
5437 }
5438 case Intrinsic::amdgcn_raw_buffer_load:
5439 case Intrinsic::amdgcn_raw_ptr_buffer_load:
5440 case Intrinsic::amdgcn_raw_atomic_buffer_load:
5441 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
5442 case Intrinsic::amdgcn_raw_tbuffer_load:
5443 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
5444 // FIXME: Should make intrinsic ID the last operand of the instruction,
5445 // then this would be the same as store
5446 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5447 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5448 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5449 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5450 break;
5451 }
5452 case Intrinsic::amdgcn_raw_buffer_load_lds:
5453 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
5454 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5455 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5456 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5457 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5458 break;
5459 }
5460 case Intrinsic::amdgcn_raw_buffer_store:
5461 case Intrinsic::amdgcn_raw_ptr_buffer_store:
5462 case Intrinsic::amdgcn_raw_buffer_store_format:
5463 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
5464 case Intrinsic::amdgcn_raw_tbuffer_store:
5465 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
5466 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5467 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5468 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5469 OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5470 break;
5471 }
5472 case Intrinsic::amdgcn_struct_buffer_load:
5473 case Intrinsic::amdgcn_struct_ptr_buffer_load:
5474 case Intrinsic::amdgcn_struct_tbuffer_load:
5475 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
5476 case Intrinsic::amdgcn_struct_atomic_buffer_load:
5477 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
5478 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5479 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5480 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5481 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5482 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5483 break;
5484 }
5485 case Intrinsic::amdgcn_struct_buffer_load_lds:
5486 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
5487 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5488 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5489 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5490 OpdsMapping[5] = getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5491 OpdsMapping[6] = getSGPROpMapping(Reg: MI.getOperand(i: 6).getReg(), MRI, TRI: *TRI);
5492 break;
5493 }
5494 case Intrinsic::amdgcn_struct_buffer_store:
5495 case Intrinsic::amdgcn_struct_ptr_buffer_store:
5496 case Intrinsic::amdgcn_struct_tbuffer_store:
5497 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
5498 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5499 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5500 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5501 OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI);
5502 OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI);
5503 break;
5504 }
5505 case Intrinsic::amdgcn_init_exec_from_input: {
5506 unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5507 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size);
5508 break;
5509 }
5510 case Intrinsic::amdgcn_ds_gws_init:
5511 case Intrinsic::amdgcn_ds_gws_barrier:
5512 case Intrinsic::amdgcn_ds_gws_sema_br: {
5513 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5514
5515 // This must be an SGPR, but accept a VGPR.
5516 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5517 Default: AMDGPU::SGPRRegBankID);
5518 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5519 break;
5520 }
5521 case Intrinsic::amdgcn_ds_gws_sema_v:
5522 case Intrinsic::amdgcn_ds_gws_sema_p:
5523 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
5524 // This must be an SGPR, but accept a VGPR.
5525 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI,
5526 Default: AMDGPU::SGPRRegBankID);
5527 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5528 break;
5529 }
5530 case Intrinsic::amdgcn_cluster_load_b32:
5531 case Intrinsic::amdgcn_cluster_load_b64:
5532 case Intrinsic::amdgcn_cluster_load_b128: {
5533 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5534 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5535 unsigned M0Bank =
5536 getRegBankID(Reg: MI.getOperand(i: 4).getReg(), MRI, Default: AMDGPU::SGPRRegBankID);
5537 OpdsMapping[4] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5538 break;
5539 }
5540 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
5541 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
5542 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
5543 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
5544 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5545 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5546 unsigned M0Bank =
5547 getRegBankID(Reg: MI.getOperand(i: 5).getReg(), MRI, Default: AMDGPU::SGPRRegBankID);
5548 OpdsMapping[5] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5549 break;
5550 }
5551 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
5552 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
5553 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
5554 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
5555 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
5556 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
5557 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
5558 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
5559 case Intrinsic::amdgcn_load_to_lds:
5560 case Intrinsic::amdgcn_global_load_lds: {
5561 OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5562 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5563 break;
5564 }
5565 case Intrinsic::amdgcn_lds_direct_load: {
5566 const int M0Idx = MI.getNumOperands() - 1;
5567 Register M0Reg = MI.getOperand(i: M0Idx).getReg();
5568 unsigned M0Bank = getRegBankID(Reg: M0Reg, MRI, Default: AMDGPU::SGPRRegBankID);
5569 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5570
5571 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize);
5572 for (int I = 2; I != M0Idx && MI.getOperand(i: I).isReg(); ++I)
5573 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32);
5574
5575 // Must be SGPR, but we must take whatever the original bank is and fix it
5576 // later.
5577 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5578 break;
5579 }
5580 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
5581 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
5582 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5583 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5584 break;
5585 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
5586 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
5587 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
5588 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
5589 OpdsMapping[0] =
5590 getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); // %vdst
5591 OpdsMapping[1] =
5592 getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); // %addr
5593 OpdsMapping[3] =
5594 getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); // %addr
5595 OpdsMapping[4] =
5596 getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); // %data0
5597 OpdsMapping[5] =
5598 getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI); // %data1
5599 break;
5600 }
5601 case Intrinsic::amdgcn_s_sleep_var:
5602 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5603 break;
5604 case Intrinsic::amdgcn_s_barrier_join:
5605 case Intrinsic::amdgcn_s_wakeup_barrier:
5606 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5607 break;
5608 case Intrinsic::amdgcn_s_barrier_init:
5609 case Intrinsic::amdgcn_s_barrier_signal_var:
5610 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5611 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5612 break;
5613 case Intrinsic::amdgcn_s_barrier_signal_isfirst: {
5614 const unsigned ResultSize = 1;
5615 OpdsMapping[0] =
5616 AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: ResultSize);
5617 break;
5618 }
5619 case Intrinsic::amdgcn_s_get_barrier_state:
5620 case Intrinsic::amdgcn_s_get_named_barrier_state: {
5621 OpdsMapping[0] = getSGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5622 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5623 break;
5624 }
5625 case Intrinsic::amdgcn_pops_exiting_wave_id:
5626 return getDefaultMappingSOP(MI);
5627 case Intrinsic::amdgcn_tensor_load_to_lds_d2:
5628 case Intrinsic::amdgcn_tensor_store_from_lds_d2:
5629 case Intrinsic::amdgcn_tensor_load_to_lds:
5630 case Intrinsic::amdgcn_tensor_store_from_lds: {
5631 // Lie and claim everything is legal, even all operands need to be
5632 // SGPRs. applyMapping will have to deal with it with readfirstlane.
5633 for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
5634 if (MI.getOperand(i: I).isReg()) {
5635 Register Reg = MI.getOperand(i: I).getReg();
5636 auto OpBank = getRegBankID(Reg, MRI);
5637 unsigned Size = getSizeInBits(Reg, MRI, TRI: *TRI);
5638 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: OpBank, Size);
5639 }
5640 }
5641 break;
5642 }
5643 case Intrinsic::amdgcn_s_prefetch_data: {
5644 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5645 OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5646 break;
5647 }
5648 case Intrinsic::amdgcn_flat_prefetch:
5649 case Intrinsic::amdgcn_global_prefetch:
5650 return getDefaultMappingVOP(MI);
5651 default:
5652 return getInvalidInstructionMapping();
5653 }
5654 break;
5655 }
5656 case AMDGPU::G_SELECT: {
5657 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5658 unsigned Op2Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI,
5659 Default: AMDGPU::SGPRRegBankID);
5660 unsigned Op3Bank = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI,
5661 Default: AMDGPU::SGPRRegBankID);
5662 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
5663 Op3Bank == AMDGPU::SGPRRegBankID;
5664
5665 unsigned CondBankDefault = SGPRSrcs ?
5666 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5667 unsigned CondBank = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI,
5668 Default: CondBankDefault);
5669 if (CondBank == AMDGPU::SGPRRegBankID)
5670 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5671 else if (CondBank == AMDGPU::VGPRRegBankID)
5672 CondBank = AMDGPU::VCCRegBankID;
5673
5674 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
5675 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5676
5677 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
5678
5679 // TODO: Should report 32-bit for scalar condition type.
5680 if (Size == 64) {
5681 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5682 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: CondBank, Size: 1);
5683 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5684 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5685 } else {
5686 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size);
5687 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: CondBank, Size: 1);
5688 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size);
5689 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: Bank, Size);
5690 }
5691
5692 break;
5693 }
5694
5695 case AMDGPU::G_SI_CALL: {
5696 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 64);
5697 // Lie and claim everything is legal, even though some need to be
5698 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
5699 OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI);
5700
5701 // Allow anything for implicit arguments
5702 for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
5703 if (MI.getOperand(i: I).isReg()) {
5704 Register Reg = MI.getOperand(i: I).getReg();
5705 auto OpBank = getRegBankID(Reg, MRI);
5706 unsigned Size = getSizeInBits(Reg, MRI, TRI: *TRI);
5707 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: OpBank, Size);
5708 }
5709 }
5710 break;
5711 }
5712 case AMDGPU::G_LOAD:
5713 case AMDGPU::G_ZEXTLOAD:
5714 case AMDGPU::G_SEXTLOAD:
5715 return getInstrMappingForLoad(MI);
5716
5717 case AMDGPU::G_ATOMICRMW_XCHG:
5718 case AMDGPU::G_ATOMICRMW_ADD:
5719 case AMDGPU::G_ATOMICRMW_SUB:
5720 case AMDGPU::G_ATOMICRMW_AND:
5721 case AMDGPU::G_ATOMICRMW_OR:
5722 case AMDGPU::G_ATOMICRMW_XOR:
5723 case AMDGPU::G_ATOMICRMW_MAX:
5724 case AMDGPU::G_ATOMICRMW_MIN:
5725 case AMDGPU::G_ATOMICRMW_UMAX:
5726 case AMDGPU::G_ATOMICRMW_UMIN:
5727 case AMDGPU::G_ATOMICRMW_FADD:
5728 case AMDGPU::G_ATOMICRMW_FMIN:
5729 case AMDGPU::G_ATOMICRMW_FMAX:
5730 case AMDGPU::G_ATOMICRMW_UINC_WRAP:
5731 case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5732 case AMDGPU::G_ATOMICRMW_USUB_COND:
5733 case AMDGPU::G_ATOMICRMW_USUB_SAT:
5734 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
5735 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5736 OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg());
5737 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5738 break;
5739 }
5740 case AMDGPU::G_ATOMIC_CMPXCHG: {
5741 OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5742 OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg());
5743 OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI);
5744 OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI);
5745 break;
5746 }
5747 case AMDGPU::G_BRCOND: {
5748 unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 0).getReg(), MRI,
5749 Default: AMDGPU::SGPRRegBankID);
5750 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
5751 if (Bank != AMDGPU::SGPRRegBankID)
5752 Bank = AMDGPU::VCCRegBankID;
5753
5754 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: 1);
5755 break;
5756 }
5757 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
5758 return getDefaultMappingVOP(MI);
5759 case AMDGPU::G_PREFETCH:
5760 OpdsMapping[0] = getSGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI);
5761 break;
5762 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP:
5763 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN:
5764 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1);
5765 break;
5766 }
5767
5768 return getInstructionMapping(/*ID*/1, /*Cost*/1,
5769 OperandsMapping: getOperandsMapping(OpdsMapping),
5770 NumOperands: MI.getNumOperands());
5771}
5772