| 1 | //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// \file |
| 9 | /// This file implements the targeting of the RegisterBankInfo class for |
| 10 | /// AMDGPU. |
| 11 | /// |
| 12 | /// \par |
| 13 | /// |
| 14 | /// AMDGPU has unique register bank constraints that require special high level |
| 15 | /// strategies to deal with. There are two main true physical register banks |
| 16 | /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a |
| 17 | /// sort of pseudo-register bank needed to represent SGPRs used in a vector |
| 18 | /// boolean context. There is also the AGPR bank, which is a special purpose |
| 19 | /// physical register bank present on some subtargets. |
| 20 | /// |
| 21 | /// Copying from VGPR to SGPR is generally illegal, unless the value is known to |
| 22 | /// be uniform. It is generally not valid to legalize operands by inserting |
| 23 | /// copies as on other targets. Operations which require uniform, SGPR operands |
| 24 | /// generally require scalarization by repeatedly executing the instruction, |
| 25 | /// activating each set of lanes using a unique set of input values. This is |
| 26 | /// referred to as a waterfall loop. |
| 27 | /// |
| 28 | /// \par Booleans |
| 29 | /// |
| 30 | /// Booleans (s1 values) requires special consideration. A vector compare result |
| 31 | /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit |
| 32 | /// register. These are represented with the VCC bank. During selection, we need |
| 33 | /// to be able to unambiguously go back from a register class to a register |
| 34 | /// bank. To distinguish whether an SGPR should use the SGPR or VCC register |
| 35 | /// bank, we need to know the use context type. An SGPR s1 value always means a |
| 36 | /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets |
| 37 | /// SCC, which is a 1-bit unaddressable register. This will need to be copied to |
| 38 | /// a 32-bit virtual register. Taken together, this means we need to adjust the |
| 39 | /// type of boolean operations to be regbank legal. All SALU booleans need to be |
| 40 | /// widened to 32-bits, and all VALU booleans need to be s1 values. |
| 41 | /// |
| 42 | /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact |
| 43 | /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc |
| 44 | /// bank. A non-boolean source (such as a truncate from a 1-bit load from |
| 45 | /// memory) will require a copy to the VCC bank which will require clearing the |
| 46 | /// high bits and inserting a compare. |
| 47 | /// |
| 48 | /// \par Constant bus restriction |
| 49 | /// |
| 50 | /// VALU instructions have a limitation known as the constant bus |
| 51 | /// restriction. Most VALU instructions can use SGPR operands, but may read at |
| 52 | /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most |
| 53 | /// instructions). This is one unique SGPR, so the same SGPR may be used for |
| 54 | /// multiple operands. From a register bank perspective, any combination of |
| 55 | /// operands should be legal as an SGPR, but this is contextually dependent on |
| 56 | /// the SGPR operands all being the same register. There is therefore optimal to |
| 57 | /// choose the SGPR with the most uses to minimize the number of copies. |
| 58 | /// |
| 59 | /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* |
| 60 | /// operation should have its source operands all mapped to VGPRs (except for |
| 61 | /// VCC), inserting copies from any SGPR operands. This the most trivial legal |
| 62 | /// mapping. Anything beyond the simplest 1:1 instruction selection would be too |
| 63 | /// complicated to solve here. Every optimization pattern or instruction |
| 64 | /// selected to multiple outputs would have to enforce this rule, and there |
| 65 | /// would be additional complexity in tracking this rule for every G_* |
| 66 | /// operation. By forcing all inputs to VGPRs, it also simplifies the task of |
| 67 | /// picking the optimal operand combination from a post-isel optimization pass. |
| 68 | /// |
| 69 | //===----------------------------------------------------------------------===// |
| 70 | |
| 71 | #include "AMDGPURegisterBankInfo.h" |
| 72 | |
| 73 | #include "AMDGPU.h" |
| 74 | #include "AMDGPUGlobalISelUtils.h" |
| 75 | #include "AMDGPUInstrInfo.h" |
| 76 | #include "AMDGPULaneMaskUtils.h" |
| 77 | #include "GCNSubtarget.h" |
| 78 | #include "SIMachineFunctionInfo.h" |
| 79 | #include "SIRegisterInfo.h" |
| 80 | #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" |
| 81 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" |
| 82 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
| 83 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" |
| 84 | #include "llvm/CodeGen/RegisterBank.h" |
| 85 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
| 86 | |
| 87 | #define GET_TARGET_REGBANK_IMPL |
| 88 | #include "AMDGPUGenRegisterBank.inc" |
| 89 | |
| 90 | // This file will be TableGen'ed at some point. |
| 91 | #include "AMDGPUGenRegisterBankInfo.def" |
| 92 | |
| 93 | using namespace llvm; |
| 94 | using namespace MIPatternMatch; |
| 95 | |
| 96 | namespace { |
| 97 | |
| 98 | // Observer to apply a register bank to new registers created by LegalizerHelper. |
| 99 | class ApplyRegBankMapping final : public GISelChangeObserver { |
| 100 | private: |
| 101 | MachineIRBuilder &B; |
| 102 | const AMDGPURegisterBankInfo &RBI; |
| 103 | MachineRegisterInfo &MRI; |
| 104 | const RegisterBank *NewBank; |
| 105 | SmallVector<MachineInstr *, 4> NewInsts; |
| 106 | |
| 107 | public: |
| 108 | ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_, |
| 109 | MachineRegisterInfo &MRI_, const RegisterBank *RB) |
| 110 | : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) { |
| 111 | assert(!B.isObservingChanges()); |
| 112 | B.setChangeObserver(*this); |
| 113 | } |
| 114 | |
| 115 | ~ApplyRegBankMapping() override { |
| 116 | for (MachineInstr *MI : NewInsts) |
| 117 | applyBank(MI&: *MI); |
| 118 | |
| 119 | B.stopObservingChanges(); |
| 120 | } |
| 121 | |
| 122 | /// Set any registers that don't have a set register class or bank to SALU. |
| 123 | void applyBank(MachineInstr &MI) { |
| 124 | const unsigned Opc = MI.getOpcode(); |
| 125 | if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || |
| 126 | Opc == AMDGPU::G_SEXT) { |
| 127 | // LegalizerHelper wants to use the basic legalization artifacts when |
| 128 | // widening etc. We don't handle selection with vcc in artifact sources, |
| 129 | // so we need to use a select instead to handle these properly. |
| 130 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 131 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 132 | const RegisterBank *SrcBank = RBI.getRegBank(Reg: SrcReg, MRI, TRI: *RBI.TRI); |
| 133 | if (SrcBank == &AMDGPU::VCCRegBank) { |
| 134 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 135 | assert(MRI.getType(SrcReg) == LLT::scalar(1)); |
| 136 | assert(MRI.getType(DstReg) == S32); |
| 137 | assert(NewBank == &AMDGPU::VGPRRegBank); |
| 138 | |
| 139 | // Replace the extension with a select, which really uses the boolean |
| 140 | // source. |
| 141 | B.setInsertPt(MBB&: *MI.getParent(), II: MI); |
| 142 | |
| 143 | auto True = B.buildConstant(Res: S32, Val: Opc == AMDGPU::G_SEXT ? -1 : 1); |
| 144 | auto False = B.buildConstant(Res: S32, Val: 0); |
| 145 | B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False); |
| 146 | MRI.setRegBank(Reg: True.getReg(Idx: 0), RegBank: *NewBank); |
| 147 | MRI.setRegBank(Reg: False.getReg(Idx: 0), RegBank: *NewBank); |
| 148 | MI.eraseFromParent(); |
| 149 | } |
| 150 | |
| 151 | assert(!MRI.getRegClassOrRegBank(DstReg)); |
| 152 | MRI.setRegBank(Reg: DstReg, RegBank: *NewBank); |
| 153 | return; |
| 154 | } |
| 155 | |
| 156 | #ifndef NDEBUG |
| 157 | if (Opc == AMDGPU::G_TRUNC) { |
| 158 | Register DstReg = MI.getOperand(0).getReg(); |
| 159 | const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); |
| 160 | assert(DstBank != &AMDGPU::VCCRegBank); |
| 161 | } |
| 162 | #endif |
| 163 | |
| 164 | for (MachineOperand &Op : MI.operands()) { |
| 165 | if (!Op.isReg()) |
| 166 | continue; |
| 167 | |
| 168 | // We may see physical registers if building a real MI |
| 169 | Register Reg = Op.getReg(); |
| 170 | if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) |
| 171 | continue; |
| 172 | |
| 173 | const RegisterBank *RB = NewBank; |
| 174 | if (MRI.getType(Reg) == LLT::scalar(SizeInBits: 1)) { |
| 175 | assert(NewBank == &AMDGPU::VGPRRegBank && |
| 176 | "s1 operands should only be used for vector bools" ); |
| 177 | assert((MI.getOpcode() != AMDGPU::G_TRUNC && |
| 178 | MI.getOpcode() != AMDGPU::G_ANYEXT) && |
| 179 | "not expecting legalization artifacts here" ); |
| 180 | RB = &AMDGPU::VCCRegBank; |
| 181 | } |
| 182 | |
| 183 | MRI.setRegBank(Reg, RegBank: *RB); |
| 184 | } |
| 185 | } |
| 186 | |
| 187 | void erasingInstr(MachineInstr &MI) override {} |
| 188 | |
| 189 | void createdInstr(MachineInstr &MI) override { |
| 190 | // At this point, the instruction was just inserted and has no operands. |
| 191 | NewInsts.push_back(Elt: &MI); |
| 192 | } |
| 193 | |
| 194 | void changingInstr(MachineInstr &MI) override {} |
| 195 | void changedInstr(MachineInstr &MI) override { |
| 196 | // FIXME: In principle we should probably add the instruction to NewInsts, |
| 197 | // but the way the LegalizerHelper uses the observer, we will always see the |
| 198 | // registers we need to set the regbank on also referenced in a new |
| 199 | // instruction. |
| 200 | } |
| 201 | }; |
| 202 | |
| 203 | } // anonymous namespace |
| 204 | |
| 205 | AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) |
| 206 | : Subtarget(ST), TRI(Subtarget.getRegisterInfo()), |
| 207 | TII(Subtarget.getInstrInfo()) { |
| 208 | |
| 209 | // HACK: Until this is fully tablegen'd. |
| 210 | static llvm::once_flag InitializeRegisterBankFlag; |
| 211 | |
| 212 | static auto InitializeRegisterBankOnce = [this]() { |
| 213 | assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && |
| 214 | &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && |
| 215 | &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); |
| 216 | (void)this; |
| 217 | }; |
| 218 | |
| 219 | llvm::call_once(flag&: InitializeRegisterBankFlag, F&: InitializeRegisterBankOnce); |
| 220 | } |
| 221 | |
| 222 | static bool isVectorRegisterBank(const RegisterBank &Bank) { |
| 223 | unsigned BankID = Bank.getID(); |
| 224 | return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; |
| 225 | } |
| 226 | |
| 227 | bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const { |
| 228 | return RB != &AMDGPU::SGPRRegBank; |
| 229 | } |
| 230 | |
| 231 | unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, |
| 232 | const RegisterBank &Src, |
| 233 | TypeSize Size) const { |
| 234 | // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? |
| 235 | if (Dst.getID() == AMDGPU::SGPRRegBankID && |
| 236 | (isVectorRegisterBank(Bank: Src) || Src.getID() == AMDGPU::VCCRegBankID)) { |
| 237 | return std::numeric_limits<unsigned>::max(); |
| 238 | } |
| 239 | |
| 240 | // Bool values are tricky, because the meaning is based on context. The SCC |
| 241 | // and VCC banks are for the natural scalar and vector conditions produced by |
| 242 | // a compare. |
| 243 | // |
| 244 | // Legalization doesn't know about the necessary context, so an s1 use may |
| 245 | // have been a truncate from an arbitrary value, in which case a copy (lowered |
| 246 | // as a compare with 0) needs to be inserted. |
| 247 | if (Size == 1 && |
| 248 | (Dst.getID() == AMDGPU::SGPRRegBankID) && |
| 249 | (isVectorRegisterBank(Bank: Src) || |
| 250 | Src.getID() == AMDGPU::SGPRRegBankID || |
| 251 | Src.getID() == AMDGPU::VCCRegBankID)) |
| 252 | return std::numeric_limits<unsigned>::max(); |
| 253 | |
| 254 | // There is no direct copy between AGPRs. |
| 255 | if (Dst.getID() == AMDGPU::AGPRRegBankID && |
| 256 | Src.getID() == AMDGPU::AGPRRegBankID) |
| 257 | return 4; |
| 258 | |
| 259 | return RegisterBankInfo::copyCost(A: Dst, B: Src, Size); |
| 260 | } |
| 261 | |
| 262 | unsigned AMDGPURegisterBankInfo::getBreakDownCost( |
| 263 | const ValueMapping &ValMapping, |
| 264 | const RegisterBank *CurBank) const { |
| 265 | // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to |
| 266 | // VGPR. |
| 267 | // FIXME: Is there a better way to do this? |
| 268 | if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) |
| 269 | return 10; // This is expensive. |
| 270 | |
| 271 | assert(ValMapping.NumBreakDowns == 2 && |
| 272 | ValMapping.BreakDown[0].Length == 32 && |
| 273 | ValMapping.BreakDown[0].StartIdx == 0 && |
| 274 | ValMapping.BreakDown[1].Length == 32 && |
| 275 | ValMapping.BreakDown[1].StartIdx == 32 && |
| 276 | ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); |
| 277 | |
| 278 | // 32-bit extract of a 64-bit value is just access of a subregister, so free. |
| 279 | // TODO: Cost of 0 hits assert, though it's not clear it's what we really |
| 280 | // want. |
| 281 | |
| 282 | // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR |
| 283 | // alignment restrictions, but this probably isn't important. |
| 284 | return 1; |
| 285 | } |
| 286 | |
| 287 | const RegisterBank & |
| 288 | AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, |
| 289 | LLT Ty) const { |
| 290 | // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a |
| 291 | // VCC-like use. |
| 292 | if (TRI->isSGPRClass(RC: &RC)) { |
| 293 | // FIXME: This probably came from a copy from a physical register, which |
| 294 | // should be inferable from the copied to-type. We don't have many boolean |
| 295 | // physical register constraints so just assume a normal SGPR for now. |
| 296 | if (!Ty.isValid()) |
| 297 | return AMDGPU::SGPRRegBank; |
| 298 | |
| 299 | return Ty == LLT::scalar(SizeInBits: 1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; |
| 300 | } |
| 301 | |
| 302 | return TRI->isAGPRClass(RC: &RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; |
| 303 | } |
| 304 | |
| 305 | template <unsigned NumOps> |
| 306 | RegisterBankInfo::InstructionMappings |
| 307 | AMDGPURegisterBankInfo::addMappingFromTable( |
| 308 | const MachineInstr &MI, const MachineRegisterInfo &MRI, |
| 309 | const std::array<unsigned, NumOps> RegSrcOpIdx, |
| 310 | ArrayRef<OpRegBankEntry<NumOps>> Table) const { |
| 311 | |
| 312 | InstructionMappings AltMappings; |
| 313 | |
| 314 | SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); |
| 315 | |
| 316 | unsigned Sizes[NumOps]; |
| 317 | for (unsigned I = 0; I < NumOps; ++I) { |
| 318 | Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); |
| 319 | Sizes[I] = getSizeInBits(Reg, MRI, TRI: *TRI); |
| 320 | } |
| 321 | |
| 322 | for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { |
| 323 | unsigned SizeI = getSizeInBits(Reg: MI.getOperand(i: I).getReg(), MRI, TRI: *TRI); |
| 324 | Operands[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SizeI); |
| 325 | } |
| 326 | |
| 327 | // getInstrMapping's default mapping uses ID 1, so start at 2. |
| 328 | unsigned MappingID = 2; |
| 329 | for (const auto &Entry : Table) { |
| 330 | for (unsigned I = 0; I < NumOps; ++I) { |
| 331 | int OpIdx = RegSrcOpIdx[I]; |
| 332 | Operands[OpIdx] = AMDGPU::getValueMapping(BankID: Entry.RegBanks[I], Size: Sizes[I]); |
| 333 | } |
| 334 | |
| 335 | AltMappings.push_back(Elt: &getInstructionMapping(ID: MappingID++, Cost: Entry.Cost, |
| 336 | OperandsMapping: getOperandsMapping(OpdsMapping: Operands), |
| 337 | NumOperands: Operands.size())); |
| 338 | } |
| 339 | |
| 340 | return AltMappings; |
| 341 | } |
| 342 | |
| 343 | RegisterBankInfo::InstructionMappings |
| 344 | AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( |
| 345 | const MachineInstr &MI, const MachineRegisterInfo &MRI) const { |
| 346 | switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) { |
| 347 | case Intrinsic::amdgcn_readlane: { |
| 348 | static const OpRegBankEntry<3> Table[2] = { |
| 349 | // Perfectly legal. |
| 350 | { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: 1 }, |
| 351 | |
| 352 | // Need a readfirstlane for the index. |
| 353 | { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 } |
| 354 | }; |
| 355 | |
| 356 | const std::array<unsigned, 3> RegSrcOpIdx = { ._M_elems: { 0, 2, 3 } }; |
| 357 | return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table); |
| 358 | } |
| 359 | case Intrinsic::amdgcn_writelane: { |
| 360 | static const OpRegBankEntry<4> Table[4] = { |
| 361 | // Perfectly legal. |
| 362 | { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 1 }, |
| 363 | |
| 364 | // Need readfirstlane of first op |
| 365 | { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 }, |
| 366 | |
| 367 | // Need readfirstlane of second op |
| 368 | { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 }, |
| 369 | |
| 370 | // Need readfirstlane of both ops |
| 371 | { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 3 } |
| 372 | }; |
| 373 | |
| 374 | // rsrc, voffset, offset |
| 375 | const std::array<unsigned, 4> RegSrcOpIdx = { ._M_elems: { 0, 2, 3, 4 } }; |
| 376 | return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table); |
| 377 | } |
| 378 | default: |
| 379 | return RegisterBankInfo::getInstrAlternativeMappings(MI); |
| 380 | } |
| 381 | } |
| 382 | |
| 383 | RegisterBankInfo::InstructionMappings |
| 384 | AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( |
| 385 | const MachineInstr &MI, const MachineRegisterInfo &MRI) const { |
| 386 | |
| 387 | switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) { |
| 388 | case Intrinsic::amdgcn_s_buffer_load: { |
| 389 | static const OpRegBankEntry<2> Table[4] = { |
| 390 | // Perfectly legal. |
| 391 | { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: 1 }, |
| 392 | |
| 393 | // Only need 1 register in loop |
| 394 | { .RegBanks: { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 300 }, |
| 395 | |
| 396 | // Have to waterfall the resource. |
| 397 | { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, .Cost: 1000 }, |
| 398 | |
| 399 | // Have to waterfall the resource, and the offset. |
| 400 | { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 1500 } |
| 401 | }; |
| 402 | |
| 403 | // rsrc, offset |
| 404 | const std::array<unsigned, 2> RegSrcOpIdx = { ._M_elems: { 2, 3 } }; |
| 405 | return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table); |
| 406 | } |
| 407 | case Intrinsic::amdgcn_ds_ordered_add: |
| 408 | case Intrinsic::amdgcn_ds_ordered_swap: { |
| 409 | // VGPR = M0, VGPR |
| 410 | static const OpRegBankEntry<3> Table[2] = { |
| 411 | // Perfectly legal. |
| 412 | { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 1 }, |
| 413 | |
| 414 | // Need a readfirstlane for m0 |
| 415 | { .RegBanks: { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, .Cost: 2 } |
| 416 | }; |
| 417 | |
| 418 | const std::array<unsigned, 3> RegSrcOpIdx = { ._M_elems: { 0, 2, 3 } }; |
| 419 | return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table); |
| 420 | } |
| 421 | case Intrinsic::amdgcn_s_sendmsg: |
| 422 | case Intrinsic::amdgcn_s_sendmsghalt: { |
| 423 | // FIXME: Should have no register for immediate |
| 424 | static const OpRegBankEntry<1> Table[2] = { |
| 425 | // Perfectly legal. |
| 426 | { .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: 1 }, |
| 427 | |
| 428 | // Need readlane |
| 429 | { .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: 3 } |
| 430 | }; |
| 431 | |
| 432 | const std::array<unsigned, 1> RegSrcOpIdx = { ._M_elems: { 2 } }; |
| 433 | return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table); |
| 434 | } |
| 435 | default: |
| 436 | return RegisterBankInfo::getInstrAlternativeMappings(MI); |
| 437 | } |
| 438 | } |
| 439 | |
| 440 | // FIXME: Returns uniform if there's no source value information. This is |
| 441 | // probably wrong. |
| 442 | bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const { |
| 443 | if (!MI.hasOneMemOperand()) |
| 444 | return false; |
| 445 | |
| 446 | const MachineMemOperand *MMO = *MI.memoperands_begin(); |
| 447 | const unsigned AS = MMO->getAddrSpace(); |
| 448 | const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || |
| 449 | AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; |
| 450 | const unsigned MemSize = 8 * MMO->getSize().getValue(); |
| 451 | |
| 452 | // Require 4-byte alignment. |
| 453 | return (MMO->getAlign() >= Align(4) || |
| 454 | (Subtarget.hasScalarSubwordLoads() && |
| 455 | ((MemSize == 16 && MMO->getAlign() >= Align(2)) || |
| 456 | (MemSize == 8 && MMO->getAlign() >= Align(1))))) && |
| 457 | // Can't do a scalar atomic load. |
| 458 | !MMO->isAtomic() && |
| 459 | // Don't use scalar loads for volatile accesses to non-constant address |
| 460 | // spaces. |
| 461 | (IsConst || !MMO->isVolatile()) && |
| 462 | // Memory must be known constant, or not written before this load. |
| 463 | (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) && |
| 464 | AMDGPU::isUniformMMO(MMO); |
| 465 | } |
| 466 | |
| 467 | RegisterBankInfo::InstructionMappings |
| 468 | AMDGPURegisterBankInfo::getInstrAlternativeMappings( |
| 469 | const MachineInstr &MI) const { |
| 470 | |
| 471 | const MachineFunction &MF = *MI.getMF(); |
| 472 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| 473 | |
| 474 | |
| 475 | InstructionMappings AltMappings; |
| 476 | switch (MI.getOpcode()) { |
| 477 | case TargetOpcode::G_CONSTANT: |
| 478 | case TargetOpcode::G_IMPLICIT_DEF: { |
| 479 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 480 | if (Size == 1) { |
| 481 | static const OpRegBankEntry<1> Table[3] = { |
| 482 | { .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: 1 }, |
| 483 | { .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: 1 }, |
| 484 | { .RegBanks: { AMDGPU::VCCRegBankID }, .Cost: 1 } |
| 485 | }; |
| 486 | |
| 487 | return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx: {._M_elems: { 0 }}, Table); |
| 488 | } |
| 489 | |
| 490 | [[fallthrough]]; |
| 491 | } |
| 492 | case TargetOpcode::G_FCONSTANT: |
| 493 | case TargetOpcode::G_FRAME_INDEX: |
| 494 | case TargetOpcode::G_GLOBAL_VALUE: { |
| 495 | static const OpRegBankEntry<1> Table[2] = { |
| 496 | { .RegBanks: { AMDGPU::VGPRRegBankID }, .Cost: 1 }, |
| 497 | { .RegBanks: { AMDGPU::SGPRRegBankID }, .Cost: 1 } |
| 498 | }; |
| 499 | |
| 500 | return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx: {._M_elems: { 0 }}, Table); |
| 501 | } |
| 502 | case TargetOpcode::G_AND: |
| 503 | case TargetOpcode::G_OR: |
| 504 | case TargetOpcode::G_XOR: { |
| 505 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 506 | |
| 507 | if (Size == 1) { |
| 508 | // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. |
| 509 | const InstructionMapping &SCCMapping = getInstructionMapping( |
| 510 | ID: 1, Cost: 1, OperandsMapping: getOperandsMapping( |
| 511 | OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32), |
| 512 | AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32), |
| 513 | AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32)}), |
| 514 | NumOperands: 3); // Num Operands |
| 515 | AltMappings.push_back(Elt: &SCCMapping); |
| 516 | |
| 517 | const InstructionMapping &VCCMapping0 = getInstructionMapping( |
| 518 | ID: 2, Cost: 1, OperandsMapping: getOperandsMapping( |
| 519 | OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size), |
| 520 | AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size), |
| 521 | AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size)}), |
| 522 | NumOperands: 3); // Num Operands |
| 523 | AltMappings.push_back(Elt: &VCCMapping0); |
| 524 | return AltMappings; |
| 525 | } |
| 526 | |
| 527 | if (Size != 64) |
| 528 | break; |
| 529 | |
| 530 | const InstructionMapping &SSMapping = getInstructionMapping( |
| 531 | ID: 1, Cost: 1, OperandsMapping: getOperandsMapping( |
| 532 | OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size), |
| 533 | AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size), |
| 534 | AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size)}), |
| 535 | NumOperands: 3); // Num Operands |
| 536 | AltMappings.push_back(Elt: &SSMapping); |
| 537 | |
| 538 | const InstructionMapping &VVMapping = getInstructionMapping( |
| 539 | ID: 2, Cost: 2, OperandsMapping: getOperandsMapping( |
| 540 | OpdsMapping: {AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size), |
| 541 | AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size), |
| 542 | AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size)}), |
| 543 | NumOperands: 3); // Num Operands |
| 544 | AltMappings.push_back(Elt: &VVMapping); |
| 545 | break; |
| 546 | } |
| 547 | case TargetOpcode::G_LOAD: |
| 548 | case TargetOpcode::G_ZEXTLOAD: |
| 549 | case TargetOpcode::G_SEXTLOAD: { |
| 550 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 551 | LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg()); |
| 552 | unsigned PtrSize = PtrTy.getSizeInBits(); |
| 553 | unsigned AS = PtrTy.getAddressSpace(); |
| 554 | |
| 555 | if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && |
| 556 | AS != AMDGPUAS::PRIVATE_ADDRESS) && |
| 557 | isScalarLoadLegal(MI)) { |
| 558 | const InstructionMapping &SSMapping = getInstructionMapping( |
| 559 | ID: 1, Cost: 1, OperandsMapping: getOperandsMapping( |
| 560 | OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size), |
| 561 | AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: PtrSize)}), |
| 562 | NumOperands: 2); // Num Operands |
| 563 | AltMappings.push_back(Elt: &SSMapping); |
| 564 | } |
| 565 | |
| 566 | const InstructionMapping &VVMapping = getInstructionMapping( |
| 567 | ID: 2, Cost: 1, |
| 568 | OperandsMapping: getOperandsMapping( |
| 569 | OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size), |
| 570 | AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: PtrSize)}), |
| 571 | NumOperands: 2); // Num Operands |
| 572 | AltMappings.push_back(Elt: &VVMapping); |
| 573 | |
| 574 | // It may be possible to have a vgpr = load sgpr mapping here, because |
| 575 | // the mubuf instructions support this kind of load, but probably for only |
| 576 | // gfx7 and older. However, the addressing mode matching in the instruction |
| 577 | // selector should be able to do a better job of detecting and selecting |
| 578 | // these kinds of loads from the vgpr = load vgpr mapping. |
| 579 | |
| 580 | return AltMappings; |
| 581 | |
| 582 | } |
| 583 | case TargetOpcode::G_SELECT: { |
| 584 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 585 | const InstructionMapping &SSMapping = getInstructionMapping(ID: 1, Cost: 1, |
| 586 | OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size), |
| 587 | AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1), |
| 588 | AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size), |
| 589 | AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size)}), |
| 590 | NumOperands: 4); // Num Operands |
| 591 | AltMappings.push_back(Elt: &SSMapping); |
| 592 | |
| 593 | const InstructionMapping &VVMapping = getInstructionMapping(ID: 2, Cost: 1, |
| 594 | OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size), |
| 595 | AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1), |
| 596 | AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size), |
| 597 | AMDGPU::getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size)}), |
| 598 | NumOperands: 4); // Num Operands |
| 599 | AltMappings.push_back(Elt: &VVMapping); |
| 600 | |
| 601 | return AltMappings; |
| 602 | } |
| 603 | case TargetOpcode::G_UADDE: |
| 604 | case TargetOpcode::G_USUBE: |
| 605 | case TargetOpcode::G_SADDE: |
| 606 | case TargetOpcode::G_SSUBE: { |
| 607 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 608 | const InstructionMapping &SSMapping = getInstructionMapping(ID: 1, Cost: 1, |
| 609 | OperandsMapping: getOperandsMapping( |
| 610 | OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size), |
| 611 | AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1), |
| 612 | AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size), |
| 613 | AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size), |
| 614 | AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1)}), |
| 615 | NumOperands: 5); // Num Operands |
| 616 | AltMappings.push_back(Elt: &SSMapping); |
| 617 | |
| 618 | const InstructionMapping &VVMapping = getInstructionMapping(ID: 2, Cost: 1, |
| 619 | OperandsMapping: getOperandsMapping(OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size), |
| 620 | AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1), |
| 621 | AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size), |
| 622 | AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size), |
| 623 | AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1)}), |
| 624 | NumOperands: 5); // Num Operands |
| 625 | AltMappings.push_back(Elt: &VVMapping); |
| 626 | return AltMappings; |
| 627 | } |
| 628 | case AMDGPU::G_BRCOND: { |
| 629 | assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); |
| 630 | |
| 631 | // TODO: Change type to 32 for scalar |
| 632 | const InstructionMapping &SMapping = getInstructionMapping( |
| 633 | ID: 1, Cost: 1, OperandsMapping: getOperandsMapping( |
| 634 | OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 1), nullptr}), |
| 635 | NumOperands: 2); // Num Operands |
| 636 | AltMappings.push_back(Elt: &SMapping); |
| 637 | |
| 638 | const InstructionMapping &VMapping = getInstructionMapping( |
| 639 | ID: 1, Cost: 1, OperandsMapping: getOperandsMapping( |
| 640 | OpdsMapping: {AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1), nullptr }), |
| 641 | NumOperands: 2); // Num Operands |
| 642 | AltMappings.push_back(Elt: &VMapping); |
| 643 | return AltMappings; |
| 644 | } |
| 645 | case AMDGPU::G_INTRINSIC: |
| 646 | case AMDGPU::G_INTRINSIC_CONVERGENT: |
| 647 | return getInstrAlternativeMappingsIntrinsic(MI, MRI); |
| 648 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: |
| 649 | case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: |
| 650 | return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); |
| 651 | default: |
| 652 | break; |
| 653 | } |
| 654 | return RegisterBankInfo::getInstrAlternativeMappings(MI); |
| 655 | } |
| 656 | |
| 657 | void AMDGPURegisterBankInfo::split64BitValueForMapping( |
| 658 | MachineIRBuilder &B, |
| 659 | SmallVector<Register, 2> &Regs, |
| 660 | LLT HalfTy, |
| 661 | Register Reg) const { |
| 662 | assert(HalfTy.getSizeInBits() == 32); |
| 663 | MachineRegisterInfo *MRI = B.getMRI(); |
| 664 | Register LoLHS = MRI->createGenericVirtualRegister(Ty: HalfTy); |
| 665 | Register HiLHS = MRI->createGenericVirtualRegister(Ty: HalfTy); |
| 666 | const RegisterBank *Bank = getRegBank(Reg, MRI: *MRI, TRI: *TRI); |
| 667 | MRI->setRegBank(Reg: LoLHS, RegBank: *Bank); |
| 668 | MRI->setRegBank(Reg: HiLHS, RegBank: *Bank); |
| 669 | |
| 670 | Regs.push_back(Elt: LoLHS); |
| 671 | Regs.push_back(Elt: HiLHS); |
| 672 | |
| 673 | B.buildInstr(Opcode: AMDGPU::G_UNMERGE_VALUES) |
| 674 | .addDef(RegNo: LoLHS) |
| 675 | .addDef(RegNo: HiLHS) |
| 676 | .addUse(RegNo: Reg); |
| 677 | } |
| 678 | |
| 679 | /// Replace the current type each register in \p Regs has with \p NewTy |
| 680 | static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, |
| 681 | LLT NewTy) { |
| 682 | for (Register Reg : Regs) { |
| 683 | assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); |
| 684 | MRI.setType(VReg: Reg, Ty: NewTy); |
| 685 | } |
| 686 | } |
| 687 | |
| 688 | static LLT getHalfSizedType(LLT Ty) { |
| 689 | if (Ty.isVector()) { |
| 690 | assert(Ty.getElementCount().isKnownMultipleOf(2)); |
| 691 | return LLT::scalarOrVector(EC: Ty.getElementCount().divideCoefficientBy(RHS: 2), |
| 692 | ScalarTy: Ty.getElementType()); |
| 693 | } |
| 694 | |
| 695 | assert(Ty.getScalarSizeInBits() % 2 == 0); |
| 696 | return LLT::scalar(SizeInBits: Ty.getScalarSizeInBits() / 2); |
| 697 | } |
| 698 | |
| 699 | // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector |
| 700 | // source value into a scalar register. |
| 701 | Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B, |
| 702 | MachineRegisterInfo &MRI, |
| 703 | Register Src) const { |
| 704 | LLT Ty = MRI.getType(Reg: Src); |
| 705 | const RegisterBank *Bank = getRegBank(Reg: Src, MRI, TRI: *TRI); |
| 706 | |
| 707 | if (Bank == &AMDGPU::SGPRRegBank) |
| 708 | return Src; |
| 709 | |
| 710 | unsigned Bits = Ty.getSizeInBits(); |
| 711 | assert(Bits % 32 == 0); |
| 712 | |
| 713 | if (Bank != &AMDGPU::VGPRRegBank) { |
| 714 | // We need to copy from AGPR to VGPR |
| 715 | Src = B.buildCopy(Res: Ty, Op: Src).getReg(Idx: 0); |
| 716 | MRI.setRegBank(Reg: Src, RegBank: AMDGPU::VGPRRegBank); |
| 717 | } |
| 718 | |
| 719 | LLT S32 = LLT::scalar(SizeInBits: 32); |
| 720 | unsigned NumParts = Bits / 32; |
| 721 | SmallVector<Register, 8> SrcParts; |
| 722 | SmallVector<Register, 8> DstParts; |
| 723 | |
| 724 | if (Bits == 32) { |
| 725 | SrcParts.push_back(Elt: Src); |
| 726 | } else { |
| 727 | auto Unmerge = B.buildUnmerge(Res: S32, Op: Src); |
| 728 | for (unsigned i = 0; i < NumParts; ++i) |
| 729 | SrcParts.push_back(Elt: Unmerge.getReg(Idx: i)); |
| 730 | } |
| 731 | |
| 732 | for (unsigned i = 0; i < NumParts; ++i) { |
| 733 | Register SrcPart = SrcParts[i]; |
| 734 | Register DstPart = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass); |
| 735 | MRI.setType(VReg: DstPart, Ty: NumParts == 1 ? Ty : S32); |
| 736 | |
| 737 | const TargetRegisterClass *Constrained = |
| 738 | constrainGenericRegister(Reg: SrcPart, RC: AMDGPU::VGPR_32RegClass, MRI); |
| 739 | (void)Constrained; |
| 740 | assert(Constrained && "Failed to constrain readfirstlane src reg" ); |
| 741 | |
| 742 | B.buildInstr(Opc: AMDGPU::V_READFIRSTLANE_B32, DstOps: {DstPart}, SrcOps: {SrcPart}); |
| 743 | |
| 744 | DstParts.push_back(Elt: DstPart); |
| 745 | } |
| 746 | |
| 747 | if (Bits == 32) |
| 748 | return DstParts[0]; |
| 749 | |
| 750 | Register Dst = B.buildMergeLikeInstr(Res: Ty, Ops: DstParts).getReg(Idx: 0); |
| 751 | MRI.setRegBank(Reg: Dst, RegBank: AMDGPU::SGPRRegBank); |
| 752 | return Dst; |
| 753 | } |
| 754 | |
| 755 | /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If |
| 756 | /// any of the required SGPR operands are VGPRs, perform a waterfall loop to |
| 757 | /// execute the instruction for each unique combination of values in all lanes |
| 758 | /// in the wave. The block will be split such that rest of the instructions are |
| 759 | /// moved to a new block. |
| 760 | /// |
| 761 | /// Essentially performs this loop: |
| 762 | // |
| 763 | /// Save Execution Mask |
| 764 | /// For (Lane : Wavefront) { |
| 765 | /// Enable Lane, Disable all other lanes |
| 766 | /// SGPR = read SGPR value for current lane from VGPR |
| 767 | /// VGPRResult[Lane] = use_op SGPR |
| 768 | /// } |
| 769 | /// Restore Execution Mask |
| 770 | /// |
| 771 | /// There is additional complexity to try for compare values to identify the |
| 772 | /// unique values used. |
| 773 | bool AMDGPURegisterBankInfo::executeInWaterfallLoop( |
| 774 | MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range, |
| 775 | SmallSet<Register, 4> &SGPROperandRegs) const { |
| 776 | // Track use registers which have already been expanded with a readfirstlane |
| 777 | // sequence. This may have multiple uses if moving a sequence. |
| 778 | DenseMap<Register, Register> WaterfalledRegMap; |
| 779 | |
| 780 | MachineBasicBlock &MBB = B.getMBB(); |
| 781 | MachineFunction *MF = &B.getMF(); |
| 782 | |
| 783 | const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); |
| 784 | const AMDGPU::LaneMaskConstants &LMC = |
| 785 | AMDGPU::LaneMaskConstants::get(ST: Subtarget); |
| 786 | |
| 787 | #ifndef NDEBUG |
| 788 | const int OrigRangeSize = std::distance(Range.begin(), Range.end()); |
| 789 | #endif |
| 790 | |
| 791 | MachineRegisterInfo &MRI = *B.getMRI(); |
| 792 | Register SaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC); |
| 793 | Register InitSaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC); |
| 794 | |
| 795 | // Don't bother using generic instructions/registers for the exec mask. |
| 796 | B.buildInstr(Opcode: TargetOpcode::IMPLICIT_DEF) |
| 797 | .addDef(RegNo: InitSaveExecReg); |
| 798 | |
| 799 | Register PhiExec = MRI.createVirtualRegister(RegClass: WaveRC); |
| 800 | Register NewExec = MRI.createVirtualRegister(RegClass: WaveRC); |
| 801 | |
| 802 | // To insert the loop we need to split the block. Move everything before this |
| 803 | // point to a new block, and insert a new empty block before this instruction. |
| 804 | MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); |
| 805 | MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock(); |
| 806 | MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); |
| 807 | MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); |
| 808 | MachineFunction::iterator MBBI(MBB); |
| 809 | ++MBBI; |
| 810 | MF->insert(MBBI, MBB: LoopBB); |
| 811 | MF->insert(MBBI, MBB: BodyBB); |
| 812 | MF->insert(MBBI, MBB: RestoreExecBB); |
| 813 | MF->insert(MBBI, MBB: RemainderBB); |
| 814 | |
| 815 | LoopBB->addSuccessor(Succ: BodyBB); |
| 816 | BodyBB->addSuccessor(Succ: RestoreExecBB); |
| 817 | BodyBB->addSuccessor(Succ: LoopBB); |
| 818 | |
| 819 | // Move the rest of the block into a new block. |
| 820 | RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB); |
| 821 | RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Range.end(), To: MBB.end()); |
| 822 | |
| 823 | MBB.addSuccessor(Succ: LoopBB); |
| 824 | RestoreExecBB->addSuccessor(Succ: RemainderBB); |
| 825 | |
| 826 | B.setInsertPt(MBB&: *LoopBB, II: LoopBB->end()); |
| 827 | |
| 828 | B.buildInstr(Opcode: TargetOpcode::PHI) |
| 829 | .addDef(RegNo: PhiExec) |
| 830 | .addReg(RegNo: InitSaveExecReg) |
| 831 | .addMBB(MBB: &MBB) |
| 832 | .addReg(RegNo: NewExec) |
| 833 | .addMBB(MBB: BodyBB); |
| 834 | |
| 835 | const DebugLoc &DL = B.getDL(); |
| 836 | |
| 837 | MachineInstr &FirstInst = *Range.begin(); |
| 838 | |
| 839 | // Move the instruction into the loop body. Note we moved everything after |
| 840 | // Range.end() already into a new block, so Range.end() is no longer valid. |
| 841 | BodyBB->splice(Where: BodyBB->end(), Other: &MBB, From: Range.begin(), To: MBB.end()); |
| 842 | |
| 843 | // Figure out the iterator range after splicing the instructions. |
| 844 | MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); |
| 845 | auto NewEnd = BodyBB->end(); |
| 846 | |
| 847 | B.setMBB(*LoopBB); |
| 848 | |
| 849 | LLT S1 = LLT::scalar(SizeInBits: 1); |
| 850 | Register CondReg; |
| 851 | |
| 852 | assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); |
| 853 | |
| 854 | for (MachineInstr &MI : make_range(x: NewBegin, y: NewEnd)) { |
| 855 | for (MachineOperand &Op : MI.all_uses()) { |
| 856 | Register OldReg = Op.getReg(); |
| 857 | if (!SGPROperandRegs.count(V: OldReg)) |
| 858 | continue; |
| 859 | |
| 860 | // See if we already processed this register in another instruction in the |
| 861 | // sequence. |
| 862 | auto OldVal = WaterfalledRegMap.find(Val: OldReg); |
| 863 | if (OldVal != WaterfalledRegMap.end()) { |
| 864 | Op.setReg(OldVal->second); |
| 865 | continue; |
| 866 | } |
| 867 | |
| 868 | Register OpReg = Op.getReg(); |
| 869 | LLT OpTy = MRI.getType(Reg: OpReg); |
| 870 | |
| 871 | const RegisterBank *OpBank = getRegBank(Reg: OpReg, MRI, TRI: *TRI); |
| 872 | if (OpBank != &AMDGPU::VGPRRegBank) { |
| 873 | // Insert copy from AGPR to VGPR before the loop. |
| 874 | B.setMBB(MBB); |
| 875 | OpReg = B.buildCopy(Res: OpTy, Op: OpReg).getReg(Idx: 0); |
| 876 | MRI.setRegBank(Reg: OpReg, RegBank: AMDGPU::VGPRRegBank); |
| 877 | B.setMBB(*LoopBB); |
| 878 | } |
| 879 | |
| 880 | Register CurrentLaneReg = buildReadFirstLane(B, MRI, Src: OpReg); |
| 881 | |
| 882 | // Build the comparison(s). |
| 883 | unsigned OpSize = OpTy.getSizeInBits(); |
| 884 | bool Is64 = OpSize % 64 == 0; |
| 885 | unsigned PartSize = Is64 ? 64 : 32; |
| 886 | LLT PartTy = LLT::scalar(SizeInBits: PartSize); |
| 887 | unsigned NumParts = OpSize / PartSize; |
| 888 | SmallVector<Register, 8> OpParts; |
| 889 | SmallVector<Register, 8> CurrentLaneParts; |
| 890 | |
| 891 | if (NumParts == 1) { |
| 892 | OpParts.push_back(Elt: OpReg); |
| 893 | CurrentLaneParts.push_back(Elt: CurrentLaneReg); |
| 894 | } else { |
| 895 | auto UnmergeOp = B.buildUnmerge(Res: PartTy, Op: OpReg); |
| 896 | auto UnmergeCurrentLane = B.buildUnmerge(Res: PartTy, Op: CurrentLaneReg); |
| 897 | for (unsigned i = 0; i < NumParts; ++i) { |
| 898 | OpParts.push_back(Elt: UnmergeOp.getReg(Idx: i)); |
| 899 | CurrentLaneParts.push_back(Elt: UnmergeCurrentLane.getReg(Idx: i)); |
| 900 | MRI.setRegBank(Reg: OpParts[i], RegBank: AMDGPU::VGPRRegBank); |
| 901 | MRI.setRegBank(Reg: CurrentLaneParts[i], RegBank: AMDGPU::SGPRRegBank); |
| 902 | } |
| 903 | } |
| 904 | |
| 905 | for (unsigned i = 0; i < NumParts; ++i) { |
| 906 | auto CmpReg = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: CurrentLaneParts[i], |
| 907 | Op1: OpParts[i]).getReg(Idx: 0); |
| 908 | MRI.setRegBank(Reg: CmpReg, RegBank: AMDGPU::VCCRegBank); |
| 909 | |
| 910 | if (!CondReg) { |
| 911 | CondReg = CmpReg; |
| 912 | } else { |
| 913 | CondReg = B.buildAnd(Dst: S1, Src0: CondReg, Src1: CmpReg).getReg(Idx: 0); |
| 914 | MRI.setRegBank(Reg: CondReg, RegBank: AMDGPU::VCCRegBank); |
| 915 | } |
| 916 | } |
| 917 | |
| 918 | Op.setReg(CurrentLaneReg); |
| 919 | |
| 920 | // Make sure we don't re-process this register again. |
| 921 | WaterfalledRegMap.insert(KV: std::pair(OldReg, Op.getReg())); |
| 922 | } |
| 923 | } |
| 924 | |
| 925 | // The ballot becomes a no-op during instruction selection. |
| 926 | CondReg = B.buildIntrinsic(ID: Intrinsic::amdgcn_ballot, |
| 927 | Res: {LLT::scalar(SizeInBits: Subtarget.isWave32() ? 32 : 64)}) |
| 928 | .addReg(RegNo: CondReg) |
| 929 | .getReg(Idx: 0); |
| 930 | MRI.setRegClass(Reg: CondReg, RC: WaveRC); |
| 931 | |
| 932 | // Update EXEC, save the original EXEC value to VCC. |
| 933 | B.buildInstr(Opcode: LMC.AndSaveExecOpc) |
| 934 | .addDef(RegNo: NewExec) |
| 935 | .addReg(RegNo: CondReg, Flags: RegState::Kill); |
| 936 | |
| 937 | MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg); |
| 938 | |
| 939 | B.setInsertPt(MBB&: *BodyBB, II: BodyBB->end()); |
| 940 | |
| 941 | // Update EXEC, switch all done bits to 0 and all todo bits to 1. |
| 942 | B.buildInstr(Opcode: LMC.XorTermOpc) |
| 943 | .addDef(RegNo: LMC.ExecReg) |
| 944 | .addReg(RegNo: LMC.ExecReg) |
| 945 | .addReg(RegNo: NewExec); |
| 946 | |
| 947 | // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use |
| 948 | // s_cbranch_scc0? |
| 949 | |
| 950 | // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. |
| 951 | B.buildInstr(Opcode: AMDGPU::SI_WATERFALL_LOOP).addMBB(MBB: LoopBB); |
| 952 | |
| 953 | // Save the EXEC mask before the loop. |
| 954 | BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: SaveExecReg) |
| 955 | .addReg(RegNo: LMC.ExecReg); |
| 956 | |
| 957 | // Restore the EXEC mask after the loop. |
| 958 | B.setMBB(*RestoreExecBB); |
| 959 | B.buildInstr(Opcode: LMC.MovTermOpc).addDef(RegNo: LMC.ExecReg).addReg(RegNo: SaveExecReg); |
| 960 | |
| 961 | // Set the insert point after the original instruction, so any new |
| 962 | // instructions will be in the remainder. |
| 963 | B.setInsertPt(MBB&: *RemainderBB, II: RemainderBB->begin()); |
| 964 | |
| 965 | return true; |
| 966 | } |
| 967 | |
| 968 | // Return any unique registers used by \p MI at \p OpIndices that need to be |
| 969 | // handled in a waterfall loop. Returns these registers in \p |
| 970 | // SGPROperandRegs. Returns true if there are any operands to handle and a |
| 971 | // waterfall loop is necessary. |
| 972 | bool AMDGPURegisterBankInfo::collectWaterfallOperands( |
| 973 | SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, |
| 974 | MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { |
| 975 | for (unsigned Op : OpIndices) { |
| 976 | assert(MI.getOperand(Op).isUse()); |
| 977 | Register Reg = MI.getOperand(i: Op).getReg(); |
| 978 | const RegisterBank *OpBank = getRegBank(Reg, MRI, TRI: *TRI); |
| 979 | if (OpBank->getID() != AMDGPU::SGPRRegBankID) |
| 980 | SGPROperandRegs.insert(V: Reg); |
| 981 | } |
| 982 | |
| 983 | // No operands need to be replaced, so no need to loop. |
| 984 | return !SGPROperandRegs.empty(); |
| 985 | } |
| 986 | |
| 987 | bool AMDGPURegisterBankInfo::executeInWaterfallLoop( |
| 988 | MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const { |
| 989 | // Use a set to avoid extra readfirstlanes in the case where multiple operands |
| 990 | // are the same register. |
| 991 | SmallSet<Register, 4> SGPROperandRegs; |
| 992 | |
| 993 | if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI&: *B.getMRI(), OpIndices)) |
| 994 | return false; |
| 995 | |
| 996 | MachineBasicBlock::iterator I = MI.getIterator(); |
| 997 | return executeInWaterfallLoop(B, Range: make_range(x: I, y: std::next(x: I)), |
| 998 | SGPROperandRegs); |
| 999 | } |
| 1000 | |
| 1001 | // Legalize an operand that must be an SGPR by inserting a readfirstlane. |
| 1002 | void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( |
| 1003 | MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const { |
| 1004 | Register Reg = MI.getOperand(i: OpIdx).getReg(); |
| 1005 | MachineRegisterInfo &MRI = *B.getMRI(); |
| 1006 | const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI); |
| 1007 | if (Bank == &AMDGPU::SGPRRegBank) |
| 1008 | return; |
| 1009 | |
| 1010 | Reg = buildReadFirstLane(B, MRI, Src: Reg); |
| 1011 | MI.getOperand(i: OpIdx).setReg(Reg); |
| 1012 | } |
| 1013 | |
| 1014 | /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the |
| 1015 | /// rest will be in the remainder. |
| 1016 | static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { |
| 1017 | unsigned TotalSize = Ty.getSizeInBits(); |
| 1018 | if (!Ty.isVector()) |
| 1019 | return {LLT::scalar(SizeInBits: FirstSize), LLT::scalar(SizeInBits: TotalSize - FirstSize)}; |
| 1020 | |
| 1021 | LLT EltTy = Ty.getElementType(); |
| 1022 | unsigned EltSize = EltTy.getSizeInBits(); |
| 1023 | assert(FirstSize % EltSize == 0); |
| 1024 | |
| 1025 | unsigned FirstPartNumElts = FirstSize / EltSize; |
| 1026 | unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; |
| 1027 | |
| 1028 | return {LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: FirstPartNumElts), ScalarTy: EltTy), |
| 1029 | LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: RemainderElts), ScalarTy: EltTy)}; |
| 1030 | } |
| 1031 | |
| 1032 | static LLT widen96To128(LLT Ty) { |
| 1033 | if (!Ty.isVector()) |
| 1034 | return LLT::scalar(SizeInBits: 128); |
| 1035 | |
| 1036 | LLT EltTy = Ty.getElementType(); |
| 1037 | assert(128 % EltTy.getSizeInBits() == 0); |
| 1038 | return LLT::fixed_vector(NumElements: 128 / EltTy.getSizeInBits(), ScalarTy: EltTy); |
| 1039 | } |
| 1040 | |
| 1041 | bool AMDGPURegisterBankInfo::applyMappingLoad( |
| 1042 | MachineIRBuilder &B, |
| 1043 | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, |
| 1044 | MachineInstr &MI) const { |
| 1045 | MachineRegisterInfo &MRI = *B.getMRI(); |
| 1046 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 1047 | const LLT LoadTy = MRI.getType(Reg: DstReg); |
| 1048 | unsigned LoadSize = LoadTy.getSizeInBits(); |
| 1049 | MachineMemOperand *MMO = *MI.memoperands_begin(); |
| 1050 | const unsigned MaxNonSmrdLoadSize = 128; |
| 1051 | |
| 1052 | const RegisterBank *DstBank = |
| 1053 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
| 1054 | if (DstBank == &AMDGPU::SGPRRegBank) { |
| 1055 | // There are some special cases that we need to look at for 32 bit and 96 |
| 1056 | // bit SGPR loads otherwise we have nothing to do. |
| 1057 | if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads())) |
| 1058 | return false; |
| 1059 | |
| 1060 | const unsigned MemSize = 8 * MMO->getSize().getValue(); |
| 1061 | // Scalar loads of size 8 or 16 bit with proper alignment may be widened to |
| 1062 | // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit |
| 1063 | // scalar loads should have a load size of 32 but memory access size of less |
| 1064 | // than 32. |
| 1065 | if (LoadSize == 32 && |
| 1066 | (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) |
| 1067 | return false; |
| 1068 | |
| 1069 | if (LoadSize == 32 && |
| 1070 | ((MemSize == 8 && MMO->getAlign() >= Align(1)) || |
| 1071 | (MemSize == 16 && MMO->getAlign() >= Align(2))) && |
| 1072 | isScalarLoadLegal(MI) && |
| 1073 | Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12) |
| 1074 | return false; |
| 1075 | |
| 1076 | Register PtrReg = MI.getOperand(i: 1).getReg(); |
| 1077 | |
| 1078 | ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); |
| 1079 | |
| 1080 | if (LoadSize == 32) { |
| 1081 | // This is an extending load from a sub-dword size. Widen the memory |
| 1082 | // access size to 4 bytes and clear the extra high bits appropriately |
| 1083 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 1084 | if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { |
| 1085 | // Must extend the sign bit into higher bits for a G_SEXTLOAD |
| 1086 | auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0); |
| 1087 | B.buildSExtInReg(Res: MI.getOperand(i: 0), Op: WideLoad, ImmOp: MemSize); |
| 1088 | } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { |
| 1089 | // Must extend zero into higher bits with an AND for a G_ZEXTLOAD |
| 1090 | auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0); |
| 1091 | B.buildZExtInReg(Res: MI.getOperand(i: 0), Op: WideLoad, ImmOp: MemSize); |
| 1092 | } else |
| 1093 | // We do not need to touch the higher bits for regular loads. |
| 1094 | B.buildLoadFromOffset(Dst: MI.getOperand(i: 0), BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0); |
| 1095 | } else { |
| 1096 | // 96-bit loads are only available for vector loads. We need to split this |
| 1097 | // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). |
| 1098 | if (MMO->getAlign() < Align(16)) { |
| 1099 | LegalizerHelper Helper(B.getMF(), ApplyBank, B); |
| 1100 | LLT Part64, Part32; |
| 1101 | std::tie(args&: Part64, args&: Part32) = splitUnequalType(Ty: LoadTy, FirstSize: 64); |
| 1102 | if (Helper.reduceLoadStoreWidth(MI&: cast<GAnyLoad>(Val&: MI), TypeIdx: 0, NarrowTy: Part64) != |
| 1103 | LegalizerHelper::Legalized) |
| 1104 | return false; |
| 1105 | return true; |
| 1106 | } |
| 1107 | LLT WiderTy = widen96To128(Ty: LoadTy); |
| 1108 | auto WideLoad = B.buildLoadFromOffset(Dst: WiderTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0); |
| 1109 | if (WiderTy.isScalar()) { |
| 1110 | B.buildTrunc(Res: MI.getOperand(i: 0), Op: WideLoad); |
| 1111 | } else { |
| 1112 | B.buildDeleteTrailingVectorElements(Res: MI.getOperand(i: 0).getReg(), |
| 1113 | Op0: WideLoad); |
| 1114 | } |
| 1115 | } |
| 1116 | |
| 1117 | MI.eraseFromParent(); |
| 1118 | return true; |
| 1119 | } |
| 1120 | |
| 1121 | // 128-bit loads are supported for all instruction types. |
| 1122 | if (LoadSize <= MaxNonSmrdLoadSize) |
| 1123 | return false; |
| 1124 | |
| 1125 | SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(OpIdx: 1)); |
| 1126 | |
| 1127 | if (SrcRegs.empty()) |
| 1128 | SrcRegs.push_back(Elt: MI.getOperand(i: 1).getReg()); |
| 1129 | |
| 1130 | // RegBankSelect only emits scalar types, so we need to reset the pointer |
| 1131 | // operand to a pointer type. |
| 1132 | Register BasePtrReg = SrcRegs[0]; |
| 1133 | LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg()); |
| 1134 | MRI.setType(VReg: BasePtrReg, Ty: PtrTy); |
| 1135 | |
| 1136 | // The following are the loads not splitted enough during legalization |
| 1137 | // because it was not clear they are smem-load or vmem-load |
| 1138 | if (AMDGPU::isExtendedGlobalAddrSpace(AS: MMO->getAddrSpace()) || |
| 1139 | MMO->getAddrSpace() == AMDGPUAS::BUFFER_RESOURCE) { |
| 1140 | assert(LoadSize % MaxNonSmrdLoadSize == 0); |
| 1141 | unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; |
| 1142 | const LLT LoadSplitTy = LoadTy.divide(Factor: NumSplitParts); |
| 1143 | ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank); |
| 1144 | LegalizerHelper Helper(B.getMF(), O, B); |
| 1145 | if (LoadTy.isVector()) { |
| 1146 | if (Helper.fewerElementsVector(MI, TypeIdx: 0, NarrowTy: LoadSplitTy) != |
| 1147 | LegalizerHelper::Legalized) |
| 1148 | return false; |
| 1149 | } else { |
| 1150 | if (Helper.narrowScalar(MI, TypeIdx: 0, NarrowTy: LoadSplitTy) != LegalizerHelper::Legalized) |
| 1151 | return false; |
| 1152 | } |
| 1153 | } |
| 1154 | |
| 1155 | MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank); |
| 1156 | return true; |
| 1157 | } |
| 1158 | |
| 1159 | bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( |
| 1160 | MachineIRBuilder &B, |
| 1161 | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, |
| 1162 | MachineInstr &MI) const { |
| 1163 | MachineRegisterInfo &MRI = *B.getMRI(); |
| 1164 | const MachineFunction &MF = B.getMF(); |
| 1165 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| 1166 | const auto &TFI = *ST.getFrameLowering(); |
| 1167 | |
| 1168 | // Guard in case the stack growth direction ever changes with scratch |
| 1169 | // instructions. |
| 1170 | assert(TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp && |
| 1171 | "Stack grows upwards for AMDGPU" ); |
| 1172 | |
| 1173 | Register Dst = MI.getOperand(i: 0).getReg(); |
| 1174 | Register AllocSize = MI.getOperand(i: 1).getReg(); |
| 1175 | Align Alignment = assumeAligned(Value: MI.getOperand(i: 2).getImm()); |
| 1176 | |
| 1177 | const RegisterBank *SizeBank = getRegBank(Reg: AllocSize, MRI, TRI: *TRI); |
| 1178 | |
| 1179 | if (SizeBank != &AMDGPU::SGPRRegBank) { |
| 1180 | auto WaveReduction = |
| 1181 | B.buildIntrinsic(ID: Intrinsic::amdgcn_wave_reduce_umax, Res: {LLT::scalar(SizeInBits: 32)}) |
| 1182 | .addUse(RegNo: AllocSize) |
| 1183 | .addImm(Val: 0); |
| 1184 | AllocSize = WaveReduction.getReg(Idx: 0); |
| 1185 | } |
| 1186 | |
| 1187 | LLT PtrTy = MRI.getType(Reg: Dst); |
| 1188 | LLT IntPtrTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits()); |
| 1189 | |
| 1190 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
| 1191 | Register SPReg = Info->getStackPtrOffsetReg(); |
| 1192 | ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); |
| 1193 | |
| 1194 | auto WaveSize = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: ST.getWavefrontSizeLog2()); |
| 1195 | auto ScaledSize = B.buildShl(Dst: IntPtrTy, Src0: AllocSize, Src1: WaveSize); |
| 1196 | |
| 1197 | auto OldSP = B.buildCopy(Res: PtrTy, Op: SPReg); |
| 1198 | if (Alignment > TFI.getStackAlign()) { |
| 1199 | auto StackAlignMask = (Alignment.value() << ST.getWavefrontSizeLog2()) - 1; |
| 1200 | auto Tmp1 = B.buildPtrAdd(Res: PtrTy, Op0: OldSP, |
| 1201 | Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: StackAlignMask)); |
| 1202 | B.buildMaskLowPtrBits(Res: Dst, Op0: Tmp1, |
| 1203 | NumBits: Log2(A: Alignment) + ST.getWavefrontSizeLog2()); |
| 1204 | } else { |
| 1205 | B.buildCopy(Res: Dst, Op: OldSP); |
| 1206 | } |
| 1207 | auto PtrAdd = B.buildPtrAdd(Res: PtrTy, Op0: Dst, Op1: ScaledSize); |
| 1208 | B.buildCopy(Res: SPReg, Op: PtrAdd); |
| 1209 | MI.eraseFromParent(); |
| 1210 | return true; |
| 1211 | } |
| 1212 | |
| 1213 | bool AMDGPURegisterBankInfo::applyMappingImage( |
| 1214 | MachineIRBuilder &B, MachineInstr &MI, |
| 1215 | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, |
| 1216 | int RsrcIdx) const { |
| 1217 | const int NumDefs = MI.getNumExplicitDefs(); |
| 1218 | |
| 1219 | // The reported argument index is relative to the IR intrinsic call arguments, |
| 1220 | // so we need to shift by the number of defs and the intrinsic ID. |
| 1221 | RsrcIdx += NumDefs + 1; |
| 1222 | |
| 1223 | // Insert copies to VGPR arguments. |
| 1224 | applyDefaultMapping(OpdMapper); |
| 1225 | |
| 1226 | // Fixup any SGPR arguments. |
| 1227 | SmallVector<unsigned, 4> SGPRIndexes; |
| 1228 | for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { |
| 1229 | if (!MI.getOperand(i: I).isReg()) |
| 1230 | continue; |
| 1231 | |
| 1232 | // If this intrinsic has a sampler, it immediately follows rsrc. |
| 1233 | if (I == RsrcIdx || I == RsrcIdx + 1) |
| 1234 | SGPRIndexes.push_back(Elt: I); |
| 1235 | } |
| 1236 | |
| 1237 | executeInWaterfallLoop(B, MI, OpIndices: SGPRIndexes); |
| 1238 | return true; |
| 1239 | } |
| 1240 | |
| 1241 | // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store |
| 1242 | // the three offsets (voffset, soffset and instoffset) |
| 1243 | unsigned AMDGPURegisterBankInfo::setBufferOffsets( |
| 1244 | MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg, |
| 1245 | Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const { |
| 1246 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 1247 | MachineRegisterInfo *MRI = B.getMRI(); |
| 1248 | |
| 1249 | if (std::optional<int64_t> Imm = |
| 1250 | getIConstantVRegSExtVal(VReg: CombinedOffset, MRI: *MRI)) { |
| 1251 | uint32_t SOffset, ImmOffset; |
| 1252 | if (TII->splitMUBUFOffset(Imm: *Imm, SOffset, ImmOffset, Alignment)) { |
| 1253 | VOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0); |
| 1254 | SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: 0); |
| 1255 | InstOffsetVal = ImmOffset; |
| 1256 | |
| 1257 | B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank); |
| 1258 | B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank); |
| 1259 | return SOffset + ImmOffset; |
| 1260 | } |
| 1261 | } |
| 1262 | |
| 1263 | const bool CheckNUW = Subtarget.hasGFX1250Insts(); |
| 1264 | Register Base; |
| 1265 | unsigned Offset; |
| 1266 | |
| 1267 | std::tie(args&: Base, args&: Offset) = |
| 1268 | AMDGPU::getBaseWithConstantOffset(MRI&: *MRI, Reg: CombinedOffset, |
| 1269 | /*KnownBits=*/ValueTracking: nullptr, |
| 1270 | /*CheckNUW=*/CheckNUW); |
| 1271 | |
| 1272 | uint32_t SOffset, ImmOffset; |
| 1273 | if ((int)Offset > 0 && |
| 1274 | TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) { |
| 1275 | if (getRegBank(Reg: Base, MRI: *MRI, TRI: *TRI) == &AMDGPU::VGPRRegBank) { |
| 1276 | VOffsetReg = Base; |
| 1277 | SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: 0); |
| 1278 | B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank); |
| 1279 | InstOffsetVal = ImmOffset; |
| 1280 | return 0; // XXX - Why is this 0? |
| 1281 | } |
| 1282 | |
| 1283 | // If we have SGPR base, we can use it for soffset. |
| 1284 | if (SOffset == 0) { |
| 1285 | VOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0); |
| 1286 | B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank); |
| 1287 | SOffsetReg = Base; |
| 1288 | InstOffsetVal = ImmOffset; |
| 1289 | return 0; // XXX - Why is this 0? |
| 1290 | } |
| 1291 | } |
| 1292 | |
| 1293 | // Handle the variable sgpr + vgpr case. |
| 1294 | MachineInstr *Add = getOpcodeDef(Opcode: AMDGPU::G_ADD, Reg: CombinedOffset, MRI: *MRI); |
| 1295 | if (Add && (int)Offset >= 0 && |
| 1296 | (!CheckNUW || Add->getFlag(Flag: MachineInstr::NoUWrap))) { |
| 1297 | Register Src0 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: 1).getReg(), MRI: *MRI); |
| 1298 | Register Src1 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: 2).getReg(), MRI: *MRI); |
| 1299 | |
| 1300 | const RegisterBank *Src0Bank = getRegBank(Reg: Src0, MRI: *MRI, TRI: *TRI); |
| 1301 | const RegisterBank *Src1Bank = getRegBank(Reg: Src1, MRI: *MRI, TRI: *TRI); |
| 1302 | |
| 1303 | if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { |
| 1304 | VOffsetReg = Src0; |
| 1305 | SOffsetReg = Src1; |
| 1306 | return 0; |
| 1307 | } |
| 1308 | |
| 1309 | if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { |
| 1310 | VOffsetReg = Src1; |
| 1311 | SOffsetReg = Src0; |
| 1312 | return 0; |
| 1313 | } |
| 1314 | } |
| 1315 | |
| 1316 | // Ensure we have a VGPR for the combined offset. This could be an issue if we |
| 1317 | // have an SGPR offset and a VGPR resource. |
| 1318 | if (getRegBank(Reg: CombinedOffset, MRI: *MRI, TRI: *TRI) == &AMDGPU::VGPRRegBank) { |
| 1319 | VOffsetReg = CombinedOffset; |
| 1320 | } else { |
| 1321 | VOffsetReg = B.buildCopy(Res: S32, Op: CombinedOffset).getReg(Idx: 0); |
| 1322 | B.getMRI()->setRegBank(Reg: VOffsetReg, RegBank: AMDGPU::VGPRRegBank); |
| 1323 | } |
| 1324 | |
| 1325 | SOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0); |
| 1326 | B.getMRI()->setRegBank(Reg: SOffsetReg, RegBank: AMDGPU::SGPRRegBank); |
| 1327 | return 0; |
| 1328 | } |
| 1329 | |
| 1330 | static unsigned getSBufferLoadCorrespondingBufferLoadOpcode(unsigned Opc) { |
| 1331 | switch (Opc) { |
| 1332 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: |
| 1333 | return AMDGPU::G_AMDGPU_BUFFER_LOAD; |
| 1334 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: |
| 1335 | return AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; |
| 1336 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE: |
| 1337 | return AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE; |
| 1338 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: |
| 1339 | return AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; |
| 1340 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: |
| 1341 | return AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT; |
| 1342 | default: |
| 1343 | break; |
| 1344 | } |
| 1345 | llvm_unreachable("Unexpected s_buffer_load opcode" ); |
| 1346 | } |
| 1347 | |
| 1348 | bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( |
| 1349 | MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { |
| 1350 | MachineInstr &MI = OpdMapper.getMI(); |
| 1351 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); |
| 1352 | |
| 1353 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 1354 | Register Dst = MI.getOperand(i: 0).getReg(); |
| 1355 | LLT Ty = MRI.getType(Reg: Dst); |
| 1356 | |
| 1357 | const RegisterBank *RSrcBank = |
| 1358 | OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank; |
| 1359 | const RegisterBank *OffsetBank = |
| 1360 | OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank; |
| 1361 | if (RSrcBank == &AMDGPU::SGPRRegBank && |
| 1362 | OffsetBank == &AMDGPU::SGPRRegBank) |
| 1363 | return true; // Legal mapping |
| 1364 | |
| 1365 | // FIXME: 96-bit case was widened during legalize. We need to narrow it back |
| 1366 | // here but don't have an MMO. |
| 1367 | |
| 1368 | unsigned LoadSize = Ty.getSizeInBits(); |
| 1369 | int NumLoads = 1; |
| 1370 | if (LoadSize == 256 || LoadSize == 512) { |
| 1371 | NumLoads = LoadSize / 128; |
| 1372 | Ty = Ty.divide(Factor: NumLoads); |
| 1373 | } |
| 1374 | |
| 1375 | // Use the alignment to ensure that the required offsets will fit into the |
| 1376 | // immediate offsets. |
| 1377 | const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); |
| 1378 | |
| 1379 | MachineFunction &MF = B.getMF(); |
| 1380 | |
| 1381 | Register SOffset; |
| 1382 | Register VOffset; |
| 1383 | int64_t ImmOffset = 0; |
| 1384 | |
| 1385 | unsigned MMOOffset = setBufferOffsets(B, CombinedOffset: MI.getOperand(i: 2).getReg(), VOffsetReg&: VOffset, |
| 1386 | SOffsetReg&: SOffset, InstOffsetVal&: ImmOffset, Alignment); |
| 1387 | |
| 1388 | // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we |
| 1389 | // can, but we need to track an MMO for that. |
| 1390 | const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; |
| 1391 | const Align MemAlign(4); // FIXME: ABI type alignment? |
| 1392 | MachineMemOperand *BaseMMO = MF.getMachineMemOperand( |
| 1393 | PtrInfo: MachinePointerInfo(), |
| 1394 | F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | |
| 1395 | MachineMemOperand::MOInvariant, |
| 1396 | Size: MemSize, BaseAlignment: MemAlign); |
| 1397 | if (MMOOffset != 0) |
| 1398 | BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset, Size: MemSize); |
| 1399 | |
| 1400 | // If only the offset is divergent, emit a MUBUF buffer load instead. We can |
| 1401 | // assume that the buffer is unswizzled. |
| 1402 | |
| 1403 | Register RSrc = MI.getOperand(i: 1).getReg(); |
| 1404 | Register VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0); |
| 1405 | B.getMRI()->setRegBank(Reg: VIndex, RegBank: AMDGPU::VGPRRegBank); |
| 1406 | |
| 1407 | SmallVector<Register, 4> LoadParts(NumLoads); |
| 1408 | |
| 1409 | MachineBasicBlock::iterator MII = MI.getIterator(); |
| 1410 | MachineInstrSpan Span(MII, &B.getMBB()); |
| 1411 | |
| 1412 | for (int i = 0; i < NumLoads; ++i) { |
| 1413 | if (NumLoads == 1) { |
| 1414 | LoadParts[i] = Dst; |
| 1415 | } else { |
| 1416 | LoadParts[i] = MRI.createGenericVirtualRegister(Ty); |
| 1417 | MRI.setRegBank(Reg: LoadParts[i], RegBank: AMDGPU::VGPRRegBank); |
| 1418 | } |
| 1419 | |
| 1420 | MachineMemOperand *MMO = BaseMMO; |
| 1421 | if (i != 0) |
| 1422 | BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset + 16 * i, Size: MemSize); |
| 1423 | |
| 1424 | B.buildInstr(Opcode: getSBufferLoadCorrespondingBufferLoadOpcode(Opc: MI.getOpcode())) |
| 1425 | .addDef(RegNo: LoadParts[i]) // vdata |
| 1426 | .addUse(RegNo: RSrc) // rsrc |
| 1427 | .addUse(RegNo: VIndex) // vindex |
| 1428 | .addUse(RegNo: VOffset) // voffset |
| 1429 | .addUse(RegNo: SOffset) // soffset |
| 1430 | .addImm(Val: ImmOffset + 16 * i) // offset(imm) |
| 1431 | .addImm(Val: 0) // cachepolicy, swizzled buffer(imm) |
| 1432 | .addImm(Val: 0) // idxen(imm) |
| 1433 | .addMemOperand(MMO); |
| 1434 | } |
| 1435 | |
| 1436 | // TODO: If only the resource is a VGPR, it may be better to execute the |
| 1437 | // scalar load in the waterfall loop if the resource is expected to frequently |
| 1438 | // be dynamically uniform. |
| 1439 | if (RSrcBank != &AMDGPU::SGPRRegBank) { |
| 1440 | // Remove the original instruction to avoid potentially confusing the |
| 1441 | // waterfall loop logic. |
| 1442 | B.setInstr(*Span.begin()); |
| 1443 | MI.eraseFromParent(); |
| 1444 | |
| 1445 | SmallSet<Register, 4> OpsToWaterfall; |
| 1446 | |
| 1447 | OpsToWaterfall.insert(V: RSrc); |
| 1448 | executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()), |
| 1449 | SGPROperandRegs&: OpsToWaterfall); |
| 1450 | } |
| 1451 | |
| 1452 | if (NumLoads != 1) { |
| 1453 | if (Ty.isVector()) |
| 1454 | B.buildConcatVectors(Res: Dst, Ops: LoadParts); |
| 1455 | else |
| 1456 | B.buildMergeLikeInstr(Res: Dst, Ops: LoadParts); |
| 1457 | } |
| 1458 | |
| 1459 | // We removed the instruction earlier with a waterfall loop. |
| 1460 | if (RSrcBank == &AMDGPU::SGPRRegBank) |
| 1461 | MI.eraseFromParent(); |
| 1462 | |
| 1463 | return true; |
| 1464 | } |
| 1465 | |
| 1466 | bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B, |
| 1467 | const OperandsMapper &OpdMapper, |
| 1468 | bool Signed) const { |
| 1469 | MachineInstr &MI = OpdMapper.getMI(); |
| 1470 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); |
| 1471 | |
| 1472 | // Insert basic copies |
| 1473 | applyDefaultMapping(OpdMapper); |
| 1474 | |
| 1475 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 1476 | LLT Ty = MRI.getType(Reg: DstReg); |
| 1477 | |
| 1478 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 1479 | |
| 1480 | unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1; |
| 1481 | Register SrcReg = MI.getOperand(i: FirstOpnd).getReg(); |
| 1482 | Register OffsetReg = MI.getOperand(i: FirstOpnd + 1).getReg(); |
| 1483 | Register WidthReg = MI.getOperand(i: FirstOpnd + 2).getReg(); |
| 1484 | |
| 1485 | const RegisterBank *DstBank = |
| 1486 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
| 1487 | if (DstBank == &AMDGPU::VGPRRegBank) { |
| 1488 | if (Ty == S32) |
| 1489 | return true; |
| 1490 | |
| 1491 | // There is no 64-bit vgpr bitfield extract instructions so the operation |
| 1492 | // is expanded to a sequence of instructions that implement the operation. |
| 1493 | ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); |
| 1494 | |
| 1495 | const LLT S64 = LLT::scalar(SizeInBits: 64); |
| 1496 | // Shift the source operand so that extracted bits start at bit 0. |
| 1497 | auto ShiftOffset = Signed ? B.buildAShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg) |
| 1498 | : B.buildLShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg); |
| 1499 | auto UnmergeSOffset = B.buildUnmerge(Res: {S32, S32}, Op: ShiftOffset); |
| 1500 | |
| 1501 | // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions |
| 1502 | // if the width is a constant. |
| 1503 | if (auto ConstWidth = getIConstantVRegValWithLookThrough(VReg: WidthReg, MRI)) { |
| 1504 | // Use the 32-bit bitfield extract instruction if the width is a constant. |
| 1505 | // Depending on the width size, use either the low or high 32-bits. |
| 1506 | auto Zero = B.buildConstant(Res: S32, Val: 0); |
| 1507 | auto WidthImm = ConstWidth->Value.getZExtValue(); |
| 1508 | if (WidthImm <= 32) { |
| 1509 | // Use bitfield extract on the lower 32-bit source, and then sign-extend |
| 1510 | // or clear the upper 32-bits. |
| 1511 | auto = |
| 1512 | Signed ? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 0), LSB: Zero, Width: WidthReg) |
| 1513 | : B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 0), LSB: Zero, Width: WidthReg); |
| 1514 | auto Extend = |
| 1515 | Signed ? B.buildAShr(Dst: S32, Src0: Extract, Src1: B.buildConstant(Res: S32, Val: 31)) : Zero; |
| 1516 | B.buildMergeLikeInstr(Res: DstReg, Ops: {Extract, Extend}); |
| 1517 | } else { |
| 1518 | // Use bitfield extract on upper 32-bit source, and combine with lower |
| 1519 | // 32-bit source. |
| 1520 | auto UpperWidth = B.buildConstant(Res: S32, Val: WidthImm - 32); |
| 1521 | auto = |
| 1522 | Signed |
| 1523 | ? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 1), LSB: Zero, Width: UpperWidth) |
| 1524 | : B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 1), LSB: Zero, Width: UpperWidth); |
| 1525 | B.buildMergeLikeInstr(Res: DstReg, Ops: {UnmergeSOffset.getReg(Idx: 0), Extract}); |
| 1526 | } |
| 1527 | MI.eraseFromParent(); |
| 1528 | return true; |
| 1529 | } |
| 1530 | |
| 1531 | // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit |
| 1532 | // operations. |
| 1533 | auto ExtShift = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: 64), Src1: WidthReg); |
| 1534 | auto SignBit = B.buildShl(Dst: S64, Src0: ShiftOffset, Src1: ExtShift); |
| 1535 | if (Signed) |
| 1536 | B.buildAShr(Dst: S64, Src0: SignBit, Src1: ExtShift); |
| 1537 | else |
| 1538 | B.buildLShr(Dst: S64, Src0: SignBit, Src1: ExtShift); |
| 1539 | MI.eraseFromParent(); |
| 1540 | return true; |
| 1541 | } |
| 1542 | |
| 1543 | // The scalar form packs the offset and width in a single operand. |
| 1544 | |
| 1545 | ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); |
| 1546 | |
| 1547 | // Ensure the high bits are clear to insert the offset. |
| 1548 | auto OffsetMask = B.buildConstant(Res: S32, Val: maskTrailingOnes<unsigned>(N: 6)); |
| 1549 | auto ClampOffset = B.buildAnd(Dst: S32, Src0: OffsetReg, Src1: OffsetMask); |
| 1550 | |
| 1551 | // Zeros out the low bits, so don't bother clamping the input value. |
| 1552 | auto ShiftWidth = B.buildShl(Dst: S32, Src0: WidthReg, Src1: B.buildConstant(Res: S32, Val: 16)); |
| 1553 | |
| 1554 | // Transformation function, pack the offset and width of a BFE into |
| 1555 | // the format expected by the S_BFE_I32 / S_BFE_U32. In the second |
| 1556 | // source, bits [5:0] contain the offset and bits [22:16] the width. |
| 1557 | auto MergedInputs = B.buildOr(Dst: S32, Src0: ClampOffset, Src1: ShiftWidth); |
| 1558 | |
| 1559 | // TODO: It might be worth using a pseudo here to avoid scc clobber and |
| 1560 | // register class constraints. |
| 1561 | unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : |
| 1562 | (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); |
| 1563 | |
| 1564 | auto MIB = B.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {SrcReg, MergedInputs}); |
| 1565 | if (!constrainSelectedInstRegOperands(I&: *MIB, TII: *TII, TRI: *TRI, RBI: *this)) |
| 1566 | llvm_unreachable("failed to constrain BFE" ); |
| 1567 | |
| 1568 | MI.eraseFromParent(); |
| 1569 | return true; |
| 1570 | } |
| 1571 | |
| 1572 | bool AMDGPURegisterBankInfo::applyMappingMAD_64_32( |
| 1573 | MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { |
| 1574 | MachineInstr &MI = OpdMapper.getMI(); |
| 1575 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); |
| 1576 | |
| 1577 | // Insert basic copies. |
| 1578 | applyDefaultMapping(OpdMapper); |
| 1579 | |
| 1580 | Register Dst0 = MI.getOperand(i: 0).getReg(); |
| 1581 | Register Dst1 = MI.getOperand(i: 1).getReg(); |
| 1582 | Register Src0 = MI.getOperand(i: 2).getReg(); |
| 1583 | Register Src1 = MI.getOperand(i: 3).getReg(); |
| 1584 | Register Src2 = MI.getOperand(i: 4).getReg(); |
| 1585 | |
| 1586 | if (MRI.getRegBankOrNull(Reg: Src0) == &AMDGPU::VGPRRegBank) |
| 1587 | return true; |
| 1588 | |
| 1589 | bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; |
| 1590 | LLT S1 = LLT::scalar(SizeInBits: 1); |
| 1591 | LLT S32 = LLT::scalar(SizeInBits: 32); |
| 1592 | |
| 1593 | bool DstOnValu = MRI.getRegBankOrNull(Reg: Src2) == &AMDGPU::VGPRRegBank; |
| 1594 | bool Accumulate = true; |
| 1595 | |
| 1596 | if (!DstOnValu) { |
| 1597 | if (mi_match(R: Src2, MRI, P: m_ZeroInt())) |
| 1598 | Accumulate = false; |
| 1599 | } |
| 1600 | |
| 1601 | // Keep the multiplication on the SALU. |
| 1602 | Register DstHi; |
| 1603 | Register DstLo = B.buildMul(Dst: S32, Src0, Src1).getReg(Idx: 0); |
| 1604 | bool MulHiInVgpr = false; |
| 1605 | |
| 1606 | MRI.setRegBank(Reg: DstLo, RegBank: AMDGPU::SGPRRegBank); |
| 1607 | |
| 1608 | if (Subtarget.hasSMulHi()) { |
| 1609 | DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0, Src1).getReg(Idx: 0) |
| 1610 | : B.buildSMulH(Dst: S32, Src0, Src1).getReg(Idx: 0); |
| 1611 | MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::SGPRRegBank); |
| 1612 | } else { |
| 1613 | Register VSrc0 = B.buildCopy(Res: S32, Op: Src0).getReg(Idx: 0); |
| 1614 | Register VSrc1 = B.buildCopy(Res: S32, Op: Src1).getReg(Idx: 0); |
| 1615 | |
| 1616 | MRI.setRegBank(Reg: VSrc0, RegBank: AMDGPU::VGPRRegBank); |
| 1617 | MRI.setRegBank(Reg: VSrc1, RegBank: AMDGPU::VGPRRegBank); |
| 1618 | |
| 1619 | DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: 0) |
| 1620 | : B.buildSMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: 0); |
| 1621 | MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::VGPRRegBank); |
| 1622 | |
| 1623 | if (!DstOnValu) { |
| 1624 | DstHi = buildReadFirstLane(B, MRI, Src: DstHi); |
| 1625 | } else { |
| 1626 | MulHiInVgpr = true; |
| 1627 | } |
| 1628 | } |
| 1629 | |
| 1630 | // Accumulate and produce the "carry-out" bit. |
| 1631 | // |
| 1632 | // The "carry-out" is defined as bit 64 of the result when computed as a |
| 1633 | // big integer. For unsigned multiply-add, this matches the usual definition |
| 1634 | // of carry-out. For signed multiply-add, bit 64 is the sign bit of the |
| 1635 | // result, which is determined as: |
| 1636 | // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add |
| 1637 | LLT CarryType = DstOnValu ? S1 : S32; |
| 1638 | const RegisterBank &CarryBank = |
| 1639 | DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; |
| 1640 | const RegisterBank &DstBank = |
| 1641 | DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank; |
| 1642 | Register Carry; |
| 1643 | Register Zero; |
| 1644 | |
| 1645 | if (!IsUnsigned) { |
| 1646 | Zero = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0); |
| 1647 | MRI.setRegBank(Reg: Zero, |
| 1648 | RegBank: MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank); |
| 1649 | |
| 1650 | Carry = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: MulHiInVgpr ? S1 : S32, Op0: DstHi, Op1: Zero) |
| 1651 | .getReg(Idx: 0); |
| 1652 | MRI.setRegBank(Reg: Carry, RegBank: MulHiInVgpr ? AMDGPU::VCCRegBank |
| 1653 | : AMDGPU::SGPRRegBank); |
| 1654 | |
| 1655 | if (DstOnValu && !MulHiInVgpr) { |
| 1656 | Carry = B.buildTrunc(Res: S1, Op: Carry).getReg(Idx: 0); |
| 1657 | MRI.setRegBank(Reg: Carry, RegBank: AMDGPU::VCCRegBank); |
| 1658 | } |
| 1659 | } |
| 1660 | |
| 1661 | if (Accumulate) { |
| 1662 | if (DstOnValu) { |
| 1663 | DstLo = B.buildCopy(Res: S32, Op: DstLo).getReg(Idx: 0); |
| 1664 | DstHi = B.buildCopy(Res: S32, Op: DstHi).getReg(Idx: 0); |
| 1665 | MRI.setRegBank(Reg: DstLo, RegBank: AMDGPU::VGPRRegBank); |
| 1666 | MRI.setRegBank(Reg: DstHi, RegBank: AMDGPU::VGPRRegBank); |
| 1667 | } |
| 1668 | |
| 1669 | auto Unmerge = B.buildUnmerge(Res: S32, Op: Src2); |
| 1670 | Register Src2Lo = Unmerge.getReg(Idx: 0); |
| 1671 | Register Src2Hi = Unmerge.getReg(Idx: 1); |
| 1672 | MRI.setRegBank(Reg: Src2Lo, RegBank: DstBank); |
| 1673 | MRI.setRegBank(Reg: Src2Hi, RegBank: DstBank); |
| 1674 | |
| 1675 | if (!IsUnsigned) { |
| 1676 | auto Src2Sign = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: CarryType, Op0: Src2Hi, Op1: Zero); |
| 1677 | MRI.setRegBank(Reg: Src2Sign.getReg(Idx: 0), RegBank: CarryBank); |
| 1678 | |
| 1679 | Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: Src2Sign).getReg(Idx: 0); |
| 1680 | MRI.setRegBank(Reg: Carry, RegBank: CarryBank); |
| 1681 | } |
| 1682 | |
| 1683 | auto AddLo = B.buildUAddo(Res: S32, CarryOut: CarryType, Op0: DstLo, Op1: Src2Lo); |
| 1684 | DstLo = AddLo.getReg(Idx: 0); |
| 1685 | Register CarryLo = AddLo.getReg(Idx: 1); |
| 1686 | MRI.setRegBank(Reg: DstLo, RegBank: DstBank); |
| 1687 | MRI.setRegBank(Reg: CarryLo, RegBank: CarryBank); |
| 1688 | |
| 1689 | auto AddHi = B.buildUAdde(Res: S32, CarryOut: CarryType, Op0: DstHi, Op1: Src2Hi, CarryIn: CarryLo); |
| 1690 | DstHi = AddHi.getReg(Idx: 0); |
| 1691 | MRI.setRegBank(Reg: DstHi, RegBank: DstBank); |
| 1692 | |
| 1693 | Register CarryHi = AddHi.getReg(Idx: 1); |
| 1694 | MRI.setRegBank(Reg: CarryHi, RegBank: CarryBank); |
| 1695 | |
| 1696 | if (IsUnsigned) { |
| 1697 | Carry = CarryHi; |
| 1698 | } else { |
| 1699 | Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: CarryHi).getReg(Idx: 0); |
| 1700 | MRI.setRegBank(Reg: Carry, RegBank: CarryBank); |
| 1701 | } |
| 1702 | } else { |
| 1703 | if (IsUnsigned) { |
| 1704 | Carry = B.buildConstant(Res: CarryType, Val: 0).getReg(Idx: 0); |
| 1705 | MRI.setRegBank(Reg: Carry, RegBank: CarryBank); |
| 1706 | } |
| 1707 | } |
| 1708 | |
| 1709 | B.buildMergeLikeInstr(Res: Dst0, Ops: {DstLo, DstHi}); |
| 1710 | |
| 1711 | if (DstOnValu) { |
| 1712 | B.buildCopy(Res: Dst1, Op: Carry); |
| 1713 | } else { |
| 1714 | B.buildTrunc(Res: Dst1, Op: Carry); |
| 1715 | } |
| 1716 | |
| 1717 | MI.eraseFromParent(); |
| 1718 | return true; |
| 1719 | } |
| 1720 | |
| 1721 | // Return a suitable opcode for extending the operands of Opc when widening. |
| 1722 | static unsigned getExtendOp(unsigned Opc) { |
| 1723 | switch (Opc) { |
| 1724 | case TargetOpcode::G_ASHR: |
| 1725 | case TargetOpcode::G_SMIN: |
| 1726 | case TargetOpcode::G_SMAX: |
| 1727 | return TargetOpcode::G_SEXT; |
| 1728 | case TargetOpcode::G_LSHR: |
| 1729 | case TargetOpcode::G_UMIN: |
| 1730 | case TargetOpcode::G_UMAX: |
| 1731 | return TargetOpcode::G_ZEXT; |
| 1732 | default: |
| 1733 | return TargetOpcode::G_ANYEXT; |
| 1734 | } |
| 1735 | } |
| 1736 | |
| 1737 | // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding |
| 1738 | // any illegal vector extend or unmerge operations. |
| 1739 | static std::pair<Register, Register> |
| 1740 | unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { |
| 1741 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 1742 | auto Bitcast = B.buildBitcast(Dst: S32, Src); |
| 1743 | |
| 1744 | if (ExtOpcode == TargetOpcode::G_SEXT) { |
| 1745 | auto ExtLo = B.buildSExtInReg(Res: S32, Op: Bitcast, ImmOp: 16); |
| 1746 | auto ShiftHi = B.buildAShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 16)); |
| 1747 | return std::pair(ExtLo.getReg(Idx: 0), ShiftHi.getReg(Idx: 0)); |
| 1748 | } |
| 1749 | |
| 1750 | auto ShiftHi = B.buildLShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 16)); |
| 1751 | if (ExtOpcode == TargetOpcode::G_ZEXT) { |
| 1752 | auto ExtLo = B.buildAnd(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 0xffff)); |
| 1753 | return std::pair(ExtLo.getReg(Idx: 0), ShiftHi.getReg(Idx: 0)); |
| 1754 | } |
| 1755 | |
| 1756 | assert(ExtOpcode == TargetOpcode::G_ANYEXT); |
| 1757 | return std::pair(Bitcast.getReg(Idx: 0), ShiftHi.getReg(Idx: 0)); |
| 1758 | } |
| 1759 | |
| 1760 | // For cases where only a single copy is inserted for matching register banks. |
| 1761 | // Replace the register in the instruction operand |
| 1762 | static bool substituteSimpleCopyRegs( |
| 1763 | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { |
| 1764 | SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); |
| 1765 | if (!SrcReg.empty()) { |
| 1766 | assert(SrcReg.size() == 1); |
| 1767 | OpdMapper.getMI().getOperand(i: OpIdx).setReg(SrcReg[0]); |
| 1768 | return true; |
| 1769 | } |
| 1770 | |
| 1771 | return false; |
| 1772 | } |
| 1773 | |
| 1774 | /// Handle register layout difference for f16 images for some subtargets. |
| 1775 | Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, |
| 1776 | MachineRegisterInfo &MRI, |
| 1777 | Register Reg) const { |
| 1778 | if (!Subtarget.hasUnpackedD16VMem()) |
| 1779 | return Reg; |
| 1780 | |
| 1781 | const LLT S16 = LLT::scalar(SizeInBits: 16); |
| 1782 | LLT StoreVT = MRI.getType(Reg); |
| 1783 | if (!StoreVT.isVector() || StoreVT.getElementType() != S16) |
| 1784 | return Reg; |
| 1785 | |
| 1786 | auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg); |
| 1787 | |
| 1788 | |
| 1789 | SmallVector<Register, 4> WideRegs; |
| 1790 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) |
| 1791 | WideRegs.push_back(Elt: Unmerge.getReg(Idx: I)); |
| 1792 | |
| 1793 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 1794 | int NumElts = StoreVT.getNumElements(); |
| 1795 | |
| 1796 | return B.buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs) |
| 1797 | .getReg(Idx: 0); |
| 1798 | } |
| 1799 | |
| 1800 | static std::pair<Register, unsigned> |
| 1801 | getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { |
| 1802 | int64_t Const; |
| 1803 | if (mi_match(R: Reg, MRI, P: m_ICst(Cst&: Const))) |
| 1804 | return std::pair(Register(), Const); |
| 1805 | |
| 1806 | Register Base; |
| 1807 | if (mi_match(R: Reg, MRI, P: m_GAdd(L: m_Reg(R&: Base), R: m_ICst(Cst&: Const)))) |
| 1808 | return std::pair(Base, Const); |
| 1809 | |
| 1810 | // TODO: Handle G_OR used for add case |
| 1811 | return std::pair(Reg, 0); |
| 1812 | } |
| 1813 | |
| 1814 | std::pair<Register, unsigned> |
| 1815 | AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, |
| 1816 | Register OrigOffset) const { |
| 1817 | const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: Subtarget); |
| 1818 | Register BaseReg; |
| 1819 | unsigned ImmOffset; |
| 1820 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 1821 | |
| 1822 | // TODO: Use AMDGPU::getBaseWithConstantOffset() instead. |
| 1823 | std::tie(args&: BaseReg, args&: ImmOffset) = getBaseWithConstantOffset(MRI&: *B.getMRI(), |
| 1824 | Reg: OrigOffset); |
| 1825 | |
| 1826 | unsigned C1 = 0; |
| 1827 | if (ImmOffset != 0) { |
| 1828 | // If the immediate value is too big for the immoffset field, put only bits |
| 1829 | // that would normally fit in the immoffset field. The remaining value that |
| 1830 | // is copied/added for the voffset field is a large power of 2, and it |
| 1831 | // stands more chance of being CSEd with the copy/add for another similar |
| 1832 | // load/store. |
| 1833 | // However, do not do that rounding down if that is a negative |
| 1834 | // number, as it appears to be illegal to have a negative offset in the |
| 1835 | // vgpr, even if adding the immediate offset makes it positive. |
| 1836 | unsigned Overflow = ImmOffset & ~MaxImm; |
| 1837 | ImmOffset -= Overflow; |
| 1838 | if ((int32_t)Overflow < 0) { |
| 1839 | Overflow += ImmOffset; |
| 1840 | ImmOffset = 0; |
| 1841 | } |
| 1842 | |
| 1843 | C1 = ImmOffset; |
| 1844 | if (Overflow != 0) { |
| 1845 | if (!BaseReg) |
| 1846 | BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: 0); |
| 1847 | else { |
| 1848 | auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow); |
| 1849 | BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: 0); |
| 1850 | } |
| 1851 | } |
| 1852 | } |
| 1853 | |
| 1854 | if (!BaseReg) |
| 1855 | BaseReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0); |
| 1856 | |
| 1857 | return {BaseReg, C1}; |
| 1858 | } |
| 1859 | |
| 1860 | bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, |
| 1861 | Register SrcReg) const { |
| 1862 | MachineRegisterInfo &MRI = *B.getMRI(); |
| 1863 | LLT SrcTy = MRI.getType(Reg: SrcReg); |
| 1864 | if (SrcTy.getSizeInBits() == 32) { |
| 1865 | // Use a v_mov_b32 here to make the exec dependency explicit. |
| 1866 | B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32) |
| 1867 | .addDef(RegNo: DstReg) |
| 1868 | .addUse(RegNo: SrcReg); |
| 1869 | return constrainGenericRegister(Reg: DstReg, RC: AMDGPU::VGPR_32RegClass, MRI) && |
| 1870 | constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_32RegClass, MRI); |
| 1871 | } |
| 1872 | |
| 1873 | Register TmpReg0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
| 1874 | Register TmpReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
| 1875 | |
| 1876 | B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32) |
| 1877 | .addDef(RegNo: TmpReg0) |
| 1878 | .addUse(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub0); |
| 1879 | B.buildInstr(Opcode: AMDGPU::V_MOV_B32_e32) |
| 1880 | .addDef(RegNo: TmpReg1) |
| 1881 | .addUse(RegNo: SrcReg, Flags: {}, SubReg: AMDGPU::sub1); |
| 1882 | B.buildInstr(Opcode: AMDGPU::REG_SEQUENCE) |
| 1883 | .addDef(RegNo: DstReg) |
| 1884 | .addUse(RegNo: TmpReg0) |
| 1885 | .addImm(Val: AMDGPU::sub0) |
| 1886 | .addUse(RegNo: TmpReg1) |
| 1887 | .addImm(Val: AMDGPU::sub1); |
| 1888 | |
| 1889 | return constrainGenericRegister(Reg: SrcReg, RC: AMDGPU::SReg_64RegClass, MRI) && |
| 1890 | constrainGenericRegister(Reg: DstReg, RC: AMDGPU::VReg_64RegClass, MRI); |
| 1891 | } |
| 1892 | |
| 1893 | /// Utility function for pushing dynamic vector indexes with a constant offset |
| 1894 | /// into waterfall loops. |
| 1895 | static void reinsertVectorIndexAdd(MachineIRBuilder &B, |
| 1896 | MachineInstr &IdxUseInstr, |
| 1897 | unsigned OpIdx, |
| 1898 | unsigned ConstOffset) { |
| 1899 | MachineRegisterInfo &MRI = *B.getMRI(); |
| 1900 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 1901 | Register WaterfallIdx = IdxUseInstr.getOperand(i: OpIdx).getReg(); |
| 1902 | B.setInsertPt(MBB&: *IdxUseInstr.getParent(), II: IdxUseInstr.getIterator()); |
| 1903 | |
| 1904 | auto MaterializedOffset = B.buildConstant(Res: S32, Val: ConstOffset); |
| 1905 | |
| 1906 | auto Add = B.buildAdd(Dst: S32, Src0: WaterfallIdx, Src1: MaterializedOffset); |
| 1907 | MRI.setRegBank(Reg: MaterializedOffset.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank); |
| 1908 | MRI.setRegBank(Reg: Add.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank); |
| 1909 | IdxUseInstr.getOperand(i: OpIdx).setReg(Add.getReg(Idx: 0)); |
| 1910 | } |
| 1911 | |
| 1912 | /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the |
| 1913 | /// original 32-bit source value (to be inserted in the low part of the combined |
| 1914 | /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit |
| 1915 | /// value. |
| 1916 | static void extendLow32IntoHigh32(MachineIRBuilder &B, |
| 1917 | Register Hi32Reg, Register Lo32Reg, |
| 1918 | unsigned ExtOpc, |
| 1919 | const RegisterBank &RegBank, |
| 1920 | bool IsBooleanSrc = false) { |
| 1921 | if (ExtOpc == AMDGPU::G_ZEXT) { |
| 1922 | B.buildConstant(Res: Hi32Reg, Val: 0); |
| 1923 | } else if (ExtOpc == AMDGPU::G_SEXT) { |
| 1924 | if (IsBooleanSrc) { |
| 1925 | // If we know the original source was an s1, the high half is the same as |
| 1926 | // the low. |
| 1927 | B.buildCopy(Res: Hi32Reg, Op: Lo32Reg); |
| 1928 | } else { |
| 1929 | // Replicate sign bit from 32-bit extended part. |
| 1930 | auto ShiftAmt = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 31); |
| 1931 | B.getMRI()->setRegBank(Reg: ShiftAmt.getReg(Idx: 0), RegBank); |
| 1932 | B.buildAShr(Dst: Hi32Reg, Src0: Lo32Reg, Src1: ShiftAmt); |
| 1933 | } |
| 1934 | } else { |
| 1935 | assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension" ); |
| 1936 | B.buildUndef(Res: Hi32Reg); |
| 1937 | } |
| 1938 | } |
| 1939 | |
| 1940 | bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( |
| 1941 | MachineIRBuilder &B, MachineInstr &MI, |
| 1942 | const OperandsMapper &OpdMapper) const { |
| 1943 | MachineRegisterInfo &MRI = *B.getMRI(); |
| 1944 | |
| 1945 | Register VecReg = MI.getOperand(i: 1).getReg(); |
| 1946 | Register Idx = MI.getOperand(i: 2).getReg(); |
| 1947 | |
| 1948 | const RegisterBank &IdxBank = |
| 1949 | *OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank; |
| 1950 | |
| 1951 | bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; |
| 1952 | |
| 1953 | LLT VecTy = MRI.getType(Reg: VecReg); |
| 1954 | unsigned EltSize = VecTy.getScalarSizeInBits(); |
| 1955 | unsigned NumElem = VecTy.getNumElements(); |
| 1956 | |
| 1957 | if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, |
| 1958 | IsDivergentIdx, Subtarget: &Subtarget)) |
| 1959 | return false; |
| 1960 | |
| 1961 | LLT S32 = LLT::scalar(SizeInBits: 32); |
| 1962 | |
| 1963 | const RegisterBank &DstBank = |
| 1964 | *OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
| 1965 | const RegisterBank &SrcBank = |
| 1966 | *OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank; |
| 1967 | |
| 1968 | const RegisterBank &CCBank = |
| 1969 | (DstBank == AMDGPU::SGPRRegBank && |
| 1970 | SrcBank == AMDGPU::SGPRRegBank && |
| 1971 | IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank |
| 1972 | : AMDGPU::VCCRegBank; |
| 1973 | LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(SizeInBits: 1); |
| 1974 | |
| 1975 | if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { |
| 1976 | Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: 0).getReg(); |
| 1977 | MRI.setRegBank(Reg: Idx, RegBank: AMDGPU::VGPRRegBank); |
| 1978 | } |
| 1979 | |
| 1980 | LLT EltTy = VecTy.getScalarType(); |
| 1981 | SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0)); |
| 1982 | unsigned NumLanes = DstRegs.size(); |
| 1983 | if (!NumLanes) |
| 1984 | NumLanes = 1; |
| 1985 | else |
| 1986 | EltTy = MRI.getType(Reg: DstRegs[0]); |
| 1987 | |
| 1988 | auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg); |
| 1989 | SmallVector<Register, 2> Res(NumLanes); |
| 1990 | for (unsigned L = 0; L < NumLanes; ++L) |
| 1991 | Res[L] = UnmergeToEltTy.getReg(Idx: L); |
| 1992 | |
| 1993 | for (unsigned I = 1; I < NumElem; ++I) { |
| 1994 | auto IC = B.buildConstant(Res: S32, Val: I); |
| 1995 | MRI.setRegBank(Reg: IC->getOperand(i: 0).getReg(), RegBank: AMDGPU::SGPRRegBank); |
| 1996 | auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC); |
| 1997 | MRI.setRegBank(Reg: Cmp->getOperand(i: 0).getReg(), RegBank: CCBank); |
| 1998 | |
| 1999 | for (unsigned L = 0; L < NumLanes; ++L) { |
| 2000 | auto S = B.buildSelect(Res: EltTy, Tst: Cmp, |
| 2001 | Op0: UnmergeToEltTy.getReg(Idx: I * NumLanes + L), Op1: Res[L]); |
| 2002 | |
| 2003 | for (unsigned N : { 0, 2, 3 }) |
| 2004 | MRI.setRegBank(Reg: S->getOperand(i: N).getReg(), RegBank: DstBank); |
| 2005 | |
| 2006 | Res[L] = S->getOperand(i: 0).getReg(); |
| 2007 | } |
| 2008 | } |
| 2009 | |
| 2010 | for (unsigned L = 0; L < NumLanes; ++L) { |
| 2011 | Register DstReg = (NumLanes == 1) ? MI.getOperand(i: 0).getReg() : DstRegs[L]; |
| 2012 | B.buildCopy(Res: DstReg, Op: Res[L]); |
| 2013 | MRI.setRegBank(Reg: DstReg, RegBank: DstBank); |
| 2014 | } |
| 2015 | |
| 2016 | MRI.setRegBank(Reg: MI.getOperand(i: 0).getReg(), RegBank: DstBank); |
| 2017 | MI.eraseFromParent(); |
| 2018 | |
| 2019 | return true; |
| 2020 | } |
| 2021 | |
| 2022 | // Insert a cross regbank copy for a register if it already has a bank that |
| 2023 | // differs from the one we want to set. |
| 2024 | static Register constrainRegToBank(MachineRegisterInfo &MRI, |
| 2025 | MachineIRBuilder &B, Register &Reg, |
| 2026 | const RegisterBank &Bank) { |
| 2027 | const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg); |
| 2028 | if (CurrBank && *CurrBank != Bank) { |
| 2029 | Register Copy = B.buildCopy(Res: MRI.getType(Reg), Op: Reg).getReg(Idx: 0); |
| 2030 | MRI.setRegBank(Reg: Copy, RegBank: Bank); |
| 2031 | return Copy; |
| 2032 | } |
| 2033 | |
| 2034 | MRI.setRegBank(Reg, RegBank: Bank); |
| 2035 | return Reg; |
| 2036 | } |
| 2037 | |
| 2038 | bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( |
| 2039 | MachineIRBuilder &B, MachineInstr &MI, |
| 2040 | const OperandsMapper &OpdMapper) const { |
| 2041 | |
| 2042 | MachineRegisterInfo &MRI = *B.getMRI(); |
| 2043 | Register VecReg = MI.getOperand(i: 1).getReg(); |
| 2044 | Register Idx = MI.getOperand(i: 3).getReg(); |
| 2045 | |
| 2046 | const RegisterBank &IdxBank = |
| 2047 | *OpdMapper.getInstrMapping().getOperandMapping(i: 3).BreakDown[0].RegBank; |
| 2048 | |
| 2049 | bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; |
| 2050 | |
| 2051 | LLT VecTy = MRI.getType(Reg: VecReg); |
| 2052 | unsigned EltSize = VecTy.getScalarSizeInBits(); |
| 2053 | unsigned NumElem = VecTy.getNumElements(); |
| 2054 | |
| 2055 | if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, |
| 2056 | IsDivergentIdx, Subtarget: &Subtarget)) |
| 2057 | return false; |
| 2058 | |
| 2059 | LLT S32 = LLT::scalar(SizeInBits: 32); |
| 2060 | |
| 2061 | const RegisterBank &DstBank = |
| 2062 | *OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
| 2063 | const RegisterBank &SrcBank = |
| 2064 | *OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank; |
| 2065 | const RegisterBank &InsBank = |
| 2066 | *OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank; |
| 2067 | |
| 2068 | const RegisterBank &CCBank = |
| 2069 | (DstBank == AMDGPU::SGPRRegBank && |
| 2070 | SrcBank == AMDGPU::SGPRRegBank && |
| 2071 | InsBank == AMDGPU::SGPRRegBank && |
| 2072 | IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank |
| 2073 | : AMDGPU::VCCRegBank; |
| 2074 | LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(SizeInBits: 1); |
| 2075 | |
| 2076 | if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { |
| 2077 | Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: 0).getReg(); |
| 2078 | MRI.setRegBank(Reg: Idx, RegBank: AMDGPU::VGPRRegBank); |
| 2079 | } |
| 2080 | |
| 2081 | LLT EltTy = VecTy.getScalarType(); |
| 2082 | SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(OpIdx: 2)); |
| 2083 | unsigned NumLanes = InsRegs.size(); |
| 2084 | if (!NumLanes) { |
| 2085 | NumLanes = 1; |
| 2086 | InsRegs.push_back(Elt: MI.getOperand(i: 2).getReg()); |
| 2087 | } else { |
| 2088 | EltTy = MRI.getType(Reg: InsRegs[0]); |
| 2089 | } |
| 2090 | |
| 2091 | auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg); |
| 2092 | SmallVector<Register, 16> Ops(NumElem * NumLanes); |
| 2093 | |
| 2094 | for (unsigned I = 0; I < NumElem; ++I) { |
| 2095 | auto IC = B.buildConstant(Res: S32, Val: I); |
| 2096 | MRI.setRegBank(Reg: IC->getOperand(i: 0).getReg(), RegBank: AMDGPU::SGPRRegBank); |
| 2097 | auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC); |
| 2098 | MRI.setRegBank(Reg: Cmp->getOperand(i: 0).getReg(), RegBank: CCBank); |
| 2099 | |
| 2100 | for (unsigned L = 0; L < NumLanes; ++L) { |
| 2101 | Register Op0 = constrainRegToBank(MRI, B, Reg&: InsRegs[L], Bank: DstBank); |
| 2102 | Register Op1 = UnmergeToEltTy.getReg(Idx: I * NumLanes + L); |
| 2103 | Op1 = constrainRegToBank(MRI, B, Reg&: Op1, Bank: DstBank); |
| 2104 | |
| 2105 | Register Select = B.buildSelect(Res: EltTy, Tst: Cmp, Op0, Op1).getReg(Idx: 0); |
| 2106 | MRI.setRegBank(Reg: Select, RegBank: DstBank); |
| 2107 | |
| 2108 | Ops[I * NumLanes + L] = Select; |
| 2109 | } |
| 2110 | } |
| 2111 | |
| 2112 | LLT MergeTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarTy: EltTy); |
| 2113 | if (MergeTy == MRI.getType(Reg: MI.getOperand(i: 0).getReg())) { |
| 2114 | B.buildBuildVector(Res: MI.getOperand(i: 0), Ops); |
| 2115 | } else { |
| 2116 | auto Vec = B.buildBuildVector(Res: MergeTy, Ops); |
| 2117 | MRI.setRegBank(Reg: Vec->getOperand(i: 0).getReg(), RegBank: DstBank); |
| 2118 | B.buildBitcast(Dst: MI.getOperand(i: 0).getReg(), Src: Vec); |
| 2119 | } |
| 2120 | |
| 2121 | MRI.setRegBank(Reg: MI.getOperand(i: 0).getReg(), RegBank: DstBank); |
| 2122 | MI.eraseFromParent(); |
| 2123 | |
| 2124 | return true; |
| 2125 | } |
| 2126 | |
| 2127 | // Break s_mul_u64 into 32-bit vector operations. |
| 2128 | void AMDGPURegisterBankInfo::applyMappingSMULU64( |
| 2129 | MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { |
| 2130 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
| 2131 | SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(OpIdx: 1)); |
| 2132 | SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2)); |
| 2133 | |
| 2134 | // All inputs are SGPRs, nothing special to do. |
| 2135 | if (DefRegs.empty()) { |
| 2136 | assert(Src0Regs.empty() && Src1Regs.empty()); |
| 2137 | applyDefaultMapping(OpdMapper); |
| 2138 | return; |
| 2139 | } |
| 2140 | |
| 2141 | assert(DefRegs.size() == 2); |
| 2142 | assert(Src0Regs.size() == Src1Regs.size() && |
| 2143 | (Src0Regs.empty() || Src0Regs.size() == 2)); |
| 2144 | |
| 2145 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); |
| 2146 | MachineInstr &MI = OpdMapper.getMI(); |
| 2147 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 2148 | LLT HalfTy = LLT::scalar(SizeInBits: 32); |
| 2149 | |
| 2150 | // Depending on where the source registers came from, the generic code may |
| 2151 | // have decided to split the inputs already or not. If not, we still need to |
| 2152 | // extract the values. |
| 2153 | |
| 2154 | if (Src0Regs.empty()) |
| 2155 | split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: 1).getReg()); |
| 2156 | else |
| 2157 | setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy); |
| 2158 | |
| 2159 | if (Src1Regs.empty()) |
| 2160 | split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg()); |
| 2161 | else |
| 2162 | setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy); |
| 2163 | |
| 2164 | setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy); |
| 2165 | |
| 2166 | // The multiplication is done as follows: |
| 2167 | // |
| 2168 | // Op1H Op1L |
| 2169 | // * Op0H Op0L |
| 2170 | // -------------------- |
| 2171 | // Op1H*Op0L Op1L*Op0L |
| 2172 | // + Op1H*Op0H Op1L*Op0H |
| 2173 | // ----------------------------------------- |
| 2174 | // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L |
| 2175 | // |
| 2176 | // We drop Op1H*Op0H because the result of the multiplication is a 64-bit |
| 2177 | // value and that would overflow. |
| 2178 | // The low 32-bit value is Op1L*Op0L. |
| 2179 | // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from |
| 2180 | // Op1L*Op0L). |
| 2181 | |
| 2182 | ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); |
| 2183 | |
| 2184 | Register Hi = B.buildUMulH(Dst: HalfTy, Src0: Src0Regs[0], Src1: Src1Regs[0]).getReg(Idx: 0); |
| 2185 | Register MulLoHi = B.buildMul(Dst: HalfTy, Src0: Src0Regs[0], Src1: Src1Regs[1]).getReg(Idx: 0); |
| 2186 | Register Add = B.buildAdd(Dst: HalfTy, Src0: Hi, Src1: MulLoHi).getReg(Idx: 0); |
| 2187 | Register MulHiLo = B.buildMul(Dst: HalfTy, Src0: Src0Regs[1], Src1: Src1Regs[0]).getReg(Idx: 0); |
| 2188 | B.buildAdd(Dst: DefRegs[1], Src0: Add, Src1: MulHiLo); |
| 2189 | B.buildMul(Dst: DefRegs[0], Src0: Src0Regs[0], Src1: Src1Regs[0]); |
| 2190 | |
| 2191 | MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank); |
| 2192 | MI.eraseFromParent(); |
| 2193 | } |
| 2194 | |
| 2195 | void AMDGPURegisterBankInfo::applyMappingImpl( |
| 2196 | MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { |
| 2197 | MachineInstr &MI = OpdMapper.getMI(); |
| 2198 | B.setInstrAndDebugLoc(MI); |
| 2199 | unsigned Opc = MI.getOpcode(); |
| 2200 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); |
| 2201 | switch (Opc) { |
| 2202 | case AMDGPU::G_CONSTANT: |
| 2203 | case AMDGPU::G_IMPLICIT_DEF: { |
| 2204 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 2205 | LLT DstTy = MRI.getType(Reg: DstReg); |
| 2206 | if (DstTy != LLT::scalar(SizeInBits: 1)) |
| 2207 | break; |
| 2208 | |
| 2209 | const RegisterBank *DstBank = |
| 2210 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
| 2211 | if (DstBank == &AMDGPU::VCCRegBank) |
| 2212 | break; |
| 2213 | SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
| 2214 | if (DefRegs.empty()) |
| 2215 | DefRegs.push_back(Elt: DstReg); |
| 2216 | |
| 2217 | B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator()); |
| 2218 | |
| 2219 | Register NewDstReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32)); |
| 2220 | LLVMContext &Ctx = B.getMF().getFunction().getContext(); |
| 2221 | |
| 2222 | MI.getOperand(i: 0).setReg(NewDstReg); |
| 2223 | if (Opc != AMDGPU::G_IMPLICIT_DEF) { |
| 2224 | uint64_t ConstVal = MI.getOperand(i: 1).getCImm()->getZExtValue(); |
| 2225 | MI.getOperand(i: 1).setCImm( |
| 2226 | ConstantInt::get(Ty: IntegerType::getInt32Ty(C&: Ctx), V: ConstVal)); |
| 2227 | } |
| 2228 | |
| 2229 | MRI.setRegBank(Reg: NewDstReg, RegBank: *DstBank); |
| 2230 | B.buildTrunc(Res: DefRegs[0], Op: NewDstReg); |
| 2231 | return; |
| 2232 | } |
| 2233 | case AMDGPU::G_PHI: { |
| 2234 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 2235 | LLT DstTy = MRI.getType(Reg: DstReg); |
| 2236 | if (DstTy != LLT::scalar(SizeInBits: 1)) |
| 2237 | break; |
| 2238 | |
| 2239 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 2240 | const RegisterBank *DstBank = |
| 2241 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
| 2242 | if (DstBank == &AMDGPU::VCCRegBank) { |
| 2243 | applyDefaultMapping(OpdMapper); |
| 2244 | // The standard handling only considers the result register bank for |
| 2245 | // phis. For VCC, blindly inserting a copy when the phi is lowered will |
| 2246 | // produce an invalid copy. We can only copy with some kind of compare to |
| 2247 | // get a vector boolean result. Insert a register bank copy that will be |
| 2248 | // correctly lowered to a compare. |
| 2249 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { |
| 2250 | Register SrcReg = MI.getOperand(i: I).getReg(); |
| 2251 | const RegisterBank *SrcBank = getRegBank(Reg: SrcReg, MRI, TRI: *TRI); |
| 2252 | |
| 2253 | if (SrcBank != &AMDGPU::VCCRegBank) { |
| 2254 | MachineBasicBlock *SrcMBB = MI.getOperand(i: I + 1).getMBB(); |
| 2255 | B.setInsertPt(MBB&: *SrcMBB, II: SrcMBB->getFirstTerminator()); |
| 2256 | |
| 2257 | auto Copy = B.buildCopy(Res: LLT::scalar(SizeInBits: 1), Op: SrcReg); |
| 2258 | MRI.setRegBank(Reg: Copy.getReg(Idx: 0), RegBank: AMDGPU::VCCRegBank); |
| 2259 | MI.getOperand(i: I).setReg(Copy.getReg(Idx: 0)); |
| 2260 | } |
| 2261 | } |
| 2262 | |
| 2263 | return; |
| 2264 | } |
| 2265 | |
| 2266 | // Phi handling is strange and only considers the bank of the destination. |
| 2267 | substituteSimpleCopyRegs(OpdMapper, OpIdx: 0); |
| 2268 | |
| 2269 | // Promote SGPR/VGPR booleans to s32 |
| 2270 | ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); |
| 2271 | B.setInsertPt(MBB&: B.getMBB(), II: MI); |
| 2272 | LegalizerHelper Helper(B.getMF(), ApplyBank, B); |
| 2273 | |
| 2274 | if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized) |
| 2275 | llvm_unreachable("widen scalar should have succeeded" ); |
| 2276 | |
| 2277 | return; |
| 2278 | } |
| 2279 | case AMDGPU::G_FCMP: |
| 2280 | if (!Subtarget.hasSALUFloatInsts()) |
| 2281 | break; |
| 2282 | [[fallthrough]]; |
| 2283 | case AMDGPU::G_ICMP: |
| 2284 | case AMDGPU::G_UADDO: |
| 2285 | case AMDGPU::G_USUBO: |
| 2286 | case AMDGPU::G_UADDE: |
| 2287 | case AMDGPU::G_SADDE: |
| 2288 | case AMDGPU::G_USUBE: |
| 2289 | case AMDGPU::G_SSUBE: { |
| 2290 | unsigned BoolDstOp = |
| 2291 | (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1; |
| 2292 | Register DstReg = MI.getOperand(i: BoolDstOp).getReg(); |
| 2293 | |
| 2294 | const RegisterBank *DstBank = |
| 2295 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
| 2296 | if (DstBank != &AMDGPU::SGPRRegBank) |
| 2297 | break; |
| 2298 | |
| 2299 | const bool HasCarryIn = MI.getNumOperands() == 5; |
| 2300 | |
| 2301 | // If this is a scalar compare, promote the result to s32, as the selection |
| 2302 | // will end up using a copy to a 32-bit vreg. |
| 2303 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 2304 | Register NewDstReg = MRI.createGenericVirtualRegister(Ty: S32); |
| 2305 | MRI.setRegBank(Reg: NewDstReg, RegBank: AMDGPU::SGPRRegBank); |
| 2306 | MI.getOperand(i: BoolDstOp).setReg(NewDstReg); |
| 2307 | |
| 2308 | if (HasCarryIn) { |
| 2309 | Register NewSrcReg = MRI.createGenericVirtualRegister(Ty: S32); |
| 2310 | MRI.setRegBank(Reg: NewSrcReg, RegBank: AMDGPU::SGPRRegBank); |
| 2311 | B.buildZExt(Res: NewSrcReg, Op: MI.getOperand(i: 4).getReg()); |
| 2312 | MI.getOperand(i: 4).setReg(NewSrcReg); |
| 2313 | } |
| 2314 | |
| 2315 | MachineBasicBlock *MBB = MI.getParent(); |
| 2316 | B.setInsertPt(MBB&: *MBB, II: std::next(x: MI.getIterator())); |
| 2317 | |
| 2318 | // If we had a constrained VCC result register, a copy was inserted to VCC |
| 2319 | // from SGPR. |
| 2320 | SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
| 2321 | if (DefRegs.empty()) |
| 2322 | DefRegs.push_back(Elt: DstReg); |
| 2323 | B.buildTrunc(Res: DefRegs[0], Op: NewDstReg); |
| 2324 | return; |
| 2325 | } |
| 2326 | case AMDGPU::G_SELECT: { |
| 2327 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 2328 | LLT DstTy = MRI.getType(Reg: DstReg); |
| 2329 | |
| 2330 | SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(OpIdx: 1)); |
| 2331 | if (CondRegs.empty()) |
| 2332 | CondRegs.push_back(Elt: MI.getOperand(i: 1).getReg()); |
| 2333 | else { |
| 2334 | assert(CondRegs.size() == 1); |
| 2335 | } |
| 2336 | |
| 2337 | const RegisterBank *CondBank = getRegBank(Reg: CondRegs[0], MRI, TRI: *TRI); |
| 2338 | if (CondBank == &AMDGPU::SGPRRegBank) { |
| 2339 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 2340 | Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32); |
| 2341 | MRI.setRegBank(Reg: NewCondReg, RegBank: AMDGPU::SGPRRegBank); |
| 2342 | |
| 2343 | MI.getOperand(i: 1).setReg(NewCondReg); |
| 2344 | B.buildZExt(Res: NewCondReg, Op: CondRegs[0]); |
| 2345 | } |
| 2346 | |
| 2347 | if (DstTy.getSizeInBits() != 64) |
| 2348 | break; |
| 2349 | |
| 2350 | LLT HalfTy = getHalfSizedType(Ty: DstTy); |
| 2351 | |
| 2352 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
| 2353 | SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2)); |
| 2354 | SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(OpIdx: 3)); |
| 2355 | |
| 2356 | // All inputs are SGPRs, nothing special to do. |
| 2357 | if (DefRegs.empty()) { |
| 2358 | assert(Src1Regs.empty() && Src2Regs.empty()); |
| 2359 | break; |
| 2360 | } |
| 2361 | |
| 2362 | if (Src1Regs.empty()) |
| 2363 | split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg()); |
| 2364 | else { |
| 2365 | setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy); |
| 2366 | } |
| 2367 | |
| 2368 | if (Src2Regs.empty()) |
| 2369 | split64BitValueForMapping(B, Regs&: Src2Regs, HalfTy, Reg: MI.getOperand(i: 3).getReg()); |
| 2370 | else |
| 2371 | setRegsToType(MRI, Regs: Src2Regs, NewTy: HalfTy); |
| 2372 | |
| 2373 | setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy); |
| 2374 | |
| 2375 | auto Flags = MI.getFlags(); |
| 2376 | B.buildSelect(Res: DefRegs[0], Tst: CondRegs[0], Op0: Src1Regs[0], Op1: Src2Regs[0], Flags); |
| 2377 | B.buildSelect(Res: DefRegs[1], Tst: CondRegs[0], Op0: Src1Regs[1], Op1: Src2Regs[1], Flags); |
| 2378 | |
| 2379 | MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank); |
| 2380 | MI.eraseFromParent(); |
| 2381 | return; |
| 2382 | } |
| 2383 | case AMDGPU::G_BRCOND: { |
| 2384 | Register CondReg = MI.getOperand(i: 0).getReg(); |
| 2385 | // FIXME: Should use legalizer helper, but should change bool ext type. |
| 2386 | const RegisterBank *CondBank = |
| 2387 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
| 2388 | |
| 2389 | if (CondBank == &AMDGPU::SGPRRegBank) { |
| 2390 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 2391 | Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32); |
| 2392 | MRI.setRegBank(Reg: NewCondReg, RegBank: AMDGPU::SGPRRegBank); |
| 2393 | |
| 2394 | MI.getOperand(i: 0).setReg(NewCondReg); |
| 2395 | B.buildZExt(Res: NewCondReg, Op: CondReg); |
| 2396 | return; |
| 2397 | } |
| 2398 | |
| 2399 | break; |
| 2400 | } |
| 2401 | case AMDGPU::G_AND: |
| 2402 | case AMDGPU::G_OR: |
| 2403 | case AMDGPU::G_XOR: { |
| 2404 | // 64-bit and is only available on the SALU, so split into 2 32-bit ops if |
| 2405 | // there is a VGPR input. |
| 2406 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 2407 | LLT DstTy = MRI.getType(Reg: DstReg); |
| 2408 | |
| 2409 | const RegisterBank *DstBank = |
| 2410 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
| 2411 | |
| 2412 | if (DstTy.getSizeInBits() == 1) { |
| 2413 | if (DstBank == &AMDGPU::VCCRegBank) |
| 2414 | break; |
| 2415 | |
| 2416 | MachineFunction *MF = MI.getMF(); |
| 2417 | ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); |
| 2418 | LegalizerHelper Helper(*MF, ApplyBank, B); |
| 2419 | |
| 2420 | if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: LLT::scalar(SizeInBits: 32)) != |
| 2421 | LegalizerHelper::Legalized) |
| 2422 | llvm_unreachable("widen scalar should have succeeded" ); |
| 2423 | return; |
| 2424 | } |
| 2425 | |
| 2426 | if (DstTy.getSizeInBits() == 16 && DstBank == &AMDGPU::SGPRRegBank) { |
| 2427 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 2428 | MachineBasicBlock *MBB = MI.getParent(); |
| 2429 | MachineFunction *MF = MBB->getParent(); |
| 2430 | ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank); |
| 2431 | LegalizerHelper Helper(*MF, ApplySALU, B); |
| 2432 | // Widen to S32, but handle `G_XOR x, -1` differently. Legalizer widening |
| 2433 | // will use a G_ANYEXT to extend the -1 which prevents matching G_XOR -1 |
| 2434 | // as "not". |
| 2435 | if (MI.getOpcode() == AMDGPU::G_XOR && |
| 2436 | mi_match(R: MI.getOperand(i: 2).getReg(), MRI, P: m_SpecificICstOrSplat(RequestedValue: -1))) { |
| 2437 | Helper.widenScalarSrc(MI, WideTy: S32, OpIdx: 1, ExtOpcode: AMDGPU::G_ANYEXT); |
| 2438 | Helper.widenScalarSrc(MI, WideTy: S32, OpIdx: 2, ExtOpcode: AMDGPU::G_SEXT); |
| 2439 | Helper.widenScalarDst(MI, WideTy: S32); |
| 2440 | } else { |
| 2441 | if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized) |
| 2442 | llvm_unreachable("widen scalar should have succeeded" ); |
| 2443 | } |
| 2444 | return; |
| 2445 | } |
| 2446 | |
| 2447 | if (DstTy.getSizeInBits() != 64) |
| 2448 | break; |
| 2449 | |
| 2450 | LLT HalfTy = getHalfSizedType(Ty: DstTy); |
| 2451 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
| 2452 | SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(OpIdx: 1)); |
| 2453 | SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2)); |
| 2454 | |
| 2455 | // All inputs are SGPRs, nothing special to do. |
| 2456 | if (DefRegs.empty()) { |
| 2457 | assert(Src0Regs.empty() && Src1Regs.empty()); |
| 2458 | break; |
| 2459 | } |
| 2460 | |
| 2461 | assert(DefRegs.size() == 2); |
| 2462 | assert(Src0Regs.size() == Src1Regs.size() && |
| 2463 | (Src0Regs.empty() || Src0Regs.size() == 2)); |
| 2464 | |
| 2465 | // Depending on where the source registers came from, the generic code may |
| 2466 | // have decided to split the inputs already or not. If not, we still need to |
| 2467 | // extract the values. |
| 2468 | |
| 2469 | if (Src0Regs.empty()) |
| 2470 | split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: 1).getReg()); |
| 2471 | else |
| 2472 | setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy); |
| 2473 | |
| 2474 | if (Src1Regs.empty()) |
| 2475 | split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg()); |
| 2476 | else |
| 2477 | setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy); |
| 2478 | |
| 2479 | setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy); |
| 2480 | |
| 2481 | auto Flags = MI.getFlags(); |
| 2482 | B.buildInstr(Opc, DstOps: {DefRegs[0]}, SrcOps: {Src0Regs[0], Src1Regs[0]}, Flags); |
| 2483 | B.buildInstr(Opc, DstOps: {DefRegs[1]}, SrcOps: {Src0Regs[1], Src1Regs[1]}, Flags); |
| 2484 | |
| 2485 | MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank); |
| 2486 | MI.eraseFromParent(); |
| 2487 | return; |
| 2488 | } |
| 2489 | case AMDGPU::G_ABS: { |
| 2490 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 2491 | const RegisterBank *SrcBank = MRI.getRegBankOrNull(Reg: SrcReg); |
| 2492 | |
| 2493 | // There is no VALU abs instruction so we need to replace it with a sub and |
| 2494 | // max combination. |
| 2495 | if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { |
| 2496 | MachineFunction *MF = MI.getMF(); |
| 2497 | ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank); |
| 2498 | LegalizerHelper Helper(*MF, Apply, B); |
| 2499 | |
| 2500 | if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) |
| 2501 | llvm_unreachable("lowerAbsToMaxNeg should have succeeded" ); |
| 2502 | return; |
| 2503 | } |
| 2504 | [[fallthrough]]; |
| 2505 | } |
| 2506 | case AMDGPU::G_ADD: |
| 2507 | case AMDGPU::G_SUB: |
| 2508 | case AMDGPU::G_MUL: |
| 2509 | case AMDGPU::G_SHL: |
| 2510 | case AMDGPU::G_LSHR: |
| 2511 | case AMDGPU::G_ASHR: |
| 2512 | case AMDGPU::G_SMIN: |
| 2513 | case AMDGPU::G_SMAX: |
| 2514 | case AMDGPU::G_UMIN: |
| 2515 | case AMDGPU::G_UMAX: { |
| 2516 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 2517 | LLT DstTy = MRI.getType(Reg: DstReg); |
| 2518 | |
| 2519 | // Special case for s_mul_u64. There is not a vector equivalent of |
| 2520 | // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector |
| 2521 | // multiplications. |
| 2522 | if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL && |
| 2523 | DstTy.getSizeInBits() == 64) { |
| 2524 | applyMappingSMULU64(B, OpdMapper); |
| 2525 | return; |
| 2526 | } |
| 2527 | |
| 2528 | // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. |
| 2529 | // Packed 16-bit operations need to be scalarized and promoted. |
| 2530 | if (DstTy != LLT::scalar(SizeInBits: 16) && DstTy != LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16)) |
| 2531 | break; |
| 2532 | |
| 2533 | const RegisterBank *DstBank = |
| 2534 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
| 2535 | if (DstBank == &AMDGPU::VGPRRegBank) |
| 2536 | break; |
| 2537 | |
| 2538 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 2539 | MachineBasicBlock *MBB = MI.getParent(); |
| 2540 | MachineFunction *MF = MBB->getParent(); |
| 2541 | ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank); |
| 2542 | |
| 2543 | if (DstTy.isVector() && Opc == AMDGPU::G_ABS) { |
| 2544 | Register WideSrcLo, WideSrcHi; |
| 2545 | |
| 2546 | std::tie(args&: WideSrcLo, args&: WideSrcHi) = |
| 2547 | unpackV2S16ToS32(B, Src: MI.getOperand(i: 1).getReg(), ExtOpcode: TargetOpcode::G_SEXT); |
| 2548 | auto Lo = B.buildInstr(Opc: AMDGPU::G_ABS, DstOps: {S32}, SrcOps: {WideSrcLo}); |
| 2549 | auto Hi = B.buildInstr(Opc: AMDGPU::G_ABS, DstOps: {S32}, SrcOps: {WideSrcHi}); |
| 2550 | B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)}); |
| 2551 | MI.eraseFromParent(); |
| 2552 | return; |
| 2553 | } |
| 2554 | |
| 2555 | if (DstTy.isVector()) { |
| 2556 | Register WideSrc0Lo, WideSrc0Hi; |
| 2557 | Register WideSrc1Lo, WideSrc1Hi; |
| 2558 | |
| 2559 | unsigned ExtendOp = getExtendOp(Opc: MI.getOpcode()); |
| 2560 | std::tie(args&: WideSrc0Lo, args&: WideSrc0Hi) |
| 2561 | = unpackV2S16ToS32(B, Src: MI.getOperand(i: 1).getReg(), ExtOpcode: ExtendOp); |
| 2562 | std::tie(args&: WideSrc1Lo, args&: WideSrc1Hi) |
| 2563 | = unpackV2S16ToS32(B, Src: MI.getOperand(i: 2).getReg(), ExtOpcode: ExtendOp); |
| 2564 | auto Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Lo, WideSrc1Lo}); |
| 2565 | auto Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Hi, WideSrc1Hi}); |
| 2566 | B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)}); |
| 2567 | MI.eraseFromParent(); |
| 2568 | } else { |
| 2569 | LegalizerHelper Helper(*MF, ApplySALU, B); |
| 2570 | |
| 2571 | if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized) |
| 2572 | llvm_unreachable("widen scalar should have succeeded" ); |
| 2573 | |
| 2574 | // FIXME: s16 shift amounts should be legal. |
| 2575 | if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || |
| 2576 | Opc == AMDGPU::G_ASHR) { |
| 2577 | B.setInsertPt(MBB&: *MBB, II: MI.getIterator()); |
| 2578 | if (Helper.widenScalar(MI, TypeIdx: 1, WideTy: S32) != LegalizerHelper::Legalized) |
| 2579 | llvm_unreachable("widen scalar should have succeeded" ); |
| 2580 | } |
| 2581 | } |
| 2582 | |
| 2583 | return; |
| 2584 | } |
| 2585 | case AMDGPU::G_AMDGPU_S_MUL_I64_I32: |
| 2586 | case AMDGPU::G_AMDGPU_S_MUL_U64_U32: { |
| 2587 | // This is a special case for s_mul_u64. We use |
| 2588 | // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation |
| 2589 | // where the 33 higher bits are sign-extended and |
| 2590 | // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation |
| 2591 | // where the 32 higher bits are zero-extended. In case scalar registers are |
| 2592 | // selected, both opcodes are lowered as s_mul_u64. If the vector registers |
| 2593 | // are selected, then G_AMDGPU_S_MUL_I64_I32 and |
| 2594 | // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction. |
| 2595 | |
| 2596 | // Insert basic copies. |
| 2597 | applyDefaultMapping(OpdMapper); |
| 2598 | |
| 2599 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 2600 | Register SrcReg0 = MI.getOperand(i: 1).getReg(); |
| 2601 | Register SrcReg1 = MI.getOperand(i: 2).getReg(); |
| 2602 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 2603 | const LLT S64 = LLT::scalar(SizeInBits: 64); |
| 2604 | assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 " |
| 2605 | "that handles only 64-bit operands." ); |
| 2606 | const RegisterBank *DstBank = |
| 2607 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
| 2608 | |
| 2609 | // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 |
| 2610 | // with s_mul_u64 operation. |
| 2611 | if (DstBank == &AMDGPU::SGPRRegBank) { |
| 2612 | MI.setDesc(TII->get(Opcode: AMDGPU::S_MUL_U64)); |
| 2613 | MRI.setRegClass(Reg: DstReg, RC: &AMDGPU::SGPR_64RegClass); |
| 2614 | MRI.setRegClass(Reg: SrcReg0, RC: &AMDGPU::SGPR_64RegClass); |
| 2615 | MRI.setRegClass(Reg: SrcReg1, RC: &AMDGPU::SGPR_64RegClass); |
| 2616 | return; |
| 2617 | } |
| 2618 | |
| 2619 | // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 |
| 2620 | // with a vector mad. |
| 2621 | assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank && |
| 2622 | "The destination operand should be in vector registers." ); |
| 2623 | |
| 2624 | // Extract the lower subregister from the first operand. |
| 2625 | Register Op0L = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
| 2626 | MRI.setRegClass(Reg: Op0L, RC: &AMDGPU::VGPR_32RegClass); |
| 2627 | MRI.setType(VReg: Op0L, Ty: S32); |
| 2628 | B.buildTrunc(Res: Op0L, Op: SrcReg0); |
| 2629 | |
| 2630 | // Extract the lower subregister from the second operand. |
| 2631 | Register Op1L = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
| 2632 | MRI.setRegClass(Reg: Op1L, RC: &AMDGPU::VGPR_32RegClass); |
| 2633 | MRI.setType(VReg: Op1L, Ty: S32); |
| 2634 | B.buildTrunc(Res: Op1L, Op: SrcReg1); |
| 2635 | |
| 2636 | unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32 |
| 2637 | ? AMDGPU::G_AMDGPU_MAD_U64_U32 |
| 2638 | : AMDGPU::G_AMDGPU_MAD_I64_I32; |
| 2639 | |
| 2640 | MachineIRBuilder B(MI); |
| 2641 | Register Zero64 = B.buildConstant(Res: S64, Val: 0).getReg(Idx: 0); |
| 2642 | MRI.setRegClass(Reg: Zero64, RC: &AMDGPU::VReg_64RegClass); |
| 2643 | Register CarryOut = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass); |
| 2644 | MRI.setRegClass(Reg: CarryOut, RC: &AMDGPU::VReg_64RegClass); |
| 2645 | B.buildInstr(Opc: NewOpc, DstOps: {DstReg, CarryOut}, SrcOps: {Op0L, Op1L, Zero64}); |
| 2646 | MI.eraseFromParent(); |
| 2647 | return; |
| 2648 | } |
| 2649 | case AMDGPU::G_SEXT_INREG: { |
| 2650 | SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(OpIdx: 1)); |
| 2651 | if (SrcRegs.empty()) |
| 2652 | break; // Nothing to repair |
| 2653 | |
| 2654 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 2655 | ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank); |
| 2656 | |
| 2657 | // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs |
| 2658 | // we would need to further expand, and doesn't let us directly set the |
| 2659 | // result registers. |
| 2660 | SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0)); |
| 2661 | |
| 2662 | int Amt = MI.getOperand(i: 2).getImm(); |
| 2663 | if (Amt <= 32) { |
| 2664 | // Downstream users have expectations for the high bit behavior, so freeze |
| 2665 | // incoming undefined bits. |
| 2666 | if (Amt == 32) { |
| 2667 | // The low bits are unchanged. |
| 2668 | B.buildFreeze(Dst: DstRegs[0], Src: SrcRegs[0]); |
| 2669 | } else { |
| 2670 | auto Freeze = B.buildFreeze(Dst: S32, Src: SrcRegs[0]); |
| 2671 | // Extend in the low bits and propagate the sign bit to the high half. |
| 2672 | B.buildSExtInReg(Res: DstRegs[0], Op: Freeze, ImmOp: Amt); |
| 2673 | } |
| 2674 | |
| 2675 | B.buildAShr(Dst: DstRegs[1], Src0: DstRegs[0], Src1: B.buildConstant(Res: S32, Val: 31)); |
| 2676 | } else { |
| 2677 | // The low bits are unchanged, and extend in the high bits. |
| 2678 | // No freeze required |
| 2679 | B.buildCopy(Res: DstRegs[0], Op: SrcRegs[0]); |
| 2680 | B.buildSExtInReg(Res: DstRegs[1], Op: DstRegs[0], ImmOp: Amt - 32); |
| 2681 | } |
| 2682 | |
| 2683 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 2684 | MRI.setRegBank(Reg: DstReg, RegBank: AMDGPU::VGPRRegBank); |
| 2685 | MI.eraseFromParent(); |
| 2686 | return; |
| 2687 | } |
| 2688 | case AMDGPU::G_CTPOP: |
| 2689 | case AMDGPU::G_BITREVERSE: { |
| 2690 | const RegisterBank *DstBank = |
| 2691 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
| 2692 | if (DstBank == &AMDGPU::SGPRRegBank) |
| 2693 | break; |
| 2694 | |
| 2695 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 2696 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 2697 | LLT Ty = MRI.getType(Reg: SrcReg); |
| 2698 | if (Ty == S32) |
| 2699 | break; |
| 2700 | |
| 2701 | ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank); |
| 2702 | |
| 2703 | MachineFunction &MF = B.getMF(); |
| 2704 | LegalizerHelper Helper(MF, ApplyVALU, B); |
| 2705 | |
| 2706 | if (Helper.narrowScalar(MI, TypeIdx: 1, NarrowTy: S32) != LegalizerHelper::Legalized) |
| 2707 | llvm_unreachable("narrowScalar should have succeeded" ); |
| 2708 | return; |
| 2709 | } |
| 2710 | case AMDGPU::G_AMDGPU_FFBH_U32: |
| 2711 | case AMDGPU::G_AMDGPU_FFBL_B32: |
| 2712 | case AMDGPU::G_CTLZ_ZERO_UNDEF: |
| 2713 | case AMDGPU::G_CTTZ_ZERO_UNDEF: { |
| 2714 | const RegisterBank *DstBank = |
| 2715 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
| 2716 | if (DstBank == &AMDGPU::SGPRRegBank) |
| 2717 | break; |
| 2718 | |
| 2719 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 2720 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 2721 | LLT Ty = MRI.getType(Reg: SrcReg); |
| 2722 | if (Ty == S32) |
| 2723 | break; |
| 2724 | |
| 2725 | // We can narrow this more efficiently than Helper can by using ffbh/ffbl |
| 2726 | // which return -1 when the input is zero: |
| 2727 | // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) |
| 2728 | // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) |
| 2729 | // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) |
| 2730 | // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) |
| 2731 | ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank); |
| 2732 | SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(OpIdx: 1)); |
| 2733 | unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF |
| 2734 | ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32 |
| 2735 | : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF |
| 2736 | ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32 |
| 2737 | : Opc; |
| 2738 | unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32; |
| 2739 | auto X = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs[Idx]}); |
| 2740 | auto Y = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs[Idx ^ 1]}); |
| 2741 | unsigned AddOpc = |
| 2742 | Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF |
| 2743 | ? AMDGPU::G_ADD |
| 2744 | : AMDGPU::G_UADDSAT; |
| 2745 | Y = B.buildInstr(Opc: AddOpc, DstOps: {S32}, SrcOps: {Y, B.buildConstant(Res: S32, Val: 32)}); |
| 2746 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 2747 | B.buildUMin(Dst: DstReg, Src0: X, Src1: Y); |
| 2748 | MI.eraseFromParent(); |
| 2749 | return; |
| 2750 | } |
| 2751 | case AMDGPU::G_SEXT: |
| 2752 | case AMDGPU::G_ZEXT: |
| 2753 | case AMDGPU::G_ANYEXT: { |
| 2754 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 2755 | LLT SrcTy = MRI.getType(Reg: SrcReg); |
| 2756 | const bool Signed = Opc == AMDGPU::G_SEXT; |
| 2757 | |
| 2758 | assert(OpdMapper.getVRegs(1).empty()); |
| 2759 | |
| 2760 | const RegisterBank *SrcBank = |
| 2761 | OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank; |
| 2762 | |
| 2763 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 2764 | LLT DstTy = MRI.getType(Reg: DstReg); |
| 2765 | if (DstTy.isScalar() && |
| 2766 | SrcBank != &AMDGPU::SGPRRegBank && |
| 2767 | SrcBank != &AMDGPU::VCCRegBank && |
| 2768 | // FIXME: Should handle any type that round to s64 when irregular |
| 2769 | // breakdowns supported. |
| 2770 | DstTy.getSizeInBits() == 64 && |
| 2771 | SrcTy.getSizeInBits() <= 32) { |
| 2772 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
| 2773 | |
| 2774 | // Extend to 32-bit, and then extend the low half. |
| 2775 | if (Signed) { |
| 2776 | // TODO: Should really be buildSExtOrCopy |
| 2777 | B.buildSExtOrTrunc(Res: DefRegs[0], Op: SrcReg); |
| 2778 | } else if (Opc == AMDGPU::G_ZEXT) { |
| 2779 | B.buildZExtOrTrunc(Res: DefRegs[0], Op: SrcReg); |
| 2780 | } else { |
| 2781 | B.buildAnyExtOrTrunc(Res: DefRegs[0], Op: SrcReg); |
| 2782 | } |
| 2783 | |
| 2784 | extendLow32IntoHigh32(B, Hi32Reg: DefRegs[1], Lo32Reg: DefRegs[0], ExtOpc: Opc, RegBank: *SrcBank); |
| 2785 | MRI.setRegBank(Reg: DstReg, RegBank: *SrcBank); |
| 2786 | MI.eraseFromParent(); |
| 2787 | return; |
| 2788 | } |
| 2789 | |
| 2790 | if (SrcTy != LLT::scalar(SizeInBits: 1)) |
| 2791 | return; |
| 2792 | |
| 2793 | // It is not legal to have a legalization artifact with a VCC source. Rather |
| 2794 | // than introducing a copy, insert the select we would have to select the |
| 2795 | // copy to. |
| 2796 | if (SrcBank == &AMDGPU::VCCRegBank) { |
| 2797 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
| 2798 | |
| 2799 | const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; |
| 2800 | |
| 2801 | unsigned DstSize = DstTy.getSizeInBits(); |
| 2802 | // 64-bit select is SGPR only |
| 2803 | const bool UseSel64 = DstSize > 32 && |
| 2804 | SrcBank->getID() == AMDGPU::SGPRRegBankID; |
| 2805 | |
| 2806 | // TODO: Should s16 select be legal? |
| 2807 | LLT SelType = UseSel64 ? LLT::scalar(SizeInBits: 64) : LLT::scalar(SizeInBits: 32); |
| 2808 | auto True = B.buildConstant(Res: SelType, Val: Signed ? -1 : 1); |
| 2809 | auto False = B.buildConstant(Res: SelType, Val: 0); |
| 2810 | |
| 2811 | MRI.setRegBank(Reg: True.getReg(Idx: 0), RegBank: *DstBank); |
| 2812 | MRI.setRegBank(Reg: False.getReg(Idx: 0), RegBank: *DstBank); |
| 2813 | MRI.setRegBank(Reg: DstReg, RegBank: *DstBank); |
| 2814 | |
| 2815 | if (DstSize > 32) { |
| 2816 | B.buildSelect(Res: DefRegs[0], Tst: SrcReg, Op0: True, Op1: False); |
| 2817 | extendLow32IntoHigh32(B, Hi32Reg: DefRegs[1], Lo32Reg: DefRegs[0], ExtOpc: Opc, RegBank: *SrcBank, IsBooleanSrc: true); |
| 2818 | } else if (DstSize < 32) { |
| 2819 | auto Sel = B.buildSelect(Res: SelType, Tst: SrcReg, Op0: True, Op1: False); |
| 2820 | MRI.setRegBank(Reg: Sel.getReg(Idx: 0), RegBank: *DstBank); |
| 2821 | B.buildTrunc(Res: DstReg, Op: Sel); |
| 2822 | } else { |
| 2823 | B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False); |
| 2824 | } |
| 2825 | |
| 2826 | MI.eraseFromParent(); |
| 2827 | return; |
| 2828 | } |
| 2829 | |
| 2830 | break; |
| 2831 | } |
| 2832 | case AMDGPU::G_EXTRACT_VECTOR_ELT: { |
| 2833 | SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0)); |
| 2834 | |
| 2835 | assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); |
| 2836 | |
| 2837 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 2838 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 2839 | |
| 2840 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 2841 | LLT DstTy = MRI.getType(Reg: DstReg); |
| 2842 | LLT SrcTy = MRI.getType(Reg: SrcReg); |
| 2843 | |
| 2844 | if (foldExtractEltToCmpSelect(B, MI, OpdMapper)) |
| 2845 | return; |
| 2846 | |
| 2847 | const ValueMapping &DstMapping |
| 2848 | = OpdMapper.getInstrMapping().getOperandMapping(i: 0); |
| 2849 | const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; |
| 2850 | const RegisterBank *SrcBank = |
| 2851 | OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank; |
| 2852 | const RegisterBank *IdxBank = |
| 2853 | OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank; |
| 2854 | |
| 2855 | Register BaseIdxReg; |
| 2856 | unsigned ConstOffset; |
| 2857 | std::tie(args&: BaseIdxReg, args&: ConstOffset) = |
| 2858 | AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: 2).getReg()); |
| 2859 | |
| 2860 | // See if the index is an add of a constant which will be foldable by moving |
| 2861 | // the base register of the index later if this is going to be executed in a |
| 2862 | // waterfall loop. This is essentially to reassociate the add of a constant |
| 2863 | // with the readfirstlane. |
| 2864 | bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && |
| 2865 | ConstOffset > 0 && |
| 2866 | ConstOffset < SrcTy.getNumElements(); |
| 2867 | |
| 2868 | // Move the base register. We'll re-insert the add later. |
| 2869 | if (ShouldMoveIndexIntoLoop) |
| 2870 | MI.getOperand(i: 2).setReg(BaseIdxReg); |
| 2871 | |
| 2872 | // If this is a VGPR result only because the index was a VGPR result, the |
| 2873 | // actual indexing will be done on the SGPR source vector, which will |
| 2874 | // produce a scalar result. We need to copy to the VGPR result inside the |
| 2875 | // waterfall loop. |
| 2876 | const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && |
| 2877 | SrcBank == &AMDGPU::SGPRRegBank; |
| 2878 | if (DstRegs.empty()) { |
| 2879 | applyDefaultMapping(OpdMapper); |
| 2880 | |
| 2881 | executeInWaterfallLoop(B, MI, OpIndices: {2}); |
| 2882 | |
| 2883 | if (NeedCopyToVGPR) { |
| 2884 | // We don't want a phi for this temporary reg. |
| 2885 | Register TmpReg = MRI.createGenericVirtualRegister(Ty: DstTy); |
| 2886 | MRI.setRegBank(Reg: TmpReg, RegBank: AMDGPU::SGPRRegBank); |
| 2887 | MI.getOperand(i: 0).setReg(TmpReg); |
| 2888 | B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator()); |
| 2889 | |
| 2890 | // Use a v_mov_b32 here to make the exec dependency explicit. |
| 2891 | buildVCopy(B, DstReg, SrcReg: TmpReg); |
| 2892 | } |
| 2893 | |
| 2894 | // Re-insert the constant offset add inside the waterfall loop. |
| 2895 | if (ShouldMoveIndexIntoLoop) |
| 2896 | reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: 2, ConstOffset); |
| 2897 | |
| 2898 | return; |
| 2899 | } |
| 2900 | |
| 2901 | assert(DstTy.getSizeInBits() == 64); |
| 2902 | |
| 2903 | LLT Vec32 = LLT::fixed_vector(NumElements: 2 * SrcTy.getNumElements(), ScalarSizeInBits: 32); |
| 2904 | |
| 2905 | auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg); |
| 2906 | auto One = B.buildConstant(Res: S32, Val: 1); |
| 2907 | |
| 2908 | MachineBasicBlock::iterator MII = MI.getIterator(); |
| 2909 | |
| 2910 | // Split the vector index into 32-bit pieces. Prepare to move all of the |
| 2911 | // new instructions into a waterfall loop if necessary. |
| 2912 | // |
| 2913 | // Don't put the bitcast or constant in the loop. |
| 2914 | MachineInstrSpan Span(MII, &B.getMBB()); |
| 2915 | |
| 2916 | // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). |
| 2917 | auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One); |
| 2918 | auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One); |
| 2919 | |
| 2920 | auto = B.buildExtractVectorElement(Res: DstRegs[0], Val: CastSrc, Idx: IdxLo); |
| 2921 | auto = B.buildExtractVectorElement(Res: DstRegs[1], Val: CastSrc, Idx: IdxHi); |
| 2922 | |
| 2923 | MRI.setRegBank(Reg: DstReg, RegBank: *DstBank); |
| 2924 | MRI.setRegBank(Reg: CastSrc.getReg(Idx: 0), RegBank: *SrcBank); |
| 2925 | MRI.setRegBank(Reg: One.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank); |
| 2926 | MRI.setRegBank(Reg: IdxLo.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank); |
| 2927 | MRI.setRegBank(Reg: IdxHi.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank); |
| 2928 | |
| 2929 | SmallSet<Register, 4> OpsToWaterfall; |
| 2930 | if (!collectWaterfallOperands(SGPROperandRegs&: OpsToWaterfall, MI, MRI, OpIndices: { 2 })) { |
| 2931 | MI.eraseFromParent(); |
| 2932 | return; |
| 2933 | } |
| 2934 | |
| 2935 | // Remove the original instruction to avoid potentially confusing the |
| 2936 | // waterfall loop logic. |
| 2937 | B.setInstr(*Span.begin()); |
| 2938 | MI.eraseFromParent(); |
| 2939 | executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()), |
| 2940 | SGPROperandRegs&: OpsToWaterfall); |
| 2941 | |
| 2942 | if (NeedCopyToVGPR) { |
| 2943 | MachineBasicBlock *LoopBB = Extract1->getParent(); |
| 2944 | Register TmpReg0 = MRI.createGenericVirtualRegister(Ty: S32); |
| 2945 | Register TmpReg1 = MRI.createGenericVirtualRegister(Ty: S32); |
| 2946 | MRI.setRegBank(Reg: TmpReg0, RegBank: AMDGPU::SGPRRegBank); |
| 2947 | MRI.setRegBank(Reg: TmpReg1, RegBank: AMDGPU::SGPRRegBank); |
| 2948 | |
| 2949 | Extract0->getOperand(i: 0).setReg(TmpReg0); |
| 2950 | Extract1->getOperand(i: 0).setReg(TmpReg1); |
| 2951 | |
| 2952 | B.setInsertPt(MBB&: *LoopBB, II: ++Extract1->getIterator()); |
| 2953 | |
| 2954 | buildVCopy(B, DstReg: DstRegs[0], SrcReg: TmpReg0); |
| 2955 | buildVCopy(B, DstReg: DstRegs[1], SrcReg: TmpReg1); |
| 2956 | } |
| 2957 | |
| 2958 | if (ShouldMoveIndexIntoLoop) |
| 2959 | reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: 1, ConstOffset); |
| 2960 | |
| 2961 | return; |
| 2962 | } |
| 2963 | case AMDGPU::G_INSERT_VECTOR_ELT: { |
| 2964 | SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(OpIdx: 2)); |
| 2965 | |
| 2966 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 2967 | LLT VecTy = MRI.getType(Reg: DstReg); |
| 2968 | |
| 2969 | assert(OpdMapper.getVRegs(0).empty()); |
| 2970 | assert(OpdMapper.getVRegs(3).empty()); |
| 2971 | |
| 2972 | if (substituteSimpleCopyRegs(OpdMapper, OpIdx: 1)) |
| 2973 | MRI.setType(VReg: MI.getOperand(i: 1).getReg(), Ty: VecTy); |
| 2974 | |
| 2975 | if (foldInsertEltToCmpSelect(B, MI, OpdMapper)) |
| 2976 | return; |
| 2977 | |
| 2978 | const RegisterBank *IdxBank = |
| 2979 | OpdMapper.getInstrMapping().getOperandMapping(i: 3).BreakDown[0].RegBank; |
| 2980 | |
| 2981 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 2982 | Register InsReg = MI.getOperand(i: 2).getReg(); |
| 2983 | LLT InsTy = MRI.getType(Reg: InsReg); |
| 2984 | (void)InsTy; |
| 2985 | |
| 2986 | Register BaseIdxReg; |
| 2987 | unsigned ConstOffset; |
| 2988 | std::tie(args&: BaseIdxReg, args&: ConstOffset) = |
| 2989 | AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: 3).getReg()); |
| 2990 | |
| 2991 | // See if the index is an add of a constant which will be foldable by moving |
| 2992 | // the base register of the index later if this is going to be executed in a |
| 2993 | // waterfall loop. This is essentially to reassociate the add of a constant |
| 2994 | // with the readfirstlane. |
| 2995 | bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && |
| 2996 | ConstOffset > 0 && |
| 2997 | ConstOffset < VecTy.getNumElements(); |
| 2998 | |
| 2999 | // Move the base register. We'll re-insert the add later. |
| 3000 | if (ShouldMoveIndexIntoLoop) |
| 3001 | MI.getOperand(i: 3).setReg(BaseIdxReg); |
| 3002 | |
| 3003 | |
| 3004 | if (InsRegs.empty()) { |
| 3005 | executeInWaterfallLoop(B, MI, OpIndices: {3}); |
| 3006 | |
| 3007 | // Re-insert the constant offset add inside the waterfall loop. |
| 3008 | if (ShouldMoveIndexIntoLoop) { |
| 3009 | reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: 3, ConstOffset); |
| 3010 | } |
| 3011 | |
| 3012 | return; |
| 3013 | } |
| 3014 | |
| 3015 | assert(InsTy.getSizeInBits() == 64); |
| 3016 | |
| 3017 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
| 3018 | LLT Vec32 = LLT::fixed_vector(NumElements: 2 * VecTy.getNumElements(), ScalarSizeInBits: 32); |
| 3019 | |
| 3020 | auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg); |
| 3021 | auto One = B.buildConstant(Res: S32, Val: 1); |
| 3022 | |
| 3023 | // Split the vector index into 32-bit pieces. Prepare to move all of the |
| 3024 | // new instructions into a waterfall loop if necessary. |
| 3025 | // |
| 3026 | // Don't put the bitcast or constant in the loop. |
| 3027 | MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); |
| 3028 | |
| 3029 | // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). |
| 3030 | auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One); |
| 3031 | auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One); |
| 3032 | |
| 3033 | auto InsLo = B.buildInsertVectorElement(Res: Vec32, Val: CastSrc, Elt: InsRegs[0], Idx: IdxLo); |
| 3034 | auto InsHi = B.buildInsertVectorElement(Res: Vec32, Val: InsLo, Elt: InsRegs[1], Idx: IdxHi); |
| 3035 | |
| 3036 | const RegisterBank *DstBank = |
| 3037 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
| 3038 | const RegisterBank *SrcBank = |
| 3039 | OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank; |
| 3040 | const RegisterBank *InsSrcBank = |
| 3041 | OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank; |
| 3042 | |
| 3043 | MRI.setRegBank(Reg: InsReg, RegBank: *InsSrcBank); |
| 3044 | MRI.setRegBank(Reg: CastSrc.getReg(Idx: 0), RegBank: *SrcBank); |
| 3045 | MRI.setRegBank(Reg: InsLo.getReg(Idx: 0), RegBank: *DstBank); |
| 3046 | MRI.setRegBank(Reg: InsHi.getReg(Idx: 0), RegBank: *DstBank); |
| 3047 | MRI.setRegBank(Reg: One.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank); |
| 3048 | MRI.setRegBank(Reg: IdxLo.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank); |
| 3049 | MRI.setRegBank(Reg: IdxHi.getReg(Idx: 0), RegBank: AMDGPU::SGPRRegBank); |
| 3050 | |
| 3051 | |
| 3052 | SmallSet<Register, 4> OpsToWaterfall; |
| 3053 | if (!collectWaterfallOperands(SGPROperandRegs&: OpsToWaterfall, MI, MRI, OpIndices: { 3 })) { |
| 3054 | B.setInsertPt(MBB&: B.getMBB(), II: MI); |
| 3055 | B.buildBitcast(Dst: DstReg, Src: InsHi); |
| 3056 | MI.eraseFromParent(); |
| 3057 | return; |
| 3058 | } |
| 3059 | |
| 3060 | B.setInstr(*Span.begin()); |
| 3061 | MI.eraseFromParent(); |
| 3062 | |
| 3063 | // Figure out the point after the waterfall loop before mangling the control |
| 3064 | // flow. |
| 3065 | executeInWaterfallLoop(B, Range: make_range(x: Span.begin(), y: Span.end()), |
| 3066 | SGPROperandRegs&: OpsToWaterfall); |
| 3067 | |
| 3068 | // The insertion point is now right after the original instruction. |
| 3069 | // |
| 3070 | // Keep the bitcast to the original vector type out of the loop. Doing this |
| 3071 | // saved an extra phi we don't need inside the loop. |
| 3072 | B.buildBitcast(Dst: DstReg, Src: InsHi); |
| 3073 | |
| 3074 | // Re-insert the constant offset add inside the waterfall loop. |
| 3075 | if (ShouldMoveIndexIntoLoop) |
| 3076 | reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: 1, ConstOffset); |
| 3077 | |
| 3078 | return; |
| 3079 | } |
| 3080 | case AMDGPU::G_AMDGPU_BUFFER_LOAD: |
| 3081 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: |
| 3082 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: |
| 3083 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: |
| 3084 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: |
| 3085 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE: |
| 3086 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE: |
| 3087 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE: |
| 3088 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE: |
| 3089 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE: |
| 3090 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: |
| 3091 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: |
| 3092 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: |
| 3093 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: |
| 3094 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: |
| 3095 | case AMDGPU::G_AMDGPU_BUFFER_STORE: |
| 3096 | case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: |
| 3097 | case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: |
| 3098 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: |
| 3099 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: |
| 3100 | case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: |
| 3101 | case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { |
| 3102 | applyDefaultMapping(OpdMapper); |
| 3103 | executeInWaterfallLoop(B, MI, OpIndices: {1, 4}); |
| 3104 | return; |
| 3105 | } |
| 3106 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: |
| 3107 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: |
| 3108 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: |
| 3109 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: |
| 3110 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: |
| 3111 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: |
| 3112 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: |
| 3113 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: |
| 3114 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: |
| 3115 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: |
| 3116 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: |
| 3117 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: |
| 3118 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32: |
| 3119 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32: |
| 3120 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: |
| 3121 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: |
| 3122 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { |
| 3123 | applyDefaultMapping(OpdMapper); |
| 3124 | executeInWaterfallLoop(B, MI, OpIndices: {2, 5}); |
| 3125 | return; |
| 3126 | } |
| 3127 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { |
| 3128 | applyDefaultMapping(OpdMapper); |
| 3129 | executeInWaterfallLoop(B, MI, OpIndices: {3, 6}); |
| 3130 | return; |
| 3131 | } |
| 3132 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: |
| 3133 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: |
| 3134 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE: |
| 3135 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: |
| 3136 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: { |
| 3137 | applyMappingSBufferLoad(B, OpdMapper); |
| 3138 | return; |
| 3139 | } |
| 3140 | case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH: |
| 3141 | constrainOpWithReadfirstlane(B, MI, OpIdx: 0); |
| 3142 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); |
| 3143 | return; |
| 3144 | case AMDGPU::G_INTRINSIC: |
| 3145 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
| 3146 | switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) { |
| 3147 | case Intrinsic::amdgcn_readlane: { |
| 3148 | substituteSimpleCopyRegs(OpdMapper, OpIdx: 2); |
| 3149 | |
| 3150 | assert(OpdMapper.getVRegs(0).empty()); |
| 3151 | assert(OpdMapper.getVRegs(3).empty()); |
| 3152 | |
| 3153 | // Make sure the index is an SGPR. It doesn't make sense to run this in a |
| 3154 | // waterfall loop, so assume it's a uniform value. |
| 3155 | constrainOpWithReadfirstlane(B, MI, OpIdx: 3); // Index |
| 3156 | return; |
| 3157 | } |
| 3158 | case Intrinsic::amdgcn_writelane: { |
| 3159 | assert(OpdMapper.getVRegs(0).empty()); |
| 3160 | assert(OpdMapper.getVRegs(2).empty()); |
| 3161 | assert(OpdMapper.getVRegs(3).empty()); |
| 3162 | |
| 3163 | substituteSimpleCopyRegs(OpdMapper, OpIdx: 4); // VGPR input val |
| 3164 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // Source value |
| 3165 | constrainOpWithReadfirstlane(B, MI, OpIdx: 3); // Index |
| 3166 | return; |
| 3167 | } |
| 3168 | case Intrinsic::amdgcn_interp_p1: |
| 3169 | case Intrinsic::amdgcn_interp_p2: |
| 3170 | case Intrinsic::amdgcn_interp_mov: |
| 3171 | case Intrinsic::amdgcn_interp_p1_f16: |
| 3172 | case Intrinsic::amdgcn_interp_p2_f16: |
| 3173 | case Intrinsic::amdgcn_lds_param_load: { |
| 3174 | applyDefaultMapping(OpdMapper); |
| 3175 | |
| 3176 | // Readlane for m0 value, which is always the last operand. |
| 3177 | // FIXME: Should this be a waterfall loop instead? |
| 3178 | constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - 1); // Index |
| 3179 | return; |
| 3180 | } |
| 3181 | case Intrinsic::amdgcn_interp_inreg_p10: |
| 3182 | case Intrinsic::amdgcn_interp_inreg_p2: |
| 3183 | case Intrinsic::amdgcn_interp_inreg_p10_f16: |
| 3184 | case Intrinsic::amdgcn_interp_inreg_p2_f16: |
| 3185 | case Intrinsic::amdgcn_interp_p10_rtz_f16: |
| 3186 | case Intrinsic::amdgcn_interp_p2_rtz_f16: |
| 3187 | case Intrinsic::amdgcn_permlane16_swap: |
| 3188 | case Intrinsic::amdgcn_permlane32_swap: |
| 3189 | applyDefaultMapping(OpdMapper); |
| 3190 | return; |
| 3191 | case Intrinsic::amdgcn_permlane16: |
| 3192 | case Intrinsic::amdgcn_permlanex16: { |
| 3193 | // Doing a waterfall loop over these wouldn't make any sense. |
| 3194 | substituteSimpleCopyRegs(OpdMapper, OpIdx: 2); |
| 3195 | substituteSimpleCopyRegs(OpdMapper, OpIdx: 3); |
| 3196 | constrainOpWithReadfirstlane(B, MI, OpIdx: 4); |
| 3197 | constrainOpWithReadfirstlane(B, MI, OpIdx: 5); |
| 3198 | return; |
| 3199 | } |
| 3200 | case Intrinsic::amdgcn_permlane_bcast: |
| 3201 | case Intrinsic::amdgcn_permlane_up: |
| 3202 | case Intrinsic::amdgcn_permlane_down: |
| 3203 | case Intrinsic::amdgcn_permlane_xor: |
| 3204 | // Doing a waterfall loop over these wouldn't make any sense. |
| 3205 | constrainOpWithReadfirstlane(B, MI, OpIdx: 3); |
| 3206 | constrainOpWithReadfirstlane(B, MI, OpIdx: 4); |
| 3207 | return; |
| 3208 | case Intrinsic::amdgcn_permlane_idx_gen: { |
| 3209 | constrainOpWithReadfirstlane(B, MI, OpIdx: 3); |
| 3210 | return; |
| 3211 | } |
| 3212 | case Intrinsic::amdgcn_sbfe: |
| 3213 | applyMappingBFE(B, OpdMapper, Signed: true); |
| 3214 | return; |
| 3215 | case Intrinsic::amdgcn_ubfe: |
| 3216 | applyMappingBFE(B, OpdMapper, Signed: false); |
| 3217 | return; |
| 3218 | case Intrinsic::amdgcn_inverse_ballot: |
| 3219 | case Intrinsic::amdgcn_s_bitreplicate: |
| 3220 | case Intrinsic::amdgcn_s_quadmask: |
| 3221 | case Intrinsic::amdgcn_s_wqm: |
| 3222 | applyDefaultMapping(OpdMapper); |
| 3223 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // Mask |
| 3224 | return; |
| 3225 | case Intrinsic::amdgcn_ballot: |
| 3226 | // Use default handling and insert copy to vcc source. |
| 3227 | break; |
| 3228 | } |
| 3229 | break; |
| 3230 | } |
| 3231 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: |
| 3232 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: |
| 3233 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET: |
| 3234 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: |
| 3235 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { |
| 3236 | const AMDGPU::RsrcIntrinsic *RSrcIntrin = |
| 3237 | AMDGPU::lookupRsrcIntrinsic(Intr: AMDGPU::getIntrinsicID(I: MI)); |
| 3238 | assert(RSrcIntrin && RSrcIntrin->IsImage); |
| 3239 | // Non-images can have complications from operands that allow both SGPR |
| 3240 | // and VGPR. For now it's too complicated to figure out the final opcode |
| 3241 | // to derive the register bank from the MCInstrDesc. |
| 3242 | applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg); |
| 3243 | return; |
| 3244 | } |
| 3245 | case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: |
| 3246 | case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY: |
| 3247 | case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: { |
| 3248 | bool IsDualOrBVH8 = |
| 3249 | MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY || |
| 3250 | MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY; |
| 3251 | unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier |
| 3252 | unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods; |
| 3253 | applyDefaultMapping(OpdMapper); |
| 3254 | executeInWaterfallLoop(B, MI, OpIndices: {LastRegOpIdx}); |
| 3255 | return; |
| 3256 | } |
| 3257 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: |
| 3258 | case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: { |
| 3259 | auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID(); |
| 3260 | switch (IntrID) { |
| 3261 | case Intrinsic::amdgcn_ds_ordered_add: |
| 3262 | case Intrinsic::amdgcn_ds_ordered_swap: { |
| 3263 | // This is only allowed to execute with 1 lane, so readfirstlane is safe. |
| 3264 | assert(OpdMapper.getVRegs(0).empty()); |
| 3265 | substituteSimpleCopyRegs(OpdMapper, OpIdx: 3); |
| 3266 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0 |
| 3267 | return; |
| 3268 | } |
| 3269 | case Intrinsic::amdgcn_ds_gws_init: |
| 3270 | case Intrinsic::amdgcn_ds_gws_barrier: |
| 3271 | case Intrinsic::amdgcn_ds_gws_sema_br: { |
| 3272 | // Only the first lane is executes, so readfirstlane is safe. |
| 3273 | substituteSimpleCopyRegs(OpdMapper, OpIdx: 1); |
| 3274 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0 |
| 3275 | return; |
| 3276 | } |
| 3277 | case Intrinsic::amdgcn_ds_gws_sema_v: |
| 3278 | case Intrinsic::amdgcn_ds_gws_sema_p: |
| 3279 | case Intrinsic::amdgcn_ds_gws_sema_release_all: { |
| 3280 | // Only the first lane is executes, so readfirstlane is safe. |
| 3281 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // M0 |
| 3282 | return; |
| 3283 | } |
| 3284 | case Intrinsic::amdgcn_ds_append: |
| 3285 | case Intrinsic::amdgcn_ds_consume: { |
| 3286 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0 |
| 3287 | return; |
| 3288 | } |
| 3289 | case Intrinsic::amdgcn_s_sendmsg: |
| 3290 | case Intrinsic::amdgcn_s_sendmsghalt: { |
| 3291 | // FIXME: Should this use a waterfall loop? |
| 3292 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0 |
| 3293 | return; |
| 3294 | } |
| 3295 | case Intrinsic::amdgcn_s_setreg: { |
| 3296 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); |
| 3297 | return; |
| 3298 | } |
| 3299 | case Intrinsic::amdgcn_s_ttracedata: |
| 3300 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // M0 |
| 3301 | return; |
| 3302 | case Intrinsic::amdgcn_raw_buffer_load_lds: |
| 3303 | case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { |
| 3304 | applyDefaultMapping(OpdMapper); |
| 3305 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // rsrc |
| 3306 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0 |
| 3307 | constrainOpWithReadfirstlane(B, MI, OpIdx: 5); // soffset |
| 3308 | return; |
| 3309 | } |
| 3310 | case Intrinsic::amdgcn_struct_buffer_load_lds: |
| 3311 | case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { |
| 3312 | applyDefaultMapping(OpdMapper); |
| 3313 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // rsrc |
| 3314 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0 |
| 3315 | constrainOpWithReadfirstlane(B, MI, OpIdx: 6); // soffset |
| 3316 | return; |
| 3317 | } |
| 3318 | case Intrinsic::amdgcn_cluster_load_async_to_lds_b8: |
| 3319 | case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: |
| 3320 | case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: |
| 3321 | case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: { |
| 3322 | applyDefaultMapping(OpdMapper); |
| 3323 | constrainOpWithReadfirstlane(B, MI, OpIdx: 5); |
| 3324 | return; |
| 3325 | } |
| 3326 | case Intrinsic::amdgcn_load_to_lds: |
| 3327 | case Intrinsic::amdgcn_global_load_lds: { |
| 3328 | applyDefaultMapping(OpdMapper); |
| 3329 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); |
| 3330 | return; |
| 3331 | } |
| 3332 | case Intrinsic::amdgcn_lds_direct_load: { |
| 3333 | applyDefaultMapping(OpdMapper); |
| 3334 | // Readlane for m0 value, which is always the last operand. |
| 3335 | constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - 1); // Index |
| 3336 | return; |
| 3337 | } |
| 3338 | case Intrinsic::amdgcn_exp_row: |
| 3339 | applyDefaultMapping(OpdMapper); |
| 3340 | constrainOpWithReadfirstlane(B, MI, OpIdx: 8); // M0 |
| 3341 | return; |
| 3342 | case Intrinsic::amdgcn_cluster_load_b32: |
| 3343 | case Intrinsic::amdgcn_cluster_load_b64: |
| 3344 | case Intrinsic::amdgcn_cluster_load_b128: { |
| 3345 | applyDefaultMapping(OpdMapper); |
| 3346 | constrainOpWithReadfirstlane(B, MI, OpIdx: 4); // M0 |
| 3347 | return; |
| 3348 | } |
| 3349 | case Intrinsic::amdgcn_s_sleep_var: |
| 3350 | assert(OpdMapper.getVRegs(1).empty()); |
| 3351 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); |
| 3352 | return; |
| 3353 | case Intrinsic::amdgcn_s_barrier_join: |
| 3354 | case Intrinsic::amdgcn_s_wakeup_barrier: |
| 3355 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); |
| 3356 | return; |
| 3357 | case Intrinsic::amdgcn_s_barrier_init: |
| 3358 | case Intrinsic::amdgcn_s_barrier_signal_var: |
| 3359 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); |
| 3360 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); |
| 3361 | return; |
| 3362 | case Intrinsic::amdgcn_s_get_barrier_state: |
| 3363 | case Intrinsic::amdgcn_s_get_named_barrier_state: { |
| 3364 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); |
| 3365 | return; |
| 3366 | } |
| 3367 | case Intrinsic::amdgcn_s_prefetch_data: { |
| 3368 | Register PtrReg = MI.getOperand(i: 1).getReg(); |
| 3369 | unsigned AS = MRI.getType(Reg: PtrReg).getAddressSpace(); |
| 3370 | if (AMDGPU::isFlatGlobalAddrSpace(AS)) { |
| 3371 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); |
| 3372 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); |
| 3373 | } else |
| 3374 | MI.eraseFromParent(); |
| 3375 | return; |
| 3376 | } |
| 3377 | case Intrinsic::amdgcn_tensor_load_to_lds: |
| 3378 | case Intrinsic::amdgcn_tensor_store_from_lds: { |
| 3379 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); |
| 3380 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); |
| 3381 | constrainOpWithReadfirstlane(B, MI, OpIdx: 3); |
| 3382 | constrainOpWithReadfirstlane(B, MI, OpIdx: 4); |
| 3383 | return; |
| 3384 | } |
| 3385 | case Intrinsic::amdgcn_tensor_load_to_lds_d2: |
| 3386 | case Intrinsic::amdgcn_tensor_store_from_lds_d2: { |
| 3387 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); |
| 3388 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); |
| 3389 | return; |
| 3390 | } |
| 3391 | default: { |
| 3392 | if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = |
| 3393 | AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) { |
| 3394 | // Non-images can have complications from operands that allow both SGPR |
| 3395 | // and VGPR. For now it's too complicated to figure out the final opcode |
| 3396 | // to derive the register bank from the MCInstrDesc. |
| 3397 | if (RSrcIntrin->IsImage) { |
| 3398 | applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg); |
| 3399 | return; |
| 3400 | } |
| 3401 | } |
| 3402 | |
| 3403 | break; |
| 3404 | } |
| 3405 | } |
| 3406 | break; |
| 3407 | } |
| 3408 | case AMDGPU::G_SI_CALL: { |
| 3409 | // Use a set to avoid extra readfirstlanes in the case where multiple |
| 3410 | // operands are the same register. |
| 3411 | SmallSet<Register, 4> SGPROperandRegs; |
| 3412 | |
| 3413 | if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices: {1})) |
| 3414 | break; |
| 3415 | |
| 3416 | // Move all copies to physical SGPRs that are used by the call instruction |
| 3417 | // into the loop block. Start searching for these copies until the |
| 3418 | // ADJCALLSTACKUP. |
| 3419 | unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP; |
| 3420 | unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN; |
| 3421 | |
| 3422 | // Move all non-copies before the copies, so that a complete range can be |
| 3423 | // moved into the waterfall loop. |
| 3424 | SmallVector<MachineInstr *, 4> NonCopyInstrs; |
| 3425 | // Count of NonCopyInstrs found until the current LastCopy. |
| 3426 | unsigned NonCopyInstrsLen = 0; |
| 3427 | MachineBasicBlock::iterator Start(&MI); |
| 3428 | MachineBasicBlock::iterator LastCopy = Start; |
| 3429 | MachineBasicBlock *MBB = MI.getParent(); |
| 3430 | const SIMachineFunctionInfo *Info = |
| 3431 | MBB->getParent()->getInfo<SIMachineFunctionInfo>(); |
| 3432 | while (Start->getOpcode() != FrameSetupOpcode) { |
| 3433 | --Start; |
| 3434 | bool IsCopy = false; |
| 3435 | if (Start->getOpcode() == AMDGPU::COPY) { |
| 3436 | auto &Dst = Start->getOperand(i: 0); |
| 3437 | if (Dst.isReg()) { |
| 3438 | Register Reg = Dst.getReg(); |
| 3439 | if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) { |
| 3440 | IsCopy = true; |
| 3441 | } else { |
| 3442 | // Also move the copy from the scratch rsrc descriptor into the loop |
| 3443 | // to allow it to be optimized away. |
| 3444 | auto &Src = Start->getOperand(i: 1); |
| 3445 | if (Src.isReg()) { |
| 3446 | Reg = Src.getReg(); |
| 3447 | IsCopy = Info->getScratchRSrcReg() == Reg; |
| 3448 | } |
| 3449 | } |
| 3450 | } |
| 3451 | } |
| 3452 | |
| 3453 | if (IsCopy) { |
| 3454 | LastCopy = Start; |
| 3455 | NonCopyInstrsLen = NonCopyInstrs.size(); |
| 3456 | } else { |
| 3457 | NonCopyInstrs.push_back(Elt: &*Start); |
| 3458 | } |
| 3459 | } |
| 3460 | NonCopyInstrs.resize(N: NonCopyInstrsLen); |
| 3461 | |
| 3462 | for (auto *NonCopy : reverse(C&: NonCopyInstrs)) { |
| 3463 | MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator()); |
| 3464 | } |
| 3465 | Start = LastCopy; |
| 3466 | |
| 3467 | // Do the same for copies after the loop |
| 3468 | NonCopyInstrs.clear(); |
| 3469 | NonCopyInstrsLen = 0; |
| 3470 | MachineBasicBlock::iterator End(&MI); |
| 3471 | LastCopy = End; |
| 3472 | while (End->getOpcode() != FrameDestroyOpcode) { |
| 3473 | ++End; |
| 3474 | bool IsCopy = false; |
| 3475 | if (End->getOpcode() == AMDGPU::COPY) { |
| 3476 | auto &Src = End->getOperand(i: 1); |
| 3477 | if (Src.isReg()) { |
| 3478 | Register Reg = Src.getReg(); |
| 3479 | IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI); |
| 3480 | } |
| 3481 | } |
| 3482 | |
| 3483 | if (IsCopy) { |
| 3484 | LastCopy = End; |
| 3485 | NonCopyInstrsLen = NonCopyInstrs.size(); |
| 3486 | } else { |
| 3487 | NonCopyInstrs.push_back(Elt: &*End); |
| 3488 | } |
| 3489 | } |
| 3490 | NonCopyInstrs.resize(N: NonCopyInstrsLen); |
| 3491 | |
| 3492 | End = LastCopy; |
| 3493 | ++LastCopy; |
| 3494 | for (auto *NonCopy : reverse(C&: NonCopyInstrs)) { |
| 3495 | MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator()); |
| 3496 | } |
| 3497 | |
| 3498 | ++End; |
| 3499 | B.setInsertPt(MBB&: B.getMBB(), II: Start); |
| 3500 | executeInWaterfallLoop(B, Range: make_range(x: Start, y: End), SGPROperandRegs); |
| 3501 | break; |
| 3502 | } |
| 3503 | case AMDGPU::G_LOAD: |
| 3504 | case AMDGPU::G_ZEXTLOAD: |
| 3505 | case AMDGPU::G_SEXTLOAD: { |
| 3506 | if (applyMappingLoad(B, OpdMapper, MI)) |
| 3507 | return; |
| 3508 | break; |
| 3509 | } |
| 3510 | case AMDGPU::G_DYN_STACKALLOC: |
| 3511 | applyMappingDynStackAlloc(B, OpdMapper, MI); |
| 3512 | return; |
| 3513 | case AMDGPU::G_STACKRESTORE: { |
| 3514 | applyDefaultMapping(OpdMapper); |
| 3515 | constrainOpWithReadfirstlane(B, MI, OpIdx: 0); |
| 3516 | return; |
| 3517 | } |
| 3518 | case AMDGPU::G_SBFX: |
| 3519 | applyMappingBFE(B, OpdMapper, /*Signed*/ true); |
| 3520 | return; |
| 3521 | case AMDGPU::G_UBFX: |
| 3522 | applyMappingBFE(B, OpdMapper, /*Signed*/ false); |
| 3523 | return; |
| 3524 | case AMDGPU::G_AMDGPU_MAD_U64_U32: |
| 3525 | case AMDGPU::G_AMDGPU_MAD_I64_I32: |
| 3526 | applyMappingMAD_64_32(B, OpdMapper); |
| 3527 | return; |
| 3528 | case AMDGPU::G_PREFETCH: { |
| 3529 | if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) { |
| 3530 | MI.eraseFromParent(); |
| 3531 | return; |
| 3532 | } |
| 3533 | Register PtrReg = MI.getOperand(i: 0).getReg(); |
| 3534 | unsigned PtrBank = getRegBankID(Reg: PtrReg, MRI, Default: AMDGPU::SGPRRegBankID); |
| 3535 | if (PtrBank == AMDGPU::VGPRRegBankID && |
| 3536 | (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(i: 3).getImm())) { |
| 3537 | // Cannot do I$ prefetch with divergent pointer. |
| 3538 | MI.eraseFromParent(); |
| 3539 | return; |
| 3540 | } |
| 3541 | unsigned AS = MRI.getType(Reg: PtrReg).getAddressSpace(); |
| 3542 | if ((!AMDGPU::isFlatGlobalAddrSpace(AS) && |
| 3543 | AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) || |
| 3544 | (!Subtarget.hasSafeSmemPrefetch() && |
| 3545 | (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || |
| 3546 | !MI.getOperand(i: 3).getImm() /* I$ prefetch */))) { |
| 3547 | MI.eraseFromParent(); |
| 3548 | return; |
| 3549 | } |
| 3550 | applyDefaultMapping(OpdMapper); |
| 3551 | return; |
| 3552 | } |
| 3553 | default: |
| 3554 | break; |
| 3555 | } |
| 3556 | |
| 3557 | return applyDefaultMapping(OpdMapper); |
| 3558 | } |
| 3559 | |
| 3560 | // vgpr, sgpr -> vgpr |
| 3561 | // vgpr, agpr -> vgpr |
| 3562 | // agpr, agpr -> agpr |
| 3563 | // agpr, sgpr -> vgpr |
| 3564 | static unsigned regBankUnion(unsigned RB0, unsigned RB1) { |
| 3565 | if (RB0 == AMDGPU::InvalidRegBankID) |
| 3566 | return RB1; |
| 3567 | if (RB1 == AMDGPU::InvalidRegBankID) |
| 3568 | return RB0; |
| 3569 | |
| 3570 | if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) |
| 3571 | return AMDGPU::SGPRRegBankID; |
| 3572 | |
| 3573 | if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) |
| 3574 | return AMDGPU::AGPRRegBankID; |
| 3575 | |
| 3576 | return AMDGPU::VGPRRegBankID; |
| 3577 | } |
| 3578 | |
| 3579 | static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { |
| 3580 | if (RB0 == AMDGPU::InvalidRegBankID) |
| 3581 | return RB1; |
| 3582 | if (RB1 == AMDGPU::InvalidRegBankID) |
| 3583 | return RB0; |
| 3584 | |
| 3585 | // vcc, vcc -> vcc |
| 3586 | // vcc, sgpr -> vcc |
| 3587 | // vcc, vgpr -> vcc |
| 3588 | if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) |
| 3589 | return AMDGPU::VCCRegBankID; |
| 3590 | |
| 3591 | // vcc, vgpr -> vgpr |
| 3592 | return regBankUnion(RB0, RB1); |
| 3593 | } |
| 3594 | |
| 3595 | unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, |
| 3596 | const MachineInstr &MI) const { |
| 3597 | unsigned RegBank = AMDGPU::InvalidRegBankID; |
| 3598 | |
| 3599 | for (const MachineOperand &MO : MI.operands()) { |
| 3600 | if (!MO.isReg()) |
| 3601 | continue; |
| 3602 | Register Reg = MO.getReg(); |
| 3603 | if (const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI)) { |
| 3604 | RegBank = regBankUnion(RB0: RegBank, RB1: Bank->getID()); |
| 3605 | if (RegBank == AMDGPU::VGPRRegBankID) |
| 3606 | break; |
| 3607 | } |
| 3608 | } |
| 3609 | |
| 3610 | return RegBank; |
| 3611 | } |
| 3612 | |
| 3613 | bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { |
| 3614 | const MachineFunction &MF = *MI.getMF(); |
| 3615 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| 3616 | for (const MachineOperand &MO : MI.operands()) { |
| 3617 | if (!MO.isReg()) |
| 3618 | continue; |
| 3619 | Register Reg = MO.getReg(); |
| 3620 | if (const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI)) { |
| 3621 | if (Bank->getID() != AMDGPU::SGPRRegBankID) |
| 3622 | return false; |
| 3623 | } |
| 3624 | } |
| 3625 | return true; |
| 3626 | } |
| 3627 | |
| 3628 | const RegisterBankInfo::InstructionMapping & |
| 3629 | AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { |
| 3630 | const MachineFunction &MF = *MI.getMF(); |
| 3631 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| 3632 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
| 3633 | |
| 3634 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { |
| 3635 | const MachineOperand &SrcOp = MI.getOperand(i); |
| 3636 | if (!SrcOp.isReg()) |
| 3637 | continue; |
| 3638 | |
| 3639 | unsigned Size = getSizeInBits(Reg: SrcOp.getReg(), MRI, TRI: *TRI); |
| 3640 | OpdsMapping[i] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size); |
| 3641 | } |
| 3642 | return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), |
| 3643 | NumOperands: MI.getNumOperands()); |
| 3644 | } |
| 3645 | |
| 3646 | const RegisterBankInfo::InstructionMapping & |
| 3647 | AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { |
| 3648 | const MachineFunction &MF = *MI.getMF(); |
| 3649 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| 3650 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
| 3651 | |
| 3652 | // Even though we technically could use SGPRs, this would require knowledge of |
| 3653 | // the constant bus restriction. Force all sources to VGPR (except for VCC). |
| 3654 | // |
| 3655 | // TODO: Unary ops are trivially OK, so accept SGPRs? |
| 3656 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { |
| 3657 | const MachineOperand &Src = MI.getOperand(i); |
| 3658 | if (!Src.isReg()) |
| 3659 | continue; |
| 3660 | |
| 3661 | unsigned Size = getSizeInBits(Reg: Src.getReg(), MRI, TRI: *TRI); |
| 3662 | unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; |
| 3663 | OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); |
| 3664 | } |
| 3665 | |
| 3666 | return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), |
| 3667 | NumOperands: MI.getNumOperands()); |
| 3668 | } |
| 3669 | |
| 3670 | const RegisterBankInfo::InstructionMapping & |
| 3671 | AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { |
| 3672 | const MachineFunction &MF = *MI.getMF(); |
| 3673 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| 3674 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
| 3675 | |
| 3676 | for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { |
| 3677 | const MachineOperand &Op = MI.getOperand(i: I); |
| 3678 | if (!Op.isReg()) |
| 3679 | continue; |
| 3680 | |
| 3681 | unsigned Size = getSizeInBits(Reg: Op.getReg(), MRI, TRI: *TRI); |
| 3682 | OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 3683 | } |
| 3684 | |
| 3685 | return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), |
| 3686 | NumOperands: MI.getNumOperands()); |
| 3687 | } |
| 3688 | |
| 3689 | const RegisterBankInfo::InstructionMapping & |
| 3690 | AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, |
| 3691 | const MachineInstr &MI, |
| 3692 | int RsrcIdx) const { |
| 3693 | // The reported argument index is relative to the IR intrinsic call arguments, |
| 3694 | // so we need to shift by the number of defs and the intrinsic ID. |
| 3695 | RsrcIdx += MI.getNumExplicitDefs() + 1; |
| 3696 | |
| 3697 | const int NumOps = MI.getNumOperands(); |
| 3698 | SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); |
| 3699 | |
| 3700 | // TODO: Should packed/unpacked D16 difference be reported here as part of |
| 3701 | // the value mapping? |
| 3702 | for (int I = 0; I != NumOps; ++I) { |
| 3703 | if (!MI.getOperand(i: I).isReg()) |
| 3704 | continue; |
| 3705 | |
| 3706 | Register OpReg = MI.getOperand(i: I).getReg(); |
| 3707 | // We replace some dead address operands with $noreg |
| 3708 | if (!OpReg) |
| 3709 | continue; |
| 3710 | |
| 3711 | unsigned Size = getSizeInBits(Reg: OpReg, MRI, TRI: *TRI); |
| 3712 | |
| 3713 | // FIXME: Probably need a new intrinsic register bank searchable table to |
| 3714 | // handle arbitrary intrinsics easily. |
| 3715 | // |
| 3716 | // If this has a sampler, it immediately follows rsrc. |
| 3717 | const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; |
| 3718 | |
| 3719 | if (MustBeSGPR) { |
| 3720 | // If this must be an SGPR, so we must report whatever it is as legal. |
| 3721 | unsigned NewBank = getRegBankID(Reg: OpReg, MRI, Default: AMDGPU::SGPRRegBankID); |
| 3722 | OpdsMapping[I] = AMDGPU::getValueMapping(BankID: NewBank, Size); |
| 3723 | } else { |
| 3724 | // Some operands must be VGPR, and these are easy to copy to. |
| 3725 | OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 3726 | } |
| 3727 | } |
| 3728 | |
| 3729 | return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: NumOps); |
| 3730 | } |
| 3731 | |
| 3732 | /// Return the mapping for a pointer argument. |
| 3733 | const RegisterBankInfo::ValueMapping * |
| 3734 | AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, |
| 3735 | Register PtrReg) const { |
| 3736 | LLT PtrTy = MRI.getType(Reg: PtrReg); |
| 3737 | unsigned Size = PtrTy.getSizeInBits(); |
| 3738 | if (Subtarget.useFlatForGlobal() || |
| 3739 | !AMDGPU::isFlatGlobalAddrSpace(AS: PtrTy.getAddressSpace())) |
| 3740 | return AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 3741 | |
| 3742 | // If we're using MUBUF instructions for global memory, an SGPR base register |
| 3743 | // is possible. Otherwise this needs to be a VGPR. |
| 3744 | const RegisterBank *PtrBank = getRegBank(Reg: PtrReg, MRI, TRI: *TRI); |
| 3745 | return AMDGPU::getValueMapping(BankID: PtrBank->getID(), Size); |
| 3746 | } |
| 3747 | |
| 3748 | const RegisterBankInfo::InstructionMapping & |
| 3749 | AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { |
| 3750 | |
| 3751 | const MachineFunction &MF = *MI.getMF(); |
| 3752 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| 3753 | SmallVector<const ValueMapping*, 2> OpdsMapping(2); |
| 3754 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 3755 | Register PtrReg = MI.getOperand(i: 1).getReg(); |
| 3756 | LLT PtrTy = MRI.getType(Reg: PtrReg); |
| 3757 | unsigned AS = PtrTy.getAddressSpace(); |
| 3758 | unsigned PtrSize = PtrTy.getSizeInBits(); |
| 3759 | |
| 3760 | const ValueMapping *ValMapping; |
| 3761 | const ValueMapping *PtrMapping; |
| 3762 | |
| 3763 | const RegisterBank *PtrBank = getRegBank(Reg: PtrReg, MRI, TRI: *TRI); |
| 3764 | |
| 3765 | if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { |
| 3766 | if (isScalarLoadLegal(MI)) { |
| 3767 | // We have a uniform instruction so we want to use an SMRD load |
| 3768 | ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size); |
| 3769 | PtrMapping = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: PtrSize); |
| 3770 | } else { |
| 3771 | ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 3772 | |
| 3773 | // If we're using MUBUF instructions for global memory, an SGPR base |
| 3774 | // register is possible. Otherwise this needs to be a VGPR. |
| 3775 | unsigned PtrBankID = Subtarget.useFlatForGlobal() ? |
| 3776 | AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; |
| 3777 | |
| 3778 | PtrMapping = AMDGPU::getValueMapping(BankID: PtrBankID, Size: PtrSize); |
| 3779 | } |
| 3780 | } else { |
| 3781 | ValMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 3782 | PtrMapping = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: PtrSize); |
| 3783 | } |
| 3784 | |
| 3785 | OpdsMapping[0] = ValMapping; |
| 3786 | OpdsMapping[1] = PtrMapping; |
| 3787 | const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( |
| 3788 | ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: MI.getNumOperands()); |
| 3789 | return Mapping; |
| 3790 | |
| 3791 | // FIXME: Do we want to add a mapping for FLAT load, or should we just |
| 3792 | // handle that during instruction selection? |
| 3793 | } |
| 3794 | |
| 3795 | unsigned |
| 3796 | AMDGPURegisterBankInfo::getRegBankID(Register Reg, |
| 3797 | const MachineRegisterInfo &MRI, |
| 3798 | unsigned Default) const { |
| 3799 | const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI); |
| 3800 | return Bank ? Bank->getID() : Default; |
| 3801 | } |
| 3802 | |
| 3803 | const RegisterBankInfo::ValueMapping * |
| 3804 | AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, |
| 3805 | const MachineRegisterInfo &MRI, |
| 3806 | const TargetRegisterInfo &TRI) const { |
| 3807 | // Lie and claim anything is legal, even though this needs to be an SGPR |
| 3808 | // applyMapping will have to deal with it as a waterfall loop. |
| 3809 | unsigned Bank = getRegBankID(Reg, MRI, Default: AMDGPU::SGPRRegBankID); |
| 3810 | unsigned Size = getSizeInBits(Reg, MRI, TRI); |
| 3811 | return AMDGPU::getValueMapping(BankID: Bank, Size); |
| 3812 | } |
| 3813 | |
| 3814 | const RegisterBankInfo::ValueMapping * |
| 3815 | AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, |
| 3816 | const MachineRegisterInfo &MRI, |
| 3817 | const TargetRegisterInfo &TRI) const { |
| 3818 | unsigned Size = getSizeInBits(Reg, MRI, TRI); |
| 3819 | return AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 3820 | } |
| 3821 | |
| 3822 | const RegisterBankInfo::ValueMapping * |
| 3823 | AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, |
| 3824 | const MachineRegisterInfo &MRI, |
| 3825 | const TargetRegisterInfo &TRI) const { |
| 3826 | unsigned Size = getSizeInBits(Reg, MRI, TRI); |
| 3827 | return AMDGPU::getValueMapping(BankID: AMDGPU::AGPRRegBankID, Size); |
| 3828 | } |
| 3829 | |
| 3830 | /// |
| 3831 | /// This function must return a legal mapping, because |
| 3832 | /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called |
| 3833 | /// in RegBankSelect::Mode::Fast. Any mapping that would cause a |
| 3834 | /// VGPR to SGPR generated is illegal. |
| 3835 | /// |
| 3836 | // Operands that must be SGPRs must accept potentially divergent VGPRs as |
| 3837 | // legal. These will be dealt with in applyMappingImpl. |
| 3838 | // |
| 3839 | const RegisterBankInfo::InstructionMapping & |
| 3840 | AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { |
| 3841 | const MachineFunction &MF = *MI.getMF(); |
| 3842 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| 3843 | |
| 3844 | if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { |
| 3845 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 3846 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 3847 | |
| 3848 | // The default logic bothers to analyze impossible alternative mappings. We |
| 3849 | // want the most straightforward mapping, so just directly handle this. |
| 3850 | const RegisterBank *DstBank = getRegBank(Reg: DstReg, MRI, TRI: *TRI); |
| 3851 | const RegisterBank *SrcBank = getRegBank(Reg: SrcReg, MRI, TRI: *TRI); |
| 3852 | |
| 3853 | // For COPY between a physical reg and an s1, there is no type associated so |
| 3854 | // we need to take the virtual register's type as a hint on how to interpret |
| 3855 | // s1 values. |
| 3856 | unsigned Size; |
| 3857 | if (!SrcReg.isVirtual() && !DstBank && |
| 3858 | MRI.getType(Reg: DstReg) == LLT::scalar(SizeInBits: 1)) { |
| 3859 | DstBank = &AMDGPU::VCCRegBank; |
| 3860 | Size = 1; |
| 3861 | } else if (!DstReg.isVirtual() && MRI.getType(Reg: SrcReg) == LLT::scalar(SizeInBits: 1)) { |
| 3862 | DstBank = &AMDGPU::VCCRegBank; |
| 3863 | Size = 1; |
| 3864 | } else { |
| 3865 | Size = getSizeInBits(Reg: DstReg, MRI, TRI: *TRI); |
| 3866 | } |
| 3867 | |
| 3868 | if (!DstBank) |
| 3869 | DstBank = SrcBank; |
| 3870 | else if (!SrcBank) |
| 3871 | SrcBank = DstBank; |
| 3872 | |
| 3873 | if (MI.getOpcode() != AMDGPU::G_FREEZE && |
| 3874 | cannotCopy(Dst: *DstBank, Src: *SrcBank, Size: TypeSize::getFixed(ExactSize: Size))) |
| 3875 | return getInvalidInstructionMapping(); |
| 3876 | |
| 3877 | const ValueMapping &ValMap = getValueMapping(StartIdx: 0, Length: Size, RegBank: *DstBank); |
| 3878 | unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; |
| 3879 | SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); |
| 3880 | OpdsMapping[0] = &ValMap; |
| 3881 | if (MI.getOpcode() == AMDGPU::G_FREEZE) |
| 3882 | OpdsMapping[1] = &ValMap; |
| 3883 | |
| 3884 | return getInstructionMapping( |
| 3885 | ID: 1, /*Cost*/ 1, |
| 3886 | /*OperandsMapping*/ getOperandsMapping(OpdsMapping), NumOperands: OpdsMappingSize); |
| 3887 | } |
| 3888 | |
| 3889 | if (MI.isRegSequence()) { |
| 3890 | // If any input is a VGPR, the result must be a VGPR. The default handling |
| 3891 | // assumes any copy between banks is legal. |
| 3892 | unsigned BankID = AMDGPU::SGPRRegBankID; |
| 3893 | |
| 3894 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { |
| 3895 | auto OpBank = getRegBankID(Reg: MI.getOperand(i: I).getReg(), MRI); |
| 3896 | // It doesn't make sense to use vcc or scc banks here, so just ignore |
| 3897 | // them. |
| 3898 | if (OpBank != AMDGPU::SGPRRegBankID) { |
| 3899 | BankID = AMDGPU::VGPRRegBankID; |
| 3900 | break; |
| 3901 | } |
| 3902 | } |
| 3903 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 3904 | |
| 3905 | const ValueMapping &ValMap = getValueMapping(StartIdx: 0, Length: Size, RegBank: getRegBank(ID: BankID)); |
| 3906 | return getInstructionMapping( |
| 3907 | ID: 1, /*Cost*/ 1, |
| 3908 | /*OperandsMapping*/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: 1); |
| 3909 | } |
| 3910 | |
| 3911 | // The default handling is broken and doesn't handle illegal SGPR->VGPR copies |
| 3912 | // properly. |
| 3913 | // |
| 3914 | // TODO: There are additional exec masking dependencies to analyze. |
| 3915 | if (auto *PHI = dyn_cast<GPhi>(Val: &MI)) { |
| 3916 | unsigned ResultBank = AMDGPU::InvalidRegBankID; |
| 3917 | Register DstReg = PHI->getReg(Idx: 0); |
| 3918 | |
| 3919 | // Sometimes the result may have already been assigned a bank. |
| 3920 | if (const RegisterBank *DstBank = getRegBank(Reg: DstReg, MRI, TRI: *TRI)) |
| 3921 | ResultBank = DstBank->getID(); |
| 3922 | |
| 3923 | for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) { |
| 3924 | Register Reg = PHI->getIncomingValue(I); |
| 3925 | const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI); |
| 3926 | |
| 3927 | // FIXME: Assuming VGPR for any undetermined inputs. |
| 3928 | if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { |
| 3929 | ResultBank = AMDGPU::VGPRRegBankID; |
| 3930 | break; |
| 3931 | } |
| 3932 | |
| 3933 | // FIXME: Need to promote SGPR case to s32 |
| 3934 | unsigned OpBank = Bank->getID(); |
| 3935 | ResultBank = regBankBoolUnion(RB0: ResultBank, RB1: OpBank); |
| 3936 | } |
| 3937 | |
| 3938 | assert(ResultBank != AMDGPU::InvalidRegBankID); |
| 3939 | |
| 3940 | unsigned Size = MRI.getType(Reg: DstReg).getSizeInBits(); |
| 3941 | |
| 3942 | const ValueMapping &ValMap = |
| 3943 | getValueMapping(StartIdx: 0, Length: Size, RegBank: getRegBank(ID: ResultBank)); |
| 3944 | return getInstructionMapping( |
| 3945 | ID: 1, /*Cost*/ 1, |
| 3946 | /*OperandsMapping*/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: 1); |
| 3947 | } |
| 3948 | |
| 3949 | const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); |
| 3950 | if (Mapping.isValid()) |
| 3951 | return Mapping; |
| 3952 | |
| 3953 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
| 3954 | |
| 3955 | switch (MI.getOpcode()) { |
| 3956 | default: |
| 3957 | return getInvalidInstructionMapping(); |
| 3958 | |
| 3959 | case AMDGPU::G_AND: |
| 3960 | case AMDGPU::G_OR: |
| 3961 | case AMDGPU::G_XOR: |
| 3962 | case AMDGPU::G_MUL: { |
| 3963 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 3964 | if (Size == 1) { |
| 3965 | const RegisterBank *DstBank |
| 3966 | = getRegBank(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 3967 | |
| 3968 | unsigned TargetBankID = AMDGPU::InvalidRegBankID; |
| 3969 | unsigned BankLHS = AMDGPU::InvalidRegBankID; |
| 3970 | unsigned BankRHS = AMDGPU::InvalidRegBankID; |
| 3971 | if (DstBank) { |
| 3972 | TargetBankID = DstBank->getID(); |
| 3973 | if (DstBank == &AMDGPU::VCCRegBank) { |
| 3974 | TargetBankID = AMDGPU::VCCRegBankID; |
| 3975 | BankLHS = AMDGPU::VCCRegBankID; |
| 3976 | BankRHS = AMDGPU::VCCRegBankID; |
| 3977 | } else { |
| 3978 | BankLHS = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI, |
| 3979 | Default: AMDGPU::SGPRRegBankID); |
| 3980 | BankRHS = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI, |
| 3981 | Default: AMDGPU::SGPRRegBankID); |
| 3982 | } |
| 3983 | } else { |
| 3984 | BankLHS = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI, |
| 3985 | Default: AMDGPU::VCCRegBankID); |
| 3986 | BankRHS = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI, |
| 3987 | Default: AMDGPU::VCCRegBankID); |
| 3988 | |
| 3989 | // Both inputs should be true booleans to produce a boolean result. |
| 3990 | if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { |
| 3991 | TargetBankID = AMDGPU::VGPRRegBankID; |
| 3992 | } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { |
| 3993 | TargetBankID = AMDGPU::VCCRegBankID; |
| 3994 | BankLHS = AMDGPU::VCCRegBankID; |
| 3995 | BankRHS = AMDGPU::VCCRegBankID; |
| 3996 | } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { |
| 3997 | TargetBankID = AMDGPU::SGPRRegBankID; |
| 3998 | } |
| 3999 | } |
| 4000 | |
| 4001 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: TargetBankID, Size); |
| 4002 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: BankLHS, Size); |
| 4003 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: BankRHS, Size); |
| 4004 | break; |
| 4005 | } |
| 4006 | |
| 4007 | if (Size == 64) { |
| 4008 | |
| 4009 | if (isSALUMapping(MI)) { |
| 4010 | OpdsMapping[0] = getValueMappingSGPR64Only(BankID: AMDGPU::SGPRRegBankID, Size); |
| 4011 | OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; |
| 4012 | } else { |
| 4013 | if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64()) |
| 4014 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 4015 | else |
| 4016 | OpdsMapping[0] = |
| 4017 | getValueMappingSGPR64Only(BankID: AMDGPU::VGPRRegBankID, Size); |
| 4018 | unsigned Bank1 = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI /*, DefaultBankID*/); |
| 4019 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank1, Size); |
| 4020 | |
| 4021 | unsigned Bank2 = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI /*, DefaultBankID*/); |
| 4022 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank2, Size); |
| 4023 | } |
| 4024 | |
| 4025 | break; |
| 4026 | } |
| 4027 | |
| 4028 | [[fallthrough]]; |
| 4029 | } |
| 4030 | case AMDGPU::G_PTR_ADD: |
| 4031 | case AMDGPU::G_PTRMASK: |
| 4032 | case AMDGPU::G_ADD: |
| 4033 | case AMDGPU::G_SUB: |
| 4034 | case AMDGPU::G_SHL: |
| 4035 | case AMDGPU::G_LSHR: |
| 4036 | case AMDGPU::G_ASHR: |
| 4037 | case AMDGPU::G_UADDO: |
| 4038 | case AMDGPU::G_USUBO: |
| 4039 | case AMDGPU::G_UADDE: |
| 4040 | case AMDGPU::G_SADDE: |
| 4041 | case AMDGPU::G_USUBE: |
| 4042 | case AMDGPU::G_SSUBE: |
| 4043 | case AMDGPU::G_ABS: |
| 4044 | case AMDGPU::G_SHUFFLE_VECTOR: |
| 4045 | case AMDGPU::G_SBFX: |
| 4046 | case AMDGPU::G_UBFX: |
| 4047 | case AMDGPU::G_AMDGPU_S_MUL_I64_I32: |
| 4048 | case AMDGPU::G_AMDGPU_S_MUL_U64_U32: |
| 4049 | if (isSALUMapping(MI)) |
| 4050 | return getDefaultMappingSOP(MI); |
| 4051 | return getDefaultMappingVOP(MI); |
| 4052 | case AMDGPU::G_SMIN: |
| 4053 | case AMDGPU::G_SMAX: |
| 4054 | case AMDGPU::G_UMIN: |
| 4055 | case AMDGPU::G_UMAX: |
| 4056 | if (isSALUMapping(MI)) { |
| 4057 | // There are no scalar 64-bit min and max, use vector instruction instead. |
| 4058 | if (MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits() == 64 && |
| 4059 | Subtarget.hasIntMinMax64()) |
| 4060 | return getDefaultMappingVOP(MI); |
| 4061 | return getDefaultMappingSOP(MI); |
| 4062 | } |
| 4063 | return getDefaultMappingVOP(MI); |
| 4064 | case AMDGPU::G_FADD: |
| 4065 | case AMDGPU::G_FSUB: |
| 4066 | case AMDGPU::G_FMUL: |
| 4067 | case AMDGPU::G_FMA: |
| 4068 | case AMDGPU::G_FFLOOR: |
| 4069 | case AMDGPU::G_FCEIL: |
| 4070 | case AMDGPU::G_INTRINSIC_ROUNDEVEN: |
| 4071 | case AMDGPU::G_FMINNUM: |
| 4072 | case AMDGPU::G_FMAXNUM: |
| 4073 | case AMDGPU::G_FMINIMUM: |
| 4074 | case AMDGPU::G_FMAXIMUM: |
| 4075 | case AMDGPU::G_FMINIMUMNUM: |
| 4076 | case AMDGPU::G_FMAXIMUMNUM: |
| 4077 | case AMDGPU::G_INTRINSIC_TRUNC: |
| 4078 | case AMDGPU::G_STRICT_FADD: |
| 4079 | case AMDGPU::G_STRICT_FSUB: |
| 4080 | case AMDGPU::G_STRICT_FMUL: |
| 4081 | case AMDGPU::G_STRICT_FMA: { |
| 4082 | LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg()); |
| 4083 | unsigned Size = Ty.getSizeInBits(); |
| 4084 | if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() && |
| 4085 | (Size == 32 || Size == 16) && isSALUMapping(MI)) |
| 4086 | return getDefaultMappingSOP(MI); |
| 4087 | return getDefaultMappingVOP(MI); |
| 4088 | } |
| 4089 | case AMDGPU::G_FPTOSI: |
| 4090 | case AMDGPU::G_FPTOUI: |
| 4091 | case AMDGPU::G_FPTOSI_SAT: |
| 4092 | case AMDGPU::G_FPTOUI_SAT: |
| 4093 | case AMDGPU::G_SITOFP: |
| 4094 | case AMDGPU::G_UITOFP: { |
| 4095 | unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4096 | unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
| 4097 | if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 && |
| 4098 | isSALUMapping(MI)) |
| 4099 | return getDefaultMappingSOP(MI); |
| 4100 | return getDefaultMappingVOP(MI); |
| 4101 | } |
| 4102 | case AMDGPU::G_FPTRUNC: |
| 4103 | case AMDGPU::G_FPEXT: { |
| 4104 | unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4105 | unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
| 4106 | if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 && |
| 4107 | isSALUMapping(MI)) |
| 4108 | return getDefaultMappingSOP(MI); |
| 4109 | return getDefaultMappingVOP(MI); |
| 4110 | } |
| 4111 | case AMDGPU::G_FSQRT: |
| 4112 | case AMDGPU::G_FEXP2: |
| 4113 | case AMDGPU::G_FLOG2: { |
| 4114 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4115 | if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) && |
| 4116 | isSALUMapping(MI)) |
| 4117 | return getDefaultMappingSOP(MI); |
| 4118 | return getDefaultMappingVOP(MI); |
| 4119 | } |
| 4120 | case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU |
| 4121 | case AMDGPU::G_SSUBSAT: |
| 4122 | case AMDGPU::G_UADDSAT: |
| 4123 | case AMDGPU::G_USUBSAT: |
| 4124 | case AMDGPU::G_FMAD: |
| 4125 | case AMDGPU::G_FLDEXP: |
| 4126 | case AMDGPU::G_FMINNUM_IEEE: |
| 4127 | case AMDGPU::G_FMAXNUM_IEEE: |
| 4128 | case AMDGPU::G_FCANONICALIZE: |
| 4129 | case AMDGPU::G_STRICT_FLDEXP: |
| 4130 | case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? |
| 4131 | case AMDGPU::G_FSHR: // TODO: Expand for scalar |
| 4132 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
| 4133 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: |
| 4134 | case AMDGPU::G_AMDGPU_RCP_IFLAG: |
| 4135 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: |
| 4136 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: |
| 4137 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: |
| 4138 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: |
| 4139 | case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: |
| 4140 | case AMDGPU::G_AMDGPU_SMED3: |
| 4141 | case AMDGPU::G_AMDGPU_FMED3: |
| 4142 | return getDefaultMappingVOP(MI); |
| 4143 | case AMDGPU::G_UMULH: |
| 4144 | case AMDGPU::G_SMULH: { |
| 4145 | if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) |
| 4146 | return getDefaultMappingSOP(MI); |
| 4147 | return getDefaultMappingVOP(MI); |
| 4148 | } |
| 4149 | case AMDGPU::G_AMDGPU_MAD_U64_U32: |
| 4150 | case AMDGPU::G_AMDGPU_MAD_I64_I32: { |
| 4151 | // Three possible mappings: |
| 4152 | // |
| 4153 | // - Default SOP |
| 4154 | // - Default VOP |
| 4155 | // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP. |
| 4156 | // |
| 4157 | // This allows instruction selection to keep the multiplication part of the |
| 4158 | // instruction on the SALU. |
| 4159 | bool AllSalu = true; |
| 4160 | bool MulSalu = true; |
| 4161 | for (unsigned i = 0; i < 5; ++i) { |
| 4162 | Register Reg = MI.getOperand(i).getReg(); |
| 4163 | if (const RegisterBank *Bank = getRegBank(Reg, MRI, TRI: *TRI)) { |
| 4164 | if (Bank->getID() != AMDGPU::SGPRRegBankID) { |
| 4165 | AllSalu = false; |
| 4166 | if (i == 2 || i == 3) { |
| 4167 | MulSalu = false; |
| 4168 | break; |
| 4169 | } |
| 4170 | } |
| 4171 | } |
| 4172 | } |
| 4173 | |
| 4174 | if (AllSalu) |
| 4175 | return getDefaultMappingSOP(MI); |
| 4176 | |
| 4177 | // If the multiply-add is full-rate in VALU, use that even if the |
| 4178 | // multiplication part is scalar. Accumulating separately on the VALU would |
| 4179 | // take two instructions. |
| 4180 | if (!MulSalu || Subtarget.hasFullRate64Ops()) |
| 4181 | return getDefaultMappingVOP(MI); |
| 4182 | |
| 4183 | // Keep the multiplication on the SALU, then accumulate on the VALU. |
| 4184 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 64); |
| 4185 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1); |
| 4186 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32); |
| 4187 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32); |
| 4188 | OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 64); |
| 4189 | break; |
| 4190 | } |
| 4191 | case AMDGPU::G_IMPLICIT_DEF: { |
| 4192 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4193 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size); |
| 4194 | break; |
| 4195 | } |
| 4196 | case AMDGPU::G_FCONSTANT: |
| 4197 | case AMDGPU::G_CONSTANT: |
| 4198 | case AMDGPU::G_GLOBAL_VALUE: |
| 4199 | case AMDGPU::G_FRAME_INDEX: |
| 4200 | case AMDGPU::G_BLOCK_ADDR: |
| 4201 | case AMDGPU::G_READSTEADYCOUNTER: |
| 4202 | case AMDGPU::G_READCYCLECOUNTER: { |
| 4203 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4204 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size); |
| 4205 | break; |
| 4206 | } |
| 4207 | case AMDGPU::G_DYN_STACKALLOC: { |
| 4208 | // Result is always uniform, and a wave reduction is needed for the source. |
| 4209 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32); |
| 4210 | unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI); |
| 4211 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: 32); |
| 4212 | break; |
| 4213 | } |
| 4214 | case AMDGPU::G_AMDGPU_WAVE_ADDRESS: { |
| 4215 | // This case is weird because we expect a physical register in the source, |
| 4216 | // but need to set a bank anyway. |
| 4217 | // |
| 4218 | // TODO: We could select the result to SGPR or VGPR |
| 4219 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32); |
| 4220 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 32); |
| 4221 | break; |
| 4222 | } |
| 4223 | case AMDGPU::G_INSERT: { |
| 4224 | unsigned BankID = getMappingType(MRI, MI); |
| 4225 | unsigned DstSize = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 4226 | unsigned SrcSize = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 4227 | unsigned EltSize = getSizeInBits(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 4228 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: DstSize); |
| 4229 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size: SrcSize); |
| 4230 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID, Size: EltSize); |
| 4231 | OpdsMapping[3] = nullptr; |
| 4232 | break; |
| 4233 | } |
| 4234 | case AMDGPU::G_EXTRACT: { |
| 4235 | unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI); |
| 4236 | unsigned DstSize = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 4237 | unsigned SrcSize = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 4238 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: DstSize); |
| 4239 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size: SrcSize); |
| 4240 | OpdsMapping[2] = nullptr; |
| 4241 | break; |
| 4242 | } |
| 4243 | case AMDGPU::G_BUILD_VECTOR: |
| 4244 | case AMDGPU::G_BUILD_VECTOR_TRUNC: { |
| 4245 | LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg()); |
| 4246 | if (DstTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16)) { |
| 4247 | unsigned DstSize = DstTy.getSizeInBits(); |
| 4248 | unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
| 4249 | unsigned Src0BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI); |
| 4250 | unsigned Src1BankID = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI); |
| 4251 | unsigned DstBankID = regBankUnion(RB0: Src0BankID, RB1: Src1BankID); |
| 4252 | |
| 4253 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: DstBankID, Size: DstSize); |
| 4254 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Src0BankID, Size: SrcSize); |
| 4255 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Src1BankID, Size: SrcSize); |
| 4256 | break; |
| 4257 | } |
| 4258 | |
| 4259 | [[fallthrough]]; |
| 4260 | } |
| 4261 | case AMDGPU::G_MERGE_VALUES: |
| 4262 | case AMDGPU::G_CONCAT_VECTORS: { |
| 4263 | unsigned Bank = getMappingType(MRI, MI); |
| 4264 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4265 | unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
| 4266 | |
| 4267 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize); |
| 4268 | // Op1 and Dst should use the same register bank. |
| 4269 | for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) |
| 4270 | OpdsMapping[i] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize); |
| 4271 | break; |
| 4272 | } |
| 4273 | case AMDGPU::G_BITREVERSE: |
| 4274 | case AMDGPU::G_BITCAST: |
| 4275 | case AMDGPU::G_INTTOPTR: |
| 4276 | case AMDGPU::G_PTRTOINT: |
| 4277 | case AMDGPU::G_FABS: |
| 4278 | case AMDGPU::G_FNEG: { |
| 4279 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4280 | unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI); |
| 4281 | OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); |
| 4282 | break; |
| 4283 | } |
| 4284 | case AMDGPU::G_AMDGPU_FFBH_U32: |
| 4285 | case AMDGPU::G_AMDGPU_FFBL_B32: |
| 4286 | case AMDGPU::G_CTLZ_ZERO_UNDEF: |
| 4287 | case AMDGPU::G_CTTZ_ZERO_UNDEF: { |
| 4288 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
| 4289 | unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI); |
| 4290 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: 32); |
| 4291 | OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size); |
| 4292 | break; |
| 4293 | } |
| 4294 | case AMDGPU::G_CTPOP: { |
| 4295 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
| 4296 | unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI); |
| 4297 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: 32); |
| 4298 | |
| 4299 | // This should really be getValueMappingSGPR64Only, but allowing the generic |
| 4300 | // code to handle the register split just makes using LegalizerHelper more |
| 4301 | // difficult. |
| 4302 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); |
| 4303 | break; |
| 4304 | } |
| 4305 | case AMDGPU::G_TRUNC: { |
| 4306 | Register Dst = MI.getOperand(i: 0).getReg(); |
| 4307 | Register Src = MI.getOperand(i: 1).getReg(); |
| 4308 | unsigned Bank = getRegBankID(Reg: Src, MRI); |
| 4309 | unsigned DstSize = getSizeInBits(Reg: Dst, MRI, TRI: *TRI); |
| 4310 | unsigned SrcSize = getSizeInBits(Reg: Src, MRI, TRI: *TRI); |
| 4311 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize); |
| 4312 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize); |
| 4313 | break; |
| 4314 | } |
| 4315 | case AMDGPU::G_ZEXT: |
| 4316 | case AMDGPU::G_SEXT: |
| 4317 | case AMDGPU::G_ANYEXT: |
| 4318 | case AMDGPU::G_SEXT_INREG: { |
| 4319 | Register Dst = MI.getOperand(i: 0).getReg(); |
| 4320 | Register Src = MI.getOperand(i: 1).getReg(); |
| 4321 | unsigned DstSize = getSizeInBits(Reg: Dst, MRI, TRI: *TRI); |
| 4322 | unsigned SrcSize = getSizeInBits(Reg: Src, MRI, TRI: *TRI); |
| 4323 | |
| 4324 | unsigned DstBank; |
| 4325 | const RegisterBank *SrcBank = getRegBank(Reg: Src, MRI, TRI: *TRI); |
| 4326 | assert(SrcBank); |
| 4327 | switch (SrcBank->getID()) { |
| 4328 | case AMDGPU::SGPRRegBankID: |
| 4329 | DstBank = AMDGPU::SGPRRegBankID; |
| 4330 | break; |
| 4331 | default: |
| 4332 | DstBank = AMDGPU::VGPRRegBankID; |
| 4333 | break; |
| 4334 | } |
| 4335 | |
| 4336 | // Scalar extend can use 64-bit BFE, but VGPRs require extending to |
| 4337 | // 32-bits, and then to 64. |
| 4338 | OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: DstBank, Size: DstSize); |
| 4339 | OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID: SrcBank->getID(), |
| 4340 | Size: SrcSize); |
| 4341 | break; |
| 4342 | } |
| 4343 | case AMDGPU::G_IS_FPCLASS: { |
| 4344 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
| 4345 | unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits(); |
| 4346 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4347 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: DstSize); |
| 4348 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize); |
| 4349 | break; |
| 4350 | } |
| 4351 | case AMDGPU::G_STORE: { |
| 4352 | assert(MI.getOperand(0).isReg()); |
| 4353 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4354 | |
| 4355 | // FIXME: We need to specify a different reg bank once scalar stores are |
| 4356 | // supported. |
| 4357 | const ValueMapping *ValMapping = |
| 4358 | AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 4359 | OpdsMapping[0] = ValMapping; |
| 4360 | OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg()); |
| 4361 | break; |
| 4362 | } |
| 4363 | case AMDGPU::G_ICMP: |
| 4364 | case AMDGPU::G_FCMP: { |
| 4365 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
| 4366 | |
| 4367 | // See if the result register has already been constrained to vcc, which may |
| 4368 | // happen due to control flow intrinsic lowering. |
| 4369 | unsigned DstBank = getRegBankID(Reg: MI.getOperand(i: 0).getReg(), MRI, |
| 4370 | Default: AMDGPU::SGPRRegBankID); |
| 4371 | unsigned Op2Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI); |
| 4372 | unsigned Op3Bank = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI); |
| 4373 | |
| 4374 | auto canUseSCCICMP = [&]() { |
| 4375 | auto Pred = |
| 4376 | static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate()); |
| 4377 | return Size == 32 || |
| 4378 | (Size == 64 && |
| 4379 | (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && |
| 4380 | Subtarget.hasScalarCompareEq64()); |
| 4381 | }; |
| 4382 | auto canUseSCCFCMP = [&]() { |
| 4383 | return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16); |
| 4384 | }; |
| 4385 | |
| 4386 | bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP; |
| 4387 | bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && |
| 4388 | Op2Bank == AMDGPU::SGPRRegBankID && |
| 4389 | Op3Bank == AMDGPU::SGPRRegBankID && |
| 4390 | (isICMP ? canUseSCCICMP() : canUseSCCFCMP()); |
| 4391 | |
| 4392 | DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; |
| 4393 | unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; |
| 4394 | |
| 4395 | // TODO: Use 32-bit for scalar output size. |
| 4396 | // SCC results will need to be copied to a 32-bit SGPR virtual register. |
| 4397 | const unsigned ResultSize = 1; |
| 4398 | |
| 4399 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: DstBank, Size: ResultSize); |
| 4400 | OpdsMapping[1] = nullptr; // Predicate Operand. |
| 4401 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: SrcBank, Size); |
| 4402 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: SrcBank, Size); |
| 4403 | break; |
| 4404 | } |
| 4405 | case AMDGPU::G_EXTRACT_VECTOR_ELT: { |
| 4406 | // VGPR index can be used for waterfall when indexing a SGPR vector. |
| 4407 | unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI); |
| 4408 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4409 | unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
| 4410 | unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
| 4411 | unsigned IdxBank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI); |
| 4412 | unsigned OutputBankID = regBankUnion(RB0: SrcBankID, RB1: IdxBank); |
| 4413 | |
| 4414 | OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: OutputBankID, Size: DstSize); |
| 4415 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: SrcSize); |
| 4416 | |
| 4417 | // The index can be either if the source vector is VGPR. |
| 4418 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize); |
| 4419 | break; |
| 4420 | } |
| 4421 | case AMDGPU::G_INSERT_VECTOR_ELT: { |
| 4422 | unsigned OutputBankID = isSALUMapping(MI) ? |
| 4423 | AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; |
| 4424 | |
| 4425 | unsigned VecSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4426 | unsigned InsertSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
| 4427 | unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: 3).getReg()).getSizeInBits(); |
| 4428 | unsigned InsertEltBankID = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI); |
| 4429 | unsigned IdxBankID = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI); |
| 4430 | |
| 4431 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize); |
| 4432 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize); |
| 4433 | |
| 4434 | // This is a weird case, because we need to break down the mapping based on |
| 4435 | // the register bank of a different operand. |
| 4436 | if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { |
| 4437 | OpdsMapping[2] = AMDGPU::getValueMappingSplit64(BankID: InsertEltBankID, |
| 4438 | Size: InsertSize); |
| 4439 | } else { |
| 4440 | assert(InsertSize == 32 || InsertSize == 64); |
| 4441 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: InsertEltBankID, Size: InsertSize); |
| 4442 | } |
| 4443 | |
| 4444 | // The index can be either if the source vector is VGPR. |
| 4445 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBankID, Size: IdxSize); |
| 4446 | break; |
| 4447 | } |
| 4448 | case AMDGPU::G_UNMERGE_VALUES: { |
| 4449 | unsigned Bank = getMappingType(MRI, MI); |
| 4450 | |
| 4451 | // Op1 and Dst should use the same register bank. |
| 4452 | // FIXME: Shouldn't this be the default? Why do we need to handle this? |
| 4453 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { |
| 4454 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i).getReg(), MRI, TRI: *TRI); |
| 4455 | OpdsMapping[i] = AMDGPU::getValueMapping(BankID: Bank, Size); |
| 4456 | } |
| 4457 | break; |
| 4458 | } |
| 4459 | case AMDGPU::G_AMDGPU_BUFFER_LOAD: |
| 4460 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: |
| 4461 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: |
| 4462 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: |
| 4463 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: |
| 4464 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE: |
| 4465 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE: |
| 4466 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE: |
| 4467 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE: |
| 4468 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE: |
| 4469 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: |
| 4470 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: |
| 4471 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: |
| 4472 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: |
| 4473 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: |
| 4474 | case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: |
| 4475 | case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: |
| 4476 | case AMDGPU::G_AMDGPU_BUFFER_STORE: |
| 4477 | case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: |
| 4478 | case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: |
| 4479 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: |
| 4480 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { |
| 4481 | OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 4482 | |
| 4483 | // rsrc |
| 4484 | OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 4485 | |
| 4486 | // vindex |
| 4487 | OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 4488 | |
| 4489 | // voffset |
| 4490 | OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); |
| 4491 | |
| 4492 | // soffset |
| 4493 | OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); |
| 4494 | |
| 4495 | // Any remaining operands are immediates and were correctly null |
| 4496 | // initialized. |
| 4497 | break; |
| 4498 | } |
| 4499 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: |
| 4500 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: |
| 4501 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: |
| 4502 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: |
| 4503 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: |
| 4504 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: |
| 4505 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: |
| 4506 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: |
| 4507 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: |
| 4508 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: |
| 4509 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: |
| 4510 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: |
| 4511 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32: |
| 4512 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32: |
| 4513 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: |
| 4514 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: |
| 4515 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { |
| 4516 | // vdata_out |
| 4517 | OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 4518 | |
| 4519 | // vdata_in |
| 4520 | OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 4521 | |
| 4522 | // rsrc |
| 4523 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 4524 | |
| 4525 | // vindex |
| 4526 | OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); |
| 4527 | |
| 4528 | // voffset |
| 4529 | OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); |
| 4530 | |
| 4531 | // soffset |
| 4532 | OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI); |
| 4533 | |
| 4534 | // Any remaining operands are immediates and were correctly null |
| 4535 | // initialized. |
| 4536 | break; |
| 4537 | } |
| 4538 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { |
| 4539 | // vdata_out |
| 4540 | OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 4541 | |
| 4542 | // vdata_in |
| 4543 | OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 4544 | |
| 4545 | // cmp |
| 4546 | OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 4547 | |
| 4548 | // rsrc |
| 4549 | OpdsMapping[3] = getSGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); |
| 4550 | |
| 4551 | // vindex |
| 4552 | OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); |
| 4553 | |
| 4554 | // voffset |
| 4555 | OpdsMapping[5] = getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI); |
| 4556 | |
| 4557 | // soffset |
| 4558 | OpdsMapping[6] = getSGPROpMapping(Reg: MI.getOperand(i: 6).getReg(), MRI, TRI: *TRI); |
| 4559 | |
| 4560 | // Any remaining operands are immediates and were correctly null |
| 4561 | // initialized. |
| 4562 | break; |
| 4563 | } |
| 4564 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: |
| 4565 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: |
| 4566 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE: |
| 4567 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: |
| 4568 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: { |
| 4569 | // Lie and claim everything is legal, even though some need to be |
| 4570 | // SGPRs. applyMapping will have to deal with it as a waterfall loop. |
| 4571 | OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 4572 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 4573 | |
| 4574 | // We need to convert this to a MUBUF if either the resource of offset is |
| 4575 | // VGPR. |
| 4576 | unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); |
| 4577 | unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); |
| 4578 | unsigned ResultBank = regBankUnion(RB0: RSrcBank, RB1: OffsetBank); |
| 4579 | |
| 4580 | unsigned Size0 = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4581 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: ResultBank, Size: Size0); |
| 4582 | break; |
| 4583 | } |
| 4584 | case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH: |
| 4585 | OpdsMapping[0] = getSGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 4586 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 4587 | break; |
| 4588 | case AMDGPU::G_AMDGPU_SPONENTRY: { |
| 4589 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4590 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size); |
| 4591 | break; |
| 4592 | } |
| 4593 | case AMDGPU::G_INTRINSIC: |
| 4594 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
| 4595 | switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) { |
| 4596 | default: |
| 4597 | return getInvalidInstructionMapping(); |
| 4598 | case Intrinsic::amdgcn_div_fmas: |
| 4599 | case Intrinsic::amdgcn_div_fixup: |
| 4600 | case Intrinsic::amdgcn_trig_preop: |
| 4601 | case Intrinsic::amdgcn_sin: |
| 4602 | case Intrinsic::amdgcn_cos: |
| 4603 | case Intrinsic::amdgcn_log_clamp: |
| 4604 | case Intrinsic::amdgcn_rcp_legacy: |
| 4605 | case Intrinsic::amdgcn_rsq_legacy: |
| 4606 | case Intrinsic::amdgcn_rsq_clamp: |
| 4607 | case Intrinsic::amdgcn_tanh: |
| 4608 | case Intrinsic::amdgcn_fmul_legacy: |
| 4609 | case Intrinsic::amdgcn_fma_legacy: |
| 4610 | case Intrinsic::amdgcn_frexp_mant: |
| 4611 | case Intrinsic::amdgcn_frexp_exp: |
| 4612 | case Intrinsic::amdgcn_fract: |
| 4613 | case Intrinsic::amdgcn_cvt_pknorm_i16: |
| 4614 | case Intrinsic::amdgcn_cvt_pknorm_u16: |
| 4615 | case Intrinsic::amdgcn_cvt_pk_i16: |
| 4616 | case Intrinsic::amdgcn_cvt_pk_u16: |
| 4617 | case Intrinsic::amdgcn_cvt_sr_pk_f16_f32: |
| 4618 | case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32: |
| 4619 | case Intrinsic::amdgcn_cvt_pk_f16_fp8: |
| 4620 | case Intrinsic::amdgcn_cvt_pk_f16_bf8: |
| 4621 | case Intrinsic::amdgcn_cvt_pk_fp8_f16: |
| 4622 | case Intrinsic::amdgcn_cvt_pk_bf8_f16: |
| 4623 | case Intrinsic::amdgcn_cvt_sr_fp8_f16: |
| 4624 | case Intrinsic::amdgcn_cvt_sr_bf8_f16: |
| 4625 | case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp8: |
| 4626 | case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp8: |
| 4627 | case Intrinsic::amdgcn_cvt_scale_pk8_f16_bf8: |
| 4628 | case Intrinsic::amdgcn_cvt_scale_pk8_bf16_bf8: |
| 4629 | case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp4: |
| 4630 | case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp4: |
| 4631 | case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8: |
| 4632 | case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8: |
| 4633 | case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4: |
| 4634 | case Intrinsic::amdgcn_cvt_scale_pk16_f16_fp6: |
| 4635 | case Intrinsic::amdgcn_cvt_scale_pk16_bf16_fp6: |
| 4636 | case Intrinsic::amdgcn_cvt_scale_pk16_f16_bf6: |
| 4637 | case Intrinsic::amdgcn_cvt_scale_pk16_bf16_bf6: |
| 4638 | case Intrinsic::amdgcn_cvt_scale_pk16_f32_fp6: |
| 4639 | case Intrinsic::amdgcn_cvt_scale_pk16_f32_bf6: |
| 4640 | case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16: |
| 4641 | case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16: |
| 4642 | case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16: |
| 4643 | case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f16: |
| 4644 | case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f32: |
| 4645 | case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f32: |
| 4646 | case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32: |
| 4647 | case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16: |
| 4648 | case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16: |
| 4649 | case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f32: |
| 4650 | case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f32: |
| 4651 | case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f16: |
| 4652 | case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f16: |
| 4653 | case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_bf16: |
| 4654 | case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_bf16: |
| 4655 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_bf16: |
| 4656 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_bf16: |
| 4657 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f16: |
| 4658 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f16: |
| 4659 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f32: |
| 4660 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f32: |
| 4661 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32: |
| 4662 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16: |
| 4663 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16: |
| 4664 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32: |
| 4665 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32: |
| 4666 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16: |
| 4667 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16: |
| 4668 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16: |
| 4669 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16: |
| 4670 | case Intrinsic::amdgcn_sat_pk4_i4_i8: |
| 4671 | case Intrinsic::amdgcn_sat_pk4_u4_u8: |
| 4672 | case Intrinsic::amdgcn_fmed3: |
| 4673 | case Intrinsic::amdgcn_cubeid: |
| 4674 | case Intrinsic::amdgcn_cubema: |
| 4675 | case Intrinsic::amdgcn_cubesc: |
| 4676 | case Intrinsic::amdgcn_cubetc: |
| 4677 | case Intrinsic::amdgcn_sffbh: |
| 4678 | case Intrinsic::amdgcn_fmad_ftz: |
| 4679 | case Intrinsic::amdgcn_mbcnt_lo: |
| 4680 | case Intrinsic::amdgcn_mbcnt_hi: |
| 4681 | case Intrinsic::amdgcn_mul_u24: |
| 4682 | case Intrinsic::amdgcn_mul_i24: |
| 4683 | case Intrinsic::amdgcn_mulhi_u24: |
| 4684 | case Intrinsic::amdgcn_mulhi_i24: |
| 4685 | case Intrinsic::amdgcn_lerp: |
| 4686 | case Intrinsic::amdgcn_sad_u8: |
| 4687 | case Intrinsic::amdgcn_msad_u8: |
| 4688 | case Intrinsic::amdgcn_sad_hi_u8: |
| 4689 | case Intrinsic::amdgcn_sad_u16: |
| 4690 | case Intrinsic::amdgcn_qsad_pk_u16_u8: |
| 4691 | case Intrinsic::amdgcn_mqsad_pk_u16_u8: |
| 4692 | case Intrinsic::amdgcn_mqsad_u32_u8: |
| 4693 | case Intrinsic::amdgcn_cvt_pk_u8_f32: |
| 4694 | case Intrinsic::amdgcn_alignbyte: |
| 4695 | case Intrinsic::amdgcn_perm: |
| 4696 | case Intrinsic::amdgcn_prng_b32: |
| 4697 | case Intrinsic::amdgcn_fdot2: |
| 4698 | case Intrinsic::amdgcn_sdot2: |
| 4699 | case Intrinsic::amdgcn_udot2: |
| 4700 | case Intrinsic::amdgcn_sdot4: |
| 4701 | case Intrinsic::amdgcn_udot4: |
| 4702 | case Intrinsic::amdgcn_sdot8: |
| 4703 | case Intrinsic::amdgcn_udot8: |
| 4704 | case Intrinsic::amdgcn_fdot2_bf16_bf16: |
| 4705 | case Intrinsic::amdgcn_fdot2_f16_f16: |
| 4706 | case Intrinsic::amdgcn_fdot2_f32_bf16: |
| 4707 | case Intrinsic::amdgcn_fdot2c_f32_bf16: |
| 4708 | case Intrinsic::amdgcn_sudot4: |
| 4709 | case Intrinsic::amdgcn_sudot8: |
| 4710 | case Intrinsic::amdgcn_dot4_f32_fp8_bf8: |
| 4711 | case Intrinsic::amdgcn_dot4_f32_bf8_fp8: |
| 4712 | case Intrinsic::amdgcn_dot4_f32_fp8_fp8: |
| 4713 | case Intrinsic::amdgcn_dot4_f32_bf8_bf8: |
| 4714 | case Intrinsic::amdgcn_cvt_f32_fp8: |
| 4715 | case Intrinsic::amdgcn_cvt_f32_fp8_e5m3: |
| 4716 | case Intrinsic::amdgcn_cvt_f32_bf8: |
| 4717 | case Intrinsic::amdgcn_cvt_off_f32_i4: |
| 4718 | case Intrinsic::amdgcn_cvt_pk_f32_fp8: |
| 4719 | case Intrinsic::amdgcn_cvt_pk_f32_bf8: |
| 4720 | case Intrinsic::amdgcn_cvt_pk_fp8_f32: |
| 4721 | case Intrinsic::amdgcn_cvt_pk_fp8_f32_e5m3: |
| 4722 | case Intrinsic::amdgcn_cvt_pk_bf8_f32: |
| 4723 | case Intrinsic::amdgcn_cvt_sr_fp8_f32: |
| 4724 | case Intrinsic::amdgcn_cvt_sr_fp8_f32_e5m3: |
| 4725 | case Intrinsic::amdgcn_cvt_sr_bf8_f32: |
| 4726 | case Intrinsic::amdgcn_cvt_sr_bf16_f32: |
| 4727 | case Intrinsic::amdgcn_cvt_sr_f16_f32: |
| 4728 | case Intrinsic::amdgcn_cvt_f16_fp8: |
| 4729 | case Intrinsic::amdgcn_cvt_f16_bf8: |
| 4730 | case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_f16: |
| 4731 | case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16: |
| 4732 | case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16: |
| 4733 | case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16: |
| 4734 | case Intrinsic::amdgcn_cvt_scalef32_f16_fp8: |
| 4735 | case Intrinsic::amdgcn_cvt_scalef32_f16_bf8: |
| 4736 | case Intrinsic::amdgcn_cvt_scalef32_f32_fp8: |
| 4737 | case Intrinsic::amdgcn_cvt_scalef32_f32_bf8: |
| 4738 | case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f32: |
| 4739 | case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f32: |
| 4740 | case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp8: |
| 4741 | case Intrinsic::amdgcn_cvt_scalef32_pk_f32_bf8: |
| 4742 | case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f16: |
| 4743 | case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_bf16: |
| 4744 | case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f16: |
| 4745 | case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_bf16: |
| 4746 | case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp4: |
| 4747 | case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f32: |
| 4748 | case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp4: |
| 4749 | case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp4: |
| 4750 | case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_fp6: |
| 4751 | case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_bf6: |
| 4752 | case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_bf6: |
| 4753 | case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_bf6: |
| 4754 | case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_fp6: |
| 4755 | case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_fp6: |
| 4756 | case Intrinsic::amdgcn_cvt_scalef32_pk_f16_bf8: |
| 4757 | case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_bf8: |
| 4758 | case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp8: |
| 4759 | case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp8: |
| 4760 | case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f16: |
| 4761 | case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_bf16: |
| 4762 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f16: |
| 4763 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_bf16: |
| 4764 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f32: |
| 4765 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_bf16: |
| 4766 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f16: |
| 4767 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f32: |
| 4768 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_bf16: |
| 4769 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f16: |
| 4770 | case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f32: |
| 4771 | case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_bf16: |
| 4772 | case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f16: |
| 4773 | case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f32: |
| 4774 | case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_bf16: |
| 4775 | case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f16: |
| 4776 | case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f32: |
| 4777 | case Intrinsic::amdgcn_ashr_pk_i8_i32: |
| 4778 | case Intrinsic::amdgcn_ashr_pk_u8_i32: |
| 4779 | case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32: |
| 4780 | case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32: |
| 4781 | case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: |
| 4782 | case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: |
| 4783 | case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied: |
| 4784 | case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied: |
| 4785 | case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16: |
| 4786 | case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: |
| 4787 | case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: |
| 4788 | case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: |
| 4789 | case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8: |
| 4790 | case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8: |
| 4791 | case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8: |
| 4792 | case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8: |
| 4793 | case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4: |
| 4794 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: |
| 4795 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: |
| 4796 | case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: |
| 4797 | case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: |
| 4798 | case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: |
| 4799 | case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: |
| 4800 | case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: |
| 4801 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: |
| 4802 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: |
| 4803 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: |
| 4804 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: |
| 4805 | case Intrinsic::amdgcn_wmma_f32_16x16x4_f32: |
| 4806 | case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16: |
| 4807 | case Intrinsic::amdgcn_wmma_f32_16x16x32_f16: |
| 4808 | case Intrinsic::amdgcn_wmma_f16_16x16x32_f16: |
| 4809 | case Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16: |
| 4810 | case Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16: |
| 4811 | case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8: |
| 4812 | case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8: |
| 4813 | case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8: |
| 4814 | case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8: |
| 4815 | case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8: |
| 4816 | case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8: |
| 4817 | case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8: |
| 4818 | case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8: |
| 4819 | case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8: |
| 4820 | case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8: |
| 4821 | case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8: |
| 4822 | case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8: |
| 4823 | case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8: |
| 4824 | case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8: |
| 4825 | case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8: |
| 4826 | case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8: |
| 4827 | case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8: |
| 4828 | case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: |
| 4829 | case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4: |
| 4830 | case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: |
| 4831 | case Intrinsic::amdgcn_wmma_f32_32x16x128_f4: |
| 4832 | case Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4: |
| 4833 | case Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4: |
| 4834 | case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: |
| 4835 | case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: |
| 4836 | case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16: |
| 4837 | case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16: |
| 4838 | case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16: |
| 4839 | case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8: |
| 4840 | case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8: |
| 4841 | case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8: |
| 4842 | case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8: |
| 4843 | case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8: |
| 4844 | case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8: |
| 4845 | case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: |
| 4846 | case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: |
| 4847 | case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: |
| 4848 | case Intrinsic::amdgcn_perm_pk16_b4_u4: |
| 4849 | case Intrinsic::amdgcn_perm_pk16_b6_u4: |
| 4850 | case Intrinsic::amdgcn_perm_pk16_b8_u4: |
| 4851 | case Intrinsic::amdgcn_add_max_i32: |
| 4852 | case Intrinsic::amdgcn_add_max_u32: |
| 4853 | case Intrinsic::amdgcn_add_min_i32: |
| 4854 | case Intrinsic::amdgcn_add_min_u32: |
| 4855 | case Intrinsic::amdgcn_pk_add_max_i16: |
| 4856 | case Intrinsic::amdgcn_pk_add_max_u16: |
| 4857 | case Intrinsic::amdgcn_pk_add_min_i16: |
| 4858 | case Intrinsic::amdgcn_pk_add_min_u16: |
| 4859 | return getDefaultMappingVOP(MI); |
| 4860 | case Intrinsic::amdgcn_log: |
| 4861 | case Intrinsic::amdgcn_exp2: |
| 4862 | case Intrinsic::amdgcn_rcp: |
| 4863 | case Intrinsic::amdgcn_rsq: |
| 4864 | case Intrinsic::amdgcn_sqrt: { |
| 4865 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4866 | if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) && |
| 4867 | isSALUMapping(MI)) |
| 4868 | return getDefaultMappingSOP(MI); |
| 4869 | return getDefaultMappingVOP(MI); |
| 4870 | } |
| 4871 | case Intrinsic::amdgcn_sbfe: |
| 4872 | case Intrinsic::amdgcn_ubfe: |
| 4873 | if (isSALUMapping(MI)) |
| 4874 | return getDefaultMappingSOP(MI); |
| 4875 | return getDefaultMappingVOP(MI); |
| 4876 | case Intrinsic::amdgcn_ds_swizzle: |
| 4877 | case Intrinsic::amdgcn_ds_permute: |
| 4878 | case Intrinsic::amdgcn_ds_bpermute: |
| 4879 | case Intrinsic::amdgcn_update_dpp: |
| 4880 | case Intrinsic::amdgcn_mov_dpp8: |
| 4881 | case Intrinsic::amdgcn_mov_dpp: |
| 4882 | case Intrinsic::amdgcn_strict_wwm: |
| 4883 | case Intrinsic::amdgcn_wwm: |
| 4884 | case Intrinsic::amdgcn_strict_wqm: |
| 4885 | case Intrinsic::amdgcn_wqm: |
| 4886 | case Intrinsic::amdgcn_softwqm: |
| 4887 | case Intrinsic::amdgcn_set_inactive: |
| 4888 | case Intrinsic::amdgcn_set_inactive_chain_arg: |
| 4889 | case Intrinsic::amdgcn_permlane64: |
| 4890 | case Intrinsic::amdgcn_ds_bpermute_fi_b32: |
| 4891 | return getDefaultMappingAllVGPR(MI); |
| 4892 | case Intrinsic::amdgcn_cvt_pkrtz: |
| 4893 | if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI)) |
| 4894 | return getDefaultMappingSOP(MI); |
| 4895 | return getDefaultMappingVOP(MI); |
| 4896 | case Intrinsic::amdgcn_kernarg_segment_ptr: |
| 4897 | case Intrinsic::amdgcn_s_getpc: |
| 4898 | case Intrinsic::amdgcn_groupstaticsize: |
| 4899 | case Intrinsic::amdgcn_reloc_constant: |
| 4900 | case Intrinsic::returnaddress: { |
| 4901 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4902 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size); |
| 4903 | break; |
| 4904 | } |
| 4905 | case Intrinsic::amdgcn_wqm_vote: { |
| 4906 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4907 | OpdsMapping[0] = OpdsMapping[2] |
| 4908 | = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size); |
| 4909 | break; |
| 4910 | } |
| 4911 | case Intrinsic::amdgcn_ps_live: { |
| 4912 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1); |
| 4913 | break; |
| 4914 | } |
| 4915 | case Intrinsic::amdgcn_div_scale: { |
| 4916 | unsigned Dst0Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4917 | unsigned Dst1Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
| 4918 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Dst0Size); |
| 4919 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: Dst1Size); |
| 4920 | |
| 4921 | unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 3).getReg()).getSizeInBits(); |
| 4922 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize); |
| 4923 | OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize); |
| 4924 | break; |
| 4925 | } |
| 4926 | case Intrinsic::amdgcn_class: { |
| 4927 | Register Src0Reg = MI.getOperand(i: 2).getReg(); |
| 4928 | Register Src1Reg = MI.getOperand(i: 3).getReg(); |
| 4929 | unsigned Src0Size = MRI.getType(Reg: Src0Reg).getSizeInBits(); |
| 4930 | unsigned Src1Size = MRI.getType(Reg: Src1Reg).getSizeInBits(); |
| 4931 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4932 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: DstSize); |
| 4933 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Src0Size); |
| 4934 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: Src1Size); |
| 4935 | break; |
| 4936 | } |
| 4937 | case Intrinsic::amdgcn_icmp: |
| 4938 | case Intrinsic::amdgcn_fcmp: { |
| 4939 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4940 | // This is not VCCRegBank because this is not used in boolean contexts. |
| 4941 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize); |
| 4942 | unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
| 4943 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize); |
| 4944 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize); |
| 4945 | break; |
| 4946 | } |
| 4947 | case Intrinsic::amdgcn_readlane: { |
| 4948 | // This must be an SGPR, but accept a VGPR. |
| 4949 | Register IdxReg = MI.getOperand(i: 3).getReg(); |
| 4950 | unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits(); |
| 4951 | unsigned IdxBank = getRegBankID(Reg: IdxReg, MRI, Default: AMDGPU::SGPRRegBankID); |
| 4952 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize); |
| 4953 | [[fallthrough]]; |
| 4954 | } |
| 4955 | case Intrinsic::amdgcn_readfirstlane: { |
| 4956 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4957 | unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
| 4958 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize); |
| 4959 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize); |
| 4960 | break; |
| 4961 | } |
| 4962 | case Intrinsic::amdgcn_writelane: { |
| 4963 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 4964 | Register SrcReg = MI.getOperand(i: 2).getReg(); |
| 4965 | unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits(); |
| 4966 | unsigned SrcBank = getRegBankID(Reg: SrcReg, MRI, Default: AMDGPU::SGPRRegBankID); |
| 4967 | Register IdxReg = MI.getOperand(i: 3).getReg(); |
| 4968 | unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits(); |
| 4969 | unsigned IdxBank = getRegBankID(Reg: IdxReg, MRI, Default: AMDGPU::SGPRRegBankID); |
| 4970 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize); |
| 4971 | |
| 4972 | // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted |
| 4973 | // to legalize. |
| 4974 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: SrcBank, Size: SrcSize); |
| 4975 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize); |
| 4976 | OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: SrcSize); |
| 4977 | break; |
| 4978 | } |
| 4979 | case Intrinsic::amdgcn_if_break: { |
| 4980 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 4981 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size); |
| 4982 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1); |
| 4983 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size); |
| 4984 | break; |
| 4985 | } |
| 4986 | case Intrinsic::amdgcn_permlane16: |
| 4987 | case Intrinsic::amdgcn_permlanex16: { |
| 4988 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 4989 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 4990 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 4991 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 4992 | OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); |
| 4993 | OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); |
| 4994 | break; |
| 4995 | } |
| 4996 | case Intrinsic::amdgcn_permlane_bcast: |
| 4997 | case Intrinsic::amdgcn_permlane_up: |
| 4998 | case Intrinsic::amdgcn_permlane_down: |
| 4999 | case Intrinsic::amdgcn_permlane_xor: { |
| 5000 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 5001 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 5002 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 5003 | OpdsMapping[3] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5004 | OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); |
| 5005 | break; |
| 5006 | } |
| 5007 | case Intrinsic::amdgcn_permlane_idx_gen: { |
| 5008 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 5009 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 5010 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 5011 | OpdsMapping[3] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5012 | break; |
| 5013 | } |
| 5014 | case Intrinsic::amdgcn_permlane16_var: |
| 5015 | case Intrinsic::amdgcn_permlanex16_var: { |
| 5016 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 5017 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 5018 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 5019 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 5020 | OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 5021 | break; |
| 5022 | } |
| 5023 | case Intrinsic::amdgcn_mfma_f32_4x4x1f32: |
| 5024 | case Intrinsic::amdgcn_mfma_f32_4x4x4f16: |
| 5025 | case Intrinsic::amdgcn_mfma_i32_4x4x4i8: |
| 5026 | case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: |
| 5027 | case Intrinsic::amdgcn_mfma_f32_16x16x1f32: |
| 5028 | case Intrinsic::amdgcn_mfma_f32_16x16x4f32: |
| 5029 | case Intrinsic::amdgcn_mfma_f32_16x16x4f16: |
| 5030 | case Intrinsic::amdgcn_mfma_f32_16x16x16f16: |
| 5031 | case Intrinsic::amdgcn_mfma_i32_16x16x4i8: |
| 5032 | case Intrinsic::amdgcn_mfma_i32_16x16x16i8: |
| 5033 | case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: |
| 5034 | case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: |
| 5035 | case Intrinsic::amdgcn_mfma_f32_32x32x1f32: |
| 5036 | case Intrinsic::amdgcn_mfma_f32_32x32x2f32: |
| 5037 | case Intrinsic::amdgcn_mfma_f32_32x32x4f16: |
| 5038 | case Intrinsic::amdgcn_mfma_f32_32x32x8f16: |
| 5039 | case Intrinsic::amdgcn_mfma_i32_32x32x4i8: |
| 5040 | case Intrinsic::amdgcn_mfma_i32_32x32x8i8: |
| 5041 | case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: |
| 5042 | case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: |
| 5043 | case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: |
| 5044 | case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: |
| 5045 | case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: |
| 5046 | case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: |
| 5047 | case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: |
| 5048 | case Intrinsic::amdgcn_mfma_f64_16x16x4f64: |
| 5049 | case Intrinsic::amdgcn_mfma_f64_4x4x4f64: |
| 5050 | case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: |
| 5051 | case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: |
| 5052 | case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32: |
| 5053 | case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: |
| 5054 | case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8: |
| 5055 | case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8: |
| 5056 | case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8: |
| 5057 | case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8: |
| 5058 | case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8: |
| 5059 | case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8: |
| 5060 | case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8: |
| 5061 | case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: |
| 5062 | case Intrinsic::amdgcn_mfma_f32_16x16x32_f16: |
| 5063 | case Intrinsic::amdgcn_mfma_f32_32x32x16_f16: |
| 5064 | case Intrinsic::amdgcn_mfma_i32_16x16x64_i8: |
| 5065 | case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: |
| 5066 | case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: { |
| 5067 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 5068 | unsigned MinNumRegsRequired = DstSize / 32; |
| 5069 | |
| 5070 | // Default for MAI intrinsics. |
| 5071 | // srcC can also be an immediate which can be folded later. |
| 5072 | // FIXME: Should we eventually add an alternative mapping with AGPR src |
| 5073 | // for srcA/srcB? |
| 5074 | // |
| 5075 | // vdst, srcA, srcB, srcC |
| 5076 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
| 5077 | |
| 5078 | bool UseAGPRForm = !Subtarget.hasGFX90AInsts() || |
| 5079 | Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired); |
| 5080 | |
| 5081 | OpdsMapping[0] = |
| 5082 | UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI) |
| 5083 | : getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 5084 | OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5085 | OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); |
| 5086 | OpdsMapping[4] = |
| 5087 | UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI) |
| 5088 | : getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); |
| 5089 | break; |
| 5090 | } |
| 5091 | case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4: |
| 5092 | case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { |
| 5093 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 5094 | unsigned MinNumRegsRequired = DstSize / 32; |
| 5095 | |
| 5096 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
| 5097 | bool UseAGPRForm = Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired); |
| 5098 | |
| 5099 | OpdsMapping[0] = |
| 5100 | UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI) |
| 5101 | : getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 5102 | |
| 5103 | OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5104 | OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); |
| 5105 | OpdsMapping[4] = |
| 5106 | UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI) |
| 5107 | : getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); |
| 5108 | |
| 5109 | OpdsMapping[8] = getVGPROpMapping(Reg: MI.getOperand(i: 8).getReg(), MRI, TRI: *TRI); |
| 5110 | OpdsMapping[10] = getVGPROpMapping(Reg: MI.getOperand(i: 10).getReg(), MRI, TRI: *TRI); |
| 5111 | break; |
| 5112 | } |
| 5113 | case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: |
| 5114 | case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: |
| 5115 | case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: |
| 5116 | case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: |
| 5117 | case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: |
| 5118 | case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: |
| 5119 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: |
| 5120 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: |
| 5121 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: |
| 5122 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: |
| 5123 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: |
| 5124 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: |
| 5125 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: |
| 5126 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: |
| 5127 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16: |
| 5128 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16: |
| 5129 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16: |
| 5130 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16: |
| 5131 | case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8: |
| 5132 | case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8: |
| 5133 | case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8: |
| 5134 | case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8: |
| 5135 | case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8: |
| 5136 | case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8: |
| 5137 | case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8: |
| 5138 | case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8: |
| 5139 | case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8: |
| 5140 | case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: { |
| 5141 | Register DstReg = MI.getOperand(i: 0).getReg(); |
| 5142 | unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits(); |
| 5143 | unsigned MinNumRegsRequired = DstSize / 32; |
| 5144 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
| 5145 | bool UseAGPRForm = Info->selectAGPRFormMFMA(NumRegs: MinNumRegsRequired); |
| 5146 | |
| 5147 | // vdst, srcA, srcB, srcC, idx |
| 5148 | OpdsMapping[0] = UseAGPRForm ? getAGPROpMapping(Reg: DstReg, MRI, TRI: *TRI) |
| 5149 | : getVGPROpMapping(Reg: DstReg, MRI, TRI: *TRI); |
| 5150 | |
| 5151 | OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5152 | OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); |
| 5153 | OpdsMapping[4] = |
| 5154 | UseAGPRForm ? getAGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI) |
| 5155 | : getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); |
| 5156 | OpdsMapping[5] = getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI); |
| 5157 | break; |
| 5158 | } |
| 5159 | case Intrinsic::amdgcn_interp_p1: |
| 5160 | case Intrinsic::amdgcn_interp_p2: |
| 5161 | case Intrinsic::amdgcn_interp_mov: |
| 5162 | case Intrinsic::amdgcn_interp_p1_f16: |
| 5163 | case Intrinsic::amdgcn_interp_p2_f16: |
| 5164 | case Intrinsic::amdgcn_lds_param_load: { |
| 5165 | const int M0Idx = MI.getNumOperands() - 1; |
| 5166 | Register M0Reg = MI.getOperand(i: M0Idx).getReg(); |
| 5167 | unsigned M0Bank = getRegBankID(Reg: M0Reg, MRI, Default: AMDGPU::SGPRRegBankID); |
| 5168 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 5169 | |
| 5170 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize); |
| 5171 | for (int I = 2; I != M0Idx && MI.getOperand(i: I).isReg(); ++I) |
| 5172 | OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5173 | |
| 5174 | // Must be SGPR, but we must take whatever the original bank is and fix it |
| 5175 | // later. |
| 5176 | OpdsMapping[M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32); |
| 5177 | break; |
| 5178 | } |
| 5179 | case Intrinsic::amdgcn_interp_inreg_p10: |
| 5180 | case Intrinsic::amdgcn_interp_inreg_p2: |
| 5181 | case Intrinsic::amdgcn_interp_inreg_p10_f16: |
| 5182 | case Intrinsic::amdgcn_interp_inreg_p2_f16: |
| 5183 | case Intrinsic::amdgcn_interp_p10_rtz_f16: |
| 5184 | case Intrinsic::amdgcn_interp_p2_rtz_f16: { |
| 5185 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 5186 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize); |
| 5187 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5188 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5189 | OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5190 | break; |
| 5191 | } |
| 5192 | case Intrinsic::amdgcn_permlane16_swap: |
| 5193 | case Intrinsic::amdgcn_permlane32_swap: { |
| 5194 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 5195 | OpdsMapping[0] = OpdsMapping[1] = OpdsMapping[3] = OpdsMapping[4] = |
| 5196 | AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize); |
| 5197 | break; |
| 5198 | } |
| 5199 | case Intrinsic::amdgcn_ballot: { |
| 5200 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 5201 | unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
| 5202 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize); |
| 5203 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: SrcSize); |
| 5204 | break; |
| 5205 | } |
| 5206 | case Intrinsic::amdgcn_inverse_ballot: { |
| 5207 | // This must be an SGPR, but accept a VGPR. |
| 5208 | Register MaskReg = MI.getOperand(i: 2).getReg(); |
| 5209 | unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits(); |
| 5210 | unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID); |
| 5211 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1); |
| 5212 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize); |
| 5213 | break; |
| 5214 | } |
| 5215 | case Intrinsic::amdgcn_bitop3: { |
| 5216 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 5217 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 5218 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 5219 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 5220 | OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 5221 | break; |
| 5222 | } |
| 5223 | case Intrinsic::amdgcn_s_quadmask: |
| 5224 | case Intrinsic::amdgcn_s_wqm: { |
| 5225 | Register MaskReg = MI.getOperand(i: 2).getReg(); |
| 5226 | unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits(); |
| 5227 | unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID); |
| 5228 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: MaskSize); |
| 5229 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize); |
| 5230 | break; |
| 5231 | } |
| 5232 | case Intrinsic::amdgcn_wave_reduce_add: |
| 5233 | case Intrinsic::amdgcn_wave_reduce_fadd: |
| 5234 | case Intrinsic::amdgcn_wave_reduce_sub: |
| 5235 | case Intrinsic::amdgcn_wave_reduce_fsub: |
| 5236 | case Intrinsic::amdgcn_wave_reduce_min: |
| 5237 | case Intrinsic::amdgcn_wave_reduce_umin: |
| 5238 | case Intrinsic::amdgcn_wave_reduce_fmin: |
| 5239 | case Intrinsic::amdgcn_wave_reduce_max: |
| 5240 | case Intrinsic::amdgcn_wave_reduce_umax: |
| 5241 | case Intrinsic::amdgcn_wave_reduce_fmax: |
| 5242 | case Intrinsic::amdgcn_wave_reduce_and: |
| 5243 | case Intrinsic::amdgcn_wave_reduce_or: |
| 5244 | case Intrinsic::amdgcn_wave_reduce_xor: { |
| 5245 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 5246 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: DstSize); |
| 5247 | unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
| 5248 | auto regBankID = |
| 5249 | isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; |
| 5250 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: regBankID, Size: OpSize); |
| 5251 | break; |
| 5252 | } |
| 5253 | case Intrinsic::amdgcn_s_bitreplicate: { |
| 5254 | Register MaskReg = MI.getOperand(i: 2).getReg(); |
| 5255 | unsigned MaskBank = getRegBankID(Reg: MaskReg, MRI, Default: AMDGPU::SGPRRegBankID); |
| 5256 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 64); |
| 5257 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: 32); |
| 5258 | break; |
| 5259 | } |
| 5260 | case Intrinsic::amdgcn_wave_shuffle: { |
| 5261 | unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 5262 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize); |
| 5263 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize); |
| 5264 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: OpSize); |
| 5265 | break; |
| 5266 | } |
| 5267 | } |
| 5268 | break; |
| 5269 | } |
| 5270 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: |
| 5271 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: |
| 5272 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET: |
| 5273 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: |
| 5274 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { |
| 5275 | auto IntrID = AMDGPU::getIntrinsicID(I: MI); |
| 5276 | const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(Intr: IntrID); |
| 5277 | assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic" ); |
| 5278 | // Non-images can have complications from operands that allow both SGPR |
| 5279 | // and VGPR. For now it's too complicated to figure out the final opcode |
| 5280 | // to derive the register bank from the MCInstrDesc. |
| 5281 | assert(RSrcIntrin->IsImage); |
| 5282 | return getImageMapping(MRI, MI, RsrcIdx: RSrcIntrin->RsrcArg); |
| 5283 | } |
| 5284 | case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: |
| 5285 | case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY: |
| 5286 | case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: { |
| 5287 | bool IsDualOrBVH8 = |
| 5288 | MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY || |
| 5289 | MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY; |
| 5290 | unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier |
| 5291 | unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods; |
| 5292 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 5293 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize); |
| 5294 | if (IsDualOrBVH8) { |
| 5295 | OpdsMapping[1] = AMDGPU::getValueMapping( |
| 5296 | BankID: AMDGPU::VGPRRegBankID, |
| 5297 | Size: MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits()); |
| 5298 | OpdsMapping[2] = AMDGPU::getValueMapping( |
| 5299 | BankID: AMDGPU::VGPRRegBankID, |
| 5300 | Size: MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits()); |
| 5301 | } |
| 5302 | OpdsMapping[LastRegOpIdx] = |
| 5303 | getSGPROpMapping(Reg: MI.getOperand(i: LastRegOpIdx).getReg(), MRI, TRI: *TRI); |
| 5304 | if (LastRegOpIdx == 3) { |
| 5305 | // Sequential form: all operands combined into VGPR256/VGPR512 |
| 5306 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
| 5307 | if (Size > 256) |
| 5308 | Size = 512; |
| 5309 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 5310 | } else { |
| 5311 | // NSA form |
| 5312 | unsigned FirstSrcOpIdx = IsDualOrBVH8 ? 4 : 2; |
| 5313 | for (unsigned I = FirstSrcOpIdx; I < LastRegOpIdx; ++I) { |
| 5314 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: I).getReg()).getSizeInBits(); |
| 5315 | OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size); |
| 5316 | } |
| 5317 | } |
| 5318 | break; |
| 5319 | } |
| 5320 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: |
| 5321 | case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: { |
| 5322 | auto IntrID = cast<GIntrinsic>(Val: MI).getIntrinsicID(); |
| 5323 | switch (IntrID) { |
| 5324 | case Intrinsic::amdgcn_s_getreg: |
| 5325 | case Intrinsic::amdgcn_s_memtime: |
| 5326 | case Intrinsic::amdgcn_s_memrealtime: |
| 5327 | case Intrinsic::amdgcn_s_get_waveid_in_workgroup: |
| 5328 | case Intrinsic::amdgcn_s_sendmsg_rtn: { |
| 5329 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 5330 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size); |
| 5331 | break; |
| 5332 | } |
| 5333 | case Intrinsic::amdgcn_global_atomic_fmin_num: |
| 5334 | case Intrinsic::amdgcn_global_atomic_fmax_num: |
| 5335 | case Intrinsic::amdgcn_flat_atomic_fmin_num: |
| 5336 | case Intrinsic::amdgcn_flat_atomic_fmax_num: |
| 5337 | case Intrinsic::amdgcn_global_atomic_ordered_add_b64: |
| 5338 | case Intrinsic::amdgcn_global_load_tr_b64: |
| 5339 | case Intrinsic::amdgcn_global_load_tr_b128: |
| 5340 | case Intrinsic::amdgcn_global_load_tr4_b64: |
| 5341 | case Intrinsic::amdgcn_global_load_tr6_b96: |
| 5342 | case Intrinsic::amdgcn_ds_load_tr8_b64: |
| 5343 | case Intrinsic::amdgcn_ds_load_tr16_b128: |
| 5344 | case Intrinsic::amdgcn_ds_load_tr4_b64: |
| 5345 | case Intrinsic::amdgcn_ds_load_tr6_b96: |
| 5346 | case Intrinsic::amdgcn_flat_load_monitor_b32: |
| 5347 | case Intrinsic::amdgcn_flat_load_monitor_b64: |
| 5348 | case Intrinsic::amdgcn_flat_load_monitor_b128: |
| 5349 | case Intrinsic::amdgcn_global_load_monitor_b32: |
| 5350 | case Intrinsic::amdgcn_global_load_monitor_b64: |
| 5351 | case Intrinsic::amdgcn_global_load_monitor_b128: |
| 5352 | case Intrinsic::amdgcn_ds_read_tr4_b64: |
| 5353 | case Intrinsic::amdgcn_ds_read_tr6_b96: |
| 5354 | case Intrinsic::amdgcn_ds_read_tr8_b64: |
| 5355 | case Intrinsic::amdgcn_ds_read_tr16_b64: |
| 5356 | case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64: |
| 5357 | case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: |
| 5358 | return getDefaultMappingAllVGPR(MI); |
| 5359 | case Intrinsic::amdgcn_ds_ordered_add: |
| 5360 | case Intrinsic::amdgcn_ds_ordered_swap: { |
| 5361 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 5362 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize); |
| 5363 | unsigned M0Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI, |
| 5364 | Default: AMDGPU::SGPRRegBankID); |
| 5365 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32); |
| 5366 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5367 | break; |
| 5368 | } |
| 5369 | case Intrinsic::amdgcn_ds_append: |
| 5370 | case Intrinsic::amdgcn_ds_consume: { |
| 5371 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 5372 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize); |
| 5373 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5374 | break; |
| 5375 | } |
| 5376 | case Intrinsic::amdgcn_exp_compr: |
| 5377 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5378 | OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5379 | break; |
| 5380 | case Intrinsic::amdgcn_exp: |
| 5381 | // FIXME: Could we support packed types here? |
| 5382 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5383 | OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5384 | OpdsMapping[5] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5385 | OpdsMapping[6] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5386 | break; |
| 5387 | case Intrinsic::amdgcn_exp_row: |
| 5388 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5389 | OpdsMapping[4] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5390 | OpdsMapping[5] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5391 | OpdsMapping[6] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5392 | OpdsMapping[8] = getSGPROpMapping(Reg: MI.getOperand(i: 8).getReg(), MRI, TRI: *TRI); |
| 5393 | break; |
| 5394 | case Intrinsic::amdgcn_s_sendmsg: |
| 5395 | case Intrinsic::amdgcn_s_sendmsghalt: { |
| 5396 | // This must be an SGPR, but accept a VGPR. |
| 5397 | unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI, |
| 5398 | Default: AMDGPU::SGPRRegBankID); |
| 5399 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32); |
| 5400 | break; |
| 5401 | } |
| 5402 | case Intrinsic::amdgcn_s_setreg: { |
| 5403 | // This must be an SGPR, but accept a VGPR. |
| 5404 | unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI, |
| 5405 | Default: AMDGPU::SGPRRegBankID); |
| 5406 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32); |
| 5407 | break; |
| 5408 | } |
| 5409 | case Intrinsic::amdgcn_s_ttracedata: { |
| 5410 | // This must be an SGPR, but accept a VGPR. |
| 5411 | unsigned Bank = |
| 5412 | getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI, Default: AMDGPU::SGPRRegBankID); |
| 5413 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: 32); |
| 5414 | break; |
| 5415 | } |
| 5416 | case Intrinsic::amdgcn_end_cf: { |
| 5417 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 5418 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size); |
| 5419 | break; |
| 5420 | } |
| 5421 | case Intrinsic::amdgcn_else: { |
| 5422 | unsigned WaveSize = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 5423 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1); |
| 5424 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: WaveSize); |
| 5425 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: WaveSize); |
| 5426 | break; |
| 5427 | } |
| 5428 | case Intrinsic::amdgcn_init_whole_wave: |
| 5429 | case Intrinsic::amdgcn_live_mask: { |
| 5430 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1); |
| 5431 | break; |
| 5432 | } |
| 5433 | case Intrinsic::amdgcn_wqm_demote: |
| 5434 | case Intrinsic::amdgcn_kill: { |
| 5435 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1); |
| 5436 | break; |
| 5437 | } |
| 5438 | case Intrinsic::amdgcn_raw_buffer_load: |
| 5439 | case Intrinsic::amdgcn_raw_ptr_buffer_load: |
| 5440 | case Intrinsic::amdgcn_raw_atomic_buffer_load: |
| 5441 | case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: |
| 5442 | case Intrinsic::amdgcn_raw_tbuffer_load: |
| 5443 | case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { |
| 5444 | // FIXME: Should make intrinsic ID the last operand of the instruction, |
| 5445 | // then this would be the same as store |
| 5446 | OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 5447 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5448 | OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); |
| 5449 | OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); |
| 5450 | break; |
| 5451 | } |
| 5452 | case Intrinsic::amdgcn_raw_buffer_load_lds: |
| 5453 | case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { |
| 5454 | OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 5455 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5456 | OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); |
| 5457 | OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI); |
| 5458 | break; |
| 5459 | } |
| 5460 | case Intrinsic::amdgcn_raw_buffer_store: |
| 5461 | case Intrinsic::amdgcn_raw_ptr_buffer_store: |
| 5462 | case Intrinsic::amdgcn_raw_buffer_store_format: |
| 5463 | case Intrinsic::amdgcn_raw_ptr_buffer_store_format: |
| 5464 | case Intrinsic::amdgcn_raw_tbuffer_store: |
| 5465 | case Intrinsic::amdgcn_raw_ptr_tbuffer_store: { |
| 5466 | OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 5467 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5468 | OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); |
| 5469 | OpdsMapping[4] = getSGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); |
| 5470 | break; |
| 5471 | } |
| 5472 | case Intrinsic::amdgcn_struct_buffer_load: |
| 5473 | case Intrinsic::amdgcn_struct_ptr_buffer_load: |
| 5474 | case Intrinsic::amdgcn_struct_tbuffer_load: |
| 5475 | case Intrinsic::amdgcn_struct_ptr_tbuffer_load: |
| 5476 | case Intrinsic::amdgcn_struct_atomic_buffer_load: |
| 5477 | case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: { |
| 5478 | OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 5479 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5480 | OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); |
| 5481 | OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); |
| 5482 | OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI); |
| 5483 | break; |
| 5484 | } |
| 5485 | case Intrinsic::amdgcn_struct_buffer_load_lds: |
| 5486 | case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { |
| 5487 | OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 5488 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5489 | OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); |
| 5490 | OpdsMapping[5] = getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI); |
| 5491 | OpdsMapping[6] = getSGPROpMapping(Reg: MI.getOperand(i: 6).getReg(), MRI, TRI: *TRI); |
| 5492 | break; |
| 5493 | } |
| 5494 | case Intrinsic::amdgcn_struct_buffer_store: |
| 5495 | case Intrinsic::amdgcn_struct_ptr_buffer_store: |
| 5496 | case Intrinsic::amdgcn_struct_tbuffer_store: |
| 5497 | case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { |
| 5498 | OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 5499 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5500 | OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); |
| 5501 | OpdsMapping[4] = getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); |
| 5502 | OpdsMapping[5] = getSGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI); |
| 5503 | break; |
| 5504 | } |
| 5505 | case Intrinsic::amdgcn_init_exec_from_input: { |
| 5506 | unsigned Size = getSizeInBits(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 5507 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size); |
| 5508 | break; |
| 5509 | } |
| 5510 | case Intrinsic::amdgcn_ds_gws_init: |
| 5511 | case Intrinsic::amdgcn_ds_gws_barrier: |
| 5512 | case Intrinsic::amdgcn_ds_gws_sema_br: { |
| 5513 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5514 | |
| 5515 | // This must be an SGPR, but accept a VGPR. |
| 5516 | unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI, |
| 5517 | Default: AMDGPU::SGPRRegBankID); |
| 5518 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32); |
| 5519 | break; |
| 5520 | } |
| 5521 | case Intrinsic::amdgcn_ds_gws_sema_v: |
| 5522 | case Intrinsic::amdgcn_ds_gws_sema_p: |
| 5523 | case Intrinsic::amdgcn_ds_gws_sema_release_all: { |
| 5524 | // This must be an SGPR, but accept a VGPR. |
| 5525 | unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI, |
| 5526 | Default: AMDGPU::SGPRRegBankID); |
| 5527 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: 32); |
| 5528 | break; |
| 5529 | } |
| 5530 | case Intrinsic::amdgcn_cluster_load_b32: |
| 5531 | case Intrinsic::amdgcn_cluster_load_b64: |
| 5532 | case Intrinsic::amdgcn_cluster_load_b128: { |
| 5533 | OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 5534 | OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5535 | unsigned M0Bank = |
| 5536 | getRegBankID(Reg: MI.getOperand(i: 4).getReg(), MRI, Default: AMDGPU::SGPRRegBankID); |
| 5537 | OpdsMapping[4] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32); |
| 5538 | break; |
| 5539 | } |
| 5540 | case Intrinsic::amdgcn_cluster_load_async_to_lds_b8: |
| 5541 | case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: |
| 5542 | case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: |
| 5543 | case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: { |
| 5544 | OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 5545 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5546 | unsigned M0Bank = |
| 5547 | getRegBankID(Reg: MI.getOperand(i: 5).getReg(), MRI, Default: AMDGPU::SGPRRegBankID); |
| 5548 | OpdsMapping[5] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32); |
| 5549 | break; |
| 5550 | } |
| 5551 | case Intrinsic::amdgcn_global_store_async_from_lds_b8: |
| 5552 | case Intrinsic::amdgcn_global_store_async_from_lds_b32: |
| 5553 | case Intrinsic::amdgcn_global_store_async_from_lds_b64: |
| 5554 | case Intrinsic::amdgcn_global_store_async_from_lds_b128: |
| 5555 | case Intrinsic::amdgcn_global_load_async_to_lds_b8: |
| 5556 | case Intrinsic::amdgcn_global_load_async_to_lds_b32: |
| 5557 | case Intrinsic::amdgcn_global_load_async_to_lds_b64: |
| 5558 | case Intrinsic::amdgcn_global_load_async_to_lds_b128: |
| 5559 | case Intrinsic::amdgcn_load_to_lds: |
| 5560 | case Intrinsic::amdgcn_global_load_lds: { |
| 5561 | OpdsMapping[1] = getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 5562 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5563 | break; |
| 5564 | } |
| 5565 | case Intrinsic::amdgcn_lds_direct_load: { |
| 5566 | const int M0Idx = MI.getNumOperands() - 1; |
| 5567 | Register M0Reg = MI.getOperand(i: M0Idx).getReg(); |
| 5568 | unsigned M0Bank = getRegBankID(Reg: M0Reg, MRI, Default: AMDGPU::SGPRRegBankID); |
| 5569 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 5570 | |
| 5571 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: DstSize); |
| 5572 | for (int I = 2; I != M0Idx && MI.getOperand(i: I).isReg(); ++I) |
| 5573 | OpdsMapping[I] = AMDGPU::getValueMapping(BankID: AMDGPU::VGPRRegBankID, Size: 32); |
| 5574 | |
| 5575 | // Must be SGPR, but we must take whatever the original bank is and fix it |
| 5576 | // later. |
| 5577 | OpdsMapping[M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32); |
| 5578 | break; |
| 5579 | } |
| 5580 | case Intrinsic::amdgcn_ds_add_gs_reg_rtn: |
| 5581 | case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: |
| 5582 | OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 5583 | OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5584 | break; |
| 5585 | case Intrinsic::amdgcn_ds_bvh_stack_rtn: |
| 5586 | case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: |
| 5587 | case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: |
| 5588 | case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: { |
| 5589 | OpdsMapping[0] = |
| 5590 | getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); // %vdst |
| 5591 | OpdsMapping[1] = |
| 5592 | getVGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); // %addr |
| 5593 | OpdsMapping[3] = |
| 5594 | getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); // %addr |
| 5595 | OpdsMapping[4] = |
| 5596 | getVGPROpMapping(Reg: MI.getOperand(i: 4).getReg(), MRI, TRI: *TRI); // %data0 |
| 5597 | OpdsMapping[5] = |
| 5598 | getVGPROpMapping(Reg: MI.getOperand(i: 5).getReg(), MRI, TRI: *TRI); // %data1 |
| 5599 | break; |
| 5600 | } |
| 5601 | case Intrinsic::amdgcn_s_sleep_var: |
| 5602 | OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 5603 | break; |
| 5604 | case Intrinsic::amdgcn_s_barrier_join: |
| 5605 | case Intrinsic::amdgcn_s_wakeup_barrier: |
| 5606 | OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 5607 | break; |
| 5608 | case Intrinsic::amdgcn_s_barrier_init: |
| 5609 | case Intrinsic::amdgcn_s_barrier_signal_var: |
| 5610 | OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 5611 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5612 | break; |
| 5613 | case Intrinsic::amdgcn_s_barrier_signal_isfirst: { |
| 5614 | const unsigned ResultSize = 1; |
| 5615 | OpdsMapping[0] = |
| 5616 | AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: ResultSize); |
| 5617 | break; |
| 5618 | } |
| 5619 | case Intrinsic::amdgcn_s_get_barrier_state: |
| 5620 | case Intrinsic::amdgcn_s_get_named_barrier_state: { |
| 5621 | OpdsMapping[0] = getSGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 5622 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5623 | break; |
| 5624 | } |
| 5625 | case Intrinsic::amdgcn_pops_exiting_wave_id: |
| 5626 | return getDefaultMappingSOP(MI); |
| 5627 | case Intrinsic::amdgcn_tensor_load_to_lds_d2: |
| 5628 | case Intrinsic::amdgcn_tensor_store_from_lds_d2: |
| 5629 | case Intrinsic::amdgcn_tensor_load_to_lds: |
| 5630 | case Intrinsic::amdgcn_tensor_store_from_lds: { |
| 5631 | // Lie and claim everything is legal, even all operands need to be |
| 5632 | // SGPRs. applyMapping will have to deal with it with readfirstlane. |
| 5633 | for (unsigned I = 1; I < MI.getNumOperands(); ++I) { |
| 5634 | if (MI.getOperand(i: I).isReg()) { |
| 5635 | Register Reg = MI.getOperand(i: I).getReg(); |
| 5636 | auto OpBank = getRegBankID(Reg, MRI); |
| 5637 | unsigned Size = getSizeInBits(Reg, MRI, TRI: *TRI); |
| 5638 | OpdsMapping[I] = AMDGPU::getValueMapping(BankID: OpBank, Size); |
| 5639 | } |
| 5640 | } |
| 5641 | break; |
| 5642 | } |
| 5643 | case Intrinsic::amdgcn_s_prefetch_data: { |
| 5644 | OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 5645 | OpdsMapping[2] = getSGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5646 | break; |
| 5647 | } |
| 5648 | case Intrinsic::amdgcn_flat_prefetch: |
| 5649 | case Intrinsic::amdgcn_global_prefetch: |
| 5650 | return getDefaultMappingVOP(MI); |
| 5651 | default: |
| 5652 | return getInvalidInstructionMapping(); |
| 5653 | } |
| 5654 | break; |
| 5655 | } |
| 5656 | case AMDGPU::G_SELECT: { |
| 5657 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
| 5658 | unsigned Op2Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI, |
| 5659 | Default: AMDGPU::SGPRRegBankID); |
| 5660 | unsigned Op3Bank = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI, |
| 5661 | Default: AMDGPU::SGPRRegBankID); |
| 5662 | bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && |
| 5663 | Op3Bank == AMDGPU::SGPRRegBankID; |
| 5664 | |
| 5665 | unsigned CondBankDefault = SGPRSrcs ? |
| 5666 | AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; |
| 5667 | unsigned CondBank = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI, |
| 5668 | Default: CondBankDefault); |
| 5669 | if (CondBank == AMDGPU::SGPRRegBankID) |
| 5670 | CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; |
| 5671 | else if (CondBank == AMDGPU::VGPRRegBankID) |
| 5672 | CondBank = AMDGPU::VCCRegBankID; |
| 5673 | |
| 5674 | unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? |
| 5675 | AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; |
| 5676 | |
| 5677 | assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); |
| 5678 | |
| 5679 | // TODO: Should report 32-bit for scalar condition type. |
| 5680 | if (Size == 64) { |
| 5681 | OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size); |
| 5682 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: CondBank, Size: 1); |
| 5683 | OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size); |
| 5684 | OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size); |
| 5685 | } else { |
| 5686 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size); |
| 5687 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: CondBank, Size: 1); |
| 5688 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size); |
| 5689 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: Bank, Size); |
| 5690 | } |
| 5691 | |
| 5692 | break; |
| 5693 | } |
| 5694 | |
| 5695 | case AMDGPU::G_SI_CALL: { |
| 5696 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::SGPRRegBankID, Size: 64); |
| 5697 | // Lie and claim everything is legal, even though some need to be |
| 5698 | // SGPRs. applyMapping will have to deal with it as a waterfall loop. |
| 5699 | OpdsMapping[1] = getSGPROpMapping(Reg: MI.getOperand(i: 1).getReg(), MRI, TRI: *TRI); |
| 5700 | |
| 5701 | // Allow anything for implicit arguments |
| 5702 | for (unsigned I = 4; I < MI.getNumOperands(); ++I) { |
| 5703 | if (MI.getOperand(i: I).isReg()) { |
| 5704 | Register Reg = MI.getOperand(i: I).getReg(); |
| 5705 | auto OpBank = getRegBankID(Reg, MRI); |
| 5706 | unsigned Size = getSizeInBits(Reg, MRI, TRI: *TRI); |
| 5707 | OpdsMapping[I] = AMDGPU::getValueMapping(BankID: OpBank, Size); |
| 5708 | } |
| 5709 | } |
| 5710 | break; |
| 5711 | } |
| 5712 | case AMDGPU::G_LOAD: |
| 5713 | case AMDGPU::G_ZEXTLOAD: |
| 5714 | case AMDGPU::G_SEXTLOAD: |
| 5715 | return getInstrMappingForLoad(MI); |
| 5716 | |
| 5717 | case AMDGPU::G_ATOMICRMW_XCHG: |
| 5718 | case AMDGPU::G_ATOMICRMW_ADD: |
| 5719 | case AMDGPU::G_ATOMICRMW_SUB: |
| 5720 | case AMDGPU::G_ATOMICRMW_AND: |
| 5721 | case AMDGPU::G_ATOMICRMW_OR: |
| 5722 | case AMDGPU::G_ATOMICRMW_XOR: |
| 5723 | case AMDGPU::G_ATOMICRMW_MAX: |
| 5724 | case AMDGPU::G_ATOMICRMW_MIN: |
| 5725 | case AMDGPU::G_ATOMICRMW_UMAX: |
| 5726 | case AMDGPU::G_ATOMICRMW_UMIN: |
| 5727 | case AMDGPU::G_ATOMICRMW_FADD: |
| 5728 | case AMDGPU::G_ATOMICRMW_FMIN: |
| 5729 | case AMDGPU::G_ATOMICRMW_FMAX: |
| 5730 | case AMDGPU::G_ATOMICRMW_UINC_WRAP: |
| 5731 | case AMDGPU::G_ATOMICRMW_UDEC_WRAP: |
| 5732 | case AMDGPU::G_ATOMICRMW_USUB_COND: |
| 5733 | case AMDGPU::G_ATOMICRMW_USUB_SAT: |
| 5734 | case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: { |
| 5735 | OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 5736 | OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg()); |
| 5737 | OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5738 | break; |
| 5739 | } |
| 5740 | case AMDGPU::G_ATOMIC_CMPXCHG: { |
| 5741 | OpdsMapping[0] = getVGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 5742 | OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg()); |
| 5743 | OpdsMapping[2] = getVGPROpMapping(Reg: MI.getOperand(i: 2).getReg(), MRI, TRI: *TRI); |
| 5744 | OpdsMapping[3] = getVGPROpMapping(Reg: MI.getOperand(i: 3).getReg(), MRI, TRI: *TRI); |
| 5745 | break; |
| 5746 | } |
| 5747 | case AMDGPU::G_BRCOND: { |
| 5748 | unsigned Bank = getRegBankID(Reg: MI.getOperand(i: 0).getReg(), MRI, |
| 5749 | Default: AMDGPU::SGPRRegBankID); |
| 5750 | assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); |
| 5751 | if (Bank != AMDGPU::SGPRRegBankID) |
| 5752 | Bank = AMDGPU::VCCRegBankID; |
| 5753 | |
| 5754 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: 1); |
| 5755 | break; |
| 5756 | } |
| 5757 | case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND: |
| 5758 | return getDefaultMappingVOP(MI); |
| 5759 | case AMDGPU::G_PREFETCH: |
| 5760 | OpdsMapping[0] = getSGPROpMapping(Reg: MI.getOperand(i: 0).getReg(), MRI, TRI: *TRI); |
| 5761 | break; |
| 5762 | case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP: |
| 5763 | case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: |
| 5764 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: AMDGPU::VCCRegBankID, Size: 1); |
| 5765 | break; |
| 5766 | } |
| 5767 | |
| 5768 | return getInstructionMapping(/*ID*/1, /*Cost*/1, |
| 5769 | OperandsMapping: getOperandsMapping(OpdsMapping), |
| 5770 | NumOperands: MI.getNumOperands()); |
| 5771 | } |
| 5772 | |