1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPURegBankLegalizeHelper.h"
15#include "AMDGPUGlobalISelUtils.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "AMDGPURegBankLegalizeRules.h"
19#include "AMDGPURegisterBankInfo.h"
20#include "GCNSubtarget.h"
21#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22#include "SIMachineFunctionInfo.h"
23#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
24#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/CodeGen/MachineInstr.h"
28#include "llvm/CodeGen/MachineUniformityAnalysis.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31#define DEBUG_TYPE "amdgpu-regbanklegalize"
32
33using namespace llvm;
34using namespace AMDGPU;
35
36RegBankLegalizeHelper::RegBankLegalizeHelper(
37 MachineIRBuilder &B, const MachineUniformityInfo &MUI,
38 GISelValueTracking *VT, const RegisterBankInfo &RBI,
39 const RegBankLegalizeRules &RBLRules)
40 : MF(B.getMF()), MFI(MF.getInfo<SIMachineFunctionInfo>()),
41 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), B(B),
42 MRI(*B.getMRI()), MUI(MUI), VT(VT), RBI(RBI), MORE(MF, nullptr),
43 RBLRules(RBLRules), IsWave32(ST.isWave32()),
44 SgprRB(&RBI.getRegBank(ID: AMDGPU::SGPRRegBankID)),
45 VgprRB(&RBI.getRegBank(ID: AMDGPU::VGPRRegBankID)),
46 AgprRB(&RBI.getRegBank(ID: AMDGPU::AGPRRegBankID)),
47 VccRB(&RBI.getRegBank(ID: AMDGPU::VCCRegBankID)) {}
48
49bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
50 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
51 if (!RuleSet) {
52 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
53 Msg: "No AMDGPU RegBankLegalize rules defined for opcode",
54 MI);
55 return false;
56 }
57
58 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
59 if (!Mapping) {
60 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
61 Msg: "AMDGPU RegBankLegalize: none of the rules defined with "
62 "'Any' for MI's opcode matched MI",
63 MI);
64 return false;
65 }
66
67 WaterfallInfo WFI;
68 unsigned OpIdx = 0;
69 if (!Mapping->DstOpMapping.empty()) {
70 B.setInsertPt(MBB&: *MI.getParent(), II: std::next(x: MI.getIterator()));
71 if (!applyMappingDst(MI, OpIdx, MethodIDs: Mapping->DstOpMapping))
72 return false;
73 }
74 if (!Mapping->SrcOpMapping.empty()) {
75 B.setInstr(MI);
76 if (!applyMappingSrc(MI, OpIdx, MethodIDs: Mapping->SrcOpMapping, WFI))
77 return false;
78 }
79
80 if (!lower(MI, Mapping: *Mapping, WFI))
81 return false;
82
83 if (!WFI.SgprWaterfallOperandRegs.empty()) {
84 if (!executeInWaterfallLoop(B, WFI))
85 return false;
86 }
87
88 return true;
89}
90
91bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
92 const WaterfallInfo &WFI) {
93 assert(WFI.Start.isValid() && WFI.End.isValid() &&
94 "Waterfall range not initialized");
95
96 // Track use registers which have already been expanded with a readfirstlane
97 // sequence. This may have multiple uses if moving a sequence.
98 DenseMap<Register, Register> WaterfalledRegMap;
99
100 MachineBasicBlock &MBB = B.getMBB();
101 MachineFunction &MF = B.getMF();
102
103 MachineBasicBlock::iterator BeginIt = WFI.Start;
104 MachineBasicBlock::iterator EndIt = WFI.End;
105
106 const SIRegisterInfo *TRI = ST.getRegisterInfo();
107 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
108 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
109
110#ifndef NDEBUG
111 const int OrigRangeSize = std::distance(BeginIt, EndIt);
112#endif
113
114 MachineRegisterInfo &MRI = *B.getMRI();
115 Register SaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
116 Register InitSaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
117
118 // Don't bother using generic instructions/registers for the exec mask.
119 B.setInstr(*WFI.Start);
120 B.buildInstr(Opcode: TargetOpcode::IMPLICIT_DEF).addDef(RegNo: InitSaveExecReg);
121
122 Register SavedExec = MRI.createVirtualRegister(RegClass: WaveRC);
123
124 // To insert the loop we need to split the block. Move everything before
125 // this point to a new block, and insert a new empty block before this
126 // instruction.
127 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
128 MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
129 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
130 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
131 MachineFunction::iterator MBBI(MBB);
132 ++MBBI;
133 MF.insert(MBBI, MBB: LoopBB);
134 MF.insert(MBBI, MBB: BodyBB);
135 MF.insert(MBBI, MBB: RestoreExecBB);
136 MF.insert(MBBI, MBB: RemainderBB);
137
138 LoopBB->addSuccessor(Succ: BodyBB);
139 BodyBB->addSuccessor(Succ: RestoreExecBB);
140 BodyBB->addSuccessor(Succ: LoopBB);
141
142 // Move the rest of the block into a new block.
143 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
144 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: EndIt, To: MBB.end());
145
146 MBB.addSuccessor(Succ: LoopBB);
147 RestoreExecBB->addSuccessor(Succ: RemainderBB);
148
149 B.setInsertPt(MBB&: *LoopBB, II: LoopBB->end());
150
151 // +-MBB:------------+
152 // | ... |
153 // | %0 = G_INST_1 |
154 // | %Dst = MI %Vgpr |
155 // | %1 = G_INST_2 |
156 // | ... |
157 // +-----------------+
158 // ->
159 // +-MBB-------------------------------+
160 // | ... |
161 // | %0 = G_INST_1 |
162 // | %SaveExecReg = S_MOV_B32 $exec_lo |
163 // +----------------|------------------+
164 // | /------------------------------|
165 // V V |
166 // +-LoopBB---------------------------------------------------------------+ |
167 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
168 // | instead of executing for each lane, see if other lanes had | |
169 // | same value for %Vgpr and execute for them also. | |
170 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
171 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
172 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
173 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
174 // +----------------|-----------------------------------------------------+ |
175 // V |
176 // +-BodyBB------------------------------------------------------------+ |
177 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
178 // | executed only for active lanes and written to Dst | |
179 // | $exec = S_XOR_B32 $exec, %SavedExec | |
180 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
181 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
182 // | SI_WATERFALL_LOOP LoopBB |-----|
183 // +----------------|--------------------------------------------------+
184 // V
185 // +-RestoreExecBB--------------------------+
186 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
187 // +----------------|-----------------------+
188 // V
189 // +-RemainderBB:----------------------+
190 // | %1 = G_INST_2 |
191 // | ... |
192 // +---------------------------------- +
193
194 // Move the instruction into the loop body. Note we moved everything after
195 // Range.end() already into a new block, so Range.end() is no longer valid.
196 BodyBB->splice(Where: BodyBB->end(), Other: &MBB, From: BeginIt, To: MBB.end());
197
198 // Figure out the iterator range after splicing the instructions.
199 MachineBasicBlock::iterator NewBegin = BeginIt;
200 auto NewEnd = BodyBB->end();
201 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
202
203 B.setMBB(*LoopBB);
204 Register CondReg;
205
206 for (MachineInstr &MI : make_range(x: NewBegin, y: NewEnd)) {
207 for (MachineOperand &Op : MI.all_uses()) {
208 Register OldReg = Op.getReg();
209 if (!WFI.SgprWaterfallOperandRegs.count(V: OldReg))
210 continue;
211
212 // See if we already processed this register in another instruction in
213 // the sequence.
214 auto OldVal = WaterfalledRegMap.find(Val: OldReg);
215 if (OldVal != WaterfalledRegMap.end()) {
216 Op.setReg(OldVal->second);
217 continue;
218 }
219
220 Register OpReg = Op.getReg();
221 LLT OpTy = MRI.getType(Reg: OpReg);
222
223 // TODO: support for agpr
224 assert(MRI.getRegBank(OpReg) == VgprRB);
225 Register CurrentLaneReg = MRI.createVirtualRegister(RegAttr: {.RCOrRB: SgprRB, .Ty: OpTy});
226 buildReadFirstLane(B, SgprDst: CurrentLaneReg, VgprSrc: OpReg, RBI);
227
228 // Build the comparison(s), CurrentLaneReg == OpReg.
229 unsigned OpSize = OpTy.getSizeInBits();
230 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
231 LLT PartTy = LLT::scalar(SizeInBits: PartSize);
232 unsigned NumParts = OpSize / PartSize;
233 SmallVector<Register, 8> OpParts;
234 SmallVector<Register, 8> CurrentLaneParts;
235
236 if (NumParts == 1) {
237 OpParts.push_back(Elt: OpReg);
238 CurrentLaneParts.push_back(Elt: CurrentLaneReg);
239 } else {
240 auto UnmergeOp = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: PartTy}, Op: OpReg);
241 auto UnmergeCurrLane = B.buildUnmerge(Attrs: {.RCOrRB: SgprRB, .Ty: PartTy}, Op: CurrentLaneReg);
242 for (unsigned i = 0; i < NumParts; ++i) {
243 OpParts.push_back(Elt: UnmergeOp.getReg(Idx: i));
244 CurrentLaneParts.push_back(Elt: UnmergeCurrLane.getReg(Idx: i));
245 }
246 }
247
248 for (unsigned i = 0; i < NumParts; ++i) {
249 Register CmpReg = MRI.createVirtualRegister(RegAttr: VccRB_S1);
250 B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CmpReg, Op0: CurrentLaneParts[i], Op1: OpParts[i]);
251
252 if (!CondReg)
253 CondReg = CmpReg;
254 else
255 CondReg = B.buildAnd(Dst: VccRB_S1, Src0: CondReg, Src1: CmpReg).getReg(Idx: 0);
256 }
257
258 Op.setReg(CurrentLaneReg);
259
260 // Make sure we don't re-process this register again.
261 WaterfalledRegMap.insert(KV: std::pair(OldReg, Op.getReg()));
262 }
263 }
264
265 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
266 Register CondRegLM =
267 MRI.createVirtualRegister(RegAttr: {.RCOrRB: WaveRC, .Ty: LLT::scalar(SizeInBits: IsWave32 ? 32 : 64)});
268 B.buildIntrinsic(ID: Intrinsic::amdgcn_ballot, Res: CondRegLM).addReg(RegNo: CondReg);
269
270 // Update EXEC, save the original EXEC value to SavedExec.
271 B.buildInstr(Opcode: LMC.AndSaveExecOpc)
272 .addDef(RegNo: SavedExec)
273 .addReg(RegNo: CondRegLM, Flags: RegState::Kill);
274 MRI.setSimpleHint(VReg: SavedExec, PrefReg: CondRegLM);
275
276 B.setInsertPt(MBB&: *BodyBB, II: BodyBB->end());
277
278 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
279 B.buildInstr(Opcode: LMC.XorTermOpc)
280 .addDef(RegNo: LMC.ExecReg)
281 .addReg(RegNo: LMC.ExecReg)
282 .addReg(RegNo: SavedExec);
283
284 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
285 // s_cbranch_scc0?
286
287 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
288 B.buildInstr(Opcode: AMDGPU::SI_WATERFALL_LOOP).addMBB(MBB: LoopBB);
289
290 // Save the EXEC mask before the loop.
291 B.setInsertPt(MBB, II: MBB.end());
292 B.buildInstr(Opcode: LMC.MovOpc).addDef(RegNo: SaveExecReg).addReg(RegNo: LMC.ExecReg);
293
294 // Restore the EXEC mask after the loop.
295 B.setInsertPt(MBB&: *RestoreExecBB, II: RestoreExecBB->begin());
296 B.buildInstr(Opcode: LMC.MovTermOpc).addDef(RegNo: LMC.ExecReg).addReg(RegNo: SaveExecReg);
297
298 // Set the insert point after the original instruction, so any new
299 // instructions will be in the remainder.
300 B.setInsertPt(MBB&: *RemainderBB, II: RemainderBB->begin());
301
302 return true;
303}
304
305// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
306// the three offsets (voffset, soffset and instoffset)
307unsigned RegBankLegalizeHelper::setBufferOffsets(
308 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
309 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) {
310 if (std::optional<int64_t> Imm =
311 getIConstantVRegSExtVal(VReg: CombinedOffset, MRI)) {
312 uint32_t SOffset, ImmOffset;
313 if (TII.splitMUBUFOffset(Imm: *Imm, SOffset, ImmOffset, Alignment)) {
314 VOffsetReg = B.buildConstant(Res: {VgprRB, S32}, Val: 0).getReg(Idx: 0);
315 SOffsetReg = B.buildConstant(Res: {SgprRB, S32}, Val: SOffset).getReg(Idx: 0);
316 InstOffsetVal = ImmOffset;
317 return SOffset + ImmOffset;
318 }
319 }
320 const bool CheckNUW = ST.hasGFX1250Insts();
321 auto [Base, Offset] = AMDGPU::getBaseWithConstantOffset(
322 MRI, Reg: CombinedOffset, /*KnownBits=*/ValueTracking: nullptr,
323 /*CheckNUW=*/CheckNUW);
324 uint32_t SOffset, ImmOffset;
325 if (static_cast<int32_t>(Offset) > 0 &&
326 TII.splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
327 if (Base.isValid() && MRI.getRegBank(Reg: Base) == VgprRB) {
328 VOffsetReg = Base;
329 SOffsetReg = B.buildConstant(Res: {SgprRB, S32}, Val: SOffset).getReg(Idx: 0);
330 InstOffsetVal = ImmOffset;
331 return 0;
332 }
333 // If we have SGPR base, we can use it for soffset.
334 if (SOffset == 0) {
335 VOffsetReg = B.buildConstant(Res: {VgprRB, S32}, Val: 0).getReg(Idx: 0);
336 SOffsetReg = Base;
337 InstOffsetVal = ImmOffset;
338 return 0;
339 }
340 }
341 // Handle the variable sgpr + vgpr case.
342 MachineInstr *Add = getOpcodeDef(Opcode: AMDGPU::G_ADD, Reg: CombinedOffset, MRI);
343 if (Add && static_cast<int32_t>(Offset) >= 0 &&
344 (!CheckNUW || Add->getFlag(Flag: MachineInstr::NoUWrap))) {
345 Register Src0 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: 1).getReg(), MRI);
346 Register Src1 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: 2).getReg(), MRI);
347 const RegisterBank *Src0Bank = MRI.getRegBank(Reg: Src0);
348 const RegisterBank *Src1Bank = MRI.getRegBank(Reg: Src1);
349 if (Src0Bank == VgprRB && Src1Bank == SgprRB) {
350 VOffsetReg = Src0;
351 SOffsetReg = Src1;
352 return 0;
353 }
354 if (Src0Bank == SgprRB && Src1Bank == VgprRB) {
355 VOffsetReg = Src1;
356 SOffsetReg = Src0;
357 return 0;
358 }
359 }
360 // Ensure we have a VGPR for the combined offset. This could be an issue if we
361 // have an SGPR offset and a VGPR resource.
362 if (MRI.getRegBank(Reg: CombinedOffset) == VgprRB) {
363 VOffsetReg = CombinedOffset;
364 } else {
365 VOffsetReg = B.buildCopy(Res: {VgprRB, S32}, Op: CombinedOffset).getReg(Idx: 0);
366 }
367 SOffsetReg = B.buildConstant(Res: {SgprRB, S32}, Val: 0).getReg(Idx: 0);
368 return 0;
369}
370
371bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
372 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
373 MachineFunction &MF = B.getMF();
374 assert(MI.getNumMemOperands() == 1);
375 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
376 Register Dst = MI.getOperand(i: 0).getReg();
377 const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: Dst);
378 Register Base = MI.getOperand(i: 1).getReg();
379 LLT PtrTy = MRI.getType(Reg: Base);
380 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Reg: Base);
381 LLT OffsetTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
382 SmallVector<Register, 4> LoadPartRegs;
383
384 unsigned ByteOffset = 0;
385 for (LLT PartTy : LLTBreakdown) {
386 Register BasePlusOffset;
387 if (ByteOffset == 0) {
388 BasePlusOffset = Base;
389 } else {
390 auto Offset = B.buildConstant(Res: {PtrRB, OffsetTy}, Val: ByteOffset);
391 BasePlusOffset =
392 B.buildObjectPtrOffset(Res: {PtrRB, PtrTy}, Op0: Base, Op1: Offset).getReg(Idx: 0);
393 }
394 auto *OffsetMMO = MF.getMachineMemOperand(MMO: &BaseMMO, Offset: ByteOffset, Ty: PartTy);
395 auto LoadPart = B.buildLoad(Res: {DstRB, PartTy}, Addr: BasePlusOffset, MMO&: *OffsetMMO);
396 LoadPartRegs.push_back(Elt: LoadPart.getReg(Idx: 0));
397 ByteOffset += PartTy.getSizeInBytes();
398 }
399
400 if (!MergeTy.isValid()) {
401 // Loads are of same size, concat or merge them together.
402 B.buildMergeLikeInstr(Res: Dst, Ops: LoadPartRegs);
403 } else {
404 // Loads are not all of same size, need to unmerge them to smaller pieces
405 // of MergeTy type, then merge pieces to Dst.
406 SmallVector<Register, 4> MergeTyParts;
407 for (Register Reg : LoadPartRegs) {
408 if (MRI.getType(Reg) == MergeTy) {
409 MergeTyParts.push_back(Elt: Reg);
410 } else {
411 auto Unmerge = B.buildUnmerge(Attrs: {.RCOrRB: DstRB, .Ty: MergeTy}, Op: Reg);
412 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
413 MergeTyParts.push_back(Elt: Unmerge.getReg(Idx: i));
414 }
415 }
416 B.buildMergeLikeInstr(Res: Dst, Ops: MergeTyParts);
417 }
418 MI.eraseFromParent();
419 return true;
420}
421
422bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
423 LLT MergeTy) {
424 MachineFunction &MF = B.getMF();
425 assert(MI.getNumMemOperands() == 1);
426 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
427 Register Dst = MI.getOperand(i: 0).getReg();
428 const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: Dst);
429 Register Base = MI.getOperand(i: 1).getReg();
430
431 MachineMemOperand *WideMMO = MF.getMachineMemOperand(MMO: &BaseMMO, Offset: 0, Ty: WideTy);
432 auto WideLoad = B.buildLoad(Res: {DstRB, WideTy}, Addr: Base, MMO&: *WideMMO);
433
434 if (WideTy.isScalar()) {
435 B.buildTrunc(Res: Dst, Op: WideLoad);
436 } else {
437 SmallVector<Register, 4> MergeTyParts;
438 auto Unmerge = B.buildUnmerge(Attrs: {.RCOrRB: DstRB, .Ty: MergeTy}, Op: WideLoad);
439
440 LLT DstTy = MRI.getType(Reg: Dst);
441 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
442 for (unsigned i = 0; i < NumElts; ++i) {
443 MergeTyParts.push_back(Elt: Unmerge.getReg(Idx: i));
444 }
445 B.buildMergeLikeInstr(Res: Dst, Ops: MergeTyParts);
446 }
447 MI.eraseFromParent();
448 return true;
449}
450
451bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
452 Register Dst = MI.getDstReg();
453 Register Ptr = MI.getPointerReg();
454 MachineMemOperand &MMO = MI.getMMO();
455 unsigned MemSize = 8 * MMO.getSize().getValue();
456
457 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(MMO: &MMO, Offset: 0, Ty: S32);
458
459 if (MI.getOpcode() == G_LOAD) {
460 B.buildLoad(Res: Dst, Addr: Ptr, MMO&: *WideMMO);
461 } else {
462 auto Load = B.buildLoad(Res: SgprRB_S32, Addr: Ptr, MMO&: *WideMMO);
463
464 if (MI.getOpcode() == G_ZEXTLOAD) {
465 APInt Mask = APInt::getLowBitsSet(numBits: S32.getSizeInBits(), loBitsSet: MemSize);
466 auto MaskCst = B.buildConstant(Res: SgprRB_S32, Val: Mask);
467 B.buildAnd(Dst, Src0: Load, Src1: MaskCst);
468 } else {
469 assert(MI.getOpcode() == G_SEXTLOAD);
470 B.buildSExtInReg(Res: Dst, Op: Load, ImmOp: MemSize);
471 }
472 }
473
474 MI.eraseFromParent();
475 return true;
476}
477
478bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
479 Register Dst = MI.getOperand(i: 0).getReg();
480 LLT Ty = MRI.getType(Reg: Dst);
481 Register Src = MI.getOperand(i: 1).getReg();
482 unsigned Opc = MI.getOpcode();
483 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
484 if (Ty == S32 || Ty == S16) {
485 auto True = B.buildConstant(Res: {VgprRB, Ty}, Val: TrueExtCst);
486 auto False = B.buildConstant(Res: {VgprRB, Ty}, Val: 0);
487 B.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
488 } else if (Ty == S64) {
489 auto True = B.buildConstant(Res: {VgprRB_S32}, Val: TrueExtCst);
490 auto False = B.buildConstant(Res: {VgprRB_S32}, Val: 0);
491 auto Lo = B.buildSelect(Res: {VgprRB_S32}, Tst: Src, Op0: True, Op1: False);
492 MachineInstrBuilder Hi;
493 switch (Opc) {
494 case G_SEXT:
495 Hi = Lo;
496 break;
497 case G_ZEXT:
498 Hi = False;
499 break;
500 case G_ANYEXT:
501 Hi = B.buildUndef(Res: {VgprRB_S32});
502 break;
503 default:
504 reportGISelFailure(
505 MF, MORE, PassName: "amdgpu-regbanklegalize",
506 Msg: "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
507 return false;
508 }
509
510 B.buildMergeValues(Res: Dst, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)});
511 } else {
512 reportGISelFailure(
513 MF, MORE, PassName: "amdgpu-regbanklegalize",
514 Msg: "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
515 return false;
516 }
517
518 MI.eraseFromParent();
519 return true;
520}
521
522std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
523 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
524 auto Mask = B.buildConstant(Res: SgprRB_S32, Val: 0x0000ffff);
525 auto Lo = B.buildAnd(Dst: SgprRB_S32, Src0: PackedS32, Src1: Mask);
526 auto Hi = B.buildLShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
527 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
528}
529
530std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
531 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
532 auto Lo = B.buildSExtInReg(Res: SgprRB_S32, Op: PackedS32, ImmOp: 16);
533 auto Hi = B.buildAShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
534 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
535}
536
537std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
538 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
539 auto Lo = PackedS32;
540 auto Hi = B.buildLShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
541 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
542}
543
544std::pair<Register, Register>
545RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
546 auto [Lo32, Hi32] = unpackAExt(Reg);
547 return {B.buildTrunc(Res: SgprRB_S16, Op: Lo32).getReg(Idx: 0),
548 B.buildTrunc(Res: SgprRB_S16, Op: Hi32).getReg(Idx: 0)};
549}
550
551bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
552 Register Lo, Hi;
553 switch (MI.getOpcode()) {
554 case AMDGPU::G_SHL: {
555 auto [Val0, Val1] = unpackAExt(Reg: MI.getOperand(i: 1).getReg());
556 auto [Amt0, Amt1] = unpackAExt(Reg: MI.getOperand(i: 2).getReg());
557 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0, Amt0}).getReg(Idx: 0);
558 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val1, Amt1}).getReg(Idx: 0);
559 break;
560 }
561 case AMDGPU::G_LSHR: {
562 auto [Val0, Val1] = unpackZExt(Reg: MI.getOperand(i: 1).getReg());
563 auto [Amt0, Amt1] = unpackZExt(Reg: MI.getOperand(i: 2).getReg());
564 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0, Amt0}).getReg(Idx: 0);
565 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val1, Amt1}).getReg(Idx: 0);
566 break;
567 }
568 case AMDGPU::G_ASHR: {
569 auto [Val0, Val1] = unpackSExt(Reg: MI.getOperand(i: 1).getReg());
570 auto [Amt0, Amt1] = unpackSExt(Reg: MI.getOperand(i: 2).getReg());
571 Lo = B.buildAShr(Dst: SgprRB_S32, Src0: Val0, Src1: Amt0).getReg(Idx: 0);
572 Hi = B.buildAShr(Dst: SgprRB_S32, Src0: Val1, Src1: Amt1).getReg(Idx: 0);
573 break;
574 }
575 default:
576 reportGISelFailure(
577 MF, MORE, PassName: "amdgpu-regbanklegalize",
578 Msg: "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
579 MI);
580 return false;
581 }
582 B.buildBuildVectorTrunc(Res: MI.getOperand(i: 0).getReg(), Ops: {Lo, Hi});
583 MI.eraseFromParent();
584 return true;
585}
586
587bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
588 Register Lo, Hi;
589 switch (MI.getOpcode()) {
590 case AMDGPU::G_SMIN:
591 case AMDGPU::G_SMAX: {
592 // For signed operations, use sign extension
593 auto [Val0_Lo, Val0_Hi] = unpackSExt(Reg: MI.getOperand(i: 1).getReg());
594 auto [Val1_Lo, Val1_Hi] = unpackSExt(Reg: MI.getOperand(i: 2).getReg());
595 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Lo, Val1_Lo})
596 .getReg(Idx: 0);
597 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Hi, Val1_Hi})
598 .getReg(Idx: 0);
599 break;
600 }
601 case AMDGPU::G_UMIN:
602 case AMDGPU::G_UMAX: {
603 // For unsigned operations, use zero extension
604 auto [Val0_Lo, Val0_Hi] = unpackZExt(Reg: MI.getOperand(i: 1).getReg());
605 auto [Val1_Lo, Val1_Hi] = unpackZExt(Reg: MI.getOperand(i: 2).getReg());
606 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Lo, Val1_Lo})
607 .getReg(Idx: 0);
608 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Hi, Val1_Hi})
609 .getReg(Idx: 0);
610 break;
611 }
612 default:
613 reportGISelFailure(
614 MF, MORE, PassName: "amdgpu-regbanklegalize",
615 Msg: "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
616 return false;
617 }
618 B.buildBuildVectorTrunc(Res: MI.getOperand(i: 0).getReg(), Ops: {Lo, Hi});
619 MI.eraseFromParent();
620 return true;
621}
622
623bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
624 auto [Op1Lo, Op1Hi] = unpackAExt(Reg: MI.getOperand(i: 1).getReg());
625 auto [Op2Lo, Op2Hi] = unpackAExt(Reg: MI.getOperand(i: 2).getReg());
626 auto ResLo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Op1Lo, Op2Lo});
627 auto ResHi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Op1Hi, Op2Hi});
628 B.buildBuildVectorTrunc(Res: MI.getOperand(i: 0).getReg(),
629 Ops: {ResLo.getReg(Idx: 0), ResHi.getReg(Idx: 0)});
630 MI.eraseFromParent();
631 return true;
632}
633
634bool RegBankLegalizeHelper::lowerSBufToBuf(MachineInstr &MI,
635 WaterfallInfo &WFI) {
636 Register Dst = MI.getOperand(i: 0).getReg();
637 LLT Ty = MRI.getType(Reg: Dst);
638 const RegisterBank *RSrcBank = MRI.getRegBank(Reg: MI.getOperand(i: 1).getReg());
639 unsigned LoadSize = Ty.getSizeInBits();
640 int NumLoads = 1;
641 SmallVector<Register, 4> LoadParts;
642 if (LoadSize == 256 || LoadSize == 512) {
643 NumLoads = LoadSize / 128;
644 Ty = Ty.divide(Factor: NumLoads);
645 }
646 for (int i = 0; i < NumLoads; ++i)
647 LoadParts.emplace_back(Args: MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty}));
648 MachineMemOperand *OrigMMO = *MI.memoperands_begin();
649 const Align Alignment = OrigMMO->getAlign();
650 MachineFunction &MF = B.getMF();
651 Register SOffset;
652 Register VOffset;
653 int64_t ImmOffset = 0;
654 unsigned MMOOffset = setBufferOffsets(B, CombinedOffset: MI.getOperand(i: 2).getReg(), VOffsetReg&: VOffset,
655 SOffsetReg&: SOffset, InstOffsetVal&: ImmOffset, Alignment);
656 // Use the MMO size from the original instruction rather than the (possibly
657 // widened) register type. E.g. 96-bit loads are widened to 128-bit during
658 // legalization but the MMO still reflects the original 96-bit access size.
659 const unsigned MemSize = divideCeil(Numerator: OrigMMO->getSize().getValue(), Denominator: NumLoads);
660 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(MMO: OrigMMO, Offset: 0, Size: MemSize);
661 if (MMOOffset != 0)
662 BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset, Size: MemSize);
663 // If only the offset is divergent, emit a MUBUF buffer load
664 // instead. We can assume that the buffer is unswizzled.
665 Register RSrc = MI.getOperand(i: 1).getReg();
666 Register VIndex = B.buildConstant(Res: VgprRB_S32, Val: 0).getReg(Idx: 0);
667 unsigned Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
668 switch (MI.getOpcode()) {
669 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
670 Opc = G_AMDGPU_BUFFER_LOAD_SBYTE;
671 break;
672 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
673 Opc = G_AMDGPU_BUFFER_LOAD_UBYTE;
674 break;
675 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
676 Opc = G_AMDGPU_BUFFER_LOAD_SSHORT;
677 break;
678 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
679 Opc = G_AMDGPU_BUFFER_LOAD_USHORT;
680 break;
681 default:
682 break;
683 }
684 for (int i = 0; i < NumLoads; ++i) {
685 B.buildInstr(Opcode: Opc)
686 .addDef(RegNo: LoadParts[i]) // vdata
687 .addUse(RegNo: RSrc) // rsrc
688 .addUse(RegNo: VIndex) // vindex
689 .addUse(RegNo: VOffset) // voffset
690 .addUse(RegNo: SOffset) // soffset
691 .addImm(Val: ImmOffset + 16 * i) // offset(imm)
692 .addImm(Val: 0) // cachepolicy, swizzled buffer(imm)
693 .addImm(Val: 0) // idxen(imm)
694 .addMemOperand(MMO: MF.getMachineMemOperand(MMO: BaseMMO, Offset: 16 * i, Size: MemSize));
695 }
696 if (NumLoads == 1)
697 B.buildCopy(Res: Dst, Op: LoadParts[0]);
698 else
699 B.buildMergeLikeInstr(Res: Dst, Ops: LoadParts);
700 B.setInstr(*MRI.getVRegDef(Reg: LoadParts[0]));
701 if (RSrcBank != SgprRB) {
702 WFI.SgprWaterfallOperandRegs.insert(V: RSrc);
703 WFI.Start = MRI.getVRegDef(Reg: LoadParts.front());
704 WFI.End = std::next(x: MRI.getVRegDef(Reg: LoadParts.back())->getIterator());
705 }
706 MI.eraseFromParent();
707 return true;
708}
709
710static bool isSignedBFE(MachineInstr &MI) {
711 if (GIntrinsic *GI = dyn_cast<GIntrinsic>(Val: &MI))
712 return (GI->is(ID: Intrinsic::amdgcn_sbfe));
713
714 return MI.getOpcode() == AMDGPU::G_SBFX;
715}
716
717bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
718 Register Dst = MI.getOperand(i: 0).getReg();
719 assert(MRI.getType(Dst) == LLT::scalar(64));
720 bool Signed = isSignedBFE(MI);
721 unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1;
722 // Extract bitfield from Src, LSBit is the least-significant bit for the
723 // extraction (field offset) and Width is size of bitfield.
724 Register Src = MI.getOperand(i: FirstOpnd).getReg();
725 Register LSBit = MI.getOperand(i: FirstOpnd + 1).getReg();
726 Register Width = MI.getOperand(i: FirstOpnd + 2).getReg();
727 // Comments are for signed bitfield extract, similar for unsigned. x is sign
728 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
729
730 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
731 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
732 auto SHRSrc = B.buildInstr(Opc: SHROpc, DstOps: {{VgprRB, S64}}, SrcOps: {Src, LSBit});
733
734 auto ConstWidth = getIConstantVRegValWithLookThrough(VReg: Width, MRI);
735
736 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
737 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
738 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
739 if (!ConstWidth) {
740 auto Amt = B.buildSub(Dst: VgprRB_S32, Src0: B.buildConstant(Res: SgprRB_S32, Val: 64), Src1: Width);
741 auto SignBit = B.buildShl(Dst: {VgprRB, S64}, Src0: SHRSrc, Src1: Amt);
742 B.buildInstr(Opc: SHROpc, DstOps: {Dst}, SrcOps: {SignBit, Amt});
743 MI.eraseFromParent();
744 return true;
745 }
746
747 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
748 auto UnmergeSHRSrc = B.buildUnmerge(Attrs: VgprRB_S32, Op: SHRSrc);
749 Register SHRSrcLo = UnmergeSHRSrc.getReg(Idx: 0);
750 Register SHRSrcHi = UnmergeSHRSrc.getReg(Idx: 1);
751 auto Zero = B.buildConstant(Res: {VgprRB, S32}, Val: 0);
752 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
753
754 if (WidthImm <= 32) {
755 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
756 auto Lo = B.buildInstr(Opc: BFXOpc, DstOps: {VgprRB_S32}, SrcOps: {SHRSrcLo, Zero, Width});
757 MachineInstrBuilder Hi;
758 if (Signed) {
759 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
760 Hi = B.buildAShr(Dst: VgprRB_S32, Src0: Lo, Src1: B.buildConstant(Res: VgprRB_S32, Val: 31));
761 } else {
762 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
763 Hi = Zero;
764 }
765 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
766 } else {
767 auto Amt = B.buildConstant(Res: VgprRB_S32, Val: WidthImm - 32);
768 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
769 auto Hi = B.buildInstr(Opc: BFXOpc, DstOps: {VgprRB_S32}, SrcOps: {SHRSrcHi, Zero, Amt});
770 B.buildMergeLikeInstr(Res: Dst, Ops: {SHRSrcLo, Hi});
771 }
772
773 MI.eraseFromParent();
774 return true;
775}
776
777bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
778 Register DstReg = MI.getOperand(i: 0).getReg();
779 LLT Ty = MRI.getType(Reg: DstReg);
780 bool Signed = isSignedBFE(MI);
781 unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1;
782 Register Src = MI.getOperand(i: FirstOpnd).getReg();
783 Register LSBit = MI.getOperand(i: FirstOpnd + 1).getReg();
784 Register Width = MI.getOperand(i: FirstOpnd + 2).getReg();
785 // For uniform bit field extract there are 4 available instructions, but
786 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
787 // field offset in low and size in high 16 bits.
788
789 // Src1 Hi16|Lo16 = Size|FieldOffset
790 auto Mask = B.buildConstant(Res: SgprRB_S32, Val: maskTrailingOnes<unsigned>(N: 6));
791 auto FieldOffset = B.buildAnd(Dst: SgprRB_S32, Src0: LSBit, Src1: Mask);
792 auto Size = B.buildShl(Dst: SgprRB_S32, Src0: Width, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
793 auto Src1 = B.buildOr(Dst: SgprRB_S32, Src0: FieldOffset, Src1: Size);
794 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
795 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
796 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
797
798 // Select machine instruction, because of reg class constraining, insert
799 // copies from reg class to reg bank.
800 auto S_BFE = B.buildInstr(Opc, DstOps: {{SgprRB, Ty}},
801 SrcOps: {B.buildCopy(Res: Ty, Op: Src), B.buildCopy(Res: S32, Op: Src1)});
802 constrainSelectedInstRegOperands(I&: *S_BFE, TII: *ST.getInstrInfo(),
803 TRI: *ST.getRegisterInfo(), RBI);
804
805 B.buildCopy(Res: DstReg, Op: S_BFE->getOperand(i: 0).getReg());
806 MI.eraseFromParent();
807 return true;
808}
809
810bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
811 Register Dst = MI.getOperand(i: 0).getReg();
812 LLT DstTy = MRI.getType(Reg: Dst);
813 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
814 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
815 auto Op1 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 1).getReg());
816 auto Op2 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 2).getReg());
817 unsigned Opc = MI.getOpcode();
818 auto Flags = MI.getFlags();
819 auto Lo =
820 B.buildInstr(Opc, DstOps: {{VgprRB, Ty}}, SrcOps: {Op1.getReg(Idx: 0), Op2.getReg(Idx: 0)}, Flags);
821 auto Hi =
822 B.buildInstr(Opc, DstOps: {{VgprRB, Ty}}, SrcOps: {Op1.getReg(Idx: 1), Op2.getReg(Idx: 1)}, Flags);
823 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
824 MI.eraseFromParent();
825 return true;
826}
827
828bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
829 Register Dst = MI.getOperand(i: 0).getReg();
830 assert(MRI.getType(Dst) == S64);
831 auto Op1 = B.buildUnmerge(Attrs: {VgprRB_S32}, Op: MI.getOperand(i: 1).getReg());
832 auto Op2 = B.buildUnmerge(Attrs: {VgprRB_S32}, Op: MI.getOperand(i: 2).getReg());
833
834 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
835 // match GlobalISel with old regbankselect.
836 auto Lo = B.buildMul(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 0), Src1: Op2.getReg(Idx: 0));
837 auto Carry = B.buildUMulH(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 0), Src1: Op2.getReg(Idx: 0));
838 auto MulLo0Hi1 = B.buildMul(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 0), Src1: Op2.getReg(Idx: 1));
839 auto MulHi0Lo1 = B.buildMul(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 1), Src1: Op2.getReg(Idx: 0));
840 auto Sum = B.buildAdd(Dst: VgprRB_S32, Src0: MulLo0Hi1, Src1: MulHi0Lo1);
841 auto Hi = B.buildAdd(Dst: VgprRB_S32, Src0: Sum, Src1: Carry);
842
843 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
844 MI.eraseFromParent();
845 return true;
846}
847
848bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
849 Register Dst = MI.getOperand(i: 0).getReg();
850 assert(MRI.getType(Dst) == V2S16);
851 unsigned Opc = MI.getOpcode();
852 unsigned NumOps = MI.getNumOperands();
853 auto Flags = MI.getFlags();
854
855 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(Reg: MI.getOperand(i: 1).getReg());
856
857 if (NumOps == 2) {
858 auto Lo = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Lo}, Flags);
859 auto Hi = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Hi}, Flags);
860 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
861 MI.eraseFromParent();
862 return true;
863 }
864
865 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(Reg: MI.getOperand(i: 2).getReg());
866
867 if (NumOps == 3) {
868 auto Lo = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Lo, Op2Lo}, Flags);
869 auto Hi = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Hi, Op2Hi}, Flags);
870 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
871 MI.eraseFromParent();
872 return true;
873 }
874
875 assert(NumOps == 4);
876 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(Reg: MI.getOperand(i: 3).getReg());
877 auto Lo = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Lo, Op2Lo, Op3Lo}, Flags);
878 auto Hi = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Hi, Op2Hi, Op3Hi}, Flags);
879 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
880 MI.eraseFromParent();
881 return true;
882}
883
884bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
885 Register Dst0 = MI.getOperand(i: 0).getReg();
886 Register Dst1 = MI.getOperand(i: 1).getReg();
887 Register Src0 = MI.getOperand(i: 2).getReg();
888 Register Src1 = MI.getOperand(i: 3).getReg();
889 Register Src2 = MI.getOperand(i: 4).getReg();
890
891 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
892
893 // Keep the multiplication on the SALU.
894 Register DstLo = B.buildMul(Dst: SgprRB_S32, Src0, Src1).getReg(Idx: 0);
895 Register DstHi = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
896 if (ST.hasScalarMulHiInsts()) {
897 B.buildInstr(Opc: AMDGPU::G_UMULH, DstOps: {{DstHi}}, SrcOps: {Src0, Src1});
898 } else {
899 auto VSrc0 = B.buildCopy(Res: VgprRB_S32, Op: Src0);
900 auto VSrc1 = B.buildCopy(Res: VgprRB_S32, Op: Src1);
901 auto MulHi = B.buildInstr(Opc: AMDGPU::G_UMULH, DstOps: {VgprRB_S32}, SrcOps: {VSrc0, VSrc1});
902 buildReadAnyLane(B, SgprDst: DstHi, VgprSrc: MulHi.getReg(Idx: 0), RBI);
903 }
904
905 // Accumulate and produce the "carry-out" bit.
906
907 // The "carry-out" is defined as bit 64 of the result when computed as a
908 // big integer. For unsigned multiply-add, this matches the usual
909 // definition of carry-out.
910 if (mi_match(R: Src2, MRI, P: MIPatternMatch::m_ZeroInt())) {
911 // No accumulate: result is just the multiplication, carry is 0.
912 B.buildMergeLikeInstr(Res: Dst0, Ops: {DstLo, DstHi});
913 B.buildConstant(Res: Dst1, Val: 0);
914 } else {
915 // Accumulate: add Src2 to the multiplication result with carry chain.
916 Register Src2Lo = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
917 Register Src2Hi = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
918 B.buildUnmerge(Res: {Src2Lo, Src2Hi}, Op: Src2);
919
920 auto AddLo = B.buildUAddo(Res: SgprRB_S32, CarryOut: SgprRB_S32, Op0: DstLo, Op1: Src2Lo);
921 auto AddHi =
922 B.buildUAdde(Res: SgprRB_S32, CarryOut: SgprRB_S32, Op0: DstHi, Op1: Src2Hi, CarryIn: AddLo.getReg(Idx: 1));
923 B.buildMergeLikeInstr(Res: Dst0, Ops: {AddLo.getReg(Idx: 0), AddHi.getReg(Idx: 0)});
924 B.buildCopy(Res: Dst1, Op: AddHi.getReg(Idx: 1));
925 }
926
927 MI.eraseFromParent();
928 return true;
929}
930
931bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
932 Register Dst = MI.getOperand(i: 0).getReg();
933 LLT DstTy = MRI.getType(Reg: Dst);
934 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
935 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
936 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
937 auto Op2 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 2).getReg());
938 auto Op3 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 3).getReg());
939 Register Cond = MI.getOperand(i: 1).getReg();
940 auto Flags = MI.getFlags();
941 auto Lo =
942 B.buildSelect(Res: {VgprRB, Ty}, Tst: Cond, Op0: Op2.getReg(Idx: 0), Op1: Op3.getReg(Idx: 0), Flags);
943 auto Hi =
944 B.buildSelect(Res: {VgprRB, Ty}, Tst: Cond, Op0: Op2.getReg(Idx: 1), Op1: Op3.getReg(Idx: 1), Flags);
945
946 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
947 MI.eraseFromParent();
948 return true;
949}
950
951bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
952 auto Op1 = B.buildUnmerge(Attrs: VgprRB_S32, Op: MI.getOperand(i: 1).getReg());
953 int Amt = MI.getOperand(i: 2).getImm();
954 Register Lo, Hi;
955 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
956 if (Amt <= 32) {
957 auto Freeze = B.buildFreeze(Dst: VgprRB_S32, Src: Op1.getReg(Idx: 0));
958 if (Amt == 32) {
959 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
960 Lo = Freeze.getReg(Idx: 0);
961 } else {
962 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
963 Lo = B.buildSExtInReg(Res: VgprRB_S32, Op: Freeze, ImmOp: Amt).getReg(Idx: 0);
964 }
965
966 auto SignExtCst = B.buildConstant(Res: SgprRB_S32, Val: 31);
967 Hi = B.buildAShr(Dst: VgprRB_S32, Src0: Lo, Src1: SignExtCst).getReg(Idx: 0);
968 } else {
969 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
970 Lo = Op1.getReg(Idx: 0);
971 Hi = B.buildSExtInReg(Res: VgprRB_S32, Op: Op1.getReg(Idx: 1), ImmOp: Amt - 32).getReg(Idx: 0);
972 }
973
974 B.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(), Ops: {Lo, Hi});
975 MI.eraseFromParent();
976 return true;
977}
978
979bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
980 // Split 64-bit find-first-bit operations into 32-bit halves:
981 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
982 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
983 // (ctlz_zero_poison hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
984 // (cttz_zero_poison hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
985 unsigned Opc = MI.getOpcode();
986
987 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
988 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_poison), so plain add
989 // is fine.
990 unsigned FFBOpc;
991 unsigned AddOpc;
992 bool SearchFromMSB;
993 switch (Opc) {
994 case AMDGPU::G_AMDGPU_FFBH_U32:
995 FFBOpc = Opc;
996 AddOpc = AMDGPU::G_UADDSAT;
997 SearchFromMSB = true;
998 break;
999 case AMDGPU::G_AMDGPU_FFBL_B32:
1000 FFBOpc = Opc;
1001 AddOpc = AMDGPU::G_UADDSAT;
1002 SearchFromMSB = false;
1003 break;
1004 case AMDGPU::G_CTLZ_ZERO_POISON:
1005 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
1006 AddOpc = AMDGPU::G_ADD;
1007 SearchFromMSB = true;
1008 break;
1009 case AMDGPU::G_CTTZ_ZERO_POISON:
1010 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
1011 AddOpc = AMDGPU::G_ADD;
1012 SearchFromMSB = false;
1013 break;
1014 default:
1015 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
1016 }
1017
1018 auto Unmerge = B.buildUnmerge(Attrs: VgprRB_S32, Op: MI.getOperand(i: 1).getReg());
1019 Register Lo = Unmerge.getReg(Idx: 0);
1020 Register Hi = Unmerge.getReg(Idx: 1);
1021
1022 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
1023 // lo first. The secondary half adds 32 to account for the primary half's
1024 // width.
1025 auto Primary = B.buildInstr(Opc: FFBOpc, DstOps: {VgprRB_S32}, SrcOps: {SearchFromMSB ? Hi : Lo});
1026 auto Secondary =
1027 B.buildInstr(Opc: FFBOpc, DstOps: {VgprRB_S32}, SrcOps: {SearchFromMSB ? Lo : Hi});
1028
1029 auto Adjusted = B.buildInstr(Opc: AddOpc, DstOps: {VgprRB_S32},
1030 SrcOps: {Secondary, B.buildConstant(Res: VgprRB_S32, Val: 32)});
1031 B.buildUMin(Dst: MI.getOperand(i: 0).getReg(), Src0: Primary, Src1: Adjusted);
1032
1033 MI.eraseFromParent();
1034 return true;
1035}
1036
1037bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &MI) {
1038 // Lower extract vector element to a compare-select chain:
1039 // result = elt[0]
1040 // for i in 1..N-1:
1041 // result = (idx == i) ? elt[i] : result
1042 //
1043 // When the index is divergent, each lane may want a different element, so
1044 // we must check every element per lane.
1045 Register Dst = MI.getOperand(i: 0).getReg();
1046 Register Src = MI.getOperand(i: 1).getReg();
1047 Register Idx = MI.getOperand(i: 2).getReg();
1048
1049 LLT VecTy = MRI.getType(Reg: Src);
1050 LLT ScalarTy = VecTy.getScalarType();
1051 unsigned NumElts = VecTy.getNumElements();
1052 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {.RCOrRB: VgprRB, .Ty: ScalarTy};
1053
1054 auto Unmerge = B.buildUnmerge(Attrs: VgprRB_EltTy, Op: Src);
1055
1056 if (ScalarTy.getSizeInBits() == 32) {
1057 Register PrevSelect = Unmerge.getReg(Idx: 0);
1058 for (unsigned I = 1; I < NumElts; ++I) {
1059 auto IdxConst = B.buildConstant(Res: {SgprRB, MRI.getType(Reg: Idx)}, Val: I);
1060 auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: VccRB_S1, Op0: Idx, Op1: IdxConst);
1061 PrevSelect =
1062 B.buildSelect(Res: VgprRB_EltTy, Tst: Cmp, Op0: Unmerge.getReg(Idx: I), Op1: PrevSelect)
1063 .getReg(Idx: 0);
1064 }
1065 B.buildCopy(Res: Dst, Op: PrevSelect);
1066 } else if (ScalarTy.getSizeInBits() == 64) {
1067 auto InitUnmerge = B.buildUnmerge(Attrs: VgprRB_S32, Op: Unmerge.getReg(Idx: 0));
1068 Register PrevLo = InitUnmerge.getReg(Idx: 0);
1069 Register PrevHi = InitUnmerge.getReg(Idx: 1);
1070 for (unsigned I = 1; I < NumElts; ++I) {
1071 auto IdxConst = B.buildConstant(Res: {SgprRB, MRI.getType(Reg: Idx)}, Val: I);
1072 auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: VccRB_S1, Op0: Idx, Op1: IdxConst);
1073 auto EltUnmerge = B.buildUnmerge(Attrs: VgprRB_S32, Op: Unmerge.getReg(Idx: I));
1074 PrevLo = B.buildSelect(Res: VgprRB_S32, Tst: Cmp, Op0: EltUnmerge.getReg(Idx: 0), Op1: PrevLo)
1075 .getReg(Idx: 0);
1076 PrevHi = B.buildSelect(Res: VgprRB_S32, Tst: Cmp, Op0: EltUnmerge.getReg(Idx: 1), Op1: PrevHi)
1077 .getReg(Idx: 0);
1078 }
1079 B.buildMergeLikeInstr(Res: Dst, Ops: {PrevLo, PrevHi});
1080 } else {
1081 reportGISelFailure(
1082 MF, MORE, PassName: "amdgpu-regbanklegalize",
1083 Msg: "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type", MI);
1084 return false;
1085 }
1086
1087 MI.eraseFromParent();
1088 return true;
1089}
1090
1091bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &MI) {
1092 // Reduce a 64-bit element extract to two 32-bit extracts:
1093 // vec32 = bitcast <N x s64> to <2N x s32>
1094 // lo = vec32[idx * 2]
1095 // hi = vec32[idx * 2 + 1]
1096 // result = merge(lo, hi)
1097 //
1098 // When the index is uniform, all lanes extract the same element, so we can
1099 // just split the s64 extract into two s32 extracts which lower to MOVREL.
1100 Register Dst = MI.getOperand(i: 0).getReg();
1101 Register Src = MI.getOperand(i: 1).getReg();
1102 Register Idx = MI.getOperand(i: 2).getReg();
1103
1104 LLT SrcTy = MRI.getType(Reg: Src);
1105 LLT Vec32Ty = LLT::fixed_vector(NumElements: 2 * SrcTy.getNumElements(), ScalarSizeInBits: 32);
1106
1107 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1108 "expected VGPR src and SGPR idx");
1109
1110 auto CastSrc = B.buildBitcast(Dst: {VgprRB, Vec32Ty}, Src);
1111
1112 // Calculate new Lo and Hi indices
1113 auto One = B.buildConstant(Res: SgprRB_S32, Val: 1);
1114 auto IdxLo = B.buildShl(Dst: SgprRB_S32, Src0: Idx, Src1: One);
1115 auto IdxHi = B.buildAdd(Dst: SgprRB_S32, Src0: IdxLo, Src1: One);
1116
1117 auto ExtLo = B.buildExtractVectorElement(Res: VgprRB_S32, Val: CastSrc, Idx: IdxLo);
1118 auto ExtHi = B.buildExtractVectorElement(Res: VgprRB_S32, Val: CastSrc, Idx: IdxHi);
1119
1120 B.buildMergeLikeInstr(Res: Dst, Ops: {ExtLo.getReg(Idx: 0), ExtHi.getReg(Idx: 0)});
1121
1122 MI.eraseFromParent();
1123 return true;
1124}
1125
1126bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &MI) {
1127 // Lower insert vector element to a compare-select chain:
1128 // for i in 0..N-1:
1129 // result[i] = (idx == i) ? elt : srcVec[i]
1130 // dst = merge(result[0..N-1])
1131 //
1132 // VGPR B64 requires splitting to lo/hi s32 pairs since there is no
1133 // v_cndmask_b64. SGPR B64/B32 and VGPR B32 can be handled natively.
1134 Register Dst = MI.getOperand(i: 0).getReg();
1135 Register Src = MI.getOperand(i: 1).getReg();
1136 Register Elt = MI.getOperand(i: 2).getReg();
1137 Register Idx = MI.getOperand(i: 3).getReg();
1138
1139 LLT VecTy = MRI.getType(Reg: Src);
1140 LLT ScalarTy = VecTy.getScalarType();
1141 unsigned NumElts = VecTy.getNumElements();
1142 const RegisterBank *SrcRB = MRI.getRegBank(Reg: Src);
1143 bool IsSGPR = (SrcRB == SgprRB);
1144 SmallVector<Register, 16> Selects;
1145
1146 if (!IsSGPR && ScalarTy.getSizeInBits() == 64) {
1147 // VGPR B64: split to 32-bit lo/hi since there is no v_cndmask_b64.
1148 auto Unmerge = B.buildUnmerge(Attrs: VgprRB_S32, Op: Src);
1149 auto EltUnmerge = B.buildUnmerge(Attrs: VgprRB_S32, Op: Elt);
1150 Register EltLo = EltUnmerge.getReg(Idx: 0);
1151 Register EltHi = EltUnmerge.getReg(Idx: 1);
1152 for (unsigned I = 0; I < NumElts; ++I) {
1153 auto IdxConst = B.buildConstant(Res: VgprRB_S32, Val: I);
1154 auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: VccRB_S1, Op0: Idx, Op1: IdxConst);
1155 Selects.push_back(
1156 Elt: B.buildSelect(Res: VgprRB_S32, Tst: Cmp, Op0: EltLo, Op1: Unmerge.getReg(Idx: 2 * I))
1157 .getReg(Idx: 0));
1158 Selects.push_back(
1159 Elt: B.buildSelect(Res: VgprRB_S32, Tst: Cmp, Op0: EltHi, Op1: Unmerge.getReg(Idx: 2 * I + 1))
1160 .getReg(Idx: 0));
1161 }
1162 LLT Vec32Ty = LLT::fixed_vector(NumElements: 2 * NumElts, ScalarSizeInBits: 32);
1163 auto Vec32 = B.buildBuildVector(Res: {VgprRB, Vec32Ty}, Ops: Selects);
1164 B.buildBitcast(Dst, Src: Vec32);
1165 } else if (ScalarTy.getSizeInBits() == 32 || ScalarTy.getSizeInBits() == 64) {
1166 // B32 (any bank) and SGPR B64: element-wise select at native width.
1167 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {.RCOrRB: SrcRB, .Ty: ScalarTy};
1168 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1169 auto Unmerge = B.buildUnmerge(Attrs: SrcRB_EltTy, Op: Src);
1170 for (unsigned I = 0; I < NumElts; ++I) {
1171 auto IdxConst = B.buildConstant(Res: SgprRB_S32, Val: I);
1172 auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CmpTy, Op0: Idx, Op1: IdxConst);
1173 Selects.push_back(
1174 Elt: B.buildSelect(Res: SrcRB_EltTy, Tst: Cmp, Op0: Elt, Op1: Unmerge.getReg(Idx: I)).getReg(Idx: 0));
1175 }
1176 B.buildMergeLikeInstr(Res: Dst, Ops: Selects);
1177 } else {
1178 reportGISelFailure(
1179 MF, MORE, PassName: "amdgpu-regbanklegalize",
1180 Msg: "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type", MI);
1181 return false;
1182 }
1183
1184 MI.eraseFromParent();
1185 return true;
1186}
1187
1188bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &MI) {
1189 // Reduce a 64-bit element insert to two 32-bit inserts:
1190 // vec32 = bitcast <N x s64> to <2N x s32>
1191 // lo, hi = unmerge elt
1192 // vec32[idx * 2] = lo
1193 // vec32[idx * 2 + 1] = hi
1194 // dst = bitcast <2N x s32> to <N x s64>
1195 //
1196 // When the index is uniform, all lanes insert at the same position, so we
1197 // can split the s64 insert into two s32 inserts which lower to MOVREL/GPRIDX.
1198 Register Dst = MI.getOperand(i: 0).getReg();
1199 Register Src = MI.getOperand(i: 1).getReg();
1200 Register Elt = MI.getOperand(i: 2).getReg();
1201 Register Idx = MI.getOperand(i: 3).getReg();
1202
1203 LLT SrcTy = MRI.getType(Reg: Src);
1204 LLT Vec32Ty = LLT::fixed_vector(NumElements: 2 * SrcTy.getNumElements(), ScalarSizeInBits: 32);
1205
1206 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1207 "expected VGPR src and SGPR idx");
1208
1209 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {.RCOrRB: VgprRB, .Ty: Vec32Ty};
1210
1211 auto CastSrc = B.buildBitcast(Dst: VgprRB_Vec32Ty, Src);
1212 auto EltUnmerge = B.buildUnmerge(Attrs: VgprRB_S32, Op: Elt);
1213
1214 // Calculate new Lo and Hi indices
1215 auto One = B.buildConstant(Res: SgprRB_S32, Val: 1);
1216 auto IdxLo = B.buildShl(Dst: SgprRB_S32, Src0: Idx, Src1: One);
1217 auto IdxHi = B.buildAdd(Dst: SgprRB_S32, Src0: IdxLo, Src1: One);
1218
1219 auto InsLo = B.buildInsertVectorElement(Res: VgprRB_Vec32Ty, Val: CastSrc,
1220 Elt: EltUnmerge.getReg(Idx: 0), Idx: IdxLo);
1221 auto InsHi = B.buildInsertVectorElement(Res: VgprRB_Vec32Ty, Val: InsLo,
1222 Elt: EltUnmerge.getReg(Idx: 1), Idx: IdxHi);
1223
1224 B.buildBitcast(Dst, Src: InsHi);
1225
1226 MI.eraseFromParent();
1227 return true;
1228}
1229
1230bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &MI) {
1231 // Lower divergent G_ABS to smax(x, 0 - x) in the VGPR bank:
1232 // zero = 0
1233 // neg = G_SUB zero, x
1234 // dst = G_SMAX x, neg
1235 //
1236 // There is no integer v_abs instruction on AMDGPU, so divergent G_ABS is
1237 // expanded to this sub/smax pair.
1238 Register DstReg = MI.getOperand(i: 0).getReg();
1239 Register SrcReg = MI.getOperand(i: 1).getReg();
1240 LLT Ty = MRI.getType(Reg: DstReg);
1241
1242 Register Zero;
1243 if (Ty == V2S16) {
1244 // buildConstant cannot produce a V2S16 directly; pack two S16 zeros.
1245 Register Zero16 = B.buildConstant(Res: {VgprRB, S16}, Val: 0).getReg(Idx: 0);
1246 Zero = B.buildBuildVector(Res: {VgprRB, Ty}, Ops: {Zero16, Zero16}).getReg(Idx: 0);
1247 } else {
1248 assert((Ty == S32 || Ty == S16) && "unexpected type for AbsToNegMax");
1249 Zero = B.buildConstant(Res: {VgprRB, Ty}, Val: 0).getReg(Idx: 0);
1250 }
1251
1252 auto Neg = B.buildSub(Dst: {VgprRB, Ty}, Src0: Zero, Src1: SrcReg);
1253 B.buildSMax(Dst: DstReg, Src0: SrcReg, Src1: Neg);
1254 MI.eraseFromParent();
1255 return true;
1256}
1257
1258bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &MI) {
1259 // Lower uniform V2S16 abs by unpacking the values to two separate SGPR
1260 // registers and re-emitting G_ABS on each:
1261 // packed = bitcast <2 x s16> src to s32
1262 // lo = sext_inreg packed, 16
1263 // hi = ashr packed, 16
1264 // dst = build_vector_trunc G_ABS(lo), G_ABS(hi)
1265 //
1266 // SALU only has s_abs_i32, with no direct uniform V2S16 abs. The
1267 // re-emitted G_ABS(SgprRB, S32) selects to s_abs_i32 on each value.
1268 auto Bitcast = B.buildBitcast(Dst: {SgprRB_S32}, Src: MI.getOperand(i: 1).getReg());
1269 auto SextInReg = B.buildSExtInReg(Res: {SgprRB_S32}, Op: Bitcast, ImmOp: 16);
1270 auto ShiftHi =
1271 B.buildAShr(Dst: {SgprRB_S32}, Src0: Bitcast, Src1: B.buildConstant(Res: {SgprRB_S32}, Val: 16));
1272
1273 auto AbsLo = B.buildInstr(Opc: AMDGPU::G_ABS, DstOps: {{SgprRB_S32}}, SrcOps: {SextInReg});
1274 auto AbsHi = B.buildInstr(Opc: AMDGPU::G_ABS, DstOps: {{SgprRB_S32}}, SrcOps: {ShiftHi});
1275 B.buildBuildVectorTrunc(Res: MI.getOperand(i: 0).getReg(),
1276 Ops: {AbsLo.getReg(Idx: 0), AbsHi.getReg(Idx: 0)});
1277
1278 MI.eraseFromParent();
1279 return true;
1280}
1281
1282// Ported from SITargetLowering::lowerSET_ROUNDING in SIISelLowering.cpp.
1283// Keep the mapping logic and conversion tables aligned with the SDAG lowering.
1284bool RegBankLegalizeHelper::lowerSetRounding(MachineInstr &MI) {
1285 Register NewMode = MI.getOperand(i: 0).getReg();
1286
1287 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
1288 // hardware MODE.fp_round values.
1289 if (auto ConstMode = getIConstantVRegValWithLookThrough(VReg: NewMode, MRI)) {
1290 uint32_t ClampedVal = std::min(
1291 a: static_cast<uint32_t>(ConstMode->Value.getZExtValue()),
1292 b: static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
1293 uint32_t DecodedVal = AMDGPU::decodeFltRoundToHWConversionTable(FltRounds: ClampedVal);
1294 NewMode = B.buildConstant(Res: SgprRB_S32, Val: DecodedVal).getReg(Idx: 0);
1295 } else {
1296 // If we know the input can only be one of the supported standard modes in
1297 // the range 0-3, we can use a simplified mapping to hardware values.
1298 KnownBits Known = VT->getKnownBits(R: NewMode);
1299 const bool UseReducedTable = Known.countMinLeadingZeros() >= 30;
1300 // The supported standard values are 0-3. The extended values start at 8. We
1301 // need to offset by 4 if the value is in the extended range.
1302
1303 if (UseReducedTable) {
1304 // Truncate to the low 32-bits.
1305 auto BitTable = B.buildConstant(
1306 Res: SgprRB_S32, Val: AMDGPU::FltRoundToHWConversionTable & 0xffff);
1307
1308 auto Two = B.buildConstant(Res: SgprRB_S32, Val: 2);
1309 auto RoundModeTimesNumBits = B.buildShl(Dst: SgprRB_S32, Src0: NewMode, Src1: Two);
1310
1311 NewMode =
1312 B.buildLShr(Dst: SgprRB_S32, Src0: BitTable, Src1: RoundModeTimesNumBits).getReg(Idx: 0);
1313
1314 // TODO: A demanded-bits simplification on the setreg source here could
1315 // likely reduce the table extracted bits into inline immediates.
1316 } else {
1317 // table_index = umin(value, value - 4)
1318 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
1319 auto NegFour = B.buildConstant(Res: SgprRB_S32, Val: -4);
1320 auto OffsetEnum = B.buildAdd(Dst: SgprRB_S32, Src0: NewMode, Src1: NegFour);
1321 auto IndexVal = B.buildUMin(Dst: SgprRB_S32, Src0: NewMode, Src1: OffsetEnum);
1322
1323 auto Two = B.buildConstant(Res: SgprRB_S32, Val: 2);
1324 auto RoundModeTimesNumBits = B.buildShl(Dst: SgprRB_S32, Src0: IndexVal, Src1: Two);
1325
1326 auto BitTable =
1327 B.buildConstant(Res: {SgprRB, S64}, Val: AMDGPU::FltRoundToHWConversionTable);
1328 auto TableValue =
1329 B.buildLShr(Dst: {SgprRB, S64}, Src0: BitTable, Src1: RoundModeTimesNumBits);
1330 // No need to mask out the high bits since the setreg will ignore them
1331 // anyway.
1332 NewMode = B.buildTrunc(Res: SgprRB_S32, Op: TableValue).getReg(Idx: 0);
1333 }
1334 }
1335
1336 // N.B. The setreg will be later folded into s_round_mode on supported
1337 // targets.
1338 uint32_t BothRoundHwReg =
1339 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4);
1340 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
1341 /*HasSideEffects=*/true, /*isConvergent=*/false)
1342 .addImm(Val: static_cast<int16_t>(BothRoundHwReg))
1343 .addReg(RegNo: NewMode);
1344
1345 MI.eraseFromParent();
1346 return true;
1347}
1348
1349// Ported from SITargetLowering::lowerGET_ROUNDING in SIISelLowering.cpp.
1350// Keep the mapping logic and conversion tables aligned with the SDAG lowering.
1351bool RegBankLegalizeHelper::lowerGetRounding(MachineInstr &MI) {
1352 Register Dst = MI.getOperand(i: 0).getReg();
1353
1354 uint32_t BothRoundHwReg =
1355 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4);
1356 auto GetReg =
1357 B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {SgprRB_S32},
1358 /*HasSideEffects=*/true, /*isConvergent=*/false)
1359 .addImm(Val: BothRoundHwReg);
1360
1361 // There are two rounding modes, one for f32 and one for f64/f16. We only
1362 // report in the standard value range if both are the same.
1363 //
1364 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
1365 // ties away from zero is not supported, and the other values are rotated by
1366 // 1.
1367 //
1368 // If the two rounding modes are not the same, report a target defined value.
1369
1370 // Mode register rounding mode fields:
1371 //
1372 // [1:0] Single-precision round mode.
1373 // [3:2] Double/Half-precision round mode.
1374 //
1375 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
1376 //
1377 // Hardware Spec
1378 // Toward-0 3 0
1379 // Nearest Even 0 1
1380 // +Inf 1 2
1381 // -Inf 2 3
1382 // NearestAway0 N/A 4
1383 //
1384 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
1385 // table we can index by the raw hardware mode.
1386 //
1387 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
1388 auto BitTable =
1389 B.buildConstant(Res: {SgprRB, S64}, Val: AMDGPU::FltRoundConversionTable);
1390
1391 auto Two = B.buildConstant(Res: SgprRB_S32, Val: 2);
1392 auto RoundModeTimesNumBits = B.buildShl(Dst: SgprRB_S32, Src0: GetReg, Src1: Two);
1393
1394 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
1395 // knew only one mode was demanded.
1396 auto TableValue = B.buildLShr(Dst: {SgprRB, S64}, Src0: BitTable, Src1: RoundModeTimesNumBits);
1397 auto TruncTable = B.buildTrunc(Res: SgprRB_S32, Op: TableValue);
1398
1399 auto EntryMask = B.buildConstant(Res: SgprRB_S32, Val: 0xf);
1400 auto TableEntry = B.buildAnd(Dst: SgprRB_S32, Src0: TruncTable, Src1: EntryMask);
1401
1402 // There's a gap in the 4-bit encoded table and actual enum values, so offset
1403 // if it's an extended value.
1404 auto Four = B.buildConstant(Res: SgprRB_S32, Val: 4);
1405 auto EnumOffset = B.buildAdd(Dst: SgprRB_S32, Src0: TableEntry, Src1: Four);
1406 auto IsStandardMode =
1407 B.buildICmp(Pred: CmpInst::ICMP_ULT, Res: SgprRB_S32, Op0: TableEntry, Op1: Four);
1408 B.buildSelect(Res: Dst, Tst: IsStandardMode, Op0: TableEntry, Op1: EnumOffset);
1409
1410 MI.eraseFromParent();
1411 return true;
1412}
1413
1414bool RegBankLegalizeHelper::lower(MachineInstr &MI,
1415 const RegBankLLTMapping &Mapping,
1416 WaterfallInfo &WFI) {
1417
1418 switch (Mapping.LoweringMethod) {
1419 case DoNotLower:
1420 break;
1421 case VccExtToSel:
1422 return lowerVccExtToSel(MI);
1423 case UniExtToSel: {
1424 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1425 auto True = B.buildConstant(Res: {SgprRB, Ty},
1426 Val: MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1427 auto False = B.buildConstant(Res: {SgprRB, Ty}, Val: 0);
1428 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
1429 // We are making select here. S1 cond was already 'any-extended to S32' +
1430 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
1431 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: MI.getOperand(i: 1).getReg(), Op0: True,
1432 Op1: False);
1433 MI.eraseFromParent();
1434 return true;
1435 }
1436 case UnpackBitShift:
1437 return lowerUnpackBitShift(MI);
1438 case UnpackMinMax:
1439 return lowerUnpackMinMax(MI);
1440 case ScalarizeToS16:
1441 return lowerSplitTo16(MI);
1442 case Ext32To64: {
1443 const RegisterBank *RB = MRI.getRegBank(Reg: MI.getOperand(i: 0).getReg());
1444 MachineInstrBuilder Hi;
1445 switch (MI.getOpcode()) {
1446 case AMDGPU::G_ZEXT: {
1447 Hi = B.buildConstant(Res: {RB, S32}, Val: 0);
1448 break;
1449 }
1450 case AMDGPU::G_SEXT: {
1451 // Replicate sign bit from 32-bit extended part.
1452 auto ShiftAmt = B.buildConstant(Res: {RB, S32}, Val: 31);
1453 Hi = B.buildAShr(Dst: {RB, S32}, Src0: MI.getOperand(i: 1).getReg(), Src1: ShiftAmt);
1454 break;
1455 }
1456 case AMDGPU::G_ANYEXT: {
1457 Hi = B.buildUndef(Res: {RB, S32});
1458 break;
1459 }
1460 default:
1461 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1462 Msg: "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1463 MI);
1464 return false;
1465 }
1466
1467 B.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(),
1468 Ops: {MI.getOperand(i: 1).getReg(), Hi});
1469 MI.eraseFromParent();
1470 return true;
1471 }
1472 case UniCstExt: {
1473 uint64_t ConstVal = MI.getOperand(i: 1).getCImm()->getZExtValue();
1474 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: ConstVal);
1475
1476 MI.eraseFromParent();
1477 return true;
1478 }
1479 case VgprToVccCopy: {
1480 Register Src = MI.getOperand(i: 1).getReg();
1481 LLT Ty = MRI.getType(Reg: Src);
1482 // Take lowest bit from each lane and put it in lane mask.
1483 // Lowering via compare, but we need to clean high bits first as compare
1484 // compares all bits in register.
1485 Register BoolSrc = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
1486 if (Ty == S64) {
1487 auto Src64 = B.buildUnmerge(Attrs: VgprRB_S32, Op: Src);
1488 auto One = B.buildConstant(Res: VgprRB_S32, Val: 1);
1489 auto AndLo = B.buildAnd(Dst: VgprRB_S32, Src0: Src64.getReg(Idx: 0), Src1: One);
1490 auto Zero = B.buildConstant(Res: VgprRB_S32, Val: 0);
1491 auto AndHi = B.buildAnd(Dst: VgprRB_S32, Src0: Src64.getReg(Idx: 1), Src1: Zero);
1492 B.buildMergeLikeInstr(Res: BoolSrc, Ops: {AndLo, AndHi});
1493 } else {
1494 assert(Ty == S32 || Ty == S16);
1495 auto One = B.buildConstant(Res: {VgprRB, Ty}, Val: 1);
1496 B.buildAnd(Dst: BoolSrc, Src0: Src, Src1: One);
1497 }
1498 auto Zero = B.buildConstant(Res: {VgprRB, Ty}, Val: 0);
1499 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: MI.getOperand(i: 0).getReg(), Op0: BoolSrc, Op1: Zero);
1500 MI.eraseFromParent();
1501 return true;
1502 }
1503 case V_BFE:
1504 return lowerV_BFE(MI);
1505 case S_BFE:
1506 return lowerS_BFE(MI);
1507 case UniMAD64:
1508 return lowerUniMAD64(MI);
1509 case UniMul64: {
1510 B.buildMul(Dst: MI.getOperand(i: 0), Src0: MI.getOperand(i: 1), Src1: MI.getOperand(i: 2));
1511 MI.eraseFromParent();
1512 return true;
1513 }
1514 case DivSMulToMAD: {
1515 auto Op1 = B.buildTrunc(Res: VgprRB_S32, Op: MI.getOperand(i: 1));
1516 auto Op2 = B.buildTrunc(Res: VgprRB_S32, Op: MI.getOperand(i: 2));
1517 auto Zero = B.buildConstant(Res: {VgprRB, S64}, Val: 0);
1518
1519 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1520 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1521 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1522
1523 B.buildInstr(Opc: NewOpc, DstOps: {MI.getOperand(i: 0).getReg(), {SgprRB, S32}},
1524 SrcOps: {Op1, Op2, Zero});
1525 MI.eraseFromParent();
1526 return true;
1527 }
1528 case SplitTo32:
1529 return lowerSplitTo32(MI);
1530 case SplitTo32Mul:
1531 return lowerSplitTo32Mul(MI);
1532 case SplitTo32Select:
1533 return lowerSplitTo32Select(MI);
1534 case SplitTo32SExtInReg:
1535 return lowerSplitTo32SExtInReg(MI);
1536 case CtPop64To32: {
1537 auto Unmerge = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: S32}, Op: MI.getOperand(i: 1).getReg());
1538 auto LoPopCnt = B.buildCTPOP(Dst: {VgprRB, S32}, Src0: Unmerge.getReg(Idx: 0));
1539 auto HiPopCnt = B.buildCTPOP(Dst: {VgprRB, S32}, Src0: Unmerge.getReg(Idx: 1));
1540 // Max popcount of two 32-bit values is 64, so this add cannot overflow.
1541 B.buildAdd(Dst: MI.getOperand(i: 0).getReg(), Src0: LoPopCnt, Src1: HiPopCnt,
1542 Flags: MachineInstr::NoSWrap | MachineInstr::NoUWrap);
1543
1544 MI.eraseFromParent();
1545 break;
1546 }
1547 case S_BUF_to_BUF:
1548 return lowerSBufToBuf(MI, WFI);
1549 case SplitLoad: {
1550 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1551 unsigned Size = DstTy.getSizeInBits();
1552 // Even split to 128-bit loads
1553 if (Size > 128) {
1554 LLT B128;
1555 if (DstTy.isVector()) {
1556 LLT EltTy = DstTy.getElementType();
1557 B128 = LLT::fixed_vector(NumElements: 128 / EltTy.getSizeInBits(), ScalarTy: EltTy);
1558 } else {
1559 B128 = LLT::scalar(SizeInBits: 128);
1560 }
1561 if (Size / 128 == 2)
1562 splitLoad(MI, LLTBreakdown: {B128, B128});
1563 else if (Size / 128 == 4)
1564 splitLoad(MI, LLTBreakdown: {B128, B128, B128, B128});
1565 else {
1566 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1567 Msg: "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1568 MI);
1569 return false;
1570 }
1571 }
1572 // 64 and 32 bit load
1573 else if (DstTy == S96)
1574 splitLoad(MI, LLTBreakdown: {S64, S32}, MergeTy: S32);
1575 else if (DstTy == V3S32)
1576 splitLoad(MI, LLTBreakdown: {V2S32, S32}, MergeTy: S32);
1577 else if (DstTy == V6S16)
1578 splitLoad(MI, LLTBreakdown: {V4S16, V2S16}, MergeTy: V2S16);
1579 else {
1580 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1581 Msg: "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1582 MI);
1583 return false;
1584 }
1585 return true;
1586 }
1587 case DynStackAlloc: {
1588 const auto &TFI = *ST.getFrameLowering();
1589 // Guard in case the stack growth direction ever changes with scratch
1590 // instructions.
1591 assert(TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
1592 "Stack grows upwards for AMDGPU");
1593
1594 Register Dst = MI.getOperand(i: 0).getReg();
1595 Register AllocSize = MI.getOperand(i: 1).getReg();
1596 Align Alignment = assumeAligned(Value: MI.getOperand(i: 2).getImm());
1597
1598 // Erase before building new instrs to avoid hitting multiple Dst assert
1599 // with CSE.
1600 B.setInsertPt(MBB&: *MI.getParent(), II: std::next(x: MI.getIterator()));
1601 MI.eraseFromParent();
1602
1603 if (MRI.getRegBank(Reg: AllocSize) != SgprRB) {
1604 auto WaveReduction =
1605 B.buildIntrinsic(ID: Intrinsic::amdgcn_wave_reduce_umax, Res: {SgprRB_S32})
1606 .addUse(RegNo: AllocSize)
1607 .addImm(Val: 0);
1608 AllocSize = WaveReduction.getReg(Idx: 0);
1609 }
1610
1611 LLT PtrTy = MRI.getType(Reg: Dst);
1612 assert(PtrTy.getSizeInBits() == 32 &&
1613 "Expected 32-bit pointer for stack allocation");
1614 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1615 Register SPReg = Info->getStackPtrOffsetReg();
1616
1617 // When using flat-scratch, the stack offset is unscaled.
1618 const bool HasFlatScratch = ST.hasFlatScratchEnabled();
1619 const unsigned WavefrontSizeLog2 = ST.getWavefrontSizeLog2();
1620
1621 Register AdjustedSize = AllocSize;
1622 if (!HasFlatScratch) {
1623 auto WaveSize = B.buildConstant(Res: SgprRB_S32, Val: WavefrontSizeLog2);
1624 AdjustedSize = B.buildShl(Dst: SgprRB_S32, Src0: AllocSize, Src1: WaveSize).getReg(Idx: 0);
1625 }
1626 if (Alignment > TFI.getStackAlign()) {
1627 const uint64_t EffectiveAlignment =
1628 Alignment.value() << (HasFlatScratch ? 0 : WavefrontSizeLog2);
1629 auto OldSP = B.buildCopy(Res: {SgprRB, PtrTy}, Op: SPReg);
1630 auto Tmp1 =
1631 B.buildPtrAdd(Res: {SgprRB, PtrTy}, Op0: OldSP,
1632 Op1: B.buildConstant(Res: SgprRB_S32, Val: EffectiveAlignment - 1));
1633 uint64_t Mask = maskTrailingZeros<uint64_t>(N: Log2_64(Value: EffectiveAlignment));
1634 B.buildPtrMask(Res: Dst, Op0: Tmp1, Op1: B.buildConstant(Res: SgprRB_S32, Val: Mask));
1635 } else {
1636 B.buildCopy(Res: Dst, Op: SPReg);
1637 }
1638 auto PtrAdd = B.buildPtrAdd(Res: {SgprRB, PtrTy}, Op0: Dst, Op1: AdjustedSize);
1639 B.buildCopy(Res: SPReg, Op: PtrAdd);
1640 return true;
1641 }
1642 case WidenLoad: {
1643 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1644 if (DstTy == S96)
1645 widenLoad(MI, WideTy: S128);
1646 else if (DstTy == V3S32)
1647 widenLoad(MI, WideTy: V4S32, MergeTy: S32);
1648 else if (DstTy == V6S16)
1649 widenLoad(MI, WideTy: V8S16, MergeTy: V2S16);
1650 else {
1651 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1652 Msg: "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1653 MI);
1654 return false;
1655 }
1656 return true;
1657 }
1658 case UnpackAExt:
1659 return lowerUnpackAExt(MI);
1660 case WidenMMOToS32:
1661 return widenMMOToS32(MI&: cast<GAnyLoad>(Val&: MI));
1662 case VerifyAllSgpr: {
1663 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1664 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1665 }));
1666 return true;
1667 }
1668 case ApplyAllVgpr: {
1669 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1670 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1671 }));
1672 B.setInstrAndDebugLoc(MI);
1673 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1674 MachineOperand &Op = MI.getOperand(i);
1675 if (!Op.isReg())
1676 continue;
1677 Register Reg = Op.getReg();
1678 if (MRI.getRegBank(Reg) != VgprRB) {
1679 auto Copy = B.buildCopy(Res: {VgprRB, MRI.getType(Reg)}, Op: Reg);
1680 Op.setReg(Copy.getReg(Idx: 0));
1681 }
1682 }
1683 return true;
1684 }
1685 case UnmergeToShiftTrunc: {
1686 GUnmerge *Unmerge = dyn_cast<GUnmerge>(Val: &MI);
1687 LLT Ty = MRI.getType(Reg: Unmerge->getSourceReg());
1688 if (Ty.getSizeInBits() % 32 != 0) {
1689 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1690 Msg: "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1691 MI);
1692 return false;
1693 }
1694
1695 B.setInstrAndDebugLoc(MI);
1696 if (Ty.getSizeInBits() > 32) {
1697 auto UnmergeV2S16 =
1698 B.buildUnmerge(Attrs: {.RCOrRB: SgprRB, .Ty: V2S16}, Op: Unmerge->getSourceReg());
1699 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1700 auto [Dst0S32, Dst1S32] =
1701 unpackAExt(Reg: UnmergeV2S16->getOperand(i).getReg());
1702 B.buildTrunc(Res: MI.getOperand(i: i * 2).getReg(), Op: Dst0S32);
1703 B.buildTrunc(Res: MI.getOperand(i: i * 2 + 1).getReg(), Op: Dst1S32);
1704 }
1705 } else {
1706 auto [Dst0S32, Dst1S32] = unpackAExt(Reg: MI.getOperand(i: 2).getReg());
1707 B.buildTrunc(Res: MI.getOperand(i: 0).getReg(), Op: Dst0S32);
1708 B.buildTrunc(Res: MI.getOperand(i: 1).getReg(), Op: Dst1S32);
1709 }
1710
1711 MI.eraseFromParent();
1712 return true;
1713 }
1714 case AextToS32InIncomingBlockGPHI: {
1715 Register Dst = MI.getOperand(i: 0).getReg();
1716 Register NewDst = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
1717 B.setInsertPt(MBB&: *MI.getParent(), II: MI.getParent()->getFirstNonPHI());
1718 MI.getOperand(i: 0).setReg(NewDst);
1719 B.buildTrunc(Res: Dst, Op: NewDst);
1720
1721 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1722 Register UseReg = MI.getOperand(i).getReg();
1723
1724 auto DefMI = MRI.getVRegDef(Reg: UseReg)->getIterator();
1725 MachineBasicBlock *DefMBB = DefMI->getParent();
1726
1727 B.setInsertPt(MBB&: *DefMBB, II: DefMBB->SkipPHIsAndLabels(I: std::next(x: DefMI)));
1728
1729 auto NewUse = B.buildAnyExt(Res: SgprRB_S32, Op: UseReg);
1730 MI.getOperand(i).setReg(NewUse.getReg(Idx: 0));
1731 }
1732 break;
1733 }
1734 case VerifyAllSgprGPHI: {
1735 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1736 if (Op.isMBB())
1737 return true;
1738 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1739 }));
1740 return true;
1741 }
1742 case VerifyAllSgprOrVgprGPHI: {
1743 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1744 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1745 if (Op.isMBB())
1746 return true;
1747 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1748 return RB == VgprRB || RB == SgprRB;
1749 }));
1750 return true;
1751 }
1752 case ApplyINTRIN_IMAGE: {
1753 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1754 AMDGPU::lookupRsrcIntrinsic(Intr: AMDGPU::getIntrinsicID(I: MI));
1755 assert(RSrcIntrin && RSrcIntrin->IsImage);
1756 // The reported argument index is relative to the IR intrinsic call
1757 // arguments, so shift by the number of defs and the intrinsic ID.
1758 unsigned RsrcIdx = RSrcIntrin->RsrcArg + MI.getNumExplicitDefs() + 1;
1759 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1760 }
1761 case ApplyBVH_INTERSECT_RAY: {
1762 // Rsrc is the last register operand. Base BVH trails an A16 immediate
1763 // after rsrc; dual/BVH8 do not. Scan backwards for the last virtual
1764 // register.
1765 unsigned RsrcIdx = MI.getNumOperands();
1766 while (RsrcIdx-- > MI.getNumExplicitDefs()) {
1767 const MachineOperand &Op = MI.getOperand(i: RsrcIdx);
1768 if (Op.isReg() && Op.getReg().isVirtual())
1769 break;
1770 }
1771 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1772 }
1773 case SplitBitCount64To32:
1774 return lowerSplitBitCount64To32(MI);
1775 case ExtrVecEltToSel:
1776 return lowerExtrVecEltToSel(MI);
1777 case ExtrVecEltTo32:
1778 return lowerExtrVecEltTo32(MI);
1779 case InsVecEltToSel:
1780 return lowerInsVecEltToSel(MI);
1781 case InsVecEltTo32:
1782 return lowerInsVecEltTo32(MI);
1783 case AbsToNegMax:
1784 return lowerAbsToNegMax(MI);
1785 case AbsToS32:
1786 return lowerAbsToS32(MI);
1787 case DeletePrefetch:
1788 MI.eraseFromParent();
1789 return true;
1790 case LowerSetRounding:
1791 return lowerSetRounding(MI);
1792 case LowerGetRounding:
1793 return lowerGetRounding(MI);
1794 }
1795
1796 return true;
1797}
1798
1799LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1800 switch (ID) {
1801 case Vcc:
1802 case UniInVcc:
1803 return LLT::scalar(SizeInBits: 1);
1804 case Sgpr16:
1805 case Vgpr16:
1806 case UniInVgprS16:
1807 return LLT::scalar(SizeInBits: 16);
1808 case Sgpr32:
1809 case Sgpr32_WF:
1810 case Sgpr32Trunc:
1811 case Sgpr32AExt:
1812 case Sgpr32AExtBoolInReg:
1813 case Sgpr32SExt:
1814 case Sgpr32ZExt:
1815 case UniInVgprS32:
1816 case Sgpr32ToVgprDst:
1817 case Vgpr32:
1818 case Vgpr32AExt:
1819 case Vgpr32SExt:
1820 case Vgpr32ZExt:
1821 return LLT::scalar(SizeInBits: 32);
1822 case Sgpr64:
1823 case Vgpr64:
1824 case UniInVgprS64:
1825 case Sgpr64ToVgprDst:
1826 return LLT::scalar(SizeInBits: 64);
1827 case Sgpr128:
1828 case Vgpr128:
1829 return LLT::scalar(SizeInBits: 128);
1830 case SgprP0:
1831 case SgprP0Call_WF:
1832 case VgprP0:
1833 return LLT::pointer(AddressSpace: 0, SizeInBits: 64);
1834 case SgprP1:
1835 case VgprP1:
1836 return LLT::pointer(AddressSpace: 1, SizeInBits: 64);
1837 case SgprP2:
1838 case VgprP2:
1839 return LLT::pointer(AddressSpace: 2, SizeInBits: 32);
1840 case SgprP3:
1841 case VgprP3:
1842 return LLT::pointer(AddressSpace: 3, SizeInBits: 32);
1843 case SgprP4:
1844 case SgprP4Call_WF:
1845 case VgprP4:
1846 return LLT::pointer(AddressSpace: 4, SizeInBits: 64);
1847 case SgprP5:
1848 case VgprP5:
1849 return LLT::pointer(AddressSpace: 5, SizeInBits: 32);
1850 case SgprP6:
1851 return LLT::pointer(AddressSpace: 6, SizeInBits: 32);
1852 case SgprP8:
1853 return LLT::pointer(AddressSpace: 8, SizeInBits: 128);
1854 case SgprV2S16:
1855 case VgprV2S16:
1856 case UniInVgprV2S16:
1857 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
1858 case SgprV2S32:
1859 case VgprV2S32:
1860 case UniInVgprV2S32:
1861 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
1862 case VgprV3S32:
1863 case UniInVgprV3S32:
1864 return LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
1865 case VgprV4S16:
1866 return LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
1867 case VgprV8S16:
1868 case UniInVgprV8S16:
1869 return LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16);
1870 case VgprV16S16:
1871 case UniInVgprV16S16:
1872 return LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 16);
1873 case SgprV4S32:
1874 case SgprV4S32_WF:
1875 case SgprV4S32_ReadFirstLane:
1876 case VgprV4S32:
1877 case UniInVgprV4S32:
1878 return LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
1879 case VgprV8S32:
1880 case UniInVgprV8S32:
1881 case SgprV8S32_ReadFirstLane:
1882 return LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 32);
1883 case VgprV2S64:
1884 case UniInVgprV2S64:
1885 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
1886 case VgprV6S32:
1887 case UniInVgprV6S32:
1888 return LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 32);
1889 case VgprV16S32:
1890 case UniInVgprV16S32:
1891 return LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 32);
1892 case VgprV32S16:
1893 case UniInVgprV32S16:
1894 return LLT::fixed_vector(NumElements: 32, ScalarSizeInBits: 16);
1895 case VgprV32S32:
1896 case UniInVgprV32S32:
1897 return LLT::fixed_vector(NumElements: 32, ScalarSizeInBits: 32);
1898 default:
1899 return LLT();
1900 }
1901}
1902
1903LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1904 switch (ID) {
1905 case SgprB32:
1906 case VgprB32:
1907 case SgprB32_M0:
1908 case SgprB32_ReadFirstLane:
1909 case UniInVgprB32:
1910 if (Ty == LLT::scalar(SizeInBits: 32) || Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) ||
1911 isAnyPtr(Ty, Width: 32))
1912 return Ty;
1913 return LLT();
1914 case SgprPtr32:
1915 case VgprPtr32:
1916 return isAnyPtr(Ty, Width: 32) ? Ty : LLT();
1917 case SgprPtr64:
1918 case VgprPtr64:
1919 return isAnyPtr(Ty, Width: 64) ? Ty : LLT();
1920 case SgprPtr128:
1921 case VgprPtr128:
1922 return isAnyPtr(Ty, Width: 128) ? Ty : LLT();
1923 case SgprB64:
1924 case VgprB64:
1925 case SgprB64_ReadFirstLane:
1926 case UniInVgprB64:
1927 if (Ty == LLT::scalar(SizeInBits: 64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32) ||
1928 Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16) || isAnyPtr(Ty, Width: 64))
1929 return Ty;
1930 return LLT();
1931 case SgprB96:
1932 case VgprB96:
1933 case UniInVgprB96:
1934 if (Ty == LLT::scalar(SizeInBits: 96) || Ty == LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32) ||
1935 Ty == LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 16))
1936 return Ty;
1937 return LLT();
1938 case SgprB128:
1939 case VgprB128:
1940 case UniInVgprB128:
1941 if (Ty.getSizeInBits() == 128)
1942 return Ty;
1943 return LLT();
1944 case VgprB160:
1945 case UniInVgprB160:
1946 if (Ty.getSizeInBits() == 160)
1947 return Ty;
1948 return LLT();
1949 case SgprB256:
1950 case VgprB256:
1951 case UniInVgprB256:
1952 if (Ty.getSizeInBits() == 256)
1953 return Ty;
1954 return LLT();
1955 case SgprB512:
1956 case VgprB512:
1957 case UniInVgprB512:
1958 if (Ty.getSizeInBits() == 512)
1959 return Ty;
1960 return LLT();
1961 case SgprBRC: {
1962 const SIRegisterInfo *TRI =
1963 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1964 unsigned LLTSize = Ty.getSizeInBits();
1965 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(BitWidth: LLTSize))
1966 return Ty;
1967 return LLT();
1968 }
1969 case VgprBRC: {
1970 const SIRegisterInfo *TRI =
1971 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1972 if (TRI->getSGPRClassForBitWidth(BitWidth: Ty.getSizeInBits()))
1973 return Ty;
1974 return LLT();
1975 }
1976 default:
1977 return LLT();
1978 }
1979}
1980
1981const RegisterBank *
1982RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1983 switch (ID) {
1984 case Vcc:
1985 return VccRB;
1986 case Sgpr16:
1987 case Sgpr32:
1988 case Sgpr32_WF:
1989 case Sgpr64:
1990 case Sgpr128:
1991 case SgprP0:
1992 case SgprP0Call_WF:
1993 case SgprP1:
1994 case SgprP2:
1995 case SgprP3:
1996 case SgprP4:
1997 case SgprP4Call_WF:
1998 case SgprP5:
1999 case SgprP6:
2000 case SgprP8:
2001 case SgprPtr32:
2002 case SgprPtr64:
2003 case SgprPtr128:
2004 case SgprV2S16:
2005 case SgprV2S32:
2006 case SgprV4S32:
2007 case SgprV4S32_WF:
2008 case SgprV4S32_ReadFirstLane:
2009 case SgprV8S32_ReadFirstLane:
2010 case SgprB32:
2011 case SgprB64:
2012 case SgprB96:
2013 case SgprB128:
2014 case SgprB256:
2015 case SgprB512:
2016 case SgprBRC:
2017 case UniInVcc:
2018 case UniInVgprS16:
2019 case UniInVgprS32:
2020 case UniInVgprS64:
2021 case UniInVgprV2S16:
2022 case UniInVgprV2S32:
2023 case UniInVgprV3S32:
2024 case UniInVgprV4S32:
2025 case UniInVgprV2S64:
2026 case UniInVgprV6S32:
2027 case UniInVgprV8S16:
2028 case UniInVgprV8S32:
2029 case UniInVgprV16S16:
2030 case UniInVgprV16S32:
2031 case UniInVgprV32S16:
2032 case UniInVgprV32S32:
2033 case UniInVgprB32:
2034 case UniInVgprB64:
2035 case UniInVgprB96:
2036 case UniInVgprB128:
2037 case UniInVgprB160:
2038 case UniInVgprB256:
2039 case UniInVgprB512:
2040 case Sgpr32Trunc:
2041 case Sgpr32AExt:
2042 case Sgpr32AExtBoolInReg:
2043 case Sgpr32SExt:
2044 case Sgpr32ZExt:
2045 return SgprRB;
2046 case AgprAnyTy:
2047 return AgprRB;
2048 case Vgpr16:
2049 case Vgpr32:
2050 case Vgpr64:
2051 case Vgpr128:
2052 case VgprP0:
2053 case VgprP1:
2054 case VgprP2:
2055 case VgprP3:
2056 case VgprP4:
2057 case VgprP5:
2058 case VgprPtr32:
2059 case VgprPtr64:
2060 case VgprPtr128:
2061 case VgprV2S16:
2062 case VgprV2S32:
2063 case VgprV2S64:
2064 case VgprV3S32:
2065 case VgprV4S16:
2066 case VgprV8S16:
2067 case VgprV16S16:
2068 case VgprV4S32:
2069 case VgprV6S32:
2070 case VgprV8S32:
2071 case VgprV16S32:
2072 case VgprV32S16:
2073 case VgprV32S32:
2074 case VgprB32:
2075 case VgprB64:
2076 case VgprB96:
2077 case VgprB128:
2078 case VgprB160:
2079 case VgprB256:
2080 case VgprB512:
2081 case VgprBRC:
2082 case VgprAnyTy:
2083 case Vgpr32AExt:
2084 case Vgpr32SExt:
2085 case Vgpr32ZExt:
2086 case Sgpr32ToVgprDst:
2087 case Sgpr64ToVgprDst:
2088 return VgprRB;
2089 default:
2090 return nullptr;
2091 }
2092}
2093
2094bool RegBankLegalizeHelper::applyMappingDst(
2095 MachineInstr &MI, unsigned &OpIdx,
2096 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
2097 // Defs start from operand 0
2098 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
2099 if (MethodIDs[OpIdx] == None)
2100 continue;
2101 MachineOperand &Op = MI.getOperand(i: OpIdx);
2102 Register Reg = Op.getReg();
2103 LLT Ty = MRI.getType(Reg);
2104 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
2105
2106 switch (MethodIDs[OpIdx]) {
2107 // vcc, sgpr and vgpr scalars, pointers and vectors
2108 case Vcc:
2109 case Sgpr16:
2110 case Sgpr32:
2111 case Sgpr64:
2112 case Sgpr128:
2113 case SgprP0:
2114 case SgprP1:
2115 case SgprP3:
2116 case SgprP4:
2117 case SgprP5:
2118 case SgprP6:
2119 case SgprP8:
2120 case SgprV2S16:
2121 case SgprV2S32:
2122 case SgprV4S32:
2123 case Vgpr16:
2124 case Vgpr32:
2125 case Vgpr64:
2126 case Vgpr128:
2127 case VgprP0:
2128 case VgprP1:
2129 case VgprP2:
2130 case VgprP3:
2131 case VgprP4:
2132 case VgprP5:
2133 case VgprV2S16:
2134 case VgprV2S32:
2135 case VgprV2S64:
2136 case VgprV3S32:
2137 case VgprV4S16:
2138 case VgprV8S16:
2139 case VgprV16S16:
2140 case VgprV4S32:
2141 case VgprV6S32:
2142 case VgprV8S32:
2143 case VgprV16S32:
2144 case VgprV32S16:
2145 case VgprV32S32: {
2146 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
2147 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
2148 break;
2149 }
2150 // sgpr and vgpr B-types
2151 case SgprB32:
2152 case SgprB64:
2153 case SgprB96:
2154 case SgprB128:
2155 case SgprB256:
2156 case SgprB512:
2157 case SgprBRC:
2158 case SgprPtr32:
2159 case SgprPtr64:
2160 case SgprPtr128:
2161 case VgprB32:
2162 case VgprB64:
2163 case VgprB96:
2164 case VgprB128:
2165 case VgprB160:
2166 case VgprB256:
2167 case VgprB512:
2168 case VgprBRC:
2169 case VgprPtr32:
2170 case VgprPtr64:
2171 case VgprPtr128: {
2172 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
2173 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
2174 break;
2175 }
2176 case VgprAnyTy: {
2177 assert(RB == VgprRB);
2178 break;
2179 }
2180 case AgprAnyTy: {
2181 if (RB == AgprRB)
2182 break;
2183 Register NewAgprDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: AgprRB, .Ty: Ty});
2184 Op.setReg(NewAgprDst);
2185 if (!MRI.use_nodbg_empty(RegNo: Reg))
2186 B.buildCopy(Res: Reg, Op: NewAgprDst);
2187 break;
2188 }
2189 case VgprOrAgprAnyTy: {
2190 const unsigned NumRegs = Ty.getSizeInBits() / 32;
2191 const RegisterBank *DstRB =
2192 MFI->selectAGPRFormMFMA(NumRegs) ? AgprRB : VgprRB;
2193 if (RB == DstRB)
2194 break;
2195 Register NewDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: DstRB, .Ty: Ty});
2196 Op.setReg(NewDst);
2197 if (!MRI.use_nodbg_empty(RegNo: Reg))
2198 B.buildCopy(Res: Reg, Op: NewDst);
2199 break;
2200 }
2201 // uniform in vcc/vgpr: scalars, vectors and B-types
2202 case UniInVcc: {
2203 assert(Ty == S1);
2204 assert(RB == SgprRB);
2205 Register NewDst = MRI.createVirtualRegister(RegAttr: VccRB_S1);
2206 Op.setReg(NewDst);
2207 if (!MRI.use_empty(RegNo: Reg)) {
2208 auto CopyS32_Vcc =
2209 B.buildInstr(Opc: AMDGPU::G_AMDGPU_COPY_SCC_VCC, DstOps: {SgprRB_S32}, SrcOps: {NewDst});
2210 B.buildTrunc(Res: Reg, Op: CopyS32_Vcc);
2211 }
2212 break;
2213 }
2214 case UniInVgprS16: {
2215 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
2216 assert(RB == SgprRB);
2217 Register NewVgprDstS16 = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: S16});
2218 Register NewVgprDstS32 = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: S32});
2219 Register NewSgprDstS32 = MRI.createVirtualRegister(RegAttr: {.RCOrRB: SgprRB, .Ty: S32});
2220 Op.setReg(NewVgprDstS16);
2221 B.buildAnyExt(Res: NewVgprDstS32, Op: NewVgprDstS16);
2222 buildReadAnyLane(B, SgprDst: NewSgprDstS32, VgprSrc: NewVgprDstS32, RBI);
2223 B.buildTrunc(Res: Reg, Op: NewSgprDstS32);
2224 break;
2225 }
2226 case UniInVgprS32:
2227 case UniInVgprS64:
2228 case UniInVgprV2S16:
2229 case UniInVgprV2S32:
2230 case UniInVgprV3S32:
2231 case UniInVgprV4S32:
2232 case UniInVgprV2S64:
2233 case UniInVgprV6S32:
2234 case UniInVgprV8S16:
2235 case UniInVgprV8S32:
2236 case UniInVgprV16S16:
2237 case UniInVgprV16S32:
2238 case UniInVgprV32S16:
2239 case UniInVgprV32S32: {
2240 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
2241 assert(RB == SgprRB);
2242 Register NewVgprDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
2243 Op.setReg(NewVgprDst);
2244 buildReadAnyLane(B, SgprDst: Reg, VgprSrc: NewVgprDst, RBI);
2245 break;
2246 }
2247 case UniInVgprB32:
2248 case UniInVgprB64:
2249 case UniInVgprB96:
2250 case UniInVgprB128:
2251 case UniInVgprB160:
2252 case UniInVgprB256:
2253 case UniInVgprB512: {
2254 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
2255 assert(RB == SgprRB);
2256 Register NewVgprDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
2257 Op.setReg(NewVgprDst);
2258 AMDGPU::buildReadAnyLane(B, SgprDst: Reg, VgprSrc: NewVgprDst, RBI);
2259 break;
2260 }
2261 // sgpr trunc
2262 case Sgpr32Trunc: {
2263 assert(Ty.getSizeInBits() < 32);
2264 assert(RB == SgprRB);
2265 Register NewDst = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
2266 Op.setReg(NewDst);
2267 if (!MRI.use_empty(RegNo: Reg))
2268 B.buildTrunc(Res: Reg, Op: NewDst);
2269 break;
2270 }
2271 case Sgpr32ToVgprDst:
2272 case Sgpr64ToVgprDst: {
2273 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
2274 assert(RB == VgprRB);
2275 Op.setReg(MRI.createVirtualRegister(RegAttr: {.RCOrRB: SgprRB, .Ty: Ty}));
2276 B.buildCopy(Res: Reg, Op: Op.getReg());
2277 break;
2278 }
2279 case InvalidMapping: {
2280 reportGISelFailure(
2281 MF, MORE, PassName: "amdgpu-regbanklegalize",
2282 Msg: "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
2283 return false;
2284 }
2285 default:
2286 reportGISelFailure(
2287 MF, MORE, PassName: "amdgpu-regbanklegalize",
2288 Msg: "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
2289 return false;
2290 }
2291 }
2292
2293 return true;
2294}
2295
2296bool RegBankLegalizeHelper::applyMappingSrc(
2297 MachineInstr &MI, unsigned &OpIdx,
2298 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
2299 WaterfallInfo &WFI) {
2300 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
2301 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
2302 continue;
2303
2304 MachineOperand &Op = MI.getOperand(i: OpIdx);
2305 Register Reg = Op.getReg();
2306 LLT Ty = MRI.getType(Reg);
2307 const RegisterBank *RB = MRI.getRegBank(Reg);
2308
2309 switch (MethodIDs[i]) {
2310 case Vcc: {
2311 assert(Ty == S1);
2312 assert(RB == VccRB || RB == SgprRB);
2313 if (RB == SgprRB) {
2314 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
2315 auto CopyVcc_Scc =
2316 B.buildInstr(Opc: AMDGPU::G_AMDGPU_COPY_VCC_SCC, DstOps: {VccRB_S1}, SrcOps: {Aext});
2317 Op.setReg(CopyVcc_Scc.getReg(Idx: 0));
2318 }
2319 break;
2320 }
2321 // sgpr scalars, pointers and vectors
2322 case Sgpr16:
2323 case Sgpr32:
2324 case Sgpr64:
2325 case Sgpr128:
2326 case SgprP0:
2327 case SgprP1:
2328 case SgprP3:
2329 case SgprP4:
2330 case SgprP5:
2331 case SgprP6:
2332 case SgprP8:
2333 case SgprV2S16:
2334 case SgprV2S32:
2335 case SgprV4S32: {
2336 assert(Ty == getTyFromID(MethodIDs[i]));
2337 assert(RB == getRegBankFromID(MethodIDs[i]));
2338 break;
2339 }
2340 // sgpr B-types
2341 case SgprB32:
2342 case SgprB64:
2343 case SgprB96:
2344 case SgprB128:
2345 case SgprB256:
2346 case SgprB512:
2347 case SgprBRC:
2348 case SgprPtr32:
2349 case SgprPtr64:
2350 case SgprPtr128: {
2351 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2352 assert(RB == getRegBankFromID(MethodIDs[i]));
2353 break;
2354 }
2355 // vgpr scalars, pointers and vectors
2356 case Vgpr16:
2357 case Vgpr32:
2358 case Vgpr64:
2359 case Vgpr128:
2360 case VgprP0:
2361 case VgprP1:
2362 case VgprP2:
2363 case VgprP3:
2364 case VgprP4:
2365 case VgprP5:
2366 case VgprV2S16:
2367 case VgprV2S32:
2368 case VgprV2S64:
2369 case VgprV3S32:
2370 case VgprV4S16:
2371 case VgprV8S16:
2372 case VgprV16S16:
2373 case VgprV4S32:
2374 case VgprV6S32:
2375 case VgprV8S32:
2376 case VgprV16S32:
2377 case VgprV32S16:
2378 case VgprV32S32: {
2379 assert(Ty == getTyFromID(MethodIDs[i]));
2380 if (RB != VgprRB) {
2381 auto CopyToVgpr = B.buildCopy(Res: {VgprRB, Ty}, Op: Reg);
2382 Op.setReg(CopyToVgpr.getReg(Idx: 0));
2383 }
2384 break;
2385 }
2386 // vgpr B-types
2387 case VgprB32:
2388 case VgprB64:
2389 case VgprB96:
2390 case VgprB128:
2391 case VgprB160:
2392 case VgprB256:
2393 case VgprB512:
2394 case VgprBRC:
2395 case VgprPtr32:
2396 case VgprPtr64:
2397 case VgprPtr128: {
2398 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2399 if (RB != VgprRB) {
2400 auto CopyToVgpr = B.buildCopy(Res: {VgprRB, Ty}, Op: Reg);
2401 Op.setReg(CopyToVgpr.getReg(Idx: 0));
2402 }
2403 break;
2404 }
2405 case VgprAnyTy: {
2406 if (RB != VgprRB) {
2407 auto CopyToVgpr = B.buildCopy(Res: {VgprRB, Ty}, Op: Reg);
2408 Op.setReg(CopyToVgpr.getReg(Idx: 0));
2409 }
2410 break;
2411 }
2412 case AgprAnyTy: {
2413 if (RB != AgprRB) {
2414 auto CopyToAgpr = B.buildCopy(Res: {AgprRB, Ty}, Op: Reg);
2415 Op.setReg(CopyToAgpr.getReg(Idx: 0));
2416 }
2417 break;
2418 }
2419 case VgprOrAgprAnyTy: {
2420 const unsigned NumRegs = Ty.getSizeInBits() / 32;
2421 const RegisterBank *SrcRB =
2422 MFI->selectAGPRFormMFMA(NumRegs) ? AgprRB : VgprRB;
2423 if (RB != SrcRB)
2424 Op.setReg(B.buildCopy(Res: {SrcRB, Ty}, Op: Reg).getReg(Idx: 0));
2425 break;
2426 }
2427 // sgpr waterfall, scalars, and vectors
2428 case Sgpr32_WF:
2429 case SgprV4S32_WF: {
2430 assert(Ty == getTyFromID(MethodIDs[i]));
2431 if (RB != SgprRB) {
2432 WFI.SgprWaterfallOperandRegs.insert(V: Reg);
2433 if (!WFI.Start.isValid()) {
2434 WFI.Start = MI.getIterator();
2435 WFI.End = std::next(x: MI.getIterator());
2436 }
2437 }
2438 break;
2439 }
2440 case SgprP0Call_WF:
2441 case SgprP4Call_WF: {
2442 assert(Ty == getTyFromID(MethodIDs[i]));
2443 if (RB != SgprRB) {
2444 WFI.SgprWaterfallOperandRegs.insert(V: Reg);
2445
2446 // Find the ADJCALLSTACKUP before the call.
2447 MachineBasicBlock::iterator Start = MI.getIterator();
2448 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2449 --Start;
2450
2451 // Find the ADJCALLSTACKDOWN after the call (include it in range).
2452 MachineBasicBlock::iterator End = MI.getIterator();
2453 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2454 ++End;
2455 ++End;
2456
2457 WFI.Start = Start;
2458 WFI.End = End;
2459 }
2460 break;
2461 }
2462 case SgprB32_M0:
2463 case SgprB32_ReadFirstLane:
2464 case SgprB64_ReadFirstLane: {
2465 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2466 if (RB == SgprRB)
2467 break;
2468 assert(RB == VgprRB);
2469 Register NewSGPR = MRI.createVirtualRegister(RegAttr: {.RCOrRB: SgprRB, .Ty: Ty});
2470 buildReadFirstLane(B, SgprDst: NewSGPR, VgprSrc: Op.getReg(), RBI);
2471 Op.setReg(NewSGPR);
2472 break;
2473 }
2474 case SgprV4S32_ReadFirstLane:
2475 case SgprV8S32_ReadFirstLane: {
2476 assert(Ty == getTyFromID(MethodIDs[i]));
2477 if (RB == SgprRB)
2478 break;
2479 assert(RB == VgprRB);
2480 Register NewSGPR = MRI.createVirtualRegister(RegAttr: {.RCOrRB: SgprRB, .Ty: Ty});
2481 buildReadFirstLane(B, SgprDst: NewSGPR, VgprSrc: Op.getReg(), RBI);
2482 Op.setReg(NewSGPR);
2483 break;
2484 }
2485 // sgpr and vgpr scalars with extend
2486 case Sgpr32AExt: {
2487 // Note: this ext allows S1, and it is meant to be combined away.
2488 assert(Ty.getSizeInBits() < 32);
2489 assert(RB == SgprRB);
2490 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
2491 Op.setReg(Aext.getReg(Idx: 0));
2492 break;
2493 }
2494 case Sgpr32AExtBoolInReg: {
2495 // Note: this ext allows S1, and it is meant to be combined away.
2496 assert(Ty.getSizeInBits() == 1);
2497 assert(RB == SgprRB);
2498 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
2499 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
2500 // most of times meant to be combined away in AMDGPURegBankCombiner.
2501 auto Cst1 = B.buildConstant(Res: SgprRB_S32, Val: 1);
2502 auto BoolInReg = B.buildAnd(Dst: SgprRB_S32, Src0: Aext, Src1: Cst1);
2503 Op.setReg(BoolInReg.getReg(Idx: 0));
2504 break;
2505 }
2506 case Sgpr32SExt: {
2507 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2508 assert(RB == SgprRB);
2509 auto Sext = B.buildSExt(Res: SgprRB_S32, Op: Reg);
2510 Op.setReg(Sext.getReg(Idx: 0));
2511 break;
2512 }
2513 case Sgpr32ZExt: {
2514 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2515 assert(RB == SgprRB);
2516 auto Zext = B.buildZExt(Res: {SgprRB, S32}, Op: Reg);
2517 Op.setReg(Zext.getReg(Idx: 0));
2518 break;
2519 }
2520 case Vgpr32AExt: {
2521 assert(Ty.getSizeInBits() < 32);
2522 assert(RB == VgprRB);
2523 auto Aext = B.buildAnyExt(Res: {VgprRB, S32}, Op: Reg);
2524 Op.setReg(Aext.getReg(Idx: 0));
2525 break;
2526 }
2527 case Vgpr32SExt: {
2528 // Note this ext allows S1, and it is meant to be combined away.
2529 assert(Ty.getSizeInBits() < 32);
2530 assert(RB == VgprRB);
2531 auto Sext = B.buildSExt(Res: {VgprRB, S32}, Op: Reg);
2532 Op.setReg(Sext.getReg(Idx: 0));
2533 break;
2534 }
2535 case Vgpr32ZExt: {
2536 // Note this ext allows S1, and it is meant to be combined away.
2537 assert(Ty.getSizeInBits() < 32);
2538 assert(RB == VgprRB);
2539 auto Zext = B.buildZExt(Res: {VgprRB, S32}, Op: Reg);
2540 Op.setReg(Zext.getReg(Idx: 0));
2541 break;
2542 }
2543 default:
2544 reportGISelFailure(
2545 MF, MORE, PassName: "amdgpu-regbanklegalize",
2546 Msg: "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
2547 return false;
2548 }
2549 }
2550 return true;
2551}
2552
2553[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
2554 const RegisterBank *RB,
2555 MachineRegisterInfo &MRI,
2556 unsigned StartOpIdx,
2557 unsigned EndOpIdx) {
2558 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2559 if (MRI.getRegBankOrNull(Reg: MI.getOperand(i).getReg()) != RB)
2560 return false;
2561 }
2562 return true;
2563}
2564
2565bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2566 MachineInstr &MI, unsigned RsrcIdx) {
2567 const unsigned NumDefs = MI.getNumExplicitDefs();
2568
2569 MachineBasicBlock *MBB = MI.getParent();
2570 B.setInsertPt(MBB&: *MBB, II: MBB->SkipPHIsAndLabels(I: std::next(x: MI.getIterator())));
2571
2572 // Defs are vgpr.
2573 for (unsigned i = 0; i < NumDefs; ++i) {
2574 Register Reg = MI.getOperand(i).getReg();
2575 if (MRI.getRegBank(Reg) == VgprRB)
2576 continue;
2577
2578 Register NewVgprDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: MRI.getType(Reg)});
2579 MI.getOperand(i).setReg(NewVgprDst);
2580 buildReadAnyLane(B, SgprDst: Reg, VgprSrc: NewVgprDst, RBI);
2581 }
2582
2583 B.setInstrAndDebugLoc(MI);
2584
2585 // Register uses before RsrcIdx are vgpr.
2586 for (unsigned i = NumDefs; i < RsrcIdx; ++i) {
2587 MachineOperand &Op = MI.getOperand(i);
2588 if (!Op.isReg())
2589 continue;
2590
2591 Register Reg = Op.getReg();
2592 if (!Reg.isVirtual())
2593 continue;
2594
2595 if (MRI.getRegBank(Reg) == VgprRB)
2596 continue;
2597
2598 auto Copy = B.buildCopy(Res: {VgprRB, MRI.getType(Reg)}, Op: Reg);
2599 Op.setReg(Copy.getReg(Idx: 0));
2600 }
2601
2602 SmallSet<Register, 4> OpsToWaterfall;
2603
2604 // Register use RsrcIdx (and later register operands) is sgpr.
2605 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
2606 MachineOperand &Op = MI.getOperand(i);
2607 if (!Op.isReg())
2608 continue;
2609
2610 Register Reg = Op.getReg();
2611 if (MRI.getRegBank(Reg) != SgprRB)
2612 OpsToWaterfall.insert(V: Reg);
2613 }
2614
2615 if (!OpsToWaterfall.empty()) {
2616 MachineBasicBlock::iterator MII = MI.getIterator();
2617 executeInWaterfallLoop(B, WFI: {.SgprWaterfallOperandRegs: OpsToWaterfall, .Start: MII, .End: std::next(x: MII)});
2618 }
2619
2620 return true;
2621}
2622