1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPURegBankLegalizeHelper.h"
15#include "AMDGPUGlobalISelUtils.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPURegBankLegalizeRules.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "GCNSubtarget.h"
20#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
22#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24#include "llvm/CodeGen/MachineInstr.h"
25#include "llvm/CodeGen/MachineUniformityAnalysis.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
29
30using namespace llvm;
31using namespace AMDGPU;
32
33RegBankLegalizeHelper::RegBankLegalizeHelper(
34 MachineIRBuilder &B, const MachineUniformityInfo &MUI,
35 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(ID: AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(ID: AMDGPU::VGPRRegBankID)),
41 VccRB(&RBI.getRegBank(ID: AMDGPU::VCCRegBankID)) {}
42
43bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
44 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
45 if (!RuleSet) {
46 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
47 Msg: "No AMDGPU RegBankLegalize rules defined for opcode",
48 MI);
49 return false;
50 }
51
52 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
53 if (!Mapping) {
54 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
55 Msg: "AMDGPU RegBankLegalize: none of the rules defined with "
56 "'Any' for MI's opcode matched MI",
57 MI);
58 return false;
59 }
60
61 WaterfallInfo WFI;
62 unsigned OpIdx = 0;
63 if (Mapping->DstOpMapping.size() > 0) {
64 B.setInsertPt(MBB&: *MI.getParent(), II: std::next(x: MI.getIterator()));
65 if (!applyMappingDst(MI, OpIdx, MethodIDs: Mapping->DstOpMapping))
66 return false;
67 }
68 if (Mapping->SrcOpMapping.size() > 0) {
69 B.setInstr(MI);
70 if (!applyMappingSrc(MI, OpIdx, MethodIDs: Mapping->SrcOpMapping, WFI))
71 return false;
72 }
73
74 if (!lower(MI, Mapping: *Mapping, WFI))
75 return false;
76
77 return true;
78}
79
80bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
81 const WaterfallInfo &WFI) {
82 assert(WFI.Start.isValid() && WFI.End.isValid() &&
83 "Waterfall range not initialized");
84
85 // Track use registers which have already been expanded with a readfirstlane
86 // sequence. This may have multiple uses if moving a sequence.
87 DenseMap<Register, Register> WaterfalledRegMap;
88
89 MachineBasicBlock &MBB = B.getMBB();
90 MachineFunction &MF = B.getMF();
91
92 MachineBasicBlock::iterator BeginIt = WFI.Start;
93 MachineBasicBlock::iterator EndIt = WFI.End;
94
95 const SIRegisterInfo *TRI = ST.getRegisterInfo();
96 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
97 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
98 if (IsWave32) {
99 MovExecOpc = AMDGPU::S_MOV_B32;
100 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
101 XorTermOpc = AMDGPU::S_XOR_B32_term;
102 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
103 ExecReg = AMDGPU::EXEC_LO;
104 } else {
105 MovExecOpc = AMDGPU::S_MOV_B64;
106 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
107 XorTermOpc = AMDGPU::S_XOR_B64_term;
108 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
109 ExecReg = AMDGPU::EXEC;
110 }
111
112#ifndef NDEBUG
113 const int OrigRangeSize = std::distance(BeginIt, EndIt);
114#endif
115
116 MachineRegisterInfo &MRI = *B.getMRI();
117 Register SaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
118 Register InitSaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
119
120 // Don't bother using generic instructions/registers for the exec mask.
121 B.buildInstr(Opcode: TargetOpcode::IMPLICIT_DEF).addDef(RegNo: InitSaveExecReg);
122
123 Register SavedExec = MRI.createVirtualRegister(RegClass: WaveRC);
124
125 // To insert the loop we need to split the block. Move everything before
126 // this point to a new block, and insert a new empty block before this
127 // instruction.
128 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
129 MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
130 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
131 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
132 MachineFunction::iterator MBBI(MBB);
133 ++MBBI;
134 MF.insert(MBBI, MBB: LoopBB);
135 MF.insert(MBBI, MBB: BodyBB);
136 MF.insert(MBBI, MBB: RestoreExecBB);
137 MF.insert(MBBI, MBB: RemainderBB);
138
139 LoopBB->addSuccessor(Succ: BodyBB);
140 BodyBB->addSuccessor(Succ: RestoreExecBB);
141 BodyBB->addSuccessor(Succ: LoopBB);
142
143 // Move the rest of the block into a new block.
144 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
145 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: EndIt, To: MBB.end());
146
147 MBB.addSuccessor(Succ: LoopBB);
148 RestoreExecBB->addSuccessor(Succ: RemainderBB);
149
150 B.setInsertPt(MBB&: *LoopBB, II: LoopBB->end());
151
152 // +-MBB:------------+
153 // | ... |
154 // | %0 = G_INST_1 |
155 // | %Dst = MI %Vgpr |
156 // | %1 = G_INST_2 |
157 // | ... |
158 // +-----------------+
159 // ->
160 // +-MBB-------------------------------+
161 // | ... |
162 // | %0 = G_INST_1 |
163 // | %SaveExecReg = S_MOV_B32 $exec_lo |
164 // +----------------|------------------+
165 // | /------------------------------|
166 // V V |
167 // +-LoopBB---------------------------------------------------------------+ |
168 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
169 // | instead of executing for each lane, see if other lanes had | |
170 // | same value for %Vgpr and execute for them also. | |
171 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
172 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
173 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
174 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
175 // +----------------|-----------------------------------------------------+ |
176 // V |
177 // +-BodyBB------------------------------------------------------------+ |
178 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
179 // | executed only for active lanes and written to Dst | |
180 // | $exec = S_XOR_B32 $exec, %SavedExec | |
181 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
182 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
183 // | SI_WATERFALL_LOOP LoopBB |-----|
184 // +----------------|--------------------------------------------------+
185 // V
186 // +-RestoreExecBB--------------------------+
187 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
188 // +----------------|-----------------------+
189 // V
190 // +-RemainderBB:----------------------+
191 // | %1 = G_INST_2 |
192 // | ... |
193 // +---------------------------------- +
194
195 // Move the instruction into the loop body. Note we moved everything after
196 // Range.end() already into a new block, so Range.end() is no longer valid.
197 BodyBB->splice(Where: BodyBB->end(), Other: &MBB, From: BeginIt, To: MBB.end());
198
199 // Figure out the iterator range after splicing the instructions.
200 MachineBasicBlock::iterator NewBegin = BeginIt;
201 auto NewEnd = BodyBB->end();
202 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
203
204 B.setMBB(*LoopBB);
205 Register CondReg;
206
207 for (MachineInstr &MI : make_range(x: NewBegin, y: NewEnd)) {
208 for (MachineOperand &Op : MI.all_uses()) {
209 Register OldReg = Op.getReg();
210 if (!WFI.SgprWaterfallOperandRegs.count(V: OldReg))
211 continue;
212
213 // See if we already processed this register in another instruction in
214 // the sequence.
215 auto OldVal = WaterfalledRegMap.find(Val: OldReg);
216 if (OldVal != WaterfalledRegMap.end()) {
217 Op.setReg(OldVal->second);
218 continue;
219 }
220
221 Register OpReg = Op.getReg();
222 LLT OpTy = MRI.getType(Reg: OpReg);
223
224 // TODO: support for agpr
225 assert(MRI.getRegBank(OpReg) == VgprRB);
226 Register CurrentLaneReg = MRI.createVirtualRegister(RegAttr: {.RCOrRB: SgprRB, .Ty: OpTy});
227 buildReadFirstLane(B, SgprDst: CurrentLaneReg, VgprSrc: OpReg, RBI);
228
229 // Build the comparison(s), CurrentLaneReg == OpReg.
230 unsigned OpSize = OpTy.getSizeInBits();
231 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
232 LLT PartTy = LLT::scalar(SizeInBits: PartSize);
233 unsigned NumParts = OpSize / PartSize;
234 SmallVector<Register, 8> OpParts;
235 SmallVector<Register, 8> CurrentLaneParts;
236
237 if (NumParts == 1) {
238 OpParts.push_back(Elt: OpReg);
239 CurrentLaneParts.push_back(Elt: CurrentLaneReg);
240 } else {
241 auto UnmergeOp = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: PartTy}, Op: OpReg);
242 auto UnmergeCurrLane = B.buildUnmerge(Attrs: {.RCOrRB: SgprRB, .Ty: PartTy}, Op: CurrentLaneReg);
243 for (unsigned i = 0; i < NumParts; ++i) {
244 OpParts.push_back(Elt: UnmergeOp.getReg(Idx: i));
245 CurrentLaneParts.push_back(Elt: UnmergeCurrLane.getReg(Idx: i));
246 }
247 }
248
249 for (unsigned i = 0; i < NumParts; ++i) {
250 Register CmpReg = MRI.createVirtualRegister(RegAttr: VccRB_S1);
251 B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CmpReg, Op0: CurrentLaneParts[i], Op1: OpParts[i]);
252
253 if (!CondReg)
254 CondReg = CmpReg;
255 else
256 CondReg = B.buildAnd(Dst: VccRB_S1, Src0: CondReg, Src1: CmpReg).getReg(Idx: 0);
257 }
258
259 Op.setReg(CurrentLaneReg);
260
261 // Make sure we don't re-process this register again.
262 WaterfalledRegMap.insert(KV: std::pair(OldReg, Op.getReg()));
263 }
264 }
265
266 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
267 Register CondRegLM =
268 MRI.createVirtualRegister(RegAttr: {.RCOrRB: WaveRC, .Ty: LLT::scalar(SizeInBits: IsWave32 ? 32 : 64)});
269 B.buildIntrinsic(ID: Intrinsic::amdgcn_ballot, Res: CondRegLM).addReg(RegNo: CondReg);
270
271 // Update EXEC, save the original EXEC value to SavedExec.
272 B.buildInstr(Opcode: AndSaveExecOpc)
273 .addDef(RegNo: SavedExec)
274 .addReg(RegNo: CondRegLM, Flags: RegState::Kill);
275 MRI.setSimpleHint(VReg: SavedExec, PrefReg: CondRegLM);
276
277 B.setInsertPt(MBB&: *BodyBB, II: BodyBB->end());
278
279 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
280 B.buildInstr(Opcode: XorTermOpc).addDef(RegNo: ExecReg).addReg(RegNo: ExecReg).addReg(RegNo: SavedExec);
281
282 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
283 // s_cbranch_scc0?
284
285 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
286 B.buildInstr(Opcode: AMDGPU::SI_WATERFALL_LOOP).addMBB(MBB: LoopBB);
287
288 // Save the EXEC mask before the loop.
289 B.setInsertPt(MBB, II: MBB.end());
290 B.buildInstr(Opcode: MovExecOpc).addDef(RegNo: SaveExecReg).addReg(RegNo: ExecReg);
291
292 // Restore the EXEC mask after the loop.
293 B.setInsertPt(MBB&: *RestoreExecBB, II: RestoreExecBB->begin());
294 B.buildInstr(Opcode: MovExecTermOpc).addDef(RegNo: ExecReg).addReg(RegNo: SaveExecReg);
295
296 // Set the insert point after the original instruction, so any new
297 // instructions will be in the remainder.
298 B.setInsertPt(MBB&: *RemainderBB, II: RemainderBB->begin());
299
300 return true;
301}
302
303bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
304 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
305 MachineFunction &MF = B.getMF();
306 assert(MI.getNumMemOperands() == 1);
307 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
308 Register Dst = MI.getOperand(i: 0).getReg();
309 const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: Dst);
310 Register Base = MI.getOperand(i: 1).getReg();
311 LLT PtrTy = MRI.getType(Reg: Base);
312 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Reg: Base);
313 LLT OffsetTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
314 SmallVector<Register, 4> LoadPartRegs;
315
316 unsigned ByteOffset = 0;
317 for (LLT PartTy : LLTBreakdown) {
318 Register BasePlusOffset;
319 if (ByteOffset == 0) {
320 BasePlusOffset = Base;
321 } else {
322 auto Offset = B.buildConstant(Res: {PtrRB, OffsetTy}, Val: ByteOffset);
323 BasePlusOffset =
324 B.buildObjectPtrOffset(Res: {PtrRB, PtrTy}, Op0: Base, Op1: Offset).getReg(Idx: 0);
325 }
326 auto *OffsetMMO = MF.getMachineMemOperand(MMO: &BaseMMO, Offset: ByteOffset, Ty: PartTy);
327 auto LoadPart = B.buildLoad(Res: {DstRB, PartTy}, Addr: BasePlusOffset, MMO&: *OffsetMMO);
328 LoadPartRegs.push_back(Elt: LoadPart.getReg(Idx: 0));
329 ByteOffset += PartTy.getSizeInBytes();
330 }
331
332 if (!MergeTy.isValid()) {
333 // Loads are of same size, concat or merge them together.
334 B.buildMergeLikeInstr(Res: Dst, Ops: LoadPartRegs);
335 } else {
336 // Loads are not all of same size, need to unmerge them to smaller pieces
337 // of MergeTy type, then merge pieces to Dst.
338 SmallVector<Register, 4> MergeTyParts;
339 for (Register Reg : LoadPartRegs) {
340 if (MRI.getType(Reg) == MergeTy) {
341 MergeTyParts.push_back(Elt: Reg);
342 } else {
343 auto Unmerge = B.buildUnmerge(Attrs: {.RCOrRB: DstRB, .Ty: MergeTy}, Op: Reg);
344 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
345 MergeTyParts.push_back(Elt: Unmerge.getReg(Idx: i));
346 }
347 }
348 B.buildMergeLikeInstr(Res: Dst, Ops: MergeTyParts);
349 }
350 MI.eraseFromParent();
351 return true;
352}
353
354bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
355 LLT MergeTy) {
356 MachineFunction &MF = B.getMF();
357 assert(MI.getNumMemOperands() == 1);
358 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
359 Register Dst = MI.getOperand(i: 0).getReg();
360 const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: Dst);
361 Register Base = MI.getOperand(i: 1).getReg();
362
363 MachineMemOperand *WideMMO = MF.getMachineMemOperand(MMO: &BaseMMO, Offset: 0, Ty: WideTy);
364 auto WideLoad = B.buildLoad(Res: {DstRB, WideTy}, Addr: Base, MMO&: *WideMMO);
365
366 if (WideTy.isScalar()) {
367 B.buildTrunc(Res: Dst, Op: WideLoad);
368 } else {
369 SmallVector<Register, 4> MergeTyParts;
370 auto Unmerge = B.buildUnmerge(Attrs: {.RCOrRB: DstRB, .Ty: MergeTy}, Op: WideLoad);
371
372 LLT DstTy = MRI.getType(Reg: Dst);
373 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
374 for (unsigned i = 0; i < NumElts; ++i) {
375 MergeTyParts.push_back(Elt: Unmerge.getReg(Idx: i));
376 }
377 B.buildMergeLikeInstr(Res: Dst, Ops: MergeTyParts);
378 }
379 MI.eraseFromParent();
380 return true;
381}
382
383bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
384 Register Dst = MI.getDstReg();
385 Register Ptr = MI.getPointerReg();
386 MachineMemOperand &MMO = MI.getMMO();
387 unsigned MemSize = 8 * MMO.getSize().getValue();
388
389 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(MMO: &MMO, Offset: 0, Ty: S32);
390
391 if (MI.getOpcode() == G_LOAD) {
392 B.buildLoad(Res: Dst, Addr: Ptr, MMO&: *WideMMO);
393 } else {
394 auto Load = B.buildLoad(Res: SgprRB_S32, Addr: Ptr, MMO&: *WideMMO);
395
396 if (MI.getOpcode() == G_ZEXTLOAD) {
397 APInt Mask = APInt::getLowBitsSet(numBits: S32.getSizeInBits(), loBitsSet: MemSize);
398 auto MaskCst = B.buildConstant(Res: SgprRB_S32, Val: Mask);
399 B.buildAnd(Dst, Src0: Load, Src1: MaskCst);
400 } else {
401 assert(MI.getOpcode() == G_SEXTLOAD);
402 B.buildSExtInReg(Res: Dst, Op: Load, ImmOp: MemSize);
403 }
404 }
405
406 MI.eraseFromParent();
407 return true;
408}
409
410bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
411 Register Dst = MI.getOperand(i: 0).getReg();
412 LLT Ty = MRI.getType(Reg: Dst);
413 Register Src = MI.getOperand(i: 1).getReg();
414 unsigned Opc = MI.getOpcode();
415 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
416 if (Ty == S32 || Ty == S16) {
417 auto True = B.buildConstant(Res: {VgprRB, Ty}, Val: TrueExtCst);
418 auto False = B.buildConstant(Res: {VgprRB, Ty}, Val: 0);
419 B.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
420 } else if (Ty == S64) {
421 auto True = B.buildConstant(Res: {VgprRB_S32}, Val: TrueExtCst);
422 auto False = B.buildConstant(Res: {VgprRB_S32}, Val: 0);
423 auto Lo = B.buildSelect(Res: {VgprRB_S32}, Tst: Src, Op0: True, Op1: False);
424 MachineInstrBuilder Hi;
425 switch (Opc) {
426 case G_SEXT:
427 Hi = Lo;
428 break;
429 case G_ZEXT:
430 Hi = False;
431 break;
432 case G_ANYEXT:
433 Hi = B.buildUndef(Res: {VgprRB_S32});
434 break;
435 default:
436 reportGISelFailure(
437 MF, MORE, PassName: "amdgpu-regbanklegalize",
438 Msg: "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
439 return false;
440 }
441
442 B.buildMergeValues(Res: Dst, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)});
443 } else {
444 reportGISelFailure(
445 MF, MORE, PassName: "amdgpu-regbanklegalize",
446 Msg: "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
447 return false;
448 }
449
450 MI.eraseFromParent();
451 return true;
452}
453
454std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
455 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
456 auto Mask = B.buildConstant(Res: SgprRB_S32, Val: 0x0000ffff);
457 auto Lo = B.buildAnd(Dst: SgprRB_S32, Src0: PackedS32, Src1: Mask);
458 auto Hi = B.buildLShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
459 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
460}
461
462std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
463 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
464 auto Lo = B.buildSExtInReg(Res: SgprRB_S32, Op: PackedS32, ImmOp: 16);
465 auto Hi = B.buildAShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
466 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
467}
468
469std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
470 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
471 auto Lo = PackedS32;
472 auto Hi = B.buildLShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
473 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
474}
475
476std::pair<Register, Register>
477RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
478 auto [Lo32, Hi32] = unpackAExt(Reg);
479 return {B.buildTrunc(Res: SgprRB_S16, Op: Lo32).getReg(Idx: 0),
480 B.buildTrunc(Res: SgprRB_S16, Op: Hi32).getReg(Idx: 0)};
481}
482
483bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
484 Register Lo, Hi;
485 switch (MI.getOpcode()) {
486 case AMDGPU::G_SHL: {
487 auto [Val0, Val1] = unpackAExt(Reg: MI.getOperand(i: 1).getReg());
488 auto [Amt0, Amt1] = unpackAExt(Reg: MI.getOperand(i: 2).getReg());
489 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0, Amt0}).getReg(Idx: 0);
490 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val1, Amt1}).getReg(Idx: 0);
491 break;
492 }
493 case AMDGPU::G_LSHR: {
494 auto [Val0, Val1] = unpackZExt(Reg: MI.getOperand(i: 1).getReg());
495 auto [Amt0, Amt1] = unpackZExt(Reg: MI.getOperand(i: 2).getReg());
496 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0, Amt0}).getReg(Idx: 0);
497 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val1, Amt1}).getReg(Idx: 0);
498 break;
499 }
500 case AMDGPU::G_ASHR: {
501 auto [Val0, Val1] = unpackSExt(Reg: MI.getOperand(i: 1).getReg());
502 auto [Amt0, Amt1] = unpackSExt(Reg: MI.getOperand(i: 2).getReg());
503 Lo = B.buildAShr(Dst: SgprRB_S32, Src0: Val0, Src1: Amt0).getReg(Idx: 0);
504 Hi = B.buildAShr(Dst: SgprRB_S32, Src0: Val1, Src1: Amt1).getReg(Idx: 0);
505 break;
506 }
507 default:
508 reportGISelFailure(
509 MF, MORE, PassName: "amdgpu-regbanklegalize",
510 Msg: "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
511 MI);
512 return false;
513 }
514 B.buildBuildVectorTrunc(Res: MI.getOperand(i: 0).getReg(), Ops: {Lo, Hi});
515 MI.eraseFromParent();
516 return true;
517}
518
519bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
520 Register Lo, Hi;
521 switch (MI.getOpcode()) {
522 case AMDGPU::G_SMIN:
523 case AMDGPU::G_SMAX: {
524 // For signed operations, use sign extension
525 auto [Val0_Lo, Val0_Hi] = unpackSExt(Reg: MI.getOperand(i: 1).getReg());
526 auto [Val1_Lo, Val1_Hi] = unpackSExt(Reg: MI.getOperand(i: 2).getReg());
527 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Lo, Val1_Lo})
528 .getReg(Idx: 0);
529 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Hi, Val1_Hi})
530 .getReg(Idx: 0);
531 break;
532 }
533 case AMDGPU::G_UMIN:
534 case AMDGPU::G_UMAX: {
535 // For unsigned operations, use zero extension
536 auto [Val0_Lo, Val0_Hi] = unpackZExt(Reg: MI.getOperand(i: 1).getReg());
537 auto [Val1_Lo, Val1_Hi] = unpackZExt(Reg: MI.getOperand(i: 2).getReg());
538 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Lo, Val1_Lo})
539 .getReg(Idx: 0);
540 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Hi, Val1_Hi})
541 .getReg(Idx: 0);
542 break;
543 }
544 default:
545 reportGISelFailure(
546 MF, MORE, PassName: "amdgpu-regbanklegalize",
547 Msg: "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
548 return false;
549 }
550 B.buildBuildVectorTrunc(Res: MI.getOperand(i: 0).getReg(), Ops: {Lo, Hi});
551 MI.eraseFromParent();
552 return true;
553}
554
555bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
556 auto [Op1Lo, Op1Hi] = unpackAExt(Reg: MI.getOperand(i: 1).getReg());
557 auto [Op2Lo, Op2Hi] = unpackAExt(Reg: MI.getOperand(i: 2).getReg());
558 auto ResLo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Op1Lo, Op2Lo});
559 auto ResHi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Op1Hi, Op2Hi});
560 B.buildBuildVectorTrunc(Res: MI.getOperand(i: 0).getReg(),
561 Ops: {ResLo.getReg(Idx: 0), ResHi.getReg(Idx: 0)});
562 MI.eraseFromParent();
563 return true;
564}
565
566static bool isSignedBFE(MachineInstr &MI) {
567 if (GIntrinsic *GI = dyn_cast<GIntrinsic>(Val: &MI))
568 return (GI->is(ID: Intrinsic::amdgcn_sbfe));
569
570 return MI.getOpcode() == AMDGPU::G_SBFX;
571}
572
573bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
574 Register Dst = MI.getOperand(i: 0).getReg();
575 assert(MRI.getType(Dst) == LLT::scalar(64));
576 bool Signed = isSignedBFE(MI);
577 unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1;
578 // Extract bitfield from Src, LSBit is the least-significant bit for the
579 // extraction (field offset) and Width is size of bitfield.
580 Register Src = MI.getOperand(i: FirstOpnd).getReg();
581 Register LSBit = MI.getOperand(i: FirstOpnd + 1).getReg();
582 Register Width = MI.getOperand(i: FirstOpnd + 2).getReg();
583 // Comments are for signed bitfield extract, similar for unsigned. x is sign
584 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
585
586 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
587 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
588 auto SHRSrc = B.buildInstr(Opc: SHROpc, DstOps: {{VgprRB, S64}}, SrcOps: {Src, LSBit});
589
590 auto ConstWidth = getIConstantVRegValWithLookThrough(VReg: Width, MRI);
591
592 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
593 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
594 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
595 if (!ConstWidth) {
596 auto Amt = B.buildSub(Dst: VgprRB_S32, Src0: B.buildConstant(Res: SgprRB_S32, Val: 64), Src1: Width);
597 auto SignBit = B.buildShl(Dst: {VgprRB, S64}, Src0: SHRSrc, Src1: Amt);
598 B.buildInstr(Opc: SHROpc, DstOps: {Dst}, SrcOps: {SignBit, Amt});
599 MI.eraseFromParent();
600 return true;
601 }
602
603 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
604 auto UnmergeSHRSrc = B.buildUnmerge(Attrs: VgprRB_S32, Op: SHRSrc);
605 Register SHRSrcLo = UnmergeSHRSrc.getReg(Idx: 0);
606 Register SHRSrcHi = UnmergeSHRSrc.getReg(Idx: 1);
607 auto Zero = B.buildConstant(Res: {VgprRB, S32}, Val: 0);
608 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
609
610 if (WidthImm <= 32) {
611 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
612 auto Lo = B.buildInstr(Opc: BFXOpc, DstOps: {VgprRB_S32}, SrcOps: {SHRSrcLo, Zero, Width});
613 MachineInstrBuilder Hi;
614 if (Signed) {
615 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
616 Hi = B.buildAShr(Dst: VgprRB_S32, Src0: Lo, Src1: B.buildConstant(Res: VgprRB_S32, Val: 31));
617 } else {
618 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
619 Hi = Zero;
620 }
621 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
622 } else {
623 auto Amt = B.buildConstant(Res: VgprRB_S32, Val: WidthImm - 32);
624 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
625 auto Hi = B.buildInstr(Opc: BFXOpc, DstOps: {VgprRB_S32}, SrcOps: {SHRSrcHi, Zero, Amt});
626 B.buildMergeLikeInstr(Res: Dst, Ops: {SHRSrcLo, Hi});
627 }
628
629 MI.eraseFromParent();
630 return true;
631}
632
633bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
634 Register DstReg = MI.getOperand(i: 0).getReg();
635 LLT Ty = MRI.getType(Reg: DstReg);
636 bool Signed = isSignedBFE(MI);
637 unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1;
638 Register Src = MI.getOperand(i: FirstOpnd).getReg();
639 Register LSBit = MI.getOperand(i: FirstOpnd + 1).getReg();
640 Register Width = MI.getOperand(i: FirstOpnd + 2).getReg();
641 // For uniform bit field extract there are 4 available instructions, but
642 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
643 // field offset in low and size in high 16 bits.
644
645 // Src1 Hi16|Lo16 = Size|FieldOffset
646 auto Mask = B.buildConstant(Res: SgprRB_S32, Val: maskTrailingOnes<unsigned>(N: 6));
647 auto FieldOffset = B.buildAnd(Dst: SgprRB_S32, Src0: LSBit, Src1: Mask);
648 auto Size = B.buildShl(Dst: SgprRB_S32, Src0: Width, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
649 auto Src1 = B.buildOr(Dst: SgprRB_S32, Src0: FieldOffset, Src1: Size);
650 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
651 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
652 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
653
654 // Select machine instruction, because of reg class constraining, insert
655 // copies from reg class to reg bank.
656 auto S_BFE = B.buildInstr(Opc, DstOps: {{SgprRB, Ty}},
657 SrcOps: {B.buildCopy(Res: Ty, Op: Src), B.buildCopy(Res: S32, Op: Src1)});
658 constrainSelectedInstRegOperands(I&: *S_BFE, TII: *ST.getInstrInfo(),
659 TRI: *ST.getRegisterInfo(), RBI);
660
661 B.buildCopy(Res: DstReg, Op: S_BFE->getOperand(i: 0).getReg());
662 MI.eraseFromParent();
663 return true;
664}
665
666bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
667 Register Dst = MI.getOperand(i: 0).getReg();
668 LLT DstTy = MRI.getType(Reg: Dst);
669 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
670 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
671 auto Op1 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 1).getReg());
672 auto Op2 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 2).getReg());
673 unsigned Opc = MI.getOpcode();
674 auto Flags = MI.getFlags();
675 auto Lo =
676 B.buildInstr(Opc, DstOps: {{VgprRB, Ty}}, SrcOps: {Op1.getReg(Idx: 0), Op2.getReg(Idx: 0)}, Flags);
677 auto Hi =
678 B.buildInstr(Opc, DstOps: {{VgprRB, Ty}}, SrcOps: {Op1.getReg(Idx: 1), Op2.getReg(Idx: 1)}, Flags);
679 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
680 MI.eraseFromParent();
681 return true;
682}
683
684bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
685 Register Dst = MI.getOperand(i: 0).getReg();
686 assert(MRI.getType(Dst) == S64);
687 auto Op1 = B.buildUnmerge(Attrs: {VgprRB_S32}, Op: MI.getOperand(i: 1).getReg());
688 auto Op2 = B.buildUnmerge(Attrs: {VgprRB_S32}, Op: MI.getOperand(i: 2).getReg());
689
690 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
691 // match GlobalISel with old regbankselect.
692 auto Lo = B.buildMul(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 0), Src1: Op2.getReg(Idx: 0));
693 auto Carry = B.buildUMulH(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 0), Src1: Op2.getReg(Idx: 0));
694 auto MulLo0Hi1 = B.buildMul(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 0), Src1: Op2.getReg(Idx: 1));
695 auto MulHi0Lo1 = B.buildMul(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 1), Src1: Op2.getReg(Idx: 0));
696 auto Sum = B.buildAdd(Dst: VgprRB_S32, Src0: MulLo0Hi1, Src1: MulHi0Lo1);
697 auto Hi = B.buildAdd(Dst: VgprRB_S32, Src0: Sum, Src1: Carry);
698
699 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
700 MI.eraseFromParent();
701 return true;
702}
703
704bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
705 Register Dst = MI.getOperand(i: 0).getReg();
706 assert(MRI.getType(Dst) == V2S16);
707 unsigned Opc = MI.getOpcode();
708 unsigned NumOps = MI.getNumOperands();
709 auto Flags = MI.getFlags();
710
711 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(Reg: MI.getOperand(i: 1).getReg());
712
713 if (NumOps == 2) {
714 auto Lo = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Lo}, Flags);
715 auto Hi = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Hi}, Flags);
716 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
717 MI.eraseFromParent();
718 return true;
719 }
720
721 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(Reg: MI.getOperand(i: 2).getReg());
722
723 if (NumOps == 3) {
724 auto Lo = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Lo, Op2Lo}, Flags);
725 auto Hi = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Hi, Op2Hi}, Flags);
726 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
727 MI.eraseFromParent();
728 return true;
729 }
730
731 assert(NumOps == 4);
732 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(Reg: MI.getOperand(i: 3).getReg());
733 auto Lo = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Lo, Op2Lo, Op3Lo}, Flags);
734 auto Hi = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Hi, Op2Hi, Op3Hi}, Flags);
735 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
736 MI.eraseFromParent();
737 return true;
738}
739
740bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
741 Register Dst0 = MI.getOperand(i: 0).getReg();
742 Register Dst1 = MI.getOperand(i: 1).getReg();
743 Register Src0 = MI.getOperand(i: 2).getReg();
744 Register Src1 = MI.getOperand(i: 3).getReg();
745 Register Src2 = MI.getOperand(i: 4).getReg();
746
747 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
748
749 // Keep the multiplication on the SALU.
750 Register DstLo = B.buildMul(Dst: SgprRB_S32, Src0, Src1).getReg(Idx: 0);
751 Register DstHi = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
752 if (ST.hasScalarMulHiInsts()) {
753 B.buildInstr(Opc: AMDGPU::G_UMULH, DstOps: {{DstHi}}, SrcOps: {Src0, Src1});
754 } else {
755 auto VSrc0 = B.buildCopy(Res: VgprRB_S32, Op: Src0);
756 auto VSrc1 = B.buildCopy(Res: VgprRB_S32, Op: Src1);
757 auto MulHi = B.buildInstr(Opc: AMDGPU::G_UMULH, DstOps: {VgprRB_S32}, SrcOps: {VSrc0, VSrc1});
758 buildReadAnyLane(B, SgprDst: DstHi, VgprSrc: MulHi.getReg(Idx: 0), RBI);
759 }
760
761 // Accumulate and produce the "carry-out" bit.
762
763 // The "carry-out" is defined as bit 64 of the result when computed as a
764 // big integer. For unsigned multiply-add, this matches the usual
765 // definition of carry-out.
766 if (mi_match(R: Src2, MRI, P: MIPatternMatch::m_ZeroInt())) {
767 // No accumulate: result is just the multiplication, carry is 0.
768 B.buildMergeLikeInstr(Res: Dst0, Ops: {DstLo, DstHi});
769 B.buildConstant(Res: Dst1, Val: 0);
770 } else {
771 // Accumulate: add Src2 to the multiplication result with carry chain.
772 Register Src2Lo = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
773 Register Src2Hi = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
774 B.buildUnmerge(Res: {Src2Lo, Src2Hi}, Op: Src2);
775
776 auto AddLo = B.buildUAddo(Res: SgprRB_S32, CarryOut: SgprRB_S32, Op0: DstLo, Op1: Src2Lo);
777 auto AddHi =
778 B.buildUAdde(Res: SgprRB_S32, CarryOut: SgprRB_S32, Op0: DstHi, Op1: Src2Hi, CarryIn: AddLo.getReg(Idx: 1));
779 B.buildMergeLikeInstr(Res: Dst0, Ops: {AddLo.getReg(Idx: 0), AddHi.getReg(Idx: 0)});
780 B.buildCopy(Res: Dst1, Op: AddHi.getReg(Idx: 1));
781 }
782
783 MI.eraseFromParent();
784 return true;
785}
786
787bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
788 Register Dst = MI.getOperand(i: 0).getReg();
789 LLT DstTy = MRI.getType(Reg: Dst);
790 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
791 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
792 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
793 auto Op2 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 2).getReg());
794 auto Op3 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 3).getReg());
795 Register Cond = MI.getOperand(i: 1).getReg();
796 auto Flags = MI.getFlags();
797 auto Lo =
798 B.buildSelect(Res: {VgprRB, Ty}, Tst: Cond, Op0: Op2.getReg(Idx: 0), Op1: Op3.getReg(Idx: 0), Flags);
799 auto Hi =
800 B.buildSelect(Res: {VgprRB, Ty}, Tst: Cond, Op0: Op2.getReg(Idx: 1), Op1: Op3.getReg(Idx: 1), Flags);
801
802 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
803 MI.eraseFromParent();
804 return true;
805}
806
807bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
808 auto Op1 = B.buildUnmerge(Attrs: VgprRB_S32, Op: MI.getOperand(i: 1).getReg());
809 int Amt = MI.getOperand(i: 2).getImm();
810 Register Lo, Hi;
811 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
812 if (Amt <= 32) {
813 auto Freeze = B.buildFreeze(Dst: VgprRB_S32, Src: Op1.getReg(Idx: 0));
814 if (Amt == 32) {
815 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
816 Lo = Freeze.getReg(Idx: 0);
817 } else {
818 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
819 Lo = B.buildSExtInReg(Res: VgprRB_S32, Op: Freeze, ImmOp: Amt).getReg(Idx: 0);
820 }
821
822 auto SignExtCst = B.buildConstant(Res: SgprRB_S32, Val: 31);
823 Hi = B.buildAShr(Dst: VgprRB_S32, Src0: Lo, Src1: SignExtCst).getReg(Idx: 0);
824 } else {
825 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
826 Lo = Op1.getReg(Idx: 0);
827 Hi = B.buildSExtInReg(Res: VgprRB_S32, Op: Op1.getReg(Idx: 1), ImmOp: Amt - 32).getReg(Idx: 0);
828 }
829
830 B.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(), Ops: {Lo, Hi});
831 MI.eraseFromParent();
832 return true;
833}
834
835bool RegBankLegalizeHelper::lower(MachineInstr &MI,
836 const RegBankLLTMapping &Mapping,
837 WaterfallInfo &WFI) {
838
839 switch (Mapping.LoweringMethod) {
840 case DoNotLower:
841 break;
842 case VccExtToSel:
843 return lowerVccExtToSel(MI);
844 case UniExtToSel: {
845 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
846 auto True = B.buildConstant(Res: {SgprRB, Ty},
847 Val: MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
848 auto False = B.buildConstant(Res: {SgprRB, Ty}, Val: 0);
849 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
850 // We are making select here. S1 cond was already 'any-extended to S32' +
851 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
852 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: MI.getOperand(i: 1).getReg(), Op0: True,
853 Op1: False);
854 MI.eraseFromParent();
855 return true;
856 }
857 case UnpackBitShift:
858 return lowerUnpackBitShift(MI);
859 case UnpackMinMax:
860 return lowerUnpackMinMax(MI);
861 case ScalarizeToS16:
862 return lowerSplitTo16(MI);
863 case Ext32To64: {
864 const RegisterBank *RB = MRI.getRegBank(Reg: MI.getOperand(i: 0).getReg());
865 MachineInstrBuilder Hi;
866 switch (MI.getOpcode()) {
867 case AMDGPU::G_ZEXT: {
868 Hi = B.buildConstant(Res: {RB, S32}, Val: 0);
869 break;
870 }
871 case AMDGPU::G_SEXT: {
872 // Replicate sign bit from 32-bit extended part.
873 auto ShiftAmt = B.buildConstant(Res: {RB, S32}, Val: 31);
874 Hi = B.buildAShr(Dst: {RB, S32}, Src0: MI.getOperand(i: 1).getReg(), Src1: ShiftAmt);
875 break;
876 }
877 case AMDGPU::G_ANYEXT: {
878 Hi = B.buildUndef(Res: {RB, S32});
879 break;
880 }
881 default:
882 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
883 Msg: "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
884 MI);
885 return false;
886 }
887
888 B.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(),
889 Ops: {MI.getOperand(i: 1).getReg(), Hi});
890 MI.eraseFromParent();
891 return true;
892 }
893 case UniCstExt: {
894 uint64_t ConstVal = MI.getOperand(i: 1).getCImm()->getZExtValue();
895 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: ConstVal);
896
897 MI.eraseFromParent();
898 return true;
899 }
900 case VgprToVccCopy: {
901 Register Src = MI.getOperand(i: 1).getReg();
902 LLT Ty = MRI.getType(Reg: Src);
903 // Take lowest bit from each lane and put it in lane mask.
904 // Lowering via compare, but we need to clean high bits first as compare
905 // compares all bits in register.
906 Register BoolSrc = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
907 if (Ty == S64) {
908 auto Src64 = B.buildUnmerge(Attrs: VgprRB_S32, Op: Src);
909 auto One = B.buildConstant(Res: VgprRB_S32, Val: 1);
910 auto AndLo = B.buildAnd(Dst: VgprRB_S32, Src0: Src64.getReg(Idx: 0), Src1: One);
911 auto Zero = B.buildConstant(Res: VgprRB_S32, Val: 0);
912 auto AndHi = B.buildAnd(Dst: VgprRB_S32, Src0: Src64.getReg(Idx: 1), Src1: Zero);
913 B.buildMergeLikeInstr(Res: BoolSrc, Ops: {AndLo, AndHi});
914 } else {
915 assert(Ty == S32 || Ty == S16);
916 auto One = B.buildConstant(Res: {VgprRB, Ty}, Val: 1);
917 B.buildAnd(Dst: BoolSrc, Src0: Src, Src1: One);
918 }
919 auto Zero = B.buildConstant(Res: {VgprRB, Ty}, Val: 0);
920 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: MI.getOperand(i: 0).getReg(), Op0: BoolSrc, Op1: Zero);
921 MI.eraseFromParent();
922 return true;
923 }
924 case V_BFE:
925 return lowerV_BFE(MI);
926 case S_BFE:
927 return lowerS_BFE(MI);
928 case UniMAD64:
929 return lowerUniMAD64(MI);
930 case UniMul64: {
931 B.buildMul(Dst: MI.getOperand(i: 0), Src0: MI.getOperand(i: 1), Src1: MI.getOperand(i: 2));
932 MI.eraseFromParent();
933 return true;
934 }
935 case DivSMulToMAD: {
936 auto Op1 = B.buildTrunc(Res: VgprRB_S32, Op: MI.getOperand(i: 1));
937 auto Op2 = B.buildTrunc(Res: VgprRB_S32, Op: MI.getOperand(i: 2));
938 auto Zero = B.buildConstant(Res: {VgprRB, S64}, Val: 0);
939
940 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
941 ? AMDGPU::G_AMDGPU_MAD_U64_U32
942 : AMDGPU::G_AMDGPU_MAD_I64_I32;
943
944 B.buildInstr(Opc: NewOpc, DstOps: {MI.getOperand(i: 0).getReg(), {SgprRB, S32}},
945 SrcOps: {Op1, Op2, Zero});
946 MI.eraseFromParent();
947 return true;
948 }
949 case SplitTo32:
950 return lowerSplitTo32(MI);
951 case SplitTo32Mul:
952 return lowerSplitTo32Mul(MI);
953 case SplitTo32Select:
954 return lowerSplitTo32Select(MI);
955 case SplitTo32SExtInReg:
956 return lowerSplitTo32SExtInReg(MI);
957 case SplitLoad: {
958 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
959 unsigned Size = DstTy.getSizeInBits();
960 // Even split to 128-bit loads
961 if (Size > 128) {
962 LLT B128;
963 if (DstTy.isVector()) {
964 LLT EltTy = DstTy.getElementType();
965 B128 = LLT::fixed_vector(NumElements: 128 / EltTy.getSizeInBits(), ScalarTy: EltTy);
966 } else {
967 B128 = LLT::scalar(SizeInBits: 128);
968 }
969 if (Size / 128 == 2)
970 splitLoad(MI, LLTBreakdown: {B128, B128});
971 else if (Size / 128 == 4)
972 splitLoad(MI, LLTBreakdown: {B128, B128, B128, B128});
973 else {
974 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
975 Msg: "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
976 MI);
977 return false;
978 }
979 }
980 // 64 and 32 bit load
981 else if (DstTy == S96)
982 splitLoad(MI, LLTBreakdown: {S64, S32}, MergeTy: S32);
983 else if (DstTy == V3S32)
984 splitLoad(MI, LLTBreakdown: {V2S32, S32}, MergeTy: S32);
985 else if (DstTy == V6S16)
986 splitLoad(MI, LLTBreakdown: {V4S16, V2S16}, MergeTy: V2S16);
987 else {
988 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
989 Msg: "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
990 MI);
991 return false;
992 }
993 return true;
994 }
995 case WidenLoad: {
996 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
997 if (DstTy == S96)
998 widenLoad(MI, WideTy: S128);
999 else if (DstTy == V3S32)
1000 widenLoad(MI, WideTy: V4S32, MergeTy: S32);
1001 else if (DstTy == V6S16)
1002 widenLoad(MI, WideTy: V8S16, MergeTy: V2S16);
1003 else {
1004 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1005 Msg: "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1006 MI);
1007 return false;
1008 }
1009 return true;
1010 }
1011 case UnpackAExt:
1012 return lowerUnpackAExt(MI);
1013 case WidenMMOToS32:
1014 return widenMMOToS32(MI&: cast<GAnyLoad>(Val&: MI));
1015 case VerifyAllSgpr: {
1016 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1017 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1018 }));
1019 return true;
1020 }
1021 case ApplyAllVgpr: {
1022 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1023 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1024 }));
1025 B.setInstrAndDebugLoc(MI);
1026 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1027 Register Reg = MI.getOperand(i).getReg();
1028 if (MRI.getRegBank(Reg) != VgprRB) {
1029 auto Copy = B.buildCopy(Res: {VgprRB, MRI.getType(Reg)}, Op: Reg);
1030 MI.getOperand(i).setReg(Copy.getReg(Idx: 0));
1031 }
1032 }
1033 return true;
1034 }
1035 case UnmergeToShiftTrunc: {
1036 GUnmerge *Unmerge = dyn_cast<GUnmerge>(Val: &MI);
1037 LLT Ty = MRI.getType(Reg: Unmerge->getSourceReg());
1038 if (Ty.getSizeInBits() % 32 != 0) {
1039 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1040 Msg: "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1041 MI);
1042 return false;
1043 }
1044
1045 B.setInstrAndDebugLoc(MI);
1046 if (Ty.getSizeInBits() > 32) {
1047 auto UnmergeV2S16 =
1048 B.buildUnmerge(Attrs: {.RCOrRB: SgprRB, .Ty: V2S16}, Op: Unmerge->getSourceReg());
1049 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1050 auto [Dst0S32, Dst1S32] =
1051 unpackAExt(Reg: UnmergeV2S16->getOperand(i).getReg());
1052 B.buildTrunc(Res: MI.getOperand(i: i * 2).getReg(), Op: Dst0S32);
1053 B.buildTrunc(Res: MI.getOperand(i: i * 2 + 1).getReg(), Op: Dst1S32);
1054 }
1055 } else {
1056 auto [Dst0S32, Dst1S32] = unpackAExt(Reg: MI.getOperand(i: 2).getReg());
1057 B.buildTrunc(Res: MI.getOperand(i: 0).getReg(), Op: Dst0S32);
1058 B.buildTrunc(Res: MI.getOperand(i: 1).getReg(), Op: Dst1S32);
1059 }
1060
1061 MI.eraseFromParent();
1062 return true;
1063 }
1064 case ApplyINTRIN_IMAGE:
1065 return applyRegisterBanksINTRIN_IMAGE(MI);
1066 }
1067
1068 if (!WFI.SgprWaterfallOperandRegs.empty()) {
1069 if (!executeInWaterfallLoop(B, WFI))
1070 return false;
1071 }
1072 return true;
1073}
1074
1075LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1076 switch (ID) {
1077 case Vcc:
1078 case UniInVcc:
1079 return LLT::scalar(SizeInBits: 1);
1080 case Sgpr16:
1081 case Vgpr16:
1082 case UniInVgprS16:
1083 return LLT::scalar(SizeInBits: 16);
1084 case Sgpr32:
1085 case Sgpr32_WF:
1086 case Sgpr32Trunc:
1087 case Sgpr32AExt:
1088 case Sgpr32AExtBoolInReg:
1089 case Sgpr32SExt:
1090 case Sgpr32ZExt:
1091 case UniInVgprS32:
1092 case Vgpr32:
1093 case Vgpr32AExt:
1094 case Vgpr32SExt:
1095 case Vgpr32ZExt:
1096 return LLT::scalar(SizeInBits: 32);
1097 case Sgpr64:
1098 case Vgpr64:
1099 case UniInVgprS64:
1100 return LLT::scalar(SizeInBits: 64);
1101 case Sgpr128:
1102 case Vgpr128:
1103 return LLT::scalar(SizeInBits: 128);
1104 case SgprP0:
1105 case SgprP0Call_WF:
1106 case VgprP0:
1107 return LLT::pointer(AddressSpace: 0, SizeInBits: 64);
1108 case SgprP1:
1109 case VgprP1:
1110 return LLT::pointer(AddressSpace: 1, SizeInBits: 64);
1111 case SgprP2:
1112 case VgprP2:
1113 return LLT::pointer(AddressSpace: 2, SizeInBits: 32);
1114 case SgprP3:
1115 case VgprP3:
1116 return LLT::pointer(AddressSpace: 3, SizeInBits: 32);
1117 case SgprP4:
1118 case SgprP4Call_WF:
1119 case VgprP4:
1120 return LLT::pointer(AddressSpace: 4, SizeInBits: 64);
1121 case SgprP5:
1122 case VgprP5:
1123 return LLT::pointer(AddressSpace: 5, SizeInBits: 32);
1124 case SgprP8:
1125 return LLT::pointer(AddressSpace: 8, SizeInBits: 128);
1126 case SgprV2S16:
1127 case VgprV2S16:
1128 case UniInVgprV2S16:
1129 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
1130 case SgprV2S32:
1131 case VgprV2S32:
1132 case UniInVgprV2S32:
1133 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
1134 case VgprV3S32:
1135 return LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
1136 case SgprV4S32:
1137 case SgprV4S32_WF:
1138 case VgprV4S32:
1139 case UniInVgprV4S32:
1140 return LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
1141 case VgprV2S64:
1142 case UniInVgprV2S64:
1143 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
1144 default:
1145 return LLT();
1146 }
1147}
1148
1149LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1150 switch (ID) {
1151 case SgprB32:
1152 case VgprB32:
1153 case UniInVgprB32:
1154 if (Ty == LLT::scalar(SizeInBits: 32) || Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) ||
1155 isAnyPtr(Ty, Width: 32))
1156 return Ty;
1157 return LLT();
1158 case SgprPtr32:
1159 case VgprPtr32:
1160 return isAnyPtr(Ty, Width: 32) ? Ty : LLT();
1161 case SgprPtr64:
1162 case VgprPtr64:
1163 return isAnyPtr(Ty, Width: 64) ? Ty : LLT();
1164 case SgprPtr128:
1165 case VgprPtr128:
1166 return isAnyPtr(Ty, Width: 128) ? Ty : LLT();
1167 case SgprB64:
1168 case VgprB64:
1169 case UniInVgprB64:
1170 if (Ty == LLT::scalar(SizeInBits: 64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32) ||
1171 Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16) || isAnyPtr(Ty, Width: 64))
1172 return Ty;
1173 return LLT();
1174 case SgprB96:
1175 case VgprB96:
1176 case UniInVgprB96:
1177 if (Ty == LLT::scalar(SizeInBits: 96) || Ty == LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32) ||
1178 Ty == LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 16))
1179 return Ty;
1180 return LLT();
1181 case SgprB128:
1182 case VgprB128:
1183 case UniInVgprB128:
1184 if (Ty == LLT::scalar(SizeInBits: 128) || Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32) ||
1185 Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64) || Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16) ||
1186 isAnyPtr(Ty, Width: 128))
1187 return Ty;
1188 return LLT();
1189 case VgprB160:
1190 case UniInVgprB160:
1191 if (Ty.getSizeInBits() == 160)
1192 return Ty;
1193 return LLT();
1194 case SgprB256:
1195 case VgprB256:
1196 case UniInVgprB256:
1197 if (Ty == LLT::scalar(SizeInBits: 256) || Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 32) ||
1198 Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 64) || Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 16))
1199 return Ty;
1200 return LLT();
1201 case SgprB512:
1202 case VgprB512:
1203 case UniInVgprB512:
1204 if (Ty == LLT::scalar(SizeInBits: 512) || Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 32) ||
1205 Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 64))
1206 return Ty;
1207 return LLT();
1208 case SgprBRC: {
1209 const SIRegisterInfo *TRI =
1210 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1211 unsigned LLTSize = Ty.getSizeInBits();
1212 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(BitWidth: LLTSize))
1213 return Ty;
1214 return LLT();
1215 }
1216 case VgprBRC: {
1217 const SIRegisterInfo *TRI =
1218 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1219 if (TRI->getSGPRClassForBitWidth(BitWidth: Ty.getSizeInBits()))
1220 return Ty;
1221 return LLT();
1222 }
1223 default:
1224 return LLT();
1225 }
1226}
1227
1228const RegisterBank *
1229RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1230 switch (ID) {
1231 case Vcc:
1232 return VccRB;
1233 case Sgpr16:
1234 case Sgpr32:
1235 case Sgpr32_WF:
1236 case Sgpr64:
1237 case Sgpr128:
1238 case SgprP0:
1239 case SgprP0Call_WF:
1240 case SgprP1:
1241 case SgprP2:
1242 case SgprP3:
1243 case SgprP4:
1244 case SgprP4Call_WF:
1245 case SgprP5:
1246 case SgprP8:
1247 case SgprPtr32:
1248 case SgprPtr64:
1249 case SgprPtr128:
1250 case SgprV2S16:
1251 case SgprV2S32:
1252 case SgprV4S32:
1253 case SgprV4S32_WF:
1254 case SgprB32:
1255 case SgprB64:
1256 case SgprB96:
1257 case SgprB128:
1258 case SgprB256:
1259 case SgprB512:
1260 case SgprBRC:
1261 case UniInVcc:
1262 case UniInVgprS16:
1263 case UniInVgprS32:
1264 case UniInVgprS64:
1265 case UniInVgprV2S16:
1266 case UniInVgprV2S32:
1267 case UniInVgprV4S32:
1268 case UniInVgprV2S64:
1269 case UniInVgprB32:
1270 case UniInVgprB64:
1271 case UniInVgprB96:
1272 case UniInVgprB128:
1273 case UniInVgprB160:
1274 case UniInVgprB256:
1275 case UniInVgprB512:
1276 case Sgpr32Trunc:
1277 case Sgpr32AExt:
1278 case Sgpr32AExtBoolInReg:
1279 case Sgpr32SExt:
1280 case Sgpr32ZExt:
1281 return SgprRB;
1282 case Vgpr16:
1283 case Vgpr32:
1284 case Vgpr64:
1285 case Vgpr128:
1286 case VgprP0:
1287 case VgprP1:
1288 case VgprP2:
1289 case VgprP3:
1290 case VgprP4:
1291 case VgprP5:
1292 case VgprPtr32:
1293 case VgprPtr64:
1294 case VgprPtr128:
1295 case VgprV2S16:
1296 case VgprV2S32:
1297 case VgprV2S64:
1298 case VgprV3S32:
1299 case VgprV4S32:
1300 case VgprB32:
1301 case VgprB64:
1302 case VgprB96:
1303 case VgprB128:
1304 case VgprB160:
1305 case VgprB256:
1306 case VgprB512:
1307 case VgprBRC:
1308 case Vgpr32AExt:
1309 case Vgpr32SExt:
1310 case Vgpr32ZExt:
1311 return VgprRB;
1312 default:
1313 return nullptr;
1314 }
1315}
1316
1317bool RegBankLegalizeHelper::applyMappingDst(
1318 MachineInstr &MI, unsigned &OpIdx,
1319 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1320 // Defs start from operand 0
1321 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1322 if (MethodIDs[OpIdx] == None)
1323 continue;
1324 MachineOperand &Op = MI.getOperand(i: OpIdx);
1325 Register Reg = Op.getReg();
1326 LLT Ty = MRI.getType(Reg);
1327 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1328
1329 switch (MethodIDs[OpIdx]) {
1330 // vcc, sgpr and vgpr scalars, pointers and vectors
1331 case Vcc:
1332 case Sgpr16:
1333 case Sgpr32:
1334 case Sgpr64:
1335 case Sgpr128:
1336 case SgprP0:
1337 case SgprP1:
1338 case SgprP3:
1339 case SgprP4:
1340 case SgprP5:
1341 case SgprP8:
1342 case SgprV2S16:
1343 case SgprV2S32:
1344 case SgprV4S32:
1345 case Vgpr16:
1346 case Vgpr32:
1347 case Vgpr64:
1348 case Vgpr128:
1349 case VgprP0:
1350 case VgprP1:
1351 case VgprP2:
1352 case VgprP3:
1353 case VgprP4:
1354 case VgprP5:
1355 case VgprV2S16:
1356 case VgprV2S32:
1357 case VgprV2S64:
1358 case VgprV3S32:
1359 case VgprV4S32: {
1360 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1361 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1362 break;
1363 }
1364 // sgpr and vgpr B-types
1365 case SgprB32:
1366 case SgprB64:
1367 case SgprB96:
1368 case SgprB128:
1369 case SgprB256:
1370 case SgprB512:
1371 case SgprBRC:
1372 case SgprPtr32:
1373 case SgprPtr64:
1374 case SgprPtr128:
1375 case VgprB32:
1376 case VgprB64:
1377 case VgprB96:
1378 case VgprB128:
1379 case VgprB160:
1380 case VgprB256:
1381 case VgprB512:
1382 case VgprBRC:
1383 case VgprPtr32:
1384 case VgprPtr64:
1385 case VgprPtr128: {
1386 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1387 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1388 break;
1389 }
1390 // uniform in vcc/vgpr: scalars, vectors and B-types
1391 case UniInVcc: {
1392 assert(Ty == S1);
1393 assert(RB == SgprRB);
1394 Register NewDst = MRI.createVirtualRegister(RegAttr: VccRB_S1);
1395 Op.setReg(NewDst);
1396 if (!MRI.use_empty(RegNo: Reg)) {
1397 auto CopyS32_Vcc =
1398 B.buildInstr(Opc: AMDGPU::G_AMDGPU_COPY_SCC_VCC, DstOps: {SgprRB_S32}, SrcOps: {NewDst});
1399 B.buildTrunc(Res: Reg, Op: CopyS32_Vcc);
1400 }
1401 break;
1402 }
1403 case UniInVgprS16: {
1404 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1405 assert(RB == SgprRB);
1406 Register NewVgprDstS16 = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: S16});
1407 Register NewVgprDstS32 = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: S32});
1408 Register NewSgprDstS32 = MRI.createVirtualRegister(RegAttr: {.RCOrRB: SgprRB, .Ty: S32});
1409 Op.setReg(NewVgprDstS16);
1410 B.buildAnyExt(Res: NewVgprDstS32, Op: NewVgprDstS16);
1411 buildReadAnyLane(B, SgprDst: NewSgprDstS32, VgprSrc: NewVgprDstS32, RBI);
1412 B.buildTrunc(Res: Reg, Op: NewSgprDstS32);
1413 break;
1414 }
1415 case UniInVgprS32:
1416 case UniInVgprS64:
1417 case UniInVgprV2S16:
1418 case UniInVgprV2S32:
1419 case UniInVgprV4S32:
1420 case UniInVgprV2S64: {
1421 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1422 assert(RB == SgprRB);
1423 Register NewVgprDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
1424 Op.setReg(NewVgprDst);
1425 buildReadAnyLane(B, SgprDst: Reg, VgprSrc: NewVgprDst, RBI);
1426 break;
1427 }
1428 case UniInVgprB32:
1429 case UniInVgprB64:
1430 case UniInVgprB96:
1431 case UniInVgprB128:
1432 case UniInVgprB160:
1433 case UniInVgprB256:
1434 case UniInVgprB512: {
1435 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1436 assert(RB == SgprRB);
1437 Register NewVgprDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
1438 Op.setReg(NewVgprDst);
1439 AMDGPU::buildReadAnyLane(B, SgprDst: Reg, VgprSrc: NewVgprDst, RBI);
1440 break;
1441 }
1442 // sgpr trunc
1443 case Sgpr32Trunc: {
1444 assert(Ty.getSizeInBits() < 32);
1445 assert(RB == SgprRB);
1446 Register NewDst = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
1447 Op.setReg(NewDst);
1448 if (!MRI.use_empty(RegNo: Reg))
1449 B.buildTrunc(Res: Reg, Op: NewDst);
1450 break;
1451 }
1452 case InvalidMapping: {
1453 reportGISelFailure(
1454 MF, MORE, PassName: "amdgpu-regbanklegalize",
1455 Msg: "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1456 return false;
1457 }
1458 default:
1459 reportGISelFailure(
1460 MF, MORE, PassName: "amdgpu-regbanklegalize",
1461 Msg: "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1462 return false;
1463 }
1464 }
1465
1466 return true;
1467}
1468
1469bool RegBankLegalizeHelper::applyMappingSrc(
1470 MachineInstr &MI, unsigned &OpIdx,
1471 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1472 WaterfallInfo &WFI) {
1473 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1474 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1475 continue;
1476
1477 MachineOperand &Op = MI.getOperand(i: OpIdx);
1478 Register Reg = Op.getReg();
1479 LLT Ty = MRI.getType(Reg);
1480 const RegisterBank *RB = MRI.getRegBank(Reg);
1481
1482 switch (MethodIDs[i]) {
1483 case Vcc: {
1484 assert(Ty == S1);
1485 assert(RB == VccRB || RB == SgprRB);
1486 if (RB == SgprRB) {
1487 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
1488 auto CopyVcc_Scc =
1489 B.buildInstr(Opc: AMDGPU::G_AMDGPU_COPY_VCC_SCC, DstOps: {VccRB_S1}, SrcOps: {Aext});
1490 Op.setReg(CopyVcc_Scc.getReg(Idx: 0));
1491 }
1492 break;
1493 }
1494 // sgpr scalars, pointers and vectors
1495 case Sgpr16:
1496 case Sgpr32:
1497 case Sgpr64:
1498 case Sgpr128:
1499 case SgprP0:
1500 case SgprP1:
1501 case SgprP3:
1502 case SgprP4:
1503 case SgprP5:
1504 case SgprP8:
1505 case SgprV2S16:
1506 case SgprV2S32:
1507 case SgprV4S32: {
1508 assert(Ty == getTyFromID(MethodIDs[i]));
1509 assert(RB == getRegBankFromID(MethodIDs[i]));
1510 break;
1511 }
1512 // sgpr B-types
1513 case SgprB32:
1514 case SgprB64:
1515 case SgprB96:
1516 case SgprB128:
1517 case SgprB256:
1518 case SgprB512:
1519 case SgprBRC:
1520 case SgprPtr32:
1521 case SgprPtr64:
1522 case SgprPtr128: {
1523 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1524 assert(RB == getRegBankFromID(MethodIDs[i]));
1525 break;
1526 }
1527 // vgpr scalars, pointers and vectors
1528 case Vgpr16:
1529 case Vgpr32:
1530 case Vgpr64:
1531 case Vgpr128:
1532 case VgprP0:
1533 case VgprP1:
1534 case VgprP2:
1535 case VgprP3:
1536 case VgprP4:
1537 case VgprP5:
1538 case VgprV2S16:
1539 case VgprV2S32:
1540 case VgprV2S64:
1541 case VgprV3S32:
1542 case VgprV4S32: {
1543 assert(Ty == getTyFromID(MethodIDs[i]));
1544 if (RB != VgprRB) {
1545 auto CopyToVgpr = B.buildCopy(Res: {VgprRB, Ty}, Op: Reg);
1546 Op.setReg(CopyToVgpr.getReg(Idx: 0));
1547 }
1548 break;
1549 }
1550 // vgpr B-types
1551 case VgprB32:
1552 case VgprB64:
1553 case VgprB96:
1554 case VgprB128:
1555 case VgprB160:
1556 case VgprB256:
1557 case VgprB512:
1558 case VgprBRC:
1559 case VgprPtr32:
1560 case VgprPtr64:
1561 case VgprPtr128: {
1562 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1563 if (RB != VgprRB) {
1564 auto CopyToVgpr = B.buildCopy(Res: {VgprRB, Ty}, Op: Reg);
1565 Op.setReg(CopyToVgpr.getReg(Idx: 0));
1566 }
1567 break;
1568 }
1569 // sgpr waterfall, scalars, and vectors
1570 case Sgpr32_WF:
1571 case SgprV4S32_WF: {
1572 assert(Ty == getTyFromID(MethodIDs[i]));
1573 if (RB != SgprRB) {
1574 WFI.SgprWaterfallOperandRegs.insert(V: Reg);
1575 if (!WFI.Start.isValid()) {
1576 WFI.Start = MI.getIterator();
1577 WFI.End = std::next(x: MI.getIterator());
1578 }
1579 }
1580 break;
1581 }
1582 case SgprP0Call_WF:
1583 case SgprP4Call_WF: {
1584 assert(Ty == getTyFromID(MethodIDs[i]));
1585 if (RB != SgprRB) {
1586 WFI.SgprWaterfallOperandRegs.insert(V: Reg);
1587
1588 // Find the ADJCALLSTACKUP before the call.
1589 MachineBasicBlock::iterator Start = MI.getIterator();
1590 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
1591 --Start;
1592
1593 // Find the ADJCALLSTACKDOWN after the call (include it in range).
1594 MachineBasicBlock::iterator End = MI.getIterator();
1595 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
1596 ++End;
1597 ++End;
1598
1599 B.setInsertPt(MBB&: *MI.getParent(), II: Start);
1600 WFI.Start = Start;
1601 WFI.End = End;
1602 }
1603 break;
1604 }
1605 // sgpr and vgpr scalars with extend
1606 case Sgpr32AExt: {
1607 // Note: this ext allows S1, and it is meant to be combined away.
1608 assert(Ty.getSizeInBits() < 32);
1609 assert(RB == SgprRB);
1610 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
1611 Op.setReg(Aext.getReg(Idx: 0));
1612 break;
1613 }
1614 case Sgpr32AExtBoolInReg: {
1615 // Note: this ext allows S1, and it is meant to be combined away.
1616 assert(Ty.getSizeInBits() == 1);
1617 assert(RB == SgprRB);
1618 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
1619 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1620 // most of times meant to be combined away in AMDGPURegBankCombiner.
1621 auto Cst1 = B.buildConstant(Res: SgprRB_S32, Val: 1);
1622 auto BoolInReg = B.buildAnd(Dst: SgprRB_S32, Src0: Aext, Src1: Cst1);
1623 Op.setReg(BoolInReg.getReg(Idx: 0));
1624 break;
1625 }
1626 case Sgpr32SExt: {
1627 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1628 assert(RB == SgprRB);
1629 auto Sext = B.buildSExt(Res: SgprRB_S32, Op: Reg);
1630 Op.setReg(Sext.getReg(Idx: 0));
1631 break;
1632 }
1633 case Sgpr32ZExt: {
1634 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1635 assert(RB == SgprRB);
1636 auto Zext = B.buildZExt(Res: {SgprRB, S32}, Op: Reg);
1637 Op.setReg(Zext.getReg(Idx: 0));
1638 break;
1639 }
1640 case Vgpr32AExt: {
1641 assert(Ty.getSizeInBits() < 32);
1642 assert(RB == VgprRB);
1643 auto Aext = B.buildAnyExt(Res: {VgprRB, S32}, Op: Reg);
1644 Op.setReg(Aext.getReg(Idx: 0));
1645 break;
1646 }
1647 case Vgpr32SExt: {
1648 // Note this ext allows S1, and it is meant to be combined away.
1649 assert(Ty.getSizeInBits() < 32);
1650 assert(RB == VgprRB);
1651 auto Sext = B.buildSExt(Res: {VgprRB, S32}, Op: Reg);
1652 Op.setReg(Sext.getReg(Idx: 0));
1653 break;
1654 }
1655 case Vgpr32ZExt: {
1656 // Note this ext allows S1, and it is meant to be combined away.
1657 assert(Ty.getSizeInBits() < 32);
1658 assert(RB == VgprRB);
1659 auto Zext = B.buildZExt(Res: {VgprRB, S32}, Op: Reg);
1660 Op.setReg(Zext.getReg(Idx: 0));
1661 break;
1662 }
1663 default:
1664 reportGISelFailure(
1665 MF, MORE, PassName: "amdgpu-regbanklegalize",
1666 Msg: "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
1667 return false;
1668 }
1669 }
1670 return true;
1671}
1672
1673bool RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
1674 Register Dst = MI.getOperand(i: 0).getReg();
1675 LLT Ty = MRI.getType(Reg: Dst);
1676
1677 if (Ty == LLT::scalar(SizeInBits: 1) && MUI.isUniform(V: Dst)) {
1678 B.setInsertPt(MBB&: *MI.getParent(), II: MI.getParent()->getFirstNonPHI());
1679
1680 Register NewDst = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
1681 MI.getOperand(i: 0).setReg(NewDst);
1682 B.buildTrunc(Res: Dst, Op: NewDst);
1683
1684 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1685 Register UseReg = MI.getOperand(i).getReg();
1686
1687 auto DefMI = MRI.getVRegDef(Reg: UseReg)->getIterator();
1688 MachineBasicBlock *DefMBB = DefMI->getParent();
1689
1690 B.setInsertPt(MBB&: *DefMBB, II: DefMBB->SkipPHIsAndLabels(I: std::next(x: DefMI)));
1691
1692 auto NewUse = B.buildAnyExt(Res: SgprRB_S32, Op: UseReg);
1693 MI.getOperand(i).setReg(NewUse.getReg(Idx: 0));
1694 }
1695
1696 return true;
1697 }
1698
1699 // ALL divergent i1 phis should have been lowered and inst-selected into PHI
1700 // with sgpr reg class and S1 LLT in AMDGPUGlobalISelDivergenceLowering pass.
1701 // Note: this includes divergent phis that don't require lowering.
1702 if (Ty == LLT::scalar(SizeInBits: 1) && MUI.isDivergent(V: Dst)) {
1703 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1704 Msg: "AMDGPU RegBankLegalize: Can't lower divergent S1 G_PHI",
1705 MI);
1706 return false;
1707 }
1708
1709 // We accept all types that can fit in some register class.
1710 // Uniform G_PHIs have all sgpr registers.
1711 // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
1712 if (Ty == LLT::scalar(SizeInBits: 32) || Ty == LLT::pointer(AddressSpace: 1, SizeInBits: 64) ||
1713 Ty == LLT::pointer(AddressSpace: 4, SizeInBits: 64)) {
1714 return true;
1715 }
1716
1717 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1718 Msg: "AMDGPU RegBankLegalize: type not supported for G_PHI",
1719 MI);
1720 return false;
1721}
1722
1723[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1724 const RegisterBank *RB,
1725 MachineRegisterInfo &MRI,
1726 unsigned StartOpIdx,
1727 unsigned EndOpIdx) {
1728 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1729 if (MRI.getRegBankOrNull(Reg: MI.getOperand(i).getReg()) != RB)
1730 return false;
1731 }
1732 return true;
1733}
1734
1735void RegBankLegalizeHelper::applyMappingTrivial(MachineInstr &MI) {
1736 const RegisterBank *RB = MRI.getRegBank(Reg: MI.getOperand(i: 0).getReg());
1737 // Put RB on all registers
1738 unsigned NumDefs = MI.getNumDefs();
1739 unsigned NumOperands = MI.getNumOperands();
1740
1741 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1742 if (RB == SgprRB)
1743 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1744
1745 if (RB == VgprRB) {
1746 B.setInstr(MI);
1747 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1748 Register Reg = MI.getOperand(i).getReg();
1749 if (MRI.getRegBank(Reg) != RB) {
1750 auto Copy = B.buildCopy(Res: {VgprRB, MRI.getType(Reg)}, Op: Reg);
1751 MI.getOperand(i).setReg(Copy.getReg(Idx: 0));
1752 }
1753 }
1754 }
1755}
1756
1757bool RegBankLegalizeHelper::applyRegisterBanksINTRIN_IMAGE(MachineInstr &MI) {
1758 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1759 AMDGPU::lookupRsrcIntrinsic(Intr: AMDGPU::getIntrinsicID(I: MI));
1760 assert(RSrcIntrin && RSrcIntrin->IsImage);
1761
1762 unsigned RsrcIdx = RSrcIntrin->RsrcArg;
1763 const unsigned NumDefs = MI.getNumExplicitDefs();
1764
1765 // The reported argument index is relative to the IR intrinsic call arguments,
1766 // so we need to shift by the number of defs and the intrinsic ID.
1767 RsrcIdx += NumDefs + 1;
1768
1769 MachineBasicBlock *MBB = MI.getParent();
1770 B.setInsertPt(MBB&: *MBB, II: MBB->SkipPHIsAndLabels(I: std::next(x: MI.getIterator())));
1771
1772 // Defs(for image loads with return) are vgpr.
1773 for (unsigned i = 0; i < NumDefs; ++i) {
1774 const RegisterBank *RB = MRI.getRegBank(Reg: MI.getOperand(i).getReg());
1775 if (RB == VgprRB)
1776 continue;
1777
1778 Register Reg = MI.getOperand(i).getReg();
1779 Register NewVgprDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: MRI.getType(Reg)});
1780 MI.getOperand(i).setReg(NewVgprDst);
1781 buildReadAnyLane(B, SgprDst: Reg, VgprSrc: NewVgprDst, RBI);
1782 }
1783
1784 B.setInstrAndDebugLoc(MI);
1785
1786 // Register uses(before RsrcIdx) are vgpr.
1787 for (unsigned i = 1; i < RsrcIdx; ++i) {
1788 MachineOperand &Op = MI.getOperand(i);
1789 if (!Op.isReg())
1790 continue;
1791
1792 Register Reg = Op.getReg();
1793 if (!Reg.isVirtual())
1794 continue;
1795
1796 if (MRI.getRegBank(Reg) == VgprRB)
1797 continue;
1798
1799 auto Copy = B.buildCopy(Res: {VgprRB, MRI.getType(Reg)}, Op: Reg);
1800 Op.setReg(Copy.getReg(Idx: 0));
1801 }
1802
1803 SmallSet<Register, 4> OpsToWaterfall;
1804
1805 // Register use RsrcIdx(and RsrcIdx+1 in some cases) is sgpr.
1806 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
1807 MachineOperand &Op = MI.getOperand(i);
1808 if (!Op.isReg())
1809 continue;
1810
1811 Register Reg = Op.getReg();
1812 if (MRI.getRegBank(Reg) != SgprRB)
1813 OpsToWaterfall.insert(V: Reg);
1814 }
1815
1816 if (!OpsToWaterfall.empty()) {
1817 MachineBasicBlock::iterator MII = MI.getIterator();
1818 executeInWaterfallLoop(B, WFI: {.SgprWaterfallOperandRegs: OpsToWaterfall, .Start: MII, .End: std::next(x: MII)});
1819 }
1820
1821 return true;
1822}
1823