1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPURegBankLegalizeHelper.h"
15#include "AMDGPUGlobalISelUtils.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPURegBankLegalizeRules.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "GCNSubtarget.h"
20#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
22#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24#include "llvm/CodeGen/MachineInstr.h"
25#include "llvm/CodeGen/MachineUniformityAnalysis.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
29
30using namespace llvm;
31using namespace AMDGPU;
32
33RegBankLegalizeHelper::RegBankLegalizeHelper(
34 MachineIRBuilder &B, const MachineUniformityInfo &MUI,
35 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(ID: AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(ID: AMDGPU::VGPRRegBankID)),
41 VccRB(&RBI.getRegBank(ID: AMDGPU::VCCRegBankID)) {}
42
43bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
44 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
45 if (!RuleSet) {
46 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
47 Msg: "No AMDGPU RegBankLegalize rules defined for opcode",
48 MI);
49 return false;
50 }
51
52 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
53 if (!Mapping) {
54 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
55 Msg: "AMDGPU RegBankLegalize: none of the rules defined with "
56 "'Any' for MI's opcode matched MI",
57 MI);
58 return false;
59 }
60
61 SmallSet<Register, 4> WaterfallSgprs;
62 unsigned OpIdx = 0;
63 if (Mapping->DstOpMapping.size() > 0) {
64 B.setInsertPt(MBB&: *MI.getParent(), II: std::next(x: MI.getIterator()));
65 if (!applyMappingDst(MI, OpIdx, MethodIDs: Mapping->DstOpMapping))
66 return false;
67 }
68 if (Mapping->SrcOpMapping.size() > 0) {
69 B.setInstr(MI);
70 if (!applyMappingSrc(MI, OpIdx, MethodIDs: Mapping->SrcOpMapping, SgprWaterfallOperandRegs&: WaterfallSgprs))
71 return false;
72 }
73
74 if (!lower(MI, Mapping: *Mapping, SgprWaterfallOperandRegs&: WaterfallSgprs))
75 return false;
76
77 return true;
78}
79
80bool RegBankLegalizeHelper::executeInWaterfallLoop(
81 MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
82 SmallSet<Register, 4> &SGPROperandRegs) {
83 // Track use registers which have already been expanded with a readfirstlane
84 // sequence. This may have multiple uses if moving a sequence.
85 DenseMap<Register, Register> WaterfalledRegMap;
86
87 MachineBasicBlock &MBB = B.getMBB();
88 MachineFunction &MF = B.getMF();
89
90 const SIRegisterInfo *TRI = ST.getRegisterInfo();
91 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
92 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
93 if (IsWave32) {
94 MovExecOpc = AMDGPU::S_MOV_B32;
95 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
96 XorTermOpc = AMDGPU::S_XOR_B32_term;
97 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
98 ExecReg = AMDGPU::EXEC_LO;
99 } else {
100 MovExecOpc = AMDGPU::S_MOV_B64;
101 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
102 XorTermOpc = AMDGPU::S_XOR_B64_term;
103 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
104 ExecReg = AMDGPU::EXEC;
105 }
106
107#ifndef NDEBUG
108 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
109#endif
110
111 MachineRegisterInfo &MRI = *B.getMRI();
112 Register SaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
113 Register InitSaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
114
115 // Don't bother using generic instructions/registers for the exec mask.
116 B.buildInstr(Opcode: TargetOpcode::IMPLICIT_DEF).addDef(RegNo: InitSaveExecReg);
117
118 Register SavedExec = MRI.createVirtualRegister(RegClass: WaveRC);
119
120 // To insert the loop we need to split the block. Move everything before
121 // this point to a new block, and insert a new empty block before this
122 // instruction.
123 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
124 MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
125 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
126 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
127 MachineFunction::iterator MBBI(MBB);
128 ++MBBI;
129 MF.insert(MBBI, MBB: LoopBB);
130 MF.insert(MBBI, MBB: BodyBB);
131 MF.insert(MBBI, MBB: RestoreExecBB);
132 MF.insert(MBBI, MBB: RemainderBB);
133
134 LoopBB->addSuccessor(Succ: BodyBB);
135 BodyBB->addSuccessor(Succ: RestoreExecBB);
136 BodyBB->addSuccessor(Succ: LoopBB);
137
138 // Move the rest of the block into a new block.
139 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
140 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Range.end(), To: MBB.end());
141
142 MBB.addSuccessor(Succ: LoopBB);
143 RestoreExecBB->addSuccessor(Succ: RemainderBB);
144
145 B.setInsertPt(MBB&: *LoopBB, II: LoopBB->end());
146
147 // +-MBB:------------+
148 // | ... |
149 // | %0 = G_INST_1 |
150 // | %Dst = MI %Vgpr |
151 // | %1 = G_INST_2 |
152 // | ... |
153 // +-----------------+
154 // ->
155 // +-MBB-------------------------------+
156 // | ... |
157 // | %0 = G_INST_1 |
158 // | %SaveExecReg = S_MOV_B32 $exec_lo |
159 // +----------------|------------------+
160 // | /------------------------------|
161 // V V |
162 // +-LoopBB---------------------------------------------------------------+ |
163 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
164 // | instead of executing for each lane, see if other lanes had | |
165 // | same value for %Vgpr and execute for them also. | |
166 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
167 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
168 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
169 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
170 // +----------------|-----------------------------------------------------+ |
171 // V |
172 // +-BodyBB------------------------------------------------------------+ |
173 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
174 // | executed only for active lanes and written to Dst | |
175 // | $exec = S_XOR_B32 $exec, %SavedExec | |
176 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
177 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
178 // | SI_WATERFALL_LOOP LoopBB |-----|
179 // +----------------|--------------------------------------------------+
180 // V
181 // +-RestoreExecBB--------------------------+
182 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
183 // +----------------|-----------------------+
184 // V
185 // +-RemainderBB:----------------------+
186 // | %1 = G_INST_2 |
187 // | ... |
188 // +---------------------------------- +
189
190 // Move the instruction into the loop body. Note we moved everything after
191 // Range.end() already into a new block, so Range.end() is no longer valid.
192 BodyBB->splice(Where: BodyBB->end(), Other: &MBB, From: Range.begin(), To: MBB.end());
193
194 // Figure out the iterator range after splicing the instructions.
195 MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
196 auto NewEnd = BodyBB->end();
197 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
198
199 B.setMBB(*LoopBB);
200 Register CondReg;
201
202 for (MachineInstr &MI : make_range(x: NewBegin, y: NewEnd)) {
203 for (MachineOperand &Op : MI.all_uses()) {
204 Register OldReg = Op.getReg();
205 if (!SGPROperandRegs.count(V: OldReg))
206 continue;
207
208 // See if we already processed this register in another instruction in
209 // the sequence.
210 auto OldVal = WaterfalledRegMap.find(Val: OldReg);
211 if (OldVal != WaterfalledRegMap.end()) {
212 Op.setReg(OldVal->second);
213 continue;
214 }
215
216 Register OpReg = Op.getReg();
217 LLT OpTy = MRI.getType(Reg: OpReg);
218
219 // TODO: support for agpr
220 assert(MRI.getRegBank(OpReg) == VgprRB);
221 Register CurrentLaneReg = MRI.createVirtualRegister(RegAttr: {.RCOrRB: SgprRB, .Ty: OpTy});
222 buildReadFirstLane(B, SgprDst: CurrentLaneReg, VgprSrc: OpReg, RBI);
223
224 // Build the comparison(s), CurrentLaneReg == OpReg.
225 unsigned OpSize = OpTy.getSizeInBits();
226 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
227 LLT PartTy = LLT::scalar(SizeInBits: PartSize);
228 unsigned NumParts = OpSize / PartSize;
229 SmallVector<Register, 8> OpParts;
230 SmallVector<Register, 8> CurrentLaneParts;
231
232 if (NumParts == 1) {
233 OpParts.push_back(Elt: OpReg);
234 CurrentLaneParts.push_back(Elt: CurrentLaneReg);
235 } else {
236 auto UnmergeOp = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: PartTy}, Op: OpReg);
237 auto UnmergeCurrLane = B.buildUnmerge(Attrs: {.RCOrRB: SgprRB, .Ty: PartTy}, Op: CurrentLaneReg);
238 for (unsigned i = 0; i < NumParts; ++i) {
239 OpParts.push_back(Elt: UnmergeOp.getReg(Idx: i));
240 CurrentLaneParts.push_back(Elt: UnmergeCurrLane.getReg(Idx: i));
241 }
242 }
243
244 for (unsigned i = 0; i < NumParts; ++i) {
245 Register CmpReg = MRI.createVirtualRegister(RegAttr: VccRB_S1);
246 B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CmpReg, Op0: CurrentLaneParts[i], Op1: OpParts[i]);
247
248 if (!CondReg)
249 CondReg = CmpReg;
250 else
251 CondReg = B.buildAnd(Dst: VccRB_S1, Src0: CondReg, Src1: CmpReg).getReg(Idx: 0);
252 }
253
254 Op.setReg(CurrentLaneReg);
255
256 // Make sure we don't re-process this register again.
257 WaterfalledRegMap.insert(KV: std::pair(OldReg, Op.getReg()));
258 }
259 }
260
261 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
262 Register CondRegLM =
263 MRI.createVirtualRegister(RegAttr: {.RCOrRB: WaveRC, .Ty: LLT::scalar(SizeInBits: IsWave32 ? 32 : 64)});
264 B.buildIntrinsic(ID: Intrinsic::amdgcn_ballot, Res: CondRegLM).addReg(RegNo: CondReg);
265
266 // Update EXEC, save the original EXEC value to SavedExec.
267 B.buildInstr(Opcode: AndSaveExecOpc)
268 .addDef(RegNo: SavedExec)
269 .addReg(RegNo: CondRegLM, Flags: RegState::Kill);
270 MRI.setSimpleHint(VReg: SavedExec, PrefReg: CondRegLM);
271
272 B.setInsertPt(MBB&: *BodyBB, II: BodyBB->end());
273
274 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
275 B.buildInstr(Opcode: XorTermOpc).addDef(RegNo: ExecReg).addReg(RegNo: ExecReg).addReg(RegNo: SavedExec);
276
277 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
278 // s_cbranch_scc0?
279
280 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
281 B.buildInstr(Opcode: AMDGPU::SI_WATERFALL_LOOP).addMBB(MBB: LoopBB);
282
283 // Save the EXEC mask before the loop.
284 B.setInsertPt(MBB, II: MBB.end());
285 B.buildInstr(Opcode: MovExecOpc).addDef(RegNo: SaveExecReg).addReg(RegNo: ExecReg);
286
287 // Restore the EXEC mask after the loop.
288 B.setInsertPt(MBB&: *RestoreExecBB, II: RestoreExecBB->begin());
289 B.buildInstr(Opcode: MovExecTermOpc).addDef(RegNo: ExecReg).addReg(RegNo: SaveExecReg);
290
291 // Set the insert point after the original instruction, so any new
292 // instructions will be in the remainder.
293 B.setInsertPt(MBB&: *RemainderBB, II: RemainderBB->begin());
294
295 return true;
296}
297
298bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
299 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
300 MachineFunction &MF = B.getMF();
301 assert(MI.getNumMemOperands() == 1);
302 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
303 Register Dst = MI.getOperand(i: 0).getReg();
304 const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: Dst);
305 Register Base = MI.getOperand(i: 1).getReg();
306 LLT PtrTy = MRI.getType(Reg: Base);
307 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Reg: Base);
308 LLT OffsetTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
309 SmallVector<Register, 4> LoadPartRegs;
310
311 unsigned ByteOffset = 0;
312 for (LLT PartTy : LLTBreakdown) {
313 Register BasePlusOffset;
314 if (ByteOffset == 0) {
315 BasePlusOffset = Base;
316 } else {
317 auto Offset = B.buildConstant(Res: {PtrRB, OffsetTy}, Val: ByteOffset);
318 BasePlusOffset =
319 B.buildObjectPtrOffset(Res: {PtrRB, PtrTy}, Op0: Base, Op1: Offset).getReg(Idx: 0);
320 }
321 auto *OffsetMMO = MF.getMachineMemOperand(MMO: &BaseMMO, Offset: ByteOffset, Ty: PartTy);
322 auto LoadPart = B.buildLoad(Res: {DstRB, PartTy}, Addr: BasePlusOffset, MMO&: *OffsetMMO);
323 LoadPartRegs.push_back(Elt: LoadPart.getReg(Idx: 0));
324 ByteOffset += PartTy.getSizeInBytes();
325 }
326
327 if (!MergeTy.isValid()) {
328 // Loads are of same size, concat or merge them together.
329 B.buildMergeLikeInstr(Res: Dst, Ops: LoadPartRegs);
330 } else {
331 // Loads are not all of same size, need to unmerge them to smaller pieces
332 // of MergeTy type, then merge pieces to Dst.
333 SmallVector<Register, 4> MergeTyParts;
334 for (Register Reg : LoadPartRegs) {
335 if (MRI.getType(Reg) == MergeTy) {
336 MergeTyParts.push_back(Elt: Reg);
337 } else {
338 auto Unmerge = B.buildUnmerge(Attrs: {.RCOrRB: DstRB, .Ty: MergeTy}, Op: Reg);
339 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
340 MergeTyParts.push_back(Elt: Unmerge.getReg(Idx: i));
341 }
342 }
343 B.buildMergeLikeInstr(Res: Dst, Ops: MergeTyParts);
344 }
345 MI.eraseFromParent();
346 return true;
347}
348
349bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
350 LLT MergeTy) {
351 MachineFunction &MF = B.getMF();
352 assert(MI.getNumMemOperands() == 1);
353 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
354 Register Dst = MI.getOperand(i: 0).getReg();
355 const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: Dst);
356 Register Base = MI.getOperand(i: 1).getReg();
357
358 MachineMemOperand *WideMMO = MF.getMachineMemOperand(MMO: &BaseMMO, Offset: 0, Ty: WideTy);
359 auto WideLoad = B.buildLoad(Res: {DstRB, WideTy}, Addr: Base, MMO&: *WideMMO);
360
361 if (WideTy.isScalar()) {
362 B.buildTrunc(Res: Dst, Op: WideLoad);
363 } else {
364 SmallVector<Register, 4> MergeTyParts;
365 auto Unmerge = B.buildUnmerge(Attrs: {.RCOrRB: DstRB, .Ty: MergeTy}, Op: WideLoad);
366
367 LLT DstTy = MRI.getType(Reg: Dst);
368 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
369 for (unsigned i = 0; i < NumElts; ++i) {
370 MergeTyParts.push_back(Elt: Unmerge.getReg(Idx: i));
371 }
372 B.buildMergeLikeInstr(Res: Dst, Ops: MergeTyParts);
373 }
374 MI.eraseFromParent();
375 return true;
376}
377
378bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
379 Register Dst = MI.getDstReg();
380 Register Ptr = MI.getPointerReg();
381 MachineMemOperand &MMO = MI.getMMO();
382 unsigned MemSize = 8 * MMO.getSize().getValue();
383
384 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(MMO: &MMO, Offset: 0, Ty: S32);
385
386 if (MI.getOpcode() == G_LOAD) {
387 B.buildLoad(Res: Dst, Addr: Ptr, MMO&: *WideMMO);
388 } else {
389 auto Load = B.buildLoad(Res: SgprRB_S32, Addr: Ptr, MMO&: *WideMMO);
390
391 if (MI.getOpcode() == G_ZEXTLOAD) {
392 APInt Mask = APInt::getLowBitsSet(numBits: S32.getSizeInBits(), loBitsSet: MemSize);
393 auto MaskCst = B.buildConstant(Res: SgprRB_S32, Val: Mask);
394 B.buildAnd(Dst, Src0: Load, Src1: MaskCst);
395 } else {
396 assert(MI.getOpcode() == G_SEXTLOAD);
397 B.buildSExtInReg(Res: Dst, Op: Load, ImmOp: MemSize);
398 }
399 }
400
401 MI.eraseFromParent();
402 return true;
403}
404
405bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
406 Register Dst = MI.getOperand(i: 0).getReg();
407 LLT Ty = MRI.getType(Reg: Dst);
408 Register Src = MI.getOperand(i: 1).getReg();
409 unsigned Opc = MI.getOpcode();
410 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
411 if (Ty == S32 || Ty == S16) {
412 auto True = B.buildConstant(Res: {VgprRB, Ty}, Val: TrueExtCst);
413 auto False = B.buildConstant(Res: {VgprRB, Ty}, Val: 0);
414 B.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
415 } else if (Ty == S64) {
416 auto True = B.buildConstant(Res: {VgprRB_S32}, Val: TrueExtCst);
417 auto False = B.buildConstant(Res: {VgprRB_S32}, Val: 0);
418 auto Lo = B.buildSelect(Res: {VgprRB_S32}, Tst: Src, Op0: True, Op1: False);
419 MachineInstrBuilder Hi;
420 switch (Opc) {
421 case G_SEXT:
422 Hi = Lo;
423 break;
424 case G_ZEXT:
425 Hi = False;
426 break;
427 case G_ANYEXT:
428 Hi = B.buildUndef(Res: {VgprRB_S32});
429 break;
430 default:
431 reportGISelFailure(
432 MF, MORE, PassName: "amdgpu-regbanklegalize",
433 Msg: "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
434 return false;
435 }
436
437 B.buildMergeValues(Res: Dst, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)});
438 } else {
439 reportGISelFailure(
440 MF, MORE, PassName: "amdgpu-regbanklegalize",
441 Msg: "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
442 return false;
443 }
444
445 MI.eraseFromParent();
446 return true;
447}
448
449std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
450 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
451 auto Mask = B.buildConstant(Res: SgprRB_S32, Val: 0x0000ffff);
452 auto Lo = B.buildAnd(Dst: SgprRB_S32, Src0: PackedS32, Src1: Mask);
453 auto Hi = B.buildLShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
454 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
455}
456
457std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
458 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
459 auto Lo = B.buildSExtInReg(Res: SgprRB_S32, Op: PackedS32, ImmOp: 16);
460 auto Hi = B.buildAShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
461 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
462}
463
464std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
465 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
466 auto Lo = PackedS32;
467 auto Hi = B.buildLShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
468 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
469}
470
471std::pair<Register, Register>
472RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
473 auto [Lo32, Hi32] = unpackAExt(Reg);
474 return {B.buildTrunc(Res: SgprRB_S16, Op: Lo32).getReg(Idx: 0),
475 B.buildTrunc(Res: SgprRB_S16, Op: Hi32).getReg(Idx: 0)};
476}
477
478bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
479 Register Lo, Hi;
480 switch (MI.getOpcode()) {
481 case AMDGPU::G_SHL: {
482 auto [Val0, Val1] = unpackAExt(Reg: MI.getOperand(i: 1).getReg());
483 auto [Amt0, Amt1] = unpackAExt(Reg: MI.getOperand(i: 2).getReg());
484 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0, Amt0}).getReg(Idx: 0);
485 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val1, Amt1}).getReg(Idx: 0);
486 break;
487 }
488 case AMDGPU::G_LSHR: {
489 auto [Val0, Val1] = unpackZExt(Reg: MI.getOperand(i: 1).getReg());
490 auto [Amt0, Amt1] = unpackZExt(Reg: MI.getOperand(i: 2).getReg());
491 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0, Amt0}).getReg(Idx: 0);
492 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val1, Amt1}).getReg(Idx: 0);
493 break;
494 }
495 case AMDGPU::G_ASHR: {
496 auto [Val0, Val1] = unpackSExt(Reg: MI.getOperand(i: 1).getReg());
497 auto [Amt0, Amt1] = unpackSExt(Reg: MI.getOperand(i: 2).getReg());
498 Lo = B.buildAShr(Dst: SgprRB_S32, Src0: Val0, Src1: Amt0).getReg(Idx: 0);
499 Hi = B.buildAShr(Dst: SgprRB_S32, Src0: Val1, Src1: Amt1).getReg(Idx: 0);
500 break;
501 }
502 default:
503 reportGISelFailure(
504 MF, MORE, PassName: "amdgpu-regbanklegalize",
505 Msg: "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
506 MI);
507 return false;
508 }
509 B.buildBuildVectorTrunc(Res: MI.getOperand(i: 0).getReg(), Ops: {Lo, Hi});
510 MI.eraseFromParent();
511 return true;
512}
513
514bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
515 Register Lo, Hi;
516 switch (MI.getOpcode()) {
517 case AMDGPU::G_SMIN:
518 case AMDGPU::G_SMAX: {
519 // For signed operations, use sign extension
520 auto [Val0_Lo, Val0_Hi] = unpackSExt(Reg: MI.getOperand(i: 1).getReg());
521 auto [Val1_Lo, Val1_Hi] = unpackSExt(Reg: MI.getOperand(i: 2).getReg());
522 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Lo, Val1_Lo})
523 .getReg(Idx: 0);
524 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Hi, Val1_Hi})
525 .getReg(Idx: 0);
526 break;
527 }
528 case AMDGPU::G_UMIN:
529 case AMDGPU::G_UMAX: {
530 // For unsigned operations, use zero extension
531 auto [Val0_Lo, Val0_Hi] = unpackZExt(Reg: MI.getOperand(i: 1).getReg());
532 auto [Val1_Lo, Val1_Hi] = unpackZExt(Reg: MI.getOperand(i: 2).getReg());
533 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Lo, Val1_Lo})
534 .getReg(Idx: 0);
535 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Hi, Val1_Hi})
536 .getReg(Idx: 0);
537 break;
538 }
539 default:
540 reportGISelFailure(
541 MF, MORE, PassName: "amdgpu-regbanklegalize",
542 Msg: "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
543 return false;
544 }
545 B.buildBuildVectorTrunc(Res: MI.getOperand(i: 0).getReg(), Ops: {Lo, Hi});
546 MI.eraseFromParent();
547 return true;
548}
549
550bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
551 auto [Op1Lo, Op1Hi] = unpackAExt(Reg: MI.getOperand(i: 1).getReg());
552 auto [Op2Lo, Op2Hi] = unpackAExt(Reg: MI.getOperand(i: 2).getReg());
553 auto ResLo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Op1Lo, Op2Lo});
554 auto ResHi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Op1Hi, Op2Hi});
555 B.buildBuildVectorTrunc(Res: MI.getOperand(i: 0).getReg(),
556 Ops: {ResLo.getReg(Idx: 0), ResHi.getReg(Idx: 0)});
557 MI.eraseFromParent();
558 return true;
559}
560
561static bool isSignedBFE(MachineInstr &MI) {
562 if (GIntrinsic *GI = dyn_cast<GIntrinsic>(Val: &MI))
563 return (GI->is(ID: Intrinsic::amdgcn_sbfe));
564
565 return MI.getOpcode() == AMDGPU::G_SBFX;
566}
567
568bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
569 Register Dst = MI.getOperand(i: 0).getReg();
570 assert(MRI.getType(Dst) == LLT::scalar(64));
571 bool Signed = isSignedBFE(MI);
572 unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1;
573 // Extract bitfield from Src, LSBit is the least-significant bit for the
574 // extraction (field offset) and Width is size of bitfield.
575 Register Src = MI.getOperand(i: FirstOpnd).getReg();
576 Register LSBit = MI.getOperand(i: FirstOpnd + 1).getReg();
577 Register Width = MI.getOperand(i: FirstOpnd + 2).getReg();
578 // Comments are for signed bitfield extract, similar for unsigned. x is sign
579 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
580
581 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
582 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
583 auto SHRSrc = B.buildInstr(Opc: SHROpc, DstOps: {{VgprRB, S64}}, SrcOps: {Src, LSBit});
584
585 auto ConstWidth = getIConstantVRegValWithLookThrough(VReg: Width, MRI);
586
587 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
588 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
589 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
590 if (!ConstWidth) {
591 auto Amt = B.buildSub(Dst: VgprRB_S32, Src0: B.buildConstant(Res: SgprRB_S32, Val: 64), Src1: Width);
592 auto SignBit = B.buildShl(Dst: {VgprRB, S64}, Src0: SHRSrc, Src1: Amt);
593 B.buildInstr(Opc: SHROpc, DstOps: {Dst}, SrcOps: {SignBit, Amt});
594 MI.eraseFromParent();
595 return true;
596 }
597
598 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
599 auto UnmergeSHRSrc = B.buildUnmerge(Attrs: VgprRB_S32, Op: SHRSrc);
600 Register SHRSrcLo = UnmergeSHRSrc.getReg(Idx: 0);
601 Register SHRSrcHi = UnmergeSHRSrc.getReg(Idx: 1);
602 auto Zero = B.buildConstant(Res: {VgprRB, S32}, Val: 0);
603 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
604
605 if (WidthImm <= 32) {
606 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
607 auto Lo = B.buildInstr(Opc: BFXOpc, DstOps: {VgprRB_S32}, SrcOps: {SHRSrcLo, Zero, Width});
608 MachineInstrBuilder Hi;
609 if (Signed) {
610 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
611 Hi = B.buildAShr(Dst: VgprRB_S32, Src0: Lo, Src1: B.buildConstant(Res: VgprRB_S32, Val: 31));
612 } else {
613 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
614 Hi = Zero;
615 }
616 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
617 } else {
618 auto Amt = B.buildConstant(Res: VgprRB_S32, Val: WidthImm - 32);
619 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
620 auto Hi = B.buildInstr(Opc: BFXOpc, DstOps: {VgprRB_S32}, SrcOps: {SHRSrcHi, Zero, Amt});
621 B.buildMergeLikeInstr(Res: Dst, Ops: {SHRSrcLo, Hi});
622 }
623
624 MI.eraseFromParent();
625 return true;
626}
627
628bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
629 Register DstReg = MI.getOperand(i: 0).getReg();
630 LLT Ty = MRI.getType(Reg: DstReg);
631 bool Signed = isSignedBFE(MI);
632 unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1;
633 Register Src = MI.getOperand(i: FirstOpnd).getReg();
634 Register LSBit = MI.getOperand(i: FirstOpnd + 1).getReg();
635 Register Width = MI.getOperand(i: FirstOpnd + 2).getReg();
636 // For uniform bit field extract there are 4 available instructions, but
637 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
638 // field offset in low and size in high 16 bits.
639
640 // Src1 Hi16|Lo16 = Size|FieldOffset
641 auto Mask = B.buildConstant(Res: SgprRB_S32, Val: maskTrailingOnes<unsigned>(N: 6));
642 auto FieldOffset = B.buildAnd(Dst: SgprRB_S32, Src0: LSBit, Src1: Mask);
643 auto Size = B.buildShl(Dst: SgprRB_S32, Src0: Width, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
644 auto Src1 = B.buildOr(Dst: SgprRB_S32, Src0: FieldOffset, Src1: Size);
645 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
646 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
647 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
648
649 // Select machine instruction, because of reg class constraining, insert
650 // copies from reg class to reg bank.
651 auto S_BFE = B.buildInstr(Opc, DstOps: {{SgprRB, Ty}},
652 SrcOps: {B.buildCopy(Res: Ty, Op: Src), B.buildCopy(Res: S32, Op: Src1)});
653 if (!constrainSelectedInstRegOperands(I&: *S_BFE, TII: *ST.getInstrInfo(),
654 TRI: *ST.getRegisterInfo(), RBI)) {
655 reportGISelFailure(
656 MF, MORE, PassName: "amdgpu-regbanklegalize",
657 Msg: "AMDGPU RegBankLegalize: lowerS_BFE, failed to constrain BFE", MI);
658 return false;
659 }
660
661 B.buildCopy(Res: DstReg, Op: S_BFE->getOperand(i: 0).getReg());
662 MI.eraseFromParent();
663 return true;
664}
665
666bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
667 Register Dst = MI.getOperand(i: 0).getReg();
668 LLT DstTy = MRI.getType(Reg: Dst);
669 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
670 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
671 auto Op1 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 1).getReg());
672 auto Op2 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 2).getReg());
673 unsigned Opc = MI.getOpcode();
674 auto Flags = MI.getFlags();
675 auto Lo =
676 B.buildInstr(Opc, DstOps: {{VgprRB, Ty}}, SrcOps: {Op1.getReg(Idx: 0), Op2.getReg(Idx: 0)}, Flags);
677 auto Hi =
678 B.buildInstr(Opc, DstOps: {{VgprRB, Ty}}, SrcOps: {Op1.getReg(Idx: 1), Op2.getReg(Idx: 1)}, Flags);
679 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
680 MI.eraseFromParent();
681 return true;
682}
683
684bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
685 Register Dst = MI.getOperand(i: 0).getReg();
686 assert(MRI.getType(Dst) == S64);
687 auto Op1 = B.buildUnmerge(Attrs: {VgprRB_S32}, Op: MI.getOperand(i: 1).getReg());
688 auto Op2 = B.buildUnmerge(Attrs: {VgprRB_S32}, Op: MI.getOperand(i: 2).getReg());
689
690 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
691 // match GlobalISel with old regbankselect.
692 auto Lo = B.buildMul(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 0), Src1: Op2.getReg(Idx: 0));
693 auto Carry = B.buildUMulH(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 0), Src1: Op2.getReg(Idx: 0));
694 auto MulLo0Hi1 = B.buildMul(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 0), Src1: Op2.getReg(Idx: 1));
695 auto MulHi0Lo1 = B.buildMul(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 1), Src1: Op2.getReg(Idx: 0));
696 auto Sum = B.buildAdd(Dst: VgprRB_S32, Src0: MulLo0Hi1, Src1: MulHi0Lo1);
697 auto Hi = B.buildAdd(Dst: VgprRB_S32, Src0: Sum, Src1: Carry);
698
699 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
700 MI.eraseFromParent();
701 return true;
702}
703
704bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
705 Register Dst = MI.getOperand(i: 0).getReg();
706 assert(MRI.getType(Dst) == V2S16);
707 unsigned Opc = MI.getOpcode();
708 unsigned NumOps = MI.getNumOperands();
709 auto Flags = MI.getFlags();
710
711 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(Reg: MI.getOperand(i: 1).getReg());
712
713 if (NumOps == 2) {
714 auto Lo = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Lo}, Flags);
715 auto Hi = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Hi}, Flags);
716 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
717 MI.eraseFromParent();
718 return true;
719 }
720
721 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(Reg: MI.getOperand(i: 2).getReg());
722
723 if (NumOps == 3) {
724 auto Lo = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Lo, Op2Lo}, Flags);
725 auto Hi = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Hi, Op2Hi}, Flags);
726 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
727 MI.eraseFromParent();
728 return true;
729 }
730
731 assert(NumOps == 4);
732 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(Reg: MI.getOperand(i: 3).getReg());
733 auto Lo = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Lo, Op2Lo, Op3Lo}, Flags);
734 auto Hi = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Hi, Op2Hi, Op3Hi}, Flags);
735 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
736 MI.eraseFromParent();
737 return true;
738}
739
740bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
741 Register Dst0 = MI.getOperand(i: 0).getReg();
742 Register Dst1 = MI.getOperand(i: 1).getReg();
743 Register Src0 = MI.getOperand(i: 2).getReg();
744 Register Src1 = MI.getOperand(i: 3).getReg();
745 Register Src2 = MI.getOperand(i: 4).getReg();
746
747 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
748
749 // Keep the multiplication on the SALU.
750 Register DstLo = B.buildMul(Dst: SgprRB_S32, Src0, Src1).getReg(Idx: 0);
751 Register DstHi = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
752 if (ST.hasScalarMulHiInsts()) {
753 B.buildInstr(Opc: AMDGPU::G_UMULH, DstOps: {{DstHi}}, SrcOps: {Src0, Src1});
754 } else {
755 auto VSrc0 = B.buildCopy(Res: VgprRB_S32, Op: Src0);
756 auto VSrc1 = B.buildCopy(Res: VgprRB_S32, Op: Src1);
757 auto MulHi = B.buildInstr(Opc: AMDGPU::G_UMULH, DstOps: {VgprRB_S32}, SrcOps: {VSrc0, VSrc1});
758 buildReadAnyLane(B, SgprDst: DstHi, VgprSrc: MulHi.getReg(Idx: 0), RBI);
759 }
760
761 // Accumulate and produce the "carry-out" bit.
762
763 // The "carry-out" is defined as bit 64 of the result when computed as a
764 // big integer. For unsigned multiply-add, this matches the usual
765 // definition of carry-out.
766 if (mi_match(R: Src2, MRI, P: MIPatternMatch::m_ZeroInt())) {
767 // No accumulate: result is just the multiplication, carry is 0.
768 B.buildMergeLikeInstr(Res: Dst0, Ops: {DstLo, DstHi});
769 B.buildConstant(Res: Dst1, Val: 0);
770 } else {
771 // Accumulate: add Src2 to the multiplication result with carry chain.
772 Register Src2Lo = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
773 Register Src2Hi = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
774 B.buildUnmerge(Res: {Src2Lo, Src2Hi}, Op: Src2);
775
776 auto AddLo = B.buildUAddo(Res: SgprRB_S32, CarryOut: SgprRB_S32, Op0: DstLo, Op1: Src2Lo);
777 auto AddHi =
778 B.buildUAdde(Res: SgprRB_S32, CarryOut: SgprRB_S32, Op0: DstHi, Op1: Src2Hi, CarryIn: AddLo.getReg(Idx: 1));
779 B.buildMergeLikeInstr(Res: Dst0, Ops: {AddLo.getReg(Idx: 0), AddHi.getReg(Idx: 0)});
780 B.buildCopy(Res: Dst1, Op: AddHi.getReg(Idx: 1));
781 }
782
783 MI.eraseFromParent();
784 return true;
785}
786
787bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
788 Register Dst = MI.getOperand(i: 0).getReg();
789 LLT DstTy = MRI.getType(Reg: Dst);
790 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
791 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
792 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
793 auto Op2 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 2).getReg());
794 auto Op3 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 3).getReg());
795 Register Cond = MI.getOperand(i: 1).getReg();
796 auto Flags = MI.getFlags();
797 auto Lo =
798 B.buildSelect(Res: {VgprRB, Ty}, Tst: Cond, Op0: Op2.getReg(Idx: 0), Op1: Op3.getReg(Idx: 0), Flags);
799 auto Hi =
800 B.buildSelect(Res: {VgprRB, Ty}, Tst: Cond, Op0: Op2.getReg(Idx: 1), Op1: Op3.getReg(Idx: 1), Flags);
801
802 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
803 MI.eraseFromParent();
804 return true;
805}
806
807bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
808 auto Op1 = B.buildUnmerge(Attrs: VgprRB_S32, Op: MI.getOperand(i: 1).getReg());
809 int Amt = MI.getOperand(i: 2).getImm();
810 Register Lo, Hi;
811 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
812 if (Amt <= 32) {
813 auto Freeze = B.buildFreeze(Dst: VgprRB_S32, Src: Op1.getReg(Idx: 0));
814 if (Amt == 32) {
815 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
816 Lo = Freeze.getReg(Idx: 0);
817 } else {
818 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
819 Lo = B.buildSExtInReg(Res: VgprRB_S32, Op: Freeze, ImmOp: Amt).getReg(Idx: 0);
820 }
821
822 auto SignExtCst = B.buildConstant(Res: SgprRB_S32, Val: 31);
823 Hi = B.buildAShr(Dst: VgprRB_S32, Src0: Lo, Src1: SignExtCst).getReg(Idx: 0);
824 } else {
825 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
826 Lo = Op1.getReg(Idx: 0);
827 Hi = B.buildSExtInReg(Res: VgprRB_S32, Op: Op1.getReg(Idx: 1), ImmOp: Amt - 32).getReg(Idx: 0);
828 }
829
830 B.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(), Ops: {Lo, Hi});
831 MI.eraseFromParent();
832 return true;
833}
834
835bool RegBankLegalizeHelper::lower(MachineInstr &MI,
836 const RegBankLLTMapping &Mapping,
837 SmallSet<Register, 4> &WaterfallSgprs) {
838
839 switch (Mapping.LoweringMethod) {
840 case DoNotLower:
841 break;
842 case VccExtToSel:
843 return lowerVccExtToSel(MI);
844 case UniExtToSel: {
845 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
846 auto True = B.buildConstant(Res: {SgprRB, Ty},
847 Val: MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
848 auto False = B.buildConstant(Res: {SgprRB, Ty}, Val: 0);
849 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
850 // We are making select here. S1 cond was already 'any-extended to S32' +
851 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
852 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: MI.getOperand(i: 1).getReg(), Op0: True,
853 Op1: False);
854 MI.eraseFromParent();
855 return true;
856 }
857 case UnpackBitShift:
858 return lowerUnpackBitShift(MI);
859 case UnpackMinMax:
860 return lowerUnpackMinMax(MI);
861 case ScalarizeToS16:
862 return lowerSplitTo16(MI);
863 case Ext32To64: {
864 const RegisterBank *RB = MRI.getRegBank(Reg: MI.getOperand(i: 0).getReg());
865 MachineInstrBuilder Hi;
866 switch (MI.getOpcode()) {
867 case AMDGPU::G_ZEXT: {
868 Hi = B.buildConstant(Res: {RB, S32}, Val: 0);
869 break;
870 }
871 case AMDGPU::G_SEXT: {
872 // Replicate sign bit from 32-bit extended part.
873 auto ShiftAmt = B.buildConstant(Res: {RB, S32}, Val: 31);
874 Hi = B.buildAShr(Dst: {RB, S32}, Src0: MI.getOperand(i: 1).getReg(), Src1: ShiftAmt);
875 break;
876 }
877 case AMDGPU::G_ANYEXT: {
878 Hi = B.buildUndef(Res: {RB, S32});
879 break;
880 }
881 default:
882 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
883 Msg: "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
884 MI);
885 return false;
886 }
887
888 B.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(),
889 Ops: {MI.getOperand(i: 1).getReg(), Hi});
890 MI.eraseFromParent();
891 return true;
892 }
893 case UniCstExt: {
894 uint64_t ConstVal = MI.getOperand(i: 1).getCImm()->getZExtValue();
895 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: ConstVal);
896
897 MI.eraseFromParent();
898 return true;
899 }
900 case VgprToVccCopy: {
901 Register Src = MI.getOperand(i: 1).getReg();
902 LLT Ty = MRI.getType(Reg: Src);
903 // Take lowest bit from each lane and put it in lane mask.
904 // Lowering via compare, but we need to clean high bits first as compare
905 // compares all bits in register.
906 Register BoolSrc = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
907 if (Ty == S64) {
908 auto Src64 = B.buildUnmerge(Attrs: VgprRB_S32, Op: Src);
909 auto One = B.buildConstant(Res: VgprRB_S32, Val: 1);
910 auto AndLo = B.buildAnd(Dst: VgprRB_S32, Src0: Src64.getReg(Idx: 0), Src1: One);
911 auto Zero = B.buildConstant(Res: VgprRB_S32, Val: 0);
912 auto AndHi = B.buildAnd(Dst: VgprRB_S32, Src0: Src64.getReg(Idx: 1), Src1: Zero);
913 B.buildMergeLikeInstr(Res: BoolSrc, Ops: {AndLo, AndHi});
914 } else {
915 assert(Ty == S32 || Ty == S16);
916 auto One = B.buildConstant(Res: {VgprRB, Ty}, Val: 1);
917 B.buildAnd(Dst: BoolSrc, Src0: Src, Src1: One);
918 }
919 auto Zero = B.buildConstant(Res: {VgprRB, Ty}, Val: 0);
920 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: MI.getOperand(i: 0).getReg(), Op0: BoolSrc, Op1: Zero);
921 MI.eraseFromParent();
922 return true;
923 }
924 case V_BFE:
925 return lowerV_BFE(MI);
926 case S_BFE:
927 return lowerS_BFE(MI);
928 case UniMAD64:
929 return lowerUniMAD64(MI);
930 case UniMul64: {
931 B.buildMul(Dst: MI.getOperand(i: 0), Src0: MI.getOperand(i: 1), Src1: MI.getOperand(i: 2));
932 MI.eraseFromParent();
933 return true;
934 }
935 case DivSMulToMAD: {
936 auto Op1 = B.buildTrunc(Res: VgprRB_S32, Op: MI.getOperand(i: 1));
937 auto Op2 = B.buildTrunc(Res: VgprRB_S32, Op: MI.getOperand(i: 2));
938 auto Zero = B.buildConstant(Res: {VgprRB, S64}, Val: 0);
939
940 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
941 ? AMDGPU::G_AMDGPU_MAD_U64_U32
942 : AMDGPU::G_AMDGPU_MAD_I64_I32;
943
944 B.buildInstr(Opc: NewOpc, DstOps: {MI.getOperand(i: 0).getReg(), {SgprRB, S32}},
945 SrcOps: {Op1, Op2, Zero});
946 MI.eraseFromParent();
947 return true;
948 }
949 case SplitTo32:
950 return lowerSplitTo32(MI);
951 case SplitTo32Mul:
952 return lowerSplitTo32Mul(MI);
953 case SplitTo32Select:
954 return lowerSplitTo32Select(MI);
955 case SplitTo32SExtInReg:
956 return lowerSplitTo32SExtInReg(MI);
957 case SplitLoad: {
958 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
959 unsigned Size = DstTy.getSizeInBits();
960 // Even split to 128-bit loads
961 if (Size > 128) {
962 LLT B128;
963 if (DstTy.isVector()) {
964 LLT EltTy = DstTy.getElementType();
965 B128 = LLT::fixed_vector(NumElements: 128 / EltTy.getSizeInBits(), ScalarTy: EltTy);
966 } else {
967 B128 = LLT::scalar(SizeInBits: 128);
968 }
969 if (Size / 128 == 2)
970 splitLoad(MI, LLTBreakdown: {B128, B128});
971 else if (Size / 128 == 4)
972 splitLoad(MI, LLTBreakdown: {B128, B128, B128, B128});
973 else {
974 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
975 Msg: "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
976 MI);
977 return false;
978 }
979 }
980 // 64 and 32 bit load
981 else if (DstTy == S96)
982 splitLoad(MI, LLTBreakdown: {S64, S32}, MergeTy: S32);
983 else if (DstTy == V3S32)
984 splitLoad(MI, LLTBreakdown: {V2S32, S32}, MergeTy: S32);
985 else if (DstTy == V6S16)
986 splitLoad(MI, LLTBreakdown: {V4S16, V2S16}, MergeTy: V2S16);
987 else {
988 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
989 Msg: "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
990 MI);
991 return false;
992 }
993 return true;
994 }
995 case WidenLoad: {
996 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
997 if (DstTy == S96)
998 widenLoad(MI, WideTy: S128);
999 else if (DstTy == V3S32)
1000 widenLoad(MI, WideTy: V4S32, MergeTy: S32);
1001 else if (DstTy == V6S16)
1002 widenLoad(MI, WideTy: V8S16, MergeTy: V2S16);
1003 else {
1004 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1005 Msg: "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1006 MI);
1007 return false;
1008 }
1009 return true;
1010 }
1011 case UnpackAExt:
1012 return lowerUnpackAExt(MI);
1013 case WidenMMOToS32:
1014 return widenMMOToS32(MI&: cast<GAnyLoad>(Val&: MI));
1015 case VerifyAllSgpr: {
1016 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1017 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1018 }));
1019 return true;
1020 }
1021 case ApplyAllVgpr: {
1022 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1023 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1024 }));
1025 B.setInstrAndDebugLoc(MI);
1026 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1027 Register Reg = MI.getOperand(i).getReg();
1028 if (MRI.getRegBank(Reg) != VgprRB) {
1029 auto Copy = B.buildCopy(Res: {VgprRB, MRI.getType(Reg)}, Op: Reg);
1030 MI.getOperand(i).setReg(Copy.getReg(Idx: 0));
1031 }
1032 }
1033 return true;
1034 }
1035 case UnmergeToShiftTrunc: {
1036 GUnmerge *Unmerge = dyn_cast<GUnmerge>(Val: &MI);
1037 LLT Ty = MRI.getType(Reg: Unmerge->getSourceReg());
1038 if (Ty.getSizeInBits() % 32 != 0) {
1039 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1040 Msg: "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1041 MI);
1042 return false;
1043 }
1044
1045 B.setInstrAndDebugLoc(MI);
1046 if (Ty.getSizeInBits() > 32) {
1047 auto Unmerge32 = B.buildUnmerge(Attrs: SgprRB_S32, Op: Unmerge->getSourceReg());
1048 for (unsigned i = 0; i < Unmerge32->getNumDefs(); ++i) {
1049 auto [Dst0S32, Dst1S32] = unpackAExt(Reg: Unmerge32->getOperand(i).getReg());
1050 B.buildTrunc(Res: MI.getOperand(i: i * 2).getReg(), Op: Dst0S32);
1051 B.buildTrunc(Res: MI.getOperand(i: i * 2 + 1).getReg(), Op: Dst1S32);
1052 }
1053 } else {
1054 auto [Dst0S32, Dst1S32] = unpackAExt(Reg: MI.getOperand(i: 2).getReg());
1055 B.buildTrunc(Res: MI.getOperand(i: 0).getReg(), Op: Dst0S32);
1056 B.buildTrunc(Res: MI.getOperand(i: 1).getReg(), Op: Dst1S32);
1057 }
1058
1059 MI.eraseFromParent();
1060 return true;
1061 }
1062 }
1063
1064 if (!WaterfallSgprs.empty()) {
1065 MachineBasicBlock::iterator I = MI.getIterator();
1066 if (!executeInWaterfallLoop(B, Range: make_range(x: I, y: std::next(x: I)), SGPROperandRegs&: WaterfallSgprs))
1067 return false;
1068 }
1069 return true;
1070}
1071
1072LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1073 switch (ID) {
1074 case Vcc:
1075 case UniInVcc:
1076 return LLT::scalar(SizeInBits: 1);
1077 case Sgpr16:
1078 case Vgpr16:
1079 case UniInVgprS16:
1080 return LLT::scalar(SizeInBits: 16);
1081 case Sgpr32:
1082 case Sgpr32_WF:
1083 case Sgpr32Trunc:
1084 case Sgpr32AExt:
1085 case Sgpr32AExtBoolInReg:
1086 case Sgpr32SExt:
1087 case Sgpr32ZExt:
1088 case UniInVgprS32:
1089 case Vgpr32:
1090 case Vgpr32AExt:
1091 case Vgpr32SExt:
1092 case Vgpr32ZExt:
1093 return LLT::scalar(SizeInBits: 32);
1094 case Sgpr64:
1095 case Vgpr64:
1096 case UniInVgprS64:
1097 return LLT::scalar(SizeInBits: 64);
1098 case Sgpr128:
1099 case Vgpr128:
1100 return LLT::scalar(SizeInBits: 128);
1101 case SgprP0:
1102 case VgprP0:
1103 return LLT::pointer(AddressSpace: 0, SizeInBits: 64);
1104 case SgprP1:
1105 case VgprP1:
1106 return LLT::pointer(AddressSpace: 1, SizeInBits: 64);
1107 case SgprP2:
1108 case VgprP2:
1109 return LLT::pointer(AddressSpace: 2, SizeInBits: 32);
1110 case SgprP3:
1111 case VgprP3:
1112 return LLT::pointer(AddressSpace: 3, SizeInBits: 32);
1113 case SgprP4:
1114 case VgprP4:
1115 return LLT::pointer(AddressSpace: 4, SizeInBits: 64);
1116 case SgprP5:
1117 case VgprP5:
1118 return LLT::pointer(AddressSpace: 5, SizeInBits: 32);
1119 case SgprP8:
1120 return LLT::pointer(AddressSpace: 8, SizeInBits: 128);
1121 case SgprV2S16:
1122 case VgprV2S16:
1123 case UniInVgprV2S16:
1124 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
1125 case SgprV2S32:
1126 case VgprV2S32:
1127 case UniInVgprV2S32:
1128 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
1129 case VgprV3S32:
1130 return LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
1131 case SgprV4S32:
1132 case SgprV4S32_WF:
1133 case VgprV4S32:
1134 case UniInVgprV4S32:
1135 return LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
1136 case VgprV2S64:
1137 case UniInVgprV2S64:
1138 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
1139 default:
1140 return LLT();
1141 }
1142}
1143
1144LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1145 switch (ID) {
1146 case SgprB32:
1147 case VgprB32:
1148 case UniInVgprB32:
1149 if (Ty == LLT::scalar(SizeInBits: 32) || Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) ||
1150 isAnyPtr(Ty, Width: 32))
1151 return Ty;
1152 return LLT();
1153 case SgprPtr32:
1154 case VgprPtr32:
1155 return isAnyPtr(Ty, Width: 32) ? Ty : LLT();
1156 case SgprPtr64:
1157 case VgprPtr64:
1158 return isAnyPtr(Ty, Width: 64) ? Ty : LLT();
1159 case SgprPtr128:
1160 case VgprPtr128:
1161 return isAnyPtr(Ty, Width: 128) ? Ty : LLT();
1162 case SgprB64:
1163 case VgprB64:
1164 case UniInVgprB64:
1165 if (Ty == LLT::scalar(SizeInBits: 64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32) ||
1166 Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16) || isAnyPtr(Ty, Width: 64))
1167 return Ty;
1168 return LLT();
1169 case SgprB96:
1170 case VgprB96:
1171 case UniInVgprB96:
1172 if (Ty == LLT::scalar(SizeInBits: 96) || Ty == LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32) ||
1173 Ty == LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 16))
1174 return Ty;
1175 return LLT();
1176 case SgprB128:
1177 case VgprB128:
1178 case UniInVgprB128:
1179 if (Ty == LLT::scalar(SizeInBits: 128) || Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32) ||
1180 Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64) || Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16) ||
1181 isAnyPtr(Ty, Width: 128))
1182 return Ty;
1183 return LLT();
1184 case SgprB256:
1185 case VgprB256:
1186 case UniInVgprB256:
1187 if (Ty == LLT::scalar(SizeInBits: 256) || Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 32) ||
1188 Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 64) || Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 16))
1189 return Ty;
1190 return LLT();
1191 case SgprB512:
1192 case VgprB512:
1193 case UniInVgprB512:
1194 if (Ty == LLT::scalar(SizeInBits: 512) || Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 32) ||
1195 Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 64))
1196 return Ty;
1197 return LLT();
1198 default:
1199 return LLT();
1200 }
1201}
1202
1203const RegisterBank *
1204RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1205 switch (ID) {
1206 case Vcc:
1207 return VccRB;
1208 case Sgpr16:
1209 case Sgpr32:
1210 case Sgpr32_WF:
1211 case Sgpr64:
1212 case Sgpr128:
1213 case SgprP0:
1214 case SgprP1:
1215 case SgprP2:
1216 case SgprP3:
1217 case SgprP4:
1218 case SgprP5:
1219 case SgprP8:
1220 case SgprPtr32:
1221 case SgprPtr64:
1222 case SgprPtr128:
1223 case SgprV2S16:
1224 case SgprV2S32:
1225 case SgprV4S32:
1226 case SgprV4S32_WF:
1227 case SgprB32:
1228 case SgprB64:
1229 case SgprB96:
1230 case SgprB128:
1231 case SgprB256:
1232 case SgprB512:
1233 case UniInVcc:
1234 case UniInVgprS16:
1235 case UniInVgprS32:
1236 case UniInVgprS64:
1237 case UniInVgprV2S16:
1238 case UniInVgprV2S32:
1239 case UniInVgprV4S32:
1240 case UniInVgprV2S64:
1241 case UniInVgprB32:
1242 case UniInVgprB64:
1243 case UniInVgprB96:
1244 case UniInVgprB128:
1245 case UniInVgprB256:
1246 case UniInVgprB512:
1247 case Sgpr32Trunc:
1248 case Sgpr32AExt:
1249 case Sgpr32AExtBoolInReg:
1250 case Sgpr32SExt:
1251 case Sgpr32ZExt:
1252 return SgprRB;
1253 case Vgpr16:
1254 case Vgpr32:
1255 case Vgpr64:
1256 case Vgpr128:
1257 case VgprP0:
1258 case VgprP1:
1259 case VgprP2:
1260 case VgprP3:
1261 case VgprP4:
1262 case VgprP5:
1263 case VgprPtr32:
1264 case VgprPtr64:
1265 case VgprPtr128:
1266 case VgprV2S16:
1267 case VgprV2S32:
1268 case VgprV2S64:
1269 case VgprV3S32:
1270 case VgprV4S32:
1271 case VgprB32:
1272 case VgprB64:
1273 case VgprB96:
1274 case VgprB128:
1275 case VgprB256:
1276 case VgprB512:
1277 case Vgpr32AExt:
1278 case Vgpr32SExt:
1279 case Vgpr32ZExt:
1280 return VgprRB;
1281 default:
1282 return nullptr;
1283 }
1284}
1285
1286bool RegBankLegalizeHelper::applyMappingDst(
1287 MachineInstr &MI, unsigned &OpIdx,
1288 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1289 // Defs start from operand 0
1290 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1291 if (MethodIDs[OpIdx] == None)
1292 continue;
1293 MachineOperand &Op = MI.getOperand(i: OpIdx);
1294 Register Reg = Op.getReg();
1295 LLT Ty = MRI.getType(Reg);
1296 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1297
1298 switch (MethodIDs[OpIdx]) {
1299 // vcc, sgpr and vgpr scalars, pointers and vectors
1300 case Vcc:
1301 case Sgpr16:
1302 case Sgpr32:
1303 case Sgpr64:
1304 case Sgpr128:
1305 case SgprP0:
1306 case SgprP1:
1307 case SgprP3:
1308 case SgprP4:
1309 case SgprP5:
1310 case SgprP8:
1311 case SgprV2S16:
1312 case SgprV2S32:
1313 case SgprV4S32:
1314 case Vgpr16:
1315 case Vgpr32:
1316 case Vgpr64:
1317 case Vgpr128:
1318 case VgprP0:
1319 case VgprP1:
1320 case VgprP2:
1321 case VgprP3:
1322 case VgprP4:
1323 case VgprP5:
1324 case VgprV2S16:
1325 case VgprV2S32:
1326 case VgprV2S64:
1327 case VgprV3S32:
1328 case VgprV4S32: {
1329 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1330 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1331 break;
1332 }
1333 // sgpr and vgpr B-types
1334 case SgprB32:
1335 case SgprB64:
1336 case SgprB96:
1337 case SgprB128:
1338 case SgprB256:
1339 case SgprB512:
1340 case SgprPtr32:
1341 case SgprPtr64:
1342 case SgprPtr128:
1343 case VgprB32:
1344 case VgprB64:
1345 case VgprB96:
1346 case VgprB128:
1347 case VgprB256:
1348 case VgprB512:
1349 case VgprPtr32:
1350 case VgprPtr64:
1351 case VgprPtr128: {
1352 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1353 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1354 break;
1355 }
1356 // uniform in vcc/vgpr: scalars, vectors and B-types
1357 case UniInVcc: {
1358 assert(Ty == S1);
1359 assert(RB == SgprRB);
1360 Register NewDst = MRI.createVirtualRegister(RegAttr: VccRB_S1);
1361 Op.setReg(NewDst);
1362 if (!MRI.use_empty(RegNo: Reg)) {
1363 auto CopyS32_Vcc =
1364 B.buildInstr(Opc: AMDGPU::G_AMDGPU_COPY_SCC_VCC, DstOps: {SgprRB_S32}, SrcOps: {NewDst});
1365 B.buildTrunc(Res: Reg, Op: CopyS32_Vcc);
1366 }
1367 break;
1368 }
1369 case UniInVgprS16: {
1370 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1371 assert(RB == SgprRB);
1372 Register NewVgprDstS16 = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: S16});
1373 Register NewVgprDstS32 = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: S32});
1374 Register NewSgprDstS32 = MRI.createVirtualRegister(RegAttr: {.RCOrRB: SgprRB, .Ty: S32});
1375 Op.setReg(NewVgprDstS16);
1376 B.buildAnyExt(Res: NewVgprDstS32, Op: NewVgprDstS16);
1377 buildReadAnyLane(B, SgprDst: NewSgprDstS32, VgprSrc: NewVgprDstS32, RBI);
1378 B.buildTrunc(Res: Reg, Op: NewSgprDstS32);
1379 break;
1380 }
1381 case UniInVgprS32:
1382 case UniInVgprS64:
1383 case UniInVgprV2S16:
1384 case UniInVgprV2S32:
1385 case UniInVgprV4S32:
1386 case UniInVgprV2S64: {
1387 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1388 assert(RB == SgprRB);
1389 Register NewVgprDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
1390 Op.setReg(NewVgprDst);
1391 buildReadAnyLane(B, SgprDst: Reg, VgprSrc: NewVgprDst, RBI);
1392 break;
1393 }
1394 case UniInVgprB32:
1395 case UniInVgprB64:
1396 case UniInVgprB96:
1397 case UniInVgprB128:
1398 case UniInVgprB256:
1399 case UniInVgprB512: {
1400 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1401 assert(RB == SgprRB);
1402 Register NewVgprDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
1403 Op.setReg(NewVgprDst);
1404 AMDGPU::buildReadAnyLane(B, SgprDst: Reg, VgprSrc: NewVgprDst, RBI);
1405 break;
1406 }
1407 // sgpr trunc
1408 case Sgpr32Trunc: {
1409 assert(Ty.getSizeInBits() < 32);
1410 assert(RB == SgprRB);
1411 Register NewDst = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
1412 Op.setReg(NewDst);
1413 if (!MRI.use_empty(RegNo: Reg))
1414 B.buildTrunc(Res: Reg, Op: NewDst);
1415 break;
1416 }
1417 case InvalidMapping: {
1418 reportGISelFailure(
1419 MF, MORE, PassName: "amdgpu-regbanklegalize",
1420 Msg: "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1421 return false;
1422 }
1423 default:
1424 reportGISelFailure(
1425 MF, MORE, PassName: "amdgpu-regbanklegalize",
1426 Msg: "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1427 return false;
1428 }
1429 }
1430
1431 return true;
1432}
1433
1434bool RegBankLegalizeHelper::applyMappingSrc(
1435 MachineInstr &MI, unsigned &OpIdx,
1436 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1437 SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
1438 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1439 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1440 continue;
1441
1442 MachineOperand &Op = MI.getOperand(i: OpIdx);
1443 Register Reg = Op.getReg();
1444 LLT Ty = MRI.getType(Reg);
1445 const RegisterBank *RB = MRI.getRegBank(Reg);
1446
1447 switch (MethodIDs[i]) {
1448 case Vcc: {
1449 assert(Ty == S1);
1450 assert(RB == VccRB || RB == SgprRB);
1451 if (RB == SgprRB) {
1452 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
1453 auto CopyVcc_Scc =
1454 B.buildInstr(Opc: AMDGPU::G_AMDGPU_COPY_VCC_SCC, DstOps: {VccRB_S1}, SrcOps: {Aext});
1455 Op.setReg(CopyVcc_Scc.getReg(Idx: 0));
1456 }
1457 break;
1458 }
1459 // sgpr scalars, pointers and vectors
1460 case Sgpr16:
1461 case Sgpr32:
1462 case Sgpr64:
1463 case Sgpr128:
1464 case SgprP0:
1465 case SgprP1:
1466 case SgprP3:
1467 case SgprP4:
1468 case SgprP5:
1469 case SgprP8:
1470 case SgprV2S16:
1471 case SgprV2S32:
1472 case SgprV4S32: {
1473 assert(Ty == getTyFromID(MethodIDs[i]));
1474 assert(RB == getRegBankFromID(MethodIDs[i]));
1475 break;
1476 }
1477 // sgpr B-types
1478 case SgprB32:
1479 case SgprB64:
1480 case SgprB96:
1481 case SgprB128:
1482 case SgprB256:
1483 case SgprB512:
1484 case SgprPtr32:
1485 case SgprPtr64:
1486 case SgprPtr128: {
1487 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1488 assert(RB == getRegBankFromID(MethodIDs[i]));
1489 break;
1490 }
1491 // vgpr scalars, pointers and vectors
1492 case Vgpr16:
1493 case Vgpr32:
1494 case Vgpr64:
1495 case Vgpr128:
1496 case VgprP0:
1497 case VgprP1:
1498 case VgprP2:
1499 case VgprP3:
1500 case VgprP4:
1501 case VgprP5:
1502 case VgprV2S16:
1503 case VgprV2S32:
1504 case VgprV2S64:
1505 case VgprV3S32:
1506 case VgprV4S32: {
1507 assert(Ty == getTyFromID(MethodIDs[i]));
1508 if (RB != VgprRB) {
1509 auto CopyToVgpr = B.buildCopy(Res: {VgprRB, Ty}, Op: Reg);
1510 Op.setReg(CopyToVgpr.getReg(Idx: 0));
1511 }
1512 break;
1513 }
1514 // vgpr B-types
1515 case VgprB32:
1516 case VgprB64:
1517 case VgprB96:
1518 case VgprB128:
1519 case VgprB256:
1520 case VgprB512:
1521 case VgprPtr32:
1522 case VgprPtr64:
1523 case VgprPtr128: {
1524 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1525 if (RB != VgprRB) {
1526 auto CopyToVgpr = B.buildCopy(Res: {VgprRB, Ty}, Op: Reg);
1527 Op.setReg(CopyToVgpr.getReg(Idx: 0));
1528 }
1529 break;
1530 }
1531 // sgpr waterfall, scalars and vectors
1532 case Sgpr32_WF:
1533 case SgprV4S32_WF: {
1534 assert(Ty == getTyFromID(MethodIDs[i]));
1535 if (RB != SgprRB)
1536 SgprWaterfallOperandRegs.insert(V: Reg);
1537 break;
1538 }
1539 // sgpr and vgpr scalars with extend
1540 case Sgpr32AExt: {
1541 // Note: this ext allows S1, and it is meant to be combined away.
1542 assert(Ty.getSizeInBits() < 32);
1543 assert(RB == SgprRB);
1544 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
1545 Op.setReg(Aext.getReg(Idx: 0));
1546 break;
1547 }
1548 case Sgpr32AExtBoolInReg: {
1549 // Note: this ext allows S1, and it is meant to be combined away.
1550 assert(Ty.getSizeInBits() == 1);
1551 assert(RB == SgprRB);
1552 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
1553 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1554 // most of times meant to be combined away in AMDGPURegBankCombiner.
1555 auto Cst1 = B.buildConstant(Res: SgprRB_S32, Val: 1);
1556 auto BoolInReg = B.buildAnd(Dst: SgprRB_S32, Src0: Aext, Src1: Cst1);
1557 Op.setReg(BoolInReg.getReg(Idx: 0));
1558 break;
1559 }
1560 case Sgpr32SExt: {
1561 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1562 assert(RB == SgprRB);
1563 auto Sext = B.buildSExt(Res: SgprRB_S32, Op: Reg);
1564 Op.setReg(Sext.getReg(Idx: 0));
1565 break;
1566 }
1567 case Sgpr32ZExt: {
1568 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1569 assert(RB == SgprRB);
1570 auto Zext = B.buildZExt(Res: {SgprRB, S32}, Op: Reg);
1571 Op.setReg(Zext.getReg(Idx: 0));
1572 break;
1573 }
1574 case Vgpr32AExt: {
1575 assert(Ty.getSizeInBits() < 32);
1576 assert(RB == VgprRB);
1577 auto Aext = B.buildAnyExt(Res: {VgprRB, S32}, Op: Reg);
1578 Op.setReg(Aext.getReg(Idx: 0));
1579 break;
1580 }
1581 case Vgpr32SExt: {
1582 // Note this ext allows S1, and it is meant to be combined away.
1583 assert(Ty.getSizeInBits() < 32);
1584 assert(RB == VgprRB);
1585 auto Sext = B.buildSExt(Res: {VgprRB, S32}, Op: Reg);
1586 Op.setReg(Sext.getReg(Idx: 0));
1587 break;
1588 }
1589 case Vgpr32ZExt: {
1590 // Note this ext allows S1, and it is meant to be combined away.
1591 assert(Ty.getSizeInBits() < 32);
1592 assert(RB == VgprRB);
1593 auto Zext = B.buildZExt(Res: {VgprRB, S32}, Op: Reg);
1594 Op.setReg(Zext.getReg(Idx: 0));
1595 break;
1596 }
1597 default:
1598 reportGISelFailure(
1599 MF, MORE, PassName: "amdgpu-regbanklegalize",
1600 Msg: "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
1601 return false;
1602 }
1603 }
1604 return true;
1605}
1606
1607bool RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
1608 Register Dst = MI.getOperand(i: 0).getReg();
1609 LLT Ty = MRI.getType(Reg: Dst);
1610
1611 if (Ty == LLT::scalar(SizeInBits: 1) && MUI.isUniform(V: Dst)) {
1612 B.setInsertPt(MBB&: *MI.getParent(), II: MI.getParent()->getFirstNonPHI());
1613
1614 Register NewDst = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
1615 MI.getOperand(i: 0).setReg(NewDst);
1616 B.buildTrunc(Res: Dst, Op: NewDst);
1617
1618 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1619 Register UseReg = MI.getOperand(i).getReg();
1620
1621 auto DefMI = MRI.getVRegDef(Reg: UseReg)->getIterator();
1622 MachineBasicBlock *DefMBB = DefMI->getParent();
1623
1624 B.setInsertPt(MBB&: *DefMBB, II: DefMBB->SkipPHIsAndLabels(I: std::next(x: DefMI)));
1625
1626 auto NewUse = B.buildAnyExt(Res: SgprRB_S32, Op: UseReg);
1627 MI.getOperand(i).setReg(NewUse.getReg(Idx: 0));
1628 }
1629
1630 return true;
1631 }
1632
1633 // ALL divergent i1 phis should have been lowered and inst-selected into PHI
1634 // with sgpr reg class and S1 LLT in AMDGPUGlobalISelDivergenceLowering pass.
1635 // Note: this includes divergent phis that don't require lowering.
1636 if (Ty == LLT::scalar(SizeInBits: 1) && MUI.isDivergent(V: Dst)) {
1637 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1638 Msg: "AMDGPU RegBankLegalize: Can't lower divergent S1 G_PHI",
1639 MI);
1640 return false;
1641 }
1642
1643 // We accept all types that can fit in some register class.
1644 // Uniform G_PHIs have all sgpr registers.
1645 // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
1646 if (Ty == LLT::scalar(SizeInBits: 32) || Ty == LLT::pointer(AddressSpace: 1, SizeInBits: 64) ||
1647 Ty == LLT::pointer(AddressSpace: 4, SizeInBits: 64)) {
1648 return true;
1649 }
1650
1651 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1652 Msg: "AMDGPU RegBankLegalize: type not supported for G_PHI",
1653 MI);
1654 return false;
1655}
1656
1657[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1658 const RegisterBank *RB,
1659 MachineRegisterInfo &MRI,
1660 unsigned StartOpIdx,
1661 unsigned EndOpIdx) {
1662 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1663 if (MRI.getRegBankOrNull(Reg: MI.getOperand(i).getReg()) != RB)
1664 return false;
1665 }
1666 return true;
1667}
1668
1669void RegBankLegalizeHelper::applyMappingTrivial(MachineInstr &MI) {
1670 const RegisterBank *RB = MRI.getRegBank(Reg: MI.getOperand(i: 0).getReg());
1671 // Put RB on all registers
1672 unsigned NumDefs = MI.getNumDefs();
1673 unsigned NumOperands = MI.getNumOperands();
1674
1675 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1676 if (RB == SgprRB)
1677 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1678
1679 if (RB == VgprRB) {
1680 B.setInstr(MI);
1681 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1682 Register Reg = MI.getOperand(i).getReg();
1683 if (MRI.getRegBank(Reg) != RB) {
1684 auto Copy = B.buildCopy(Res: {VgprRB, MRI.getType(Reg)}, Op: Reg);
1685 MI.getOperand(i).setReg(Copy.getReg(Idx: 0));
1686 }
1687 }
1688 }
1689}
1690