1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPURegBankLegalizeHelper.h"
15#include "AMDGPUGlobalISelUtils.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPURegBankLegalizeRules.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "GCNSubtarget.h"
20#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
22#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23#include "llvm/CodeGen/MachineInstr.h"
24#include "llvm/CodeGen/MachineUniformityAnalysis.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
26#include "llvm/Support/AMDGPUAddrSpace.h"
27
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
29
30using namespace llvm;
31using namespace AMDGPU;
32
33RegBankLegalizeHelper::RegBankLegalizeHelper(
34 MachineIRBuilder &B, const MachineUniformityInfo &MUI,
35 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36 : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
37 MUI(MUI), RBI(RBI), RBLRules(RBLRules),
38 SgprRB(&RBI.getRegBank(ID: AMDGPU::SGPRRegBankID)),
39 VgprRB(&RBI.getRegBank(ID: AMDGPU::VGPRRegBankID)),
40 VccRB(&RBI.getRegBank(ID: AMDGPU::VCCRegBankID)) {}
41
42void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
43 const SetOfRulesForOpcode &RuleSet = RBLRules.getRulesForOpc(MI);
44 const RegBankLLTMapping &Mapping = RuleSet.findMappingForMI(MI, MRI, MUI);
45
46 SmallSet<Register, 4> WaterfallSgprs;
47 unsigned OpIdx = 0;
48 if (Mapping.DstOpMapping.size() > 0) {
49 B.setInsertPt(MBB&: *MI.getParent(), II: std::next(x: MI.getIterator()));
50 applyMappingDst(MI, OpIdx, MethodIDs: Mapping.DstOpMapping);
51 }
52 if (Mapping.SrcOpMapping.size() > 0) {
53 B.setInstr(MI);
54 applyMappingSrc(MI, OpIdx, MethodIDs: Mapping.SrcOpMapping, SgprWaterfallOperandRegs&: WaterfallSgprs);
55 }
56
57 lower(MI, Mapping, SgprWaterfallOperandRegs&: WaterfallSgprs);
58}
59
60void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
61 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
62 MachineFunction &MF = B.getMF();
63 assert(MI.getNumMemOperands() == 1);
64 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
65 Register Dst = MI.getOperand(i: 0).getReg();
66 const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: Dst);
67 Register Base = MI.getOperand(i: 1).getReg();
68 LLT PtrTy = MRI.getType(Reg: Base);
69 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Reg: Base);
70 LLT OffsetTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
71 SmallVector<Register, 4> LoadPartRegs;
72
73 unsigned ByteOffset = 0;
74 for (LLT PartTy : LLTBreakdown) {
75 Register BasePlusOffset;
76 if (ByteOffset == 0) {
77 BasePlusOffset = Base;
78 } else {
79 auto Offset = B.buildConstant(Res: {PtrRB, OffsetTy}, Val: ByteOffset);
80 BasePlusOffset = B.buildPtrAdd(Res: {PtrRB, PtrTy}, Op0: Base, Op1: Offset).getReg(Idx: 0);
81 }
82 auto *OffsetMMO = MF.getMachineMemOperand(MMO: &BaseMMO, Offset: ByteOffset, Ty: PartTy);
83 auto LoadPart = B.buildLoad(Res: {DstRB, PartTy}, Addr: BasePlusOffset, MMO&: *OffsetMMO);
84 LoadPartRegs.push_back(Elt: LoadPart.getReg(Idx: 0));
85 ByteOffset += PartTy.getSizeInBytes();
86 }
87
88 if (!MergeTy.isValid()) {
89 // Loads are of same size, concat or merge them together.
90 B.buildMergeLikeInstr(Res: Dst, Ops: LoadPartRegs);
91 } else {
92 // Loads are not all of same size, need to unmerge them to smaller pieces
93 // of MergeTy type, then merge pieces to Dst.
94 SmallVector<Register, 4> MergeTyParts;
95 for (Register Reg : LoadPartRegs) {
96 if (MRI.getType(Reg) == MergeTy) {
97 MergeTyParts.push_back(Elt: Reg);
98 } else {
99 auto Unmerge = B.buildUnmerge(Attrs: {.RCOrRB: DstRB, .Ty: MergeTy}, Op: Reg);
100 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
101 MergeTyParts.push_back(Elt: Unmerge.getReg(Idx: i));
102 }
103 }
104 B.buildMergeLikeInstr(Res: Dst, Ops: MergeTyParts);
105 }
106 MI.eraseFromParent();
107}
108
109void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
110 LLT MergeTy) {
111 MachineFunction &MF = B.getMF();
112 assert(MI.getNumMemOperands() == 1);
113 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
114 Register Dst = MI.getOperand(i: 0).getReg();
115 const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: Dst);
116 Register Base = MI.getOperand(i: 1).getReg();
117
118 MachineMemOperand *WideMMO = MF.getMachineMemOperand(MMO: &BaseMMO, Offset: 0, Ty: WideTy);
119 auto WideLoad = B.buildLoad(Res: {DstRB, WideTy}, Addr: Base, MMO&: *WideMMO);
120
121 if (WideTy.isScalar()) {
122 B.buildTrunc(Res: Dst, Op: WideLoad);
123 } else {
124 SmallVector<Register, 4> MergeTyParts;
125 auto Unmerge = B.buildUnmerge(Attrs: {.RCOrRB: DstRB, .Ty: MergeTy}, Op: WideLoad);
126
127 LLT DstTy = MRI.getType(Reg: Dst);
128 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
129 for (unsigned i = 0; i < NumElts; ++i) {
130 MergeTyParts.push_back(Elt: Unmerge.getReg(Idx: i));
131 }
132 B.buildMergeLikeInstr(Res: Dst, Ops: MergeTyParts);
133 }
134 MI.eraseFromParent();
135}
136
137void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
138 Register Dst = MI.getOperand(i: 0).getReg();
139 LLT Ty = MRI.getType(Reg: Dst);
140 Register Src = MI.getOperand(i: 1).getReg();
141 unsigned Opc = MI.getOpcode();
142 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
143 if (Ty == S32 || Ty == S16) {
144 auto True = B.buildConstant(Res: {VgprRB, Ty}, Val: TrueExtCst);
145 auto False = B.buildConstant(Res: {VgprRB, Ty}, Val: 0);
146 B.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
147 } else if (Ty == S64) {
148 auto True = B.buildConstant(Res: {VgprRB_S32}, Val: TrueExtCst);
149 auto False = B.buildConstant(Res: {VgprRB_S32}, Val: 0);
150 auto Lo = B.buildSelect(Res: {VgprRB_S32}, Tst: Src, Op0: True, Op1: False);
151 MachineInstrBuilder Hi;
152 switch (Opc) {
153 case G_SEXT:
154 Hi = Lo;
155 break;
156 case G_ZEXT:
157 Hi = False;
158 break;
159 case G_ANYEXT:
160 Hi = B.buildUndef(Res: {VgprRB_S32});
161 break;
162 default:
163 llvm_unreachable("Opcode not supported");
164 }
165
166 B.buildMergeValues(Res: Dst, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)});
167 } else {
168 llvm_unreachable("Type not supported");
169 }
170
171 MI.eraseFromParent();
172}
173
174std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
175 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
176 auto Mask = B.buildConstant(Res: SgprRB_S32, Val: 0x0000ffff);
177 auto Lo = B.buildAnd(Dst: SgprRB_S32, Src0: PackedS32, Src1: Mask);
178 auto Hi = B.buildLShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
179 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
180}
181
182std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
183 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
184 auto Lo = B.buildSExtInReg(Res: SgprRB_S32, Op: PackedS32, ImmOp: 16);
185 auto Hi = B.buildAShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
186 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
187}
188
189std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
190 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
191 auto Lo = PackedS32;
192 auto Hi = B.buildLShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
193 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
194}
195
196void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
197 Register Lo, Hi;
198 switch (MI.getOpcode()) {
199 case AMDGPU::G_SHL: {
200 auto [Val0, Val1] = unpackAExt(Reg: MI.getOperand(i: 1).getReg());
201 auto [Amt0, Amt1] = unpackAExt(Reg: MI.getOperand(i: 2).getReg());
202 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0, Amt0}).getReg(Idx: 0);
203 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val1, Amt1}).getReg(Idx: 0);
204 break;
205 }
206 case AMDGPU::G_LSHR: {
207 auto [Val0, Val1] = unpackZExt(Reg: MI.getOperand(i: 1).getReg());
208 auto [Amt0, Amt1] = unpackZExt(Reg: MI.getOperand(i: 2).getReg());
209 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0, Amt0}).getReg(Idx: 0);
210 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val1, Amt1}).getReg(Idx: 0);
211 break;
212 }
213 case AMDGPU::G_ASHR: {
214 auto [Val0, Val1] = unpackSExt(Reg: MI.getOperand(i: 1).getReg());
215 auto [Amt0, Amt1] = unpackSExt(Reg: MI.getOperand(i: 2).getReg());
216 Lo = B.buildAShr(Dst: SgprRB_S32, Src0: Val0, Src1: Amt0).getReg(Idx: 0);
217 Hi = B.buildAShr(Dst: SgprRB_S32, Src0: Val1, Src1: Amt1).getReg(Idx: 0);
218 break;
219 }
220 default:
221 llvm_unreachable("Unpack lowering not implemented");
222 }
223 B.buildBuildVectorTrunc(Res: MI.getOperand(i: 0).getReg(), Ops: {Lo, Hi});
224 MI.eraseFromParent();
225}
226
227static bool isSignedBFE(MachineInstr &MI) {
228 if (GIntrinsic *GI = dyn_cast<GIntrinsic>(Val: &MI))
229 return (GI->is(ID: Intrinsic::amdgcn_sbfe));
230
231 return MI.getOpcode() == AMDGPU::G_SBFX;
232}
233
234void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
235 Register Dst = MI.getOperand(i: 0).getReg();
236 assert(MRI.getType(Dst) == LLT::scalar(64));
237 bool Signed = isSignedBFE(MI);
238 unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1;
239 // Extract bitfield from Src, LSBit is the least-significant bit for the
240 // extraction (field offset) and Width is size of bitfield.
241 Register Src = MI.getOperand(i: FirstOpnd).getReg();
242 Register LSBit = MI.getOperand(i: FirstOpnd + 1).getReg();
243 Register Width = MI.getOperand(i: FirstOpnd + 2).getReg();
244 // Comments are for signed bitfield extract, similar for unsigned. x is sign
245 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
246
247 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
248 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
249 auto SHRSrc = B.buildInstr(Opc: SHROpc, DstOps: {{VgprRB, S64}}, SrcOps: {Src, LSBit});
250
251 auto ConstWidth = getIConstantVRegValWithLookThrough(VReg: Width, MRI);
252
253 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
254 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
255 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
256 if (!ConstWidth) {
257 auto Amt = B.buildSub(Dst: VgprRB_S32, Src0: B.buildConstant(Res: SgprRB_S32, Val: 64), Src1: Width);
258 auto SignBit = B.buildShl(Dst: {VgprRB, S64}, Src0: SHRSrc, Src1: Amt);
259 B.buildInstr(Opc: SHROpc, DstOps: {Dst}, SrcOps: {SignBit, Amt});
260 MI.eraseFromParent();
261 return;
262 }
263
264 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
265 auto UnmergeSHRSrc = B.buildUnmerge(Attrs: VgprRB_S32, Op: SHRSrc);
266 Register SHRSrcLo = UnmergeSHRSrc.getReg(Idx: 0);
267 Register SHRSrcHi = UnmergeSHRSrc.getReg(Idx: 1);
268 auto Zero = B.buildConstant(Res: {VgprRB, S32}, Val: 0);
269 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
270
271 if (WidthImm <= 32) {
272 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
273 auto Lo = B.buildInstr(Opc: BFXOpc, DstOps: {VgprRB_S32}, SrcOps: {SHRSrcLo, Zero, Width});
274 MachineInstrBuilder Hi;
275 if (Signed) {
276 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
277 Hi = B.buildAShr(Dst: VgprRB_S32, Src0: Lo, Src1: B.buildConstant(Res: VgprRB_S32, Val: 31));
278 } else {
279 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
280 Hi = Zero;
281 }
282 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
283 } else {
284 auto Amt = B.buildConstant(Res: VgprRB_S32, Val: WidthImm - 32);
285 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
286 auto Hi = B.buildInstr(Opc: BFXOpc, DstOps: {VgprRB_S32}, SrcOps: {SHRSrcHi, Zero, Amt});
287 B.buildMergeLikeInstr(Res: Dst, Ops: {SHRSrcLo, Hi});
288 }
289
290 MI.eraseFromParent();
291}
292
293void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
294 Register DstReg = MI.getOperand(i: 0).getReg();
295 LLT Ty = MRI.getType(Reg: DstReg);
296 bool Signed = isSignedBFE(MI);
297 unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1;
298 Register Src = MI.getOperand(i: FirstOpnd).getReg();
299 Register LSBit = MI.getOperand(i: FirstOpnd + 1).getReg();
300 Register Width = MI.getOperand(i: FirstOpnd + 2).getReg();
301 // For uniform bit field extract there are 4 available instructions, but
302 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
303 // field offset in low and size in high 16 bits.
304
305 // Src1 Hi16|Lo16 = Size|FieldOffset
306 auto Mask = B.buildConstant(Res: SgprRB_S32, Val: maskTrailingOnes<unsigned>(N: 6));
307 auto FieldOffset = B.buildAnd(Dst: SgprRB_S32, Src0: LSBit, Src1: Mask);
308 auto Size = B.buildShl(Dst: SgprRB_S32, Src0: Width, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
309 auto Src1 = B.buildOr(Dst: SgprRB_S32, Src0: FieldOffset, Src1: Size);
310 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
311 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
312 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
313
314 // Select machine instruction, because of reg class constraining, insert
315 // copies from reg class to reg bank.
316 auto S_BFE = B.buildInstr(Opc, DstOps: {{SgprRB, Ty}},
317 SrcOps: {B.buildCopy(Res: Ty, Op: Src), B.buildCopy(Res: S32, Op: Src1)});
318 if (!constrainSelectedInstRegOperands(I&: *S_BFE, TII: *ST.getInstrInfo(),
319 TRI: *ST.getRegisterInfo(), RBI))
320 llvm_unreachable("failed to constrain BFE");
321
322 B.buildCopy(Res: DstReg, Op: S_BFE->getOperand(i: 0).getReg());
323 MI.eraseFromParent();
324}
325
326void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
327 Register Dst = MI.getOperand(i: 0).getReg();
328 LLT DstTy = MRI.getType(Reg: Dst);
329 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
330 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
331 auto Op1 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 1).getReg());
332 auto Op2 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 2).getReg());
333 unsigned Opc = MI.getOpcode();
334 auto Flags = MI.getFlags();
335 auto Lo =
336 B.buildInstr(Opc, DstOps: {{VgprRB, Ty}}, SrcOps: {Op1.getReg(Idx: 0), Op2.getReg(Idx: 0)}, Flags);
337 auto Hi =
338 B.buildInstr(Opc, DstOps: {{VgprRB, Ty}}, SrcOps: {Op1.getReg(Idx: 1), Op2.getReg(Idx: 1)}, Flags);
339 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
340 MI.eraseFromParent();
341}
342
343void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
344 Register Dst = MI.getOperand(i: 0).getReg();
345 LLT DstTy = MRI.getType(Reg: Dst);
346 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
347 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
348 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
349 auto Op2 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 2).getReg());
350 auto Op3 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 3).getReg());
351 Register Cond = MI.getOperand(i: 1).getReg();
352 auto Flags = MI.getFlags();
353 auto Lo =
354 B.buildSelect(Res: {VgprRB, Ty}, Tst: Cond, Op0: Op2.getReg(Idx: 0), Op1: Op3.getReg(Idx: 0), Flags);
355 auto Hi =
356 B.buildSelect(Res: {VgprRB, Ty}, Tst: Cond, Op0: Op2.getReg(Idx: 1), Op1: Op3.getReg(Idx: 1), Flags);
357
358 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
359 MI.eraseFromParent();
360}
361
362void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
363 auto Op1 = B.buildUnmerge(Attrs: VgprRB_S32, Op: MI.getOperand(i: 1).getReg());
364 int Amt = MI.getOperand(i: 2).getImm();
365 Register Lo, Hi;
366 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
367 if (Amt <= 32) {
368 auto Freeze = B.buildFreeze(Dst: VgprRB_S32, Src: Op1.getReg(Idx: 0));
369 if (Amt == 32) {
370 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
371 Lo = Freeze.getReg(Idx: 0);
372 } else {
373 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
374 Lo = B.buildSExtInReg(Res: VgprRB_S32, Op: Freeze, ImmOp: Amt).getReg(Idx: 0);
375 }
376
377 auto SignExtCst = B.buildConstant(Res: SgprRB_S32, Val: 31);
378 Hi = B.buildAShr(Dst: VgprRB_S32, Src0: Lo, Src1: SignExtCst).getReg(Idx: 0);
379 } else {
380 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
381 Lo = Op1.getReg(Idx: 0);
382 Hi = B.buildSExtInReg(Res: VgprRB_S32, Op: Op1.getReg(Idx: 1), ImmOp: Amt - 32).getReg(Idx: 0);
383 }
384
385 B.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(), Ops: {Lo, Hi});
386 MI.eraseFromParent();
387}
388
389void RegBankLegalizeHelper::lower(MachineInstr &MI,
390 const RegBankLLTMapping &Mapping,
391 SmallSet<Register, 4> &WaterfallSgprs) {
392
393 switch (Mapping.LoweringMethod) {
394 case DoNotLower:
395 return;
396 case VccExtToSel:
397 return lowerVccExtToSel(MI);
398 case UniExtToSel: {
399 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
400 auto True = B.buildConstant(Res: {SgprRB, Ty},
401 Val: MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
402 auto False = B.buildConstant(Res: {SgprRB, Ty}, Val: 0);
403 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
404 // We are making select here. S1 cond was already 'any-extended to S32' +
405 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
406 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: MI.getOperand(i: 1).getReg(), Op0: True,
407 Op1: False);
408 MI.eraseFromParent();
409 return;
410 }
411 case UnpackBitShift:
412 return lowerUnpackBitShift(MI);
413 case Ext32To64: {
414 const RegisterBank *RB = MRI.getRegBank(Reg: MI.getOperand(i: 0).getReg());
415 MachineInstrBuilder Hi;
416 switch (MI.getOpcode()) {
417 case AMDGPU::G_ZEXT: {
418 Hi = B.buildConstant(Res: {RB, S32}, Val: 0);
419 break;
420 }
421 case AMDGPU::G_SEXT: {
422 // Replicate sign bit from 32-bit extended part.
423 auto ShiftAmt = B.buildConstant(Res: {RB, S32}, Val: 31);
424 Hi = B.buildAShr(Dst: {RB, S32}, Src0: MI.getOperand(i: 1).getReg(), Src1: ShiftAmt);
425 break;
426 }
427 case AMDGPU::G_ANYEXT: {
428 Hi = B.buildUndef(Res: {RB, S32});
429 break;
430 }
431 default:
432 llvm_unreachable("Unsuported Opcode in Ext32To64");
433 }
434
435 B.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(),
436 Ops: {MI.getOperand(i: 1).getReg(), Hi});
437 MI.eraseFromParent();
438 return;
439 }
440 case UniCstExt: {
441 uint64_t ConstVal = MI.getOperand(i: 1).getCImm()->getZExtValue();
442 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: ConstVal);
443
444 MI.eraseFromParent();
445 return;
446 }
447 case VgprToVccCopy: {
448 Register Src = MI.getOperand(i: 1).getReg();
449 LLT Ty = MRI.getType(Reg: Src);
450 // Take lowest bit from each lane and put it in lane mask.
451 // Lowering via compare, but we need to clean high bits first as compare
452 // compares all bits in register.
453 Register BoolSrc = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
454 if (Ty == S64) {
455 auto Src64 = B.buildUnmerge(Attrs: VgprRB_S32, Op: Src);
456 auto One = B.buildConstant(Res: VgprRB_S32, Val: 1);
457 auto AndLo = B.buildAnd(Dst: VgprRB_S32, Src0: Src64.getReg(Idx: 0), Src1: One);
458 auto Zero = B.buildConstant(Res: VgprRB_S32, Val: 0);
459 auto AndHi = B.buildAnd(Dst: VgprRB_S32, Src0: Src64.getReg(Idx: 1), Src1: Zero);
460 B.buildMergeLikeInstr(Res: BoolSrc, Ops: {AndLo, AndHi});
461 } else {
462 assert(Ty == S32 || Ty == S16);
463 auto One = B.buildConstant(Res: {VgprRB, Ty}, Val: 1);
464 B.buildAnd(Dst: BoolSrc, Src0: Src, Src1: One);
465 }
466 auto Zero = B.buildConstant(Res: {VgprRB, Ty}, Val: 0);
467 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: MI.getOperand(i: 0).getReg(), Op0: BoolSrc, Op1: Zero);
468 MI.eraseFromParent();
469 return;
470 }
471 case V_BFE:
472 return lowerV_BFE(MI);
473 case S_BFE:
474 return lowerS_BFE(MI);
475 case SplitTo32:
476 return lowerSplitTo32(MI);
477 case SplitTo32Select:
478 return lowerSplitTo32Select(MI);
479 case SplitTo32SExtInReg:
480 return lowerSplitTo32SExtInReg(MI);
481 case SplitLoad: {
482 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
483 unsigned Size = DstTy.getSizeInBits();
484 // Even split to 128-bit loads
485 if (Size > 128) {
486 LLT B128;
487 if (DstTy.isVector()) {
488 LLT EltTy = DstTy.getElementType();
489 B128 = LLT::fixed_vector(NumElements: 128 / EltTy.getSizeInBits(), ScalarTy: EltTy);
490 } else {
491 B128 = LLT::scalar(SizeInBits: 128);
492 }
493 if (Size / 128 == 2)
494 splitLoad(MI, LLTBreakdown: {B128, B128});
495 else if (Size / 128 == 4)
496 splitLoad(MI, LLTBreakdown: {B128, B128, B128, B128});
497 else {
498 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
499 llvm_unreachable("SplitLoad type not supported for MI");
500 }
501 }
502 // 64 and 32 bit load
503 else if (DstTy == S96)
504 splitLoad(MI, LLTBreakdown: {S64, S32}, MergeTy: S32);
505 else if (DstTy == V3S32)
506 splitLoad(MI, LLTBreakdown: {V2S32, S32}, MergeTy: S32);
507 else if (DstTy == V6S16)
508 splitLoad(MI, LLTBreakdown: {V4S16, V2S16}, MergeTy: V2S16);
509 else {
510 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
511 llvm_unreachable("SplitLoad type not supported for MI");
512 }
513 break;
514 }
515 case WidenLoad: {
516 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
517 if (DstTy == S96)
518 widenLoad(MI, WideTy: S128);
519 else if (DstTy == V3S32)
520 widenLoad(MI, WideTy: V4S32, MergeTy: S32);
521 else if (DstTy == V6S16)
522 widenLoad(MI, WideTy: V8S16, MergeTy: V2S16);
523 else {
524 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
525 llvm_unreachable("WidenLoad type not supported for MI");
526 }
527 break;
528 }
529 }
530
531 // TODO: executeInWaterfallLoop(... WaterfallSgprs)
532}
533
534LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
535 switch (ID) {
536 case Vcc:
537 case UniInVcc:
538 return LLT::scalar(SizeInBits: 1);
539 case Sgpr16:
540 case Vgpr16:
541 return LLT::scalar(SizeInBits: 16);
542 case Sgpr32:
543 case Sgpr32Trunc:
544 case Sgpr32AExt:
545 case Sgpr32AExtBoolInReg:
546 case Sgpr32SExt:
547 case Sgpr32ZExt:
548 case UniInVgprS32:
549 case Vgpr32:
550 case Vgpr32SExt:
551 case Vgpr32ZExt:
552 return LLT::scalar(SizeInBits: 32);
553 case Sgpr64:
554 case Vgpr64:
555 return LLT::scalar(SizeInBits: 64);
556 case Sgpr128:
557 case Vgpr128:
558 return LLT::scalar(SizeInBits: 128);
559 case VgprP0:
560 return LLT::pointer(AddressSpace: 0, SizeInBits: 64);
561 case SgprP1:
562 case VgprP1:
563 return LLT::pointer(AddressSpace: 1, SizeInBits: 64);
564 case SgprP3:
565 case VgprP3:
566 return LLT::pointer(AddressSpace: 3, SizeInBits: 32);
567 case SgprP4:
568 case VgprP4:
569 return LLT::pointer(AddressSpace: 4, SizeInBits: 64);
570 case SgprP5:
571 case VgprP5:
572 return LLT::pointer(AddressSpace: 5, SizeInBits: 32);
573 case SgprV2S16:
574 case VgprV2S16:
575 case UniInVgprV2S16:
576 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
577 case SgprV2S32:
578 case VgprV2S32:
579 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
580 case SgprV4S32:
581 case VgprV4S32:
582 case UniInVgprV4S32:
583 return LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
584 default:
585 return LLT();
586 }
587}
588
589LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
590 switch (ID) {
591 case SgprB32:
592 case VgprB32:
593 case UniInVgprB32:
594 if (Ty == LLT::scalar(SizeInBits: 32) || Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) ||
595 isAnyPtr(Ty, Width: 32))
596 return Ty;
597 return LLT();
598 case SgprPtr32:
599 case VgprPtr32:
600 return isAnyPtr(Ty, Width: 32) ? Ty : LLT();
601 case SgprPtr64:
602 case VgprPtr64:
603 return isAnyPtr(Ty, Width: 64) ? Ty : LLT();
604 case SgprPtr128:
605 case VgprPtr128:
606 return isAnyPtr(Ty, Width: 128) ? Ty : LLT();
607 case SgprB64:
608 case VgprB64:
609 case UniInVgprB64:
610 if (Ty == LLT::scalar(SizeInBits: 64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32) ||
611 Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16) || isAnyPtr(Ty, Width: 64))
612 return Ty;
613 return LLT();
614 case SgprB96:
615 case VgprB96:
616 case UniInVgprB96:
617 if (Ty == LLT::scalar(SizeInBits: 96) || Ty == LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32) ||
618 Ty == LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 16))
619 return Ty;
620 return LLT();
621 case SgprB128:
622 case VgprB128:
623 case UniInVgprB128:
624 if (Ty == LLT::scalar(SizeInBits: 128) || Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32) ||
625 Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64) || isAnyPtr(Ty, Width: 128))
626 return Ty;
627 return LLT();
628 case SgprB256:
629 case VgprB256:
630 case UniInVgprB256:
631 if (Ty == LLT::scalar(SizeInBits: 256) || Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 32) ||
632 Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 64) || Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 16))
633 return Ty;
634 return LLT();
635 case SgprB512:
636 case VgprB512:
637 case UniInVgprB512:
638 if (Ty == LLT::scalar(SizeInBits: 512) || Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 32) ||
639 Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 64))
640 return Ty;
641 return LLT();
642 default:
643 return LLT();
644 }
645}
646
647const RegisterBank *
648RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
649 switch (ID) {
650 case Vcc:
651 return VccRB;
652 case Sgpr16:
653 case Sgpr32:
654 case Sgpr64:
655 case Sgpr128:
656 case SgprP1:
657 case SgprP3:
658 case SgprP4:
659 case SgprP5:
660 case SgprPtr32:
661 case SgprPtr64:
662 case SgprPtr128:
663 case SgprV2S16:
664 case SgprV2S32:
665 case SgprV4S32:
666 case SgprB32:
667 case SgprB64:
668 case SgprB96:
669 case SgprB128:
670 case SgprB256:
671 case SgprB512:
672 case UniInVcc:
673 case UniInVgprS32:
674 case UniInVgprV2S16:
675 case UniInVgprV4S32:
676 case UniInVgprB32:
677 case UniInVgprB64:
678 case UniInVgprB96:
679 case UniInVgprB128:
680 case UniInVgprB256:
681 case UniInVgprB512:
682 case Sgpr32Trunc:
683 case Sgpr32AExt:
684 case Sgpr32AExtBoolInReg:
685 case Sgpr32SExt:
686 case Sgpr32ZExt:
687 return SgprRB;
688 case Vgpr16:
689 case Vgpr32:
690 case Vgpr64:
691 case Vgpr128:
692 case VgprP0:
693 case VgprP1:
694 case VgprP3:
695 case VgprP4:
696 case VgprP5:
697 case VgprPtr32:
698 case VgprPtr64:
699 case VgprPtr128:
700 case VgprV2S16:
701 case VgprV2S32:
702 case VgprV4S32:
703 case VgprB32:
704 case VgprB64:
705 case VgprB96:
706 case VgprB128:
707 case VgprB256:
708 case VgprB512:
709 case Vgpr32SExt:
710 case Vgpr32ZExt:
711 return VgprRB;
712 default:
713 return nullptr;
714 }
715}
716
717void RegBankLegalizeHelper::applyMappingDst(
718 MachineInstr &MI, unsigned &OpIdx,
719 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
720 // Defs start from operand 0
721 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
722 if (MethodIDs[OpIdx] == None)
723 continue;
724 MachineOperand &Op = MI.getOperand(i: OpIdx);
725 Register Reg = Op.getReg();
726 LLT Ty = MRI.getType(Reg);
727 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
728
729 switch (MethodIDs[OpIdx]) {
730 // vcc, sgpr and vgpr scalars, pointers and vectors
731 case Vcc:
732 case Sgpr16:
733 case Sgpr32:
734 case Sgpr64:
735 case Sgpr128:
736 case SgprP1:
737 case SgprP3:
738 case SgprP4:
739 case SgprP5:
740 case SgprV2S16:
741 case SgprV2S32:
742 case SgprV4S32:
743 case Vgpr16:
744 case Vgpr32:
745 case Vgpr64:
746 case Vgpr128:
747 case VgprP0:
748 case VgprP1:
749 case VgprP3:
750 case VgprP4:
751 case VgprP5:
752 case VgprV2S16:
753 case VgprV2S32:
754 case VgprV4S32: {
755 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
756 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
757 break;
758 }
759 // sgpr and vgpr B-types
760 case SgprB32:
761 case SgprB64:
762 case SgprB96:
763 case SgprB128:
764 case SgprB256:
765 case SgprB512:
766 case SgprPtr32:
767 case SgprPtr64:
768 case SgprPtr128:
769 case VgprB32:
770 case VgprB64:
771 case VgprB96:
772 case VgprB128:
773 case VgprB256:
774 case VgprB512:
775 case VgprPtr32:
776 case VgprPtr64:
777 case VgprPtr128: {
778 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
779 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
780 break;
781 }
782 // uniform in vcc/vgpr: scalars, vectors and B-types
783 case UniInVcc: {
784 assert(Ty == S1);
785 assert(RB == SgprRB);
786 Register NewDst = MRI.createVirtualRegister(RegAttr: VccRB_S1);
787 Op.setReg(NewDst);
788 auto CopyS32_Vcc =
789 B.buildInstr(Opc: AMDGPU::G_AMDGPU_COPY_SCC_VCC, DstOps: {SgprRB_S32}, SrcOps: {NewDst});
790 B.buildTrunc(Res: Reg, Op: CopyS32_Vcc);
791 break;
792 }
793 case UniInVgprS32:
794 case UniInVgprV2S16:
795 case UniInVgprV4S32: {
796 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
797 assert(RB == SgprRB);
798 Register NewVgprDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
799 Op.setReg(NewVgprDst);
800 buildReadAnyLane(B, SgprDst: Reg, VgprSrc: NewVgprDst, RBI);
801 break;
802 }
803 case UniInVgprB32:
804 case UniInVgprB64:
805 case UniInVgprB96:
806 case UniInVgprB128:
807 case UniInVgprB256:
808 case UniInVgprB512: {
809 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
810 assert(RB == SgprRB);
811 Register NewVgprDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
812 Op.setReg(NewVgprDst);
813 AMDGPU::buildReadAnyLane(B, SgprDst: Reg, VgprSrc: NewVgprDst, RBI);
814 break;
815 }
816 // sgpr trunc
817 case Sgpr32Trunc: {
818 assert(Ty.getSizeInBits() < 32);
819 assert(RB == SgprRB);
820 Register NewDst = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
821 Op.setReg(NewDst);
822 B.buildTrunc(Res: Reg, Op: NewDst);
823 break;
824 }
825 case InvalidMapping: {
826 LLVM_DEBUG(dbgs() << "Instruction with Invalid mapping: "; MI.dump(););
827 llvm_unreachable("missing fast rule for MI");
828 }
829 default:
830 llvm_unreachable("ID not supported");
831 }
832 }
833}
834
835void RegBankLegalizeHelper::applyMappingSrc(
836 MachineInstr &MI, unsigned &OpIdx,
837 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
838 SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
839 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
840 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
841 continue;
842
843 MachineOperand &Op = MI.getOperand(i: OpIdx);
844 Register Reg = Op.getReg();
845 LLT Ty = MRI.getType(Reg);
846 const RegisterBank *RB = MRI.getRegBank(Reg);
847
848 switch (MethodIDs[i]) {
849 case Vcc: {
850 assert(Ty == S1);
851 assert(RB == VccRB || RB == SgprRB);
852 if (RB == SgprRB) {
853 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
854 auto CopyVcc_Scc =
855 B.buildInstr(Opc: AMDGPU::G_AMDGPU_COPY_VCC_SCC, DstOps: {VccRB_S1}, SrcOps: {Aext});
856 Op.setReg(CopyVcc_Scc.getReg(Idx: 0));
857 }
858 break;
859 }
860 // sgpr scalars, pointers and vectors
861 case Sgpr16:
862 case Sgpr32:
863 case Sgpr64:
864 case Sgpr128:
865 case SgprP1:
866 case SgprP3:
867 case SgprP4:
868 case SgprP5:
869 case SgprV2S16:
870 case SgprV2S32:
871 case SgprV4S32: {
872 assert(Ty == getTyFromID(MethodIDs[i]));
873 assert(RB == getRegBankFromID(MethodIDs[i]));
874 break;
875 }
876 // sgpr B-types
877 case SgprB32:
878 case SgprB64:
879 case SgprB96:
880 case SgprB128:
881 case SgprB256:
882 case SgprB512:
883 case SgprPtr32:
884 case SgprPtr64:
885 case SgprPtr128: {
886 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
887 assert(RB == getRegBankFromID(MethodIDs[i]));
888 break;
889 }
890 // vgpr scalars, pointers and vectors
891 case Vgpr16:
892 case Vgpr32:
893 case Vgpr64:
894 case Vgpr128:
895 case VgprP0:
896 case VgprP1:
897 case VgprP3:
898 case VgprP4:
899 case VgprP5:
900 case VgprV2S16:
901 case VgprV2S32:
902 case VgprV4S32: {
903 assert(Ty == getTyFromID(MethodIDs[i]));
904 if (RB != VgprRB) {
905 auto CopyToVgpr = B.buildCopy(Res: {VgprRB, Ty}, Op: Reg);
906 Op.setReg(CopyToVgpr.getReg(Idx: 0));
907 }
908 break;
909 }
910 // vgpr B-types
911 case VgprB32:
912 case VgprB64:
913 case VgprB96:
914 case VgprB128:
915 case VgprB256:
916 case VgprB512:
917 case VgprPtr32:
918 case VgprPtr64:
919 case VgprPtr128: {
920 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
921 if (RB != VgprRB) {
922 auto CopyToVgpr = B.buildCopy(Res: {VgprRB, Ty}, Op: Reg);
923 Op.setReg(CopyToVgpr.getReg(Idx: 0));
924 }
925 break;
926 }
927 // sgpr and vgpr scalars with extend
928 case Sgpr32AExt: {
929 // Note: this ext allows S1, and it is meant to be combined away.
930 assert(Ty.getSizeInBits() < 32);
931 assert(RB == SgprRB);
932 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
933 Op.setReg(Aext.getReg(Idx: 0));
934 break;
935 }
936 case Sgpr32AExtBoolInReg: {
937 // Note: this ext allows S1, and it is meant to be combined away.
938 assert(Ty.getSizeInBits() == 1);
939 assert(RB == SgprRB);
940 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
941 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
942 // most of times meant to be combined away in AMDGPURegBankCombiner.
943 auto Cst1 = B.buildConstant(Res: SgprRB_S32, Val: 1);
944 auto BoolInReg = B.buildAnd(Dst: SgprRB_S32, Src0: Aext, Src1: Cst1);
945 Op.setReg(BoolInReg.getReg(Idx: 0));
946 break;
947 }
948 case Sgpr32SExt: {
949 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
950 assert(RB == SgprRB);
951 auto Sext = B.buildSExt(Res: SgprRB_S32, Op: Reg);
952 Op.setReg(Sext.getReg(Idx: 0));
953 break;
954 }
955 case Sgpr32ZExt: {
956 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
957 assert(RB == SgprRB);
958 auto Zext = B.buildZExt(Res: {SgprRB, S32}, Op: Reg);
959 Op.setReg(Zext.getReg(Idx: 0));
960 break;
961 }
962 case Vgpr32SExt: {
963 // Note this ext allows S1, and it is meant to be combined away.
964 assert(Ty.getSizeInBits() < 32);
965 assert(RB == VgprRB);
966 auto Sext = B.buildSExt(Res: {VgprRB, S32}, Op: Reg);
967 Op.setReg(Sext.getReg(Idx: 0));
968 break;
969 }
970 case Vgpr32ZExt: {
971 // Note this ext allows S1, and it is meant to be combined away.
972 assert(Ty.getSizeInBits() < 32);
973 assert(RB == VgprRB);
974 auto Zext = B.buildZExt(Res: {VgprRB, S32}, Op: Reg);
975 Op.setReg(Zext.getReg(Idx: 0));
976 break;
977 }
978 default:
979 llvm_unreachable("ID not supported");
980 }
981 }
982}
983
984void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
985 Register Dst = MI.getOperand(i: 0).getReg();
986 LLT Ty = MRI.getType(Reg: Dst);
987
988 if (Ty == LLT::scalar(SizeInBits: 1) && MUI.isUniform(V: Dst)) {
989 B.setInsertPt(MBB&: *MI.getParent(), II: MI.getParent()->getFirstNonPHI());
990
991 Register NewDst = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
992 MI.getOperand(i: 0).setReg(NewDst);
993 B.buildTrunc(Res: Dst, Op: NewDst);
994
995 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
996 Register UseReg = MI.getOperand(i).getReg();
997
998 auto DefMI = MRI.getVRegDef(Reg: UseReg)->getIterator();
999 MachineBasicBlock *DefMBB = DefMI->getParent();
1000
1001 B.setInsertPt(MBB&: *DefMBB, II: DefMBB->SkipPHIsAndLabels(I: std::next(x: DefMI)));
1002
1003 auto NewUse = B.buildAnyExt(Res: SgprRB_S32, Op: UseReg);
1004 MI.getOperand(i).setReg(NewUse.getReg(Idx: 0));
1005 }
1006
1007 return;
1008 }
1009
1010 // ALL divergent i1 phis should be already lowered and inst-selected into PHI
1011 // with sgpr reg class and S1 LLT.
1012 // Note: this includes divergent phis that don't require lowering.
1013 if (Ty == LLT::scalar(SizeInBits: 1) && MUI.isDivergent(V: Dst)) {
1014 LLVM_DEBUG(dbgs() << "Divergent S1 G_PHI: "; MI.dump(););
1015 llvm_unreachable("Make sure to run AMDGPUGlobalISelDivergenceLowering "
1016 "before RegBankLegalize to lower lane mask(vcc) phis");
1017 }
1018
1019 // We accept all types that can fit in some register class.
1020 // Uniform G_PHIs have all sgpr registers.
1021 // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
1022 if (Ty == LLT::scalar(SizeInBits: 32) || Ty == LLT::pointer(AddressSpace: 1, SizeInBits: 64) ||
1023 Ty == LLT::pointer(AddressSpace: 4, SizeInBits: 64)) {
1024 return;
1025 }
1026
1027 LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump(););
1028 llvm_unreachable("type not supported");
1029}
1030
1031[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1032 const RegisterBank *RB,
1033 MachineRegisterInfo &MRI,
1034 unsigned StartOpIdx,
1035 unsigned EndOpIdx) {
1036 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1037 if (MRI.getRegBankOrNull(Reg: MI.getOperand(i).getReg()) != RB)
1038 return false;
1039 }
1040 return true;
1041}
1042
1043void RegBankLegalizeHelper::applyMappingTrivial(MachineInstr &MI) {
1044 const RegisterBank *RB = MRI.getRegBank(Reg: MI.getOperand(i: 0).getReg());
1045 // Put RB on all registers
1046 unsigned NumDefs = MI.getNumDefs();
1047 unsigned NumOperands = MI.getNumOperands();
1048
1049 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1050 if (RB == SgprRB)
1051 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1052
1053 if (RB == VgprRB) {
1054 B.setInstr(MI);
1055 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1056 Register Reg = MI.getOperand(i).getReg();
1057 if (MRI.getRegBank(Reg) != RB) {
1058 auto Copy = B.buildCopy(Res: {VgprRB, MRI.getType(Reg)}, Op: Reg);
1059 MI.getOperand(i).setReg(Copy.getReg(Idx: 0));
1060 }
1061 }
1062 }
1063}
1064