1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPURegBankLegalizeHelper.h"
15#include "AMDGPUGlobalISelUtils.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPURegBankLegalizeRules.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "GCNSubtarget.h"
20#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
22#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24#include "llvm/CodeGen/MachineInstr.h"
25#include "llvm/CodeGen/MachineUniformityAnalysis.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
29
30using namespace llvm;
31using namespace AMDGPU;
32
33RegBankLegalizeHelper::RegBankLegalizeHelper(
34 MachineIRBuilder &B, const MachineUniformityInfo &MUI,
35 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(ID: AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(ID: AMDGPU::VGPRRegBankID)),
41 VccRB(&RBI.getRegBank(ID: AMDGPU::VCCRegBankID)) {}
42
43bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
44 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
45 if (!RuleSet) {
46 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
47 Msg: "No AMDGPU RegBankLegalize rules defined for opcode",
48 MI);
49 return false;
50 }
51
52 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
53 if (!Mapping) {
54 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
55 Msg: "AMDGPU RegBankLegalize: none of the rules defined with "
56 "'Any' for MI's opcode matched MI",
57 MI);
58 return false;
59 }
60
61 WaterfallInfo WFI;
62 unsigned OpIdx = 0;
63 if (Mapping->DstOpMapping.size() > 0) {
64 B.setInsertPt(MBB&: *MI.getParent(), II: std::next(x: MI.getIterator()));
65 if (!applyMappingDst(MI, OpIdx, MethodIDs: Mapping->DstOpMapping))
66 return false;
67 }
68 if (Mapping->SrcOpMapping.size() > 0) {
69 B.setInstr(MI);
70 if (!applyMappingSrc(MI, OpIdx, MethodIDs: Mapping->SrcOpMapping, WFI))
71 return false;
72 }
73
74 if (!lower(MI, Mapping: *Mapping, WFI))
75 return false;
76
77 return true;
78}
79
80bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
81 const WaterfallInfo &WFI) {
82 assert(WFI.Start.isValid() && WFI.End.isValid() &&
83 "Waterfall range not initialized");
84
85 // Track use registers which have already been expanded with a readfirstlane
86 // sequence. This may have multiple uses if moving a sequence.
87 DenseMap<Register, Register> WaterfalledRegMap;
88
89 MachineBasicBlock &MBB = B.getMBB();
90 MachineFunction &MF = B.getMF();
91
92 MachineBasicBlock::iterator BeginIt = WFI.Start;
93 MachineBasicBlock::iterator EndIt = WFI.End;
94
95 const SIRegisterInfo *TRI = ST.getRegisterInfo();
96 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
97 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
98 if (IsWave32) {
99 MovExecOpc = AMDGPU::S_MOV_B32;
100 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
101 XorTermOpc = AMDGPU::S_XOR_B32_term;
102 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
103 ExecReg = AMDGPU::EXEC_LO;
104 } else {
105 MovExecOpc = AMDGPU::S_MOV_B64;
106 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
107 XorTermOpc = AMDGPU::S_XOR_B64_term;
108 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
109 ExecReg = AMDGPU::EXEC;
110 }
111
112#ifndef NDEBUG
113 const int OrigRangeSize = std::distance(BeginIt, EndIt);
114#endif
115
116 MachineRegisterInfo &MRI = *B.getMRI();
117 Register SaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
118 Register InitSaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
119
120 // Don't bother using generic instructions/registers for the exec mask.
121 B.buildInstr(Opcode: TargetOpcode::IMPLICIT_DEF).addDef(RegNo: InitSaveExecReg);
122
123 Register SavedExec = MRI.createVirtualRegister(RegClass: WaveRC);
124
125 // To insert the loop we need to split the block. Move everything before
126 // this point to a new block, and insert a new empty block before this
127 // instruction.
128 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
129 MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
130 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
131 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
132 MachineFunction::iterator MBBI(MBB);
133 ++MBBI;
134 MF.insert(MBBI, MBB: LoopBB);
135 MF.insert(MBBI, MBB: BodyBB);
136 MF.insert(MBBI, MBB: RestoreExecBB);
137 MF.insert(MBBI, MBB: RemainderBB);
138
139 LoopBB->addSuccessor(Succ: BodyBB);
140 BodyBB->addSuccessor(Succ: RestoreExecBB);
141 BodyBB->addSuccessor(Succ: LoopBB);
142
143 // Move the rest of the block into a new block.
144 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
145 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: EndIt, To: MBB.end());
146
147 MBB.addSuccessor(Succ: LoopBB);
148 RestoreExecBB->addSuccessor(Succ: RemainderBB);
149
150 B.setInsertPt(MBB&: *LoopBB, II: LoopBB->end());
151
152 // +-MBB:------------+
153 // | ... |
154 // | %0 = G_INST_1 |
155 // | %Dst = MI %Vgpr |
156 // | %1 = G_INST_2 |
157 // | ... |
158 // +-----------------+
159 // ->
160 // +-MBB-------------------------------+
161 // | ... |
162 // | %0 = G_INST_1 |
163 // | %SaveExecReg = S_MOV_B32 $exec_lo |
164 // +----------------|------------------+
165 // | /------------------------------|
166 // V V |
167 // +-LoopBB---------------------------------------------------------------+ |
168 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
169 // | instead of executing for each lane, see if other lanes had | |
170 // | same value for %Vgpr and execute for them also. | |
171 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
172 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
173 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
174 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
175 // +----------------|-----------------------------------------------------+ |
176 // V |
177 // +-BodyBB------------------------------------------------------------+ |
178 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
179 // | executed only for active lanes and written to Dst | |
180 // | $exec = S_XOR_B32 $exec, %SavedExec | |
181 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
182 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
183 // | SI_WATERFALL_LOOP LoopBB |-----|
184 // +----------------|--------------------------------------------------+
185 // V
186 // +-RestoreExecBB--------------------------+
187 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
188 // +----------------|-----------------------+
189 // V
190 // +-RemainderBB:----------------------+
191 // | %1 = G_INST_2 |
192 // | ... |
193 // +---------------------------------- +
194
195 // Move the instruction into the loop body. Note we moved everything after
196 // Range.end() already into a new block, so Range.end() is no longer valid.
197 BodyBB->splice(Where: BodyBB->end(), Other: &MBB, From: BeginIt, To: MBB.end());
198
199 // Figure out the iterator range after splicing the instructions.
200 MachineBasicBlock::iterator NewBegin = BeginIt;
201 auto NewEnd = BodyBB->end();
202 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
203
204 B.setMBB(*LoopBB);
205 Register CondReg;
206
207 for (MachineInstr &MI : make_range(x: NewBegin, y: NewEnd)) {
208 for (MachineOperand &Op : MI.all_uses()) {
209 Register OldReg = Op.getReg();
210 if (!WFI.SgprWaterfallOperandRegs.count(V: OldReg))
211 continue;
212
213 // See if we already processed this register in another instruction in
214 // the sequence.
215 auto OldVal = WaterfalledRegMap.find(Val: OldReg);
216 if (OldVal != WaterfalledRegMap.end()) {
217 Op.setReg(OldVal->second);
218 continue;
219 }
220
221 Register OpReg = Op.getReg();
222 LLT OpTy = MRI.getType(Reg: OpReg);
223
224 // TODO: support for agpr
225 assert(MRI.getRegBank(OpReg) == VgprRB);
226 Register CurrentLaneReg = MRI.createVirtualRegister(RegAttr: {.RCOrRB: SgprRB, .Ty: OpTy});
227 buildReadFirstLane(B, SgprDst: CurrentLaneReg, VgprSrc: OpReg, RBI);
228
229 // Build the comparison(s), CurrentLaneReg == OpReg.
230 unsigned OpSize = OpTy.getSizeInBits();
231 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
232 LLT PartTy = LLT::scalar(SizeInBits: PartSize);
233 unsigned NumParts = OpSize / PartSize;
234 SmallVector<Register, 8> OpParts;
235 SmallVector<Register, 8> CurrentLaneParts;
236
237 if (NumParts == 1) {
238 OpParts.push_back(Elt: OpReg);
239 CurrentLaneParts.push_back(Elt: CurrentLaneReg);
240 } else {
241 auto UnmergeOp = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: PartTy}, Op: OpReg);
242 auto UnmergeCurrLane = B.buildUnmerge(Attrs: {.RCOrRB: SgprRB, .Ty: PartTy}, Op: CurrentLaneReg);
243 for (unsigned i = 0; i < NumParts; ++i) {
244 OpParts.push_back(Elt: UnmergeOp.getReg(Idx: i));
245 CurrentLaneParts.push_back(Elt: UnmergeCurrLane.getReg(Idx: i));
246 }
247 }
248
249 for (unsigned i = 0; i < NumParts; ++i) {
250 Register CmpReg = MRI.createVirtualRegister(RegAttr: VccRB_S1);
251 B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CmpReg, Op0: CurrentLaneParts[i], Op1: OpParts[i]);
252
253 if (!CondReg)
254 CondReg = CmpReg;
255 else
256 CondReg = B.buildAnd(Dst: VccRB_S1, Src0: CondReg, Src1: CmpReg).getReg(Idx: 0);
257 }
258
259 Op.setReg(CurrentLaneReg);
260
261 // Make sure we don't re-process this register again.
262 WaterfalledRegMap.insert(KV: std::pair(OldReg, Op.getReg()));
263 }
264 }
265
266 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
267 Register CondRegLM =
268 MRI.createVirtualRegister(RegAttr: {.RCOrRB: WaveRC, .Ty: LLT::scalar(SizeInBits: IsWave32 ? 32 : 64)});
269 B.buildIntrinsic(ID: Intrinsic::amdgcn_ballot, Res: CondRegLM).addReg(RegNo: CondReg);
270
271 // Update EXEC, save the original EXEC value to SavedExec.
272 B.buildInstr(Opcode: AndSaveExecOpc)
273 .addDef(RegNo: SavedExec)
274 .addReg(RegNo: CondRegLM, Flags: RegState::Kill);
275 MRI.setSimpleHint(VReg: SavedExec, PrefReg: CondRegLM);
276
277 B.setInsertPt(MBB&: *BodyBB, II: BodyBB->end());
278
279 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
280 B.buildInstr(Opcode: XorTermOpc).addDef(RegNo: ExecReg).addReg(RegNo: ExecReg).addReg(RegNo: SavedExec);
281
282 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
283 // s_cbranch_scc0?
284
285 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
286 B.buildInstr(Opcode: AMDGPU::SI_WATERFALL_LOOP).addMBB(MBB: LoopBB);
287
288 // Save the EXEC mask before the loop.
289 B.setInsertPt(MBB, II: MBB.end());
290 B.buildInstr(Opcode: MovExecOpc).addDef(RegNo: SaveExecReg).addReg(RegNo: ExecReg);
291
292 // Restore the EXEC mask after the loop.
293 B.setInsertPt(MBB&: *RestoreExecBB, II: RestoreExecBB->begin());
294 B.buildInstr(Opcode: MovExecTermOpc).addDef(RegNo: ExecReg).addReg(RegNo: SaveExecReg);
295
296 // Set the insert point after the original instruction, so any new
297 // instructions will be in the remainder.
298 B.setInsertPt(MBB&: *RemainderBB, II: RemainderBB->begin());
299
300 return true;
301}
302
303bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
304 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
305 MachineFunction &MF = B.getMF();
306 assert(MI.getNumMemOperands() == 1);
307 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
308 Register Dst = MI.getOperand(i: 0).getReg();
309 const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: Dst);
310 Register Base = MI.getOperand(i: 1).getReg();
311 LLT PtrTy = MRI.getType(Reg: Base);
312 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Reg: Base);
313 LLT OffsetTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
314 SmallVector<Register, 4> LoadPartRegs;
315
316 unsigned ByteOffset = 0;
317 for (LLT PartTy : LLTBreakdown) {
318 Register BasePlusOffset;
319 if (ByteOffset == 0) {
320 BasePlusOffset = Base;
321 } else {
322 auto Offset = B.buildConstant(Res: {PtrRB, OffsetTy}, Val: ByteOffset);
323 BasePlusOffset =
324 B.buildObjectPtrOffset(Res: {PtrRB, PtrTy}, Op0: Base, Op1: Offset).getReg(Idx: 0);
325 }
326 auto *OffsetMMO = MF.getMachineMemOperand(MMO: &BaseMMO, Offset: ByteOffset, Ty: PartTy);
327 auto LoadPart = B.buildLoad(Res: {DstRB, PartTy}, Addr: BasePlusOffset, MMO&: *OffsetMMO);
328 LoadPartRegs.push_back(Elt: LoadPart.getReg(Idx: 0));
329 ByteOffset += PartTy.getSizeInBytes();
330 }
331
332 if (!MergeTy.isValid()) {
333 // Loads are of same size, concat or merge them together.
334 B.buildMergeLikeInstr(Res: Dst, Ops: LoadPartRegs);
335 } else {
336 // Loads are not all of same size, need to unmerge them to smaller pieces
337 // of MergeTy type, then merge pieces to Dst.
338 SmallVector<Register, 4> MergeTyParts;
339 for (Register Reg : LoadPartRegs) {
340 if (MRI.getType(Reg) == MergeTy) {
341 MergeTyParts.push_back(Elt: Reg);
342 } else {
343 auto Unmerge = B.buildUnmerge(Attrs: {.RCOrRB: DstRB, .Ty: MergeTy}, Op: Reg);
344 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
345 MergeTyParts.push_back(Elt: Unmerge.getReg(Idx: i));
346 }
347 }
348 B.buildMergeLikeInstr(Res: Dst, Ops: MergeTyParts);
349 }
350 MI.eraseFromParent();
351 return true;
352}
353
354bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
355 LLT MergeTy) {
356 MachineFunction &MF = B.getMF();
357 assert(MI.getNumMemOperands() == 1);
358 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
359 Register Dst = MI.getOperand(i: 0).getReg();
360 const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: Dst);
361 Register Base = MI.getOperand(i: 1).getReg();
362
363 MachineMemOperand *WideMMO = MF.getMachineMemOperand(MMO: &BaseMMO, Offset: 0, Ty: WideTy);
364 auto WideLoad = B.buildLoad(Res: {DstRB, WideTy}, Addr: Base, MMO&: *WideMMO);
365
366 if (WideTy.isScalar()) {
367 B.buildTrunc(Res: Dst, Op: WideLoad);
368 } else {
369 SmallVector<Register, 4> MergeTyParts;
370 auto Unmerge = B.buildUnmerge(Attrs: {.RCOrRB: DstRB, .Ty: MergeTy}, Op: WideLoad);
371
372 LLT DstTy = MRI.getType(Reg: Dst);
373 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
374 for (unsigned i = 0; i < NumElts; ++i) {
375 MergeTyParts.push_back(Elt: Unmerge.getReg(Idx: i));
376 }
377 B.buildMergeLikeInstr(Res: Dst, Ops: MergeTyParts);
378 }
379 MI.eraseFromParent();
380 return true;
381}
382
383bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
384 Register Dst = MI.getDstReg();
385 Register Ptr = MI.getPointerReg();
386 MachineMemOperand &MMO = MI.getMMO();
387 unsigned MemSize = 8 * MMO.getSize().getValue();
388
389 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(MMO: &MMO, Offset: 0, Ty: S32);
390
391 if (MI.getOpcode() == G_LOAD) {
392 B.buildLoad(Res: Dst, Addr: Ptr, MMO&: *WideMMO);
393 } else {
394 auto Load = B.buildLoad(Res: SgprRB_S32, Addr: Ptr, MMO&: *WideMMO);
395
396 if (MI.getOpcode() == G_ZEXTLOAD) {
397 APInt Mask = APInt::getLowBitsSet(numBits: S32.getSizeInBits(), loBitsSet: MemSize);
398 auto MaskCst = B.buildConstant(Res: SgprRB_S32, Val: Mask);
399 B.buildAnd(Dst, Src0: Load, Src1: MaskCst);
400 } else {
401 assert(MI.getOpcode() == G_SEXTLOAD);
402 B.buildSExtInReg(Res: Dst, Op: Load, ImmOp: MemSize);
403 }
404 }
405
406 MI.eraseFromParent();
407 return true;
408}
409
410bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
411 Register Dst = MI.getOperand(i: 0).getReg();
412 LLT Ty = MRI.getType(Reg: Dst);
413 Register Src = MI.getOperand(i: 1).getReg();
414 unsigned Opc = MI.getOpcode();
415 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
416 if (Ty == S32 || Ty == S16) {
417 auto True = B.buildConstant(Res: {VgprRB, Ty}, Val: TrueExtCst);
418 auto False = B.buildConstant(Res: {VgprRB, Ty}, Val: 0);
419 B.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
420 } else if (Ty == S64) {
421 auto True = B.buildConstant(Res: {VgprRB_S32}, Val: TrueExtCst);
422 auto False = B.buildConstant(Res: {VgprRB_S32}, Val: 0);
423 auto Lo = B.buildSelect(Res: {VgprRB_S32}, Tst: Src, Op0: True, Op1: False);
424 MachineInstrBuilder Hi;
425 switch (Opc) {
426 case G_SEXT:
427 Hi = Lo;
428 break;
429 case G_ZEXT:
430 Hi = False;
431 break;
432 case G_ANYEXT:
433 Hi = B.buildUndef(Res: {VgprRB_S32});
434 break;
435 default:
436 reportGISelFailure(
437 MF, MORE, PassName: "amdgpu-regbanklegalize",
438 Msg: "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
439 return false;
440 }
441
442 B.buildMergeValues(Res: Dst, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)});
443 } else {
444 reportGISelFailure(
445 MF, MORE, PassName: "amdgpu-regbanklegalize",
446 Msg: "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
447 return false;
448 }
449
450 MI.eraseFromParent();
451 return true;
452}
453
454std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
455 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
456 auto Mask = B.buildConstant(Res: SgprRB_S32, Val: 0x0000ffff);
457 auto Lo = B.buildAnd(Dst: SgprRB_S32, Src0: PackedS32, Src1: Mask);
458 auto Hi = B.buildLShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
459 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
460}
461
462std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
463 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
464 auto Lo = B.buildSExtInReg(Res: SgprRB_S32, Op: PackedS32, ImmOp: 16);
465 auto Hi = B.buildAShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
466 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
467}
468
469std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
470 auto PackedS32 = B.buildBitcast(Dst: SgprRB_S32, Src: Reg);
471 auto Lo = PackedS32;
472 auto Hi = B.buildLShr(Dst: SgprRB_S32, Src0: PackedS32, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
473 return {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)};
474}
475
476std::pair<Register, Register>
477RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
478 auto [Lo32, Hi32] = unpackAExt(Reg);
479 return {B.buildTrunc(Res: SgprRB_S16, Op: Lo32).getReg(Idx: 0),
480 B.buildTrunc(Res: SgprRB_S16, Op: Hi32).getReg(Idx: 0)};
481}
482
483bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
484 Register Lo, Hi;
485 switch (MI.getOpcode()) {
486 case AMDGPU::G_SHL: {
487 auto [Val0, Val1] = unpackAExt(Reg: MI.getOperand(i: 1).getReg());
488 auto [Amt0, Amt1] = unpackAExt(Reg: MI.getOperand(i: 2).getReg());
489 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0, Amt0}).getReg(Idx: 0);
490 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val1, Amt1}).getReg(Idx: 0);
491 break;
492 }
493 case AMDGPU::G_LSHR: {
494 auto [Val0, Val1] = unpackZExt(Reg: MI.getOperand(i: 1).getReg());
495 auto [Amt0, Amt1] = unpackZExt(Reg: MI.getOperand(i: 2).getReg());
496 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0, Amt0}).getReg(Idx: 0);
497 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val1, Amt1}).getReg(Idx: 0);
498 break;
499 }
500 case AMDGPU::G_ASHR: {
501 auto [Val0, Val1] = unpackSExt(Reg: MI.getOperand(i: 1).getReg());
502 auto [Amt0, Amt1] = unpackSExt(Reg: MI.getOperand(i: 2).getReg());
503 Lo = B.buildAShr(Dst: SgprRB_S32, Src0: Val0, Src1: Amt0).getReg(Idx: 0);
504 Hi = B.buildAShr(Dst: SgprRB_S32, Src0: Val1, Src1: Amt1).getReg(Idx: 0);
505 break;
506 }
507 default:
508 reportGISelFailure(
509 MF, MORE, PassName: "amdgpu-regbanklegalize",
510 Msg: "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
511 MI);
512 return false;
513 }
514 B.buildBuildVectorTrunc(Res: MI.getOperand(i: 0).getReg(), Ops: {Lo, Hi});
515 MI.eraseFromParent();
516 return true;
517}
518
519bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
520 Register Lo, Hi;
521 switch (MI.getOpcode()) {
522 case AMDGPU::G_SMIN:
523 case AMDGPU::G_SMAX: {
524 // For signed operations, use sign extension
525 auto [Val0_Lo, Val0_Hi] = unpackSExt(Reg: MI.getOperand(i: 1).getReg());
526 auto [Val1_Lo, Val1_Hi] = unpackSExt(Reg: MI.getOperand(i: 2).getReg());
527 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Lo, Val1_Lo})
528 .getReg(Idx: 0);
529 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Hi, Val1_Hi})
530 .getReg(Idx: 0);
531 break;
532 }
533 case AMDGPU::G_UMIN:
534 case AMDGPU::G_UMAX: {
535 // For unsigned operations, use zero extension
536 auto [Val0_Lo, Val0_Hi] = unpackZExt(Reg: MI.getOperand(i: 1).getReg());
537 auto [Val1_Lo, Val1_Hi] = unpackZExt(Reg: MI.getOperand(i: 2).getReg());
538 Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Lo, Val1_Lo})
539 .getReg(Idx: 0);
540 Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Val0_Hi, Val1_Hi})
541 .getReg(Idx: 0);
542 break;
543 }
544 default:
545 reportGISelFailure(
546 MF, MORE, PassName: "amdgpu-regbanklegalize",
547 Msg: "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
548 return false;
549 }
550 B.buildBuildVectorTrunc(Res: MI.getOperand(i: 0).getReg(), Ops: {Lo, Hi});
551 MI.eraseFromParent();
552 return true;
553}
554
555bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
556 auto [Op1Lo, Op1Hi] = unpackAExt(Reg: MI.getOperand(i: 1).getReg());
557 auto [Op2Lo, Op2Hi] = unpackAExt(Reg: MI.getOperand(i: 2).getReg());
558 auto ResLo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Op1Lo, Op2Lo});
559 auto ResHi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {SgprRB_S32}, SrcOps: {Op1Hi, Op2Hi});
560 B.buildBuildVectorTrunc(Res: MI.getOperand(i: 0).getReg(),
561 Ops: {ResLo.getReg(Idx: 0), ResHi.getReg(Idx: 0)});
562 MI.eraseFromParent();
563 return true;
564}
565
566static bool isSignedBFE(MachineInstr &MI) {
567 if (GIntrinsic *GI = dyn_cast<GIntrinsic>(Val: &MI))
568 return (GI->is(ID: Intrinsic::amdgcn_sbfe));
569
570 return MI.getOpcode() == AMDGPU::G_SBFX;
571}
572
573bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
574 Register Dst = MI.getOperand(i: 0).getReg();
575 assert(MRI.getType(Dst) == LLT::scalar(64));
576 bool Signed = isSignedBFE(MI);
577 unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1;
578 // Extract bitfield from Src, LSBit is the least-significant bit for the
579 // extraction (field offset) and Width is size of bitfield.
580 Register Src = MI.getOperand(i: FirstOpnd).getReg();
581 Register LSBit = MI.getOperand(i: FirstOpnd + 1).getReg();
582 Register Width = MI.getOperand(i: FirstOpnd + 2).getReg();
583 // Comments are for signed bitfield extract, similar for unsigned. x is sign
584 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
585
586 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
587 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
588 auto SHRSrc = B.buildInstr(Opc: SHROpc, DstOps: {{VgprRB, S64}}, SrcOps: {Src, LSBit});
589
590 auto ConstWidth = getIConstantVRegValWithLookThrough(VReg: Width, MRI);
591
592 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
593 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
594 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
595 if (!ConstWidth) {
596 auto Amt = B.buildSub(Dst: VgprRB_S32, Src0: B.buildConstant(Res: SgprRB_S32, Val: 64), Src1: Width);
597 auto SignBit = B.buildShl(Dst: {VgprRB, S64}, Src0: SHRSrc, Src1: Amt);
598 B.buildInstr(Opc: SHROpc, DstOps: {Dst}, SrcOps: {SignBit, Amt});
599 MI.eraseFromParent();
600 return true;
601 }
602
603 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
604 auto UnmergeSHRSrc = B.buildUnmerge(Attrs: VgprRB_S32, Op: SHRSrc);
605 Register SHRSrcLo = UnmergeSHRSrc.getReg(Idx: 0);
606 Register SHRSrcHi = UnmergeSHRSrc.getReg(Idx: 1);
607 auto Zero = B.buildConstant(Res: {VgprRB, S32}, Val: 0);
608 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
609
610 if (WidthImm <= 32) {
611 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
612 auto Lo = B.buildInstr(Opc: BFXOpc, DstOps: {VgprRB_S32}, SrcOps: {SHRSrcLo, Zero, Width});
613 MachineInstrBuilder Hi;
614 if (Signed) {
615 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
616 Hi = B.buildAShr(Dst: VgprRB_S32, Src0: Lo, Src1: B.buildConstant(Res: VgprRB_S32, Val: 31));
617 } else {
618 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
619 Hi = Zero;
620 }
621 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
622 } else {
623 auto Amt = B.buildConstant(Res: VgprRB_S32, Val: WidthImm - 32);
624 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
625 auto Hi = B.buildInstr(Opc: BFXOpc, DstOps: {VgprRB_S32}, SrcOps: {SHRSrcHi, Zero, Amt});
626 B.buildMergeLikeInstr(Res: Dst, Ops: {SHRSrcLo, Hi});
627 }
628
629 MI.eraseFromParent();
630 return true;
631}
632
633bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
634 Register DstReg = MI.getOperand(i: 0).getReg();
635 LLT Ty = MRI.getType(Reg: DstReg);
636 bool Signed = isSignedBFE(MI);
637 unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1;
638 Register Src = MI.getOperand(i: FirstOpnd).getReg();
639 Register LSBit = MI.getOperand(i: FirstOpnd + 1).getReg();
640 Register Width = MI.getOperand(i: FirstOpnd + 2).getReg();
641 // For uniform bit field extract there are 4 available instructions, but
642 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
643 // field offset in low and size in high 16 bits.
644
645 // Src1 Hi16|Lo16 = Size|FieldOffset
646 auto Mask = B.buildConstant(Res: SgprRB_S32, Val: maskTrailingOnes<unsigned>(N: 6));
647 auto FieldOffset = B.buildAnd(Dst: SgprRB_S32, Src0: LSBit, Src1: Mask);
648 auto Size = B.buildShl(Dst: SgprRB_S32, Src0: Width, Src1: B.buildConstant(Res: SgprRB_S32, Val: 16));
649 auto Src1 = B.buildOr(Dst: SgprRB_S32, Src0: FieldOffset, Src1: Size);
650 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
651 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
652 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
653
654 // Select machine instruction, because of reg class constraining, insert
655 // copies from reg class to reg bank.
656 auto S_BFE = B.buildInstr(Opc, DstOps: {{SgprRB, Ty}},
657 SrcOps: {B.buildCopy(Res: Ty, Op: Src), B.buildCopy(Res: S32, Op: Src1)});
658 constrainSelectedInstRegOperands(I&: *S_BFE, TII: *ST.getInstrInfo(),
659 TRI: *ST.getRegisterInfo(), RBI);
660
661 B.buildCopy(Res: DstReg, Op: S_BFE->getOperand(i: 0).getReg());
662 MI.eraseFromParent();
663 return true;
664}
665
666bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
667 Register Dst = MI.getOperand(i: 0).getReg();
668 LLT DstTy = MRI.getType(Reg: Dst);
669 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
670 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
671 auto Op1 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 1).getReg());
672 auto Op2 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 2).getReg());
673 unsigned Opc = MI.getOpcode();
674 auto Flags = MI.getFlags();
675 auto Lo =
676 B.buildInstr(Opc, DstOps: {{VgprRB, Ty}}, SrcOps: {Op1.getReg(Idx: 0), Op2.getReg(Idx: 0)}, Flags);
677 auto Hi =
678 B.buildInstr(Opc, DstOps: {{VgprRB, Ty}}, SrcOps: {Op1.getReg(Idx: 1), Op2.getReg(Idx: 1)}, Flags);
679 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
680 MI.eraseFromParent();
681 return true;
682}
683
684bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
685 Register Dst = MI.getOperand(i: 0).getReg();
686 assert(MRI.getType(Dst) == S64);
687 auto Op1 = B.buildUnmerge(Attrs: {VgprRB_S32}, Op: MI.getOperand(i: 1).getReg());
688 auto Op2 = B.buildUnmerge(Attrs: {VgprRB_S32}, Op: MI.getOperand(i: 2).getReg());
689
690 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
691 // match GlobalISel with old regbankselect.
692 auto Lo = B.buildMul(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 0), Src1: Op2.getReg(Idx: 0));
693 auto Carry = B.buildUMulH(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 0), Src1: Op2.getReg(Idx: 0));
694 auto MulLo0Hi1 = B.buildMul(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 0), Src1: Op2.getReg(Idx: 1));
695 auto MulHi0Lo1 = B.buildMul(Dst: VgprRB_S32, Src0: Op1.getReg(Idx: 1), Src1: Op2.getReg(Idx: 0));
696 auto Sum = B.buildAdd(Dst: VgprRB_S32, Src0: MulLo0Hi1, Src1: MulHi0Lo1);
697 auto Hi = B.buildAdd(Dst: VgprRB_S32, Src0: Sum, Src1: Carry);
698
699 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
700 MI.eraseFromParent();
701 return true;
702}
703
704bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
705 Register Dst = MI.getOperand(i: 0).getReg();
706 assert(MRI.getType(Dst) == V2S16);
707 unsigned Opc = MI.getOpcode();
708 unsigned NumOps = MI.getNumOperands();
709 auto Flags = MI.getFlags();
710
711 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(Reg: MI.getOperand(i: 1).getReg());
712
713 if (NumOps == 2) {
714 auto Lo = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Lo}, Flags);
715 auto Hi = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Hi}, Flags);
716 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
717 MI.eraseFromParent();
718 return true;
719 }
720
721 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(Reg: MI.getOperand(i: 2).getReg());
722
723 if (NumOps == 3) {
724 auto Lo = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Lo, Op2Lo}, Flags);
725 auto Hi = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Hi, Op2Hi}, Flags);
726 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
727 MI.eraseFromParent();
728 return true;
729 }
730
731 assert(NumOps == 4);
732 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(Reg: MI.getOperand(i: 3).getReg());
733 auto Lo = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Lo, Op2Lo, Op3Lo}, Flags);
734 auto Hi = B.buildInstr(Opc, DstOps: {SgprRB_S16}, SrcOps: {Op1Hi, Op2Hi, Op3Hi}, Flags);
735 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
736 MI.eraseFromParent();
737 return true;
738}
739
740bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
741 Register Dst0 = MI.getOperand(i: 0).getReg();
742 Register Dst1 = MI.getOperand(i: 1).getReg();
743 Register Src0 = MI.getOperand(i: 2).getReg();
744 Register Src1 = MI.getOperand(i: 3).getReg();
745 Register Src2 = MI.getOperand(i: 4).getReg();
746
747 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
748
749 // Keep the multiplication on the SALU.
750 Register DstLo = B.buildMul(Dst: SgprRB_S32, Src0, Src1).getReg(Idx: 0);
751 Register DstHi = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
752 if (ST.hasScalarMulHiInsts()) {
753 B.buildInstr(Opc: AMDGPU::G_UMULH, DstOps: {{DstHi}}, SrcOps: {Src0, Src1});
754 } else {
755 auto VSrc0 = B.buildCopy(Res: VgprRB_S32, Op: Src0);
756 auto VSrc1 = B.buildCopy(Res: VgprRB_S32, Op: Src1);
757 auto MulHi = B.buildInstr(Opc: AMDGPU::G_UMULH, DstOps: {VgprRB_S32}, SrcOps: {VSrc0, VSrc1});
758 buildReadAnyLane(B, SgprDst: DstHi, VgprSrc: MulHi.getReg(Idx: 0), RBI);
759 }
760
761 // Accumulate and produce the "carry-out" bit.
762
763 // The "carry-out" is defined as bit 64 of the result when computed as a
764 // big integer. For unsigned multiply-add, this matches the usual
765 // definition of carry-out.
766 if (mi_match(R: Src2, MRI, P: MIPatternMatch::m_ZeroInt())) {
767 // No accumulate: result is just the multiplication, carry is 0.
768 B.buildMergeLikeInstr(Res: Dst0, Ops: {DstLo, DstHi});
769 B.buildConstant(Res: Dst1, Val: 0);
770 } else {
771 // Accumulate: add Src2 to the multiplication result with carry chain.
772 Register Src2Lo = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
773 Register Src2Hi = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
774 B.buildUnmerge(Res: {Src2Lo, Src2Hi}, Op: Src2);
775
776 auto AddLo = B.buildUAddo(Res: SgprRB_S32, CarryOut: SgprRB_S32, Op0: DstLo, Op1: Src2Lo);
777 auto AddHi =
778 B.buildUAdde(Res: SgprRB_S32, CarryOut: SgprRB_S32, Op0: DstHi, Op1: Src2Hi, CarryIn: AddLo.getReg(Idx: 1));
779 B.buildMergeLikeInstr(Res: Dst0, Ops: {AddLo.getReg(Idx: 0), AddHi.getReg(Idx: 0)});
780 B.buildCopy(Res: Dst1, Op: AddHi.getReg(Idx: 1));
781 }
782
783 MI.eraseFromParent();
784 return true;
785}
786
787bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
788 Register Dst = MI.getOperand(i: 0).getReg();
789 LLT DstTy = MRI.getType(Reg: Dst);
790 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
791 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
792 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
793 auto Op2 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 2).getReg());
794 auto Op3 = B.buildUnmerge(Attrs: {.RCOrRB: VgprRB, .Ty: Ty}, Op: MI.getOperand(i: 3).getReg());
795 Register Cond = MI.getOperand(i: 1).getReg();
796 auto Flags = MI.getFlags();
797 auto Lo =
798 B.buildSelect(Res: {VgprRB, Ty}, Tst: Cond, Op0: Op2.getReg(Idx: 0), Op1: Op3.getReg(Idx: 0), Flags);
799 auto Hi =
800 B.buildSelect(Res: {VgprRB, Ty}, Tst: Cond, Op0: Op2.getReg(Idx: 1), Op1: Op3.getReg(Idx: 1), Flags);
801
802 B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
803 MI.eraseFromParent();
804 return true;
805}
806
807bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
808 auto Op1 = B.buildUnmerge(Attrs: VgprRB_S32, Op: MI.getOperand(i: 1).getReg());
809 int Amt = MI.getOperand(i: 2).getImm();
810 Register Lo, Hi;
811 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
812 if (Amt <= 32) {
813 auto Freeze = B.buildFreeze(Dst: VgprRB_S32, Src: Op1.getReg(Idx: 0));
814 if (Amt == 32) {
815 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
816 Lo = Freeze.getReg(Idx: 0);
817 } else {
818 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
819 Lo = B.buildSExtInReg(Res: VgprRB_S32, Op: Freeze, ImmOp: Amt).getReg(Idx: 0);
820 }
821
822 auto SignExtCst = B.buildConstant(Res: SgprRB_S32, Val: 31);
823 Hi = B.buildAShr(Dst: VgprRB_S32, Src0: Lo, Src1: SignExtCst).getReg(Idx: 0);
824 } else {
825 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
826 Lo = Op1.getReg(Idx: 0);
827 Hi = B.buildSExtInReg(Res: VgprRB_S32, Op: Op1.getReg(Idx: 1), ImmOp: Amt - 32).getReg(Idx: 0);
828 }
829
830 B.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(), Ops: {Lo, Hi});
831 MI.eraseFromParent();
832 return true;
833}
834
835bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
836 // Split 64-bit find-first-bit operations into 32-bit halves:
837 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
838 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
839 // (ctlz_zero_undef hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
840 // (cttz_zero_undef hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
841 unsigned Opc = MI.getOpcode();
842
843 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
844 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_undef), so plain add
845 // is fine.
846 unsigned FFBOpc;
847 unsigned AddOpc;
848 bool SearchFromMSB;
849 switch (Opc) {
850 case AMDGPU::G_AMDGPU_FFBH_U32:
851 FFBOpc = Opc;
852 AddOpc = AMDGPU::G_UADDSAT;
853 SearchFromMSB = true;
854 break;
855 case AMDGPU::G_AMDGPU_FFBL_B32:
856 FFBOpc = Opc;
857 AddOpc = AMDGPU::G_UADDSAT;
858 SearchFromMSB = false;
859 break;
860 case AMDGPU::G_CTLZ_ZERO_UNDEF:
861 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
862 AddOpc = AMDGPU::G_ADD;
863 SearchFromMSB = true;
864 break;
865 case AMDGPU::G_CTTZ_ZERO_UNDEF:
866 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
867 AddOpc = AMDGPU::G_ADD;
868 SearchFromMSB = false;
869 break;
870 default:
871 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
872 }
873
874 auto Unmerge = B.buildUnmerge(Attrs: VgprRB_S32, Op: MI.getOperand(i: 1).getReg());
875 Register Lo = Unmerge.getReg(Idx: 0);
876 Register Hi = Unmerge.getReg(Idx: 1);
877
878 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
879 // lo first. The secondary half adds 32 to account for the primary half's
880 // width.
881 auto Primary = B.buildInstr(Opc: FFBOpc, DstOps: {VgprRB_S32}, SrcOps: {SearchFromMSB ? Hi : Lo});
882 auto Secondary =
883 B.buildInstr(Opc: FFBOpc, DstOps: {VgprRB_S32}, SrcOps: {SearchFromMSB ? Lo : Hi});
884
885 auto Adjusted = B.buildInstr(Opc: AddOpc, DstOps: {VgprRB_S32},
886 SrcOps: {Secondary, B.buildConstant(Res: VgprRB_S32, Val: 32)});
887 B.buildUMin(Dst: MI.getOperand(i: 0).getReg(), Src0: Primary, Src1: Adjusted);
888
889 MI.eraseFromParent();
890 return true;
891}
892
893bool RegBankLegalizeHelper::lower(MachineInstr &MI,
894 const RegBankLLTMapping &Mapping,
895 WaterfallInfo &WFI) {
896
897 switch (Mapping.LoweringMethod) {
898 case DoNotLower:
899 break;
900 case VccExtToSel:
901 return lowerVccExtToSel(MI);
902 case UniExtToSel: {
903 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
904 auto True = B.buildConstant(Res: {SgprRB, Ty},
905 Val: MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
906 auto False = B.buildConstant(Res: {SgprRB, Ty}, Val: 0);
907 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
908 // We are making select here. S1 cond was already 'any-extended to S32' +
909 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
910 B.buildSelect(Res: MI.getOperand(i: 0).getReg(), Tst: MI.getOperand(i: 1).getReg(), Op0: True,
911 Op1: False);
912 MI.eraseFromParent();
913 return true;
914 }
915 case UnpackBitShift:
916 return lowerUnpackBitShift(MI);
917 case UnpackMinMax:
918 return lowerUnpackMinMax(MI);
919 case ScalarizeToS16:
920 return lowerSplitTo16(MI);
921 case Ext32To64: {
922 const RegisterBank *RB = MRI.getRegBank(Reg: MI.getOperand(i: 0).getReg());
923 MachineInstrBuilder Hi;
924 switch (MI.getOpcode()) {
925 case AMDGPU::G_ZEXT: {
926 Hi = B.buildConstant(Res: {RB, S32}, Val: 0);
927 break;
928 }
929 case AMDGPU::G_SEXT: {
930 // Replicate sign bit from 32-bit extended part.
931 auto ShiftAmt = B.buildConstant(Res: {RB, S32}, Val: 31);
932 Hi = B.buildAShr(Dst: {RB, S32}, Src0: MI.getOperand(i: 1).getReg(), Src1: ShiftAmt);
933 break;
934 }
935 case AMDGPU::G_ANYEXT: {
936 Hi = B.buildUndef(Res: {RB, S32});
937 break;
938 }
939 default:
940 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
941 Msg: "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
942 MI);
943 return false;
944 }
945
946 B.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(),
947 Ops: {MI.getOperand(i: 1).getReg(), Hi});
948 MI.eraseFromParent();
949 return true;
950 }
951 case UniCstExt: {
952 uint64_t ConstVal = MI.getOperand(i: 1).getCImm()->getZExtValue();
953 B.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: ConstVal);
954
955 MI.eraseFromParent();
956 return true;
957 }
958 case VgprToVccCopy: {
959 Register Src = MI.getOperand(i: 1).getReg();
960 LLT Ty = MRI.getType(Reg: Src);
961 // Take lowest bit from each lane and put it in lane mask.
962 // Lowering via compare, but we need to clean high bits first as compare
963 // compares all bits in register.
964 Register BoolSrc = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
965 if (Ty == S64) {
966 auto Src64 = B.buildUnmerge(Attrs: VgprRB_S32, Op: Src);
967 auto One = B.buildConstant(Res: VgprRB_S32, Val: 1);
968 auto AndLo = B.buildAnd(Dst: VgprRB_S32, Src0: Src64.getReg(Idx: 0), Src1: One);
969 auto Zero = B.buildConstant(Res: VgprRB_S32, Val: 0);
970 auto AndHi = B.buildAnd(Dst: VgprRB_S32, Src0: Src64.getReg(Idx: 1), Src1: Zero);
971 B.buildMergeLikeInstr(Res: BoolSrc, Ops: {AndLo, AndHi});
972 } else {
973 assert(Ty == S32 || Ty == S16);
974 auto One = B.buildConstant(Res: {VgprRB, Ty}, Val: 1);
975 B.buildAnd(Dst: BoolSrc, Src0: Src, Src1: One);
976 }
977 auto Zero = B.buildConstant(Res: {VgprRB, Ty}, Val: 0);
978 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: MI.getOperand(i: 0).getReg(), Op0: BoolSrc, Op1: Zero);
979 MI.eraseFromParent();
980 return true;
981 }
982 case V_BFE:
983 return lowerV_BFE(MI);
984 case S_BFE:
985 return lowerS_BFE(MI);
986 case UniMAD64:
987 return lowerUniMAD64(MI);
988 case UniMul64: {
989 B.buildMul(Dst: MI.getOperand(i: 0), Src0: MI.getOperand(i: 1), Src1: MI.getOperand(i: 2));
990 MI.eraseFromParent();
991 return true;
992 }
993 case DivSMulToMAD: {
994 auto Op1 = B.buildTrunc(Res: VgprRB_S32, Op: MI.getOperand(i: 1));
995 auto Op2 = B.buildTrunc(Res: VgprRB_S32, Op: MI.getOperand(i: 2));
996 auto Zero = B.buildConstant(Res: {VgprRB, S64}, Val: 0);
997
998 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
999 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1000 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1001
1002 B.buildInstr(Opc: NewOpc, DstOps: {MI.getOperand(i: 0).getReg(), {SgprRB, S32}},
1003 SrcOps: {Op1, Op2, Zero});
1004 MI.eraseFromParent();
1005 return true;
1006 }
1007 case SplitTo32:
1008 return lowerSplitTo32(MI);
1009 case SplitTo32Mul:
1010 return lowerSplitTo32Mul(MI);
1011 case SplitTo32Select:
1012 return lowerSplitTo32Select(MI);
1013 case SplitTo32SExtInReg:
1014 return lowerSplitTo32SExtInReg(MI);
1015 case SplitLoad: {
1016 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1017 unsigned Size = DstTy.getSizeInBits();
1018 // Even split to 128-bit loads
1019 if (Size > 128) {
1020 LLT B128;
1021 if (DstTy.isVector()) {
1022 LLT EltTy = DstTy.getElementType();
1023 B128 = LLT::fixed_vector(NumElements: 128 / EltTy.getSizeInBits(), ScalarTy: EltTy);
1024 } else {
1025 B128 = LLT::scalar(SizeInBits: 128);
1026 }
1027 if (Size / 128 == 2)
1028 splitLoad(MI, LLTBreakdown: {B128, B128});
1029 else if (Size / 128 == 4)
1030 splitLoad(MI, LLTBreakdown: {B128, B128, B128, B128});
1031 else {
1032 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1033 Msg: "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1034 MI);
1035 return false;
1036 }
1037 }
1038 // 64 and 32 bit load
1039 else if (DstTy == S96)
1040 splitLoad(MI, LLTBreakdown: {S64, S32}, MergeTy: S32);
1041 else if (DstTy == V3S32)
1042 splitLoad(MI, LLTBreakdown: {V2S32, S32}, MergeTy: S32);
1043 else if (DstTy == V6S16)
1044 splitLoad(MI, LLTBreakdown: {V4S16, V2S16}, MergeTy: V2S16);
1045 else {
1046 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1047 Msg: "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1048 MI);
1049 return false;
1050 }
1051 return true;
1052 }
1053 case WidenLoad: {
1054 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1055 if (DstTy == S96)
1056 widenLoad(MI, WideTy: S128);
1057 else if (DstTy == V3S32)
1058 widenLoad(MI, WideTy: V4S32, MergeTy: S32);
1059 else if (DstTy == V6S16)
1060 widenLoad(MI, WideTy: V8S16, MergeTy: V2S16);
1061 else {
1062 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1063 Msg: "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1064 MI);
1065 return false;
1066 }
1067 return true;
1068 }
1069 case UnpackAExt:
1070 return lowerUnpackAExt(MI);
1071 case WidenMMOToS32:
1072 return widenMMOToS32(MI&: cast<GAnyLoad>(Val&: MI));
1073 case VerifyAllSgpr: {
1074 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1075 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1076 }));
1077 return true;
1078 }
1079 case ApplyAllVgpr: {
1080 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1081 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1082 }));
1083 B.setInstrAndDebugLoc(MI);
1084 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1085 Register Reg = MI.getOperand(i).getReg();
1086 if (MRI.getRegBank(Reg) != VgprRB) {
1087 auto Copy = B.buildCopy(Res: {VgprRB, MRI.getType(Reg)}, Op: Reg);
1088 MI.getOperand(i).setReg(Copy.getReg(Idx: 0));
1089 }
1090 }
1091 return true;
1092 }
1093 case UnmergeToShiftTrunc: {
1094 GUnmerge *Unmerge = dyn_cast<GUnmerge>(Val: &MI);
1095 LLT Ty = MRI.getType(Reg: Unmerge->getSourceReg());
1096 if (Ty.getSizeInBits() % 32 != 0) {
1097 reportGISelFailure(MF, MORE, PassName: "amdgpu-regbanklegalize",
1098 Msg: "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1099 MI);
1100 return false;
1101 }
1102
1103 B.setInstrAndDebugLoc(MI);
1104 if (Ty.getSizeInBits() > 32) {
1105 auto UnmergeV2S16 =
1106 B.buildUnmerge(Attrs: {.RCOrRB: SgprRB, .Ty: V2S16}, Op: Unmerge->getSourceReg());
1107 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1108 auto [Dst0S32, Dst1S32] =
1109 unpackAExt(Reg: UnmergeV2S16->getOperand(i).getReg());
1110 B.buildTrunc(Res: MI.getOperand(i: i * 2).getReg(), Op: Dst0S32);
1111 B.buildTrunc(Res: MI.getOperand(i: i * 2 + 1).getReg(), Op: Dst1S32);
1112 }
1113 } else {
1114 auto [Dst0S32, Dst1S32] = unpackAExt(Reg: MI.getOperand(i: 2).getReg());
1115 B.buildTrunc(Res: MI.getOperand(i: 0).getReg(), Op: Dst0S32);
1116 B.buildTrunc(Res: MI.getOperand(i: 1).getReg(), Op: Dst1S32);
1117 }
1118
1119 MI.eraseFromParent();
1120 return true;
1121 }
1122 case AextToS32InIncomingBlockGPHI: {
1123 Register Dst = MI.getOperand(i: 0).getReg();
1124 Register NewDst = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
1125 B.setInsertPt(MBB&: *MI.getParent(), II: MI.getParent()->getFirstNonPHI());
1126 MI.getOperand(i: 0).setReg(NewDst);
1127 B.buildTrunc(Res: Dst, Op: NewDst);
1128
1129 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1130 Register UseReg = MI.getOperand(i).getReg();
1131
1132 auto DefMI = MRI.getVRegDef(Reg: UseReg)->getIterator();
1133 MachineBasicBlock *DefMBB = DefMI->getParent();
1134
1135 B.setInsertPt(MBB&: *DefMBB, II: DefMBB->SkipPHIsAndLabels(I: std::next(x: DefMI)));
1136
1137 auto NewUse = B.buildAnyExt(Res: SgprRB_S32, Op: UseReg);
1138 MI.getOperand(i).setReg(NewUse.getReg(Idx: 0));
1139 }
1140 break;
1141 }
1142 case VerifyAllSgprGPHI: {
1143 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1144 if (Op.isMBB())
1145 return true;
1146 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1147 }));
1148 return true;
1149 }
1150 case VerifyAllSgprOrVgprGPHI: {
1151 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1152 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1153 if (Op.isMBB())
1154 return true;
1155 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1156 return RB == VgprRB || RB == SgprRB;
1157 }));
1158 return true;
1159 }
1160 case ApplyINTRIN_IMAGE:
1161 return applyRegisterBanksINTRIN_IMAGE(MI);
1162 case SplitBitCount64To32:
1163 return lowerSplitBitCount64To32(MI);
1164 }
1165
1166 if (!WFI.SgprWaterfallOperandRegs.empty()) {
1167 if (!executeInWaterfallLoop(B, WFI))
1168 return false;
1169 }
1170 return true;
1171}
1172
1173LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1174 switch (ID) {
1175 case Vcc:
1176 case UniInVcc:
1177 return LLT::scalar(SizeInBits: 1);
1178 case Sgpr16:
1179 case Vgpr16:
1180 case UniInVgprS16:
1181 return LLT::scalar(SizeInBits: 16);
1182 case Sgpr32:
1183 case Sgpr32_WF:
1184 case Sgpr32Trunc:
1185 case Sgpr32AExt:
1186 case Sgpr32AExtBoolInReg:
1187 case Sgpr32SExt:
1188 case Sgpr32ZExt:
1189 case UniInVgprS32:
1190 case Vgpr32:
1191 case Vgpr32AExt:
1192 case Vgpr32SExt:
1193 case Vgpr32ZExt:
1194 return LLT::scalar(SizeInBits: 32);
1195 case Sgpr64:
1196 case Vgpr64:
1197 case UniInVgprS64:
1198 return LLT::scalar(SizeInBits: 64);
1199 case Sgpr128:
1200 case Vgpr128:
1201 return LLT::scalar(SizeInBits: 128);
1202 case SgprP0:
1203 case SgprP0Call_WF:
1204 case VgprP0:
1205 return LLT::pointer(AddressSpace: 0, SizeInBits: 64);
1206 case SgprP1:
1207 case VgprP1:
1208 return LLT::pointer(AddressSpace: 1, SizeInBits: 64);
1209 case SgprP2:
1210 case VgprP2:
1211 return LLT::pointer(AddressSpace: 2, SizeInBits: 32);
1212 case SgprP3:
1213 case VgprP3:
1214 return LLT::pointer(AddressSpace: 3, SizeInBits: 32);
1215 case SgprP4:
1216 case SgprP4Call_WF:
1217 case VgprP4:
1218 return LLT::pointer(AddressSpace: 4, SizeInBits: 64);
1219 case SgprP5:
1220 case VgprP5:
1221 return LLT::pointer(AddressSpace: 5, SizeInBits: 32);
1222 case SgprP8:
1223 return LLT::pointer(AddressSpace: 8, SizeInBits: 128);
1224 case SgprV2S16:
1225 case VgprV2S16:
1226 case UniInVgprV2S16:
1227 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
1228 case SgprV2S32:
1229 case VgprV2S32:
1230 case UniInVgprV2S32:
1231 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
1232 case VgprV3S32:
1233 return LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32);
1234 case VgprV4S16:
1235 return LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
1236 case SgprV4S32:
1237 case SgprV4S32_WF:
1238 case VgprV4S32:
1239 case UniInVgprV4S32:
1240 return LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
1241 case VgprV8S32:
1242 return LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 32);
1243 case VgprV2S64:
1244 case UniInVgprV2S64:
1245 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
1246 default:
1247 return LLT();
1248 }
1249}
1250
1251LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1252 switch (ID) {
1253 case SgprB32:
1254 case VgprB32:
1255 case SgprB32_M0:
1256 case UniInVgprB32:
1257 if (Ty == LLT::scalar(SizeInBits: 32) || Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) ||
1258 isAnyPtr(Ty, Width: 32))
1259 return Ty;
1260 return LLT();
1261 case SgprPtr32:
1262 case VgprPtr32:
1263 return isAnyPtr(Ty, Width: 32) ? Ty : LLT();
1264 case SgprPtr64:
1265 case VgprPtr64:
1266 return isAnyPtr(Ty, Width: 64) ? Ty : LLT();
1267 case SgprPtr128:
1268 case VgprPtr128:
1269 return isAnyPtr(Ty, Width: 128) ? Ty : LLT();
1270 case SgprB64:
1271 case VgprB64:
1272 case UniInVgprB64:
1273 if (Ty == LLT::scalar(SizeInBits: 64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32) ||
1274 Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16) || isAnyPtr(Ty, Width: 64))
1275 return Ty;
1276 return LLT();
1277 case SgprB96:
1278 case VgprB96:
1279 case UniInVgprB96:
1280 if (Ty == LLT::scalar(SizeInBits: 96) || Ty == LLT::fixed_vector(NumElements: 3, ScalarSizeInBits: 32) ||
1281 Ty == LLT::fixed_vector(NumElements: 6, ScalarSizeInBits: 16))
1282 return Ty;
1283 return LLT();
1284 case SgprB128:
1285 case VgprB128:
1286 case UniInVgprB128:
1287 if (Ty == LLT::scalar(SizeInBits: 128) || Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32) ||
1288 Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64) || Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16) ||
1289 isAnyPtr(Ty, Width: 128))
1290 return Ty;
1291 return LLT();
1292 case VgprB160:
1293 case UniInVgprB160:
1294 if (Ty.getSizeInBits() == 160)
1295 return Ty;
1296 return LLT();
1297 case SgprB256:
1298 case VgprB256:
1299 case UniInVgprB256:
1300 if (Ty == LLT::scalar(SizeInBits: 256) || Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 32) ||
1301 Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 64) || Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 16))
1302 return Ty;
1303 return LLT();
1304 case SgprB512:
1305 case VgprB512:
1306 case UniInVgprB512:
1307 if (Ty == LLT::scalar(SizeInBits: 512) || Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 32) ||
1308 Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 64))
1309 return Ty;
1310 return LLT();
1311 case SgprBRC: {
1312 const SIRegisterInfo *TRI =
1313 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1314 unsigned LLTSize = Ty.getSizeInBits();
1315 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(BitWidth: LLTSize))
1316 return Ty;
1317 return LLT();
1318 }
1319 case VgprBRC: {
1320 const SIRegisterInfo *TRI =
1321 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1322 if (TRI->getSGPRClassForBitWidth(BitWidth: Ty.getSizeInBits()))
1323 return Ty;
1324 return LLT();
1325 }
1326 default:
1327 return LLT();
1328 }
1329}
1330
1331const RegisterBank *
1332RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1333 switch (ID) {
1334 case Vcc:
1335 return VccRB;
1336 case Sgpr16:
1337 case Sgpr32:
1338 case Sgpr32_WF:
1339 case Sgpr64:
1340 case Sgpr128:
1341 case SgprP0:
1342 case SgprP0Call_WF:
1343 case SgprP1:
1344 case SgprP2:
1345 case SgprP3:
1346 case SgprP4:
1347 case SgprP4Call_WF:
1348 case SgprP5:
1349 case SgprP8:
1350 case SgprPtr32:
1351 case SgprPtr64:
1352 case SgprPtr128:
1353 case SgprV2S16:
1354 case SgprV2S32:
1355 case SgprV4S32:
1356 case SgprV4S32_WF:
1357 case SgprB32:
1358 case SgprB64:
1359 case SgprB96:
1360 case SgprB128:
1361 case SgprB256:
1362 case SgprB512:
1363 case SgprBRC:
1364 case UniInVcc:
1365 case UniInVgprS16:
1366 case UniInVgprS32:
1367 case UniInVgprS64:
1368 case UniInVgprV2S16:
1369 case UniInVgprV2S32:
1370 case UniInVgprV4S32:
1371 case UniInVgprV2S64:
1372 case UniInVgprB32:
1373 case UniInVgprB64:
1374 case UniInVgprB96:
1375 case UniInVgprB128:
1376 case UniInVgprB160:
1377 case UniInVgprB256:
1378 case UniInVgprB512:
1379 case Sgpr32Trunc:
1380 case Sgpr32AExt:
1381 case Sgpr32AExtBoolInReg:
1382 case Sgpr32SExt:
1383 case Sgpr32ZExt:
1384 return SgprRB;
1385 case Vgpr16:
1386 case Vgpr32:
1387 case Vgpr64:
1388 case Vgpr128:
1389 case VgprP0:
1390 case VgprP1:
1391 case VgprP2:
1392 case VgprP3:
1393 case VgprP4:
1394 case VgprP5:
1395 case VgprPtr32:
1396 case VgprPtr64:
1397 case VgprPtr128:
1398 case VgprV2S16:
1399 case VgprV2S32:
1400 case VgprV2S64:
1401 case VgprV3S32:
1402 case VgprV4S16:
1403 case VgprV4S32:
1404 case VgprV8S32:
1405 case VgprB32:
1406 case VgprB64:
1407 case VgprB96:
1408 case VgprB128:
1409 case VgprB160:
1410 case VgprB256:
1411 case VgprB512:
1412 case VgprBRC:
1413 case Vgpr32AExt:
1414 case Vgpr32SExt:
1415 case Vgpr32ZExt:
1416 return VgprRB;
1417 default:
1418 return nullptr;
1419 }
1420}
1421
1422bool RegBankLegalizeHelper::applyMappingDst(
1423 MachineInstr &MI, unsigned &OpIdx,
1424 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1425 // Defs start from operand 0
1426 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1427 if (MethodIDs[OpIdx] == None)
1428 continue;
1429 MachineOperand &Op = MI.getOperand(i: OpIdx);
1430 Register Reg = Op.getReg();
1431 LLT Ty = MRI.getType(Reg);
1432 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1433
1434 switch (MethodIDs[OpIdx]) {
1435 // vcc, sgpr and vgpr scalars, pointers and vectors
1436 case Vcc:
1437 case Sgpr16:
1438 case Sgpr32:
1439 case Sgpr64:
1440 case Sgpr128:
1441 case SgprP0:
1442 case SgprP1:
1443 case SgprP3:
1444 case SgprP4:
1445 case SgprP5:
1446 case SgprP8:
1447 case SgprV2S16:
1448 case SgprV2S32:
1449 case SgprV4S32:
1450 case Vgpr16:
1451 case Vgpr32:
1452 case Vgpr64:
1453 case Vgpr128:
1454 case VgprP0:
1455 case VgprP1:
1456 case VgprP2:
1457 case VgprP3:
1458 case VgprP4:
1459 case VgprP5:
1460 case VgprV2S16:
1461 case VgprV2S32:
1462 case VgprV2S64:
1463 case VgprV3S32:
1464 case VgprV4S16:
1465 case VgprV4S32:
1466 case VgprV8S32: {
1467 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1468 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1469 break;
1470 }
1471 // sgpr and vgpr B-types
1472 case SgprB32:
1473 case SgprB64:
1474 case SgprB96:
1475 case SgprB128:
1476 case SgprB256:
1477 case SgprB512:
1478 case SgprBRC:
1479 case SgprPtr32:
1480 case SgprPtr64:
1481 case SgprPtr128:
1482 case VgprB32:
1483 case VgprB64:
1484 case VgprB96:
1485 case VgprB128:
1486 case VgprB160:
1487 case VgprB256:
1488 case VgprB512:
1489 case VgprBRC:
1490 case VgprPtr32:
1491 case VgprPtr64:
1492 case VgprPtr128: {
1493 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1494 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1495 break;
1496 }
1497 // uniform in vcc/vgpr: scalars, vectors and B-types
1498 case UniInVcc: {
1499 assert(Ty == S1);
1500 assert(RB == SgprRB);
1501 Register NewDst = MRI.createVirtualRegister(RegAttr: VccRB_S1);
1502 Op.setReg(NewDst);
1503 if (!MRI.use_empty(RegNo: Reg)) {
1504 auto CopyS32_Vcc =
1505 B.buildInstr(Opc: AMDGPU::G_AMDGPU_COPY_SCC_VCC, DstOps: {SgprRB_S32}, SrcOps: {NewDst});
1506 B.buildTrunc(Res: Reg, Op: CopyS32_Vcc);
1507 }
1508 break;
1509 }
1510 case UniInVgprS16: {
1511 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1512 assert(RB == SgprRB);
1513 Register NewVgprDstS16 = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: S16});
1514 Register NewVgprDstS32 = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: S32});
1515 Register NewSgprDstS32 = MRI.createVirtualRegister(RegAttr: {.RCOrRB: SgprRB, .Ty: S32});
1516 Op.setReg(NewVgprDstS16);
1517 B.buildAnyExt(Res: NewVgprDstS32, Op: NewVgprDstS16);
1518 buildReadAnyLane(B, SgprDst: NewSgprDstS32, VgprSrc: NewVgprDstS32, RBI);
1519 B.buildTrunc(Res: Reg, Op: NewSgprDstS32);
1520 break;
1521 }
1522 case UniInVgprS32:
1523 case UniInVgprS64:
1524 case UniInVgprV2S16:
1525 case UniInVgprV2S32:
1526 case UniInVgprV4S32:
1527 case UniInVgprV2S64: {
1528 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1529 assert(RB == SgprRB);
1530 Register NewVgprDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
1531 Op.setReg(NewVgprDst);
1532 buildReadAnyLane(B, SgprDst: Reg, VgprSrc: NewVgprDst, RBI);
1533 break;
1534 }
1535 case UniInVgprB32:
1536 case UniInVgprB64:
1537 case UniInVgprB96:
1538 case UniInVgprB128:
1539 case UniInVgprB160:
1540 case UniInVgprB256:
1541 case UniInVgprB512: {
1542 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1543 assert(RB == SgprRB);
1544 Register NewVgprDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: Ty});
1545 Op.setReg(NewVgprDst);
1546 AMDGPU::buildReadAnyLane(B, SgprDst: Reg, VgprSrc: NewVgprDst, RBI);
1547 break;
1548 }
1549 // sgpr trunc
1550 case Sgpr32Trunc: {
1551 assert(Ty.getSizeInBits() < 32);
1552 assert(RB == SgprRB);
1553 Register NewDst = MRI.createVirtualRegister(RegAttr: SgprRB_S32);
1554 Op.setReg(NewDst);
1555 if (!MRI.use_empty(RegNo: Reg))
1556 B.buildTrunc(Res: Reg, Op: NewDst);
1557 break;
1558 }
1559 case InvalidMapping: {
1560 reportGISelFailure(
1561 MF, MORE, PassName: "amdgpu-regbanklegalize",
1562 Msg: "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1563 return false;
1564 }
1565 default:
1566 reportGISelFailure(
1567 MF, MORE, PassName: "amdgpu-regbanklegalize",
1568 Msg: "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1569 return false;
1570 }
1571 }
1572
1573 return true;
1574}
1575
1576bool RegBankLegalizeHelper::applyMappingSrc(
1577 MachineInstr &MI, unsigned &OpIdx,
1578 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1579 WaterfallInfo &WFI) {
1580 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1581 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1582 continue;
1583
1584 MachineOperand &Op = MI.getOperand(i: OpIdx);
1585 Register Reg = Op.getReg();
1586 LLT Ty = MRI.getType(Reg);
1587 const RegisterBank *RB = MRI.getRegBank(Reg);
1588
1589 switch (MethodIDs[i]) {
1590 case Vcc: {
1591 assert(Ty == S1);
1592 assert(RB == VccRB || RB == SgprRB);
1593 if (RB == SgprRB) {
1594 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
1595 auto CopyVcc_Scc =
1596 B.buildInstr(Opc: AMDGPU::G_AMDGPU_COPY_VCC_SCC, DstOps: {VccRB_S1}, SrcOps: {Aext});
1597 Op.setReg(CopyVcc_Scc.getReg(Idx: 0));
1598 }
1599 break;
1600 }
1601 // sgpr scalars, pointers and vectors
1602 case Sgpr16:
1603 case Sgpr32:
1604 case Sgpr64:
1605 case Sgpr128:
1606 case SgprP0:
1607 case SgprP1:
1608 case SgprP3:
1609 case SgprP4:
1610 case SgprP5:
1611 case SgprP8:
1612 case SgprV2S16:
1613 case SgprV2S32:
1614 case SgprV4S32: {
1615 assert(Ty == getTyFromID(MethodIDs[i]));
1616 assert(RB == getRegBankFromID(MethodIDs[i]));
1617 break;
1618 }
1619 // sgpr B-types
1620 case SgprB32:
1621 case SgprB64:
1622 case SgprB96:
1623 case SgprB128:
1624 case SgprB256:
1625 case SgprB512:
1626 case SgprBRC:
1627 case SgprPtr32:
1628 case SgprPtr64:
1629 case SgprPtr128: {
1630 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1631 assert(RB == getRegBankFromID(MethodIDs[i]));
1632 break;
1633 }
1634 // vgpr scalars, pointers and vectors
1635 case Vgpr16:
1636 case Vgpr32:
1637 case Vgpr64:
1638 case Vgpr128:
1639 case VgprP0:
1640 case VgprP1:
1641 case VgprP2:
1642 case VgprP3:
1643 case VgprP4:
1644 case VgprP5:
1645 case VgprV2S16:
1646 case VgprV2S32:
1647 case VgprV2S64:
1648 case VgprV3S32:
1649 case VgprV4S16:
1650 case VgprV4S32:
1651 case VgprV8S32: {
1652 assert(Ty == getTyFromID(MethodIDs[i]));
1653 if (RB != VgprRB) {
1654 auto CopyToVgpr = B.buildCopy(Res: {VgprRB, Ty}, Op: Reg);
1655 Op.setReg(CopyToVgpr.getReg(Idx: 0));
1656 }
1657 break;
1658 }
1659 // vgpr B-types
1660 case VgprB32:
1661 case VgprB64:
1662 case VgprB96:
1663 case VgprB128:
1664 case VgprB160:
1665 case VgprB256:
1666 case VgprB512:
1667 case VgprBRC:
1668 case VgprPtr32:
1669 case VgprPtr64:
1670 case VgprPtr128: {
1671 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1672 if (RB != VgprRB) {
1673 auto CopyToVgpr = B.buildCopy(Res: {VgprRB, Ty}, Op: Reg);
1674 Op.setReg(CopyToVgpr.getReg(Idx: 0));
1675 }
1676 break;
1677 }
1678 // sgpr waterfall, scalars, and vectors
1679 case Sgpr32_WF:
1680 case SgprV4S32_WF: {
1681 assert(Ty == getTyFromID(MethodIDs[i]));
1682 if (RB != SgprRB) {
1683 WFI.SgprWaterfallOperandRegs.insert(V: Reg);
1684 if (!WFI.Start.isValid()) {
1685 WFI.Start = MI.getIterator();
1686 WFI.End = std::next(x: MI.getIterator());
1687 }
1688 }
1689 break;
1690 }
1691 case SgprP0Call_WF:
1692 case SgprP4Call_WF: {
1693 assert(Ty == getTyFromID(MethodIDs[i]));
1694 if (RB != SgprRB) {
1695 WFI.SgprWaterfallOperandRegs.insert(V: Reg);
1696
1697 // Find the ADJCALLSTACKUP before the call.
1698 MachineBasicBlock::iterator Start = MI.getIterator();
1699 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
1700 --Start;
1701
1702 // Find the ADJCALLSTACKDOWN after the call (include it in range).
1703 MachineBasicBlock::iterator End = MI.getIterator();
1704 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
1705 ++End;
1706 ++End;
1707
1708 B.setInsertPt(MBB&: *MI.getParent(), II: Start);
1709 WFI.Start = Start;
1710 WFI.End = End;
1711 }
1712 break;
1713 }
1714 case SgprB32_M0: {
1715 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1716 if (RB == SgprRB)
1717 break;
1718 assert(RB == VgprRB);
1719 Register NewSGPR32 = MRI.createVirtualRegister(RegAttr: {.RCOrRB: SgprRB, .Ty: Ty});
1720 buildReadFirstLane(B, SgprDst: NewSGPR32, VgprSrc: Op.getReg(), RBI);
1721 Op.setReg(NewSGPR32);
1722 break;
1723 }
1724 // sgpr and vgpr scalars with extend
1725 case Sgpr32AExt: {
1726 // Note: this ext allows S1, and it is meant to be combined away.
1727 assert(Ty.getSizeInBits() < 32);
1728 assert(RB == SgprRB);
1729 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
1730 Op.setReg(Aext.getReg(Idx: 0));
1731 break;
1732 }
1733 case Sgpr32AExtBoolInReg: {
1734 // Note: this ext allows S1, and it is meant to be combined away.
1735 assert(Ty.getSizeInBits() == 1);
1736 assert(RB == SgprRB);
1737 auto Aext = B.buildAnyExt(Res: SgprRB_S32, Op: Reg);
1738 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1739 // most of times meant to be combined away in AMDGPURegBankCombiner.
1740 auto Cst1 = B.buildConstant(Res: SgprRB_S32, Val: 1);
1741 auto BoolInReg = B.buildAnd(Dst: SgprRB_S32, Src0: Aext, Src1: Cst1);
1742 Op.setReg(BoolInReg.getReg(Idx: 0));
1743 break;
1744 }
1745 case Sgpr32SExt: {
1746 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1747 assert(RB == SgprRB);
1748 auto Sext = B.buildSExt(Res: SgprRB_S32, Op: Reg);
1749 Op.setReg(Sext.getReg(Idx: 0));
1750 break;
1751 }
1752 case Sgpr32ZExt: {
1753 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1754 assert(RB == SgprRB);
1755 auto Zext = B.buildZExt(Res: {SgprRB, S32}, Op: Reg);
1756 Op.setReg(Zext.getReg(Idx: 0));
1757 break;
1758 }
1759 case Vgpr32AExt: {
1760 assert(Ty.getSizeInBits() < 32);
1761 assert(RB == VgprRB);
1762 auto Aext = B.buildAnyExt(Res: {VgprRB, S32}, Op: Reg);
1763 Op.setReg(Aext.getReg(Idx: 0));
1764 break;
1765 }
1766 case Vgpr32SExt: {
1767 // Note this ext allows S1, and it is meant to be combined away.
1768 assert(Ty.getSizeInBits() < 32);
1769 assert(RB == VgprRB);
1770 auto Sext = B.buildSExt(Res: {VgprRB, S32}, Op: Reg);
1771 Op.setReg(Sext.getReg(Idx: 0));
1772 break;
1773 }
1774 case Vgpr32ZExt: {
1775 // Note this ext allows S1, and it is meant to be combined away.
1776 assert(Ty.getSizeInBits() < 32);
1777 assert(RB == VgprRB);
1778 auto Zext = B.buildZExt(Res: {VgprRB, S32}, Op: Reg);
1779 Op.setReg(Zext.getReg(Idx: 0));
1780 break;
1781 }
1782 default:
1783 reportGISelFailure(
1784 MF, MORE, PassName: "amdgpu-regbanklegalize",
1785 Msg: "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
1786 return false;
1787 }
1788 }
1789 return true;
1790}
1791
1792[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1793 const RegisterBank *RB,
1794 MachineRegisterInfo &MRI,
1795 unsigned StartOpIdx,
1796 unsigned EndOpIdx) {
1797 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1798 if (MRI.getRegBankOrNull(Reg: MI.getOperand(i).getReg()) != RB)
1799 return false;
1800 }
1801 return true;
1802}
1803
1804void RegBankLegalizeHelper::applyMappingTrivial(MachineInstr &MI) {
1805 const RegisterBank *RB = MRI.getRegBank(Reg: MI.getOperand(i: 0).getReg());
1806 // Put RB on all registers
1807 unsigned NumDefs = MI.getNumDefs();
1808 unsigned NumOperands = MI.getNumOperands();
1809
1810 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1811 if (RB == SgprRB)
1812 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1813
1814 if (RB == VgprRB) {
1815 B.setInstr(MI);
1816 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1817 Register Reg = MI.getOperand(i).getReg();
1818 if (MRI.getRegBank(Reg) != RB) {
1819 auto Copy = B.buildCopy(Res: {VgprRB, MRI.getType(Reg)}, Op: Reg);
1820 MI.getOperand(i).setReg(Copy.getReg(Idx: 0));
1821 }
1822 }
1823 }
1824}
1825
1826bool RegBankLegalizeHelper::applyRegisterBanksINTRIN_IMAGE(MachineInstr &MI) {
1827 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1828 AMDGPU::lookupRsrcIntrinsic(Intr: AMDGPU::getIntrinsicID(I: MI));
1829 assert(RSrcIntrin && RSrcIntrin->IsImage);
1830
1831 unsigned RsrcIdx = RSrcIntrin->RsrcArg;
1832 const unsigned NumDefs = MI.getNumExplicitDefs();
1833
1834 // The reported argument index is relative to the IR intrinsic call arguments,
1835 // so we need to shift by the number of defs and the intrinsic ID.
1836 RsrcIdx += NumDefs + 1;
1837
1838 MachineBasicBlock *MBB = MI.getParent();
1839 B.setInsertPt(MBB&: *MBB, II: MBB->SkipPHIsAndLabels(I: std::next(x: MI.getIterator())));
1840
1841 // Defs(for image loads with return) are vgpr.
1842 for (unsigned i = 0; i < NumDefs; ++i) {
1843 const RegisterBank *RB = MRI.getRegBank(Reg: MI.getOperand(i).getReg());
1844 if (RB == VgprRB)
1845 continue;
1846
1847 Register Reg = MI.getOperand(i).getReg();
1848 Register NewVgprDst = MRI.createVirtualRegister(RegAttr: {.RCOrRB: VgprRB, .Ty: MRI.getType(Reg)});
1849 MI.getOperand(i).setReg(NewVgprDst);
1850 buildReadAnyLane(B, SgprDst: Reg, VgprSrc: NewVgprDst, RBI);
1851 }
1852
1853 B.setInstrAndDebugLoc(MI);
1854
1855 // Register uses(before RsrcIdx) are vgpr.
1856 for (unsigned i = 1; i < RsrcIdx; ++i) {
1857 MachineOperand &Op = MI.getOperand(i);
1858 if (!Op.isReg())
1859 continue;
1860
1861 Register Reg = Op.getReg();
1862 if (!Reg.isVirtual())
1863 continue;
1864
1865 if (MRI.getRegBank(Reg) == VgprRB)
1866 continue;
1867
1868 auto Copy = B.buildCopy(Res: {VgprRB, MRI.getType(Reg)}, Op: Reg);
1869 Op.setReg(Copy.getReg(Idx: 0));
1870 }
1871
1872 SmallSet<Register, 4> OpsToWaterfall;
1873
1874 // Register use RsrcIdx(and RsrcIdx+1 in some cases) is sgpr.
1875 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
1876 MachineOperand &Op = MI.getOperand(i);
1877 if (!Op.isReg())
1878 continue;
1879
1880 Register Reg = Op.getReg();
1881 if (MRI.getRegBank(Reg) != SgprRB)
1882 OpsToWaterfall.insert(V: Reg);
1883 }
1884
1885 if (!OpsToWaterfall.empty()) {
1886 MachineBasicBlock::iterator MII = MI.getIterator();
1887 executeInWaterfallLoop(B, WFI: {.SgprWaterfallOperandRegs: OpsToWaterfall, .Start: MII, .End: std::next(x: MII)});
1888 }
1889
1890 return true;
1891}
1892