1//===-- AMDGPURegBankLegalize.cpp -----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Lower G_ instructions that can't be inst-selected with register bank
10/// assignment from AMDGPURegBankSelect based on machine uniformity info.
11/// Given types on all operands, some register bank assignments require lowering
12/// while others do not.
13/// Note: cases where all register bank assignments would require lowering are
14/// lowered in legalizer.
15/// For example vgpr S64 G_AND requires lowering to S32 while sgpr S64 does not.
16/// Eliminate sgpr S1 by lowering to sgpr S32.
17//
18//===----------------------------------------------------------------------===//
19
20#include "AMDGPU.h"
21#include "AMDGPUGlobalISelUtils.h"
22#include "AMDGPURegBankLegalizeHelper.h"
23#include "GCNSubtarget.h"
24#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
25#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
26#include "llvm/CodeGen/MachineFunctionPass.h"
27#include "llvm/CodeGen/MachineUniformityAnalysis.h"
28#include "llvm/CodeGen/TargetPassConfig.h"
29#include "llvm/InitializePasses.h"
30
31#define DEBUG_TYPE "amdgpu-regbanklegalize"
32
33using namespace llvm;
34using namespace AMDGPU;
35
36namespace {
37
38class AMDGPURegBankLegalize : public MachineFunctionPass {
39public:
40 static char ID;
41
42public:
43 AMDGPURegBankLegalize() : MachineFunctionPass(ID) {}
44
45 bool runOnMachineFunction(MachineFunction &MF) override;
46
47 StringRef getPassName() const override {
48 return "AMDGPU Register Bank Legalize";
49 }
50
51 void getAnalysisUsage(AnalysisUsage &AU) const override {
52 AU.addRequired<TargetPassConfig>();
53 AU.addRequired<GISelCSEAnalysisWrapperPass>();
54 AU.addRequired<MachineUniformityAnalysisPass>();
55 MachineFunctionPass::getAnalysisUsage(AU);
56 }
57
58 // If there were no phis and we do waterfall expansion machine verifier would
59 // fail.
60 MachineFunctionProperties getClearedProperties() const override {
61 return MachineFunctionProperties().setNoPHIs();
62 }
63};
64
65} // End anonymous namespace.
66
67INITIALIZE_PASS_BEGIN(AMDGPURegBankLegalize, DEBUG_TYPE,
68 "AMDGPU Register Bank Legalize", false, false)
69INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
70INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
71INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
72INITIALIZE_PASS_END(AMDGPURegBankLegalize, DEBUG_TYPE,
73 "AMDGPU Register Bank Legalize", false, false)
74
75char AMDGPURegBankLegalize::ID = 0;
76
77char &llvm::AMDGPURegBankLegalizeID = AMDGPURegBankLegalize::ID;
78
79FunctionPass *llvm::createAMDGPURegBankLegalizePass() {
80 return new AMDGPURegBankLegalize();
81}
82
83const RegBankLegalizeRules &getRules(const GCNSubtarget &ST,
84 MachineRegisterInfo &MRI) {
85 static std::mutex GlobalMutex;
86 static SmallDenseMap<unsigned, std::unique_ptr<RegBankLegalizeRules>>
87 CacheForRuleSet;
88 std::lock_guard<std::mutex> Lock(GlobalMutex);
89 auto [It, Inserted] = CacheForRuleSet.try_emplace(Key: ST.getGeneration());
90 if (Inserted)
91 It->second = std::make_unique<RegBankLegalizeRules>(args: ST, args&: MRI);
92 else
93 It->second->refreshRefs(ST: ST, MRI&: MRI);
94 return *It->second;
95}
96
97class AMDGPURegBankLegalizeCombiner {
98 MachineIRBuilder &B;
99 MachineRegisterInfo &MRI;
100 const SIRegisterInfo &TRI;
101 const RegisterBank *SgprRB;
102 const RegisterBank *VgprRB;
103 const RegisterBank *VccRB;
104
105 static constexpr LLT S1 = LLT::scalar(SizeInBits: 1);
106 static constexpr LLT S16 = LLT::scalar(SizeInBits: 16);
107 static constexpr LLT S32 = LLT::scalar(SizeInBits: 32);
108 static constexpr LLT S64 = LLT::scalar(SizeInBits: 64);
109
110public:
111 AMDGPURegBankLegalizeCombiner(MachineIRBuilder &B, const SIRegisterInfo &TRI,
112 const RegisterBankInfo &RBI)
113 : B(B), MRI(*B.getMRI()), TRI(TRI),
114 SgprRB(&RBI.getRegBank(ID: AMDGPU::SGPRRegBankID)),
115 VgprRB(&RBI.getRegBank(ID: AMDGPU::VGPRRegBankID)),
116 VccRB(&RBI.getRegBank(ID: AMDGPU::VCCRegBankID)) {};
117
118 bool isLaneMask(Register Reg) {
119 const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
120 if (RB && RB->getID() == AMDGPU::VCCRegBankID)
121 return true;
122
123 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
124 return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(SizeInBits: 1);
125 }
126
127 void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) {
128 MI.eraseFromParent();
129 if (Optional0 && isTriviallyDead(MI: *Optional0, MRI))
130 Optional0->eraseFromParent();
131 }
132
133 std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) {
134 MachineInstr *MatchMI = MRI.getVRegDef(Reg: Src);
135 if (MatchMI->getOpcode() != Opcode)
136 return {nullptr, Register()};
137 return {MatchMI, MatchMI->getOperand(i: 1).getReg()};
138 }
139
140 void tryCombineCopy(MachineInstr &MI) {
141 Register Dst = MI.getOperand(i: 0).getReg();
142 Register Src = MI.getOperand(i: 1).getReg();
143 // Skip copies of physical registers.
144 if (!Dst.isVirtual() || !Src.isVirtual())
145 return;
146
147 // This is a cross bank copy, sgpr S1 to lane mask.
148 //
149 // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
150 // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
151 // ->
152 // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
153 if (isLaneMask(Reg: Dst) && MRI.getRegBankOrNull(Reg: Src) == SgprRB) {
154 auto [Trunc, TruncS32Src] = tryMatch(Src, Opcode: AMDGPU::G_TRUNC);
155 assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
156 "sgpr S1 must be result of G_TRUNC of sgpr S32");
157
158 B.setInstr(MI);
159 // Ensure that truncated bits in BoolSrc are 0.
160 auto One = B.buildConstant(Res: {SgprRB, S32}, Val: 1);
161 auto BoolSrc = B.buildAnd(Dst: {SgprRB, S32}, Src0: TruncS32Src, Src1: One);
162 B.buildInstr(Opc: AMDGPU::G_AMDGPU_COPY_VCC_SCC, DstOps: {Dst}, SrcOps: {BoolSrc});
163 cleanUpAfterCombine(MI, Optional0: Trunc);
164 return;
165 }
166
167 // Src = G_AMDGPU_READANYLANE RALSrc
168 // Dst = COPY Src
169 // ->
170 // Dst = RALSrc
171 if (MRI.getRegBankOrNull(Reg: Dst) == VgprRB &&
172 MRI.getRegBankOrNull(Reg: Src) == SgprRB) {
173 auto [RAL, RALSrc] = tryMatch(Src, Opcode: AMDGPU::G_AMDGPU_READANYLANE);
174 if (!RAL)
175 return;
176
177 assert(MRI.getRegBank(RALSrc) == VgprRB);
178 MRI.replaceRegWith(FromReg: Dst, ToReg: RALSrc);
179 cleanUpAfterCombine(MI, Optional0: RAL);
180 return;
181 }
182 }
183
184 void tryCombineS1AnyExt(MachineInstr &MI) {
185 // %Src:sgpr(S1) = G_TRUNC %TruncSrc
186 // %Dst = G_ANYEXT %Src:sgpr(S1)
187 // ->
188 // %Dst = G_... %TruncSrc
189 Register Dst = MI.getOperand(i: 0).getReg();
190 Register Src = MI.getOperand(i: 1).getReg();
191 if (MRI.getType(Reg: Src) != S1)
192 return;
193
194 auto [Trunc, TruncSrc] = tryMatch(Src, Opcode: AMDGPU::G_TRUNC);
195 if (!Trunc)
196 return;
197
198 LLT DstTy = MRI.getType(Reg: Dst);
199 LLT TruncSrcTy = MRI.getType(Reg: TruncSrc);
200
201 if (DstTy == TruncSrcTy) {
202 MRI.replaceRegWith(FromReg: Dst, ToReg: TruncSrc);
203 cleanUpAfterCombine(MI, Optional0: Trunc);
204 return;
205 }
206
207 B.setInstr(MI);
208
209 if (DstTy == S32 && TruncSrcTy == S64) {
210 auto Unmerge = B.buildUnmerge(Attrs: {.RCOrRB: SgprRB, .Ty: S32}, Op: TruncSrc);
211 MRI.replaceRegWith(FromReg: Dst, ToReg: Unmerge.getReg(Idx: 0));
212 cleanUpAfterCombine(MI, Optional0: Trunc);
213 return;
214 }
215
216 if (DstTy == S64 && TruncSrcTy == S32) {
217 B.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(),
218 Ops: {TruncSrc, B.buildUndef(Res: {SgprRB, S32})});
219 cleanUpAfterCombine(MI, Optional0: Trunc);
220 return;
221 }
222
223 if (DstTy == S32 && TruncSrcTy == S16) {
224 B.buildAnyExt(Res: Dst, Op: TruncSrc);
225 cleanUpAfterCombine(MI, Optional0: Trunc);
226 return;
227 }
228
229 if (DstTy == S16 && TruncSrcTy == S32) {
230 B.buildTrunc(Res: Dst, Op: TruncSrc);
231 cleanUpAfterCombine(MI, Optional0: Trunc);
232 return;
233 }
234
235 llvm_unreachable("missing anyext + trunc combine");
236 }
237};
238
239// Search through MRI for virtual registers with sgpr register bank and S1 LLT.
240[[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) {
241 const LLT S1 = LLT::scalar(SizeInBits: 1);
242 for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) {
243 Register Reg = Register::index2VirtReg(Index: i);
244 if (MRI.def_empty(RegNo: Reg) || MRI.getType(Reg) != S1)
245 continue;
246
247 const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
248 if (RB && RB->getID() == AMDGPU::SGPRRegBankID) {
249 LLVM_DEBUG(dbgs() << "Warning: detected sgpr S1 register in: ";
250 MRI.getVRegDef(Reg)->dump(););
251 return Reg;
252 }
253 }
254
255 return {};
256}
257
258bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
259 if (MF.getProperties().hasFailedISel())
260 return false;
261
262 // Setup the instruction builder with CSE.
263 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
264 GISelCSEAnalysisWrapper &Wrapper =
265 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
266 GISelCSEInfo &CSEInfo = Wrapper.get(CSEOpt: TPC.getCSEConfig());
267 GISelObserverWrapper Observer;
268 Observer.addObserver(O: &CSEInfo);
269
270 CSEMIRBuilder B(MF);
271 B.setCSEInfo(&CSEInfo);
272 B.setChangeObserver(Observer);
273
274 RAIIDelegateInstaller DelegateInstaller(MF, &Observer);
275 RAIIMFObserverInstaller MFObserverInstaller(MF, Observer);
276
277 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
278 MachineRegisterInfo &MRI = MF.getRegInfo();
279 const RegisterBankInfo &RBI = *ST.getRegBankInfo();
280 const MachineUniformityInfo &MUI =
281 getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
282
283 // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes.
284 const RegBankLegalizeRules &RBLRules = getRules(ST, MRI);
285
286 // Logic that does legalization based on IDs assigned to Opcode.
287 RegBankLegalizeHelper RBLHelper(B, MUI, RBI, RBLRules);
288
289 SmallVector<MachineInstr *> AllInst;
290
291 for (MachineBasicBlock &MBB : MF) {
292 for (MachineInstr &MI : MBB) {
293 AllInst.push_back(Elt: &MI);
294 }
295 }
296
297 for (MachineInstr *MI : AllInst) {
298 if (!MI->isPreISelOpcode())
299 continue;
300
301 unsigned Opc = MI->getOpcode();
302 // Insert point for use operands needs some calculation.
303 if (Opc == AMDGPU::G_PHI) {
304 RBLHelper.applyMappingPHI(MI&: *MI);
305 continue;
306 }
307
308 // Opcodes that support pretty much all combinations of reg banks and LLTs
309 // (except S1). There is no point in writing rules for them.
310 if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
311 Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) {
312 RBLHelper.applyMappingTrivial(MI&: *MI);
313 continue;
314 }
315
316 // Opcodes that also support S1.
317 if (Opc == G_FREEZE &&
318 MRI.getType(Reg: MI->getOperand(i: 0).getReg()) != LLT::scalar(SizeInBits: 1)) {
319 RBLHelper.applyMappingTrivial(MI&: *MI);
320 continue;
321 }
322
323 if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT ||
324 Opc == AMDGPU::G_IMPLICIT_DEF)) {
325 Register Dst = MI->getOperand(i: 0).getReg();
326 // Non S1 types are trivially accepted.
327 if (MRI.getType(Reg: Dst) != LLT::scalar(SizeInBits: 1)) {
328 assert(MRI.getRegBank(Dst)->getID() == AMDGPU::SGPRRegBankID);
329 continue;
330 }
331
332 // S1 rules are in RegBankLegalizeRules.
333 }
334
335 RBLHelper.findRuleAndApplyMapping(MI&: *MI);
336 }
337
338 // Sgpr S1 clean up combines:
339 // - Sgpr S1(S32) to sgpr S1(S32) Copy: anyext + trunc combine.
340 // In RegBankLegalize 'S1 Dst' are legalized into S32 as
341 // 'S1Dst = Trunc S32Dst' and 'S1 Src' into 'S32Src = Anyext S1Src'.
342 // S1 Truncs and Anyexts that come from legalizer, that can have non-S32
343 // types e.g. S16 = Anyext S1 or S1 = Trunc S64, will also be cleaned up.
344 // - Sgpr S1(S32) to vcc Copy: G_AMDGPU_COPY_VCC_SCC combine.
345 // Divergent instruction uses sgpr S1 as input that should be lane mask(vcc)
346 // Legalizing this use creates sgpr S1(S32) to vcc Copy.
347
348 // Note: Remaining S1 copies, S1s are either sgpr S1(S32) or vcc S1:
349 // - Vcc to vcc Copy: nothing to do here, just a regular copy.
350 // - Vcc to sgpr S1 Copy: Should not exist in a form of COPY instruction(*).
351 // Note: For 'uniform-in-vcc to sgpr-S1 copy' G_AMDGPU_COPY_SCC_VCC is used
352 // instead. When only available instruction creates vcc result, use of
353 // UniformInVcc results in creating G_AMDGPU_COPY_SCC_VCC.
354
355 // (*)Explanation for 'sgpr S1(uniform) = COPY vcc(divergent)':
356 // Copy from divergent to uniform register indicates an error in either:
357 // - Uniformity analysis: Uniform instruction has divergent input. If one of
358 // the inputs is divergent, instruction should be divergent!
359 // - RegBankLegalizer not executing in waterfall loop (missing implementation)
360
361 AMDGPURegBankLegalizeCombiner Combiner(B, *ST.getRegisterInfo(), RBI);
362
363 for (MachineBasicBlock &MBB : MF) {
364 for (MachineInstr &MI : make_early_inc_range(Range&: MBB)) {
365 if (MI.getOpcode() == AMDGPU::COPY) {
366 Combiner.tryCombineCopy(MI);
367 continue;
368 }
369 if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
370 Combiner.tryCombineS1AnyExt(MI);
371 continue;
372 }
373 }
374 }
375
376 assert(!getAnySgprS1(MRI).isValid() &&
377 "Registers with sgpr reg bank and S1 LLT are not legal after "
378 "AMDGPURegBankLegalize. Should lower to sgpr S32");
379
380 return true;
381}
382