1 | //===-- AMDGPURegBankLegalize.cpp -----------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// Lower G_ instructions that can't be inst-selected with register bank |
10 | /// assignment from AMDGPURegBankSelect based on machine uniformity info. |
11 | /// Given types on all operands, some register bank assignments require lowering |
12 | /// while others do not. |
13 | /// Note: cases where all register bank assignments would require lowering are |
14 | /// lowered in legalizer. |
15 | /// For example vgpr S64 G_AND requires lowering to S32 while sgpr S64 does not. |
16 | /// Eliminate sgpr S1 by lowering to sgpr S32. |
17 | // |
18 | //===----------------------------------------------------------------------===// |
19 | |
20 | #include "AMDGPU.h" |
21 | #include "AMDGPUGlobalISelUtils.h" |
22 | #include "AMDGPURegBankLegalizeHelper.h" |
23 | #include "GCNSubtarget.h" |
24 | #include "llvm/CodeGen/GlobalISel/CSEInfo.h" |
25 | #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" |
26 | #include "llvm/CodeGen/MachineFunctionPass.h" |
27 | #include "llvm/CodeGen/MachineUniformityAnalysis.h" |
28 | #include "llvm/CodeGen/TargetPassConfig.h" |
29 | #include "llvm/InitializePasses.h" |
30 | |
31 | #define DEBUG_TYPE "amdgpu-regbanklegalize" |
32 | |
33 | using namespace llvm; |
34 | using namespace AMDGPU; |
35 | |
36 | namespace { |
37 | |
38 | class AMDGPURegBankLegalize : public MachineFunctionPass { |
39 | public: |
40 | static char ID; |
41 | |
42 | public: |
43 | AMDGPURegBankLegalize() : MachineFunctionPass(ID) {} |
44 | |
45 | bool runOnMachineFunction(MachineFunction &MF) override; |
46 | |
47 | StringRef getPassName() const override { |
48 | return "AMDGPU Register Bank Legalize" ; |
49 | } |
50 | |
51 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
52 | AU.addRequired<TargetPassConfig>(); |
53 | AU.addRequired<GISelCSEAnalysisWrapperPass>(); |
54 | AU.addRequired<MachineUniformityAnalysisPass>(); |
55 | MachineFunctionPass::getAnalysisUsage(AU); |
56 | } |
57 | |
58 | // If there were no phis and we do waterfall expansion machine verifier would |
59 | // fail. |
60 | MachineFunctionProperties getClearedProperties() const override { |
61 | return MachineFunctionProperties().setNoPHIs(); |
62 | } |
63 | }; |
64 | |
65 | } // End anonymous namespace. |
66 | |
67 | INITIALIZE_PASS_BEGIN(AMDGPURegBankLegalize, DEBUG_TYPE, |
68 | "AMDGPU Register Bank Legalize" , false, false) |
69 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
70 | INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) |
71 | INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) |
72 | INITIALIZE_PASS_END(AMDGPURegBankLegalize, DEBUG_TYPE, |
73 | "AMDGPU Register Bank Legalize" , false, false) |
74 | |
75 | char AMDGPURegBankLegalize::ID = 0; |
76 | |
77 | char &llvm::AMDGPURegBankLegalizeID = AMDGPURegBankLegalize::ID; |
78 | |
79 | FunctionPass *llvm::createAMDGPURegBankLegalizePass() { |
80 | return new AMDGPURegBankLegalize(); |
81 | } |
82 | |
83 | const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, |
84 | MachineRegisterInfo &MRI) { |
85 | static std::mutex GlobalMutex; |
86 | static SmallDenseMap<unsigned, std::unique_ptr<RegBankLegalizeRules>> |
87 | CacheForRuleSet; |
88 | std::lock_guard<std::mutex> Lock(GlobalMutex); |
89 | auto [It, Inserted] = CacheForRuleSet.try_emplace(Key: ST.getGeneration()); |
90 | if (Inserted) |
91 | It->second = std::make_unique<RegBankLegalizeRules>(args: ST, args&: MRI); |
92 | else |
93 | It->second->refreshRefs(ST: ST, MRI&: MRI); |
94 | return *It->second; |
95 | } |
96 | |
97 | class AMDGPURegBankLegalizeCombiner { |
98 | MachineIRBuilder &B; |
99 | MachineRegisterInfo &MRI; |
100 | const SIRegisterInfo &TRI; |
101 | const RegisterBank *SgprRB; |
102 | const RegisterBank *VgprRB; |
103 | const RegisterBank *VccRB; |
104 | |
105 | static constexpr LLT S1 = LLT::scalar(SizeInBits: 1); |
106 | static constexpr LLT S16 = LLT::scalar(SizeInBits: 16); |
107 | static constexpr LLT S32 = LLT::scalar(SizeInBits: 32); |
108 | static constexpr LLT S64 = LLT::scalar(SizeInBits: 64); |
109 | |
110 | public: |
111 | AMDGPURegBankLegalizeCombiner(MachineIRBuilder &B, const SIRegisterInfo &TRI, |
112 | const RegisterBankInfo &RBI) |
113 | : B(B), MRI(*B.getMRI()), TRI(TRI), |
114 | SgprRB(&RBI.getRegBank(ID: AMDGPU::SGPRRegBankID)), |
115 | VgprRB(&RBI.getRegBank(ID: AMDGPU::VGPRRegBankID)), |
116 | VccRB(&RBI.getRegBank(ID: AMDGPU::VCCRegBankID)) {}; |
117 | |
118 | bool isLaneMask(Register Reg) { |
119 | const RegisterBank *RB = MRI.getRegBankOrNull(Reg); |
120 | if (RB && RB->getID() == AMDGPU::VCCRegBankID) |
121 | return true; |
122 | |
123 | const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); |
124 | return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(SizeInBits: 1); |
125 | } |
126 | |
127 | void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { |
128 | MI.eraseFromParent(); |
129 | if (Optional0 && isTriviallyDead(MI: *Optional0, MRI)) |
130 | Optional0->eraseFromParent(); |
131 | } |
132 | |
133 | std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) { |
134 | MachineInstr *MatchMI = MRI.getVRegDef(Reg: Src); |
135 | if (MatchMI->getOpcode() != Opcode) |
136 | return {nullptr, Register()}; |
137 | return {MatchMI, MatchMI->getOperand(i: 1).getReg()}; |
138 | } |
139 | |
140 | void tryCombineCopy(MachineInstr &MI) { |
141 | Register Dst = MI.getOperand(i: 0).getReg(); |
142 | Register Src = MI.getOperand(i: 1).getReg(); |
143 | // Skip copies of physical registers. |
144 | if (!Dst.isVirtual() || !Src.isVirtual()) |
145 | return; |
146 | |
147 | // This is a cross bank copy, sgpr S1 to lane mask. |
148 | // |
149 | // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) |
150 | // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) |
151 | // -> |
152 | // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32) |
153 | if (isLaneMask(Reg: Dst) && MRI.getRegBankOrNull(Reg: Src) == SgprRB) { |
154 | auto [Trunc, TruncS32Src] = tryMatch(Src, Opcode: AMDGPU::G_TRUNC); |
155 | assert(Trunc && MRI.getType(TruncS32Src) == S32 && |
156 | "sgpr S1 must be result of G_TRUNC of sgpr S32" ); |
157 | |
158 | B.setInstr(MI); |
159 | // Ensure that truncated bits in BoolSrc are 0. |
160 | auto One = B.buildConstant(Res: {SgprRB, S32}, Val: 1); |
161 | auto BoolSrc = B.buildAnd(Dst: {SgprRB, S32}, Src0: TruncS32Src, Src1: One); |
162 | B.buildInstr(Opc: AMDGPU::G_AMDGPU_COPY_VCC_SCC, DstOps: {Dst}, SrcOps: {BoolSrc}); |
163 | cleanUpAfterCombine(MI, Optional0: Trunc); |
164 | return; |
165 | } |
166 | |
167 | // Src = G_AMDGPU_READANYLANE RALSrc |
168 | // Dst = COPY Src |
169 | // -> |
170 | // Dst = RALSrc |
171 | if (MRI.getRegBankOrNull(Reg: Dst) == VgprRB && |
172 | MRI.getRegBankOrNull(Reg: Src) == SgprRB) { |
173 | auto [RAL, RALSrc] = tryMatch(Src, Opcode: AMDGPU::G_AMDGPU_READANYLANE); |
174 | if (!RAL) |
175 | return; |
176 | |
177 | assert(MRI.getRegBank(RALSrc) == VgprRB); |
178 | MRI.replaceRegWith(FromReg: Dst, ToReg: RALSrc); |
179 | cleanUpAfterCombine(MI, Optional0: RAL); |
180 | return; |
181 | } |
182 | } |
183 | |
184 | void tryCombineS1AnyExt(MachineInstr &MI) { |
185 | // %Src:sgpr(S1) = G_TRUNC %TruncSrc |
186 | // %Dst = G_ANYEXT %Src:sgpr(S1) |
187 | // -> |
188 | // %Dst = G_... %TruncSrc |
189 | Register Dst = MI.getOperand(i: 0).getReg(); |
190 | Register Src = MI.getOperand(i: 1).getReg(); |
191 | if (MRI.getType(Reg: Src) != S1) |
192 | return; |
193 | |
194 | auto [Trunc, TruncSrc] = tryMatch(Src, Opcode: AMDGPU::G_TRUNC); |
195 | if (!Trunc) |
196 | return; |
197 | |
198 | LLT DstTy = MRI.getType(Reg: Dst); |
199 | LLT TruncSrcTy = MRI.getType(Reg: TruncSrc); |
200 | |
201 | if (DstTy == TruncSrcTy) { |
202 | MRI.replaceRegWith(FromReg: Dst, ToReg: TruncSrc); |
203 | cleanUpAfterCombine(MI, Optional0: Trunc); |
204 | return; |
205 | } |
206 | |
207 | B.setInstr(MI); |
208 | |
209 | if (DstTy == S32 && TruncSrcTy == S64) { |
210 | auto Unmerge = B.buildUnmerge(Attrs: {.RCOrRB: SgprRB, .Ty: S32}, Op: TruncSrc); |
211 | MRI.replaceRegWith(FromReg: Dst, ToReg: Unmerge.getReg(Idx: 0)); |
212 | cleanUpAfterCombine(MI, Optional0: Trunc); |
213 | return; |
214 | } |
215 | |
216 | if (DstTy == S64 && TruncSrcTy == S32) { |
217 | B.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(), |
218 | Ops: {TruncSrc, B.buildUndef(Res: {SgprRB, S32})}); |
219 | cleanUpAfterCombine(MI, Optional0: Trunc); |
220 | return; |
221 | } |
222 | |
223 | if (DstTy == S32 && TruncSrcTy == S16) { |
224 | B.buildAnyExt(Res: Dst, Op: TruncSrc); |
225 | cleanUpAfterCombine(MI, Optional0: Trunc); |
226 | return; |
227 | } |
228 | |
229 | if (DstTy == S16 && TruncSrcTy == S32) { |
230 | B.buildTrunc(Res: Dst, Op: TruncSrc); |
231 | cleanUpAfterCombine(MI, Optional0: Trunc); |
232 | return; |
233 | } |
234 | |
235 | llvm_unreachable("missing anyext + trunc combine" ); |
236 | } |
237 | }; |
238 | |
239 | // Search through MRI for virtual registers with sgpr register bank and S1 LLT. |
240 | [[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) { |
241 | const LLT S1 = LLT::scalar(SizeInBits: 1); |
242 | for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) { |
243 | Register Reg = Register::index2VirtReg(Index: i); |
244 | if (MRI.def_empty(RegNo: Reg) || MRI.getType(Reg) != S1) |
245 | continue; |
246 | |
247 | const RegisterBank *RB = MRI.getRegBankOrNull(Reg); |
248 | if (RB && RB->getID() == AMDGPU::SGPRRegBankID) { |
249 | LLVM_DEBUG(dbgs() << "Warning: detected sgpr S1 register in: " ; |
250 | MRI.getVRegDef(Reg)->dump();); |
251 | return Reg; |
252 | } |
253 | } |
254 | |
255 | return {}; |
256 | } |
257 | |
258 | bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { |
259 | if (MF.getProperties().hasFailedISel()) |
260 | return false; |
261 | |
262 | // Setup the instruction builder with CSE. |
263 | const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); |
264 | GISelCSEAnalysisWrapper &Wrapper = |
265 | getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); |
266 | GISelCSEInfo &CSEInfo = Wrapper.get(CSEOpt: TPC.getCSEConfig()); |
267 | GISelObserverWrapper Observer; |
268 | Observer.addObserver(O: &CSEInfo); |
269 | |
270 | CSEMIRBuilder B(MF); |
271 | B.setCSEInfo(&CSEInfo); |
272 | B.setChangeObserver(Observer); |
273 | |
274 | RAIIDelegateInstaller DelegateInstaller(MF, &Observer); |
275 | RAIIMFObserverInstaller MFObserverInstaller(MF, Observer); |
276 | |
277 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
278 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
279 | const RegisterBankInfo &RBI = *ST.getRegBankInfo(); |
280 | const MachineUniformityInfo &MUI = |
281 | getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo(); |
282 | |
283 | // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes. |
284 | const RegBankLegalizeRules &RBLRules = getRules(ST, MRI); |
285 | |
286 | // Logic that does legalization based on IDs assigned to Opcode. |
287 | RegBankLegalizeHelper RBLHelper(B, MUI, RBI, RBLRules); |
288 | |
289 | SmallVector<MachineInstr *> AllInst; |
290 | |
291 | for (MachineBasicBlock &MBB : MF) { |
292 | for (MachineInstr &MI : MBB) { |
293 | AllInst.push_back(Elt: &MI); |
294 | } |
295 | } |
296 | |
297 | for (MachineInstr *MI : AllInst) { |
298 | if (!MI->isPreISelOpcode()) |
299 | continue; |
300 | |
301 | unsigned Opc = MI->getOpcode(); |
302 | // Insert point for use operands needs some calculation. |
303 | if (Opc == AMDGPU::G_PHI) { |
304 | RBLHelper.applyMappingPHI(MI&: *MI); |
305 | continue; |
306 | } |
307 | |
308 | // Opcodes that support pretty much all combinations of reg banks and LLTs |
309 | // (except S1). There is no point in writing rules for them. |
310 | if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES || |
311 | Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) { |
312 | RBLHelper.applyMappingTrivial(MI&: *MI); |
313 | continue; |
314 | } |
315 | |
316 | // Opcodes that also support S1. |
317 | if (Opc == G_FREEZE && |
318 | MRI.getType(Reg: MI->getOperand(i: 0).getReg()) != LLT::scalar(SizeInBits: 1)) { |
319 | RBLHelper.applyMappingTrivial(MI&: *MI); |
320 | continue; |
321 | } |
322 | |
323 | if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT || |
324 | Opc == AMDGPU::G_IMPLICIT_DEF)) { |
325 | Register Dst = MI->getOperand(i: 0).getReg(); |
326 | // Non S1 types are trivially accepted. |
327 | if (MRI.getType(Reg: Dst) != LLT::scalar(SizeInBits: 1)) { |
328 | assert(MRI.getRegBank(Dst)->getID() == AMDGPU::SGPRRegBankID); |
329 | continue; |
330 | } |
331 | |
332 | // S1 rules are in RegBankLegalizeRules. |
333 | } |
334 | |
335 | RBLHelper.findRuleAndApplyMapping(MI&: *MI); |
336 | } |
337 | |
338 | // Sgpr S1 clean up combines: |
339 | // - Sgpr S1(S32) to sgpr S1(S32) Copy: anyext + trunc combine. |
340 | // In RegBankLegalize 'S1 Dst' are legalized into S32 as |
341 | // 'S1Dst = Trunc S32Dst' and 'S1 Src' into 'S32Src = Anyext S1Src'. |
342 | // S1 Truncs and Anyexts that come from legalizer, that can have non-S32 |
343 | // types e.g. S16 = Anyext S1 or S1 = Trunc S64, will also be cleaned up. |
344 | // - Sgpr S1(S32) to vcc Copy: G_AMDGPU_COPY_VCC_SCC combine. |
345 | // Divergent instruction uses sgpr S1 as input that should be lane mask(vcc) |
346 | // Legalizing this use creates sgpr S1(S32) to vcc Copy. |
347 | |
348 | // Note: Remaining S1 copies, S1s are either sgpr S1(S32) or vcc S1: |
349 | // - Vcc to vcc Copy: nothing to do here, just a regular copy. |
350 | // - Vcc to sgpr S1 Copy: Should not exist in a form of COPY instruction(*). |
351 | // Note: For 'uniform-in-vcc to sgpr-S1 copy' G_AMDGPU_COPY_SCC_VCC is used |
352 | // instead. When only available instruction creates vcc result, use of |
353 | // UniformInVcc results in creating G_AMDGPU_COPY_SCC_VCC. |
354 | |
355 | // (*)Explanation for 'sgpr S1(uniform) = COPY vcc(divergent)': |
356 | // Copy from divergent to uniform register indicates an error in either: |
357 | // - Uniformity analysis: Uniform instruction has divergent input. If one of |
358 | // the inputs is divergent, instruction should be divergent! |
359 | // - RegBankLegalizer not executing in waterfall loop (missing implementation) |
360 | |
361 | AMDGPURegBankLegalizeCombiner Combiner(B, *ST.getRegisterInfo(), RBI); |
362 | |
363 | for (MachineBasicBlock &MBB : MF) { |
364 | for (MachineInstr &MI : make_early_inc_range(Range&: MBB)) { |
365 | if (MI.getOpcode() == AMDGPU::COPY) { |
366 | Combiner.tryCombineCopy(MI); |
367 | continue; |
368 | } |
369 | if (MI.getOpcode() == AMDGPU::G_ANYEXT) { |
370 | Combiner.tryCombineS1AnyExt(MI); |
371 | continue; |
372 | } |
373 | } |
374 | } |
375 | |
376 | assert(!getAnySgprS1(MRI).isValid() && |
377 | "Registers with sgpr reg bank and S1 LLT are not legal after " |
378 | "AMDGPURegBankLegalize. Should lower to sgpr S32" ); |
379 | |
380 | return true; |
381 | } |
382 | |