1//===-- GCNPreRAOptimizations.cpp -----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass combines split register tuple initialization into a single pseudo:
11///
12/// undef %0.sub1:sreg_64 = S_MOV_B32 1
13/// %0.sub0:sreg_64 = S_MOV_B32 2
14/// =>
15/// %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 0x200000001
16///
17/// This is to allow rematerialization of a value instead of spilling. It is
18/// supposed to be done after register coalescer to allow it to do its job and
19/// before actual register allocation to allow rematerialization.
20///
21/// Right now the pass only handles 64 bit SGPRs with immediate initializers,
22/// although the same shall be possible with other register classes and
23/// instructions if necessary.
24///
25/// This pass also adds register allocation hints to COPY.
26/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
27/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
28/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
29/// the VGPR_32, the COPY can be completely eliminated.
30///
31//===----------------------------------------------------------------------===//
32
33#include "GCNPreRAOptimizations.h"
34#include "AMDGPU.h"
35#include "GCNSubtarget.h"
36#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
37#include "SIRegisterInfo.h"
38#include "llvm/CodeGen/LiveIntervals.h"
39#include "llvm/CodeGen/MachineFunctionPass.h"
40#include "llvm/InitializePasses.h"
41
42using namespace llvm;
43
44#define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
45
46namespace {
47
48class GCNPreRAOptimizationsImpl {
49private:
50 const SIInstrInfo *TII;
51 const SIRegisterInfo *TRI;
52 MachineRegisterInfo *MRI;
53 LiveIntervals *LIS;
54
55 bool processReg(Register Reg);
56 void hintTrue16Copy(const MachineInstr &MI);
57 bool optimizeBVHStack(MachineInstr &MI);
58
59public:
60 GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
61 bool run(MachineFunction &MF);
62};
63
64class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
65public:
66 static char ID;
67
68 GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {}
69
70 bool runOnMachineFunction(MachineFunction &MF) override;
71
72 StringRef getPassName() const override {
73 return "AMDGPU Pre-RA optimizations";
74 }
75
76 void getAnalysisUsage(AnalysisUsage &AU) const override {
77 AU.addRequired<LiveIntervalsWrapperPass>();
78 AU.setPreservesAll();
79 MachineFunctionPass::getAnalysisUsage(AU);
80 }
81};
82} // End anonymous namespace.
83
84INITIALIZE_PASS_BEGIN(GCNPreRAOptimizationsLegacy, DEBUG_TYPE,
85 "AMDGPU Pre-RA optimizations", false, false)
86INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
87INITIALIZE_PASS_END(GCNPreRAOptimizationsLegacy, DEBUG_TYPE,
88 "Pre-RA optimizations", false, false)
89
90char GCNPreRAOptimizationsLegacy::ID = 0;
91
92char &llvm::GCNPreRAOptimizationsID = GCNPreRAOptimizationsLegacy::ID;
93
94FunctionPass *llvm::createGCNPreRAOptimizationsLegacyPass() {
95 return new GCNPreRAOptimizationsLegacy();
96}
97
98bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
99 MachineInstr *Def0 = nullptr;
100 MachineInstr *Def1 = nullptr;
101 uint64_t Init = 0;
102 bool Changed = false;
103 SmallSet<Register, 32> ModifiedRegs;
104 bool IsAGPRDst = TRI->isAGPRClass(RC: MRI->getRegClass(Reg));
105
106 for (MachineInstr &I : MRI->def_instructions(Reg)) {
107 switch (I.getOpcode()) {
108 default:
109 return false;
110 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
111 break;
112 case AMDGPU::COPY: {
113 // Some subtargets cannot do an AGPR to AGPR copy directly, and need an
114 // intermdiate temporary VGPR register. Try to find the defining
115 // accvgpr_write to avoid temporary registers.
116
117 if (!IsAGPRDst)
118 return false;
119
120 Register SrcReg = I.getOperand(i: 1).getReg();
121
122 if (!SrcReg.isVirtual())
123 break;
124
125 // Check if source of copy is from another AGPR.
126 bool IsAGPRSrc = TRI->isAGPRClass(RC: MRI->getRegClass(Reg: SrcReg));
127 if (!IsAGPRSrc)
128 break;
129
130 // def_instructions() does not look at subregs so it may give us a
131 // different instruction that defines the same vreg but different subreg
132 // so we have to manually check subreg.
133 Register SrcSubReg = I.getOperand(i: 1).getSubReg();
134 for (auto &Def : MRI->def_instructions(Reg: SrcReg)) {
135 if (SrcSubReg != Def.getOperand(i: 0).getSubReg())
136 continue;
137
138 if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
139 const MachineOperand &DefSrcMO = Def.getOperand(i: 1);
140
141 // Immediates are not an issue and can be propagated in
142 // postrapseudos pass. Only handle cases where defining
143 // accvgpr_write source is a vreg.
144 if (DefSrcMO.isReg() && DefSrcMO.getReg().isVirtual()) {
145 // Propagate source reg of accvgpr write to this copy instruction
146 I.getOperand(i: 1).setReg(DefSrcMO.getReg());
147 I.getOperand(i: 1).setSubReg(DefSrcMO.getSubReg());
148
149 // Reg uses were changed, collect unique set of registers to update
150 // live intervals at the end.
151 ModifiedRegs.insert(V: DefSrcMO.getReg());
152 ModifiedRegs.insert(V: SrcReg);
153
154 Changed = true;
155 }
156
157 // Found the defining accvgpr_write, stop looking any further.
158 break;
159 }
160 }
161 break;
162 }
163 case AMDGPU::S_MOV_B32:
164 if (I.getOperand(i: 0).getReg() != Reg || !I.getOperand(i: 1).isImm() ||
165 I.getNumOperands() != 2)
166 return false;
167
168 switch (I.getOperand(i: 0).getSubReg()) {
169 default:
170 return false;
171 case AMDGPU::sub0:
172 if (Def0)
173 return false;
174 Def0 = &I;
175 Init |= Lo_32(Value: I.getOperand(i: 1).getImm());
176 break;
177 case AMDGPU::sub1:
178 if (Def1)
179 return false;
180 Def1 = &I;
181 Init |= static_cast<uint64_t>(I.getOperand(i: 1).getImm()) << 32;
182 break;
183 }
184 break;
185 }
186 }
187
188 // For AGPR reg, check if live intervals need to be updated.
189 if (IsAGPRDst) {
190 if (Changed) {
191 for (Register RegToUpdate : ModifiedRegs) {
192 LIS->removeInterval(Reg: RegToUpdate);
193 LIS->createAndComputeVirtRegInterval(Reg: RegToUpdate);
194 }
195 }
196
197 return Changed;
198 }
199
200 // For SGPR reg, check if we can combine instructions.
201 if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent())
202 return Changed;
203
204 LLVM_DEBUG(dbgs() << "Combining:\n " << *Def0 << " " << *Def1
205 << " =>\n");
206
207 if (SlotIndex::isEarlierInstr(A: LIS->getInstructionIndex(Instr: *Def1),
208 B: LIS->getInstructionIndex(Instr: *Def0)))
209 std::swap(a&: Def0, b&: Def1);
210
211 LIS->RemoveMachineInstrFromMaps(MI&: *Def0);
212 LIS->RemoveMachineInstrFromMaps(MI&: *Def1);
213 auto NewI = BuildMI(BB&: *Def0->getParent(), I&: *Def0, MIMD: Def0->getDebugLoc(),
214 MCID: TII->get(Opcode: AMDGPU::S_MOV_B64_IMM_PSEUDO), DestReg: Reg)
215 .addImm(Val: Init);
216
217 Def0->eraseFromParent();
218 Def1->eraseFromParent();
219 LIS->InsertMachineInstrInMaps(MI&: *NewI);
220 LIS->removeInterval(Reg);
221 LIS->createAndComputeVirtRegInterval(Reg);
222
223 LLVM_DEBUG(dbgs() << " " << *NewI);
224
225 return true;
226}
227
228bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
229 if (skipFunction(F: MF.getFunction()))
230 return false;
231 LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
232 return GCNPreRAOptimizationsImpl(LIS).run(MF);
233}
234
235PreservedAnalyses
236GCNPreRAOptimizationsPass::run(MachineFunction &MF,
237 MachineFunctionAnalysisManager &MFAM) {
238 LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(IR&: MF);
239 GCNPreRAOptimizationsImpl(LIS).run(MF);
240 return PreservedAnalyses::all();
241}
242
243void GCNPreRAOptimizationsImpl::hintTrue16Copy(const MachineInstr &MI) {
244 Register Dst = MI.getOperand(i: 0).getReg();
245 Register Src = MI.getOperand(i: 1).getReg();
246 const TargetRegisterClass *DstRC = TRI->getRegClassForReg(MRI: *MRI, Reg: Dst);
247 bool IsDst16Bit = AMDGPU::VGPR_16RegClass.hasSubClassEq(RC: DstRC);
248 if (Dst.isVirtual() && IsDst16Bit && Src.isPhysical() &&
249 TRI->getRegClassForReg(MRI: *MRI, Reg: Src) == &AMDGPU::VGPR_32RegClass)
250 MRI->setRegAllocationHint(VReg: Dst, Type: 0, PrefReg: TRI->getSubReg(Reg: Src, Idx: AMDGPU::lo16));
251 if (Src.isVirtual() && MRI->getRegClass(Reg: Src) == &AMDGPU::VGPR_16RegClass &&
252 Dst.isPhysical() && DstRC == &AMDGPU::VGPR_32RegClass)
253 MRI->setRegAllocationHint(VReg: Src, Type: 0, PrefReg: TRI->getSubReg(Reg: Dst, Idx: AMDGPU::lo16));
254 if (!Dst.isVirtual() || !Src.isVirtual())
255 return;
256 if (MRI->getRegClass(Reg: Dst) == &AMDGPU::VGPR_32RegClass &&
257 MRI->getRegClass(Reg: Src) == &AMDGPU::VGPR_16RegClass) {
258 MRI->setRegAllocationHint(VReg: Dst, Type: AMDGPURI::Size32, PrefReg: Src);
259 MRI->setRegAllocationHint(VReg: Src, Type: AMDGPURI::Size16, PrefReg: Dst);
260 }
261 if (IsDst16Bit && MRI->getRegClass(Reg: Src) == &AMDGPU::VGPR_32RegClass)
262 MRI->setRegAllocationHint(VReg: Dst, Type: AMDGPURI::Size16, PrefReg: Src);
263}
264
265bool GCNPreRAOptimizationsImpl::optimizeBVHStack(MachineInstr &MI) {
266 SmallVector<Register, 2> UseRegs;
267
268 // Find BVH sources for this DS_BVH_STACK instruction.
269 auto CheckUse = [&](MachineOperand &Use) {
270 Register Reg = Use.getReg();
271 for (const MachineInstr &Src : MRI->def_instructions(Reg)) {
272 if (!SIInstrInfo::isImage(MI: Src))
273 continue;
274 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Src.getOpcode());
275 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
276 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
277 if (!BaseInfo->BVH)
278 continue;
279 UseRegs.push_back(Elt: Reg);
280 break;
281 }
282 };
283 CheckUse(*TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::data0));
284 CheckUse(*TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::data1));
285
286 if (UseRegs.empty())
287 return false;
288
289 // Add implicit uses for entire BVH source registers.
290 // This avoids partial reallocation of register which could
291 // introduce a premature s_wait_bvhcnt.
292 for (Register Reg : UseRegs) {
293 MI.addOperand(Op: MachineOperand::CreateReg(Reg, isDef: false, isImp: true));
294 LIS->removeInterval(Reg);
295 LIS->createAndComputeVirtRegInterval(Reg);
296 }
297 LLVM_DEBUG(dbgs() << "Added implicit uses to: " << MI);
298
299 return true;
300}
301
302bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
303 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
304 TII = ST.getInstrInfo();
305 MRI = &MF.getRegInfo();
306 TRI = ST.getRegisterInfo();
307
308 bool Changed = false;
309
310 for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
311 Register Reg = Register::index2VirtReg(Index: I);
312 if (!LIS->hasInterval(Reg))
313 continue;
314 const TargetRegisterClass *RC = MRI->getRegClass(Reg);
315 if ((RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) &&
316 (ST.hasGFX90AInsts() || !TRI->isAGPRClass(RC)))
317 continue;
318
319 Changed |= processReg(Reg);
320 }
321
322 const bool HasBVHStack = ST.hasBVHDualAndBVH8Insts();
323 const bool HasRealTrue16 = ST.useRealTrue16Insts();
324
325 if (!HasRealTrue16 && !HasBVHStack)
326 return Changed;
327
328 for (MachineBasicBlock &MBB : MF) {
329 for (MachineInstr &MI : MBB) {
330 // Add RA hints to improve True16 COPY elimination.
331 if (HasRealTrue16 && MI.getOpcode() == AMDGPU::COPY) {
332 hintTrue16Copy(MI);
333 continue;
334 }
335 // Add implicit uses to avoid early wait on intersect ray instructions.
336 if (HasBVHStack &&
337 (MI.getOpcode() == AMDGPU::DS_BVH_STACK_RTN_B32 ||
338 MI.getOpcode() == AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32 ||
339 MI.getOpcode() == AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64)) {
340 Changed |= optimizeBVHStack(MI);
341 continue;
342 }
343 }
344 }
345
346 return Changed;
347}
348