1//===-- GCNPreRAOptimizations.cpp -----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass combines split register tuple initialization into a single pseudo:
11///
12/// undef %0.sub1:sreg_64 = S_MOV_B32 1
13/// %0.sub0:sreg_64 = S_MOV_B32 2
14/// =>
15/// %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 0x200000001
16///
17/// This is to allow rematerialization of a value instead of spilling. It is
18/// supposed to be done after register coalescer to allow it to do its job and
19/// before actual register allocation to allow rematerialization.
20///
21/// Right now the pass only handles 64 bit SGPRs with immediate initializers,
22/// although the same shall be possible with other register classes and
23/// instructions if necessary.
24///
25/// This pass also adds register allocation hints to COPY.
26/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
27/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
28/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
29/// the VGPR_32, the COPY can be completely eliminated.
30///
31//===----------------------------------------------------------------------===//
32
33#include "GCNPreRAOptimizations.h"
34#include "AMDGPU.h"
35#include "GCNSubtarget.h"
36#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
37#include "SIRegisterInfo.h"
38#include "llvm/CodeGen/LiveIntervals.h"
39#include "llvm/CodeGen/MachineFunctionPass.h"
40#include "llvm/InitializePasses.h"
41
42using namespace llvm;
43
44#define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
45
46namespace {
47
48class GCNPreRAOptimizationsImpl {
49private:
50 const SIInstrInfo *TII;
51 const SIRegisterInfo *TRI;
52 MachineRegisterInfo *MRI;
53 LiveIntervals *LIS;
54
55 bool processReg(Register Reg);
56
57public:
58 GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
59 bool run(MachineFunction &MF);
60};
61
62class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
63public:
64 static char ID;
65
66 GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {}
67
68 bool runOnMachineFunction(MachineFunction &MF) override;
69
70 StringRef getPassName() const override {
71 return "AMDGPU Pre-RA optimizations";
72 }
73
74 void getAnalysisUsage(AnalysisUsage &AU) const override {
75 AU.addRequired<LiveIntervalsWrapperPass>();
76 AU.setPreservesAll();
77 MachineFunctionPass::getAnalysisUsage(AU);
78 }
79};
80} // End anonymous namespace.
81
82INITIALIZE_PASS_BEGIN(GCNPreRAOptimizationsLegacy, DEBUG_TYPE,
83 "AMDGPU Pre-RA optimizations", false, false)
84INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
85INITIALIZE_PASS_END(GCNPreRAOptimizationsLegacy, DEBUG_TYPE,
86 "Pre-RA optimizations", false, false)
87
88char GCNPreRAOptimizationsLegacy::ID = 0;
89
90char &llvm::GCNPreRAOptimizationsID = GCNPreRAOptimizationsLegacy::ID;
91
92FunctionPass *llvm::createGCNPreRAOptimizationsLegacyPass() {
93 return new GCNPreRAOptimizationsLegacy();
94}
95
96bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
97 MachineInstr *Def0 = nullptr;
98 MachineInstr *Def1 = nullptr;
99 uint64_t Init = 0;
100 bool Changed = false;
101 SmallSet<Register, 32> ModifiedRegs;
102 bool IsAGPRDst = TRI->isAGPRClass(RC: MRI->getRegClass(Reg));
103
104 for (MachineInstr &I : MRI->def_instructions(Reg)) {
105 switch (I.getOpcode()) {
106 default:
107 return false;
108 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
109 break;
110 case AMDGPU::COPY: {
111 // Some subtargets cannot do an AGPR to AGPR copy directly, and need an
112 // intermdiate temporary VGPR register. Try to find the defining
113 // accvgpr_write to avoid temporary registers.
114
115 if (!IsAGPRDst)
116 return false;
117
118 Register SrcReg = I.getOperand(i: 1).getReg();
119
120 if (!SrcReg.isVirtual())
121 break;
122
123 // Check if source of copy is from another AGPR.
124 bool IsAGPRSrc = TRI->isAGPRClass(RC: MRI->getRegClass(Reg: SrcReg));
125 if (!IsAGPRSrc)
126 break;
127
128 // def_instructions() does not look at subregs so it may give us a
129 // different instruction that defines the same vreg but different subreg
130 // so we have to manually check subreg.
131 Register SrcSubReg = I.getOperand(i: 1).getSubReg();
132 for (auto &Def : MRI->def_instructions(Reg: SrcReg)) {
133 if (SrcSubReg != Def.getOperand(i: 0).getSubReg())
134 continue;
135
136 if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
137 const MachineOperand &DefSrcMO = Def.getOperand(i: 1);
138
139 // Immediates are not an issue and can be propagated in
140 // postrapseudos pass. Only handle cases where defining
141 // accvgpr_write source is a vreg.
142 if (DefSrcMO.isReg() && DefSrcMO.getReg().isVirtual()) {
143 // Propagate source reg of accvgpr write to this copy instruction
144 I.getOperand(i: 1).setReg(DefSrcMO.getReg());
145 I.getOperand(i: 1).setSubReg(DefSrcMO.getSubReg());
146
147 // Reg uses were changed, collect unique set of registers to update
148 // live intervals at the end.
149 ModifiedRegs.insert(V: DefSrcMO.getReg());
150 ModifiedRegs.insert(V: SrcReg);
151
152 Changed = true;
153 }
154
155 // Found the defining accvgpr_write, stop looking any further.
156 break;
157 }
158 }
159 break;
160 }
161 case AMDGPU::S_MOV_B32:
162 if (I.getOperand(i: 0).getReg() != Reg || !I.getOperand(i: 1).isImm() ||
163 I.getNumOperands() != 2)
164 return false;
165
166 switch (I.getOperand(i: 0).getSubReg()) {
167 default:
168 return false;
169 case AMDGPU::sub0:
170 if (Def0)
171 return false;
172 Def0 = &I;
173 Init |= Lo_32(Value: I.getOperand(i: 1).getImm());
174 break;
175 case AMDGPU::sub1:
176 if (Def1)
177 return false;
178 Def1 = &I;
179 Init |= static_cast<uint64_t>(I.getOperand(i: 1).getImm()) << 32;
180 break;
181 }
182 break;
183 }
184 }
185
186 // For AGPR reg, check if live intervals need to be updated.
187 if (IsAGPRDst) {
188 if (Changed) {
189 for (Register RegToUpdate : ModifiedRegs) {
190 LIS->removeInterval(Reg: RegToUpdate);
191 LIS->createAndComputeVirtRegInterval(Reg: RegToUpdate);
192 }
193 }
194
195 return Changed;
196 }
197
198 // For SGPR reg, check if we can combine instructions.
199 if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent())
200 return Changed;
201
202 LLVM_DEBUG(dbgs() << "Combining:\n " << *Def0 << " " << *Def1
203 << " =>\n");
204
205 if (SlotIndex::isEarlierInstr(A: LIS->getInstructionIndex(Instr: *Def1),
206 B: LIS->getInstructionIndex(Instr: *Def0)))
207 std::swap(a&: Def0, b&: Def1);
208
209 LIS->RemoveMachineInstrFromMaps(MI&: *Def0);
210 LIS->RemoveMachineInstrFromMaps(MI&: *Def1);
211 auto NewI = BuildMI(BB&: *Def0->getParent(), I&: *Def0, MIMD: Def0->getDebugLoc(),
212 MCID: TII->get(Opcode: AMDGPU::S_MOV_B64_IMM_PSEUDO), DestReg: Reg)
213 .addImm(Val: Init);
214
215 Def0->eraseFromParent();
216 Def1->eraseFromParent();
217 LIS->InsertMachineInstrInMaps(MI&: *NewI);
218 LIS->removeInterval(Reg);
219 LIS->createAndComputeVirtRegInterval(Reg);
220
221 LLVM_DEBUG(dbgs() << " " << *NewI);
222
223 return true;
224}
225
226bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
227 if (skipFunction(F: MF.getFunction()))
228 return false;
229 LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
230 return GCNPreRAOptimizationsImpl(LIS).run(MF);
231}
232
233PreservedAnalyses
234GCNPreRAOptimizationsPass::run(MachineFunction &MF,
235 MachineFunctionAnalysisManager &MFAM) {
236 LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(IR&: MF);
237 GCNPreRAOptimizationsImpl(LIS).run(MF);
238 return PreservedAnalyses::all();
239}
240
241bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
242 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
243 TII = ST.getInstrInfo();
244 MRI = &MF.getRegInfo();
245 TRI = ST.getRegisterInfo();
246
247 bool Changed = false;
248
249 for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
250 Register Reg = Register::index2VirtReg(Index: I);
251 if (!LIS->hasInterval(Reg))
252 continue;
253 const TargetRegisterClass *RC = MRI->getRegClass(Reg);
254 if ((RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) &&
255 (ST.hasGFX90AInsts() || !TRI->isAGPRClass(RC)))
256 continue;
257
258 Changed |= processReg(Reg);
259 }
260
261 if (!ST.useRealTrue16Insts())
262 return Changed;
263
264 // Add RA hints to improve True16 COPY elimination.
265 for (const MachineBasicBlock &MBB : MF) {
266 for (const MachineInstr &MI : MBB) {
267 if (MI.getOpcode() != AMDGPU::COPY)
268 continue;
269 Register Dst = MI.getOperand(i: 0).getReg();
270 Register Src = MI.getOperand(i: 1).getReg();
271 const TargetRegisterClass *DstRC = TRI->getRegClassForReg(MRI: *MRI, Reg: Dst);
272 bool IsDst16Bit = AMDGPU::VGPR_16RegClass.hasSubClassEq(RC: DstRC);
273 if (Dst.isVirtual() && IsDst16Bit && Src.isPhysical() &&
274 TRI->getRegClassForReg(MRI: *MRI, Reg: Src) == &AMDGPU::VGPR_32RegClass)
275 MRI->setRegAllocationHint(VReg: Dst, Type: 0, PrefReg: TRI->getSubReg(Reg: Src, Idx: AMDGPU::lo16));
276 if (Src.isVirtual() &&
277 MRI->getRegClass(Reg: Src) == &AMDGPU::VGPR_16RegClass &&
278 Dst.isPhysical() && DstRC == &AMDGPU::VGPR_32RegClass)
279 MRI->setRegAllocationHint(VReg: Src, Type: 0, PrefReg: TRI->getSubReg(Reg: Dst, Idx: AMDGPU::lo16));
280 if (!Dst.isVirtual() || !Src.isVirtual())
281 continue;
282 if (MRI->getRegClass(Reg: Dst) == &AMDGPU::VGPR_32RegClass &&
283 MRI->getRegClass(Reg: Src) == &AMDGPU::VGPR_16RegClass) {
284 MRI->setRegAllocationHint(VReg: Dst, Type: AMDGPURI::Size32, PrefReg: Src);
285 MRI->setRegAllocationHint(VReg: Src, Type: AMDGPURI::Size16, PrefReg: Dst);
286 }
287 if (IsDst16Bit && MRI->getRegClass(Reg: Src) == &AMDGPU::VGPR_32RegClass)
288 MRI->setRegAllocationHint(VReg: Dst, Type: AMDGPURI::Size16, PrefReg: Src);
289 }
290 }
291
292 return Changed;
293}
294