1 | //===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU |
11 | /// instructions that produce single-use VGPR values. If the value is forwarded |
12 | /// to the consumer instruction prior to VGPR writeback, the hardware can |
13 | /// then skip (kill) the VGPR write. |
14 | // |
15 | //===----------------------------------------------------------------------===// |
16 | |
17 | #include "AMDGPU.h" |
18 | #include "AMDGPUGenSearchableTables.inc" |
19 | #include "GCNSubtarget.h" |
20 | #include "SIInstrInfo.h" |
21 | #include "SIRegisterInfo.h" |
22 | #include "llvm/ADT/DenseMap.h" |
23 | #include "llvm/ADT/STLExtras.h" |
24 | #include "llvm/ADT/SmallVector.h" |
25 | #include "llvm/ADT/StringRef.h" |
26 | #include "llvm/CodeGen/MachineBasicBlock.h" |
27 | #include "llvm/CodeGen/MachineFunction.h" |
28 | #include "llvm/CodeGen/MachineFunctionPass.h" |
29 | #include "llvm/CodeGen/MachineInstr.h" |
30 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
31 | #include "llvm/CodeGen/MachineOperand.h" |
32 | #include "llvm/CodeGen/Register.h" |
33 | #include "llvm/IR/DebugLoc.h" |
34 | #include "llvm/MC/MCRegister.h" |
35 | #include "llvm/MC/MCRegisterInfo.h" |
36 | #include "llvm/Pass.h" |
37 | #include <array> |
38 | |
39 | using namespace llvm; |
40 | |
41 | #define DEBUG_TYPE "amdgpu-insert-single-use-vdst" |
42 | |
43 | namespace { |
44 | class AMDGPUInsertSingleUseVDST : public MachineFunctionPass { |
45 | private: |
46 | const SIInstrInfo *SII; |
47 | class SingleUseInstruction { |
48 | private: |
49 | static const unsigned MaxSkipRange = 0b111; |
50 | static const unsigned MaxNumberOfSkipRegions = 2; |
51 | |
52 | unsigned LastEncodedPositionEnd; |
53 | MachineInstr *ProducerInstr; |
54 | |
55 | std::array<unsigned, MaxNumberOfSkipRegions + 1> SingleUseRegions; |
56 | SmallVector<unsigned, MaxNumberOfSkipRegions> SkipRegions; |
57 | |
58 | // Adds a skip region into the instruction. |
59 | void skip(const unsigned ProducerPosition) { |
60 | while (LastEncodedPositionEnd + MaxSkipRange < ProducerPosition) { |
61 | SkipRegions.push_back(Elt: MaxSkipRange); |
62 | LastEncodedPositionEnd += MaxSkipRange; |
63 | } |
64 | SkipRegions.push_back(Elt: ProducerPosition - LastEncodedPositionEnd); |
65 | LastEncodedPositionEnd = ProducerPosition; |
66 | } |
67 | |
68 | bool currentRegionHasSpace() { |
69 | const auto Region = SkipRegions.size(); |
70 | // The first region has an extra bit of encoding space. |
71 | return SingleUseRegions[Region] < |
72 | ((Region == MaxNumberOfSkipRegions) ? 0b1111U : 0b111U); |
73 | } |
74 | |
75 | unsigned encodeImm() { |
76 | // Handle the first Single Use Region separately as it has an extra bit |
77 | // of encoding space. |
78 | unsigned Imm = SingleUseRegions[SkipRegions.size()]; |
79 | unsigned ShiftAmount = 4; |
80 | for (unsigned i = SkipRegions.size(); i > 0; i--) { |
81 | Imm |= SkipRegions[i - 1] << ShiftAmount; |
82 | ShiftAmount += 3; |
83 | Imm |= SingleUseRegions[i - 1] << ShiftAmount; |
84 | ShiftAmount += 3; |
85 | } |
86 | return Imm; |
87 | } |
88 | |
89 | public: |
90 | SingleUseInstruction(const unsigned ProducerPosition, |
91 | MachineInstr *Producer) |
92 | : LastEncodedPositionEnd(ProducerPosition + 1), ProducerInstr(Producer), |
93 | SingleUseRegions({1, 0, 0}) {} |
94 | |
95 | // Returns false if adding a new single use producer failed. This happens |
96 | // because it could not be encoded, either because there is no room to |
97 | // encode another single use producer region or that this single use |
98 | // producer is too far away to encode the amount of instructions to skip. |
99 | bool tryAddProducer(const unsigned ProducerPosition, MachineInstr *MI) { |
100 | // Producer is too far away to encode into this instruction or another |
101 | // skip region is needed and SkipRegions.size() = 2 so there's no room for |
102 | // another skip region, therefore a new instruction is needed. |
103 | if (LastEncodedPositionEnd + |
104 | (MaxSkipRange * (MaxNumberOfSkipRegions - SkipRegions.size())) < |
105 | ProducerPosition) |
106 | return false; |
107 | |
108 | // If a skip region is needed. |
109 | if (LastEncodedPositionEnd != ProducerPosition || |
110 | !currentRegionHasSpace()) { |
111 | // If the current region is out of space therefore a skip region would |
112 | // be needed, but there is no room for another skip region. |
113 | if (SkipRegions.size() == MaxNumberOfSkipRegions) |
114 | return false; |
115 | skip(ProducerPosition); |
116 | } |
117 | |
118 | SingleUseRegions[SkipRegions.size()]++; |
119 | LastEncodedPositionEnd = ProducerPosition + 1; |
120 | ProducerInstr = MI; |
121 | return true; |
122 | } |
123 | |
124 | auto emit(const SIInstrInfo *SII) { |
125 | return BuildMI(BB&: *ProducerInstr->getParent(), I: ProducerInstr, MIMD: DebugLoc(), |
126 | MCID: SII->get(Opcode: AMDGPU::S_SINGLEUSE_VDST)) |
127 | .addImm(Val: encodeImm()); |
128 | } |
129 | }; |
130 | |
131 | public: |
132 | static char ID; |
133 | |
134 | AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {} |
135 | |
136 | void insertSingleUseInstructions( |
137 | ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const { |
138 | SmallVector<SingleUseInstruction> Instructions; |
139 | |
140 | for (auto &[Position, MI] : SingleUseProducers) { |
141 | // Encode this position into the last single use instruction if possible. |
142 | if (Instructions.empty() || |
143 | !Instructions.back().tryAddProducer(ProducerPosition: Position, MI)) { |
144 | // If not, add a new instruction. |
145 | Instructions.push_back(Elt: SingleUseInstruction(Position, MI)); |
146 | } |
147 | } |
148 | |
149 | for (auto &Instruction : Instructions) |
150 | Instruction.emit(SII); |
151 | } |
152 | |
153 | bool runOnMachineFunction(MachineFunction &MF) override { |
154 | const auto &ST = MF.getSubtarget<GCNSubtarget>(); |
155 | if (!ST.hasVGPRSingleUseHintInsts()) |
156 | return false; |
157 | |
158 | SII = ST.getInstrInfo(); |
159 | const auto *TRI = &SII->getRegisterInfo(); |
160 | bool InstructionEmitted = false; |
161 | |
162 | for (MachineBasicBlock &MBB : MF) { |
163 | DenseMap<MCRegUnit, unsigned> RegisterUseCount; |
164 | |
165 | // Handle boundaries at the end of basic block separately to avoid |
166 | // false positives. If they are live at the end of a basic block then |
167 | // assume it has more uses later on. |
168 | for (const auto &Liveout : MBB.liveouts()) { |
169 | for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid(); |
170 | ++Units) { |
171 | const auto [Unit, Mask] = *Units; |
172 | if ((Mask & Liveout.LaneMask).any()) |
173 | RegisterUseCount[Unit] = 2; |
174 | } |
175 | } |
176 | |
177 | SmallVector<std::pair<unsigned, MachineInstr *>> |
178 | SingleUseProducerPositions; |
179 | |
180 | unsigned VALUInstrCount = 0; |
181 | for (MachineInstr &MI : reverse(C: MBB.instrs())) { |
182 | // All registers in all operands need to be single use for an |
183 | // instruction to be marked as a single use producer. |
184 | bool AllProducerOperandsAreSingleUse = true; |
185 | |
186 | // Gather a list of Registers used before updating use counts to avoid |
187 | // double counting registers that appear multiple times in a single |
188 | // MachineInstr. |
189 | SmallVector<MCRegUnit> RegistersUsed; |
190 | |
191 | for (const auto &Operand : MI.all_defs()) { |
192 | const auto Reg = Operand.getReg(); |
193 | |
194 | const auto RegUnits = TRI->regunits(Reg); |
195 | if (any_of(Range: RegUnits, P: [&RegisterUseCount](const MCRegUnit Unit) { |
196 | return RegisterUseCount[Unit] > 1; |
197 | })) |
198 | AllProducerOperandsAreSingleUse = false; |
199 | |
200 | // Reset uses count when a register is no longer live. |
201 | for (const MCRegUnit Unit : RegUnits) |
202 | RegisterUseCount.erase(Val: Unit); |
203 | } |
204 | |
205 | for (const auto &Operand : MI.all_uses()) { |
206 | const auto Reg = Operand.getReg(); |
207 | |
208 | // Count the number of times each register is read. |
209 | for (const MCRegUnit Unit : TRI->regunits(Reg)) { |
210 | if (!is_contained(Range&: RegistersUsed, Element: Unit)) |
211 | RegistersUsed.push_back(Elt: Unit); |
212 | } |
213 | } |
214 | for (const MCRegUnit Unit : RegistersUsed) |
215 | RegisterUseCount[Unit]++; |
216 | |
217 | // Do not attempt to optimise across exec mask changes. |
218 | if (MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI) || |
219 | AMDGPU::isInvalidSingleUseConsumerInst(Opc: MI.getOpcode())) { |
220 | for (auto &UsedReg : RegisterUseCount) |
221 | UsedReg.second = 2; |
222 | } |
223 | |
224 | if (!SIInstrInfo::isVALU(MI) || |
225 | AMDGPU::isInvalidSingleUseProducerInst(Opc: MI.getOpcode())) |
226 | continue; |
227 | if (AllProducerOperandsAreSingleUse) { |
228 | SingleUseProducerPositions.push_back(Elt: {VALUInstrCount, &MI}); |
229 | InstructionEmitted = true; |
230 | } |
231 | VALUInstrCount++; |
232 | } |
233 | insertSingleUseInstructions(SingleUseProducers: SingleUseProducerPositions); |
234 | } |
235 | return InstructionEmitted; |
236 | } |
237 | }; |
238 | } // namespace |
239 | |
240 | char AMDGPUInsertSingleUseVDST::ID = 0; |
241 | |
242 | char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID; |
243 | |
244 | INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE, |
245 | "AMDGPU Insert SingleUseVDST" , false, false) |
246 | |