1//===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU
11/// instructions that produce single-use VGPR values. If the value is forwarded
12/// to the consumer instruction prior to VGPR writeback, the hardware can
13/// then skip (kill) the VGPR write.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPU.h"
18#include "AMDGPUGenSearchableTables.inc"
19#include "GCNSubtarget.h"
20#include "SIInstrInfo.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/DenseMap.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/SmallVector.h"
25#include "llvm/ADT/StringRef.h"
26#include "llvm/CodeGen/MachineBasicBlock.h"
27#include "llvm/CodeGen/MachineFunction.h"
28#include "llvm/CodeGen/MachineFunctionPass.h"
29#include "llvm/CodeGen/MachineInstr.h"
30#include "llvm/CodeGen/MachineInstrBuilder.h"
31#include "llvm/CodeGen/MachineOperand.h"
32#include "llvm/CodeGen/Register.h"
33#include "llvm/IR/DebugLoc.h"
34#include "llvm/MC/MCRegister.h"
35#include "llvm/MC/MCRegisterInfo.h"
36#include "llvm/Pass.h"
37#include <array>
38
39using namespace llvm;
40
41#define DEBUG_TYPE "amdgpu-insert-single-use-vdst"
42
43namespace {
44class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
45private:
46 const SIInstrInfo *SII;
47 class SingleUseInstruction {
48 private:
49 static const unsigned MaxSkipRange = 0b111;
50 static const unsigned MaxNumberOfSkipRegions = 2;
51
52 unsigned LastEncodedPositionEnd;
53 MachineInstr *ProducerInstr;
54
55 std::array<unsigned, MaxNumberOfSkipRegions + 1> SingleUseRegions;
56 SmallVector<unsigned, MaxNumberOfSkipRegions> SkipRegions;
57
58 // Adds a skip region into the instruction.
59 void skip(const unsigned ProducerPosition) {
60 while (LastEncodedPositionEnd + MaxSkipRange < ProducerPosition) {
61 SkipRegions.push_back(Elt: MaxSkipRange);
62 LastEncodedPositionEnd += MaxSkipRange;
63 }
64 SkipRegions.push_back(Elt: ProducerPosition - LastEncodedPositionEnd);
65 LastEncodedPositionEnd = ProducerPosition;
66 }
67
68 bool currentRegionHasSpace() {
69 const auto Region = SkipRegions.size();
70 // The first region has an extra bit of encoding space.
71 return SingleUseRegions[Region] <
72 ((Region == MaxNumberOfSkipRegions) ? 0b1111U : 0b111U);
73 }
74
75 unsigned encodeImm() {
76 // Handle the first Single Use Region separately as it has an extra bit
77 // of encoding space.
78 unsigned Imm = SingleUseRegions[SkipRegions.size()];
79 unsigned ShiftAmount = 4;
80 for (unsigned i = SkipRegions.size(); i > 0; i--) {
81 Imm |= SkipRegions[i - 1] << ShiftAmount;
82 ShiftAmount += 3;
83 Imm |= SingleUseRegions[i - 1] << ShiftAmount;
84 ShiftAmount += 3;
85 }
86 return Imm;
87 }
88
89 public:
90 SingleUseInstruction(const unsigned ProducerPosition,
91 MachineInstr *Producer)
92 : LastEncodedPositionEnd(ProducerPosition + 1), ProducerInstr(Producer),
93 SingleUseRegions({1, 0, 0}) {}
94
95 // Returns false if adding a new single use producer failed. This happens
96 // because it could not be encoded, either because there is no room to
97 // encode another single use producer region or that this single use
98 // producer is too far away to encode the amount of instructions to skip.
99 bool tryAddProducer(const unsigned ProducerPosition, MachineInstr *MI) {
100 // Producer is too far away to encode into this instruction or another
101 // skip region is needed and SkipRegions.size() = 2 so there's no room for
102 // another skip region, therefore a new instruction is needed.
103 if (LastEncodedPositionEnd +
104 (MaxSkipRange * (MaxNumberOfSkipRegions - SkipRegions.size())) <
105 ProducerPosition)
106 return false;
107
108 // If a skip region is needed.
109 if (LastEncodedPositionEnd != ProducerPosition ||
110 !currentRegionHasSpace()) {
111 // If the current region is out of space therefore a skip region would
112 // be needed, but there is no room for another skip region.
113 if (SkipRegions.size() == MaxNumberOfSkipRegions)
114 return false;
115 skip(ProducerPosition);
116 }
117
118 SingleUseRegions[SkipRegions.size()]++;
119 LastEncodedPositionEnd = ProducerPosition + 1;
120 ProducerInstr = MI;
121 return true;
122 }
123
124 auto emit(const SIInstrInfo *SII) {
125 return BuildMI(BB&: *ProducerInstr->getParent(), I: ProducerInstr, MIMD: DebugLoc(),
126 MCID: SII->get(Opcode: AMDGPU::S_SINGLEUSE_VDST))
127 .addImm(Val: encodeImm());
128 }
129 };
130
131public:
132 static char ID;
133
134 AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
135
136 void insertSingleUseInstructions(
137 ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const {
138 SmallVector<SingleUseInstruction> Instructions;
139
140 for (auto &[Position, MI] : SingleUseProducers) {
141 // Encode this position into the last single use instruction if possible.
142 if (Instructions.empty() ||
143 !Instructions.back().tryAddProducer(ProducerPosition: Position, MI)) {
144 // If not, add a new instruction.
145 Instructions.push_back(Elt: SingleUseInstruction(Position, MI));
146 }
147 }
148
149 for (auto &Instruction : Instructions)
150 Instruction.emit(SII);
151 }
152
153 bool runOnMachineFunction(MachineFunction &MF) override {
154 const auto &ST = MF.getSubtarget<GCNSubtarget>();
155 if (!ST.hasVGPRSingleUseHintInsts())
156 return false;
157
158 SII = ST.getInstrInfo();
159 const auto *TRI = &SII->getRegisterInfo();
160 bool InstructionEmitted = false;
161
162 for (MachineBasicBlock &MBB : MF) {
163 DenseMap<MCRegUnit, unsigned> RegisterUseCount;
164
165 // Handle boundaries at the end of basic block separately to avoid
166 // false positives. If they are live at the end of a basic block then
167 // assume it has more uses later on.
168 for (const auto &Liveout : MBB.liveouts()) {
169 for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid();
170 ++Units) {
171 const auto [Unit, Mask] = *Units;
172 if ((Mask & Liveout.LaneMask).any())
173 RegisterUseCount[Unit] = 2;
174 }
175 }
176
177 SmallVector<std::pair<unsigned, MachineInstr *>>
178 SingleUseProducerPositions;
179
180 unsigned VALUInstrCount = 0;
181 for (MachineInstr &MI : reverse(C: MBB.instrs())) {
182 // All registers in all operands need to be single use for an
183 // instruction to be marked as a single use producer.
184 bool AllProducerOperandsAreSingleUse = true;
185
186 // Gather a list of Registers used before updating use counts to avoid
187 // double counting registers that appear multiple times in a single
188 // MachineInstr.
189 SmallVector<MCRegUnit> RegistersUsed;
190
191 for (const auto &Operand : MI.all_defs()) {
192 const auto Reg = Operand.getReg();
193
194 const auto RegUnits = TRI->regunits(Reg);
195 if (any_of(Range: RegUnits, P: [&RegisterUseCount](const MCRegUnit Unit) {
196 return RegisterUseCount[Unit] > 1;
197 }))
198 AllProducerOperandsAreSingleUse = false;
199
200 // Reset uses count when a register is no longer live.
201 for (const MCRegUnit Unit : RegUnits)
202 RegisterUseCount.erase(Val: Unit);
203 }
204
205 for (const auto &Operand : MI.all_uses()) {
206 const auto Reg = Operand.getReg();
207
208 // Count the number of times each register is read.
209 for (const MCRegUnit Unit : TRI->regunits(Reg)) {
210 if (!is_contained(Range&: RegistersUsed, Element: Unit))
211 RegistersUsed.push_back(Elt: Unit);
212 }
213 }
214 for (const MCRegUnit Unit : RegistersUsed)
215 RegisterUseCount[Unit]++;
216
217 // Do not attempt to optimise across exec mask changes.
218 if (MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI) ||
219 AMDGPU::isInvalidSingleUseConsumerInst(Opc: MI.getOpcode())) {
220 for (auto &UsedReg : RegisterUseCount)
221 UsedReg.second = 2;
222 }
223
224 if (!SIInstrInfo::isVALU(MI) ||
225 AMDGPU::isInvalidSingleUseProducerInst(Opc: MI.getOpcode()))
226 continue;
227 if (AllProducerOperandsAreSingleUse) {
228 SingleUseProducerPositions.push_back(Elt: {VALUInstrCount, &MI});
229 InstructionEmitted = true;
230 }
231 VALUInstrCount++;
232 }
233 insertSingleUseInstructions(SingleUseProducers: SingleUseProducerPositions);
234 }
235 return InstructionEmitted;
236 }
237};
238} // namespace
239
240char AMDGPUInsertSingleUseVDST::ID = 0;
241
242char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID;
243
244INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE,
245 "AMDGPU Insert SingleUseVDST", false, false)
246