1 | //===- SIInsertHardClauses.cpp - Insert Hard Clauses ----------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Insert s_clause instructions to form hard clauses. |
11 | /// |
12 | /// Clausing load instructions can give cache coherency benefits. Before gfx10, |
13 | /// the hardware automatically detected "soft clauses", which were sequences of |
14 | /// memory instructions of the same type. In gfx10 this detection was removed, |
15 | /// and the s_clause instruction was introduced to explicitly mark "hard |
16 | /// clauses". |
17 | /// |
18 | /// It's the scheduler's job to form the clauses by putting similar memory |
19 | /// instructions next to each other. Our job is just to insert an s_clause |
20 | /// instruction to mark the start of each clause. |
21 | /// |
22 | /// Note that hard clauses are very similar to, but logically distinct from, the |
23 | /// groups of instructions that have to be restartable when XNACK is enabled. |
24 | /// The rules are slightly different in each case. For example an s_nop |
25 | /// instruction breaks a restartable group, but can appear in the middle of a |
26 | /// hard clause. (Before gfx10 there wasn't a distinction, and both were called |
27 | /// "soft clauses" or just "clauses".) |
28 | /// |
29 | /// The SIFormMemoryClauses pass and GCNHazardRecognizer deal with restartable |
30 | /// groups, not hard clauses. |
31 | // |
32 | //===----------------------------------------------------------------------===// |
33 | |
34 | #include "AMDGPU.h" |
35 | #include "GCNSubtarget.h" |
36 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
37 | #include "llvm/ADT/SmallVector.h" |
38 | #include "llvm/CodeGen/MachineFunctionPass.h" |
39 | #include "llvm/CodeGen/MachinePassManager.h" |
40 | |
41 | using namespace llvm; |
42 | |
43 | #define DEBUG_TYPE "si-insert-hard-clauses" |
44 | |
45 | static cl::opt<unsigned> |
46 | HardClauseLengthLimit("amdgpu-hard-clause-length-limit" , |
47 | cl::desc("Maximum number of memory instructions to " |
48 | "place in the same hard clause" ), |
49 | cl::Hidden); |
50 | |
51 | namespace { |
52 | |
53 | enum HardClauseType { |
54 | // For GFX10: |
55 | |
56 | // Texture, buffer, global or scratch memory instructions. |
57 | HARDCLAUSE_VMEM, |
58 | // Flat (not global or scratch) memory instructions. |
59 | HARDCLAUSE_FLAT, |
60 | |
61 | // For GFX11: |
62 | |
63 | // Texture memory instructions. |
64 | HARDCLAUSE_MIMG_LOAD, |
65 | HARDCLAUSE_MIMG_STORE, |
66 | HARDCLAUSE_MIMG_ATOMIC, |
67 | HARDCLAUSE_MIMG_SAMPLE, |
68 | // Buffer, global or scratch memory instructions. |
69 | HARDCLAUSE_VMEM_LOAD, |
70 | HARDCLAUSE_VMEM_STORE, |
71 | HARDCLAUSE_VMEM_ATOMIC, |
72 | // Flat (not global or scratch) memory instructions. |
73 | HARDCLAUSE_FLAT_LOAD, |
74 | HARDCLAUSE_FLAT_STORE, |
75 | HARDCLAUSE_FLAT_ATOMIC, |
76 | // BVH instructions. |
77 | HARDCLAUSE_BVH, |
78 | |
79 | // Common: |
80 | |
81 | // Instructions that access LDS. |
82 | HARDCLAUSE_LDS, |
83 | // Scalar memory instructions. |
84 | HARDCLAUSE_SMEM, |
85 | // VALU instructions. |
86 | HARDCLAUSE_VALU, |
87 | LAST_REAL_HARDCLAUSE_TYPE = HARDCLAUSE_VALU, |
88 | |
89 | // Internal instructions, which are allowed in the middle of a hard clause, |
90 | // except for s_waitcnt. |
91 | HARDCLAUSE_INTERNAL, |
92 | // Meta instructions that do not result in any ISA like KILL. |
93 | HARDCLAUSE_IGNORE, |
94 | // Instructions that are not allowed in a hard clause: SALU, export, branch, |
95 | // message, GDS, s_waitcnt and anything else not mentioned above. |
96 | HARDCLAUSE_ILLEGAL, |
97 | }; |
98 | |
99 | class SIInsertHardClauses { |
100 | public: |
101 | const GCNSubtarget *ST = nullptr; |
102 | |
103 | HardClauseType getHardClauseType(const MachineInstr &MI) { |
104 | if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) { |
105 | if (ST->getGeneration() == AMDGPUSubtarget::GFX10) { |
106 | if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) || |
107 | SIInstrInfo::isSegmentSpecificFLAT(MI)) { |
108 | if (ST->hasNSAClauseBug()) { |
109 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode()); |
110 | if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA) |
111 | return HARDCLAUSE_ILLEGAL; |
112 | } |
113 | return HARDCLAUSE_VMEM; |
114 | } |
115 | if (SIInstrInfo::isFLAT(MI)) |
116 | return HARDCLAUSE_FLAT; |
117 | } else { |
118 | assert(ST->getGeneration() >= AMDGPUSubtarget::GFX11); |
119 | if (SIInstrInfo::isMIMG(MI)) { |
120 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode()); |
121 | const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = |
122 | AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode); |
123 | if (BaseInfo->BVH) |
124 | return HARDCLAUSE_BVH; |
125 | if (BaseInfo->Sampler || BaseInfo->MSAA) |
126 | return HARDCLAUSE_MIMG_SAMPLE; |
127 | return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_MIMG_ATOMIC |
128 | : HARDCLAUSE_MIMG_LOAD |
129 | : HARDCLAUSE_MIMG_STORE; |
130 | } |
131 | if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) || |
132 | SIInstrInfo::isSegmentSpecificFLAT(MI)) { |
133 | return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_VMEM_ATOMIC |
134 | : HARDCLAUSE_VMEM_LOAD |
135 | : HARDCLAUSE_VMEM_STORE; |
136 | } |
137 | if (SIInstrInfo::isFLAT(MI)) { |
138 | return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_FLAT_ATOMIC |
139 | : HARDCLAUSE_FLAT_LOAD |
140 | : HARDCLAUSE_FLAT_STORE; |
141 | } |
142 | } |
143 | // TODO: LDS |
144 | if (SIInstrInfo::isSMRD(MI)) |
145 | return HARDCLAUSE_SMEM; |
146 | } |
147 | |
148 | // Don't form VALU clauses. It's not clear what benefit they give, if any. |
149 | |
150 | // In practice s_nop is the only internal instruction we're likely to see. |
151 | // It's safe to treat the rest as illegal. |
152 | if (MI.getOpcode() == AMDGPU::S_NOP) |
153 | return HARDCLAUSE_INTERNAL; |
154 | if (MI.isMetaInstruction()) |
155 | return HARDCLAUSE_IGNORE; |
156 | return HARDCLAUSE_ILLEGAL; |
157 | } |
158 | |
159 | // Track information about a clause as we discover it. |
160 | struct ClauseInfo { |
161 | // The type of all (non-internal) instructions in the clause. |
162 | HardClauseType Type = HARDCLAUSE_ILLEGAL; |
163 | // The first (necessarily non-internal) instruction in the clause. |
164 | MachineInstr *First = nullptr; |
165 | // The last non-internal instruction in the clause. |
166 | MachineInstr *Last = nullptr; |
167 | // The length of the clause including any internal instructions in the |
168 | // middle (but not at the end) of the clause. |
169 | unsigned Length = 0; |
170 | // Internal instructions at the and of a clause should not be included in |
171 | // the clause. Count them in TrailingInternalLength until a new memory |
172 | // instruction is added. |
173 | unsigned TrailingInternalLength = 0; |
174 | // The base operands of *Last. |
175 | SmallVector<const MachineOperand *, 4> BaseOps; |
176 | }; |
177 | |
178 | bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) { |
179 | if (CI.First == CI.Last) |
180 | return false; |
181 | assert(CI.Length <= ST->maxHardClauseLength() && |
182 | "Hard clause is too long!" ); |
183 | |
184 | auto &MBB = *CI.First->getParent(); |
185 | auto ClauseMI = |
186 | BuildMI(BB&: MBB, I&: *CI.First, MIMD: DebugLoc(), MCID: SII->get(Opcode: AMDGPU::S_CLAUSE)) |
187 | .addImm(Val: CI.Length - 1); |
188 | finalizeBundle(MBB, FirstMI: ClauseMI->getIterator(), |
189 | LastMI: std::next(x: CI.Last->getIterator())); |
190 | return true; |
191 | } |
192 | |
193 | bool run(MachineFunction &MF) { |
194 | ST = &MF.getSubtarget<GCNSubtarget>(); |
195 | if (!ST->hasHardClauses()) |
196 | return false; |
197 | |
198 | unsigned MaxClauseLength = MF.getFunction().getFnAttributeAsParsedInteger( |
199 | Kind: "amdgpu-hard-clause-length-limit" , Default: 255); |
200 | if (HardClauseLengthLimit.getNumOccurrences()) |
201 | MaxClauseLength = HardClauseLengthLimit; |
202 | MaxClauseLength = std::min(a: MaxClauseLength, b: ST->maxHardClauseLength()); |
203 | if (MaxClauseLength <= 1) |
204 | return false; |
205 | |
206 | const SIInstrInfo *SII = ST->getInstrInfo(); |
207 | const TargetRegisterInfo *TRI = ST->getRegisterInfo(); |
208 | |
209 | bool Changed = false; |
210 | for (auto &MBB : MF) { |
211 | ClauseInfo CI; |
212 | for (auto &MI : MBB) { |
213 | HardClauseType Type = getHardClauseType(MI); |
214 | |
215 | int64_t Dummy1; |
216 | bool Dummy2; |
217 | LocationSize Dummy3 = LocationSize::precise(Value: 0); |
218 | SmallVector<const MachineOperand *, 4> BaseOps; |
219 | if (Type <= LAST_REAL_HARDCLAUSE_TYPE) { |
220 | if (!SII->getMemOperandsWithOffsetWidth(LdSt: MI, BaseOps, Offset&: Dummy1, OffsetIsScalable&: Dummy2, |
221 | Width&: Dummy3, TRI)) { |
222 | // We failed to get the base operands, so we'll never clause this |
223 | // instruction with any other, so pretend it's illegal. |
224 | Type = HARDCLAUSE_ILLEGAL; |
225 | } |
226 | } |
227 | |
228 | if (CI.Length == MaxClauseLength || |
229 | (CI.Length && Type != HARDCLAUSE_INTERNAL && |
230 | Type != HARDCLAUSE_IGNORE && |
231 | (Type != CI.Type || |
232 | // Note that we lie to shouldClusterMemOps about the size of the |
233 | // cluster. When shouldClusterMemOps is called from the machine |
234 | // scheduler it limits the size of the cluster to avoid increasing |
235 | // register pressure too much, but this pass runs after register |
236 | // allocation so there is no need for that kind of limit. |
237 | // We also lie about the Offset and OffsetIsScalable parameters, |
238 | // as they aren't used in the SIInstrInfo implementation. |
239 | !SII->shouldClusterMemOps(BaseOps1: CI.BaseOps, Offset1: 0, OffsetIsScalable1: false, BaseOps2: BaseOps, Offset2: 0, OffsetIsScalable2: false, |
240 | ClusterSize: 2, NumBytes: 2)))) { |
241 | // Finish the current clause. |
242 | Changed |= emitClause(CI, SII); |
243 | CI = ClauseInfo(); |
244 | } |
245 | |
246 | if (CI.Length) { |
247 | // Extend the current clause. |
248 | if (Type != HARDCLAUSE_IGNORE) { |
249 | if (Type == HARDCLAUSE_INTERNAL) { |
250 | ++CI.TrailingInternalLength; |
251 | } else { |
252 | ++CI.Length; |
253 | CI.Length += CI.TrailingInternalLength; |
254 | CI.TrailingInternalLength = 0; |
255 | CI.Last = &MI; |
256 | CI.BaseOps = std::move(BaseOps); |
257 | } |
258 | } |
259 | } else if (Type <= LAST_REAL_HARDCLAUSE_TYPE) { |
260 | // Start a new clause. |
261 | CI = ClauseInfo{.Type: Type, .First: &MI, .Last: &MI, .Length: 1, .TrailingInternalLength: 0, .BaseOps: std::move(BaseOps)}; |
262 | } |
263 | } |
264 | |
265 | // Finish the last clause in the basic block if any. |
266 | if (CI.Length) |
267 | Changed |= emitClause(CI, SII); |
268 | } |
269 | |
270 | return Changed; |
271 | } |
272 | }; |
273 | |
274 | class SIInsertHardClausesLegacy : public MachineFunctionPass { |
275 | public: |
276 | static char ID; |
277 | SIInsertHardClausesLegacy() : MachineFunctionPass(ID) {} |
278 | |
279 | bool runOnMachineFunction(MachineFunction &MF) override { |
280 | if (skipFunction(F: MF.getFunction())) |
281 | return false; |
282 | |
283 | return SIInsertHardClauses().run(MF); |
284 | } |
285 | |
286 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
287 | AU.setPreservesCFG(); |
288 | MachineFunctionPass::getAnalysisUsage(AU); |
289 | } |
290 | }; |
291 | |
292 | } // namespace |
293 | |
294 | PreservedAnalyses |
295 | llvm::SIInsertHardClausesPass::run(MachineFunction &MF, |
296 | MachineFunctionAnalysisManager &MFAM) { |
297 | if (!SIInsertHardClauses().run(MF)) |
298 | return PreservedAnalyses::all(); |
299 | |
300 | auto PA = getMachineFunctionPassPreservedAnalyses(); |
301 | PA.preserveSet<CFGAnalyses>(); |
302 | return PA; |
303 | } |
304 | |
305 | char SIInsertHardClausesLegacy::ID = 0; |
306 | |
307 | char &llvm::SIInsertHardClausesID = SIInsertHardClausesLegacy::ID; |
308 | |
309 | INITIALIZE_PASS(SIInsertHardClausesLegacy, DEBUG_TYPE, "SI Insert Hard Clauses" , |
310 | false, false) |
311 | |