| 1 | //===- SIInsertHardClauses.cpp - Insert Hard Clauses ----------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// Insert s_clause instructions to form hard clauses. |
| 11 | /// |
| 12 | /// Clausing load instructions can give cache coherency benefits. Before gfx10, |
| 13 | /// the hardware automatically detected "soft clauses", which were sequences of |
| 14 | /// memory instructions of the same type. In gfx10 this detection was removed, |
| 15 | /// and the s_clause instruction was introduced to explicitly mark "hard |
| 16 | /// clauses". |
| 17 | /// |
| 18 | /// It's the scheduler's job to form the clauses by putting similar memory |
| 19 | /// instructions next to each other. Our job is just to insert an s_clause |
| 20 | /// instruction to mark the start of each clause. |
| 21 | /// |
| 22 | /// Note that hard clauses are very similar to, but logically distinct from, the |
| 23 | /// groups of instructions that have to be restartable when XNACK is enabled. |
| 24 | /// The rules are slightly different in each case. For example an s_nop |
| 25 | /// instruction breaks a restartable group, but can appear in the middle of a |
| 26 | /// hard clause. (Before gfx10 there wasn't a distinction, and both were called |
| 27 | /// "soft clauses" or just "clauses".) |
| 28 | /// |
| 29 | /// The SIFormMemoryClauses pass and GCNHazardRecognizer deal with restartable |
| 30 | /// groups, not hard clauses. |
| 31 | // |
| 32 | //===----------------------------------------------------------------------===// |
| 33 | |
| 34 | #include "AMDGPU.h" |
| 35 | #include "GCNSubtarget.h" |
| 36 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 37 | #include "llvm/ADT/SmallVector.h" |
| 38 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 39 | #include "llvm/CodeGen/MachinePassManager.h" |
| 40 | |
| 41 | using namespace llvm; |
| 42 | |
| 43 | #define DEBUG_TYPE "si-insert-hard-clauses" |
| 44 | |
| 45 | static cl::opt<unsigned> |
| 46 | HardClauseLengthLimit("amdgpu-hard-clause-length-limit" , |
| 47 | cl::desc("Maximum number of memory instructions to " |
| 48 | "place in the same hard clause" ), |
| 49 | cl::Hidden); |
| 50 | |
| 51 | namespace { |
| 52 | |
| 53 | enum HardClauseType { |
| 54 | // For GFX10: |
| 55 | |
| 56 | // Texture, buffer, global or scratch memory instructions. |
| 57 | HARDCLAUSE_VMEM, |
| 58 | // Flat (not global or scratch) memory instructions. |
| 59 | HARDCLAUSE_FLAT, |
| 60 | |
| 61 | // For GFX11: |
| 62 | |
| 63 | // Texture memory instructions. |
| 64 | HARDCLAUSE_MIMG_LOAD, |
| 65 | HARDCLAUSE_MIMG_STORE, |
| 66 | HARDCLAUSE_MIMG_ATOMIC, |
| 67 | HARDCLAUSE_MIMG_SAMPLE, |
| 68 | // Buffer, global or scratch memory instructions. |
| 69 | HARDCLAUSE_VMEM_LOAD, |
| 70 | HARDCLAUSE_VMEM_STORE, |
| 71 | HARDCLAUSE_VMEM_ATOMIC, |
| 72 | // Flat (not global or scratch) memory instructions. |
| 73 | HARDCLAUSE_FLAT_LOAD, |
| 74 | HARDCLAUSE_FLAT_STORE, |
| 75 | HARDCLAUSE_FLAT_ATOMIC, |
| 76 | // BVH instructions. |
| 77 | HARDCLAUSE_BVH, |
| 78 | |
| 79 | // Common: |
| 80 | |
| 81 | // Instructions that access LDS. |
| 82 | HARDCLAUSE_LDS, |
| 83 | // Scalar memory instructions. |
| 84 | HARDCLAUSE_SMEM, |
| 85 | // VALU instructions. |
| 86 | HARDCLAUSE_VALU, |
| 87 | LAST_REAL_HARDCLAUSE_TYPE = HARDCLAUSE_VALU, |
| 88 | |
| 89 | // Internal instructions, which are allowed in the middle of a hard clause, |
| 90 | // except for s_waitcnt. |
| 91 | HARDCLAUSE_INTERNAL, |
| 92 | // Meta instructions that do not result in any ISA like KILL. |
| 93 | HARDCLAUSE_IGNORE, |
| 94 | // Instructions that are not allowed in a hard clause: SALU, export, branch, |
| 95 | // message, GDS, s_waitcnt and anything else not mentioned above. |
| 96 | HARDCLAUSE_ILLEGAL, |
| 97 | }; |
| 98 | |
| 99 | class SIInsertHardClauses { |
| 100 | public: |
| 101 | const GCNSubtarget *ST = nullptr; |
| 102 | |
| 103 | HardClauseType getHardClauseType(const MachineInstr &MI) { |
| 104 | if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) { |
| 105 | if (ST->getGeneration() == AMDGPUSubtarget::GFX10) { |
| 106 | if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) || |
| 107 | SIInstrInfo::isSegmentSpecificFLAT(MI)) { |
| 108 | if (ST->hasNSAClauseBug()) { |
| 109 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode()); |
| 110 | if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA) |
| 111 | return HARDCLAUSE_ILLEGAL; |
| 112 | } |
| 113 | return HARDCLAUSE_VMEM; |
| 114 | } |
| 115 | if (SIInstrInfo::isFLAT(MI)) |
| 116 | return HARDCLAUSE_FLAT; |
| 117 | } else { |
| 118 | assert(ST->getGeneration() >= AMDGPUSubtarget::GFX11); |
| 119 | if (SIInstrInfo::isMIMG(MI)) { |
| 120 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode()); |
| 121 | const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = |
| 122 | AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode); |
| 123 | if (BaseInfo->BVH) |
| 124 | return HARDCLAUSE_BVH; |
| 125 | if (BaseInfo->Sampler || BaseInfo->MSAA) |
| 126 | return HARDCLAUSE_MIMG_SAMPLE; |
| 127 | return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_MIMG_ATOMIC |
| 128 | : HARDCLAUSE_MIMG_LOAD |
| 129 | : HARDCLAUSE_MIMG_STORE; |
| 130 | } |
| 131 | if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) || |
| 132 | SIInstrInfo::isSegmentSpecificFLAT(MI)) { |
| 133 | return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_VMEM_ATOMIC |
| 134 | : HARDCLAUSE_VMEM_LOAD |
| 135 | : HARDCLAUSE_VMEM_STORE; |
| 136 | } |
| 137 | if (SIInstrInfo::isFLAT(MI)) { |
| 138 | return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_FLAT_ATOMIC |
| 139 | : HARDCLAUSE_FLAT_LOAD |
| 140 | : HARDCLAUSE_FLAT_STORE; |
| 141 | } |
| 142 | } |
| 143 | // TODO: LDS |
| 144 | if (SIInstrInfo::isSMRD(MI)) |
| 145 | return HARDCLAUSE_SMEM; |
| 146 | } |
| 147 | |
| 148 | // Don't form VALU clauses. It's not clear what benefit they give, if any. |
| 149 | |
| 150 | // In practice s_nop is the only internal instruction we're likely to see. |
| 151 | // It's safe to treat the rest as illegal. |
| 152 | if (MI.getOpcode() == AMDGPU::S_NOP) |
| 153 | return HARDCLAUSE_INTERNAL; |
| 154 | if (MI.isMetaInstruction()) |
| 155 | return HARDCLAUSE_IGNORE; |
| 156 | return HARDCLAUSE_ILLEGAL; |
| 157 | } |
| 158 | |
| 159 | // Track information about a clause as we discover it. |
| 160 | struct ClauseInfo { |
| 161 | // The type of all (non-internal) instructions in the clause. |
| 162 | HardClauseType Type = HARDCLAUSE_ILLEGAL; |
| 163 | // The first (necessarily non-internal) instruction in the clause. |
| 164 | MachineInstr *First = nullptr; |
| 165 | // The last non-internal instruction in the clause. |
| 166 | MachineInstr *Last = nullptr; |
| 167 | // The length of the clause including any internal instructions in the |
| 168 | // middle (but not at the end) of the clause. |
| 169 | unsigned Length = 0; |
| 170 | // Internal instructions at the and of a clause should not be included in |
| 171 | // the clause. Count them in TrailingInternalLength until a new memory |
| 172 | // instruction is added. |
| 173 | unsigned TrailingInternalLength = 0; |
| 174 | // The base operands of *Last. |
| 175 | SmallVector<const MachineOperand *, 4> BaseOps; |
| 176 | }; |
| 177 | |
| 178 | bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) { |
| 179 | if (CI.First == CI.Last) |
| 180 | return false; |
| 181 | assert(CI.Length <= ST->maxHardClauseLength() && |
| 182 | "Hard clause is too long!" ); |
| 183 | |
| 184 | auto &MBB = *CI.First->getParent(); |
| 185 | auto ClauseMI = |
| 186 | BuildMI(BB&: MBB, I&: *CI.First, MIMD: DebugLoc(), MCID: SII->get(Opcode: AMDGPU::S_CLAUSE)) |
| 187 | .addImm(Val: CI.Length - 1); |
| 188 | finalizeBundle(MBB, FirstMI: ClauseMI->getIterator(), |
| 189 | LastMI: std::next(x: CI.Last->getIterator())); |
| 190 | return true; |
| 191 | } |
| 192 | |
| 193 | bool run(MachineFunction &MF) { |
| 194 | ST = &MF.getSubtarget<GCNSubtarget>(); |
| 195 | if (!ST->hasHardClauses()) |
| 196 | return false; |
| 197 | |
| 198 | unsigned MaxClauseLength = MF.getFunction().getFnAttributeAsParsedInteger( |
| 199 | Kind: "amdgpu-hard-clause-length-limit" , Default: 255); |
| 200 | if (HardClauseLengthLimit.getNumOccurrences()) |
| 201 | MaxClauseLength = HardClauseLengthLimit; |
| 202 | MaxClauseLength = std::min(a: MaxClauseLength, b: ST->maxHardClauseLength()); |
| 203 | if (MaxClauseLength <= 1) |
| 204 | return false; |
| 205 | |
| 206 | const SIInstrInfo *SII = ST->getInstrInfo(); |
| 207 | const TargetRegisterInfo *TRI = ST->getRegisterInfo(); |
| 208 | |
| 209 | bool Changed = false; |
| 210 | for (auto &MBB : MF) { |
| 211 | ClauseInfo CI; |
| 212 | for (auto &MI : MBB) { |
| 213 | HardClauseType Type = getHardClauseType(MI); |
| 214 | |
| 215 | int64_t Dummy1; |
| 216 | bool Dummy2; |
| 217 | LocationSize Dummy3 = LocationSize::precise(Value: 0); |
| 218 | SmallVector<const MachineOperand *, 4> BaseOps; |
| 219 | if (Type <= LAST_REAL_HARDCLAUSE_TYPE) { |
| 220 | if (!SII->getMemOperandsWithOffsetWidth(LdSt: MI, BaseOps, Offset&: Dummy1, OffsetIsScalable&: Dummy2, |
| 221 | Width&: Dummy3, TRI)) { |
| 222 | // We failed to get the base operands, so we'll never clause this |
| 223 | // instruction with any other, so pretend it's illegal. |
| 224 | Type = HARDCLAUSE_ILLEGAL; |
| 225 | } |
| 226 | } |
| 227 | |
| 228 | if (CI.Length == MaxClauseLength || |
| 229 | (CI.Length && Type != HARDCLAUSE_INTERNAL && |
| 230 | Type != HARDCLAUSE_IGNORE && |
| 231 | (Type != CI.Type || |
| 232 | // Note that we lie to shouldClusterMemOps about the size of the |
| 233 | // cluster. When shouldClusterMemOps is called from the machine |
| 234 | // scheduler it limits the size of the cluster to avoid increasing |
| 235 | // register pressure too much, but this pass runs after register |
| 236 | // allocation so there is no need for that kind of limit. |
| 237 | // We also lie about the Offset and OffsetIsScalable parameters, |
| 238 | // as they aren't used in the SIInstrInfo implementation. |
| 239 | !SII->shouldClusterMemOps(BaseOps1: CI.BaseOps, Offset1: 0, OffsetIsScalable1: false, BaseOps2: BaseOps, Offset2: 0, OffsetIsScalable2: false, |
| 240 | ClusterSize: 2, NumBytes: 2)))) { |
| 241 | // Finish the current clause. |
| 242 | Changed |= emitClause(CI, SII); |
| 243 | CI = ClauseInfo(); |
| 244 | } |
| 245 | |
| 246 | if (CI.Length) { |
| 247 | // Extend the current clause. |
| 248 | if (Type != HARDCLAUSE_IGNORE) { |
| 249 | if (Type == HARDCLAUSE_INTERNAL) { |
| 250 | ++CI.TrailingInternalLength; |
| 251 | } else { |
| 252 | ++CI.Length; |
| 253 | CI.Length += CI.TrailingInternalLength; |
| 254 | CI.TrailingInternalLength = 0; |
| 255 | CI.Last = &MI; |
| 256 | CI.BaseOps = std::move(BaseOps); |
| 257 | } |
| 258 | } |
| 259 | } else if (Type <= LAST_REAL_HARDCLAUSE_TYPE) { |
| 260 | // Start a new clause. |
| 261 | CI = ClauseInfo{.Type: Type, .First: &MI, .Last: &MI, .Length: 1, .TrailingInternalLength: 0, .BaseOps: std::move(BaseOps)}; |
| 262 | } |
| 263 | } |
| 264 | |
| 265 | // Finish the last clause in the basic block if any. |
| 266 | if (CI.Length) |
| 267 | Changed |= emitClause(CI, SII); |
| 268 | } |
| 269 | |
| 270 | return Changed; |
| 271 | } |
| 272 | }; |
| 273 | |
| 274 | class SIInsertHardClausesLegacy : public MachineFunctionPass { |
| 275 | public: |
| 276 | static char ID; |
| 277 | SIInsertHardClausesLegacy() : MachineFunctionPass(ID) {} |
| 278 | |
| 279 | bool runOnMachineFunction(MachineFunction &MF) override { |
| 280 | if (skipFunction(F: MF.getFunction())) |
| 281 | return false; |
| 282 | |
| 283 | return SIInsertHardClauses().run(MF); |
| 284 | } |
| 285 | |
| 286 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 287 | AU.setPreservesCFG(); |
| 288 | MachineFunctionPass::getAnalysisUsage(AU); |
| 289 | } |
| 290 | }; |
| 291 | |
| 292 | } // namespace |
| 293 | |
| 294 | PreservedAnalyses |
| 295 | llvm::SIInsertHardClausesPass::run(MachineFunction &MF, |
| 296 | MachineFunctionAnalysisManager &MFAM) { |
| 297 | if (!SIInsertHardClauses().run(MF)) |
| 298 | return PreservedAnalyses::all(); |
| 299 | |
| 300 | auto PA = getMachineFunctionPassPreservedAnalyses(); |
| 301 | PA.preserveSet<CFGAnalyses>(); |
| 302 | return PA; |
| 303 | } |
| 304 | |
| 305 | char SIInsertHardClausesLegacy::ID = 0; |
| 306 | |
| 307 | char &llvm::SIInsertHardClausesID = SIInsertHardClausesLegacy::ID; |
| 308 | |
| 309 | INITIALIZE_PASS(SIInsertHardClausesLegacy, DEBUG_TYPE, "SI Insert Hard Clauses" , |
| 310 | false, false) |
| 311 | |