| 1 | //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// \file |
| 9 | /// |
| 10 | /// This file implements methods from the AMDGPUCustomBehaviour class. |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "AMDGPUCustomBehaviour.h" |
| 15 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 16 | #include "TargetInfo/AMDGPUTargetInfo.h" |
| 17 | #include "Utils/AMDGPUBaseInfo.h" |
| 18 | #include "llvm/MC/TargetRegistry.h" |
| 19 | #include "llvm/Support/Compiler.h" |
| 20 | #include "llvm/Support/WithColor.h" |
| 21 | |
| 22 | namespace llvm::mca { |
| 23 | |
| 24 | void AMDGPUInstrPostProcess::postProcessInstruction( |
| 25 | std::unique_ptr<Instruction> &Inst, const MCInst &MCI) { |
| 26 | switch (MCI.getOpcode()) { |
| 27 | case AMDGPU::S_WAITCNT: |
| 28 | case AMDGPU::S_WAITCNT_soft: |
| 29 | case AMDGPU::S_WAITCNT_EXPCNT: |
| 30 | case AMDGPU::S_WAITCNT_LGKMCNT: |
| 31 | case AMDGPU::S_WAITCNT_VMCNT: |
| 32 | case AMDGPU::S_WAITCNT_VSCNT: |
| 33 | case AMDGPU::S_WAITCNT_VSCNT_soft: |
| 34 | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
| 35 | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
| 36 | case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
| 37 | case AMDGPU::S_WAITCNT_VSCNT_gfx10: |
| 38 | case AMDGPU::S_WAITCNT_gfx10: |
| 39 | case AMDGPU::S_WAITCNT_gfx6_gfx7: |
| 40 | case AMDGPU::S_WAITCNT_vi: |
| 41 | return processWaitCnt(Inst, MCI); |
| 42 | } |
| 43 | } |
| 44 | |
| 45 | // s_waitcnt instructions encode important information as immediate operands |
| 46 | // which are lost during the MCInst -> mca::Instruction lowering. |
| 47 | void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst, |
| 48 | const MCInst &MCI) { |
| 49 | for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) { |
| 50 | MCAOperand Op; |
| 51 | const MCOperand &MCOp = MCI.getOperand(i: Idx); |
| 52 | if (MCOp.isReg()) { |
| 53 | Op = MCAOperand::createReg(Reg: MCOp.getReg()); |
| 54 | } else if (MCOp.isImm()) { |
| 55 | Op = MCAOperand::createImm(Val: MCOp.getImm()); |
| 56 | } |
| 57 | Op.setIndex(Idx); |
| 58 | Inst->addOperand(Op); |
| 59 | } |
| 60 | } |
| 61 | |
| 62 | AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, |
| 63 | const mca::SourceMgr &SrcMgr, |
| 64 | const MCInstrInfo &MCII) |
| 65 | : CustomBehaviour(STI, SrcMgr, MCII) { |
| 66 | generateWaitCntInfo(); |
| 67 | } |
| 68 | |
| 69 | unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst, |
| 70 | const InstRef &IR) { |
| 71 | const Instruction &Inst = *IR.getInstruction(); |
| 72 | unsigned Opcode = Inst.getOpcode(); |
| 73 | |
| 74 | // llvm-mca is generally run on fully compiled assembly so we wouldn't see any |
| 75 | // pseudo instructions here. However, there are plans for the future to make |
| 76 | // it possible to use mca within backend passes. As such, I have left the |
| 77 | // pseudo version of s_waitcnt within this switch statement. |
| 78 | switch (Opcode) { |
| 79 | default: |
| 80 | return 0; |
| 81 | case AMDGPU::S_WAITCNT: // This instruction |
| 82 | case AMDGPU::S_WAITCNT_soft: |
| 83 | case AMDGPU::S_WAITCNT_EXPCNT: |
| 84 | case AMDGPU::S_WAITCNT_LGKMCNT: |
| 85 | case AMDGPU::S_WAITCNT_VMCNT: |
| 86 | case AMDGPU::S_WAITCNT_VSCNT: |
| 87 | case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo. |
| 88 | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
| 89 | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
| 90 | case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
| 91 | case AMDGPU::S_WAITCNT_VSCNT_gfx10: |
| 92 | case AMDGPU::S_WAITCNT_gfx10: |
| 93 | case AMDGPU::S_WAITCNT_gfx6_gfx7: |
| 94 | case AMDGPU::S_WAITCNT_vi: |
| 95 | // s_endpgm also behaves as if there is an implicit |
| 96 | // s_waitcnt 0, but I'm not sure if it would be appropriate |
| 97 | // to model this in llvm-mca based on how the iterations work |
| 98 | // while simulating the pipeline over and over. |
| 99 | return handleWaitCnt(IssuedInst, IR); |
| 100 | } |
| 101 | |
| 102 | return 0; |
| 103 | } |
| 104 | |
| 105 | unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst, |
| 106 | const InstRef &IR) { |
| 107 | // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr. |
| 108 | // I do not know how that instruction works so I did not attempt to model it. |
| 109 | // set the max values to begin |
| 110 | unsigned Vmcnt = 63; |
| 111 | unsigned Expcnt = 7; |
| 112 | unsigned Lgkmcnt = 31; |
| 113 | unsigned Vscnt = 63; |
| 114 | unsigned CurrVmcnt = 0; |
| 115 | unsigned CurrExpcnt = 0; |
| 116 | unsigned CurrLgkmcnt = 0; |
| 117 | unsigned CurrVscnt = 0; |
| 118 | unsigned CyclesToWaitVm = ~0U; |
| 119 | unsigned CyclesToWaitExp = ~0U; |
| 120 | unsigned CyclesToWaitLgkm = ~0U; |
| 121 | unsigned CyclesToWaitVs = ~0U; |
| 122 | |
| 123 | computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt); |
| 124 | |
| 125 | // We will now look at each of the currently executing instructions |
| 126 | // to find out if this wait instruction still needs to wait. |
| 127 | for (const InstRef &PrevIR : IssuedInst) { |
| 128 | const Instruction &PrevInst = *PrevIR.getInstruction(); |
| 129 | const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size(); |
| 130 | const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex]; |
| 131 | const int CyclesLeft = PrevInst.getCyclesLeft(); |
| 132 | assert(CyclesLeft != UNKNOWN_CYCLES && |
| 133 | "We should know how many cycles are left for this instruction" ); |
| 134 | if (PrevInstWaitInfo.VmCnt) { |
| 135 | CurrVmcnt++; |
| 136 | if ((unsigned)CyclesLeft < CyclesToWaitVm) |
| 137 | CyclesToWaitVm = CyclesLeft; |
| 138 | } |
| 139 | if (PrevInstWaitInfo.ExpCnt) { |
| 140 | CurrExpcnt++; |
| 141 | if ((unsigned)CyclesLeft < CyclesToWaitExp) |
| 142 | CyclesToWaitExp = CyclesLeft; |
| 143 | } |
| 144 | if (PrevInstWaitInfo.LgkmCnt) { |
| 145 | CurrLgkmcnt++; |
| 146 | if ((unsigned)CyclesLeft < CyclesToWaitLgkm) |
| 147 | CyclesToWaitLgkm = CyclesLeft; |
| 148 | } |
| 149 | if (PrevInstWaitInfo.VsCnt) { |
| 150 | CurrVscnt++; |
| 151 | if ((unsigned)CyclesLeft < CyclesToWaitVs) |
| 152 | CyclesToWaitVs = CyclesLeft; |
| 153 | } |
| 154 | } |
| 155 | |
| 156 | unsigned CyclesToWait = ~0U; |
| 157 | if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait) |
| 158 | CyclesToWait = CyclesToWaitVm; |
| 159 | if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait) |
| 160 | CyclesToWait = CyclesToWaitExp; |
| 161 | if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait) |
| 162 | CyclesToWait = CyclesToWaitLgkm; |
| 163 | if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait) |
| 164 | CyclesToWait = CyclesToWaitVs; |
| 165 | |
| 166 | // We may underestimate how many cycles we need to wait, but this |
| 167 | // isn't a big deal. Our return value is just how many cycles until |
| 168 | // this function gets run again. So as long as we don't overestimate |
| 169 | // the wait time, we'll still end up stalling at this instruction |
| 170 | // for the correct number of cycles. |
| 171 | |
| 172 | if (CyclesToWait == ~0U) |
| 173 | return 0; |
| 174 | return CyclesToWait; |
| 175 | } |
| 176 | |
| 177 | void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, |
| 178 | unsigned &Expcnt, unsigned &Lgkmcnt, |
| 179 | unsigned &Vscnt) { |
| 180 | AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU()); |
| 181 | const Instruction &Inst = *IR.getInstruction(); |
| 182 | unsigned Opcode = Inst.getOpcode(); |
| 183 | |
| 184 | switch (Opcode) { |
| 185 | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
| 186 | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
| 187 | case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
| 188 | case AMDGPU::S_WAITCNT_VSCNT_gfx10: { |
| 189 | // Should probably be checking for nullptr |
| 190 | // here, but I'm not sure how I should handle the case |
| 191 | // where we see a nullptr. |
| 192 | const MCAOperand *OpReg = Inst.getOperand(Idx: 0); |
| 193 | const MCAOperand *OpImm = Inst.getOperand(Idx: 1); |
| 194 | assert(OpReg && OpReg->isReg() && "First operand should be a register." ); |
| 195 | assert(OpImm && OpImm->isImm() && "Second operand should be an immediate." ); |
| 196 | if (OpReg->getReg() != AMDGPU::SGPR_NULL) { |
| 197 | // Instruction is using a real register. |
| 198 | // Since we can't know what value this register will have, |
| 199 | // we can't compute what the value of this wait should be. |
| 200 | WithColor::warning() << "The register component of " |
| 201 | << MCII.getName(Opcode) << " will be completely " |
| 202 | << "ignored. So the wait may not be accurate.\n" ; |
| 203 | } |
| 204 | switch (Opcode) { |
| 205 | // Redundant switch so I don't have to repeat the code above |
| 206 | // for each case. There are more clever ways to avoid this |
| 207 | // extra switch and anyone can feel free to implement one of them. |
| 208 | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
| 209 | Expcnt = OpImm->getImm(); |
| 210 | break; |
| 211 | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
| 212 | Lgkmcnt = OpImm->getImm(); |
| 213 | break; |
| 214 | case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
| 215 | Vmcnt = OpImm->getImm(); |
| 216 | break; |
| 217 | case AMDGPU::S_WAITCNT_VSCNT_gfx10: |
| 218 | Vscnt = OpImm->getImm(); |
| 219 | break; |
| 220 | } |
| 221 | return; |
| 222 | } |
| 223 | case AMDGPU::S_WAITCNT_gfx10: |
| 224 | case AMDGPU::S_WAITCNT_gfx6_gfx7: |
| 225 | case AMDGPU::S_WAITCNT_vi: |
| 226 | unsigned WaitCnt = Inst.getOperand(Idx: 0)->getImm(); |
| 227 | AMDGPU::decodeWaitcnt(Version: IV, Waitcnt: WaitCnt, Vmcnt, Expcnt, Lgkmcnt); |
| 228 | return; |
| 229 | } |
| 230 | } |
| 231 | |
| 232 | void AMDGPUCustomBehaviour::generateWaitCntInfo() { |
| 233 | // The core logic from this function is taken from |
| 234 | // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions |
| 235 | // that are being looked at are in the MachineInstr format, whereas we have |
| 236 | // access to the MCInst format. The side effects of this are that we can't use |
| 237 | // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst) |
| 238 | // functions. Therefore, we conservatively assume that these functions will |
| 239 | // return true. This may cause a few instructions to be incorrectly tagged |
| 240 | // with an extra CNT. However, these are instructions that do interact with at |
| 241 | // least one CNT so giving them an extra CNT shouldn't cause issues in most |
| 242 | // scenarios. |
| 243 | AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU()); |
| 244 | InstrWaitCntInfo.resize(new_size: SrcMgr.size()); |
| 245 | |
| 246 | for (const auto &EN : llvm::enumerate(First: SrcMgr.getInstructions())) { |
| 247 | const std::unique_ptr<Instruction> &Inst = EN.value(); |
| 248 | unsigned Index = EN.index(); |
| 249 | unsigned Opcode = Inst->getOpcode(); |
| 250 | const MCInstrDesc &MCID = MCII.get(Opcode); |
| 251 | if ((MCID.TSFlags & SIInstrFlags::DS) && |
| 252 | (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) { |
| 253 | InstrWaitCntInfo[Index].LgkmCnt = true; |
| 254 | if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, OpName: AMDGPU::OpName::gds)) |
| 255 | InstrWaitCntInfo[Index].ExpCnt = true; |
| 256 | } else if (MCID.TSFlags & SIInstrFlags::FLAT) { |
| 257 | // We conservatively assume that mayAccessVMEMThroughFlat(Inst) |
| 258 | // and mayAccessLDSThroughFlat(Inst) would both return true for this |
| 259 | // instruction. We have to do this because those functions use |
| 260 | // information about the memory operands that we don't have access to. |
| 261 | InstrWaitCntInfo[Index].LgkmCnt = true; |
| 262 | if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt)) |
| 263 | InstrWaitCntInfo[Index].VmCnt = true; |
| 264 | else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) |
| 265 | InstrWaitCntInfo[Index].VmCnt = true; |
| 266 | else |
| 267 | InstrWaitCntInfo[Index].VsCnt = true; |
| 268 | } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opc: Opcode)) { |
| 269 | if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt)) |
| 270 | InstrWaitCntInfo[Index].VmCnt = true; |
| 271 | else if ((MCID.mayLoad() && |
| 272 | !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) || |
| 273 | ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() && |
| 274 | !MCID.mayStore())) |
| 275 | InstrWaitCntInfo[Index].VmCnt = true; |
| 276 | else if (MCID.mayStore()) |
| 277 | InstrWaitCntInfo[Index].VsCnt = true; |
| 278 | |
| 279 | // (IV.Major < 7) is meant to represent |
| 280 | // GCNTarget.vmemWriteNeedsExpWaitcnt() |
| 281 | // which is defined as |
| 282 | // { return getGeneration() < SEA_ISLANDS; } |
| 283 | if (IV.Major < 7 && |
| 284 | (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet))) |
| 285 | InstrWaitCntInfo[Index].ExpCnt = true; |
| 286 | } else if (MCID.TSFlags & SIInstrFlags::SMRD) { |
| 287 | InstrWaitCntInfo[Index].LgkmCnt = true; |
| 288 | } else if (MCID.TSFlags & SIInstrFlags::EXP) { |
| 289 | InstrWaitCntInfo[Index].ExpCnt = true; |
| 290 | } else { |
| 291 | switch (Opcode) { |
| 292 | case AMDGPU::S_SENDMSG: |
| 293 | case AMDGPU::S_SENDMSGHALT: |
| 294 | case AMDGPU::S_MEMTIME: |
| 295 | case AMDGPU::S_MEMREALTIME: |
| 296 | InstrWaitCntInfo[Index].LgkmCnt = true; |
| 297 | break; |
| 298 | } |
| 299 | } |
| 300 | } |
| 301 | } |
| 302 | |
| 303 | // taken from SIInstrInfo::isVMEM() |
| 304 | bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) { |
| 305 | return MCID.TSFlags & SIInstrFlags::MUBUF || |
| 306 | MCID.TSFlags & SIInstrFlags::MTBUF || |
| 307 | MCID.TSFlags & SIInstrFlags::MIMG || MCID.TSFlags & SIInstrFlags::FLAT; |
| 308 | } |
| 309 | |
| 310 | // taken from SIInstrInfo::hasModifiersSet() |
| 311 | bool AMDGPUCustomBehaviour::( |
| 312 | const std::unique_ptr<Instruction> &Inst, AMDGPU::OpName OpName) const { |
| 313 | int Idx = AMDGPU::getNamedOperandIdx(Opcode: Inst->getOpcode(), Name: OpName); |
| 314 | if (Idx == -1) |
| 315 | return false; |
| 316 | |
| 317 | const MCAOperand *Op = Inst->getOperand(Idx); |
| 318 | if (Op == nullptr || !Op->isImm() || !Op->getImm()) |
| 319 | return false; |
| 320 | |
| 321 | return true; |
| 322 | } |
| 323 | |
| 324 | // taken from SIInstrInfo::isGWS() |
| 325 | bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const { |
| 326 | const MCInstrDesc &MCID = MCII.get(Opcode); |
| 327 | return MCID.TSFlags & SIInstrFlags::GWS; |
| 328 | } |
| 329 | |
| 330 | // taken from SIInstrInfo::isAlwaysGDS() |
| 331 | bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const { |
| 332 | return Opcode == AMDGPU::DS_ORDERED_COUNT || |
| 333 | Opcode == AMDGPU::DS_ADD_GS_REG_RTN || |
| 334 | Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode); |
| 335 | } |
| 336 | |
| 337 | } // namespace llvm::mca |
| 338 | |
| 339 | using namespace llvm; |
| 340 | using namespace mca; |
| 341 | |
| 342 | static CustomBehaviour * |
| 343 | createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI, |
| 344 | const mca::SourceMgr &SrcMgr, |
| 345 | const MCInstrInfo &MCII) { |
| 346 | return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII); |
| 347 | } |
| 348 | |
| 349 | static InstrPostProcess * |
| 350 | createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI, |
| 351 | const MCInstrInfo &MCII) { |
| 352 | return new AMDGPUInstrPostProcess(STI, MCII); |
| 353 | } |
| 354 | |
| 355 | /// Extern function to initialize the targets for the AMDGPU backend |
| 356 | |
| 357 | extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void |
| 358 | LLVMInitializeAMDGPUTargetMCA() { |
| 359 | TargetRegistry::RegisterCustomBehaviour(T&: getTheR600Target(), |
| 360 | Fn: createAMDGPUCustomBehaviour); |
| 361 | TargetRegistry::RegisterInstrPostProcess(T&: getTheR600Target(), |
| 362 | Fn: createAMDGPUInstrPostProcess); |
| 363 | |
| 364 | TargetRegistry::RegisterCustomBehaviour(T&: getTheGCNTarget(), |
| 365 | Fn: createAMDGPUCustomBehaviour); |
| 366 | TargetRegistry::RegisterInstrPostProcess(T&: getTheGCNTarget(), |
| 367 | Fn: createAMDGPUInstrPostProcess); |
| 368 | } |
| 369 | |