| 1 | //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// R600 Machine Scheduler interface |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "R600MachineScheduler.h" |
| 15 | #include "MCTargetDesc/R600MCTargetDesc.h" |
| 16 | #include "R600Subtarget.h" |
| 17 | |
| 18 | using namespace llvm; |
| 19 | |
| 20 | #define DEBUG_TYPE "machine-scheduler" |
| 21 | |
| 22 | void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { |
| 23 | assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness" ); |
| 24 | DAG = static_cast<ScheduleDAGMILive*>(dag); |
| 25 | const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>(); |
| 26 | TII = static_cast<const R600InstrInfo*>(DAG->TII); |
| 27 | TRI = static_cast<const R600RegisterInfo*>(DAG->TRI); |
| 28 | VLIW5 = !ST.hasCaymanISA(); |
| 29 | MRI = &DAG->MRI; |
| 30 | CurInstKind = IDOther; |
| 31 | CurEmitted = 0; |
| 32 | OccupiedSlotsMask = 31; |
| 33 | InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); |
| 34 | InstKindLimit[IDOther] = 32; |
| 35 | InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); |
| 36 | AluInstCount = 0; |
| 37 | FetchInstCount = 0; |
| 38 | } |
| 39 | |
| 40 | void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc, |
| 41 | std::vector<SUnit *> &QDst) |
| 42 | { |
| 43 | llvm::append_range(C&: QDst, R&: QSrc); |
| 44 | QSrc.clear(); |
| 45 | } |
| 46 | |
| 47 | static unsigned getWFCountLimitedByGPR(unsigned GPRCount) { |
| 48 | assert (GPRCount && "GPRCount cannot be 0" ); |
| 49 | return 248 / GPRCount; |
| 50 | } |
| 51 | |
| 52 | SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { |
| 53 | SUnit *SU = nullptr; |
| 54 | NextInstKind = IDOther; |
| 55 | |
| 56 | IsTopNode = false; |
| 57 | |
| 58 | // check if we might want to switch current clause type |
| 59 | bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) || |
| 60 | (Available[CurInstKind].empty()); |
| 61 | bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) && |
| 62 | (!Available[IDFetch].empty() || !Available[IDOther].empty()); |
| 63 | |
| 64 | if (CurInstKind == IDAlu && !Available[IDFetch].empty()) { |
| 65 | // We use the heuristic provided by AMD Accelerated Parallel Processing |
| 66 | // OpenCL Programming Guide : |
| 67 | // The approx. number of WF that allows TEX inst to hide ALU inst is : |
| 68 | // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU)) |
| 69 | float ALUFetchRationEstimate = |
| 70 | (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / |
| 71 | (FetchInstCount + Available[IDFetch].size()); |
| 72 | if (ALUFetchRationEstimate == 0) { |
| 73 | AllowSwitchFromAlu = true; |
| 74 | } else { |
| 75 | unsigned NeededWF = 62.5f / ALUFetchRationEstimate; |
| 76 | LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n" ); |
| 77 | // We assume the local GPR requirements to be "dominated" by the requirement |
| 78 | // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and |
| 79 | // after TEX are indeed likely to consume or generate values from/for the |
| 80 | // TEX clause. |
| 81 | // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause |
| 82 | // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need |
| 83 | // one GPR) or TmXYZW = TnXYZW (need 2 GPR). |
| 84 | // (TODO : use RegisterPressure) |
| 85 | // If we are going too use too many GPR, we flush Fetch instruction to lower |
| 86 | // register pressure on 128 bits regs. |
| 87 | unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); |
| 88 | if (NeededWF > getWFCountLimitedByGPR(GPRCount: NearRegisterRequirement)) |
| 89 | AllowSwitchFromAlu = true; |
| 90 | } |
| 91 | } |
| 92 | |
| 93 | if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || |
| 94 | (!AllowSwitchFromAlu && CurInstKind == IDAlu))) { |
| 95 | // try to pick ALU |
| 96 | SU = pickAlu(); |
| 97 | if (!SU && !PhysicalRegCopy.empty()) { |
| 98 | SU = PhysicalRegCopy.front(); |
| 99 | PhysicalRegCopy.erase(position: PhysicalRegCopy.begin()); |
| 100 | } |
| 101 | if (SU) { |
| 102 | if (CurEmitted >= InstKindLimit[IDAlu]) |
| 103 | CurEmitted = 0; |
| 104 | NextInstKind = IDAlu; |
| 105 | } |
| 106 | } |
| 107 | |
| 108 | if (!SU) { |
| 109 | // try to pick FETCH |
| 110 | SU = pickOther(QID: IDFetch); |
| 111 | if (SU) |
| 112 | NextInstKind = IDFetch; |
| 113 | } |
| 114 | |
| 115 | // try to pick other |
| 116 | if (!SU) { |
| 117 | SU = pickOther(QID: IDOther); |
| 118 | if (SU) |
| 119 | NextInstKind = IDOther; |
| 120 | } |
| 121 | |
| 122 | LLVM_DEBUG(if (SU) { |
| 123 | dbgs() << " ** Pick node **\n" ; |
| 124 | DAG->dumpNode(*SU); |
| 125 | } else { |
| 126 | dbgs() << "NO NODE \n" ; |
| 127 | for (const SUnit &S : DAG->SUnits) |
| 128 | if (!S.isScheduled) |
| 129 | DAG->dumpNode(S); |
| 130 | }); |
| 131 | |
| 132 | return SU; |
| 133 | } |
| 134 | |
| 135 | void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { |
| 136 | if (NextInstKind != CurInstKind) { |
| 137 | LLVM_DEBUG(dbgs() << "Instruction Type Switch\n" ); |
| 138 | if (NextInstKind != IDAlu) |
| 139 | OccupiedSlotsMask |= 31; |
| 140 | CurEmitted = 0; |
| 141 | CurInstKind = NextInstKind; |
| 142 | } |
| 143 | |
| 144 | if (CurInstKind == IDAlu) { |
| 145 | AluInstCount ++; |
| 146 | switch (getAluKind(SU)) { |
| 147 | case AluT_XYZW: |
| 148 | CurEmitted += 4; |
| 149 | break; |
| 150 | case AluDiscarded: |
| 151 | break; |
| 152 | default: { |
| 153 | ++CurEmitted; |
| 154 | for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), |
| 155 | E = SU->getInstr()->operands_end(); It != E; ++It) { |
| 156 | MachineOperand &MO = *It; |
| 157 | if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X) |
| 158 | ++CurEmitted; |
| 159 | } |
| 160 | } |
| 161 | } |
| 162 | } else { |
| 163 | ++CurEmitted; |
| 164 | } |
| 165 | |
| 166 | LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n" ); |
| 167 | |
| 168 | if (CurInstKind != IDFetch) { |
| 169 | MoveUnits(QSrc&: Pending[IDFetch], QDst&: Available[IDFetch]); |
| 170 | } else |
| 171 | FetchInstCount++; |
| 172 | } |
| 173 | |
| 174 | static bool |
| 175 | isPhysicalRegCopy(MachineInstr *MI) { |
| 176 | if (MI->getOpcode() != R600::COPY) |
| 177 | return false; |
| 178 | |
| 179 | return !MI->getOperand(i: 1).getReg().isVirtual(); |
| 180 | } |
| 181 | |
| 182 | void R600SchedStrategy::releaseTopNode(SUnit *SU) { |
| 183 | LLVM_DEBUG(dbgs() << "Top Releasing " ; DAG->dumpNode(*SU)); |
| 184 | } |
| 185 | |
| 186 | void R600SchedStrategy::releaseBottomNode(SUnit *SU) { |
| 187 | LLVM_DEBUG(dbgs() << "Bottom Releasing " ; DAG->dumpNode(*SU)); |
| 188 | if (isPhysicalRegCopy(MI: SU->getInstr())) { |
| 189 | PhysicalRegCopy.push_back(x: SU); |
| 190 | return; |
| 191 | } |
| 192 | |
| 193 | int IK = getInstKind(SU); |
| 194 | |
| 195 | // There is no export clause, we can schedule one as soon as its ready |
| 196 | if (IK == IDOther) |
| 197 | Available[IDOther].push_back(x: SU); |
| 198 | else |
| 199 | Pending[IK].push_back(x: SU); |
| 200 | |
| 201 | } |
| 202 | |
| 203 | bool R600SchedStrategy::regBelongsToClass(Register Reg, |
| 204 | const TargetRegisterClass *RC) const { |
| 205 | if (!Reg.isVirtual()) |
| 206 | return RC->contains(Reg); |
| 207 | return MRI->getRegClass(Reg) == RC; |
| 208 | } |
| 209 | |
| 210 | R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { |
| 211 | MachineInstr *MI = SU->getInstr(); |
| 212 | |
| 213 | if (TII->isTransOnly(MI: *MI)) |
| 214 | return AluTrans; |
| 215 | |
| 216 | switch (MI->getOpcode()) { |
| 217 | case R600::PRED_X: |
| 218 | return AluPredX; |
| 219 | case R600::INTERP_PAIR_XY: |
| 220 | case R600::INTERP_PAIR_ZW: |
| 221 | case R600::INTERP_VEC_LOAD: |
| 222 | case R600::DOT_4: |
| 223 | return AluT_XYZW; |
| 224 | case R600::COPY: |
| 225 | if (MI->getOperand(i: 1).isUndef()) { |
| 226 | // MI will become a KILL, don't considers it in scheduling |
| 227 | return AluDiscarded; |
| 228 | } |
| 229 | break; |
| 230 | default: |
| 231 | break; |
| 232 | } |
| 233 | |
| 234 | // Does the instruction take a whole IG ? |
| 235 | // XXX: Is it possible to add a helper function in R600InstrInfo that can |
| 236 | // be used here and in R600PacketizerList::isSoloInstruction() ? |
| 237 | if(TII->isVector(MI: *MI) || |
| 238 | TII->isCubeOp(opcode: MI->getOpcode()) || |
| 239 | TII->isReductionOp(opcode: MI->getOpcode()) || |
| 240 | MI->getOpcode() == R600::GROUP_BARRIER) { |
| 241 | return AluT_XYZW; |
| 242 | } |
| 243 | |
| 244 | if (TII->isLDSInstr(Opcode: MI->getOpcode())) { |
| 245 | return AluT_X; |
| 246 | } |
| 247 | |
| 248 | // Is the result already assigned to a channel ? |
| 249 | unsigned DestSubReg = MI->getOperand(i: 0).getSubReg(); |
| 250 | switch (DestSubReg) { |
| 251 | case R600::sub0: |
| 252 | return AluT_X; |
| 253 | case R600::sub1: |
| 254 | return AluT_Y; |
| 255 | case R600::sub2: |
| 256 | return AluT_Z; |
| 257 | case R600::sub3: |
| 258 | return AluT_W; |
| 259 | default: |
| 260 | break; |
| 261 | } |
| 262 | |
| 263 | // Is the result already member of a X/Y/Z/W class ? |
| 264 | Register DestReg = MI->getOperand(i: 0).getReg(); |
| 265 | if (regBelongsToClass(Reg: DestReg, RC: &R600::R600_TReg32_XRegClass) || |
| 266 | regBelongsToClass(Reg: DestReg, RC: &R600::R600_AddrRegClass)) |
| 267 | return AluT_X; |
| 268 | if (regBelongsToClass(Reg: DestReg, RC: &R600::R600_TReg32_YRegClass)) |
| 269 | return AluT_Y; |
| 270 | if (regBelongsToClass(Reg: DestReg, RC: &R600::R600_TReg32_ZRegClass)) |
| 271 | return AluT_Z; |
| 272 | if (regBelongsToClass(Reg: DestReg, RC: &R600::R600_TReg32_WRegClass)) |
| 273 | return AluT_W; |
| 274 | if (regBelongsToClass(Reg: DestReg, RC: &R600::R600_Reg128RegClass)) |
| 275 | return AluT_XYZW; |
| 276 | |
| 277 | // LDS src registers cannot be used in the Trans slot. |
| 278 | if (TII->readsLDSSrcReg(MI: *MI)) |
| 279 | return AluT_XYZW; |
| 280 | |
| 281 | return AluAny; |
| 282 | } |
| 283 | |
| 284 | int R600SchedStrategy::getInstKind(SUnit* SU) { |
| 285 | int Opcode = SU->getInstr()->getOpcode(); |
| 286 | |
| 287 | if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode)) |
| 288 | return IDFetch; |
| 289 | |
| 290 | if (TII->isALUInstr(Opcode)) { |
| 291 | return IDAlu; |
| 292 | } |
| 293 | |
| 294 | switch (Opcode) { |
| 295 | case R600::PRED_X: |
| 296 | case R600::COPY: |
| 297 | case R600::CONST_COPY: |
| 298 | case R600::INTERP_PAIR_XY: |
| 299 | case R600::INTERP_PAIR_ZW: |
| 300 | case R600::INTERP_VEC_LOAD: |
| 301 | case R600::DOT_4: |
| 302 | return IDAlu; |
| 303 | default: |
| 304 | return IDOther; |
| 305 | } |
| 306 | } |
| 307 | |
| 308 | SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) { |
| 309 | if (Q.empty()) |
| 310 | return nullptr; |
| 311 | for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend(); |
| 312 | It != E; ++It) { |
| 313 | SUnit *SU = *It; |
| 314 | InstructionsGroupCandidate.push_back(x: SU->getInstr()); |
| 315 | if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) && |
| 316 | (!AnyALU || !TII->isVectorOnly(MI: *SU->getInstr()))) { |
| 317 | InstructionsGroupCandidate.pop_back(); |
| 318 | Q.erase(position: (It + 1).base()); |
| 319 | return SU; |
| 320 | } |
| 321 | InstructionsGroupCandidate.pop_back(); |
| 322 | } |
| 323 | return nullptr; |
| 324 | } |
| 325 | |
| 326 | void R600SchedStrategy::LoadAlu() { |
| 327 | std::vector<SUnit *> &QSrc = Pending[IDAlu]; |
| 328 | for (SUnit *SU : QSrc) { |
| 329 | AluKind AK = getAluKind(SU); |
| 330 | AvailableAlus[AK].push_back(x: SU); |
| 331 | } |
| 332 | QSrc.clear(); |
| 333 | } |
| 334 | |
| 335 | void R600SchedStrategy::PrepareNextSlot() { |
| 336 | LLVM_DEBUG(dbgs() << "New Slot\n" ); |
| 337 | assert(OccupiedSlotsMask && "Slot wasn't filled" ); |
| 338 | OccupiedSlotsMask = 0; |
| 339 | // if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) |
| 340 | // OccupiedSlotsMask |= 16; |
| 341 | InstructionsGroupCandidate.clear(); |
| 342 | LoadAlu(); |
| 343 | } |
| 344 | |
| 345 | void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { |
| 346 | int DstIndex = TII->getOperandIdx(Opcode: MI->getOpcode(), Op: R600::OpName::dst); |
| 347 | if (DstIndex == -1) { |
| 348 | return; |
| 349 | } |
| 350 | Register DestReg = MI->getOperand(i: DstIndex).getReg(); |
| 351 | // PressureRegister crashes if an operand is def and used in the same inst |
| 352 | // and we try to constraint its regclass |
| 353 | for (const MachineOperand &MO : MI->all_uses()) |
| 354 | if (MO.getReg() == DestReg) |
| 355 | return; |
| 356 | // Constrains the regclass of DestReg to assign it to Slot |
| 357 | switch (Slot) { |
| 358 | case 0: |
| 359 | MRI->constrainRegClass(Reg: DestReg, RC: &R600::R600_TReg32_XRegClass); |
| 360 | break; |
| 361 | case 1: |
| 362 | MRI->constrainRegClass(Reg: DestReg, RC: &R600::R600_TReg32_YRegClass); |
| 363 | break; |
| 364 | case 2: |
| 365 | MRI->constrainRegClass(Reg: DestReg, RC: &R600::R600_TReg32_ZRegClass); |
| 366 | break; |
| 367 | case 3: |
| 368 | MRI->constrainRegClass(Reg: DestReg, RC: &R600::R600_TReg32_WRegClass); |
| 369 | break; |
| 370 | } |
| 371 | } |
| 372 | |
| 373 | SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) { |
| 374 | static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W}; |
| 375 | SUnit *SlotedSU = PopInst(Q&: AvailableAlus[IndexToID[Slot]], AnyALU: AnyAlu); |
| 376 | if (SlotedSU) |
| 377 | return SlotedSU; |
| 378 | SUnit *UnslotedSU = PopInst(Q&: AvailableAlus[AluAny], AnyALU: AnyAlu); |
| 379 | if (UnslotedSU) |
| 380 | AssignSlot(MI: UnslotedSU->getInstr(), Slot); |
| 381 | return UnslotedSU; |
| 382 | } |
| 383 | |
| 384 | unsigned R600SchedStrategy::AvailablesAluCount() const { |
| 385 | return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() + |
| 386 | AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() + |
| 387 | AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() + |
| 388 | AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() + |
| 389 | AvailableAlus[AluPredX].size(); |
| 390 | } |
| 391 | |
| 392 | SUnit* R600SchedStrategy::pickAlu() { |
| 393 | while (AvailablesAluCount() || !Pending[IDAlu].empty()) { |
| 394 | if (!OccupiedSlotsMask) { |
| 395 | // Bottom up scheduling : predX must comes first |
| 396 | if (!AvailableAlus[AluPredX].empty()) { |
| 397 | OccupiedSlotsMask |= 31; |
| 398 | return PopInst(Q&: AvailableAlus[AluPredX], AnyALU: false); |
| 399 | } |
| 400 | // Flush physical reg copies (RA will discard them) |
| 401 | if (!AvailableAlus[AluDiscarded].empty()) { |
| 402 | OccupiedSlotsMask |= 31; |
| 403 | return PopInst(Q&: AvailableAlus[AluDiscarded], AnyALU: false); |
| 404 | } |
| 405 | // If there is a T_XYZW alu available, use it |
| 406 | if (!AvailableAlus[AluT_XYZW].empty()) { |
| 407 | OccupiedSlotsMask |= 15; |
| 408 | return PopInst(Q&: AvailableAlus[AluT_XYZW], AnyALU: false); |
| 409 | } |
| 410 | } |
| 411 | bool TransSlotOccupied = OccupiedSlotsMask & 16; |
| 412 | if (!TransSlotOccupied && VLIW5) { |
| 413 | if (!AvailableAlus[AluTrans].empty()) { |
| 414 | OccupiedSlotsMask |= 16; |
| 415 | return PopInst(Q&: AvailableAlus[AluTrans], AnyALU: false); |
| 416 | } |
| 417 | SUnit *SU = AttemptFillSlot(Slot: 3, AnyAlu: true); |
| 418 | if (SU) { |
| 419 | OccupiedSlotsMask |= 16; |
| 420 | return SU; |
| 421 | } |
| 422 | } |
| 423 | for (int Chan = 3; Chan > -1; --Chan) { |
| 424 | bool isOccupied = OccupiedSlotsMask & (1 << Chan); |
| 425 | if (!isOccupied) { |
| 426 | SUnit *SU = AttemptFillSlot(Slot: Chan, AnyAlu: false); |
| 427 | if (SU) { |
| 428 | OccupiedSlotsMask |= (1 << Chan); |
| 429 | InstructionsGroupCandidate.push_back(x: SU->getInstr()); |
| 430 | return SU; |
| 431 | } |
| 432 | } |
| 433 | } |
| 434 | PrepareNextSlot(); |
| 435 | } |
| 436 | return nullptr; |
| 437 | } |
| 438 | |
| 439 | SUnit* R600SchedStrategy::pickOther(int QID) { |
| 440 | SUnit *SU = nullptr; |
| 441 | std::vector<SUnit *> &AQ = Available[QID]; |
| 442 | |
| 443 | if (AQ.empty()) { |
| 444 | MoveUnits(QSrc&: Pending[QID], QDst&: AQ); |
| 445 | } |
| 446 | if (!AQ.empty()) { |
| 447 | SU = AQ.back(); |
| 448 | AQ.pop_back(); |
| 449 | } |
| 450 | return SU; |
| 451 | } |
| 452 | |