| 1 | //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | /// The pass tries to use the 32-bit encoding for instructions when possible. |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | // |
| 10 | |
| 11 | #include "SIShrinkInstructions.h" |
| 12 | #include "AMDGPU.h" |
| 13 | #include "GCNSubtarget.h" |
| 14 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 15 | #include "Utils/AMDGPUBaseInfo.h" |
| 16 | #include "llvm/ADT/Statistic.h" |
| 17 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 18 | |
| 19 | #define DEBUG_TYPE "si-shrink-instructions" |
| 20 | |
| 21 | STATISTIC(NumInstructionsShrunk, |
| 22 | "Number of 64-bit instruction reduced to 32-bit." ); |
| 23 | STATISTIC(NumLiteralConstantsFolded, |
| 24 | "Number of literal constants folded into 32-bit instructions." ); |
| 25 | |
| 26 | using namespace llvm; |
| 27 | |
| 28 | namespace { |
| 29 | |
| 30 | class SIShrinkInstructions { |
| 31 | MachineFunction *MF; |
| 32 | MachineRegisterInfo *MRI; |
| 33 | const GCNSubtarget *ST; |
| 34 | const SIInstrInfo *TII; |
| 35 | const SIRegisterInfo *TRI; |
| 36 | bool IsPostRA; |
| 37 | |
| 38 | bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; |
| 39 | bool shouldShrinkTrue16(MachineInstr &MI) const; |
| 40 | bool isKImmOperand(const MachineOperand &Src) const; |
| 41 | bool isKUImmOperand(const MachineOperand &Src) const; |
| 42 | bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; |
| 43 | void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; |
| 44 | void shrinkScalarCompare(MachineInstr &MI) const; |
| 45 | void shrinkMIMG(MachineInstr &MI) const; |
| 46 | void shrinkMadFma(MachineInstr &MI) const; |
| 47 | bool shrinkScalarLogicOp(MachineInstr &MI) const; |
| 48 | bool tryReplaceDeadSDST(MachineInstr &MI) const; |
| 49 | bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, |
| 50 | Register Reg, unsigned SubReg) const; |
| 51 | bool instReadsReg(const MachineInstr *MI, unsigned Reg, |
| 52 | unsigned SubReg) const; |
| 53 | bool instModifiesReg(const MachineInstr *MI, unsigned Reg, |
| 54 | unsigned SubReg) const; |
| 55 | TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, |
| 56 | unsigned I) const; |
| 57 | void dropInstructionKeepingImpDefs(MachineInstr &MI) const; |
| 58 | MachineInstr *matchSwap(MachineInstr &MovT) const; |
| 59 | |
| 60 | public: |
| 61 | SIShrinkInstructions() = default; |
| 62 | bool run(MachineFunction &MF); |
| 63 | }; |
| 64 | |
| 65 | class SIShrinkInstructionsLegacy : public MachineFunctionPass { |
| 66 | |
| 67 | public: |
| 68 | static char ID; |
| 69 | |
| 70 | SIShrinkInstructionsLegacy() : MachineFunctionPass(ID) {} |
| 71 | |
| 72 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 73 | |
| 74 | StringRef getPassName() const override { return "SI Shrink Instructions" ; } |
| 75 | |
| 76 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 77 | AU.setPreservesCFG(); |
| 78 | MachineFunctionPass::getAnalysisUsage(AU); |
| 79 | } |
| 80 | }; |
| 81 | |
| 82 | } // End anonymous namespace. |
| 83 | |
| 84 | INITIALIZE_PASS(SIShrinkInstructionsLegacy, DEBUG_TYPE, |
| 85 | "SI Shrink Instructions" , false, false) |
| 86 | |
| 87 | char SIShrinkInstructionsLegacy::ID = 0; |
| 88 | |
| 89 | FunctionPass *llvm::createSIShrinkInstructionsLegacyPass() { |
| 90 | return new SIShrinkInstructionsLegacy(); |
| 91 | } |
| 92 | |
| 93 | /// This function checks \p MI for operands defined by a move immediate |
| 94 | /// instruction and then folds the literal constant into the instruction if it |
| 95 | /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. |
| 96 | bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, |
| 97 | bool TryToCommute) const { |
| 98 | assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); |
| 99 | |
| 100 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src0); |
| 101 | |
| 102 | // Try to fold Src0 |
| 103 | MachineOperand &Src0 = MI.getOperand(i: Src0Idx); |
| 104 | if (Src0.isReg()) { |
| 105 | Register Reg = Src0.getReg(); |
| 106 | if (Reg.isVirtual()) { |
| 107 | MachineInstr *Def = MRI->getUniqueVRegDef(Reg); |
| 108 | if (Def && Def->isMoveImmediate()) { |
| 109 | MachineOperand &MovSrc = Def->getOperand(i: 1); |
| 110 | bool ConstantFolded = false; |
| 111 | |
| 112 | if (TII->isOperandLegal(MI, OpIdx: Src0Idx, MO: &MovSrc)) { |
| 113 | if (MovSrc.isImm()) { |
| 114 | Src0.ChangeToImmediate(ImmVal: MovSrc.getImm()); |
| 115 | ConstantFolded = true; |
| 116 | } else if (MovSrc.isFI()) { |
| 117 | Src0.ChangeToFrameIndex(Idx: MovSrc.getIndex()); |
| 118 | ConstantFolded = true; |
| 119 | } else if (MovSrc.isGlobal()) { |
| 120 | Src0.ChangeToGA(GV: MovSrc.getGlobal(), Offset: MovSrc.getOffset(), |
| 121 | TargetFlags: MovSrc.getTargetFlags()); |
| 122 | ConstantFolded = true; |
| 123 | } |
| 124 | } |
| 125 | |
| 126 | if (ConstantFolded) { |
| 127 | if (MRI->use_nodbg_empty(RegNo: Reg)) |
| 128 | Def->eraseFromParent(); |
| 129 | ++NumLiteralConstantsFolded; |
| 130 | return true; |
| 131 | } |
| 132 | } |
| 133 | } |
| 134 | } |
| 135 | |
| 136 | // We have failed to fold src0, so commute the instruction and try again. |
| 137 | if (TryToCommute && MI.isCommutable()) { |
| 138 | if (TII->commuteInstruction(MI)) { |
| 139 | if (foldImmediates(MI, TryToCommute: false)) |
| 140 | return true; |
| 141 | |
| 142 | // Commute back. |
| 143 | TII->commuteInstruction(MI); |
| 144 | } |
| 145 | } |
| 146 | |
| 147 | return false; |
| 148 | } |
| 149 | |
| 150 | /// Do not shrink the instruction if its registers are not expressible in the |
| 151 | /// shrunk encoding. |
| 152 | bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const { |
| 153 | for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { |
| 154 | const MachineOperand &MO = MI.getOperand(i: I); |
| 155 | if (MO.isReg()) { |
| 156 | Register Reg = MO.getReg(); |
| 157 | assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink " |
| 158 | "True16 Instructions post-RA" ); |
| 159 | if (AMDGPU::VGPR_32RegClass.contains(Reg) && |
| 160 | !AMDGPU::VGPR_32_Lo128RegClass.contains(Reg)) |
| 161 | return false; |
| 162 | |
| 163 | if (AMDGPU::VGPR_16RegClass.contains(Reg) && |
| 164 | !AMDGPU::VGPR_16_Lo128RegClass.contains(Reg)) |
| 165 | return false; |
| 166 | } |
| 167 | } |
| 168 | return true; |
| 169 | } |
| 170 | |
| 171 | bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { |
| 172 | return isInt<16>(x: SignExtend64(X: Src.getImm(), B: 32)) && |
| 173 | !TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo()); |
| 174 | } |
| 175 | |
| 176 | bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { |
| 177 | return isUInt<16>(x: Src.getImm()) && |
| 178 | !TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo()); |
| 179 | } |
| 180 | |
| 181 | bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, |
| 182 | bool &IsUnsigned) const { |
| 183 | if (isInt<16>(x: SignExtend64(X: Src.getImm(), B: 32))) { |
| 184 | IsUnsigned = false; |
| 185 | return !TII->isInlineConstant(MO: Src); |
| 186 | } |
| 187 | |
| 188 | if (isUInt<16>(x: Src.getImm())) { |
| 189 | IsUnsigned = true; |
| 190 | return !TII->isInlineConstant(MO: Src); |
| 191 | } |
| 192 | |
| 193 | return false; |
| 194 | } |
| 195 | |
| 196 | /// \returns the opcode of an instruction a move immediate of the constant \p |
| 197 | /// Src can be replaced with if the constant is replaced with \p ModifiedImm. |
| 198 | /// i.e. |
| 199 | /// |
| 200 | /// If the bitreverse of a constant is an inline immediate, reverse the |
| 201 | /// immediate and return the bitreverse opcode. |
| 202 | /// |
| 203 | /// If the bitwise negation of a constant is an inline immediate, reverse the |
| 204 | /// immediate and return the bitwise not opcode. |
| 205 | static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII, |
| 206 | const MachineOperand &Src, |
| 207 | int32_t &ModifiedImm, bool Scalar) { |
| 208 | if (TII->isInlineConstant(MO: Src)) |
| 209 | return 0; |
| 210 | int32_t SrcImm = static_cast<int32_t>(Src.getImm()); |
| 211 | |
| 212 | if (!Scalar) { |
| 213 | // We could handle the scalar case with here, but we would need to check |
| 214 | // that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth |
| 215 | // it, as the reasonable values are already covered by s_movk_i32. |
| 216 | ModifiedImm = ~SrcImm; |
| 217 | if (TII->isInlineConstant(Imm: APInt(32, ModifiedImm, true))) |
| 218 | return AMDGPU::V_NOT_B32_e32; |
| 219 | } |
| 220 | |
| 221 | ModifiedImm = reverseBits<int32_t>(Val: SrcImm); |
| 222 | if (TII->isInlineConstant(Imm: APInt(32, ModifiedImm, true))) |
| 223 | return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32; |
| 224 | |
| 225 | return 0; |
| 226 | } |
| 227 | |
| 228 | /// Copy implicit register operands from specified instruction to this |
| 229 | /// instruction that are not part of the instruction definition. |
| 230 | void SIShrinkInstructions::(MachineInstr &NewMI, |
| 231 | MachineInstr &MI) const { |
| 232 | MachineFunction &MF = *MI.getMF(); |
| 233 | for (unsigned i = MI.getDesc().getNumOperands() + |
| 234 | MI.getDesc().implicit_uses().size() + |
| 235 | MI.getDesc().implicit_defs().size(), |
| 236 | e = MI.getNumOperands(); |
| 237 | i != e; ++i) { |
| 238 | const MachineOperand &MO = MI.getOperand(i); |
| 239 | if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) |
| 240 | NewMI.addOperand(MF, Op: MO); |
| 241 | } |
| 242 | } |
| 243 | |
| 244 | void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { |
| 245 | if (!ST->hasSCmpK()) |
| 246 | return; |
| 247 | |
| 248 | // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to |
| 249 | // get constants on the RHS. |
| 250 | if (!MI.getOperand(i: 0).isReg()) |
| 251 | TII->commuteInstruction(MI, NewMI: false, OpIdx1: 0, OpIdx2: 1); |
| 252 | |
| 253 | // cmpk requires src0 to be a register |
| 254 | const MachineOperand &Src0 = MI.getOperand(i: 0); |
| 255 | if (!Src0.isReg()) |
| 256 | return; |
| 257 | |
| 258 | MachineOperand &Src1 = MI.getOperand(i: 1); |
| 259 | if (!Src1.isImm()) |
| 260 | return; |
| 261 | |
| 262 | int SOPKOpc = AMDGPU::getSOPKOp(Opcode: MI.getOpcode()); |
| 263 | if (SOPKOpc == -1) |
| 264 | return; |
| 265 | |
| 266 | // eq/ne is special because the imm16 can be treated as signed or unsigned, |
| 267 | // and initially selected to the unsigned versions. |
| 268 | if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { |
| 269 | bool HasUImm; |
| 270 | if (isKImmOrKUImmOperand(Src: Src1, IsUnsigned&: HasUImm)) { |
| 271 | if (!HasUImm) { |
| 272 | SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? |
| 273 | AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; |
| 274 | Src1.setImm(SignExtend32(X: Src1.getImm(), B: 32)); |
| 275 | } |
| 276 | |
| 277 | MI.setDesc(TII->get(Opcode: SOPKOpc)); |
| 278 | } |
| 279 | |
| 280 | return; |
| 281 | } |
| 282 | |
| 283 | const MCInstrDesc &NewDesc = TII->get(Opcode: SOPKOpc); |
| 284 | |
| 285 | if ((SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKUImmOperand(Src: Src1)) || |
| 286 | (!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKImmOperand(Src: Src1))) { |
| 287 | if (!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc)) |
| 288 | Src1.setImm(SignExtend64(X: Src1.getImm(), B: 32)); |
| 289 | MI.setDesc(NewDesc); |
| 290 | } |
| 291 | } |
| 292 | |
| 293 | // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. |
| 294 | void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { |
| 295 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode()); |
| 296 | if (!Info) |
| 297 | return; |
| 298 | |
| 299 | uint8_t NewEncoding; |
| 300 | switch (Info->MIMGEncoding) { |
| 301 | case AMDGPU::MIMGEncGfx10NSA: |
| 302 | NewEncoding = AMDGPU::MIMGEncGfx10Default; |
| 303 | break; |
| 304 | case AMDGPU::MIMGEncGfx11NSA: |
| 305 | NewEncoding = AMDGPU::MIMGEncGfx11Default; |
| 306 | break; |
| 307 | default: |
| 308 | return; |
| 309 | } |
| 310 | |
| 311 | int VAddr0Idx = |
| 312 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vaddr0); |
| 313 | unsigned NewAddrDwords = Info->VAddrDwords; |
| 314 | const TargetRegisterClass *RC; |
| 315 | |
| 316 | if (Info->VAddrDwords == 2) { |
| 317 | RC = &AMDGPU::VReg_64RegClass; |
| 318 | } else if (Info->VAddrDwords == 3) { |
| 319 | RC = &AMDGPU::VReg_96RegClass; |
| 320 | } else if (Info->VAddrDwords == 4) { |
| 321 | RC = &AMDGPU::VReg_128RegClass; |
| 322 | } else if (Info->VAddrDwords == 5) { |
| 323 | RC = &AMDGPU::VReg_160RegClass; |
| 324 | } else if (Info->VAddrDwords == 6) { |
| 325 | RC = &AMDGPU::VReg_192RegClass; |
| 326 | } else if (Info->VAddrDwords == 7) { |
| 327 | RC = &AMDGPU::VReg_224RegClass; |
| 328 | } else if (Info->VAddrDwords == 8) { |
| 329 | RC = &AMDGPU::VReg_256RegClass; |
| 330 | } else if (Info->VAddrDwords == 9) { |
| 331 | RC = &AMDGPU::VReg_288RegClass; |
| 332 | } else if (Info->VAddrDwords == 10) { |
| 333 | RC = &AMDGPU::VReg_320RegClass; |
| 334 | } else if (Info->VAddrDwords == 11) { |
| 335 | RC = &AMDGPU::VReg_352RegClass; |
| 336 | } else if (Info->VAddrDwords == 12) { |
| 337 | RC = &AMDGPU::VReg_384RegClass; |
| 338 | } else { |
| 339 | RC = &AMDGPU::VReg_512RegClass; |
| 340 | NewAddrDwords = 16; |
| 341 | } |
| 342 | |
| 343 | unsigned VgprBase = 0; |
| 344 | unsigned NextVgpr = 0; |
| 345 | bool IsUndef = true; |
| 346 | bool IsKill = NewAddrDwords == Info->VAddrDwords; |
| 347 | const unsigned NSAMaxSize = ST->getNSAMaxSize(); |
| 348 | const bool IsPartialNSA = NewAddrDwords > NSAMaxSize; |
| 349 | const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands; |
| 350 | for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) { |
| 351 | const MachineOperand &Op = MI.getOperand(i: VAddr0Idx + Idx); |
| 352 | unsigned Vgpr = TRI->getHWRegIndex(Reg: Op.getReg()); |
| 353 | unsigned Dwords = TRI->getRegSizeInBits(Reg: Op.getReg(), MRI: *MRI) / 32; |
| 354 | assert(Dwords > 0 && "Un-implemented for less than 32 bit regs" ); |
| 355 | |
| 356 | if (Idx == 0) { |
| 357 | VgprBase = Vgpr; |
| 358 | NextVgpr = Vgpr + Dwords; |
| 359 | } else if (Vgpr == NextVgpr) { |
| 360 | NextVgpr = Vgpr + Dwords; |
| 361 | } else { |
| 362 | return; |
| 363 | } |
| 364 | |
| 365 | if (!Op.isUndef()) |
| 366 | IsUndef = false; |
| 367 | if (!Op.isKill()) |
| 368 | IsKill = false; |
| 369 | } |
| 370 | |
| 371 | if (VgprBase + NewAddrDwords > 256) |
| 372 | return; |
| 373 | |
| 374 | // Further check for implicit tied operands - this may be present if TFE is |
| 375 | // enabled |
| 376 | int TFEIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::tfe); |
| 377 | int LWEIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::lwe); |
| 378 | unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(i: TFEIdx).getImm(); |
| 379 | unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(i: LWEIdx).getImm(); |
| 380 | int ToUntie = -1; |
| 381 | if (TFEVal || LWEVal) { |
| 382 | // TFE/LWE is enabled so we need to deal with an implicit tied operand |
| 383 | for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { |
| 384 | if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && |
| 385 | MI.getOperand(i).isImplicit()) { |
| 386 | // This is the tied operand |
| 387 | assert( |
| 388 | ToUntie == -1 && |
| 389 | "found more than one tied implicit operand when expecting only 1" ); |
| 390 | ToUntie = i; |
| 391 | MI.untieRegOperand(OpIdx: ToUntie); |
| 392 | } |
| 393 | } |
| 394 | } |
| 395 | |
| 396 | unsigned NewOpcode = AMDGPU::getMIMGOpcode(BaseOpcode: Info->BaseOpcode, MIMGEncoding: NewEncoding, |
| 397 | VDataDwords: Info->VDataDwords, VAddrDwords: NewAddrDwords); |
| 398 | MI.setDesc(TII->get(Opcode: NewOpcode)); |
| 399 | MI.getOperand(i: VAddr0Idx).setReg(RC->getRegister(i: VgprBase)); |
| 400 | MI.getOperand(i: VAddr0Idx).setIsUndef(IsUndef); |
| 401 | MI.getOperand(i: VAddr0Idx).setIsKill(IsKill); |
| 402 | |
| 403 | for (unsigned i = 1; i < EndVAddr; ++i) |
| 404 | MI.removeOperand(OpNo: VAddr0Idx + 1); |
| 405 | |
| 406 | if (ToUntie >= 0) { |
| 407 | MI.tieOperands( |
| 408 | DefIdx: AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdata), |
| 409 | UseIdx: ToUntie - (EndVAddr - 1)); |
| 410 | } |
| 411 | } |
| 412 | |
| 413 | // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. |
| 414 | void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { |
| 415 | // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so |
| 416 | // there is no reason to try to shrink them. |
| 417 | if (!ST->hasVOP3Literal()) |
| 418 | return; |
| 419 | |
| 420 | // There is no advantage to doing this pre-RA. |
| 421 | if (!IsPostRA) |
| 422 | return; |
| 423 | |
| 424 | if (TII->hasAnyModifiersSet(MI)) |
| 425 | return; |
| 426 | |
| 427 | const unsigned Opcode = MI.getOpcode(); |
| 428 | MachineOperand &Src0 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
| 429 | MachineOperand &Src1 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
| 430 | MachineOperand &Src2 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
| 431 | unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; |
| 432 | |
| 433 | bool Swap; |
| 434 | |
| 435 | // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. |
| 436 | if (Src2.isImm() && !TII->isInlineConstant(MO: Src2)) { |
| 437 | if (Src1.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src1.getReg())) |
| 438 | Swap = false; |
| 439 | else if (Src0.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src0.getReg())) |
| 440 | Swap = true; |
| 441 | else |
| 442 | return; |
| 443 | |
| 444 | switch (Opcode) { |
| 445 | default: |
| 446 | llvm_unreachable("Unexpected mad/fma opcode!" ); |
| 447 | case AMDGPU::V_MAD_F32_e64: |
| 448 | NewOpcode = AMDGPU::V_MADAK_F32; |
| 449 | break; |
| 450 | case AMDGPU::V_FMA_F32_e64: |
| 451 | NewOpcode = AMDGPU::V_FMAAK_F32; |
| 452 | break; |
| 453 | case AMDGPU::V_MAD_F16_e64: |
| 454 | NewOpcode = AMDGPU::V_MADAK_F16; |
| 455 | break; |
| 456 | case AMDGPU::V_FMA_F16_e64: |
| 457 | case AMDGPU::V_FMA_F16_gfx9_e64: |
| 458 | NewOpcode = AMDGPU::V_FMAAK_F16; |
| 459 | break; |
| 460 | case AMDGPU::V_FMA_F16_gfx9_t16_e64: |
| 461 | NewOpcode = AMDGPU::V_FMAAK_F16_t16; |
| 462 | break; |
| 463 | case AMDGPU::V_FMA_F16_gfx9_fake16_e64: |
| 464 | NewOpcode = AMDGPU::V_FMAAK_F16_fake16; |
| 465 | break; |
| 466 | } |
| 467 | } |
| 468 | |
| 469 | // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. |
| 470 | if (Src2.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src2.getReg())) { |
| 471 | if (Src1.isImm() && !TII->isInlineConstant(MO: Src1)) |
| 472 | Swap = false; |
| 473 | else if (Src0.isImm() && !TII->isInlineConstant(MO: Src0)) |
| 474 | Swap = true; |
| 475 | else |
| 476 | return; |
| 477 | |
| 478 | switch (Opcode) { |
| 479 | default: |
| 480 | llvm_unreachable("Unexpected mad/fma opcode!" ); |
| 481 | case AMDGPU::V_MAD_F32_e64: |
| 482 | NewOpcode = AMDGPU::V_MADMK_F32; |
| 483 | break; |
| 484 | case AMDGPU::V_FMA_F32_e64: |
| 485 | NewOpcode = AMDGPU::V_FMAMK_F32; |
| 486 | break; |
| 487 | case AMDGPU::V_MAD_F16_e64: |
| 488 | NewOpcode = AMDGPU::V_MADMK_F16; |
| 489 | break; |
| 490 | case AMDGPU::V_FMA_F16_e64: |
| 491 | case AMDGPU::V_FMA_F16_gfx9_e64: |
| 492 | NewOpcode = AMDGPU::V_FMAMK_F16; |
| 493 | break; |
| 494 | case AMDGPU::V_FMA_F16_gfx9_t16_e64: |
| 495 | NewOpcode = AMDGPU::V_FMAMK_F16_t16; |
| 496 | break; |
| 497 | case AMDGPU::V_FMA_F16_gfx9_fake16_e64: |
| 498 | NewOpcode = AMDGPU::V_FMAMK_F16_fake16; |
| 499 | break; |
| 500 | } |
| 501 | } |
| 502 | |
| 503 | if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) |
| 504 | return; |
| 505 | |
| 506 | if (AMDGPU::isTrue16Inst(Opc: NewOpcode) && !shouldShrinkTrue16(MI)) |
| 507 | return; |
| 508 | |
| 509 | if (Swap) { |
| 510 | // Swap Src0 and Src1 by building a new instruction. |
| 511 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: NewOpcode), |
| 512 | DestReg: MI.getOperand(i: 0).getReg()) |
| 513 | .add(MO: Src1) |
| 514 | .add(MO: Src0) |
| 515 | .add(MO: Src2) |
| 516 | .setMIFlags(MI.getFlags()); |
| 517 | MI.eraseFromParent(); |
| 518 | } else { |
| 519 | TII->removeModOperands(MI); |
| 520 | MI.setDesc(TII->get(Opcode: NewOpcode)); |
| 521 | } |
| 522 | } |
| 523 | |
| 524 | /// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals. |
| 525 | /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. |
| 526 | /// If the inverse of the immediate is legal, use ANDN2, ORN2 or |
| 527 | /// XNOR (as a ^ b == ~(a ^ ~b)). |
| 528 | /// \returns true if the caller should continue the machine function iterator |
| 529 | bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { |
| 530 | unsigned Opc = MI.getOpcode(); |
| 531 | const MachineOperand *Dest = &MI.getOperand(i: 0); |
| 532 | MachineOperand *Src0 = &MI.getOperand(i: 1); |
| 533 | MachineOperand *Src1 = &MI.getOperand(i: 2); |
| 534 | MachineOperand *SrcReg = Src0; |
| 535 | MachineOperand *SrcImm = Src1; |
| 536 | |
| 537 | if (!SrcImm->isImm() || |
| 538 | AMDGPU::isInlinableLiteral32(Literal: SrcImm->getImm(), HasInv2Pi: ST->hasInv2PiInlineImm())) |
| 539 | return false; |
| 540 | |
| 541 | uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); |
| 542 | uint32_t NewImm = 0; |
| 543 | |
| 544 | if (Opc == AMDGPU::S_AND_B32) { |
| 545 | if (isPowerOf2_32(Value: ~Imm)) { |
| 546 | NewImm = llvm::countr_one(Value: Imm); |
| 547 | Opc = AMDGPU::S_BITSET0_B32; |
| 548 | } else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) { |
| 549 | NewImm = ~Imm; |
| 550 | Opc = AMDGPU::S_ANDN2_B32; |
| 551 | } |
| 552 | } else if (Opc == AMDGPU::S_OR_B32) { |
| 553 | if (isPowerOf2_32(Value: Imm)) { |
| 554 | NewImm = llvm::countr_zero(Val: Imm); |
| 555 | Opc = AMDGPU::S_BITSET1_B32; |
| 556 | } else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) { |
| 557 | NewImm = ~Imm; |
| 558 | Opc = AMDGPU::S_ORN2_B32; |
| 559 | } |
| 560 | } else if (Opc == AMDGPU::S_XOR_B32) { |
| 561 | if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) { |
| 562 | NewImm = ~Imm; |
| 563 | Opc = AMDGPU::S_XNOR_B32; |
| 564 | } |
| 565 | } else { |
| 566 | llvm_unreachable("unexpected opcode" ); |
| 567 | } |
| 568 | |
| 569 | if (NewImm != 0) { |
| 570 | if (Dest->getReg().isVirtual() && SrcReg->isReg()) { |
| 571 | MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: 0, PrefReg: SrcReg->getReg()); |
| 572 | MRI->setRegAllocationHint(VReg: SrcReg->getReg(), Type: 0, PrefReg: Dest->getReg()); |
| 573 | return true; |
| 574 | } |
| 575 | |
| 576 | if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { |
| 577 | const bool IsUndef = SrcReg->isUndef(); |
| 578 | const bool IsKill = SrcReg->isKill(); |
| 579 | MI.setDesc(TII->get(Opcode: Opc)); |
| 580 | if (Opc == AMDGPU::S_BITSET0_B32 || |
| 581 | Opc == AMDGPU::S_BITSET1_B32) { |
| 582 | Src0->ChangeToImmediate(ImmVal: NewImm); |
| 583 | // Remove the immediate and add the tied input. |
| 584 | MI.getOperand(i: 2).ChangeToRegister(Reg: Dest->getReg(), /*IsDef*/ isDef: false, |
| 585 | /*isImp*/ false, isKill: IsKill, |
| 586 | /*isDead*/ false, isUndef: IsUndef); |
| 587 | MI.tieOperands(DefIdx: 0, UseIdx: 2); |
| 588 | } else { |
| 589 | SrcImm->setImm(NewImm); |
| 590 | } |
| 591 | } |
| 592 | } |
| 593 | |
| 594 | return false; |
| 595 | } |
| 596 | |
| 597 | // This is the same as MachineInstr::readsRegister/modifiesRegister except |
| 598 | // it takes subregs into account. |
| 599 | bool SIShrinkInstructions::instAccessReg( |
| 600 | iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, |
| 601 | unsigned SubReg) const { |
| 602 | for (const MachineOperand &MO : R) { |
| 603 | if (!MO.isReg()) |
| 604 | continue; |
| 605 | |
| 606 | if (Reg.isPhysical() && MO.getReg().isPhysical()) { |
| 607 | if (TRI->regsOverlap(RegA: Reg, RegB: MO.getReg())) |
| 608 | return true; |
| 609 | } else if (MO.getReg() == Reg && Reg.isVirtual()) { |
| 610 | LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubIdx: SubReg) & |
| 611 | TRI->getSubRegIndexLaneMask(SubIdx: MO.getSubReg()); |
| 612 | if (Overlap.any()) |
| 613 | return true; |
| 614 | } |
| 615 | } |
| 616 | return false; |
| 617 | } |
| 618 | |
| 619 | bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, |
| 620 | unsigned SubReg) const { |
| 621 | return instAccessReg(R: MI->uses(), Reg, SubReg); |
| 622 | } |
| 623 | |
| 624 | bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, |
| 625 | unsigned SubReg) const { |
| 626 | return instAccessReg(R: MI->defs(), Reg, SubReg); |
| 627 | } |
| 628 | |
| 629 | TargetInstrInfo::RegSubRegPair |
| 630 | SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, |
| 631 | unsigned I) const { |
| 632 | if (TRI->getRegSizeInBits(Reg, MRI: *MRI) != 32) { |
| 633 | if (Reg.isPhysical()) { |
| 634 | Reg = TRI->getSubReg(Reg, Idx: TRI->getSubRegFromChannel(Channel: I)); |
| 635 | } else { |
| 636 | Sub = TRI->getSubRegFromChannel(Channel: I + TRI->getChannelFromSubReg(SubReg: Sub)); |
| 637 | } |
| 638 | } |
| 639 | return TargetInstrInfo::RegSubRegPair(Reg, Sub); |
| 640 | } |
| 641 | |
| 642 | void SIShrinkInstructions::dropInstructionKeepingImpDefs( |
| 643 | MachineInstr &MI) const { |
| 644 | for (unsigned i = MI.getDesc().getNumOperands() + |
| 645 | MI.getDesc().implicit_uses().size() + |
| 646 | MI.getDesc().implicit_defs().size(), |
| 647 | e = MI.getNumOperands(); |
| 648 | i != e; ++i) { |
| 649 | const MachineOperand &Op = MI.getOperand(i); |
| 650 | if (!Op.isDef()) |
| 651 | continue; |
| 652 | BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(), |
| 653 | MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Op.getReg()); |
| 654 | } |
| 655 | |
| 656 | MI.eraseFromParent(); |
| 657 | } |
| 658 | |
| 659 | // Match: |
| 660 | // mov t, x |
| 661 | // mov x, y |
| 662 | // mov y, t |
| 663 | // |
| 664 | // => |
| 665 | // |
| 666 | // mov t, x (t is potentially dead and move eliminated) |
| 667 | // v_swap_b32 x, y |
| 668 | // |
| 669 | // Returns next valid instruction pointer if was able to create v_swap_b32. |
| 670 | // |
| 671 | // This shall not be done too early not to prevent possible folding which may |
| 672 | // remove matched moves, and this should preferably be done before RA to |
| 673 | // release saved registers and also possibly after RA which can insert copies |
| 674 | // too. |
| 675 | // |
| 676 | // This is really just a generic peephole that is not a canonical shrinking, |
| 677 | // although requirements match the pass placement and it reduces code size too. |
| 678 | MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { |
| 679 | assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || |
| 680 | MovT.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 || |
| 681 | MovT.getOpcode() == AMDGPU::COPY); |
| 682 | |
| 683 | Register T = MovT.getOperand(i: 0).getReg(); |
| 684 | unsigned Tsub = MovT.getOperand(i: 0).getSubReg(); |
| 685 | MachineOperand &Xop = MovT.getOperand(i: 1); |
| 686 | |
| 687 | if (!Xop.isReg()) |
| 688 | return nullptr; |
| 689 | Register X = Xop.getReg(); |
| 690 | unsigned Xsub = Xop.getSubReg(); |
| 691 | |
| 692 | unsigned Size = TII->getOpSize(MI: MovT, OpNo: 0); |
| 693 | |
| 694 | // We can't match v_swap_b16 pre-RA, because VGPR_16_Lo128 registers |
| 695 | // are not allocatble. |
| 696 | if (Size == 2 && X.isVirtual()) |
| 697 | return nullptr; |
| 698 | |
| 699 | if (!TRI->isVGPR(MRI: *MRI, Reg: X)) |
| 700 | return nullptr; |
| 701 | |
| 702 | const unsigned SearchLimit = 16; |
| 703 | unsigned Count = 0; |
| 704 | bool KilledT = false; |
| 705 | for (auto Iter = std::next(x: MovT.getIterator()), |
| 706 | E = MovT.getParent()->instr_end(); |
| 707 | Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { |
| 708 | |
| 709 | MachineInstr *MovY = &*Iter; |
| 710 | KilledT = MovY->killsRegister(Reg: T, TRI); |
| 711 | |
| 712 | if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && |
| 713 | MovY->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 && |
| 714 | MovY->getOpcode() != AMDGPU::COPY) || |
| 715 | !MovY->getOperand(i: 1).isReg() || MovY->getOperand(i: 1).getReg() != T || |
| 716 | MovY->getOperand(i: 1).getSubReg() != Tsub) |
| 717 | continue; |
| 718 | |
| 719 | Register Y = MovY->getOperand(i: 0).getReg(); |
| 720 | unsigned Ysub = MovY->getOperand(i: 0).getSubReg(); |
| 721 | |
| 722 | if (!TRI->isVGPR(MRI: *MRI, Reg: Y)) |
| 723 | continue; |
| 724 | |
| 725 | MachineInstr *MovX = nullptr; |
| 726 | for (auto IY = MovY->getIterator(), I = std::next(x: MovT.getIterator()); |
| 727 | I != IY; ++I) { |
| 728 | if (instReadsReg(MI: &*I, Reg: X, SubReg: Xsub) || instModifiesReg(MI: &*I, Reg: Y, SubReg: Ysub) || |
| 729 | instModifiesReg(MI: &*I, Reg: T, SubReg: Tsub) || |
| 730 | (MovX && instModifiesReg(MI: &*I, Reg: X, SubReg: Xsub))) { |
| 731 | MovX = nullptr; |
| 732 | break; |
| 733 | } |
| 734 | if (!instReadsReg(MI: &*I, Reg: Y, SubReg: Ysub)) { |
| 735 | if (!MovX && instModifiesReg(MI: &*I, Reg: X, SubReg: Xsub)) { |
| 736 | MovX = nullptr; |
| 737 | break; |
| 738 | } |
| 739 | continue; |
| 740 | } |
| 741 | if (MovX || |
| 742 | (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && |
| 743 | I->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 && |
| 744 | I->getOpcode() != AMDGPU::COPY) || |
| 745 | I->getOperand(i: 0).getReg() != X || |
| 746 | I->getOperand(i: 0).getSubReg() != Xsub) { |
| 747 | MovX = nullptr; |
| 748 | break; |
| 749 | } |
| 750 | |
| 751 | if (Size > 4 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) |
| 752 | continue; |
| 753 | |
| 754 | MovX = &*I; |
| 755 | } |
| 756 | |
| 757 | if (!MovX) |
| 758 | continue; |
| 759 | |
| 760 | LLVM_DEBUG(dbgs() << "Matched v_swap:\n" << MovT << *MovX << *MovY); |
| 761 | |
| 762 | MachineBasicBlock &MBB = *MovT.getParent(); |
| 763 | SmallVector<MachineInstr *, 4> Swaps; |
| 764 | if (Size == 2) { |
| 765 | auto *MIB = BuildMI(BB&: MBB, I: MovX->getIterator(), MIMD: MovT.getDebugLoc(), |
| 766 | MCID: TII->get(Opcode: AMDGPU::V_SWAP_B16)) |
| 767 | .addDef(RegNo: X) |
| 768 | .addDef(RegNo: Y) |
| 769 | .addReg(RegNo: Y) |
| 770 | .addReg(RegNo: X) |
| 771 | .getInstr(); |
| 772 | Swaps.push_back(Elt: MIB); |
| 773 | } else { |
| 774 | assert(Size > 0 && Size % 4 == 0); |
| 775 | for (unsigned I = 0; I < Size / 4; ++I) { |
| 776 | TargetInstrInfo::RegSubRegPair X1, Y1; |
| 777 | X1 = getSubRegForIndex(Reg: X, Sub: Xsub, I); |
| 778 | Y1 = getSubRegForIndex(Reg: Y, Sub: Ysub, I); |
| 779 | auto *MIB = BuildMI(BB&: MBB, I: MovX->getIterator(), MIMD: MovT.getDebugLoc(), |
| 780 | MCID: TII->get(Opcode: AMDGPU::V_SWAP_B32)) |
| 781 | .addDef(RegNo: X1.Reg, Flags: 0, SubReg: X1.SubReg) |
| 782 | .addDef(RegNo: Y1.Reg, Flags: 0, SubReg: Y1.SubReg) |
| 783 | .addReg(RegNo: Y1.Reg, flags: 0, SubReg: Y1.SubReg) |
| 784 | .addReg(RegNo: X1.Reg, flags: 0, SubReg: X1.SubReg) |
| 785 | .getInstr(); |
| 786 | Swaps.push_back(Elt: MIB); |
| 787 | } |
| 788 | } |
| 789 | // Drop implicit EXEC. |
| 790 | if (MovX->hasRegisterImplicitUseOperand(Reg: AMDGPU::EXEC)) { |
| 791 | for (MachineInstr *Swap : Swaps) { |
| 792 | Swap->removeOperand(OpNo: Swap->getNumExplicitOperands()); |
| 793 | Swap->copyImplicitOps(MF&: *MBB.getParent(), MI: *MovX); |
| 794 | } |
| 795 | } |
| 796 | MovX->eraseFromParent(); |
| 797 | dropInstructionKeepingImpDefs(MI&: *MovY); |
| 798 | MachineInstr *Next = &*std::next(x: MovT.getIterator()); |
| 799 | |
| 800 | if (T.isVirtual() && MRI->use_nodbg_empty(RegNo: T)) { |
| 801 | dropInstructionKeepingImpDefs(MI&: MovT); |
| 802 | } else { |
| 803 | Xop.setIsKill(false); |
| 804 | for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { |
| 805 | unsigned OpNo = MovT.getNumExplicitOperands() + I; |
| 806 | const MachineOperand &Op = MovT.getOperand(i: OpNo); |
| 807 | if (Op.isKill() && TRI->regsOverlap(RegA: X, RegB: Op.getReg())) |
| 808 | MovT.removeOperand(OpNo); |
| 809 | } |
| 810 | } |
| 811 | |
| 812 | return Next; |
| 813 | } |
| 814 | |
| 815 | return nullptr; |
| 816 | } |
| 817 | |
| 818 | // If an instruction has dead sdst replace it with NULL register on gfx1030+ |
| 819 | bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const { |
| 820 | if (!ST->hasGFX10_3Insts()) |
| 821 | return false; |
| 822 | |
| 823 | MachineOperand *Op = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst); |
| 824 | if (!Op) |
| 825 | return false; |
| 826 | Register SDstReg = Op->getReg(); |
| 827 | if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(RegNo: SDstReg)) |
| 828 | return false; |
| 829 | |
| 830 | Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64); |
| 831 | return true; |
| 832 | } |
| 833 | |
| 834 | bool SIShrinkInstructions::run(MachineFunction &MF) { |
| 835 | |
| 836 | this->MF = &MF; |
| 837 | MRI = &MF.getRegInfo(); |
| 838 | ST = &MF.getSubtarget<GCNSubtarget>(); |
| 839 | TII = ST->getInstrInfo(); |
| 840 | TRI = &TII->getRegisterInfo(); |
| 841 | IsPostRA = MF.getProperties().hasNoVRegs(); |
| 842 | |
| 843 | unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; |
| 844 | |
| 845 | for (MachineBasicBlock &MBB : MF) { |
| 846 | MachineBasicBlock::iterator I, Next; |
| 847 | for (I = MBB.begin(); I != MBB.end(); I = Next) { |
| 848 | Next = std::next(x: I); |
| 849 | MachineInstr &MI = *I; |
| 850 | |
| 851 | if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { |
| 852 | // If this has a literal constant source that is the same as the |
| 853 | // reversed bits of an inline immediate, replace with a bitreverse of |
| 854 | // that constant. This saves 4 bytes in the common case of materializing |
| 855 | // sign bits. |
| 856 | |
| 857 | // Test if we are after regalloc. We only want to do this after any |
| 858 | // optimizations happen because this will confuse them. |
| 859 | MachineOperand &Src = MI.getOperand(i: 1); |
| 860 | if (Src.isImm() && IsPostRA) { |
| 861 | int32_t ModImm; |
| 862 | unsigned ModOpcode = |
| 863 | canModifyToInlineImmOp32(TII, Src, ModifiedImm&: ModImm, /*Scalar=*/false); |
| 864 | if (ModOpcode != 0) { |
| 865 | MI.setDesc(TII->get(Opcode: ModOpcode)); |
| 866 | Src.setImm(static_cast<int64_t>(ModImm)); |
| 867 | continue; |
| 868 | } |
| 869 | } |
| 870 | } |
| 871 | |
| 872 | if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || |
| 873 | MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 || |
| 874 | MI.getOpcode() == AMDGPU::COPY)) { |
| 875 | if (auto *NextMI = matchSwap(MovT&: MI)) { |
| 876 | Next = NextMI->getIterator(); |
| 877 | continue; |
| 878 | } |
| 879 | } |
| 880 | |
| 881 | // Try to use S_ADDK_I32 and S_MULK_I32. |
| 882 | if (MI.getOpcode() == AMDGPU::S_ADD_I32 || |
| 883 | MI.getOpcode() == AMDGPU::S_MUL_I32) { |
| 884 | const MachineOperand *Dest = &MI.getOperand(i: 0); |
| 885 | MachineOperand *Src0 = &MI.getOperand(i: 1); |
| 886 | MachineOperand *Src1 = &MI.getOperand(i: 2); |
| 887 | |
| 888 | if (!Src0->isReg() && Src1->isReg()) { |
| 889 | if (TII->commuteInstruction(MI, NewMI: false, OpIdx1: 1, OpIdx2: 2)) |
| 890 | std::swap(a&: Src0, b&: Src1); |
| 891 | } |
| 892 | |
| 893 | // FIXME: This could work better if hints worked with subregisters. If |
| 894 | // we have a vector add of a constant, we usually don't get the correct |
| 895 | // allocation due to the subregister usage. |
| 896 | if (Dest->getReg().isVirtual() && Src0->isReg()) { |
| 897 | MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: 0, PrefReg: Src0->getReg()); |
| 898 | MRI->setRegAllocationHint(VReg: Src0->getReg(), Type: 0, PrefReg: Dest->getReg()); |
| 899 | continue; |
| 900 | } |
| 901 | |
| 902 | if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { |
| 903 | if (Src1->isImm() && isKImmOperand(Src: *Src1)) { |
| 904 | unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? |
| 905 | AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; |
| 906 | |
| 907 | Src1->setImm(SignExtend64(X: Src1->getImm(), B: 32)); |
| 908 | MI.setDesc(TII->get(Opcode: Opc)); |
| 909 | MI.tieOperands(DefIdx: 0, UseIdx: 1); |
| 910 | } |
| 911 | } |
| 912 | } |
| 913 | |
| 914 | // Try to use s_cmpk_* |
| 915 | if (MI.isCompare() && TII->isSOPC(MI)) { |
| 916 | shrinkScalarCompare(MI); |
| 917 | continue; |
| 918 | } |
| 919 | |
| 920 | // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. |
| 921 | if (MI.getOpcode() == AMDGPU::S_MOV_B32) { |
| 922 | const MachineOperand &Dst = MI.getOperand(i: 0); |
| 923 | MachineOperand &Src = MI.getOperand(i: 1); |
| 924 | |
| 925 | if (Src.isImm() && Dst.getReg().isPhysical()) { |
| 926 | unsigned ModOpc; |
| 927 | int32_t ModImm; |
| 928 | if (isKImmOperand(Src)) { |
| 929 | MI.setDesc(TII->get(Opcode: AMDGPU::S_MOVK_I32)); |
| 930 | Src.setImm(SignExtend64(X: Src.getImm(), B: 32)); |
| 931 | } else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModifiedImm&: ModImm, |
| 932 | /*Scalar=*/true))) { |
| 933 | MI.setDesc(TII->get(Opcode: ModOpc)); |
| 934 | Src.setImm(static_cast<int64_t>(ModImm)); |
| 935 | } |
| 936 | } |
| 937 | |
| 938 | continue; |
| 939 | } |
| 940 | |
| 941 | // Shrink scalar logic operations. |
| 942 | if (MI.getOpcode() == AMDGPU::S_AND_B32 || |
| 943 | MI.getOpcode() == AMDGPU::S_OR_B32 || |
| 944 | MI.getOpcode() == AMDGPU::S_XOR_B32) { |
| 945 | if (shrinkScalarLogicOp(MI)) |
| 946 | continue; |
| 947 | } |
| 948 | |
| 949 | if (IsPostRA && TII->isMIMG(Opcode: MI.getOpcode()) && |
| 950 | ST->getGeneration() >= AMDGPUSubtarget::GFX10) { |
| 951 | shrinkMIMG(MI); |
| 952 | continue; |
| 953 | } |
| 954 | |
| 955 | if (!TII->isVOP3(MI)) |
| 956 | continue; |
| 957 | |
| 958 | if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || |
| 959 | MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || |
| 960 | MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || |
| 961 | MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || |
| 962 | MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 || |
| 963 | MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 || |
| 964 | MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) { |
| 965 | shrinkMadFma(MI); |
| 966 | continue; |
| 967 | } |
| 968 | |
| 969 | // If there is no chance we will shrink it and use VCC as sdst to get |
| 970 | // a 32 bit form try to replace dead sdst with NULL. |
| 971 | if (TII->isVOP3(Opcode: MI.getOpcode())) { |
| 972 | tryReplaceDeadSDST(MI); |
| 973 | if (!TII->hasVALU32BitEncoding(Opcode: MI.getOpcode())) { |
| 974 | continue; |
| 975 | } |
| 976 | } |
| 977 | |
| 978 | if (!TII->canShrink(MI, MRI: *MRI)) { |
| 979 | // Try commuting the instruction and see if that enables us to shrink |
| 980 | // it. |
| 981 | if (!MI.isCommutable() || !TII->commuteInstruction(MI) || |
| 982 | !TII->canShrink(MI, MRI: *MRI)) { |
| 983 | tryReplaceDeadSDST(MI); |
| 984 | continue; |
| 985 | } |
| 986 | } |
| 987 | |
| 988 | int Op32 = AMDGPU::getVOPe32(Opcode: MI.getOpcode()); |
| 989 | |
| 990 | if (TII->isVOPC(Opcode: Op32)) { |
| 991 | MachineOperand &Op0 = MI.getOperand(i: 0); |
| 992 | if (Op0.isReg()) { |
| 993 | // Exclude VOPCX instructions as these don't explicitly write a |
| 994 | // dst. |
| 995 | Register DstReg = Op0.getReg(); |
| 996 | if (DstReg.isVirtual()) { |
| 997 | // VOPC instructions can only write to the VCC register. We can't |
| 998 | // force them to use VCC here, because this is only one register and |
| 999 | // cannot deal with sequences which would require multiple copies of |
| 1000 | // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) |
| 1001 | // |
| 1002 | // So, instead of forcing the instruction to write to VCC, we |
| 1003 | // provide a hint to the register allocator to use VCC and then we |
| 1004 | // will run this pass again after RA and shrink it if it outputs to |
| 1005 | // VCC. |
| 1006 | MRI->setRegAllocationHint(VReg: DstReg, Type: 0, PrefReg: VCCReg); |
| 1007 | continue; |
| 1008 | } |
| 1009 | if (DstReg != VCCReg) |
| 1010 | continue; |
| 1011 | } |
| 1012 | } |
| 1013 | |
| 1014 | if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { |
| 1015 | // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC |
| 1016 | // instructions. |
| 1017 | const MachineOperand *Src2 = |
| 1018 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
| 1019 | if (!Src2->isReg()) |
| 1020 | continue; |
| 1021 | Register SReg = Src2->getReg(); |
| 1022 | if (SReg.isVirtual()) { |
| 1023 | MRI->setRegAllocationHint(VReg: SReg, Type: 0, PrefReg: VCCReg); |
| 1024 | continue; |
| 1025 | } |
| 1026 | if (SReg != VCCReg) |
| 1027 | continue; |
| 1028 | } |
| 1029 | |
| 1030 | // Check for the bool flag output for instructions like V_ADD_I32_e64. |
| 1031 | const MachineOperand *SDst = TII->getNamedOperand(MI, |
| 1032 | OperandName: AMDGPU::OpName::sdst); |
| 1033 | |
| 1034 | if (SDst) { |
| 1035 | bool Next = false; |
| 1036 | |
| 1037 | if (SDst->getReg() != VCCReg) { |
| 1038 | if (SDst->getReg().isVirtual()) |
| 1039 | MRI->setRegAllocationHint(VReg: SDst->getReg(), Type: 0, PrefReg: VCCReg); |
| 1040 | Next = true; |
| 1041 | } |
| 1042 | |
| 1043 | // All of the instructions with carry outs also have an SGPR input in |
| 1044 | // src2. |
| 1045 | const MachineOperand *Src2 = TII->getNamedOperand(MI, |
| 1046 | OperandName: AMDGPU::OpName::src2); |
| 1047 | if (Src2 && Src2->getReg() != VCCReg) { |
| 1048 | if (Src2->getReg().isVirtual()) |
| 1049 | MRI->setRegAllocationHint(VReg: Src2->getReg(), Type: 0, PrefReg: VCCReg); |
| 1050 | Next = true; |
| 1051 | } |
| 1052 | |
| 1053 | if (Next) |
| 1054 | continue; |
| 1055 | } |
| 1056 | |
| 1057 | // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to |
| 1058 | // fold an immediate into the shrunk instruction as a literal operand. In |
| 1059 | // GFX10 VOP3 instructions can take a literal operand anyway, so there is |
| 1060 | // no advantage to doing this. |
| 1061 | if (ST->hasVOP3Literal() && !IsPostRA) |
| 1062 | continue; |
| 1063 | |
| 1064 | if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(Opc: MI.getOpcode()) && |
| 1065 | !shouldShrinkTrue16(MI)) |
| 1066 | continue; |
| 1067 | |
| 1068 | // We can shrink this instruction |
| 1069 | LLVM_DEBUG(dbgs() << "Shrinking " << MI); |
| 1070 | |
| 1071 | MachineInstr *Inst32 = TII->buildShrunkInst(MI, NewOpcode: Op32); |
| 1072 | ++NumInstructionsShrunk; |
| 1073 | |
| 1074 | // Copy extra operands not present in the instruction definition. |
| 1075 | copyExtraImplicitOps(NewMI&: *Inst32, MI); |
| 1076 | |
| 1077 | // Copy deadness from the old explicit vcc def to the new implicit def. |
| 1078 | if (SDst && SDst->isDead()) |
| 1079 | Inst32->findRegisterDefOperand(Reg: VCCReg, /*TRI=*/nullptr)->setIsDead(); |
| 1080 | |
| 1081 | MI.eraseFromParent(); |
| 1082 | foldImmediates(MI&: *Inst32); |
| 1083 | |
| 1084 | LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); |
| 1085 | } |
| 1086 | } |
| 1087 | return false; |
| 1088 | } |
| 1089 | |
| 1090 | bool SIShrinkInstructionsLegacy::runOnMachineFunction(MachineFunction &MF) { |
| 1091 | if (skipFunction(F: MF.getFunction())) |
| 1092 | return false; |
| 1093 | |
| 1094 | return SIShrinkInstructions().run(MF); |
| 1095 | } |
| 1096 | |
| 1097 | PreservedAnalyses |
| 1098 | SIShrinkInstructionsPass::run(MachineFunction &MF, |
| 1099 | MachineFunctionAnalysisManager &) { |
| 1100 | if (MF.getFunction().hasOptNone() || !SIShrinkInstructions().run(MF)) |
| 1101 | return PreservedAnalyses::all(); |
| 1102 | |
| 1103 | auto PA = getMachineFunctionPassPreservedAnalyses(); |
| 1104 | PA.preserveSet<CFGAnalyses>(); |
| 1105 | return PA; |
| 1106 | } |
| 1107 | |