1 | //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | /// The pass tries to use the 32-bit encoding for instructions when possible. |
8 | //===----------------------------------------------------------------------===// |
9 | // |
10 | |
11 | #include "SIShrinkInstructions.h" |
12 | #include "AMDGPU.h" |
13 | #include "GCNSubtarget.h" |
14 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
15 | #include "Utils/AMDGPUBaseInfo.h" |
16 | #include "llvm/ADT/Statistic.h" |
17 | #include "llvm/CodeGen/MachineFunctionPass.h" |
18 | |
19 | #define DEBUG_TYPE "si-shrink-instructions" |
20 | |
21 | STATISTIC(NumInstructionsShrunk, |
22 | "Number of 64-bit instruction reduced to 32-bit." ); |
23 | STATISTIC(NumLiteralConstantsFolded, |
24 | "Number of literal constants folded into 32-bit instructions." ); |
25 | |
26 | using namespace llvm; |
27 | |
28 | namespace { |
29 | |
30 | class SIShrinkInstructions { |
31 | MachineFunction *MF; |
32 | MachineRegisterInfo *MRI; |
33 | const GCNSubtarget *ST; |
34 | const SIInstrInfo *TII; |
35 | const SIRegisterInfo *TRI; |
36 | bool IsPostRA; |
37 | |
38 | bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; |
39 | bool shouldShrinkTrue16(MachineInstr &MI) const; |
40 | bool isKImmOperand(const MachineOperand &Src) const; |
41 | bool isKUImmOperand(const MachineOperand &Src) const; |
42 | bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; |
43 | void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; |
44 | void shrinkScalarCompare(MachineInstr &MI) const; |
45 | void shrinkMIMG(MachineInstr &MI) const; |
46 | void shrinkMadFma(MachineInstr &MI) const; |
47 | bool shrinkScalarLogicOp(MachineInstr &MI) const; |
48 | bool tryReplaceDeadSDST(MachineInstr &MI) const; |
49 | bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, |
50 | Register Reg, unsigned SubReg) const; |
51 | bool instReadsReg(const MachineInstr *MI, unsigned Reg, |
52 | unsigned SubReg) const; |
53 | bool instModifiesReg(const MachineInstr *MI, unsigned Reg, |
54 | unsigned SubReg) const; |
55 | TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, |
56 | unsigned I) const; |
57 | void dropInstructionKeepingImpDefs(MachineInstr &MI) const; |
58 | MachineInstr *matchSwap(MachineInstr &MovT) const; |
59 | |
60 | public: |
61 | SIShrinkInstructions() = default; |
62 | bool run(MachineFunction &MF); |
63 | }; |
64 | |
65 | class SIShrinkInstructionsLegacy : public MachineFunctionPass { |
66 | |
67 | public: |
68 | static char ID; |
69 | |
70 | SIShrinkInstructionsLegacy() : MachineFunctionPass(ID) {} |
71 | |
72 | bool runOnMachineFunction(MachineFunction &MF) override; |
73 | |
74 | StringRef getPassName() const override { return "SI Shrink Instructions" ; } |
75 | |
76 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
77 | AU.setPreservesCFG(); |
78 | MachineFunctionPass::getAnalysisUsage(AU); |
79 | } |
80 | }; |
81 | |
82 | } // End anonymous namespace. |
83 | |
84 | INITIALIZE_PASS(SIShrinkInstructionsLegacy, DEBUG_TYPE, |
85 | "SI Shrink Instructions" , false, false) |
86 | |
87 | char SIShrinkInstructionsLegacy::ID = 0; |
88 | |
89 | FunctionPass *llvm::createSIShrinkInstructionsLegacyPass() { |
90 | return new SIShrinkInstructionsLegacy(); |
91 | } |
92 | |
93 | /// This function checks \p MI for operands defined by a move immediate |
94 | /// instruction and then folds the literal constant into the instruction if it |
95 | /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. |
96 | bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, |
97 | bool TryToCommute) const { |
98 | assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); |
99 | |
100 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src0); |
101 | |
102 | // Try to fold Src0 |
103 | MachineOperand &Src0 = MI.getOperand(i: Src0Idx); |
104 | if (Src0.isReg()) { |
105 | Register Reg = Src0.getReg(); |
106 | if (Reg.isVirtual()) { |
107 | MachineInstr *Def = MRI->getUniqueVRegDef(Reg); |
108 | if (Def && Def->isMoveImmediate()) { |
109 | MachineOperand &MovSrc = Def->getOperand(i: 1); |
110 | bool ConstantFolded = false; |
111 | |
112 | if (TII->isOperandLegal(MI, OpIdx: Src0Idx, MO: &MovSrc)) { |
113 | if (MovSrc.isImm()) { |
114 | Src0.ChangeToImmediate(ImmVal: MovSrc.getImm()); |
115 | ConstantFolded = true; |
116 | } else if (MovSrc.isFI()) { |
117 | Src0.ChangeToFrameIndex(Idx: MovSrc.getIndex()); |
118 | ConstantFolded = true; |
119 | } else if (MovSrc.isGlobal()) { |
120 | Src0.ChangeToGA(GV: MovSrc.getGlobal(), Offset: MovSrc.getOffset(), |
121 | TargetFlags: MovSrc.getTargetFlags()); |
122 | ConstantFolded = true; |
123 | } |
124 | } |
125 | |
126 | if (ConstantFolded) { |
127 | if (MRI->use_nodbg_empty(RegNo: Reg)) |
128 | Def->eraseFromParent(); |
129 | ++NumLiteralConstantsFolded; |
130 | return true; |
131 | } |
132 | } |
133 | } |
134 | } |
135 | |
136 | // We have failed to fold src0, so commute the instruction and try again. |
137 | if (TryToCommute && MI.isCommutable()) { |
138 | if (TII->commuteInstruction(MI)) { |
139 | if (foldImmediates(MI, TryToCommute: false)) |
140 | return true; |
141 | |
142 | // Commute back. |
143 | TII->commuteInstruction(MI); |
144 | } |
145 | } |
146 | |
147 | return false; |
148 | } |
149 | |
150 | /// Do not shrink the instruction if its registers are not expressible in the |
151 | /// shrunk encoding. |
152 | bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const { |
153 | for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { |
154 | const MachineOperand &MO = MI.getOperand(i: I); |
155 | if (MO.isReg()) { |
156 | Register Reg = MO.getReg(); |
157 | assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink " |
158 | "True16 Instructions post-RA" ); |
159 | if (AMDGPU::VGPR_32RegClass.contains(Reg) && |
160 | !AMDGPU::VGPR_32_Lo128RegClass.contains(Reg)) |
161 | return false; |
162 | |
163 | if (AMDGPU::VGPR_16RegClass.contains(Reg) && |
164 | !AMDGPU::VGPR_16_Lo128RegClass.contains(Reg)) |
165 | return false; |
166 | } |
167 | } |
168 | return true; |
169 | } |
170 | |
171 | bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { |
172 | return isInt<16>(x: SignExtend64(X: Src.getImm(), B: 32)) && |
173 | !TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo()); |
174 | } |
175 | |
176 | bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { |
177 | return isUInt<16>(x: Src.getImm()) && |
178 | !TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo()); |
179 | } |
180 | |
181 | bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, |
182 | bool &IsUnsigned) const { |
183 | if (isInt<16>(x: SignExtend64(X: Src.getImm(), B: 32))) { |
184 | IsUnsigned = false; |
185 | return !TII->isInlineConstant(MO: Src); |
186 | } |
187 | |
188 | if (isUInt<16>(x: Src.getImm())) { |
189 | IsUnsigned = true; |
190 | return !TII->isInlineConstant(MO: Src); |
191 | } |
192 | |
193 | return false; |
194 | } |
195 | |
196 | /// \returns the opcode of an instruction a move immediate of the constant \p |
197 | /// Src can be replaced with if the constant is replaced with \p ModifiedImm. |
198 | /// i.e. |
199 | /// |
200 | /// If the bitreverse of a constant is an inline immediate, reverse the |
201 | /// immediate and return the bitreverse opcode. |
202 | /// |
203 | /// If the bitwise negation of a constant is an inline immediate, reverse the |
204 | /// immediate and return the bitwise not opcode. |
205 | static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII, |
206 | const MachineOperand &Src, |
207 | int32_t &ModifiedImm, bool Scalar) { |
208 | if (TII->isInlineConstant(MO: Src)) |
209 | return 0; |
210 | int32_t SrcImm = static_cast<int32_t>(Src.getImm()); |
211 | |
212 | if (!Scalar) { |
213 | // We could handle the scalar case with here, but we would need to check |
214 | // that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth |
215 | // it, as the reasonable values are already covered by s_movk_i32. |
216 | ModifiedImm = ~SrcImm; |
217 | if (TII->isInlineConstant(Imm: APInt(32, ModifiedImm, true))) |
218 | return AMDGPU::V_NOT_B32_e32; |
219 | } |
220 | |
221 | ModifiedImm = reverseBits<int32_t>(Val: SrcImm); |
222 | if (TII->isInlineConstant(Imm: APInt(32, ModifiedImm, true))) |
223 | return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32; |
224 | |
225 | return 0; |
226 | } |
227 | |
228 | /// Copy implicit register operands from specified instruction to this |
229 | /// instruction that are not part of the instruction definition. |
230 | void SIShrinkInstructions::(MachineInstr &NewMI, |
231 | MachineInstr &MI) const { |
232 | MachineFunction &MF = *MI.getMF(); |
233 | for (unsigned i = MI.getDesc().getNumOperands() + |
234 | MI.getDesc().implicit_uses().size() + |
235 | MI.getDesc().implicit_defs().size(), |
236 | e = MI.getNumOperands(); |
237 | i != e; ++i) { |
238 | const MachineOperand &MO = MI.getOperand(i); |
239 | if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) |
240 | NewMI.addOperand(MF, Op: MO); |
241 | } |
242 | } |
243 | |
244 | void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { |
245 | if (!ST->hasSCmpK()) |
246 | return; |
247 | |
248 | // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to |
249 | // get constants on the RHS. |
250 | if (!MI.getOperand(i: 0).isReg()) |
251 | TII->commuteInstruction(MI, NewMI: false, OpIdx1: 0, OpIdx2: 1); |
252 | |
253 | // cmpk requires src0 to be a register |
254 | const MachineOperand &Src0 = MI.getOperand(i: 0); |
255 | if (!Src0.isReg()) |
256 | return; |
257 | |
258 | MachineOperand &Src1 = MI.getOperand(i: 1); |
259 | if (!Src1.isImm()) |
260 | return; |
261 | |
262 | int SOPKOpc = AMDGPU::getSOPKOp(Opcode: MI.getOpcode()); |
263 | if (SOPKOpc == -1) |
264 | return; |
265 | |
266 | // eq/ne is special because the imm16 can be treated as signed or unsigned, |
267 | // and initially selected to the unsigned versions. |
268 | if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { |
269 | bool HasUImm; |
270 | if (isKImmOrKUImmOperand(Src: Src1, IsUnsigned&: HasUImm)) { |
271 | if (!HasUImm) { |
272 | SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? |
273 | AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; |
274 | Src1.setImm(SignExtend32(X: Src1.getImm(), B: 32)); |
275 | } |
276 | |
277 | MI.setDesc(TII->get(Opcode: SOPKOpc)); |
278 | } |
279 | |
280 | return; |
281 | } |
282 | |
283 | const MCInstrDesc &NewDesc = TII->get(Opcode: SOPKOpc); |
284 | |
285 | if ((SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKUImmOperand(Src: Src1)) || |
286 | (!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKImmOperand(Src: Src1))) { |
287 | if (!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc)) |
288 | Src1.setImm(SignExtend64(X: Src1.getImm(), B: 32)); |
289 | MI.setDesc(NewDesc); |
290 | } |
291 | } |
292 | |
293 | // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. |
294 | void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { |
295 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode()); |
296 | if (!Info) |
297 | return; |
298 | |
299 | uint8_t NewEncoding; |
300 | switch (Info->MIMGEncoding) { |
301 | case AMDGPU::MIMGEncGfx10NSA: |
302 | NewEncoding = AMDGPU::MIMGEncGfx10Default; |
303 | break; |
304 | case AMDGPU::MIMGEncGfx11NSA: |
305 | NewEncoding = AMDGPU::MIMGEncGfx11Default; |
306 | break; |
307 | default: |
308 | return; |
309 | } |
310 | |
311 | int VAddr0Idx = |
312 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vaddr0); |
313 | unsigned NewAddrDwords = Info->VAddrDwords; |
314 | const TargetRegisterClass *RC; |
315 | |
316 | if (Info->VAddrDwords == 2) { |
317 | RC = &AMDGPU::VReg_64RegClass; |
318 | } else if (Info->VAddrDwords == 3) { |
319 | RC = &AMDGPU::VReg_96RegClass; |
320 | } else if (Info->VAddrDwords == 4) { |
321 | RC = &AMDGPU::VReg_128RegClass; |
322 | } else if (Info->VAddrDwords == 5) { |
323 | RC = &AMDGPU::VReg_160RegClass; |
324 | } else if (Info->VAddrDwords == 6) { |
325 | RC = &AMDGPU::VReg_192RegClass; |
326 | } else if (Info->VAddrDwords == 7) { |
327 | RC = &AMDGPU::VReg_224RegClass; |
328 | } else if (Info->VAddrDwords == 8) { |
329 | RC = &AMDGPU::VReg_256RegClass; |
330 | } else if (Info->VAddrDwords == 9) { |
331 | RC = &AMDGPU::VReg_288RegClass; |
332 | } else if (Info->VAddrDwords == 10) { |
333 | RC = &AMDGPU::VReg_320RegClass; |
334 | } else if (Info->VAddrDwords == 11) { |
335 | RC = &AMDGPU::VReg_352RegClass; |
336 | } else if (Info->VAddrDwords == 12) { |
337 | RC = &AMDGPU::VReg_384RegClass; |
338 | } else { |
339 | RC = &AMDGPU::VReg_512RegClass; |
340 | NewAddrDwords = 16; |
341 | } |
342 | |
343 | unsigned VgprBase = 0; |
344 | unsigned NextVgpr = 0; |
345 | bool IsUndef = true; |
346 | bool IsKill = NewAddrDwords == Info->VAddrDwords; |
347 | const unsigned NSAMaxSize = ST->getNSAMaxSize(); |
348 | const bool IsPartialNSA = NewAddrDwords > NSAMaxSize; |
349 | const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands; |
350 | for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) { |
351 | const MachineOperand &Op = MI.getOperand(i: VAddr0Idx + Idx); |
352 | unsigned Vgpr = TRI->getHWRegIndex(Reg: Op.getReg()); |
353 | unsigned Dwords = TRI->getRegSizeInBits(Reg: Op.getReg(), MRI: *MRI) / 32; |
354 | assert(Dwords > 0 && "Un-implemented for less than 32 bit regs" ); |
355 | |
356 | if (Idx == 0) { |
357 | VgprBase = Vgpr; |
358 | NextVgpr = Vgpr + Dwords; |
359 | } else if (Vgpr == NextVgpr) { |
360 | NextVgpr = Vgpr + Dwords; |
361 | } else { |
362 | return; |
363 | } |
364 | |
365 | if (!Op.isUndef()) |
366 | IsUndef = false; |
367 | if (!Op.isKill()) |
368 | IsKill = false; |
369 | } |
370 | |
371 | if (VgprBase + NewAddrDwords > 256) |
372 | return; |
373 | |
374 | // Further check for implicit tied operands - this may be present if TFE is |
375 | // enabled |
376 | int TFEIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::tfe); |
377 | int LWEIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::lwe); |
378 | unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(i: TFEIdx).getImm(); |
379 | unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(i: LWEIdx).getImm(); |
380 | int ToUntie = -1; |
381 | if (TFEVal || LWEVal) { |
382 | // TFE/LWE is enabled so we need to deal with an implicit tied operand |
383 | for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { |
384 | if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && |
385 | MI.getOperand(i).isImplicit()) { |
386 | // This is the tied operand |
387 | assert( |
388 | ToUntie == -1 && |
389 | "found more than one tied implicit operand when expecting only 1" ); |
390 | ToUntie = i; |
391 | MI.untieRegOperand(OpIdx: ToUntie); |
392 | } |
393 | } |
394 | } |
395 | |
396 | unsigned NewOpcode = AMDGPU::getMIMGOpcode(BaseOpcode: Info->BaseOpcode, MIMGEncoding: NewEncoding, |
397 | VDataDwords: Info->VDataDwords, VAddrDwords: NewAddrDwords); |
398 | MI.setDesc(TII->get(Opcode: NewOpcode)); |
399 | MI.getOperand(i: VAddr0Idx).setReg(RC->getRegister(i: VgprBase)); |
400 | MI.getOperand(i: VAddr0Idx).setIsUndef(IsUndef); |
401 | MI.getOperand(i: VAddr0Idx).setIsKill(IsKill); |
402 | |
403 | for (unsigned i = 1; i < EndVAddr; ++i) |
404 | MI.removeOperand(OpNo: VAddr0Idx + 1); |
405 | |
406 | if (ToUntie >= 0) { |
407 | MI.tieOperands( |
408 | DefIdx: AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdata), |
409 | UseIdx: ToUntie - (EndVAddr - 1)); |
410 | } |
411 | } |
412 | |
413 | // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. |
414 | void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { |
415 | // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so |
416 | // there is no reason to try to shrink them. |
417 | if (!ST->hasVOP3Literal()) |
418 | return; |
419 | |
420 | // There is no advantage to doing this pre-RA. |
421 | if (!IsPostRA) |
422 | return; |
423 | |
424 | if (TII->hasAnyModifiersSet(MI)) |
425 | return; |
426 | |
427 | const unsigned Opcode = MI.getOpcode(); |
428 | MachineOperand &Src0 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
429 | MachineOperand &Src1 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
430 | MachineOperand &Src2 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
431 | unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; |
432 | |
433 | bool Swap; |
434 | |
435 | // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. |
436 | if (Src2.isImm() && !TII->isInlineConstant(MO: Src2)) { |
437 | if (Src1.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src1.getReg())) |
438 | Swap = false; |
439 | else if (Src0.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src0.getReg())) |
440 | Swap = true; |
441 | else |
442 | return; |
443 | |
444 | switch (Opcode) { |
445 | default: |
446 | llvm_unreachable("Unexpected mad/fma opcode!" ); |
447 | case AMDGPU::V_MAD_F32_e64: |
448 | NewOpcode = AMDGPU::V_MADAK_F32; |
449 | break; |
450 | case AMDGPU::V_FMA_F32_e64: |
451 | NewOpcode = AMDGPU::V_FMAAK_F32; |
452 | break; |
453 | case AMDGPU::V_MAD_F16_e64: |
454 | NewOpcode = AMDGPU::V_MADAK_F16; |
455 | break; |
456 | case AMDGPU::V_FMA_F16_e64: |
457 | case AMDGPU::V_FMA_F16_gfx9_e64: |
458 | NewOpcode = AMDGPU::V_FMAAK_F16; |
459 | break; |
460 | case AMDGPU::V_FMA_F16_gfx9_t16_e64: |
461 | NewOpcode = AMDGPU::V_FMAAK_F16_t16; |
462 | break; |
463 | case AMDGPU::V_FMA_F16_gfx9_fake16_e64: |
464 | NewOpcode = AMDGPU::V_FMAAK_F16_fake16; |
465 | break; |
466 | } |
467 | } |
468 | |
469 | // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. |
470 | if (Src2.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src2.getReg())) { |
471 | if (Src1.isImm() && !TII->isInlineConstant(MO: Src1)) |
472 | Swap = false; |
473 | else if (Src0.isImm() && !TII->isInlineConstant(MO: Src0)) |
474 | Swap = true; |
475 | else |
476 | return; |
477 | |
478 | switch (Opcode) { |
479 | default: |
480 | llvm_unreachable("Unexpected mad/fma opcode!" ); |
481 | case AMDGPU::V_MAD_F32_e64: |
482 | NewOpcode = AMDGPU::V_MADMK_F32; |
483 | break; |
484 | case AMDGPU::V_FMA_F32_e64: |
485 | NewOpcode = AMDGPU::V_FMAMK_F32; |
486 | break; |
487 | case AMDGPU::V_MAD_F16_e64: |
488 | NewOpcode = AMDGPU::V_MADMK_F16; |
489 | break; |
490 | case AMDGPU::V_FMA_F16_e64: |
491 | case AMDGPU::V_FMA_F16_gfx9_e64: |
492 | NewOpcode = AMDGPU::V_FMAMK_F16; |
493 | break; |
494 | case AMDGPU::V_FMA_F16_gfx9_t16_e64: |
495 | NewOpcode = AMDGPU::V_FMAMK_F16_t16; |
496 | break; |
497 | case AMDGPU::V_FMA_F16_gfx9_fake16_e64: |
498 | NewOpcode = AMDGPU::V_FMAMK_F16_fake16; |
499 | break; |
500 | } |
501 | } |
502 | |
503 | if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) |
504 | return; |
505 | |
506 | if (AMDGPU::isTrue16Inst(Opc: NewOpcode) && !shouldShrinkTrue16(MI)) |
507 | return; |
508 | |
509 | if (Swap) { |
510 | // Swap Src0 and Src1 by building a new instruction. |
511 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: NewOpcode), |
512 | DestReg: MI.getOperand(i: 0).getReg()) |
513 | .add(MO: Src1) |
514 | .add(MO: Src0) |
515 | .add(MO: Src2) |
516 | .setMIFlags(MI.getFlags()); |
517 | MI.eraseFromParent(); |
518 | } else { |
519 | TII->removeModOperands(MI); |
520 | MI.setDesc(TII->get(Opcode: NewOpcode)); |
521 | } |
522 | } |
523 | |
524 | /// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals. |
525 | /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. |
526 | /// If the inverse of the immediate is legal, use ANDN2, ORN2 or |
527 | /// XNOR (as a ^ b == ~(a ^ ~b)). |
528 | /// \returns true if the caller should continue the machine function iterator |
529 | bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { |
530 | unsigned Opc = MI.getOpcode(); |
531 | const MachineOperand *Dest = &MI.getOperand(i: 0); |
532 | MachineOperand *Src0 = &MI.getOperand(i: 1); |
533 | MachineOperand *Src1 = &MI.getOperand(i: 2); |
534 | MachineOperand *SrcReg = Src0; |
535 | MachineOperand *SrcImm = Src1; |
536 | |
537 | if (!SrcImm->isImm() || |
538 | AMDGPU::isInlinableLiteral32(Literal: SrcImm->getImm(), HasInv2Pi: ST->hasInv2PiInlineImm())) |
539 | return false; |
540 | |
541 | uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); |
542 | uint32_t NewImm = 0; |
543 | |
544 | if (Opc == AMDGPU::S_AND_B32) { |
545 | if (isPowerOf2_32(Value: ~Imm)) { |
546 | NewImm = llvm::countr_one(Value: Imm); |
547 | Opc = AMDGPU::S_BITSET0_B32; |
548 | } else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) { |
549 | NewImm = ~Imm; |
550 | Opc = AMDGPU::S_ANDN2_B32; |
551 | } |
552 | } else if (Opc == AMDGPU::S_OR_B32) { |
553 | if (isPowerOf2_32(Value: Imm)) { |
554 | NewImm = llvm::countr_zero(Val: Imm); |
555 | Opc = AMDGPU::S_BITSET1_B32; |
556 | } else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) { |
557 | NewImm = ~Imm; |
558 | Opc = AMDGPU::S_ORN2_B32; |
559 | } |
560 | } else if (Opc == AMDGPU::S_XOR_B32) { |
561 | if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) { |
562 | NewImm = ~Imm; |
563 | Opc = AMDGPU::S_XNOR_B32; |
564 | } |
565 | } else { |
566 | llvm_unreachable("unexpected opcode" ); |
567 | } |
568 | |
569 | if (NewImm != 0) { |
570 | if (Dest->getReg().isVirtual() && SrcReg->isReg()) { |
571 | MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: 0, PrefReg: SrcReg->getReg()); |
572 | MRI->setRegAllocationHint(VReg: SrcReg->getReg(), Type: 0, PrefReg: Dest->getReg()); |
573 | return true; |
574 | } |
575 | |
576 | if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { |
577 | const bool IsUndef = SrcReg->isUndef(); |
578 | const bool IsKill = SrcReg->isKill(); |
579 | MI.setDesc(TII->get(Opcode: Opc)); |
580 | if (Opc == AMDGPU::S_BITSET0_B32 || |
581 | Opc == AMDGPU::S_BITSET1_B32) { |
582 | Src0->ChangeToImmediate(ImmVal: NewImm); |
583 | // Remove the immediate and add the tied input. |
584 | MI.getOperand(i: 2).ChangeToRegister(Reg: Dest->getReg(), /*IsDef*/ isDef: false, |
585 | /*isImp*/ false, isKill: IsKill, |
586 | /*isDead*/ false, isUndef: IsUndef); |
587 | MI.tieOperands(DefIdx: 0, UseIdx: 2); |
588 | } else { |
589 | SrcImm->setImm(NewImm); |
590 | } |
591 | } |
592 | } |
593 | |
594 | return false; |
595 | } |
596 | |
597 | // This is the same as MachineInstr::readsRegister/modifiesRegister except |
598 | // it takes subregs into account. |
599 | bool SIShrinkInstructions::instAccessReg( |
600 | iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, |
601 | unsigned SubReg) const { |
602 | for (const MachineOperand &MO : R) { |
603 | if (!MO.isReg()) |
604 | continue; |
605 | |
606 | if (Reg.isPhysical() && MO.getReg().isPhysical()) { |
607 | if (TRI->regsOverlap(RegA: Reg, RegB: MO.getReg())) |
608 | return true; |
609 | } else if (MO.getReg() == Reg && Reg.isVirtual()) { |
610 | LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubIdx: SubReg) & |
611 | TRI->getSubRegIndexLaneMask(SubIdx: MO.getSubReg()); |
612 | if (Overlap.any()) |
613 | return true; |
614 | } |
615 | } |
616 | return false; |
617 | } |
618 | |
619 | bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, |
620 | unsigned SubReg) const { |
621 | return instAccessReg(R: MI->uses(), Reg, SubReg); |
622 | } |
623 | |
624 | bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, |
625 | unsigned SubReg) const { |
626 | return instAccessReg(R: MI->defs(), Reg, SubReg); |
627 | } |
628 | |
629 | TargetInstrInfo::RegSubRegPair |
630 | SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, |
631 | unsigned I) const { |
632 | if (TRI->getRegSizeInBits(Reg, MRI: *MRI) != 32) { |
633 | if (Reg.isPhysical()) { |
634 | Reg = TRI->getSubReg(Reg, Idx: TRI->getSubRegFromChannel(Channel: I)); |
635 | } else { |
636 | Sub = TRI->getSubRegFromChannel(Channel: I + TRI->getChannelFromSubReg(SubReg: Sub)); |
637 | } |
638 | } |
639 | return TargetInstrInfo::RegSubRegPair(Reg, Sub); |
640 | } |
641 | |
642 | void SIShrinkInstructions::dropInstructionKeepingImpDefs( |
643 | MachineInstr &MI) const { |
644 | for (unsigned i = MI.getDesc().getNumOperands() + |
645 | MI.getDesc().implicit_uses().size() + |
646 | MI.getDesc().implicit_defs().size(), |
647 | e = MI.getNumOperands(); |
648 | i != e; ++i) { |
649 | const MachineOperand &Op = MI.getOperand(i); |
650 | if (!Op.isDef()) |
651 | continue; |
652 | BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(), |
653 | MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Op.getReg()); |
654 | } |
655 | |
656 | MI.eraseFromParent(); |
657 | } |
658 | |
659 | // Match: |
660 | // mov t, x |
661 | // mov x, y |
662 | // mov y, t |
663 | // |
664 | // => |
665 | // |
666 | // mov t, x (t is potentially dead and move eliminated) |
667 | // v_swap_b32 x, y |
668 | // |
669 | // Returns next valid instruction pointer if was able to create v_swap_b32. |
670 | // |
671 | // This shall not be done too early not to prevent possible folding which may |
672 | // remove matched moves, and this should preferably be done before RA to |
673 | // release saved registers and also possibly after RA which can insert copies |
674 | // too. |
675 | // |
676 | // This is really just a generic peephole that is not a canonical shrinking, |
677 | // although requirements match the pass placement and it reduces code size too. |
678 | MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { |
679 | assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || |
680 | MovT.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 || |
681 | MovT.getOpcode() == AMDGPU::COPY); |
682 | |
683 | Register T = MovT.getOperand(i: 0).getReg(); |
684 | unsigned Tsub = MovT.getOperand(i: 0).getSubReg(); |
685 | MachineOperand &Xop = MovT.getOperand(i: 1); |
686 | |
687 | if (!Xop.isReg()) |
688 | return nullptr; |
689 | Register X = Xop.getReg(); |
690 | unsigned Xsub = Xop.getSubReg(); |
691 | |
692 | unsigned Size = TII->getOpSize(MI: MovT, OpNo: 0); |
693 | |
694 | // We can't match v_swap_b16 pre-RA, because VGPR_16_Lo128 registers |
695 | // are not allocatble. |
696 | if (Size == 2 && X.isVirtual()) |
697 | return nullptr; |
698 | |
699 | if (!TRI->isVGPR(MRI: *MRI, Reg: X)) |
700 | return nullptr; |
701 | |
702 | const unsigned SearchLimit = 16; |
703 | unsigned Count = 0; |
704 | bool KilledT = false; |
705 | for (auto Iter = std::next(x: MovT.getIterator()), |
706 | E = MovT.getParent()->instr_end(); |
707 | Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { |
708 | |
709 | MachineInstr *MovY = &*Iter; |
710 | KilledT = MovY->killsRegister(Reg: T, TRI); |
711 | |
712 | if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && |
713 | MovY->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 && |
714 | MovY->getOpcode() != AMDGPU::COPY) || |
715 | !MovY->getOperand(i: 1).isReg() || MovY->getOperand(i: 1).getReg() != T || |
716 | MovY->getOperand(i: 1).getSubReg() != Tsub) |
717 | continue; |
718 | |
719 | Register Y = MovY->getOperand(i: 0).getReg(); |
720 | unsigned Ysub = MovY->getOperand(i: 0).getSubReg(); |
721 | |
722 | if (!TRI->isVGPR(MRI: *MRI, Reg: Y)) |
723 | continue; |
724 | |
725 | MachineInstr *MovX = nullptr; |
726 | for (auto IY = MovY->getIterator(), I = std::next(x: MovT.getIterator()); |
727 | I != IY; ++I) { |
728 | if (instReadsReg(MI: &*I, Reg: X, SubReg: Xsub) || instModifiesReg(MI: &*I, Reg: Y, SubReg: Ysub) || |
729 | instModifiesReg(MI: &*I, Reg: T, SubReg: Tsub) || |
730 | (MovX && instModifiesReg(MI: &*I, Reg: X, SubReg: Xsub))) { |
731 | MovX = nullptr; |
732 | break; |
733 | } |
734 | if (!instReadsReg(MI: &*I, Reg: Y, SubReg: Ysub)) { |
735 | if (!MovX && instModifiesReg(MI: &*I, Reg: X, SubReg: Xsub)) { |
736 | MovX = nullptr; |
737 | break; |
738 | } |
739 | continue; |
740 | } |
741 | if (MovX || |
742 | (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && |
743 | I->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 && |
744 | I->getOpcode() != AMDGPU::COPY) || |
745 | I->getOperand(i: 0).getReg() != X || |
746 | I->getOperand(i: 0).getSubReg() != Xsub) { |
747 | MovX = nullptr; |
748 | break; |
749 | } |
750 | |
751 | if (Size > 4 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) |
752 | continue; |
753 | |
754 | MovX = &*I; |
755 | } |
756 | |
757 | if (!MovX) |
758 | continue; |
759 | |
760 | LLVM_DEBUG(dbgs() << "Matched v_swap:\n" << MovT << *MovX << *MovY); |
761 | |
762 | MachineBasicBlock &MBB = *MovT.getParent(); |
763 | SmallVector<MachineInstr *, 4> Swaps; |
764 | if (Size == 2) { |
765 | auto *MIB = BuildMI(BB&: MBB, I: MovX->getIterator(), MIMD: MovT.getDebugLoc(), |
766 | MCID: TII->get(Opcode: AMDGPU::V_SWAP_B16)) |
767 | .addDef(RegNo: X) |
768 | .addDef(RegNo: Y) |
769 | .addReg(RegNo: Y) |
770 | .addReg(RegNo: X) |
771 | .getInstr(); |
772 | Swaps.push_back(Elt: MIB); |
773 | } else { |
774 | assert(Size > 0 && Size % 4 == 0); |
775 | for (unsigned I = 0; I < Size / 4; ++I) { |
776 | TargetInstrInfo::RegSubRegPair X1, Y1; |
777 | X1 = getSubRegForIndex(Reg: X, Sub: Xsub, I); |
778 | Y1 = getSubRegForIndex(Reg: Y, Sub: Ysub, I); |
779 | auto *MIB = BuildMI(BB&: MBB, I: MovX->getIterator(), MIMD: MovT.getDebugLoc(), |
780 | MCID: TII->get(Opcode: AMDGPU::V_SWAP_B32)) |
781 | .addDef(RegNo: X1.Reg, Flags: 0, SubReg: X1.SubReg) |
782 | .addDef(RegNo: Y1.Reg, Flags: 0, SubReg: Y1.SubReg) |
783 | .addReg(RegNo: Y1.Reg, flags: 0, SubReg: Y1.SubReg) |
784 | .addReg(RegNo: X1.Reg, flags: 0, SubReg: X1.SubReg) |
785 | .getInstr(); |
786 | Swaps.push_back(Elt: MIB); |
787 | } |
788 | } |
789 | // Drop implicit EXEC. |
790 | if (MovX->hasRegisterImplicitUseOperand(Reg: AMDGPU::EXEC)) { |
791 | for (MachineInstr *Swap : Swaps) { |
792 | Swap->removeOperand(OpNo: Swap->getNumExplicitOperands()); |
793 | Swap->copyImplicitOps(MF&: *MBB.getParent(), MI: *MovX); |
794 | } |
795 | } |
796 | MovX->eraseFromParent(); |
797 | dropInstructionKeepingImpDefs(MI&: *MovY); |
798 | MachineInstr *Next = &*std::next(x: MovT.getIterator()); |
799 | |
800 | if (T.isVirtual() && MRI->use_nodbg_empty(RegNo: T)) { |
801 | dropInstructionKeepingImpDefs(MI&: MovT); |
802 | } else { |
803 | Xop.setIsKill(false); |
804 | for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { |
805 | unsigned OpNo = MovT.getNumExplicitOperands() + I; |
806 | const MachineOperand &Op = MovT.getOperand(i: OpNo); |
807 | if (Op.isKill() && TRI->regsOverlap(RegA: X, RegB: Op.getReg())) |
808 | MovT.removeOperand(OpNo); |
809 | } |
810 | } |
811 | |
812 | return Next; |
813 | } |
814 | |
815 | return nullptr; |
816 | } |
817 | |
818 | // If an instruction has dead sdst replace it with NULL register on gfx1030+ |
819 | bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const { |
820 | if (!ST->hasGFX10_3Insts()) |
821 | return false; |
822 | |
823 | MachineOperand *Op = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst); |
824 | if (!Op) |
825 | return false; |
826 | Register SDstReg = Op->getReg(); |
827 | if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(RegNo: SDstReg)) |
828 | return false; |
829 | |
830 | Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64); |
831 | return true; |
832 | } |
833 | |
834 | bool SIShrinkInstructions::run(MachineFunction &MF) { |
835 | |
836 | this->MF = &MF; |
837 | MRI = &MF.getRegInfo(); |
838 | ST = &MF.getSubtarget<GCNSubtarget>(); |
839 | TII = ST->getInstrInfo(); |
840 | TRI = &TII->getRegisterInfo(); |
841 | IsPostRA = MF.getProperties().hasNoVRegs(); |
842 | |
843 | unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; |
844 | |
845 | for (MachineBasicBlock &MBB : MF) { |
846 | MachineBasicBlock::iterator I, Next; |
847 | for (I = MBB.begin(); I != MBB.end(); I = Next) { |
848 | Next = std::next(x: I); |
849 | MachineInstr &MI = *I; |
850 | |
851 | if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { |
852 | // If this has a literal constant source that is the same as the |
853 | // reversed bits of an inline immediate, replace with a bitreverse of |
854 | // that constant. This saves 4 bytes in the common case of materializing |
855 | // sign bits. |
856 | |
857 | // Test if we are after regalloc. We only want to do this after any |
858 | // optimizations happen because this will confuse them. |
859 | MachineOperand &Src = MI.getOperand(i: 1); |
860 | if (Src.isImm() && IsPostRA) { |
861 | int32_t ModImm; |
862 | unsigned ModOpcode = |
863 | canModifyToInlineImmOp32(TII, Src, ModifiedImm&: ModImm, /*Scalar=*/false); |
864 | if (ModOpcode != 0) { |
865 | MI.setDesc(TII->get(Opcode: ModOpcode)); |
866 | Src.setImm(static_cast<int64_t>(ModImm)); |
867 | continue; |
868 | } |
869 | } |
870 | } |
871 | |
872 | if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || |
873 | MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 || |
874 | MI.getOpcode() == AMDGPU::COPY)) { |
875 | if (auto *NextMI = matchSwap(MovT&: MI)) { |
876 | Next = NextMI->getIterator(); |
877 | continue; |
878 | } |
879 | } |
880 | |
881 | // Try to use S_ADDK_I32 and S_MULK_I32. |
882 | if (MI.getOpcode() == AMDGPU::S_ADD_I32 || |
883 | MI.getOpcode() == AMDGPU::S_MUL_I32) { |
884 | const MachineOperand *Dest = &MI.getOperand(i: 0); |
885 | MachineOperand *Src0 = &MI.getOperand(i: 1); |
886 | MachineOperand *Src1 = &MI.getOperand(i: 2); |
887 | |
888 | if (!Src0->isReg() && Src1->isReg()) { |
889 | if (TII->commuteInstruction(MI, NewMI: false, OpIdx1: 1, OpIdx2: 2)) |
890 | std::swap(a&: Src0, b&: Src1); |
891 | } |
892 | |
893 | // FIXME: This could work better if hints worked with subregisters. If |
894 | // we have a vector add of a constant, we usually don't get the correct |
895 | // allocation due to the subregister usage. |
896 | if (Dest->getReg().isVirtual() && Src0->isReg()) { |
897 | MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: 0, PrefReg: Src0->getReg()); |
898 | MRI->setRegAllocationHint(VReg: Src0->getReg(), Type: 0, PrefReg: Dest->getReg()); |
899 | continue; |
900 | } |
901 | |
902 | if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { |
903 | if (Src1->isImm() && isKImmOperand(Src: *Src1)) { |
904 | unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? |
905 | AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; |
906 | |
907 | Src1->setImm(SignExtend64(X: Src1->getImm(), B: 32)); |
908 | MI.setDesc(TII->get(Opcode: Opc)); |
909 | MI.tieOperands(DefIdx: 0, UseIdx: 1); |
910 | } |
911 | } |
912 | } |
913 | |
914 | // Try to use s_cmpk_* |
915 | if (MI.isCompare() && TII->isSOPC(MI)) { |
916 | shrinkScalarCompare(MI); |
917 | continue; |
918 | } |
919 | |
920 | // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. |
921 | if (MI.getOpcode() == AMDGPU::S_MOV_B32) { |
922 | const MachineOperand &Dst = MI.getOperand(i: 0); |
923 | MachineOperand &Src = MI.getOperand(i: 1); |
924 | |
925 | if (Src.isImm() && Dst.getReg().isPhysical()) { |
926 | unsigned ModOpc; |
927 | int32_t ModImm; |
928 | if (isKImmOperand(Src)) { |
929 | MI.setDesc(TII->get(Opcode: AMDGPU::S_MOVK_I32)); |
930 | Src.setImm(SignExtend64(X: Src.getImm(), B: 32)); |
931 | } else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModifiedImm&: ModImm, |
932 | /*Scalar=*/true))) { |
933 | MI.setDesc(TII->get(Opcode: ModOpc)); |
934 | Src.setImm(static_cast<int64_t>(ModImm)); |
935 | } |
936 | } |
937 | |
938 | continue; |
939 | } |
940 | |
941 | // Shrink scalar logic operations. |
942 | if (MI.getOpcode() == AMDGPU::S_AND_B32 || |
943 | MI.getOpcode() == AMDGPU::S_OR_B32 || |
944 | MI.getOpcode() == AMDGPU::S_XOR_B32) { |
945 | if (shrinkScalarLogicOp(MI)) |
946 | continue; |
947 | } |
948 | |
949 | if (IsPostRA && TII->isMIMG(Opcode: MI.getOpcode()) && |
950 | ST->getGeneration() >= AMDGPUSubtarget::GFX10) { |
951 | shrinkMIMG(MI); |
952 | continue; |
953 | } |
954 | |
955 | if (!TII->isVOP3(MI)) |
956 | continue; |
957 | |
958 | if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || |
959 | MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || |
960 | MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || |
961 | MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || |
962 | MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 || |
963 | MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 || |
964 | MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) { |
965 | shrinkMadFma(MI); |
966 | continue; |
967 | } |
968 | |
969 | // If there is no chance we will shrink it and use VCC as sdst to get |
970 | // a 32 bit form try to replace dead sdst with NULL. |
971 | if (TII->isVOP3(Opcode: MI.getOpcode())) { |
972 | tryReplaceDeadSDST(MI); |
973 | if (!TII->hasVALU32BitEncoding(Opcode: MI.getOpcode())) { |
974 | continue; |
975 | } |
976 | } |
977 | |
978 | if (!TII->canShrink(MI, MRI: *MRI)) { |
979 | // Try commuting the instruction and see if that enables us to shrink |
980 | // it. |
981 | if (!MI.isCommutable() || !TII->commuteInstruction(MI) || |
982 | !TII->canShrink(MI, MRI: *MRI)) { |
983 | tryReplaceDeadSDST(MI); |
984 | continue; |
985 | } |
986 | } |
987 | |
988 | int Op32 = AMDGPU::getVOPe32(Opcode: MI.getOpcode()); |
989 | |
990 | if (TII->isVOPC(Opcode: Op32)) { |
991 | MachineOperand &Op0 = MI.getOperand(i: 0); |
992 | if (Op0.isReg()) { |
993 | // Exclude VOPCX instructions as these don't explicitly write a |
994 | // dst. |
995 | Register DstReg = Op0.getReg(); |
996 | if (DstReg.isVirtual()) { |
997 | // VOPC instructions can only write to the VCC register. We can't |
998 | // force them to use VCC here, because this is only one register and |
999 | // cannot deal with sequences which would require multiple copies of |
1000 | // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) |
1001 | // |
1002 | // So, instead of forcing the instruction to write to VCC, we |
1003 | // provide a hint to the register allocator to use VCC and then we |
1004 | // will run this pass again after RA and shrink it if it outputs to |
1005 | // VCC. |
1006 | MRI->setRegAllocationHint(VReg: DstReg, Type: 0, PrefReg: VCCReg); |
1007 | continue; |
1008 | } |
1009 | if (DstReg != VCCReg) |
1010 | continue; |
1011 | } |
1012 | } |
1013 | |
1014 | if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { |
1015 | // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC |
1016 | // instructions. |
1017 | const MachineOperand *Src2 = |
1018 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
1019 | if (!Src2->isReg()) |
1020 | continue; |
1021 | Register SReg = Src2->getReg(); |
1022 | if (SReg.isVirtual()) { |
1023 | MRI->setRegAllocationHint(VReg: SReg, Type: 0, PrefReg: VCCReg); |
1024 | continue; |
1025 | } |
1026 | if (SReg != VCCReg) |
1027 | continue; |
1028 | } |
1029 | |
1030 | // Check for the bool flag output for instructions like V_ADD_I32_e64. |
1031 | const MachineOperand *SDst = TII->getNamedOperand(MI, |
1032 | OperandName: AMDGPU::OpName::sdst); |
1033 | |
1034 | if (SDst) { |
1035 | bool Next = false; |
1036 | |
1037 | if (SDst->getReg() != VCCReg) { |
1038 | if (SDst->getReg().isVirtual()) |
1039 | MRI->setRegAllocationHint(VReg: SDst->getReg(), Type: 0, PrefReg: VCCReg); |
1040 | Next = true; |
1041 | } |
1042 | |
1043 | // All of the instructions with carry outs also have an SGPR input in |
1044 | // src2. |
1045 | const MachineOperand *Src2 = TII->getNamedOperand(MI, |
1046 | OperandName: AMDGPU::OpName::src2); |
1047 | if (Src2 && Src2->getReg() != VCCReg) { |
1048 | if (Src2->getReg().isVirtual()) |
1049 | MRI->setRegAllocationHint(VReg: Src2->getReg(), Type: 0, PrefReg: VCCReg); |
1050 | Next = true; |
1051 | } |
1052 | |
1053 | if (Next) |
1054 | continue; |
1055 | } |
1056 | |
1057 | // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to |
1058 | // fold an immediate into the shrunk instruction as a literal operand. In |
1059 | // GFX10 VOP3 instructions can take a literal operand anyway, so there is |
1060 | // no advantage to doing this. |
1061 | if (ST->hasVOP3Literal() && !IsPostRA) |
1062 | continue; |
1063 | |
1064 | if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(Opc: MI.getOpcode()) && |
1065 | !shouldShrinkTrue16(MI)) |
1066 | continue; |
1067 | |
1068 | // We can shrink this instruction |
1069 | LLVM_DEBUG(dbgs() << "Shrinking " << MI); |
1070 | |
1071 | MachineInstr *Inst32 = TII->buildShrunkInst(MI, NewOpcode: Op32); |
1072 | ++NumInstructionsShrunk; |
1073 | |
1074 | // Copy extra operands not present in the instruction definition. |
1075 | copyExtraImplicitOps(NewMI&: *Inst32, MI); |
1076 | |
1077 | // Copy deadness from the old explicit vcc def to the new implicit def. |
1078 | if (SDst && SDst->isDead()) |
1079 | Inst32->findRegisterDefOperand(Reg: VCCReg, /*TRI=*/nullptr)->setIsDead(); |
1080 | |
1081 | MI.eraseFromParent(); |
1082 | foldImmediates(MI&: *Inst32); |
1083 | |
1084 | LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); |
1085 | } |
1086 | } |
1087 | return false; |
1088 | } |
1089 | |
1090 | bool SIShrinkInstructionsLegacy::runOnMachineFunction(MachineFunction &MF) { |
1091 | if (skipFunction(F: MF.getFunction())) |
1092 | return false; |
1093 | |
1094 | return SIShrinkInstructions().run(MF); |
1095 | } |
1096 | |
1097 | PreservedAnalyses |
1098 | SIShrinkInstructionsPass::run(MachineFunction &MF, |
1099 | MachineFunctionAnalysisManager &) { |
1100 | if (MF.getFunction().hasOptNone() || !SIShrinkInstructions().run(MF)) |
1101 | return PreservedAnalyses::all(); |
1102 | |
1103 | auto PA = getMachineFunctionPassPreservedAnalyses(); |
1104 | PA.preserveSet<CFGAnalyses>(); |
1105 | return PA; |
1106 | } |
1107 | |