1 | //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | /// The pass tries to use the 32-bit encoding for instructions when possible. |
8 | //===----------------------------------------------------------------------===// |
9 | // |
10 | |
11 | #include "AMDGPU.h" |
12 | #include "GCNSubtarget.h" |
13 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
14 | #include "Utils/AMDGPUBaseInfo.h" |
15 | #include "llvm/ADT/Statistic.h" |
16 | #include "llvm/CodeGen/MachineFunctionPass.h" |
17 | |
18 | #define DEBUG_TYPE "si-shrink-instructions" |
19 | |
20 | STATISTIC(NumInstructionsShrunk, |
21 | "Number of 64-bit instruction reduced to 32-bit." ); |
22 | STATISTIC(NumLiteralConstantsFolded, |
23 | "Number of literal constants folded into 32-bit instructions." ); |
24 | |
25 | using namespace llvm; |
26 | |
27 | namespace { |
28 | |
29 | class SIShrinkInstructions : public MachineFunctionPass { |
30 | MachineFunction *MF; |
31 | MachineRegisterInfo *MRI; |
32 | const GCNSubtarget *ST; |
33 | const SIInstrInfo *TII; |
34 | const SIRegisterInfo *TRI; |
35 | |
36 | public: |
37 | static char ID; |
38 | |
39 | public: |
40 | SIShrinkInstructions() : MachineFunctionPass(ID) { |
41 | } |
42 | |
43 | bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; |
44 | bool shouldShrinkTrue16(MachineInstr &MI) const; |
45 | bool isKImmOperand(const MachineOperand &Src) const; |
46 | bool isKUImmOperand(const MachineOperand &Src) const; |
47 | bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; |
48 | void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; |
49 | void shrinkScalarCompare(MachineInstr &MI) const; |
50 | void shrinkMIMG(MachineInstr &MI) const; |
51 | void shrinkMadFma(MachineInstr &MI) const; |
52 | bool shrinkScalarLogicOp(MachineInstr &MI) const; |
53 | bool tryReplaceDeadSDST(MachineInstr &MI) const; |
54 | bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, |
55 | Register Reg, unsigned SubReg) const; |
56 | bool instReadsReg(const MachineInstr *MI, unsigned Reg, |
57 | unsigned SubReg) const; |
58 | bool instModifiesReg(const MachineInstr *MI, unsigned Reg, |
59 | unsigned SubReg) const; |
60 | TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, |
61 | unsigned I) const; |
62 | void dropInstructionKeepingImpDefs(MachineInstr &MI) const; |
63 | MachineInstr *matchSwap(MachineInstr &MovT) const; |
64 | |
65 | bool runOnMachineFunction(MachineFunction &MF) override; |
66 | |
67 | StringRef getPassName() const override { return "SI Shrink Instructions" ; } |
68 | |
69 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
70 | AU.setPreservesCFG(); |
71 | MachineFunctionPass::getAnalysisUsage(AU); |
72 | } |
73 | }; |
74 | |
75 | } // End anonymous namespace. |
76 | |
77 | INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, |
78 | "SI Shrink Instructions" , false, false) |
79 | |
80 | char SIShrinkInstructions::ID = 0; |
81 | |
82 | FunctionPass *llvm::createSIShrinkInstructionsPass() { |
83 | return new SIShrinkInstructions(); |
84 | } |
85 | |
86 | /// This function checks \p MI for operands defined by a move immediate |
87 | /// instruction and then folds the literal constant into the instruction if it |
88 | /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. |
89 | bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, |
90 | bool TryToCommute) const { |
91 | assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); |
92 | |
93 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::src0); |
94 | |
95 | // Try to fold Src0 |
96 | MachineOperand &Src0 = MI.getOperand(i: Src0Idx); |
97 | if (Src0.isReg()) { |
98 | Register Reg = Src0.getReg(); |
99 | if (Reg.isVirtual()) { |
100 | MachineInstr *Def = MRI->getUniqueVRegDef(Reg); |
101 | if (Def && Def->isMoveImmediate()) { |
102 | MachineOperand &MovSrc = Def->getOperand(i: 1); |
103 | bool ConstantFolded = false; |
104 | |
105 | if (TII->isOperandLegal(MI, OpIdx: Src0Idx, MO: &MovSrc)) { |
106 | if (MovSrc.isImm()) { |
107 | Src0.ChangeToImmediate(ImmVal: MovSrc.getImm()); |
108 | ConstantFolded = true; |
109 | } else if (MovSrc.isFI()) { |
110 | Src0.ChangeToFrameIndex(Idx: MovSrc.getIndex()); |
111 | ConstantFolded = true; |
112 | } else if (MovSrc.isGlobal()) { |
113 | Src0.ChangeToGA(GV: MovSrc.getGlobal(), Offset: MovSrc.getOffset(), |
114 | TargetFlags: MovSrc.getTargetFlags()); |
115 | ConstantFolded = true; |
116 | } |
117 | } |
118 | |
119 | if (ConstantFolded) { |
120 | if (MRI->use_nodbg_empty(RegNo: Reg)) |
121 | Def->eraseFromParent(); |
122 | ++NumLiteralConstantsFolded; |
123 | return true; |
124 | } |
125 | } |
126 | } |
127 | } |
128 | |
129 | // We have failed to fold src0, so commute the instruction and try again. |
130 | if (TryToCommute && MI.isCommutable()) { |
131 | if (TII->commuteInstruction(MI)) { |
132 | if (foldImmediates(MI, TryToCommute: false)) |
133 | return true; |
134 | |
135 | // Commute back. |
136 | TII->commuteInstruction(MI); |
137 | } |
138 | } |
139 | |
140 | return false; |
141 | } |
142 | |
143 | /// Do not shrink the instruction if its registers are not expressible in the |
144 | /// shrunk encoding. |
145 | bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const { |
146 | for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { |
147 | const MachineOperand &MO = MI.getOperand(i: I); |
148 | if (MO.isReg()) { |
149 | Register Reg = MO.getReg(); |
150 | assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink " |
151 | "True16 Instructions post-RA" ); |
152 | if (AMDGPU::VGPR_32RegClass.contains(Reg) && |
153 | !AMDGPU::VGPR_32_Lo128RegClass.contains(Reg)) |
154 | return false; |
155 | } |
156 | } |
157 | return true; |
158 | } |
159 | |
160 | bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { |
161 | return isInt<16>(x: SignExtend64(X: Src.getImm(), B: 32)) && |
162 | !TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo()); |
163 | } |
164 | |
165 | bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { |
166 | return isUInt<16>(x: Src.getImm()) && |
167 | !TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo()); |
168 | } |
169 | |
170 | bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, |
171 | bool &IsUnsigned) const { |
172 | if (isInt<16>(x: SignExtend64(X: Src.getImm(), B: 32))) { |
173 | IsUnsigned = false; |
174 | return !TII->isInlineConstant(MO: Src); |
175 | } |
176 | |
177 | if (isUInt<16>(x: Src.getImm())) { |
178 | IsUnsigned = true; |
179 | return !TII->isInlineConstant(MO: Src); |
180 | } |
181 | |
182 | return false; |
183 | } |
184 | |
185 | /// \returns the opcode of an instruction a move immediate of the constant \p |
186 | /// Src can be replaced with if the constant is replaced with \p ModifiedImm. |
187 | /// i.e. |
188 | /// |
189 | /// If the bitreverse of a constant is an inline immediate, reverse the |
190 | /// immediate and return the bitreverse opcode. |
191 | /// |
192 | /// If the bitwise negation of a constant is an inline immediate, reverse the |
193 | /// immediate and return the bitwise not opcode. |
194 | static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII, |
195 | const MachineOperand &Src, |
196 | int32_t &ModifiedImm, bool Scalar) { |
197 | if (TII->isInlineConstant(MO: Src)) |
198 | return 0; |
199 | int32_t SrcImm = static_cast<int32_t>(Src.getImm()); |
200 | |
201 | if (!Scalar) { |
202 | // We could handle the scalar case with here, but we would need to check |
203 | // that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth |
204 | // it, as the reasonable values are already covered by s_movk_i32. |
205 | ModifiedImm = ~SrcImm; |
206 | if (TII->isInlineConstant(Imm: APInt(32, ModifiedImm))) |
207 | return AMDGPU::V_NOT_B32_e32; |
208 | } |
209 | |
210 | ModifiedImm = reverseBits<int32_t>(Val: SrcImm); |
211 | if (TII->isInlineConstant(Imm: APInt(32, ModifiedImm))) |
212 | return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32; |
213 | |
214 | return 0; |
215 | } |
216 | |
217 | /// Copy implicit register operands from specified instruction to this |
218 | /// instruction that are not part of the instruction definition. |
219 | void SIShrinkInstructions::(MachineInstr &NewMI, |
220 | MachineInstr &MI) const { |
221 | MachineFunction &MF = *MI.getMF(); |
222 | for (unsigned i = MI.getDesc().getNumOperands() + |
223 | MI.getDesc().implicit_uses().size() + |
224 | MI.getDesc().implicit_defs().size(), |
225 | e = MI.getNumOperands(); |
226 | i != e; ++i) { |
227 | const MachineOperand &MO = MI.getOperand(i); |
228 | if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) |
229 | NewMI.addOperand(MF, Op: MO); |
230 | } |
231 | } |
232 | |
233 | void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { |
234 | if (!ST->hasSCmpK()) |
235 | return; |
236 | |
237 | // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to |
238 | // get constants on the RHS. |
239 | if (!MI.getOperand(i: 0).isReg()) |
240 | TII->commuteInstruction(MI, NewMI: false, OpIdx1: 0, OpIdx2: 1); |
241 | |
242 | // cmpk requires src0 to be a register |
243 | const MachineOperand &Src0 = MI.getOperand(i: 0); |
244 | if (!Src0.isReg()) |
245 | return; |
246 | |
247 | MachineOperand &Src1 = MI.getOperand(i: 1); |
248 | if (!Src1.isImm()) |
249 | return; |
250 | |
251 | int SOPKOpc = AMDGPU::getSOPKOp(Opcode: MI.getOpcode()); |
252 | if (SOPKOpc == -1) |
253 | return; |
254 | |
255 | // eq/ne is special because the imm16 can be treated as signed or unsigned, |
256 | // and initially selected to the unsigned versions. |
257 | if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { |
258 | bool HasUImm; |
259 | if (isKImmOrKUImmOperand(Src: Src1, IsUnsigned&: HasUImm)) { |
260 | if (!HasUImm) { |
261 | SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? |
262 | AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; |
263 | Src1.setImm(SignExtend32(X: Src1.getImm(), B: 32)); |
264 | } |
265 | |
266 | MI.setDesc(TII->get(Opcode: SOPKOpc)); |
267 | } |
268 | |
269 | return; |
270 | } |
271 | |
272 | const MCInstrDesc &NewDesc = TII->get(Opcode: SOPKOpc); |
273 | |
274 | if ((SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKUImmOperand(Src: Src1)) || |
275 | (!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKImmOperand(Src: Src1))) { |
276 | if (!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc)) |
277 | Src1.setImm(SignExtend64(X: Src1.getImm(), B: 32)); |
278 | MI.setDesc(NewDesc); |
279 | } |
280 | } |
281 | |
282 | // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. |
283 | void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { |
284 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode()); |
285 | if (!Info) |
286 | return; |
287 | |
288 | uint8_t NewEncoding; |
289 | switch (Info->MIMGEncoding) { |
290 | case AMDGPU::MIMGEncGfx10NSA: |
291 | NewEncoding = AMDGPU::MIMGEncGfx10Default; |
292 | break; |
293 | case AMDGPU::MIMGEncGfx11NSA: |
294 | NewEncoding = AMDGPU::MIMGEncGfx11Default; |
295 | break; |
296 | default: |
297 | return; |
298 | } |
299 | |
300 | int VAddr0Idx = |
301 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::vaddr0); |
302 | unsigned NewAddrDwords = Info->VAddrDwords; |
303 | const TargetRegisterClass *RC; |
304 | |
305 | if (Info->VAddrDwords == 2) { |
306 | RC = &AMDGPU::VReg_64RegClass; |
307 | } else if (Info->VAddrDwords == 3) { |
308 | RC = &AMDGPU::VReg_96RegClass; |
309 | } else if (Info->VAddrDwords == 4) { |
310 | RC = &AMDGPU::VReg_128RegClass; |
311 | } else if (Info->VAddrDwords == 5) { |
312 | RC = &AMDGPU::VReg_160RegClass; |
313 | } else if (Info->VAddrDwords == 6) { |
314 | RC = &AMDGPU::VReg_192RegClass; |
315 | } else if (Info->VAddrDwords == 7) { |
316 | RC = &AMDGPU::VReg_224RegClass; |
317 | } else if (Info->VAddrDwords == 8) { |
318 | RC = &AMDGPU::VReg_256RegClass; |
319 | } else if (Info->VAddrDwords == 9) { |
320 | RC = &AMDGPU::VReg_288RegClass; |
321 | } else if (Info->VAddrDwords == 10) { |
322 | RC = &AMDGPU::VReg_320RegClass; |
323 | } else if (Info->VAddrDwords == 11) { |
324 | RC = &AMDGPU::VReg_352RegClass; |
325 | } else if (Info->VAddrDwords == 12) { |
326 | RC = &AMDGPU::VReg_384RegClass; |
327 | } else { |
328 | RC = &AMDGPU::VReg_512RegClass; |
329 | NewAddrDwords = 16; |
330 | } |
331 | |
332 | unsigned VgprBase = 0; |
333 | unsigned NextVgpr = 0; |
334 | bool IsUndef = true; |
335 | bool IsKill = NewAddrDwords == Info->VAddrDwords; |
336 | const unsigned NSAMaxSize = ST->getNSAMaxSize(); |
337 | const bool IsPartialNSA = NewAddrDwords > NSAMaxSize; |
338 | const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands; |
339 | for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) { |
340 | const MachineOperand &Op = MI.getOperand(i: VAddr0Idx + Idx); |
341 | unsigned Vgpr = TRI->getHWRegIndex(Reg: Op.getReg()); |
342 | unsigned Dwords = TRI->getRegSizeInBits(Reg: Op.getReg(), MRI: *MRI) / 32; |
343 | assert(Dwords > 0 && "Un-implemented for less than 32 bit regs" ); |
344 | |
345 | if (Idx == 0) { |
346 | VgprBase = Vgpr; |
347 | NextVgpr = Vgpr + Dwords; |
348 | } else if (Vgpr == NextVgpr) { |
349 | NextVgpr = Vgpr + Dwords; |
350 | } else { |
351 | return; |
352 | } |
353 | |
354 | if (!Op.isUndef()) |
355 | IsUndef = false; |
356 | if (!Op.isKill()) |
357 | IsKill = false; |
358 | } |
359 | |
360 | if (VgprBase + NewAddrDwords > 256) |
361 | return; |
362 | |
363 | // Further check for implicit tied operands - this may be present if TFE is |
364 | // enabled |
365 | int TFEIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::tfe); |
366 | int LWEIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::lwe); |
367 | unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(i: TFEIdx).getImm(); |
368 | unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(i: LWEIdx).getImm(); |
369 | int ToUntie = -1; |
370 | if (TFEVal || LWEVal) { |
371 | // TFE/LWE is enabled so we need to deal with an implicit tied operand |
372 | for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { |
373 | if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && |
374 | MI.getOperand(i).isImplicit()) { |
375 | // This is the tied operand |
376 | assert( |
377 | ToUntie == -1 && |
378 | "found more than one tied implicit operand when expecting only 1" ); |
379 | ToUntie = i; |
380 | MI.untieRegOperand(OpIdx: ToUntie); |
381 | } |
382 | } |
383 | } |
384 | |
385 | unsigned NewOpcode = AMDGPU::getMIMGOpcode(BaseOpcode: Info->BaseOpcode, MIMGEncoding: NewEncoding, |
386 | VDataDwords: Info->VDataDwords, VAddrDwords: NewAddrDwords); |
387 | MI.setDesc(TII->get(Opcode: NewOpcode)); |
388 | MI.getOperand(i: VAddr0Idx).setReg(RC->getRegister(i: VgprBase)); |
389 | MI.getOperand(i: VAddr0Idx).setIsUndef(IsUndef); |
390 | MI.getOperand(i: VAddr0Idx).setIsKill(IsKill); |
391 | |
392 | for (unsigned i = 1; i < EndVAddr; ++i) |
393 | MI.removeOperand(OpNo: VAddr0Idx + 1); |
394 | |
395 | if (ToUntie >= 0) { |
396 | MI.tieOperands( |
397 | DefIdx: AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::vdata), |
398 | UseIdx: ToUntie - (EndVAddr - 1)); |
399 | } |
400 | } |
401 | |
402 | // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. |
403 | void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { |
404 | // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so |
405 | // there is no reason to try to shrink them. |
406 | if (!ST->hasVOP3Literal()) |
407 | return; |
408 | |
409 | // There is no advantage to doing this pre-RA. |
410 | if (!MF->getProperties().hasProperty( |
411 | P: MachineFunctionProperties::Property::NoVRegs)) |
412 | return; |
413 | |
414 | if (TII->hasAnyModifiersSet(MI)) |
415 | return; |
416 | |
417 | const unsigned Opcode = MI.getOpcode(); |
418 | MachineOperand &Src0 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
419 | MachineOperand &Src1 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
420 | MachineOperand &Src2 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
421 | unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; |
422 | |
423 | bool Swap; |
424 | |
425 | // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. |
426 | if (Src2.isImm() && !TII->isInlineConstant(MO: Src2)) { |
427 | if (Src1.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src1.getReg())) |
428 | Swap = false; |
429 | else if (Src0.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src0.getReg())) |
430 | Swap = true; |
431 | else |
432 | return; |
433 | |
434 | switch (Opcode) { |
435 | default: |
436 | llvm_unreachable("Unexpected mad/fma opcode!" ); |
437 | case AMDGPU::V_MAD_F32_e64: |
438 | NewOpcode = AMDGPU::V_MADAK_F32; |
439 | break; |
440 | case AMDGPU::V_FMA_F32_e64: |
441 | NewOpcode = AMDGPU::V_FMAAK_F32; |
442 | break; |
443 | case AMDGPU::V_MAD_F16_e64: |
444 | NewOpcode = AMDGPU::V_MADAK_F16; |
445 | break; |
446 | case AMDGPU::V_FMA_F16_e64: |
447 | case AMDGPU::V_FMA_F16_gfx9_e64: |
448 | NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 |
449 | : AMDGPU::V_FMAAK_F16; |
450 | break; |
451 | } |
452 | } |
453 | |
454 | // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. |
455 | if (Src2.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src2.getReg())) { |
456 | if (Src1.isImm() && !TII->isInlineConstant(MO: Src1)) |
457 | Swap = false; |
458 | else if (Src0.isImm() && !TII->isInlineConstant(MO: Src0)) |
459 | Swap = true; |
460 | else |
461 | return; |
462 | |
463 | switch (Opcode) { |
464 | default: |
465 | llvm_unreachable("Unexpected mad/fma opcode!" ); |
466 | case AMDGPU::V_MAD_F32_e64: |
467 | NewOpcode = AMDGPU::V_MADMK_F32; |
468 | break; |
469 | case AMDGPU::V_FMA_F32_e64: |
470 | NewOpcode = AMDGPU::V_FMAMK_F32; |
471 | break; |
472 | case AMDGPU::V_MAD_F16_e64: |
473 | NewOpcode = AMDGPU::V_MADMK_F16; |
474 | break; |
475 | case AMDGPU::V_FMA_F16_e64: |
476 | case AMDGPU::V_FMA_F16_gfx9_e64: |
477 | NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 |
478 | : AMDGPU::V_FMAMK_F16; |
479 | break; |
480 | } |
481 | } |
482 | |
483 | if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) |
484 | return; |
485 | |
486 | if (AMDGPU::isTrue16Inst(Opc: NewOpcode) && !shouldShrinkTrue16(MI)) |
487 | return; |
488 | |
489 | if (Swap) { |
490 | // Swap Src0 and Src1 by building a new instruction. |
491 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: NewOpcode), |
492 | DestReg: MI.getOperand(i: 0).getReg()) |
493 | .add(MO: Src1) |
494 | .add(MO: Src0) |
495 | .add(MO: Src2) |
496 | .setMIFlags(MI.getFlags()); |
497 | MI.eraseFromParent(); |
498 | } else { |
499 | TII->removeModOperands(MI); |
500 | MI.setDesc(TII->get(Opcode: NewOpcode)); |
501 | } |
502 | } |
503 | |
504 | /// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals. |
505 | /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. |
506 | /// If the inverse of the immediate is legal, use ANDN2, ORN2 or |
507 | /// XNOR (as a ^ b == ~(a ^ ~b)). |
508 | /// \returns true if the caller should continue the machine function iterator |
509 | bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { |
510 | unsigned Opc = MI.getOpcode(); |
511 | const MachineOperand *Dest = &MI.getOperand(i: 0); |
512 | MachineOperand *Src0 = &MI.getOperand(i: 1); |
513 | MachineOperand *Src1 = &MI.getOperand(i: 2); |
514 | MachineOperand *SrcReg = Src0; |
515 | MachineOperand *SrcImm = Src1; |
516 | |
517 | if (!SrcImm->isImm() || |
518 | AMDGPU::isInlinableLiteral32(Literal: SrcImm->getImm(), HasInv2Pi: ST->hasInv2PiInlineImm())) |
519 | return false; |
520 | |
521 | uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); |
522 | uint32_t NewImm = 0; |
523 | |
524 | if (Opc == AMDGPU::S_AND_B32) { |
525 | if (isPowerOf2_32(Value: ~Imm)) { |
526 | NewImm = llvm::countr_one(Value: Imm); |
527 | Opc = AMDGPU::S_BITSET0_B32; |
528 | } else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) { |
529 | NewImm = ~Imm; |
530 | Opc = AMDGPU::S_ANDN2_B32; |
531 | } |
532 | } else if (Opc == AMDGPU::S_OR_B32) { |
533 | if (isPowerOf2_32(Value: Imm)) { |
534 | NewImm = llvm::countr_zero(Val: Imm); |
535 | Opc = AMDGPU::S_BITSET1_B32; |
536 | } else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) { |
537 | NewImm = ~Imm; |
538 | Opc = AMDGPU::S_ORN2_B32; |
539 | } |
540 | } else if (Opc == AMDGPU::S_XOR_B32) { |
541 | if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) { |
542 | NewImm = ~Imm; |
543 | Opc = AMDGPU::S_XNOR_B32; |
544 | } |
545 | } else { |
546 | llvm_unreachable("unexpected opcode" ); |
547 | } |
548 | |
549 | if (NewImm != 0) { |
550 | if (Dest->getReg().isVirtual() && SrcReg->isReg()) { |
551 | MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: 0, PrefReg: SrcReg->getReg()); |
552 | MRI->setRegAllocationHint(VReg: SrcReg->getReg(), Type: 0, PrefReg: Dest->getReg()); |
553 | return true; |
554 | } |
555 | |
556 | if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { |
557 | const bool IsUndef = SrcReg->isUndef(); |
558 | const bool IsKill = SrcReg->isKill(); |
559 | MI.setDesc(TII->get(Opcode: Opc)); |
560 | if (Opc == AMDGPU::S_BITSET0_B32 || |
561 | Opc == AMDGPU::S_BITSET1_B32) { |
562 | Src0->ChangeToImmediate(ImmVal: NewImm); |
563 | // Remove the immediate and add the tied input. |
564 | MI.getOperand(i: 2).ChangeToRegister(Reg: Dest->getReg(), /*IsDef*/ isDef: false, |
565 | /*isImp*/ false, isKill: IsKill, |
566 | /*isDead*/ false, isUndef: IsUndef); |
567 | MI.tieOperands(DefIdx: 0, UseIdx: 2); |
568 | } else { |
569 | SrcImm->setImm(NewImm); |
570 | } |
571 | } |
572 | } |
573 | |
574 | return false; |
575 | } |
576 | |
577 | // This is the same as MachineInstr::readsRegister/modifiesRegister except |
578 | // it takes subregs into account. |
579 | bool SIShrinkInstructions::instAccessReg( |
580 | iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, |
581 | unsigned SubReg) const { |
582 | for (const MachineOperand &MO : R) { |
583 | if (!MO.isReg()) |
584 | continue; |
585 | |
586 | if (Reg.isPhysical() && MO.getReg().isPhysical()) { |
587 | if (TRI->regsOverlap(RegA: Reg, RegB: MO.getReg())) |
588 | return true; |
589 | } else if (MO.getReg() == Reg && Reg.isVirtual()) { |
590 | LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubIdx: SubReg) & |
591 | TRI->getSubRegIndexLaneMask(SubIdx: MO.getSubReg()); |
592 | if (Overlap.any()) |
593 | return true; |
594 | } |
595 | } |
596 | return false; |
597 | } |
598 | |
599 | bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, |
600 | unsigned SubReg) const { |
601 | return instAccessReg(R: MI->uses(), Reg, SubReg); |
602 | } |
603 | |
604 | bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, |
605 | unsigned SubReg) const { |
606 | return instAccessReg(R: MI->defs(), Reg, SubReg); |
607 | } |
608 | |
609 | TargetInstrInfo::RegSubRegPair |
610 | SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, |
611 | unsigned I) const { |
612 | if (TRI->getRegSizeInBits(Reg, MRI: *MRI) != 32) { |
613 | if (Reg.isPhysical()) { |
614 | Reg = TRI->getSubReg(Reg, Idx: TRI->getSubRegFromChannel(Channel: I)); |
615 | } else { |
616 | Sub = TRI->getSubRegFromChannel(Channel: I + TRI->getChannelFromSubReg(SubReg: Sub)); |
617 | } |
618 | } |
619 | return TargetInstrInfo::RegSubRegPair(Reg, Sub); |
620 | } |
621 | |
622 | void SIShrinkInstructions::dropInstructionKeepingImpDefs( |
623 | MachineInstr &MI) const { |
624 | for (unsigned i = MI.getDesc().getNumOperands() + |
625 | MI.getDesc().implicit_uses().size() + |
626 | MI.getDesc().implicit_defs().size(), |
627 | e = MI.getNumOperands(); |
628 | i != e; ++i) { |
629 | const MachineOperand &Op = MI.getOperand(i); |
630 | if (!Op.isDef()) |
631 | continue; |
632 | BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(), |
633 | MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Op.getReg()); |
634 | } |
635 | |
636 | MI.eraseFromParent(); |
637 | } |
638 | |
639 | // Match: |
640 | // mov t, x |
641 | // mov x, y |
642 | // mov y, t |
643 | // |
644 | // => |
645 | // |
646 | // mov t, x (t is potentially dead and move eliminated) |
647 | // v_swap_b32 x, y |
648 | // |
649 | // Returns next valid instruction pointer if was able to create v_swap_b32. |
650 | // |
651 | // This shall not be done too early not to prevent possible folding which may |
652 | // remove matched moves, and this should preferably be done before RA to |
653 | // release saved registers and also possibly after RA which can insert copies |
654 | // too. |
655 | // |
656 | // This is really just a generic peephole that is not a canonical shrinking, |
657 | // although requirements match the pass placement and it reduces code size too. |
658 | MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { |
659 | assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || |
660 | MovT.getOpcode() == AMDGPU::COPY); |
661 | |
662 | Register T = MovT.getOperand(i: 0).getReg(); |
663 | unsigned Tsub = MovT.getOperand(i: 0).getSubReg(); |
664 | MachineOperand &Xop = MovT.getOperand(i: 1); |
665 | |
666 | if (!Xop.isReg()) |
667 | return nullptr; |
668 | Register X = Xop.getReg(); |
669 | unsigned Xsub = Xop.getSubReg(); |
670 | |
671 | unsigned Size = TII->getOpSize(MI: MovT, OpNo: 0) / 4; |
672 | |
673 | if (!TRI->isVGPR(MRI: *MRI, Reg: X)) |
674 | return nullptr; |
675 | |
676 | const unsigned SearchLimit = 16; |
677 | unsigned Count = 0; |
678 | bool KilledT = false; |
679 | for (auto Iter = std::next(x: MovT.getIterator()), |
680 | E = MovT.getParent()->instr_end(); |
681 | Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { |
682 | |
683 | MachineInstr *MovY = &*Iter; |
684 | KilledT = MovY->killsRegister(Reg: T, TRI); |
685 | |
686 | if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && |
687 | MovY->getOpcode() != AMDGPU::COPY) || |
688 | !MovY->getOperand(i: 1).isReg() || |
689 | MovY->getOperand(i: 1).getReg() != T || |
690 | MovY->getOperand(i: 1).getSubReg() != Tsub) |
691 | continue; |
692 | |
693 | Register Y = MovY->getOperand(i: 0).getReg(); |
694 | unsigned Ysub = MovY->getOperand(i: 0).getSubReg(); |
695 | |
696 | if (!TRI->isVGPR(MRI: *MRI, Reg: Y)) |
697 | continue; |
698 | |
699 | MachineInstr *MovX = nullptr; |
700 | for (auto IY = MovY->getIterator(), I = std::next(x: MovT.getIterator()); |
701 | I != IY; ++I) { |
702 | if (instReadsReg(MI: &*I, Reg: X, SubReg: Xsub) || instModifiesReg(MI: &*I, Reg: Y, SubReg: Ysub) || |
703 | instModifiesReg(MI: &*I, Reg: T, SubReg: Tsub) || |
704 | (MovX && instModifiesReg(MI: &*I, Reg: X, SubReg: Xsub))) { |
705 | MovX = nullptr; |
706 | break; |
707 | } |
708 | if (!instReadsReg(MI: &*I, Reg: Y, SubReg: Ysub)) { |
709 | if (!MovX && instModifiesReg(MI: &*I, Reg: X, SubReg: Xsub)) { |
710 | MovX = nullptr; |
711 | break; |
712 | } |
713 | continue; |
714 | } |
715 | if (MovX || |
716 | (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && |
717 | I->getOpcode() != AMDGPU::COPY) || |
718 | I->getOperand(i: 0).getReg() != X || |
719 | I->getOperand(i: 0).getSubReg() != Xsub) { |
720 | MovX = nullptr; |
721 | break; |
722 | } |
723 | |
724 | if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) |
725 | continue; |
726 | |
727 | MovX = &*I; |
728 | } |
729 | |
730 | if (!MovX) |
731 | continue; |
732 | |
733 | LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY); |
734 | |
735 | for (unsigned I = 0; I < Size; ++I) { |
736 | TargetInstrInfo::RegSubRegPair X1, Y1; |
737 | X1 = getSubRegForIndex(Reg: X, Sub: Xsub, I); |
738 | Y1 = getSubRegForIndex(Reg: Y, Sub: Ysub, I); |
739 | MachineBasicBlock &MBB = *MovT.getParent(); |
740 | auto MIB = BuildMI(BB&: MBB, I: MovX->getIterator(), MIMD: MovT.getDebugLoc(), |
741 | MCID: TII->get(Opcode: AMDGPU::V_SWAP_B32)) |
742 | .addDef(RegNo: X1.Reg, Flags: 0, SubReg: X1.SubReg) |
743 | .addDef(RegNo: Y1.Reg, Flags: 0, SubReg: Y1.SubReg) |
744 | .addReg(RegNo: Y1.Reg, flags: 0, SubReg: Y1.SubReg) |
745 | .addReg(RegNo: X1.Reg, flags: 0, SubReg: X1.SubReg).getInstr(); |
746 | if (MovX->hasRegisterImplicitUseOperand(Reg: AMDGPU::EXEC)) { |
747 | // Drop implicit EXEC. |
748 | MIB->removeOperand(OpNo: MIB->getNumExplicitOperands()); |
749 | MIB->copyImplicitOps(MF&: *MBB.getParent(), MI: *MovX); |
750 | } |
751 | } |
752 | MovX->eraseFromParent(); |
753 | dropInstructionKeepingImpDefs(MI&: *MovY); |
754 | MachineInstr *Next = &*std::next(x: MovT.getIterator()); |
755 | |
756 | if (T.isVirtual() && MRI->use_nodbg_empty(RegNo: T)) { |
757 | dropInstructionKeepingImpDefs(MI&: MovT); |
758 | } else { |
759 | Xop.setIsKill(false); |
760 | for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { |
761 | unsigned OpNo = MovT.getNumExplicitOperands() + I; |
762 | const MachineOperand &Op = MovT.getOperand(i: OpNo); |
763 | if (Op.isKill() && TRI->regsOverlap(RegA: X, RegB: Op.getReg())) |
764 | MovT.removeOperand(OpNo); |
765 | } |
766 | } |
767 | |
768 | return Next; |
769 | } |
770 | |
771 | return nullptr; |
772 | } |
773 | |
774 | // If an instruction has dead sdst replace it with NULL register on gfx1030+ |
775 | bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const { |
776 | if (!ST->hasGFX10_3Insts()) |
777 | return false; |
778 | |
779 | MachineOperand *Op = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst); |
780 | if (!Op) |
781 | return false; |
782 | Register SDstReg = Op->getReg(); |
783 | if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(RegNo: SDstReg)) |
784 | return false; |
785 | |
786 | Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64); |
787 | return true; |
788 | } |
789 | |
790 | bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { |
791 | if (skipFunction(F: MF.getFunction())) |
792 | return false; |
793 | |
794 | this->MF = &MF; |
795 | MRI = &MF.getRegInfo(); |
796 | ST = &MF.getSubtarget<GCNSubtarget>(); |
797 | TII = ST->getInstrInfo(); |
798 | TRI = &TII->getRegisterInfo(); |
799 | |
800 | unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; |
801 | |
802 | std::vector<unsigned> I1Defs; |
803 | |
804 | for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); |
805 | BI != BE; ++BI) { |
806 | |
807 | MachineBasicBlock &MBB = *BI; |
808 | MachineBasicBlock::iterator I, Next; |
809 | for (I = MBB.begin(); I != MBB.end(); I = Next) { |
810 | Next = std::next(x: I); |
811 | MachineInstr &MI = *I; |
812 | |
813 | if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { |
814 | // If this has a literal constant source that is the same as the |
815 | // reversed bits of an inline immediate, replace with a bitreverse of |
816 | // that constant. This saves 4 bytes in the common case of materializing |
817 | // sign bits. |
818 | |
819 | // Test if we are after regalloc. We only want to do this after any |
820 | // optimizations happen because this will confuse them. |
821 | // XXX - not exactly a check for post-regalloc run. |
822 | MachineOperand &Src = MI.getOperand(i: 1); |
823 | if (Src.isImm() && MI.getOperand(i: 0).getReg().isPhysical()) { |
824 | int32_t ModImm; |
825 | unsigned ModOpcode = |
826 | canModifyToInlineImmOp32(TII, Src, ModifiedImm&: ModImm, /*Scalar=*/false); |
827 | if (ModOpcode != 0) { |
828 | MI.setDesc(TII->get(Opcode: ModOpcode)); |
829 | Src.setImm(static_cast<int64_t>(ModImm)); |
830 | continue; |
831 | } |
832 | } |
833 | } |
834 | |
835 | if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || |
836 | MI.getOpcode() == AMDGPU::COPY)) { |
837 | if (auto *NextMI = matchSwap(MovT&: MI)) { |
838 | Next = NextMI->getIterator(); |
839 | continue; |
840 | } |
841 | } |
842 | |
843 | // Try to use S_ADDK_I32 and S_MULK_I32. |
844 | if (MI.getOpcode() == AMDGPU::S_ADD_I32 || |
845 | MI.getOpcode() == AMDGPU::S_MUL_I32) { |
846 | const MachineOperand *Dest = &MI.getOperand(i: 0); |
847 | MachineOperand *Src0 = &MI.getOperand(i: 1); |
848 | MachineOperand *Src1 = &MI.getOperand(i: 2); |
849 | |
850 | if (!Src0->isReg() && Src1->isReg()) { |
851 | if (TII->commuteInstruction(MI, NewMI: false, OpIdx1: 1, OpIdx2: 2)) |
852 | std::swap(a&: Src0, b&: Src1); |
853 | } |
854 | |
855 | // FIXME: This could work better if hints worked with subregisters. If |
856 | // we have a vector add of a constant, we usually don't get the correct |
857 | // allocation due to the subregister usage. |
858 | if (Dest->getReg().isVirtual() && Src0->isReg()) { |
859 | MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: 0, PrefReg: Src0->getReg()); |
860 | MRI->setRegAllocationHint(VReg: Src0->getReg(), Type: 0, PrefReg: Dest->getReg()); |
861 | continue; |
862 | } |
863 | |
864 | if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { |
865 | if (Src1->isImm() && isKImmOperand(Src: *Src1)) { |
866 | unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? |
867 | AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; |
868 | |
869 | Src1->setImm(SignExtend64(X: Src1->getImm(), B: 32)); |
870 | MI.setDesc(TII->get(Opcode: Opc)); |
871 | MI.tieOperands(DefIdx: 0, UseIdx: 1); |
872 | } |
873 | } |
874 | } |
875 | |
876 | // Try to use s_cmpk_* |
877 | if (MI.isCompare() && TII->isSOPC(MI)) { |
878 | shrinkScalarCompare(MI); |
879 | continue; |
880 | } |
881 | |
882 | // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. |
883 | if (MI.getOpcode() == AMDGPU::S_MOV_B32) { |
884 | const MachineOperand &Dst = MI.getOperand(i: 0); |
885 | MachineOperand &Src = MI.getOperand(i: 1); |
886 | |
887 | if (Src.isImm() && Dst.getReg().isPhysical()) { |
888 | unsigned ModOpc; |
889 | int32_t ModImm; |
890 | if (isKImmOperand(Src)) { |
891 | MI.setDesc(TII->get(Opcode: AMDGPU::S_MOVK_I32)); |
892 | Src.setImm(SignExtend64(X: Src.getImm(), B: 32)); |
893 | } else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModifiedImm&: ModImm, |
894 | /*Scalar=*/true))) { |
895 | MI.setDesc(TII->get(Opcode: ModOpc)); |
896 | Src.setImm(static_cast<int64_t>(ModImm)); |
897 | } |
898 | } |
899 | |
900 | continue; |
901 | } |
902 | |
903 | // Shrink scalar logic operations. |
904 | if (MI.getOpcode() == AMDGPU::S_AND_B32 || |
905 | MI.getOpcode() == AMDGPU::S_OR_B32 || |
906 | MI.getOpcode() == AMDGPU::S_XOR_B32) { |
907 | if (shrinkScalarLogicOp(MI)) |
908 | continue; |
909 | } |
910 | |
911 | if (TII->isMIMG(Opcode: MI.getOpcode()) && |
912 | ST->getGeneration() >= AMDGPUSubtarget::GFX10 && |
913 | MF.getProperties().hasProperty( |
914 | P: MachineFunctionProperties::Property::NoVRegs)) { |
915 | shrinkMIMG(MI); |
916 | continue; |
917 | } |
918 | |
919 | if (!TII->isVOP3(MI)) |
920 | continue; |
921 | |
922 | if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || |
923 | MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || |
924 | MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || |
925 | MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || |
926 | MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) { |
927 | shrinkMadFma(MI); |
928 | continue; |
929 | } |
930 | |
931 | if (!TII->hasVALU32BitEncoding(Opcode: MI.getOpcode())) { |
932 | // If there is no chance we will shrink it and use VCC as sdst to get |
933 | // a 32 bit form try to replace dead sdst with NULL. |
934 | tryReplaceDeadSDST(MI); |
935 | continue; |
936 | } |
937 | |
938 | if (!TII->canShrink(MI, MRI: *MRI)) { |
939 | // Try commuting the instruction and see if that enables us to shrink |
940 | // it. |
941 | if (!MI.isCommutable() || !TII->commuteInstruction(MI) || |
942 | !TII->canShrink(MI, MRI: *MRI)) { |
943 | tryReplaceDeadSDST(MI); |
944 | continue; |
945 | } |
946 | } |
947 | |
948 | int Op32 = AMDGPU::getVOPe32(Opcode: MI.getOpcode()); |
949 | |
950 | if (TII->isVOPC(Opcode: Op32)) { |
951 | MachineOperand &Op0 = MI.getOperand(i: 0); |
952 | if (Op0.isReg()) { |
953 | // Exclude VOPCX instructions as these don't explicitly write a |
954 | // dst. |
955 | Register DstReg = Op0.getReg(); |
956 | if (DstReg.isVirtual()) { |
957 | // VOPC instructions can only write to the VCC register. We can't |
958 | // force them to use VCC here, because this is only one register and |
959 | // cannot deal with sequences which would require multiple copies of |
960 | // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) |
961 | // |
962 | // So, instead of forcing the instruction to write to VCC, we |
963 | // provide a hint to the register allocator to use VCC and then we |
964 | // will run this pass again after RA and shrink it if it outputs to |
965 | // VCC. |
966 | MRI->setRegAllocationHint(VReg: DstReg, Type: 0, PrefReg: VCCReg); |
967 | continue; |
968 | } |
969 | if (DstReg != VCCReg) |
970 | continue; |
971 | } |
972 | } |
973 | |
974 | if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { |
975 | // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC |
976 | // instructions. |
977 | const MachineOperand *Src2 = |
978 | TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
979 | if (!Src2->isReg()) |
980 | continue; |
981 | Register SReg = Src2->getReg(); |
982 | if (SReg.isVirtual()) { |
983 | MRI->setRegAllocationHint(VReg: SReg, Type: 0, PrefReg: VCCReg); |
984 | continue; |
985 | } |
986 | if (SReg != VCCReg) |
987 | continue; |
988 | } |
989 | |
990 | // Check for the bool flag output for instructions like V_ADD_I32_e64. |
991 | const MachineOperand *SDst = TII->getNamedOperand(MI, |
992 | OperandName: AMDGPU::OpName::sdst); |
993 | |
994 | if (SDst) { |
995 | bool Next = false; |
996 | |
997 | if (SDst->getReg() != VCCReg) { |
998 | if (SDst->getReg().isVirtual()) |
999 | MRI->setRegAllocationHint(VReg: SDst->getReg(), Type: 0, PrefReg: VCCReg); |
1000 | Next = true; |
1001 | } |
1002 | |
1003 | // All of the instructions with carry outs also have an SGPR input in |
1004 | // src2. |
1005 | const MachineOperand *Src2 = TII->getNamedOperand(MI, |
1006 | OperandName: AMDGPU::OpName::src2); |
1007 | if (Src2 && Src2->getReg() != VCCReg) { |
1008 | if (Src2->getReg().isVirtual()) |
1009 | MRI->setRegAllocationHint(VReg: Src2->getReg(), Type: 0, PrefReg: VCCReg); |
1010 | Next = true; |
1011 | } |
1012 | |
1013 | if (Next) |
1014 | continue; |
1015 | } |
1016 | |
1017 | // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to |
1018 | // fold an immediate into the shrunk instruction as a literal operand. In |
1019 | // GFX10 VOP3 instructions can take a literal operand anyway, so there is |
1020 | // no advantage to doing this. |
1021 | if (ST->hasVOP3Literal() && |
1022 | !MF.getProperties().hasProperty( |
1023 | P: MachineFunctionProperties::Property::NoVRegs)) |
1024 | continue; |
1025 | |
1026 | if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(Opc: MI.getOpcode()) && |
1027 | !shouldShrinkTrue16(MI)) |
1028 | continue; |
1029 | |
1030 | // We can shrink this instruction |
1031 | LLVM_DEBUG(dbgs() << "Shrinking " << MI); |
1032 | |
1033 | MachineInstr *Inst32 = TII->buildShrunkInst(MI, NewOpcode: Op32); |
1034 | ++NumInstructionsShrunk; |
1035 | |
1036 | // Copy extra operands not present in the instruction definition. |
1037 | copyExtraImplicitOps(NewMI&: *Inst32, MI); |
1038 | |
1039 | // Copy deadness from the old explicit vcc def to the new implicit def. |
1040 | if (SDst && SDst->isDead()) |
1041 | Inst32->findRegisterDefOperand(Reg: VCCReg, /*TRI=*/nullptr)->setIsDead(); |
1042 | |
1043 | MI.eraseFromParent(); |
1044 | foldImmediates(MI&: *Inst32); |
1045 | |
1046 | LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); |
1047 | } |
1048 | } |
1049 | return false; |
1050 | } |
1051 | |