1 | //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | /// \file |
8 | //===----------------------------------------------------------------------===// |
9 | // |
10 | |
11 | #include "AMDGPU.h" |
12 | #include "GCNSubtarget.h" |
13 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
14 | #include "SIMachineFunctionInfo.h" |
15 | #include "llvm/ADT/DepthFirstIterator.h" |
16 | #include "llvm/CodeGen/MachineFunctionPass.h" |
17 | #include "llvm/CodeGen/MachineOperand.h" |
18 | |
19 | #define DEBUG_TYPE "si-fold-operands" |
20 | using namespace llvm; |
21 | |
22 | namespace { |
23 | |
24 | struct FoldCandidate { |
25 | MachineInstr *UseMI; |
26 | union { |
27 | MachineOperand *OpToFold; |
28 | uint64_t ImmToFold; |
29 | int FrameIndexToFold; |
30 | }; |
31 | int ShrinkOpcode; |
32 | unsigned UseOpNo; |
33 | MachineOperand::MachineOperandType Kind; |
34 | bool Commuted; |
35 | |
36 | FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, |
37 | bool Commuted_ = false, |
38 | int ShrinkOp = -1) : |
39 | UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), |
40 | Kind(FoldOp->getType()), |
41 | Commuted(Commuted_) { |
42 | if (FoldOp->isImm()) { |
43 | ImmToFold = FoldOp->getImm(); |
44 | } else if (FoldOp->isFI()) { |
45 | FrameIndexToFold = FoldOp->getIndex(); |
46 | } else { |
47 | assert(FoldOp->isReg() || FoldOp->isGlobal()); |
48 | OpToFold = FoldOp; |
49 | } |
50 | } |
51 | |
52 | bool isFI() const { |
53 | return Kind == MachineOperand::MO_FrameIndex; |
54 | } |
55 | |
56 | bool isImm() const { |
57 | return Kind == MachineOperand::MO_Immediate; |
58 | } |
59 | |
60 | bool isReg() const { |
61 | return Kind == MachineOperand::MO_Register; |
62 | } |
63 | |
64 | bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; } |
65 | |
66 | bool needsShrink() const { return ShrinkOpcode != -1; } |
67 | }; |
68 | |
69 | class SIFoldOperands : public MachineFunctionPass { |
70 | public: |
71 | static char ID; |
72 | MachineRegisterInfo *MRI; |
73 | const SIInstrInfo *TII; |
74 | const SIRegisterInfo *TRI; |
75 | const GCNSubtarget *ST; |
76 | const SIMachineFunctionInfo *MFI; |
77 | |
78 | bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo, |
79 | const MachineOperand &OpToFold) const; |
80 | |
81 | bool updateOperand(FoldCandidate &Fold) const; |
82 | |
83 | bool canUseImmWithOpSel(FoldCandidate &Fold) const; |
84 | |
85 | bool tryFoldImmWithOpSel(FoldCandidate &Fold) const; |
86 | |
87 | bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, |
88 | MachineInstr *MI, unsigned OpNo, |
89 | MachineOperand *OpToFold) const; |
90 | bool isUseSafeToFold(const MachineInstr &MI, |
91 | const MachineOperand &UseMO) const; |
92 | bool |
93 | getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs, |
94 | Register UseReg, uint8_t OpTy) const; |
95 | bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI, |
96 | unsigned UseOpIdx, |
97 | SmallVectorImpl<FoldCandidate> &FoldList) const; |
98 | void foldOperand(MachineOperand &OpToFold, |
99 | MachineInstr *UseMI, |
100 | int UseOpIdx, |
101 | SmallVectorImpl<FoldCandidate> &FoldList, |
102 | SmallVectorImpl<MachineInstr *> &CopiesToReplace) const; |
103 | |
104 | MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const; |
105 | bool tryConstantFoldOp(MachineInstr *MI) const; |
106 | bool tryFoldCndMask(MachineInstr &MI) const; |
107 | bool tryFoldZeroHighBits(MachineInstr &MI) const; |
108 | bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; |
109 | bool tryFoldFoldableCopy(MachineInstr &MI, |
110 | MachineOperand *&CurrentKnownM0Val) const; |
111 | |
112 | const MachineOperand *isClamp(const MachineInstr &MI) const; |
113 | bool tryFoldClamp(MachineInstr &MI); |
114 | |
115 | std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const; |
116 | bool tryFoldOMod(MachineInstr &MI); |
117 | bool tryFoldRegSequence(MachineInstr &MI); |
118 | bool tryFoldPhiAGPR(MachineInstr &MI); |
119 | bool tryFoldLoad(MachineInstr &MI); |
120 | |
121 | bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB); |
122 | |
123 | public: |
124 | SIFoldOperands() : MachineFunctionPass(ID) { |
125 | initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); |
126 | } |
127 | |
128 | bool runOnMachineFunction(MachineFunction &MF) override; |
129 | |
130 | StringRef getPassName() const override { return "SI Fold Operands" ; } |
131 | |
132 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
133 | AU.setPreservesCFG(); |
134 | MachineFunctionPass::getAnalysisUsage(AU); |
135 | } |
136 | }; |
137 | |
138 | } // End anonymous namespace. |
139 | |
140 | INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, |
141 | "SI Fold Operands" , false, false) |
142 | |
143 | char SIFoldOperands::ID = 0; |
144 | |
145 | char &llvm::SIFoldOperandsID = SIFoldOperands::ID; |
146 | |
147 | static const TargetRegisterClass *getRegOpRC(const MachineRegisterInfo &MRI, |
148 | const TargetRegisterInfo &TRI, |
149 | const MachineOperand &MO) { |
150 | const TargetRegisterClass *RC = MRI.getRegClass(Reg: MO.getReg()); |
151 | if (const TargetRegisterClass *SubRC = |
152 | TRI.getSubRegisterClass(SuperRC: RC, SubRegIdx: MO.getSubReg())) |
153 | RC = SubRC; |
154 | return RC; |
155 | } |
156 | |
157 | // Map multiply-accumulate opcode to corresponding multiply-add opcode if any. |
158 | static unsigned macToMad(unsigned Opc) { |
159 | switch (Opc) { |
160 | case AMDGPU::V_MAC_F32_e64: |
161 | return AMDGPU::V_MAD_F32_e64; |
162 | case AMDGPU::V_MAC_F16_e64: |
163 | return AMDGPU::V_MAD_F16_e64; |
164 | case AMDGPU::V_FMAC_F32_e64: |
165 | return AMDGPU::V_FMA_F32_e64; |
166 | case AMDGPU::V_FMAC_F16_e64: |
167 | return AMDGPU::V_FMA_F16_gfx9_e64; |
168 | case AMDGPU::V_FMAC_F16_t16_e64: |
169 | return AMDGPU::V_FMA_F16_gfx9_e64; |
170 | case AMDGPU::V_FMAC_LEGACY_F32_e64: |
171 | return AMDGPU::V_FMA_LEGACY_F32_e64; |
172 | case AMDGPU::V_FMAC_F64_e64: |
173 | return AMDGPU::V_FMA_F64_e64; |
174 | } |
175 | return AMDGPU::INSTRUCTION_LIST_END; |
176 | } |
177 | |
178 | // TODO: Add heuristic that the frame index might not fit in the addressing mode |
179 | // immediate offset to avoid materializing in loops. |
180 | bool SIFoldOperands::frameIndexMayFold(const MachineInstr &UseMI, int OpNo, |
181 | const MachineOperand &OpToFold) const { |
182 | if (!OpToFold.isFI()) |
183 | return false; |
184 | |
185 | const unsigned Opc = UseMI.getOpcode(); |
186 | if (TII->isMUBUF(MI: UseMI)) |
187 | return OpNo == AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr); |
188 | if (!TII->isFLATScratch(MI: UseMI)) |
189 | return false; |
190 | |
191 | int SIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::saddr); |
192 | if (OpNo == SIdx) |
193 | return true; |
194 | |
195 | int VIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr); |
196 | return OpNo == VIdx && SIdx == -1; |
197 | } |
198 | |
199 | FunctionPass *llvm::createSIFoldOperandsPass() { |
200 | return new SIFoldOperands(); |
201 | } |
202 | |
203 | bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const { |
204 | MachineInstr *MI = Fold.UseMI; |
205 | MachineOperand &Old = MI->getOperand(i: Fold.UseOpNo); |
206 | const uint64_t TSFlags = MI->getDesc().TSFlags; |
207 | |
208 | assert(Old.isReg() && Fold.isImm()); |
209 | |
210 | if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) || |
211 | (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) || |
212 | (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT))) |
213 | return false; |
214 | |
215 | unsigned Opcode = MI->getOpcode(); |
216 | int OpNo = MI->getOperandNo(I: &Old); |
217 | uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType; |
218 | switch (OpType) { |
219 | default: |
220 | return false; |
221 | case AMDGPU::OPERAND_REG_IMM_V2FP16: |
222 | case AMDGPU::OPERAND_REG_IMM_V2BF16: |
223 | case AMDGPU::OPERAND_REG_IMM_V2INT16: |
224 | case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: |
225 | case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: |
226 | case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: |
227 | break; |
228 | } |
229 | |
230 | return true; |
231 | } |
232 | |
233 | bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const { |
234 | MachineInstr *MI = Fold.UseMI; |
235 | MachineOperand &Old = MI->getOperand(i: Fold.UseOpNo); |
236 | unsigned Opcode = MI->getOpcode(); |
237 | int OpNo = MI->getOperandNo(I: &Old); |
238 | uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType; |
239 | |
240 | // If the literal can be inlined as-is, apply it and short-circuit the |
241 | // tests below. The main motivation for this is to avoid unintuitive |
242 | // uses of opsel. |
243 | if (AMDGPU::isInlinableLiteralV216(Literal: Fold.ImmToFold, OpType)) { |
244 | Old.ChangeToImmediate(ImmVal: Fold.ImmToFold); |
245 | return true; |
246 | } |
247 | |
248 | // Refer to op_sel/op_sel_hi and check if we can change the immediate and |
249 | // op_sel in a way that allows an inline constant. |
250 | int ModIdx = -1; |
251 | unsigned SrcIdx = ~0; |
252 | if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::src0)) { |
253 | ModIdx = AMDGPU::OpName::src0_modifiers; |
254 | SrcIdx = 0; |
255 | } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::src1)) { |
256 | ModIdx = AMDGPU::OpName::src1_modifiers; |
257 | SrcIdx = 1; |
258 | } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::src2)) { |
259 | ModIdx = AMDGPU::OpName::src2_modifiers; |
260 | SrcIdx = 2; |
261 | } |
262 | assert(ModIdx != -1); |
263 | ModIdx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: ModIdx); |
264 | MachineOperand &Mod = MI->getOperand(i: ModIdx); |
265 | unsigned ModVal = Mod.getImm(); |
266 | |
267 | uint16_t ImmLo = static_cast<uint16_t>( |
268 | Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0)); |
269 | uint16_t ImmHi = static_cast<uint16_t>( |
270 | Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0)); |
271 | uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo; |
272 | unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1); |
273 | |
274 | // Helper function that attempts to inline the given value with a newly |
275 | // chosen opsel pattern. |
276 | auto tryFoldToInline = [&](uint32_t Imm) -> bool { |
277 | if (AMDGPU::isInlinableLiteralV216(Literal: Imm, OpType)) { |
278 | Mod.setImm(NewModVal | SISrcMods::OP_SEL_1); |
279 | Old.ChangeToImmediate(ImmVal: Imm); |
280 | return true; |
281 | } |
282 | |
283 | // Try to shuffle the halves around and leverage opsel to get an inline |
284 | // constant. |
285 | uint16_t Lo = static_cast<uint16_t>(Imm); |
286 | uint16_t Hi = static_cast<uint16_t>(Imm >> 16); |
287 | if (Lo == Hi) { |
288 | if (AMDGPU::isInlinableLiteralV216(Literal: Lo, OpType)) { |
289 | Mod.setImm(NewModVal); |
290 | Old.ChangeToImmediate(ImmVal: Lo); |
291 | return true; |
292 | } |
293 | |
294 | if (static_cast<int16_t>(Lo) < 0) { |
295 | int32_t SExt = static_cast<int16_t>(Lo); |
296 | if (AMDGPU::isInlinableLiteralV216(Literal: SExt, OpType)) { |
297 | Mod.setImm(NewModVal); |
298 | Old.ChangeToImmediate(ImmVal: SExt); |
299 | return true; |
300 | } |
301 | } |
302 | |
303 | // This check is only useful for integer instructions |
304 | if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 || |
305 | OpType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16) { |
306 | if (AMDGPU::isInlinableLiteralV216(Literal: Lo << 16, OpType)) { |
307 | Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1); |
308 | Old.ChangeToImmediate(ImmVal: static_cast<uint32_t>(Lo) << 16); |
309 | return true; |
310 | } |
311 | } |
312 | } else { |
313 | uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi; |
314 | if (AMDGPU::isInlinableLiteralV216(Literal: Swapped, OpType)) { |
315 | Mod.setImm(NewModVal | SISrcMods::OP_SEL_0); |
316 | Old.ChangeToImmediate(ImmVal: Swapped); |
317 | return true; |
318 | } |
319 | } |
320 | |
321 | return false; |
322 | }; |
323 | |
324 | if (tryFoldToInline(Imm)) |
325 | return true; |
326 | |
327 | // Replace integer addition by subtraction and vice versa if it allows |
328 | // folding the immediate to an inline constant. |
329 | // |
330 | // We should only ever get here for SrcIdx == 1 due to canonicalization |
331 | // earlier in the pipeline, but we double-check here to be safe / fully |
332 | // general. |
333 | bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16; |
334 | bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16; |
335 | if (SrcIdx == 1 && (IsUAdd || IsUSub)) { |
336 | unsigned ClampIdx = |
337 | AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::clamp); |
338 | bool Clamp = MI->getOperand(i: ClampIdx).getImm() != 0; |
339 | |
340 | if (!Clamp) { |
341 | uint16_t NegLo = -static_cast<uint16_t>(Imm); |
342 | uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16); |
343 | uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo; |
344 | |
345 | if (tryFoldToInline(NegImm)) { |
346 | unsigned NegOpcode = |
347 | IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16; |
348 | MI->setDesc(TII->get(Opcode: NegOpcode)); |
349 | return true; |
350 | } |
351 | } |
352 | } |
353 | |
354 | return false; |
355 | } |
356 | |
357 | bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { |
358 | MachineInstr *MI = Fold.UseMI; |
359 | MachineOperand &Old = MI->getOperand(i: Fold.UseOpNo); |
360 | assert(Old.isReg()); |
361 | |
362 | if (Fold.isImm() && canUseImmWithOpSel(Fold)) { |
363 | if (tryFoldImmWithOpSel(Fold)) |
364 | return true; |
365 | |
366 | // We can't represent the candidate as an inline constant. Try as a literal |
367 | // with the original opsel, checking constant bus limitations. |
368 | MachineOperand New = MachineOperand::CreateImm(Val: Fold.ImmToFold); |
369 | int OpNo = MI->getOperandNo(I: &Old); |
370 | if (!TII->isOperandLegal(MI: *MI, OpIdx: OpNo, MO: &New)) |
371 | return false; |
372 | Old.ChangeToImmediate(ImmVal: Fold.ImmToFold); |
373 | return true; |
374 | } |
375 | |
376 | if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { |
377 | MachineBasicBlock *MBB = MI->getParent(); |
378 | auto Liveness = MBB->computeRegisterLiveness(TRI, Reg: AMDGPU::VCC, Before: MI, Neighborhood: 16); |
379 | if (Liveness != MachineBasicBlock::LQR_Dead) { |
380 | LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n" ); |
381 | return false; |
382 | } |
383 | |
384 | int Op32 = Fold.ShrinkOpcode; |
385 | MachineOperand &Dst0 = MI->getOperand(i: 0); |
386 | MachineOperand &Dst1 = MI->getOperand(i: 1); |
387 | assert(Dst0.isDef() && Dst1.isDef()); |
388 | |
389 | bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(RegNo: Dst1.getReg()); |
390 | |
391 | const TargetRegisterClass *Dst0RC = MRI->getRegClass(Reg: Dst0.getReg()); |
392 | Register NewReg0 = MRI->createVirtualRegister(RegClass: Dst0RC); |
393 | |
394 | MachineInstr *Inst32 = TII->buildShrunkInst(MI&: *MI, NewOpcode: Op32); |
395 | |
396 | if (HaveNonDbgCarryUse) { |
397 | BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), |
398 | DestReg: Dst1.getReg()) |
399 | .addReg(RegNo: AMDGPU::VCC, flags: RegState::Kill); |
400 | } |
401 | |
402 | // Keep the old instruction around to avoid breaking iterators, but |
403 | // replace it with a dummy instruction to remove uses. |
404 | // |
405 | // FIXME: We should not invert how this pass looks at operands to avoid |
406 | // this. Should track set of foldable movs instead of looking for uses |
407 | // when looking at a use. |
408 | Dst0.setReg(NewReg0); |
409 | for (unsigned I = MI->getNumOperands() - 1; I > 0; --I) |
410 | MI->removeOperand(OpNo: I); |
411 | MI->setDesc(TII->get(Opcode: AMDGPU::IMPLICIT_DEF)); |
412 | |
413 | if (Fold.Commuted) |
414 | TII->commuteInstruction(MI&: *Inst32, NewMI: false); |
415 | return true; |
416 | } |
417 | |
418 | assert(!Fold.needsShrink() && "not handled" ); |
419 | |
420 | if (Fold.isImm()) { |
421 | if (Old.isTied()) { |
422 | int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opcode: MI->getOpcode()); |
423 | if (NewMFMAOpc == -1) |
424 | return false; |
425 | MI->setDesc(TII->get(Opcode: NewMFMAOpc)); |
426 | MI->untieRegOperand(OpIdx: 0); |
427 | } |
428 | Old.ChangeToImmediate(ImmVal: Fold.ImmToFold); |
429 | return true; |
430 | } |
431 | |
432 | if (Fold.isGlobal()) { |
433 | Old.ChangeToGA(GV: Fold.OpToFold->getGlobal(), Offset: Fold.OpToFold->getOffset(), |
434 | TargetFlags: Fold.OpToFold->getTargetFlags()); |
435 | return true; |
436 | } |
437 | |
438 | if (Fold.isFI()) { |
439 | Old.ChangeToFrameIndex(Idx: Fold.FrameIndexToFold); |
440 | return true; |
441 | } |
442 | |
443 | MachineOperand *New = Fold.OpToFold; |
444 | Old.substVirtReg(Reg: New->getReg(), SubIdx: New->getSubReg(), *TRI); |
445 | Old.setIsUndef(New->isUndef()); |
446 | return true; |
447 | } |
448 | |
449 | static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList, |
450 | const MachineInstr *MI) { |
451 | return any_of(Range&: FoldList, P: [&](const auto &C) { return C.UseMI == MI; }); |
452 | } |
453 | |
454 | static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, |
455 | MachineInstr *MI, unsigned OpNo, |
456 | MachineOperand *FoldOp, bool Commuted = false, |
457 | int ShrinkOp = -1) { |
458 | // Skip additional folding on the same operand. |
459 | for (FoldCandidate &Fold : FoldList) |
460 | if (Fold.UseMI == MI && Fold.UseOpNo == OpNo) |
461 | return; |
462 | LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal" ) |
463 | << " operand " << OpNo << "\n " << *MI); |
464 | FoldList.emplace_back(Args&: MI, Args&: OpNo, Args&: FoldOp, Args&: Commuted, Args&: ShrinkOp); |
465 | } |
466 | |
467 | bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, |
468 | MachineInstr *MI, unsigned OpNo, |
469 | MachineOperand *OpToFold) const { |
470 | const unsigned Opc = MI->getOpcode(); |
471 | |
472 | auto tryToFoldAsFMAAKorMK = [&]() { |
473 | if (!OpToFold->isImm()) |
474 | return false; |
475 | |
476 | const bool TryAK = OpNo == 3; |
477 | const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32; |
478 | MI->setDesc(TII->get(Opcode: NewOpc)); |
479 | |
480 | // We have to fold into operand which would be Imm not into OpNo. |
481 | bool FoldAsFMAAKorMK = |
482 | tryAddToFoldList(FoldList, MI, OpNo: TryAK ? 3 : 2, OpToFold); |
483 | if (FoldAsFMAAKorMK) { |
484 | // Untie Src2 of fmac. |
485 | MI->untieRegOperand(OpIdx: 3); |
486 | // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1. |
487 | if (OpNo == 1) { |
488 | MachineOperand &Op1 = MI->getOperand(i: 1); |
489 | MachineOperand &Op2 = MI->getOperand(i: 2); |
490 | Register OldReg = Op1.getReg(); |
491 | // Operand 2 might be an inlinable constant |
492 | if (Op2.isImm()) { |
493 | Op1.ChangeToImmediate(ImmVal: Op2.getImm()); |
494 | Op2.ChangeToRegister(Reg: OldReg, isDef: false); |
495 | } else { |
496 | Op1.setReg(Op2.getReg()); |
497 | Op2.setReg(OldReg); |
498 | } |
499 | } |
500 | return true; |
501 | } |
502 | MI->setDesc(TII->get(Opcode: Opc)); |
503 | return false; |
504 | }; |
505 | |
506 | bool IsLegal = TII->isOperandLegal(MI: *MI, OpIdx: OpNo, MO: OpToFold); |
507 | if (!IsLegal && OpToFold->isImm()) { |
508 | FoldCandidate Fold(MI, OpNo, OpToFold); |
509 | IsLegal = canUseImmWithOpSel(Fold); |
510 | } |
511 | |
512 | if (!IsLegal) { |
513 | // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 |
514 | unsigned NewOpc = macToMad(Opc); |
515 | if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) { |
516 | // Check if changing this to a v_mad_{f16, f32} instruction will allow us |
517 | // to fold the operand. |
518 | MI->setDesc(TII->get(Opcode: NewOpc)); |
519 | bool AddOpSel = !AMDGPU::hasNamedOperand(Opcode: Opc, NamedIdx: AMDGPU::OpName::op_sel) && |
520 | AMDGPU::hasNamedOperand(Opcode: NewOpc, NamedIdx: AMDGPU::OpName::op_sel); |
521 | if (AddOpSel) |
522 | MI->addOperand(Op: MachineOperand::CreateImm(Val: 0)); |
523 | bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold); |
524 | if (FoldAsMAD) { |
525 | MI->untieRegOperand(OpIdx: OpNo); |
526 | return true; |
527 | } |
528 | if (AddOpSel) |
529 | MI->removeOperand(OpNo: MI->getNumExplicitOperands() - 1); |
530 | MI->setDesc(TII->get(Opcode: Opc)); |
531 | } |
532 | |
533 | // Special case for s_fmac_f32 if we are trying to fold into Src2. |
534 | // By transforming into fmaak we can untie Src2 and make folding legal. |
535 | if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) { |
536 | if (tryToFoldAsFMAAKorMK()) |
537 | return true; |
538 | } |
539 | |
540 | // Special case for s_setreg_b32 |
541 | if (OpToFold->isImm()) { |
542 | unsigned ImmOpc = 0; |
543 | if (Opc == AMDGPU::S_SETREG_B32) |
544 | ImmOpc = AMDGPU::S_SETREG_IMM32_B32; |
545 | else if (Opc == AMDGPU::S_SETREG_B32_mode) |
546 | ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode; |
547 | if (ImmOpc) { |
548 | MI->setDesc(TII->get(Opcode: ImmOpc)); |
549 | appendFoldCandidate(FoldList, MI, OpNo, FoldOp: OpToFold); |
550 | return true; |
551 | } |
552 | } |
553 | |
554 | // If we are already folding into another operand of MI, then |
555 | // we can't commute the instruction, otherwise we risk making the |
556 | // other fold illegal. |
557 | if (isUseMIInFoldList(FoldList, MI)) |
558 | return false; |
559 | |
560 | // Operand is not legal, so try to commute the instruction to |
561 | // see if this makes it possible to fold. |
562 | unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex; |
563 | bool CanCommute = TII->findCommutedOpIndices(MI: *MI, SrcOpIdx0&: OpNo, SrcOpIdx1&: CommuteOpNo); |
564 | if (!CanCommute) |
565 | return false; |
566 | |
567 | // One of operands might be an Imm operand, and OpNo may refer to it after |
568 | // the call of commuteInstruction() below. Such situations are avoided |
569 | // here explicitly as OpNo must be a register operand to be a candidate |
570 | // for memory folding. |
571 | if (!MI->getOperand(i: OpNo).isReg() || !MI->getOperand(i: CommuteOpNo).isReg()) |
572 | return false; |
573 | |
574 | if (!TII->commuteInstruction(MI&: *MI, NewMI: false, OpIdx1: OpNo, OpIdx2: CommuteOpNo)) |
575 | return false; |
576 | |
577 | int Op32 = -1; |
578 | if (!TII->isOperandLegal(MI: *MI, OpIdx: CommuteOpNo, MO: OpToFold)) { |
579 | if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 && |
580 | Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME |
581 | (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) { |
582 | TII->commuteInstruction(MI&: *MI, NewMI: false, OpIdx1: OpNo, OpIdx2: CommuteOpNo); |
583 | return false; |
584 | } |
585 | |
586 | // Verify the other operand is a VGPR, otherwise we would violate the |
587 | // constant bus restriction. |
588 | MachineOperand &OtherOp = MI->getOperand(i: OpNo); |
589 | if (!OtherOp.isReg() || |
590 | !TII->getRegisterInfo().isVGPR(MRI: *MRI, Reg: OtherOp.getReg())) |
591 | return false; |
592 | |
593 | assert(MI->getOperand(1).isDef()); |
594 | |
595 | // Make sure to get the 32-bit version of the commuted opcode. |
596 | unsigned MaybeCommutedOpc = MI->getOpcode(); |
597 | Op32 = AMDGPU::getVOPe32(Opcode: MaybeCommutedOpc); |
598 | } |
599 | |
600 | appendFoldCandidate(FoldList, MI, OpNo: CommuteOpNo, FoldOp: OpToFold, Commuted: true, ShrinkOp: Op32); |
601 | return true; |
602 | } |
603 | |
604 | // Inlineable constant might have been folded into Imm operand of fmaak or |
605 | // fmamk and we are trying to fold a non-inlinable constant. |
606 | if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) && |
607 | !OpToFold->isReg() && !TII->isInlineConstant(MO: *OpToFold)) { |
608 | unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2; |
609 | MachineOperand &OpImm = MI->getOperand(i: ImmIdx); |
610 | if (!OpImm.isReg() && |
611 | TII->isInlineConstant(MI: *MI, UseMO: MI->getOperand(i: OpNo), DefMO: OpImm)) |
612 | return tryToFoldAsFMAAKorMK(); |
613 | } |
614 | |
615 | // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1. |
616 | // By changing into fmamk we can untie Src2. |
617 | // If folding for Src0 happens first and it is identical operand to Src1 we |
618 | // should avoid transforming into fmamk which requires commuting as it would |
619 | // cause folding into Src1 to fail later on due to wrong OpNo used. |
620 | if (Opc == AMDGPU::S_FMAC_F32 && |
621 | (OpNo != 1 || !MI->getOperand(i: 1).isIdenticalTo(Other: MI->getOperand(i: 2)))) { |
622 | if (tryToFoldAsFMAAKorMK()) |
623 | return true; |
624 | } |
625 | |
626 | // Check the case where we might introduce a second constant operand to a |
627 | // scalar instruction |
628 | if (TII->isSALU(Opcode: MI->getOpcode())) { |
629 | const MCInstrDesc &InstDesc = MI->getDesc(); |
630 | const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; |
631 | |
632 | // Fine if the operand can be encoded as an inline constant |
633 | if (!OpToFold->isReg() && !TII->isInlineConstant(MO: *OpToFold, OpInfo)) { |
634 | // Otherwise check for another constant |
635 | for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) { |
636 | auto &Op = MI->getOperand(i); |
637 | if (OpNo != i && !Op.isReg() && |
638 | !TII->isInlineConstant(MO: Op, OpInfo: InstDesc.operands()[i])) |
639 | return false; |
640 | } |
641 | } |
642 | } |
643 | |
644 | appendFoldCandidate(FoldList, MI, OpNo, FoldOp: OpToFold); |
645 | return true; |
646 | } |
647 | |
648 | bool SIFoldOperands::isUseSafeToFold(const MachineInstr &MI, |
649 | const MachineOperand &UseMO) const { |
650 | // Operands of SDWA instructions must be registers. |
651 | return !TII->isSDWA(MI); |
652 | } |
653 | |
654 | // Find a def of the UseReg, check if it is a reg_sequence and find initializers |
655 | // for each subreg, tracking it to foldable inline immediate if possible. |
656 | // Returns true on success. |
657 | bool SIFoldOperands::getRegSeqInit( |
658 | SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs, |
659 | Register UseReg, uint8_t OpTy) const { |
660 | MachineInstr *Def = MRI->getVRegDef(Reg: UseReg); |
661 | if (!Def || !Def->isRegSequence()) |
662 | return false; |
663 | |
664 | for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) { |
665 | MachineOperand *Sub = &Def->getOperand(i: I); |
666 | assert(Sub->isReg()); |
667 | |
668 | for (MachineInstr *SubDef = MRI->getVRegDef(Reg: Sub->getReg()); |
669 | SubDef && Sub->isReg() && Sub->getReg().isVirtual() && |
670 | !Sub->getSubReg() && TII->isFoldableCopy(MI: *SubDef); |
671 | SubDef = MRI->getVRegDef(Reg: Sub->getReg())) { |
672 | MachineOperand *Op = &SubDef->getOperand(i: 1); |
673 | if (Op->isImm()) { |
674 | if (TII->isInlineConstant(MO: *Op, OperandType: OpTy)) |
675 | Sub = Op; |
676 | break; |
677 | } |
678 | if (!Op->isReg() || Op->getReg().isPhysical()) |
679 | break; |
680 | Sub = Op; |
681 | } |
682 | |
683 | Defs.emplace_back(Args&: Sub, Args: Def->getOperand(i: I + 1).getImm()); |
684 | } |
685 | |
686 | return true; |
687 | } |
688 | |
689 | bool SIFoldOperands::tryToFoldACImm( |
690 | const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, |
691 | SmallVectorImpl<FoldCandidate> &FoldList) const { |
692 | const MCInstrDesc &Desc = UseMI->getDesc(); |
693 | if (UseOpIdx >= Desc.getNumOperands()) |
694 | return false; |
695 | |
696 | if (!AMDGPU::isSISrcInlinableOperand(Desc, OpNo: UseOpIdx)) |
697 | return false; |
698 | |
699 | uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; |
700 | if (OpToFold.isImm() && TII->isInlineConstant(MO: OpToFold, OperandType: OpTy) && |
701 | TII->isOperandLegal(MI: *UseMI, OpIdx: UseOpIdx, MO: &OpToFold)) { |
702 | UseMI->getOperand(i: UseOpIdx).ChangeToImmediate(ImmVal: OpToFold.getImm()); |
703 | return true; |
704 | } |
705 | |
706 | if (!OpToFold.isReg()) |
707 | return false; |
708 | |
709 | Register UseReg = OpToFold.getReg(); |
710 | if (!UseReg.isVirtual()) |
711 | return false; |
712 | |
713 | if (isUseMIInFoldList(FoldList, MI: UseMI)) |
714 | return false; |
715 | |
716 | // Maybe it is just a COPY of an immediate itself. |
717 | MachineInstr *Def = MRI->getVRegDef(Reg: UseReg); |
718 | MachineOperand &UseOp = UseMI->getOperand(i: UseOpIdx); |
719 | if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(MI: *Def)) { |
720 | MachineOperand &DefOp = Def->getOperand(i: 1); |
721 | if (DefOp.isImm() && TII->isInlineConstant(MO: DefOp, OperandType: OpTy) && |
722 | TII->isOperandLegal(MI: *UseMI, OpIdx: UseOpIdx, MO: &DefOp)) { |
723 | UseMI->getOperand(i: UseOpIdx).ChangeToImmediate(ImmVal: DefOp.getImm()); |
724 | return true; |
725 | } |
726 | } |
727 | |
728 | SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs; |
729 | if (!getRegSeqInit(Defs, UseReg, OpTy)) |
730 | return false; |
731 | |
732 | int32_t Imm; |
733 | for (unsigned I = 0, E = Defs.size(); I != E; ++I) { |
734 | const MachineOperand *Op = Defs[I].first; |
735 | if (!Op->isImm()) |
736 | return false; |
737 | |
738 | auto SubImm = Op->getImm(); |
739 | if (!I) { |
740 | Imm = SubImm; |
741 | if (!TII->isInlineConstant(MO: *Op, OperandType: OpTy) || |
742 | !TII->isOperandLegal(MI: *UseMI, OpIdx: UseOpIdx, MO: Op)) |
743 | return false; |
744 | |
745 | continue; |
746 | } |
747 | if (Imm != SubImm) |
748 | return false; // Can only fold splat constants |
749 | } |
750 | |
751 | appendFoldCandidate(FoldList, MI: UseMI, OpNo: UseOpIdx, FoldOp: Defs[0].first); |
752 | return true; |
753 | } |
754 | |
755 | void SIFoldOperands::foldOperand( |
756 | MachineOperand &OpToFold, |
757 | MachineInstr *UseMI, |
758 | int UseOpIdx, |
759 | SmallVectorImpl<FoldCandidate> &FoldList, |
760 | SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { |
761 | const MachineOperand *UseOp = &UseMI->getOperand(i: UseOpIdx); |
762 | |
763 | if (!isUseSafeToFold(MI: *UseMI, UseMO: *UseOp)) |
764 | return; |
765 | |
766 | // FIXME: Fold operands with subregs. |
767 | if (UseOp->isReg() && OpToFold.isReg() && |
768 | (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister)) |
769 | return; |
770 | |
771 | // Special case for REG_SEQUENCE: We can't fold literals into |
772 | // REG_SEQUENCE instructions, so we have to fold them into the |
773 | // uses of REG_SEQUENCE. |
774 | if (UseMI->isRegSequence()) { |
775 | Register RegSeqDstReg = UseMI->getOperand(i: 0).getReg(); |
776 | unsigned RegSeqDstSubReg = UseMI->getOperand(i: UseOpIdx + 1).getImm(); |
777 | |
778 | // Grab the use operands first |
779 | SmallVector<MachineOperand *, 4> UsesToProcess; |
780 | for (auto &Use : MRI->use_nodbg_operands(Reg: RegSeqDstReg)) |
781 | UsesToProcess.push_back(Elt: &Use); |
782 | for (auto *RSUse : UsesToProcess) { |
783 | MachineInstr *RSUseMI = RSUse->getParent(); |
784 | |
785 | if (tryToFoldACImm(OpToFold: UseMI->getOperand(i: 0), UseMI: RSUseMI, |
786 | UseOpIdx: RSUseMI->getOperandNo(I: RSUse), FoldList)) |
787 | continue; |
788 | |
789 | if (RSUse->getSubReg() != RegSeqDstSubReg) |
790 | continue; |
791 | |
792 | foldOperand(OpToFold, UseMI: RSUseMI, UseOpIdx: RSUseMI->getOperandNo(I: RSUse), FoldList, |
793 | CopiesToReplace); |
794 | } |
795 | return; |
796 | } |
797 | |
798 | if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList)) |
799 | return; |
800 | |
801 | if (frameIndexMayFold(UseMI: *UseMI, OpNo: UseOpIdx, OpToFold)) { |
802 | // Verify that this is a stack access. |
803 | // FIXME: Should probably use stack pseudos before frame lowering. |
804 | |
805 | if (TII->isMUBUF(MI: *UseMI)) { |
806 | if (TII->getNamedOperand(MI&: *UseMI, OperandName: AMDGPU::OpName::srsrc)->getReg() != |
807 | MFI->getScratchRSrcReg()) |
808 | return; |
809 | |
810 | // Ensure this is either relative to the current frame or the current |
811 | // wave. |
812 | MachineOperand &SOff = |
813 | *TII->getNamedOperand(MI&: *UseMI, OperandName: AMDGPU::OpName::soffset); |
814 | if (!SOff.isImm() || SOff.getImm() != 0) |
815 | return; |
816 | } |
817 | |
818 | // A frame index will resolve to a positive constant, so it should always be |
819 | // safe to fold the addressing mode, even pre-GFX9. |
820 | UseMI->getOperand(i: UseOpIdx).ChangeToFrameIndex(Idx: OpToFold.getIndex()); |
821 | |
822 | const unsigned Opc = UseMI->getOpcode(); |
823 | if (TII->isFLATScratch(MI: *UseMI) && |
824 | AMDGPU::hasNamedOperand(Opcode: Opc, NamedIdx: AMDGPU::OpName::vaddr) && |
825 | !AMDGPU::hasNamedOperand(Opcode: Opc, NamedIdx: AMDGPU::OpName::saddr)) { |
826 | unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opcode: Opc); |
827 | UseMI->setDesc(TII->get(Opcode: NewOpc)); |
828 | } |
829 | |
830 | return; |
831 | } |
832 | |
833 | bool FoldingImmLike = |
834 | OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); |
835 | |
836 | if (FoldingImmLike && UseMI->isCopy()) { |
837 | Register DestReg = UseMI->getOperand(i: 0).getReg(); |
838 | Register SrcReg = UseMI->getOperand(i: 1).getReg(); |
839 | assert(SrcReg.isVirtual()); |
840 | |
841 | const TargetRegisterClass *SrcRC = MRI->getRegClass(Reg: SrcReg); |
842 | |
843 | // Don't fold into a copy to a physical register with the same class. Doing |
844 | // so would interfere with the register coalescer's logic which would avoid |
845 | // redundant initializations. |
846 | if (DestReg.isPhysical() && SrcRC->contains(Reg: DestReg)) |
847 | return; |
848 | |
849 | const TargetRegisterClass *DestRC = TRI->getRegClassForReg(MRI: *MRI, Reg: DestReg); |
850 | if (!DestReg.isPhysical()) { |
851 | if (DestRC == &AMDGPU::AGPR_32RegClass && |
852 | TII->isInlineConstant(MO: OpToFold, OperandType: AMDGPU::OPERAND_REG_INLINE_C_INT32)) { |
853 | UseMI->setDesc(TII->get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64)); |
854 | UseMI->getOperand(i: 1).ChangeToImmediate(ImmVal: OpToFold.getImm()); |
855 | CopiesToReplace.push_back(Elt: UseMI); |
856 | return; |
857 | } |
858 | } |
859 | |
860 | // In order to fold immediates into copies, we need to change the |
861 | // copy to a MOV. |
862 | |
863 | unsigned MovOp = TII->getMovOpcode(DstRC: DestRC); |
864 | if (MovOp == AMDGPU::COPY) |
865 | return; |
866 | |
867 | MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin(); |
868 | MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end(); |
869 | while (ImpOpI != ImpOpE) { |
870 | MachineInstr::mop_iterator Tmp = ImpOpI; |
871 | ImpOpI++; |
872 | UseMI->removeOperand(OpNo: UseMI->getOperandNo(I: Tmp)); |
873 | } |
874 | UseMI->setDesc(TII->get(Opcode: MovOp)); |
875 | |
876 | if (MovOp == AMDGPU::V_MOV_B16_t16_e64) { |
877 | const auto &SrcOp = UseMI->getOperand(i: UseOpIdx); |
878 | MachineOperand NewSrcOp(SrcOp); |
879 | MachineFunction *MF = UseMI->getParent()->getParent(); |
880 | UseMI->removeOperand(OpNo: 1); |
881 | UseMI->addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0)); // src0_modifiers |
882 | UseMI->addOperand(Op: NewSrcOp); // src0 |
883 | UseMI->addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0)); // op_sel |
884 | UseOpIdx = 2; |
885 | UseOp = &UseMI->getOperand(i: UseOpIdx); |
886 | } |
887 | CopiesToReplace.push_back(Elt: UseMI); |
888 | } else { |
889 | if (UseMI->isCopy() && OpToFold.isReg() && |
890 | UseMI->getOperand(i: 0).getReg().isVirtual() && |
891 | !UseMI->getOperand(i: 1).getSubReg()) { |
892 | LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI); |
893 | unsigned Size = TII->getOpSize(MI: *UseMI, OpNo: 1); |
894 | Register UseReg = OpToFold.getReg(); |
895 | UseMI->getOperand(i: 1).setReg(UseReg); |
896 | UseMI->getOperand(i: 1).setSubReg(OpToFold.getSubReg()); |
897 | UseMI->getOperand(i: 1).setIsKill(false); |
898 | CopiesToReplace.push_back(Elt: UseMI); |
899 | OpToFold.setIsKill(false); |
900 | |
901 | // Remove kill flags as kills may now be out of order with uses. |
902 | MRI->clearKillFlags(Reg: OpToFold.getReg()); |
903 | |
904 | // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32 |
905 | // can only accept VGPR or inline immediate. Recreate a reg_sequence with |
906 | // its initializers right here, so we will rematerialize immediates and |
907 | // avoid copies via different reg classes. |
908 | SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs; |
909 | if (Size > 4 && TRI->isAGPR(MRI: *MRI, Reg: UseMI->getOperand(i: 0).getReg()) && |
910 | getRegSeqInit(Defs, UseReg, OpTy: AMDGPU::OPERAND_REG_INLINE_C_INT32)) { |
911 | const DebugLoc &DL = UseMI->getDebugLoc(); |
912 | MachineBasicBlock &MBB = *UseMI->getParent(); |
913 | |
914 | UseMI->setDesc(TII->get(Opcode: AMDGPU::REG_SEQUENCE)); |
915 | for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I) |
916 | UseMI->removeOperand(OpNo: I); |
917 | |
918 | MachineInstrBuilder B(*MBB.getParent(), UseMI); |
919 | DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies; |
920 | SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs; |
921 | for (unsigned I = 0; I < Size / 4; ++I) { |
922 | MachineOperand *Def = Defs[I].first; |
923 | TargetInstrInfo::RegSubRegPair CopyToVGPR; |
924 | if (Def->isImm() && |
925 | TII->isInlineConstant(MO: *Def, OperandType: AMDGPU::OPERAND_REG_INLINE_C_INT32)) { |
926 | int64_t Imm = Def->getImm(); |
927 | |
928 | auto Tmp = MRI->createVirtualRegister(RegClass: &AMDGPU::AGPR_32RegClass); |
929 | BuildMI(BB&: MBB, I: UseMI, MIMD: DL, |
930 | MCID: TII->get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg: Tmp).addImm(Val: Imm); |
931 | B.addReg(RegNo: Tmp); |
932 | } else if (Def->isReg() && TRI->isAGPR(MRI: *MRI, Reg: Def->getReg())) { |
933 | auto Src = getRegSubRegPair(O: *Def); |
934 | Def->setIsKill(false); |
935 | if (!SeenAGPRs.insert(X: Src)) { |
936 | // We cannot build a reg_sequence out of the same registers, they |
937 | // must be copied. Better do it here before copyPhysReg() created |
938 | // several reads to do the AGPR->VGPR->AGPR copy. |
939 | CopyToVGPR = Src; |
940 | } else { |
941 | B.addReg(RegNo: Src.Reg, flags: Def->isUndef() ? RegState::Undef : 0, |
942 | SubReg: Src.SubReg); |
943 | } |
944 | } else { |
945 | assert(Def->isReg()); |
946 | Def->setIsKill(false); |
947 | auto Src = getRegSubRegPair(O: *Def); |
948 | |
949 | // Direct copy from SGPR to AGPR is not possible. To avoid creation |
950 | // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later, |
951 | // create a copy here and track if we already have such a copy. |
952 | if (TRI->isSGPRReg(MRI: *MRI, Reg: Src.Reg)) { |
953 | CopyToVGPR = Src; |
954 | } else { |
955 | auto Tmp = MRI->createVirtualRegister(RegClass: &AMDGPU::AGPR_32RegClass); |
956 | BuildMI(BB&: MBB, I: UseMI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Tmp).add(MO: *Def); |
957 | B.addReg(RegNo: Tmp); |
958 | } |
959 | } |
960 | |
961 | if (CopyToVGPR.Reg) { |
962 | Register Vgpr; |
963 | if (VGPRCopies.count(Val: CopyToVGPR)) { |
964 | Vgpr = VGPRCopies[CopyToVGPR]; |
965 | } else { |
966 | Vgpr = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
967 | BuildMI(BB&: MBB, I: UseMI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Vgpr).add(MO: *Def); |
968 | VGPRCopies[CopyToVGPR] = Vgpr; |
969 | } |
970 | auto Tmp = MRI->createVirtualRegister(RegClass: &AMDGPU::AGPR_32RegClass); |
971 | BuildMI(BB&: MBB, I: UseMI, MIMD: DL, |
972 | MCID: TII->get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg: Tmp).addReg(RegNo: Vgpr); |
973 | B.addReg(RegNo: Tmp); |
974 | } |
975 | |
976 | B.addImm(Val: Defs[I].second); |
977 | } |
978 | LLVM_DEBUG(dbgs() << "Folded " << *UseMI); |
979 | return; |
980 | } |
981 | |
982 | if (Size != 4) |
983 | return; |
984 | |
985 | Register Reg0 = UseMI->getOperand(i: 0).getReg(); |
986 | Register Reg1 = UseMI->getOperand(i: 1).getReg(); |
987 | if (TRI->isAGPR(MRI: *MRI, Reg: Reg0) && TRI->isVGPR(MRI: *MRI, Reg: Reg1)) |
988 | UseMI->setDesc(TII->get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64)); |
989 | else if (TRI->isVGPR(MRI: *MRI, Reg: Reg0) && TRI->isAGPR(MRI: *MRI, Reg: Reg1)) |
990 | UseMI->setDesc(TII->get(Opcode: AMDGPU::V_ACCVGPR_READ_B32_e64)); |
991 | else if (ST->hasGFX90AInsts() && TRI->isAGPR(MRI: *MRI, Reg: Reg0) && |
992 | TRI->isAGPR(MRI: *MRI, Reg: Reg1)) |
993 | UseMI->setDesc(TII->get(Opcode: AMDGPU::V_ACCVGPR_MOV_B32)); |
994 | return; |
995 | } |
996 | |
997 | unsigned UseOpc = UseMI->getOpcode(); |
998 | if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 || |
999 | (UseOpc == AMDGPU::V_READLANE_B32 && |
1000 | (int)UseOpIdx == |
1001 | AMDGPU::getNamedOperandIdx(Opcode: UseOpc, NamedIdx: AMDGPU::OpName::src0))) { |
1002 | // %vgpr = V_MOV_B32 imm |
1003 | // %sgpr = V_READFIRSTLANE_B32 %vgpr |
1004 | // => |
1005 | // %sgpr = S_MOV_B32 imm |
1006 | if (FoldingImmLike) { |
1007 | if (execMayBeModifiedBeforeUse(MRI: *MRI, |
1008 | VReg: UseMI->getOperand(i: UseOpIdx).getReg(), |
1009 | DefMI: *OpToFold.getParent(), |
1010 | UseMI: *UseMI)) |
1011 | return; |
1012 | |
1013 | UseMI->setDesc(TII->get(Opcode: AMDGPU::S_MOV_B32)); |
1014 | |
1015 | if (OpToFold.isImm()) |
1016 | UseMI->getOperand(i: 1).ChangeToImmediate(ImmVal: OpToFold.getImm()); |
1017 | else |
1018 | UseMI->getOperand(i: 1).ChangeToFrameIndex(Idx: OpToFold.getIndex()); |
1019 | UseMI->removeOperand(OpNo: 2); // Remove exec read (or src1 for readlane) |
1020 | return; |
1021 | } |
1022 | |
1023 | if (OpToFold.isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: OpToFold.getReg())) { |
1024 | if (execMayBeModifiedBeforeUse(MRI: *MRI, |
1025 | VReg: UseMI->getOperand(i: UseOpIdx).getReg(), |
1026 | DefMI: *OpToFold.getParent(), |
1027 | UseMI: *UseMI)) |
1028 | return; |
1029 | |
1030 | // %vgpr = COPY %sgpr0 |
1031 | // %sgpr1 = V_READFIRSTLANE_B32 %vgpr |
1032 | // => |
1033 | // %sgpr1 = COPY %sgpr0 |
1034 | UseMI->setDesc(TII->get(Opcode: AMDGPU::COPY)); |
1035 | UseMI->getOperand(i: 1).setReg(OpToFold.getReg()); |
1036 | UseMI->getOperand(i: 1).setSubReg(OpToFold.getSubReg()); |
1037 | UseMI->getOperand(i: 1).setIsKill(false); |
1038 | UseMI->removeOperand(OpNo: 2); // Remove exec read (or src1 for readlane) |
1039 | return; |
1040 | } |
1041 | } |
1042 | |
1043 | const MCInstrDesc &UseDesc = UseMI->getDesc(); |
1044 | |
1045 | // Don't fold into target independent nodes. Target independent opcodes |
1046 | // don't have defined register classes. |
1047 | if (UseDesc.isVariadic() || UseOp->isImplicit() || |
1048 | UseDesc.operands()[UseOpIdx].RegClass == -1) |
1049 | return; |
1050 | } |
1051 | |
1052 | if (!FoldingImmLike) { |
1053 | if (OpToFold.isReg() && ST->needsAlignedVGPRs()) { |
1054 | // Don't fold if OpToFold doesn't hold an aligned register. |
1055 | const TargetRegisterClass *RC = |
1056 | TRI->getRegClassForReg(MRI: *MRI, Reg: OpToFold.getReg()); |
1057 | assert(RC); |
1058 | if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) { |
1059 | unsigned SubReg = OpToFold.getSubReg(); |
1060 | if (const TargetRegisterClass *SubRC = |
1061 | TRI->getSubRegisterClass(RC, SubReg)) |
1062 | RC = SubRC; |
1063 | } |
1064 | |
1065 | if (!RC || !TRI->isProperlyAlignedRC(RC: *RC)) |
1066 | return; |
1067 | } |
1068 | |
1069 | tryAddToFoldList(FoldList, MI: UseMI, OpNo: UseOpIdx, OpToFold: &OpToFold); |
1070 | |
1071 | // FIXME: We could try to change the instruction from 64-bit to 32-bit |
1072 | // to enable more folding opportunities. The shrink operands pass |
1073 | // already does this. |
1074 | return; |
1075 | } |
1076 | |
1077 | |
1078 | const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc(); |
1079 | const TargetRegisterClass *FoldRC = |
1080 | TRI->getRegClass(RCID: FoldDesc.operands()[0].RegClass); |
1081 | |
1082 | // Split 64-bit constants into 32-bits for folding. |
1083 | if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(RC: *FoldRC) == 64) { |
1084 | Register UseReg = UseOp->getReg(); |
1085 | const TargetRegisterClass *UseRC = MRI->getRegClass(Reg: UseReg); |
1086 | if (AMDGPU::getRegBitWidth(RC: *UseRC) != 64) |
1087 | return; |
1088 | |
1089 | APInt Imm(64, OpToFold.getImm()); |
1090 | if (UseOp->getSubReg() == AMDGPU::sub0) { |
1091 | Imm = Imm.getLoBits(numBits: 32); |
1092 | } else { |
1093 | assert(UseOp->getSubReg() == AMDGPU::sub1); |
1094 | Imm = Imm.getHiBits(numBits: 32); |
1095 | } |
1096 | |
1097 | MachineOperand ImmOp = MachineOperand::CreateImm(Val: Imm.getSExtValue()); |
1098 | tryAddToFoldList(FoldList, MI: UseMI, OpNo: UseOpIdx, OpToFold: &ImmOp); |
1099 | return; |
1100 | } |
1101 | |
1102 | tryAddToFoldList(FoldList, MI: UseMI, OpNo: UseOpIdx, OpToFold: &OpToFold); |
1103 | } |
1104 | |
1105 | static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, |
1106 | uint32_t LHS, uint32_t RHS) { |
1107 | switch (Opcode) { |
1108 | case AMDGPU::V_AND_B32_e64: |
1109 | case AMDGPU::V_AND_B32_e32: |
1110 | case AMDGPU::S_AND_B32: |
1111 | Result = LHS & RHS; |
1112 | return true; |
1113 | case AMDGPU::V_OR_B32_e64: |
1114 | case AMDGPU::V_OR_B32_e32: |
1115 | case AMDGPU::S_OR_B32: |
1116 | Result = LHS | RHS; |
1117 | return true; |
1118 | case AMDGPU::V_XOR_B32_e64: |
1119 | case AMDGPU::V_XOR_B32_e32: |
1120 | case AMDGPU::S_XOR_B32: |
1121 | Result = LHS ^ RHS; |
1122 | return true; |
1123 | case AMDGPU::S_XNOR_B32: |
1124 | Result = ~(LHS ^ RHS); |
1125 | return true; |
1126 | case AMDGPU::S_NAND_B32: |
1127 | Result = ~(LHS & RHS); |
1128 | return true; |
1129 | case AMDGPU::S_NOR_B32: |
1130 | Result = ~(LHS | RHS); |
1131 | return true; |
1132 | case AMDGPU::S_ANDN2_B32: |
1133 | Result = LHS & ~RHS; |
1134 | return true; |
1135 | case AMDGPU::S_ORN2_B32: |
1136 | Result = LHS | ~RHS; |
1137 | return true; |
1138 | case AMDGPU::V_LSHL_B32_e64: |
1139 | case AMDGPU::V_LSHL_B32_e32: |
1140 | case AMDGPU::S_LSHL_B32: |
1141 | // The instruction ignores the high bits for out of bounds shifts. |
1142 | Result = LHS << (RHS & 31); |
1143 | return true; |
1144 | case AMDGPU::V_LSHLREV_B32_e64: |
1145 | case AMDGPU::V_LSHLREV_B32_e32: |
1146 | Result = RHS << (LHS & 31); |
1147 | return true; |
1148 | case AMDGPU::V_LSHR_B32_e64: |
1149 | case AMDGPU::V_LSHR_B32_e32: |
1150 | case AMDGPU::S_LSHR_B32: |
1151 | Result = LHS >> (RHS & 31); |
1152 | return true; |
1153 | case AMDGPU::V_LSHRREV_B32_e64: |
1154 | case AMDGPU::V_LSHRREV_B32_e32: |
1155 | Result = RHS >> (LHS & 31); |
1156 | return true; |
1157 | case AMDGPU::V_ASHR_I32_e64: |
1158 | case AMDGPU::V_ASHR_I32_e32: |
1159 | case AMDGPU::S_ASHR_I32: |
1160 | Result = static_cast<int32_t>(LHS) >> (RHS & 31); |
1161 | return true; |
1162 | case AMDGPU::V_ASHRREV_I32_e64: |
1163 | case AMDGPU::V_ASHRREV_I32_e32: |
1164 | Result = static_cast<int32_t>(RHS) >> (LHS & 31); |
1165 | return true; |
1166 | default: |
1167 | return false; |
1168 | } |
1169 | } |
1170 | |
1171 | static unsigned getMovOpc(bool IsScalar) { |
1172 | return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; |
1173 | } |
1174 | |
1175 | static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { |
1176 | MI.setDesc(NewDesc); |
1177 | |
1178 | // Remove any leftover implicit operands from mutating the instruction. e.g. |
1179 | // if we replace an s_and_b32 with a copy, we don't need the implicit scc def |
1180 | // anymore. |
1181 | const MCInstrDesc &Desc = MI.getDesc(); |
1182 | unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() + |
1183 | Desc.implicit_defs().size(); |
1184 | |
1185 | for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) |
1186 | MI.removeOperand(OpNo: I); |
1187 | } |
1188 | |
1189 | MachineOperand * |
1190 | SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const { |
1191 | // If this has a subregister, it obviously is a register source. |
1192 | if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister || |
1193 | !Op.getReg().isVirtual()) |
1194 | return &Op; |
1195 | |
1196 | MachineInstr *Def = MRI->getVRegDef(Reg: Op.getReg()); |
1197 | if (Def && Def->isMoveImmediate()) { |
1198 | MachineOperand &ImmSrc = Def->getOperand(i: 1); |
1199 | if (ImmSrc.isImm()) |
1200 | return &ImmSrc; |
1201 | } |
1202 | |
1203 | return &Op; |
1204 | } |
1205 | |
1206 | // Try to simplify operations with a constant that may appear after instruction |
1207 | // selection. |
1208 | // TODO: See if a frame index with a fixed offset can fold. |
1209 | bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const { |
1210 | if (!MI->allImplicitDefsAreDead()) |
1211 | return false; |
1212 | |
1213 | unsigned Opc = MI->getOpcode(); |
1214 | |
1215 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src0); |
1216 | if (Src0Idx == -1) |
1217 | return false; |
1218 | MachineOperand *Src0 = getImmOrMaterializedImm(Op&: MI->getOperand(i: Src0Idx)); |
1219 | |
1220 | if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || |
1221 | Opc == AMDGPU::S_NOT_B32) && |
1222 | Src0->isImm()) { |
1223 | MI->getOperand(i: 1).ChangeToImmediate(ImmVal: ~Src0->getImm()); |
1224 | mutateCopyOp(MI&: *MI, NewDesc: TII->get(Opcode: getMovOpc(IsScalar: Opc == AMDGPU::S_NOT_B32))); |
1225 | return true; |
1226 | } |
1227 | |
1228 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src1); |
1229 | if (Src1Idx == -1) |
1230 | return false; |
1231 | MachineOperand *Src1 = getImmOrMaterializedImm(Op&: MI->getOperand(i: Src1Idx)); |
1232 | |
1233 | if (!Src0->isImm() && !Src1->isImm()) |
1234 | return false; |
1235 | |
1236 | // and k0, k1 -> v_mov_b32 (k0 & k1) |
1237 | // or k0, k1 -> v_mov_b32 (k0 | k1) |
1238 | // xor k0, k1 -> v_mov_b32 (k0 ^ k1) |
1239 | if (Src0->isImm() && Src1->isImm()) { |
1240 | int32_t NewImm; |
1241 | if (!evalBinaryInstruction(Opcode: Opc, Result&: NewImm, LHS: Src0->getImm(), RHS: Src1->getImm())) |
1242 | return false; |
1243 | |
1244 | bool IsSGPR = TRI->isSGPRReg(MRI: *MRI, Reg: MI->getOperand(i: 0).getReg()); |
1245 | |
1246 | // Be careful to change the right operand, src0 may belong to a different |
1247 | // instruction. |
1248 | MI->getOperand(i: Src0Idx).ChangeToImmediate(ImmVal: NewImm); |
1249 | MI->removeOperand(OpNo: Src1Idx); |
1250 | mutateCopyOp(MI&: *MI, NewDesc: TII->get(Opcode: getMovOpc(IsScalar: IsSGPR))); |
1251 | return true; |
1252 | } |
1253 | |
1254 | if (!MI->isCommutable()) |
1255 | return false; |
1256 | |
1257 | if (Src0->isImm() && !Src1->isImm()) { |
1258 | std::swap(a&: Src0, b&: Src1); |
1259 | std::swap(a&: Src0Idx, b&: Src1Idx); |
1260 | } |
1261 | |
1262 | int32_t Src1Val = static_cast<int32_t>(Src1->getImm()); |
1263 | if (Opc == AMDGPU::V_OR_B32_e64 || |
1264 | Opc == AMDGPU::V_OR_B32_e32 || |
1265 | Opc == AMDGPU::S_OR_B32) { |
1266 | if (Src1Val == 0) { |
1267 | // y = or x, 0 => y = copy x |
1268 | MI->removeOperand(OpNo: Src1Idx); |
1269 | mutateCopyOp(MI&: *MI, NewDesc: TII->get(Opcode: AMDGPU::COPY)); |
1270 | } else if (Src1Val == -1) { |
1271 | // y = or x, -1 => y = v_mov_b32 -1 |
1272 | MI->removeOperand(OpNo: Src1Idx); |
1273 | mutateCopyOp(MI&: *MI, NewDesc: TII->get(Opcode: getMovOpc(IsScalar: Opc == AMDGPU::S_OR_B32))); |
1274 | } else |
1275 | return false; |
1276 | |
1277 | return true; |
1278 | } |
1279 | |
1280 | if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 || |
1281 | Opc == AMDGPU::S_AND_B32) { |
1282 | if (Src1Val == 0) { |
1283 | // y = and x, 0 => y = v_mov_b32 0 |
1284 | MI->removeOperand(OpNo: Src0Idx); |
1285 | mutateCopyOp(MI&: *MI, NewDesc: TII->get(Opcode: getMovOpc(IsScalar: Opc == AMDGPU::S_AND_B32))); |
1286 | } else if (Src1Val == -1) { |
1287 | // y = and x, -1 => y = copy x |
1288 | MI->removeOperand(OpNo: Src1Idx); |
1289 | mutateCopyOp(MI&: *MI, NewDesc: TII->get(Opcode: AMDGPU::COPY)); |
1290 | } else |
1291 | return false; |
1292 | |
1293 | return true; |
1294 | } |
1295 | |
1296 | if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 || |
1297 | Opc == AMDGPU::S_XOR_B32) { |
1298 | if (Src1Val == 0) { |
1299 | // y = xor x, 0 => y = copy x |
1300 | MI->removeOperand(OpNo: Src1Idx); |
1301 | mutateCopyOp(MI&: *MI, NewDesc: TII->get(Opcode: AMDGPU::COPY)); |
1302 | return true; |
1303 | } |
1304 | } |
1305 | |
1306 | return false; |
1307 | } |
1308 | |
1309 | // Try to fold an instruction into a simpler one |
1310 | bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const { |
1311 | unsigned Opc = MI.getOpcode(); |
1312 | if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 && |
1313 | Opc != AMDGPU::V_CNDMASK_B64_PSEUDO) |
1314 | return false; |
1315 | |
1316 | MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
1317 | MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1); |
1318 | if (!Src1->isIdenticalTo(Other: *Src0)) { |
1319 | auto *Src0Imm = getImmOrMaterializedImm(Op&: *Src0); |
1320 | auto *Src1Imm = getImmOrMaterializedImm(Op&: *Src1); |
1321 | if (!Src1Imm->isIdenticalTo(Other: *Src0Imm)) |
1322 | return false; |
1323 | } |
1324 | |
1325 | int Src1ModIdx = |
1326 | AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src1_modifiers); |
1327 | int Src0ModIdx = |
1328 | AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src0_modifiers); |
1329 | if ((Src1ModIdx != -1 && MI.getOperand(i: Src1ModIdx).getImm() != 0) || |
1330 | (Src0ModIdx != -1 && MI.getOperand(i: Src0ModIdx).getImm() != 0)) |
1331 | return false; |
1332 | |
1333 | LLVM_DEBUG(dbgs() << "Folded " << MI << " into " ); |
1334 | auto &NewDesc = |
1335 | TII->get(Opcode: Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(IsScalar: false)); |
1336 | int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src2); |
1337 | if (Src2Idx != -1) |
1338 | MI.removeOperand(OpNo: Src2Idx); |
1339 | MI.removeOperand(OpNo: AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src1)); |
1340 | if (Src1ModIdx != -1) |
1341 | MI.removeOperand(OpNo: Src1ModIdx); |
1342 | if (Src0ModIdx != -1) |
1343 | MI.removeOperand(OpNo: Src0ModIdx); |
1344 | mutateCopyOp(MI, NewDesc); |
1345 | LLVM_DEBUG(dbgs() << MI); |
1346 | return true; |
1347 | } |
1348 | |
1349 | bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const { |
1350 | if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 && |
1351 | MI.getOpcode() != AMDGPU::V_AND_B32_e32) |
1352 | return false; |
1353 | |
1354 | MachineOperand *Src0 = getImmOrMaterializedImm(Op&: MI.getOperand(i: 1)); |
1355 | if (!Src0->isImm() || Src0->getImm() != 0xffff) |
1356 | return false; |
1357 | |
1358 | Register Src1 = MI.getOperand(i: 2).getReg(); |
1359 | MachineInstr *SrcDef = MRI->getVRegDef(Reg: Src1); |
1360 | if (!ST->zeroesHigh16BitsOfDest(Opcode: SrcDef->getOpcode())) |
1361 | return false; |
1362 | |
1363 | Register Dst = MI.getOperand(i: 0).getReg(); |
1364 | MRI->replaceRegWith(FromReg: Dst, ToReg: Src1); |
1365 | if (!MI.getOperand(i: 2).isKill()) |
1366 | MRI->clearKillFlags(Reg: Src1); |
1367 | MI.eraseFromParent(); |
1368 | return true; |
1369 | } |
1370 | |
1371 | bool SIFoldOperands::foldInstOperand(MachineInstr &MI, |
1372 | MachineOperand &OpToFold) const { |
1373 | // We need mutate the operands of new mov instructions to add implicit |
1374 | // uses of EXEC, but adding them invalidates the use_iterator, so defer |
1375 | // this. |
1376 | SmallVector<MachineInstr *, 4> CopiesToReplace; |
1377 | SmallVector<FoldCandidate, 4> FoldList; |
1378 | MachineOperand &Dst = MI.getOperand(i: 0); |
1379 | bool Changed = false; |
1380 | |
1381 | if (OpToFold.isImm()) { |
1382 | for (auto &UseMI : |
1383 | make_early_inc_range(Range: MRI->use_nodbg_instructions(Reg: Dst.getReg()))) { |
1384 | // Folding the immediate may reveal operations that can be constant |
1385 | // folded or replaced with a copy. This can happen for example after |
1386 | // frame indices are lowered to constants or from splitting 64-bit |
1387 | // constants. |
1388 | // |
1389 | // We may also encounter cases where one or both operands are |
1390 | // immediates materialized into a register, which would ordinarily not |
1391 | // be folded due to multiple uses or operand constraints. |
1392 | if (tryConstantFoldOp(MI: &UseMI)) { |
1393 | LLVM_DEBUG(dbgs() << "Constant folded " << UseMI); |
1394 | Changed = true; |
1395 | } |
1396 | } |
1397 | } |
1398 | |
1399 | SmallVector<MachineOperand *, 4> UsesToProcess; |
1400 | for (auto &Use : MRI->use_nodbg_operands(Reg: Dst.getReg())) |
1401 | UsesToProcess.push_back(Elt: &Use); |
1402 | for (auto *U : UsesToProcess) { |
1403 | MachineInstr *UseMI = U->getParent(); |
1404 | foldOperand(OpToFold, UseMI, UseOpIdx: UseMI->getOperandNo(I: U), FoldList, |
1405 | CopiesToReplace); |
1406 | } |
1407 | |
1408 | if (CopiesToReplace.empty() && FoldList.empty()) |
1409 | return Changed; |
1410 | |
1411 | MachineFunction *MF = MI.getParent()->getParent(); |
1412 | // Make sure we add EXEC uses to any new v_mov instructions created. |
1413 | for (MachineInstr *Copy : CopiesToReplace) |
1414 | Copy->addImplicitDefUseOperands(MF&: *MF); |
1415 | |
1416 | for (FoldCandidate &Fold : FoldList) { |
1417 | assert(!Fold.isReg() || Fold.OpToFold); |
1418 | if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) { |
1419 | Register Reg = Fold.OpToFold->getReg(); |
1420 | MachineInstr *DefMI = Fold.OpToFold->getParent(); |
1421 | if (DefMI->readsRegister(Reg: AMDGPU::EXEC, TRI) && |
1422 | execMayBeModifiedBeforeUse(MRI: *MRI, VReg: Reg, DefMI: *DefMI, UseMI: *Fold.UseMI)) |
1423 | continue; |
1424 | } |
1425 | if (updateOperand(Fold)) { |
1426 | // Clear kill flags. |
1427 | if (Fold.isReg()) { |
1428 | assert(Fold.OpToFold && Fold.OpToFold->isReg()); |
1429 | // FIXME: Probably shouldn't bother trying to fold if not an |
1430 | // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR |
1431 | // copies. |
1432 | MRI->clearKillFlags(Reg: Fold.OpToFold->getReg()); |
1433 | } |
1434 | LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " |
1435 | << static_cast<int>(Fold.UseOpNo) << " of " |
1436 | << *Fold.UseMI); |
1437 | } else if (Fold.Commuted) { |
1438 | // Restoring instruction's original operand order if fold has failed. |
1439 | TII->commuteInstruction(MI&: *Fold.UseMI, NewMI: false); |
1440 | } |
1441 | } |
1442 | return true; |
1443 | } |
1444 | |
1445 | bool SIFoldOperands::tryFoldFoldableCopy( |
1446 | MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const { |
1447 | // Specially track simple redefs of m0 to the same value in a block, so we |
1448 | // can erase the later ones. |
1449 | if (MI.getOperand(i: 0).getReg() == AMDGPU::M0) { |
1450 | MachineOperand &NewM0Val = MI.getOperand(i: 1); |
1451 | if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(Other: NewM0Val)) { |
1452 | MI.eraseFromParent(); |
1453 | return true; |
1454 | } |
1455 | |
1456 | // We aren't tracking other physical registers |
1457 | CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical()) |
1458 | ? nullptr |
1459 | : &NewM0Val; |
1460 | return false; |
1461 | } |
1462 | |
1463 | MachineOperand &OpToFold = MI.getOperand(i: 1); |
1464 | bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); |
1465 | |
1466 | // FIXME: We could also be folding things like TargetIndexes. |
1467 | if (!FoldingImm && !OpToFold.isReg()) |
1468 | return false; |
1469 | |
1470 | if (OpToFold.isReg() && !OpToFold.getReg().isVirtual()) |
1471 | return false; |
1472 | |
1473 | // Prevent folding operands backwards in the function. For example, |
1474 | // the COPY opcode must not be replaced by 1 in this example: |
1475 | // |
1476 | // %3 = COPY %vgpr0; VGPR_32:%3 |
1477 | // ... |
1478 | // %vgpr0 = V_MOV_B32_e32 1, implicit %exec |
1479 | if (!MI.getOperand(i: 0).getReg().isVirtual()) |
1480 | return false; |
1481 | |
1482 | bool Changed = foldInstOperand(MI, OpToFold); |
1483 | |
1484 | // If we managed to fold all uses of this copy then we might as well |
1485 | // delete it now. |
1486 | // The only reason we need to follow chains of copies here is that |
1487 | // tryFoldRegSequence looks forward through copies before folding a |
1488 | // REG_SEQUENCE into its eventual users. |
1489 | auto *InstToErase = &MI; |
1490 | while (MRI->use_nodbg_empty(RegNo: InstToErase->getOperand(i: 0).getReg())) { |
1491 | auto &SrcOp = InstToErase->getOperand(i: 1); |
1492 | auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register(); |
1493 | InstToErase->eraseFromParent(); |
1494 | Changed = true; |
1495 | InstToErase = nullptr; |
1496 | if (!SrcReg || SrcReg.isPhysical()) |
1497 | break; |
1498 | InstToErase = MRI->getVRegDef(Reg: SrcReg); |
1499 | if (!InstToErase || !TII->isFoldableCopy(MI: *InstToErase)) |
1500 | break; |
1501 | } |
1502 | |
1503 | if (InstToErase && InstToErase->isRegSequence() && |
1504 | MRI->use_nodbg_empty(RegNo: InstToErase->getOperand(i: 0).getReg())) { |
1505 | InstToErase->eraseFromParent(); |
1506 | Changed = true; |
1507 | } |
1508 | |
1509 | return Changed; |
1510 | } |
1511 | |
1512 | // Clamp patterns are canonically selected to v_max_* instructions, so only |
1513 | // handle them. |
1514 | const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { |
1515 | unsigned Op = MI.getOpcode(); |
1516 | switch (Op) { |
1517 | case AMDGPU::V_MAX_F32_e64: |
1518 | case AMDGPU::V_MAX_F16_e64: |
1519 | case AMDGPU::V_MAX_F16_t16_e64: |
1520 | case AMDGPU::V_MAX_F16_fake16_e64: |
1521 | case AMDGPU::V_MAX_F64_e64: |
1522 | case AMDGPU::V_MAX_NUM_F64_e64: |
1523 | case AMDGPU::V_PK_MAX_F16: { |
1524 | if (MI.mayRaiseFPException()) |
1525 | return nullptr; |
1526 | |
1527 | if (!TII->getNamedOperand(MI, OpName: AMDGPU::OpName::clamp)->getImm()) |
1528 | return nullptr; |
1529 | |
1530 | // Make sure sources are identical. |
1531 | const MachineOperand *Src0 = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::src0); |
1532 | const MachineOperand *Src1 = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::src1); |
1533 | if (!Src0->isReg() || !Src1->isReg() || |
1534 | Src0->getReg() != Src1->getReg() || |
1535 | Src0->getSubReg() != Src1->getSubReg() || |
1536 | Src0->getSubReg() != AMDGPU::NoSubRegister) |
1537 | return nullptr; |
1538 | |
1539 | // Can't fold up if we have modifiers. |
1540 | if (TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod)) |
1541 | return nullptr; |
1542 | |
1543 | unsigned Src0Mods |
1544 | = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::src0_modifiers)->getImm(); |
1545 | unsigned Src1Mods |
1546 | = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::src1_modifiers)->getImm(); |
1547 | |
1548 | // Having a 0 op_sel_hi would require swizzling the output in the source |
1549 | // instruction, which we can't do. |
1550 | unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 |
1551 | : 0u; |
1552 | if (Src0Mods != UnsetMods && Src1Mods != UnsetMods) |
1553 | return nullptr; |
1554 | return Src0; |
1555 | } |
1556 | default: |
1557 | return nullptr; |
1558 | } |
1559 | } |
1560 | |
1561 | // FIXME: Clamp for v_mad_mixhi_f16 handled during isel. |
1562 | bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { |
1563 | const MachineOperand *ClampSrc = isClamp(MI); |
1564 | if (!ClampSrc || !MRI->hasOneNonDBGUser(RegNo: ClampSrc->getReg())) |
1565 | return false; |
1566 | |
1567 | MachineInstr *Def = MRI->getVRegDef(Reg: ClampSrc->getReg()); |
1568 | |
1569 | // The type of clamp must be compatible. |
1570 | if (TII->getClampMask(MI: *Def) != TII->getClampMask(MI)) |
1571 | return false; |
1572 | |
1573 | if (Def->mayRaiseFPException()) |
1574 | return false; |
1575 | |
1576 | MachineOperand *DefClamp = TII->getNamedOperand(MI&: *Def, OperandName: AMDGPU::OpName::clamp); |
1577 | if (!DefClamp) |
1578 | return false; |
1579 | |
1580 | LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def); |
1581 | |
1582 | // Clamp is applied after omod, so it is OK if omod is set. |
1583 | DefClamp->setImm(1); |
1584 | |
1585 | Register DefReg = Def->getOperand(i: 0).getReg(); |
1586 | Register MIDstReg = MI.getOperand(i: 0).getReg(); |
1587 | if (TRI->isSGPRReg(MRI: *MRI, Reg: DefReg)) { |
1588 | // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max* |
1589 | // instruction with a VGPR dst. |
1590 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), |
1591 | DestReg: MIDstReg) |
1592 | .addReg(RegNo: DefReg); |
1593 | } else { |
1594 | MRI->replaceRegWith(FromReg: MIDstReg, ToReg: DefReg); |
1595 | } |
1596 | MI.eraseFromParent(); |
1597 | |
1598 | // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac |
1599 | // instruction, so we might as well convert it to the more flexible VOP3-only |
1600 | // mad/fma form. |
1601 | if (TII->convertToThreeAddress(MI&: *Def, LV: nullptr, LIS: nullptr)) |
1602 | Def->eraseFromParent(); |
1603 | |
1604 | return true; |
1605 | } |
1606 | |
1607 | static int getOModValue(unsigned Opc, int64_t Val) { |
1608 | switch (Opc) { |
1609 | case AMDGPU::V_MUL_F64_e64: |
1610 | case AMDGPU::V_MUL_F64_pseudo_e64: { |
1611 | switch (Val) { |
1612 | case 0x3fe0000000000000: // 0.5 |
1613 | return SIOutMods::DIV2; |
1614 | case 0x4000000000000000: // 2.0 |
1615 | return SIOutMods::MUL2; |
1616 | case 0x4010000000000000: // 4.0 |
1617 | return SIOutMods::MUL4; |
1618 | default: |
1619 | return SIOutMods::NONE; |
1620 | } |
1621 | } |
1622 | case AMDGPU::V_MUL_F32_e64: { |
1623 | switch (static_cast<uint32_t>(Val)) { |
1624 | case 0x3f000000: // 0.5 |
1625 | return SIOutMods::DIV2; |
1626 | case 0x40000000: // 2.0 |
1627 | return SIOutMods::MUL2; |
1628 | case 0x40800000: // 4.0 |
1629 | return SIOutMods::MUL4; |
1630 | default: |
1631 | return SIOutMods::NONE; |
1632 | } |
1633 | } |
1634 | case AMDGPU::V_MUL_F16_e64: |
1635 | case AMDGPU::V_MUL_F16_t16_e64: |
1636 | case AMDGPU::V_MUL_F16_fake16_e64: { |
1637 | switch (static_cast<uint16_t>(Val)) { |
1638 | case 0x3800: // 0.5 |
1639 | return SIOutMods::DIV2; |
1640 | case 0x4000: // 2.0 |
1641 | return SIOutMods::MUL2; |
1642 | case 0x4400: // 4.0 |
1643 | return SIOutMods::MUL4; |
1644 | default: |
1645 | return SIOutMods::NONE; |
1646 | } |
1647 | } |
1648 | default: |
1649 | llvm_unreachable("invalid mul opcode" ); |
1650 | } |
1651 | } |
1652 | |
1653 | // FIXME: Does this really not support denormals with f16? |
1654 | // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not |
1655 | // handled, so will anything other than that break? |
1656 | std::pair<const MachineOperand *, int> |
1657 | SIFoldOperands::isOMod(const MachineInstr &MI) const { |
1658 | unsigned Op = MI.getOpcode(); |
1659 | switch (Op) { |
1660 | case AMDGPU::V_MUL_F64_e64: |
1661 | case AMDGPU::V_MUL_F64_pseudo_e64: |
1662 | case AMDGPU::V_MUL_F32_e64: |
1663 | case AMDGPU::V_MUL_F16_t16_e64: |
1664 | case AMDGPU::V_MUL_F16_fake16_e64: |
1665 | case AMDGPU::V_MUL_F16_e64: { |
1666 | // If output denormals are enabled, omod is ignored. |
1667 | if ((Op == AMDGPU::V_MUL_F32_e64 && |
1668 | MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) || |
1669 | ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 || |
1670 | Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 || |
1671 | Op == AMDGPU::V_MUL_F16_fake16_e64) && |
1672 | MFI->getMode().FP64FP16Denormals.Output != |
1673 | DenormalMode::PreserveSign) || |
1674 | MI.mayRaiseFPException()) |
1675 | return std::pair(nullptr, SIOutMods::NONE); |
1676 | |
1677 | const MachineOperand *RegOp = nullptr; |
1678 | const MachineOperand *ImmOp = nullptr; |
1679 | const MachineOperand *Src0 = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::src0); |
1680 | const MachineOperand *Src1 = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::src1); |
1681 | if (Src0->isImm()) { |
1682 | ImmOp = Src0; |
1683 | RegOp = Src1; |
1684 | } else if (Src1->isImm()) { |
1685 | ImmOp = Src1; |
1686 | RegOp = Src0; |
1687 | } else |
1688 | return std::pair(nullptr, SIOutMods::NONE); |
1689 | |
1690 | int OMod = getOModValue(Opc: Op, Val: ImmOp->getImm()); |
1691 | if (OMod == SIOutMods::NONE || |
1692 | TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::src0_modifiers) || |
1693 | TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::src1_modifiers) || |
1694 | TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod) || |
1695 | TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp)) |
1696 | return std::pair(nullptr, SIOutMods::NONE); |
1697 | |
1698 | return std::pair(RegOp, OMod); |
1699 | } |
1700 | case AMDGPU::V_ADD_F64_e64: |
1701 | case AMDGPU::V_ADD_F64_pseudo_e64: |
1702 | case AMDGPU::V_ADD_F32_e64: |
1703 | case AMDGPU::V_ADD_F16_e64: |
1704 | case AMDGPU::V_ADD_F16_t16_e64: |
1705 | case AMDGPU::V_ADD_F16_fake16_e64: { |
1706 | // If output denormals are enabled, omod is ignored. |
1707 | if ((Op == AMDGPU::V_ADD_F32_e64 && |
1708 | MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) || |
1709 | ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 || |
1710 | Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 || |
1711 | Op == AMDGPU::V_ADD_F16_fake16_e64) && |
1712 | MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign)) |
1713 | return std::pair(nullptr, SIOutMods::NONE); |
1714 | |
1715 | // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x |
1716 | const MachineOperand *Src0 = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::src0); |
1717 | const MachineOperand *Src1 = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::src1); |
1718 | |
1719 | if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() && |
1720 | Src0->getSubReg() == Src1->getSubReg() && |
1721 | !TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::src0_modifiers) && |
1722 | !TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::src1_modifiers) && |
1723 | !TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) && |
1724 | !TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod)) |
1725 | return std::pair(Src0, SIOutMods::MUL2); |
1726 | |
1727 | return std::pair(nullptr, SIOutMods::NONE); |
1728 | } |
1729 | default: |
1730 | return std::pair(nullptr, SIOutMods::NONE); |
1731 | } |
1732 | } |
1733 | |
1734 | // FIXME: Does this need to check IEEE bit on function? |
1735 | bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { |
1736 | const MachineOperand *RegOp; |
1737 | int OMod; |
1738 | std::tie(args&: RegOp, args&: OMod) = isOMod(MI); |
1739 | if (OMod == SIOutMods::NONE || !RegOp->isReg() || |
1740 | RegOp->getSubReg() != AMDGPU::NoSubRegister || |
1741 | !MRI->hasOneNonDBGUser(RegNo: RegOp->getReg())) |
1742 | return false; |
1743 | |
1744 | MachineInstr *Def = MRI->getVRegDef(Reg: RegOp->getReg()); |
1745 | MachineOperand *DefOMod = TII->getNamedOperand(MI&: *Def, OperandName: AMDGPU::OpName::omod); |
1746 | if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE) |
1747 | return false; |
1748 | |
1749 | if (Def->mayRaiseFPException()) |
1750 | return false; |
1751 | |
1752 | // Clamp is applied after omod. If the source already has clamp set, don't |
1753 | // fold it. |
1754 | if (TII->hasModifiersSet(MI: *Def, OpName: AMDGPU::OpName::clamp)) |
1755 | return false; |
1756 | |
1757 | LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def); |
1758 | |
1759 | DefOMod->setImm(OMod); |
1760 | MRI->replaceRegWith(FromReg: MI.getOperand(i: 0).getReg(), ToReg: Def->getOperand(i: 0).getReg()); |
1761 | MI.eraseFromParent(); |
1762 | |
1763 | // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac |
1764 | // instruction, so we might as well convert it to the more flexible VOP3-only |
1765 | // mad/fma form. |
1766 | if (TII->convertToThreeAddress(MI&: *Def, LV: nullptr, LIS: nullptr)) |
1767 | Def->eraseFromParent(); |
1768 | |
1769 | return true; |
1770 | } |
1771 | |
1772 | // Try to fold a reg_sequence with vgpr output and agpr inputs into an |
1773 | // instruction which can take an agpr. So far that means a store. |
1774 | bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) { |
1775 | assert(MI.isRegSequence()); |
1776 | auto Reg = MI.getOperand(i: 0).getReg(); |
1777 | |
1778 | if (!ST->hasGFX90AInsts() || !TRI->isVGPR(MRI: *MRI, Reg) || |
1779 | !MRI->hasOneNonDBGUse(RegNo: Reg)) |
1780 | return false; |
1781 | |
1782 | SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs; |
1783 | if (!getRegSeqInit(Defs, UseReg: Reg, OpTy: MCOI::OPERAND_REGISTER)) |
1784 | return false; |
1785 | |
1786 | for (auto &[Op, SubIdx] : Defs) { |
1787 | if (!Op->isReg()) |
1788 | return false; |
1789 | if (TRI->isAGPR(MRI: *MRI, Reg: Op->getReg())) |
1790 | continue; |
1791 | // Maybe this is a COPY from AREG |
1792 | const MachineInstr *SubDef = MRI->getVRegDef(Reg: Op->getReg()); |
1793 | if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(i: 1).getSubReg()) |
1794 | return false; |
1795 | if (!TRI->isAGPR(MRI: *MRI, Reg: SubDef->getOperand(i: 1).getReg())) |
1796 | return false; |
1797 | } |
1798 | |
1799 | MachineOperand *Op = &*MRI->use_nodbg_begin(RegNo: Reg); |
1800 | MachineInstr *UseMI = Op->getParent(); |
1801 | while (UseMI->isCopy() && !Op->getSubReg()) { |
1802 | Reg = UseMI->getOperand(i: 0).getReg(); |
1803 | if (!TRI->isVGPR(MRI: *MRI, Reg) || !MRI->hasOneNonDBGUse(RegNo: Reg)) |
1804 | return false; |
1805 | Op = &*MRI->use_nodbg_begin(RegNo: Reg); |
1806 | UseMI = Op->getParent(); |
1807 | } |
1808 | |
1809 | if (Op->getSubReg()) |
1810 | return false; |
1811 | |
1812 | unsigned OpIdx = Op - &UseMI->getOperand(i: 0); |
1813 | const MCInstrDesc &InstDesc = UseMI->getDesc(); |
1814 | const TargetRegisterClass *OpRC = |
1815 | TII->getRegClass(TID: InstDesc, OpNum: OpIdx, TRI, MF: *MI.getMF()); |
1816 | if (!OpRC || !TRI->isVectorSuperClass(RC: OpRC)) |
1817 | return false; |
1818 | |
1819 | const auto *NewDstRC = TRI->getEquivalentAGPRClass(SRC: MRI->getRegClass(Reg)); |
1820 | auto Dst = MRI->createVirtualRegister(RegClass: NewDstRC); |
1821 | auto RS = BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), |
1822 | MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst); |
1823 | |
1824 | for (auto &[Def, SubIdx] : Defs) { |
1825 | Def->setIsKill(false); |
1826 | if (TRI->isAGPR(MRI: *MRI, Reg: Def->getReg())) { |
1827 | RS.add(MO: *Def); |
1828 | } else { // This is a copy |
1829 | MachineInstr *SubDef = MRI->getVRegDef(Reg: Def->getReg()); |
1830 | SubDef->getOperand(i: 1).setIsKill(false); |
1831 | RS.addReg(RegNo: SubDef->getOperand(i: 1).getReg(), flags: 0, SubReg: Def->getSubReg()); |
1832 | } |
1833 | RS.addImm(Val: SubIdx); |
1834 | } |
1835 | |
1836 | Op->setReg(Dst); |
1837 | if (!TII->isOperandLegal(MI: *UseMI, OpIdx, MO: Op)) { |
1838 | Op->setReg(Reg); |
1839 | RS->eraseFromParent(); |
1840 | return false; |
1841 | } |
1842 | |
1843 | LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI); |
1844 | |
1845 | // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users, |
1846 | // in which case we can erase them all later in runOnMachineFunction. |
1847 | if (MRI->use_nodbg_empty(RegNo: MI.getOperand(i: 0).getReg())) |
1848 | MI.eraseFromParent(); |
1849 | return true; |
1850 | } |
1851 | |
1852 | /// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and |
1853 | /// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg |
1854 | static bool isAGPRCopy(const SIRegisterInfo &TRI, |
1855 | const MachineRegisterInfo &MRI, const MachineInstr &Copy, |
1856 | Register &OutReg, unsigned &OutSubReg) { |
1857 | assert(Copy.isCopy()); |
1858 | |
1859 | const MachineOperand &CopySrc = Copy.getOperand(i: 1); |
1860 | Register CopySrcReg = CopySrc.getReg(); |
1861 | if (!CopySrcReg.isVirtual()) |
1862 | return false; |
1863 | |
1864 | // Common case: copy from AGPR directly, e.g. |
1865 | // %1:vgpr_32 = COPY %0:agpr_32 |
1866 | if (TRI.isAGPR(MRI, Reg: CopySrcReg)) { |
1867 | OutReg = CopySrcReg; |
1868 | OutSubReg = CopySrc.getSubReg(); |
1869 | return true; |
1870 | } |
1871 | |
1872 | // Sometimes it can also involve two copies, e.g. |
1873 | // %1:vgpr_256 = COPY %0:agpr_256 |
1874 | // %2:vgpr_32 = COPY %1:vgpr_256.sub0 |
1875 | const MachineInstr *CopySrcDef = MRI.getVRegDef(Reg: CopySrcReg); |
1876 | if (!CopySrcDef || !CopySrcDef->isCopy()) |
1877 | return false; |
1878 | |
1879 | const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(i: 1); |
1880 | Register OtherCopySrcReg = OtherCopySrc.getReg(); |
1881 | if (!OtherCopySrcReg.isVirtual() || |
1882 | CopySrcDef->getOperand(i: 0).getSubReg() != AMDGPU::NoSubRegister || |
1883 | OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister || |
1884 | !TRI.isAGPR(MRI, Reg: OtherCopySrcReg)) |
1885 | return false; |
1886 | |
1887 | OutReg = OtherCopySrcReg; |
1888 | OutSubReg = CopySrc.getSubReg(); |
1889 | return true; |
1890 | } |
1891 | |
1892 | // Try to hoist an AGPR to VGPR copy across a PHI. |
1893 | // This should allow folding of an AGPR into a consumer which may support it. |
1894 | // |
1895 | // Example 1: LCSSA PHI |
1896 | // loop: |
1897 | // %1:vreg = COPY %0:areg |
1898 | // exit: |
1899 | // %2:vreg = PHI %1:vreg, %loop |
1900 | // => |
1901 | // loop: |
1902 | // exit: |
1903 | // %1:areg = PHI %0:areg, %loop |
1904 | // %2:vreg = COPY %1:areg |
1905 | // |
1906 | // Example 2: PHI with multiple incoming values: |
1907 | // entry: |
1908 | // %1:vreg = GLOBAL_LOAD(..) |
1909 | // loop: |
1910 | // %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop |
1911 | // %3:areg = COPY %2:vreg |
1912 | // %4:areg = (instr using %3:areg) |
1913 | // %5:vreg = COPY %4:areg |
1914 | // => |
1915 | // entry: |
1916 | // %1:vreg = GLOBAL_LOAD(..) |
1917 | // %2:areg = COPY %1:vreg |
1918 | // loop: |
1919 | // %3:areg = PHI %2:areg, %entry, %X:areg, |
1920 | // %4:areg = (instr using %3:areg) |
1921 | bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) { |
1922 | assert(PHI.isPHI()); |
1923 | |
1924 | Register PhiOut = PHI.getOperand(i: 0).getReg(); |
1925 | if (!TRI->isVGPR(MRI: *MRI, Reg: PhiOut)) |
1926 | return false; |
1927 | |
1928 | // Iterate once over all incoming values of the PHI to check if this PHI is |
1929 | // eligible, and determine the exact AGPR RC we'll target. |
1930 | const TargetRegisterClass *ARC = nullptr; |
1931 | for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) { |
1932 | MachineOperand &MO = PHI.getOperand(i: K); |
1933 | MachineInstr *Copy = MRI->getVRegDef(Reg: MO.getReg()); |
1934 | if (!Copy || !Copy->isCopy()) |
1935 | continue; |
1936 | |
1937 | Register AGPRSrc; |
1938 | unsigned AGPRRegMask = AMDGPU::NoSubRegister; |
1939 | if (!isAGPRCopy(TRI: *TRI, MRI: *MRI, Copy: *Copy, OutReg&: AGPRSrc, OutSubReg&: AGPRRegMask)) |
1940 | continue; |
1941 | |
1942 | const TargetRegisterClass *CopyInRC = MRI->getRegClass(Reg: AGPRSrc); |
1943 | if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask)) |
1944 | CopyInRC = SubRC; |
1945 | |
1946 | if (ARC && !ARC->hasSubClassEq(RC: CopyInRC)) |
1947 | return false; |
1948 | ARC = CopyInRC; |
1949 | } |
1950 | |
1951 | if (!ARC) |
1952 | return false; |
1953 | |
1954 | bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass); |
1955 | |
1956 | // Rewrite the PHI's incoming values to ARC. |
1957 | LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI); |
1958 | for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) { |
1959 | MachineOperand &MO = PHI.getOperand(i: K); |
1960 | Register Reg = MO.getReg(); |
1961 | |
1962 | MachineBasicBlock::iterator InsertPt; |
1963 | MachineBasicBlock *InsertMBB = nullptr; |
1964 | |
1965 | // Look at the def of Reg, ignoring all copies. |
1966 | unsigned CopyOpc = AMDGPU::COPY; |
1967 | if (MachineInstr *Def = MRI->getVRegDef(Reg)) { |
1968 | |
1969 | // Look at pre-existing COPY instructions from ARC: Steal the operand. If |
1970 | // the copy was single-use, it will be removed by DCE later. |
1971 | if (Def->isCopy()) { |
1972 | Register AGPRSrc; |
1973 | unsigned AGPRSubReg = AMDGPU::NoSubRegister; |
1974 | if (isAGPRCopy(TRI: *TRI, MRI: *MRI, Copy: *Def, OutReg&: AGPRSrc, OutSubReg&: AGPRSubReg)) { |
1975 | MO.setReg(AGPRSrc); |
1976 | MO.setSubReg(AGPRSubReg); |
1977 | continue; |
1978 | } |
1979 | |
1980 | // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on |
1981 | // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try |
1982 | // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which |
1983 | // is unlikely to be profitable. |
1984 | // |
1985 | // Note that V_ACCVGPR_WRITE is only used for AGPR_32. |
1986 | MachineOperand &CopyIn = Def->getOperand(i: 1); |
1987 | if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(RegNo: Reg) && |
1988 | TRI->isSGPRReg(MRI: *MRI, Reg: CopyIn.getReg())) |
1989 | CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; |
1990 | } |
1991 | |
1992 | InsertMBB = Def->getParent(); |
1993 | InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(I: ++Def->getIterator()); |
1994 | } else { |
1995 | InsertMBB = PHI.getOperand(i: MO.getOperandNo() + 1).getMBB(); |
1996 | InsertPt = InsertMBB->getFirstTerminator(); |
1997 | } |
1998 | |
1999 | Register NewReg = MRI->createVirtualRegister(RegClass: ARC); |
2000 | MachineInstr *MI = BuildMI(BB&: *InsertMBB, I: InsertPt, MIMD: PHI.getDebugLoc(), |
2001 | MCID: TII->get(Opcode: CopyOpc), DestReg: NewReg) |
2002 | .addReg(RegNo: Reg); |
2003 | MO.setReg(NewReg); |
2004 | |
2005 | (void)MI; |
2006 | LLVM_DEBUG(dbgs() << " Created COPY: " << *MI); |
2007 | } |
2008 | |
2009 | // Replace the PHI's result with a new register. |
2010 | Register NewReg = MRI->createVirtualRegister(RegClass: ARC); |
2011 | PHI.getOperand(i: 0).setReg(NewReg); |
2012 | |
2013 | // COPY that new register back to the original PhiOut register. This COPY will |
2014 | // usually be folded out later. |
2015 | MachineBasicBlock *MBB = PHI.getParent(); |
2016 | BuildMI(BB&: *MBB, I: MBB->getFirstNonPHI(), MIMD: PHI.getDebugLoc(), |
2017 | MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: PhiOut) |
2018 | .addReg(RegNo: NewReg); |
2019 | |
2020 | LLVM_DEBUG(dbgs() << " Done: Folded " << PHI); |
2021 | return true; |
2022 | } |
2023 | |
2024 | // Attempt to convert VGPR load to an AGPR load. |
2025 | bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) { |
2026 | assert(MI.mayLoad()); |
2027 | if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1) |
2028 | return false; |
2029 | |
2030 | MachineOperand &Def = MI.getOperand(i: 0); |
2031 | if (!Def.isDef()) |
2032 | return false; |
2033 | |
2034 | Register DefReg = Def.getReg(); |
2035 | |
2036 | if (DefReg.isPhysical() || !TRI->isVGPR(MRI: *MRI, Reg: DefReg)) |
2037 | return false; |
2038 | |
2039 | SmallVector<const MachineInstr*, 8> Users; |
2040 | SmallVector<Register, 8> MoveRegs; |
2041 | for (const MachineInstr &I : MRI->use_nodbg_instructions(Reg: DefReg)) |
2042 | Users.push_back(Elt: &I); |
2043 | |
2044 | if (Users.empty()) |
2045 | return false; |
2046 | |
2047 | // Check that all uses a copy to an agpr or a reg_sequence producing an agpr. |
2048 | while (!Users.empty()) { |
2049 | const MachineInstr *I = Users.pop_back_val(); |
2050 | if (!I->isCopy() && !I->isRegSequence()) |
2051 | return false; |
2052 | Register DstReg = I->getOperand(i: 0).getReg(); |
2053 | // Physical registers may have more than one instruction definitions |
2054 | if (DstReg.isPhysical()) |
2055 | return false; |
2056 | if (TRI->isAGPR(MRI: *MRI, Reg: DstReg)) |
2057 | continue; |
2058 | MoveRegs.push_back(Elt: DstReg); |
2059 | for (const MachineInstr &U : MRI->use_nodbg_instructions(Reg: DstReg)) |
2060 | Users.push_back(Elt: &U); |
2061 | } |
2062 | |
2063 | const TargetRegisterClass *RC = MRI->getRegClass(Reg: DefReg); |
2064 | MRI->setRegClass(Reg: DefReg, RC: TRI->getEquivalentAGPRClass(SRC: RC)); |
2065 | if (!TII->isOperandLegal(MI, OpIdx: 0, MO: &Def)) { |
2066 | MRI->setRegClass(Reg: DefReg, RC); |
2067 | return false; |
2068 | } |
2069 | |
2070 | while (!MoveRegs.empty()) { |
2071 | Register Reg = MoveRegs.pop_back_val(); |
2072 | MRI->setRegClass(Reg, RC: TRI->getEquivalentAGPRClass(SRC: MRI->getRegClass(Reg))); |
2073 | } |
2074 | |
2075 | LLVM_DEBUG(dbgs() << "Folded " << MI); |
2076 | |
2077 | return true; |
2078 | } |
2079 | |
2080 | // tryFoldPhiAGPR will aggressively try to create AGPR PHIs. |
2081 | // For GFX90A and later, this is pretty much always a good thing, but for GFX908 |
2082 | // there's cases where it can create a lot more AGPR-AGPR copies, which are |
2083 | // expensive on this architecture due to the lack of V_ACCVGPR_MOV. |
2084 | // |
2085 | // This function looks at all AGPR PHIs in a basic block and collects their |
2086 | // operands. Then, it checks for register that are used more than once across |
2087 | // all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from |
2088 | // having to create one VGPR temporary per use, which can get very messy if |
2089 | // these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector |
2090 | // element). |
2091 | // |
2092 | // Example |
2093 | // a: |
2094 | // %in:agpr_256 = COPY %foo:vgpr_256 |
2095 | // c: |
2096 | // %x:agpr_32 = .. |
2097 | // b: |
2098 | // %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c |
2099 | // %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c |
2100 | // %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c |
2101 | // => |
2102 | // a: |
2103 | // %in:agpr_256 = COPY %foo:vgpr_256 |
2104 | // %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32 |
2105 | // %tmp_agpr:agpr_32 = COPY %tmp |
2106 | // c: |
2107 | // %x:agpr_32 = .. |
2108 | // b: |
2109 | // %0:areg = PHI %tmp_agpr, %a, %x, %c |
2110 | // %1:areg = PHI %tmp_agpr, %a, %y, %c |
2111 | // %2:areg = PHI %tmp_agpr, %a, %z, %c |
2112 | bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) { |
2113 | // This is only really needed on GFX908 where AGPR-AGPR copies are |
2114 | // unreasonably difficult. |
2115 | if (ST->hasGFX90AInsts()) |
2116 | return false; |
2117 | |
2118 | // Look at all AGPR Phis and collect the register + subregister used. |
2119 | DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>> |
2120 | RegToMO; |
2121 | |
2122 | for (auto &MI : MBB) { |
2123 | if (!MI.isPHI()) |
2124 | break; |
2125 | |
2126 | if (!TRI->isAGPR(MRI: *MRI, Reg: MI.getOperand(i: 0).getReg())) |
2127 | continue; |
2128 | |
2129 | for (unsigned K = 1; K < MI.getNumOperands(); K += 2) { |
2130 | MachineOperand &PhiMO = MI.getOperand(i: K); |
2131 | if (!PhiMO.getSubReg()) |
2132 | continue; |
2133 | RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(x: &PhiMO); |
2134 | } |
2135 | } |
2136 | |
2137 | // For all (Reg, SubReg) pair that are used more than once, cache the value in |
2138 | // a VGPR. |
2139 | bool Changed = false; |
2140 | for (const auto &[Entry, MOs] : RegToMO) { |
2141 | if (MOs.size() == 1) |
2142 | continue; |
2143 | |
2144 | const auto [Reg, SubReg] = Entry; |
2145 | MachineInstr *Def = MRI->getVRegDef(Reg); |
2146 | MachineBasicBlock *DefMBB = Def->getParent(); |
2147 | |
2148 | // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded |
2149 | // out. |
2150 | const TargetRegisterClass *ARC = getRegOpRC(MRI: *MRI, TRI: *TRI, MO: *MOs.front()); |
2151 | Register TempVGPR = |
2152 | MRI->createVirtualRegister(RegClass: TRI->getEquivalentVGPRClass(SRC: ARC)); |
2153 | MachineInstr *VGPRCopy = |
2154 | BuildMI(BB&: *DefMBB, I: ++Def->getIterator(), MIMD: Def->getDebugLoc(), |
2155 | MCID: TII->get(Opcode: AMDGPU::V_ACCVGPR_READ_B32_e64), DestReg: TempVGPR) |
2156 | .addReg(RegNo: Reg, /* flags */ 0, SubReg); |
2157 | |
2158 | // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs. |
2159 | Register TempAGPR = MRI->createVirtualRegister(RegClass: ARC); |
2160 | BuildMI(BB&: *DefMBB, I: ++VGPRCopy->getIterator(), MIMD: Def->getDebugLoc(), |
2161 | MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: TempAGPR) |
2162 | .addReg(RegNo: TempVGPR); |
2163 | |
2164 | LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy); |
2165 | for (MachineOperand *MO : MOs) { |
2166 | MO->setReg(TempAGPR); |
2167 | MO->setSubReg(AMDGPU::NoSubRegister); |
2168 | LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n" ); |
2169 | } |
2170 | |
2171 | Changed = true; |
2172 | } |
2173 | |
2174 | return Changed; |
2175 | } |
2176 | |
2177 | bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { |
2178 | if (skipFunction(F: MF.getFunction())) |
2179 | return false; |
2180 | |
2181 | MRI = &MF.getRegInfo(); |
2182 | ST = &MF.getSubtarget<GCNSubtarget>(); |
2183 | TII = ST->getInstrInfo(); |
2184 | TRI = &TII->getRegisterInfo(); |
2185 | MFI = MF.getInfo<SIMachineFunctionInfo>(); |
2186 | |
2187 | // omod is ignored by hardware if IEEE bit is enabled. omod also does not |
2188 | // correctly handle signed zeros. |
2189 | // |
2190 | // FIXME: Also need to check strictfp |
2191 | bool IsIEEEMode = MFI->getMode().IEEE; |
2192 | bool HasNSZ = MFI->hasNoSignedZerosFPMath(); |
2193 | |
2194 | bool Changed = false; |
2195 | for (MachineBasicBlock *MBB : depth_first(G: &MF)) { |
2196 | MachineOperand *CurrentKnownM0Val = nullptr; |
2197 | for (auto &MI : make_early_inc_range(Range&: *MBB)) { |
2198 | Changed |= tryFoldCndMask(MI); |
2199 | |
2200 | if (tryFoldZeroHighBits(MI)) { |
2201 | Changed = true; |
2202 | continue; |
2203 | } |
2204 | |
2205 | if (MI.isRegSequence() && tryFoldRegSequence(MI)) { |
2206 | Changed = true; |
2207 | continue; |
2208 | } |
2209 | |
2210 | if (MI.isPHI() && tryFoldPhiAGPR(PHI&: MI)) { |
2211 | Changed = true; |
2212 | continue; |
2213 | } |
2214 | |
2215 | if (MI.mayLoad() && tryFoldLoad(MI)) { |
2216 | Changed = true; |
2217 | continue; |
2218 | } |
2219 | |
2220 | if (TII->isFoldableCopy(MI)) { |
2221 | Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val); |
2222 | continue; |
2223 | } |
2224 | |
2225 | // Saw an unknown clobber of m0, so we no longer know what it is. |
2226 | if (CurrentKnownM0Val && MI.modifiesRegister(Reg: AMDGPU::M0, TRI)) |
2227 | CurrentKnownM0Val = nullptr; |
2228 | |
2229 | // TODO: Omod might be OK if there is NSZ only on the source |
2230 | // instruction, and not the omod multiply. |
2231 | if (IsIEEEMode || (!HasNSZ && !MI.getFlag(Flag: MachineInstr::FmNsz)) || |
2232 | !tryFoldOMod(MI)) |
2233 | Changed |= tryFoldClamp(MI); |
2234 | } |
2235 | |
2236 | Changed |= tryOptimizeAGPRPhis(MBB&: *MBB); |
2237 | } |
2238 | |
2239 | return Changed; |
2240 | } |
2241 | |