1//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
9// operand. If any of the use instruction cannot be combined with the mov the
10// whole sequence is reverted.
11//
12// $old = ...
13// $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
14// dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
15// $res = VALU $dpp_value [, src1]
16//
17// to
18//
19// $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
20// dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
21//
22// Combining rules :
23//
24// if $row_mask and $bank_mask are fully enabled (0xF) and
25// $bound_ctrl==DPP_BOUND_ZERO or $old==0
26// -> $combined_old = undef,
27// $combined_bound_ctrl = DPP_BOUND_ZERO
28//
29// if the VALU op is binary and
30// $bound_ctrl==DPP_BOUND_OFF and
31// $old==identity value (immediate) for the VALU op
32// -> $combined_old = src1,
33// $combined_bound_ctrl = DPP_BOUND_OFF
34//
35// Otherwise cancel.
36//
37// The mov_dpp instruction should reside in the same BB as all its uses
38//===----------------------------------------------------------------------===//
39
40#include "GCNDPPCombine.h"
41#include "AMDGPU.h"
42#include "GCNSubtarget.h"
43#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
44#include "llvm/ADT/Statistic.h"
45#include "llvm/CodeGen/MachineFunctionPass.h"
46
47using namespace llvm;
48
49#define DEBUG_TYPE "gcn-dpp-combine"
50
51STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
52
53namespace {
54
55class GCNDPPCombine {
56 MachineRegisterInfo *MRI;
57 const SIInstrInfo *TII;
58 const GCNSubtarget *ST;
59
60 using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
61
62 MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
63
64 MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI,
65 RegSubRegPair CombOldVGPR,
66 MachineOperand *OldOpnd, bool CombBCZ,
67 bool IsShrinkable) const;
68
69 MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI,
70 RegSubRegPair CombOldVGPR, bool CombBCZ,
71 bool IsShrinkable) const;
72
73 bool hasNoImmOrEqual(MachineInstr &MI, AMDGPU::OpName OpndName, int64_t Value,
74 int64_t Mask = -1) const;
75
76 bool combineDPPMov(MachineInstr &MI) const;
77
78 int getDPPOp(unsigned Op, bool IsShrinkable) const;
79 bool isShrinkable(MachineInstr &MI) const;
80
81public:
82 bool run(MachineFunction &MF);
83};
84
85class GCNDPPCombineLegacy : public MachineFunctionPass {
86public:
87 static char ID;
88
89 GCNDPPCombineLegacy() : MachineFunctionPass(ID) {}
90
91 bool runOnMachineFunction(MachineFunction &MF) override;
92
93 StringRef getPassName() const override { return "GCN DPP Combine"; }
94
95 void getAnalysisUsage(AnalysisUsage &AU) const override {
96 AU.setPreservesCFG();
97 MachineFunctionPass::getAnalysisUsage(AU);
98 }
99
100 MachineFunctionProperties getRequiredProperties() const override {
101 return MachineFunctionProperties().setIsSSA();
102 }
103};
104
105} // end anonymous namespace
106
107INITIALIZE_PASS(GCNDPPCombineLegacy, DEBUG_TYPE, "GCN DPP Combine", false,
108 false)
109
110char GCNDPPCombineLegacy::ID = 0;
111
112char &llvm::GCNDPPCombineLegacyID = GCNDPPCombineLegacy::ID;
113
114FunctionPass *llvm::createGCNDPPCombinePass() {
115 return new GCNDPPCombineLegacy();
116}
117
118bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
119 unsigned Op = MI.getOpcode();
120 if (!TII->isVOP3(Opcode: Op)) {
121 return false;
122 }
123 if (!TII->hasVALU32BitEncoding(Opcode: Op)) {
124 LLVM_DEBUG(dbgs() << " Inst hasn't e32 equivalent\n");
125 return false;
126 }
127 // Do not shrink True16 instructions pre-RA to avoid the restriction in
128 // register allocation from only being able to use 128 VGPRs
129 if (AMDGPU::isTrue16Inst(Opc: Op))
130 return false;
131 if (const auto *SDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst)) {
132 // Give up if there are any uses of the sdst in carry-out or VOPC.
133 // The shrunken form of the instruction would write it to vcc instead of to
134 // a virtual register. If we rewrote the uses the shrinking would be
135 // possible.
136 if (!MRI->use_nodbg_empty(RegNo: SDst->getReg()))
137 return false;
138 }
139 // check if other than abs|neg modifiers are set (opsel for example)
140 const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
141 if (!hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::src0_modifiers, Value: 0, Mask) ||
142 !hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::src1_modifiers, Value: 0, Mask) ||
143 !hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::clamp, Value: 0) ||
144 !hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::omod, Value: 0) ||
145 !hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::byte_sel, Value: 0)) {
146 LLVM_DEBUG(dbgs() << " Inst has non-default modifiers\n");
147 return false;
148 }
149 return true;
150}
151
152int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const {
153 int DPP32 = AMDGPU::getDPPOp32(Opcode: Op);
154 if (IsShrinkable) {
155 assert(DPP32 == -1);
156 int E32 = AMDGPU::getVOPe32(Opcode: Op);
157 DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(Opcode: E32);
158 }
159 if (DPP32 != -1 && TII->pseudoToMCOpcode(Opcode: DPP32) != -1)
160 return DPP32;
161 int DPP64 = -1;
162 if (ST->hasVOP3DPP())
163 DPP64 = AMDGPU::getDPPOp64(Opcode: Op);
164 if (DPP64 != -1 && TII->pseudoToMCOpcode(Opcode: DPP64) != -1)
165 return DPP64;
166 return -1;
167}
168
169// tracks the register operand definition and returns:
170// 1. immediate operand used to initialize the register if found
171// 2. nullptr if the register operand is undef
172// 3. the operand itself otherwise
173MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
174 auto *Def = getVRegSubRegDef(P: getRegSubRegPair(O: OldOpnd), MRI&: *MRI);
175 if (!Def)
176 return nullptr;
177
178 switch(Def->getOpcode()) {
179 default: break;
180 case AMDGPU::IMPLICIT_DEF:
181 return nullptr;
182 case AMDGPU::COPY:
183 case AMDGPU::V_MOV_B32_e32:
184 case AMDGPU::V_MOV_B64_PSEUDO:
185 case AMDGPU::V_MOV_B64_e32:
186 case AMDGPU::V_MOV_B64_e64: {
187 auto &Op1 = Def->getOperand(i: 1);
188 if (Op1.isImm())
189 return &Op1;
190 break;
191 }
192 }
193 return &OldOpnd;
194}
195
196[[maybe_unused]] static unsigned getOperandSize(MachineInstr &MI, unsigned Idx,
197 MachineRegisterInfo &MRI) {
198 int16_t RegClass = MI.getDesc().operands()[Idx].RegClass;
199 if (RegClass == -1)
200 return 0;
201
202 const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
203 return TRI->getRegSizeInBits(RC: *TRI->getRegClass(i: RegClass));
204}
205
206MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
207 MachineInstr &MovMI,
208 RegSubRegPair CombOldVGPR,
209 bool CombBCZ,
210 bool IsShrinkable) const {
211 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
212 MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
213 MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
214
215 bool HasVOP3DPP = ST->hasVOP3DPP();
216 auto OrigOp = OrigMI.getOpcode();
217 if (ST->useRealTrue16Insts() && AMDGPU::isTrue16Inst(Opc: OrigOp)) {
218 LLVM_DEBUG(
219 dbgs() << " failed: Did not expect any 16-bit uses of dpp values\n");
220 return nullptr;
221 }
222 auto DPPOp = getDPPOp(Op: OrigOp, IsShrinkable);
223 if (DPPOp == -1) {
224 LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
225 return nullptr;
226 }
227 int OrigOpE32 = AMDGPU::getVOPe32(Opcode: OrigOp);
228 // Prior checks cover Mask with VOPC condition, but not on purpose
229 auto *RowMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::row_mask);
230 assert(RowMaskOpnd && RowMaskOpnd->isImm());
231 auto *BankMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bank_mask);
232 assert(BankMaskOpnd && BankMaskOpnd->isImm());
233 const bool MaskAllLanes =
234 RowMaskOpnd->getImm() == 0xF && BankMaskOpnd->getImm() == 0xF;
235 (void)MaskAllLanes;
236 assert((MaskAllLanes ||
237 !(TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
238 TII->isVOPC(OrigOpE32)))) &&
239 "VOPC cannot form DPP unless mask is full");
240
241 auto DPPInst = BuildMI(BB&: *OrigMI.getParent(), I&: OrigMI,
242 MIMD: OrigMI.getDebugLoc(), MCID: TII->get(Opcode: DPPOp))
243 .setMIFlags(OrigMI.getFlags());
244
245 bool Fail = false;
246 do {
247 int NumOperands = 0;
248 if (auto *Dst = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::vdst)) {
249 DPPInst.add(MO: *Dst);
250 ++NumOperands;
251 }
252 if (auto *SDst = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::sdst)) {
253 if (TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: NumOperands, MO: SDst)) {
254 DPPInst.add(MO: *SDst);
255 ++NumOperands;
256 }
257 // If we shrunk a 64bit vop3b to 32bits, just ignore the sdst
258 }
259
260 const int OldIdx = AMDGPU::getNamedOperandIdx(Opcode: DPPOp, Name: AMDGPU::OpName::old);
261 if (OldIdx != -1) {
262 assert(OldIdx == NumOperands);
263 assert(isOfRegClass(
264 CombOldVGPR,
265 *MRI->getRegClass(
266 TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()),
267 *MRI));
268 auto *Def = getVRegSubRegDef(P: CombOldVGPR, MRI&: *MRI);
269 DPPInst.addReg(RegNo: CombOldVGPR.Reg, flags: Def ? 0 : RegState::Undef,
270 SubReg: CombOldVGPR.SubReg);
271 ++NumOperands;
272 } else if (TII->isVOPC(Opcode: DPPOp) || (TII->isVOP3(Opcode: DPPOp) && OrigOpE32 != -1 &&
273 TII->isVOPC(Opcode: OrigOpE32))) {
274 // VOPC DPP and VOPC promoted to VOP3 DPP do not have an old operand
275 // because they write to SGPRs not VGPRs
276 } else {
277 // TODO: this discards MAC/FMA instructions for now, let's add it later
278 LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction,"
279 " TBD\n");
280 Fail = true;
281 break;
282 }
283
284 auto *Mod0 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src0_modifiers);
285 if (Mod0) {
286 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
287 AMDGPU::OpName::src0_modifiers));
288 assert(HasVOP3DPP ||
289 (0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
290 DPPInst.addImm(Val: Mod0->getImm());
291 ++NumOperands;
292 } else if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::src0_modifiers)) {
293 DPPInst.addImm(Val: 0);
294 ++NumOperands;
295 }
296 auto *Src0 = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::src0);
297 assert(Src0);
298 int Src0Idx = NumOperands;
299 if (!TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: NumOperands, MO: Src0)) {
300 LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
301 Fail = true;
302 break;
303 }
304 DPPInst.add(MO: *Src0);
305 DPPInst->getOperand(i: NumOperands).setIsKill(false);
306 ++NumOperands;
307
308 auto *Mod1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1_modifiers);
309 if (Mod1) {
310 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
311 AMDGPU::OpName::src1_modifiers));
312 assert(HasVOP3DPP ||
313 (0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
314 DPPInst.addImm(Val: Mod1->getImm());
315 ++NumOperands;
316 } else if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::src1_modifiers)) {
317 DPPInst.addImm(Val: 0);
318 ++NumOperands;
319 }
320 auto *Src1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1);
321 if (Src1) {
322 int OpNum = NumOperands;
323 // If subtarget does not support SGPRs for src1 operand then the
324 // requirements are the same as for src0. We check src0 instead because
325 // pseudos are shared between subtargets and allow SGPR for src1 on all.
326 if (!ST->hasDPPSrc1SGPR()) {
327 assert(getOperandSize(*DPPInst, Src0Idx, *MRI) ==
328 getOperandSize(*DPPInst, NumOperands, *MRI) &&
329 "Src0 and Src1 operands should have the same size");
330 OpNum = Src0Idx;
331 }
332 if (!TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: OpNum, MO: Src1)) {
333 LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
334 Fail = true;
335 break;
336 }
337 DPPInst.add(MO: *Src1);
338 ++NumOperands;
339 }
340
341 auto *Mod2 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src2_modifiers);
342 if (Mod2) {
343 assert(NumOperands ==
344 AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers));
345 assert(HasVOP3DPP ||
346 (0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
347 DPPInst.addImm(Val: Mod2->getImm());
348 ++NumOperands;
349 }
350 auto *Src2 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src2);
351 if (Src2) {
352 if (!TII->getNamedOperand(MI&: *DPPInst.getInstr(), OperandName: AMDGPU::OpName::src2) ||
353 !TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: NumOperands, MO: Src2)) {
354 LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
355 Fail = true;
356 break;
357 }
358 DPPInst.add(MO: *Src2);
359 ++NumOperands;
360 }
361
362 if (HasVOP3DPP) {
363 auto *ClampOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::clamp);
364 if (ClampOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::clamp)) {
365 DPPInst.addImm(Val: ClampOpr->getImm());
366 }
367 auto *VdstInOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::vdst_in);
368 if (VdstInOpr &&
369 AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::vdst_in)) {
370 DPPInst.add(MO: *VdstInOpr);
371 }
372 auto *OmodOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::omod);
373 if (OmodOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::omod)) {
374 DPPInst.addImm(Val: OmodOpr->getImm());
375 }
376 // Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to
377 // all 1.
378 if (TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::op_sel)) {
379 int64_t OpSel = 0;
380 OpSel |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_0) << 0) : 0);
381 OpSel |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_0) << 1) : 0);
382 OpSel |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_0) << 2) : 0);
383 if (Mod0 && TII->isVOP3(MI: OrigMI) && !TII->isVOP3P(MI: OrigMI))
384 OpSel |= !!(Mod0->getImm() & SISrcMods::DST_OP_SEL) << 3;
385
386 if (OpSel != 0) {
387 LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n");
388 Fail = true;
389 break;
390 }
391 if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::op_sel))
392 DPPInst.addImm(Val: OpSel);
393 }
394 if (TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::op_sel_hi)) {
395 int64_t OpSelHi = 0;
396 OpSelHi |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_1) << 0) : 0);
397 OpSelHi |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_1) << 1) : 0);
398 OpSelHi |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_1) << 2) : 0);
399
400 // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check
401 // the bitmask for 3 op_sel_hi bits set
402 assert(Src2 && "Expected vop3p with 3 operands");
403 if (OpSelHi != 7) {
404 LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n");
405 Fail = true;
406 break;
407 }
408 if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::op_sel_hi))
409 DPPInst.addImm(Val: OpSelHi);
410 }
411 auto *NegOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::neg_lo);
412 if (NegOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::neg_lo)) {
413 DPPInst.addImm(Val: NegOpr->getImm());
414 }
415 auto *NegHiOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::neg_hi);
416 if (NegHiOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::neg_hi)) {
417 DPPInst.addImm(Val: NegHiOpr->getImm());
418 }
419 auto *ByteSelOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::byte_sel);
420 if (ByteSelOpr &&
421 AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::byte_sel)) {
422 DPPInst.addImm(Val: ByteSelOpr->getImm());
423 }
424 }
425 DPPInst.add(MO: *TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::dpp_ctrl));
426 DPPInst.add(MO: *TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::row_mask));
427 DPPInst.add(MO: *TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bank_mask));
428 DPPInst.addImm(Val: CombBCZ ? 1 : 0);
429 } while (false);
430
431 if (Fail) {
432 DPPInst.getInstr()->eraseFromParent();
433 return nullptr;
434 }
435 LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr());
436 return DPPInst.getInstr();
437}
438
439static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
440 assert(OldOpnd->isImm());
441 switch (OrigMIOp) {
442 default: break;
443 case AMDGPU::V_ADD_U32_e32:
444 case AMDGPU::V_ADD_U32_e64:
445 case AMDGPU::V_ADD_CO_U32_e32:
446 case AMDGPU::V_ADD_CO_U32_e64:
447 case AMDGPU::V_OR_B32_e32:
448 case AMDGPU::V_OR_B32_e64:
449 case AMDGPU::V_SUBREV_U32_e32:
450 case AMDGPU::V_SUBREV_U32_e64:
451 case AMDGPU::V_SUBREV_CO_U32_e32:
452 case AMDGPU::V_SUBREV_CO_U32_e64:
453 case AMDGPU::V_MAX_U32_e32:
454 case AMDGPU::V_MAX_U32_e64:
455 case AMDGPU::V_XOR_B32_e32:
456 case AMDGPU::V_XOR_B32_e64:
457 if (OldOpnd->getImm() == 0)
458 return true;
459 break;
460 case AMDGPU::V_AND_B32_e32:
461 case AMDGPU::V_AND_B32_e64:
462 case AMDGPU::V_MIN_U32_e32:
463 case AMDGPU::V_MIN_U32_e64:
464 if (static_cast<uint32_t>(OldOpnd->getImm()) ==
465 std::numeric_limits<uint32_t>::max())
466 return true;
467 break;
468 case AMDGPU::V_MIN_I32_e32:
469 case AMDGPU::V_MIN_I32_e64:
470 if (static_cast<int32_t>(OldOpnd->getImm()) ==
471 std::numeric_limits<int32_t>::max())
472 return true;
473 break;
474 case AMDGPU::V_MAX_I32_e32:
475 case AMDGPU::V_MAX_I32_e64:
476 if (static_cast<int32_t>(OldOpnd->getImm()) ==
477 std::numeric_limits<int32_t>::min())
478 return true;
479 break;
480 case AMDGPU::V_MUL_I32_I24_e32:
481 case AMDGPU::V_MUL_I32_I24_e64:
482 case AMDGPU::V_MUL_U32_U24_e32:
483 case AMDGPU::V_MUL_U32_U24_e64:
484 if (OldOpnd->getImm() == 1)
485 return true;
486 break;
487 }
488 return false;
489}
490
491MachineInstr *GCNDPPCombine::createDPPInst(
492 MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR,
493 MachineOperand *OldOpndValue, bool CombBCZ, bool IsShrinkable) const {
494 assert(CombOldVGPR.Reg);
495 if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
496 auto *Src1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1);
497 if (!Src1 || !Src1->isReg()) {
498 LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n");
499 return nullptr;
500 }
501 if (!isIdentityValue(OrigMIOp: OrigMI.getOpcode(), OldOpnd: OldOpndValue)) {
502 LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n");
503 return nullptr;
504 }
505 CombOldVGPR = getRegSubRegPair(O: *Src1);
506 auto *MovDst = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::vdst);
507 const TargetRegisterClass *RC = MRI->getRegClass(Reg: MovDst->getReg());
508 if (!isOfRegClass(P: CombOldVGPR, TRC: *RC, MRI&: *MRI)) {
509 LLVM_DEBUG(dbgs() << " failed: src1 has wrong register class\n");
510 return nullptr;
511 }
512 }
513 return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable);
514}
515
516// returns true if MI doesn't have OpndName immediate operand or the
517// operand has Value
518bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, AMDGPU::OpName OpndName,
519 int64_t Value, int64_t Mask) const {
520 auto *Imm = TII->getNamedOperand(MI, OperandName: OpndName);
521 if (!Imm)
522 return true;
523
524 assert(Imm->isImm());
525 return (Imm->getImm() & Mask) == Value;
526}
527
528bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
529 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
530 MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
531 MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
532 LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
533
534 auto *DstOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::vdst);
535 assert(DstOpnd && DstOpnd->isReg());
536 auto DPPMovReg = DstOpnd->getReg();
537 if (DPPMovReg.isPhysical()) {
538 LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n");
539 return false;
540 }
541 if (execMayBeModifiedBeforeAnyUse(MRI: *MRI, VReg: DPPMovReg, DefMI: MovMI)) {
542 LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
543 " for all uses\n");
544 return false;
545 }
546
547 if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
548 MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
549 auto *DppCtrl = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::dpp_ctrl);
550 assert(DppCtrl && DppCtrl->isImm());
551 if (!AMDGPU::isLegalDPALU_DPPControl(DC: DppCtrl->getImm())) {
552 LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported"
553 " control value\n");
554 // Let it split, then control may become legal.
555 return false;
556 }
557 }
558
559 auto *RowMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::row_mask);
560 assert(RowMaskOpnd && RowMaskOpnd->isImm());
561 auto *BankMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bank_mask);
562 assert(BankMaskOpnd && BankMaskOpnd->isImm());
563 const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
564 BankMaskOpnd->getImm() == 0xF;
565
566 auto *BCZOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bound_ctrl);
567 assert(BCZOpnd && BCZOpnd->isImm());
568 bool BoundCtrlZero = BCZOpnd->getImm();
569
570 auto *OldOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::old);
571 auto *SrcOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::src0);
572 assert(OldOpnd && OldOpnd->isReg());
573 assert(SrcOpnd && SrcOpnd->isReg());
574 if (OldOpnd->getReg().isPhysical() || SrcOpnd->getReg().isPhysical()) {
575 LLVM_DEBUG(dbgs() << " failed: dpp move reads physreg\n");
576 return false;
577 }
578
579 auto * const OldOpndValue = getOldOpndValue(OldOpnd&: *OldOpnd);
580 // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
581 // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
582 // but the third option is used to distinguish undef from non-immediate
583 // to reuse IMPLICIT_DEF instruction later
584 assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
585
586 bool CombBCZ = false;
587
588 if (MaskAllLanes && BoundCtrlZero) { // [1]
589 CombBCZ = true;
590 } else {
591 if (!OldOpndValue || !OldOpndValue->isImm()) {
592 LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n");
593 return false;
594 }
595
596 if (OldOpndValue->getImm() == 0) {
597 if (MaskAllLanes) {
598 assert(!BoundCtrlZero); // by check [1]
599 CombBCZ = true;
600 }
601 } else if (BoundCtrlZero) {
602 assert(!MaskAllLanes); // by check [1]
603 LLVM_DEBUG(dbgs() <<
604 " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
605 return false;
606 }
607 }
608
609 LLVM_DEBUG(dbgs() << " old=";
610 if (!OldOpndValue)
611 dbgs() << "undef";
612 else
613 dbgs() << *OldOpndValue;
614 dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
615
616 SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
617 DenseMap<MachineInstr*, SmallVector<unsigned, 4>> RegSeqWithOpNos;
618 auto CombOldVGPR = getRegSubRegPair(O: *OldOpnd);
619 // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
620 if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
621 const TargetRegisterClass *RC = MRI->getRegClass(Reg: DPPMovReg);
622 CombOldVGPR = RegSubRegPair(
623 MRI->createVirtualRegister(RegClass: RC));
624 auto UndefInst = BuildMI(BB&: *MovMI.getParent(), I&: MovMI, MIMD: MovMI.getDebugLoc(),
625 MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: CombOldVGPR.Reg);
626 DPPMIs.push_back(Elt: UndefInst.getInstr());
627 }
628
629 OrigMIs.push_back(Elt: &MovMI);
630 bool Rollback = true;
631 SmallVector<MachineOperand *, 16> Uses(
632 llvm::make_pointer_range(Range: MRI->use_nodbg_operands(Reg: DPPMovReg)));
633
634 while (!Uses.empty()) {
635 MachineOperand *Use = Uses.pop_back_val();
636 Rollback = true;
637
638 auto &OrigMI = *Use->getParent();
639 LLVM_DEBUG(dbgs() << " try: " << OrigMI);
640
641 auto OrigOp = OrigMI.getOpcode();
642 assert((TII->get(OrigOp).getSize() != 4 || !AMDGPU::isTrue16Inst(OrigOp)) &&
643 "There should not be e32 True16 instructions pre-RA");
644 if (OrigOp == AMDGPU::REG_SEQUENCE) {
645 Register FwdReg = OrigMI.getOperand(i: 0).getReg();
646 unsigned FwdSubReg = 0;
647
648 if (execMayBeModifiedBeforeAnyUse(MRI: *MRI, VReg: FwdReg, DefMI: OrigMI)) {
649 LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
650 " for all uses\n");
651 break;
652 }
653
654 unsigned OpNo, E = OrigMI.getNumOperands();
655 for (OpNo = 1; OpNo < E; OpNo += 2) {
656 if (OrigMI.getOperand(i: OpNo).getReg() == DPPMovReg) {
657 FwdSubReg = OrigMI.getOperand(i: OpNo + 1).getImm();
658 break;
659 }
660 }
661
662 if (!FwdSubReg)
663 break;
664
665 for (auto &Op : MRI->use_nodbg_operands(Reg: FwdReg)) {
666 if (Op.getSubReg() == FwdSubReg)
667 Uses.push_back(Elt: &Op);
668 }
669 RegSeqWithOpNos[&OrigMI].push_back(Elt: OpNo);
670 continue;
671 }
672
673 bool IsShrinkable = isShrinkable(MI&: OrigMI);
674 if (!(IsShrinkable ||
675 ((TII->isVOP3P(Opcode: OrigOp) || TII->isVOPC(Opcode: OrigOp) ||
676 TII->isVOP3(Opcode: OrigOp)) &&
677 ST->hasVOP3DPP()) ||
678 TII->isVOP1(Opcode: OrigOp) || TII->isVOP2(Opcode: OrigOp))) {
679 LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n");
680 break;
681 }
682 if (OrigMI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: ST->getRegisterInfo())) {
683 LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n");
684 break;
685 }
686
687 auto *Src0 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src0);
688 auto *Src1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1);
689 if (Use != Src0 && !(Use == Src1 && OrigMI.isCommutable())) { // [1]
690 LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
691 break;
692 }
693
694 auto *Src2 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src2);
695 assert(Src0 && "Src1 without Src0?");
696 if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(Other: *Src0)) ||
697 (Src2 && Src2->isIdenticalTo(Other: *Src0)))) ||
698 (Use == Src1 && (Src1->isIdenticalTo(Other: *Src0) ||
699 (Src2 && Src2->isIdenticalTo(Other: *Src1))))) {
700 LLVM_DEBUG(
701 dbgs()
702 << " " << OrigMI
703 << " failed: DPP register is used more than once per instruction\n");
704 break;
705 }
706
707 LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
708 if (Use == Src0) {
709 if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
710 OldOpndValue, CombBCZ, IsShrinkable)) {
711 DPPMIs.push_back(Elt: DPPInst);
712 Rollback = false;
713 }
714 } else {
715 assert(Use == Src1 && OrigMI.isCommutable()); // by check [1]
716 auto *BB = OrigMI.getParent();
717 auto *NewMI = BB->getParent()->CloneMachineInstr(Orig: &OrigMI);
718 BB->insert(I: OrigMI, MI: NewMI);
719 if (TII->commuteInstruction(MI&: *NewMI)) {
720 LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
721 if (auto *DPPInst =
722 createDPPInst(OrigMI&: *NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ,
723 IsShrinkable)) {
724 DPPMIs.push_back(Elt: DPPInst);
725 Rollback = false;
726 }
727 } else
728 LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
729 NewMI->eraseFromParent();
730 }
731 if (Rollback)
732 break;
733 OrigMIs.push_back(Elt: &OrigMI);
734 }
735
736 Rollback |= !Uses.empty();
737
738 for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
739 MI->eraseFromParent();
740
741 if (!Rollback) {
742 for (auto &S : RegSeqWithOpNos) {
743 if (MRI->use_nodbg_empty(RegNo: S.first->getOperand(i: 0).getReg())) {
744 S.first->eraseFromParent();
745 continue;
746 }
747 while (!S.second.empty())
748 S.first->getOperand(i: S.second.pop_back_val()).setIsUndef();
749 }
750 }
751
752 return !Rollback;
753}
754
755bool GCNDPPCombineLegacy::runOnMachineFunction(MachineFunction &MF) {
756 if (skipFunction(F: MF.getFunction()))
757 return false;
758
759 return GCNDPPCombine().run(MF);
760}
761
762bool GCNDPPCombine::run(MachineFunction &MF) {
763 ST = &MF.getSubtarget<GCNSubtarget>();
764 if (!ST->hasDPP())
765 return false;
766
767 MRI = &MF.getRegInfo();
768 TII = ST->getInstrInfo();
769
770 bool Changed = false;
771 for (auto &MBB : MF) {
772 for (MachineInstr &MI : llvm::make_early_inc_range(Range: llvm::reverse(C&: MBB))) {
773 if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MovMI&: MI)) {
774 Changed = true;
775 ++NumDPPMovsCombined;
776 } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
777 MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
778 if (ST->hasDPALU_DPP() && combineDPPMov(MovMI&: MI)) {
779 Changed = true;
780 ++NumDPPMovsCombined;
781 } else {
782 auto Split = TII->expandMovDPP64(MI);
783 for (auto *M : {Split.first, Split.second}) {
784 if (M && combineDPPMov(MovMI&: *M))
785 ++NumDPPMovsCombined;
786 }
787 Changed = true;
788 }
789 }
790 }
791 }
792 return Changed;
793}
794
795PreservedAnalyses GCNDPPCombinePass::run(MachineFunction &MF,
796 MachineFunctionAnalysisManager &) {
797 MFPropsModifier _(*this, MF);
798
799 if (MF.getFunction().hasOptNone())
800 return PreservedAnalyses::all();
801
802 bool Changed = GCNDPPCombine().run(MF);
803 if (!Changed)
804 return PreservedAnalyses::all();
805
806 auto PA = getMachineFunctionPassPreservedAnalyses();
807 PA.preserveSet<CFGAnalyses>();
808 return PA;
809}
810