1 | //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0 |
9 | // operand. If any of the use instruction cannot be combined with the mov the |
10 | // whole sequence is reverted. |
11 | // |
12 | // $old = ... |
13 | // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane, |
14 | // dpp_controls..., $row_mask, $bank_mask, $bound_ctrl |
15 | // $res = VALU $dpp_value [, src1] |
16 | // |
17 | // to |
18 | // |
19 | // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,] |
20 | // dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl |
21 | // |
22 | // Combining rules : |
23 | // |
24 | // if $row_mask and $bank_mask are fully enabled (0xF) and |
25 | // $bound_ctrl==DPP_BOUND_ZERO or $old==0 |
26 | // -> $combined_old = undef, |
27 | // $combined_bound_ctrl = DPP_BOUND_ZERO |
28 | // |
29 | // if the VALU op is binary and |
30 | // $bound_ctrl==DPP_BOUND_OFF and |
31 | // $old==identity value (immediate) for the VALU op |
32 | // -> $combined_old = src1, |
33 | // $combined_bound_ctrl = DPP_BOUND_OFF |
34 | // |
35 | // Otherwise cancel. |
36 | // |
37 | // The mov_dpp instruction should reside in the same BB as all its uses |
38 | //===----------------------------------------------------------------------===// |
39 | |
40 | #include "GCNDPPCombine.h" |
41 | #include "AMDGPU.h" |
42 | #include "GCNSubtarget.h" |
43 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
44 | #include "llvm/ADT/Statistic.h" |
45 | #include "llvm/CodeGen/MachineFunctionPass.h" |
46 | |
47 | using namespace llvm; |
48 | |
49 | #define DEBUG_TYPE "gcn-dpp-combine" |
50 | |
51 | STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined." ); |
52 | |
53 | namespace { |
54 | |
55 | class GCNDPPCombine { |
56 | MachineRegisterInfo *MRI; |
57 | const SIInstrInfo *TII; |
58 | const GCNSubtarget *ST; |
59 | |
60 | using RegSubRegPair = TargetInstrInfo::RegSubRegPair; |
61 | |
62 | MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const; |
63 | |
64 | MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, |
65 | RegSubRegPair CombOldVGPR, |
66 | MachineOperand *OldOpnd, bool CombBCZ, |
67 | bool IsShrinkable) const; |
68 | |
69 | MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, |
70 | RegSubRegPair CombOldVGPR, bool CombBCZ, |
71 | bool IsShrinkable) const; |
72 | |
73 | bool hasNoImmOrEqual(MachineInstr &MI, AMDGPU::OpName OpndName, int64_t Value, |
74 | int64_t Mask = -1) const; |
75 | |
76 | bool combineDPPMov(MachineInstr &MI) const; |
77 | |
78 | int getDPPOp(unsigned Op, bool IsShrinkable) const; |
79 | bool isShrinkable(MachineInstr &MI) const; |
80 | |
81 | public: |
82 | bool run(MachineFunction &MF); |
83 | }; |
84 | |
85 | class GCNDPPCombineLegacy : public MachineFunctionPass { |
86 | public: |
87 | static char ID; |
88 | |
89 | GCNDPPCombineLegacy() : MachineFunctionPass(ID) {} |
90 | |
91 | bool runOnMachineFunction(MachineFunction &MF) override; |
92 | |
93 | StringRef getPassName() const override { return "GCN DPP Combine" ; } |
94 | |
95 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
96 | AU.setPreservesCFG(); |
97 | MachineFunctionPass::getAnalysisUsage(AU); |
98 | } |
99 | |
100 | MachineFunctionProperties getRequiredProperties() const override { |
101 | return MachineFunctionProperties().setIsSSA(); |
102 | } |
103 | }; |
104 | |
105 | } // end anonymous namespace |
106 | |
107 | INITIALIZE_PASS(GCNDPPCombineLegacy, DEBUG_TYPE, "GCN DPP Combine" , false, |
108 | false) |
109 | |
110 | char GCNDPPCombineLegacy::ID = 0; |
111 | |
112 | char &llvm::GCNDPPCombineLegacyID = GCNDPPCombineLegacy::ID; |
113 | |
114 | FunctionPass *llvm::createGCNDPPCombinePass() { |
115 | return new GCNDPPCombineLegacy(); |
116 | } |
117 | |
118 | bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const { |
119 | unsigned Op = MI.getOpcode(); |
120 | if (!TII->isVOP3(Opcode: Op)) { |
121 | return false; |
122 | } |
123 | if (!TII->hasVALU32BitEncoding(Opcode: Op)) { |
124 | LLVM_DEBUG(dbgs() << " Inst hasn't e32 equivalent\n" ); |
125 | return false; |
126 | } |
127 | // Do not shrink True16 instructions pre-RA to avoid the restriction in |
128 | // register allocation from only being able to use 128 VGPRs |
129 | if (AMDGPU::isTrue16Inst(Opc: Op)) |
130 | return false; |
131 | if (const auto *SDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst)) { |
132 | // Give up if there are any uses of the sdst in carry-out or VOPC. |
133 | // The shrunken form of the instruction would write it to vcc instead of to |
134 | // a virtual register. If we rewrote the uses the shrinking would be |
135 | // possible. |
136 | if (!MRI->use_nodbg_empty(RegNo: SDst->getReg())) |
137 | return false; |
138 | } |
139 | // check if other than abs|neg modifiers are set (opsel for example) |
140 | const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG); |
141 | if (!hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::src0_modifiers, Value: 0, Mask) || |
142 | !hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::src1_modifiers, Value: 0, Mask) || |
143 | !hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::clamp, Value: 0) || |
144 | !hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::omod, Value: 0) || |
145 | !hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::byte_sel, Value: 0)) { |
146 | LLVM_DEBUG(dbgs() << " Inst has non-default modifiers\n" ); |
147 | return false; |
148 | } |
149 | return true; |
150 | } |
151 | |
152 | int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const { |
153 | int DPP32 = AMDGPU::getDPPOp32(Opcode: Op); |
154 | if (IsShrinkable) { |
155 | assert(DPP32 == -1); |
156 | int E32 = AMDGPU::getVOPe32(Opcode: Op); |
157 | DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(Opcode: E32); |
158 | } |
159 | if (DPP32 != -1 && TII->pseudoToMCOpcode(Opcode: DPP32) != -1) |
160 | return DPP32; |
161 | int DPP64 = -1; |
162 | if (ST->hasVOP3DPP()) |
163 | DPP64 = AMDGPU::getDPPOp64(Opcode: Op); |
164 | if (DPP64 != -1 && TII->pseudoToMCOpcode(Opcode: DPP64) != -1) |
165 | return DPP64; |
166 | return -1; |
167 | } |
168 | |
169 | // tracks the register operand definition and returns: |
170 | // 1. immediate operand used to initialize the register if found |
171 | // 2. nullptr if the register operand is undef |
172 | // 3. the operand itself otherwise |
173 | MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { |
174 | auto *Def = getVRegSubRegDef(P: getRegSubRegPair(O: OldOpnd), MRI&: *MRI); |
175 | if (!Def) |
176 | return nullptr; |
177 | |
178 | switch(Def->getOpcode()) { |
179 | default: break; |
180 | case AMDGPU::IMPLICIT_DEF: |
181 | return nullptr; |
182 | case AMDGPU::COPY: |
183 | case AMDGPU::V_MOV_B32_e32: |
184 | case AMDGPU::V_MOV_B64_PSEUDO: |
185 | case AMDGPU::V_MOV_B64_e32: |
186 | case AMDGPU::V_MOV_B64_e64: { |
187 | auto &Op1 = Def->getOperand(i: 1); |
188 | if (Op1.isImm()) |
189 | return &Op1; |
190 | break; |
191 | } |
192 | } |
193 | return &OldOpnd; |
194 | } |
195 | |
196 | [[maybe_unused]] static unsigned getOperandSize(MachineInstr &MI, unsigned Idx, |
197 | MachineRegisterInfo &MRI) { |
198 | int16_t RegClass = MI.getDesc().operands()[Idx].RegClass; |
199 | if (RegClass == -1) |
200 | return 0; |
201 | |
202 | const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); |
203 | return TRI->getRegSizeInBits(RC: *TRI->getRegClass(i: RegClass)); |
204 | } |
205 | |
206 | MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, |
207 | MachineInstr &MovMI, |
208 | RegSubRegPair CombOldVGPR, |
209 | bool CombBCZ, |
210 | bool IsShrinkable) const { |
211 | assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp || |
212 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp || |
213 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); |
214 | |
215 | bool HasVOP3DPP = ST->hasVOP3DPP(); |
216 | auto OrigOp = OrigMI.getOpcode(); |
217 | if (ST->useRealTrue16Insts() && AMDGPU::isTrue16Inst(Opc: OrigOp)) { |
218 | LLVM_DEBUG( |
219 | dbgs() << " failed: Did not expect any 16-bit uses of dpp values\n" ); |
220 | return nullptr; |
221 | } |
222 | auto DPPOp = getDPPOp(Op: OrigOp, IsShrinkable); |
223 | if (DPPOp == -1) { |
224 | LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n" ); |
225 | return nullptr; |
226 | } |
227 | int OrigOpE32 = AMDGPU::getVOPe32(Opcode: OrigOp); |
228 | // Prior checks cover Mask with VOPC condition, but not on purpose |
229 | auto *RowMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::row_mask); |
230 | assert(RowMaskOpnd && RowMaskOpnd->isImm()); |
231 | auto *BankMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bank_mask); |
232 | assert(BankMaskOpnd && BankMaskOpnd->isImm()); |
233 | const bool MaskAllLanes = |
234 | RowMaskOpnd->getImm() == 0xF && BankMaskOpnd->getImm() == 0xF; |
235 | (void)MaskAllLanes; |
236 | assert((MaskAllLanes || |
237 | !(TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 && |
238 | TII->isVOPC(OrigOpE32)))) && |
239 | "VOPC cannot form DPP unless mask is full" ); |
240 | |
241 | auto DPPInst = BuildMI(BB&: *OrigMI.getParent(), I&: OrigMI, |
242 | MIMD: OrigMI.getDebugLoc(), MCID: TII->get(Opcode: DPPOp)) |
243 | .setMIFlags(OrigMI.getFlags()); |
244 | |
245 | bool Fail = false; |
246 | do { |
247 | int NumOperands = 0; |
248 | if (auto *Dst = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::vdst)) { |
249 | DPPInst.add(MO: *Dst); |
250 | ++NumOperands; |
251 | } |
252 | if (auto *SDst = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::sdst)) { |
253 | if (TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: NumOperands, MO: SDst)) { |
254 | DPPInst.add(MO: *SDst); |
255 | ++NumOperands; |
256 | } |
257 | // If we shrunk a 64bit vop3b to 32bits, just ignore the sdst |
258 | } |
259 | |
260 | const int OldIdx = AMDGPU::getNamedOperandIdx(Opcode: DPPOp, Name: AMDGPU::OpName::old); |
261 | if (OldIdx != -1) { |
262 | assert(OldIdx == NumOperands); |
263 | assert(isOfRegClass( |
264 | CombOldVGPR, |
265 | *MRI->getRegClass( |
266 | TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()), |
267 | *MRI)); |
268 | auto *Def = getVRegSubRegDef(P: CombOldVGPR, MRI&: *MRI); |
269 | DPPInst.addReg(RegNo: CombOldVGPR.Reg, flags: Def ? 0 : RegState::Undef, |
270 | SubReg: CombOldVGPR.SubReg); |
271 | ++NumOperands; |
272 | } else if (TII->isVOPC(Opcode: DPPOp) || (TII->isVOP3(Opcode: DPPOp) && OrigOpE32 != -1 && |
273 | TII->isVOPC(Opcode: OrigOpE32))) { |
274 | // VOPC DPP and VOPC promoted to VOP3 DPP do not have an old operand |
275 | // because they write to SGPRs not VGPRs |
276 | } else { |
277 | // TODO: this discards MAC/FMA instructions for now, let's add it later |
278 | LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction," |
279 | " TBD\n" ); |
280 | Fail = true; |
281 | break; |
282 | } |
283 | |
284 | auto *Mod0 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src0_modifiers); |
285 | if (Mod0) { |
286 | assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, |
287 | AMDGPU::OpName::src0_modifiers)); |
288 | assert(HasVOP3DPP || |
289 | (0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); |
290 | DPPInst.addImm(Val: Mod0->getImm()); |
291 | ++NumOperands; |
292 | } else if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::src0_modifiers)) { |
293 | DPPInst.addImm(Val: 0); |
294 | ++NumOperands; |
295 | } |
296 | auto *Src0 = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::src0); |
297 | assert(Src0); |
298 | int Src0Idx = NumOperands; |
299 | if (!TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: NumOperands, MO: Src0)) { |
300 | LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n" ); |
301 | Fail = true; |
302 | break; |
303 | } |
304 | DPPInst.add(MO: *Src0); |
305 | DPPInst->getOperand(i: NumOperands).setIsKill(false); |
306 | ++NumOperands; |
307 | |
308 | auto *Mod1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1_modifiers); |
309 | if (Mod1) { |
310 | assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, |
311 | AMDGPU::OpName::src1_modifiers)); |
312 | assert(HasVOP3DPP || |
313 | (0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); |
314 | DPPInst.addImm(Val: Mod1->getImm()); |
315 | ++NumOperands; |
316 | } else if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::src1_modifiers)) { |
317 | DPPInst.addImm(Val: 0); |
318 | ++NumOperands; |
319 | } |
320 | auto *Src1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1); |
321 | if (Src1) { |
322 | int OpNum = NumOperands; |
323 | // If subtarget does not support SGPRs for src1 operand then the |
324 | // requirements are the same as for src0. We check src0 instead because |
325 | // pseudos are shared between subtargets and allow SGPR for src1 on all. |
326 | if (!ST->hasDPPSrc1SGPR()) { |
327 | assert(getOperandSize(*DPPInst, Src0Idx, *MRI) == |
328 | getOperandSize(*DPPInst, NumOperands, *MRI) && |
329 | "Src0 and Src1 operands should have the same size" ); |
330 | OpNum = Src0Idx; |
331 | } |
332 | if (!TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: OpNum, MO: Src1)) { |
333 | LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n" ); |
334 | Fail = true; |
335 | break; |
336 | } |
337 | DPPInst.add(MO: *Src1); |
338 | ++NumOperands; |
339 | } |
340 | |
341 | auto *Mod2 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src2_modifiers); |
342 | if (Mod2) { |
343 | assert(NumOperands == |
344 | AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers)); |
345 | assert(HasVOP3DPP || |
346 | (0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); |
347 | DPPInst.addImm(Val: Mod2->getImm()); |
348 | ++NumOperands; |
349 | } |
350 | auto *Src2 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src2); |
351 | if (Src2) { |
352 | if (!TII->getNamedOperand(MI&: *DPPInst.getInstr(), OperandName: AMDGPU::OpName::src2) || |
353 | !TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: NumOperands, MO: Src2)) { |
354 | LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n" ); |
355 | Fail = true; |
356 | break; |
357 | } |
358 | DPPInst.add(MO: *Src2); |
359 | ++NumOperands; |
360 | } |
361 | |
362 | if (HasVOP3DPP) { |
363 | auto *ClampOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::clamp); |
364 | if (ClampOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::clamp)) { |
365 | DPPInst.addImm(Val: ClampOpr->getImm()); |
366 | } |
367 | auto *VdstInOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::vdst_in); |
368 | if (VdstInOpr && |
369 | AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::vdst_in)) { |
370 | DPPInst.add(MO: *VdstInOpr); |
371 | } |
372 | auto *OmodOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::omod); |
373 | if (OmodOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::omod)) { |
374 | DPPInst.addImm(Val: OmodOpr->getImm()); |
375 | } |
376 | // Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to |
377 | // all 1. |
378 | if (TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::op_sel)) { |
379 | int64_t OpSel = 0; |
380 | OpSel |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_0) << 0) : 0); |
381 | OpSel |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_0) << 1) : 0); |
382 | OpSel |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_0) << 2) : 0); |
383 | if (Mod0 && TII->isVOP3(MI: OrigMI) && !TII->isVOP3P(MI: OrigMI)) |
384 | OpSel |= !!(Mod0->getImm() & SISrcMods::DST_OP_SEL) << 3; |
385 | |
386 | if (OpSel != 0) { |
387 | LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n" ); |
388 | Fail = true; |
389 | break; |
390 | } |
391 | if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::op_sel)) |
392 | DPPInst.addImm(Val: OpSel); |
393 | } |
394 | if (TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::op_sel_hi)) { |
395 | int64_t OpSelHi = 0; |
396 | OpSelHi |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_1) << 0) : 0); |
397 | OpSelHi |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_1) << 1) : 0); |
398 | OpSelHi |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_1) << 2) : 0); |
399 | |
400 | // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check |
401 | // the bitmask for 3 op_sel_hi bits set |
402 | assert(Src2 && "Expected vop3p with 3 operands" ); |
403 | if (OpSelHi != 7) { |
404 | LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n" ); |
405 | Fail = true; |
406 | break; |
407 | } |
408 | if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::op_sel_hi)) |
409 | DPPInst.addImm(Val: OpSelHi); |
410 | } |
411 | auto *NegOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::neg_lo); |
412 | if (NegOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::neg_lo)) { |
413 | DPPInst.addImm(Val: NegOpr->getImm()); |
414 | } |
415 | auto *NegHiOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::neg_hi); |
416 | if (NegHiOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::neg_hi)) { |
417 | DPPInst.addImm(Val: NegHiOpr->getImm()); |
418 | } |
419 | auto *ByteSelOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::byte_sel); |
420 | if (ByteSelOpr && |
421 | AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::byte_sel)) { |
422 | DPPInst.addImm(Val: ByteSelOpr->getImm()); |
423 | } |
424 | } |
425 | DPPInst.add(MO: *TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::dpp_ctrl)); |
426 | DPPInst.add(MO: *TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::row_mask)); |
427 | DPPInst.add(MO: *TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bank_mask)); |
428 | DPPInst.addImm(Val: CombBCZ ? 1 : 0); |
429 | } while (false); |
430 | |
431 | if (Fail) { |
432 | DPPInst.getInstr()->eraseFromParent(); |
433 | return nullptr; |
434 | } |
435 | LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr()); |
436 | return DPPInst.getInstr(); |
437 | } |
438 | |
439 | static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) { |
440 | assert(OldOpnd->isImm()); |
441 | switch (OrigMIOp) { |
442 | default: break; |
443 | case AMDGPU::V_ADD_U32_e32: |
444 | case AMDGPU::V_ADD_U32_e64: |
445 | case AMDGPU::V_ADD_CO_U32_e32: |
446 | case AMDGPU::V_ADD_CO_U32_e64: |
447 | case AMDGPU::V_OR_B32_e32: |
448 | case AMDGPU::V_OR_B32_e64: |
449 | case AMDGPU::V_SUBREV_U32_e32: |
450 | case AMDGPU::V_SUBREV_U32_e64: |
451 | case AMDGPU::V_SUBREV_CO_U32_e32: |
452 | case AMDGPU::V_SUBREV_CO_U32_e64: |
453 | case AMDGPU::V_MAX_U32_e32: |
454 | case AMDGPU::V_MAX_U32_e64: |
455 | case AMDGPU::V_XOR_B32_e32: |
456 | case AMDGPU::V_XOR_B32_e64: |
457 | if (OldOpnd->getImm() == 0) |
458 | return true; |
459 | break; |
460 | case AMDGPU::V_AND_B32_e32: |
461 | case AMDGPU::V_AND_B32_e64: |
462 | case AMDGPU::V_MIN_U32_e32: |
463 | case AMDGPU::V_MIN_U32_e64: |
464 | if (static_cast<uint32_t>(OldOpnd->getImm()) == |
465 | std::numeric_limits<uint32_t>::max()) |
466 | return true; |
467 | break; |
468 | case AMDGPU::V_MIN_I32_e32: |
469 | case AMDGPU::V_MIN_I32_e64: |
470 | if (static_cast<int32_t>(OldOpnd->getImm()) == |
471 | std::numeric_limits<int32_t>::max()) |
472 | return true; |
473 | break; |
474 | case AMDGPU::V_MAX_I32_e32: |
475 | case AMDGPU::V_MAX_I32_e64: |
476 | if (static_cast<int32_t>(OldOpnd->getImm()) == |
477 | std::numeric_limits<int32_t>::min()) |
478 | return true; |
479 | break; |
480 | case AMDGPU::V_MUL_I32_I24_e32: |
481 | case AMDGPU::V_MUL_I32_I24_e64: |
482 | case AMDGPU::V_MUL_U32_U24_e32: |
483 | case AMDGPU::V_MUL_U32_U24_e64: |
484 | if (OldOpnd->getImm() == 1) |
485 | return true; |
486 | break; |
487 | } |
488 | return false; |
489 | } |
490 | |
491 | MachineInstr *GCNDPPCombine::createDPPInst( |
492 | MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR, |
493 | MachineOperand *OldOpndValue, bool CombBCZ, bool IsShrinkable) const { |
494 | assert(CombOldVGPR.Reg); |
495 | if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) { |
496 | auto *Src1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1); |
497 | if (!Src1 || !Src1->isReg()) { |
498 | LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n" ); |
499 | return nullptr; |
500 | } |
501 | if (!isIdentityValue(OrigMIOp: OrigMI.getOpcode(), OldOpnd: OldOpndValue)) { |
502 | LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n" ); |
503 | return nullptr; |
504 | } |
505 | CombOldVGPR = getRegSubRegPair(O: *Src1); |
506 | auto *MovDst = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::vdst); |
507 | const TargetRegisterClass *RC = MRI->getRegClass(Reg: MovDst->getReg()); |
508 | if (!isOfRegClass(P: CombOldVGPR, TRC: *RC, MRI&: *MRI)) { |
509 | LLVM_DEBUG(dbgs() << " failed: src1 has wrong register class\n" ); |
510 | return nullptr; |
511 | } |
512 | } |
513 | return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable); |
514 | } |
515 | |
516 | // returns true if MI doesn't have OpndName immediate operand or the |
517 | // operand has Value |
518 | bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, AMDGPU::OpName OpndName, |
519 | int64_t Value, int64_t Mask) const { |
520 | auto *Imm = TII->getNamedOperand(MI, OperandName: OpndName); |
521 | if (!Imm) |
522 | return true; |
523 | |
524 | assert(Imm->isImm()); |
525 | return (Imm->getImm() & Mask) == Value; |
526 | } |
527 | |
528 | bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { |
529 | assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp || |
530 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp || |
531 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); |
532 | LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); |
533 | |
534 | auto *DstOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::vdst); |
535 | assert(DstOpnd && DstOpnd->isReg()); |
536 | auto DPPMovReg = DstOpnd->getReg(); |
537 | if (DPPMovReg.isPhysical()) { |
538 | LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n" ); |
539 | return false; |
540 | } |
541 | if (execMayBeModifiedBeforeAnyUse(MRI: *MRI, VReg: DPPMovReg, DefMI: MovMI)) { |
542 | LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" |
543 | " for all uses\n" ); |
544 | return false; |
545 | } |
546 | |
547 | if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || |
548 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { |
549 | auto *DppCtrl = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::dpp_ctrl); |
550 | assert(DppCtrl && DppCtrl->isImm()); |
551 | if (!AMDGPU::isLegalDPALU_DPPControl(DC: DppCtrl->getImm())) { |
552 | LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported" |
553 | " control value\n" ); |
554 | // Let it split, then control may become legal. |
555 | return false; |
556 | } |
557 | } |
558 | |
559 | auto *RowMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::row_mask); |
560 | assert(RowMaskOpnd && RowMaskOpnd->isImm()); |
561 | auto *BankMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bank_mask); |
562 | assert(BankMaskOpnd && BankMaskOpnd->isImm()); |
563 | const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF && |
564 | BankMaskOpnd->getImm() == 0xF; |
565 | |
566 | auto *BCZOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bound_ctrl); |
567 | assert(BCZOpnd && BCZOpnd->isImm()); |
568 | bool BoundCtrlZero = BCZOpnd->getImm(); |
569 | |
570 | auto *OldOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::old); |
571 | auto *SrcOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::src0); |
572 | assert(OldOpnd && OldOpnd->isReg()); |
573 | assert(SrcOpnd && SrcOpnd->isReg()); |
574 | if (OldOpnd->getReg().isPhysical() || SrcOpnd->getReg().isPhysical()) { |
575 | LLVM_DEBUG(dbgs() << " failed: dpp move reads physreg\n" ); |
576 | return false; |
577 | } |
578 | |
579 | auto * const OldOpndValue = getOldOpndValue(OldOpnd&: *OldOpnd); |
580 | // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else |
581 | // We could use: assert(!OldOpndValue || OldOpndValue->isImm()) |
582 | // but the third option is used to distinguish undef from non-immediate |
583 | // to reuse IMPLICIT_DEF instruction later |
584 | assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd); |
585 | |
586 | bool CombBCZ = false; |
587 | |
588 | if (MaskAllLanes && BoundCtrlZero) { // [1] |
589 | CombBCZ = true; |
590 | } else { |
591 | if (!OldOpndValue || !OldOpndValue->isImm()) { |
592 | LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n" ); |
593 | return false; |
594 | } |
595 | |
596 | if (OldOpndValue->getImm() == 0) { |
597 | if (MaskAllLanes) { |
598 | assert(!BoundCtrlZero); // by check [1] |
599 | CombBCZ = true; |
600 | } |
601 | } else if (BoundCtrlZero) { |
602 | assert(!MaskAllLanes); // by check [1] |
603 | LLVM_DEBUG(dbgs() << |
604 | " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n" ); |
605 | return false; |
606 | } |
607 | } |
608 | |
609 | LLVM_DEBUG(dbgs() << " old=" ; |
610 | if (!OldOpndValue) |
611 | dbgs() << "undef" ; |
612 | else |
613 | dbgs() << *OldOpndValue; |
614 | dbgs() << ", bound_ctrl=" << CombBCZ << '\n'); |
615 | |
616 | SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs; |
617 | DenseMap<MachineInstr*, SmallVector<unsigned, 4>> RegSeqWithOpNos; |
618 | auto CombOldVGPR = getRegSubRegPair(O: *OldOpnd); |
619 | // try to reuse previous old reg if its undefined (IMPLICIT_DEF) |
620 | if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef |
621 | const TargetRegisterClass *RC = MRI->getRegClass(Reg: DPPMovReg); |
622 | CombOldVGPR = RegSubRegPair( |
623 | MRI->createVirtualRegister(RegClass: RC)); |
624 | auto UndefInst = BuildMI(BB&: *MovMI.getParent(), I&: MovMI, MIMD: MovMI.getDebugLoc(), |
625 | MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: CombOldVGPR.Reg); |
626 | DPPMIs.push_back(Elt: UndefInst.getInstr()); |
627 | } |
628 | |
629 | OrigMIs.push_back(Elt: &MovMI); |
630 | bool Rollback = true; |
631 | SmallVector<MachineOperand *, 16> Uses( |
632 | llvm::make_pointer_range(Range: MRI->use_nodbg_operands(Reg: DPPMovReg))); |
633 | |
634 | while (!Uses.empty()) { |
635 | MachineOperand *Use = Uses.pop_back_val(); |
636 | Rollback = true; |
637 | |
638 | auto &OrigMI = *Use->getParent(); |
639 | LLVM_DEBUG(dbgs() << " try: " << OrigMI); |
640 | |
641 | auto OrigOp = OrigMI.getOpcode(); |
642 | assert((TII->get(OrigOp).getSize() != 4 || !AMDGPU::isTrue16Inst(OrigOp)) && |
643 | "There should not be e32 True16 instructions pre-RA" ); |
644 | if (OrigOp == AMDGPU::REG_SEQUENCE) { |
645 | Register FwdReg = OrigMI.getOperand(i: 0).getReg(); |
646 | unsigned FwdSubReg = 0; |
647 | |
648 | if (execMayBeModifiedBeforeAnyUse(MRI: *MRI, VReg: FwdReg, DefMI: OrigMI)) { |
649 | LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" |
650 | " for all uses\n" ); |
651 | break; |
652 | } |
653 | |
654 | unsigned OpNo, E = OrigMI.getNumOperands(); |
655 | for (OpNo = 1; OpNo < E; OpNo += 2) { |
656 | if (OrigMI.getOperand(i: OpNo).getReg() == DPPMovReg) { |
657 | FwdSubReg = OrigMI.getOperand(i: OpNo + 1).getImm(); |
658 | break; |
659 | } |
660 | } |
661 | |
662 | if (!FwdSubReg) |
663 | break; |
664 | |
665 | for (auto &Op : MRI->use_nodbg_operands(Reg: FwdReg)) { |
666 | if (Op.getSubReg() == FwdSubReg) |
667 | Uses.push_back(Elt: &Op); |
668 | } |
669 | RegSeqWithOpNos[&OrigMI].push_back(Elt: OpNo); |
670 | continue; |
671 | } |
672 | |
673 | bool IsShrinkable = isShrinkable(MI&: OrigMI); |
674 | if (!(IsShrinkable || |
675 | ((TII->isVOP3P(Opcode: OrigOp) || TII->isVOPC(Opcode: OrigOp) || |
676 | TII->isVOP3(Opcode: OrigOp)) && |
677 | ST->hasVOP3DPP()) || |
678 | TII->isVOP1(Opcode: OrigOp) || TII->isVOP2(Opcode: OrigOp))) { |
679 | LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n" ); |
680 | break; |
681 | } |
682 | if (OrigMI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: ST->getRegisterInfo())) { |
683 | LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n" ); |
684 | break; |
685 | } |
686 | |
687 | auto *Src0 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src0); |
688 | auto *Src1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1); |
689 | if (Use != Src0 && !(Use == Src1 && OrigMI.isCommutable())) { // [1] |
690 | LLVM_DEBUG(dbgs() << " failed: no suitable operands\n" ); |
691 | break; |
692 | } |
693 | |
694 | auto *Src2 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src2); |
695 | assert(Src0 && "Src1 without Src0?" ); |
696 | if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(Other: *Src0)) || |
697 | (Src2 && Src2->isIdenticalTo(Other: *Src0)))) || |
698 | (Use == Src1 && (Src1->isIdenticalTo(Other: *Src0) || |
699 | (Src2 && Src2->isIdenticalTo(Other: *Src1))))) { |
700 | LLVM_DEBUG( |
701 | dbgs() |
702 | << " " << OrigMI |
703 | << " failed: DPP register is used more than once per instruction\n" ); |
704 | break; |
705 | } |
706 | |
707 | LLVM_DEBUG(dbgs() << " combining: " << OrigMI); |
708 | if (Use == Src0) { |
709 | if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR, |
710 | OldOpndValue, CombBCZ, IsShrinkable)) { |
711 | DPPMIs.push_back(Elt: DPPInst); |
712 | Rollback = false; |
713 | } |
714 | } else { |
715 | assert(Use == Src1 && OrigMI.isCommutable()); // by check [1] |
716 | auto *BB = OrigMI.getParent(); |
717 | auto *NewMI = BB->getParent()->CloneMachineInstr(Orig: &OrigMI); |
718 | BB->insert(I: OrigMI, MI: NewMI); |
719 | if (TII->commuteInstruction(MI&: *NewMI)) { |
720 | LLVM_DEBUG(dbgs() << " commuted: " << *NewMI); |
721 | if (auto *DPPInst = |
722 | createDPPInst(OrigMI&: *NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ, |
723 | IsShrinkable)) { |
724 | DPPMIs.push_back(Elt: DPPInst); |
725 | Rollback = false; |
726 | } |
727 | } else |
728 | LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n" ); |
729 | NewMI->eraseFromParent(); |
730 | } |
731 | if (Rollback) |
732 | break; |
733 | OrigMIs.push_back(Elt: &OrigMI); |
734 | } |
735 | |
736 | Rollback |= !Uses.empty(); |
737 | |
738 | for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs)) |
739 | MI->eraseFromParent(); |
740 | |
741 | if (!Rollback) { |
742 | for (auto &S : RegSeqWithOpNos) { |
743 | if (MRI->use_nodbg_empty(RegNo: S.first->getOperand(i: 0).getReg())) { |
744 | S.first->eraseFromParent(); |
745 | continue; |
746 | } |
747 | while (!S.second.empty()) |
748 | S.first->getOperand(i: S.second.pop_back_val()).setIsUndef(); |
749 | } |
750 | } |
751 | |
752 | return !Rollback; |
753 | } |
754 | |
755 | bool GCNDPPCombineLegacy::runOnMachineFunction(MachineFunction &MF) { |
756 | if (skipFunction(F: MF.getFunction())) |
757 | return false; |
758 | |
759 | return GCNDPPCombine().run(MF); |
760 | } |
761 | |
762 | bool GCNDPPCombine::run(MachineFunction &MF) { |
763 | ST = &MF.getSubtarget<GCNSubtarget>(); |
764 | if (!ST->hasDPP()) |
765 | return false; |
766 | |
767 | MRI = &MF.getRegInfo(); |
768 | TII = ST->getInstrInfo(); |
769 | |
770 | bool Changed = false; |
771 | for (auto &MBB : MF) { |
772 | for (MachineInstr &MI : llvm::make_early_inc_range(Range: llvm::reverse(C&: MBB))) { |
773 | if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MovMI&: MI)) { |
774 | Changed = true; |
775 | ++NumDPPMovsCombined; |
776 | } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || |
777 | MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { |
778 | if (ST->hasDPALU_DPP() && combineDPPMov(MovMI&: MI)) { |
779 | Changed = true; |
780 | ++NumDPPMovsCombined; |
781 | } else { |
782 | auto Split = TII->expandMovDPP64(MI); |
783 | for (auto *M : {Split.first, Split.second}) { |
784 | if (M && combineDPPMov(MovMI&: *M)) |
785 | ++NumDPPMovsCombined; |
786 | } |
787 | Changed = true; |
788 | } |
789 | } |
790 | } |
791 | } |
792 | return Changed; |
793 | } |
794 | |
795 | PreservedAnalyses GCNDPPCombinePass::run(MachineFunction &MF, |
796 | MachineFunctionAnalysisManager &) { |
797 | MFPropsModifier _(*this, MF); |
798 | |
799 | if (MF.getFunction().hasOptNone()) |
800 | return PreservedAnalyses::all(); |
801 | |
802 | bool Changed = GCNDPPCombine().run(MF); |
803 | if (!Changed) |
804 | return PreservedAnalyses::all(); |
805 | |
806 | auto PA = getMachineFunctionPassPreservedAnalyses(); |
807 | PA.preserveSet<CFGAnalyses>(); |
808 | return PA; |
809 | } |
810 | |