1 | //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0 |
9 | // operand. If any of the use instruction cannot be combined with the mov the |
10 | // whole sequence is reverted. |
11 | // |
12 | // $old = ... |
13 | // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane, |
14 | // dpp_controls..., $row_mask, $bank_mask, $bound_ctrl |
15 | // $res = VALU $dpp_value [, src1] |
16 | // |
17 | // to |
18 | // |
19 | // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,] |
20 | // dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl |
21 | // |
22 | // Combining rules : |
23 | // |
24 | // if $row_mask and $bank_mask are fully enabled (0xF) and |
25 | // $bound_ctrl==DPP_BOUND_ZERO or $old==0 |
26 | // -> $combined_old = undef, |
27 | // $combined_bound_ctrl = DPP_BOUND_ZERO |
28 | // |
29 | // if the VALU op is binary and |
30 | // $bound_ctrl==DPP_BOUND_OFF and |
31 | // $old==identity value (immediate) for the VALU op |
32 | // -> $combined_old = src1, |
33 | // $combined_bound_ctrl = DPP_BOUND_OFF |
34 | // |
35 | // Otherwise cancel. |
36 | // |
37 | // The mov_dpp instruction should reside in the same BB as all its uses |
38 | //===----------------------------------------------------------------------===// |
39 | |
40 | #include "AMDGPU.h" |
41 | #include "GCNSubtarget.h" |
42 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
43 | #include "llvm/ADT/Statistic.h" |
44 | #include "llvm/CodeGen/MachineFunctionPass.h" |
45 | |
46 | using namespace llvm; |
47 | |
48 | #define DEBUG_TYPE "gcn-dpp-combine" |
49 | |
50 | STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined." ); |
51 | |
52 | namespace { |
53 | |
54 | class GCNDPPCombine : public MachineFunctionPass { |
55 | MachineRegisterInfo *MRI; |
56 | const SIInstrInfo *TII; |
57 | const GCNSubtarget *ST; |
58 | |
59 | using RegSubRegPair = TargetInstrInfo::RegSubRegPair; |
60 | |
61 | MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const; |
62 | |
63 | MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, |
64 | RegSubRegPair CombOldVGPR, |
65 | MachineOperand *OldOpnd, bool CombBCZ, |
66 | bool IsShrinkable) const; |
67 | |
68 | MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, |
69 | RegSubRegPair CombOldVGPR, bool CombBCZ, |
70 | bool IsShrinkable) const; |
71 | |
72 | bool hasNoImmOrEqual(MachineInstr &MI, |
73 | unsigned OpndName, |
74 | int64_t Value, |
75 | int64_t Mask = -1) const; |
76 | |
77 | bool combineDPPMov(MachineInstr &MI) const; |
78 | |
79 | public: |
80 | static char ID; |
81 | |
82 | GCNDPPCombine() : MachineFunctionPass(ID) { |
83 | initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry()); |
84 | } |
85 | |
86 | bool runOnMachineFunction(MachineFunction &MF) override; |
87 | |
88 | StringRef getPassName() const override { return "GCN DPP Combine" ; } |
89 | |
90 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
91 | AU.setPreservesCFG(); |
92 | MachineFunctionPass::getAnalysisUsage(AU); |
93 | } |
94 | |
95 | MachineFunctionProperties getRequiredProperties() const override { |
96 | return MachineFunctionProperties() |
97 | .set(MachineFunctionProperties::Property::IsSSA); |
98 | } |
99 | |
100 | private: |
101 | int getDPPOp(unsigned Op, bool IsShrinkable) const; |
102 | bool isShrinkable(MachineInstr &MI) const; |
103 | }; |
104 | |
105 | } // end anonymous namespace |
106 | |
107 | INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine" , false, false) |
108 | |
109 | char GCNDPPCombine::ID = 0; |
110 | |
111 | char &llvm::GCNDPPCombineID = GCNDPPCombine::ID; |
112 | |
113 | FunctionPass *llvm::createGCNDPPCombinePass() { |
114 | return new GCNDPPCombine(); |
115 | } |
116 | |
117 | bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const { |
118 | unsigned Op = MI.getOpcode(); |
119 | if (!TII->isVOP3(Opcode: Op)) { |
120 | return false; |
121 | } |
122 | if (!TII->hasVALU32BitEncoding(Opcode: Op)) { |
123 | LLVM_DEBUG(dbgs() << " Inst hasn't e32 equivalent\n" ); |
124 | return false; |
125 | } |
126 | // Do not shrink True16 instructions pre-RA to avoid the restriction in |
127 | // register allocation from only being able to use 128 VGPRs |
128 | if (AMDGPU::isTrue16Inst(Opc: Op)) |
129 | return false; |
130 | if (const auto *SDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst)) { |
131 | // Give up if there are any uses of the sdst in carry-out or VOPC. |
132 | // The shrunken form of the instruction would write it to vcc instead of to |
133 | // a virtual register. If we rewrote the uses the shrinking would be |
134 | // possible. |
135 | if (!MRI->use_nodbg_empty(RegNo: SDst->getReg())) |
136 | return false; |
137 | } |
138 | // check if other than abs|neg modifiers are set (opsel for example) |
139 | const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG); |
140 | if (!hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::src0_modifiers, Value: 0, Mask) || |
141 | !hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::src1_modifiers, Value: 0, Mask) || |
142 | !hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::clamp, Value: 0) || |
143 | !hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::omod, Value: 0) || |
144 | !hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::byte_sel, Value: 0)) { |
145 | LLVM_DEBUG(dbgs() << " Inst has non-default modifiers\n" ); |
146 | return false; |
147 | } |
148 | return true; |
149 | } |
150 | |
151 | int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const { |
152 | int DPP32 = AMDGPU::getDPPOp32(Opcode: Op); |
153 | if (IsShrinkable) { |
154 | assert(DPP32 == -1); |
155 | int E32 = AMDGPU::getVOPe32(Opcode: Op); |
156 | DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(Opcode: E32); |
157 | } |
158 | if (DPP32 != -1 && TII->pseudoToMCOpcode(Opcode: DPP32) != -1) |
159 | return DPP32; |
160 | int DPP64 = -1; |
161 | if (ST->hasVOP3DPP()) |
162 | DPP64 = AMDGPU::getDPPOp64(Opcode: Op); |
163 | if (DPP64 != -1 && TII->pseudoToMCOpcode(Opcode: DPP64) != -1) |
164 | return DPP64; |
165 | return -1; |
166 | } |
167 | |
168 | // tracks the register operand definition and returns: |
169 | // 1. immediate operand used to initialize the register if found |
170 | // 2. nullptr if the register operand is undef |
171 | // 3. the operand itself otherwise |
172 | MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { |
173 | auto *Def = getVRegSubRegDef(P: getRegSubRegPair(O: OldOpnd), MRI&: *MRI); |
174 | if (!Def) |
175 | return nullptr; |
176 | |
177 | switch(Def->getOpcode()) { |
178 | default: break; |
179 | case AMDGPU::IMPLICIT_DEF: |
180 | return nullptr; |
181 | case AMDGPU::COPY: |
182 | case AMDGPU::V_MOV_B32_e32: |
183 | case AMDGPU::V_MOV_B64_PSEUDO: |
184 | case AMDGPU::V_MOV_B64_e32: |
185 | case AMDGPU::V_MOV_B64_e64: { |
186 | auto &Op1 = Def->getOperand(i: 1); |
187 | if (Op1.isImm()) |
188 | return &Op1; |
189 | break; |
190 | } |
191 | } |
192 | return &OldOpnd; |
193 | } |
194 | |
195 | [[maybe_unused]] static unsigned getOperandSize(MachineInstr &MI, unsigned Idx, |
196 | MachineRegisterInfo &MRI) { |
197 | int16_t RegClass = MI.getDesc().operands()[Idx].RegClass; |
198 | if (RegClass == -1) |
199 | return 0; |
200 | |
201 | const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); |
202 | return TRI->getRegSizeInBits(RC: *TRI->getRegClass(i: RegClass)); |
203 | } |
204 | |
205 | MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, |
206 | MachineInstr &MovMI, |
207 | RegSubRegPair CombOldVGPR, |
208 | bool CombBCZ, |
209 | bool IsShrinkable) const { |
210 | assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp || |
211 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp || |
212 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); |
213 | |
214 | bool HasVOP3DPP = ST->hasVOP3DPP(); |
215 | auto OrigOp = OrigMI.getOpcode(); |
216 | auto DPPOp = getDPPOp(Op: OrigOp, IsShrinkable); |
217 | if (DPPOp == -1) { |
218 | LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n" ); |
219 | return nullptr; |
220 | } |
221 | int OrigOpE32 = AMDGPU::getVOPe32(Opcode: OrigOp); |
222 | // Prior checks cover Mask with VOPC condition, but not on purpose |
223 | auto *RowMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::row_mask); |
224 | assert(RowMaskOpnd && RowMaskOpnd->isImm()); |
225 | auto *BankMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bank_mask); |
226 | assert(BankMaskOpnd && BankMaskOpnd->isImm()); |
227 | const bool MaskAllLanes = |
228 | RowMaskOpnd->getImm() == 0xF && BankMaskOpnd->getImm() == 0xF; |
229 | (void)MaskAllLanes; |
230 | assert((MaskAllLanes || |
231 | !(TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 && |
232 | TII->isVOPC(OrigOpE32)))) && |
233 | "VOPC cannot form DPP unless mask is full" ); |
234 | |
235 | auto DPPInst = BuildMI(BB&: *OrigMI.getParent(), I&: OrigMI, |
236 | MIMD: OrigMI.getDebugLoc(), MCID: TII->get(Opcode: DPPOp)) |
237 | .setMIFlags(OrigMI.getFlags()); |
238 | |
239 | bool Fail = false; |
240 | do { |
241 | int NumOperands = 0; |
242 | if (auto *Dst = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::vdst)) { |
243 | DPPInst.add(MO: *Dst); |
244 | ++NumOperands; |
245 | } |
246 | if (auto *SDst = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::sdst)) { |
247 | if (TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: NumOperands, MO: SDst)) { |
248 | DPPInst.add(MO: *SDst); |
249 | ++NumOperands; |
250 | } |
251 | // If we shrunk a 64bit vop3b to 32bits, just ignore the sdst |
252 | } |
253 | |
254 | const int OldIdx = AMDGPU::getNamedOperandIdx(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::old); |
255 | if (OldIdx != -1) { |
256 | assert(OldIdx == NumOperands); |
257 | assert(isOfRegClass( |
258 | CombOldVGPR, |
259 | *MRI->getRegClass( |
260 | TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()), |
261 | *MRI)); |
262 | auto *Def = getVRegSubRegDef(P: CombOldVGPR, MRI&: *MRI); |
263 | DPPInst.addReg(RegNo: CombOldVGPR.Reg, flags: Def ? 0 : RegState::Undef, |
264 | SubReg: CombOldVGPR.SubReg); |
265 | ++NumOperands; |
266 | } else if (TII->isVOPC(Opcode: DPPOp) || (TII->isVOP3(Opcode: DPPOp) && OrigOpE32 != -1 && |
267 | TII->isVOPC(Opcode: OrigOpE32))) { |
268 | // VOPC DPP and VOPC promoted to VOP3 DPP do not have an old operand |
269 | // because they write to SGPRs not VGPRs |
270 | } else { |
271 | // TODO: this discards MAC/FMA instructions for now, let's add it later |
272 | LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction," |
273 | " TBD\n" ); |
274 | Fail = true; |
275 | break; |
276 | } |
277 | |
278 | auto *Mod0 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src0_modifiers); |
279 | if (Mod0) { |
280 | assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, |
281 | AMDGPU::OpName::src0_modifiers)); |
282 | assert(HasVOP3DPP || |
283 | (0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); |
284 | DPPInst.addImm(Val: Mod0->getImm()); |
285 | ++NumOperands; |
286 | } else if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::src0_modifiers)) { |
287 | DPPInst.addImm(Val: 0); |
288 | ++NumOperands; |
289 | } |
290 | auto *Src0 = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::src0); |
291 | assert(Src0); |
292 | int Src0Idx = NumOperands; |
293 | if (!TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: NumOperands, MO: Src0)) { |
294 | LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n" ); |
295 | Fail = true; |
296 | break; |
297 | } |
298 | DPPInst.add(MO: *Src0); |
299 | DPPInst->getOperand(i: NumOperands).setIsKill(false); |
300 | ++NumOperands; |
301 | |
302 | auto *Mod1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1_modifiers); |
303 | if (Mod1) { |
304 | assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, |
305 | AMDGPU::OpName::src1_modifiers)); |
306 | assert(HasVOP3DPP || |
307 | (0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); |
308 | DPPInst.addImm(Val: Mod1->getImm()); |
309 | ++NumOperands; |
310 | } else if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::src1_modifiers)) { |
311 | DPPInst.addImm(Val: 0); |
312 | ++NumOperands; |
313 | } |
314 | auto *Src1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1); |
315 | if (Src1) { |
316 | int OpNum = NumOperands; |
317 | // If subtarget does not support SGPRs for src1 operand then the |
318 | // requirements are the same as for src0. We check src0 instead because |
319 | // pseudos are shared between subtargets and allow SGPR for src1 on all. |
320 | if (!ST->hasDPPSrc1SGPR()) { |
321 | assert(getOperandSize(*DPPInst, Src0Idx, *MRI) == |
322 | getOperandSize(*DPPInst, NumOperands, *MRI) && |
323 | "Src0 and Src1 operands should have the same size" ); |
324 | OpNum = Src0Idx; |
325 | } |
326 | if (!TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: OpNum, MO: Src1)) { |
327 | LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n" ); |
328 | Fail = true; |
329 | break; |
330 | } |
331 | DPPInst.add(MO: *Src1); |
332 | ++NumOperands; |
333 | } |
334 | |
335 | auto *Mod2 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src2_modifiers); |
336 | if (Mod2) { |
337 | assert(NumOperands == |
338 | AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers)); |
339 | assert(HasVOP3DPP || |
340 | (0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); |
341 | DPPInst.addImm(Val: Mod2->getImm()); |
342 | ++NumOperands; |
343 | } |
344 | auto *Src2 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src2); |
345 | if (Src2) { |
346 | if (!TII->getNamedOperand(MI&: *DPPInst.getInstr(), OperandName: AMDGPU::OpName::src2) || |
347 | !TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: NumOperands, MO: Src2)) { |
348 | LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n" ); |
349 | Fail = true; |
350 | break; |
351 | } |
352 | DPPInst.add(MO: *Src2); |
353 | ++NumOperands; |
354 | } |
355 | |
356 | if (HasVOP3DPP) { |
357 | auto *ClampOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::clamp); |
358 | if (ClampOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::clamp)) { |
359 | DPPInst.addImm(Val: ClampOpr->getImm()); |
360 | } |
361 | auto *VdstInOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::vdst_in); |
362 | if (VdstInOpr && |
363 | AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::vdst_in)) { |
364 | DPPInst.add(MO: *VdstInOpr); |
365 | } |
366 | auto *OmodOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::omod); |
367 | if (OmodOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::omod)) { |
368 | DPPInst.addImm(Val: OmodOpr->getImm()); |
369 | } |
370 | // Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to |
371 | // all 1. |
372 | if (TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::op_sel)) { |
373 | int64_t OpSel = 0; |
374 | OpSel |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_0) << 0) : 0); |
375 | OpSel |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_0) << 1) : 0); |
376 | OpSel |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_0) << 2) : 0); |
377 | if (Mod0 && TII->isVOP3(MI: OrigMI) && !TII->isVOP3P(MI: OrigMI)) |
378 | OpSel |= !!(Mod0->getImm() & SISrcMods::DST_OP_SEL) << 3; |
379 | |
380 | if (OpSel != 0) { |
381 | LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n" ); |
382 | Fail = true; |
383 | break; |
384 | } |
385 | if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::op_sel)) |
386 | DPPInst.addImm(Val: OpSel); |
387 | } |
388 | if (TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::op_sel_hi)) { |
389 | int64_t OpSelHi = 0; |
390 | OpSelHi |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_1) << 0) : 0); |
391 | OpSelHi |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_1) << 1) : 0); |
392 | OpSelHi |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_1) << 2) : 0); |
393 | |
394 | // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check |
395 | // the bitmask for 3 op_sel_hi bits set |
396 | assert(Src2 && "Expected vop3p with 3 operands" ); |
397 | if (OpSelHi != 7) { |
398 | LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n" ); |
399 | Fail = true; |
400 | break; |
401 | } |
402 | if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::op_sel_hi)) |
403 | DPPInst.addImm(Val: OpSelHi); |
404 | } |
405 | auto *NegOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::neg_lo); |
406 | if (NegOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::neg_lo)) { |
407 | DPPInst.addImm(Val: NegOpr->getImm()); |
408 | } |
409 | auto *NegHiOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::neg_hi); |
410 | if (NegHiOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::neg_hi)) { |
411 | DPPInst.addImm(Val: NegHiOpr->getImm()); |
412 | } |
413 | auto *ByteSelOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::byte_sel); |
414 | if (ByteSelOpr && |
415 | AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::byte_sel)) { |
416 | DPPInst.addImm(Val: ByteSelOpr->getImm()); |
417 | } |
418 | } |
419 | DPPInst.add(MO: *TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::dpp_ctrl)); |
420 | DPPInst.add(MO: *TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::row_mask)); |
421 | DPPInst.add(MO: *TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bank_mask)); |
422 | DPPInst.addImm(Val: CombBCZ ? 1 : 0); |
423 | } while (false); |
424 | |
425 | if (Fail) { |
426 | DPPInst.getInstr()->eraseFromParent(); |
427 | return nullptr; |
428 | } |
429 | LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr()); |
430 | return DPPInst.getInstr(); |
431 | } |
432 | |
433 | static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) { |
434 | assert(OldOpnd->isImm()); |
435 | switch (OrigMIOp) { |
436 | default: break; |
437 | case AMDGPU::V_ADD_U32_e32: |
438 | case AMDGPU::V_ADD_U32_e64: |
439 | case AMDGPU::V_ADD_CO_U32_e32: |
440 | case AMDGPU::V_ADD_CO_U32_e64: |
441 | case AMDGPU::V_OR_B32_e32: |
442 | case AMDGPU::V_OR_B32_e64: |
443 | case AMDGPU::V_SUBREV_U32_e32: |
444 | case AMDGPU::V_SUBREV_U32_e64: |
445 | case AMDGPU::V_SUBREV_CO_U32_e32: |
446 | case AMDGPU::V_SUBREV_CO_U32_e64: |
447 | case AMDGPU::V_MAX_U32_e32: |
448 | case AMDGPU::V_MAX_U32_e64: |
449 | case AMDGPU::V_XOR_B32_e32: |
450 | case AMDGPU::V_XOR_B32_e64: |
451 | if (OldOpnd->getImm() == 0) |
452 | return true; |
453 | break; |
454 | case AMDGPU::V_AND_B32_e32: |
455 | case AMDGPU::V_AND_B32_e64: |
456 | case AMDGPU::V_MIN_U32_e32: |
457 | case AMDGPU::V_MIN_U32_e64: |
458 | if (static_cast<uint32_t>(OldOpnd->getImm()) == |
459 | std::numeric_limits<uint32_t>::max()) |
460 | return true; |
461 | break; |
462 | case AMDGPU::V_MIN_I32_e32: |
463 | case AMDGPU::V_MIN_I32_e64: |
464 | if (static_cast<int32_t>(OldOpnd->getImm()) == |
465 | std::numeric_limits<int32_t>::max()) |
466 | return true; |
467 | break; |
468 | case AMDGPU::V_MAX_I32_e32: |
469 | case AMDGPU::V_MAX_I32_e64: |
470 | if (static_cast<int32_t>(OldOpnd->getImm()) == |
471 | std::numeric_limits<int32_t>::min()) |
472 | return true; |
473 | break; |
474 | case AMDGPU::V_MUL_I32_I24_e32: |
475 | case AMDGPU::V_MUL_I32_I24_e64: |
476 | case AMDGPU::V_MUL_U32_U24_e32: |
477 | case AMDGPU::V_MUL_U32_U24_e64: |
478 | if (OldOpnd->getImm() == 1) |
479 | return true; |
480 | break; |
481 | } |
482 | return false; |
483 | } |
484 | |
485 | MachineInstr *GCNDPPCombine::createDPPInst( |
486 | MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR, |
487 | MachineOperand *OldOpndValue, bool CombBCZ, bool IsShrinkable) const { |
488 | assert(CombOldVGPR.Reg); |
489 | if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) { |
490 | auto *Src1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1); |
491 | if (!Src1 || !Src1->isReg()) { |
492 | LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n" ); |
493 | return nullptr; |
494 | } |
495 | if (!isIdentityValue(OrigMIOp: OrigMI.getOpcode(), OldOpnd: OldOpndValue)) { |
496 | LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n" ); |
497 | return nullptr; |
498 | } |
499 | CombOldVGPR = getRegSubRegPair(O: *Src1); |
500 | auto MovDst = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::vdst); |
501 | const TargetRegisterClass *RC = MRI->getRegClass(Reg: MovDst->getReg()); |
502 | if (!isOfRegClass(P: CombOldVGPR, TRC: *RC, MRI&: *MRI)) { |
503 | LLVM_DEBUG(dbgs() << " failed: src1 has wrong register class\n" ); |
504 | return nullptr; |
505 | } |
506 | } |
507 | return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable); |
508 | } |
509 | |
510 | // returns true if MI doesn't have OpndName immediate operand or the |
511 | // operand has Value |
512 | bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, |
513 | int64_t Value, int64_t Mask) const { |
514 | auto *Imm = TII->getNamedOperand(MI, OperandName: OpndName); |
515 | if (!Imm) |
516 | return true; |
517 | |
518 | assert(Imm->isImm()); |
519 | return (Imm->getImm() & Mask) == Value; |
520 | } |
521 | |
522 | bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { |
523 | assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp || |
524 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp || |
525 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); |
526 | LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); |
527 | |
528 | auto *DstOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::vdst); |
529 | assert(DstOpnd && DstOpnd->isReg()); |
530 | auto DPPMovReg = DstOpnd->getReg(); |
531 | if (DPPMovReg.isPhysical()) { |
532 | LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n" ); |
533 | return false; |
534 | } |
535 | if (execMayBeModifiedBeforeAnyUse(MRI: *MRI, VReg: DPPMovReg, DefMI: MovMI)) { |
536 | LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" |
537 | " for all uses\n" ); |
538 | return false; |
539 | } |
540 | |
541 | if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || |
542 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { |
543 | auto *DppCtrl = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::dpp_ctrl); |
544 | assert(DppCtrl && DppCtrl->isImm()); |
545 | if (!AMDGPU::isLegalDPALU_DPPControl(DC: DppCtrl->getImm())) { |
546 | LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported" |
547 | " control value\n" ); |
548 | // Let it split, then control may become legal. |
549 | return false; |
550 | } |
551 | } |
552 | |
553 | auto *RowMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::row_mask); |
554 | assert(RowMaskOpnd && RowMaskOpnd->isImm()); |
555 | auto *BankMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bank_mask); |
556 | assert(BankMaskOpnd && BankMaskOpnd->isImm()); |
557 | const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF && |
558 | BankMaskOpnd->getImm() == 0xF; |
559 | |
560 | auto *BCZOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bound_ctrl); |
561 | assert(BCZOpnd && BCZOpnd->isImm()); |
562 | bool BoundCtrlZero = BCZOpnd->getImm(); |
563 | |
564 | auto *OldOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::old); |
565 | auto *SrcOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::src0); |
566 | assert(OldOpnd && OldOpnd->isReg()); |
567 | assert(SrcOpnd && SrcOpnd->isReg()); |
568 | if (OldOpnd->getReg().isPhysical() || SrcOpnd->getReg().isPhysical()) { |
569 | LLVM_DEBUG(dbgs() << " failed: dpp move reads physreg\n" ); |
570 | return false; |
571 | } |
572 | |
573 | auto * const OldOpndValue = getOldOpndValue(OldOpnd&: *OldOpnd); |
574 | // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else |
575 | // We could use: assert(!OldOpndValue || OldOpndValue->isImm()) |
576 | // but the third option is used to distinguish undef from non-immediate |
577 | // to reuse IMPLICIT_DEF instruction later |
578 | assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd); |
579 | |
580 | bool CombBCZ = false; |
581 | |
582 | if (MaskAllLanes && BoundCtrlZero) { // [1] |
583 | CombBCZ = true; |
584 | } else { |
585 | if (!OldOpndValue || !OldOpndValue->isImm()) { |
586 | LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n" ); |
587 | return false; |
588 | } |
589 | |
590 | if (OldOpndValue->getImm() == 0) { |
591 | if (MaskAllLanes) { |
592 | assert(!BoundCtrlZero); // by check [1] |
593 | CombBCZ = true; |
594 | } |
595 | } else if (BoundCtrlZero) { |
596 | assert(!MaskAllLanes); // by check [1] |
597 | LLVM_DEBUG(dbgs() << |
598 | " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n" ); |
599 | return false; |
600 | } |
601 | } |
602 | |
603 | LLVM_DEBUG(dbgs() << " old=" ; |
604 | if (!OldOpndValue) |
605 | dbgs() << "undef" ; |
606 | else |
607 | dbgs() << *OldOpndValue; |
608 | dbgs() << ", bound_ctrl=" << CombBCZ << '\n'); |
609 | |
610 | SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs; |
611 | DenseMap<MachineInstr*, SmallVector<unsigned, 4>> RegSeqWithOpNos; |
612 | auto CombOldVGPR = getRegSubRegPair(O: *OldOpnd); |
613 | // try to reuse previous old reg if its undefined (IMPLICIT_DEF) |
614 | if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef |
615 | const TargetRegisterClass *RC = MRI->getRegClass(Reg: DPPMovReg); |
616 | CombOldVGPR = RegSubRegPair( |
617 | MRI->createVirtualRegister(RegClass: RC)); |
618 | auto UndefInst = BuildMI(BB&: *MovMI.getParent(), I&: MovMI, MIMD: MovMI.getDebugLoc(), |
619 | MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: CombOldVGPR.Reg); |
620 | DPPMIs.push_back(Elt: UndefInst.getInstr()); |
621 | } |
622 | |
623 | OrigMIs.push_back(Elt: &MovMI); |
624 | bool Rollback = true; |
625 | SmallVector<MachineOperand*, 16> Uses; |
626 | |
627 | for (auto &Use : MRI->use_nodbg_operands(Reg: DPPMovReg)) { |
628 | Uses.push_back(Elt: &Use); |
629 | } |
630 | |
631 | while (!Uses.empty()) { |
632 | MachineOperand *Use = Uses.pop_back_val(); |
633 | Rollback = true; |
634 | |
635 | auto &OrigMI = *Use->getParent(); |
636 | LLVM_DEBUG(dbgs() << " try: " << OrigMI); |
637 | |
638 | auto OrigOp = OrigMI.getOpcode(); |
639 | assert((TII->get(OrigOp).getSize() != 4 || !AMDGPU::isTrue16Inst(OrigOp)) && |
640 | "There should not be e32 True16 instructions pre-RA" ); |
641 | if (OrigOp == AMDGPU::REG_SEQUENCE) { |
642 | Register FwdReg = OrigMI.getOperand(i: 0).getReg(); |
643 | unsigned FwdSubReg = 0; |
644 | |
645 | if (execMayBeModifiedBeforeAnyUse(MRI: *MRI, VReg: FwdReg, DefMI: OrigMI)) { |
646 | LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" |
647 | " for all uses\n" ); |
648 | break; |
649 | } |
650 | |
651 | unsigned OpNo, E = OrigMI.getNumOperands(); |
652 | for (OpNo = 1; OpNo < E; OpNo += 2) { |
653 | if (OrigMI.getOperand(i: OpNo).getReg() == DPPMovReg) { |
654 | FwdSubReg = OrigMI.getOperand(i: OpNo + 1).getImm(); |
655 | break; |
656 | } |
657 | } |
658 | |
659 | if (!FwdSubReg) |
660 | break; |
661 | |
662 | for (auto &Op : MRI->use_nodbg_operands(Reg: FwdReg)) { |
663 | if (Op.getSubReg() == FwdSubReg) |
664 | Uses.push_back(Elt: &Op); |
665 | } |
666 | RegSeqWithOpNos[&OrigMI].push_back(Elt: OpNo); |
667 | continue; |
668 | } |
669 | |
670 | bool IsShrinkable = isShrinkable(MI&: OrigMI); |
671 | if (!(IsShrinkable || |
672 | ((TII->isVOP3P(Opcode: OrigOp) || TII->isVOPC(Opcode: OrigOp) || |
673 | TII->isVOP3(Opcode: OrigOp)) && |
674 | ST->hasVOP3DPP()) || |
675 | TII->isVOP1(Opcode: OrigOp) || TII->isVOP2(Opcode: OrigOp))) { |
676 | LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n" ); |
677 | break; |
678 | } |
679 | if (OrigMI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: ST->getRegisterInfo())) { |
680 | LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n" ); |
681 | break; |
682 | } |
683 | |
684 | auto *Src0 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src0); |
685 | auto *Src1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1); |
686 | if (Use != Src0 && !(Use == Src1 && OrigMI.isCommutable())) { // [1] |
687 | LLVM_DEBUG(dbgs() << " failed: no suitable operands\n" ); |
688 | break; |
689 | } |
690 | |
691 | auto *Src2 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src2); |
692 | assert(Src0 && "Src1 without Src0?" ); |
693 | if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(Other: *Src0)) || |
694 | (Src2 && Src2->isIdenticalTo(Other: *Src0)))) || |
695 | (Use == Src1 && (Src1->isIdenticalTo(Other: *Src0) || |
696 | (Src2 && Src2->isIdenticalTo(Other: *Src1))))) { |
697 | LLVM_DEBUG( |
698 | dbgs() |
699 | << " " << OrigMI |
700 | << " failed: DPP register is used more than once per instruction\n" ); |
701 | break; |
702 | } |
703 | |
704 | LLVM_DEBUG(dbgs() << " combining: " << OrigMI); |
705 | if (Use == Src0) { |
706 | if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR, |
707 | OldOpndValue, CombBCZ, IsShrinkable)) { |
708 | DPPMIs.push_back(Elt: DPPInst); |
709 | Rollback = false; |
710 | } |
711 | } else { |
712 | assert(Use == Src1 && OrigMI.isCommutable()); // by check [1] |
713 | auto *BB = OrigMI.getParent(); |
714 | auto *NewMI = BB->getParent()->CloneMachineInstr(Orig: &OrigMI); |
715 | BB->insert(I: OrigMI, MI: NewMI); |
716 | if (TII->commuteInstruction(MI&: *NewMI)) { |
717 | LLVM_DEBUG(dbgs() << " commuted: " << *NewMI); |
718 | if (auto *DPPInst = |
719 | createDPPInst(OrigMI&: *NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ, |
720 | IsShrinkable)) { |
721 | DPPMIs.push_back(Elt: DPPInst); |
722 | Rollback = false; |
723 | } |
724 | } else |
725 | LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n" ); |
726 | NewMI->eraseFromParent(); |
727 | } |
728 | if (Rollback) |
729 | break; |
730 | OrigMIs.push_back(Elt: &OrigMI); |
731 | } |
732 | |
733 | Rollback |= !Uses.empty(); |
734 | |
735 | for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs)) |
736 | MI->eraseFromParent(); |
737 | |
738 | if (!Rollback) { |
739 | for (auto &S : RegSeqWithOpNos) { |
740 | if (MRI->use_nodbg_empty(RegNo: S.first->getOperand(i: 0).getReg())) { |
741 | S.first->eraseFromParent(); |
742 | continue; |
743 | } |
744 | while (!S.second.empty()) |
745 | S.first->getOperand(i: S.second.pop_back_val()).setIsUndef(); |
746 | } |
747 | } |
748 | |
749 | return !Rollback; |
750 | } |
751 | |
752 | bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { |
753 | ST = &MF.getSubtarget<GCNSubtarget>(); |
754 | if (!ST->hasDPP() || skipFunction(F: MF.getFunction())) |
755 | return false; |
756 | |
757 | MRI = &MF.getRegInfo(); |
758 | TII = ST->getInstrInfo(); |
759 | |
760 | bool Changed = false; |
761 | for (auto &MBB : MF) { |
762 | for (MachineInstr &MI : llvm::make_early_inc_range(Range: llvm::reverse(C&: MBB))) { |
763 | if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MovMI&: MI)) { |
764 | Changed = true; |
765 | ++NumDPPMovsCombined; |
766 | } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || |
767 | MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { |
768 | if (ST->hasDPALU_DPP() && combineDPPMov(MovMI&: MI)) { |
769 | Changed = true; |
770 | ++NumDPPMovsCombined; |
771 | } else { |
772 | auto Split = TII->expandMovDPP64(MI); |
773 | for (auto *M : {Split.first, Split.second}) { |
774 | if (M && combineDPPMov(MovMI&: *M)) |
775 | ++NumDPPMovsCombined; |
776 | } |
777 | Changed = true; |
778 | } |
779 | } |
780 | } |
781 | } |
782 | return Changed; |
783 | } |
784 | |