1 | //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Copies from VGPR to SGPR registers are illegal and the register coalescer |
11 | /// will sometimes generate these illegal copies in situations like this: |
12 | /// |
13 | /// Register Class <vsrc> is the union of <vgpr> and <sgpr> |
14 | /// |
15 | /// BB0: |
16 | /// %0 <sgpr> = SCALAR_INST |
17 | /// %1 <vsrc> = COPY %0 <sgpr> |
18 | /// ... |
19 | /// BRANCH %cond BB1, BB2 |
20 | /// BB1: |
21 | /// %2 <vgpr> = VECTOR_INST |
22 | /// %3 <vsrc> = COPY %2 <vgpr> |
23 | /// BB2: |
24 | /// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1> |
25 | /// %5 <vgpr> = VECTOR_INST %4 <vsrc> |
26 | /// |
27 | /// |
28 | /// The coalescer will begin at BB0 and eliminate its copy, then the resulting |
29 | /// code will look like this: |
30 | /// |
31 | /// BB0: |
32 | /// %0 <sgpr> = SCALAR_INST |
33 | /// ... |
34 | /// BRANCH %cond BB1, BB2 |
35 | /// BB1: |
36 | /// %2 <vgpr> = VECTOR_INST |
37 | /// %3 <vsrc> = COPY %2 <vgpr> |
38 | /// BB2: |
39 | /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1> |
40 | /// %5 <vgpr> = VECTOR_INST %4 <sgpr> |
41 | /// |
42 | /// Now that the result of the PHI instruction is an SGPR, the register |
43 | /// allocator is now forced to constrain the register class of %3 to |
44 | /// <sgpr> so we end up with final code like this: |
45 | /// |
46 | /// BB0: |
47 | /// %0 <sgpr> = SCALAR_INST |
48 | /// ... |
49 | /// BRANCH %cond BB1, BB2 |
50 | /// BB1: |
51 | /// %2 <vgpr> = VECTOR_INST |
52 | /// %3 <sgpr> = COPY %2 <vgpr> |
53 | /// BB2: |
54 | /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1> |
55 | /// %5 <vgpr> = VECTOR_INST %4 <sgpr> |
56 | /// |
57 | /// Now this code contains an illegal copy from a VGPR to an SGPR. |
58 | /// |
59 | /// In order to avoid this problem, this pass searches for PHI instructions |
60 | /// which define a <vsrc> register and constrains its definition class to |
61 | /// <vgpr> if the user of the PHI's definition register is a vector instruction. |
62 | /// If the PHI's definition class is constrained to <vgpr> then the coalescer |
63 | /// will be unable to perform the COPY removal from the above example which |
64 | /// ultimately led to the creation of an illegal COPY. |
65 | //===----------------------------------------------------------------------===// |
66 | |
67 | #include "SIFixSGPRCopies.h" |
68 | #include "AMDGPU.h" |
69 | #include "GCNSubtarget.h" |
70 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
71 | #include "llvm/CodeGen/MachineDominators.h" |
72 | #include "llvm/InitializePasses.h" |
73 | #include "llvm/Target/TargetMachine.h" |
74 | |
75 | using namespace llvm; |
76 | |
77 | #define DEBUG_TYPE "si-fix-sgpr-copies" |
78 | |
79 | static cl::opt<bool> EnableM0Merge( |
80 | "amdgpu-enable-merge-m0" , |
81 | cl::desc("Merge and hoist M0 initializations" ), |
82 | cl::init(Val: true)); |
83 | |
84 | namespace { |
85 | |
86 | class V2SCopyInfo { |
87 | public: |
88 | // VGPR to SGPR copy being processed |
89 | MachineInstr *Copy; |
90 | // All SALU instructions reachable from this copy in SSA graph |
91 | SetVector<MachineInstr *> SChain; |
92 | // Number of SGPR to VGPR copies that are used to put the SALU computation |
93 | // results back to VALU. |
94 | unsigned NumSVCopies = 0; |
95 | |
96 | unsigned Score = 0; |
97 | // Actual count of v_readfirstlane_b32 |
98 | // which need to be inserted to keep SChain SALU |
99 | unsigned NumReadfirstlanes = 0; |
100 | // Current score state. To speedup selection V2SCopyInfos for processing |
101 | bool NeedToBeConvertedToVALU = false; |
102 | // Unique ID. Used as a key for mapping to keep permanent order. |
103 | unsigned ID; |
104 | |
105 | // Count of another VGPR to SGPR copies that contribute to the |
106 | // current copy SChain |
107 | unsigned SiblingPenalty = 0; |
108 | SetVector<unsigned> Siblings; |
109 | V2SCopyInfo() : Copy(nullptr), ID(0){}; |
110 | V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) |
111 | : Copy(C), NumReadfirstlanes(Width / 32), ID(Id){}; |
112 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
113 | void dump() { |
114 | dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() |
115 | << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty |
116 | << "\nScore: " << Score << "\n" ; |
117 | } |
118 | #endif |
119 | }; |
120 | |
121 | class SIFixSGPRCopies { |
122 | MachineDominatorTree *MDT; |
123 | SmallVector<MachineInstr*, 4> SCCCopies; |
124 | SmallVector<MachineInstr*, 4> RegSequences; |
125 | SmallVector<MachineInstr*, 4> PHINodes; |
126 | SmallVector<MachineInstr*, 4> S2VCopies; |
127 | unsigned NextVGPRToSGPRCopyID = 0; |
128 | MapVector<unsigned, V2SCopyInfo> V2SCopies; |
129 | DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty; |
130 | DenseSet<MachineInstr *> PHISources; |
131 | |
132 | public: |
133 | MachineRegisterInfo *MRI; |
134 | const SIRegisterInfo *TRI; |
135 | const SIInstrInfo *TII; |
136 | |
137 | SIFixSGPRCopies(MachineDominatorTree *MDT) : MDT(MDT) {} |
138 | |
139 | bool run(MachineFunction &MF); |
140 | void fixSCCCopies(MachineFunction &MF); |
141 | void prepareRegSequenceAndPHIs(MachineFunction &MF); |
142 | unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } |
143 | bool needToBeConvertedToVALU(V2SCopyInfo *I); |
144 | void analyzeVGPRToSGPRCopy(MachineInstr *MI); |
145 | void lowerVGPR2SGPRCopies(MachineFunction &MF); |
146 | // Handles copies which source register is: |
147 | // 1. Physical register |
148 | // 2. AGPR |
149 | // 3. Defined by the instruction the merely moves the immediate |
150 | bool lowerSpecialCase(MachineInstr &MI, MachineBasicBlock::iterator &I); |
151 | |
152 | void processPHINode(MachineInstr &MI); |
153 | |
154 | // Check if MO is an immediate materialized into a VGPR, and if so replace it |
155 | // with an SGPR immediate. The VGPR immediate is also deleted if it does not |
156 | // have any other uses. |
157 | bool tryMoveVGPRConstToSGPR(MachineOperand &MO, Register NewDst, |
158 | MachineBasicBlock *BlockToInsertTo, |
159 | MachineBasicBlock::iterator PointToInsertTo, |
160 | const DebugLoc &DL); |
161 | }; |
162 | |
163 | class SIFixSGPRCopiesLegacy : public MachineFunctionPass { |
164 | public: |
165 | static char ID; |
166 | |
167 | SIFixSGPRCopiesLegacy() : MachineFunctionPass(ID) {} |
168 | |
169 | bool runOnMachineFunction(MachineFunction &MF) override { |
170 | MachineDominatorTree *MDT = |
171 | &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
172 | SIFixSGPRCopies Impl(MDT); |
173 | return Impl.run(MF); |
174 | } |
175 | |
176 | StringRef getPassName() const override { return "SI Fix SGPR copies" ; } |
177 | |
178 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
179 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
180 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
181 | AU.setPreservesCFG(); |
182 | MachineFunctionPass::getAnalysisUsage(AU); |
183 | } |
184 | }; |
185 | |
186 | } // end anonymous namespace |
187 | |
188 | INITIALIZE_PASS_BEGIN(SIFixSGPRCopiesLegacy, DEBUG_TYPE, "SI Fix SGPR copies" , |
189 | false, false) |
190 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
191 | INITIALIZE_PASS_END(SIFixSGPRCopiesLegacy, DEBUG_TYPE, "SI Fix SGPR copies" , |
192 | false, false) |
193 | |
194 | char SIFixSGPRCopiesLegacy::ID = 0; |
195 | |
196 | char &llvm::SIFixSGPRCopiesLegacyID = SIFixSGPRCopiesLegacy::ID; |
197 | |
198 | FunctionPass *llvm::createSIFixSGPRCopiesLegacyPass() { |
199 | return new SIFixSGPRCopiesLegacy(); |
200 | } |
201 | |
202 | static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> |
203 | getCopyRegClasses(const MachineInstr &Copy, |
204 | const SIRegisterInfo &TRI, |
205 | const MachineRegisterInfo &MRI) { |
206 | Register DstReg = Copy.getOperand(i: 0).getReg(); |
207 | Register SrcReg = Copy.getOperand(i: 1).getReg(); |
208 | |
209 | const TargetRegisterClass *SrcRC = SrcReg.isVirtual() |
210 | ? MRI.getRegClass(Reg: SrcReg) |
211 | : TRI.getPhysRegBaseClass(Reg: SrcReg); |
212 | |
213 | // We don't really care about the subregister here. |
214 | // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); |
215 | |
216 | const TargetRegisterClass *DstRC = DstReg.isVirtual() |
217 | ? MRI.getRegClass(Reg: DstReg) |
218 | : TRI.getPhysRegBaseClass(Reg: DstReg); |
219 | |
220 | return std::pair(SrcRC, DstRC); |
221 | } |
222 | |
223 | static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, |
224 | const TargetRegisterClass *DstRC, |
225 | const SIRegisterInfo &TRI) { |
226 | return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(RC: DstRC) && |
227 | TRI.hasVectorRegisters(RC: SrcRC); |
228 | } |
229 | |
230 | static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, |
231 | const TargetRegisterClass *DstRC, |
232 | const SIRegisterInfo &TRI) { |
233 | return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(RC: SrcRC) && |
234 | TRI.hasVectorRegisters(RC: DstRC); |
235 | } |
236 | |
237 | static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, |
238 | const SIRegisterInfo *TRI, |
239 | const SIInstrInfo *TII) { |
240 | MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); |
241 | auto &Src = MI.getOperand(i: 1); |
242 | Register DstReg = MI.getOperand(i: 0).getReg(); |
243 | Register SrcReg = Src.getReg(); |
244 | if (!SrcReg.isVirtual() || !DstReg.isVirtual()) |
245 | return false; |
246 | |
247 | for (const auto &MO : MRI.reg_nodbg_operands(Reg: DstReg)) { |
248 | const auto *UseMI = MO.getParent(); |
249 | if (UseMI == &MI) |
250 | continue; |
251 | if (MO.isDef() || UseMI->getParent() != MI.getParent() || |
252 | UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) |
253 | return false; |
254 | |
255 | unsigned OpIdx = MO.getOperandNo(); |
256 | if (OpIdx >= UseMI->getDesc().getNumOperands() || |
257 | !TII->isOperandLegal(MI: *UseMI, OpIdx, MO: &Src)) |
258 | return false; |
259 | } |
260 | // Change VGPR to SGPR destination. |
261 | MRI.setRegClass(Reg: DstReg, RC: TRI->getEquivalentSGPRClass(VRC: MRI.getRegClass(Reg: DstReg))); |
262 | return true; |
263 | } |
264 | |
265 | // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. |
266 | // |
267 | // SGPRx = ... |
268 | // SGPRy = REG_SEQUENCE SGPRx, sub0 ... |
269 | // VGPRz = COPY SGPRy |
270 | // |
271 | // ==> |
272 | // |
273 | // VGPRx = COPY SGPRx |
274 | // VGPRz = REG_SEQUENCE VGPRx, sub0 |
275 | // |
276 | // This exposes immediate folding opportunities when materializing 64-bit |
277 | // immediates. |
278 | static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, |
279 | const SIRegisterInfo *TRI, |
280 | const SIInstrInfo *TII, |
281 | MachineRegisterInfo &MRI) { |
282 | assert(MI.isRegSequence()); |
283 | |
284 | Register DstReg = MI.getOperand(i: 0).getReg(); |
285 | if (!TRI->isSGPRClass(RC: MRI.getRegClass(Reg: DstReg))) |
286 | return false; |
287 | |
288 | if (!MRI.hasOneUse(RegNo: DstReg)) |
289 | return false; |
290 | |
291 | MachineInstr &CopyUse = *MRI.use_instr_begin(RegNo: DstReg); |
292 | if (!CopyUse.isCopy()) |
293 | return false; |
294 | |
295 | // It is illegal to have vreg inputs to a physreg defining reg_sequence. |
296 | if (CopyUse.getOperand(i: 0).getReg().isPhysical()) |
297 | return false; |
298 | |
299 | const TargetRegisterClass *SrcRC, *DstRC; |
300 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: CopyUse, TRI: *TRI, MRI); |
301 | |
302 | if (!isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
303 | return false; |
304 | |
305 | if (tryChangeVGPRtoSGPRinCopy(MI&: CopyUse, TRI, TII)) |
306 | return true; |
307 | |
308 | // TODO: Could have multiple extracts? |
309 | unsigned SubReg = CopyUse.getOperand(i: 1).getSubReg(); |
310 | if (SubReg != AMDGPU::NoSubRegister) |
311 | return false; |
312 | |
313 | MRI.setRegClass(Reg: DstReg, RC: DstRC); |
314 | |
315 | // SGPRx = ... |
316 | // SGPRy = REG_SEQUENCE SGPRx, sub0 ... |
317 | // VGPRz = COPY SGPRy |
318 | |
319 | // => |
320 | // VGPRx = COPY SGPRx |
321 | // VGPRz = REG_SEQUENCE VGPRx, sub0 |
322 | |
323 | MI.getOperand(i: 0).setReg(CopyUse.getOperand(i: 0).getReg()); |
324 | bool IsAGPR = TRI->isAGPRClass(RC: DstRC); |
325 | |
326 | for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { |
327 | const TargetRegisterClass *SrcRC = |
328 | TRI->getRegClassForOperandReg(MRI, MO: MI.getOperand(i: I)); |
329 | assert(TRI->isSGPRClass(SrcRC) && |
330 | "Expected SGPR REG_SEQUENCE to only have SGPR inputs" ); |
331 | const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SRC: SrcRC); |
332 | |
333 | Register TmpReg = MRI.createVirtualRegister(RegClass: NewSrcRC); |
334 | |
335 | BuildMI(BB&: *MI.getParent(), I: &MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), |
336 | DestReg: TmpReg) |
337 | .add(MO: MI.getOperand(i: I)); |
338 | |
339 | if (IsAGPR) { |
340 | const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SRC: SrcRC); |
341 | Register TmpAReg = MRI.createVirtualRegister(RegClass: NewSrcRC); |
342 | unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? |
343 | AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY; |
344 | BuildMI(BB&: *MI.getParent(), I: &MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc), |
345 | DestReg: TmpAReg) |
346 | .addReg(RegNo: TmpReg, flags: RegState::Kill); |
347 | TmpReg = TmpAReg; |
348 | } |
349 | |
350 | MI.getOperand(i: I).setReg(TmpReg); |
351 | } |
352 | |
353 | CopyUse.eraseFromParent(); |
354 | return true; |
355 | } |
356 | |
357 | static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, |
358 | const MachineInstr *MoveImm, |
359 | const SIInstrInfo *TII, |
360 | unsigned &SMovOp, |
361 | int64_t &Imm) { |
362 | if (Copy->getOpcode() != AMDGPU::COPY) |
363 | return false; |
364 | |
365 | if (!MoveImm->isMoveImmediate()) |
366 | return false; |
367 | |
368 | const MachineOperand *ImmOp = |
369 | TII->getNamedOperand(MI: *MoveImm, OperandName: AMDGPU::OpName::src0); |
370 | if (!ImmOp->isImm()) |
371 | return false; |
372 | |
373 | // FIXME: Handle copies with sub-regs. |
374 | if (Copy->getOperand(i: 1).getSubReg()) |
375 | return false; |
376 | |
377 | switch (MoveImm->getOpcode()) { |
378 | default: |
379 | return false; |
380 | case AMDGPU::V_MOV_B32_e32: |
381 | SMovOp = AMDGPU::S_MOV_B32; |
382 | break; |
383 | case AMDGPU::V_MOV_B64_PSEUDO: |
384 | SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO; |
385 | break; |
386 | } |
387 | Imm = ImmOp->getImm(); |
388 | return true; |
389 | } |
390 | |
391 | template <class UnaryPredicate> |
392 | bool searchPredecessors(const MachineBasicBlock *MBB, |
393 | const MachineBasicBlock *CutOff, |
394 | UnaryPredicate Predicate) { |
395 | if (MBB == CutOff) |
396 | return false; |
397 | |
398 | DenseSet<const MachineBasicBlock *> Visited; |
399 | SmallVector<MachineBasicBlock *, 4> Worklist(MBB->predecessors()); |
400 | |
401 | while (!Worklist.empty()) { |
402 | MachineBasicBlock *MBB = Worklist.pop_back_val(); |
403 | |
404 | if (!Visited.insert(V: MBB).second) |
405 | continue; |
406 | if (MBB == CutOff) |
407 | continue; |
408 | if (Predicate(MBB)) |
409 | return true; |
410 | |
411 | Worklist.append(in_start: MBB->pred_begin(), in_end: MBB->pred_end()); |
412 | } |
413 | |
414 | return false; |
415 | } |
416 | |
417 | // Checks if there is potential path From instruction To instruction. |
418 | // If CutOff is specified and it sits in between of that path we ignore |
419 | // a higher portion of the path and report it is not reachable. |
420 | static bool isReachable(const MachineInstr *From, |
421 | const MachineInstr *To, |
422 | const MachineBasicBlock *CutOff, |
423 | MachineDominatorTree &MDT) { |
424 | if (MDT.dominates(A: From, B: To)) |
425 | return true; |
426 | |
427 | const MachineBasicBlock *MBBFrom = From->getParent(); |
428 | const MachineBasicBlock *MBBTo = To->getParent(); |
429 | |
430 | // Do predecessor search. |
431 | // We should almost never get here since we do not usually produce M0 stores |
432 | // other than -1. |
433 | return searchPredecessors(MBB: MBBTo, CutOff, Predicate: [MBBFrom] |
434 | (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); |
435 | } |
436 | |
437 | // Return the first non-prologue instruction in the block. |
438 | static MachineBasicBlock::iterator |
439 | getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { |
440 | MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); |
441 | while (I != MBB->end() && TII->isBasicBlockPrologue(MI: *I)) |
442 | ++I; |
443 | |
444 | return I; |
445 | } |
446 | |
447 | // Hoist and merge identical SGPR initializations into a common predecessor. |
448 | // This is intended to combine M0 initializations, but can work with any |
449 | // SGPR. A VGPR cannot be processed since we cannot guarantee vector |
450 | // executioon. |
451 | static bool hoistAndMergeSGPRInits(unsigned Reg, |
452 | const MachineRegisterInfo &MRI, |
453 | const TargetRegisterInfo *TRI, |
454 | MachineDominatorTree &MDT, |
455 | const TargetInstrInfo *TII) { |
456 | // List of inits by immediate value. |
457 | using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; |
458 | InitListMap Inits; |
459 | // List of clobbering instructions. |
460 | SmallVector<MachineInstr*, 8> Clobbers; |
461 | // List of instructions marked for deletion. |
462 | SmallSet<MachineInstr*, 8> MergedInstrs; |
463 | |
464 | bool Changed = false; |
465 | |
466 | for (auto &MI : MRI.def_instructions(Reg)) { |
467 | MachineOperand *Imm = nullptr; |
468 | for (auto &MO : MI.operands()) { |
469 | if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || |
470 | (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { |
471 | Imm = nullptr; |
472 | break; |
473 | } |
474 | if (MO.isImm()) |
475 | Imm = &MO; |
476 | } |
477 | if (Imm) |
478 | Inits[Imm->getImm()].push_front(x: &MI); |
479 | else |
480 | Clobbers.push_back(Elt: &MI); |
481 | } |
482 | |
483 | for (auto &Init : Inits) { |
484 | auto &Defs = Init.second; |
485 | |
486 | for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { |
487 | MachineInstr *MI1 = *I1; |
488 | |
489 | for (auto I2 = std::next(x: I1); I2 != E; ) { |
490 | MachineInstr *MI2 = *I2; |
491 | |
492 | // Check any possible interference |
493 | auto interferes = [&](MachineBasicBlock::iterator From, |
494 | MachineBasicBlock::iterator To) -> bool { |
495 | |
496 | assert(MDT.dominates(&*To, &*From)); |
497 | |
498 | auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { |
499 | const MachineBasicBlock *MBBFrom = From->getParent(); |
500 | const MachineBasicBlock *MBBTo = To->getParent(); |
501 | bool MayClobberFrom = isReachable(From: Clobber, To: &*From, CutOff: MBBTo, MDT); |
502 | bool MayClobberTo = isReachable(From: Clobber, To: &*To, CutOff: MBBTo, MDT); |
503 | if (!MayClobberFrom && !MayClobberTo) |
504 | return false; |
505 | if ((MayClobberFrom && !MayClobberTo) || |
506 | (!MayClobberFrom && MayClobberTo)) |
507 | return true; |
508 | // Both can clobber, this is not an interference only if both are |
509 | // dominated by Clobber and belong to the same block or if Clobber |
510 | // properly dominates To, given that To >> From, so it dominates |
511 | // both and located in a common dominator. |
512 | return !((MBBFrom == MBBTo && |
513 | MDT.dominates(A: Clobber, B: &*From) && |
514 | MDT.dominates(A: Clobber, B: &*To)) || |
515 | MDT.properlyDominates(A: Clobber->getParent(), B: MBBTo)); |
516 | }; |
517 | |
518 | return (llvm::any_of(Range&: Clobbers, P: interferes)) || |
519 | (llvm::any_of(Range&: Inits, P: [&](InitListMap::value_type &C) { |
520 | return C.first != Init.first && |
521 | llvm::any_of(Range&: C.second, P: interferes); |
522 | })); |
523 | }; |
524 | |
525 | if (MDT.dominates(A: MI1, B: MI2)) { |
526 | if (!interferes(MI2, MI1)) { |
527 | LLVM_DEBUG(dbgs() |
528 | << "Erasing from " |
529 | << printMBBReference(*MI2->getParent()) << " " << *MI2); |
530 | MergedInstrs.insert(Ptr: MI2); |
531 | Changed = true; |
532 | ++I2; |
533 | continue; |
534 | } |
535 | } else if (MDT.dominates(A: MI2, B: MI1)) { |
536 | if (!interferes(MI1, MI2)) { |
537 | LLVM_DEBUG(dbgs() |
538 | << "Erasing from " |
539 | << printMBBReference(*MI1->getParent()) << " " << *MI1); |
540 | MergedInstrs.insert(Ptr: MI1); |
541 | Changed = true; |
542 | ++I1; |
543 | break; |
544 | } |
545 | } else { |
546 | auto *MBB = MDT.findNearestCommonDominator(A: MI1->getParent(), |
547 | B: MI2->getParent()); |
548 | if (!MBB) { |
549 | ++I2; |
550 | continue; |
551 | } |
552 | |
553 | MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); |
554 | if (!interferes(MI1, I) && !interferes(MI2, I)) { |
555 | LLVM_DEBUG(dbgs() |
556 | << "Erasing from " |
557 | << printMBBReference(*MI1->getParent()) << " " << *MI1 |
558 | << "and moving from " |
559 | << printMBBReference(*MI2->getParent()) << " to " |
560 | << printMBBReference(*I->getParent()) << " " << *MI2); |
561 | I->getParent()->splice(Where: I, Other: MI2->getParent(), From: MI2); |
562 | MergedInstrs.insert(Ptr: MI1); |
563 | Changed = true; |
564 | ++I1; |
565 | break; |
566 | } |
567 | } |
568 | ++I2; |
569 | } |
570 | ++I1; |
571 | } |
572 | } |
573 | |
574 | // Remove initializations that were merged into another. |
575 | for (auto &Init : Inits) { |
576 | auto &Defs = Init.second; |
577 | auto I = Defs.begin(); |
578 | while (I != Defs.end()) { |
579 | if (MergedInstrs.count(Ptr: *I)) { |
580 | (*I)->eraseFromParent(); |
581 | I = Defs.erase(position: I); |
582 | } else |
583 | ++I; |
584 | } |
585 | } |
586 | |
587 | // Try to schedule SGPR initializations as early as possible in the MBB. |
588 | for (auto &Init : Inits) { |
589 | auto &Defs = Init.second; |
590 | for (auto *MI : Defs) { |
591 | auto *MBB = MI->getParent(); |
592 | MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); |
593 | MachineBasicBlock::reverse_iterator B(BoundaryMI); |
594 | // Check if B should actually be a boundary. If not set the previous |
595 | // instruction as the boundary instead. |
596 | if (!TII->isBasicBlockPrologue(MI: *B)) |
597 | B++; |
598 | |
599 | auto R = std::next(x: MI->getReverseIterator()); |
600 | const unsigned Threshold = 50; |
601 | // Search until B or Threshold for a place to insert the initialization. |
602 | for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) |
603 | if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || |
604 | TII->isSchedulingBoundary(MI: *R, MBB, MF: *MBB->getParent())) |
605 | break; |
606 | |
607 | // Move to directly after R. |
608 | if (&*--R != MI) |
609 | MBB->splice(Where: *R, Other: MBB, From: MI); |
610 | } |
611 | } |
612 | |
613 | if (Changed) |
614 | MRI.clearKillFlags(Reg); |
615 | |
616 | return Changed; |
617 | } |
618 | |
619 | bool SIFixSGPRCopies::run(MachineFunction &MF) { |
620 | // Only need to run this in SelectionDAG path. |
621 | if (MF.getProperties().hasSelected()) |
622 | return false; |
623 | |
624 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
625 | MRI = &MF.getRegInfo(); |
626 | TRI = ST.getRegisterInfo(); |
627 | TII = ST.getInstrInfo(); |
628 | |
629 | for (MachineBasicBlock &MBB : MF) { |
630 | for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; |
631 | ++I) { |
632 | MachineInstr &MI = *I; |
633 | |
634 | switch (MI.getOpcode()) { |
635 | default: |
636 | continue; |
637 | case AMDGPU::COPY: { |
638 | const TargetRegisterClass *SrcRC, *DstRC; |
639 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: MI, TRI: *TRI, MRI: *MRI); |
640 | |
641 | if (isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) { |
642 | // Since VGPR to SGPR copies affect VGPR to SGPR copy |
643 | // score and, hence the lowering decision, let's try to get rid of |
644 | // them as early as possible |
645 | if (tryChangeVGPRtoSGPRinCopy(MI, TRI, TII)) |
646 | continue; |
647 | |
648 | // Collect those not changed to try them after VGPR to SGPR copies |
649 | // lowering as there will be more opportunities. |
650 | S2VCopies.push_back(Elt: &MI); |
651 | } |
652 | if (!isVGPRToSGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
653 | continue; |
654 | if (lowerSpecialCase(MI, I)) |
655 | continue; |
656 | |
657 | analyzeVGPRToSGPRCopy(MI: &MI); |
658 | |
659 | break; |
660 | } |
661 | case AMDGPU::WQM: |
662 | case AMDGPU::STRICT_WQM: |
663 | case AMDGPU::SOFT_WQM: |
664 | case AMDGPU::STRICT_WWM: |
665 | case AMDGPU::INSERT_SUBREG: |
666 | case AMDGPU::PHI: |
667 | case AMDGPU::REG_SEQUENCE: { |
668 | if (TRI->isSGPRClass(RC: TII->getOpRegClass(MI, OpNo: 0))) { |
669 | for (MachineOperand &MO : MI.operands()) { |
670 | if (!MO.isReg() || !MO.getReg().isVirtual()) |
671 | continue; |
672 | const TargetRegisterClass *SrcRC = MRI->getRegClass(Reg: MO.getReg()); |
673 | if (SrcRC == &AMDGPU::VReg_1RegClass) |
674 | continue; |
675 | |
676 | if (TRI->hasVectorRegisters(RC: SrcRC)) { |
677 | const TargetRegisterClass *DestRC = |
678 | TRI->getEquivalentSGPRClass(VRC: SrcRC); |
679 | Register NewDst = MRI->createVirtualRegister(RegClass: DestRC); |
680 | MachineBasicBlock *BlockToInsertCopy = |
681 | MI.isPHI() ? MI.getOperand(i: MO.getOperandNo() + 1).getMBB() |
682 | : &MBB; |
683 | MachineBasicBlock::iterator PointToInsertCopy = |
684 | MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I; |
685 | |
686 | const DebugLoc &DL = MI.getDebugLoc(); |
687 | if (!tryMoveVGPRConstToSGPR(MO, NewDst, BlockToInsertTo: BlockToInsertCopy, |
688 | PointToInsertTo: PointToInsertCopy, DL)) { |
689 | MachineInstr *NewCopy = |
690 | BuildMI(BB&: *BlockToInsertCopy, I: PointToInsertCopy, MIMD: DL, |
691 | MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: NewDst) |
692 | .addReg(RegNo: MO.getReg()); |
693 | MO.setReg(NewDst); |
694 | analyzeVGPRToSGPRCopy(MI: NewCopy); |
695 | PHISources.insert(V: NewCopy); |
696 | } |
697 | } |
698 | } |
699 | } |
700 | |
701 | if (MI.isPHI()) |
702 | PHINodes.push_back(Elt: &MI); |
703 | else if (MI.isRegSequence()) |
704 | RegSequences.push_back(Elt: &MI); |
705 | |
706 | break; |
707 | } |
708 | case AMDGPU::V_WRITELANE_B32: { |
709 | // Some architectures allow more than one constant bus access without |
710 | // SGPR restriction |
711 | if (ST.getConstantBusLimit(Opcode: MI.getOpcode()) != 1) |
712 | break; |
713 | |
714 | // Writelane is special in that it can use SGPR and M0 (which would |
715 | // normally count as using the constant bus twice - but in this case it |
716 | // is allowed since the lane selector doesn't count as a use of the |
717 | // constant bus). However, it is still required to abide by the 1 SGPR |
718 | // rule. Apply a fix here as we might have multiple SGPRs after |
719 | // legalizing VGPRs to SGPRs |
720 | int Src0Idx = |
721 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src0); |
722 | int Src1Idx = |
723 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src1); |
724 | MachineOperand &Src0 = MI.getOperand(i: Src0Idx); |
725 | MachineOperand &Src1 = MI.getOperand(i: Src1Idx); |
726 | |
727 | // Check to see if the instruction violates the 1 SGPR rule |
728 | if ((Src0.isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src0.getReg()) && |
729 | Src0.getReg() != AMDGPU::M0) && |
730 | (Src1.isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src1.getReg()) && |
731 | Src1.getReg() != AMDGPU::M0)) { |
732 | |
733 | // Check for trivially easy constant prop into one of the operands |
734 | // If this is the case then perform the operation now to resolve SGPR |
735 | // issue. If we don't do that here we will always insert a mov to m0 |
736 | // that can't be resolved in later operand folding pass |
737 | bool Resolved = false; |
738 | for (MachineOperand *MO : {&Src0, &Src1}) { |
739 | if (MO->getReg().isVirtual()) { |
740 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MO->getReg()); |
741 | if (DefMI && TII->isFoldableCopy(MI: *DefMI)) { |
742 | const MachineOperand &Def = DefMI->getOperand(i: 0); |
743 | if (Def.isReg() && |
744 | MO->getReg() == Def.getReg() && |
745 | MO->getSubReg() == Def.getSubReg()) { |
746 | const MachineOperand &Copied = DefMI->getOperand(i: 1); |
747 | if (Copied.isImm() && |
748 | TII->isInlineConstant(Imm: APInt(64, Copied.getImm(), true))) { |
749 | MO->ChangeToImmediate(ImmVal: Copied.getImm()); |
750 | Resolved = true; |
751 | break; |
752 | } |
753 | } |
754 | } |
755 | } |
756 | } |
757 | |
758 | if (!Resolved) { |
759 | // Haven't managed to resolve by replacing an SGPR with an immediate |
760 | // Move src1 to be in M0 |
761 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), |
762 | MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0) |
763 | .add(MO: Src1); |
764 | Src1.ChangeToRegister(Reg: AMDGPU::M0, isDef: false); |
765 | } |
766 | } |
767 | break; |
768 | } |
769 | } |
770 | } |
771 | } |
772 | |
773 | lowerVGPR2SGPRCopies(MF); |
774 | // Postprocessing |
775 | fixSCCCopies(MF); |
776 | for (auto *MI : S2VCopies) { |
777 | // Check if it is still valid |
778 | if (MI->isCopy()) { |
779 | const TargetRegisterClass *SrcRC, *DstRC; |
780 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: *MI, TRI: *TRI, MRI: *MRI); |
781 | if (isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
782 | tryChangeVGPRtoSGPRinCopy(MI&: *MI, TRI, TII); |
783 | } |
784 | } |
785 | for (auto *MI : RegSequences) { |
786 | // Check if it is still valid |
787 | if (MI->isRegSequence()) |
788 | foldVGPRCopyIntoRegSequence(MI&: *MI, TRI, TII, MRI&: *MRI); |
789 | } |
790 | for (auto *MI : PHINodes) { |
791 | processPHINode(MI&: *MI); |
792 | } |
793 | if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge) |
794 | hoistAndMergeSGPRInits(Reg: AMDGPU::M0, MRI: *MRI, TRI, MDT&: *MDT, TII); |
795 | |
796 | SiblingPenalty.clear(); |
797 | V2SCopies.clear(); |
798 | SCCCopies.clear(); |
799 | RegSequences.clear(); |
800 | PHINodes.clear(); |
801 | S2VCopies.clear(); |
802 | PHISources.clear(); |
803 | |
804 | return true; |
805 | } |
806 | |
807 | void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { |
808 | bool AllAGPRUses = true; |
809 | SetVector<const MachineInstr *> worklist; |
810 | SmallSet<const MachineInstr *, 4> Visited; |
811 | SetVector<MachineInstr *> PHIOperands; |
812 | worklist.insert(X: &MI); |
813 | Visited.insert(Ptr: &MI); |
814 | // HACK to make MIR tests with no uses happy |
815 | bool HasUses = false; |
816 | while (!worklist.empty()) { |
817 | const MachineInstr *Instr = worklist.pop_back_val(); |
818 | Register Reg = Instr->getOperand(i: 0).getReg(); |
819 | for (const auto &Use : MRI->use_operands(Reg)) { |
820 | HasUses = true; |
821 | const MachineInstr *UseMI = Use.getParent(); |
822 | AllAGPRUses &= (UseMI->isCopy() && |
823 | TRI->isAGPR(MRI: *MRI, Reg: UseMI->getOperand(i: 0).getReg())) || |
824 | TRI->isAGPR(MRI: *MRI, Reg: Use.getReg()); |
825 | if (UseMI->isCopy() || UseMI->isRegSequence()) { |
826 | if (Visited.insert(Ptr: UseMI).second) |
827 | worklist.insert(X: UseMI); |
828 | |
829 | continue; |
830 | } |
831 | } |
832 | } |
833 | |
834 | Register PHIRes = MI.getOperand(i: 0).getReg(); |
835 | const TargetRegisterClass *RC0 = MRI->getRegClass(Reg: PHIRes); |
836 | if (HasUses && AllAGPRUses && !TRI->isAGPRClass(RC: RC0)) { |
837 | LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); |
838 | MRI->setRegClass(Reg: PHIRes, RC: TRI->getEquivalentAGPRClass(SRC: RC0)); |
839 | for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { |
840 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MI.getOperand(i: I).getReg()); |
841 | if (DefMI && DefMI->isPHI()) |
842 | PHIOperands.insert(X: DefMI); |
843 | } |
844 | } |
845 | |
846 | if (TRI->isVectorRegister(MRI: *MRI, Reg: PHIRes) || |
847 | RC0 == &AMDGPU::VReg_1RegClass) { |
848 | LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); |
849 | TII->legalizeOperands(MI, MDT); |
850 | } |
851 | |
852 | // Propagate register class back to PHI operands which are PHI themselves. |
853 | while (!PHIOperands.empty()) { |
854 | processPHINode(MI&: *PHIOperands.pop_back_val()); |
855 | } |
856 | } |
857 | |
858 | bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR( |
859 | MachineOperand &MaybeVGPRConstMO, Register DstReg, |
860 | MachineBasicBlock *BlockToInsertTo, |
861 | MachineBasicBlock::iterator PointToInsertTo, const DebugLoc &DL) { |
862 | |
863 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MaybeVGPRConstMO.getReg()); |
864 | if (!DefMI || !DefMI->isMoveImmediate()) |
865 | return false; |
866 | |
867 | MachineOperand *SrcConst = TII->getNamedOperand(MI&: *DefMI, OperandName: AMDGPU::OpName::src0); |
868 | if (SrcConst->isReg()) |
869 | return false; |
870 | |
871 | const TargetRegisterClass *SrcRC = |
872 | MRI->getRegClass(Reg: MaybeVGPRConstMO.getReg()); |
873 | unsigned MoveSize = TRI->getRegSizeInBits(RC: *SrcRC); |
874 | unsigned MoveOp = MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; |
875 | BuildMI(BB&: *BlockToInsertTo, I: PointToInsertTo, MIMD: DL, MCID: TII->get(Opcode: MoveOp), DestReg: DstReg) |
876 | .add(MO: *SrcConst); |
877 | if (MRI->hasOneUse(RegNo: MaybeVGPRConstMO.getReg())) |
878 | DefMI->eraseFromParent(); |
879 | MaybeVGPRConstMO.setReg(DstReg); |
880 | return true; |
881 | } |
882 | |
883 | bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, |
884 | MachineBasicBlock::iterator &I) { |
885 | Register DstReg = MI.getOperand(i: 0).getReg(); |
886 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
887 | if (!DstReg.isVirtual()) { |
888 | // If the destination register is a physical register there isn't |
889 | // really much we can do to fix this. |
890 | // Some special instructions use M0 as an input. Some even only use |
891 | // the first lane. Insert a readfirstlane and hope for the best. |
892 | if (DstReg == AMDGPU::M0 && |
893 | TRI->hasVectorRegisters(RC: MRI->getRegClass(Reg: SrcReg))) { |
894 | Register TmpReg = |
895 | MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass); |
896 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), |
897 | MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: TmpReg) |
898 | .add(MO: MI.getOperand(i: 1)); |
899 | MI.getOperand(i: 1).setReg(TmpReg); |
900 | } else if (tryMoveVGPRConstToSGPR(MaybeVGPRConstMO&: MI.getOperand(i: 1), DstReg, BlockToInsertTo: MI.getParent(), |
901 | PointToInsertTo: MI, DL: MI.getDebugLoc())) { |
902 | I = std::next(x: I); |
903 | MI.eraseFromParent(); |
904 | } |
905 | return true; |
906 | } |
907 | if (!SrcReg.isVirtual() || TRI->isAGPR(MRI: *MRI, Reg: SrcReg)) { |
908 | SIInstrWorklist worklist; |
909 | worklist.insert(MI: &MI); |
910 | TII->moveToVALU(Worklist&: worklist, MDT); |
911 | return true; |
912 | } |
913 | |
914 | unsigned SMovOp; |
915 | int64_t Imm; |
916 | // If we are just copying an immediate, we can replace the copy with |
917 | // s_mov_b32. |
918 | if (isSafeToFoldImmIntoCopy(Copy: &MI, MoveImm: MRI->getVRegDef(Reg: SrcReg), TII, SMovOp, Imm)) { |
919 | MI.getOperand(i: 1).ChangeToImmediate(ImmVal: Imm); |
920 | MI.addImplicitDefUseOperands(MF&: *MI.getParent()->getParent()); |
921 | MI.setDesc(TII->get(Opcode: SMovOp)); |
922 | return true; |
923 | } |
924 | return false; |
925 | } |
926 | |
927 | void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { |
928 | if (PHISources.contains(V: MI)) |
929 | return; |
930 | Register DstReg = MI->getOperand(i: 0).getReg(); |
931 | const TargetRegisterClass *DstRC = MRI->getRegClass(Reg: DstReg); |
932 | |
933 | V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, |
934 | TRI->getRegSizeInBits(RC: *DstRC)); |
935 | SmallVector<MachineInstr *, 8> AnalysisWorklist; |
936 | // Needed because the SSA is not a tree but a graph and may have |
937 | // forks and joins. We should not then go same way twice. |
938 | DenseSet<MachineInstr *> Visited; |
939 | AnalysisWorklist.push_back(Elt: Info.Copy); |
940 | while (!AnalysisWorklist.empty()) { |
941 | |
942 | MachineInstr *Inst = AnalysisWorklist.pop_back_val(); |
943 | |
944 | if (!Visited.insert(V: Inst).second) |
945 | continue; |
946 | |
947 | // Copies and REG_SEQUENCE do not contribute to the final assembly |
948 | // So, skip them but take care of the SGPR to VGPR copies bookkeeping. |
949 | if (Inst->isCopy() || Inst->isRegSequence()) { |
950 | if (TRI->isVGPR(MRI: *MRI, Reg: Inst->getOperand(i: 0).getReg())) { |
951 | if (!Inst->isCopy() || |
952 | !tryChangeVGPRtoSGPRinCopy(MI&: *Inst, TRI, TII)) { |
953 | Info.NumSVCopies++; |
954 | continue; |
955 | } |
956 | } |
957 | } |
958 | |
959 | SiblingPenalty[Inst].insert(X: Info.ID); |
960 | |
961 | SmallVector<MachineInstr *, 4> Users; |
962 | if ((TII->isSALU(MI: *Inst) && Inst->isCompare()) || |
963 | (Inst->isCopy() && Inst->getOperand(i: 0).getReg() == AMDGPU::SCC)) { |
964 | auto I = Inst->getIterator(); |
965 | auto E = Inst->getParent()->end(); |
966 | while (++I != E && |
967 | !I->findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr)) { |
968 | if (I->readsRegister(Reg: AMDGPU::SCC, /*TRI=*/nullptr)) |
969 | Users.push_back(Elt: &*I); |
970 | } |
971 | } else if (Inst->getNumExplicitDefs() != 0) { |
972 | Register Reg = Inst->getOperand(i: 0).getReg(); |
973 | if (Reg.isVirtual() && TRI->isSGPRReg(MRI: *MRI, Reg) && !TII->isVALU(MI: *Inst)) { |
974 | for (auto &U : MRI->use_instructions(Reg)) |
975 | Users.push_back(Elt: &U); |
976 | } |
977 | } |
978 | for (auto *U : Users) { |
979 | if (TII->isSALU(MI: *U)) |
980 | Info.SChain.insert(X: U); |
981 | AnalysisWorklist.push_back(Elt: U); |
982 | } |
983 | } |
984 | V2SCopies[Info.ID] = Info; |
985 | } |
986 | |
987 | // The main function that computes the VGPR to SGPR copy score |
988 | // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU |
989 | bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { |
990 | if (Info->SChain.empty()) { |
991 | Info->Score = 0; |
992 | return true; |
993 | } |
994 | Info->Siblings = SiblingPenalty[*llvm::max_element( |
995 | Range&: Info->SChain, C: [&](MachineInstr *A, MachineInstr *B) -> bool { |
996 | return SiblingPenalty[A].size() < SiblingPenalty[B].size(); |
997 | })]; |
998 | Info->Siblings.remove_if(P: [&](unsigned ID) { return ID == Info->ID; }); |
999 | // The loop below computes the number of another VGPR to SGPR V2SCopies |
1000 | // which contribute to the current copy SALU chain. We assume that all the |
1001 | // V2SCopies with the same source virtual register will be squashed to one |
1002 | // by regalloc. Also we take care of the V2SCopies of the differnt subregs |
1003 | // of the same register. |
1004 | SmallSet<std::pair<Register, unsigned>, 4> SrcRegs; |
1005 | for (auto J : Info->Siblings) { |
1006 | auto *InfoIt = V2SCopies.find(Key: J); |
1007 | if (InfoIt != V2SCopies.end()) { |
1008 | MachineInstr *SiblingCopy = InfoIt->second.Copy; |
1009 | if (SiblingCopy->isImplicitDef()) |
1010 | // the COPY has already been MoveToVALUed |
1011 | continue; |
1012 | |
1013 | SrcRegs.insert(V: std::pair(SiblingCopy->getOperand(i: 1).getReg(), |
1014 | SiblingCopy->getOperand(i: 1).getSubReg())); |
1015 | } |
1016 | } |
1017 | Info->SiblingPenalty = SrcRegs.size(); |
1018 | |
1019 | unsigned Penalty = |
1020 | Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes; |
1021 | unsigned Profit = Info->SChain.size(); |
1022 | Info->Score = Penalty > Profit ? 0 : Profit - Penalty; |
1023 | Info->NeedToBeConvertedToVALU = Info->Score < 3; |
1024 | return Info->NeedToBeConvertedToVALU; |
1025 | } |
1026 | |
1027 | void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { |
1028 | |
1029 | SmallVector<unsigned, 8> LoweringWorklist; |
1030 | for (auto &C : V2SCopies) { |
1031 | if (needToBeConvertedToVALU(Info: &C.second)) |
1032 | LoweringWorklist.push_back(Elt: C.second.ID); |
1033 | } |
1034 | |
1035 | // Store all the V2S copy instructions that need to be moved to VALU |
1036 | // in the Copies worklist. |
1037 | SIInstrWorklist Copies; |
1038 | |
1039 | while (!LoweringWorklist.empty()) { |
1040 | unsigned CurID = LoweringWorklist.pop_back_val(); |
1041 | auto *CurInfoIt = V2SCopies.find(Key: CurID); |
1042 | if (CurInfoIt != V2SCopies.end()) { |
1043 | V2SCopyInfo C = CurInfoIt->second; |
1044 | LLVM_DEBUG(dbgs() << "Processing ...\n" ; C.dump()); |
1045 | for (auto S : C.Siblings) { |
1046 | auto *SibInfoIt = V2SCopies.find(Key: S); |
1047 | if (SibInfoIt != V2SCopies.end()) { |
1048 | V2SCopyInfo &SI = SibInfoIt->second; |
1049 | LLVM_DEBUG(dbgs() << "Sibling:\n" ; SI.dump()); |
1050 | if (!SI.NeedToBeConvertedToVALU) { |
1051 | SI.SChain.set_subtract(C.SChain); |
1052 | if (needToBeConvertedToVALU(Info: &SI)) |
1053 | LoweringWorklist.push_back(Elt: SI.ID); |
1054 | } |
1055 | SI.Siblings.remove_if(P: [&](unsigned ID) { return ID == C.ID; }); |
1056 | } |
1057 | } |
1058 | LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy |
1059 | << " is being turned to VALU\n" ); |
1060 | // TODO: MapVector::erase is inefficient. Do bulk removal with remove_if |
1061 | // instead. |
1062 | V2SCopies.erase(Key: C.ID); |
1063 | Copies.insert(MI: C.Copy); |
1064 | } |
1065 | } |
1066 | |
1067 | TII->moveToVALU(Worklist&: Copies, MDT); |
1068 | Copies.clear(); |
1069 | |
1070 | // Now do actual lowering |
1071 | for (auto C : V2SCopies) { |
1072 | MachineInstr *MI = C.second.Copy; |
1073 | MachineBasicBlock *MBB = MI->getParent(); |
1074 | // We decide to turn V2S copy to v_readfirstlane_b32 |
1075 | // remove it from the V2SCopies and remove it from all its siblings |
1076 | LLVM_DEBUG(dbgs() << "V2S copy " << *MI |
1077 | << " is being turned to v_readfirstlane_b32" |
1078 | << " Score: " << C.second.Score << "\n" ); |
1079 | Register DstReg = MI->getOperand(i: 0).getReg(); |
1080 | MRI->constrainRegClass(Reg: DstReg, RC: &AMDGPU::SReg_32_XM0RegClass); |
1081 | |
1082 | Register SrcReg = MI->getOperand(i: 1).getReg(); |
1083 | unsigned SubReg = MI->getOperand(i: 1).getSubReg(); |
1084 | const TargetRegisterClass *SrcRC = |
1085 | TRI->getRegClassForOperandReg(MRI: *MRI, MO: MI->getOperand(i: 1)); |
1086 | size_t SrcSize = TRI->getRegSizeInBits(RC: *SrcRC); |
1087 | if (SrcSize == 16) { |
1088 | assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() && |
1089 | "We do not expect to see 16-bit copies from VGPR to SGPR unless " |
1090 | "we have 16-bit VGPRs" ); |
1091 | assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass || |
1092 | MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass || |
1093 | MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass); |
1094 | // There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits |
1095 | MRI->setRegClass(Reg: DstReg, RC: &AMDGPU::SReg_32_XM0RegClass); |
1096 | Register VReg32 = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
1097 | const DebugLoc &DL = MI->getDebugLoc(); |
1098 | Register Undef = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass); |
1099 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef); |
1100 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: VReg32) |
1101 | .addReg(RegNo: SrcReg, flags: 0, SubReg) |
1102 | .addImm(Val: AMDGPU::lo16) |
1103 | .addReg(RegNo: Undef) |
1104 | .addImm(Val: AMDGPU::hi16); |
1105 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg) |
1106 | .addReg(RegNo: VReg32); |
1107 | } else if (SrcSize == 32) { |
1108 | auto MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), |
1109 | MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg); |
1110 | MIB.addReg(RegNo: SrcReg, flags: 0, SubReg); |
1111 | } else { |
1112 | auto Result = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), |
1113 | MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg); |
1114 | int N = TRI->getRegSizeInBits(RC: *SrcRC) / 32; |
1115 | for (int i = 0; i < N; i++) { |
1116 | Register PartialSrc = TII->buildExtractSubReg( |
1117 | MI: Result, MRI&: *MRI, SuperReg: MI->getOperand(i: 1), SuperRC: SrcRC, |
1118 | SubIdx: TRI->getSubRegFromChannel(Channel: i), SubRC: &AMDGPU::VGPR_32RegClass); |
1119 | Register PartialDst = |
1120 | MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass); |
1121 | BuildMI(BB&: *MBB, I&: *Result, MIMD: Result->getDebugLoc(), |
1122 | MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: PartialDst) |
1123 | .addReg(RegNo: PartialSrc); |
1124 | Result.addReg(RegNo: PartialDst).addImm(Val: TRI->getSubRegFromChannel(Channel: i)); |
1125 | } |
1126 | } |
1127 | MI->eraseFromParent(); |
1128 | } |
1129 | } |
1130 | |
1131 | void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { |
1132 | bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32(); |
1133 | for (MachineBasicBlock &MBB : MF) { |
1134 | for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; |
1135 | ++I) { |
1136 | MachineInstr &MI = *I; |
1137 | // May already have been lowered. |
1138 | if (!MI.isCopy()) |
1139 | continue; |
1140 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
1141 | Register DstReg = MI.getOperand(i: 0).getReg(); |
1142 | if (SrcReg == AMDGPU::SCC) { |
1143 | Register SCCCopy = |
1144 | MRI->createVirtualRegister(RegClass: TRI->getWaveMaskRegClass()); |
1145 | I = BuildMI(BB&: *MI.getParent(), I: std::next(x: MachineBasicBlock::iterator(MI)), |
1146 | MIMD: MI.getDebugLoc(), |
1147 | MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_CSELECT_B32 |
1148 | : AMDGPU::S_CSELECT_B64), |
1149 | DestReg: SCCCopy) |
1150 | .addImm(Val: -1) |
1151 | .addImm(Val: 0); |
1152 | I = BuildMI(BB&: *MI.getParent(), I: std::next(x: I), MIMD: I->getDebugLoc(), |
1153 | MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg) |
1154 | .addReg(RegNo: SCCCopy); |
1155 | MI.eraseFromParent(); |
1156 | continue; |
1157 | } |
1158 | if (DstReg == AMDGPU::SCC) { |
1159 | unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; |
1160 | Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
1161 | Register Tmp = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
1162 | I = BuildMI(BB&: *MI.getParent(), I: std::next(x: MachineBasicBlock::iterator(MI)), |
1163 | MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode)) |
1164 | .addReg(RegNo: Tmp, flags: getDefRegState(B: true)) |
1165 | .addReg(RegNo: SrcReg) |
1166 | .addReg(RegNo: Exec); |
1167 | MI.eraseFromParent(); |
1168 | } |
1169 | } |
1170 | } |
1171 | } |
1172 | |
1173 | PreservedAnalyses |
1174 | SIFixSGPRCopiesPass::run(MachineFunction &MF, |
1175 | MachineFunctionAnalysisManager &MFAM) { |
1176 | MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(IR&: MF); |
1177 | SIFixSGPRCopies Impl(&MDT); |
1178 | bool Changed = Impl.run(MF); |
1179 | if (!Changed) |
1180 | return PreservedAnalyses::all(); |
1181 | |
1182 | // TODO: We could detect CFG changed. |
1183 | auto PA = getMachineFunctionPassPreservedAnalyses(); |
1184 | return PA; |
1185 | } |
1186 | |