1 | //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Copies from VGPR to SGPR registers are illegal and the register coalescer |
11 | /// will sometimes generate these illegal copies in situations like this: |
12 | /// |
13 | /// Register Class <vsrc> is the union of <vgpr> and <sgpr> |
14 | /// |
15 | /// BB0: |
16 | /// %0 <sgpr> = SCALAR_INST |
17 | /// %1 <vsrc> = COPY %0 <sgpr> |
18 | /// ... |
19 | /// BRANCH %cond BB1, BB2 |
20 | /// BB1: |
21 | /// %2 <vgpr> = VECTOR_INST |
22 | /// %3 <vsrc> = COPY %2 <vgpr> |
23 | /// BB2: |
24 | /// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1> |
25 | /// %5 <vgpr> = VECTOR_INST %4 <vsrc> |
26 | /// |
27 | /// |
28 | /// The coalescer will begin at BB0 and eliminate its copy, then the resulting |
29 | /// code will look like this: |
30 | /// |
31 | /// BB0: |
32 | /// %0 <sgpr> = SCALAR_INST |
33 | /// ... |
34 | /// BRANCH %cond BB1, BB2 |
35 | /// BB1: |
36 | /// %2 <vgpr> = VECTOR_INST |
37 | /// %3 <vsrc> = COPY %2 <vgpr> |
38 | /// BB2: |
39 | /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1> |
40 | /// %5 <vgpr> = VECTOR_INST %4 <sgpr> |
41 | /// |
42 | /// Now that the result of the PHI instruction is an SGPR, the register |
43 | /// allocator is now forced to constrain the register class of %3 to |
44 | /// <sgpr> so we end up with final code like this: |
45 | /// |
46 | /// BB0: |
47 | /// %0 <sgpr> = SCALAR_INST |
48 | /// ... |
49 | /// BRANCH %cond BB1, BB2 |
50 | /// BB1: |
51 | /// %2 <vgpr> = VECTOR_INST |
52 | /// %3 <sgpr> = COPY %2 <vgpr> |
53 | /// BB2: |
54 | /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1> |
55 | /// %5 <vgpr> = VECTOR_INST %4 <sgpr> |
56 | /// |
57 | /// Now this code contains an illegal copy from a VGPR to an SGPR. |
58 | /// |
59 | /// In order to avoid this problem, this pass searches for PHI instructions |
60 | /// which define a <vsrc> register and constrains its definition class to |
61 | /// <vgpr> if the user of the PHI's definition register is a vector instruction. |
62 | /// If the PHI's definition class is constrained to <vgpr> then the coalescer |
63 | /// will be unable to perform the COPY removal from the above example which |
64 | /// ultimately led to the creation of an illegal COPY. |
65 | //===----------------------------------------------------------------------===// |
66 | |
67 | #include "AMDGPU.h" |
68 | #include "GCNSubtarget.h" |
69 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
70 | #include "llvm/ADT/SetOperations.h" |
71 | #include "llvm/CodeGen/MachineDominators.h" |
72 | #include "llvm/InitializePasses.h" |
73 | #include "llvm/Target/TargetMachine.h" |
74 | |
75 | using namespace llvm; |
76 | |
77 | #define DEBUG_TYPE "si-fix-sgpr-copies" |
78 | |
79 | static cl::opt<bool> EnableM0Merge( |
80 | "amdgpu-enable-merge-m0" , |
81 | cl::desc("Merge and hoist M0 initializations" ), |
82 | cl::init(Val: true)); |
83 | |
84 | namespace { |
85 | |
86 | class V2SCopyInfo { |
87 | public: |
88 | // VGPR to SGPR copy being processed |
89 | MachineInstr *Copy; |
90 | // All SALU instructions reachable from this copy in SSA graph |
91 | SetVector<MachineInstr *> SChain; |
92 | // Number of SGPR to VGPR copies that are used to put the SALU computation |
93 | // results back to VALU. |
94 | unsigned NumSVCopies; |
95 | |
96 | unsigned Score; |
97 | // Actual count of v_readfirstlane_b32 |
98 | // which need to be inserted to keep SChain SALU |
99 | unsigned NumReadfirstlanes; |
100 | // Current score state. To speedup selection V2SCopyInfos for processing |
101 | bool NeedToBeConvertedToVALU = false; |
102 | // Unique ID. Used as a key for mapping to keep permanent order. |
103 | unsigned ID; |
104 | |
105 | // Count of another VGPR to SGPR copies that contribute to the |
106 | // current copy SChain |
107 | unsigned SiblingPenalty = 0; |
108 | SetVector<unsigned> Siblings; |
109 | V2SCopyInfo() : Copy(nullptr), ID(0){}; |
110 | V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) |
111 | : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){}; |
112 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
113 | void dump() { |
114 | dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() |
115 | << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty |
116 | << "\nScore: " << Score << "\n" ; |
117 | } |
118 | #endif |
119 | }; |
120 | |
121 | class SIFixSGPRCopies : public MachineFunctionPass { |
122 | MachineDominatorTree *MDT; |
123 | SmallVector<MachineInstr*, 4> SCCCopies; |
124 | SmallVector<MachineInstr*, 4> RegSequences; |
125 | SmallVector<MachineInstr*, 4> PHINodes; |
126 | SmallVector<MachineInstr*, 4> S2VCopies; |
127 | unsigned NextVGPRToSGPRCopyID = 0; |
128 | MapVector<unsigned, V2SCopyInfo> V2SCopies; |
129 | DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty; |
130 | |
131 | public: |
132 | static char ID; |
133 | |
134 | MachineRegisterInfo *MRI; |
135 | const SIRegisterInfo *TRI; |
136 | const SIInstrInfo *TII; |
137 | |
138 | SIFixSGPRCopies() : MachineFunctionPass(ID) {} |
139 | |
140 | bool runOnMachineFunction(MachineFunction &MF) override; |
141 | void fixSCCCopies(MachineFunction &MF); |
142 | void prepareRegSequenceAndPHIs(MachineFunction &MF); |
143 | unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } |
144 | bool needToBeConvertedToVALU(V2SCopyInfo *I); |
145 | void analyzeVGPRToSGPRCopy(MachineInstr *MI); |
146 | void lowerVGPR2SGPRCopies(MachineFunction &MF); |
147 | // Handles copies which source register is: |
148 | // 1. Physical register |
149 | // 2. AGPR |
150 | // 3. Defined by the instruction the merely moves the immediate |
151 | bool lowerSpecialCase(MachineInstr &MI, MachineBasicBlock::iterator &I); |
152 | |
153 | void processPHINode(MachineInstr &MI); |
154 | |
155 | // Check if MO is an immediate materialized into a VGPR, and if so replace it |
156 | // with an SGPR immediate. The VGPR immediate is also deleted if it does not |
157 | // have any other uses. |
158 | bool tryMoveVGPRConstToSGPR(MachineOperand &MO, Register NewDst, |
159 | MachineBasicBlock *BlockToInsertTo, |
160 | MachineBasicBlock::iterator PointToInsertTo); |
161 | |
162 | StringRef getPassName() const override { return "SI Fix SGPR copies" ; } |
163 | |
164 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
165 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
166 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
167 | AU.setPreservesCFG(); |
168 | MachineFunctionPass::getAnalysisUsage(AU); |
169 | } |
170 | }; |
171 | |
172 | } // end anonymous namespace |
173 | |
174 | INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, |
175 | "SI Fix SGPR copies" , false, false) |
176 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
177 | INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, |
178 | "SI Fix SGPR copies" , false, false) |
179 | |
180 | char SIFixSGPRCopies::ID = 0; |
181 | |
182 | char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; |
183 | |
184 | FunctionPass *llvm::createSIFixSGPRCopiesPass() { |
185 | return new SIFixSGPRCopies(); |
186 | } |
187 | |
188 | static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> |
189 | getCopyRegClasses(const MachineInstr &Copy, |
190 | const SIRegisterInfo &TRI, |
191 | const MachineRegisterInfo &MRI) { |
192 | Register DstReg = Copy.getOperand(i: 0).getReg(); |
193 | Register SrcReg = Copy.getOperand(i: 1).getReg(); |
194 | |
195 | const TargetRegisterClass *SrcRC = SrcReg.isVirtual() |
196 | ? MRI.getRegClass(Reg: SrcReg) |
197 | : TRI.getPhysRegBaseClass(Reg: SrcReg); |
198 | |
199 | // We don't really care about the subregister here. |
200 | // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); |
201 | |
202 | const TargetRegisterClass *DstRC = DstReg.isVirtual() |
203 | ? MRI.getRegClass(Reg: DstReg) |
204 | : TRI.getPhysRegBaseClass(Reg: DstReg); |
205 | |
206 | return std::pair(SrcRC, DstRC); |
207 | } |
208 | |
209 | static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, |
210 | const TargetRegisterClass *DstRC, |
211 | const SIRegisterInfo &TRI) { |
212 | return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(RC: DstRC) && |
213 | TRI.hasVectorRegisters(RC: SrcRC); |
214 | } |
215 | |
216 | static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, |
217 | const TargetRegisterClass *DstRC, |
218 | const SIRegisterInfo &TRI) { |
219 | return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(RC: SrcRC) && |
220 | TRI.hasVectorRegisters(RC: DstRC); |
221 | } |
222 | |
223 | static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, |
224 | const SIRegisterInfo *TRI, |
225 | const SIInstrInfo *TII) { |
226 | MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); |
227 | auto &Src = MI.getOperand(i: 1); |
228 | Register DstReg = MI.getOperand(i: 0).getReg(); |
229 | Register SrcReg = Src.getReg(); |
230 | if (!SrcReg.isVirtual() || !DstReg.isVirtual()) |
231 | return false; |
232 | |
233 | for (const auto &MO : MRI.reg_nodbg_operands(Reg: DstReg)) { |
234 | const auto *UseMI = MO.getParent(); |
235 | if (UseMI == &MI) |
236 | continue; |
237 | if (MO.isDef() || UseMI->getParent() != MI.getParent() || |
238 | UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) |
239 | return false; |
240 | |
241 | unsigned OpIdx = MO.getOperandNo(); |
242 | if (OpIdx >= UseMI->getDesc().getNumOperands() || |
243 | !TII->isOperandLegal(MI: *UseMI, OpIdx, MO: &Src)) |
244 | return false; |
245 | } |
246 | // Change VGPR to SGPR destination. |
247 | MRI.setRegClass(Reg: DstReg, RC: TRI->getEquivalentSGPRClass(VRC: MRI.getRegClass(Reg: DstReg))); |
248 | return true; |
249 | } |
250 | |
251 | // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. |
252 | // |
253 | // SGPRx = ... |
254 | // SGPRy = REG_SEQUENCE SGPRx, sub0 ... |
255 | // VGPRz = COPY SGPRy |
256 | // |
257 | // ==> |
258 | // |
259 | // VGPRx = COPY SGPRx |
260 | // VGPRz = REG_SEQUENCE VGPRx, sub0 |
261 | // |
262 | // This exposes immediate folding opportunities when materializing 64-bit |
263 | // immediates. |
264 | static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, |
265 | const SIRegisterInfo *TRI, |
266 | const SIInstrInfo *TII, |
267 | MachineRegisterInfo &MRI) { |
268 | assert(MI.isRegSequence()); |
269 | |
270 | Register DstReg = MI.getOperand(i: 0).getReg(); |
271 | if (!TRI->isSGPRClass(RC: MRI.getRegClass(Reg: DstReg))) |
272 | return false; |
273 | |
274 | if (!MRI.hasOneUse(RegNo: DstReg)) |
275 | return false; |
276 | |
277 | MachineInstr &CopyUse = *MRI.use_instr_begin(RegNo: DstReg); |
278 | if (!CopyUse.isCopy()) |
279 | return false; |
280 | |
281 | // It is illegal to have vreg inputs to a physreg defining reg_sequence. |
282 | if (CopyUse.getOperand(i: 0).getReg().isPhysical()) |
283 | return false; |
284 | |
285 | const TargetRegisterClass *SrcRC, *DstRC; |
286 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: CopyUse, TRI: *TRI, MRI); |
287 | |
288 | if (!isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
289 | return false; |
290 | |
291 | if (tryChangeVGPRtoSGPRinCopy(MI&: CopyUse, TRI, TII)) |
292 | return true; |
293 | |
294 | // TODO: Could have multiple extracts? |
295 | unsigned SubReg = CopyUse.getOperand(i: 1).getSubReg(); |
296 | if (SubReg != AMDGPU::NoSubRegister) |
297 | return false; |
298 | |
299 | MRI.setRegClass(Reg: DstReg, RC: DstRC); |
300 | |
301 | // SGPRx = ... |
302 | // SGPRy = REG_SEQUENCE SGPRx, sub0 ... |
303 | // VGPRz = COPY SGPRy |
304 | |
305 | // => |
306 | // VGPRx = COPY SGPRx |
307 | // VGPRz = REG_SEQUENCE VGPRx, sub0 |
308 | |
309 | MI.getOperand(i: 0).setReg(CopyUse.getOperand(i: 0).getReg()); |
310 | bool IsAGPR = TRI->isAGPRClass(RC: DstRC); |
311 | |
312 | for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { |
313 | const TargetRegisterClass *SrcRC = |
314 | TRI->getRegClassForOperandReg(MRI, MO: MI.getOperand(i: I)); |
315 | assert(TRI->isSGPRClass(SrcRC) && |
316 | "Expected SGPR REG_SEQUENCE to only have SGPR inputs" ); |
317 | const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SRC: SrcRC); |
318 | |
319 | Register TmpReg = MRI.createVirtualRegister(RegClass: NewSrcRC); |
320 | |
321 | BuildMI(BB&: *MI.getParent(), I: &MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), |
322 | DestReg: TmpReg) |
323 | .add(MO: MI.getOperand(i: I)); |
324 | |
325 | if (IsAGPR) { |
326 | const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SRC: SrcRC); |
327 | Register TmpAReg = MRI.createVirtualRegister(RegClass: NewSrcRC); |
328 | unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? |
329 | AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY; |
330 | BuildMI(BB&: *MI.getParent(), I: &MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc), |
331 | DestReg: TmpAReg) |
332 | .addReg(RegNo: TmpReg, flags: RegState::Kill); |
333 | TmpReg = TmpAReg; |
334 | } |
335 | |
336 | MI.getOperand(i: I).setReg(TmpReg); |
337 | } |
338 | |
339 | CopyUse.eraseFromParent(); |
340 | return true; |
341 | } |
342 | |
343 | static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, |
344 | const MachineInstr *MoveImm, |
345 | const SIInstrInfo *TII, |
346 | unsigned &SMovOp, |
347 | int64_t &Imm) { |
348 | if (Copy->getOpcode() != AMDGPU::COPY) |
349 | return false; |
350 | |
351 | if (!MoveImm->isMoveImmediate()) |
352 | return false; |
353 | |
354 | const MachineOperand *ImmOp = |
355 | TII->getNamedOperand(MI: *MoveImm, OpName: AMDGPU::OpName::src0); |
356 | if (!ImmOp->isImm()) |
357 | return false; |
358 | |
359 | // FIXME: Handle copies with sub-regs. |
360 | if (Copy->getOperand(i: 1).getSubReg()) |
361 | return false; |
362 | |
363 | switch (MoveImm->getOpcode()) { |
364 | default: |
365 | return false; |
366 | case AMDGPU::V_MOV_B32_e32: |
367 | SMovOp = AMDGPU::S_MOV_B32; |
368 | break; |
369 | case AMDGPU::V_MOV_B64_PSEUDO: |
370 | SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO; |
371 | break; |
372 | } |
373 | Imm = ImmOp->getImm(); |
374 | return true; |
375 | } |
376 | |
377 | template <class UnaryPredicate> |
378 | bool searchPredecessors(const MachineBasicBlock *MBB, |
379 | const MachineBasicBlock *CutOff, |
380 | UnaryPredicate Predicate) { |
381 | if (MBB == CutOff) |
382 | return false; |
383 | |
384 | DenseSet<const MachineBasicBlock *> Visited; |
385 | SmallVector<MachineBasicBlock *, 4> Worklist(MBB->predecessors()); |
386 | |
387 | while (!Worklist.empty()) { |
388 | MachineBasicBlock *MBB = Worklist.pop_back_val(); |
389 | |
390 | if (!Visited.insert(V: MBB).second) |
391 | continue; |
392 | if (MBB == CutOff) |
393 | continue; |
394 | if (Predicate(MBB)) |
395 | return true; |
396 | |
397 | Worklist.append(in_start: MBB->pred_begin(), in_end: MBB->pred_end()); |
398 | } |
399 | |
400 | return false; |
401 | } |
402 | |
403 | // Checks if there is potential path From instruction To instruction. |
404 | // If CutOff is specified and it sits in between of that path we ignore |
405 | // a higher portion of the path and report it is not reachable. |
406 | static bool isReachable(const MachineInstr *From, |
407 | const MachineInstr *To, |
408 | const MachineBasicBlock *CutOff, |
409 | MachineDominatorTree &MDT) { |
410 | if (MDT.dominates(A: From, B: To)) |
411 | return true; |
412 | |
413 | const MachineBasicBlock *MBBFrom = From->getParent(); |
414 | const MachineBasicBlock *MBBTo = To->getParent(); |
415 | |
416 | // Do predecessor search. |
417 | // We should almost never get here since we do not usually produce M0 stores |
418 | // other than -1. |
419 | return searchPredecessors(MBB: MBBTo, CutOff, Predicate: [MBBFrom] |
420 | (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); |
421 | } |
422 | |
423 | // Return the first non-prologue instruction in the block. |
424 | static MachineBasicBlock::iterator |
425 | getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { |
426 | MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); |
427 | while (I != MBB->end() && TII->isBasicBlockPrologue(MI: *I)) |
428 | ++I; |
429 | |
430 | return I; |
431 | } |
432 | |
433 | // Hoist and merge identical SGPR initializations into a common predecessor. |
434 | // This is intended to combine M0 initializations, but can work with any |
435 | // SGPR. A VGPR cannot be processed since we cannot guarantee vector |
436 | // executioon. |
437 | static bool hoistAndMergeSGPRInits(unsigned Reg, |
438 | const MachineRegisterInfo &MRI, |
439 | const TargetRegisterInfo *TRI, |
440 | MachineDominatorTree &MDT, |
441 | const TargetInstrInfo *TII) { |
442 | // List of inits by immediate value. |
443 | using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; |
444 | InitListMap Inits; |
445 | // List of clobbering instructions. |
446 | SmallVector<MachineInstr*, 8> Clobbers; |
447 | // List of instructions marked for deletion. |
448 | SmallSet<MachineInstr*, 8> MergedInstrs; |
449 | |
450 | bool Changed = false; |
451 | |
452 | for (auto &MI : MRI.def_instructions(Reg)) { |
453 | MachineOperand *Imm = nullptr; |
454 | for (auto &MO : MI.operands()) { |
455 | if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || |
456 | (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { |
457 | Imm = nullptr; |
458 | break; |
459 | } |
460 | if (MO.isImm()) |
461 | Imm = &MO; |
462 | } |
463 | if (Imm) |
464 | Inits[Imm->getImm()].push_front(x: &MI); |
465 | else |
466 | Clobbers.push_back(Elt: &MI); |
467 | } |
468 | |
469 | for (auto &Init : Inits) { |
470 | auto &Defs = Init.second; |
471 | |
472 | for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { |
473 | MachineInstr *MI1 = *I1; |
474 | |
475 | for (auto I2 = std::next(x: I1); I2 != E; ) { |
476 | MachineInstr *MI2 = *I2; |
477 | |
478 | // Check any possible interference |
479 | auto interferes = [&](MachineBasicBlock::iterator From, |
480 | MachineBasicBlock::iterator To) -> bool { |
481 | |
482 | assert(MDT.dominates(&*To, &*From)); |
483 | |
484 | auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { |
485 | const MachineBasicBlock *MBBFrom = From->getParent(); |
486 | const MachineBasicBlock *MBBTo = To->getParent(); |
487 | bool MayClobberFrom = isReachable(From: Clobber, To: &*From, CutOff: MBBTo, MDT); |
488 | bool MayClobberTo = isReachable(From: Clobber, To: &*To, CutOff: MBBTo, MDT); |
489 | if (!MayClobberFrom && !MayClobberTo) |
490 | return false; |
491 | if ((MayClobberFrom && !MayClobberTo) || |
492 | (!MayClobberFrom && MayClobberTo)) |
493 | return true; |
494 | // Both can clobber, this is not an interference only if both are |
495 | // dominated by Clobber and belong to the same block or if Clobber |
496 | // properly dominates To, given that To >> From, so it dominates |
497 | // both and located in a common dominator. |
498 | return !((MBBFrom == MBBTo && |
499 | MDT.dominates(A: Clobber, B: &*From) && |
500 | MDT.dominates(A: Clobber, B: &*To)) || |
501 | MDT.properlyDominates(A: Clobber->getParent(), B: MBBTo)); |
502 | }; |
503 | |
504 | return (llvm::any_of(Range&: Clobbers, P: interferes)) || |
505 | (llvm::any_of(Range&: Inits, P: [&](InitListMap::value_type &C) { |
506 | return C.first != Init.first && |
507 | llvm::any_of(Range&: C.second, P: interferes); |
508 | })); |
509 | }; |
510 | |
511 | if (MDT.dominates(A: MI1, B: MI2)) { |
512 | if (!interferes(MI2, MI1)) { |
513 | LLVM_DEBUG(dbgs() |
514 | << "Erasing from " |
515 | << printMBBReference(*MI2->getParent()) << " " << *MI2); |
516 | MergedInstrs.insert(Ptr: MI2); |
517 | Changed = true; |
518 | ++I2; |
519 | continue; |
520 | } |
521 | } else if (MDT.dominates(A: MI2, B: MI1)) { |
522 | if (!interferes(MI1, MI2)) { |
523 | LLVM_DEBUG(dbgs() |
524 | << "Erasing from " |
525 | << printMBBReference(*MI1->getParent()) << " " << *MI1); |
526 | MergedInstrs.insert(Ptr: MI1); |
527 | Changed = true; |
528 | ++I1; |
529 | break; |
530 | } |
531 | } else { |
532 | auto *MBB = MDT.findNearestCommonDominator(A: MI1->getParent(), |
533 | B: MI2->getParent()); |
534 | if (!MBB) { |
535 | ++I2; |
536 | continue; |
537 | } |
538 | |
539 | MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); |
540 | if (!interferes(MI1, I) && !interferes(MI2, I)) { |
541 | LLVM_DEBUG(dbgs() |
542 | << "Erasing from " |
543 | << printMBBReference(*MI1->getParent()) << " " << *MI1 |
544 | << "and moving from " |
545 | << printMBBReference(*MI2->getParent()) << " to " |
546 | << printMBBReference(*I->getParent()) << " " << *MI2); |
547 | I->getParent()->splice(Where: I, Other: MI2->getParent(), From: MI2); |
548 | MergedInstrs.insert(Ptr: MI1); |
549 | Changed = true; |
550 | ++I1; |
551 | break; |
552 | } |
553 | } |
554 | ++I2; |
555 | } |
556 | ++I1; |
557 | } |
558 | } |
559 | |
560 | // Remove initializations that were merged into another. |
561 | for (auto &Init : Inits) { |
562 | auto &Defs = Init.second; |
563 | auto I = Defs.begin(); |
564 | while (I != Defs.end()) { |
565 | if (MergedInstrs.count(Ptr: *I)) { |
566 | (*I)->eraseFromParent(); |
567 | I = Defs.erase(position: I); |
568 | } else |
569 | ++I; |
570 | } |
571 | } |
572 | |
573 | // Try to schedule SGPR initializations as early as possible in the MBB. |
574 | for (auto &Init : Inits) { |
575 | auto &Defs = Init.second; |
576 | for (auto *MI : Defs) { |
577 | auto MBB = MI->getParent(); |
578 | MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); |
579 | MachineBasicBlock::reverse_iterator B(BoundaryMI); |
580 | // Check if B should actually be a boundary. If not set the previous |
581 | // instruction as the boundary instead. |
582 | if (!TII->isBasicBlockPrologue(MI: *B)) |
583 | B++; |
584 | |
585 | auto R = std::next(x: MI->getReverseIterator()); |
586 | const unsigned Threshold = 50; |
587 | // Search until B or Threshold for a place to insert the initialization. |
588 | for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) |
589 | if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || |
590 | TII->isSchedulingBoundary(MI: *R, MBB, MF: *MBB->getParent())) |
591 | break; |
592 | |
593 | // Move to directly after R. |
594 | if (&*--R != MI) |
595 | MBB->splice(Where: *R, Other: MBB, From: MI); |
596 | } |
597 | } |
598 | |
599 | if (Changed) |
600 | MRI.clearKillFlags(Reg); |
601 | |
602 | return Changed; |
603 | } |
604 | |
605 | bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { |
606 | // Only need to run this in SelectionDAG path. |
607 | if (MF.getProperties().hasProperty( |
608 | P: MachineFunctionProperties::Property::Selected)) |
609 | return false; |
610 | |
611 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
612 | MRI = &MF.getRegInfo(); |
613 | TRI = ST.getRegisterInfo(); |
614 | TII = ST.getInstrInfo(); |
615 | MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
616 | |
617 | for (MachineBasicBlock &MBB : MF) { |
618 | for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; |
619 | ++I) { |
620 | MachineInstr &MI = *I; |
621 | |
622 | switch (MI.getOpcode()) { |
623 | default: |
624 | continue; |
625 | case AMDGPU::COPY: |
626 | case AMDGPU::WQM: |
627 | case AMDGPU::STRICT_WQM: |
628 | case AMDGPU::SOFT_WQM: |
629 | case AMDGPU::STRICT_WWM: { |
630 | const TargetRegisterClass *SrcRC, *DstRC; |
631 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: MI, TRI: *TRI, MRI: *MRI); |
632 | |
633 | if (isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) { |
634 | // Since VGPR to SGPR copies affect VGPR to SGPR copy |
635 | // score and, hence the lowering decision, let's try to get rid of |
636 | // them as early as possible |
637 | if (tryChangeVGPRtoSGPRinCopy(MI, TRI, TII)) |
638 | continue; |
639 | |
640 | // Collect those not changed to try them after VGPR to SGPR copies |
641 | // lowering as there will be more opportunities. |
642 | S2VCopies.push_back(Elt: &MI); |
643 | } |
644 | if (!isVGPRToSGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
645 | continue; |
646 | if (lowerSpecialCase(MI, I)) |
647 | continue; |
648 | |
649 | analyzeVGPRToSGPRCopy(MI: &MI); |
650 | |
651 | break; |
652 | } |
653 | case AMDGPU::INSERT_SUBREG: |
654 | case AMDGPU::PHI: |
655 | case AMDGPU::REG_SEQUENCE: { |
656 | if (TRI->isSGPRClass(RC: TII->getOpRegClass(MI, OpNo: 0))) { |
657 | for (MachineOperand &MO : MI.operands()) { |
658 | if (!MO.isReg() || !MO.getReg().isVirtual()) |
659 | continue; |
660 | const TargetRegisterClass *SrcRC = MRI->getRegClass(Reg: MO.getReg()); |
661 | if (TRI->hasVectorRegisters(RC: SrcRC)) { |
662 | const TargetRegisterClass *DestRC = |
663 | TRI->getEquivalentSGPRClass(VRC: SrcRC); |
664 | Register NewDst = MRI->createVirtualRegister(RegClass: DestRC); |
665 | MachineBasicBlock *BlockToInsertCopy = |
666 | MI.isPHI() ? MI.getOperand(i: MO.getOperandNo() + 1).getMBB() |
667 | : &MBB; |
668 | MachineBasicBlock::iterator PointToInsertCopy = |
669 | MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I; |
670 | |
671 | if (!tryMoveVGPRConstToSGPR(MO, NewDst, BlockToInsertTo: BlockToInsertCopy, |
672 | PointToInsertTo: PointToInsertCopy)) { |
673 | MachineInstr *NewCopy = |
674 | BuildMI(BB&: *BlockToInsertCopy, I: PointToInsertCopy, |
675 | MIMD: PointToInsertCopy->getDebugLoc(), |
676 | MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: NewDst) |
677 | .addReg(RegNo: MO.getReg()); |
678 | MO.setReg(NewDst); |
679 | analyzeVGPRToSGPRCopy(MI: NewCopy); |
680 | } |
681 | } |
682 | } |
683 | } |
684 | |
685 | if (MI.isPHI()) |
686 | PHINodes.push_back(Elt: &MI); |
687 | else if (MI.isRegSequence()) |
688 | RegSequences.push_back(Elt: &MI); |
689 | |
690 | break; |
691 | } |
692 | case AMDGPU::V_WRITELANE_B32: { |
693 | // Some architectures allow more than one constant bus access without |
694 | // SGPR restriction |
695 | if (ST.getConstantBusLimit(Opcode: MI.getOpcode()) != 1) |
696 | break; |
697 | |
698 | // Writelane is special in that it can use SGPR and M0 (which would |
699 | // normally count as using the constant bus twice - but in this case it |
700 | // is allowed since the lane selector doesn't count as a use of the |
701 | // constant bus). However, it is still required to abide by the 1 SGPR |
702 | // rule. Apply a fix here as we might have multiple SGPRs after |
703 | // legalizing VGPRs to SGPRs |
704 | int Src0Idx = |
705 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::src0); |
706 | int Src1Idx = |
707 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::src1); |
708 | MachineOperand &Src0 = MI.getOperand(i: Src0Idx); |
709 | MachineOperand &Src1 = MI.getOperand(i: Src1Idx); |
710 | |
711 | // Check to see if the instruction violates the 1 SGPR rule |
712 | if ((Src0.isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src0.getReg()) && |
713 | Src0.getReg() != AMDGPU::M0) && |
714 | (Src1.isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src1.getReg()) && |
715 | Src1.getReg() != AMDGPU::M0)) { |
716 | |
717 | // Check for trivially easy constant prop into one of the operands |
718 | // If this is the case then perform the operation now to resolve SGPR |
719 | // issue. If we don't do that here we will always insert a mov to m0 |
720 | // that can't be resolved in later operand folding pass |
721 | bool Resolved = false; |
722 | for (MachineOperand *MO : {&Src0, &Src1}) { |
723 | if (MO->getReg().isVirtual()) { |
724 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MO->getReg()); |
725 | if (DefMI && TII->isFoldableCopy(MI: *DefMI)) { |
726 | const MachineOperand &Def = DefMI->getOperand(i: 0); |
727 | if (Def.isReg() && |
728 | MO->getReg() == Def.getReg() && |
729 | MO->getSubReg() == Def.getSubReg()) { |
730 | const MachineOperand &Copied = DefMI->getOperand(i: 1); |
731 | if (Copied.isImm() && |
732 | TII->isInlineConstant(Imm: APInt(64, Copied.getImm(), true))) { |
733 | MO->ChangeToImmediate(ImmVal: Copied.getImm()); |
734 | Resolved = true; |
735 | break; |
736 | } |
737 | } |
738 | } |
739 | } |
740 | } |
741 | |
742 | if (!Resolved) { |
743 | // Haven't managed to resolve by replacing an SGPR with an immediate |
744 | // Move src1 to be in M0 |
745 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), |
746 | MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0) |
747 | .add(MO: Src1); |
748 | Src1.ChangeToRegister(Reg: AMDGPU::M0, isDef: false); |
749 | } |
750 | } |
751 | break; |
752 | } |
753 | } |
754 | } |
755 | } |
756 | |
757 | lowerVGPR2SGPRCopies(MF); |
758 | // Postprocessing |
759 | fixSCCCopies(MF); |
760 | for (auto MI : S2VCopies) { |
761 | // Check if it is still valid |
762 | if (MI->isCopy()) { |
763 | const TargetRegisterClass *SrcRC, *DstRC; |
764 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: *MI, TRI: *TRI, MRI: *MRI); |
765 | if (isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
766 | tryChangeVGPRtoSGPRinCopy(MI&: *MI, TRI, TII); |
767 | } |
768 | } |
769 | for (auto MI : RegSequences) { |
770 | // Check if it is still valid |
771 | if (MI->isRegSequence()) |
772 | foldVGPRCopyIntoRegSequence(MI&: *MI, TRI, TII, MRI&: *MRI); |
773 | } |
774 | for (auto MI : PHINodes) { |
775 | processPHINode(MI&: *MI); |
776 | } |
777 | if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge) |
778 | hoistAndMergeSGPRInits(Reg: AMDGPU::M0, MRI: *MRI, TRI, MDT&: *MDT, TII); |
779 | |
780 | SiblingPenalty.clear(); |
781 | V2SCopies.clear(); |
782 | SCCCopies.clear(); |
783 | RegSequences.clear(); |
784 | PHINodes.clear(); |
785 | S2VCopies.clear(); |
786 | |
787 | return true; |
788 | } |
789 | |
790 | void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { |
791 | bool AllAGPRUses = true; |
792 | SetVector<const MachineInstr *> worklist; |
793 | SmallSet<const MachineInstr *, 4> Visited; |
794 | SetVector<MachineInstr *> PHIOperands; |
795 | worklist.insert(X: &MI); |
796 | Visited.insert(Ptr: &MI); |
797 | // HACK to make MIR tests with no uses happy |
798 | bool HasUses = false; |
799 | while (!worklist.empty()) { |
800 | const MachineInstr *Instr = worklist.pop_back_val(); |
801 | Register Reg = Instr->getOperand(i: 0).getReg(); |
802 | for (const auto &Use : MRI->use_operands(Reg)) { |
803 | HasUses = true; |
804 | const MachineInstr *UseMI = Use.getParent(); |
805 | AllAGPRUses &= (UseMI->isCopy() && |
806 | TRI->isAGPR(MRI: *MRI, Reg: UseMI->getOperand(i: 0).getReg())) || |
807 | TRI->isAGPR(MRI: *MRI, Reg: Use.getReg()); |
808 | if (UseMI->isCopy() || UseMI->isRegSequence()) { |
809 | if (Visited.insert(Ptr: UseMI).second) |
810 | worklist.insert(X: UseMI); |
811 | |
812 | continue; |
813 | } |
814 | } |
815 | } |
816 | |
817 | Register PHIRes = MI.getOperand(i: 0).getReg(); |
818 | const TargetRegisterClass *RC0 = MRI->getRegClass(Reg: PHIRes); |
819 | if (HasUses && AllAGPRUses && !TRI->isAGPRClass(RC: RC0)) { |
820 | LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); |
821 | MRI->setRegClass(Reg: PHIRes, RC: TRI->getEquivalentAGPRClass(SRC: RC0)); |
822 | for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { |
823 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MI.getOperand(i: I).getReg()); |
824 | if (DefMI && DefMI->isPHI()) |
825 | PHIOperands.insert(X: DefMI); |
826 | } |
827 | } |
828 | |
829 | if (TRI->isVectorRegister(MRI: *MRI, Reg: PHIRes) || |
830 | RC0 == &AMDGPU::VReg_1RegClass) { |
831 | LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); |
832 | TII->legalizeOperands(MI, MDT); |
833 | } |
834 | |
835 | // Propagate register class back to PHI operands which are PHI themselves. |
836 | while (!PHIOperands.empty()) { |
837 | processPHINode(MI&: *PHIOperands.pop_back_val()); |
838 | } |
839 | } |
840 | |
841 | bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR( |
842 | MachineOperand &MaybeVGPRConstMO, Register DstReg, |
843 | MachineBasicBlock *BlockToInsertTo, |
844 | MachineBasicBlock::iterator PointToInsertTo) { |
845 | |
846 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MaybeVGPRConstMO.getReg()); |
847 | if (!DefMI || !DefMI->isMoveImmediate()) |
848 | return false; |
849 | |
850 | MachineOperand *SrcConst = TII->getNamedOperand(MI&: *DefMI, OperandName: AMDGPU::OpName::src0); |
851 | if (SrcConst->isReg()) |
852 | return false; |
853 | |
854 | const TargetRegisterClass *SrcRC = |
855 | MRI->getRegClass(Reg: MaybeVGPRConstMO.getReg()); |
856 | unsigned MoveSize = TRI->getRegSizeInBits(RC: *SrcRC); |
857 | unsigned MoveOp = MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; |
858 | BuildMI(BB&: *BlockToInsertTo, I: PointToInsertTo, MIMD: PointToInsertTo->getDebugLoc(), |
859 | MCID: TII->get(Opcode: MoveOp), DestReg: DstReg) |
860 | .add(MO: *SrcConst); |
861 | if (MRI->hasOneUse(RegNo: MaybeVGPRConstMO.getReg())) |
862 | DefMI->eraseFromParent(); |
863 | MaybeVGPRConstMO.setReg(DstReg); |
864 | return true; |
865 | } |
866 | |
867 | bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, |
868 | MachineBasicBlock::iterator &I) { |
869 | Register DstReg = MI.getOperand(i: 0).getReg(); |
870 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
871 | if (!DstReg.isVirtual()) { |
872 | // If the destination register is a physical register there isn't |
873 | // really much we can do to fix this. |
874 | // Some special instructions use M0 as an input. Some even only use |
875 | // the first lane. Insert a readfirstlane and hope for the best. |
876 | if (DstReg == AMDGPU::M0 && |
877 | TRI->hasVectorRegisters(RC: MRI->getRegClass(Reg: SrcReg))) { |
878 | Register TmpReg = |
879 | MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass); |
880 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), |
881 | MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: TmpReg) |
882 | .add(MO: MI.getOperand(i: 1)); |
883 | MI.getOperand(i: 1).setReg(TmpReg); |
884 | } else if (tryMoveVGPRConstToSGPR(MaybeVGPRConstMO&: MI.getOperand(i: 1), DstReg, BlockToInsertTo: MI.getParent(), |
885 | PointToInsertTo: MI)) { |
886 | I = std::next(x: I); |
887 | MI.eraseFromParent(); |
888 | } |
889 | return true; |
890 | } |
891 | if (!SrcReg.isVirtual() || TRI->isAGPR(MRI: *MRI, Reg: SrcReg)) { |
892 | SIInstrWorklist worklist; |
893 | worklist.insert(MI: &MI); |
894 | TII->moveToVALU(Worklist&: worklist, MDT); |
895 | return true; |
896 | } |
897 | |
898 | unsigned SMovOp; |
899 | int64_t Imm; |
900 | // If we are just copying an immediate, we can replace the copy with |
901 | // s_mov_b32. |
902 | if (isSafeToFoldImmIntoCopy(Copy: &MI, MoveImm: MRI->getVRegDef(Reg: SrcReg), TII, SMovOp, Imm)) { |
903 | MI.getOperand(i: 1).ChangeToImmediate(ImmVal: Imm); |
904 | MI.addImplicitDefUseOperands(MF&: *MI.getParent()->getParent()); |
905 | MI.setDesc(TII->get(Opcode: SMovOp)); |
906 | return true; |
907 | } |
908 | return false; |
909 | } |
910 | |
911 | void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { |
912 | Register DstReg = MI->getOperand(i: 0).getReg(); |
913 | const TargetRegisterClass *DstRC = MRI->getRegClass(Reg: DstReg); |
914 | |
915 | V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, |
916 | TRI->getRegSizeInBits(RC: *DstRC)); |
917 | SmallVector<MachineInstr *, 8> AnalysisWorklist; |
918 | // Needed because the SSA is not a tree but a graph and may have |
919 | // forks and joins. We should not then go same way twice. |
920 | DenseSet<MachineInstr *> Visited; |
921 | AnalysisWorklist.push_back(Elt: Info.Copy); |
922 | while (!AnalysisWorklist.empty()) { |
923 | |
924 | MachineInstr *Inst = AnalysisWorklist.pop_back_val(); |
925 | |
926 | if (!Visited.insert(V: Inst).second) |
927 | continue; |
928 | |
929 | // Copies and REG_SEQUENCE do not contribute to the final assembly |
930 | // So, skip them but take care of the SGPR to VGPR copies bookkeeping. |
931 | if (Inst->isCopy() || Inst->isRegSequence()) { |
932 | if (TRI->isVGPR(MRI: *MRI, Reg: Inst->getOperand(i: 0).getReg())) { |
933 | if (!Inst->isCopy() || |
934 | !tryChangeVGPRtoSGPRinCopy(MI&: *Inst, TRI, TII)) { |
935 | Info.NumSVCopies++; |
936 | continue; |
937 | } |
938 | } |
939 | } |
940 | |
941 | SiblingPenalty[Inst].insert(X: Info.ID); |
942 | |
943 | SmallVector<MachineInstr *, 4> Users; |
944 | if ((TII->isSALU(MI: *Inst) && Inst->isCompare()) || |
945 | (Inst->isCopy() && Inst->getOperand(i: 0).getReg() == AMDGPU::SCC)) { |
946 | auto I = Inst->getIterator(); |
947 | auto E = Inst->getParent()->end(); |
948 | while (++I != E && |
949 | !I->findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr)) { |
950 | if (I->readsRegister(Reg: AMDGPU::SCC, /*TRI=*/nullptr)) |
951 | Users.push_back(Elt: &*I); |
952 | } |
953 | } else if (Inst->getNumExplicitDefs() != 0) { |
954 | Register Reg = Inst->getOperand(i: 0).getReg(); |
955 | if (TRI->isSGPRReg(MRI: *MRI, Reg) && !TII->isVALU(MI: *Inst)) |
956 | for (auto &U : MRI->use_instructions(Reg)) |
957 | Users.push_back(Elt: &U); |
958 | } |
959 | for (auto U : Users) { |
960 | if (TII->isSALU(MI: *U)) |
961 | Info.SChain.insert(X: U); |
962 | AnalysisWorklist.push_back(Elt: U); |
963 | } |
964 | } |
965 | V2SCopies[Info.ID] = Info; |
966 | } |
967 | |
968 | // The main function that computes the VGPR to SGPR copy score |
969 | // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU |
970 | bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { |
971 | if (Info->SChain.empty()) { |
972 | Info->Score = 0; |
973 | return true; |
974 | } |
975 | Info->Siblings = SiblingPenalty[*llvm::max_element( |
976 | Range&: Info->SChain, C: [&](MachineInstr *A, MachineInstr *B) -> bool { |
977 | return SiblingPenalty[A].size() < SiblingPenalty[B].size(); |
978 | })]; |
979 | Info->Siblings.remove_if(P: [&](unsigned ID) { return ID == Info->ID; }); |
980 | // The loop below computes the number of another VGPR to SGPR V2SCopies |
981 | // which contribute to the current copy SALU chain. We assume that all the |
982 | // V2SCopies with the same source virtual register will be squashed to one |
983 | // by regalloc. Also we take care of the V2SCopies of the differnt subregs |
984 | // of the same register. |
985 | SmallSet<std::pair<Register, unsigned>, 4> SrcRegs; |
986 | for (auto J : Info->Siblings) { |
987 | auto InfoIt = V2SCopies.find(Key: J); |
988 | if (InfoIt != V2SCopies.end()) { |
989 | MachineInstr *SiblingCopy = InfoIt->second.Copy; |
990 | if (SiblingCopy->isImplicitDef()) |
991 | // the COPY has already been MoveToVALUed |
992 | continue; |
993 | |
994 | SrcRegs.insert(V: std::pair(SiblingCopy->getOperand(i: 1).getReg(), |
995 | SiblingCopy->getOperand(i: 1).getSubReg())); |
996 | } |
997 | } |
998 | Info->SiblingPenalty = SrcRegs.size(); |
999 | |
1000 | unsigned Penalty = |
1001 | Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes; |
1002 | unsigned Profit = Info->SChain.size(); |
1003 | Info->Score = Penalty > Profit ? 0 : Profit - Penalty; |
1004 | Info->NeedToBeConvertedToVALU = Info->Score < 3; |
1005 | return Info->NeedToBeConvertedToVALU; |
1006 | } |
1007 | |
1008 | void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { |
1009 | |
1010 | SmallVector<unsigned, 8> LoweringWorklist; |
1011 | for (auto &C : V2SCopies) { |
1012 | if (needToBeConvertedToVALU(Info: &C.second)) |
1013 | LoweringWorklist.push_back(Elt: C.second.ID); |
1014 | } |
1015 | |
1016 | // Store all the V2S copy instructions that need to be moved to VALU |
1017 | // in the Copies worklist. |
1018 | SIInstrWorklist Copies; |
1019 | |
1020 | while (!LoweringWorklist.empty()) { |
1021 | unsigned CurID = LoweringWorklist.pop_back_val(); |
1022 | auto CurInfoIt = V2SCopies.find(Key: CurID); |
1023 | if (CurInfoIt != V2SCopies.end()) { |
1024 | V2SCopyInfo C = CurInfoIt->second; |
1025 | LLVM_DEBUG(dbgs() << "Processing ...\n" ; C.dump()); |
1026 | for (auto S : C.Siblings) { |
1027 | auto SibInfoIt = V2SCopies.find(Key: S); |
1028 | if (SibInfoIt != V2SCopies.end()) { |
1029 | V2SCopyInfo &SI = SibInfoIt->second; |
1030 | LLVM_DEBUG(dbgs() << "Sibling:\n" ; SI.dump()); |
1031 | if (!SI.NeedToBeConvertedToVALU) { |
1032 | SI.SChain.set_subtract(C.SChain); |
1033 | if (needToBeConvertedToVALU(Info: &SI)) |
1034 | LoweringWorklist.push_back(Elt: SI.ID); |
1035 | } |
1036 | SI.Siblings.remove_if(P: [&](unsigned ID) { return ID == C.ID; }); |
1037 | } |
1038 | } |
1039 | LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy |
1040 | << " is being turned to VALU\n" ); |
1041 | // TODO: MapVector::erase is inefficient. Do bulk removal with remove_if |
1042 | // instead. |
1043 | V2SCopies.erase(Key: C.ID); |
1044 | Copies.insert(MI: C.Copy); |
1045 | } |
1046 | } |
1047 | |
1048 | TII->moveToVALU(Worklist&: Copies, MDT); |
1049 | Copies.clear(); |
1050 | |
1051 | // Now do actual lowering |
1052 | for (auto C : V2SCopies) { |
1053 | MachineInstr *MI = C.second.Copy; |
1054 | MachineBasicBlock *MBB = MI->getParent(); |
1055 | // We decide to turn V2S copy to v_readfirstlane_b32 |
1056 | // remove it from the V2SCopies and remove it from all its siblings |
1057 | LLVM_DEBUG(dbgs() << "V2S copy " << *MI |
1058 | << " is being turned to v_readfirstlane_b32" |
1059 | << " Score: " << C.second.Score << "\n" ); |
1060 | Register DstReg = MI->getOperand(i: 0).getReg(); |
1061 | Register SrcReg = MI->getOperand(i: 1).getReg(); |
1062 | unsigned SubReg = MI->getOperand(i: 1).getSubReg(); |
1063 | const TargetRegisterClass *SrcRC = |
1064 | TRI->getRegClassForOperandReg(MRI: *MRI, MO: MI->getOperand(i: 1)); |
1065 | size_t SrcSize = TRI->getRegSizeInBits(RC: *SrcRC); |
1066 | if (SrcSize == 16) { |
1067 | // HACK to handle possible 16bit VGPR source |
1068 | auto MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), |
1069 | MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg); |
1070 | MIB.addReg(RegNo: SrcReg, flags: 0, SubReg: AMDGPU::NoSubRegister); |
1071 | } else if (SrcSize == 32) { |
1072 | auto MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), |
1073 | MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg); |
1074 | MIB.addReg(RegNo: SrcReg, flags: 0, SubReg); |
1075 | } else { |
1076 | auto Result = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), |
1077 | MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg); |
1078 | int N = TRI->getRegSizeInBits(RC: *SrcRC) / 32; |
1079 | for (int i = 0; i < N; i++) { |
1080 | Register PartialSrc = TII->buildExtractSubReg( |
1081 | MI: Result, MRI&: *MRI, SuperReg: MI->getOperand(i: 1), SuperRC: SrcRC, |
1082 | SubIdx: TRI->getSubRegFromChannel(Channel: i), SubRC: &AMDGPU::VGPR_32RegClass); |
1083 | Register PartialDst = |
1084 | MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
1085 | BuildMI(BB&: *MBB, I&: *Result, MIMD: Result->getDebugLoc(), |
1086 | MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: PartialDst) |
1087 | .addReg(RegNo: PartialSrc); |
1088 | Result.addReg(RegNo: PartialDst).addImm(Val: TRI->getSubRegFromChannel(Channel: i)); |
1089 | } |
1090 | } |
1091 | MI->eraseFromParent(); |
1092 | } |
1093 | } |
1094 | |
1095 | void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { |
1096 | bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32(); |
1097 | for (MachineBasicBlock &MBB : MF) { |
1098 | for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; |
1099 | ++I) { |
1100 | MachineInstr &MI = *I; |
1101 | // May already have been lowered. |
1102 | if (!MI.isCopy()) |
1103 | continue; |
1104 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
1105 | Register DstReg = MI.getOperand(i: 0).getReg(); |
1106 | if (SrcReg == AMDGPU::SCC) { |
1107 | Register SCCCopy = MRI->createVirtualRegister( |
1108 | RegClass: TRI->getRegClass(RCID: AMDGPU::SReg_1_XEXECRegClassID)); |
1109 | I = BuildMI(BB&: *MI.getParent(), I: std::next(x: MachineBasicBlock::iterator(MI)), |
1110 | MIMD: MI.getDebugLoc(), |
1111 | MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_CSELECT_B32 |
1112 | : AMDGPU::S_CSELECT_B64), |
1113 | DestReg: SCCCopy) |
1114 | .addImm(Val: -1) |
1115 | .addImm(Val: 0); |
1116 | I = BuildMI(BB&: *MI.getParent(), I: std::next(x: I), MIMD: I->getDebugLoc(), |
1117 | MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg) |
1118 | .addReg(RegNo: SCCCopy); |
1119 | MI.eraseFromParent(); |
1120 | continue; |
1121 | } |
1122 | if (DstReg == AMDGPU::SCC) { |
1123 | unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; |
1124 | Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
1125 | Register Tmp = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
1126 | I = BuildMI(BB&: *MI.getParent(), I: std::next(x: MachineBasicBlock::iterator(MI)), |
1127 | MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode)) |
1128 | .addReg(RegNo: Tmp, flags: getDefRegState(B: true)) |
1129 | .addReg(RegNo: SrcReg) |
1130 | .addReg(RegNo: Exec); |
1131 | MI.eraseFromParent(); |
1132 | } |
1133 | } |
1134 | } |
1135 | } |
1136 | |