1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/Analysis/ValueTracking.h"
24#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25#include "llvm/CodeGen/LiveIntervals.h"
26#include "llvm/CodeGen/LiveVariables.h"
27#include "llvm/CodeGen/MachineDominators.h"
28#include "llvm/CodeGen/MachineFrameInfo.h"
29#include "llvm/CodeGen/MachineScheduler.h"
30#include "llvm/CodeGen/RegisterScavenging.h"
31#include "llvm/CodeGen/ScheduleDAG.h"
32#include "llvm/IR/DiagnosticInfo.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
35#include "llvm/Support/CommandLine.h"
36#include "llvm/Target/TargetMachine.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
55static cl::opt<unsigned>
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(Val: 16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
59static cl::opt<bool> Fix16BitCopies(
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(Val: true),
63 cl::ReallyHidden);
64
65SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
66 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
67 AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(TSInfo: &ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(Num: N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
85static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1,
86 AMDGPU::OpName OpName) {
87 unsigned Opc0 = N0->getMachineOpcode();
88 unsigned Opc1 = N1->getMachineOpcode();
89
90 int Op0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: OpName);
91 int Op1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: OpName);
92
93 if (Op0Idx == -1 && Op1Idx == -1)
94 return true;
95
96
97 if ((Op0Idx == -1 && Op1Idx != -1) ||
98 (Op1Idx == -1 && Op0Idx != -1))
99 return false;
100
101 // getNamedOperandIdx returns the index for the MachineInstr's operands,
102 // which includes the result as the first operand. We are indexing into the
103 // MachineSDNode's operands, so we need to skip the result operand to get
104 // the real index.
105 --Op0Idx;
106 --Op1Idx;
107
108 return N0->getOperand(Num: Op0Idx) == N1->getOperand(Num: Op1Idx);
109}
110
111static bool canRemat(const MachineInstr &MI) {
112
113 if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) ||
114 SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) ||
115 SIInstrInfo::isSALU(MI))
116 return true;
117
118 if (SIInstrInfo::isSMRD(MI)) {
119 return !MI.memoperands_empty() &&
120 llvm::all_of(Range: MI.memoperands(), P: [](const MachineMemOperand *MMO) {
121 return MMO->isLoad() && MMO->isInvariant();
122 });
123 }
124
125 return false;
126}
127
128bool SIInstrInfo::isReMaterializableImpl(
129 const MachineInstr &MI) const {
130
131 if (canRemat(MI)) {
132 // Normally VALU use of exec would block the rematerialization, but that
133 // is OK in this case to have an implicit exec read as all VALU do.
134 // We really want all of the generic logic for this except for this.
135
136 // Another potential implicit use is mode register. The core logic of
137 // the RA will not attempt rematerialization if mode is set anywhere
138 // in the function, otherwise it is safe since mode is not changed.
139
140 // There is difference to generic method which does not allow
141 // rematerialization if there are virtual register uses. We allow this,
142 // therefore this method includes SOP instructions as well.
143 if (!MI.hasImplicitDef() &&
144 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
145 !MI.mayRaiseFPException())
146 return true;
147 }
148
149 return TargetInstrInfo::isReMaterializableImpl(MI);
150}
151
152// Returns true if the scalar result of a VALU instruction depends on exec.
153bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
154 // Ignore comparisons which are only used masked with exec.
155 // This allows some hoisting/sinking of VALU comparisons.
156 if (MI.isCompare()) {
157 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
158 if (!Dst)
159 return true;
160
161 Register DstReg = Dst->getReg();
162 if (!DstReg.isVirtual())
163 return true;
164
165 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
166 for (MachineInstr &Use : MRI.use_nodbg_instructions(Reg: DstReg)) {
167 switch (Use.getOpcode()) {
168 case AMDGPU::S_AND_SAVEEXEC_B32:
169 case AMDGPU::S_AND_SAVEEXEC_B64:
170 break;
171 case AMDGPU::S_AND_B32:
172 case AMDGPU::S_AND_B64:
173 if (!Use.readsRegister(Reg: AMDGPU::EXEC, /*TRI=*/nullptr))
174 return true;
175 break;
176 default:
177 return true;
178 }
179 }
180 return false;
181 }
182
183 switch (MI.getOpcode()) {
184 default:
185 break;
186 case AMDGPU::V_READFIRSTLANE_B32:
187 return true;
188 }
189
190 return false;
191}
192
193bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
194 // Any implicit use of exec by VALU is not a real register read.
195 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
196 isVALU(MI: *MO.getParent()) && !resultDependsOnExec(MI: *MO.getParent());
197}
198
199bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
200 MachineBasicBlock *SuccToSinkTo,
201 MachineCycleInfo *CI) const {
202 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
203 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
204 return true;
205
206 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
207 // Check if sinking of MI would create temporal divergent use.
208 for (auto Op : MI.uses()) {
209 if (Op.isReg() && Op.getReg().isVirtual() &&
210 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Op.getReg()))) {
211 MachineInstr *SgprDef = MRI.getVRegDef(Reg: Op.getReg());
212
213 // SgprDef defined inside cycle
214 MachineCycle *FromCycle = CI->getCycle(Block: SgprDef->getParent());
215 if (FromCycle == nullptr)
216 continue;
217
218 MachineCycle *ToCycle = CI->getCycle(Block: SuccToSinkTo);
219 // Check if there is a FromCycle that contains SgprDef's basic block but
220 // does not contain SuccToSinkTo and also has divergent exit condition.
221 while (FromCycle && !FromCycle->contains(C: ToCycle)) {
222 SmallVector<MachineBasicBlock *, 1> ExitingBlocks;
223 FromCycle->getExitingBlocks(TmpStorage&: ExitingBlocks);
224
225 // FromCycle has divergent exit condition.
226 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
227 if (hasDivergentBranch(MBB: ExitingBlock))
228 return false;
229 }
230
231 FromCycle = FromCycle->getParentCycle();
232 }
233 }
234 }
235
236 return true;
237}
238
239bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
240 int64_t &Offset0,
241 int64_t &Offset1) const {
242 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
243 return false;
244
245 unsigned Opc0 = Load0->getMachineOpcode();
246 unsigned Opc1 = Load1->getMachineOpcode();
247
248 // Make sure both are actually loads.
249 if (!get(Opcode: Opc0).mayLoad() || !get(Opcode: Opc1).mayLoad())
250 return false;
251
252 // A mayLoad instruction without a def is not a load. Likely a prefetch.
253 if (!get(Opcode: Opc0).getNumDefs() || !get(Opcode: Opc1).getNumDefs())
254 return false;
255
256 if (isDS(Opcode: Opc0) && isDS(Opcode: Opc1)) {
257
258 // FIXME: Handle this case:
259 if (getNumOperandsNoGlue(Node: Load0) != getNumOperandsNoGlue(Node: Load1))
260 return false;
261
262 // Check base reg.
263 if (Load0->getOperand(Num: 0) != Load1->getOperand(Num: 0))
264 return false;
265
266 // Skip read2 / write2 variants for simplicity.
267 // TODO: We should report true if the used offsets are adjacent (excluded
268 // st64 versions).
269 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: AMDGPU::OpName::offset);
270 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: AMDGPU::OpName::offset);
271 if (Offset0Idx == -1 || Offset1Idx == -1)
272 return false;
273
274 // XXX - be careful of dataless loads
275 // getNamedOperandIdx returns the index for MachineInstrs. Since they
276 // include the output in the operand list, but SDNodes don't, we need to
277 // subtract the index by one.
278 Offset0Idx -= get(Opcode: Opc0).NumDefs;
279 Offset1Idx -= get(Opcode: Opc1).NumDefs;
280 Offset0 = Load0->getConstantOperandVal(Num: Offset0Idx);
281 Offset1 = Load1->getConstantOperandVal(Num: Offset1Idx);
282 return true;
283 }
284
285 if (isSMRD(Opcode: Opc0) && isSMRD(Opcode: Opc1)) {
286 // Skip time and cache invalidation instructions.
287 if (!AMDGPU::hasNamedOperand(Opcode: Opc0, NamedIdx: AMDGPU::OpName::sbase) ||
288 !AMDGPU::hasNamedOperand(Opcode: Opc1, NamedIdx: AMDGPU::OpName::sbase))
289 return false;
290
291 unsigned NumOps = getNumOperandsNoGlue(Node: Load0);
292 if (NumOps != getNumOperandsNoGlue(Node: Load1))
293 return false;
294
295 // Check base reg.
296 if (Load0->getOperand(Num: 0) != Load1->getOperand(Num: 0))
297 return false;
298
299 // Match register offsets, if both register and immediate offsets present.
300 assert(NumOps == 4 || NumOps == 5);
301 if (NumOps == 5 && Load0->getOperand(Num: 1) != Load1->getOperand(Num: 1))
302 return false;
303
304 const ConstantSDNode *Load0Offset =
305 dyn_cast<ConstantSDNode>(Val: Load0->getOperand(Num: NumOps - 3));
306 const ConstantSDNode *Load1Offset =
307 dyn_cast<ConstantSDNode>(Val: Load1->getOperand(Num: NumOps - 3));
308
309 if (!Load0Offset || !Load1Offset)
310 return false;
311
312 Offset0 = Load0Offset->getZExtValue();
313 Offset1 = Load1Offset->getZExtValue();
314 return true;
315 }
316
317 // MUBUF and MTBUF can access the same addresses.
318 if ((isMUBUF(Opcode: Opc0) || isMTBUF(Opcode: Opc0)) && (isMUBUF(Opcode: Opc1) || isMTBUF(Opcode: Opc1))) {
319
320 // MUBUF and MTBUF have vaddr at different indices.
321 if (!nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::soffset) ||
322 !nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::vaddr) ||
323 !nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::srsrc))
324 return false;
325
326 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: AMDGPU::OpName::offset);
327 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: AMDGPU::OpName::offset);
328
329 if (OffIdx0 == -1 || OffIdx1 == -1)
330 return false;
331
332 // getNamedOperandIdx returns the index for MachineInstrs. Since they
333 // include the output in the operand list, but SDNodes don't, we need to
334 // subtract the index by one.
335 OffIdx0 -= get(Opcode: Opc0).NumDefs;
336 OffIdx1 -= get(Opcode: Opc1).NumDefs;
337
338 SDValue Off0 = Load0->getOperand(Num: OffIdx0);
339 SDValue Off1 = Load1->getOperand(Num: OffIdx1);
340
341 // The offset might be a FrameIndexSDNode.
342 if (!isa<ConstantSDNode>(Val: Off0) || !isa<ConstantSDNode>(Val: Off1))
343 return false;
344
345 Offset0 = Off0->getAsZExtVal();
346 Offset1 = Off1->getAsZExtVal();
347 return true;
348 }
349
350 return false;
351}
352
353static bool isStride64(unsigned Opc) {
354 switch (Opc) {
355 case AMDGPU::DS_READ2ST64_B32:
356 case AMDGPU::DS_READ2ST64_B64:
357 case AMDGPU::DS_WRITE2ST64_B32:
358 case AMDGPU::DS_WRITE2ST64_B64:
359 return true;
360 default:
361 return false;
362 }
363}
364
365bool SIInstrInfo::getMemOperandsWithOffsetWidth(
366 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
367 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
368 const TargetRegisterInfo *TRI) const {
369 if (!LdSt.mayLoadOrStore())
370 return false;
371
372 unsigned Opc = LdSt.getOpcode();
373 OffsetIsScalable = false;
374 const MachineOperand *BaseOp, *OffsetOp;
375 int DataOpIdx;
376
377 if (isDS(MI: LdSt)) {
378 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::addr);
379 OffsetOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
380 if (OffsetOp) {
381 // Normal, single offset LDS instruction.
382 if (!BaseOp) {
383 // DS_CONSUME/DS_APPEND use M0 for the base address.
384 // TODO: find the implicit use operand for M0 and use that as BaseOp?
385 return false;
386 }
387 BaseOps.push_back(Elt: BaseOp);
388 Offset = OffsetOp->getImm();
389 // Get appropriate operand, and compute width accordingly.
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
391 if (DataOpIdx == -1)
392 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
393 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
394 Width = LocationSize::precise(Value: 64);
395 else
396 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
397 } else {
398 // The 2 offset instructions use offset0 and offset1 instead. We can treat
399 // these as a load with a single offset if the 2 offsets are consecutive.
400 // We will use this for some partially aligned loads.
401 const MachineOperand *Offset0Op =
402 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset0);
403 const MachineOperand *Offset1Op =
404 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset1);
405
406 unsigned Offset0 = Offset0Op->getImm() & 0xff;
407 unsigned Offset1 = Offset1Op->getImm() & 0xff;
408 if (Offset0 + 1 != Offset1)
409 return false;
410
411 // Each of these offsets is in element sized units, so we need to convert
412 // to bytes of the individual reads.
413
414 unsigned EltSize;
415 if (LdSt.mayLoad())
416 EltSize = TRI->getRegSizeInBits(RC: *getOpRegClass(MI: LdSt, OpNo: 0)) / 16;
417 else {
418 assert(LdSt.mayStore());
419 int Data0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
420 EltSize = TRI->getRegSizeInBits(RC: *getOpRegClass(MI: LdSt, OpNo: Data0Idx)) / 8;
421 }
422
423 if (isStride64(Opc))
424 EltSize *= 64;
425
426 BaseOps.push_back(Elt: BaseOp);
427 Offset = EltSize * Offset0;
428 // Get appropriate operand(s), and compute width accordingly.
429 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
430 if (DataOpIdx == -1) {
431 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
432 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
433 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data1);
434 Width = LocationSize::precise(
435 Value: Width.getValue() + TypeSize::getFixed(ExactSize: getOpSize(MI: LdSt, OpNo: DataOpIdx)));
436 } else {
437 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
438 }
439 }
440 return true;
441 }
442
443 if (isMUBUF(MI: LdSt) || isMTBUF(MI: LdSt)) {
444 const MachineOperand *RSrc = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::srsrc);
445 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
446 return false;
447 BaseOps.push_back(Elt: RSrc);
448 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr);
449 if (BaseOp && !BaseOp->isFI())
450 BaseOps.push_back(Elt: BaseOp);
451 const MachineOperand *OffsetImm =
452 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
453 Offset = OffsetImm->getImm();
454 const MachineOperand *SOffset =
455 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::soffset);
456 if (SOffset) {
457 if (SOffset->isReg())
458 BaseOps.push_back(Elt: SOffset);
459 else
460 Offset += SOffset->getImm();
461 }
462 // Get appropriate operand, and compute width accordingly.
463 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
464 if (DataOpIdx == -1)
465 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
466 if (DataOpIdx == -1) // LDS DMA
467 return false;
468 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
469 return true;
470 }
471
472 if (isImage(MI: LdSt)) {
473 auto RsrcOpName =
474 isMIMG(MI: LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
475 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: RsrcOpName);
476 BaseOps.push_back(Elt: &LdSt.getOperand(i: SRsrcIdx));
477 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0);
478 if (VAddr0Idx >= 0) {
479 // GFX10 possible NSA encoding.
480 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
481 BaseOps.push_back(Elt: &LdSt.getOperand(i: I));
482 } else {
483 BaseOps.push_back(Elt: getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr));
484 }
485 Offset = 0;
486 // Get appropriate operand, and compute width accordingly.
487 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
488 if (DataOpIdx == -1)
489 return false; // no return sampler
490 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
491 return true;
492 }
493
494 if (isSMRD(MI: LdSt)) {
495 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::sbase);
496 if (!BaseOp) // e.g. S_MEMTIME
497 return false;
498 BaseOps.push_back(Elt: BaseOp);
499 OffsetOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
500 Offset = OffsetOp ? OffsetOp->getImm() : 0;
501 // Get appropriate operand, and compute width accordingly.
502 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::sdst);
503 if (DataOpIdx == -1)
504 return false;
505 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
506 return true;
507 }
508
509 if (isFLAT(MI: LdSt)) {
510 // Instructions have either vaddr or saddr or both or none.
511 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr);
512 if (BaseOp)
513 BaseOps.push_back(Elt: BaseOp);
514 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::saddr);
515 if (BaseOp)
516 BaseOps.push_back(Elt: BaseOp);
517 Offset = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset)->getImm();
518 // Get appropriate operand, and compute width accordingly.
519 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
520 if (DataOpIdx == -1)
521 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
522 if (DataOpIdx == -1) // LDS DMA
523 return false;
524 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
525 return true;
526 }
527
528 return false;
529}
530
531static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
532 ArrayRef<const MachineOperand *> BaseOps1,
533 const MachineInstr &MI2,
534 ArrayRef<const MachineOperand *> BaseOps2) {
535 // Only examine the first "base" operand of each instruction, on the
536 // assumption that it represents the real base address of the memory access.
537 // Other operands are typically offsets or indices from this base address.
538 if (BaseOps1.front()->isIdenticalTo(Other: *BaseOps2.front()))
539 return true;
540
541 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
542 return false;
543
544 auto *MO1 = *MI1.memoperands_begin();
545 auto *MO2 = *MI2.memoperands_begin();
546 if (MO1->getAddrSpace() != MO2->getAddrSpace())
547 return false;
548
549 const auto *Base1 = MO1->getValue();
550 const auto *Base2 = MO2->getValue();
551 if (!Base1 || !Base2)
552 return false;
553 Base1 = getUnderlyingObject(V: Base1);
554 Base2 = getUnderlyingObject(V: Base2);
555
556 if (isa<UndefValue>(Val: Base1) || isa<UndefValue>(Val: Base2))
557 return false;
558
559 return Base1 == Base2;
560}
561
562bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
563 int64_t Offset1, bool OffsetIsScalable1,
564 ArrayRef<const MachineOperand *> BaseOps2,
565 int64_t Offset2, bool OffsetIsScalable2,
566 unsigned ClusterSize,
567 unsigned NumBytes) const {
568 // If the mem ops (to be clustered) do not have the same base ptr, then they
569 // should not be clustered
570 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
571 if (!BaseOps1.empty() && !BaseOps2.empty()) {
572 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
573 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
574 if (!memOpsHaveSameBasePtr(MI1: FirstLdSt, BaseOps1, MI2: SecondLdSt, BaseOps2))
575 return false;
576
577 const SIMachineFunctionInfo *MFI =
578 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
579 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
580 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
581 // If only one base op is empty, they do not have the same base ptr
582 return false;
583 }
584
585 // In order to avoid register pressure, on an average, the number of DWORDS
586 // loaded together by all clustered mem ops should not exceed
587 // MaxMemoryClusterDWords. This is an empirical value based on certain
588 // observations and performance related experiments.
589 // The good thing about this heuristic is - it avoids clustering of too many
590 // sub-word loads, and also avoids clustering of wide loads. Below is the
591 // brief summary of how the heuristic behaves for various `LoadSize` when
592 // MaxMemoryClusterDWords is 8.
593 //
594 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
595 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
596 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
597 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
598 // (5) LoadSize >= 17: do not cluster
599 const unsigned LoadSize = NumBytes / ClusterSize;
600 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
601 return NumDWords <= MaxMemoryClusterDWords;
602}
603
604// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
605// the first 16 loads will be interleaved with the stores, and the next 16 will
606// be clustered as expected. It should really split into 2 16 store batches.
607//
608// Loads are clustered until this returns false, rather than trying to schedule
609// groups of stores. This also means we have to deal with saying different
610// address space loads should be clustered, and ones which might cause bank
611// conflicts.
612//
613// This might be deprecated so it might not be worth that much effort to fix.
614bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
615 int64_t Offset0, int64_t Offset1,
616 unsigned NumLoads) const {
617 assert(Offset1 > Offset0 &&
618 "Second offset should be larger than first offset!");
619 // If we have less than 16 loads in a row, and the offsets are within 64
620 // bytes, then schedule together.
621
622 // A cacheline is 64 bytes (for global memory).
623 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
624}
625
626static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
627 MachineBasicBlock::iterator MI,
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 const char *Msg = "illegal VGPR to SGPR copy") {
631 MachineFunction *MF = MBB.getParent();
632
633 LLVMContext &C = MF->getFunction().getContext();
634 C.diagnose(DI: DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
635
636 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_ILLEGAL_COPY), DestReg)
637 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
638}
639
640/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
641/// possible to have a direct copy in these cases on GFX908, so an intermediate
642/// VGPR copy is required.
643static void indirectCopyToAGPR(const SIInstrInfo &TII,
644 MachineBasicBlock &MBB,
645 MachineBasicBlock::iterator MI,
646 const DebugLoc &DL, MCRegister DestReg,
647 MCRegister SrcReg, bool KillSrc,
648 RegScavenger &RS, bool RegsOverlap,
649 Register ImpDefSuperReg = Register(),
650 Register ImpUseSuperReg = Register()) {
651 assert((TII.getSubtarget().hasMAIInsts() &&
652 !TII.getSubtarget().hasGFX90AInsts()) &&
653 "Expected GFX908 subtarget.");
654
655 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
656 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
657 "Source register of the copy should be either an SGPR or an AGPR.");
658
659 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
660 "Destination register of the copy should be an AGPR.");
661
662 const SIRegisterInfo &RI = TII.getRegisterInfo();
663
664 // First try to find defining accvgpr_write to avoid temporary registers.
665 // In the case of copies of overlapping AGPRs, we conservatively do not
666 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
667 // an accvgpr_write used for this same copy due to implicit-defs
668 if (!RegsOverlap) {
669 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
670 --Def;
671
672 if (!Def->modifiesRegister(Reg: SrcReg, TRI: &RI))
673 continue;
674
675 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
676 Def->getOperand(i: 0).getReg() != SrcReg)
677 break;
678
679 MachineOperand &DefOp = Def->getOperand(i: 1);
680 assert(DefOp.isReg() || DefOp.isImm());
681
682 if (DefOp.isReg()) {
683 bool SafeToPropagate = true;
684 // Check that register source operand is not clobbered before MI.
685 // Immediate operands are always safe to propagate.
686 for (auto I = Def; I != MI && SafeToPropagate; ++I)
687 if (I->modifiesRegister(Reg: DefOp.getReg(), TRI: &RI))
688 SafeToPropagate = false;
689
690 if (!SafeToPropagate)
691 break;
692
693 for (auto I = Def; I != MI; ++I)
694 I->clearRegisterKills(Reg: DefOp.getReg(), RegInfo: &RI);
695 }
696
697 MachineInstrBuilder Builder =
698 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
699 .add(MO: DefOp);
700 if (ImpDefSuperReg)
701 Builder.addReg(RegNo: ImpDefSuperReg, Flags: RegState::Define | RegState::Implicit);
702
703 if (ImpUseSuperReg) {
704 Builder.addReg(RegNo: ImpUseSuperReg,
705 Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
706 }
707
708 return;
709 }
710 }
711
712 RS.enterBasicBlockEnd(MBB);
713 RS.backward(I: std::next(x: MI));
714
715 // Ideally we want to have three registers for a long reg_sequence copy
716 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
717 unsigned MaxVGPRs = RI.getRegPressureLimit(RC: &AMDGPU::VGPR_32RegClass,
718 MF&: *MBB.getParent());
719
720 // Registers in the sequence are allocated contiguously so we can just
721 // use register number to pick one of three round-robin temps.
722 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
723 Register Tmp =
724 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
725 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
726 "VGPR used for an intermediate copy should have been reserved.");
727
728 // Only loop through if there are any free registers left. We don't want to
729 // spill.
730 while (RegNo--) {
731 Register Tmp2 = RS.scavengeRegisterBackwards(RC: AMDGPU::VGPR_32RegClass, To: MI,
732 /* RestoreAfter */ false, SPAdj: 0,
733 /* AllowSpill */ false);
734 if (!Tmp2 || RI.getHWRegIndex(Reg: Tmp2) >= MaxVGPRs)
735 break;
736 Tmp = Tmp2;
737 RS.setRegUsed(Reg: Tmp);
738 }
739
740 // Insert copy to temporary VGPR.
741 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
742 if (AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg)) {
743 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
744 } else {
745 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
746 }
747
748 MachineInstrBuilder UseBuilder = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: TmpCopyOp), DestReg: Tmp)
749 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
750 if (ImpUseSuperReg) {
751 UseBuilder.addReg(RegNo: ImpUseSuperReg,
752 Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
753 }
754
755 MachineInstrBuilder DefBuilder
756 = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
757 .addReg(RegNo: Tmp, Flags: RegState::Kill);
758
759 if (ImpDefSuperReg)
760 DefBuilder.addReg(RegNo: ImpDefSuperReg, Flags: RegState::Define | RegState::Implicit);
761}
762
763static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
764 MachineBasicBlock::iterator MI, const DebugLoc &DL,
765 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
766 const TargetRegisterClass *RC, bool Forward) {
767 const SIRegisterInfo &RI = TII.getRegisterInfo();
768 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, EltSize: 4);
769 MachineBasicBlock::iterator I = MI;
770 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
771
772 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
773 int16_t SubIdx = BaseIndices[Idx];
774 Register DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
775 Register SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
776 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
777 unsigned Opcode = AMDGPU::S_MOV_B32;
778
779 // Is SGPR aligned? If so try to combine with next.
780 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
781 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
782 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
783 // Can use SGPR64 copy
784 unsigned Channel = RI.getChannelFromSubReg(SubReg: SubIdx);
785 SubIdx = RI.getSubRegFromChannel(Channel, NumRegs: 2);
786 DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
787 SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
788 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
789 Opcode = AMDGPU::S_MOV_B64;
790 Idx++;
791 }
792
793 LastMI = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode), DestReg: DestSubReg)
794 .addReg(RegNo: SrcSubReg)
795 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
796
797 if (!FirstMI)
798 FirstMI = LastMI;
799
800 if (!Forward)
801 I--;
802 }
803
804 assert(FirstMI && LastMI);
805 if (!Forward)
806 std::swap(a&: FirstMI, b&: LastMI);
807
808 FirstMI->addOperand(
809 Op: MachineOperand::CreateReg(Reg: DestReg, isDef: true /*IsDef*/, isImp: true /*IsImp*/));
810
811 if (KillSrc)
812 LastMI->addRegisterKilled(IncomingReg: SrcReg, RegInfo: &RI);
813}
814
815void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
816 MachineBasicBlock::iterator MI,
817 const DebugLoc &DL, Register DestReg,
818 Register SrcReg, bool KillSrc, bool RenamableDest,
819 bool RenamableSrc) const {
820 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(Reg: DestReg);
821 unsigned Size = RI.getRegSizeInBits(RC: *RC);
822 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(Reg: SrcReg);
823 unsigned SrcSize = RI.getRegSizeInBits(RC: *SrcRC);
824
825 // The rest of copyPhysReg assumes Src and Dst size are the same size.
826 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
827 // we remove Fix16BitCopies and this code block?
828 if (Fix16BitCopies) {
829 if (((Size == 16) != (SrcSize == 16))) {
830 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
831 assert(ST.useRealTrue16Insts());
832 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
833 MCRegister SubReg = RI.getSubReg(Reg: RegToFix, Idx: AMDGPU::lo16);
834 RegToFix = SubReg;
835
836 if (DestReg == SrcReg) {
837 // Identity copy. Insert empty bundle since ExpandPostRA expects an
838 // instruction here.
839 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::BUNDLE));
840 return;
841 }
842 RC = RI.getPhysRegBaseClass(Reg: DestReg);
843 Size = RI.getRegSizeInBits(RC: *RC);
844 SrcRC = RI.getPhysRegBaseClass(Reg: SrcReg);
845 SrcSize = RI.getRegSizeInBits(RC: *SrcRC);
846 }
847 }
848
849 if (RC == &AMDGPU::VGPR_32RegClass) {
850 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
851 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
852 AMDGPU::AGPR_32RegClass.contains(SrcReg));
853 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg) ?
854 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
855 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: Opc), DestReg)
856 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
857 return;
858 }
859
860 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
861 RC == &AMDGPU::SReg_32RegClass) {
862 if (SrcReg == AMDGPU::SCC) {
863 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B32), DestReg)
864 .addImm(Val: 1)
865 .addImm(Val: 0);
866 return;
867 }
868
869 if (!AMDGPU::SReg_32RegClass.contains(Reg: SrcReg)) {
870 if (DestReg == AMDGPU::VCC_LO) {
871 // FIXME: Hack until VReg_1 removed.
872 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
873 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_U32_e32))
874 .addImm(Val: 0)
875 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
876 return;
877 }
878
879 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
880 return;
881 }
882
883 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg)
884 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
885 return;
886 }
887
888 if (RC == &AMDGPU::SReg_64RegClass) {
889 if (SrcReg == AMDGPU::SCC) {
890 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B64), DestReg)
891 .addImm(Val: 1)
892 .addImm(Val: 0);
893 return;
894 }
895
896 if (!AMDGPU::SReg_64_EncodableRegClass.contains(Reg: SrcReg)) {
897 if (DestReg == AMDGPU::VCC) {
898 // FIXME: Hack until VReg_1 removed.
899 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
900 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_U32_e32))
901 .addImm(Val: 0)
902 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
903 return;
904 }
905
906 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
907 return;
908 }
909
910 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B64), DestReg)
911 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
912 return;
913 }
914
915 if (DestReg == AMDGPU::SCC) {
916 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
917 // but SelectionDAG emits such copies for i1 sources.
918 if (AMDGPU::SReg_64RegClass.contains(Reg: SrcReg)) {
919 // This copy can only be produced by patterns
920 // with explicit SCC, which are known to be enabled
921 // only for subtargets with S_CMP_LG_U64 present.
922 assert(ST.hasScalarCompareEq64());
923 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U64))
924 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
925 .addImm(Val: 0);
926 } else {
927 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
928 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U32))
929 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
930 .addImm(Val: 0);
931 }
932
933 return;
934 }
935
936 if (RC == &AMDGPU::AGPR_32RegClass) {
937 if (AMDGPU::VGPR_32RegClass.contains(Reg: SrcReg) ||
938 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(Reg: SrcReg))) {
939 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
940 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
941 return;
942 }
943
944 if (AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg) && ST.hasGFX90AInsts()) {
945 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
946 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
947 return;
948 }
949
950 // FIXME: Pass should maintain scavenger to avoid scan through the block on
951 // every AGPR spill.
952 RegScavenger RS;
953 const bool Overlap = RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
954 indirectCopyToAGPR(TII: *this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, RegsOverlap: Overlap);
955 return;
956 }
957
958 if (Size == 16) {
959 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
960 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
961 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
962
963 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(Reg: DestReg);
964 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(Reg: SrcReg);
965 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(Reg: DestReg);
966 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(Reg: SrcReg);
967 bool DstLow = !AMDGPU::isHi16Reg(Reg: DestReg, MRI: RI);
968 bool SrcLow = !AMDGPU::isHi16Reg(Reg: SrcReg, MRI: RI);
969 MCRegister NewDestReg = RI.get32BitRegister(Reg: DestReg);
970 MCRegister NewSrcReg = RI.get32BitRegister(Reg: SrcReg);
971
972 if (IsSGPRDst) {
973 if (!IsSGPRSrc) {
974 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
975 return;
976 }
977
978 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: NewDestReg)
979 .addReg(RegNo: NewSrcReg, Flags: getKillRegState(B: KillSrc));
980 return;
981 }
982
983 if (IsAGPRDst || IsAGPRSrc) {
984 if (!DstLow || !SrcLow) {
985 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
986 Msg: "Cannot use hi16 subreg with an AGPR!");
987 }
988
989 copyPhysReg(MBB, MI, DL, DestReg: NewDestReg, SrcReg: NewSrcReg, KillSrc);
990 return;
991 }
992
993 if (ST.useRealTrue16Insts()) {
994 if (IsSGPRSrc) {
995 assert(SrcLow);
996 SrcReg = NewSrcReg;
997 }
998 // Use the smaller instruction encoding if possible.
999 if (AMDGPU::VGPR_16_Lo128RegClass.contains(Reg: DestReg) &&
1000 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(Reg: SrcReg))) {
1001 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B16_t16_e32), DestReg)
1002 .addReg(RegNo: SrcReg);
1003 } else {
1004 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B16_t16_e64), DestReg)
1005 .addImm(Val: 0) // src0_modifiers
1006 .addReg(RegNo: SrcReg)
1007 .addImm(Val: 0); // op_sel
1008 }
1009 return;
1010 }
1011
1012 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1013 if (!DstLow || !SrcLow) {
1014 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1015 Msg: "Cannot use hi16 subreg on VI!");
1016 }
1017
1018 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: NewDestReg)
1019 .addReg(RegNo: NewSrcReg, Flags: getKillRegState(B: KillSrc));
1020 return;
1021 }
1022
1023 auto MIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_sdwa), DestReg: NewDestReg)
1024 .addImm(Val: 0) // src0_modifiers
1025 .addReg(RegNo: NewSrcReg)
1026 .addImm(Val: 0) // clamp
1027 .addImm(Val: DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1028 : AMDGPU::SDWA::SdwaSel::WORD_1)
1029 .addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
1030 .addImm(Val: SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1031 : AMDGPU::SDWA::SdwaSel::WORD_1)
1032 .addReg(RegNo: NewDestReg, Flags: RegState::Implicit | RegState::Undef);
1033 // First implicit operand is $exec.
1034 MIB->tieOperands(DefIdx: 0, UseIdx: MIB->getNumOperands() - 1);
1035 return;
1036 }
1037
1038 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(RC: SrcRC))) {
1039 if (ST.hasMovB64()) {
1040 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B64_e32), DestReg)
1041 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
1042 return;
1043 }
1044 if (ST.hasPkMovB32()) {
1045 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg)
1046 .addImm(Val: SISrcMods::OP_SEL_1)
1047 .addReg(RegNo: SrcReg)
1048 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1049 .addReg(RegNo: SrcReg)
1050 .addImm(Val: 0) // op_sel_lo
1051 .addImm(Val: 0) // op_sel_hi
1052 .addImm(Val: 0) // neg_lo
1053 .addImm(Val: 0) // neg_hi
1054 .addImm(Val: 0) // clamp
1055 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
1056 return;
1057 }
1058 }
1059
1060 const bool Forward = RI.getHWRegIndex(Reg: DestReg) <= RI.getHWRegIndex(Reg: SrcReg);
1061 if (RI.isSGPRClass(RC)) {
1062 if (!RI.isSGPRClass(RC: SrcRC)) {
1063 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1064 return;
1065 }
1066 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
1067 expandSGPRCopy(TII: *this, MBB, MI, DL, DestReg, SrcReg, KillSrc: CanKillSuperReg, RC,
1068 Forward);
1069 return;
1070 }
1071
1072 unsigned EltSize = 4;
1073 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1074 if (RI.isAGPRClass(RC)) {
1075 if (ST.hasGFX90AInsts() && RI.isAGPRClass(RC: SrcRC))
1076 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1077 else if (RI.hasVGPRs(RC: SrcRC) ||
1078 (ST.hasGFX90AInsts() && RI.isSGPRClass(RC: SrcRC)))
1079 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1080 else
1081 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1082 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(RC: SrcRC)) {
1083 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1084 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1085 (RI.isProperlyAlignedRC(RC: *RC) &&
1086 (SrcRC == RC || RI.isSGPRClass(RC: SrcRC)))) {
1087 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1088 if (ST.hasMovB64()) {
1089 Opcode = AMDGPU::V_MOV_B64_e32;
1090 EltSize = 8;
1091 } else if (ST.hasPkMovB32()) {
1092 Opcode = AMDGPU::V_PK_MOV_B32;
1093 EltSize = 8;
1094 }
1095 }
1096
1097 // For the cases where we need an intermediate instruction/temporary register
1098 // (destination is an AGPR), we need a scavenger.
1099 //
1100 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1101 // whole block for every handled copy.
1102 std::unique_ptr<RegScavenger> RS;
1103 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1104 RS = std::make_unique<RegScavenger>();
1105
1106 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1107
1108 // If there is an overlap, we can't kill the super-register on the last
1109 // instruction, since it will also kill the components made live by this def.
1110 const bool Overlap = RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
1111 const bool CanKillSuperReg = KillSrc && !Overlap;
1112
1113 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1114 unsigned SubIdx;
1115 if (Forward)
1116 SubIdx = SubIndices[Idx];
1117 else
1118 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1119 Register DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
1120 Register SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
1121 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1122
1123 bool IsFirstSubreg = Idx == 0;
1124 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1125
1126 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1127 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1128 Register ImpUseSuper = SrcReg;
1129 indirectCopyToAGPR(TII: *this, MBB, MI, DL, DestReg: DestSubReg, SrcReg: SrcSubReg, KillSrc: UseKill,
1130 RS&: *RS, RegsOverlap: Overlap, ImpDefSuperReg: ImpDefSuper, ImpUseSuperReg: ImpUseSuper);
1131 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1132 MachineInstrBuilder MIB =
1133 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg: DestSubReg)
1134 .addImm(Val: SISrcMods::OP_SEL_1)
1135 .addReg(RegNo: SrcSubReg)
1136 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1137 .addReg(RegNo: SrcSubReg)
1138 .addImm(Val: 0) // op_sel_lo
1139 .addImm(Val: 0) // op_sel_hi
1140 .addImm(Val: 0) // neg_lo
1141 .addImm(Val: 0) // neg_hi
1142 .addImm(Val: 0) // clamp
1143 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: UseKill) | RegState::Implicit);
1144 if (IsFirstSubreg)
1145 MIB.addReg(RegNo: DestReg, Flags: RegState::Define | RegState::Implicit);
1146 } else {
1147 MachineInstrBuilder Builder =
1148 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode), DestReg: DestSubReg).addReg(RegNo: SrcSubReg);
1149 if (IsFirstSubreg)
1150 Builder.addReg(RegNo: DestReg, Flags: RegState::Define | RegState::Implicit);
1151
1152 Builder.addReg(RegNo: SrcReg, Flags: getKillRegState(B: UseKill) | RegState::Implicit);
1153 }
1154 }
1155}
1156
1157int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1158 int NewOpc;
1159
1160 // Try to map original to commuted opcode
1161 NewOpc = AMDGPU::getCommuteRev(Opcode);
1162 if (NewOpc != -1)
1163 // Check if the commuted (REV) opcode exists on the target.
1164 return pseudoToMCOpcode(Opcode: NewOpc) != -1 ? NewOpc : -1;
1165
1166 // Try to map commuted to original opcode
1167 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1168 if (NewOpc != -1)
1169 // Check if the original (non-REV) opcode exists on the target.
1170 return pseudoToMCOpcode(Opcode: NewOpc) != -1 ? NewOpc : -1;
1171
1172 return Opcode;
1173}
1174
1175const TargetRegisterClass *
1176SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
1177 return &AMDGPU::VGPR_32RegClass;
1178}
1179
1180void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
1181 MachineBasicBlock::iterator I,
1182 const DebugLoc &DL, Register DstReg,
1183 ArrayRef<MachineOperand> Cond,
1184 Register TrueReg,
1185 Register FalseReg) const {
1186 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1187 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1188 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
1189 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1190 "Not a VGPR32 reg");
1191
1192 if (Cond.size() == 1) {
1193 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1194 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1195 .add(MO: Cond[0]);
1196 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1197 .addImm(Val: 0)
1198 .addReg(RegNo: FalseReg)
1199 .addImm(Val: 0)
1200 .addReg(RegNo: TrueReg)
1201 .addReg(RegNo: SReg);
1202 } else if (Cond.size() == 2) {
1203 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1204 switch (Cond[0].getImm()) {
1205 case SIInstrInfo::SCC_TRUE: {
1206 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1207 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 1).addImm(Val: 0);
1208 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1209 .addImm(Val: 0)
1210 .addReg(RegNo: FalseReg)
1211 .addImm(Val: 0)
1212 .addReg(RegNo: TrueReg)
1213 .addReg(RegNo: SReg);
1214 break;
1215 }
1216 case SIInstrInfo::SCC_FALSE: {
1217 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1218 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 0).addImm(Val: 1);
1219 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1220 .addImm(Val: 0)
1221 .addReg(RegNo: FalseReg)
1222 .addImm(Val: 0)
1223 .addReg(RegNo: TrueReg)
1224 .addReg(RegNo: SReg);
1225 break;
1226 }
1227 case SIInstrInfo::VCCNZ: {
1228 MachineOperand RegOp = Cond[1];
1229 RegOp.setImplicit(false);
1230 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1231 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1232 .add(MO: RegOp);
1233 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1234 .addImm(Val: 0)
1235 .addReg(RegNo: FalseReg)
1236 .addImm(Val: 0)
1237 .addReg(RegNo: TrueReg)
1238 .addReg(RegNo: SReg);
1239 break;
1240 }
1241 case SIInstrInfo::VCCZ: {
1242 MachineOperand RegOp = Cond[1];
1243 RegOp.setImplicit(false);
1244 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1245 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1246 .add(MO: RegOp);
1247 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1248 .addImm(Val: 0)
1249 .addReg(RegNo: TrueReg)
1250 .addImm(Val: 0)
1251 .addReg(RegNo: FalseReg)
1252 .addReg(RegNo: SReg);
1253 break;
1254 }
1255 case SIInstrInfo::EXECNZ: {
1256 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1257 Register SReg2 = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1258 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.OrSaveExecOpc), DestReg: SReg2).addImm(Val: 0);
1259 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 1).addImm(Val: 0);
1260 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1261 .addImm(Val: 0)
1262 .addReg(RegNo: FalseReg)
1263 .addImm(Val: 0)
1264 .addReg(RegNo: TrueReg)
1265 .addReg(RegNo: SReg);
1266 break;
1267 }
1268 case SIInstrInfo::EXECZ: {
1269 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1270 Register SReg2 = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1271 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.OrSaveExecOpc), DestReg: SReg2).addImm(Val: 0);
1272 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 0).addImm(Val: 1);
1273 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1274 .addImm(Val: 0)
1275 .addReg(RegNo: FalseReg)
1276 .addImm(Val: 0)
1277 .addReg(RegNo: TrueReg)
1278 .addReg(RegNo: SReg);
1279 llvm_unreachable("Unhandled branch predicate EXECZ");
1280 break;
1281 }
1282 default:
1283 llvm_unreachable("invalid branch predicate");
1284 }
1285 } else {
1286 llvm_unreachable("Can only handle Cond size 1 or 2");
1287 }
1288}
1289
1290Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
1291 MachineBasicBlock::iterator I,
1292 const DebugLoc &DL,
1293 Register SrcReg, int Value) const {
1294 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1295 Register Reg = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1296 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_EQ_I32_e64), DestReg: Reg)
1297 .addImm(Val: Value)
1298 .addReg(RegNo: SrcReg);
1299
1300 return Reg;
1301}
1302
1303Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
1304 MachineBasicBlock::iterator I,
1305 const DebugLoc &DL,
1306 Register SrcReg, int Value) const {
1307 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1308 Register Reg = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1309 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_I32_e64), DestReg: Reg)
1310 .addImm(Val: Value)
1311 .addReg(RegNo: SrcReg);
1312
1313 return Reg;
1314}
1315
1316bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
1317 const Register Reg,
1318 int64_t &ImmVal) const {
1319 switch (MI.getOpcode()) {
1320 case AMDGPU::V_MOV_B32_e32:
1321 case AMDGPU::S_MOV_B32:
1322 case AMDGPU::S_MOVK_I32:
1323 case AMDGPU::S_MOV_B64:
1324 case AMDGPU::V_MOV_B64_e32:
1325 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1326 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1327 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1328 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1329 case AMDGPU::V_MOV_B64_PSEUDO:
1330 case AMDGPU::V_MOV_B16_t16_e32: {
1331 const MachineOperand &Src0 = MI.getOperand(i: 1);
1332 if (Src0.isImm()) {
1333 ImmVal = Src0.getImm();
1334 return MI.getOperand(i: 0).getReg() == Reg;
1335 }
1336
1337 return false;
1338 }
1339 case AMDGPU::V_MOV_B16_t16_e64: {
1340 const MachineOperand &Src0 = MI.getOperand(i: 2);
1341 if (Src0.isImm() && !MI.getOperand(i: 1).getImm()) {
1342 ImmVal = Src0.getImm();
1343 return MI.getOperand(i: 0).getReg() == Reg;
1344 }
1345
1346 return false;
1347 }
1348 case AMDGPU::S_BREV_B32:
1349 case AMDGPU::V_BFREV_B32_e32:
1350 case AMDGPU::V_BFREV_B32_e64: {
1351 const MachineOperand &Src0 = MI.getOperand(i: 1);
1352 if (Src0.isImm()) {
1353 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Val: Src0.getImm()));
1354 return MI.getOperand(i: 0).getReg() == Reg;
1355 }
1356
1357 return false;
1358 }
1359 case AMDGPU::S_NOT_B32:
1360 case AMDGPU::V_NOT_B32_e32:
1361 case AMDGPU::V_NOT_B32_e64: {
1362 const MachineOperand &Src0 = MI.getOperand(i: 1);
1363 if (Src0.isImm()) {
1364 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1365 return MI.getOperand(i: 0).getReg() == Reg;
1366 }
1367
1368 return false;
1369 }
1370 default:
1371 return false;
1372 }
1373}
1374
1375std::optional<int64_t>
1376SIInstrInfo::getImmOrMaterializedImm(MachineOperand &Op) const {
1377 if (Op.isImm())
1378 return Op.getImm();
1379
1380 if (!Op.isReg() || !Op.getReg().isVirtual())
1381 return std::nullopt;
1382 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1383 const MachineInstr *Def = MRI.getVRegDef(Reg: Op.getReg());
1384 if (Def && Def->isMoveImmediate()) {
1385 const MachineOperand &ImmSrc = Def->getOperand(i: 1);
1386 if (ImmSrc.isImm())
1387 return extractSubregFromImm(ImmVal: ImmSrc.getImm(), SubRegIndex: Op.getSubReg());
1388 }
1389
1390 return std::nullopt;
1391}
1392
1393unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
1394
1395 if (RI.isAGPRClass(RC: DstRC))
1396 return AMDGPU::COPY;
1397 if (RI.getRegSizeInBits(RC: *DstRC) == 16) {
1398 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1399 // before RA.
1400 return RI.isSGPRClass(RC: DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1401 }
1402 if (RI.getRegSizeInBits(RC: *DstRC) == 32)
1403 return RI.isSGPRClass(RC: DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1404 if (RI.getRegSizeInBits(RC: *DstRC) == 64 && RI.isSGPRClass(RC: DstRC))
1405 return AMDGPU::S_MOV_B64;
1406 if (RI.getRegSizeInBits(RC: *DstRC) == 64 && !RI.isSGPRClass(RC: DstRC))
1407 return AMDGPU::V_MOV_B64_PSEUDO;
1408 return AMDGPU::COPY;
1409}
1410
1411const MCInstrDesc &
1412SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
1413 bool IsIndirectSrc) const {
1414 if (IsIndirectSrc) {
1415 if (VecSize <= 32) // 4 bytes
1416 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1417 if (VecSize <= 64) // 8 bytes
1418 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1419 if (VecSize <= 96) // 12 bytes
1420 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1421 if (VecSize <= 128) // 16 bytes
1422 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1423 if (VecSize <= 160) // 20 bytes
1424 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1425 if (VecSize <= 192) // 24 bytes
1426 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1427 if (VecSize <= 224) // 28 bytes
1428 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1429 if (VecSize <= 256) // 32 bytes
1430 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1431 if (VecSize <= 288) // 36 bytes
1432 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1433 if (VecSize <= 320) // 40 bytes
1434 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1435 if (VecSize <= 352) // 44 bytes
1436 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1437 if (VecSize <= 384) // 48 bytes
1438 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1439 if (VecSize <= 512) // 64 bytes
1440 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1441 if (VecSize <= 1024) // 128 bytes
1442 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1443
1444 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1445 }
1446
1447 if (VecSize <= 32) // 4 bytes
1448 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1449 if (VecSize <= 64) // 8 bytes
1450 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1451 if (VecSize <= 96) // 12 bytes
1452 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1453 if (VecSize <= 128) // 16 bytes
1454 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1455 if (VecSize <= 160) // 20 bytes
1456 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1457 if (VecSize <= 192) // 24 bytes
1458 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1459 if (VecSize <= 224) // 28 bytes
1460 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1461 if (VecSize <= 256) // 32 bytes
1462 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1463 if (VecSize <= 288) // 36 bytes
1464 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1465 if (VecSize <= 320) // 40 bytes
1466 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1467 if (VecSize <= 352) // 44 bytes
1468 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1469 if (VecSize <= 384) // 48 bytes
1470 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1471 if (VecSize <= 512) // 64 bytes
1472 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1473 if (VecSize <= 1024) // 128 bytes
1474 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1475
1476 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1477}
1478
1479static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1480 if (VecSize <= 32) // 4 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1482 if (VecSize <= 64) // 8 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1484 if (VecSize <= 96) // 12 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1486 if (VecSize <= 128) // 16 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1488 if (VecSize <= 160) // 20 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1490 if (VecSize <= 192) // 24 bytes
1491 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1492 if (VecSize <= 224) // 28 bytes
1493 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1494 if (VecSize <= 256) // 32 bytes
1495 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1496 if (VecSize <= 288) // 36 bytes
1497 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1498 if (VecSize <= 320) // 40 bytes
1499 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1500 if (VecSize <= 352) // 44 bytes
1501 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1502 if (VecSize <= 384) // 48 bytes
1503 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1504 if (VecSize <= 512) // 64 bytes
1505 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1506 if (VecSize <= 1024) // 128 bytes
1507 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1508
1509 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1510}
1511
1512static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1513 if (VecSize <= 32) // 4 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1515 if (VecSize <= 64) // 8 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1517 if (VecSize <= 96) // 12 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1519 if (VecSize <= 128) // 16 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1521 if (VecSize <= 160) // 20 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1523 if (VecSize <= 192) // 24 bytes
1524 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1525 if (VecSize <= 224) // 28 bytes
1526 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1527 if (VecSize <= 256) // 32 bytes
1528 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1529 if (VecSize <= 288) // 36 bytes
1530 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1531 if (VecSize <= 320) // 40 bytes
1532 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1533 if (VecSize <= 352) // 44 bytes
1534 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1535 if (VecSize <= 384) // 48 bytes
1536 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1537 if (VecSize <= 512) // 64 bytes
1538 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1539 if (VecSize <= 1024) // 128 bytes
1540 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1541
1542 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1543}
1544
1545static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1546 if (VecSize <= 64) // 8 bytes
1547 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1548 if (VecSize <= 128) // 16 bytes
1549 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1550 if (VecSize <= 256) // 32 bytes
1551 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1552 if (VecSize <= 512) // 64 bytes
1553 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1554 if (VecSize <= 1024) // 128 bytes
1555 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1556
1557 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1558}
1559
1560const MCInstrDesc &
1561SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1562 bool IsSGPR) const {
1563 if (IsSGPR) {
1564 switch (EltSize) {
1565 case 32:
1566 return get(Opcode: getIndirectSGPRWriteMovRelPseudo32(VecSize));
1567 case 64:
1568 return get(Opcode: getIndirectSGPRWriteMovRelPseudo64(VecSize));
1569 default:
1570 llvm_unreachable("invalid reg indexing elt size");
1571 }
1572 }
1573
1574 assert(EltSize == 32 && "invalid reg indexing elt size");
1575 return get(Opcode: getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
1576}
1577
1578static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1579 switch (Size) {
1580 case 4:
1581 return AMDGPU::SI_SPILL_S32_SAVE;
1582 case 8:
1583 return AMDGPU::SI_SPILL_S64_SAVE;
1584 case 12:
1585 return AMDGPU::SI_SPILL_S96_SAVE;
1586 case 16:
1587 return AMDGPU::SI_SPILL_S128_SAVE;
1588 case 20:
1589 return AMDGPU::SI_SPILL_S160_SAVE;
1590 case 24:
1591 return AMDGPU::SI_SPILL_S192_SAVE;
1592 case 28:
1593 return AMDGPU::SI_SPILL_S224_SAVE;
1594 case 32:
1595 return AMDGPU::SI_SPILL_S256_SAVE;
1596 case 36:
1597 return AMDGPU::SI_SPILL_S288_SAVE;
1598 case 40:
1599 return AMDGPU::SI_SPILL_S320_SAVE;
1600 case 44:
1601 return AMDGPU::SI_SPILL_S352_SAVE;
1602 case 48:
1603 return AMDGPU::SI_SPILL_S384_SAVE;
1604 case 64:
1605 return AMDGPU::SI_SPILL_S512_SAVE;
1606 case 128:
1607 return AMDGPU::SI_SPILL_S1024_SAVE;
1608 default:
1609 llvm_unreachable("unknown register size");
1610 }
1611}
1612
1613static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1614 switch (Size) {
1615 case 2:
1616 return AMDGPU::SI_SPILL_V16_SAVE;
1617 case 4:
1618 return AMDGPU::SI_SPILL_V32_SAVE;
1619 case 8:
1620 return AMDGPU::SI_SPILL_V64_SAVE;
1621 case 12:
1622 return AMDGPU::SI_SPILL_V96_SAVE;
1623 case 16:
1624 return AMDGPU::SI_SPILL_V128_SAVE;
1625 case 20:
1626 return AMDGPU::SI_SPILL_V160_SAVE;
1627 case 24:
1628 return AMDGPU::SI_SPILL_V192_SAVE;
1629 case 28:
1630 return AMDGPU::SI_SPILL_V224_SAVE;
1631 case 32:
1632 return AMDGPU::SI_SPILL_V256_SAVE;
1633 case 36:
1634 return AMDGPU::SI_SPILL_V288_SAVE;
1635 case 40:
1636 return AMDGPU::SI_SPILL_V320_SAVE;
1637 case 44:
1638 return AMDGPU::SI_SPILL_V352_SAVE;
1639 case 48:
1640 return AMDGPU::SI_SPILL_V384_SAVE;
1641 case 64:
1642 return AMDGPU::SI_SPILL_V512_SAVE;
1643 case 128:
1644 return AMDGPU::SI_SPILL_V1024_SAVE;
1645 default:
1646 llvm_unreachable("unknown register size");
1647 }
1648}
1649
1650static unsigned getAVSpillSaveOpcode(unsigned Size) {
1651 switch (Size) {
1652 case 4:
1653 return AMDGPU::SI_SPILL_AV32_SAVE;
1654 case 8:
1655 return AMDGPU::SI_SPILL_AV64_SAVE;
1656 case 12:
1657 return AMDGPU::SI_SPILL_AV96_SAVE;
1658 case 16:
1659 return AMDGPU::SI_SPILL_AV128_SAVE;
1660 case 20:
1661 return AMDGPU::SI_SPILL_AV160_SAVE;
1662 case 24:
1663 return AMDGPU::SI_SPILL_AV192_SAVE;
1664 case 28:
1665 return AMDGPU::SI_SPILL_AV224_SAVE;
1666 case 32:
1667 return AMDGPU::SI_SPILL_AV256_SAVE;
1668 case 36:
1669 return AMDGPU::SI_SPILL_AV288_SAVE;
1670 case 40:
1671 return AMDGPU::SI_SPILL_AV320_SAVE;
1672 case 44:
1673 return AMDGPU::SI_SPILL_AV352_SAVE;
1674 case 48:
1675 return AMDGPU::SI_SPILL_AV384_SAVE;
1676 case 64:
1677 return AMDGPU::SI_SPILL_AV512_SAVE;
1678 case 128:
1679 return AMDGPU::SI_SPILL_AV1024_SAVE;
1680 default:
1681 llvm_unreachable("unknown register size");
1682 }
1683}
1684
1685static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1686 bool IsVectorSuperClass) {
1687 // Currently, there is only 32-bit WWM register spills needed.
1688 if (Size != 4)
1689 llvm_unreachable("unknown wwm register spill size");
1690
1691 if (IsVectorSuperClass)
1692 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1693
1694 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1695}
1696
1697unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
1698 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1699 const SIMachineFunctionInfo &MFI) const {
1700 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1701
1702 // Choose the right opcode if spilling a WWM register.
1703 if (MFI.checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
1704 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1705
1706 // TODO: Check if AGPRs are available
1707 if (ST.hasMAIInsts())
1708 return getAVSpillSaveOpcode(Size);
1709
1710 return getVGPRSpillSaveOpcode(Size);
1711}
1712
1713void SIInstrInfo::storeRegToStackSlot(
1714 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
1715 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1716 MachineInstr::MIFlag Flags) const {
1717 MachineFunction *MF = MBB.getParent();
1718 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1719 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1720 const DebugLoc &DL = MBB.findDebugLoc(MBBI: MI);
1721
1722 MachinePointerInfo PtrInfo
1723 = MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1724 MachineMemOperand *MMO = MF->getMachineMemOperand(
1725 PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
1726 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
1727 unsigned SpillSize = RI.getSpillSize(RC: *RC);
1728
1729 MachineRegisterInfo &MRI = MF->getRegInfo();
1730 if (RI.isSGPRClass(RC)) {
1731 MFI->setHasSpilledSGPRs();
1732 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1733 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1734 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1735
1736 // We are only allowed to create one new instruction when spilling
1737 // registers, so we need to use pseudo instruction for spilling SGPRs.
1738 const MCInstrDesc &OpDesc = get(Opcode: getSGPRSpillSaveOpcode(Size: SpillSize));
1739
1740 // The SGPR spill/restore instructions only work on number sgprs, so we need
1741 // to make sure we are using the correct register class.
1742 if (SrcReg.isVirtual() && SpillSize == 4) {
1743 MRI.constrainRegClass(Reg: SrcReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
1744 }
1745
1746 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: OpDesc)
1747 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: isKill)) // data
1748 .addFrameIndex(Idx: FrameIndex) // addr
1749 .addMemOperand(MMO)
1750 .addReg(RegNo: MFI->getStackPtrOffsetReg(), Flags: RegState::Implicit);
1751
1752 if (RI.spillSGPRToVGPR())
1753 FrameInfo.setStackID(ObjectIdx: FrameIndex, ID: TargetStackID::SGPRSpill);
1754 return;
1755 }
1756
1757 unsigned Opcode =
1758 getVectorRegSpillSaveOpcode(Reg: VReg ? VReg : SrcReg, RC, Size: SpillSize, MFI: *MFI);
1759 MFI->setHasSpilledVGPRs();
1760
1761 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode))
1762 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: isKill)) // data
1763 .addFrameIndex(Idx: FrameIndex) // addr
1764 .addReg(RegNo: MFI->getStackPtrOffsetReg()) // scratch_offset
1765 .addImm(Val: 0) // offset
1766 .addMemOperand(MMO);
1767}
1768
1769static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1770 switch (Size) {
1771 case 4:
1772 return AMDGPU::SI_SPILL_S32_RESTORE;
1773 case 8:
1774 return AMDGPU::SI_SPILL_S64_RESTORE;
1775 case 12:
1776 return AMDGPU::SI_SPILL_S96_RESTORE;
1777 case 16:
1778 return AMDGPU::SI_SPILL_S128_RESTORE;
1779 case 20:
1780 return AMDGPU::SI_SPILL_S160_RESTORE;
1781 case 24:
1782 return AMDGPU::SI_SPILL_S192_RESTORE;
1783 case 28:
1784 return AMDGPU::SI_SPILL_S224_RESTORE;
1785 case 32:
1786 return AMDGPU::SI_SPILL_S256_RESTORE;
1787 case 36:
1788 return AMDGPU::SI_SPILL_S288_RESTORE;
1789 case 40:
1790 return AMDGPU::SI_SPILL_S320_RESTORE;
1791 case 44:
1792 return AMDGPU::SI_SPILL_S352_RESTORE;
1793 case 48:
1794 return AMDGPU::SI_SPILL_S384_RESTORE;
1795 case 64:
1796 return AMDGPU::SI_SPILL_S512_RESTORE;
1797 case 128:
1798 return AMDGPU::SI_SPILL_S1024_RESTORE;
1799 default:
1800 llvm_unreachable("unknown register size");
1801 }
1802}
1803
1804static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1805 switch (Size) {
1806 case 2:
1807 return AMDGPU::SI_SPILL_V16_RESTORE;
1808 case 4:
1809 return AMDGPU::SI_SPILL_V32_RESTORE;
1810 case 8:
1811 return AMDGPU::SI_SPILL_V64_RESTORE;
1812 case 12:
1813 return AMDGPU::SI_SPILL_V96_RESTORE;
1814 case 16:
1815 return AMDGPU::SI_SPILL_V128_RESTORE;
1816 case 20:
1817 return AMDGPU::SI_SPILL_V160_RESTORE;
1818 case 24:
1819 return AMDGPU::SI_SPILL_V192_RESTORE;
1820 case 28:
1821 return AMDGPU::SI_SPILL_V224_RESTORE;
1822 case 32:
1823 return AMDGPU::SI_SPILL_V256_RESTORE;
1824 case 36:
1825 return AMDGPU::SI_SPILL_V288_RESTORE;
1826 case 40:
1827 return AMDGPU::SI_SPILL_V320_RESTORE;
1828 case 44:
1829 return AMDGPU::SI_SPILL_V352_RESTORE;
1830 case 48:
1831 return AMDGPU::SI_SPILL_V384_RESTORE;
1832 case 64:
1833 return AMDGPU::SI_SPILL_V512_RESTORE;
1834 case 128:
1835 return AMDGPU::SI_SPILL_V1024_RESTORE;
1836 default:
1837 llvm_unreachable("unknown register size");
1838 }
1839}
1840
1841static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1842 switch (Size) {
1843 case 4:
1844 return AMDGPU::SI_SPILL_AV32_RESTORE;
1845 case 8:
1846 return AMDGPU::SI_SPILL_AV64_RESTORE;
1847 case 12:
1848 return AMDGPU::SI_SPILL_AV96_RESTORE;
1849 case 16:
1850 return AMDGPU::SI_SPILL_AV128_RESTORE;
1851 case 20:
1852 return AMDGPU::SI_SPILL_AV160_RESTORE;
1853 case 24:
1854 return AMDGPU::SI_SPILL_AV192_RESTORE;
1855 case 28:
1856 return AMDGPU::SI_SPILL_AV224_RESTORE;
1857 case 32:
1858 return AMDGPU::SI_SPILL_AV256_RESTORE;
1859 case 36:
1860 return AMDGPU::SI_SPILL_AV288_RESTORE;
1861 case 40:
1862 return AMDGPU::SI_SPILL_AV320_RESTORE;
1863 case 44:
1864 return AMDGPU::SI_SPILL_AV352_RESTORE;
1865 case 48:
1866 return AMDGPU::SI_SPILL_AV384_RESTORE;
1867 case 64:
1868 return AMDGPU::SI_SPILL_AV512_RESTORE;
1869 case 128:
1870 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1871 default:
1872 llvm_unreachable("unknown register size");
1873 }
1874}
1875
1876static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1877 bool IsVectorSuperClass) {
1878 // Currently, there is only 32-bit WWM register spills needed.
1879 if (Size != 4)
1880 llvm_unreachable("unknown wwm register spill size");
1881
1882 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1883 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1884
1885 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1886}
1887
1888unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(
1889 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1890 const SIMachineFunctionInfo &MFI) const {
1891 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1892
1893 // Choose the right opcode if restoring a WWM register.
1894 if (MFI.checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
1895 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1896
1897 // TODO: Check if AGPRs are available
1898 if (ST.hasMAIInsts())
1899 return getAVSpillRestoreOpcode(Size);
1900
1901 assert(!RI.isAGPRClass(RC));
1902 return getVGPRSpillRestoreOpcode(Size);
1903}
1904
1905void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
1906 MachineBasicBlock::iterator MI,
1907 Register DestReg, int FrameIndex,
1908 const TargetRegisterClass *RC,
1909 Register VReg, unsigned SubReg,
1910 MachineInstr::MIFlag Flags) const {
1911 MachineFunction *MF = MBB.getParent();
1912 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1913 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1914 const DebugLoc &DL = MBB.findDebugLoc(MBBI: MI);
1915 unsigned SpillSize = RI.getSpillSize(RC: *RC);
1916
1917 MachinePointerInfo PtrInfo
1918 = MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1919
1920 MachineMemOperand *MMO = MF->getMachineMemOperand(
1921 PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
1922 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
1923
1924 if (RI.isSGPRClass(RC)) {
1925 MFI->setHasSpilledSGPRs();
1926 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1927 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1928 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1929
1930 // FIXME: Maybe this should not include a memoperand because it will be
1931 // lowered to non-memory instructions.
1932 const MCInstrDesc &OpDesc = get(Opcode: getSGPRSpillRestoreOpcode(Size: SpillSize));
1933 if (DestReg.isVirtual() && SpillSize == 4) {
1934 MachineRegisterInfo &MRI = MF->getRegInfo();
1935 MRI.constrainRegClass(Reg: DestReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
1936 }
1937
1938 if (RI.spillSGPRToVGPR())
1939 FrameInfo.setStackID(ObjectIdx: FrameIndex, ID: TargetStackID::SGPRSpill);
1940 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: OpDesc, DestReg)
1941 .addFrameIndex(Idx: FrameIndex) // addr
1942 .addMemOperand(MMO)
1943 .addReg(RegNo: MFI->getStackPtrOffsetReg(), Flags: RegState::Implicit);
1944
1945 return;
1946 }
1947
1948 unsigned Opcode = getVectorRegSpillRestoreOpcode(Reg: VReg ? VReg : DestReg, RC,
1949 Size: SpillSize, MFI: *MFI);
1950 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode), DestReg)
1951 .addFrameIndex(Idx: FrameIndex) // vaddr
1952 .addReg(RegNo: MFI->getStackPtrOffsetReg()) // scratch_offset
1953 .addImm(Val: 0) // offset
1954 .addMemOperand(MMO);
1955}
1956
1957void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
1958 MachineBasicBlock::iterator MI) const {
1959 insertNoops(MBB, MI, Quantity: 1);
1960}
1961
1962void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
1963 MachineBasicBlock::iterator MI,
1964 unsigned Quantity) const {
1965 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
1966 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1967 while (Quantity > 0) {
1968 unsigned Arg = std::min(a: Quantity, b: MaxSNopCount);
1969 Quantity -= Arg;
1970 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOP)).addImm(Val: Arg - 1);
1971 }
1972}
1973
1974void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
1975 auto *MF = MBB.getParent();
1976 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1977
1978 assert(Info->isEntryFunction());
1979
1980 if (MBB.succ_empty()) {
1981 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1982 if (HasNoTerminator) {
1983 if (Info->returnsVoid()) {
1984 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DebugLoc(), MCID: get(Opcode: AMDGPU::S_ENDPGM)).addImm(Val: 0);
1985 } else {
1986 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DebugLoc(), MCID: get(Opcode: AMDGPU::SI_RETURN_TO_EPILOG));
1987 }
1988 }
1989 }
1990}
1991
1992MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
1993 MachineBasicBlock &MBB,
1994 MachineInstr &MI,
1995 const DebugLoc &DL) const {
1996 MachineFunction *MF = MBB.getParent();
1997 constexpr unsigned DoorbellIDMask = 0x3ff;
1998 constexpr unsigned ECQueueWaveAbort = 0x400;
1999
2000 MachineBasicBlock *TrapBB = &MBB;
2001 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2002
2003 if (!MBB.succ_empty() || std::next(x: MI.getIterator()) != MBB.end()) {
2004 MBB.splitAt(SplitInst&: MI, /*UpdateLiveIns=*/false);
2005 TrapBB = MF->CreateMachineBasicBlock();
2006 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CBRANCH_EXECNZ)).addMBB(MBB: TrapBB);
2007 MF->push_back(MBB: TrapBB);
2008 MBB.addSuccessor(Succ: TrapBB);
2009 }
2010 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2011 // will be a nop.
2012 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_TRAP))
2013 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2014 Register DoorbellReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2015 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SENDMSG_RTN_B32),
2016 DestReg: DoorbellReg)
2017 .addImm(Val: AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
2018 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::TTMP2)
2019 .addUse(RegNo: AMDGPU::M0);
2020 Register DoorbellRegMasked =
2021 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2022 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_AND_B32), DestReg: DoorbellRegMasked)
2023 .addUse(RegNo: DoorbellReg)
2024 .addImm(Val: DoorbellIDMask);
2025 Register SetWaveAbortBit =
2026 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2027 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_OR_B32), DestReg: SetWaveAbortBit)
2028 .addUse(RegNo: DoorbellRegMasked)
2029 .addImm(Val: ECQueueWaveAbort);
2030 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
2031 .addUse(RegNo: SetWaveAbortBit);
2032 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SENDMSG))
2033 .addImm(Val: AMDGPU::SendMsg::ID_INTERRUPT);
2034 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
2035 .addUse(RegNo: AMDGPU::TTMP2);
2036 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH)).addMBB(MBB: HaltLoopBB);
2037 TrapBB->addSuccessor(Succ: HaltLoopBB);
2038
2039 BuildMI(BB&: *HaltLoopBB, I: HaltLoopBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETHALT)).addImm(Val: 5);
2040 BuildMI(BB&: *HaltLoopBB, I: HaltLoopBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
2041 .addMBB(MBB: HaltLoopBB);
2042 MF->push_back(MBB: HaltLoopBB);
2043 HaltLoopBB->addSuccessor(Succ: HaltLoopBB);
2044
2045 return MBB.getNextNode();
2046}
2047
2048unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
2049 switch (MI.getOpcode()) {
2050 default:
2051 if (MI.isMetaInstruction())
2052 return 0;
2053 return 1; // FIXME: Do wait states equal cycles?
2054
2055 case AMDGPU::S_NOP:
2056 return MI.getOperand(i: 0).getImm() + 1;
2057 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2058 // hazard, even if one exist, won't really be visible. Should we handle it?
2059 }
2060}
2061
2062bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2063 MachineBasicBlock &MBB = *MI.getParent();
2064 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
2065 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
2066 switch (MI.getOpcode()) {
2067 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2068 case AMDGPU::S_MOV_B64_term:
2069 // This is only a terminator to get the correct spill code placement during
2070 // register allocation.
2071 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2072 break;
2073
2074 case AMDGPU::S_MOV_B32_term:
2075 // This is only a terminator to get the correct spill code placement during
2076 // register allocation.
2077 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B32));
2078 break;
2079
2080 case AMDGPU::S_XOR_B64_term:
2081 // This is only a terminator to get the correct spill code placement during
2082 // register allocation.
2083 MI.setDesc(get(Opcode: AMDGPU::S_XOR_B64));
2084 break;
2085
2086 case AMDGPU::S_XOR_B32_term:
2087 // This is only a terminator to get the correct spill code placement during
2088 // register allocation.
2089 MI.setDesc(get(Opcode: AMDGPU::S_XOR_B32));
2090 break;
2091 case AMDGPU::S_OR_B64_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(Opcode: AMDGPU::S_OR_B64));
2095 break;
2096 case AMDGPU::S_OR_B32_term:
2097 // This is only a terminator to get the correct spill code placement during
2098 // register allocation.
2099 MI.setDesc(get(Opcode: AMDGPU::S_OR_B32));
2100 break;
2101
2102 case AMDGPU::S_ANDN2_B64_term:
2103 // This is only a terminator to get the correct spill code placement during
2104 // register allocation.
2105 MI.setDesc(get(Opcode: AMDGPU::S_ANDN2_B64));
2106 break;
2107
2108 case AMDGPU::S_ANDN2_B32_term:
2109 // This is only a terminator to get the correct spill code placement during
2110 // register allocation.
2111 MI.setDesc(get(Opcode: AMDGPU::S_ANDN2_B32));
2112 break;
2113
2114 case AMDGPU::S_AND_B64_term:
2115 // This is only a terminator to get the correct spill code placement during
2116 // register allocation.
2117 MI.setDesc(get(Opcode: AMDGPU::S_AND_B64));
2118 break;
2119
2120 case AMDGPU::S_AND_B32_term:
2121 // This is only a terminator to get the correct spill code placement during
2122 // register allocation.
2123 MI.setDesc(get(Opcode: AMDGPU::S_AND_B32));
2124 break;
2125
2126 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2127 // This is only a terminator to get the correct spill code placement during
2128 // register allocation.
2129 MI.setDesc(get(Opcode: AMDGPU::S_AND_SAVEEXEC_B64));
2130 break;
2131
2132 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2133 // This is only a terminator to get the correct spill code placement during
2134 // register allocation.
2135 MI.setDesc(get(Opcode: AMDGPU::S_AND_SAVEEXEC_B32));
2136 break;
2137
2138 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2139 MI.setDesc(get(Opcode: AMDGPU::V_WRITELANE_B32));
2140 break;
2141
2142 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2143 MI.setDesc(get(Opcode: AMDGPU::V_READLANE_B32));
2144 break;
2145 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2146 Register Dst = MI.getOperand(i: 0).getReg();
2147 bool IsAGPR = SIRegisterInfo::isAGPRClass(RC: RI.getPhysRegBaseClass(Reg: Dst));
2148 MI.setDesc(
2149 get(Opcode: IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2150 break;
2151 }
2152 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2153 Register Dst = MI.getOperand(i: 0).getReg();
2154 if (SIRegisterInfo::isAGPRClass(RC: RI.getPhysRegBaseClass(Reg: Dst))) {
2155 int64_t Imm = MI.getOperand(i: 1).getImm();
2156
2157 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2158 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2159 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg: DstLo)
2160 .addImm(Val: SignExtend64<32>(x: Imm))
2161 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2162 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg: DstHi)
2163 .addImm(Val: SignExtend64<32>(x: Imm >> 32))
2164 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2165 MI.eraseFromParent();
2166 break;
2167 }
2168
2169 [[fallthrough]];
2170 }
2171 case AMDGPU::V_MOV_B64_PSEUDO: {
2172 Register Dst = MI.getOperand(i: 0).getReg();
2173 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2174 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2175
2176 const MCInstrDesc &Mov64Desc = get(Opcode: AMDGPU::V_MOV_B64_e32);
2177 const TargetRegisterClass *Mov64RC = getRegClass(MCID: Mov64Desc, /*OpNum=*/0);
2178
2179 const MachineOperand &SrcOp = MI.getOperand(i: 1);
2180 // FIXME: Will this work for 64-bit floating point immediates?
2181 assert(!SrcOp.isFPImm());
2182 if (ST.hasMovB64() && Mov64RC->contains(Reg: Dst)) {
2183 MI.setDesc(Mov64Desc);
2184 if (SrcOp.isReg() || isInlineConstant(MI, OpIdx: 1) ||
2185 isUInt<32>(x: SrcOp.getImm()) || ST.has64BitLiterals())
2186 break;
2187 }
2188 if (SrcOp.isImm()) {
2189 APInt Imm(64, SrcOp.getImm());
2190 APInt Lo(32, Imm.getLoBits(numBits: 32).getZExtValue());
2191 APInt Hi(32, Imm.getHiBits(numBits: 32).getZExtValue());
2192 const MCInstrDesc &PkMovDesc = get(Opcode: AMDGPU::V_PK_MOV_B32);
2193 const TargetRegisterClass *PkMovRC = getRegClass(MCID: PkMovDesc, /*OpNum=*/0);
2194
2195 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Imm: Lo) &&
2196 PkMovRC->contains(Reg: Dst)) {
2197 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: PkMovDesc, DestReg: Dst)
2198 .addImm(Val: SISrcMods::OP_SEL_1)
2199 .addImm(Val: Lo.getSExtValue())
2200 .addImm(Val: SISrcMods::OP_SEL_1)
2201 .addImm(Val: Lo.getSExtValue())
2202 .addImm(Val: 0) // op_sel_lo
2203 .addImm(Val: 0) // op_sel_hi
2204 .addImm(Val: 0) // neg_lo
2205 .addImm(Val: 0) // neg_hi
2206 .addImm(Val: 0); // clamp
2207 } else {
2208 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo)
2209 .addImm(Val: Lo.getSExtValue())
2210 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2211 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstHi)
2212 .addImm(Val: Hi.getSExtValue())
2213 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2214 }
2215 } else {
2216 assert(SrcOp.isReg());
2217 if (ST.hasPkMovB32() &&
2218 !RI.isAGPR(MRI: MBB.getParent()->getRegInfo(), Reg: SrcOp.getReg())) {
2219 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg: Dst)
2220 .addImm(Val: SISrcMods::OP_SEL_1) // src0_mod
2221 .addReg(RegNo: SrcOp.getReg())
2222 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
2223 .addReg(RegNo: SrcOp.getReg())
2224 .addImm(Val: 0) // op_sel_lo
2225 .addImm(Val: 0) // op_sel_hi
2226 .addImm(Val: 0) // neg_lo
2227 .addImm(Val: 0) // neg_hi
2228 .addImm(Val: 0); // clamp
2229 } else {
2230 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo)
2231 .addReg(RegNo: RI.getSubReg(Reg: SrcOp.getReg(), Idx: AMDGPU::sub0))
2232 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2233 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstHi)
2234 .addReg(RegNo: RI.getSubReg(Reg: SrcOp.getReg(), Idx: AMDGPU::sub1))
2235 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2236 }
2237 }
2238 MI.eraseFromParent();
2239 break;
2240 }
2241 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2242 expandMovDPP64(MI);
2243 break;
2244 }
2245 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2246 const MachineOperand &SrcOp = MI.getOperand(i: 1);
2247 assert(!SrcOp.isFPImm());
2248
2249 if (ST.has64BitLiterals()) {
2250 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2251 break;
2252 }
2253
2254 APInt Imm(64, SrcOp.getImm());
2255 if (Imm.isIntN(N: 32) || isInlineConstant(Imm)) {
2256 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2257 break;
2258 }
2259
2260 Register Dst = MI.getOperand(i: 0).getReg();
2261 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2262 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2263
2264 APInt Lo(32, Imm.getLoBits(numBits: 32).getZExtValue());
2265 APInt Hi(32, Imm.getHiBits(numBits: 32).getZExtValue());
2266 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstLo)
2267 .addImm(Val: Lo.getSExtValue())
2268 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2269 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstHi)
2270 .addImm(Val: Hi.getSExtValue())
2271 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2272 MI.eraseFromParent();
2273 break;
2274 }
2275 case AMDGPU::V_SET_INACTIVE_B32: {
2276 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2277 Register DstReg = MI.getOperand(i: 0).getReg();
2278 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
2279 .add(MO: MI.getOperand(i: 3))
2280 .add(MO: MI.getOperand(i: 4))
2281 .add(MO: MI.getOperand(i: 1))
2282 .add(MO: MI.getOperand(i: 2))
2283 .add(MO: MI.getOperand(i: 5));
2284 MI.eraseFromParent();
2285 break;
2286 }
2287 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2288 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2289 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2290 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2291 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2292 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2294 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2298 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2301 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2302 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2303 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2304 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2305 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2320 const TargetRegisterClass *EltRC = getOpRegClass(MI, OpNo: 2);
2321
2322 unsigned Opc;
2323 if (RI.hasVGPRs(RC: EltRC)) {
2324 Opc = AMDGPU::V_MOVRELD_B32_e32;
2325 } else {
2326 Opc = RI.getRegSizeInBits(RC: *EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2327 : AMDGPU::S_MOVRELD_B32;
2328 }
2329
2330 const MCInstrDesc &OpDesc = get(Opcode: Opc);
2331 Register VecReg = MI.getOperand(i: 0).getReg();
2332 bool IsUndef = MI.getOperand(i: 1).isUndef();
2333 unsigned SubReg = MI.getOperand(i: 3).getImm();
2334 assert(VecReg == MI.getOperand(1).getReg());
2335
2336 MachineInstrBuilder MIB =
2337 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: OpDesc)
2338 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2339 .add(MO: MI.getOperand(i: 2))
2340 .addReg(RegNo: VecReg, Flags: RegState::ImplicitDefine)
2341 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2342
2343 const int ImpDefIdx =
2344 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2345 const int ImpUseIdx = ImpDefIdx + 1;
2346 MIB->tieOperands(DefIdx: ImpDefIdx, UseIdx: ImpUseIdx);
2347 MI.eraseFromParent();
2348 break;
2349 }
2350 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2351 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2352 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2353 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2357 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2358 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2359 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2360 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2361 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2362 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2363 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2364 assert(ST.useVGPRIndexMode());
2365 Register VecReg = MI.getOperand(i: 0).getReg();
2366 bool IsUndef = MI.getOperand(i: 1).isUndef();
2367 MachineOperand &Idx = MI.getOperand(i: 3);
2368 Register SubReg = MI.getOperand(i: 4).getImm();
2369
2370 MachineInstr *SetOn = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_ON))
2371 .add(MO: Idx)
2372 .addImm(Val: AMDGPU::VGPRIndexMode::DST_ENABLE);
2373 SetOn->getOperand(i: 3).setIsUndef();
2374
2375 const MCInstrDesc &OpDesc = get(Opcode: AMDGPU::V_MOV_B32_indirect_write);
2376 MachineInstrBuilder MIB =
2377 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: OpDesc)
2378 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2379 .add(MO: MI.getOperand(i: 2))
2380 .addReg(RegNo: VecReg, Flags: RegState::ImplicitDefine)
2381 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2382
2383 const int ImpDefIdx =
2384 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2385 const int ImpUseIdx = ImpDefIdx + 1;
2386 MIB->tieOperands(DefIdx: ImpDefIdx, UseIdx: ImpUseIdx);
2387
2388 MachineInstr *SetOff = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_OFF));
2389
2390 finalizeBundle(MBB, FirstMI: SetOn->getIterator(), LastMI: std::next(x: SetOff->getIterator()));
2391
2392 MI.eraseFromParent();
2393 break;
2394 }
2395 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2396 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2397 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2398 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2402 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2403 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2404 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2405 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2406 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2407 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2408 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2409 assert(ST.useVGPRIndexMode());
2410 Register Dst = MI.getOperand(i: 0).getReg();
2411 Register VecReg = MI.getOperand(i: 1).getReg();
2412 bool IsUndef = MI.getOperand(i: 1).isUndef();
2413 Register Idx = MI.getOperand(i: 2).getReg();
2414 Register SubReg = MI.getOperand(i: 3).getImm();
2415
2416 MachineInstr *SetOn = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_ON))
2417 .addReg(RegNo: Idx)
2418 .addImm(Val: AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2419 SetOn->getOperand(i: 3).setIsUndef();
2420
2421 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_indirect_read))
2422 .addDef(RegNo: Dst)
2423 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2424 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2425
2426 MachineInstr *SetOff = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_OFF));
2427
2428 finalizeBundle(MBB, FirstMI: SetOn->getIterator(), LastMI: std::next(x: SetOff->getIterator()));
2429
2430 MI.eraseFromParent();
2431 break;
2432 }
2433 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2434 MachineFunction &MF = *MBB.getParent();
2435 Register Reg = MI.getOperand(i: 0).getReg();
2436 Register RegLo = RI.getSubReg(Reg, Idx: AMDGPU::sub0);
2437 Register RegHi = RI.getSubReg(Reg, Idx: AMDGPU::sub1);
2438 MachineOperand OpLo = MI.getOperand(i: 1);
2439 MachineOperand OpHi = MI.getOperand(i: 2);
2440
2441 // Create a bundle so these instructions won't be re-ordered by the
2442 // post-RA scheduler.
2443 MIBundleBuilder Bundler(MBB, MI);
2444 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: Reg));
2445
2446 // What we want here is an offset from the value returned by s_getpc (which
2447 // is the address of the s_add_u32 instruction) to the global variable, but
2448 // since the encoding of $symbol starts 4 bytes after the start of the
2449 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2450 // small. This requires us to add 4 to the global variable offset in order
2451 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2452 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2453 // instruction.
2454
2455 int64_t Adjust = 0;
2456 if (ST.hasGetPCZeroExtension()) {
2457 // Fix up hardware that does not sign-extend the 48-bit PC value by
2458 // inserting: s_sext_i32_i16 reghi, reghi
2459 Bundler.append(
2460 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SEXT_I32_I16), DestReg: RegHi).addReg(RegNo: RegHi));
2461 Adjust += 4;
2462 }
2463
2464 if (OpLo.isGlobal())
2465 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2466 Bundler.append(
2467 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U32), DestReg: RegLo).addReg(RegNo: RegLo).add(MO: OpLo));
2468
2469 if (OpHi.isGlobal())
2470 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2471 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADDC_U32), DestReg: RegHi)
2472 .addReg(RegNo: RegHi)
2473 .add(MO: OpHi));
2474
2475 finalizeBundle(MBB, FirstMI: Bundler.begin());
2476
2477 MI.eraseFromParent();
2478 break;
2479 }
2480 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2481 MachineFunction &MF = *MBB.getParent();
2482 Register Reg = MI.getOperand(i: 0).getReg();
2483 MachineOperand Op = MI.getOperand(i: 1);
2484
2485 // Create a bundle so these instructions won't be re-ordered by the
2486 // post-RA scheduler.
2487 MIBundleBuilder Bundler(MBB, MI);
2488 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: Reg));
2489 if (Op.isGlobal())
2490 Op.setOffset(Op.getOffset() + 4);
2491 Bundler.append(
2492 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U64), DestReg: Reg).addReg(RegNo: Reg).add(MO: Op));
2493
2494 finalizeBundle(MBB, FirstMI: Bundler.begin());
2495
2496 MI.eraseFromParent();
2497 break;
2498 }
2499 case AMDGPU::ENTER_STRICT_WWM: {
2500 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2501 // Whole Wave Mode is entered.
2502 MI.setDesc(get(Opcode: LMC.OrSaveExecOpc));
2503 break;
2504 }
2505 case AMDGPU::ENTER_STRICT_WQM: {
2506 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2507 // STRICT_WQM is entered.
2508 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: LMC.MovOpc), DestReg: MI.getOperand(i: 0).getReg())
2509 .addReg(RegNo: LMC.ExecReg);
2510 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: LMC.WQMOpc), DestReg: LMC.ExecReg).addReg(RegNo: LMC.ExecReg);
2511
2512 MI.eraseFromParent();
2513 break;
2514 }
2515 case AMDGPU::EXIT_STRICT_WWM:
2516 case AMDGPU::EXIT_STRICT_WQM: {
2517 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2518 // WWM/STICT_WQM is exited.
2519 MI.setDesc(get(Opcode: LMC.MovOpc));
2520 break;
2521 }
2522 case AMDGPU::SI_RETURN: {
2523 const MachineFunction *MF = MBB.getParent();
2524 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2525 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2526 // Hiding the return address use with SI_RETURN may lead to extra kills in
2527 // the function and missing live-ins. We are fine in practice because callee
2528 // saved register handling ensures the register value is restored before
2529 // RET, but we need the undef flag here to appease the MachineVerifier
2530 // liveness checks.
2531 MachineInstrBuilder MIB =
2532 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETPC_B64_return))
2533 .addReg(RegNo: TRI->getReturnAddressReg(MF: *MF), Flags: RegState::Undef);
2534
2535 MIB.copyImplicitOps(OtherMI: MI);
2536 MI.eraseFromParent();
2537 break;
2538 }
2539
2540 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2541 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2542 MI.setDesc(get(Opcode: AMDGPU::S_MUL_U64));
2543 break;
2544
2545 case AMDGPU::S_GETPC_B64_pseudo:
2546 MI.setDesc(get(Opcode: AMDGPU::S_GETPC_B64));
2547 if (ST.hasGetPCZeroExtension()) {
2548 Register Dst = MI.getOperand(i: 0).getReg();
2549 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2550 // Fix up hardware that does not sign-extend the 48-bit PC value by
2551 // inserting: s_sext_i32_i16 dsthi, dsthi
2552 BuildMI(BB&: MBB, I: std::next(x: MI.getIterator()), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SEXT_I32_I16),
2553 DestReg: DstHi)
2554 .addReg(RegNo: DstHi);
2555 }
2556 break;
2557
2558 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2559 assert(ST.hasBF16PackedInsts());
2560 MI.setDesc(get(Opcode: AMDGPU::V_PK_MAX_NUM_BF16));
2561 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // op_sel
2562 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // neg_lo
2563 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // neg_hi
2564 auto Op0 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
2565 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2566 auto Op1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
2567 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2568 break;
2569 }
2570
2571 case AMDGPU::GET_STACK_BASE:
2572 // The stack starts at offset 0 unless we need to reserve some space at the
2573 // bottom.
2574 if (ST.getFrameLowering()->mayReserveScratchForCWSR(MF: *MBB.getParent())) {
2575 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2576 // some of the VGPRs. The size of the required scratch space has already
2577 // been computed by prolog epilog insertion.
2578 const SIMachineFunctionInfo *MFI =
2579 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2580 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2581 Register DestReg = MI.getOperand(i: 0).getReg();
2582 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETREG_B32), DestReg)
2583 .addImm(Val: AMDGPU::Hwreg::HwregEncoding::encode(
2584 Values: AMDGPU::Hwreg::ID_HW_ID2, Values: AMDGPU::Hwreg::OFFSET_ME_ID, Values: 2));
2585 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2586 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2587 // SCC, so we need to check for 0 manually.
2588 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U32)).addImm(Val: 0).addReg(RegNo: DestReg);
2589 // Change the implicif-def of SCC to an explicit use (but first remove
2590 // the dead flag if present).
2591 MI.getOperand(i: MI.getNumExplicitOperands()).setIsDead(false);
2592 MI.getOperand(i: MI.getNumExplicitOperands()).setIsUse();
2593 MI.setDesc(get(Opcode: AMDGPU::S_CMOVK_I32));
2594 MI.addOperand(Op: MachineOperand::CreateImm(Val: VGPRSize));
2595 } else {
2596 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B32));
2597 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2598 MI.removeOperand(
2599 OpNo: MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2600 }
2601 break;
2602 }
2603
2604 return true;
2605}
2606
2607void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
2608 MachineBasicBlock::iterator I, Register DestReg,
2609 unsigned SubIdx,
2610 const MachineInstr &Orig) const {
2611
2612 // Try shrinking the instruction to remat only the part needed for current
2613 // context.
2614 // TODO: Handle more cases.
2615 unsigned Opcode = Orig.getOpcode();
2616 switch (Opcode) {
2617 case AMDGPU::S_LOAD_DWORDX16_IMM:
2618 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2619 if (SubIdx != 0)
2620 break;
2621
2622 if (I == MBB.end())
2623 break;
2624
2625 if (I->isBundled())
2626 break;
2627
2628 // Look for a single use of the register that is also a subreg.
2629 Register RegToFind = Orig.getOperand(i: 0).getReg();
2630 MachineOperand *UseMO = nullptr;
2631 for (auto &CandMO : I->operands()) {
2632 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2633 continue;
2634 if (UseMO) {
2635 UseMO = nullptr;
2636 break;
2637 }
2638 UseMO = &CandMO;
2639 }
2640 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2641 break;
2642
2643 unsigned Offset = RI.getSubRegIdxOffset(Idx: UseMO->getSubReg());
2644 unsigned SubregSize = RI.getSubRegIdxSize(Idx: UseMO->getSubReg());
2645
2646 MachineFunction *MF = MBB.getParent();
2647 MachineRegisterInfo &MRI = MF->getRegInfo();
2648 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2649
2650 unsigned NewOpcode = -1;
2651 if (SubregSize == 256)
2652 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2653 else if (SubregSize == 128)
2654 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2655 else
2656 break;
2657
2658 const MCInstrDesc &TID = get(Opcode: NewOpcode);
2659 const TargetRegisterClass *NewRC =
2660 RI.getAllocatableClass(RC: getRegClass(MCID: TID, OpNum: 0));
2661 MRI.setRegClass(Reg: DestReg, RC: NewRC);
2662
2663 UseMO->setReg(DestReg);
2664 UseMO->setSubReg(AMDGPU::NoSubRegister);
2665
2666 // Use a smaller load with the desired size, possibly with updated offset.
2667 MachineInstr *MI = MF->CloneMachineInstr(Orig: &Orig);
2668 MI->setDesc(TID);
2669 MI->getOperand(i: 0).setReg(DestReg);
2670 MI->getOperand(i: 0).setSubReg(AMDGPU::NoSubRegister);
2671 if (Offset) {
2672 MachineOperand *OffsetMO = getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset);
2673 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2674 OffsetMO->setImm(FinalOffset);
2675 }
2676 SmallVector<MachineMemOperand *> NewMMOs;
2677 for (const MachineMemOperand *MemOp : Orig.memoperands())
2678 NewMMOs.push_back(Elt: MF->getMachineMemOperand(MMO: MemOp, PtrInfo: MemOp->getPointerInfo(),
2679 Size: SubregSize / 8));
2680 MI->setMemRefs(MF&: *MF, MemRefs: NewMMOs);
2681
2682 MBB.insert(I, MI);
2683 return;
2684 }
2685
2686 default:
2687 break;
2688 }
2689
2690 TargetInstrInfo::reMaterialize(MBB, MI: I, DestReg, SubIdx, Orig);
2691}
2692
2693std::pair<MachineInstr*, MachineInstr*>
2694SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
2695 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2696
2697 if (ST.hasMovB64() && ST.hasFeature(Feature: AMDGPU::FeatureDPALU_DPP) &&
2698 AMDGPU::isLegalDPALU_DPPControl(
2699 ST, DC: getNamedOperand(MI, OperandName: AMDGPU::OpName::dpp_ctrl)->getImm())) {
2700 MI.setDesc(get(Opcode: AMDGPU::V_MOV_B64_dpp));
2701 return std::pair(&MI, nullptr);
2702 }
2703
2704 MachineBasicBlock &MBB = *MI.getParent();
2705 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
2706 MachineFunction *MF = MBB.getParent();
2707 MachineRegisterInfo &MRI = MF->getRegInfo();
2708 Register Dst = MI.getOperand(i: 0).getReg();
2709 unsigned Part = 0;
2710 MachineInstr *Split[2];
2711
2712 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2713 auto MovDPP = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_dpp));
2714 if (Dst.isPhysical()) {
2715 MovDPP.addDef(RegNo: RI.getSubReg(Reg: Dst, Idx: Sub));
2716 } else {
2717 assert(MRI.isSSA());
2718 auto Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2719 MovDPP.addDef(RegNo: Tmp);
2720 }
2721
2722 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2723 const MachineOperand &SrcOp = MI.getOperand(i: I);
2724 assert(!SrcOp.isFPImm());
2725 if (SrcOp.isImm()) {
2726 APInt Imm(64, SrcOp.getImm());
2727 Imm.ashrInPlace(ShiftAmt: Part * 32);
2728 MovDPP.addImm(Val: Imm.getLoBits(numBits: 32).getZExtValue());
2729 } else {
2730 assert(SrcOp.isReg());
2731 Register Src = SrcOp.getReg();
2732 if (Src.isPhysical())
2733 MovDPP.addReg(RegNo: RI.getSubReg(Reg: Src, Idx: Sub));
2734 else
2735 MovDPP.addReg(RegNo: Src, Flags: getUndefRegState(B: SrcOp.isUndef()), SubReg: Sub);
2736 }
2737 }
2738
2739 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.explicit_operands(), N: 3))
2740 MovDPP.addImm(Val: MO.getImm());
2741
2742 Split[Part] = MovDPP;
2743 ++Part;
2744 }
2745
2746 if (Dst.isVirtual())
2747 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
2748 .addReg(RegNo: Split[0]->getOperand(i: 0).getReg())
2749 .addImm(Val: AMDGPU::sub0)
2750 .addReg(RegNo: Split[1]->getOperand(i: 0).getReg())
2751 .addImm(Val: AMDGPU::sub1);
2752
2753 MI.eraseFromParent();
2754 return std::pair(Split[0], Split[1]);
2755}
2756
2757std::optional<DestSourcePair>
2758SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
2759 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2760 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 1)};
2761
2762 return std::nullopt;
2763}
2764
2765bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0,
2766 AMDGPU::OpName Src0OpName,
2767 MachineOperand &Src1,
2768 AMDGPU::OpName Src1OpName) const {
2769 MachineOperand *Src0Mods = getNamedOperand(MI, OperandName: Src0OpName);
2770 if (!Src0Mods)
2771 return false;
2772
2773 MachineOperand *Src1Mods = getNamedOperand(MI, OperandName: Src1OpName);
2774 assert(Src1Mods &&
2775 "All commutable instructions have both src0 and src1 modifiers");
2776
2777 int Src0ModsVal = Src0Mods->getImm();
2778 int Src1ModsVal = Src1Mods->getImm();
2779
2780 Src1Mods->setImm(Src0ModsVal);
2781 Src0Mods->setImm(Src1ModsVal);
2782 return true;
2783}
2784
2785static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
2786 MachineOperand &RegOp,
2787 MachineOperand &NonRegOp) {
2788 Register Reg = RegOp.getReg();
2789 unsigned SubReg = RegOp.getSubReg();
2790 bool IsKill = RegOp.isKill();
2791 bool IsDead = RegOp.isDead();
2792 bool IsUndef = RegOp.isUndef();
2793 bool IsDebug = RegOp.isDebug();
2794
2795 if (NonRegOp.isImm())
2796 RegOp.ChangeToImmediate(ImmVal: NonRegOp.getImm());
2797 else if (NonRegOp.isFI())
2798 RegOp.ChangeToFrameIndex(Idx: NonRegOp.getIndex());
2799 else if (NonRegOp.isGlobal()) {
2800 RegOp.ChangeToGA(GV: NonRegOp.getGlobal(), Offset: NonRegOp.getOffset(),
2801 TargetFlags: NonRegOp.getTargetFlags());
2802 } else
2803 return nullptr;
2804
2805 // Make sure we don't reinterpret a subreg index in the target flags.
2806 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2807
2808 NonRegOp.ChangeToRegister(Reg, isDef: false, isImp: false, isKill: IsKill, isDead: IsDead, isUndef: IsUndef, isDebug: IsDebug);
2809 NonRegOp.setSubReg(SubReg);
2810
2811 return &MI;
2812}
2813
2814static MachineInstr *swapImmOperands(MachineInstr &MI,
2815 MachineOperand &NonRegOp1,
2816 MachineOperand &NonRegOp2) {
2817 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2818 int64_t NonRegVal = NonRegOp1.getImm();
2819
2820 NonRegOp1.setImm(NonRegOp2.getImm());
2821 NonRegOp2.setImm(NonRegVal);
2822 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2823 NonRegOp2.setTargetFlags(TargetFlags);
2824 return &MI;
2825}
2826
2827bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2828 unsigned OpIdx1) const {
2829 const MCInstrDesc &InstDesc = MI.getDesc();
2830 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2831 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2832
2833 unsigned Opc = MI.getOpcode();
2834 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
2835
2836 const MachineOperand &MO0 = MI.getOperand(i: OpIdx0);
2837 const MachineOperand &MO1 = MI.getOperand(i: OpIdx1);
2838
2839 // Swap doesn't breach constant bus or literal limits
2840 // It may move literal to position other than src0, this is not allowed
2841 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2842 // FIXME: After gfx9, literal can be in place other than Src0
2843 if (isVALU(MI)) {
2844 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2845 !isInlineConstant(MO: MO0, OpInfo: OpInfo1))
2846 return false;
2847 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2848 !isInlineConstant(MO: MO1, OpInfo: OpInfo0))
2849 return false;
2850 }
2851
2852 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2853 if (OpInfo1.RegClass == -1)
2854 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2855 return isLegalRegOperand(MI, OpIdx: OpIdx1, MO: MO0) &&
2856 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx: OpIdx0, MO: MO1));
2857 }
2858 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2859 if (OpInfo0.RegClass == -1)
2860 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2861 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx: OpIdx1, MO: MO0)) &&
2862 isLegalRegOperand(MI, OpIdx: OpIdx0, MO: MO1);
2863 }
2864
2865 // No need to check 64-bit literals since swapping does not bring new
2866 // 64-bit literals into current instruction to fold to 32-bit
2867
2868 return isImmOperandLegal(MI, OpNo: OpIdx1, MO: MO0);
2869}
2870
2871MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
2872 unsigned Src0Idx,
2873 unsigned Src1Idx) const {
2874 assert(!NewMI && "this should never be used");
2875
2876 unsigned Opc = MI.getOpcode();
2877 int CommutedOpcode = commuteOpcode(Opcode: Opc);
2878 if (CommutedOpcode == -1)
2879 return nullptr;
2880
2881 if (Src0Idx > Src1Idx)
2882 std::swap(a&: Src0Idx, b&: Src1Idx);
2883
2884 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2885 static_cast<int>(Src0Idx) &&
2886 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2887 static_cast<int>(Src1Idx) &&
2888 "inconsistency with findCommutedOpIndices");
2889
2890 if (!isLegalToSwap(MI, OpIdx0: Src0Idx, OpIdx1: Src1Idx))
2891 return nullptr;
2892
2893 MachineInstr *CommutedMI = nullptr;
2894 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
2895 MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
2896 if (Src0.isReg() && Src1.isReg()) {
2897 // Be sure to copy the source modifiers to the right place.
2898 CommutedMI =
2899 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1: Src0Idx, OpIdx2: Src1Idx);
2900 } else if (Src0.isReg() && !Src1.isReg()) {
2901 CommutedMI = swapRegAndNonRegOperand(MI, RegOp&: Src0, NonRegOp&: Src1);
2902 } else if (!Src0.isReg() && Src1.isReg()) {
2903 CommutedMI = swapRegAndNonRegOperand(MI, RegOp&: Src1, NonRegOp&: Src0);
2904 } else if (Src0.isImm() && Src1.isImm()) {
2905 CommutedMI = swapImmOperands(MI, NonRegOp1&: Src0, NonRegOp2&: Src1);
2906 } else {
2907 // FIXME: Found two non registers to commute. This does happen.
2908 return nullptr;
2909 }
2910
2911 if (CommutedMI) {
2912 swapSourceModifiers(MI, Src0, Src0OpName: AMDGPU::OpName::src0_modifiers,
2913 Src1, Src1OpName: AMDGPU::OpName::src1_modifiers);
2914
2915 swapSourceModifiers(MI, Src0, Src0OpName: AMDGPU::OpName::src0_sel, Src1,
2916 Src1OpName: AMDGPU::OpName::src1_sel);
2917
2918 CommutedMI->setDesc(get(Opcode: CommutedOpcode));
2919 }
2920
2921 return CommutedMI;
2922}
2923
2924// This needs to be implemented because the source modifiers may be inserted
2925// between the true commutable operands, and the base
2926// TargetInstrInfo::commuteInstruction uses it.
2927bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
2928 unsigned &SrcOpIdx0,
2929 unsigned &SrcOpIdx1) const {
2930 return findCommutedOpIndices(Desc: MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2931}
2932
2933bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc,
2934 unsigned &SrcOpIdx0,
2935 unsigned &SrcOpIdx1) const {
2936 if (!Desc.isCommutable())
2937 return false;
2938
2939 unsigned Opc = Desc.getOpcode();
2940 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
2941 if (Src0Idx == -1)
2942 return false;
2943
2944 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1);
2945 if (Src1Idx == -1)
2946 return false;
2947
2948 return fixCommutedOpIndices(ResultIdx1&: SrcOpIdx0, ResultIdx2&: SrcOpIdx1, CommutableOpIdx1: Src0Idx, CommutableOpIdx2: Src1Idx);
2949}
2950
2951bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
2952 int64_t BrOffset) const {
2953 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2954 // because its dest block is unanalyzable.
2955 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2956
2957 // Convert to dwords.
2958 BrOffset /= 4;
2959
2960 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2961 // from the next instruction.
2962 BrOffset -= 1;
2963
2964 return isIntN(N: BranchOffsetBits, x: BrOffset);
2965}
2966
2967MachineBasicBlock *
2968SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
2969 return MI.getOperand(i: 0).getMBB();
2970}
2971
2972bool SIInstrInfo::hasDivergentBranch(const MachineBasicBlock *MBB) const {
2973 for (const MachineInstr &MI : MBB->terminators()) {
2974 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2975 MI.getOpcode() == AMDGPU::SI_LOOP)
2976 return true;
2977 }
2978 return false;
2979}
2980
2981void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
2982 MachineBasicBlock &DestBB,
2983 MachineBasicBlock &RestoreBB,
2984 const DebugLoc &DL, int64_t BrOffset,
2985 RegScavenger *RS) const {
2986 assert(MBB.empty() &&
2987 "new block should be inserted for expanding unconditional branch");
2988 assert(MBB.pred_size() == 1);
2989 assert(RestoreBB.empty() &&
2990 "restore block should be inserted for restoring clobbered registers");
2991
2992 MachineFunction *MF = MBB.getParent();
2993 MachineRegisterInfo &MRI = MF->getRegInfo();
2994 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2995 auto I = MBB.end();
2996 auto &MCCtx = MF->getContext();
2997
2998 if (ST.useAddPC64Inst()) {
2999 MCSymbol *Offset =
3000 MCCtx.createTempSymbol(Name: "offset", /*AlwaysAddSuffix=*/true);
3001 auto AddPC = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_PC_I64))
3002 .addSym(Sym: Offset, TargetFlags: MO_FAR_BRANCH_OFFSET);
3003 MCSymbol *PostAddPCLabel =
3004 MCCtx.createTempSymbol(Name: "post_addpc", /*AlwaysAddSuffix=*/true);
3005 AddPC->setPostInstrSymbol(MF&: *MF, Symbol: PostAddPCLabel);
3006 auto *OffsetExpr = MCBinaryExpr::createSub(
3007 LHS: MCSymbolRefExpr::create(Symbol: DestBB.getSymbol(), Ctx&: MCCtx),
3008 RHS: MCSymbolRefExpr::create(Symbol: PostAddPCLabel, Ctx&: MCCtx), Ctx&: MCCtx);
3009 Offset->setVariableValue(OffsetExpr);
3010 return;
3011 }
3012
3013 assert(RS && "RegScavenger required for long branching");
3014
3015 // FIXME: Virtual register workaround for RegScavenger not working with empty
3016 // blocks.
3017 Register PCReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
3018
3019 // Note: as this is used after hazard recognizer we need to apply some hazard
3020 // workarounds directly.
3021 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
3022 ST.hasVALUReadSGPRHazard();
3023 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
3024 if (FlushSGPRWrites)
3025 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3026 .addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST));
3027 };
3028
3029 // We need to compute the offset relative to the instruction immediately after
3030 // s_getpc_b64. Insert pc arithmetic code before last terminator.
3031 MachineInstr *GetPC = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: PCReg);
3032 ApplyHazardWorkarounds();
3033
3034 MCSymbol *PostGetPCLabel =
3035 MCCtx.createTempSymbol(Name: "post_getpc", /*AlwaysAddSuffix=*/true);
3036 GetPC->setPostInstrSymbol(MF&: *MF, Symbol: PostGetPCLabel);
3037
3038 MCSymbol *OffsetLo =
3039 MCCtx.createTempSymbol(Name: "offset_lo", /*AlwaysAddSuffix=*/true);
3040 MCSymbol *OffsetHi =
3041 MCCtx.createTempSymbol(Name: "offset_hi", /*AlwaysAddSuffix=*/true);
3042 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U32))
3043 .addReg(RegNo: PCReg, Flags: RegState::Define, SubReg: AMDGPU::sub0)
3044 .addReg(RegNo: PCReg, Flags: {}, SubReg: AMDGPU::sub0)
3045 .addSym(Sym: OffsetLo, TargetFlags: MO_FAR_BRANCH_OFFSET);
3046 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADDC_U32))
3047 .addReg(RegNo: PCReg, Flags: RegState::Define, SubReg: AMDGPU::sub1)
3048 .addReg(RegNo: PCReg, Flags: {}, SubReg: AMDGPU::sub1)
3049 .addSym(Sym: OffsetHi, TargetFlags: MO_FAR_BRANCH_OFFSET);
3050 ApplyHazardWorkarounds();
3051
3052 // Insert the indirect branch after the other terminator.
3053 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETPC_B64))
3054 .addReg(RegNo: PCReg);
3055
3056 // If a spill is needed for the pc register pair, we need to insert a spill
3057 // restore block right before the destination block, and insert a short branch
3058 // into the old destination block's fallthrough predecessor.
3059 // e.g.:
3060 //
3061 // s_cbranch_scc0 skip_long_branch:
3062 //
3063 // long_branch_bb:
3064 // spill s[8:9]
3065 // s_getpc_b64 s[8:9]
3066 // s_add_u32 s8, s8, restore_bb
3067 // s_addc_u32 s9, s9, 0
3068 // s_setpc_b64 s[8:9]
3069 //
3070 // skip_long_branch:
3071 // foo;
3072 //
3073 // .....
3074 //
3075 // dest_bb_fallthrough_predecessor:
3076 // bar;
3077 // s_branch dest_bb
3078 //
3079 // restore_bb:
3080 // restore s[8:9]
3081 // fallthrough dest_bb
3082 ///
3083 // dest_bb:
3084 // buzz;
3085
3086 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3087 Register Scav;
3088
3089 // If we've previously reserved a register for long branches
3090 // avoid running the scavenger and just use those registers
3091 if (LongBranchReservedReg) {
3092 RS->enterBasicBlock(MBB);
3093 Scav = LongBranchReservedReg;
3094 } else {
3095 RS->enterBasicBlockEnd(MBB);
3096 Scav = RS->scavengeRegisterBackwards(
3097 RC: AMDGPU::SReg_64RegClass, To: MachineBasicBlock::iterator(GetPC),
3098 /* RestoreAfter */ false, SPAdj: 0, /* AllowSpill */ false);
3099 }
3100 if (Scav) {
3101 RS->setRegUsed(Reg: Scav);
3102 MRI.replaceRegWith(FromReg: PCReg, ToReg: Scav);
3103 MRI.clearVirtRegs();
3104 } else {
3105 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3106 // SGPR spill.
3107 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3108 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3109 TRI->spillEmergencySGPR(MI: GetPC, RestoreMBB&: RestoreBB, SGPR: AMDGPU::SGPR0_SGPR1, RS);
3110 MRI.replaceRegWith(FromReg: PCReg, ToReg: AMDGPU::SGPR0_SGPR1);
3111 MRI.clearVirtRegs();
3112 }
3113
3114 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3115 // Now, the distance could be defined.
3116 auto *Offset = MCBinaryExpr::createSub(
3117 LHS: MCSymbolRefExpr::create(Symbol: DestLabel, Ctx&: MCCtx),
3118 RHS: MCSymbolRefExpr::create(Symbol: PostGetPCLabel, Ctx&: MCCtx), Ctx&: MCCtx);
3119 // Add offset assignments.
3120 auto *Mask = MCConstantExpr::create(Value: 0xFFFFFFFFULL, Ctx&: MCCtx);
3121 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(LHS: Offset, RHS: Mask, Ctx&: MCCtx));
3122 auto *ShAmt = MCConstantExpr::create(Value: 32, Ctx&: MCCtx);
3123 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(LHS: Offset, RHS: ShAmt, Ctx&: MCCtx));
3124}
3125
3126unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3127 switch (Cond) {
3128 case SIInstrInfo::SCC_TRUE:
3129 return AMDGPU::S_CBRANCH_SCC1;
3130 case SIInstrInfo::SCC_FALSE:
3131 return AMDGPU::S_CBRANCH_SCC0;
3132 case SIInstrInfo::VCCNZ:
3133 return AMDGPU::S_CBRANCH_VCCNZ;
3134 case SIInstrInfo::VCCZ:
3135 return AMDGPU::S_CBRANCH_VCCZ;
3136 case SIInstrInfo::EXECNZ:
3137 return AMDGPU::S_CBRANCH_EXECNZ;
3138 case SIInstrInfo::EXECZ:
3139 return AMDGPU::S_CBRANCH_EXECZ;
3140 default:
3141 llvm_unreachable("invalid branch predicate");
3142 }
3143}
3144
3145SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3146 switch (Opcode) {
3147 case AMDGPU::S_CBRANCH_SCC0:
3148 return SCC_FALSE;
3149 case AMDGPU::S_CBRANCH_SCC1:
3150 return SCC_TRUE;
3151 case AMDGPU::S_CBRANCH_VCCNZ:
3152 return VCCNZ;
3153 case AMDGPU::S_CBRANCH_VCCZ:
3154 return VCCZ;
3155 case AMDGPU::S_CBRANCH_EXECNZ:
3156 return EXECNZ;
3157 case AMDGPU::S_CBRANCH_EXECZ:
3158 return EXECZ;
3159 default:
3160 return INVALID_BR;
3161 }
3162}
3163
3164bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
3165 MachineBasicBlock::iterator I,
3166 MachineBasicBlock *&TBB,
3167 MachineBasicBlock *&FBB,
3168 SmallVectorImpl<MachineOperand> &Cond,
3169 bool AllowModify) const {
3170 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3171 // Unconditional Branch
3172 TBB = I->getOperand(i: 0).getMBB();
3173 return false;
3174 }
3175
3176 BranchPredicate Pred = getBranchPredicate(Opcode: I->getOpcode());
3177 if (Pred == INVALID_BR)
3178 return true;
3179
3180 MachineBasicBlock *CondBB = I->getOperand(i: 0).getMBB();
3181 Cond.push_back(Elt: MachineOperand::CreateImm(Val: Pred));
3182 Cond.push_back(Elt: I->getOperand(i: 1)); // Save the branch register.
3183
3184 ++I;
3185
3186 if (I == MBB.end()) {
3187 // Conditional branch followed by fall-through.
3188 TBB = CondBB;
3189 return false;
3190 }
3191
3192 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3193 TBB = CondBB;
3194 FBB = I->getOperand(i: 0).getMBB();
3195 return false;
3196 }
3197
3198 return true;
3199}
3200
3201bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
3202 MachineBasicBlock *&FBB,
3203 SmallVectorImpl<MachineOperand> &Cond,
3204 bool AllowModify) const {
3205 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3206 auto E = MBB.end();
3207 if (I == E)
3208 return false;
3209
3210 // Skip over the instructions that are artificially terminators for special
3211 // exec management.
3212 while (I != E && !I->isBranch() && !I->isReturn()) {
3213 switch (I->getOpcode()) {
3214 case AMDGPU::S_MOV_B64_term:
3215 case AMDGPU::S_XOR_B64_term:
3216 case AMDGPU::S_OR_B64_term:
3217 case AMDGPU::S_ANDN2_B64_term:
3218 case AMDGPU::S_AND_B64_term:
3219 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3220 case AMDGPU::S_MOV_B32_term:
3221 case AMDGPU::S_XOR_B32_term:
3222 case AMDGPU::S_OR_B32_term:
3223 case AMDGPU::S_ANDN2_B32_term:
3224 case AMDGPU::S_AND_B32_term:
3225 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3226 break;
3227 case AMDGPU::SI_IF:
3228 case AMDGPU::SI_ELSE:
3229 case AMDGPU::SI_KILL_I1_TERMINATOR:
3230 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3231 // FIXME: It's messy that these need to be considered here at all.
3232 return true;
3233 default:
3234 llvm_unreachable("unexpected non-branch terminator inst");
3235 }
3236
3237 ++I;
3238 }
3239
3240 if (I == E)
3241 return false;
3242
3243 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3244}
3245
3246unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
3247 int *BytesRemoved) const {
3248 unsigned Count = 0;
3249 unsigned RemovedSize = 0;
3250 for (MachineInstr &MI : llvm::make_early_inc_range(Range: MBB.terminators())) {
3251 // Skip over artificial terminators when removing instructions.
3252 if (MI.isBranch() || MI.isReturn()) {
3253 RemovedSize += getInstSizeInBytes(MI);
3254 MI.eraseFromParent();
3255 ++Count;
3256 }
3257 }
3258
3259 if (BytesRemoved)
3260 *BytesRemoved = RemovedSize;
3261
3262 return Count;
3263}
3264
3265// Copy the flags onto the implicit condition register operand.
3266static void preserveCondRegFlags(MachineOperand &CondReg,
3267 const MachineOperand &OrigCond) {
3268 CondReg.setIsUndef(OrigCond.isUndef());
3269 CondReg.setIsKill(OrigCond.isKill());
3270}
3271
3272unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
3273 MachineBasicBlock *TBB,
3274 MachineBasicBlock *FBB,
3275 ArrayRef<MachineOperand> Cond,
3276 const DebugLoc &DL,
3277 int *BytesAdded) const {
3278 if (!FBB && Cond.empty()) {
3279 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
3280 .addMBB(MBB: TBB);
3281 if (BytesAdded)
3282 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3283 return 1;
3284 }
3285
3286 assert(TBB && Cond[0].isImm());
3287
3288 unsigned Opcode
3289 = getBranchOpcode(Cond: static_cast<BranchPredicate>(Cond[0].getImm()));
3290
3291 if (!FBB) {
3292 MachineInstr *CondBr =
3293 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode))
3294 .addMBB(MBB: TBB);
3295
3296 // Copy the flags onto the implicit condition register operand.
3297 preserveCondRegFlags(CondReg&: CondBr->getOperand(i: 1), OrigCond: Cond[1]);
3298 fixImplicitOperands(MI&: *CondBr);
3299
3300 if (BytesAdded)
3301 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3302 return 1;
3303 }
3304
3305 assert(TBB && FBB);
3306
3307 MachineInstr *CondBr =
3308 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode))
3309 .addMBB(MBB: TBB);
3310 fixImplicitOperands(MI&: *CondBr);
3311 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
3312 .addMBB(MBB: FBB);
3313
3314 MachineOperand &CondReg = CondBr->getOperand(i: 1);
3315 CondReg.setIsUndef(Cond[1].isUndef());
3316 CondReg.setIsKill(Cond[1].isKill());
3317
3318 if (BytesAdded)
3319 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3320
3321 return 2;
3322}
3323
3324bool SIInstrInfo::reverseBranchCondition(
3325 SmallVectorImpl<MachineOperand> &Cond) const {
3326 if (Cond.size() != 2) {
3327 return true;
3328 }
3329
3330 if (Cond[0].isImm()) {
3331 Cond[0].setImm(-Cond[0].getImm());
3332 return false;
3333 }
3334
3335 return true;
3336}
3337
3338bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
3339 ArrayRef<MachineOperand> Cond,
3340 Register DstReg, Register TrueReg,
3341 Register FalseReg, int &CondCycles,
3342 int &TrueCycles, int &FalseCycles) const {
3343 switch (Cond[0].getImm()) {
3344 case VCCNZ:
3345 case VCCZ: {
3346 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3347 const TargetRegisterClass *RC = MRI.getRegClass(Reg: TrueReg);
3348 if (MRI.getRegClass(Reg: FalseReg) != RC)
3349 return false;
3350
3351 int NumInsts = AMDGPU::getRegBitWidth(RC: *RC) / 32;
3352 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3353
3354 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3355 return RI.hasVGPRs(RC) && NumInsts <= 6;
3356 }
3357 case SCC_TRUE:
3358 case SCC_FALSE: {
3359 // FIXME: We could insert for VGPRs if we could replace the original compare
3360 // with a vector one.
3361 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3362 const TargetRegisterClass *RC = MRI.getRegClass(Reg: TrueReg);
3363 if (MRI.getRegClass(Reg: FalseReg) != RC)
3364 return false;
3365
3366 int NumInsts = AMDGPU::getRegBitWidth(RC: *RC) / 32;
3367
3368 // Multiples of 8 can do s_cselect_b64
3369 if (NumInsts % 2 == 0)
3370 NumInsts /= 2;
3371
3372 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3373 return RI.isSGPRClass(RC);
3374 }
3375 default:
3376 return false;
3377 }
3378}
3379
3380void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
3381 MachineBasicBlock::iterator I, const DebugLoc &DL,
3382 Register DstReg, ArrayRef<MachineOperand> Cond,
3383 Register TrueReg, Register FalseReg) const {
3384 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3385 if (Pred == VCCZ || Pred == SCC_FALSE) {
3386 Pred = static_cast<BranchPredicate>(-Pred);
3387 std::swap(a&: TrueReg, b&: FalseReg);
3388 }
3389
3390 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3391 const TargetRegisterClass *DstRC = MRI.getRegClass(Reg: DstReg);
3392 unsigned DstSize = RI.getRegSizeInBits(RC: *DstRC);
3393
3394 if (DstSize == 32) {
3395 MachineInstr *Select;
3396 if (Pred == SCC_TRUE) {
3397 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: DstReg)
3398 .addReg(RegNo: TrueReg)
3399 .addReg(RegNo: FalseReg);
3400 } else {
3401 // Instruction's operands are backwards from what is expected.
3402 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e32), DestReg: DstReg)
3403 .addReg(RegNo: FalseReg)
3404 .addReg(RegNo: TrueReg);
3405 }
3406
3407 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3408 return;
3409 }
3410
3411 if (DstSize == 64 && Pred == SCC_TRUE) {
3412 MachineInstr *Select =
3413 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: DstReg)
3414 .addReg(RegNo: TrueReg)
3415 .addReg(RegNo: FalseReg);
3416
3417 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3418 return;
3419 }
3420
3421 static const int16_t Sub0_15[] = {
3422 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3423 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3424 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3425 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3426 };
3427
3428 static const int16_t Sub0_15_64[] = {
3429 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3430 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3431 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3432 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3433 };
3434
3435 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3436 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3437 const int16_t *SubIndices = Sub0_15;
3438 int NElts = DstSize / 32;
3439
3440 // 64-bit select is only available for SALU.
3441 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3442 if (Pred == SCC_TRUE) {
3443 if (NElts % 2) {
3444 SelOp = AMDGPU::S_CSELECT_B32;
3445 EltRC = &AMDGPU::SGPR_32RegClass;
3446 } else {
3447 SelOp = AMDGPU::S_CSELECT_B64;
3448 EltRC = &AMDGPU::SGPR_64RegClass;
3449 SubIndices = Sub0_15_64;
3450 NElts /= 2;
3451 }
3452 }
3453
3454 MachineInstrBuilder MIB = BuildMI(
3455 BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg);
3456
3457 I = MIB->getIterator();
3458
3459 SmallVector<Register, 8> Regs;
3460 for (int Idx = 0; Idx != NElts; ++Idx) {
3461 Register DstElt = MRI.createVirtualRegister(RegClass: EltRC);
3462 Regs.push_back(Elt: DstElt);
3463
3464 unsigned SubIdx = SubIndices[Idx];
3465
3466 MachineInstr *Select;
3467 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3468 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SelOp), DestReg: DstElt)
3469 .addReg(RegNo: FalseReg, Flags: {}, SubReg: SubIdx)
3470 .addReg(RegNo: TrueReg, Flags: {}, SubReg: SubIdx);
3471 } else {
3472 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SelOp), DestReg: DstElt)
3473 .addReg(RegNo: TrueReg, Flags: {}, SubReg: SubIdx)
3474 .addReg(RegNo: FalseReg, Flags: {}, SubReg: SubIdx);
3475 }
3476
3477 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3478 fixImplicitOperands(MI&: *Select);
3479
3480 MIB.addReg(RegNo: DstElt)
3481 .addImm(Val: SubIdx);
3482 }
3483}
3484
3485bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
3486 switch (MI.getOpcode()) {
3487 case AMDGPU::V_MOV_B16_t16_e32:
3488 case AMDGPU::V_MOV_B16_t16_e64:
3489 case AMDGPU::V_MOV_B32_e32:
3490 case AMDGPU::V_MOV_B32_e64:
3491 case AMDGPU::V_MOV_B64_PSEUDO:
3492 case AMDGPU::V_MOV_B64_e32:
3493 case AMDGPU::V_MOV_B64_e64:
3494 case AMDGPU::S_MOV_B32:
3495 case AMDGPU::S_MOV_B64:
3496 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3497 case AMDGPU::COPY:
3498 case AMDGPU::WWM_COPY:
3499 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3500 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3501 case AMDGPU::V_ACCVGPR_MOV_B32:
3502 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3503 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3504 return true;
3505 default:
3506 return false;
3507 }
3508}
3509
3510unsigned SIInstrInfo::getFoldableCopySrcIdx(const MachineInstr &MI) {
3511 switch (MI.getOpcode()) {
3512 case AMDGPU::V_MOV_B16_t16_e32:
3513 case AMDGPU::V_MOV_B16_t16_e64:
3514 return 2;
3515 case AMDGPU::V_MOV_B32_e32:
3516 case AMDGPU::V_MOV_B32_e64:
3517 case AMDGPU::V_MOV_B64_PSEUDO:
3518 case AMDGPU::V_MOV_B64_e32:
3519 case AMDGPU::V_MOV_B64_e64:
3520 case AMDGPU::S_MOV_B32:
3521 case AMDGPU::S_MOV_B64:
3522 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3523 case AMDGPU::COPY:
3524 case AMDGPU::WWM_COPY:
3525 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3526 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3527 case AMDGPU::V_ACCVGPR_MOV_B32:
3528 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3529 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3530 return 1;
3531 default:
3532 llvm_unreachable("MI is not a foldable copy");
3533 }
3534}
3535
3536static constexpr AMDGPU::OpName ModifierOpNames[] = {
3537 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3538 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3539 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3540
3541void SIInstrInfo::removeModOperands(MachineInstr &MI) const {
3542 unsigned Opc = MI.getOpcode();
3543 for (AMDGPU::OpName Name : reverse(C: ModifierOpNames)) {
3544 int Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name);
3545 if (Idx >= 0)
3546 MI.removeOperand(OpNo: Idx);
3547 }
3548}
3549
3550void SIInstrInfo::mutateAndCleanupImplicit(MachineInstr &MI,
3551 const MCInstrDesc &NewDesc) const {
3552 MI.setDesc(NewDesc);
3553
3554 // Remove any leftover implicit operands from mutating the instruction. e.g.
3555 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3556 // anymore.
3557 const MCInstrDesc &Desc = MI.getDesc();
3558 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3559 Desc.implicit_defs().size();
3560
3561 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3562 MI.removeOperand(OpNo: I);
3563}
3564
3565std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3566 unsigned SubRegIndex) {
3567 switch (SubRegIndex) {
3568 case AMDGPU::NoSubRegister:
3569 return Imm;
3570 case AMDGPU::sub0:
3571 return SignExtend64<32>(x: Imm);
3572 case AMDGPU::sub1:
3573 return SignExtend64<32>(x: Imm >> 32);
3574 case AMDGPU::lo16:
3575 return SignExtend64<16>(x: Imm);
3576 case AMDGPU::hi16:
3577 return SignExtend64<16>(x: Imm >> 16);
3578 case AMDGPU::sub1_lo16:
3579 return SignExtend64<16>(x: Imm >> 32);
3580 case AMDGPU::sub1_hi16:
3581 return SignExtend64<16>(x: Imm >> 48);
3582 default:
3583 return std::nullopt;
3584 }
3585
3586 llvm_unreachable("covered subregister switch");
3587}
3588
3589static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3590 switch (Opc) {
3591 case AMDGPU::V_MAC_F16_e32:
3592 case AMDGPU::V_MAC_F16_e64:
3593 case AMDGPU::V_MAD_F16_e64:
3594 return AMDGPU::V_MADAK_F16;
3595 case AMDGPU::V_MAC_F32_e32:
3596 case AMDGPU::V_MAC_F32_e64:
3597 case AMDGPU::V_MAD_F32_e64:
3598 return AMDGPU::V_MADAK_F32;
3599 case AMDGPU::V_FMAC_F32_e32:
3600 case AMDGPU::V_FMAC_F32_e64:
3601 case AMDGPU::V_FMA_F32_e64:
3602 return AMDGPU::V_FMAAK_F32;
3603 case AMDGPU::V_FMAC_F16_e32:
3604 case AMDGPU::V_FMAC_F16_e64:
3605 case AMDGPU::V_FMAC_F16_t16_e64:
3606 case AMDGPU::V_FMAC_F16_fake16_e64:
3607 case AMDGPU::V_FMAC_F16_t16_e32:
3608 case AMDGPU::V_FMAC_F16_fake16_e32:
3609 case AMDGPU::V_FMA_F16_e64:
3610 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3611 ? AMDGPU::V_FMAAK_F16_t16
3612 : AMDGPU::V_FMAAK_F16_fake16
3613 : AMDGPU::V_FMAAK_F16;
3614 case AMDGPU::V_FMAC_F64_e32:
3615 case AMDGPU::V_FMAC_F64_e64:
3616 case AMDGPU::V_FMA_F64_e64:
3617 return AMDGPU::V_FMAAK_F64;
3618 default:
3619 llvm_unreachable("invalid instruction");
3620 }
3621}
3622
3623static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3624 switch (Opc) {
3625 case AMDGPU::V_MAC_F16_e32:
3626 case AMDGPU::V_MAC_F16_e64:
3627 case AMDGPU::V_MAD_F16_e64:
3628 return AMDGPU::V_MADMK_F16;
3629 case AMDGPU::V_MAC_F32_e32:
3630 case AMDGPU::V_MAC_F32_e64:
3631 case AMDGPU::V_MAD_F32_e64:
3632 return AMDGPU::V_MADMK_F32;
3633 case AMDGPU::V_FMAC_F32_e32:
3634 case AMDGPU::V_FMAC_F32_e64:
3635 case AMDGPU::V_FMA_F32_e64:
3636 return AMDGPU::V_FMAMK_F32;
3637 case AMDGPU::V_FMAC_F16_e32:
3638 case AMDGPU::V_FMAC_F16_e64:
3639 case AMDGPU::V_FMAC_F16_t16_e64:
3640 case AMDGPU::V_FMAC_F16_fake16_e64:
3641 case AMDGPU::V_FMAC_F16_t16_e32:
3642 case AMDGPU::V_FMAC_F16_fake16_e32:
3643 case AMDGPU::V_FMA_F16_e64:
3644 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3645 ? AMDGPU::V_FMAMK_F16_t16
3646 : AMDGPU::V_FMAMK_F16_fake16
3647 : AMDGPU::V_FMAMK_F16;
3648 case AMDGPU::V_FMAC_F64_e32:
3649 case AMDGPU::V_FMAC_F64_e64:
3650 case AMDGPU::V_FMA_F64_e64:
3651 return AMDGPU::V_FMAMK_F64;
3652 default:
3653 llvm_unreachable("invalid instruction");
3654 }
3655}
3656
3657bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3658 Register Reg, MachineRegisterInfo *MRI) const {
3659 int64_t Imm;
3660 if (!getConstValDefinedInReg(MI: DefMI, Reg, ImmVal&: Imm))
3661 return false;
3662
3663 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(RegNo: Reg);
3664
3665 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3666
3667 unsigned Opc = UseMI.getOpcode();
3668 if (Opc == AMDGPU::COPY) {
3669 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3670
3671 Register DstReg = UseMI.getOperand(i: 0).getReg();
3672 Register UseSubReg = UseMI.getOperand(i: 1).getSubReg();
3673
3674 const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI: *MRI, Reg: DstReg);
3675
3676 if (HasMultipleUses) {
3677 // TODO: This should fold in more cases with multiple use, but we need to
3678 // more carefully consider what those uses are.
3679 unsigned ImmDefSize = RI.getRegSizeInBits(RC: *MRI->getRegClass(Reg));
3680
3681 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3682 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3683 return false;
3684
3685 // Most of the time folding a 32-bit inline constant is free (though this
3686 // might not be true if we can't later fold it into a real user).
3687 //
3688 // FIXME: This isInlineConstant check is imprecise if
3689 // getConstValDefinedInReg handled the tricky non-mov cases.
3690 if (ImmDefSize == 32 &&
3691 !isInlineConstant(ImmVal: Imm, OperandType: AMDGPU::OPERAND_REG_IMM_INT32))
3692 return false;
3693 }
3694
3695 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3696 RI.getSubRegIdxSize(Idx: UseSubReg) == 16;
3697
3698 if (Is16Bit) {
3699 if (RI.hasVGPRs(RC: DstRC))
3700 return false; // Do not clobber vgpr_hi16
3701
3702 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3703 return false;
3704 }
3705
3706 MachineFunction *MF = UseMI.getMF();
3707
3708 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3709 MCRegister MovDstPhysReg =
3710 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3711
3712 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, SubRegIndex: UseSubReg);
3713
3714 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3715 for (unsigned MovOp :
3716 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3717 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3718 const MCInstrDesc &MovDesc = get(Opcode: MovOp);
3719
3720 const TargetRegisterClass *MovDstRC = getRegClass(MCID: MovDesc, OpNum: 0);
3721 if (Is16Bit) {
3722 // We just need to find a correctly sized register class, so the
3723 // subregister index compatibility doesn't matter since we're statically
3724 // extracting the immediate value.
3725 MovDstRC = RI.getMatchingSuperRegClass(A: MovDstRC, B: DstRC, Idx: AMDGPU::lo16);
3726 if (!MovDstRC)
3727 continue;
3728
3729 if (MovDstPhysReg) {
3730 // FIXME: We probably should not do this. If there is a live value in
3731 // the high half of the register, it will be corrupted.
3732 MovDstPhysReg =
3733 RI.getMatchingSuperReg(Reg: MovDstPhysReg, SubIdx: AMDGPU::lo16, RC: MovDstRC);
3734 if (!MovDstPhysReg)
3735 continue;
3736 }
3737 }
3738
3739 // Result class isn't the right size, try the next instruction.
3740 if (MovDstPhysReg) {
3741 if (!MovDstRC->contains(Reg: MovDstPhysReg))
3742 return false;
3743 } else if (!MRI->constrainRegClass(Reg: DstReg, RC: MovDstRC)) {
3744 // TODO: This will be overly conservative in the case of 16-bit virtual
3745 // SGPRs. We could hack up the virtual register uses to use a compatible
3746 // 32-bit class.
3747 continue;
3748 }
3749
3750 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3751
3752 // Ensure the interpreted immediate value is a valid operand in the new
3753 // mov.
3754 //
3755 // FIXME: isImmOperandLegal should have form that doesn't require existing
3756 // MachineInstr or MachineOperand
3757 if (!RI.opCanUseLiteralConstant(OpType: OpInfo.OperandType) &&
3758 !isInlineConstant(ImmVal: *SubRegImm, OperandType: OpInfo.OperandType))
3759 break;
3760
3761 NewOpc = MovOp;
3762 break;
3763 }
3764
3765 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3766 return false;
3767
3768 if (Is16Bit) {
3769 UseMI.getOperand(i: 0).setSubReg(AMDGPU::NoSubRegister);
3770 if (MovDstPhysReg)
3771 UseMI.getOperand(i: 0).setReg(MovDstPhysReg);
3772 assert(UseMI.getOperand(1).getReg().isVirtual());
3773 }
3774
3775 const MCInstrDesc &NewMCID = get(Opcode: NewOpc);
3776 UseMI.setDesc(NewMCID);
3777 UseMI.getOperand(i: 1).ChangeToImmediate(ImmVal: *SubRegImm);
3778 UseMI.addImplicitDefUseOperands(MF&: *MF);
3779 return true;
3780 }
3781
3782 if (HasMultipleUses)
3783 return false;
3784
3785 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3786 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3787 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3788 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3789 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3790 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3791 Opc == AMDGPU::V_FMAC_F64_e64) {
3792 // Don't fold if we are using source or output modifiers. The new VOP2
3793 // instructions don't have them.
3794 if (hasAnyModifiersSet(MI: UseMI))
3795 return false;
3796
3797 // If this is a free constant, there's no reason to do this.
3798 // TODO: We could fold this here instead of letting SIFoldOperands do it
3799 // later.
3800 int Src0Idx = getNamedOperandIdx(Opcode: UseMI.getOpcode(), Name: AMDGPU::OpName::src0);
3801
3802 // Any src operand can be used for the legality check.
3803 if (isInlineConstant(MI: UseMI, OpIdx: Src0Idx, ImmVal: Imm))
3804 return false;
3805
3806 MachineOperand *Src0 = &UseMI.getOperand(i: Src0Idx);
3807
3808 MachineOperand *Src1 = getNamedOperand(MI&: UseMI, OperandName: AMDGPU::OpName::src1);
3809 MachineOperand *Src2 = getNamedOperand(MI&: UseMI, OperandName: AMDGPU::OpName::src2);
3810
3811 auto CopyRegOperandToNarrowerRC =
3812 [MRI, this](MachineInstr &MI, unsigned OpNo,
3813 const TargetRegisterClass *NewRC) -> void {
3814 if (!MI.getOperand(i: OpNo).isReg())
3815 return;
3816 Register Reg = MI.getOperand(i: OpNo).getReg();
3817 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI: *MRI, Reg);
3818 if (RI.getCommonSubClass(A: RC, B: NewRC) != NewRC)
3819 return;
3820 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
3821 BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(),
3822 MCID: get(Opcode: AMDGPU::COPY), DestReg: Tmp)
3823 .addReg(RegNo: Reg);
3824 MI.getOperand(i: OpNo).setReg(Tmp);
3825 MI.getOperand(i: OpNo).setIsKill();
3826 };
3827
3828 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3829 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3830 (Src1->isReg() && Src1->getReg() == Reg)) {
3831 MachineOperand *RegSrc =
3832 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3833 if (!RegSrc->isReg())
3834 return false;
3835 if (RI.isSGPRClass(RC: MRI->getRegClass(Reg: RegSrc->getReg())) &&
3836 ST.getConstantBusLimit(Opcode: Opc) < 2)
3837 return false;
3838
3839 if (!Src2->isReg() || RI.isSGPRClass(RC: MRI->getRegClass(Reg: Src2->getReg())))
3840 return false;
3841
3842 // If src2 is also a literal constant then we have to choose which one to
3843 // fold. In general it is better to choose madak so that the other literal
3844 // can be materialized in an sgpr instead of a vgpr:
3845 // s_mov_b32 s0, literal
3846 // v_madak_f32 v0, s0, v0, literal
3847 // Instead of:
3848 // v_mov_b32 v1, literal
3849 // v_madmk_f32 v0, v0, literal, v1
3850 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src2->getReg());
3851 if (Def && Def->isMoveImmediate() &&
3852 !isInlineConstant(MO: Def->getOperand(i: 1)))
3853 return false;
3854
3855 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3856 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
3857 return false;
3858
3859 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3860 Imm, SubRegIndex: RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3861
3862 // FIXME: This would be a lot easier if we could return a new instruction
3863 // instead of having to modify in place.
3864
3865 Register SrcReg = RegSrc->getReg();
3866 unsigned SrcSubReg = RegSrc->getSubReg();
3867 Src0->setReg(SrcReg);
3868 Src0->setSubReg(SrcSubReg);
3869 Src0->setIsKill(RegSrc->isKill());
3870
3871 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3872 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3873 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3874 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3875 UseMI.untieRegOperand(
3876 OpIdx: AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2));
3877
3878 Src1->ChangeToImmediate(ImmVal: *SubRegImm);
3879
3880 removeModOperands(MI&: UseMI);
3881 UseMI.setDesc(get(Opcode: NewOpc));
3882
3883 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3884 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3885 const TargetRegisterClass *NewRC = getRegClass(MCID: get(Opcode: NewOpc), OpNum: 0);
3886 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
3887 BuildMI(BB&: *UseMI.getParent(), I: std::next(x: UseMI.getIterator()),
3888 MIMD: UseMI.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
3889 DestReg: UseMI.getOperand(i: 0).getReg())
3890 .addReg(RegNo: Tmp, Flags: RegState::Kill);
3891 UseMI.getOperand(i: 0).setReg(Tmp);
3892 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3893 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3894 }
3895
3896 bool DeleteDef = MRI->use_nodbg_empty(RegNo: Reg);
3897 if (DeleteDef)
3898 DefMI.eraseFromParent();
3899
3900 return true;
3901 }
3902
3903 // Added part is the constant: Use v_madak_{f16, f32}.
3904 if (Src2->isReg() && Src2->getReg() == Reg) {
3905 if (ST.getConstantBusLimit(Opcode: Opc) < 2) {
3906 // Not allowed to use constant bus for another operand.
3907 // We can however allow an inline immediate as src0.
3908 bool Src0Inlined = false;
3909 if (Src0->isReg()) {
3910 // Try to inline constant if possible.
3911 // If the Def moves immediate and the use is single
3912 // We are saving VGPR here.
3913 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src0->getReg());
3914 if (Def && Def->isMoveImmediate() &&
3915 isInlineConstant(MO: Def->getOperand(i: 1)) &&
3916 MRI->hasOneNonDBGUse(RegNo: Src0->getReg())) {
3917 Src0->ChangeToImmediate(ImmVal: Def->getOperand(i: 1).getImm());
3918 Src0Inlined = true;
3919 } else if (ST.getConstantBusLimit(Opcode: Opc) <= 1 &&
3920 RI.isSGPRReg(MRI: *MRI, Reg: Src0->getReg())) {
3921 return false;
3922 }
3923 // VGPR is okay as Src0 - fallthrough
3924 }
3925
3926 if (Src1->isReg() && !Src0Inlined) {
3927 // We have one slot for inlinable constant so far - try to fill it
3928 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src1->getReg());
3929 if (Def && Def->isMoveImmediate() &&
3930 isInlineConstant(MO: Def->getOperand(i: 1)) &&
3931 MRI->hasOneNonDBGUse(RegNo: Src1->getReg()) && commuteInstruction(MI&: UseMI))
3932 Src0->ChangeToImmediate(ImmVal: Def->getOperand(i: 1).getImm());
3933 else if (RI.isSGPRReg(MRI: *MRI, Reg: Src1->getReg()))
3934 return false;
3935 // VGPR is okay as Src1 - fallthrough
3936 }
3937 }
3938
3939 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3940 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
3941 return false;
3942
3943 // FIXME: This would be a lot easier if we could return a new instruction
3944 // instead of having to modify in place.
3945
3946 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3947 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3948 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3949 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3950 UseMI.untieRegOperand(
3951 OpIdx: AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2));
3952
3953 const std::optional<int64_t> SubRegImm =
3954 extractSubregFromImm(Imm, SubRegIndex: Src2->getSubReg());
3955
3956 // ChangingToImmediate adds Src2 back to the instruction.
3957 Src2->ChangeToImmediate(ImmVal: *SubRegImm);
3958
3959 // These come before src2.
3960 removeModOperands(MI&: UseMI);
3961 UseMI.setDesc(get(Opcode: NewOpc));
3962
3963 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3964 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3965 const TargetRegisterClass *NewRC = getRegClass(MCID: get(Opcode: NewOpc), OpNum: 0);
3966 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
3967 BuildMI(BB&: *UseMI.getParent(), I: std::next(x: UseMI.getIterator()),
3968 MIMD: UseMI.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
3969 DestReg: UseMI.getOperand(i: 0).getReg())
3970 .addReg(RegNo: Tmp, Flags: RegState::Kill);
3971 UseMI.getOperand(i: 0).setReg(Tmp);
3972 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3973 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
3974 }
3975
3976 // It might happen that UseMI was commuted
3977 // and we now have SGPR as SRC1. If so 2 inlined
3978 // constant and SGPR are illegal.
3979 legalizeOperands(MI&: UseMI);
3980
3981 bool DeleteDef = MRI->use_nodbg_empty(RegNo: Reg);
3982 if (DeleteDef)
3983 DefMI.eraseFromParent();
3984
3985 return true;
3986 }
3987 }
3988
3989 return false;
3990}
3991
3992static bool
3993memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
3994 ArrayRef<const MachineOperand *> BaseOps2) {
3995 if (BaseOps1.size() != BaseOps2.size())
3996 return false;
3997 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3998 if (!BaseOps1[I]->isIdenticalTo(Other: *BaseOps2[I]))
3999 return false;
4000 }
4001 return true;
4002}
4003
4004static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
4005 LocationSize WidthB, int OffsetB) {
4006 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
4007 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
4008 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
4009 return LowWidth.hasValue() &&
4010 LowOffset + (int)LowWidth.getValue() <= HighOffset;
4011}
4012
4013bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
4014 const MachineInstr &MIb) const {
4015 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
4016 int64_t Offset0, Offset1;
4017 LocationSize Dummy0 = LocationSize::precise(Value: 0);
4018 LocationSize Dummy1 = LocationSize::precise(Value: 0);
4019 bool Offset0IsScalable, Offset1IsScalable;
4020 if (!getMemOperandsWithOffsetWidth(LdSt: MIa, BaseOps&: BaseOps0, Offset&: Offset0, OffsetIsScalable&: Offset0IsScalable,
4021 Width&: Dummy0, TRI: &RI) ||
4022 !getMemOperandsWithOffsetWidth(LdSt: MIb, BaseOps&: BaseOps1, Offset&: Offset1, OffsetIsScalable&: Offset1IsScalable,
4023 Width&: Dummy1, TRI: &RI))
4024 return false;
4025
4026 if (!memOpsHaveSameBaseOperands(BaseOps1: BaseOps0, BaseOps2: BaseOps1))
4027 return false;
4028
4029 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
4030 // FIXME: Handle ds_read2 / ds_write2.
4031 return false;
4032 }
4033 LocationSize Width0 = MIa.memoperands().front()->getSize();
4034 LocationSize Width1 = MIb.memoperands().front()->getSize();
4035 return offsetsDoNotOverlap(WidthA: Width0, OffsetA: Offset0, WidthB: Width1, OffsetB: Offset1);
4036}
4037
4038bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
4039 const MachineInstr &MIb) const {
4040 assert(MIa.mayLoadOrStore() &&
4041 "MIa must load from or modify a memory location");
4042 assert(MIb.mayLoadOrStore() &&
4043 "MIb must load from or modify a memory location");
4044
4045 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
4046 return false;
4047
4048 // XXX - Can we relax this between address spaces?
4049 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4050 return false;
4051
4052 if (isLDSDMA(MI: MIa) || isLDSDMA(MI: MIb))
4053 return false;
4054
4055 if (MIa.isBundle() || MIb.isBundle())
4056 return false;
4057
4058 // TODO: Should we check the address space from the MachineMemOperand? That
4059 // would allow us to distinguish objects we know don't alias based on the
4060 // underlying address space, even if it was lowered to a different one,
4061 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4062 // buffer.
4063 if (isDS(MI: MIa)) {
4064 if (isDS(MI: MIb))
4065 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4066
4067 return !isFLAT(MI: MIb) || isSegmentSpecificFLAT(MI: MIb);
4068 }
4069
4070 if (isMUBUF(MI: MIa) || isMTBUF(MI: MIa)) {
4071 if (isMUBUF(MI: MIb) || isMTBUF(MI: MIb))
4072 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4073
4074 if (isFLAT(MI: MIb))
4075 return isFLATScratch(MI: MIb);
4076
4077 return !isSMRD(MI: MIb);
4078 }
4079
4080 if (isSMRD(MI: MIa)) {
4081 if (isSMRD(MI: MIb))
4082 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4083
4084 if (isFLAT(MI: MIb))
4085 return isFLATScratch(MI: MIb);
4086
4087 return !isMUBUF(MI: MIb) && !isMTBUF(MI: MIb);
4088 }
4089
4090 if (isFLAT(MI: MIa)) {
4091 if (isFLAT(MI: MIb)) {
4092 if ((isFLATScratch(MI: MIa) && isFLATGlobal(MI: MIb)) ||
4093 (isFLATGlobal(MI: MIa) && isFLATScratch(MI: MIb)))
4094 return true;
4095
4096 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4097 }
4098
4099 return false;
4100 }
4101
4102 return false;
4103}
4104
4105static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,
4106 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4107 if (Reg.isPhysical())
4108 return false;
4109 auto *Def = MRI.getUniqueVRegDef(Reg);
4110 if (Def && SIInstrInfo::isFoldableCopy(MI: *Def) && Def->getOperand(i: 1).isImm()) {
4111 Imm = Def->getOperand(i: 1).getImm();
4112 if (DefMI)
4113 *DefMI = Def;
4114 return true;
4115 }
4116 return false;
4117}
4118
4119static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4120 MachineInstr **DefMI = nullptr) {
4121 if (!MO->isReg())
4122 return false;
4123 const MachineFunction *MF = MO->getParent()->getMF();
4124 const MachineRegisterInfo &MRI = MF->getRegInfo();
4125 return getFoldableImm(Reg: MO->getReg(), MRI, Imm, DefMI);
4126}
4127
4128static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
4129 MachineInstr &NewMI) {
4130 if (LV) {
4131 unsigned NumOps = MI.getNumOperands();
4132 for (unsigned I = 1; I < NumOps; ++I) {
4133 MachineOperand &Op = MI.getOperand(i: I);
4134 if (Op.isReg() && Op.isKill())
4135 LV->replaceKillInstruction(Reg: Op.getReg(), OldMI&: MI, NewMI);
4136 }
4137 }
4138}
4139
4140static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4141 switch (Opc) {
4142 case AMDGPU::V_MAC_F16_e32:
4143 case AMDGPU::V_MAC_F16_e64:
4144 return AMDGPU::V_MAD_F16_e64;
4145 case AMDGPU::V_MAC_F32_e32:
4146 case AMDGPU::V_MAC_F32_e64:
4147 return AMDGPU::V_MAD_F32_e64;
4148 case AMDGPU::V_MAC_LEGACY_F32_e32:
4149 case AMDGPU::V_MAC_LEGACY_F32_e64:
4150 return AMDGPU::V_MAD_LEGACY_F32_e64;
4151 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4152 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4153 return AMDGPU::V_FMA_LEGACY_F32_e64;
4154 case AMDGPU::V_FMAC_F16_e32:
4155 case AMDGPU::V_FMAC_F16_e64:
4156 case AMDGPU::V_FMAC_F16_t16_e64:
4157 case AMDGPU::V_FMAC_F16_fake16_e64:
4158 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4159 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4160 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4161 : AMDGPU::V_FMA_F16_gfx9_e64;
4162 case AMDGPU::V_FMAC_F32_e32:
4163 case AMDGPU::V_FMAC_F32_e64:
4164 return AMDGPU::V_FMA_F32_e64;
4165 case AMDGPU::V_FMAC_F64_e32:
4166 case AMDGPU::V_FMAC_F64_e64:
4167 return AMDGPU::V_FMA_F64_e64;
4168 default:
4169 llvm_unreachable("invalid instruction");
4170 }
4171}
4172
4173/// Helper struct for the implementation of 3-address conversion to communicate
4174/// updates made to instruction operands.
4175struct SIInstrInfo::ThreeAddressUpdates {
4176 /// Other instruction whose def is no longer used by the converted
4177 /// instruction.
4178 MachineInstr *RemoveMIUse = nullptr;
4179};
4180
4181MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
4182 LiveVariables *LV,
4183 LiveIntervals *LIS) const {
4184 MachineBasicBlock &MBB = *MI.getParent();
4185 MachineInstr *CandidateMI = &MI;
4186
4187 if (MI.isBundle()) {
4188 // This is a temporary placeholder for bundle handling that enables us to
4189 // exercise the relevant code paths in the two-address instruction pass.
4190 if (MI.getBundleSize() != 1)
4191 return nullptr;
4192 CandidateMI = MI.getNextNode();
4193 }
4194
4195 ThreeAddressUpdates U;
4196 MachineInstr *NewMI = convertToThreeAddressImpl(MI&: *CandidateMI, Updates&: U);
4197 if (!NewMI)
4198 return nullptr;
4199
4200 if (MI.isBundle()) {
4201 CandidateMI->eraseFromBundle();
4202
4203 for (MachineOperand &MO : MI.all_defs()) {
4204 if (MO.isTied())
4205 MI.untieRegOperand(OpIdx: MO.getOperandNo());
4206 }
4207 } else {
4208 updateLiveVariables(LV, MI, NewMI&: *NewMI);
4209 if (LIS) {
4210 LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *NewMI);
4211 // SlotIndex of defs needs to be updated when converting to early-clobber
4212 MachineOperand &Def = NewMI->getOperand(i: 0);
4213 if (Def.isEarlyClobber() && Def.isReg() &&
4214 LIS->hasInterval(Reg: Def.getReg())) {
4215 SlotIndex OldIndex = LIS->getInstructionIndex(Instr: *NewMI).getRegSlot(EC: false);
4216 SlotIndex NewIndex = LIS->getInstructionIndex(Instr: *NewMI).getRegSlot(EC: true);
4217 auto &LI = LIS->getInterval(Reg: Def.getReg());
4218 auto UpdateDefIndex = [&](LiveRange &LR) {
4219 auto *S = LR.find(Pos: OldIndex);
4220 if (S != LR.end() && S->start == OldIndex) {
4221 assert(S->valno && S->valno->def == OldIndex);
4222 S->start = NewIndex;
4223 S->valno->def = NewIndex;
4224 }
4225 };
4226 UpdateDefIndex(LI);
4227 for (auto &SR : LI.subranges())
4228 UpdateDefIndex(SR);
4229 }
4230 }
4231 }
4232
4233 if (U.RemoveMIUse) {
4234 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4235 // The only user is the instruction which will be killed.
4236 Register DefReg = U.RemoveMIUse->getOperand(i: 0).getReg();
4237
4238 if (MRI.hasOneNonDBGUse(RegNo: DefReg)) {
4239 // We cannot just remove the DefMI here, calling pass will crash.
4240 U.RemoveMIUse->setDesc(get(Opcode: AMDGPU::IMPLICIT_DEF));
4241 U.RemoveMIUse->getOperand(i: 0).setIsDead(true);
4242 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4243 U.RemoveMIUse->removeOperand(OpNo: I);
4244 if (LV)
4245 LV->getVarInfo(Reg: DefReg).AliveBlocks.clear();
4246 }
4247
4248 if (MI.isBundle()) {
4249 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, Reg: DefReg);
4250 if (!VRI.Reads && !VRI.Writes) {
4251 for (MachineOperand &MO : MI.all_uses()) {
4252 if (MO.isReg() && MO.getReg() == DefReg) {
4253 assert(MO.getSubReg() == 0 &&
4254 "tied sub-registers in bundles currently not supported");
4255 MI.removeOperand(OpNo: MO.getOperandNo());
4256 break;
4257 }
4258 }
4259
4260 if (LIS)
4261 LIS->shrinkToUses(li: &LIS->getInterval(Reg: DefReg));
4262 }
4263 } else if (LIS) {
4264 LiveInterval &DefLI = LIS->getInterval(Reg: DefReg);
4265
4266 // We cannot delete the original instruction here, so hack out the use
4267 // in the original instruction with a dummy register so we can use
4268 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4269 // not have the complexity of deleting a use to consider here.
4270 Register DummyReg = MRI.cloneVirtualRegister(VReg: DefReg);
4271 for (MachineOperand &MIOp : MI.uses()) {
4272 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4273 MIOp.setIsUndef(true);
4274 MIOp.setReg(DummyReg);
4275 }
4276 }
4277
4278 if (MI.isBundle()) {
4279 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, Reg: DefReg);
4280 if (!VRI.Reads && !VRI.Writes) {
4281 for (MachineOperand &MIOp : MI.uses()) {
4282 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4283 MIOp.setIsUndef(true);
4284 MIOp.setReg(DummyReg);
4285 }
4286 }
4287 }
4288
4289 MI.addOperand(Op: MachineOperand::CreateReg(Reg: DummyReg, isDef: false, isImp: false, isKill: false,
4290 isDead: false, /*isUndef=*/true));
4291 }
4292
4293 LIS->shrinkToUses(li: &DefLI);
4294 }
4295 }
4296
4297 return MI.isBundle() ? &MI : NewMI;
4298}
4299
4300MachineInstr *
4301SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4302 ThreeAddressUpdates &U) const {
4303 MachineBasicBlock &MBB = *MI.getParent();
4304 unsigned Opc = MI.getOpcode();
4305
4306 // Handle MFMA.
4307 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opcode: Opc);
4308 if (NewMFMAOpc != -1) {
4309 MachineInstrBuilder MIB =
4310 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewMFMAOpc));
4311 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4312 MIB.add(MO: MI.getOperand(i: I));
4313 return MIB;
4314 }
4315
4316 if (SIInstrInfo::isWMMA(MI)) {
4317 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(Opc: MI.getOpcode());
4318 MachineInstrBuilder MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4319 .setMIFlags(MI.getFlags());
4320 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4321 MIB->addOperand(Op: MI.getOperand(i: I));
4322 return MIB;
4323 }
4324
4325 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4326 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4327 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4328 "present pre-RA");
4329
4330 // Handle MAC/FMAC.
4331 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4332 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4333 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4334 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4335 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4336 bool Src0Literal = false;
4337
4338 switch (Opc) {
4339 default:
4340 return nullptr;
4341 case AMDGPU::V_MAC_F16_e64:
4342 case AMDGPU::V_FMAC_F16_e64:
4343 case AMDGPU::V_FMAC_F16_t16_e64:
4344 case AMDGPU::V_FMAC_F16_fake16_e64:
4345 case AMDGPU::V_MAC_F32_e64:
4346 case AMDGPU::V_MAC_LEGACY_F32_e64:
4347 case AMDGPU::V_FMAC_F32_e64:
4348 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4349 case AMDGPU::V_FMAC_F64_e64:
4350 break;
4351 case AMDGPU::V_MAC_F16_e32:
4352 case AMDGPU::V_FMAC_F16_e32:
4353 case AMDGPU::V_MAC_F32_e32:
4354 case AMDGPU::V_MAC_LEGACY_F32_e32:
4355 case AMDGPU::V_FMAC_F32_e32:
4356 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4357 case AMDGPU::V_FMAC_F64_e32: {
4358 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
4359 Name: AMDGPU::OpName::src0);
4360 const MachineOperand *Src0 = &MI.getOperand(i: Src0Idx);
4361 if (!Src0->isReg() && !Src0->isImm())
4362 return nullptr;
4363
4364 if (Src0->isImm() && !isInlineConstant(MI, OpIdx: Src0Idx, MO: *Src0))
4365 Src0Literal = true;
4366
4367 break;
4368 }
4369 }
4370
4371 MachineInstrBuilder MIB;
4372 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
4373 const MachineOperand *Src0 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
4374 const MachineOperand *Src0Mods =
4375 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
4376 const MachineOperand *Src1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4377 const MachineOperand *Src1Mods =
4378 getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
4379 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
4380 const MachineOperand *Src2Mods =
4381 getNamedOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers);
4382 const MachineOperand *Clamp = getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
4383 const MachineOperand *Omod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
4384 const MachineOperand *OpSel = getNamedOperand(MI, OperandName: AMDGPU::OpName::op_sel);
4385
4386 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4387 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4388 // If we have an SGPR input, we will violate the constant bus restriction.
4389 (ST.getConstantBusLimit(Opcode: Opc) > 1 || !Src0->isReg() ||
4390 !RI.isSGPRReg(MRI: MBB.getParent()->getRegInfo(), Reg: Src0->getReg()))) {
4391 MachineInstr *DefMI;
4392
4393 int64_t Imm;
4394 if (!Src0Literal && getFoldableImm(MO: Src2, Imm, DefMI: &DefMI)) {
4395 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4396 if (pseudoToMCOpcode(Opcode: NewOpc) != -1) {
4397 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4398 .add(MO: *Dst)
4399 .add(MO: *Src0)
4400 .add(MO: *Src1)
4401 .addImm(Val: Imm)
4402 .setMIFlags(MI.getFlags());
4403 U.RemoveMIUse = DefMI;
4404 return MIB;
4405 }
4406 }
4407 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4408 if (!Src0Literal && getFoldableImm(MO: Src1, Imm, DefMI: &DefMI)) {
4409 if (pseudoToMCOpcode(Opcode: NewOpc) != -1) {
4410 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4411 .add(MO: *Dst)
4412 .add(MO: *Src0)
4413 .addImm(Val: Imm)
4414 .add(MO: *Src2)
4415 .setMIFlags(MI.getFlags());
4416 U.RemoveMIUse = DefMI;
4417 return MIB;
4418 }
4419 }
4420 if (Src0Literal || getFoldableImm(MO: Src0, Imm, DefMI: &DefMI)) {
4421 if (Src0Literal) {
4422 Imm = Src0->getImm();
4423 DefMI = nullptr;
4424 }
4425 if (pseudoToMCOpcode(Opcode: NewOpc) != -1 &&
4426 isOperandLegal(
4427 MI, OpIdx: AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::src0),
4428 MO: Src1)) {
4429 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4430 .add(MO: *Dst)
4431 .add(MO: *Src1)
4432 .addImm(Val: Imm)
4433 .add(MO: *Src2)
4434 .setMIFlags(MI.getFlags());
4435 U.RemoveMIUse = DefMI;
4436 return MIB;
4437 }
4438 }
4439 }
4440
4441 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4442 // if VOP3 does not allow a literal operand.
4443 if (Src0Literal && !ST.hasVOP3Literal())
4444 return nullptr;
4445
4446 unsigned NewOpc = getNewFMAInst(ST, Opc);
4447
4448 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
4449 return nullptr;
4450
4451 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4452 .add(MO: *Dst)
4453 .addImm(Val: Src0Mods ? Src0Mods->getImm() : 0)
4454 .add(MO: *Src0)
4455 .addImm(Val: Src1Mods ? Src1Mods->getImm() : 0)
4456 .add(MO: *Src1)
4457 .addImm(Val: Src2Mods ? Src2Mods->getImm() : 0)
4458 .add(MO: *Src2)
4459 .addImm(Val: Clamp ? Clamp->getImm() : 0)
4460 .addImm(Val: Omod ? Omod->getImm() : 0)
4461 .setMIFlags(MI.getFlags());
4462 if (AMDGPU::hasNamedOperand(Opcode: NewOpc, NamedIdx: AMDGPU::OpName::op_sel))
4463 MIB.addImm(Val: OpSel ? OpSel->getImm() : 0);
4464 return MIB;
4465}
4466
4467// It's not generally safe to move VALU instructions across these since it will
4468// start using the register as a base index rather than directly.
4469// XXX - Why isn't hasSideEffects sufficient for these?
4470static bool changesVGPRIndexingMode(const MachineInstr &MI) {
4471 switch (MI.getOpcode()) {
4472 case AMDGPU::S_SET_GPR_IDX_ON:
4473 case AMDGPU::S_SET_GPR_IDX_MODE:
4474 case AMDGPU::S_SET_GPR_IDX_OFF:
4475 return true;
4476 default:
4477 return false;
4478 }
4479}
4480
4481bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
4482 const MachineBasicBlock *MBB,
4483 const MachineFunction &MF) const {
4484 // Skipping the check for SP writes in the base implementation. The reason it
4485 // was added was apparently due to compile time concerns.
4486 //
4487 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4488 // but is probably avoidable.
4489
4490 // Copied from base implementation.
4491 // Terminators and labels can't be scheduled around.
4492 if (MI.isTerminator() || MI.isPosition())
4493 return true;
4494
4495 // INLINEASM_BR can jump to another block
4496 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4497 return true;
4498
4499 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(i: 0).getImm() == 0)
4500 return true;
4501
4502 // Target-independent instructions do not have an implicit-use of EXEC, even
4503 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4504 // boundaries prevents incorrect movements of such instructions.
4505 return MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &RI) ||
4506 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4507 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4508 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4509 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4510 changesVGPRIndexingMode(MI);
4511}
4512
4513bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
4514 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4515 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4516 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4517}
4518
4519bool SIInstrInfo::mayAccessScratch(const MachineInstr &MI) const {
4520 // Instructions that access scratch use FLAT encoding or BUF encodings.
4521 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4522 return false;
4523
4524 // SCRATCH instructions always access scratch.
4525 if (isFLATScratch(MI))
4526 return true;
4527
4528 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4529 // via the aperture.
4530 if (MI.getMF()->getFunction().hasFnAttribute(Kind: "amdgpu-no-flat-scratch-init"))
4531 return false;
4532
4533 // If there are no memory operands then conservatively assume the flat
4534 // operation may access scratch.
4535 if (MI.memoperands_empty())
4536 return true;
4537
4538 // See if any memory operand specifies an address space that involves scratch.
4539 return any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *Memop) {
4540 unsigned AS = Memop->getAddrSpace();
4541 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4542 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4543 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4544 MD: *MD, Val: AMDGPUAS::PRIVATE_ADDRESS);
4545 }
4546 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4547 });
4548}
4549
4550bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
4551 assert(isFLAT(MI));
4552
4553 // All flat instructions use the VMEM counter except prefetch.
4554 if (!usesVM_CNT(MI))
4555 return false;
4556
4557 // If there are no memory operands then conservatively assume the flat
4558 // operation may access VMEM.
4559 if (MI.memoperands_empty())
4560 return true;
4561
4562 // See if any memory operand specifies an address space that involves VMEM.
4563 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4564 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4565 // (GDS) address space is not supported by flat operations. Therefore, simply
4566 // return true unless only the LDS address space is found.
4567 for (const MachineMemOperand *Memop : MI.memoperands()) {
4568 unsigned AS = Memop->getAddrSpace();
4569 assert(AS != AMDGPUAS::REGION_ADDRESS);
4570 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4571 return true;
4572 }
4573
4574 return false;
4575}
4576
4577bool SIInstrInfo::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
4578 assert(isFLAT(MI));
4579
4580 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4581 if (!usesLGKM_CNT(MI))
4582 return false;
4583
4584 // If in tgsplit mode then there can be no use of LDS.
4585 if (ST.isTgSplitEnabled())
4586 return false;
4587
4588 // If there are no memory operands then conservatively assume the flat
4589 // operation may access LDS.
4590 if (MI.memoperands_empty())
4591 return true;
4592
4593 // See if any memory operand specifies an address space that involves LDS.
4594 for (const MachineMemOperand *Memop : MI.memoperands()) {
4595 unsigned AS = Memop->getAddrSpace();
4596 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
4597 return true;
4598 }
4599
4600 return false;
4601}
4602
4603bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
4604 // Skip the full operand and register alias search modifiesRegister
4605 // does. There's only a handful of instructions that touch this, it's only an
4606 // implicit def, and doesn't alias any other registers.
4607 return is_contained(Range: MI.getDesc().implicit_defs(), Element: AMDGPU::MODE);
4608}
4609
4610bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
4611 unsigned Opcode = MI.getOpcode();
4612
4613 if (MI.mayStore() && isSMRD(MI))
4614 return true; // scalar store or atomic
4615
4616 // This will terminate the function when other lanes may need to continue.
4617 if (MI.isReturn())
4618 return true;
4619
4620 // These instructions cause shader I/O that may cause hardware lockups
4621 // when executed with an empty EXEC mask.
4622 //
4623 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4624 // EXEC = 0, but checking for that case here seems not worth it
4625 // given the typical code patterns.
4626 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4627 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4628 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4629 return true;
4630
4631 if (MI.isCall() || MI.isInlineAsm())
4632 return true; // conservative assumption
4633
4634 // Assume that barrier interactions are only intended with active lanes.
4635 if (isBarrier(Opcode))
4636 return true;
4637
4638 // A mode change is a scalar operation that influences vector instructions.
4639 if (modifiesModeRegister(MI))
4640 return true;
4641
4642 // These are like SALU instructions in terms of effects, so it's questionable
4643 // whether we should return true for those.
4644 //
4645 // However, executing them with EXEC = 0 causes them to operate on undefined
4646 // data, which we avoid by returning true here.
4647 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4648 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4649 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4650 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4651 return true;
4652
4653 return false;
4654}
4655
4656bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,
4657 const MachineInstr &MI) const {
4658 if (MI.isMetaInstruction())
4659 return false;
4660
4661 // This won't read exec if this is an SGPR->SGPR copy.
4662 if (MI.isCopyLike()) {
4663 if (!RI.isSGPRReg(MRI, Reg: MI.getOperand(i: 0).getReg()))
4664 return true;
4665
4666 // Make sure this isn't copying exec as a normal operand
4667 return MI.readsRegister(Reg: AMDGPU::EXEC, TRI: &RI);
4668 }
4669
4670 // Make a conservative assumption about the callee.
4671 if (MI.isCall())
4672 return true;
4673
4674 // Be conservative with any unhandled generic opcodes.
4675 if (!isTargetSpecificOpcode(Opcode: MI.getOpcode()))
4676 return true;
4677
4678 return !isSALU(MI) || MI.readsRegister(Reg: AMDGPU::EXEC, TRI: &RI);
4679}
4680
4681bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4682 switch (Imm.getBitWidth()) {
4683 case 1: // This likely will be a condition code mask.
4684 return true;
4685
4686 case 32:
4687 return AMDGPU::isInlinableLiteral32(Literal: Imm.getSExtValue(),
4688 HasInv2Pi: ST.hasInv2PiInlineImm());
4689 case 64:
4690 return AMDGPU::isInlinableLiteral64(Literal: Imm.getSExtValue(),
4691 HasInv2Pi: ST.hasInv2PiInlineImm());
4692 case 16:
4693 return ST.has16BitInsts() &&
4694 AMDGPU::isInlinableLiteralI16(Literal: Imm.getSExtValue(),
4695 HasInv2Pi: ST.hasInv2PiInlineImm());
4696 default:
4697 llvm_unreachable("invalid bitwidth");
4698 }
4699}
4700
4701bool SIInstrInfo::isInlineConstant(const APFloat &Imm) const {
4702 APInt IntImm = Imm.bitcastToAPInt();
4703 int64_t IntImmVal = IntImm.getSExtValue();
4704 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4705 switch (APFloat::SemanticsToEnum(Sem: Imm.getSemantics())) {
4706 default:
4707 llvm_unreachable("invalid fltSemantics");
4708 case APFloatBase::S_IEEEsingle:
4709 case APFloatBase::S_IEEEdouble:
4710 return isInlineConstant(Imm: IntImm);
4711 case APFloatBase::S_BFloat:
4712 return ST.has16BitInsts() &&
4713 AMDGPU::isInlinableLiteralBF16(Literal: IntImmVal, HasInv2Pi);
4714 case APFloatBase::S_IEEEhalf:
4715 return ST.has16BitInsts() &&
4716 AMDGPU::isInlinableLiteralFP16(Literal: IntImmVal, HasInv2Pi);
4717 }
4718}
4719
4720bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4721 // MachineOperand provides no way to tell the true operand size, since it only
4722 // records a 64-bit value. We need to know the size to determine if a 32-bit
4723 // floating point immediate bit pattern is legal for an integer immediate. It
4724 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4725 switch (OperandType) {
4726 case AMDGPU::OPERAND_REG_IMM_INT32:
4727 case AMDGPU::OPERAND_REG_IMM_FP32:
4728 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
4729 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
4730 case AMDGPU::OPERAND_REG_IMM_V2FP32:
4731 case AMDGPU::OPERAND_REG_IMM_V2INT32:
4732 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
4733 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
4734 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: {
4735 int32_t Trunc = static_cast<int32_t>(Imm);
4736 return AMDGPU::isInlinableLiteral32(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4737 }
4738 case AMDGPU::OPERAND_REG_IMM_INT64:
4739 case AMDGPU::OPERAND_REG_IMM_FP64:
4740 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
4741 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
4742 case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
4743 return AMDGPU::isInlinableLiteral64(Literal: Imm, HasInv2Pi: ST.hasInv2PiInlineImm());
4744 case AMDGPU::OPERAND_REG_IMM_INT16:
4745 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
4746 // We would expect inline immediates to not be concerned with an integer/fp
4747 // distinction. However, in the case of 16-bit integer operations, the
4748 // "floating point" values appear to not work. It seems read the low 16-bits
4749 // of 32-bit immediates, which happens to always work for the integer
4750 // values.
4751 //
4752 // See llvm bugzilla 46302.
4753 //
4754 // TODO: Theoretically we could use op-sel to use the high bits of the
4755 // 32-bit FP values.
4756 return AMDGPU::isInlinableIntLiteral(Literal: Imm);
4757 case AMDGPU::OPERAND_REG_IMM_V2INT16:
4758 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
4759 return AMDGPU::isInlinableLiteralV2I16(Literal: Imm);
4760 case AMDGPU::OPERAND_REG_IMM_V2FP16:
4761 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
4762 return AMDGPU::isInlinableLiteralV2F16(Literal: Imm);
4763 case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
4764 return AMDGPU::isPKFMACF16InlineConstant(Literal: Imm, IsGFX11Plus: ST.isGFX11Plus());
4765 case AMDGPU::OPERAND_REG_IMM_V2BF16:
4766 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
4767 return AMDGPU::isInlinableLiteralV2BF16(Literal: Imm);
4768 case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
4769 return false;
4770 case AMDGPU::OPERAND_REG_IMM_FP16:
4771 case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
4772 if (isInt<16>(x: Imm) || isUInt<16>(x: Imm)) {
4773 // A few special case instructions have 16-bit operands on subtargets
4774 // where 16-bit instructions are not legal.
4775 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4776 // constants in these cases
4777 int16_t Trunc = static_cast<int16_t>(Imm);
4778 return ST.has16BitInsts() &&
4779 AMDGPU::isInlinableLiteralFP16(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4780 }
4781
4782 return false;
4783 }
4784 case AMDGPU::OPERAND_REG_IMM_BF16:
4785 case AMDGPU::OPERAND_REG_INLINE_C_BF16: {
4786 if (isInt<16>(x: Imm) || isUInt<16>(x: Imm)) {
4787 int16_t Trunc = static_cast<int16_t>(Imm);
4788 return ST.has16BitInsts() &&
4789 AMDGPU::isInlinableLiteralBF16(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4790 }
4791 return false;
4792 }
4793 case AMDGPU::OPERAND_KIMM32:
4794 case AMDGPU::OPERAND_KIMM16:
4795 case AMDGPU::OPERAND_KIMM64:
4796 return false;
4797 case AMDGPU::OPERAND_INLINE_C_AV64_PSEUDO:
4798 return isLegalAV64PseudoImm(Imm);
4799 case AMDGPU::OPERAND_INPUT_MODS:
4800 case MCOI::OPERAND_IMMEDIATE:
4801 // Always embedded in the instruction for free.
4802 return true;
4803 case MCOI::OPERAND_UNKNOWN:
4804 case MCOI::OPERAND_REGISTER:
4805 case MCOI::OPERAND_PCREL:
4806 case MCOI::OPERAND_GENERIC_0:
4807 case MCOI::OPERAND_GENERIC_1:
4808 case MCOI::OPERAND_GENERIC_2:
4809 case MCOI::OPERAND_GENERIC_3:
4810 case MCOI::OPERAND_GENERIC_4:
4811 case MCOI::OPERAND_GENERIC_5:
4812 // Just ignore anything else.
4813 return true;
4814 default:
4815 llvm_unreachable("invalid operand type");
4816 }
4817}
4818
4819static bool compareMachineOp(const MachineOperand &Op0,
4820 const MachineOperand &Op1) {
4821 if (Op0.getType() != Op1.getType())
4822 return false;
4823
4824 switch (Op0.getType()) {
4825 case MachineOperand::MO_Register:
4826 return Op0.getReg() == Op1.getReg();
4827 case MachineOperand::MO_Immediate:
4828 return Op0.getImm() == Op1.getImm();
4829 default:
4830 llvm_unreachable("Didn't expect to be comparing these operand types");
4831 }
4832}
4833
4834bool SIInstrInfo::isLiteralOperandLegal(const MCInstrDesc &InstDesc,
4835 const MCOperandInfo &OpInfo) const {
4836 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4837 return true;
4838
4839 if (!RI.opCanUseLiteralConstant(OpType: OpInfo.OperandType))
4840 return false;
4841
4842 if (!isVOP3(Desc: InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4843 return true;
4844
4845 return ST.hasVOP3Literal();
4846}
4847
4848bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4849 int64_t ImmVal) const {
4850 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4851 if (isInlineConstant(Imm: ImmVal, OperandType: OpInfo.OperandType)) {
4852 if (isMAI(Desc: InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4853 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(Opcode: InstDesc.getOpcode(),
4854 Name: AMDGPU::OpName::src2))
4855 return false;
4856 return RI.opCanUseInlineConstant(OpType: OpInfo.OperandType);
4857 }
4858
4859 return isLiteralOperandLegal(InstDesc, OpInfo);
4860}
4861
4862bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4863 const MachineOperand &MO) const {
4864 if (MO.isImm())
4865 return isImmOperandLegal(InstDesc, OpNo, ImmVal: MO.getImm());
4866
4867 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4868 "unexpected imm-like operand kind");
4869 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4870 return isLiteralOperandLegal(InstDesc, OpInfo);
4871}
4872
4873bool SIInstrInfo::isLegalAV64PseudoImm(uint64_t Imm) const {
4874 // 2 32-bit inline constants packed into one.
4875 return AMDGPU::isInlinableLiteral32(Literal: Lo_32(Value: Imm), HasInv2Pi: ST.hasInv2PiInlineImm()) &&
4876 AMDGPU::isInlinableLiteral32(Literal: Hi_32(Value: Imm), HasInv2Pi: ST.hasInv2PiInlineImm());
4877}
4878
4879bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4880 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4881 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4882 return false;
4883
4884 int Op32 = AMDGPU::getVOPe32(Opcode);
4885 if (Op32 == -1)
4886 return false;
4887
4888 return pseudoToMCOpcode(Opcode: Op32) != -1;
4889}
4890
4891bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4892 // The src0_modifier operand is present on all instructions
4893 // that have modifiers.
4894
4895 return AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers);
4896}
4897
4898bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
4899 AMDGPU::OpName OpName) const {
4900 const MachineOperand *Mods = getNamedOperand(MI, OperandName: OpName);
4901 return Mods && Mods->getImm();
4902}
4903
4904bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
4905 return any_of(Range: ModifierOpNames,
4906 P: [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, OpName: Name); });
4907}
4908
4909bool SIInstrInfo::canShrink(const MachineInstr &MI,
4910 const MachineRegisterInfo &MRI) const {
4911 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
4912 // Can't shrink instruction with three operands.
4913 if (Src2) {
4914 switch (MI.getOpcode()) {
4915 default: return false;
4916
4917 case AMDGPU::V_ADDC_U32_e64:
4918 case AMDGPU::V_SUBB_U32_e64:
4919 case AMDGPU::V_SUBBREV_U32_e64: {
4920 const MachineOperand *Src1
4921 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4922 if (!Src1->isReg() || !RI.isVGPR(MRI, Reg: Src1->getReg()))
4923 return false;
4924 // Additional verification is needed for sdst/src2.
4925 return true;
4926 }
4927 case AMDGPU::V_MAC_F16_e64:
4928 case AMDGPU::V_MAC_F32_e64:
4929 case AMDGPU::V_MAC_LEGACY_F32_e64:
4930 case AMDGPU::V_FMAC_F16_e64:
4931 case AMDGPU::V_FMAC_F16_t16_e64:
4932 case AMDGPU::V_FMAC_F16_fake16_e64:
4933 case AMDGPU::V_FMAC_F32_e64:
4934 case AMDGPU::V_FMAC_F64_e64:
4935 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4936 if (!Src2->isReg() || !RI.isVGPR(MRI, Reg: Src2->getReg()) ||
4937 hasModifiersSet(MI, OpName: AMDGPU::OpName::src2_modifiers))
4938 return false;
4939 break;
4940
4941 case AMDGPU::V_CNDMASK_B32_e64:
4942 break;
4943 }
4944 }
4945
4946 const MachineOperand *Src1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4947 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Reg: Src1->getReg()) ||
4948 hasModifiersSet(MI, OpName: AMDGPU::OpName::src1_modifiers)))
4949 return false;
4950
4951 // We don't need to check src0, all input types are legal, so just make sure
4952 // src0 isn't using any modifiers.
4953 if (hasModifiersSet(MI, OpName: AMDGPU::OpName::src0_modifiers))
4954 return false;
4955
4956 // Can it be shrunk to a valid 32 bit opcode?
4957 if (!hasVALU32BitEncoding(Opcode: MI.getOpcode()))
4958 return false;
4959
4960 // Check output modifiers
4961 return !hasModifiersSet(MI, OpName: AMDGPU::OpName::omod) &&
4962 !hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) &&
4963 !hasModifiersSet(MI, OpName: AMDGPU::OpName::byte_sel) &&
4964 // TODO: Can we avoid checking bound_ctrl/fi here?
4965 // They are only used by permlane*_swap special case.
4966 !hasModifiersSet(MI, OpName: AMDGPU::OpName::bound_ctrl) &&
4967 !hasModifiersSet(MI, OpName: AMDGPU::OpName::fi);
4968}
4969
4970// Set VCC operand with all flags from \p Orig, except for setting it as
4971// implicit.
4972static void copyFlagsToImplicitVCC(MachineInstr &MI,
4973 const MachineOperand &Orig) {
4974
4975 for (MachineOperand &Use : MI.implicit_operands()) {
4976 if (Use.isUse() &&
4977 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4978 Use.setIsUndef(Orig.isUndef());
4979 Use.setIsKill(Orig.isKill());
4980 return;
4981 }
4982 }
4983}
4984
4985MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
4986 unsigned Op32) const {
4987 MachineBasicBlock *MBB = MI.getParent();
4988
4989 const MCInstrDesc &Op32Desc = get(Opcode: Op32);
4990 MachineInstrBuilder Inst32 =
4991 BuildMI(BB&: *MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: Op32Desc)
4992 .setMIFlags(MI.getFlags());
4993
4994 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4995 // For VOPC instructions, this is replaced by an implicit def of vcc.
4996
4997 // We assume the defs of the shrunk opcode are in the same order, and the
4998 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4999 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
5000 Inst32.add(MO: MI.getOperand(i: I));
5001
5002 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
5003
5004 int Idx = MI.getNumExplicitDefs();
5005 for (const MachineOperand &Use : MI.explicit_uses()) {
5006 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
5007 if (OpTy == AMDGPU::OPERAND_INPUT_MODS || OpTy == MCOI::OPERAND_IMMEDIATE)
5008 continue;
5009
5010 if (&Use == Src2) {
5011 if (AMDGPU::getNamedOperandIdx(Opcode: Op32, Name: AMDGPU::OpName::src2) == -1) {
5012 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
5013 // replaced with an implicit read of vcc or vcc_lo. The implicit read
5014 // of vcc was already added during the initial BuildMI, but we
5015 // 1) may need to change vcc to vcc_lo to preserve the original register
5016 // 2) have to preserve the original flags.
5017 copyFlagsToImplicitVCC(MI&: *Inst32, Orig: *Src2);
5018 continue;
5019 }
5020 }
5021
5022 Inst32.add(MO: Use);
5023 }
5024
5025 // FIXME: Losing implicit operands
5026 fixImplicitOperands(MI&: *Inst32);
5027 return Inst32;
5028}
5029
5030bool SIInstrInfo::physRegUsesConstantBus(const MachineOperand &RegOp) const {
5031 // Null is free
5032 Register Reg = RegOp.getReg();
5033 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
5034 return false;
5035
5036 // SGPRs use the constant bus
5037
5038 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5039 // physical register operands should also count, except for exec.
5040 if (RegOp.isImplicit())
5041 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5042
5043 // SGPRs use the constant bus
5044 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5045 AMDGPU::SReg_64RegClass.contains(Reg);
5046}
5047
5048bool SIInstrInfo::regUsesConstantBus(const MachineOperand &RegOp,
5049 const MachineRegisterInfo &MRI) const {
5050 Register Reg = RegOp.getReg();
5051 return Reg.isVirtual() ? RI.isSGPRClass(RC: MRI.getRegClass(Reg))
5052 : physRegUsesConstantBus(RegOp);
5053}
5054
5055bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
5056 const MachineOperand &MO,
5057 const MCOperandInfo &OpInfo) const {
5058 // Literal constants use the constant bus.
5059 if (!MO.isReg())
5060 return !isInlineConstant(MO, OpInfo);
5061
5062 Register Reg = MO.getReg();
5063 return Reg.isVirtual() ? RI.isSGPRClass(RC: MRI.getRegClass(Reg))
5064 : physRegUsesConstantBus(RegOp: MO);
5065}
5066
5067static Register findImplicitSGPRRead(const MachineInstr &MI) {
5068 for (const MachineOperand &MO : MI.implicit_operands()) {
5069 // We only care about reads.
5070 if (MO.isDef())
5071 continue;
5072
5073 switch (MO.getReg()) {
5074 case AMDGPU::VCC:
5075 case AMDGPU::VCC_LO:
5076 case AMDGPU::VCC_HI:
5077 case AMDGPU::M0:
5078 case AMDGPU::FLAT_SCR:
5079 return MO.getReg();
5080
5081 default:
5082 break;
5083 }
5084 }
5085
5086 return Register();
5087}
5088
5089static bool shouldReadExec(const MachineInstr &MI) {
5090 if (SIInstrInfo::isVALU(MI)) {
5091 switch (MI.getOpcode()) {
5092 case AMDGPU::V_READLANE_B32:
5093 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5094 case AMDGPU::V_WRITELANE_B32:
5095 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5096 return false;
5097 }
5098
5099 return true;
5100 }
5101
5102 if (MI.isPreISelOpcode() ||
5103 SIInstrInfo::isGenericOpcode(Opc: MI.getOpcode()) ||
5104 SIInstrInfo::isSALU(MI) ||
5105 SIInstrInfo::isSMRD(MI))
5106 return false;
5107
5108 return true;
5109}
5110
5111static bool isRegOrFI(const MachineOperand &MO) {
5112 return MO.isReg() || MO.isFI();
5113}
5114
5115static bool isSubRegOf(const SIRegisterInfo &TRI,
5116 const MachineOperand &SuperVec,
5117 const MachineOperand &SubReg) {
5118 if (SubReg.getReg().isPhysical())
5119 return TRI.isSubRegister(RegA: SuperVec.getReg(), RegB: SubReg.getReg());
5120
5121 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5122 SubReg.getReg() == SuperVec.getReg();
5123}
5124
5125// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5126bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5127 const MachineRegisterInfo &MRI,
5128 StringRef &ErrInfo) const {
5129 Register DstReg = MI.getOperand(i: 0).getReg();
5130 Register SrcReg = MI.getOperand(i: 1).getReg();
5131 // This is a check for copy from vector register to SGPR
5132 if (RI.isVectorRegister(MRI, Reg: SrcReg) && RI.isSGPRReg(MRI, Reg: DstReg)) {
5133 ErrInfo = "illegal copy from vector register to SGPR";
5134 return false;
5135 }
5136 return true;
5137}
5138
5139bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
5140 StringRef &ErrInfo) const {
5141 uint16_t Opcode = MI.getOpcode();
5142 const MachineFunction *MF = MI.getMF();
5143 const MachineRegisterInfo &MRI = MF->getRegInfo();
5144
5145 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5146 // Find a better property to recognize the point where instruction selection
5147 // is just done.
5148 // We can only enforce this check after SIFixSGPRCopies pass so that the
5149 // illegal copies are legalized and thereafter we don't expect a pass
5150 // inserting similar copies.
5151 if (!MRI.isSSA() && MI.isCopy())
5152 return verifyCopy(MI, MRI, ErrInfo);
5153
5154 if (SIInstrInfo::isGenericOpcode(Opc: Opcode))
5155 return true;
5156
5157 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0);
5158 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src1);
5159 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src2);
5160 int Src3Idx = -1;
5161 if (Src0Idx == -1) {
5162 // VOPD V_DUAL_* instructions use different operand names.
5163 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0X);
5164 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vsrc1X);
5165 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0Y);
5166 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vsrc1Y);
5167 }
5168
5169 // Make sure the number of operands is correct.
5170 const MCInstrDesc &Desc = get(Opcode);
5171 if (!Desc.isVariadic() &&
5172 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5173 ErrInfo = "Instruction has wrong number of operands.";
5174 return false;
5175 }
5176
5177 if (MI.isInlineAsm()) {
5178 // Verify register classes for inlineasm constraints.
5179 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5180 I != E; ++I) {
5181 const TargetRegisterClass *RC = MI.getRegClassConstraint(OpIdx: I, TII: this, TRI: &RI);
5182 if (!RC)
5183 continue;
5184
5185 const MachineOperand &Op = MI.getOperand(i: I);
5186 if (!Op.isReg())
5187 continue;
5188
5189 Register Reg = Op.getReg();
5190 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5191 ErrInfo = "inlineasm operand has incorrect register class.";
5192 return false;
5193 }
5194 }
5195
5196 return true;
5197 }
5198
5199 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5200 ErrInfo = "missing memory operand from image instruction.";
5201 return false;
5202 }
5203
5204 // Make sure the register classes are correct.
5205 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5206 const MachineOperand &MO = MI.getOperand(i);
5207 if (MO.isFPImm()) {
5208 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5209 "all fp values to integers.";
5210 return false;
5211 }
5212
5213 const MCOperandInfo &OpInfo = Desc.operands()[i];
5214 int16_t RegClass = getOpRegClassID(OpInfo);
5215
5216 switch (OpInfo.OperandType) {
5217 case MCOI::OPERAND_REGISTER:
5218 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5219 ErrInfo = "Illegal immediate value for operand.";
5220 return false;
5221 }
5222 break;
5223 case AMDGPU::OPERAND_REG_IMM_INT32:
5224 case AMDGPU::OPERAND_REG_IMM_INT64:
5225 case AMDGPU::OPERAND_REG_IMM_INT16:
5226 case AMDGPU::OPERAND_REG_IMM_FP32:
5227 case AMDGPU::OPERAND_REG_IMM_V2FP32:
5228 case AMDGPU::OPERAND_REG_IMM_BF16:
5229 case AMDGPU::OPERAND_REG_IMM_FP16:
5230 case AMDGPU::OPERAND_REG_IMM_FP64:
5231 case AMDGPU::OPERAND_REG_IMM_V2FP16:
5232 case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
5233 case AMDGPU::OPERAND_REG_IMM_V2INT16:
5234 case AMDGPU::OPERAND_REG_IMM_V2INT32:
5235 case AMDGPU::OPERAND_REG_IMM_V2BF16:
5236 break;
5237 case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
5238 break;
5239 break;
5240 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
5241 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
5242 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
5243 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
5244 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
5245 case AMDGPU::OPERAND_REG_INLINE_C_BF16:
5246 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
5247 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
5248 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
5249 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
5250 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
5251 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
5252 case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
5253 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, OpIdx: i))) {
5254 ErrInfo = "Illegal immediate value for operand.";
5255 return false;
5256 }
5257 break;
5258 }
5259 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
5260 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, OpIdx: i)) {
5261 ErrInfo = "Expected inline constant for operand.";
5262 return false;
5263 }
5264 break;
5265 case AMDGPU::OPERAND_INPUT_MODS:
5266 case AMDGPU::OPERAND_SDWA_VOPC_DST:
5267 case AMDGPU::OPERAND_KIMM16:
5268 break;
5269 case MCOI::OPERAND_IMMEDIATE:
5270 case AMDGPU::OPERAND_KIMM32:
5271 case AMDGPU::OPERAND_KIMM64:
5272 case AMDGPU::OPERAND_INLINE_C_AV64_PSEUDO:
5273 // Check if this operand is an immediate.
5274 // FrameIndex operands will be replaced by immediates, so they are
5275 // allowed.
5276 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5277 ErrInfo = "Expected immediate, but got non-immediate";
5278 return false;
5279 }
5280 break;
5281 case MCOI::OPERAND_UNKNOWN:
5282 case MCOI::OPERAND_MEMORY:
5283 case MCOI::OPERAND_PCREL:
5284 break;
5285 default:
5286 if (OpInfo.isGenericType())
5287 continue;
5288 break;
5289 }
5290
5291 if (!MO.isReg())
5292 continue;
5293 Register Reg = MO.getReg();
5294 if (!Reg)
5295 continue;
5296
5297 // FIXME: Ideally we would have separate instruction definitions with the
5298 // aligned register constraint.
5299 // FIXME: We do not verify inline asm operands, but custom inline asm
5300 // verification is broken anyway
5301 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5302 Opcode != AMDGPU::V_MOV_B64_PSEUDO) {
5303 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5304 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5305 if (const TargetRegisterClass *SubRC =
5306 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5307 RC = RI.getCompatibleSubRegClass(SuperRC: RC, SubRC, SubIdx: MO.getSubReg());
5308 if (RC)
5309 RC = SubRC;
5310 }
5311 }
5312
5313 // Check that this is the aligned version of the class.
5314 if (!RC || !RI.isProperlyAlignedRC(RC: *RC)) {
5315 ErrInfo = "Subtarget requires even aligned vector registers";
5316 return false;
5317 }
5318 }
5319
5320 if (RegClass != -1) {
5321 if (Reg.isVirtual())
5322 continue;
5323
5324 const TargetRegisterClass *RC = RI.getRegClass(i: RegClass);
5325 if (!RC->contains(Reg)) {
5326 ErrInfo = "Operand has incorrect register class.";
5327 return false;
5328 }
5329 }
5330 }
5331
5332 // Verify SDWA
5333 if (isSDWA(MI)) {
5334 if (!ST.hasSDWA()) {
5335 ErrInfo = "SDWA is not supported on this target";
5336 return false;
5337 }
5338
5339 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5340 AMDGPU::OpName::dst_sel}) {
5341 const MachineOperand *MO = getNamedOperand(MI, OperandName: Op);
5342 if (!MO)
5343 continue;
5344 int64_t Imm = MO->getImm();
5345 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5346 ErrInfo = "Invalid SDWA selection";
5347 return false;
5348 }
5349 }
5350
5351 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdst);
5352
5353 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5354 if (OpIdx == -1)
5355 continue;
5356 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5357
5358 if (!ST.hasSDWAScalar()) {
5359 // Only VGPRS on VI
5360 if (!MO.isReg() || !RI.hasVGPRs(RC: RI.getRegClassForReg(MRI, Reg: MO.getReg()))) {
5361 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5362 return false;
5363 }
5364 } else {
5365 // No immediates on GFX9
5366 if (!MO.isReg()) {
5367 ErrInfo =
5368 "Only reg allowed as operands in SDWA instructions on GFX9+";
5369 return false;
5370 }
5371 }
5372 }
5373
5374 if (!ST.hasSDWAOmod()) {
5375 // No omod allowed on VI
5376 const MachineOperand *OMod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
5377 if (OMod != nullptr &&
5378 (!OMod->isImm() || OMod->getImm() != 0)) {
5379 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5380 return false;
5381 }
5382 }
5383
5384 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5385 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5386 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5387 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5388 const MachineOperand *Src0ModsMO =
5389 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
5390 unsigned Mods = Src0ModsMO->getImm();
5391 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5392 Mods & SISrcMods::SEXT) {
5393 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5394 return false;
5395 }
5396 }
5397
5398 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5399 if (isVOPC(Opcode: BasicOpcode)) {
5400 if (!ST.hasSDWASdst() && DstIdx != -1) {
5401 // Only vcc allowed as dst on VI for VOPC
5402 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5403 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5404 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5405 return false;
5406 }
5407 } else if (!ST.hasSDWAOutModsVOPC()) {
5408 // No clamp allowed on GFX9 for VOPC
5409 const MachineOperand *Clamp = getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
5410 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5411 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5412 return false;
5413 }
5414
5415 // No omod allowed on GFX9 for VOPC
5416 const MachineOperand *OMod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
5417 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5418 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5419 return false;
5420 }
5421 }
5422 }
5423
5424 const MachineOperand *DstUnused = getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused);
5425 if (DstUnused && DstUnused->isImm() &&
5426 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5427 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5428 if (!Dst.isReg() || !Dst.isTied()) {
5429 ErrInfo = "Dst register should have tied register";
5430 return false;
5431 }
5432
5433 const MachineOperand &TiedMO =
5434 MI.getOperand(i: MI.findTiedOperandIdx(OpIdx: DstIdx));
5435 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5436 ErrInfo =
5437 "Dst register should be tied to implicit use of preserved register";
5438 return false;
5439 }
5440 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5441 ErrInfo = "Dst register should use same physical register as preserved";
5442 return false;
5443 }
5444 }
5445 }
5446
5447 // Verify MIMG / VIMAGE / VSAMPLE
5448 if (isImage(Opcode) && !MI.mayStore()) {
5449 // Ensure that the return type used is large enough for all the options
5450 // being used TFE/LWE require an extra result register.
5451 const MachineOperand *DMask = getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask);
5452 if (DMask) {
5453 uint64_t DMaskImm = DMask->getImm();
5454 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(Value: DMaskImm);
5455 const MachineOperand *TFE = getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe);
5456 const MachineOperand *LWE = getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe);
5457 const MachineOperand *D16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::d16);
5458
5459 // Adjust for packed 16 bit values
5460 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5461 RegCount = divideCeil(Numerator: RegCount, Denominator: 2);
5462
5463 // Adjust if using LWE or TFE
5464 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5465 RegCount += 1;
5466
5467 const uint32_t DstIdx =
5468 AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdata);
5469 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5470 if (Dst.isReg()) {
5471 const TargetRegisterClass *DstRC = getOpRegClass(MI, OpNo: DstIdx);
5472 uint32_t DstSize = RI.getRegSizeInBits(RC: *DstRC) / 32;
5473 if (RegCount > DstSize) {
5474 ErrInfo = "Image instruction returns too many registers for dst "
5475 "register class";
5476 return false;
5477 }
5478 }
5479 }
5480 }
5481
5482 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5483 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5484 unsigned ConstantBusCount = 0;
5485 bool UsesLiteral = false;
5486 const MachineOperand *LiteralVal = nullptr;
5487
5488 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::imm);
5489 if (ImmIdx != -1) {
5490 ++ConstantBusCount;
5491 UsesLiteral = true;
5492 LiteralVal = &MI.getOperand(i: ImmIdx);
5493 }
5494
5495 SmallVector<Register, 2> SGPRsUsed;
5496 Register SGPRUsed;
5497
5498 // Only look at the true operands. Only a real operand can use the constant
5499 // bus, and we don't want to check pseudo-operands like the source modifier
5500 // flags.
5501 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5502 if (OpIdx == -1)
5503 continue;
5504 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5505 if (usesConstantBus(MRI, MO, OpInfo: MI.getDesc().operands()[OpIdx])) {
5506 if (MO.isReg()) {
5507 SGPRUsed = MO.getReg();
5508 if (!llvm::is_contained(Range&: SGPRsUsed, Element: SGPRUsed)) {
5509 ++ConstantBusCount;
5510 SGPRsUsed.push_back(Elt: SGPRUsed);
5511 }
5512 } else if (!MO.isFI()) { // Treat FI like a register.
5513 if (!UsesLiteral) {
5514 ++ConstantBusCount;
5515 UsesLiteral = true;
5516 LiteralVal = &MO;
5517 } else if (!MO.isIdenticalTo(Other: *LiteralVal)) {
5518 assert(isVOP2(MI) || isVOP3(MI));
5519 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5520 return false;
5521 }
5522 }
5523 }
5524 }
5525
5526 SGPRUsed = findImplicitSGPRRead(MI);
5527 if (SGPRUsed) {
5528 // Implicit uses may safely overlap true operands
5529 if (llvm::all_of(Range&: SGPRsUsed, P: [this, SGPRUsed](unsigned SGPR) {
5530 return !RI.regsOverlap(RegA: SGPRUsed, RegB: SGPR);
5531 })) {
5532 ++ConstantBusCount;
5533 SGPRsUsed.push_back(Elt: SGPRUsed);
5534 }
5535 }
5536
5537 // v_writelane_b32 is an exception from constant bus restriction:
5538 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5539 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5540 Opcode != AMDGPU::V_WRITELANE_B32) {
5541 ErrInfo = "VOP* instruction violates constant bus restriction";
5542 return false;
5543 }
5544
5545 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5546 ErrInfo = "VOP3 instruction uses literal";
5547 return false;
5548 }
5549 }
5550
5551 // Special case for writelane - this can break the multiple constant bus rule,
5552 // but still can't use more than one SGPR register
5553 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5554 unsigned SGPRCount = 0;
5555 Register SGPRUsed;
5556
5557 for (int OpIdx : {Src0Idx, Src1Idx}) {
5558 if (OpIdx == -1)
5559 break;
5560
5561 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5562
5563 if (usesConstantBus(MRI, MO, OpInfo: MI.getDesc().operands()[OpIdx])) {
5564 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5565 if (MO.getReg() != SGPRUsed)
5566 ++SGPRCount;
5567 SGPRUsed = MO.getReg();
5568 }
5569 }
5570 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5571 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5572 return false;
5573 }
5574 }
5575 }
5576
5577 // Verify misc. restrictions on specific instructions.
5578 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5579 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5580 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5581 const MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
5582 const MachineOperand &Src2 = MI.getOperand(i: Src2Idx);
5583 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5584 if (!compareMachineOp(Op0: Src0, Op1: Src1) &&
5585 !compareMachineOp(Op0: Src0, Op1: Src2)) {
5586 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5587 return false;
5588 }
5589 }
5590 if ((getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers)->getImm() &
5591 SISrcMods::ABS) ||
5592 (getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers)->getImm() &
5593 SISrcMods::ABS) ||
5594 (getNamedOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers)->getImm() &
5595 SISrcMods::ABS)) {
5596 ErrInfo = "ABS not allowed in VOP3B instructions";
5597 return false;
5598 }
5599 }
5600
5601 if (isSOP2(MI) || isSOPC(MI)) {
5602 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5603 const MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
5604
5605 if (!isRegOrFI(MO: Src0) && !isRegOrFI(MO: Src1) &&
5606 !isInlineConstant(MO: Src0, OpInfo: Desc.operands()[Src0Idx]) &&
5607 !isInlineConstant(MO: Src1, OpInfo: Desc.operands()[Src1Idx]) &&
5608 !Src0.isIdenticalTo(Other: Src1)) {
5609 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5610 return false;
5611 }
5612 }
5613
5614 if (isSOPK(MI)) {
5615 const auto *Op = getNamedOperand(MI, OperandName: AMDGPU::OpName::simm16);
5616 if (Desc.isBranch()) {
5617 if (!Op->isMBB()) {
5618 ErrInfo = "invalid branch target for SOPK instruction";
5619 return false;
5620 }
5621 } else {
5622 uint64_t Imm = Op->getImm();
5623 if (sopkIsZext(Opcode)) {
5624 if (!isUInt<16>(x: Imm)) {
5625 ErrInfo = "invalid immediate for SOPK instruction";
5626 return false;
5627 }
5628 } else {
5629 if (!isInt<16>(x: Imm)) {
5630 ErrInfo = "invalid immediate for SOPK instruction";
5631 return false;
5632 }
5633 }
5634 }
5635 }
5636
5637 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5638 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5639 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5640 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5641 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5642 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5643
5644 const unsigned StaticNumOps =
5645 Desc.getNumOperands() + Desc.implicit_uses().size();
5646 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5647
5648 // Require additional implicit operands. This allows a fixup done by the
5649 // post RA scheduler where the main implicit operand is killed and
5650 // implicit-defs are added for sub-registers that remain live after this
5651 // instruction.
5652 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5653 ErrInfo = "missing implicit register operands";
5654 return false;
5655 }
5656
5657 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
5658 if (IsDst) {
5659 if (!Dst->isUse()) {
5660 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5661 return false;
5662 }
5663
5664 unsigned UseOpIdx;
5665 if (!MI.isRegTiedToUseOperand(DefOpIdx: StaticNumOps, UseOpIdx: &UseOpIdx) ||
5666 UseOpIdx != StaticNumOps + 1) {
5667 ErrInfo = "movrel implicit operands should be tied";
5668 return false;
5669 }
5670 }
5671
5672 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5673 const MachineOperand &ImpUse
5674 = MI.getOperand(i: StaticNumOps + NumImplicitOps - 1);
5675 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5676 !isSubRegOf(TRI: RI, SuperVec: ImpUse, SubReg: IsDst ? *Dst : Src0)) {
5677 ErrInfo = "src0 should be subreg of implicit vector use";
5678 return false;
5679 }
5680 }
5681
5682 // Make sure we aren't losing exec uses in the td files. This mostly requires
5683 // being careful when using let Uses to try to add other use registers.
5684 if (shouldReadExec(MI)) {
5685 if (!MI.hasRegisterImplicitUseOperand(Reg: AMDGPU::EXEC)) {
5686 ErrInfo = "VALU instruction does not implicitly read exec mask";
5687 return false;
5688 }
5689 }
5690
5691 if (isSMRD(MI)) {
5692 if (MI.mayStore() &&
5693 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5694 // The register offset form of scalar stores may only use m0 as the
5695 // soffset register.
5696 const MachineOperand *Soff = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
5697 if (Soff && Soff->getReg() != AMDGPU::M0) {
5698 ErrInfo = "scalar stores must use m0 as offset register";
5699 return false;
5700 }
5701 }
5702 }
5703
5704 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5705 const MachineOperand *Offset = getNamedOperand(MI, OperandName: AMDGPU::OpName::offset);
5706 if (Offset->getImm() != 0) {
5707 ErrInfo = "subtarget does not support offsets in flat instructions";
5708 return false;
5709 }
5710 }
5711
5712 if (isDS(MI) && !ST.hasGDS()) {
5713 const MachineOperand *GDSOp = getNamedOperand(MI, OperandName: AMDGPU::OpName::gds);
5714 if (GDSOp && GDSOp->getImm() != 0) {
5715 ErrInfo = "GDS is not supported on this subtarget";
5716 return false;
5717 }
5718 }
5719
5720 if (isImage(MI)) {
5721 const MachineOperand *DimOp = getNamedOperand(MI, OperandName: AMDGPU::OpName::dim);
5722 if (DimOp) {
5723 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5724 Name: AMDGPU::OpName::vaddr0);
5725 AMDGPU::OpName RSrcOpName =
5726 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5727 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: RSrcOpName);
5728 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Opcode);
5729 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5730 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
5731 const AMDGPU::MIMGDimInfo *Dim =
5732 AMDGPU::getMIMGDimInfoByEncoding(DimEnc: DimOp->getImm());
5733
5734 if (!Dim) {
5735 ErrInfo = "dim is out of range";
5736 return false;
5737 }
5738
5739 bool IsA16 = false;
5740 if (ST.hasR128A16()) {
5741 const MachineOperand *R128A16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::r128);
5742 IsA16 = R128A16->getImm() != 0;
5743 } else if (ST.hasA16()) {
5744 const MachineOperand *A16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::a16);
5745 IsA16 = A16->getImm() != 0;
5746 }
5747
5748 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5749
5750 unsigned AddrWords =
5751 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, IsG16Supported: ST.hasG16());
5752
5753 unsigned VAddrWords;
5754 if (IsNSA) {
5755 VAddrWords = RsrcIdx - VAddr0Idx;
5756 if (ST.hasPartialNSAEncoding() &&
5757 AddrWords > ST.getNSAMaxSize(HasSampler: isVSAMPLE(MI))) {
5758 unsigned LastVAddrIdx = RsrcIdx - 1;
5759 VAddrWords += getOpSize(MI, OpNo: LastVAddrIdx) / 4 - 1;
5760 }
5761 } else {
5762 VAddrWords = getOpSize(MI, OpNo: VAddr0Idx) / 4;
5763 if (AddrWords > 12)
5764 AddrWords = 16;
5765 }
5766
5767 if (VAddrWords != AddrWords) {
5768 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5769 << " but got " << VAddrWords << "\n");
5770 ErrInfo = "bad vaddr size";
5771 return false;
5772 }
5773 }
5774 }
5775
5776 const MachineOperand *DppCt = getNamedOperand(MI, OperandName: AMDGPU::OpName::dpp_ctrl);
5777 if (DppCt) {
5778 using namespace AMDGPU::DPP;
5779
5780 unsigned DC = DppCt->getImm();
5781 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5782 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5783 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5784 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5785 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5786 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5787 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5788 ErrInfo = "Invalid dpp_ctrl value";
5789 return false;
5790 }
5791 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5792 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5793 ErrInfo = "Invalid dpp_ctrl value: "
5794 "wavefront shifts are not supported on GFX10+";
5795 return false;
5796 }
5797 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5798 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5799 ErrInfo = "Invalid dpp_ctrl value: "
5800 "broadcasts are not supported on GFX10+";
5801 return false;
5802 }
5803 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5804 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5805 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5806 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5807 !ST.hasGFX90AInsts()) {
5808 ErrInfo = "Invalid dpp_ctrl value: "
5809 "row_newbroadcast/row_share is not supported before "
5810 "GFX90A/GFX10";
5811 return false;
5812 }
5813 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5814 ErrInfo = "Invalid dpp_ctrl value: "
5815 "row_share and row_xmask are not supported before GFX10";
5816 return false;
5817 }
5818 }
5819
5820 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5821 !AMDGPU::isLegalDPALU_DPPControl(ST, DC) &&
5822 AMDGPU::isDPALU_DPP(OpDesc: Desc, MII: *this, ST)) {
5823 ErrInfo = "Invalid dpp_ctrl value: "
5824 "DP ALU dpp only support row_newbcast";
5825 return false;
5826 }
5827 }
5828
5829 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5830 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
5831 AMDGPU::OpName DataName =
5832 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5833 const MachineOperand *Data = getNamedOperand(MI, OperandName: DataName);
5834 const MachineOperand *Data2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::data1);
5835 if (Data && !Data->isReg())
5836 Data = nullptr;
5837
5838 if (ST.hasGFX90AInsts()) {
5839 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5840 (RI.isAGPR(MRI, Reg: Dst->getReg()) != RI.isAGPR(MRI, Reg: Data->getReg()))) {
5841 ErrInfo = "Invalid register class: "
5842 "vdata and vdst should be both VGPR or AGPR";
5843 return false;
5844 }
5845 if (Data && Data2 &&
5846 (RI.isAGPR(MRI, Reg: Data->getReg()) != RI.isAGPR(MRI, Reg: Data2->getReg()))) {
5847 ErrInfo = "Invalid register class: "
5848 "both data operands should be VGPR or AGPR";
5849 return false;
5850 }
5851 } else {
5852 if ((Dst && RI.isAGPR(MRI, Reg: Dst->getReg())) ||
5853 (Data && RI.isAGPR(MRI, Reg: Data->getReg())) ||
5854 (Data2 && RI.isAGPR(MRI, Reg: Data2->getReg()))) {
5855 ErrInfo = "Invalid register class: "
5856 "agpr loads and stores not supported on this GPU";
5857 return false;
5858 }
5859 }
5860 }
5861
5862 if (ST.needsAlignedVGPRs()) {
5863 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5864 const MachineOperand *Op = getNamedOperand(MI, OperandName: OpName);
5865 if (!Op)
5866 return true;
5867 Register Reg = Op->getReg();
5868 if (Reg.isPhysical())
5869 return !(RI.getHWRegIndex(Reg) & 1);
5870 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5871 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5872 !(RI.getChannelFromSubReg(SubReg: Op->getSubReg()) & 1);
5873 };
5874
5875 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5876 Opcode == AMDGPU::DS_GWS_BARRIER) {
5877
5878 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5879 ErrInfo = "Subtarget requires even aligned vector registers "
5880 "for DS_GWS instructions";
5881 return false;
5882 }
5883 }
5884
5885 if (isMIMG(MI)) {
5886 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5887 ErrInfo = "Subtarget requires even aligned vector registers "
5888 "for vaddr operand of image instructions";
5889 return false;
5890 }
5891 }
5892 }
5893
5894 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5895 const MachineOperand *Src = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
5896 if (Src->isReg() && RI.isSGPRReg(MRI, Reg: Src->getReg())) {
5897 ErrInfo = "Invalid register class: "
5898 "v_accvgpr_write with an SGPR is not supported on this GPU";
5899 return false;
5900 }
5901 }
5902
5903 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5904 const MachineOperand &SrcOp = MI.getOperand(i: 1);
5905 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5906 ErrInfo = "pseudo expects only physical SGPRs";
5907 return false;
5908 }
5909 }
5910
5911 if (const MachineOperand *CPol = getNamedOperand(MI, OperandName: AMDGPU::OpName::cpol)) {
5912 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5913 if (!ST.hasScaleOffset()) {
5914 ErrInfo = "Subtarget does not support offset scaling";
5915 return false;
5916 }
5917 if (!AMDGPU::supportsScaleOffset(MII: *this, Opcode: MI.getOpcode())) {
5918 ErrInfo = "Instruction does not support offset scaling";
5919 return false;
5920 }
5921 }
5922 }
5923
5924 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5925 // information.
5926 if (AMDGPU::isPackedFP32Inst(Opc: Opcode) && AMDGPU::isGFX12Plus(STI: ST)) {
5927 for (unsigned I = 0; I < 3; ++I) {
5928 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, SrcN: I))
5929 return false;
5930 }
5931 }
5932
5933 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5934 MI.readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI: nullptr)) {
5935 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
5936 if ((Dst && RI.getRegClassForReg(MRI, Reg: Dst->getReg()) ==
5937 &AMDGPU::SReg_64RegClass) ||
5938 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5939 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5940 return false;
5941 }
5942 }
5943
5944 return true;
5945}
5946
5947// It is more readable to list mapped opcodes on the same line.
5948// clang-format off
5949
5950unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
5951 switch (MI.getOpcode()) {
5952 default: return AMDGPU::INSTRUCTION_LIST_END;
5953 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5954 case AMDGPU::COPY: return AMDGPU::COPY;
5955 case AMDGPU::PHI: return AMDGPU::PHI;
5956 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5957 case AMDGPU::WQM: return AMDGPU::WQM;
5958 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5959 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5960 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5961 case AMDGPU::S_MOV_B32: {
5962 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5963 return MI.getOperand(i: 1).isReg() ||
5964 RI.isAGPR(MRI, Reg: MI.getOperand(i: 0).getReg()) ?
5965 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5966 }
5967 case AMDGPU::S_ADD_I32:
5968 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5969 case AMDGPU::S_ADDC_U32:
5970 return AMDGPU::V_ADDC_U32_e32;
5971 case AMDGPU::S_SUB_I32:
5972 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5973 // FIXME: These are not consistently handled, and selected when the carry is
5974 // used.
5975 case AMDGPU::S_ADD_U32:
5976 return AMDGPU::V_ADD_CO_U32_e32;
5977 case AMDGPU::S_SUB_U32:
5978 return AMDGPU::V_SUB_CO_U32_e32;
5979 case AMDGPU::S_ADD_U64_PSEUDO:
5980 return AMDGPU::V_ADD_U64_PSEUDO;
5981 case AMDGPU::S_SUB_U64_PSEUDO:
5982 return AMDGPU::V_SUB_U64_PSEUDO;
5983 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5984 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5985 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5986 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5987 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5988 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5989 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5990 case AMDGPU::S_XNOR_B32:
5991 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5992 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5993 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5994 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5995 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5996 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5997 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5998 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5999 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
6000 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
6001 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
6002 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
6003 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
6004 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
6005 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
6006 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
6007 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
6008 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
6009 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
6010 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
6011 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
6012 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
6013 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
6014 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
6015 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
6016 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
6017 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
6018 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
6019 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
6020 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
6021 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
6022 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
6023 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
6024 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
6025 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
6026 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
6027 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
6028 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
6029 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
6030 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6031 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6032 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6033 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6034 case AMDGPU::S_CVT_F32_F16:
6035 case AMDGPU::S_CVT_HI_F32_F16:
6036 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6037 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6038 case AMDGPU::S_CVT_F16_F32:
6039 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6040 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6041 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6042 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6043 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6044 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6045 case AMDGPU::S_CEIL_F16:
6046 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6047 : AMDGPU::V_CEIL_F16_fake16_e64;
6048 case AMDGPU::S_FLOOR_F16:
6049 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6050 : AMDGPU::V_FLOOR_F16_fake16_e64;
6051 case AMDGPU::S_TRUNC_F16:
6052 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6053 : AMDGPU::V_TRUNC_F16_fake16_e64;
6054 case AMDGPU::S_RNDNE_F16:
6055 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6056 : AMDGPU::V_RNDNE_F16_fake16_e64;
6057 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6058 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6059 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6060 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6061 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6062 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6063 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6064 case AMDGPU::S_ADD_F16:
6065 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6066 : AMDGPU::V_ADD_F16_fake16_e64;
6067 case AMDGPU::S_SUB_F16:
6068 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6069 : AMDGPU::V_SUB_F16_fake16_e64;
6070 case AMDGPU::S_MIN_F16:
6071 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6072 : AMDGPU::V_MIN_F16_fake16_e64;
6073 case AMDGPU::S_MAX_F16:
6074 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6075 : AMDGPU::V_MAX_F16_fake16_e64;
6076 case AMDGPU::S_MINIMUM_F16:
6077 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6078 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6079 case AMDGPU::S_MAXIMUM_F16:
6080 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6081 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6082 case AMDGPU::S_MUL_F16:
6083 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6084 : AMDGPU::V_MUL_F16_fake16_e64;
6085 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6086 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6087 case AMDGPU::S_FMAC_F16:
6088 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6089 : AMDGPU::V_FMAC_F16_fake16_e64;
6090 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6091 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6092 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6093 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6094 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6095 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6096 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6097 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6098 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6099 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6100 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6101 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6102 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6103 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6104 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6105 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6106 case AMDGPU::S_CMP_LT_F16:
6107 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6108 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6109 case AMDGPU::S_CMP_EQ_F16:
6110 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6111 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6112 case AMDGPU::S_CMP_LE_F16:
6113 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6114 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6115 case AMDGPU::S_CMP_GT_F16:
6116 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6117 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6118 case AMDGPU::S_CMP_LG_F16:
6119 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6120 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6121 case AMDGPU::S_CMP_GE_F16:
6122 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6123 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6124 case AMDGPU::S_CMP_O_F16:
6125 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6126 : AMDGPU::V_CMP_O_F16_fake16_e64;
6127 case AMDGPU::S_CMP_U_F16:
6128 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6129 : AMDGPU::V_CMP_U_F16_fake16_e64;
6130 case AMDGPU::S_CMP_NGE_F16:
6131 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6132 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6133 case AMDGPU::S_CMP_NLG_F16:
6134 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6135 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6136 case AMDGPU::S_CMP_NGT_F16:
6137 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6138 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6139 case AMDGPU::S_CMP_NLE_F16:
6140 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6141 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6142 case AMDGPU::S_CMP_NEQ_F16:
6143 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6144 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6145 case AMDGPU::S_CMP_NLT_F16:
6146 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6147 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6148 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6149 case AMDGPU::V_S_EXP_F16_e64:
6150 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6151 : AMDGPU::V_EXP_F16_fake16_e64;
6152 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6153 case AMDGPU::V_S_LOG_F16_e64:
6154 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6155 : AMDGPU::V_LOG_F16_fake16_e64;
6156 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6157 case AMDGPU::V_S_RCP_F16_e64:
6158 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6159 : AMDGPU::V_RCP_F16_fake16_e64;
6160 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6161 case AMDGPU::V_S_RSQ_F16_e64:
6162 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6163 : AMDGPU::V_RSQ_F16_fake16_e64;
6164 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6165 case AMDGPU::V_S_SQRT_F16_e64:
6166 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6167 : AMDGPU::V_SQRT_F16_fake16_e64;
6168 }
6169 llvm_unreachable(
6170 "Unexpected scalar opcode without corresponding vector one!");
6171}
6172
6173// clang-format on
6174
6175void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
6176 MachineBasicBlock &MBB,
6177 MachineBasicBlock::iterator MBBI,
6178 const DebugLoc &DL, Register Reg,
6179 bool IsSCCLive,
6180 SlotIndexes *Indexes) const {
6181 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6182 const SIInstrInfo *TII = ST.getInstrInfo();
6183 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
6184 if (IsSCCLive) {
6185 // Insert two move instructions, one to save the original value of EXEC and
6186 // the other to turn on all bits in EXEC. This is required as we can't use
6187 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6188 auto StoreExecMI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: Reg)
6189 .addReg(RegNo: LMC.ExecReg, Flags: RegState::Kill);
6190 auto FlipExecMI =
6191 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg).addImm(Val: -1);
6192 if (Indexes) {
6193 Indexes->insertMachineInstrInMaps(MI&: *StoreExecMI);
6194 Indexes->insertMachineInstrInMaps(MI&: *FlipExecMI);
6195 }
6196 } else {
6197 auto SaveExec =
6198 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.OrSaveExecOpc), DestReg: Reg).addImm(Val: -1);
6199 SaveExec->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
6200 if (Indexes)
6201 Indexes->insertMachineInstrInMaps(MI&: *SaveExec);
6202 }
6203}
6204
6205void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
6206 MachineBasicBlock::iterator MBBI,
6207 const DebugLoc &DL, Register Reg,
6208 SlotIndexes *Indexes) const {
6209 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
6210 auto ExecRestoreMI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
6211 .addReg(RegNo: Reg, Flags: RegState::Kill);
6212 if (Indexes)
6213 Indexes->insertMachineInstrInMaps(MI&: *ExecRestoreMI);
6214}
6215
6216MachineInstr *
6217SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
6218 assert(MF.getInfo<SIMachineFunctionInfo>()->isWholeWaveFunction() &&
6219 "Not a whole wave func");
6220 MachineBasicBlock &MBB = *MF.begin();
6221 for (MachineInstr &MI : MBB)
6222 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6223 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6224 return &MI;
6225
6226 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6227}
6228
6229const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
6230 unsigned OpNo) const {
6231 const MCInstrDesc &Desc = get(Opcode: MI.getOpcode());
6232 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6233 Desc.operands()[OpNo].RegClass == -1) {
6234 Register Reg = MI.getOperand(i: OpNo).getReg();
6235
6236 if (Reg.isVirtual()) {
6237 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6238 return MRI.getRegClass(Reg);
6239 }
6240 return RI.getPhysRegBaseClass(Reg);
6241 }
6242
6243 int16_t RegClass = getOpRegClassID(OpInfo: Desc.operands()[OpNo]);
6244 return RegClass < 0 ? nullptr : RI.getRegClass(i: RegClass);
6245}
6246
6247void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
6248 MachineBasicBlock::iterator I = MI;
6249 MachineBasicBlock *MBB = MI.getParent();
6250 MachineOperand &MO = MI.getOperand(i: OpIdx);
6251 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6252 unsigned RCID = getOpRegClassID(OpInfo: get(Opcode: MI.getOpcode()).operands()[OpIdx]);
6253 const TargetRegisterClass *RC = RI.getRegClass(i: RCID);
6254 unsigned Size = RI.getRegSizeInBits(RC: *RC);
6255 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6256 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6257 : AMDGPU::V_MOV_B32_e32;
6258 if (MO.isReg())
6259 Opcode = AMDGPU::COPY;
6260 else if (RI.isSGPRClass(RC))
6261 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6262
6263 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(SRC: RC);
6264 Register Reg = MRI.createVirtualRegister(RegClass: VRC);
6265 DebugLoc DL = MBB->findDebugLoc(MBBI: I);
6266 BuildMI(BB&: *MI.getParent(), I, MIMD: DL, MCID: get(Opcode), DestReg: Reg).add(MO);
6267 MO.ChangeToRegister(Reg, isDef: false);
6268}
6269
6270unsigned SIInstrInfo::buildExtractSubReg(
6271 MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
6272 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6273 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6274 if (!SuperReg.getReg().isVirtual())
6275 return RI.getSubReg(Reg: SuperReg.getReg(), Idx: SubIdx);
6276
6277 MachineBasicBlock *MBB = MI->getParent();
6278 const DebugLoc &DL = MI->getDebugLoc();
6279 Register SubReg = MRI.createVirtualRegister(RegClass: SubRC);
6280
6281 unsigned NewSubIdx = RI.composeSubRegIndices(a: SuperReg.getSubReg(), b: SubIdx);
6282 BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: SubReg)
6283 .addReg(RegNo: SuperReg.getReg(), Flags: {}, SubReg: NewSubIdx);
6284 return SubReg;
6285}
6286
6287MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
6288 MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI,
6289 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6290 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6291 if (Op.isImm()) {
6292 if (SubIdx == AMDGPU::sub0)
6293 return MachineOperand::CreateImm(Val: static_cast<int32_t>(Op.getImm()));
6294 if (SubIdx == AMDGPU::sub1)
6295 return MachineOperand::CreateImm(Val: static_cast<int32_t>(Op.getImm() >> 32));
6296
6297 llvm_unreachable("Unhandled register index for immediate");
6298 }
6299
6300 unsigned SubReg = buildExtractSubReg(MI: MII, MRI, SuperReg: Op, SuperRC,
6301 SubIdx, SubRC);
6302 return MachineOperand::CreateReg(Reg: SubReg, isDef: false);
6303}
6304
6305// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6306void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6307 assert(Inst.getNumExplicitOperands() == 3);
6308 MachineOperand Op1 = Inst.getOperand(i: 1);
6309 Inst.removeOperand(OpNo: 1);
6310 Inst.addOperand(Op: Op1);
6311}
6312
6313bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
6314 const MCOperandInfo &OpInfo,
6315 const MachineOperand &MO) const {
6316 if (!MO.isReg())
6317 return false;
6318
6319 Register Reg = MO.getReg();
6320
6321 const TargetRegisterClass *DRC = RI.getRegClass(i: getOpRegClassID(OpInfo));
6322 if (Reg.isPhysical())
6323 return DRC->contains(Reg);
6324
6325 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6326
6327 if (MO.getSubReg()) {
6328 const MachineFunction *MF = MO.getParent()->getMF();
6329 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, MF: *MF);
6330 if (!SuperRC)
6331 return false;
6332 return RI.getMatchingSuperRegClass(A: SuperRC, B: DRC, Idx: MO.getSubReg()) != nullptr;
6333 }
6334
6335 return RI.getCommonSubClass(A: DRC, B: RC) != nullptr;
6336}
6337
6338bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
6339 const MachineOperand &MO) const {
6340 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6341 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6342 unsigned Opc = MI.getOpcode();
6343
6344 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6345 // information.
6346 if (AMDGPU::isPackedFP32Inst(Opc: MI.getOpcode()) && AMDGPU::isGFX12Plus(STI: ST) &&
6347 MO.isReg() && RI.isSGPRReg(MRI, Reg: MO.getReg())) {
6348 constexpr AMDGPU::OpName OpNames[] = {
6349 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6350
6351 for (auto [I, OpName] : enumerate(First: OpNames)) {
6352 int SrcIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[I]);
6353 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6354 !isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, SrcN: I, MO: &MO))
6355 return false;
6356 }
6357 }
6358
6359 if (!isLegalRegOperand(MRI, OpInfo, MO))
6360 return false;
6361
6362 // check Accumulate GPR operand
6363 bool IsAGPR = RI.isAGPR(MRI, Reg: MO.getReg());
6364 if (IsAGPR && !ST.hasMAIInsts())
6365 return false;
6366 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6367 (MI.mayLoad() || MI.mayStore() || isDS(Opcode: Opc) || isMIMG(Opcode: Opc)))
6368 return false;
6369 // Atomics should have both vdst and vdata either vgpr or agpr.
6370 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
6371 const int DataIdx = AMDGPU::getNamedOperandIdx(
6372 Opcode: Opc, Name: isDS(Opcode: Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6373 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6374 MI.getOperand(i: DataIdx).isReg() &&
6375 RI.isAGPR(MRI, Reg: MI.getOperand(i: DataIdx).getReg()) != IsAGPR)
6376 return false;
6377 if ((int)OpIdx == DataIdx) {
6378 if (VDstIdx != -1 &&
6379 RI.isAGPR(MRI, Reg: MI.getOperand(i: VDstIdx).getReg()) != IsAGPR)
6380 return false;
6381 // DS instructions with 2 src operands also must have tied RC.
6382 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data1);
6383 if (Data1Idx != -1 && MI.getOperand(i: Data1Idx).isReg() &&
6384 RI.isAGPR(MRI, Reg: MI.getOperand(i: Data1Idx).getReg()) != IsAGPR)
6385 return false;
6386 }
6387
6388 // Check V_ACCVGPR_WRITE_B32_e64
6389 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6390 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0) &&
6391 RI.isSGPRReg(MRI, Reg: MO.getReg()))
6392 return false;
6393
6394 if (ST.hasFlatScratchHiInB64InstHazard() &&
6395 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6396 if (const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst)) {
6397 if (AMDGPU::getRegBitWidth(RC: *RI.getRegClassForReg(MRI, Reg: Dst->getReg())) ==
6398 64)
6399 return false;
6400 }
6401 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6402 return false;
6403 }
6404
6405 return true;
6406}
6407
6408bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
6409 const MCOperandInfo &OpInfo,
6410 const MachineOperand &MO) const {
6411 if (MO.isReg())
6412 return isLegalRegOperand(MRI, OpInfo, MO);
6413
6414 // Handle non-register types that are treated like immediates.
6415 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6416 return true;
6417}
6418
6419bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
6420 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6421 const MachineOperand *MO) const {
6422 constexpr unsigned NumOps = 3;
6423 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6424 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6425 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6426 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6427
6428 assert(SrcN < NumOps);
6429
6430 if (!MO) {
6431 int SrcIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[SrcN]);
6432 if (SrcIdx == -1)
6433 return true;
6434 MO = &MI.getOperand(i: SrcIdx);
6435 }
6436
6437 if (!MO->isReg() || !RI.isSGPRReg(MRI, Reg: MO->getReg()))
6438 return true;
6439
6440 int ModsIdx =
6441 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[NumOps + SrcN]);
6442 if (ModsIdx == -1)
6443 return true;
6444
6445 unsigned Mods = MI.getOperand(i: ModsIdx).getImm();
6446 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6447 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6448
6449 return !OpSel && !OpSelHi;
6450}
6451
6452bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
6453 const MachineOperand *MO) const {
6454 const MachineFunction &MF = *MI.getMF();
6455 const MachineRegisterInfo &MRI = MF.getRegInfo();
6456 const MCInstrDesc &InstDesc = MI.getDesc();
6457 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6458 int64_t RegClass = getOpRegClassID(OpInfo);
6459 const TargetRegisterClass *DefinedRC =
6460 RegClass != -1 ? RI.getRegClass(i: RegClass) : nullptr;
6461 if (!MO)
6462 MO = &MI.getOperand(i: OpIdx);
6463
6464 const bool IsInlineConst = !MO->isReg() && isInlineConstant(MO: *MO, OpInfo);
6465
6466 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, MO: *MO, OpInfo)) {
6467 const MachineOperand *UsedLiteral = nullptr;
6468
6469 int ConstantBusLimit = ST.getConstantBusLimit(Opcode: MI.getOpcode());
6470 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6471
6472 // TODO: Be more permissive with frame indexes.
6473 if (!MO->isReg() && !isInlineConstant(MO: *MO, OpInfo)) {
6474 if (!LiteralLimit--)
6475 return false;
6476
6477 UsedLiteral = MO;
6478 }
6479
6480 SmallDenseSet<RegSubRegPair> SGPRsUsed;
6481 if (MO->isReg())
6482 SGPRsUsed.insert(V: RegSubRegPair(MO->getReg(), MO->getSubReg()));
6483
6484 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6485 if (i == OpIdx)
6486 continue;
6487 const MachineOperand &Op = MI.getOperand(i);
6488 if (Op.isReg()) {
6489 if (Op.isUse()) {
6490 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6491 if (regUsesConstantBus(RegOp: Op, MRI) && SGPRsUsed.insert(V: SGPR).second) {
6492 if (--ConstantBusLimit <= 0)
6493 return false;
6494 }
6495 }
6496 } else if (AMDGPU::isSISrcOperand(OpInfo: InstDesc.operands()[i]) &&
6497 !isInlineConstant(MO: Op, OpInfo: InstDesc.operands()[i])) {
6498 // The same literal may be used multiple times.
6499 if (!UsedLiteral)
6500 UsedLiteral = &Op;
6501 else if (UsedLiteral->isIdenticalTo(Other: Op))
6502 continue;
6503
6504 if (!LiteralLimit--)
6505 return false;
6506 if (--ConstantBusLimit <= 0)
6507 return false;
6508 }
6509 }
6510 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6511 // There can be at most one literal operand, but it can be repeated.
6512 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6513 if (i == OpIdx)
6514 continue;
6515 const MachineOperand &Op = MI.getOperand(i);
6516 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6517 !isInlineConstant(MO: Op, OpInfo: InstDesc.operands()[i]) &&
6518 !Op.isIdenticalTo(Other: *MO))
6519 return false;
6520
6521 // Do not fold a non-inlineable and non-register operand into an
6522 // instruction that already has a frame index. The frame index handling
6523 // code could not handle well when a frame index co-exists with another
6524 // non-register operand, unless that operand is an inlineable immediate.
6525 if (Op.isFI())
6526 return false;
6527 }
6528 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6529 isF16PseudoScalarTrans(Opcode: MI.getOpcode())) {
6530 return false;
6531 }
6532
6533 if (MO->isReg()) {
6534 if (!DefinedRC)
6535 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6536 return isLegalRegOperand(MI, OpIdx, MO: *MO);
6537 }
6538
6539 if (MO->isImm()) {
6540 uint64_t Imm = MO->getImm();
6541 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6542 bool Is64BitOp = Is64BitFPOp ||
6543 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6544 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6545 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6546 if (Is64BitOp &&
6547 !AMDGPU::isInlinableLiteral64(Literal: Imm, HasInv2Pi: ST.hasInv2PiInlineImm())) {
6548 if (!AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: Is64BitFPOp) &&
6549 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6550 return false;
6551
6552 // FIXME: We can use sign extended 64-bit literals, but only for signed
6553 // operands. At the moment we do not know if an operand is signed.
6554 // Such operand will be encoded as its low 32 bits and then either
6555 // correctly sign extended or incorrectly zero extended by HW.
6556 // If 64-bit literals are supported and the literal will be encoded
6557 // as full 64 bit we still can use it.
6558 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6559 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: false)))
6560 return false;
6561 }
6562 }
6563
6564 // Handle non-register types that are treated like immediates.
6565 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6566
6567 if (!DefinedRC) {
6568 // This operand expects an immediate.
6569 return true;
6570 }
6571
6572 return isImmOperandLegal(MI, OpNo: OpIdx, MO: *MO);
6573}
6574
6575bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
6576 bool IsGFX950Only = ST.hasGFX950Insts();
6577 bool IsGFX940Only = ST.hasGFX940Insts();
6578
6579 if (!IsGFX950Only && !IsGFX940Only)
6580 return false;
6581
6582 if (!isVALU(MI))
6583 return false;
6584
6585 // V_COS, V_EXP, V_RCP, etc.
6586 if (isTRANS(MI))
6587 return true;
6588
6589 // DOT2, DOT2C, DOT4, etc.
6590 if (isDOT(MI))
6591 return true;
6592
6593 // MFMA, SMFMA
6594 if (isMFMA(MI))
6595 return true;
6596
6597 unsigned Opcode = MI.getOpcode();
6598 switch (Opcode) {
6599 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6600 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6601 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6602 case AMDGPU::V_MQSAD_U32_U8_e64:
6603 case AMDGPU::V_PK_ADD_F16:
6604 case AMDGPU::V_PK_ADD_F32:
6605 case AMDGPU::V_PK_ADD_I16:
6606 case AMDGPU::V_PK_ADD_U16:
6607 case AMDGPU::V_PK_ASHRREV_I16:
6608 case AMDGPU::V_PK_FMA_F16:
6609 case AMDGPU::V_PK_FMA_F32:
6610 case AMDGPU::V_PK_FMAC_F16_e32:
6611 case AMDGPU::V_PK_FMAC_F16_e64:
6612 case AMDGPU::V_PK_LSHLREV_B16:
6613 case AMDGPU::V_PK_LSHRREV_B16:
6614 case AMDGPU::V_PK_MAD_I16:
6615 case AMDGPU::V_PK_MAD_U16:
6616 case AMDGPU::V_PK_MAX_F16:
6617 case AMDGPU::V_PK_MAX_I16:
6618 case AMDGPU::V_PK_MAX_U16:
6619 case AMDGPU::V_PK_MIN_F16:
6620 case AMDGPU::V_PK_MIN_I16:
6621 case AMDGPU::V_PK_MIN_U16:
6622 case AMDGPU::V_PK_MOV_B32:
6623 case AMDGPU::V_PK_MUL_F16:
6624 case AMDGPU::V_PK_MUL_F32:
6625 case AMDGPU::V_PK_MUL_LO_U16:
6626 case AMDGPU::V_PK_SUB_I16:
6627 case AMDGPU::V_PK_SUB_U16:
6628 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6629 return true;
6630 default:
6631 return false;
6632 }
6633}
6634
6635void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
6636 MachineInstr &MI) const {
6637 unsigned Opc = MI.getOpcode();
6638 const MCInstrDesc &InstrDesc = get(Opcode: Opc);
6639
6640 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
6641 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
6642
6643 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1);
6644 MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
6645
6646 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6647 // we need to only have one constant bus use before GFX10.
6648 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6649 if (HasImplicitSGPR && ST.getConstantBusLimit(Opcode: Opc) <= 1 && Src0.isReg() &&
6650 RI.isSGPRReg(MRI, Reg: Src0.getReg()))
6651 legalizeOpWithMove(MI, OpIdx: Src0Idx);
6652
6653 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6654 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6655 // src0/src1 with V_READFIRSTLANE.
6656 if (Opc == AMDGPU::V_WRITELANE_B32) {
6657 const DebugLoc &DL = MI.getDebugLoc();
6658 if (Src0.isReg() && RI.isVGPR(MRI, Reg: Src0.getReg())) {
6659 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6660 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6661 .add(MO: Src0);
6662 Src0.ChangeToRegister(Reg, isDef: false);
6663 }
6664 if (Src1.isReg() && RI.isVGPR(MRI, Reg: Src1.getReg())) {
6665 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6666 const DebugLoc &DL = MI.getDebugLoc();
6667 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6668 .add(MO: Src1);
6669 Src1.ChangeToRegister(Reg, isDef: false);
6670 }
6671 return;
6672 }
6673
6674 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6675 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6676 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
6677 if (!RI.isVGPR(MRI, Reg: MI.getOperand(i: Src2Idx).getReg()))
6678 legalizeOpWithMove(MI, OpIdx: Src2Idx);
6679 }
6680
6681 // VOP2 src0 instructions support all operand types, so we don't need to check
6682 // their legality. If src1 is already legal, we don't need to do anything.
6683 if (isLegalRegOperand(MRI, OpInfo: InstrDesc.operands()[Src1Idx], MO: Src1))
6684 return;
6685
6686 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6687 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6688 // select is uniform.
6689 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6690 RI.isVGPR(MRI, Reg: Src1.getReg())) {
6691 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6692 const DebugLoc &DL = MI.getDebugLoc();
6693 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6694 .add(MO: Src1);
6695 Src1.ChangeToRegister(Reg, isDef: false);
6696 return;
6697 }
6698
6699 // We do not use commuteInstruction here because it is too aggressive and will
6700 // commute if it is possible. We only want to commute here if it improves
6701 // legality. This can be called a fairly large number of times so don't waste
6702 // compile time pointlessly swapping and checking legality again.
6703 if (HasImplicitSGPR || !MI.isCommutable()) {
6704 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6705 return;
6706 }
6707
6708 // If src0 can be used as src1, commuting will make the operands legal.
6709 // Otherwise we have to give up and insert a move.
6710 //
6711 // TODO: Other immediate-like operand kinds could be commuted if there was a
6712 // MachineOperand::ChangeTo* for them.
6713 if ((!Src1.isImm() && !Src1.isReg()) ||
6714 !isLegalRegOperand(MRI, OpInfo: InstrDesc.operands()[Src1Idx], MO: Src0)) {
6715 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6716 return;
6717 }
6718
6719 int CommutedOpc = commuteOpcode(MI);
6720 if (CommutedOpc == -1) {
6721 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6722 return;
6723 }
6724
6725 MI.setDesc(get(Opcode: CommutedOpc));
6726
6727 Register Src0Reg = Src0.getReg();
6728 unsigned Src0SubReg = Src0.getSubReg();
6729 bool Src0Kill = Src0.isKill();
6730
6731 if (Src1.isImm())
6732 Src0.ChangeToImmediate(ImmVal: Src1.getImm());
6733 else if (Src1.isReg()) {
6734 Src0.ChangeToRegister(Reg: Src1.getReg(), isDef: false, isImp: false, isKill: Src1.isKill());
6735 Src0.setSubReg(Src1.getSubReg());
6736 } else
6737 llvm_unreachable("Should only have register or immediate operands");
6738
6739 Src1.ChangeToRegister(Reg: Src0Reg, isDef: false, isImp: false, isKill: Src0Kill);
6740 Src1.setSubReg(Src0SubReg);
6741 fixImplicitOperands(MI);
6742}
6743
6744// Legalize VOP3 operands. All operand types are supported for any operand
6745// but only one literal constant and only starting from GFX10.
6746void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
6747 MachineInstr &MI) const {
6748 unsigned Opc = MI.getOpcode();
6749
6750 int VOP3Idx[3] = {
6751 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0),
6752 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1),
6753 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2)
6754 };
6755
6756 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6757 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6758 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6759 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6760 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6761 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6762 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6763 // src1 and src2 must be scalar
6764 MachineOperand &Src1 = MI.getOperand(i: VOP3Idx[1]);
6765 const DebugLoc &DL = MI.getDebugLoc();
6766 if (Src1.isReg() && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src1.getReg()))) {
6767 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6768 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6769 .add(MO: Src1);
6770 Src1.ChangeToRegister(Reg, isDef: false);
6771 }
6772 if (VOP3Idx[2] != -1) {
6773 MachineOperand &Src2 = MI.getOperand(i: VOP3Idx[2]);
6774 if (Src2.isReg() && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src2.getReg()))) {
6775 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6776 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6777 .add(MO: Src2);
6778 Src2.ChangeToRegister(Reg, isDef: false);
6779 }
6780 }
6781 }
6782
6783 // Find the one SGPR operand we are allowed to use.
6784 int ConstantBusLimit = ST.getConstantBusLimit(Opcode: Opc);
6785 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6786 SmallDenseSet<unsigned> SGPRsUsed;
6787 Register SGPRReg = findUsedSGPR(MI, OpIndices: VOP3Idx);
6788 if (SGPRReg) {
6789 SGPRsUsed.insert(V: SGPRReg);
6790 --ConstantBusLimit;
6791 }
6792
6793 for (int Idx : VOP3Idx) {
6794 if (Idx == -1)
6795 break;
6796 MachineOperand &MO = MI.getOperand(i: Idx);
6797
6798 if (!MO.isReg()) {
6799 if (isInlineConstant(MO, OpInfo: get(Opcode: Opc).operands()[Idx]))
6800 continue;
6801
6802 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6803 --LiteralLimit;
6804 --ConstantBusLimit;
6805 continue;
6806 }
6807
6808 --LiteralLimit;
6809 --ConstantBusLimit;
6810 legalizeOpWithMove(MI, OpIdx: Idx);
6811 continue;
6812 }
6813
6814 if (!RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg: MO.getReg())))
6815 continue; // VGPRs are legal
6816
6817 // We can use one SGPR in each VOP3 instruction prior to GFX10
6818 // and two starting from GFX10.
6819 if (SGPRsUsed.count(V: MO.getReg()))
6820 continue;
6821 if (ConstantBusLimit > 0) {
6822 SGPRsUsed.insert(V: MO.getReg());
6823 --ConstantBusLimit;
6824 continue;
6825 }
6826
6827 // If we make it this far, then the operand is not legal and we must
6828 // legalize it.
6829 legalizeOpWithMove(MI, OpIdx: Idx);
6830 }
6831
6832 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6833 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6834 !RI.isVGPR(MRI, Reg: MI.getOperand(i: VOP3Idx[2]).getReg()))
6835 legalizeOpWithMove(MI, OpIdx: VOP3Idx[2]);
6836
6837 // Fix the register class of packed FP32 instructions on gfx12+. See
6838 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6839 if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(STI: ST)) {
6840 for (unsigned I = 0; I < 3; ++I) {
6841 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
6842 legalizeOpWithMove(MI, OpIdx: VOP3Idx[I]);
6843 }
6844 }
6845}
6846
6847Register SIInstrInfo::readlaneVGPRToSGPR(
6848 Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI,
6849 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6850 const TargetRegisterClass *VRC = MRI.getRegClass(Reg: SrcReg);
6851 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6852 if (DstRC)
6853 SRC = RI.getCommonSubClass(A: SRC, B: DstRC);
6854
6855 Register DstReg = MRI.createVirtualRegister(RegClass: SRC);
6856 unsigned SubRegs = RI.getRegSizeInBits(RC: *VRC) / 32;
6857
6858 if (RI.hasAGPRs(RC: VRC)) {
6859 VRC = RI.getEquivalentVGPRClass(SRC: VRC);
6860 Register NewSrcReg = MRI.createVirtualRegister(RegClass: VRC);
6861 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6862 MCID: get(Opcode: TargetOpcode::COPY), DestReg: NewSrcReg)
6863 .addReg(RegNo: SrcReg);
6864 SrcReg = NewSrcReg;
6865 }
6866
6867 if (SubRegs == 1) {
6868 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6869 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
6870 .addReg(RegNo: SrcReg);
6871 return DstReg;
6872 }
6873
6874 SmallVector<Register, 8> SRegs;
6875 for (unsigned i = 0; i < SubRegs; ++i) {
6876 Register SGPR = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
6877 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6878 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: SGPR)
6879 .addReg(RegNo: SrcReg, Flags: {}, SubReg: RI.getSubRegFromChannel(Channel: i));
6880 SRegs.push_back(Elt: SGPR);
6881 }
6882
6883 MachineInstrBuilder MIB =
6884 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6885 MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg);
6886 for (unsigned i = 0; i < SubRegs; ++i) {
6887 MIB.addReg(RegNo: SRegs[i]);
6888 MIB.addImm(Val: RI.getSubRegFromChannel(Channel: i));
6889 }
6890 return DstReg;
6891}
6892
6893void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
6894 MachineInstr &MI) const {
6895
6896 // If the pointer is store in VGPRs, then we need to move them to
6897 // SGPRs using v_readfirstlane. This is safe because we only select
6898 // loads with uniform pointers to SMRD instruction so we know the
6899 // pointer value is uniform.
6900 MachineOperand *SBase = getNamedOperand(MI, OperandName: AMDGPU::OpName::sbase);
6901 if (SBase && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SBase->getReg()))) {
6902 Register SGPR = readlaneVGPRToSGPR(SrcReg: SBase->getReg(), UseMI&: MI, MRI);
6903 SBase->setReg(SGPR);
6904 }
6905 MachineOperand *SOff = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
6906 if (SOff && !RI.isSGPRReg(MRI, Reg: SOff->getReg())) {
6907 Register SGPR = readlaneVGPRToSGPR(SrcReg: SOff->getReg(), UseMI&: MI, MRI);
6908 SOff->setReg(SGPR);
6909 }
6910}
6911
6912bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
6913 unsigned Opc = Inst.getOpcode();
6914 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::saddr);
6915 if (OldSAddrIdx < 0)
6916 return false;
6917
6918 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6919
6920 int NewOpc = AMDGPU::getGlobalVaddrOp(Opcode: Opc);
6921 if (NewOpc < 0)
6922 NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opcode: Opc);
6923 if (NewOpc < 0)
6924 return false;
6925
6926 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6927 MachineOperand &SAddr = Inst.getOperand(i: OldSAddrIdx);
6928 if (RI.isSGPRReg(MRI, Reg: SAddr.getReg()))
6929 return false;
6930
6931 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::vaddr);
6932 if (NewVAddrIdx < 0)
6933 return false;
6934
6935 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr);
6936
6937 // Check vaddr, it shall be zero or absent.
6938 MachineInstr *VAddrDef = nullptr;
6939 if (OldVAddrIdx >= 0) {
6940 MachineOperand &VAddr = Inst.getOperand(i: OldVAddrIdx);
6941 VAddrDef = MRI.getUniqueVRegDef(Reg: VAddr.getReg());
6942 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6943 !VAddrDef->getOperand(i: 1).isImm() ||
6944 VAddrDef->getOperand(i: 1).getImm() != 0)
6945 return false;
6946 }
6947
6948 const MCInstrDesc &NewDesc = get(Opcode: NewOpc);
6949 Inst.setDesc(NewDesc);
6950
6951 // Callers expect iterator to be valid after this call, so modify the
6952 // instruction in place.
6953 if (OldVAddrIdx == NewVAddrIdx) {
6954 MachineOperand &NewVAddr = Inst.getOperand(i: NewVAddrIdx);
6955 // Clear use list from the old vaddr holding a zero register.
6956 MRI.removeRegOperandFromUseList(MO: &NewVAddr);
6957 MRI.moveOperands(Dst: &NewVAddr, Src: &SAddr, NumOps: 1);
6958 Inst.removeOperand(OpNo: OldSAddrIdx);
6959 // Update the use list with the pointer we have just moved from vaddr to
6960 // saddr position. Otherwise new vaddr will be missing from the use list.
6961 MRI.removeRegOperandFromUseList(MO: &NewVAddr);
6962 MRI.addRegOperandToUseList(MO: &NewVAddr);
6963 } else {
6964 assert(OldSAddrIdx == NewVAddrIdx);
6965
6966 if (OldVAddrIdx >= 0) {
6967 int NewVDstIn = AMDGPU::getNamedOperandIdx(Opcode: NewOpc,
6968 Name: AMDGPU::OpName::vdst_in);
6969
6970 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6971 // it asserts. Untie the operands for now and retie them afterwards.
6972 if (NewVDstIn != -1) {
6973 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst_in);
6974 Inst.untieRegOperand(OpIdx: OldVDstIn);
6975 }
6976
6977 Inst.removeOperand(OpNo: OldVAddrIdx);
6978
6979 if (NewVDstIn != -1) {
6980 int NewVDst = AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::vdst);
6981 Inst.tieOperands(DefIdx: NewVDst, UseIdx: NewVDstIn);
6982 }
6983 }
6984 }
6985
6986 if (VAddrDef && MRI.use_nodbg_empty(RegNo: VAddrDef->getOperand(i: 0).getReg()))
6987 VAddrDef->eraseFromParent();
6988
6989 return true;
6990}
6991
6992// FIXME: Remove this when SelectionDAG is obsoleted.
6993void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
6994 MachineInstr &MI) const {
6995 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6996 return;
6997
6998 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6999 // thinks they are uniform, so a readfirstlane should be valid.
7000 MachineOperand *SAddr = getNamedOperand(MI, OperandName: AMDGPU::OpName::saddr);
7001 if (!SAddr || RI.isSGPRClass(RC: MRI.getRegClass(Reg: SAddr->getReg())))
7002 return;
7003
7004 if (moveFlatAddrToVGPR(Inst&: MI))
7005 return;
7006
7007 const TargetRegisterClass *DeclaredRC =
7008 getRegClass(MCID: MI.getDesc(), OpNum: SAddr->getOperandNo());
7009
7010 Register ToSGPR = readlaneVGPRToSGPR(SrcReg: SAddr->getReg(), UseMI&: MI, MRI, DstRC: DeclaredRC);
7011 SAddr->setReg(ToSGPR);
7012}
7013
7014void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
7015 MachineBasicBlock::iterator I,
7016 const TargetRegisterClass *DstRC,
7017 MachineOperand &Op,
7018 MachineRegisterInfo &MRI,
7019 const DebugLoc &DL) const {
7020 Register OpReg = Op.getReg();
7021 unsigned OpSubReg = Op.getSubReg();
7022
7023 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7024 RI.getRegClassForReg(MRI, Reg: OpReg), OpSubReg);
7025
7026 // Check if operand is already the correct register class.
7027 if (DstRC == OpRC)
7028 return;
7029
7030 Register DstReg = MRI.createVirtualRegister(RegClass: DstRC);
7031 auto Copy =
7032 BuildMI(BB&: InsertMBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: OpReg);
7033 Op.setReg(DstReg);
7034
7035 MachineInstr *Def = MRI.getVRegDef(Reg: OpReg);
7036 if (!Def)
7037 return;
7038
7039 // Try to eliminate the copy if it is copying an immediate value.
7040 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7041 foldImmediate(UseMI&: *Copy, DefMI&: *Def, Reg: OpReg, MRI: &MRI);
7042
7043 bool ImpDef = Def->isImplicitDef();
7044 while (!ImpDef && Def && Def->isCopy()) {
7045 if (Def->getOperand(i: 1).getReg().isPhysical())
7046 break;
7047 Def = MRI.getUniqueVRegDef(Reg: Def->getOperand(i: 1).getReg());
7048 ImpDef = Def && Def->isImplicitDef();
7049 }
7050 if (!RI.isSGPRClass(RC: DstRC) && !Copy->readsRegister(Reg: AMDGPU::EXEC, TRI: &RI) &&
7051 !ImpDef)
7052 Copy.addReg(RegNo: AMDGPU::EXEC, Flags: RegState::Implicit);
7053}
7054
7055// Emit the actual waterfall loop, executing the wrapped instruction for each
7056// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7057// iteration, in the worst case we execute 64 (once per lane).
7058static void
7059emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
7060 MachineRegisterInfo &MRI,
7061 MachineBasicBlock &LoopBB,
7062 MachineBasicBlock &BodyBB,
7063 const DebugLoc &DL,
7064 ArrayRef<MachineOperand *> ScalarOps) {
7065 MachineFunction &MF = *LoopBB.getParent();
7066 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7067 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7068 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
7069 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7070
7071 MachineBasicBlock::iterator I = LoopBB.begin();
7072 Register CondReg;
7073
7074 for (MachineOperand *ScalarOp : ScalarOps) {
7075 unsigned RegSize = TRI->getRegSizeInBits(Reg: ScalarOp->getReg(), MRI);
7076 unsigned NumSubRegs = RegSize / 32;
7077 Register VScalarOp = ScalarOp->getReg();
7078
7079 if (NumSubRegs == 1) {
7080 Register CurReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7081
7082 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurReg)
7083 .addReg(RegNo: VScalarOp);
7084
7085 Register NewCondReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7086
7087 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: NewCondReg)
7088 .addReg(RegNo: CurReg)
7089 .addReg(RegNo: VScalarOp);
7090
7091 // Combine the comparison results with AND.
7092 if (!CondReg) // First.
7093 CondReg = NewCondReg;
7094 else { // If not the first, we create an AND.
7095 Register AndReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7096 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndOpc), DestReg: AndReg)
7097 .addReg(RegNo: CondReg)
7098 .addReg(RegNo: NewCondReg);
7099 CondReg = AndReg;
7100 }
7101
7102 // Update ScalarOp operand to use the SGPR ScalarOp.
7103 ScalarOp->setReg(CurReg);
7104 ScalarOp->setIsKill();
7105 } else {
7106 SmallVector<Register, 8> ReadlanePieces;
7107 RegState VScalarOpUndef = getUndefRegState(B: ScalarOp->isUndef());
7108 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7109 "Unhandled register size");
7110
7111 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7112 Register CurRegLo =
7113 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7114 Register CurRegHi =
7115 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7116
7117 // Read the next variant <- also loop target.
7118 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurRegLo)
7119 .addReg(RegNo: VScalarOp, Flags: VScalarOpUndef, SubReg: TRI->getSubRegFromChannel(Channel: Idx));
7120
7121 // Read the next variant <- also loop target.
7122 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurRegHi)
7123 .addReg(RegNo: VScalarOp, Flags: VScalarOpUndef,
7124 SubReg: TRI->getSubRegFromChannel(Channel: Idx + 1));
7125
7126 ReadlanePieces.push_back(Elt: CurRegLo);
7127 ReadlanePieces.push_back(Elt: CurRegHi);
7128
7129 // Comparison is to be done as 64-bit.
7130 Register CurReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_64RegClass);
7131 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: CurReg)
7132 .addReg(RegNo: CurRegLo)
7133 .addImm(Val: AMDGPU::sub0)
7134 .addReg(RegNo: CurRegHi)
7135 .addImm(Val: AMDGPU::sub1);
7136
7137 Register NewCondReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7138 auto Cmp = BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U64_e64),
7139 DestReg: NewCondReg)
7140 .addReg(RegNo: CurReg);
7141 if (NumSubRegs <= 2)
7142 Cmp.addReg(RegNo: VScalarOp);
7143 else
7144 Cmp.addReg(RegNo: VScalarOp, Flags: VScalarOpUndef,
7145 SubReg: TRI->getSubRegFromChannel(Channel: Idx, NumRegs: 2));
7146
7147 // Combine the comparison results with AND.
7148 if (!CondReg) // First.
7149 CondReg = NewCondReg;
7150 else { // If not the first, we create an AND.
7151 Register AndReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7152 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndOpc), DestReg: AndReg)
7153 .addReg(RegNo: CondReg)
7154 .addReg(RegNo: NewCondReg);
7155 CondReg = AndReg;
7156 }
7157 } // End for loop.
7158
7159 const auto *SScalarOpRC =
7160 TRI->getEquivalentSGPRClass(VRC: MRI.getRegClass(Reg: VScalarOp));
7161 Register SScalarOp = MRI.createVirtualRegister(RegClass: SScalarOpRC);
7162
7163 // Build scalar ScalarOp.
7164 auto Merge =
7165 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: SScalarOp);
7166 unsigned Channel = 0;
7167 for (Register Piece : ReadlanePieces) {
7168 Merge.addReg(RegNo: Piece).addImm(Val: TRI->getSubRegFromChannel(Channel: Channel++));
7169 }
7170
7171 // Update ScalarOp operand to use the SGPR ScalarOp.
7172 ScalarOp->setReg(SScalarOp);
7173 ScalarOp->setIsKill();
7174 }
7175 }
7176
7177 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7178 MRI.setSimpleHint(VReg: SaveExec, PrefReg: CondReg);
7179
7180 // Update EXEC to matching lanes, saving original to SaveExec.
7181 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndSaveExecOpc), DestReg: SaveExec)
7182 .addReg(RegNo: CondReg, Flags: RegState::Kill);
7183
7184 // The original instruction is here; we insert the terminators after it.
7185 I = BodyBB.end();
7186
7187 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7188 BuildMI(BB&: BodyBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.XorTermOpc), DestReg: LMC.ExecReg)
7189 .addReg(RegNo: LMC.ExecReg)
7190 .addReg(RegNo: SaveExec);
7191
7192 BuildMI(BB&: BodyBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::SI_WATERFALL_LOOP)).addMBB(MBB: &LoopBB);
7193}
7194
7195// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7196// with SGPRs by iterating over all unique values across all lanes.
7197// Returns the loop basic block that now contains \p MI.
7198static MachineBasicBlock *
7199loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
7200 ArrayRef<MachineOperand *> ScalarOps,
7201 MachineDominatorTree *MDT,
7202 MachineBasicBlock::iterator Begin = nullptr,
7203 MachineBasicBlock::iterator End = nullptr) {
7204 MachineBasicBlock &MBB = *MI.getParent();
7205 MachineFunction &MF = *MBB.getParent();
7206 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7207 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7208 MachineRegisterInfo &MRI = MF.getRegInfo();
7209 if (!Begin.isValid())
7210 Begin = &MI;
7211 if (!End.isValid()) {
7212 End = &MI;
7213 ++End;
7214 }
7215 const DebugLoc &DL = MI.getDebugLoc();
7216 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
7217 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7218
7219 // Save SCC. Waterfall Loop may overwrite SCC.
7220 Register SaveSCCReg;
7221
7222 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7223 // rather than unlimited scan everywhere
7224 bool SCCNotDead =
7225 MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::SCC, Before: MI,
7226 Neighborhood: std::numeric_limits<unsigned>::max()) !=
7227 MachineBasicBlock::LQR_Dead;
7228 if (SCCNotDead) {
7229 SaveSCCReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7230 BuildMI(BB&: MBB, I: Begin, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: SaveSCCReg)
7231 .addImm(Val: 1)
7232 .addImm(Val: 0);
7233 }
7234
7235 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7236
7237 // Save the EXEC mask
7238 BuildMI(BB&: MBB, I: Begin, MIMD: DL, MCID: TII.get(Opcode: LMC.MovOpc), DestReg: SaveExec).addReg(RegNo: LMC.ExecReg);
7239
7240 // Killed uses in the instruction we are waterfalling around will be
7241 // incorrect due to the added control-flow.
7242 MachineBasicBlock::iterator AfterMI = MI;
7243 ++AfterMI;
7244 for (auto I = Begin; I != AfterMI; I++) {
7245 for (auto &MO : I->all_uses())
7246 MRI.clearKillFlags(Reg: MO.getReg());
7247 }
7248
7249 // To insert the loop we need to split the block. Move everything after this
7250 // point to a new block, and insert a new empty block between the two.
7251 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
7252 MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
7253 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7254 MachineFunction::iterator MBBI(MBB);
7255 ++MBBI;
7256
7257 MF.insert(MBBI, MBB: LoopBB);
7258 MF.insert(MBBI, MBB: BodyBB);
7259 MF.insert(MBBI, MBB: RemainderBB);
7260
7261 LoopBB->addSuccessor(Succ: BodyBB);
7262 BodyBB->addSuccessor(Succ: LoopBB);
7263 BodyBB->addSuccessor(Succ: RemainderBB);
7264
7265 // Move Begin to MI to the BodyBB, and the remainder of the block to
7266 // RemainderBB.
7267 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
7268 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: End, To: MBB.end());
7269 BodyBB->splice(Where: BodyBB->begin(), Other: &MBB, From: Begin, To: MBB.end());
7270
7271 MBB.addSuccessor(Succ: LoopBB);
7272
7273 // Update dominators. We know that MBB immediately dominates LoopBB, that
7274 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7275 // RemainderBB. RemainderBB immediately dominates all of the successors
7276 // transferred to it from MBB that MBB used to properly dominate.
7277 if (MDT) {
7278 MDT->addNewBlock(BB: LoopBB, DomBB: &MBB);
7279 MDT->addNewBlock(BB: BodyBB, DomBB: LoopBB);
7280 MDT->addNewBlock(BB: RemainderBB, DomBB: BodyBB);
7281 for (auto &Succ : RemainderBB->successors()) {
7282 if (MDT->properlyDominates(A: &MBB, B: Succ)) {
7283 MDT->changeImmediateDominator(BB: Succ, NewBB: RemainderBB);
7284 }
7285 }
7286 }
7287
7288 emitLoadScalarOpsFromVGPRLoop(TII, MRI, LoopBB&: *LoopBB, BodyBB&: *BodyBB, DL, ScalarOps);
7289
7290 MachineBasicBlock::iterator First = RemainderBB->begin();
7291 // Restore SCC
7292 if (SCCNotDead) {
7293 BuildMI(BB&: *RemainderBB, I: First, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CMP_LG_U32))
7294 .addReg(RegNo: SaveSCCReg, Flags: RegState::Kill)
7295 .addImm(Val: 0);
7296 }
7297
7298 // Restore the EXEC mask
7299 BuildMI(BB&: *RemainderBB, I: First, MIMD: DL, MCID: TII.get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
7300 .addReg(RegNo: SaveExec);
7301 return BodyBB;
7302}
7303
7304// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7305static std::tuple<unsigned, unsigned>
7306extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
7307 MachineBasicBlock &MBB = *MI.getParent();
7308 MachineFunction &MF = *MBB.getParent();
7309 MachineRegisterInfo &MRI = MF.getRegInfo();
7310
7311 // Extract the ptr from the resource descriptor.
7312 unsigned RsrcPtr =
7313 TII.buildExtractSubReg(MI, MRI, SuperReg: Rsrc, SuperRC: &AMDGPU::VReg_128RegClass,
7314 SubIdx: AMDGPU::sub0_sub1, SubRC: &AMDGPU::VReg_64RegClass);
7315
7316 // Create an empty resource descriptor
7317 Register Zero64 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
7318 Register SRsrcFormatLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
7319 Register SRsrcFormatHi = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
7320 Register NewSRsrc = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_128RegClass);
7321 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7322
7323 // Zero64 = 0
7324 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B64), DestReg: Zero64)
7325 .addImm(Val: 0);
7326
7327 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7328 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: SRsrcFormatLo)
7329 .addImm(Val: Lo_32(Value: RsrcDataFormat));
7330
7331 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7332 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: SRsrcFormatHi)
7333 .addImm(Val: Hi_32(Value: RsrcDataFormat));
7334
7335 // NewSRsrc = {Zero64, SRsrcFormat}
7336 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewSRsrc)
7337 .addReg(RegNo: Zero64)
7338 .addImm(Val: AMDGPU::sub0_sub1)
7339 .addReg(RegNo: SRsrcFormatLo)
7340 .addImm(Val: AMDGPU::sub2)
7341 .addReg(RegNo: SRsrcFormatHi)
7342 .addImm(Val: AMDGPU::sub3);
7343
7344 return std::tuple(RsrcPtr, NewSRsrc);
7345}
7346
7347MachineBasicBlock *
7348SIInstrInfo::legalizeOperands(MachineInstr &MI,
7349 MachineDominatorTree *MDT) const {
7350 MachineFunction &MF = *MI.getMF();
7351 MachineRegisterInfo &MRI = MF.getRegInfo();
7352 MachineBasicBlock *CreatedBB = nullptr;
7353
7354 // Legalize VOP2
7355 if (isVOP2(MI) || isVOPC(MI)) {
7356 legalizeOperandsVOP2(MRI, MI);
7357 return CreatedBB;
7358 }
7359
7360 // Legalize VOP3
7361 if (isVOP3(MI)) {
7362 legalizeOperandsVOP3(MRI, MI);
7363 return CreatedBB;
7364 }
7365
7366 // Legalize SMRD
7367 if (isSMRD(MI)) {
7368 legalizeOperandsSMRD(MRI, MI);
7369 return CreatedBB;
7370 }
7371
7372 // Legalize FLAT
7373 if (isFLAT(MI)) {
7374 legalizeOperandsFLAT(MRI, MI);
7375 return CreatedBB;
7376 }
7377
7378 // Legalize PHI
7379 // The register class of the operands must be the same type as the register
7380 // class of the output.
7381 if (MI.getOpcode() == AMDGPU::PHI) {
7382 const TargetRegisterClass *VRC = getOpRegClass(MI, OpNo: 0);
7383 assert(!RI.isSGPRClass(VRC));
7384
7385 // Update all the operands so they have the same type.
7386 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7387 MachineOperand &Op = MI.getOperand(i: I);
7388 if (!Op.isReg() || !Op.getReg().isVirtual())
7389 continue;
7390
7391 // MI is a PHI instruction.
7392 MachineBasicBlock *InsertBB = MI.getOperand(i: I + 1).getMBB();
7393 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
7394
7395 // Avoid creating no-op copies with the same src and dst reg class. These
7396 // confuse some of the machine passes.
7397 legalizeGenericOperand(InsertMBB&: *InsertBB, I: Insert, DstRC: VRC, Op, MRI, DL: MI.getDebugLoc());
7398 }
7399 }
7400
7401 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7402 // VGPR dest type and SGPR sources, insert copies so all operands are
7403 // VGPRs. This seems to help operand folding / the register coalescer.
7404 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7405 MachineBasicBlock *MBB = MI.getParent();
7406 const TargetRegisterClass *DstRC = getOpRegClass(MI, OpNo: 0);
7407 if (RI.hasVGPRs(RC: DstRC)) {
7408 // Update all the operands so they are VGPR register classes. These may
7409 // not be the same register class because REG_SEQUENCE supports mixing
7410 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7411 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7412 MachineOperand &Op = MI.getOperand(i: I);
7413 if (!Op.isReg() || !Op.getReg().isVirtual())
7414 continue;
7415
7416 const TargetRegisterClass *OpRC = MRI.getRegClass(Reg: Op.getReg());
7417 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(SRC: OpRC);
7418 if (VRC == OpRC)
7419 continue;
7420
7421 legalizeGenericOperand(InsertMBB&: *MBB, I: MI, DstRC: VRC, Op, MRI, DL: MI.getDebugLoc());
7422 Op.setIsKill();
7423 }
7424 }
7425
7426 return CreatedBB;
7427 }
7428
7429 // Legalize INSERT_SUBREG
7430 // src0 must have the same register class as dst
7431 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7432 Register Dst = MI.getOperand(i: 0).getReg();
7433 Register Src0 = MI.getOperand(i: 1).getReg();
7434 const TargetRegisterClass *DstRC = MRI.getRegClass(Reg: Dst);
7435 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0);
7436 if (DstRC != Src0RC) {
7437 MachineBasicBlock *MBB = MI.getParent();
7438 MachineOperand &Op = MI.getOperand(i: 1);
7439 legalizeGenericOperand(InsertMBB&: *MBB, I: MI, DstRC, Op, MRI, DL: MI.getDebugLoc());
7440 }
7441 return CreatedBB;
7442 }
7443
7444 // Legalize SI_INIT_M0
7445 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7446 MachineOperand &Src = MI.getOperand(i: 0);
7447 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7448 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7449 return CreatedBB;
7450 }
7451
7452 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7453 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7454 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7455 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7456 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7457 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7458 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7459 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7460 MachineOperand &Src = MI.getOperand(i: 1);
7461 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7462 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7463 return CreatedBB;
7464 }
7465
7466 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7467 //
7468 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7469 // scratch memory access. In both cases, the legalization never involves
7470 // conversion to the addr64 form.
7471 if (isImage(MI) || (AMDGPU::isGraphics(CC: MF.getFunction().getCallingConv()) &&
7472 (isMUBUF(MI) || isMTBUF(MI)))) {
7473 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7474 ? AMDGPU::OpName::rsrc
7475 : AMDGPU::OpName::srsrc;
7476 MachineOperand *SRsrc = getNamedOperand(MI, OperandName: RSrcOpName);
7477 if (SRsrc && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SRsrc->getReg())))
7478 CreatedBB = loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {SRsrc}, MDT);
7479
7480 AMDGPU::OpName SampOpName =
7481 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7482 MachineOperand *SSamp = getNamedOperand(MI, OperandName: SampOpName);
7483 if (SSamp && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SSamp->getReg())))
7484 CreatedBB = loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {SSamp}, MDT);
7485
7486 return CreatedBB;
7487 }
7488
7489 // Legalize SI_CALL
7490 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7491 MachineOperand *Dest = &MI.getOperand(i: 0);
7492 if (!RI.isSGPRClass(RC: MRI.getRegClass(Reg: Dest->getReg()))) {
7493 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7494 // following copies, we also need to move copies from and to physical
7495 // registers into the loop block.
7496 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7497 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7498
7499 // Also move the copies to physical registers into the loop block
7500 MachineBasicBlock &MBB = *MI.getParent();
7501 MachineBasicBlock::iterator Start(&MI);
7502 while (Start->getOpcode() != FrameSetupOpcode)
7503 --Start;
7504 MachineBasicBlock::iterator End(&MI);
7505 while (End->getOpcode() != FrameDestroyOpcode)
7506 ++End;
7507 // Also include following copies of the return value
7508 ++End;
7509 while (End != MBB.end() && End->isCopy() && End->getOperand(i: 1).isReg() &&
7510 MI.definesRegister(Reg: End->getOperand(i: 1).getReg(), /*TRI=*/nullptr))
7511 ++End;
7512 CreatedBB =
7513 loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Dest}, MDT, Begin: Start, End);
7514 }
7515 }
7516
7517 // Legalize s_sleep_var.
7518 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7519 const DebugLoc &DL = MI.getDebugLoc();
7520 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7521 int Src0Idx =
7522 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src0);
7523 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
7524 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
7525 .add(MO: Src0);
7526 Src0.ChangeToRegister(Reg, isDef: false);
7527 return nullptr;
7528 }
7529
7530 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7531 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7532 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7533 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7534 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7535 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7536 for (MachineOperand &Src : MI.explicit_operands()) {
7537 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7538 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7539 }
7540 return CreatedBB;
7541 }
7542
7543 // Legalize MUBUF instructions.
7544 bool isSoffsetLegal = true;
7545 int SoffsetIdx =
7546 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::soffset);
7547 if (SoffsetIdx != -1) {
7548 MachineOperand *Soffset = &MI.getOperand(i: SoffsetIdx);
7549 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7550 !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Soffset->getReg()))) {
7551 isSoffsetLegal = false;
7552 }
7553 }
7554
7555 bool isRsrcLegal = true;
7556 int RsrcIdx =
7557 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::srsrc);
7558 if (RsrcIdx != -1) {
7559 MachineOperand *Rsrc = &MI.getOperand(i: RsrcIdx);
7560 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Reg: Rsrc->getReg()))
7561 isRsrcLegal = false;
7562 }
7563
7564 // The operands are legal.
7565 if (isRsrcLegal && isSoffsetLegal)
7566 return CreatedBB;
7567
7568 if (!isRsrcLegal) {
7569 // Legalize a VGPR Rsrc
7570 //
7571 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7572 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7573 // a zero-value SRsrc.
7574 //
7575 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7576 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7577 // above.
7578 //
7579 // Otherwise we are on non-ADDR64 hardware, and/or we have
7580 // idxen/offen/bothen and we fall back to a waterfall loop.
7581
7582 MachineOperand *Rsrc = &MI.getOperand(i: RsrcIdx);
7583 MachineBasicBlock &MBB = *MI.getParent();
7584
7585 MachineOperand *VAddr = getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
7586 if (VAddr && AMDGPU::getIfAddr64Inst(Opcode: MI.getOpcode()) != -1) {
7587 // This is already an ADDR64 instruction so we need to add the pointer
7588 // extracted from the resource descriptor to the current value of VAddr.
7589 Register NewVAddrLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7590 Register NewVAddrHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7591 Register NewVAddr = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
7592
7593 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7594 Register CondReg0 = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7595 Register CondReg1 = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7596
7597 unsigned RsrcPtr, NewSRsrc;
7598 std::tie(args&: RsrcPtr, args&: NewSRsrc) = extractRsrcPtr(TII: *this, MI, Rsrc&: *Rsrc);
7599
7600 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7601 const DebugLoc &DL = MI.getDebugLoc();
7602 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: NewVAddrLo)
7603 .addDef(RegNo: CondReg0)
7604 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub0)
7605 .addReg(RegNo: VAddr->getReg(), Flags: {}, SubReg: AMDGPU::sub0)
7606 .addImm(Val: 0);
7607
7608 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7609 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: NewVAddrHi)
7610 .addDef(RegNo: CondReg1, Flags: RegState::Dead)
7611 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub1)
7612 .addReg(RegNo: VAddr->getReg(), Flags: {}, SubReg: AMDGPU::sub1)
7613 .addReg(RegNo: CondReg0, Flags: RegState::Kill)
7614 .addImm(Val: 0);
7615
7616 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7617 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewVAddr)
7618 .addReg(RegNo: NewVAddrLo)
7619 .addImm(Val: AMDGPU::sub0)
7620 .addReg(RegNo: NewVAddrHi)
7621 .addImm(Val: AMDGPU::sub1);
7622
7623 VAddr->setReg(NewVAddr);
7624 Rsrc->setReg(NewSRsrc);
7625 } else if (!VAddr && ST.hasAddr64()) {
7626 // This instructions is the _OFFSET variant, so we need to convert it to
7627 // ADDR64.
7628 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7629 "FIXME: Need to emit flat atomics here");
7630
7631 unsigned RsrcPtr, NewSRsrc;
7632 std::tie(args&: RsrcPtr, args&: NewSRsrc) = extractRsrcPtr(TII: *this, MI, Rsrc&: *Rsrc);
7633
7634 Register NewVAddr = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
7635 MachineOperand *VData = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata);
7636 MachineOperand *Offset = getNamedOperand(MI, OperandName: AMDGPU::OpName::offset);
7637 MachineOperand *SOffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7638 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(Opcode: MI.getOpcode());
7639
7640 // Atomics with return have an additional tied operand and are
7641 // missing some of the special bits.
7642 MachineOperand *VDataIn = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata_in);
7643 MachineInstr *Addr64;
7644
7645 if (!VDataIn) {
7646 // Regular buffer load / store.
7647 MachineInstrBuilder MIB =
7648 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: Addr64Opcode))
7649 .add(MO: *VData)
7650 .addReg(RegNo: NewVAddr)
7651 .addReg(RegNo: NewSRsrc)
7652 .add(MO: *SOffset)
7653 .add(MO: *Offset);
7654
7655 if (const MachineOperand *CPol =
7656 getNamedOperand(MI, OperandName: AMDGPU::OpName::cpol)) {
7657 MIB.addImm(Val: CPol->getImm());
7658 }
7659
7660 if (const MachineOperand *TFE =
7661 getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe)) {
7662 MIB.addImm(Val: TFE->getImm());
7663 }
7664
7665 MIB.addImm(Val: getNamedImmOperand(MI, OperandName: AMDGPU::OpName::swz));
7666
7667 MIB.cloneMemRefs(OtherMI: MI);
7668 Addr64 = MIB;
7669 } else {
7670 // Atomics with return.
7671 Addr64 = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: Addr64Opcode))
7672 .add(MO: *VData)
7673 .add(MO: *VDataIn)
7674 .addReg(RegNo: NewVAddr)
7675 .addReg(RegNo: NewSRsrc)
7676 .add(MO: *SOffset)
7677 .add(MO: *Offset)
7678 .addImm(Val: getNamedImmOperand(MI, OperandName: AMDGPU::OpName::cpol))
7679 .cloneMemRefs(OtherMI: MI);
7680 }
7681
7682 MI.removeFromParent();
7683
7684 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7685 BuildMI(BB&: MBB, I: Addr64, MIMD: Addr64->getDebugLoc(), MCID: get(Opcode: AMDGPU::REG_SEQUENCE),
7686 DestReg: NewVAddr)
7687 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub0)
7688 .addImm(Val: AMDGPU::sub0)
7689 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub1)
7690 .addImm(Val: AMDGPU::sub1);
7691 } else {
7692 // Legalize a VGPR Rsrc and soffset together.
7693 if (!isSoffsetLegal) {
7694 MachineOperand *Soffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7695 CreatedBB =
7696 loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Rsrc, Soffset}, MDT);
7697 return CreatedBB;
7698 }
7699 CreatedBB = loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Rsrc}, MDT);
7700 return CreatedBB;
7701 }
7702 }
7703
7704 // Legalize a VGPR soffset.
7705 if (!isSoffsetLegal) {
7706 MachineOperand *Soffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7707 CreatedBB = loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Soffset}, MDT);
7708 return CreatedBB;
7709 }
7710 return CreatedBB;
7711}
7712
7713void SIInstrWorklist::insert(MachineInstr *MI) {
7714 InstrList.insert(X: MI);
7715 // Add MBUF instructiosn to deferred list.
7716 int RsrcIdx =
7717 AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(), Name: AMDGPU::OpName::srsrc);
7718 if (RsrcIdx != -1) {
7719 DeferredList.insert(X: MI);
7720 }
7721}
7722
7723bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
7724 return DeferredList.contains(key: MI);
7725}
7726
7727// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7728// lowering (change spgr to vgpr).
7729// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7730// size. Need to legalize the size of the operands during the vgpr lowering
7731// chain. This can be removed after we have sgpr16 in place
7732void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
7733 MachineRegisterInfo &MRI) const {
7734 if (!ST.useRealTrue16Insts())
7735 return;
7736
7737 unsigned Opcode = MI.getOpcode();
7738 MachineBasicBlock *MBB = MI.getParent();
7739 // Legalize operands and check for size mismatch
7740 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7741 OpIdx >= get(Opcode).getNumOperands() ||
7742 get(Opcode).operands()[OpIdx].RegClass == -1)
7743 return;
7744
7745 MachineOperand &Op = MI.getOperand(i: OpIdx);
7746 if (!Op.isReg() || !Op.getReg().isVirtual())
7747 return;
7748
7749 const TargetRegisterClass *CurrRC = MRI.getRegClass(Reg: Op.getReg());
7750 if (!RI.isVGPRClass(RC: CurrRC))
7751 return;
7752
7753 int16_t RCID = getOpRegClassID(OpInfo: get(Opcode).operands()[OpIdx]);
7754 const TargetRegisterClass *ExpectedRC = RI.getRegClass(i: RCID);
7755 if (RI.getMatchingSuperRegClass(A: CurrRC, B: ExpectedRC, Idx: AMDGPU::lo16)) {
7756 Op.setSubReg(AMDGPU::lo16);
7757 } else if (RI.getMatchingSuperRegClass(A: ExpectedRC, B: CurrRC, Idx: AMDGPU::lo16)) {
7758 const DebugLoc &DL = MI.getDebugLoc();
7759 Register NewDstReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7760 Register Undef = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass);
7761 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
7762 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewDstReg)
7763 .addReg(RegNo: Op.getReg())
7764 .addImm(Val: AMDGPU::lo16)
7765 .addReg(RegNo: Undef)
7766 .addImm(Val: AMDGPU::hi16);
7767 Op.setReg(NewDstReg);
7768 }
7769}
7770void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
7771 MachineRegisterInfo &MRI) const {
7772 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7773 legalizeOperandsVALUt16(MI, OpIdx, MRI);
7774}
7775
7776void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
7777 MachineDominatorTree *MDT) const {
7778
7779 while (!Worklist.empty()) {
7780 MachineInstr &Inst = *Worklist.top();
7781 Worklist.erase_top();
7782 // Skip MachineInstr in the deferred list.
7783 if (Worklist.isDeferred(MI: &Inst))
7784 continue;
7785 moveToVALUImpl(Worklist, MDT, Inst);
7786 }
7787
7788 // Deferred list of instructions will be processed once
7789 // all the MachineInstr in the worklist are done.
7790 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7791 moveToVALUImpl(Worklist, MDT, Inst&: *Inst);
7792 assert(Worklist.empty() &&
7793 "Deferred MachineInstr are not supposed to re-populate worklist");
7794 }
7795}
7796
7797void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
7798 MachineDominatorTree *MDT,
7799 MachineInstr &Inst) const {
7800
7801 MachineBasicBlock *MBB = Inst.getParent();
7802 if (!MBB)
7803 return;
7804 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7805 unsigned Opcode = Inst.getOpcode();
7806 unsigned NewOpcode = getVALUOp(MI: Inst);
7807 const DebugLoc &DL = Inst.getDebugLoc();
7808
7809 // Handle some special cases
7810 switch (Opcode) {
7811 default:
7812 break;
7813 case AMDGPU::S_ADD_I32:
7814 case AMDGPU::S_SUB_I32: {
7815 // FIXME: The u32 versions currently selected use the carry.
7816 bool Changed;
7817 MachineBasicBlock *CreatedBBTmp = nullptr;
7818 std::tie(args&: Changed, args&: CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7819 if (Changed)
7820 return;
7821
7822 // Default handling
7823 break;
7824 }
7825
7826 case AMDGPU::S_MUL_U64:
7827 if (ST.hasVectorMulU64()) {
7828 NewOpcode = AMDGPU::V_MUL_U64_e64;
7829 break;
7830 }
7831 // Split s_mul_u64 in 32-bit vector multiplications.
7832 splitScalarSMulU64(Worklist, Inst, MDT);
7833 Inst.eraseFromParent();
7834 return;
7835
7836 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7837 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7838 // This is a special case of s_mul_u64 where all the operands are either
7839 // zero extended or sign extended.
7840 splitScalarSMulPseudo(Worklist, Inst, MDT);
7841 Inst.eraseFromParent();
7842 return;
7843
7844 case AMDGPU::S_AND_B64:
7845 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_AND_B32, MDT);
7846 Inst.eraseFromParent();
7847 return;
7848
7849 case AMDGPU::S_OR_B64:
7850 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_OR_B32, MDT);
7851 Inst.eraseFromParent();
7852 return;
7853
7854 case AMDGPU::S_XOR_B64:
7855 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_XOR_B32, MDT);
7856 Inst.eraseFromParent();
7857 return;
7858
7859 case AMDGPU::S_NAND_B64:
7860 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_NAND_B32, MDT);
7861 Inst.eraseFromParent();
7862 return;
7863
7864 case AMDGPU::S_NOR_B64:
7865 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_NOR_B32, MDT);
7866 Inst.eraseFromParent();
7867 return;
7868
7869 case AMDGPU::S_XNOR_B64:
7870 if (ST.hasDLInsts())
7871 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_XNOR_B32, MDT);
7872 else
7873 splitScalar64BitXnor(Worklist, Inst, MDT);
7874 Inst.eraseFromParent();
7875 return;
7876
7877 case AMDGPU::S_ANDN2_B64:
7878 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_ANDN2_B32, MDT);
7879 Inst.eraseFromParent();
7880 return;
7881
7882 case AMDGPU::S_ORN2_B64:
7883 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_ORN2_B32, MDT);
7884 Inst.eraseFromParent();
7885 return;
7886
7887 case AMDGPU::S_BREV_B64:
7888 splitScalar64BitUnaryOp(Worklist, Inst, Opcode: AMDGPU::S_BREV_B32, Swap: true);
7889 Inst.eraseFromParent();
7890 return;
7891
7892 case AMDGPU::S_NOT_B64:
7893 splitScalar64BitUnaryOp(Worklist, Inst, Opcode: AMDGPU::S_NOT_B32);
7894 Inst.eraseFromParent();
7895 return;
7896
7897 case AMDGPU::S_BCNT1_I32_B64:
7898 splitScalar64BitBCNT(Worklist, Inst);
7899 Inst.eraseFromParent();
7900 return;
7901
7902 case AMDGPU::S_BFE_I64:
7903 splitScalar64BitBFE(Worklist, Inst);
7904 Inst.eraseFromParent();
7905 return;
7906
7907 case AMDGPU::S_FLBIT_I32_B64:
7908 splitScalar64BitCountOp(Worklist, Inst, Opcode: AMDGPU::V_FFBH_U32_e32);
7909 Inst.eraseFromParent();
7910 return;
7911 case AMDGPU::S_FF1_I32_B64:
7912 splitScalar64BitCountOp(Worklist, Inst, Opcode: AMDGPU::V_FFBL_B32_e32);
7913 Inst.eraseFromParent();
7914 return;
7915
7916 case AMDGPU::S_LSHL_B32:
7917 if (ST.hasOnlyRevVALUShifts()) {
7918 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7919 swapOperands(Inst);
7920 }
7921 break;
7922 case AMDGPU::S_ASHR_I32:
7923 if (ST.hasOnlyRevVALUShifts()) {
7924 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7925 swapOperands(Inst);
7926 }
7927 break;
7928 case AMDGPU::S_LSHR_B32:
7929 if (ST.hasOnlyRevVALUShifts()) {
7930 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7931 swapOperands(Inst);
7932 }
7933 break;
7934 case AMDGPU::S_LSHL_B64:
7935 if (ST.hasOnlyRevVALUShifts()) {
7936 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7937 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7938 : AMDGPU::V_LSHLREV_B64_e64;
7939 swapOperands(Inst);
7940 }
7941 break;
7942 case AMDGPU::S_ASHR_I64:
7943 if (ST.hasOnlyRevVALUShifts()) {
7944 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7945 swapOperands(Inst);
7946 }
7947 break;
7948 case AMDGPU::S_LSHR_B64:
7949 if (ST.hasOnlyRevVALUShifts()) {
7950 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7951 swapOperands(Inst);
7952 }
7953 break;
7954
7955 case AMDGPU::S_ABS_I32:
7956 lowerScalarAbs(Worklist, Inst);
7957 Inst.eraseFromParent();
7958 return;
7959
7960 case AMDGPU::S_ABSDIFF_I32:
7961 lowerScalarAbsDiff(Worklist, Inst);
7962 Inst.eraseFromParent();
7963 return;
7964
7965 case AMDGPU::S_CBRANCH_SCC0:
7966 case AMDGPU::S_CBRANCH_SCC1: {
7967 // Clear unused bits of vcc
7968 Register CondReg = Inst.getOperand(i: 1).getReg();
7969 bool IsSCC = CondReg == AMDGPU::SCC;
7970 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
7971 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: LMC.AndOpc), DestReg: LMC.VccReg)
7972 .addReg(RegNo: LMC.ExecReg)
7973 .addReg(RegNo: IsSCC ? LMC.VccReg : CondReg);
7974 Inst.removeOperand(OpNo: 1);
7975 } break;
7976
7977 case AMDGPU::S_BFE_U64:
7978 case AMDGPU::S_BFM_B64:
7979 llvm_unreachable("Moving this op to VALU not implemented");
7980
7981 case AMDGPU::S_PACK_LL_B32_B16:
7982 case AMDGPU::S_PACK_LH_B32_B16:
7983 case AMDGPU::S_PACK_HL_B32_B16:
7984 case AMDGPU::S_PACK_HH_B32_B16:
7985 movePackToVALU(Worklist, MRI, Inst);
7986 Inst.eraseFromParent();
7987 return;
7988
7989 case AMDGPU::S_XNOR_B32:
7990 lowerScalarXnor(Worklist, Inst);
7991 Inst.eraseFromParent();
7992 return;
7993
7994 case AMDGPU::S_NAND_B32:
7995 splitScalarNotBinop(Worklist, Inst, Opcode: AMDGPU::S_AND_B32);
7996 Inst.eraseFromParent();
7997 return;
7998
7999 case AMDGPU::S_NOR_B32:
8000 splitScalarNotBinop(Worklist, Inst, Opcode: AMDGPU::S_OR_B32);
8001 Inst.eraseFromParent();
8002 return;
8003
8004 case AMDGPU::S_ANDN2_B32:
8005 splitScalarBinOpN2(Worklist, Inst, Opcode: AMDGPU::S_AND_B32);
8006 Inst.eraseFromParent();
8007 return;
8008
8009 case AMDGPU::S_ORN2_B32:
8010 splitScalarBinOpN2(Worklist, Inst, Opcode: AMDGPU::S_OR_B32);
8011 Inst.eraseFromParent();
8012 return;
8013
8014 // TODO: remove as soon as everything is ready
8015 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8016 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8017 // can only be selected from the uniform SDNode.
8018 case AMDGPU::S_ADD_CO_PSEUDO:
8019 case AMDGPU::S_SUB_CO_PSEUDO: {
8020 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8021 ? AMDGPU::V_ADDC_U32_e64
8022 : AMDGPU::V_SUBB_U32_e64;
8023 const auto *CarryRC = RI.getWaveMaskRegClass();
8024
8025 Register CarryInReg = Inst.getOperand(i: 4).getReg();
8026 if (!MRI.constrainRegClass(Reg: CarryInReg, RC: CarryRC)) {
8027 Register NewCarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
8028 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY), DestReg: NewCarryReg)
8029 .addReg(RegNo: CarryInReg);
8030 }
8031
8032 Register CarryOutReg = Inst.getOperand(i: 1).getReg();
8033
8034 Register DestReg = MRI.createVirtualRegister(RegClass: RI.getEquivalentVGPRClass(
8035 SRC: MRI.getRegClass(Reg: Inst.getOperand(i: 0).getReg())));
8036 MachineInstr *CarryOp =
8037 BuildMI(BB&: *MBB, I: &Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: Opc), DestReg)
8038 .addReg(RegNo: CarryOutReg, Flags: RegState::Define)
8039 .add(MO: Inst.getOperand(i: 2))
8040 .add(MO: Inst.getOperand(i: 3))
8041 .addReg(RegNo: CarryInReg)
8042 .addImm(Val: 0);
8043 legalizeOperands(MI&: *CarryOp);
8044 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: DestReg);
8045 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8046 Inst.eraseFromParent();
8047 }
8048 return;
8049 case AMDGPU::S_UADDO_PSEUDO:
8050 case AMDGPU::S_USUBO_PSEUDO: {
8051 MachineOperand &Dest0 = Inst.getOperand(i: 0);
8052 MachineOperand &Dest1 = Inst.getOperand(i: 1);
8053 MachineOperand &Src0 = Inst.getOperand(i: 2);
8054 MachineOperand &Src1 = Inst.getOperand(i: 3);
8055
8056 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8057 ? AMDGPU::V_ADD_CO_U32_e64
8058 : AMDGPU::V_SUB_CO_U32_e64;
8059 const TargetRegisterClass *NewRC =
8060 RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest0.getReg()));
8061 Register DestReg = MRI.createVirtualRegister(RegClass: NewRC);
8062 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I: &Inst, MIMD: DL, MCID: get(Opcode: Opc), DestReg)
8063 .addReg(RegNo: Dest1.getReg(), Flags: RegState::Define)
8064 .add(MO: Src0)
8065 .add(MO: Src1)
8066 .addImm(Val: 0); // clamp bit
8067
8068 legalizeOperands(MI&: *NewInstr, MDT);
8069 MRI.replaceRegWith(FromReg: Dest0.getReg(), ToReg: DestReg);
8070 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8071 Inst.eraseFromParent();
8072 }
8073 return;
8074 case AMDGPU::S_LSHL1_ADD_U32:
8075 case AMDGPU::S_LSHL2_ADD_U32:
8076 case AMDGPU::S_LSHL3_ADD_U32:
8077 case AMDGPU::S_LSHL4_ADD_U32: {
8078 MachineOperand &Dest = Inst.getOperand(i: 0);
8079 MachineOperand &Src0 = Inst.getOperand(i: 1);
8080 MachineOperand &Src1 = Inst.getOperand(i: 2);
8081 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8082 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8083 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8084 : 4);
8085
8086 const TargetRegisterClass *NewRC =
8087 RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest.getReg()));
8088 Register DestReg = MRI.createVirtualRegister(RegClass: NewRC);
8089 MachineInstr *NewInstr =
8090 BuildMI(BB&: *MBB, I: &Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8091 .add(MO: Src0)
8092 .addImm(Val: ShiftAmt)
8093 .add(MO: Src1);
8094
8095 legalizeOperands(MI&: *NewInstr, MDT);
8096 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: DestReg);
8097 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8098 Inst.eraseFromParent();
8099 }
8100 return;
8101 case AMDGPU::S_CSELECT_B32:
8102 case AMDGPU::S_CSELECT_B64:
8103 lowerSelect(Worklist, Inst, MDT);
8104 Inst.eraseFromParent();
8105 return;
8106 case AMDGPU::S_CMP_EQ_I32:
8107 case AMDGPU::S_CMP_LG_I32:
8108 case AMDGPU::S_CMP_GT_I32:
8109 case AMDGPU::S_CMP_GE_I32:
8110 case AMDGPU::S_CMP_LT_I32:
8111 case AMDGPU::S_CMP_LE_I32:
8112 case AMDGPU::S_CMP_EQ_U32:
8113 case AMDGPU::S_CMP_LG_U32:
8114 case AMDGPU::S_CMP_GT_U32:
8115 case AMDGPU::S_CMP_GE_U32:
8116 case AMDGPU::S_CMP_LT_U32:
8117 case AMDGPU::S_CMP_LE_U32:
8118 case AMDGPU::S_CMP_EQ_U64:
8119 case AMDGPU::S_CMP_LG_U64:
8120 case AMDGPU::S_CMP_LT_F32:
8121 case AMDGPU::S_CMP_EQ_F32:
8122 case AMDGPU::S_CMP_LE_F32:
8123 case AMDGPU::S_CMP_GT_F32:
8124 case AMDGPU::S_CMP_LG_F32:
8125 case AMDGPU::S_CMP_GE_F32:
8126 case AMDGPU::S_CMP_O_F32:
8127 case AMDGPU::S_CMP_U_F32:
8128 case AMDGPU::S_CMP_NGE_F32:
8129 case AMDGPU::S_CMP_NLG_F32:
8130 case AMDGPU::S_CMP_NGT_F32:
8131 case AMDGPU::S_CMP_NLE_F32:
8132 case AMDGPU::S_CMP_NEQ_F32:
8133 case AMDGPU::S_CMP_NLT_F32: {
8134 Register CondReg = MRI.createVirtualRegister(RegClass: RI.getWaveMaskRegClass());
8135 auto NewInstr =
8136 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode), DestReg: CondReg)
8137 .setMIFlags(Inst.getFlags());
8138 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src0_modifiers) >=
8139 0) {
8140 NewInstr
8141 .addImm(Val: 0) // src0_modifiers
8142 .add(MO: Inst.getOperand(i: 0)) // src0
8143 .addImm(Val: 0) // src1_modifiers
8144 .add(MO: Inst.getOperand(i: 1)) // src1
8145 .addImm(Val: 0); // clamp
8146 } else {
8147 NewInstr.add(MO: Inst.getOperand(i: 0)).add(MO: Inst.getOperand(i: 1));
8148 }
8149 legalizeOperands(MI&: *NewInstr, MDT);
8150 int SCCIdx = Inst.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
8151 const MachineOperand &SCCOp = Inst.getOperand(i: SCCIdx);
8152 addSCCDefUsersToVALUWorklist(Op: SCCOp, SCCDefInst&: Inst, Worklist, NewCond: CondReg);
8153 Inst.eraseFromParent();
8154 return;
8155 }
8156 case AMDGPU::S_CMP_LT_F16:
8157 case AMDGPU::S_CMP_EQ_F16:
8158 case AMDGPU::S_CMP_LE_F16:
8159 case AMDGPU::S_CMP_GT_F16:
8160 case AMDGPU::S_CMP_LG_F16:
8161 case AMDGPU::S_CMP_GE_F16:
8162 case AMDGPU::S_CMP_O_F16:
8163 case AMDGPU::S_CMP_U_F16:
8164 case AMDGPU::S_CMP_NGE_F16:
8165 case AMDGPU::S_CMP_NLG_F16:
8166 case AMDGPU::S_CMP_NGT_F16:
8167 case AMDGPU::S_CMP_NLE_F16:
8168 case AMDGPU::S_CMP_NEQ_F16:
8169 case AMDGPU::S_CMP_NLT_F16: {
8170 Register CondReg = MRI.createVirtualRegister(RegClass: RI.getWaveMaskRegClass());
8171 auto NewInstr =
8172 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode), DestReg: CondReg)
8173 .setMIFlags(Inst.getFlags());
8174 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::src0_modifiers)) {
8175 NewInstr
8176 .addImm(Val: 0) // src0_modifiers
8177 .add(MO: Inst.getOperand(i: 0)) // src0
8178 .addImm(Val: 0) // src1_modifiers
8179 .add(MO: Inst.getOperand(i: 1)) // src1
8180 .addImm(Val: 0); // clamp
8181 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::op_sel))
8182 NewInstr.addImm(Val: 0); // op_sel0
8183 } else {
8184 NewInstr
8185 .add(MO: Inst.getOperand(i: 0))
8186 .add(MO: Inst.getOperand(i: 1));
8187 }
8188 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8189 legalizeOperands(MI&: *NewInstr, MDT);
8190 int SCCIdx = Inst.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
8191 const MachineOperand &SCCOp = Inst.getOperand(i: SCCIdx);
8192 addSCCDefUsersToVALUWorklist(Op: SCCOp, SCCDefInst&: Inst, Worklist, NewCond: CondReg);
8193 Inst.eraseFromParent();
8194 return;
8195 }
8196 case AMDGPU::S_CVT_HI_F32_F16: {
8197 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8198 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8199 if (ST.useRealTrue16Insts()) {
8200 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: TmpReg)
8201 .add(MO: Inst.getOperand(i: 1));
8202 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8203 .addImm(Val: 0) // src0_modifiers
8204 .addReg(RegNo: TmpReg, Flags: {}, SubReg: AMDGPU::hi16)
8205 .addImm(Val: 0) // clamp
8206 .addImm(Val: 0) // omod
8207 .addImm(Val: 0); // op_sel0
8208 } else {
8209 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
8210 .addImm(Val: 16)
8211 .add(MO: Inst.getOperand(i: 1));
8212 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8213 .addImm(Val: 0) // src0_modifiers
8214 .addReg(RegNo: TmpReg)
8215 .addImm(Val: 0) // clamp
8216 .addImm(Val: 0); // omod
8217 }
8218
8219 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8220 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8221 Inst.eraseFromParent();
8222 return;
8223 }
8224 case AMDGPU::S_MINIMUM_F32:
8225 case AMDGPU::S_MAXIMUM_F32: {
8226 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8227 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8228 .addImm(Val: 0) // src0_modifiers
8229 .add(MO: Inst.getOperand(i: 1))
8230 .addImm(Val: 0) // src1_modifiers
8231 .add(MO: Inst.getOperand(i: 2))
8232 .addImm(Val: 0) // clamp
8233 .addImm(Val: 0); // omod
8234 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8235
8236 legalizeOperands(MI&: *NewInstr, MDT);
8237 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8238 Inst.eraseFromParent();
8239 return;
8240 }
8241 case AMDGPU::S_MINIMUM_F16:
8242 case AMDGPU::S_MAXIMUM_F16: {
8243 Register NewDst = MRI.createVirtualRegister(RegClass: ST.useRealTrue16Insts()
8244 ? &AMDGPU::VGPR_16RegClass
8245 : &AMDGPU::VGPR_32RegClass);
8246 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8247 .addImm(Val: 0) // src0_modifiers
8248 .add(MO: Inst.getOperand(i: 1))
8249 .addImm(Val: 0) // src1_modifiers
8250 .add(MO: Inst.getOperand(i: 2))
8251 .addImm(Val: 0) // clamp
8252 .addImm(Val: 0) // omod
8253 .addImm(Val: 0); // opsel0
8254 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8255 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8256 legalizeOperands(MI&: *NewInstr, MDT);
8257 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8258 Inst.eraseFromParent();
8259 return;
8260 }
8261 case AMDGPU::V_S_EXP_F16_e64:
8262 case AMDGPU::V_S_LOG_F16_e64:
8263 case AMDGPU::V_S_RCP_F16_e64:
8264 case AMDGPU::V_S_RSQ_F16_e64:
8265 case AMDGPU::V_S_SQRT_F16_e64: {
8266 Register NewDst = MRI.createVirtualRegister(RegClass: ST.useRealTrue16Insts()
8267 ? &AMDGPU::VGPR_16RegClass
8268 : &AMDGPU::VGPR_32RegClass);
8269 auto NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8270 .add(MO: Inst.getOperand(i: 1)) // src0_modifiers
8271 .add(MO: Inst.getOperand(i: 2))
8272 .add(MO: Inst.getOperand(i: 3)) // clamp
8273 .add(MO: Inst.getOperand(i: 4)) // omod
8274 .setMIFlags(Inst.getFlags());
8275 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::op_sel))
8276 NewInstr.addImm(Val: 0); // opsel0
8277 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8278 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8279 legalizeOperands(MI&: *NewInstr, MDT);
8280 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8281 Inst.eraseFromParent();
8282 return;
8283 }
8284 }
8285
8286 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8287 // We cannot move this instruction to the VALU, so we should try to
8288 // legalize its operands instead.
8289 legalizeOperands(MI&: Inst, MDT);
8290 return;
8291 }
8292 // Handle converting generic instructions like COPY-to-SGPR into
8293 // COPY-to-VGPR.
8294 if (NewOpcode == Opcode) {
8295 Register DstReg = Inst.getOperand(i: 0).getReg();
8296 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8297
8298 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8299 // hope for the best.
8300 if (Inst.isCopy() && DstReg.isPhysical() &&
8301 RI.isVGPR(MRI, Reg: Inst.getOperand(i: 1).getReg())) {
8302 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8303 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
8304 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: NewDst)
8305 .add(MO: Inst.getOperand(i: 1));
8306 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
8307 DestReg: DstReg)
8308 .addReg(RegNo: NewDst);
8309
8310 Inst.eraseFromParent();
8311 return;
8312 }
8313
8314 if (Inst.isCopy() && Inst.getOperand(i: 1).getReg().isVirtual()) {
8315 Register NewDstReg = Inst.getOperand(i: 1).getReg();
8316 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, Reg: NewDstReg);
8317 if (const TargetRegisterClass *CommonRC =
8318 RI.getCommonSubClass(A: NewDstRC, B: SrcRC)) {
8319 // Instead of creating a copy where src and dst are the same register
8320 // class, we just replace all uses of dst with src. These kinds of
8321 // copies interfere with the heuristics MachineSink uses to decide
8322 // whether or not to split a critical edge. Since the pass assumes
8323 // that copies will end up as machine instructions and not be
8324 // eliminated.
8325 addUsersToMoveToVALUWorklist(Reg: DstReg, MRI, Worklist);
8326 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8327 MRI.clearKillFlags(Reg: NewDstReg);
8328 Inst.getOperand(i: 0).setReg(DstReg);
8329
8330 if (!MRI.constrainRegClass(Reg: NewDstReg, RC: CommonRC))
8331 llvm_unreachable("failed to constrain register");
8332
8333 Inst.eraseFromParent();
8334 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8335 for (MachineOperand &MO :
8336 make_early_inc_range(Range: MRI.use_operands(Reg: NewDstReg))) {
8337 legalizeOperandsVALUt16(MI&: *MO.getParent(), MRI);
8338 }
8339
8340 return;
8341 }
8342 }
8343
8344 // If this is a v2s copy between 16bit and 32bit reg,
8345 // replace vgpr copy to reg_sequence/extract_subreg
8346 // This can be remove after we have sgpr16 in place
8347 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8348 Inst.getOperand(i: 1).getReg().isVirtual() &&
8349 RI.isVGPR(MRI, Reg: Inst.getOperand(i: 1).getReg())) {
8350 const TargetRegisterClass *SrcRegRC = getOpRegClass(MI: Inst, OpNo: 1);
8351 if (RI.getMatchingSuperRegClass(A: NewDstRC, B: SrcRegRC, Idx: AMDGPU::lo16)) {
8352 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8353 Register Undef = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass);
8354 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
8355 MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
8356 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
8357 MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewDstReg)
8358 .addReg(RegNo: Inst.getOperand(i: 1).getReg())
8359 .addImm(Val: AMDGPU::lo16)
8360 .addReg(RegNo: Undef)
8361 .addImm(Val: AMDGPU::hi16);
8362 Inst.eraseFromParent();
8363 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8364 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8365 return;
8366 } else if (RI.getMatchingSuperRegClass(A: SrcRegRC, B: NewDstRC,
8367 Idx: AMDGPU::lo16)) {
8368 Inst.getOperand(i: 1).setSubReg(AMDGPU::lo16);
8369 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8370 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8371 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8372 return;
8373 }
8374 }
8375
8376 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8377 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8378 legalizeOperands(MI&: Inst, MDT);
8379 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8380 return;
8381 }
8382
8383 // Use the new VALU Opcode.
8384 auto NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode))
8385 .setMIFlags(Inst.getFlags());
8386 if (isVOP3(Opcode: NewOpcode) && !isVOP3(Opcode)) {
8387 // Intersperse VOP3 modifiers among the SALU operands.
8388 NewInstr->addOperand(Op: Inst.getOperand(i: 0));
8389 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8390 Name: AMDGPU::OpName::src0_modifiers) >= 0)
8391 NewInstr.addImm(Val: 0);
8392 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::src0)) {
8393 const MachineOperand &Src = Inst.getOperand(i: 1);
8394 NewInstr->addOperand(Op: Src);
8395 }
8396
8397 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8398 // We are converting these to a BFE, so we need to add the missing
8399 // operands for the size and offset.
8400 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8401 NewInstr.addImm(Val: 0);
8402 NewInstr.addImm(Val: Size);
8403 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8404 // The VALU version adds the second operand to the result, so insert an
8405 // extra 0 operand.
8406 NewInstr.addImm(Val: 0);
8407 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8408 const MachineOperand &OffsetWidthOp = Inst.getOperand(i: 2);
8409 // If we need to move this to VGPRs, we need to unpack the second
8410 // operand back into the 2 separate ones for bit offset and width.
8411 assert(OffsetWidthOp.isImm() &&
8412 "Scalar BFE is only implemented for constant width and offset");
8413 uint32_t Imm = OffsetWidthOp.getImm();
8414
8415 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8416 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8417 NewInstr.addImm(Val: Offset);
8418 NewInstr.addImm(Val: BitWidth);
8419 } else {
8420 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8421 Name: AMDGPU::OpName::src1_modifiers) >= 0)
8422 NewInstr.addImm(Val: 0);
8423 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src1) >= 0)
8424 NewInstr->addOperand(Op: Inst.getOperand(i: 2));
8425 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8426 Name: AMDGPU::OpName::src2_modifiers) >= 0)
8427 NewInstr.addImm(Val: 0);
8428 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src2) >= 0)
8429 NewInstr->addOperand(Op: Inst.getOperand(i: 3));
8430 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::clamp) >= 0)
8431 NewInstr.addImm(Val: 0);
8432 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::omod) >= 0)
8433 NewInstr.addImm(Val: 0);
8434 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::op_sel) >= 0)
8435 NewInstr.addImm(Val: 0);
8436 }
8437 } else {
8438 // Just copy the SALU operands.
8439 for (const MachineOperand &Op : Inst.explicit_operands())
8440 NewInstr->addOperand(Op);
8441 }
8442
8443 // Remove any references to SCC. Vector instructions can't read from it, and
8444 // We're just about to add the implicit use / defs of VCC, and we don't want
8445 // both.
8446 for (MachineOperand &Op : Inst.implicit_operands()) {
8447 if (Op.getReg() == AMDGPU::SCC) {
8448 // Only propagate through live-def of SCC.
8449 if (Op.isDef() && !Op.isDead())
8450 addSCCDefUsersToVALUWorklist(Op, SCCDefInst&: Inst, Worklist);
8451 if (Op.isUse())
8452 addSCCDefsToVALUWorklist(SCCUseInst: NewInstr, Worklist);
8453 }
8454 }
8455 Inst.eraseFromParent();
8456 Register NewDstReg;
8457 if (NewInstr->getOperand(i: 0).isReg() && NewInstr->getOperand(i: 0).isDef()) {
8458 Register DstReg = NewInstr->getOperand(i: 0).getReg();
8459 assert(DstReg.isVirtual());
8460 // Update the destination register class.
8461 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst: *NewInstr);
8462 assert(NewDstRC);
8463 NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8464 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8465 }
8466 fixImplicitOperands(MI&: *NewInstr);
8467
8468 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8469
8470 // Legalize the operands
8471 legalizeOperands(MI&: *NewInstr, MDT);
8472 if (NewDstReg)
8473 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8474}
8475
8476// Add/sub require special handling to deal with carry outs.
8477std::pair<bool, MachineBasicBlock *>
8478SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8479 MachineDominatorTree *MDT) const {
8480 if (ST.hasAddNoCarryInsts()) {
8481 // Assume there is no user of scc since we don't select this in that case.
8482 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8483 // is used.
8484
8485 MachineBasicBlock &MBB = *Inst.getParent();
8486 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8487
8488 Register OldDstReg = Inst.getOperand(i: 0).getReg();
8489 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8490
8491 unsigned Opc = Inst.getOpcode();
8492 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8493
8494 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8495 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8496
8497 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8498 Inst.removeOperand(OpNo: 3);
8499
8500 Inst.setDesc(get(Opcode: NewOpc));
8501 Inst.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // clamp bit
8502 Inst.addImplicitDefUseOperands(MF&: *MBB.getParent());
8503 MRI.replaceRegWith(FromReg: OldDstReg, ToReg: ResultReg);
8504 MachineBasicBlock *NewBB = legalizeOperands(MI&: Inst, MDT);
8505
8506 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8507 return std::pair(true, NewBB);
8508 }
8509
8510 return std::pair(false, nullptr);
8511}
8512
8513void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8514 MachineDominatorTree *MDT) const {
8515
8516 MachineBasicBlock &MBB = *Inst.getParent();
8517 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8518 MachineBasicBlock::iterator MII = Inst;
8519 const DebugLoc &DL = Inst.getDebugLoc();
8520
8521 MachineOperand &Dest = Inst.getOperand(i: 0);
8522 MachineOperand &Src0 = Inst.getOperand(i: 1);
8523 MachineOperand &Src1 = Inst.getOperand(i: 2);
8524 MachineOperand &Cond = Inst.getOperand(i: 3);
8525
8526 Register CondReg = Cond.getReg();
8527 bool IsSCC = (CondReg == AMDGPU::SCC);
8528
8529 // If this is a trivial select where the condition is effectively not SCC
8530 // (CondReg is a source of copy to SCC), then the select is semantically
8531 // equivalent to copying CondReg. Hence, there is no need to create
8532 // V_CNDMASK, we can just use that and bail out.
8533 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8534 (Src1.getImm() == 0)) {
8535 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: CondReg);
8536 return;
8537 }
8538
8539 Register NewCondReg = CondReg;
8540 if (IsSCC) {
8541 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8542 NewCondReg = MRI.createVirtualRegister(RegClass: TC);
8543
8544 // Now look for the closest SCC def if it is a copy
8545 // replacing the CondReg with the COPY source register
8546 bool CopyFound = false;
8547 for (MachineInstr &CandI :
8548 make_range(x: std::next(x: MachineBasicBlock::reverse_iterator(Inst)),
8549 y: Inst.getParent()->rend())) {
8550 if (CandI.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isDead: false, Overlap: false) !=
8551 -1) {
8552 if (CandI.isCopy() && CandI.getOperand(i: 0).getReg() == AMDGPU::SCC) {
8553 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: NewCondReg)
8554 .addReg(RegNo: CandI.getOperand(i: 1).getReg());
8555 CopyFound = true;
8556 }
8557 break;
8558 }
8559 }
8560 if (!CopyFound) {
8561 // SCC def is not a copy
8562 // Insert a trivial select instead of creating a copy, because a copy from
8563 // SCC would semantically mean just copying a single bit, but we may need
8564 // the result to be a vector condition mask that needs preserving.
8565 unsigned Opcode =
8566 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8567 auto NewSelect =
8568 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: NewCondReg).addImm(Val: -1).addImm(Val: 0);
8569 NewSelect->getOperand(i: 3).setIsUndef(Cond.isUndef());
8570 }
8571 }
8572
8573 Register NewDestReg = MRI.createVirtualRegister(
8574 RegClass: RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest.getReg())));
8575 MachineInstr *NewInst;
8576 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8577 NewInst = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: NewDestReg)
8578 .addImm(Val: 0)
8579 .add(MO: Src1) // False
8580 .addImm(Val: 0)
8581 .add(MO: Src0) // True
8582 .addReg(RegNo: NewCondReg);
8583 } else {
8584 NewInst =
8585 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B64_PSEUDO), DestReg: NewDestReg)
8586 .add(MO: Src1) // False
8587 .add(MO: Src0) // True
8588 .addReg(RegNo: NewCondReg);
8589 }
8590 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDestReg);
8591 legalizeOperands(MI&: *NewInst, MDT);
8592 addUsersToMoveToVALUWorklist(Reg: NewDestReg, MRI, Worklist);
8593}
8594
8595void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8596 MachineInstr &Inst) const {
8597 MachineBasicBlock &MBB = *Inst.getParent();
8598 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8599 MachineBasicBlock::iterator MII = Inst;
8600 const DebugLoc &DL = Inst.getDebugLoc();
8601
8602 MachineOperand &Dest = Inst.getOperand(i: 0);
8603 MachineOperand &Src = Inst.getOperand(i: 1);
8604 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8605 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8606
8607 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8608 : AMDGPU::V_SUB_CO_U32_e32;
8609
8610 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: TmpReg)
8611 .addImm(Val: 0)
8612 .addReg(RegNo: Src.getReg());
8613
8614 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MAX_I32_e64), DestReg: ResultReg)
8615 .addReg(RegNo: Src.getReg())
8616 .addReg(RegNo: TmpReg);
8617
8618 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8619 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8620}
8621
8622void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8623 MachineInstr &Inst) const {
8624 MachineBasicBlock &MBB = *Inst.getParent();
8625 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8626 MachineBasicBlock::iterator MII = Inst;
8627 const DebugLoc &DL = Inst.getDebugLoc();
8628
8629 MachineOperand &Dest = Inst.getOperand(i: 0);
8630 MachineOperand &Src1 = Inst.getOperand(i: 1);
8631 MachineOperand &Src2 = Inst.getOperand(i: 2);
8632 Register SubResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8633 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8634 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8635
8636 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8637 : AMDGPU::V_SUB_CO_U32_e32;
8638
8639 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: SubResultReg)
8640 .addReg(RegNo: Src1.getReg())
8641 .addReg(RegNo: Src2.getReg());
8642
8643 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: TmpReg).addImm(Val: 0).addReg(RegNo: SubResultReg);
8644
8645 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MAX_I32_e64), DestReg: ResultReg)
8646 .addReg(RegNo: SubResultReg)
8647 .addReg(RegNo: TmpReg);
8648
8649 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8650 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8651}
8652
8653void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8654 MachineInstr &Inst) const {
8655 MachineBasicBlock &MBB = *Inst.getParent();
8656 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8657 MachineBasicBlock::iterator MII = Inst;
8658 const DebugLoc &DL = Inst.getDebugLoc();
8659
8660 MachineOperand &Dest = Inst.getOperand(i: 0);
8661 MachineOperand &Src0 = Inst.getOperand(i: 1);
8662 MachineOperand &Src1 = Inst.getOperand(i: 2);
8663
8664 if (ST.hasDLInsts()) {
8665 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8666 legalizeGenericOperand(InsertMBB&: MBB, I: MII, DstRC: &AMDGPU::VGPR_32RegClass, Op&: Src0, MRI, DL);
8667 legalizeGenericOperand(InsertMBB&: MBB, I: MII, DstRC: &AMDGPU::VGPR_32RegClass, Op&: Src1, MRI, DL);
8668
8669 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_XNOR_B32_e64), DestReg: NewDest)
8670 .add(MO: Src0)
8671 .add(MO: Src1);
8672
8673 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8674 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8675 } else {
8676 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8677 // invert either source and then perform the XOR. If either source is a
8678 // scalar register, then we can leave the inversion on the scalar unit to
8679 // achieve a better distribution of scalar and vector instructions.
8680 bool Src0IsSGPR = Src0.isReg() &&
8681 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src0.getReg()));
8682 bool Src1IsSGPR = Src1.isReg() &&
8683 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src1.getReg()));
8684 MachineInstr *Xor;
8685 Register Temp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8686 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8687
8688 // Build a pair of scalar instructions and add them to the work list.
8689 // The next iteration over the work list will lower these to the vector
8690 // unit as necessary.
8691 if (Src0IsSGPR) {
8692 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Temp).add(MO: Src0);
8693 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: NewDest)
8694 .addReg(RegNo: Temp)
8695 .add(MO: Src1);
8696 } else if (Src1IsSGPR) {
8697 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Temp).add(MO: Src1);
8698 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: NewDest)
8699 .add(MO: Src0)
8700 .addReg(RegNo: Temp);
8701 } else {
8702 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: Temp)
8703 .add(MO: Src0)
8704 .add(MO: Src1);
8705 MachineInstr *Not =
8706 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: NewDest).addReg(RegNo: Temp);
8707 Worklist.insert(MI: Not);
8708 }
8709
8710 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8711
8712 Worklist.insert(MI: Xor);
8713
8714 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8715 }
8716}
8717
8718void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8719 MachineInstr &Inst,
8720 unsigned Opcode) const {
8721 MachineBasicBlock &MBB = *Inst.getParent();
8722 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8723 MachineBasicBlock::iterator MII = Inst;
8724 const DebugLoc &DL = Inst.getDebugLoc();
8725
8726 MachineOperand &Dest = Inst.getOperand(i: 0);
8727 MachineOperand &Src0 = Inst.getOperand(i: 1);
8728 MachineOperand &Src1 = Inst.getOperand(i: 2);
8729
8730 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8731 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8732
8733 MachineInstr &Op = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: Interm)
8734 .add(MO: Src0)
8735 .add(MO: Src1);
8736
8737 MachineInstr &Not = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: NewDest)
8738 .addReg(RegNo: Interm);
8739
8740 Worklist.insert(MI: &Op);
8741 Worklist.insert(MI: &Not);
8742
8743 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8744 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8745}
8746
8747void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8748 MachineInstr &Inst,
8749 unsigned Opcode) const {
8750 MachineBasicBlock &MBB = *Inst.getParent();
8751 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8752 MachineBasicBlock::iterator MII = Inst;
8753 const DebugLoc &DL = Inst.getDebugLoc();
8754
8755 MachineOperand &Dest = Inst.getOperand(i: 0);
8756 MachineOperand &Src0 = Inst.getOperand(i: 1);
8757 MachineOperand &Src1 = Inst.getOperand(i: 2);
8758
8759 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8760 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8761
8762 MachineInstr &Not = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Interm)
8763 .add(MO: Src1);
8764
8765 MachineInstr &Op = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: NewDest)
8766 .add(MO: Src0)
8767 .addReg(RegNo: Interm);
8768
8769 Worklist.insert(MI: &Not);
8770 Worklist.insert(MI: &Op);
8771
8772 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8773 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8774}
8775
8776void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8777 MachineInstr &Inst, unsigned Opcode,
8778 bool Swap) const {
8779 MachineBasicBlock &MBB = *Inst.getParent();
8780 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8781
8782 MachineOperand &Dest = Inst.getOperand(i: 0);
8783 MachineOperand &Src0 = Inst.getOperand(i: 1);
8784 const DebugLoc &DL = Inst.getDebugLoc();
8785
8786 MachineBasicBlock::iterator MII = Inst;
8787
8788 const MCInstrDesc &InstDesc = get(Opcode);
8789 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8790 MRI.getRegClass(Reg: Src0.getReg()) :
8791 &AMDGPU::SGPR_32RegClass;
8792
8793 const TargetRegisterClass *Src0SubRC =
8794 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8795
8796 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
8797 SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8798
8799 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
8800 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(SRC: DestRC);
8801 const TargetRegisterClass *NewDestSubRC =
8802 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8803
8804 Register DestSub0 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
8805 MachineInstr &LoHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub0).add(MO: SrcReg0Sub0);
8806
8807 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
8808 SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
8809
8810 Register DestSub1 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
8811 MachineInstr &HiHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub1).add(MO: SrcReg0Sub1);
8812
8813 if (Swap)
8814 std::swap(a&: DestSub0, b&: DestSub1);
8815
8816 Register FullDestReg = MRI.createVirtualRegister(RegClass: NewDestRC);
8817 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
8818 .addReg(RegNo: DestSub0)
8819 .addImm(Val: AMDGPU::sub0)
8820 .addReg(RegNo: DestSub1)
8821 .addImm(Val: AMDGPU::sub1);
8822
8823 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
8824
8825 Worklist.insert(MI: &LoHalf);
8826 Worklist.insert(MI: &HiHalf);
8827
8828 // We don't need to legalizeOperands here because for a single operand, src0
8829 // will support any kind of input.
8830
8831 // Move all users of this moved value.
8832 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
8833}
8834
8835// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8836// split the s_mul_u64 in 32-bit vector multiplications.
8837void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8838 MachineInstr &Inst,
8839 MachineDominatorTree *MDT) const {
8840 MachineBasicBlock &MBB = *Inst.getParent();
8841 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8842
8843 Register FullDestReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
8844 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8845 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8846
8847 MachineOperand &Dest = Inst.getOperand(i: 0);
8848 MachineOperand &Src0 = Inst.getOperand(i: 1);
8849 MachineOperand &Src1 = Inst.getOperand(i: 2);
8850 const DebugLoc &DL = Inst.getDebugLoc();
8851 MachineBasicBlock::iterator MII = Inst;
8852
8853 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0.getReg());
8854 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: Src1.getReg());
8855 const TargetRegisterClass *Src0SubRC =
8856 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8857 if (RI.isSGPRClass(RC: Src0SubRC))
8858 Src0SubRC = RI.getEquivalentVGPRClass(SRC: Src0SubRC);
8859 const TargetRegisterClass *Src1SubRC =
8860 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8861 if (RI.isSGPRClass(RC: Src1SubRC))
8862 Src1SubRC = RI.getEquivalentVGPRClass(SRC: Src1SubRC);
8863
8864 // First, we extract the low 32-bit and high 32-bit values from each of the
8865 // operands.
8866 MachineOperand Op0L =
8867 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8868 MachineOperand Op1L =
8869 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
8870 MachineOperand Op0H =
8871 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
8872 MachineOperand Op1H =
8873 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
8874
8875 // The multilication is done as follows:
8876 //
8877 // Op1H Op1L
8878 // * Op0H Op0L
8879 // --------------------
8880 // Op1H*Op0L Op1L*Op0L
8881 // + Op1H*Op0H Op1L*Op0H
8882 // -----------------------------------------
8883 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8884 //
8885 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8886 // value and that would overflow.
8887 // The low 32-bit value is Op1L*Op0L.
8888 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8889
8890 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8891 MachineInstr *Op1L_Op0H =
8892 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: Op1L_Op0H_Reg)
8893 .add(MO: Op1L)
8894 .add(MO: Op0H);
8895
8896 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8897 MachineInstr *Op1H_Op0L =
8898 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: Op1H_Op0L_Reg)
8899 .add(MO: Op1H)
8900 .add(MO: Op0L);
8901
8902 Register CarryReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8903 MachineInstr *Carry =
8904 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_HI_U32_e64), DestReg: CarryReg)
8905 .add(MO: Op1L)
8906 .add(MO: Op0L);
8907
8908 MachineInstr *LoHalf =
8909 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: DestSub0)
8910 .add(MO: Op1L)
8911 .add(MO: Op0L);
8912
8913 Register AddReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8914 MachineInstr *Add = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg: AddReg)
8915 .addReg(RegNo: Op1L_Op0H_Reg)
8916 .addReg(RegNo: Op1H_Op0L_Reg);
8917
8918 MachineInstr *HiHalf =
8919 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg: DestSub1)
8920 .addReg(RegNo: AddReg)
8921 .addReg(RegNo: CarryReg);
8922
8923 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
8924 .addReg(RegNo: DestSub0)
8925 .addImm(Val: AMDGPU::sub0)
8926 .addReg(RegNo: DestSub1)
8927 .addImm(Val: AMDGPU::sub1);
8928
8929 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
8930
8931 // Try to legalize the operands in case we need to swap the order to keep it
8932 // valid.
8933 legalizeOperands(MI&: *Op1L_Op0H, MDT);
8934 legalizeOperands(MI&: *Op1H_Op0L, MDT);
8935 legalizeOperands(MI&: *Carry, MDT);
8936 legalizeOperands(MI&: *LoHalf, MDT);
8937 legalizeOperands(MI&: *Add, MDT);
8938 legalizeOperands(MI&: *HiHalf, MDT);
8939
8940 // Move all users of this moved value.
8941 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
8942}
8943
8944// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8945// multiplications.
8946void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8947 MachineInstr &Inst,
8948 MachineDominatorTree *MDT) const {
8949 MachineBasicBlock &MBB = *Inst.getParent();
8950 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8951
8952 Register FullDestReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
8953 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8954 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8955
8956 MachineOperand &Dest = Inst.getOperand(i: 0);
8957 MachineOperand &Src0 = Inst.getOperand(i: 1);
8958 MachineOperand &Src1 = Inst.getOperand(i: 2);
8959 const DebugLoc &DL = Inst.getDebugLoc();
8960 MachineBasicBlock::iterator MII = Inst;
8961
8962 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0.getReg());
8963 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: Src1.getReg());
8964 const TargetRegisterClass *Src0SubRC =
8965 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8966 if (RI.isSGPRClass(RC: Src0SubRC))
8967 Src0SubRC = RI.getEquivalentVGPRClass(SRC: Src0SubRC);
8968 const TargetRegisterClass *Src1SubRC =
8969 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8970 if (RI.isSGPRClass(RC: Src1SubRC))
8971 Src1SubRC = RI.getEquivalentVGPRClass(SRC: Src1SubRC);
8972
8973 // First, we extract the low 32-bit and high 32-bit values from each of the
8974 // operands.
8975 MachineOperand Op0L =
8976 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8977 MachineOperand Op1L =
8978 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
8979
8980 unsigned Opc = Inst.getOpcode();
8981 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8982 ? AMDGPU::V_MUL_HI_U32_e64
8983 : AMDGPU::V_MUL_HI_I32_e64;
8984 MachineInstr *HiHalf =
8985 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: NewOpc), DestReg: DestSub1).add(MO: Op1L).add(MO: Op0L);
8986
8987 MachineInstr *LoHalf =
8988 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: DestSub0)
8989 .add(MO: Op1L)
8990 .add(MO: Op0L);
8991
8992 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
8993 .addReg(RegNo: DestSub0)
8994 .addImm(Val: AMDGPU::sub0)
8995 .addReg(RegNo: DestSub1)
8996 .addImm(Val: AMDGPU::sub1);
8997
8998 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
8999
9000 // Try to legalize the operands in case we need to swap the order to keep it
9001 // valid.
9002 legalizeOperands(MI&: *HiHalf, MDT);
9003 legalizeOperands(MI&: *LoHalf, MDT);
9004
9005 // Move all users of this moved value.
9006 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
9007}
9008
9009void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9010 MachineInstr &Inst, unsigned Opcode,
9011 MachineDominatorTree *MDT) const {
9012 MachineBasicBlock &MBB = *Inst.getParent();
9013 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9014
9015 MachineOperand &Dest = Inst.getOperand(i: 0);
9016 MachineOperand &Src0 = Inst.getOperand(i: 1);
9017 MachineOperand &Src1 = Inst.getOperand(i: 2);
9018 const DebugLoc &DL = Inst.getDebugLoc();
9019
9020 MachineBasicBlock::iterator MII = Inst;
9021
9022 const MCInstrDesc &InstDesc = get(Opcode);
9023 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9024 MRI.getRegClass(Reg: Src0.getReg()) :
9025 &AMDGPU::SGPR_32RegClass;
9026
9027 const TargetRegisterClass *Src0SubRC =
9028 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9029 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9030 MRI.getRegClass(Reg: Src1.getReg()) :
9031 &AMDGPU::SGPR_32RegClass;
9032
9033 const TargetRegisterClass *Src1SubRC =
9034 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9035
9036 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
9037 SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
9038 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC,
9039 SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
9040 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
9041 SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
9042 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC,
9043 SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
9044
9045 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
9046 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(SRC: DestRC);
9047 const TargetRegisterClass *NewDestSubRC =
9048 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9049
9050 Register DestSub0 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
9051 MachineInstr &LoHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub0)
9052 .add(MO: SrcReg0Sub0)
9053 .add(MO: SrcReg1Sub0);
9054
9055 Register DestSub1 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
9056 MachineInstr &HiHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub1)
9057 .add(MO: SrcReg0Sub1)
9058 .add(MO: SrcReg1Sub1);
9059
9060 Register FullDestReg = MRI.createVirtualRegister(RegClass: NewDestRC);
9061 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
9062 .addReg(RegNo: DestSub0)
9063 .addImm(Val: AMDGPU::sub0)
9064 .addReg(RegNo: DestSub1)
9065 .addImm(Val: AMDGPU::sub1);
9066
9067 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
9068
9069 Worklist.insert(MI: &LoHalf);
9070 Worklist.insert(MI: &HiHalf);
9071
9072 // Move all users of this moved value.
9073 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
9074}
9075
9076void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9077 MachineInstr &Inst,
9078 MachineDominatorTree *MDT) const {
9079 MachineBasicBlock &MBB = *Inst.getParent();
9080 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9081
9082 MachineOperand &Dest = Inst.getOperand(i: 0);
9083 MachineOperand &Src0 = Inst.getOperand(i: 1);
9084 MachineOperand &Src1 = Inst.getOperand(i: 2);
9085 const DebugLoc &DL = Inst.getDebugLoc();
9086
9087 MachineBasicBlock::iterator MII = Inst;
9088
9089 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
9090
9091 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
9092
9093 MachineOperand* Op0;
9094 MachineOperand* Op1;
9095
9096 if (Src0.isReg() && RI.isSGPRReg(MRI, Reg: Src0.getReg())) {
9097 Op0 = &Src0;
9098 Op1 = &Src1;
9099 } else {
9100 Op0 = &Src1;
9101 Op1 = &Src0;
9102 }
9103
9104 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B64), DestReg: Interm)
9105 .add(MO: *Op0);
9106
9107 Register NewDest = MRI.createVirtualRegister(RegClass: DestRC);
9108
9109 MachineInstr &Xor = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B64), DestReg: NewDest)
9110 .addReg(RegNo: Interm)
9111 .add(MO: *Op1);
9112
9113 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
9114
9115 Worklist.insert(MI: &Xor);
9116}
9117
9118void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9119 MachineInstr &Inst) const {
9120 MachineBasicBlock &MBB = *Inst.getParent();
9121 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9122
9123 MachineBasicBlock::iterator MII = Inst;
9124 const DebugLoc &DL = Inst.getDebugLoc();
9125
9126 MachineOperand &Dest = Inst.getOperand(i: 0);
9127 MachineOperand &Src = Inst.getOperand(i: 1);
9128
9129 const MCInstrDesc &InstDesc = get(Opcode: AMDGPU::V_BCNT_U32_B32_e64);
9130 const TargetRegisterClass *SrcRC = Src.isReg() ?
9131 MRI.getRegClass(Reg: Src.getReg()) :
9132 &AMDGPU::SGPR_32RegClass;
9133
9134 Register MidReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9135 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9136
9137 const TargetRegisterClass *SrcSubRC =
9138 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9139
9140 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC,
9141 SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
9142 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC,
9143 SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
9144
9145 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg).add(MO: SrcRegSub0).addImm(Val: 0);
9146
9147 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: ResultReg).add(MO: SrcRegSub1).addReg(RegNo: MidReg);
9148
9149 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9150
9151 // We don't need to legalize operands here. src0 for either instruction can be
9152 // an SGPR, and the second input is unused or determined here.
9153 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9154}
9155
9156void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9157 MachineInstr &Inst) const {
9158 MachineBasicBlock &MBB = *Inst.getParent();
9159 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9160 MachineBasicBlock::iterator MII = Inst;
9161 const DebugLoc &DL = Inst.getDebugLoc();
9162
9163 MachineOperand &Dest = Inst.getOperand(i: 0);
9164 uint32_t Imm = Inst.getOperand(i: 2).getImm();
9165 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9166 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9167
9168 (void) Offset;
9169
9170 // Only sext_inreg cases handled.
9171 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9172 Offset == 0 && "Not implemented");
9173
9174 if (BitWidth < 32) {
9175 Register MidRegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9176 Register MidRegHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9177 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
9178
9179 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_BFE_I32_e64), DestReg: MidRegLo)
9180 .addReg(RegNo: Inst.getOperand(i: 1).getReg(), Flags: {}, SubReg: AMDGPU::sub0)
9181 .addImm(Val: 0)
9182 .addImm(Val: BitWidth);
9183
9184 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ASHRREV_I32_e32), DestReg: MidRegHi)
9185 .addImm(Val: 31)
9186 .addReg(RegNo: MidRegLo);
9187
9188 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: ResultReg)
9189 .addReg(RegNo: MidRegLo)
9190 .addImm(Val: AMDGPU::sub0)
9191 .addReg(RegNo: MidRegHi)
9192 .addImm(Val: AMDGPU::sub1);
9193
9194 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9195 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9196 return;
9197 }
9198
9199 MachineOperand &Src = Inst.getOperand(i: 1);
9200 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9201 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
9202
9203 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ASHRREV_I32_e64), DestReg: TmpReg)
9204 .addImm(Val: 31)
9205 .addReg(RegNo: Src.getReg(), Flags: {}, SubReg: AMDGPU::sub0);
9206
9207 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: ResultReg)
9208 .addReg(RegNo: Src.getReg(), Flags: {}, SubReg: AMDGPU::sub0)
9209 .addImm(Val: AMDGPU::sub0)
9210 .addReg(RegNo: TmpReg)
9211 .addImm(Val: AMDGPU::sub1);
9212
9213 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9214 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9215}
9216
9217void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9218 MachineInstr &Inst, unsigned Opcode,
9219 MachineDominatorTree *MDT) const {
9220 // (S_FLBIT_I32_B64 hi:lo) ->
9221 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9222 // (S_FF1_I32_B64 hi:lo) ->
9223 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9224
9225 MachineBasicBlock &MBB = *Inst.getParent();
9226 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9227 MachineBasicBlock::iterator MII = Inst;
9228 const DebugLoc &DL = Inst.getDebugLoc();
9229
9230 MachineOperand &Dest = Inst.getOperand(i: 0);
9231 MachineOperand &Src = Inst.getOperand(i: 1);
9232
9233 const MCInstrDesc &InstDesc = get(Opcode);
9234
9235 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9236 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9237 : AMDGPU::V_ADD_CO_U32_e32;
9238
9239 const TargetRegisterClass *SrcRC =
9240 Src.isReg() ? MRI.getRegClass(Reg: Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9241 const TargetRegisterClass *SrcSubRC =
9242 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9243
9244 MachineOperand SrcRegSub0 =
9245 buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
9246 MachineOperand SrcRegSub1 =
9247 buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
9248
9249 Register MidReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9250 Register MidReg2 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9251 Register MidReg3 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9252 Register MidReg4 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9253
9254 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg1).add(MO: SrcRegSub0);
9255
9256 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg2).add(MO: SrcRegSub1);
9257
9258 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: OpcodeAdd), DestReg: MidReg3)
9259 .addReg(RegNo: IsCtlz ? MidReg1 : MidReg2)
9260 .addImm(Val: 32)
9261 .addImm(Val: 1); // enable clamp
9262
9263 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MIN_U32_e64), DestReg: MidReg4)
9264 .addReg(RegNo: MidReg3)
9265 .addReg(RegNo: IsCtlz ? MidReg2 : MidReg1);
9266
9267 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: MidReg4);
9268
9269 addUsersToMoveToVALUWorklist(Reg: MidReg4, MRI, Worklist);
9270}
9271
9272void SIInstrInfo::addUsersToMoveToVALUWorklist(
9273 Register DstReg, MachineRegisterInfo &MRI,
9274 SIInstrWorklist &Worklist) const {
9275 for (MachineOperand &MO : make_early_inc_range(Range: MRI.use_operands(Reg: DstReg))) {
9276 MachineInstr &UseMI = *MO.getParent();
9277
9278 unsigned OpNo = 0;
9279
9280 switch (UseMI.getOpcode()) {
9281 case AMDGPU::COPY:
9282 case AMDGPU::WQM:
9283 case AMDGPU::SOFT_WQM:
9284 case AMDGPU::STRICT_WWM:
9285 case AMDGPU::STRICT_WQM:
9286 case AMDGPU::REG_SEQUENCE:
9287 case AMDGPU::PHI:
9288 case AMDGPU::INSERT_SUBREG:
9289 break;
9290 default:
9291 OpNo = MO.getOperandNo();
9292 break;
9293 }
9294
9295 const TargetRegisterClass *OpRC = getOpRegClass(MI: UseMI, OpNo);
9296 MRI.constrainRegClass(Reg: DstReg, RC: OpRC);
9297
9298 if (!RI.hasVectorRegisters(RC: OpRC))
9299 Worklist.insert(MI: &UseMI);
9300 else
9301 // Legalization could change user list.
9302 legalizeOperandsVALUt16(MI&: UseMI, OpIdx: OpNo, MRI);
9303 }
9304}
9305
9306void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9307 MachineRegisterInfo &MRI,
9308 MachineInstr &Inst) const {
9309 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9310 MachineBasicBlock *MBB = Inst.getParent();
9311 MachineOperand &Src0 = Inst.getOperand(i: 1);
9312 MachineOperand &Src1 = Inst.getOperand(i: 2);
9313 const DebugLoc &DL = Inst.getDebugLoc();
9314
9315 if (ST.useRealTrue16Insts()) {
9316 Register SrcReg0, SrcReg1;
9317 if (!Src0.isReg() || !RI.isVGPR(MRI, Reg: Src0.getReg())) {
9318 SrcReg0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9319 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: SrcReg0).add(MO: Src0);
9320 } else {
9321 SrcReg0 = Src0.getReg();
9322 }
9323
9324 if (!Src1.isReg() || !RI.isVGPR(MRI, Reg: Src1.getReg())) {
9325 SrcReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9326 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: SrcReg1).add(MO: Src1);
9327 } else {
9328 SrcReg1 = Src1.getReg();
9329 }
9330
9331 bool isSrc0Reg16 = MRI.constrainRegClass(Reg: SrcReg0, RC: &AMDGPU::VGPR_16RegClass);
9332 bool isSrc1Reg16 = MRI.constrainRegClass(Reg: SrcReg1, RC: &AMDGPU::VGPR_16RegClass);
9333
9334 auto NewMI = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: ResultReg);
9335 switch (Inst.getOpcode()) {
9336 case AMDGPU::S_PACK_LL_B32_B16:
9337 NewMI
9338 .addReg(RegNo: SrcReg0, Flags: {},
9339 SubReg: isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9340 .addImm(Val: AMDGPU::lo16)
9341 .addReg(RegNo: SrcReg1, Flags: {},
9342 SubReg: isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9343 .addImm(Val: AMDGPU::hi16);
9344 break;
9345 case AMDGPU::S_PACK_LH_B32_B16:
9346 NewMI
9347 .addReg(RegNo: SrcReg0, Flags: {},
9348 SubReg: isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9349 .addImm(Val: AMDGPU::lo16)
9350 .addReg(RegNo: SrcReg1, Flags: {}, SubReg: AMDGPU::hi16)
9351 .addImm(Val: AMDGPU::hi16);
9352 break;
9353 case AMDGPU::S_PACK_HL_B32_B16:
9354 NewMI.addReg(RegNo: SrcReg0, Flags: {}, SubReg: AMDGPU::hi16)
9355 .addImm(Val: AMDGPU::lo16)
9356 .addReg(RegNo: SrcReg1, Flags: {},
9357 SubReg: isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9358 .addImm(Val: AMDGPU::hi16);
9359 break;
9360 case AMDGPU::S_PACK_HH_B32_B16:
9361 NewMI.addReg(RegNo: SrcReg0, Flags: {}, SubReg: AMDGPU::hi16)
9362 .addImm(Val: AMDGPU::lo16)
9363 .addReg(RegNo: SrcReg1, Flags: {}, SubReg: AMDGPU::hi16)
9364 .addImm(Val: AMDGPU::hi16);
9365 break;
9366 default:
9367 llvm_unreachable("unhandled s_pack_* instruction");
9368 }
9369
9370 MachineOperand &Dest = Inst.getOperand(i: 0);
9371 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9372 addUsersToMoveToVALUWorklist(DstReg: ResultReg, MRI, Worklist);
9373 return;
9374 }
9375
9376 switch (Inst.getOpcode()) {
9377 case AMDGPU::S_PACK_LL_B32_B16: {
9378 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9379 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9380
9381 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9382 // 0.
9383 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9384 .addImm(Val: 0xffff);
9385
9386 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_AND_B32_e64), DestReg: TmpReg)
9387 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9388 .add(MO: Src0);
9389
9390 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: ResultReg)
9391 .add(MO: Src1)
9392 .addImm(Val: 16)
9393 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9394 break;
9395 }
9396 case AMDGPU::S_PACK_LH_B32_B16: {
9397 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9398 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9399 .addImm(Val: 0xffff);
9400 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_BFI_B32_e64), DestReg: ResultReg)
9401 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9402 .add(MO: Src0)
9403 .add(MO: Src1);
9404 break;
9405 }
9406 case AMDGPU::S_PACK_HL_B32_B16: {
9407 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9408 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
9409 .addImm(Val: 16)
9410 .add(MO: Src0);
9411 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: ResultReg)
9412 .add(MO: Src1)
9413 .addImm(Val: 16)
9414 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9415 break;
9416 }
9417 case AMDGPU::S_PACK_HH_B32_B16: {
9418 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9419 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9420 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
9421 .addImm(Val: 16)
9422 .add(MO: Src0);
9423 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9424 .addImm(Val: 0xffff0000);
9425 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_AND_OR_B32_e64), DestReg: ResultReg)
9426 .add(MO: Src1)
9427 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9428 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9429 break;
9430 }
9431 default:
9432 llvm_unreachable("unhandled s_pack_* instruction");
9433 }
9434
9435 MachineOperand &Dest = Inst.getOperand(i: 0);
9436 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9437 addUsersToMoveToVALUWorklist(DstReg: ResultReg, MRI, Worklist);
9438}
9439
9440void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9441 MachineInstr &SCCDefInst,
9442 SIInstrWorklist &Worklist,
9443 Register NewCond) const {
9444
9445 // Ensure that def inst defines SCC, which is still live.
9446 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9447 !Op.isDead() && Op.getParent() == &SCCDefInst);
9448 SmallVector<MachineInstr *, 4> CopyToDelete;
9449 // This assumes that all the users of SCC are in the same block
9450 // as the SCC def.
9451 for (MachineInstr &MI : // Skip the def inst itself.
9452 make_range(x: std::next(x: MachineBasicBlock::iterator(SCCDefInst)),
9453 y: SCCDefInst.getParent()->end())) {
9454 // Check if SCC is used first.
9455 int SCCIdx = MI.findRegisterUseOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isKill: false);
9456 if (SCCIdx != -1) {
9457 if (MI.isCopy()) {
9458 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9459 Register DestReg = MI.getOperand(i: 0).getReg();
9460
9461 MRI.replaceRegWith(FromReg: DestReg, ToReg: NewCond);
9462 CopyToDelete.push_back(Elt: &MI);
9463 } else {
9464
9465 if (NewCond.isValid())
9466 MI.getOperand(i: SCCIdx).setReg(NewCond);
9467
9468 Worklist.insert(MI: &MI);
9469 }
9470 }
9471 // Exit if we find another SCC def.
9472 if (MI.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isDead: false, Overlap: false) != -1)
9473 break;
9474 }
9475 for (auto &Copy : CopyToDelete)
9476 Copy->eraseFromParent();
9477}
9478
9479// Instructions that use SCC may be converted to VALU instructions. When that
9480// happens, the SCC register is changed to VCC_LO. The instruction that defines
9481// SCC must be changed to an instruction that defines VCC. This function makes
9482// sure that the instruction that defines SCC is added to the moveToVALU
9483// worklist.
9484void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9485 SIInstrWorklist &Worklist) const {
9486 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9487 // then there is nothing to do because the defining instruction has been
9488 // converted to a VALU already. If SCC then that instruction needs to be
9489 // converted to a VALU.
9490 for (MachineInstr &MI :
9491 make_range(x: std::next(x: MachineBasicBlock::reverse_iterator(SCCUseInst)),
9492 y: SCCUseInst->getParent()->rend())) {
9493 if (MI.modifiesRegister(Reg: AMDGPU::VCC, TRI: &RI))
9494 break;
9495 if (MI.definesRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
9496 Worklist.insert(MI: &MI);
9497 break;
9498 }
9499 }
9500}
9501
9502const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9503 const MachineInstr &Inst) const {
9504 const TargetRegisterClass *NewDstRC = getOpRegClass(MI: Inst, OpNo: 0);
9505
9506 switch (Inst.getOpcode()) {
9507 // For target instructions, getOpRegClass just returns the virtual register
9508 // class associated with the operand, so we need to find an equivalent VGPR
9509 // register class in order to move the instruction to the VALU.
9510 case AMDGPU::COPY:
9511 case AMDGPU::PHI:
9512 case AMDGPU::REG_SEQUENCE:
9513 case AMDGPU::INSERT_SUBREG:
9514 case AMDGPU::WQM:
9515 case AMDGPU::SOFT_WQM:
9516 case AMDGPU::STRICT_WWM:
9517 case AMDGPU::STRICT_WQM: {
9518 const TargetRegisterClass *SrcRC = getOpRegClass(MI: Inst, OpNo: 1);
9519 if (RI.isAGPRClass(RC: SrcRC)) {
9520 if (RI.isAGPRClass(RC: NewDstRC))
9521 return nullptr;
9522
9523 switch (Inst.getOpcode()) {
9524 case AMDGPU::PHI:
9525 case AMDGPU::REG_SEQUENCE:
9526 case AMDGPU::INSERT_SUBREG:
9527 NewDstRC = RI.getEquivalentAGPRClass(SRC: NewDstRC);
9528 break;
9529 default:
9530 NewDstRC = RI.getEquivalentVGPRClass(SRC: NewDstRC);
9531 }
9532
9533 if (!NewDstRC)
9534 return nullptr;
9535 } else {
9536 if (RI.isVGPRClass(RC: NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9537 return nullptr;
9538
9539 NewDstRC = RI.getEquivalentVGPRClass(SRC: NewDstRC);
9540 if (!NewDstRC)
9541 return nullptr;
9542 }
9543
9544 return NewDstRC;
9545 }
9546 default:
9547 return NewDstRC;
9548 }
9549}
9550
9551// Find the one SGPR operand we are allowed to use.
9552Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9553 int OpIndices[3]) const {
9554 const MCInstrDesc &Desc = MI.getDesc();
9555
9556 // Find the one SGPR operand we are allowed to use.
9557 //
9558 // First we need to consider the instruction's operand requirements before
9559 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9560 // of VCC, but we are still bound by the constant bus requirement to only use
9561 // one.
9562 //
9563 // If the operand's class is an SGPR, we can never move it.
9564
9565 Register SGPRReg = findImplicitSGPRRead(MI);
9566 if (SGPRReg)
9567 return SGPRReg;
9568
9569 Register UsedSGPRs[3] = {Register()};
9570 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9571
9572 for (unsigned i = 0; i < 3; ++i) {
9573 int Idx = OpIndices[i];
9574 if (Idx == -1)
9575 break;
9576
9577 const MachineOperand &MO = MI.getOperand(i: Idx);
9578 if (!MO.isReg())
9579 continue;
9580
9581 // Is this operand statically required to be an SGPR based on the operand
9582 // constraints?
9583 const TargetRegisterClass *OpRC =
9584 RI.getRegClass(i: getOpRegClassID(OpInfo: Desc.operands()[Idx]));
9585 bool IsRequiredSGPR = RI.isSGPRClass(RC: OpRC);
9586 if (IsRequiredSGPR)
9587 return MO.getReg();
9588
9589 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9590 Register Reg = MO.getReg();
9591 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9592 if (RI.isSGPRClass(RC: RegRC))
9593 UsedSGPRs[i] = Reg;
9594 }
9595
9596 // We don't have a required SGPR operand, so we have a bit more freedom in
9597 // selecting operands to move.
9598
9599 // Try to select the most used SGPR. If an SGPR is equal to one of the
9600 // others, we choose that.
9601 //
9602 // e.g.
9603 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9604 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9605
9606 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9607 // prefer those.
9608
9609 if (UsedSGPRs[0]) {
9610 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9611 SGPRReg = UsedSGPRs[0];
9612 }
9613
9614 if (!SGPRReg && UsedSGPRs[1]) {
9615 if (UsedSGPRs[1] == UsedSGPRs[2])
9616 SGPRReg = UsedSGPRs[1];
9617 }
9618
9619 return SGPRReg;
9620}
9621
9622MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
9623 AMDGPU::OpName OperandName) const {
9624 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9625 return nullptr;
9626
9627 int Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OperandName);
9628 if (Idx == -1)
9629 return nullptr;
9630
9631 return &MI.getOperand(i: Idx);
9632}
9633
9634uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
9635 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9636 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9637 ? (int64_t)AMDGPU::UfmtGFX11::UFMT_32_FLOAT
9638 : (int64_t)AMDGPU::UfmtGFX10::UFMT_32_FLOAT;
9639 return (Format << 44) |
9640 (1ULL << 56) | // RESOURCE_LEVEL = 1
9641 (3ULL << 60); // OOB_SELECT = 3
9642 }
9643
9644 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9645 if (ST.isAmdHsaOS()) {
9646 // Set ATC = 1. GFX9 doesn't have this bit.
9647 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9648 RsrcDataFormat |= (1ULL << 56);
9649
9650 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9651 // BTW, it disables TC L2 and therefore decreases performance.
9652 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9653 RsrcDataFormat |= (2ULL << 59);
9654 }
9655
9656 return RsrcDataFormat;
9657}
9658
9659uint64_t SIInstrInfo::getScratchRsrcWords23() const {
9660 uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
9661 AMDGPU::RSRC_TID_ENABLE |
9662 0xffffffff; // Size;
9663
9664 // GFX9 doesn't have ELEMENT_SIZE.
9665 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9666 uint64_t EltSizeValue = Log2_32(Value: ST.getMaxPrivateElementSize(ForBufferRSrc: true)) - 1;
9667 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9668 }
9669
9670 // IndexStride = 64 / 32.
9671 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9672 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9673
9674 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9675 // Clear them unless we want a huge stride.
9676 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9677 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9678 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9679
9680 return Rsrc23;
9681}
9682
9683bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
9684 unsigned Opc = MI.getOpcode();
9685
9686 return isSMRD(Opcode: Opc);
9687}
9688
9689bool SIInstrInfo::isHighLatencyDef(int Opc) const {
9690 return get(Opcode: Opc).mayLoad() &&
9691 (isMUBUF(Opcode: Opc) || isMTBUF(Opcode: Opc) || isMIMG(Opcode: Opc) || isFLAT(Opcode: Opc));
9692}
9693
9694Register SIInstrInfo::isStackAccess(const MachineInstr &MI,
9695 int &FrameIndex) const {
9696 const MachineOperand *Addr = getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
9697 if (!Addr || !Addr->isFI())
9698 return Register();
9699
9700 assert(!MI.memoperands_empty() &&
9701 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9702
9703 FrameIndex = Addr->getIndex();
9704 return getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata)->getReg();
9705}
9706
9707Register SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
9708 int &FrameIndex) const {
9709 const MachineOperand *Addr = getNamedOperand(MI, OperandName: AMDGPU::OpName::addr);
9710 assert(Addr && Addr->isFI());
9711 FrameIndex = Addr->getIndex();
9712 return getNamedOperand(MI, OperandName: AMDGPU::OpName::data)->getReg();
9713}
9714
9715Register SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
9716 int &FrameIndex) const {
9717 if (!MI.mayLoad())
9718 return Register();
9719
9720 if (isMUBUF(MI) || isVGPRSpill(MI))
9721 return isStackAccess(MI, FrameIndex);
9722
9723 if (isSGPRSpill(MI))
9724 return isSGPRStackAccess(MI, FrameIndex);
9725
9726 return Register();
9727}
9728
9729Register SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
9730 int &FrameIndex) const {
9731 if (!MI.mayStore())
9732 return Register();
9733
9734 if (isMUBUF(MI) || isVGPRSpill(MI))
9735 return isStackAccess(MI, FrameIndex);
9736
9737 if (isSGPRSpill(MI))
9738 return isSGPRStackAccess(MI, FrameIndex);
9739
9740 return Register();
9741}
9742
9743unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
9744 unsigned Size = 0;
9745 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
9746 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9747 while (++I != E && I->isInsideBundle()) {
9748 assert(!I->isBundle() && "No nested bundle!");
9749 Size += getInstSizeInBytes(MI: *I);
9750 }
9751
9752 return Size;
9753}
9754
9755unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
9756 unsigned Opc = MI.getOpcode();
9757 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: Opc);
9758 unsigned DescSize = Desc.getSize();
9759
9760 // If we have a definitive size, we can use it. Otherwise we need to inspect
9761 // the operands to know the size.
9762 if (isFixedSize(MI)) {
9763 unsigned Size = DescSize;
9764
9765 // If we hit the buggy offset, an extra nop will be inserted in MC so
9766 // estimate the worst case.
9767 if (MI.isBranch() && ST.hasOffset3fBug())
9768 Size += 4;
9769
9770 return Size;
9771 }
9772
9773 // Instructions may have a 32-bit literal encoded after them. Check
9774 // operands that could ever be literals.
9775 if (isVALU(MI) || isSALU(MI)) {
9776 if (isDPP(MI))
9777 return DescSize;
9778 bool HasLiteral = false;
9779 unsigned LiteralSize = 4;
9780 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9781 const MachineOperand &Op = MI.getOperand(i: I);
9782 const MCOperandInfo &OpInfo = Desc.operands()[I];
9783 if (!Op.isReg() && !isInlineConstant(MO: Op, OpInfo)) {
9784 HasLiteral = true;
9785 if (ST.has64BitLiterals()) {
9786 switch (OpInfo.OperandType) {
9787 default:
9788 break;
9789 case AMDGPU::OPERAND_REG_IMM_FP64:
9790 if (!AMDGPU::isValid32BitLiteral(Val: Op.getImm(), IsFP64: true))
9791 LiteralSize = 8;
9792 break;
9793 case AMDGPU::OPERAND_REG_IMM_INT64:
9794 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Val: Op.getImm(), IsFP64: false))
9795 LiteralSize = 8;
9796 break;
9797 }
9798 }
9799 break;
9800 }
9801 }
9802 return HasLiteral ? DescSize + LiteralSize : DescSize;
9803 }
9804
9805 // Check whether we have extra NSA words.
9806 if (isMIMG(MI)) {
9807 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0);
9808 if (VAddr0Idx < 0)
9809 return 8;
9810
9811 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::srsrc);
9812 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9813 }
9814
9815 switch (Opc) {
9816 case TargetOpcode::BUNDLE:
9817 return getInstBundleSize(MI);
9818 case TargetOpcode::INLINEASM:
9819 case TargetOpcode::INLINEASM_BR: {
9820 const MachineFunction *MF = MI.getMF();
9821 const char *AsmStr = MI.getOperand(i: 0).getSymbolName();
9822 return getInlineAsmLength(Str: AsmStr, MAI: *MF->getTarget().getMCAsmInfo(), STI: &ST);
9823 }
9824 default:
9825 if (MI.isMetaInstruction())
9826 return 0;
9827
9828 // If D16 Pseudo inst, get correct MC code size
9829 const auto *D16Info = AMDGPU::getT16D16Helper(T16Op: Opc);
9830 if (D16Info) {
9831 // Assume d16_lo/hi inst are always in same size
9832 unsigned LoInstOpcode = D16Info->LoOp;
9833 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: LoInstOpcode);
9834 DescSize = Desc.getSize();
9835 }
9836
9837 // If FMA Pseudo inst, get correct MC code size
9838 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9839 // All potential lowerings are the same size; arbitrarily pick one.
9840 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: AMDGPU::V_FMA_MIXLO_F16);
9841 DescSize = Desc.getSize();
9842 }
9843
9844 return DescSize;
9845 }
9846}
9847
9848bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
9849 if (!isFLAT(MI))
9850 return false;
9851
9852 if (MI.memoperands_empty())
9853 return true;
9854
9855 for (const MachineMemOperand *MMO : MI.memoperands()) {
9856 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9857 return true;
9858 }
9859 return false;
9860}
9861
9862ArrayRef<std::pair<int, const char *>>
9863SIInstrInfo::getSerializableTargetIndices() const {
9864 static const std::pair<int, const char *> TargetIndices[] = {
9865 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9866 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9867 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9868 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9869 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9870 return ArrayRef(TargetIndices);
9871}
9872
9873/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9874/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9875ScheduleHazardRecognizer *
9876SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
9877 const ScheduleDAG *DAG) const {
9878 return new GCNHazardRecognizer(DAG->MF);
9879}
9880
9881/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9882/// pass.
9883ScheduleHazardRecognizer *
9884SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
9885 return new GCNHazardRecognizer(MF);
9886}
9887
9888// Called during:
9889// - pre-RA scheduling and post-RA scheduling
9890ScheduleHazardRecognizer *
9891SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
9892 const ScheduleDAGMI *DAG) const {
9893 // Borrowed from Arm Target
9894 // We would like to restrict this hazard recognizer to only
9895 // post-RA scheduling; we can tell that we're post-RA because we don't
9896 // track VRegLiveness.
9897 if (!DAG->hasVRegLiveness())
9898 return new GCNHazardRecognizer(DAG->MF);
9899 return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG);
9900}
9901
9902std::pair<unsigned, unsigned>
9903SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9904 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9905}
9906
9907ArrayRef<std::pair<unsigned, const char *>>
9908SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9909 static const std::pair<unsigned, const char *> TargetFlags[] = {
9910 {MO_GOTPCREL, "amdgpu-gotprel"},
9911 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9912 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9913 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9914 {MO_REL32_LO, "amdgpu-rel32-lo"},
9915 {MO_REL32_HI, "amdgpu-rel32-hi"},
9916 {MO_REL64, "amdgpu-rel64"},
9917 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9918 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9919 {MO_ABS64, "amdgpu-abs64"},
9920 };
9921
9922 return ArrayRef(TargetFlags);
9923}
9924
9925ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
9926SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9927 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9928 {
9929 {MONoClobber, "amdgpu-noclobber"},
9930 {MOLastUse, "amdgpu-last-use"},
9931 {MOCooperative, "amdgpu-cooperative"},
9932 };
9933
9934 return ArrayRef(TargetFlags);
9935}
9936
9937unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
9938 const MachineFunction &MF) const {
9939 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
9940 assert(SrcReg.isVirtual());
9941 if (MFI->checkFlag(Reg: SrcReg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
9942 return AMDGPU::WWM_COPY;
9943
9944 return AMDGPU::COPY;
9945}
9946
9947bool SIInstrInfo::canAddToBBProlog(const MachineInstr &MI) const {
9948 uint16_t Opcode = MI.getOpcode();
9949 // Check if it is SGPR spill or wwm-register spill Opcode.
9950 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
9951 return true;
9952
9953 const MachineFunction *MF = MI.getMF();
9954 const MachineRegisterInfo &MRI = MF->getRegInfo();
9955 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
9956
9957 // See if this is Liverange split instruction inserted for SGPR or
9958 // wwm-register. The implicit def inserted for wwm-registers should also be
9959 // included as they can appear at the bb begin.
9960 bool IsLRSplitInst = MI.getFlag(Flag: MachineInstr::LRSplit);
9961 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
9962 return false;
9963
9964 Register Reg = MI.getOperand(i: 0).getReg();
9965 if (RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg)))
9966 return IsLRSplitInst;
9967
9968 return MFI->isWWMReg(Reg);
9969}
9970
9971bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
9972 Register Reg) const {
9973 // We need to handle instructions which may be inserted during register
9974 // allocation to handle the prolog. The initial prolog instruction may have
9975 // been separated from the start of the block by spills and copies inserted
9976 // needed by the prolog. However, the insertions for scalar registers can
9977 // always be placed at the BB top as they are independent of the exec mask
9978 // value.
9979 bool IsNullOrVectorRegister = true;
9980 if (Reg) {
9981 const MachineFunction *MF = MI.getMF();
9982 const MachineRegisterInfo &MRI = MF->getRegInfo();
9983 IsNullOrVectorRegister = !RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg));
9984 }
9985
9986 return IsNullOrVectorRegister &&
9987 (canAddToBBProlog(MI) ||
9988 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
9989 MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &RI)));
9990}
9991
9992MachineInstrBuilder
9993SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
9994 MachineBasicBlock::iterator I,
9995 const DebugLoc &DL,
9996 Register DestReg) const {
9997 if (ST.hasAddNoCarryInsts())
9998 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e64), DestReg);
9999
10000 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10001 Register UnusedCarry = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
10002 MRI.setRegAllocationHint(VReg: UnusedCarry, Type: 0, PrefReg: RI.getVCC());
10003
10004 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg)
10005 .addReg(RegNo: UnusedCarry, Flags: RegState::Define | RegState::Dead);
10006}
10007
10008MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
10009 MachineBasicBlock::iterator I,
10010 const DebugLoc &DL,
10011 Register DestReg,
10012 RegScavenger &RS) const {
10013 if (ST.hasAddNoCarryInsts())
10014 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg);
10015
10016 // If available, prefer to use vcc.
10017 Register UnusedCarry = !RS.isRegUsed(Reg: AMDGPU::VCC)
10018 ? Register(RI.getVCC())
10019 : RS.scavengeRegisterBackwards(
10020 RC: *RI.getBoolRC(), To: I, /* RestoreAfter */ false,
10021 SPAdj: 0, /* AllowSpill */ false);
10022
10023 // TODO: Users need to deal with this.
10024 if (!UnusedCarry.isValid())
10025 return MachineInstrBuilder();
10026
10027 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg)
10028 .addReg(RegNo: UnusedCarry, Flags: RegState::Define | RegState::Dead);
10029}
10030
10031bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10032 switch (Opcode) {
10033 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10034 case AMDGPU::SI_KILL_I1_TERMINATOR:
10035 return true;
10036 default:
10037 return false;
10038 }
10039}
10040
10041const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
10042 switch (Opcode) {
10043 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10044 return get(Opcode: AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10045 case AMDGPU::SI_KILL_I1_PSEUDO:
10046 return get(Opcode: AMDGPU::SI_KILL_I1_TERMINATOR);
10047 default:
10048 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10049 }
10050}
10051
10052bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10053 return Imm <= getMaxMUBUFImmOffset(ST);
10054}
10055
10056unsigned SIInstrInfo::getMaxMUBUFImmOffset(const GCNSubtarget &ST) {
10057 // GFX12 field is non-negative 24-bit signed byte offset.
10058 const unsigned OffsetBits =
10059 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10060 return (1 << OffsetBits) - 1;
10061}
10062
10063void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
10064 if (!ST.isWave32())
10065 return;
10066
10067 if (MI.isInlineAsm())
10068 return;
10069
10070 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10071 return;
10072
10073 for (auto &Op : MI.implicit_operands()) {
10074 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10075 Op.setReg(AMDGPU::VCC_LO);
10076 }
10077}
10078
10079bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
10080 if (!isSMRD(MI))
10081 return false;
10082
10083 // Check that it is using a buffer resource.
10084 int Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::sbase);
10085 if (Idx == -1) // e.g. s_memtime
10086 return false;
10087
10088 const int16_t RCID = getOpRegClassID(OpInfo: MI.getDesc().operands()[Idx]);
10089 return RI.getRegClass(i: RCID)->hasSubClassEq(RC: &AMDGPU::SGPR_128RegClass);
10090}
10091
10092// Given Imm, split it into the values to put into the SOffset and ImmOffset
10093// fields in an MUBUF instruction. Return false if it is not possible (due to a
10094// hardware bug needing a workaround).
10095//
10096// The required alignment ensures that individual address components remain
10097// aligned if they are aligned to begin with. It also ensures that additional
10098// offsets within the given alignment can be added to the resulting ImmOffset.
10099bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset,
10100 uint32_t &ImmOffset, Align Alignment) const {
10101 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10102 const uint32_t MaxImm = alignDown(Value: MaxOffset, Align: Alignment.value());
10103 uint32_t Overflow = 0;
10104
10105 if (Imm > MaxImm) {
10106 if (Imm <= MaxImm + 64) {
10107 // Use an SOffset inline constant for 4..64
10108 Overflow = Imm - MaxImm;
10109 Imm = MaxImm;
10110 } else {
10111 // Try to keep the same value in SOffset for adjacent loads, so that
10112 // the corresponding register contents can be re-used.
10113 //
10114 // Load values with all low-bits (except for alignment bits) set into
10115 // SOffset, so that a larger range of values can be covered using
10116 // s_movk_i32.
10117 //
10118 // Atomic operations fail to work correctly when individual address
10119 // components are unaligned, even if their sum is aligned.
10120 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10121 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10122 Imm = Low;
10123 Overflow = High - Alignment.value();
10124 }
10125 }
10126
10127 if (Overflow > 0) {
10128 // There is a hardware bug in SI and CI which prevents address clamping in
10129 // MUBUF instructions from working correctly with SOffsets. The immediate
10130 // offset is unaffected.
10131 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10132 return false;
10133
10134 // It is not possible to set immediate in SOffset field on some targets.
10135 if (ST.hasRestrictedSOffset())
10136 return false;
10137 }
10138
10139 ImmOffset = Imm;
10140 SOffset = Overflow;
10141 return true;
10142}
10143
10144// Depending on the used address space and instructions, some immediate offsets
10145// are allowed and some are not.
10146// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10147// scratch instruction offsets can also be negative. On GFX12, offsets can be
10148// negative for all variants.
10149//
10150// There are several bugs related to these offsets:
10151// On gfx10.1, flat instructions that go into the global address space cannot
10152// use an offset.
10153//
10154// For scratch instructions, the address can be either an SGPR or a VGPR.
10155// The following offsets can be used, depending on the architecture (x means
10156// cannot be used):
10157// +----------------------------+------+------+
10158// | Address-Mode | SGPR | VGPR |
10159// +----------------------------+------+------+
10160// | gfx9 | | |
10161// | negative, 4-aligned offset | x | ok |
10162// | negative, unaligned offset | x | ok |
10163// +----------------------------+------+------+
10164// | gfx10 | | |
10165// | negative, 4-aligned offset | ok | ok |
10166// | negative, unaligned offset | ok | x |
10167// +----------------------------+------+------+
10168// | gfx10.3 | | |
10169// | negative, 4-aligned offset | ok | ok |
10170// | negative, unaligned offset | ok | ok |
10171// +----------------------------+------+------+
10172//
10173// This function ignores the addressing mode, so if an offset cannot be used in
10174// one addressing mode, it is considered illegal.
10175bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10176 uint64_t FlatVariant) const {
10177 // TODO: Should 0 be special cased?
10178 if (!ST.hasFlatInstOffsets())
10179 return false;
10180
10181 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10182 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10183 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10184 return false;
10185
10186 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10187 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10188 (Offset % 4) != 0) {
10189 return false;
10190 }
10191
10192 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10193 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10194 return isIntN(N, x: Offset) && (AllowNegative || Offset >= 0);
10195}
10196
10197// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10198std::pair<int64_t, int64_t>
10199SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10200 uint64_t FlatVariant) const {
10201 int64_t RemainderOffset = COffsetVal;
10202 int64_t ImmField = 0;
10203
10204 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10205 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10206
10207 if (AllowNegative) {
10208 // Use signed division by a power of two to truncate towards 0.
10209 int64_t D = 1LL << NumBits;
10210 RemainderOffset = (COffsetVal / D) * D;
10211 ImmField = COffsetVal - RemainderOffset;
10212
10213 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10214 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10215 (ImmField % 4) != 0) {
10216 // Make ImmField a multiple of 4
10217 RemainderOffset += ImmField % 4;
10218 ImmField -= ImmField % 4;
10219 }
10220 } else if (COffsetVal >= 0) {
10221 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(N: NumBits);
10222 RemainderOffset = COffsetVal - ImmField;
10223 }
10224
10225 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10226 assert(RemainderOffset + ImmField == COffsetVal);
10227 return {ImmField, RemainderOffset};
10228}
10229
10230bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const {
10231 if (ST.hasNegativeScratchOffsetBug() &&
10232 FlatVariant == SIInstrFlags::FlatScratch)
10233 return false;
10234
10235 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(STI: ST);
10236}
10237
10238static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10239 switch (ST.getGeneration()) {
10240 default:
10241 break;
10242 case AMDGPUSubtarget::SOUTHERN_ISLANDS:
10243 case AMDGPUSubtarget::SEA_ISLANDS:
10244 return SIEncodingFamily::SI;
10245 case AMDGPUSubtarget::VOLCANIC_ISLANDS:
10246 case AMDGPUSubtarget::GFX9:
10247 return SIEncodingFamily::VI;
10248 case AMDGPUSubtarget::GFX10:
10249 return SIEncodingFamily::GFX10;
10250 case AMDGPUSubtarget::GFX11:
10251 return SIEncodingFamily::GFX11;
10252 case AMDGPUSubtarget::GFX12:
10253 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10254 : SIEncodingFamily::GFX12;
10255 case AMDGPUSubtarget::GFX13:
10256 return SIEncodingFamily::GFX13;
10257 }
10258 llvm_unreachable("Unknown subtarget generation!");
10259}
10260
10261bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10262 switch(MCOp) {
10263 // These opcodes use indirect register addressing so
10264 // they need special handling by codegen (currently missing).
10265 // Therefore it is too risky to allow these opcodes
10266 // to be selected by dpp combiner or sdwa peepholer.
10267 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10268 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10269 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10270 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10271 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10272 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10273 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10274 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10275 return true;
10276 default:
10277 return false;
10278 }
10279}
10280
10281#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10282 case OPCODE##_dpp: \
10283 case OPCODE##_e32: \
10284 case OPCODE##_e64: \
10285 case OPCODE##_e64_dpp: \
10286 case OPCODE##_sdwa:
10287
10288static bool isRenamedInGFX9(int Opcode) {
10289 switch (Opcode) {
10290 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10291 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10292 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10293 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10294 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10295 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10296 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10297 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10298 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10299 //
10300 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10301 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10302 case AMDGPU::V_FMA_F16_gfx9_e64:
10303 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10304 case AMDGPU::V_INTERP_P2_F16:
10305 case AMDGPU::V_MAD_F16_e64:
10306 case AMDGPU::V_MAD_U16_e64:
10307 case AMDGPU::V_MAD_I16_e64:
10308 return true;
10309 default:
10310 return false;
10311 }
10312}
10313
10314int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10315 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10316 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10317
10318 unsigned Gen = subtargetEncodingFamily(ST);
10319
10320 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10321 Gen = SIEncodingFamily::GFX9;
10322
10323 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10324 // subtarget has UnpackedD16VMem feature.
10325 // TODO: remove this when we discard GFX80 encoding.
10326 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10327 Gen = SIEncodingFamily::GFX80;
10328
10329 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10330 switch (ST.getGeneration()) {
10331 default:
10332 Gen = SIEncodingFamily::SDWA;
10333 break;
10334 case AMDGPUSubtarget::GFX9:
10335 Gen = SIEncodingFamily::SDWA9;
10336 break;
10337 case AMDGPUSubtarget::GFX10:
10338 Gen = SIEncodingFamily::SDWA10;
10339 break;
10340 }
10341 }
10342
10343 if (isMAI(Opcode)) {
10344 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10345 if (MFMAOp != -1)
10346 Opcode = MFMAOp;
10347 }
10348
10349 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10350
10351 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10352 MCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX12);
10353
10354 // -1 means that Opcode is already a native instruction.
10355 if (MCOp == -1)
10356 return Opcode;
10357
10358 if (ST.hasGFX90AInsts()) {
10359 uint16_t NMCOp = (uint16_t)-1;
10360 if (ST.hasGFX940Insts())
10361 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX940);
10362 if (NMCOp == (uint16_t)-1)
10363 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX90A);
10364 if (NMCOp == (uint16_t)-1)
10365 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX9);
10366 if (NMCOp != (uint16_t)-1)
10367 MCOp = NMCOp;
10368 }
10369
10370 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10371 // no encoding in the given subtarget generation.
10372 if (MCOp == (uint16_t)-1)
10373 return -1;
10374
10375 if (isAsmOnlyOpcode(MCOp))
10376 return -1;
10377
10378 return MCOp;
10379}
10380
10381static
10382TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
10383 assert(RegOpnd.isReg());
10384 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10385 getRegSubRegPair(O: RegOpnd);
10386}
10387
10388TargetInstrInfo::RegSubRegPair
10389llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
10390 assert(MI.isRegSequence());
10391 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10392 if (MI.getOperand(i: 1 + 2 * I + 1).getImm() == SubReg) {
10393 auto &RegOp = MI.getOperand(i: 1 + 2 * I);
10394 return getRegOrUndef(RegOpnd: RegOp);
10395 }
10396 return TargetInstrInfo::RegSubRegPair();
10397}
10398
10399// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10400// Following a subreg of reg:subreg isn't supported
10401static bool followSubRegDef(MachineInstr &MI,
10402 TargetInstrInfo::RegSubRegPair &RSR) {
10403 if (!RSR.SubReg)
10404 return false;
10405 switch (MI.getOpcode()) {
10406 default: break;
10407 case AMDGPU::REG_SEQUENCE:
10408 RSR = getRegSequenceSubReg(MI, SubReg: RSR.SubReg);
10409 return true;
10410 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10411 case AMDGPU::INSERT_SUBREG:
10412 if (RSR.SubReg == (unsigned)MI.getOperand(i: 3).getImm())
10413 // inserted the subreg we're looking for
10414 RSR = getRegOrUndef(RegOpnd: MI.getOperand(i: 2));
10415 else { // the subreg in the rest of the reg
10416 auto R1 = getRegOrUndef(RegOpnd: MI.getOperand(i: 1));
10417 if (R1.SubReg) // subreg of subreg isn't supported
10418 return false;
10419 RSR.Reg = R1.Reg;
10420 }
10421 return true;
10422 }
10423 return false;
10424}
10425
10426MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
10427 const MachineRegisterInfo &MRI) {
10428 assert(MRI.isSSA());
10429 if (!P.Reg.isVirtual())
10430 return nullptr;
10431
10432 auto RSR = P;
10433 auto *DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10434 while (auto *MI = DefInst) {
10435 DefInst = nullptr;
10436 switch (MI->getOpcode()) {
10437 case AMDGPU::COPY:
10438 case AMDGPU::V_MOV_B32_e32: {
10439 auto &Op1 = MI->getOperand(i: 1);
10440 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10441 if (Op1.isUndef())
10442 return nullptr;
10443 RSR = getRegSubRegPair(O: Op1);
10444 DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10445 }
10446 break;
10447 }
10448 default:
10449 if (followSubRegDef(MI&: *MI, RSR)) {
10450 if (!RSR.Reg)
10451 return nullptr;
10452 DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10453 }
10454 }
10455 if (!DefInst)
10456 return MI;
10457 }
10458 return nullptr;
10459}
10460
10461bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
10462 Register VReg,
10463 const MachineInstr &DefMI,
10464 const MachineInstr &UseMI) {
10465 assert(MRI.isSSA() && "Must be run on SSA");
10466
10467 auto *TRI = MRI.getTargetRegisterInfo();
10468 auto *DefBB = DefMI.getParent();
10469
10470 // Don't bother searching between blocks, although it is possible this block
10471 // doesn't modify exec.
10472 if (UseMI.getParent() != DefBB)
10473 return true;
10474
10475 const int MaxInstScan = 20;
10476 int NumInst = 0;
10477
10478 // Stop scan at the use.
10479 auto E = UseMI.getIterator();
10480 for (auto I = std::next(x: DefMI.getIterator()); I != E; ++I) {
10481 if (I->isDebugInstr())
10482 continue;
10483
10484 if (++NumInst > MaxInstScan)
10485 return true;
10486
10487 if (I->modifiesRegister(Reg: AMDGPU::EXEC, TRI))
10488 return true;
10489 }
10490
10491 return false;
10492}
10493
10494bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
10495 Register VReg,
10496 const MachineInstr &DefMI) {
10497 assert(MRI.isSSA() && "Must be run on SSA");
10498
10499 auto *TRI = MRI.getTargetRegisterInfo();
10500 auto *DefBB = DefMI.getParent();
10501
10502 const int MaxUseScan = 10;
10503 int NumUse = 0;
10504
10505 for (auto &Use : MRI.use_nodbg_operands(Reg: VReg)) {
10506 auto &UseInst = *Use.getParent();
10507 // Don't bother searching between blocks, although it is possible this block
10508 // doesn't modify exec.
10509 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10510 return true;
10511
10512 if (++NumUse > MaxUseScan)
10513 return true;
10514 }
10515
10516 if (NumUse == 0)
10517 return false;
10518
10519 const int MaxInstScan = 20;
10520 int NumInst = 0;
10521
10522 // Stop scan when we have seen all the uses.
10523 for (auto I = std::next(x: DefMI.getIterator()); ; ++I) {
10524 assert(I != DefBB->end());
10525
10526 if (I->isDebugInstr())
10527 continue;
10528
10529 if (++NumInst > MaxInstScan)
10530 return true;
10531
10532 for (const MachineOperand &Op : I->operands()) {
10533 // We don't check reg masks here as they're used only on calls:
10534 // 1. EXEC is only considered const within one BB
10535 // 2. Call should be a terminator instruction if present in a BB
10536
10537 if (!Op.isReg())
10538 continue;
10539
10540 Register Reg = Op.getReg();
10541 if (Op.isUse()) {
10542 if (Reg == VReg && --NumUse == 0)
10543 return false;
10544 } else if (TRI->regsOverlap(RegA: Reg, RegB: AMDGPU::EXEC))
10545 return true;
10546 }
10547 }
10548}
10549
10550MachineInstr *SIInstrInfo::createPHIDestinationCopy(
10551 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
10552 const DebugLoc &DL, Register Src, Register Dst) const {
10553 auto Cur = MBB.begin();
10554 if (Cur != MBB.end())
10555 do {
10556 if (!Cur->isPHI() && Cur->readsRegister(Reg: Dst, /*TRI=*/nullptr))
10557 return BuildMI(BB&: MBB, I: Cur, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: Dst).addReg(RegNo: Src);
10558 ++Cur;
10559 } while (Cur != MBB.end() && Cur != LastPHIIt);
10560
10561 return TargetInstrInfo::createPHIDestinationCopy(MBB, InsPt: LastPHIIt, DL, Src,
10562 Dst);
10563}
10564
10565MachineInstr *SIInstrInfo::createPHISourceCopy(
10566 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
10567 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10568 if (InsPt != MBB.end() &&
10569 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10570 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10571 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10572 InsPt->definesRegister(Reg: Src, /*TRI=*/nullptr)) {
10573 InsPt++;
10574 return BuildMI(BB&: MBB, I: InsPt, MIMD: DL,
10575 MCID: get(Opcode: AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), DestReg: Dst)
10576 .addReg(RegNo: Src, Flags: {}, SubReg: SrcSubReg)
10577 .addReg(RegNo: AMDGPU::EXEC, Flags: RegState::Implicit);
10578 }
10579 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10580 Dst);
10581}
10582
10583bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10584
10585MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
10586 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
10587 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10588 VirtRegMap *VRM) const {
10589 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10590 //
10591 // %0:sreg_32 = COPY $m0
10592 //
10593 // We explicitly chose SReg_32 for the virtual register so such a copy might
10594 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10595 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10596 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10597 // TargetInstrInfo::foldMemoryOperand() is going to try.
10598 // A similar issue also exists with spilling and reloading $exec registers.
10599 //
10600 // To prevent that, constrain the %0 register class here.
10601 if (isFullCopyInstr(MI)) {
10602 Register DstReg = MI.getOperand(i: 0).getReg();
10603 Register SrcReg = MI.getOperand(i: 1).getReg();
10604 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10605 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10606 MachineRegisterInfo &MRI = MF.getRegInfo();
10607 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10608 const TargetRegisterClass *RC = MRI.getRegClass(Reg: VirtReg);
10609 if (RC->hasSuperClassEq(RC: &AMDGPU::SReg_32RegClass)) {
10610 MRI.constrainRegClass(Reg: VirtReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
10611 return nullptr;
10612 }
10613 if (RC->hasSuperClassEq(RC: &AMDGPU::SReg_64RegClass)) {
10614 MRI.constrainRegClass(Reg: VirtReg, RC: &AMDGPU::SReg_64_XEXECRegClass);
10615 return nullptr;
10616 }
10617 }
10618 }
10619
10620 return nullptr;
10621}
10622
10623unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
10624 const MachineInstr &MI,
10625 unsigned *PredCost) const {
10626 if (MI.isBundle()) {
10627 MachineBasicBlock::const_instr_iterator I(MI.getIterator());
10628 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10629 unsigned Lat = 0, Count = 0;
10630 for (++I; I != E && I->isBundledWithPred(); ++I) {
10631 ++Count;
10632 Lat = std::max(a: Lat, b: SchedModel.computeInstrLatency(MI: &*I));
10633 }
10634 return Lat + Count - 1;
10635 }
10636
10637 return SchedModel.computeInstrLatency(MI: &MI);
10638}
10639
10640const MachineOperand &
10641SIInstrInfo::getCalleeOperand(const MachineInstr &MI) const {
10642 if (const MachineOperand *CallAddrOp =
10643 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0))
10644 return *CallAddrOp;
10645 return TargetInstrInfo::getCalleeOperand(MI);
10646}
10647
10648InstructionUniformity
10649SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
10650 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10651 unsigned Opcode = MI.getOpcode();
10652
10653 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10654 Register Dst = MI.getOperand(i: 0).getReg();
10655 Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: 2).getReg()
10656 : MI.getOperand(i: 1).getReg();
10657 LLT DstTy = MRI.getType(Reg: Dst);
10658 LLT SrcTy = MRI.getType(Reg: Src);
10659 unsigned DstAS = DstTy.getAddressSpace();
10660 unsigned SrcAS = SrcTy.getAddressSpace();
10661 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10662 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10663 ST.hasGloballyAddressableScratch()
10664 ? InstructionUniformity::NeverUniform
10665 : InstructionUniformity::Default;
10666 };
10667
10668 // If the target supports globally addressable scratch, the mapping from
10669 // scratch memory to the flat aperture changes therefore an address space cast
10670 // is no longer uniform.
10671 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10672 return HandleAddrSpaceCast(MI);
10673
10674 if (auto *GI = dyn_cast<GIntrinsic>(Val: &MI)) {
10675 auto IID = GI->getIntrinsicID();
10676 if (AMDGPU::isIntrinsicSourceOfDivergence(IntrID: IID))
10677 return InstructionUniformity::NeverUniform;
10678 if (AMDGPU::isIntrinsicAlwaysUniform(IntrID: IID))
10679 return InstructionUniformity::AlwaysUniform;
10680
10681 switch (IID) {
10682 case Intrinsic::amdgcn_addrspacecast_nonnull:
10683 return HandleAddrSpaceCast(MI);
10684 case Intrinsic::amdgcn_if:
10685 case Intrinsic::amdgcn_else:
10686 // FIXME: Uniform if second result
10687 break;
10688 }
10689
10690 return InstructionUniformity::Default;
10691 }
10692
10693 // Loads from the private and flat address spaces are divergent, because
10694 // threads can execute the load instruction with the same inputs and get
10695 // different results.
10696 //
10697 // All other loads are not divergent, because if threads issue loads with the
10698 // same arguments, they will always get the same result.
10699 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10700 Opcode == AMDGPU::G_SEXTLOAD) {
10701 if (MI.memoperands_empty())
10702 return InstructionUniformity::NeverUniform; // conservative assumption
10703
10704 if (llvm::any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *mmo) {
10705 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10706 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10707 })) {
10708 // At least one MMO in a non-global address space.
10709 return InstructionUniformity::NeverUniform;
10710 }
10711 return InstructionUniformity::Default;
10712 }
10713
10714 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opc: Opcode) ||
10715 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10716 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10717 AMDGPU::isGenericAtomic(Opc: Opcode)) {
10718 return InstructionUniformity::NeverUniform;
10719 }
10720 return InstructionUniformity::Default;
10721}
10722
10723const MIRFormatter *SIInstrInfo::getMIRFormatter() const {
10724 if (!Formatter)
10725 Formatter = std::make_unique<AMDGPUMIRFormatter>(args: ST);
10726 return Formatter.get();
10727}
10728
10729InstructionUniformity
10730SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
10731
10732 if (isNeverUniform(MI))
10733 return InstructionUniformity::NeverUniform;
10734
10735 unsigned opcode = MI.getOpcode();
10736 if (opcode == AMDGPU::V_READLANE_B32 ||
10737 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10738 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10739 return InstructionUniformity::AlwaysUniform;
10740
10741 if (isCopyInstr(MI)) {
10742 const MachineOperand &srcOp = MI.getOperand(i: 1);
10743 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10744 const TargetRegisterClass *regClass =
10745 RI.getPhysRegBaseClass(Reg: srcOp.getReg());
10746 return RI.isSGPRClass(RC: regClass) ? InstructionUniformity::AlwaysUniform
10747 : InstructionUniformity::NeverUniform;
10748 }
10749 return InstructionUniformity::Default;
10750 }
10751
10752 // GMIR handling
10753 if (MI.isPreISelOpcode())
10754 return SIInstrInfo::getGenericInstructionUniformity(MI);
10755
10756 // Atomics are divergent because they are executed sequentially: when an
10757 // atomic operation refers to the same address in each thread, then each
10758 // thread after the first sees the value written by the previous thread as
10759 // original value.
10760
10761 if (isAtomic(MI))
10762 return InstructionUniformity::NeverUniform;
10763
10764 // Loads from the private and flat address spaces are divergent, because
10765 // threads can execute the load instruction with the same inputs and get
10766 // different results.
10767 if (isFLAT(MI) && MI.mayLoad()) {
10768 if (MI.memoperands_empty())
10769 return InstructionUniformity::NeverUniform; // conservative assumption
10770
10771 if (llvm::any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *mmo) {
10772 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10773 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10774 })) {
10775 // At least one MMO in a non-global address space.
10776 return InstructionUniformity::NeverUniform;
10777 }
10778
10779 return InstructionUniformity::Default;
10780 }
10781
10782 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10783 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10784
10785 // FIXME: It's conceptually broken to report this for an instruction, and not
10786 // a specific def operand. For inline asm in particular, there could be mixed
10787 // uniform and divergent results.
10788 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10789 const MachineOperand &SrcOp = MI.getOperand(i: I);
10790 if (!SrcOp.isReg())
10791 continue;
10792
10793 Register Reg = SrcOp.getReg();
10794 if (!Reg || !SrcOp.readsReg())
10795 continue;
10796
10797 // If RegBank is null, this is unassigned or an unallocatable special
10798 // register, which are all scalars.
10799 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, TRI: RI);
10800 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10801 return InstructionUniformity::NeverUniform;
10802 }
10803
10804 // TODO: Uniformity check condtions above can be rearranged for more
10805 // redability
10806
10807 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10808 // currently turned into no-op COPYs by SelectionDAG ISel and are
10809 // therefore no longer recognizable.
10810
10811 return InstructionUniformity::Default;
10812}
10813
10814unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
10815 switch (MF.getFunction().getCallingConv()) {
10816 case CallingConv::AMDGPU_PS:
10817 return 1;
10818 case CallingConv::AMDGPU_VS:
10819 return 2;
10820 case CallingConv::AMDGPU_GS:
10821 return 3;
10822 case CallingConv::AMDGPU_HS:
10823 case CallingConv::AMDGPU_LS:
10824 case CallingConv::AMDGPU_ES: {
10825 const Function &F = MF.getFunction();
10826 F.getContext().diagnose(DI: DiagnosticInfoUnsupported(
10827 F, "ds_ordered_count unsupported for this calling conv"));
10828 [[fallthrough]];
10829 }
10830 case CallingConv::AMDGPU_CS:
10831 case CallingConv::AMDGPU_KERNEL:
10832 case CallingConv::C:
10833 case CallingConv::Fast:
10834 default:
10835 // Assume other calling conventions are various compute callable functions
10836 return 0;
10837 }
10838}
10839
10840bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
10841 Register &SrcReg2, int64_t &CmpMask,
10842 int64_t &CmpValue) const {
10843 if (!MI.getOperand(i: 0).isReg() || MI.getOperand(i: 0).getSubReg())
10844 return false;
10845
10846 switch (MI.getOpcode()) {
10847 default:
10848 break;
10849 case AMDGPU::S_CMP_EQ_U32:
10850 case AMDGPU::S_CMP_EQ_I32:
10851 case AMDGPU::S_CMP_LG_U32:
10852 case AMDGPU::S_CMP_LG_I32:
10853 case AMDGPU::S_CMP_LT_U32:
10854 case AMDGPU::S_CMP_LT_I32:
10855 case AMDGPU::S_CMP_GT_U32:
10856 case AMDGPU::S_CMP_GT_I32:
10857 case AMDGPU::S_CMP_LE_U32:
10858 case AMDGPU::S_CMP_LE_I32:
10859 case AMDGPU::S_CMP_GE_U32:
10860 case AMDGPU::S_CMP_GE_I32:
10861 case AMDGPU::S_CMP_EQ_U64:
10862 case AMDGPU::S_CMP_LG_U64:
10863 SrcReg = MI.getOperand(i: 0).getReg();
10864 if (MI.getOperand(i: 1).isReg()) {
10865 if (MI.getOperand(i: 1).getSubReg())
10866 return false;
10867 SrcReg2 = MI.getOperand(i: 1).getReg();
10868 CmpValue = 0;
10869 } else if (MI.getOperand(i: 1).isImm()) {
10870 SrcReg2 = Register();
10871 CmpValue = MI.getOperand(i: 1).getImm();
10872 } else {
10873 return false;
10874 }
10875 CmpMask = ~0;
10876 return true;
10877 case AMDGPU::S_CMPK_EQ_U32:
10878 case AMDGPU::S_CMPK_EQ_I32:
10879 case AMDGPU::S_CMPK_LG_U32:
10880 case AMDGPU::S_CMPK_LG_I32:
10881 case AMDGPU::S_CMPK_LT_U32:
10882 case AMDGPU::S_CMPK_LT_I32:
10883 case AMDGPU::S_CMPK_GT_U32:
10884 case AMDGPU::S_CMPK_GT_I32:
10885 case AMDGPU::S_CMPK_LE_U32:
10886 case AMDGPU::S_CMPK_LE_I32:
10887 case AMDGPU::S_CMPK_GE_U32:
10888 case AMDGPU::S_CMPK_GE_I32:
10889 SrcReg = MI.getOperand(i: 0).getReg();
10890 SrcReg2 = Register();
10891 CmpValue = MI.getOperand(i: 1).getImm();
10892 CmpMask = ~0;
10893 return true;
10894 }
10895
10896 return false;
10897}
10898
10899static bool isSCCDeadOnExit(MachineBasicBlock *MBB) {
10900 for (MachineBasicBlock *S : MBB->successors()) {
10901 if (S->isLiveIn(Reg: AMDGPU::SCC))
10902 return false;
10903 }
10904 return true;
10905}
10906
10907// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
10908// (incoming SCC) = !(SCC defined by SCCDef).
10909// Return true if all uses can be re-written, false otherwise.
10910bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
10911 MachineBasicBlock *MBB = SCCDef->getParent();
10912 SmallVector<MachineInstr *> InvertInstr;
10913 bool SCCIsDead = false;
10914
10915 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
10916 constexpr unsigned ScanLimit = 12;
10917 unsigned Count = 0;
10918 for (MachineInstr &MI :
10919 make_range(x: std::next(x: MachineBasicBlock::iterator(SCCDef)), y: MBB->end())) {
10920 if (++Count > ScanLimit)
10921 return false;
10922 if (MI.readsRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
10923 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
10924 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
10925 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10926 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
10927 InvertInstr.push_back(Elt: &MI);
10928 else
10929 return false;
10930 }
10931 if (MI.definesRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
10932 SCCIsDead = true;
10933 break;
10934 }
10935 }
10936 if (!SCCIsDead && isSCCDeadOnExit(MBB))
10937 SCCIsDead = true;
10938
10939 // SCC may have more uses. Can't invert all of them.
10940 if (!SCCIsDead)
10941 return false;
10942
10943 // Invert uses
10944 for (MachineInstr *MI : InvertInstr) {
10945 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
10946 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
10947 swapOperands(Inst&: *MI);
10948 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10949 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
10950 MI->setDesc(get(Opcode: MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
10951 ? AMDGPU::S_CBRANCH_SCC1
10952 : AMDGPU::S_CBRANCH_SCC0));
10953 } else {
10954 llvm_unreachable("SCC used but no inversion handling");
10955 }
10956 }
10957 return true;
10958}
10959
10960// SCC is already valid after SCCValid.
10961// SCCRedefine will redefine SCC to the same value already available after
10962// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
10963// update kill/dead flags if necessary.
10964bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
10965 bool NeedInversion) const {
10966 MachineInstr *KillsSCC = nullptr;
10967 if (SCCValid->getParent() != SCCRedefine->getParent())
10968 return false;
10969 for (MachineInstr &MI : make_range(x: std::next(x: SCCValid->getIterator()),
10970 y: SCCRedefine->getIterator())) {
10971 if (MI.modifiesRegister(Reg: AMDGPU::SCC, TRI: &RI))
10972 return false;
10973 if (MI.killsRegister(Reg: AMDGPU::SCC, TRI: &RI))
10974 KillsSCC = &MI;
10975 }
10976 if (NeedInversion && !invertSCCUse(SCCDef: SCCRedefine))
10977 return false;
10978 if (MachineOperand *SccDef =
10979 SCCValid->findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr))
10980 SccDef->setIsDead(false);
10981 if (KillsSCC)
10982 KillsSCC->clearRegisterKills(Reg: AMDGPU::SCC, /*TRI=*/RegInfo: nullptr);
10983 SCCRedefine->eraseFromParent();
10984 return true;
10985}
10986
10987static bool foldableSelect(const MachineInstr &Def) {
10988 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
10989 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
10990 return false;
10991 bool Op1IsNonZeroImm =
10992 Def.getOperand(i: 1).isImm() && Def.getOperand(i: 1).getImm() != 0;
10993 bool Op2IsZeroImm =
10994 Def.getOperand(i: 2).isImm() && Def.getOperand(i: 2).getImm() == 0;
10995 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
10996 return false;
10997 return true;
10998}
10999
11000static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11001 unsigned &NewDefOpc) {
11002 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11003 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11004 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11005 Def.getOpcode() != AMDGPU::S_ADD_U32)
11006 return false;
11007 const MachineOperand &AddSrc1 = Def.getOperand(i: 1);
11008 const MachineOperand &AddSrc2 = Def.getOperand(i: 2);
11009 int64_t addend;
11010
11011 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11012 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11013 (!getFoldableImm(MO: &AddSrc1, Imm&: addend) || addend != 1) &&
11014 (!getFoldableImm(MO: &AddSrc2, Imm&: addend) || addend != 1))
11015 return false;
11016
11017 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11018 const MachineOperand *SccDef =
11019 Def.findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
11020 if (!SccDef->isDead())
11021 return false;
11022 NewDefOpc = AMDGPU::S_ADD_U32;
11023 }
11024 NeedInversion = !NeedInversion;
11025 return true;
11026}
11027
11028bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
11029 Register SrcReg2, int64_t CmpMask,
11030 int64_t CmpValue,
11031 const MachineRegisterInfo *MRI) const {
11032 if (!SrcReg || SrcReg.isPhysical())
11033 return false;
11034
11035 if (SrcReg2 && !getFoldableImm(Reg: SrcReg2, MRI: *MRI, Imm&: CmpValue))
11036 return false;
11037
11038 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11039 this](bool NeedInversion) -> bool {
11040 if (CmpValue != 0)
11041 return false;
11042
11043 MachineInstr *Def = MRI->getVRegDef(Reg: SrcReg);
11044 if (!Def)
11045 return false;
11046
11047 // For S_OP that set SCC = DST!=0, do the transformation
11048 //
11049 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11050 //
11051 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11052 // do the transformation:
11053 //
11054 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11055 //
11056 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11057 // for S_CSELECT* already has the same value that will be calculated by
11058 // s_cmp_lg_*
11059 //
11060 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11061 // (non-zero imm), 0)
11062
11063 unsigned NewDefOpc = Def->getOpcode();
11064 if (!setsSCCIfResultIsNonZero(*Def) &&
11065 !setsSCCIfResultIsZero(Def: *Def, NeedInversion, NewDefOpc) &&
11066 !foldableSelect(Def: *Def))
11067 return false;
11068
11069 if (!optimizeSCC(SCCValid: Def, SCCRedefine: &CmpInstr, NeedInversion))
11070 return false;
11071
11072 if (NewDefOpc != Def->getOpcode())
11073 Def->setDesc(get(Opcode: NewDefOpc));
11074
11075 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11076 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11077 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11078 // sX = s_cselect_b64 (non-zero imm), 0
11079 // sLo = copy sX.sub0
11080 // sHi = copy sX.sub1
11081 // sY = s_or_b32 sLo, sHi
11082 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11083 MRI->use_nodbg_empty(RegNo: Def->getOperand(i: 0).getReg())) {
11084 const MachineOperand &OrOpnd1 = Def->getOperand(i: 1);
11085 const MachineOperand &OrOpnd2 = Def->getOperand(i: 2);
11086 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11087 MachineInstr *Def1 = MRI->getVRegDef(Reg: OrOpnd1.getReg());
11088 MachineInstr *Def2 = MRI->getVRegDef(Reg: OrOpnd2.getReg());
11089 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11090 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(i: 1).isReg() &&
11091 Def2->getOperand(i: 1).isReg() &&
11092 Def1->getOperand(i: 1).getSubReg() == AMDGPU::sub0 &&
11093 Def2->getOperand(i: 1).getSubReg() == AMDGPU::sub1 &&
11094 Def1->getOperand(i: 1).getReg() == Def2->getOperand(i: 1).getReg()) {
11095 MachineInstr *Select = MRI->getVRegDef(Reg: Def1->getOperand(i: 1).getReg());
11096 if (Select && foldableSelect(Def: *Select))
11097 optimizeSCC(SCCValid: Select, SCCRedefine: Def, /*NeedInversion=*/false);
11098 }
11099 }
11100 }
11101 return true;
11102 };
11103
11104 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11105 this](int64_t ExpectedValue, unsigned SrcSize,
11106 bool IsReversible, bool IsSigned) -> bool {
11107 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11108 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11109 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11110 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11111 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11112 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11113 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11114 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11115 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11116 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11117 //
11118 // Signed ge/gt are not used for the sign bit.
11119 //
11120 // If result of the AND is unused except in the compare:
11121 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11122 //
11123 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11124 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11125 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11126 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11127 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11128 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11129
11130 MachineInstr *Def = MRI->getVRegDef(Reg: SrcReg);
11131 if (!Def)
11132 return false;
11133
11134 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11135 Def->getOpcode() != AMDGPU::S_AND_B64)
11136 return false;
11137
11138 int64_t Mask;
11139 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11140 if (MO->isImm())
11141 Mask = MO->getImm();
11142 else if (!getFoldableImm(MO, Imm&: Mask))
11143 return false;
11144 Mask &= maxUIntN(N: SrcSize);
11145 return isPowerOf2_64(Value: Mask);
11146 };
11147
11148 MachineOperand *SrcOp = &Def->getOperand(i: 1);
11149 if (isMask(SrcOp))
11150 SrcOp = &Def->getOperand(i: 2);
11151 else if (isMask(&Def->getOperand(i: 2)))
11152 SrcOp = &Def->getOperand(i: 1);
11153 else
11154 return false;
11155
11156 // A valid Mask is required to have a single bit set, hence a non-zero and
11157 // power-of-two value. This verifies that we will not do 64-bit shift below.
11158 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11159 unsigned BitNo = llvm::countr_zero(Val: (uint64_t)Mask);
11160 if (IsSigned && BitNo == SrcSize - 1)
11161 return false;
11162
11163 ExpectedValue <<= BitNo;
11164
11165 bool IsReversedCC = false;
11166 if (CmpValue != ExpectedValue) {
11167 if (!IsReversible)
11168 return false;
11169 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11170 if (!IsReversedCC)
11171 return false;
11172 }
11173
11174 Register DefReg = Def->getOperand(i: 0).getReg();
11175 if (IsReversedCC && !MRI->hasOneNonDBGUse(RegNo: DefReg))
11176 return false;
11177
11178 if (!optimizeSCC(SCCValid: Def, SCCRedefine: &CmpInstr, /*NeedInversion=*/false))
11179 return false;
11180
11181 if (!MRI->use_nodbg_empty(RegNo: DefReg)) {
11182 assert(!IsReversedCC);
11183 return true;
11184 }
11185
11186 // Replace AND with unused result with a S_BITCMP.
11187 MachineBasicBlock *MBB = Def->getParent();
11188
11189 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11190 : AMDGPU::S_BITCMP1_B32
11191 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11192 : AMDGPU::S_BITCMP1_B64;
11193
11194 BuildMI(BB&: *MBB, I: Def, MIMD: Def->getDebugLoc(), MCID: get(Opcode: NewOpc))
11195 .add(MO: *SrcOp)
11196 .addImm(Val: BitNo);
11197 Def->eraseFromParent();
11198
11199 return true;
11200 };
11201
11202 switch (CmpInstr.getOpcode()) {
11203 default:
11204 break;
11205 case AMDGPU::S_CMP_EQ_U32:
11206 case AMDGPU::S_CMP_EQ_I32:
11207 case AMDGPU::S_CMPK_EQ_U32:
11208 case AMDGPU::S_CMPK_EQ_I32:
11209 return optimizeCmpAnd(1, 32, true, false) ||
11210 optimizeCmpSelect(/*NeedInversion=*/true);
11211 case AMDGPU::S_CMP_GE_U32:
11212 case AMDGPU::S_CMPK_GE_U32:
11213 return optimizeCmpAnd(1, 32, false, false);
11214 case AMDGPU::S_CMP_GE_I32:
11215 case AMDGPU::S_CMPK_GE_I32:
11216 return optimizeCmpAnd(1, 32, false, true);
11217 case AMDGPU::S_CMP_EQ_U64:
11218 return optimizeCmpAnd(1, 64, true, false);
11219 case AMDGPU::S_CMP_LG_U32:
11220 case AMDGPU::S_CMP_LG_I32:
11221 case AMDGPU::S_CMPK_LG_U32:
11222 case AMDGPU::S_CMPK_LG_I32:
11223 return optimizeCmpAnd(0, 32, true, false) ||
11224 optimizeCmpSelect(/*NeedInversion=*/false);
11225 case AMDGPU::S_CMP_GT_U32:
11226 case AMDGPU::S_CMPK_GT_U32:
11227 return optimizeCmpAnd(0, 32, false, false);
11228 case AMDGPU::S_CMP_GT_I32:
11229 case AMDGPU::S_CMPK_GT_I32:
11230 return optimizeCmpAnd(0, 32, false, true);
11231 case AMDGPU::S_CMP_LG_U64:
11232 return optimizeCmpAnd(0, 64, true, false) ||
11233 optimizeCmpSelect(/*NeedInversion=*/false);
11234 }
11235
11236 return false;
11237}
11238
11239void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI,
11240 AMDGPU::OpName OpName) const {
11241 if (!ST.needsAlignedVGPRs())
11242 return;
11243
11244 int OpNo = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpName);
11245 if (OpNo < 0)
11246 return;
11247 MachineOperand &Op = MI.getOperand(i: OpNo);
11248 if (getOpSize(MI, OpNo) > 4)
11249 return;
11250
11251 // Add implicit aligned super-reg to force alignment on the data operand.
11252 const DebugLoc &DL = MI.getDebugLoc();
11253 MachineBasicBlock *BB = MI.getParent();
11254 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11255 Register DataReg = Op.getReg();
11256 bool IsAGPR = RI.isAGPR(MRI, Reg: DataReg);
11257 Register Undef = MRI.createVirtualRegister(
11258 RegClass: IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11259 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
11260 Register NewVR =
11261 MRI.createVirtualRegister(RegClass: IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11262 : &AMDGPU::VReg_64_Align2RegClass);
11263 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewVR)
11264 .addReg(RegNo: DataReg, Flags: {}, SubReg: Op.getSubReg())
11265 .addImm(Val: AMDGPU::sub0)
11266 .addReg(RegNo: Undef)
11267 .addImm(Val: AMDGPU::sub1);
11268 Op.setReg(NewVR);
11269 Op.setSubReg(AMDGPU::sub0);
11270 MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewVR, isDef: false, isImp: true));
11271}
11272
11273bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
11274 if (isIGLP(MI: *MI))
11275 return false;
11276
11277 return TargetInstrInfo::isGlobalMemoryObject(MI);
11278}
11279
11280bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const {
11281 if (!isWMMA(MI) && !isSWMMAC(MI))
11282 return false;
11283
11284 if (ST.hasGFX1250Insts())
11285 return AMDGPU::getWMMAIsXDL(Opc: MI.getOpcode());
11286
11287 return true;
11288}
11289
11290bool SIInstrInfo::isXDL(const MachineInstr &MI) const {
11291 unsigned Opcode = MI.getOpcode();
11292
11293 if (AMDGPU::isGFX12Plus(STI: ST))
11294 return isDOT(MI) || isXDLWMMA(MI);
11295
11296 if (!isMAI(MI) || isDGEMM(Opcode) ||
11297 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11298 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11299 return false;
11300
11301 if (!ST.hasGFX940Insts())
11302 return true;
11303
11304 return AMDGPU::getMAIIsGFX940XDL(Opc: Opcode);
11305}
11306