1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/Analysis/ValueTracking.h"
24#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25#include "llvm/CodeGen/LiveIntervals.h"
26#include "llvm/CodeGen/LiveVariables.h"
27#include "llvm/CodeGen/MachineDominators.h"
28#include "llvm/CodeGen/MachineFrameInfo.h"
29#include "llvm/CodeGen/MachineScheduler.h"
30#include "llvm/CodeGen/RegisterScavenging.h"
31#include "llvm/CodeGen/ScheduleDAG.h"
32#include "llvm/IR/DiagnosticInfo.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
35#include "llvm/Support/CommandLine.h"
36#include "llvm/Target/TargetMachine.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
55static cl::opt<unsigned>
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(Val: 16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
59static cl::opt<bool> Fix16BitCopies(
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(Val: true),
63 cl::ReallyHidden);
64
65SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
66 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
67 AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(TSInfo: &ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(Num: N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
85static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1,
86 AMDGPU::OpName OpName) {
87 unsigned Opc0 = N0->getMachineOpcode();
88 unsigned Opc1 = N1->getMachineOpcode();
89
90 int Op0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: OpName);
91 int Op1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: OpName);
92
93 if (Op0Idx == -1 && Op1Idx == -1)
94 return true;
95
96
97 if ((Op0Idx == -1 && Op1Idx != -1) ||
98 (Op1Idx == -1 && Op0Idx != -1))
99 return false;
100
101 // getNamedOperandIdx returns the index for the MachineInstr's operands,
102 // which includes the result as the first operand. We are indexing into the
103 // MachineSDNode's operands, so we need to skip the result operand to get
104 // the real index.
105 --Op0Idx;
106 --Op1Idx;
107
108 return N0->getOperand(Num: Op0Idx) == N1->getOperand(Num: Op1Idx);
109}
110
111static bool canRemat(const MachineInstr &MI) {
112
113 if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) ||
114 SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) ||
115 SIInstrInfo::isSALU(MI))
116 return true;
117
118 if (SIInstrInfo::isSMRD(MI)) {
119 return !MI.memoperands_empty() &&
120 llvm::all_of(Range: MI.memoperands(), P: [](const MachineMemOperand *MMO) {
121 return MMO->isLoad() && MMO->isInvariant();
122 });
123 }
124
125 return false;
126}
127
128bool SIInstrInfo::isReMaterializableImpl(
129 const MachineInstr &MI) const {
130
131 if (canRemat(MI)) {
132 // Normally VALU use of exec would block the rematerialization, but that
133 // is OK in this case to have an implicit exec read as all VALU do.
134 // We really want all of the generic logic for this except for this.
135
136 // Another potential implicit use is mode register. The core logic of
137 // the RA will not attempt rematerialization if mode is set anywhere
138 // in the function, otherwise it is safe since mode is not changed.
139
140 // There is difference to generic method which does not allow
141 // rematerialization if there are virtual register uses. We allow this,
142 // therefore this method includes SOP instructions as well.
143 if (!MI.hasImplicitDef() &&
144 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
145 !MI.mayRaiseFPException())
146 return true;
147 }
148
149 return TargetInstrInfo::isReMaterializableImpl(MI);
150}
151
152// Returns true if the scalar result of a VALU instruction depends on exec.
153bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
154 // Ignore comparisons which are only used masked with exec.
155 // This allows some hoisting/sinking of VALU comparisons.
156 if (MI.isCompare()) {
157 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
158 if (!Dst)
159 return true;
160
161 Register DstReg = Dst->getReg();
162 if (!DstReg.isVirtual())
163 return true;
164
165 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
166 for (MachineInstr &Use : MRI.use_nodbg_instructions(Reg: DstReg)) {
167 switch (Use.getOpcode()) {
168 case AMDGPU::S_AND_SAVEEXEC_B32:
169 case AMDGPU::S_AND_SAVEEXEC_B64:
170 break;
171 case AMDGPU::S_AND_B32:
172 case AMDGPU::S_AND_B64:
173 if (!Use.readsRegister(Reg: AMDGPU::EXEC, /*TRI=*/nullptr))
174 return true;
175 break;
176 default:
177 return true;
178 }
179 }
180 return false;
181 }
182
183 // If it is not convergent it does not depend on EXEC.
184 if (!MI.isConvergent())
185 return false;
186
187 switch (MI.getOpcode()) {
188 default:
189 break;
190 case AMDGPU::V_READFIRSTLANE_B32:
191 return true;
192 }
193
194 return false;
195}
196
197bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
198 // Any implicit use of exec by VALU is not a real register read.
199 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
200 isVALU(MI: *MO.getParent()) && !resultDependsOnExec(MI: *MO.getParent());
201}
202
203bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
204 MachineBasicBlock *SuccToSinkTo,
205 MachineCycleInfo *CI) const {
206 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
207 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
208 return true;
209
210 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
211 // Check if sinking of MI would create temporal divergent use.
212 for (auto Op : MI.uses()) {
213 if (Op.isReg() && Op.getReg().isVirtual() &&
214 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Op.getReg()))) {
215 MachineInstr *SgprDef = MRI.getVRegDef(Reg: Op.getReg());
216
217 // SgprDef defined inside cycle
218 MachineCycle *FromCycle = CI->getCycle(Block: SgprDef->getParent());
219 if (FromCycle == nullptr)
220 continue;
221
222 MachineCycle *ToCycle = CI->getCycle(Block: SuccToSinkTo);
223 // Check if there is a FromCycle that contains SgprDef's basic block but
224 // does not contain SuccToSinkTo and also has divergent exit condition.
225 while (FromCycle && !FromCycle->contains(C: ToCycle)) {
226 SmallVector<MachineBasicBlock *, 1> ExitingBlocks;
227 FromCycle->getExitingBlocks(TmpStorage&: ExitingBlocks);
228
229 // FromCycle has divergent exit condition.
230 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
231 if (hasDivergentBranch(MBB: ExitingBlock))
232 return false;
233 }
234
235 FromCycle = FromCycle->getParentCycle();
236 }
237 }
238 }
239
240 return true;
241}
242
243bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
244 int64_t &Offset0,
245 int64_t &Offset1) const {
246 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
247 return false;
248
249 unsigned Opc0 = Load0->getMachineOpcode();
250 unsigned Opc1 = Load1->getMachineOpcode();
251
252 // Make sure both are actually loads.
253 if (!get(Opcode: Opc0).mayLoad() || !get(Opcode: Opc1).mayLoad())
254 return false;
255
256 // A mayLoad instruction without a def is not a load. Likely a prefetch.
257 if (!get(Opcode: Opc0).getNumDefs() || !get(Opcode: Opc1).getNumDefs())
258 return false;
259
260 if (isDS(Opcode: Opc0) && isDS(Opcode: Opc1)) {
261
262 // FIXME: Handle this case:
263 if (getNumOperandsNoGlue(Node: Load0) != getNumOperandsNoGlue(Node: Load1))
264 return false;
265
266 // Check base reg.
267 if (Load0->getOperand(Num: 0) != Load1->getOperand(Num: 0))
268 return false;
269
270 // Skip read2 / write2 variants for simplicity.
271 // TODO: We should report true if the used offsets are adjacent (excluded
272 // st64 versions).
273 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: AMDGPU::OpName::offset);
274 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: AMDGPU::OpName::offset);
275 if (Offset0Idx == -1 || Offset1Idx == -1)
276 return false;
277
278 // XXX - be careful of dataless loads
279 // getNamedOperandIdx returns the index for MachineInstrs. Since they
280 // include the output in the operand list, but SDNodes don't, we need to
281 // subtract the index by one.
282 Offset0Idx -= get(Opcode: Opc0).NumDefs;
283 Offset1Idx -= get(Opcode: Opc1).NumDefs;
284 Offset0 = Load0->getConstantOperandVal(Num: Offset0Idx);
285 Offset1 = Load1->getConstantOperandVal(Num: Offset1Idx);
286 return true;
287 }
288
289 if (isSMRD(Opcode: Opc0) && isSMRD(Opcode: Opc1)) {
290 // Skip time and cache invalidation instructions.
291 if (!AMDGPU::hasNamedOperand(Opcode: Opc0, NamedIdx: AMDGPU::OpName::sbase) ||
292 !AMDGPU::hasNamedOperand(Opcode: Opc1, NamedIdx: AMDGPU::OpName::sbase))
293 return false;
294
295 unsigned NumOps = getNumOperandsNoGlue(Node: Load0);
296 if (NumOps != getNumOperandsNoGlue(Node: Load1))
297 return false;
298
299 // Check base reg.
300 if (Load0->getOperand(Num: 0) != Load1->getOperand(Num: 0))
301 return false;
302
303 // Match register offsets, if both register and immediate offsets present.
304 assert(NumOps == 4 || NumOps == 5);
305 if (NumOps == 5 && Load0->getOperand(Num: 1) != Load1->getOperand(Num: 1))
306 return false;
307
308 const ConstantSDNode *Load0Offset =
309 dyn_cast<ConstantSDNode>(Val: Load0->getOperand(Num: NumOps - 3));
310 const ConstantSDNode *Load1Offset =
311 dyn_cast<ConstantSDNode>(Val: Load1->getOperand(Num: NumOps - 3));
312
313 if (!Load0Offset || !Load1Offset)
314 return false;
315
316 Offset0 = Load0Offset->getZExtValue();
317 Offset1 = Load1Offset->getZExtValue();
318 return true;
319 }
320
321 // MUBUF and MTBUF can access the same addresses.
322 if ((isMUBUF(Opcode: Opc0) || isMTBUF(Opcode: Opc0)) && (isMUBUF(Opcode: Opc1) || isMTBUF(Opcode: Opc1))) {
323
324 // MUBUF and MTBUF have vaddr at different indices.
325 if (!nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::soffset) ||
326 !nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::vaddr) ||
327 !nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::srsrc))
328 return false;
329
330 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: AMDGPU::OpName::offset);
331 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: AMDGPU::OpName::offset);
332
333 if (OffIdx0 == -1 || OffIdx1 == -1)
334 return false;
335
336 // getNamedOperandIdx returns the index for MachineInstrs. Since they
337 // include the output in the operand list, but SDNodes don't, we need to
338 // subtract the index by one.
339 OffIdx0 -= get(Opcode: Opc0).NumDefs;
340 OffIdx1 -= get(Opcode: Opc1).NumDefs;
341
342 SDValue Off0 = Load0->getOperand(Num: OffIdx0);
343 SDValue Off1 = Load1->getOperand(Num: OffIdx1);
344
345 // The offset might be a FrameIndexSDNode.
346 if (!isa<ConstantSDNode>(Val: Off0) || !isa<ConstantSDNode>(Val: Off1))
347 return false;
348
349 Offset0 = Off0->getAsZExtVal();
350 Offset1 = Off1->getAsZExtVal();
351 return true;
352 }
353
354 return false;
355}
356
357static bool isStride64(unsigned Opc) {
358 switch (Opc) {
359 case AMDGPU::DS_READ2ST64_B32:
360 case AMDGPU::DS_READ2ST64_B64:
361 case AMDGPU::DS_WRITE2ST64_B32:
362 case AMDGPU::DS_WRITE2ST64_B64:
363 return true;
364 default:
365 return false;
366 }
367}
368
369bool SIInstrInfo::getMemOperandsWithOffsetWidth(
370 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
371 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
372 const TargetRegisterInfo *TRI) const {
373 if (!LdSt.mayLoadOrStore())
374 return false;
375
376 unsigned Opc = LdSt.getOpcode();
377 OffsetIsScalable = false;
378 const MachineOperand *BaseOp, *OffsetOp;
379 int DataOpIdx;
380
381 if (isDS(MI: LdSt)) {
382 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::addr);
383 OffsetOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
384 if (OffsetOp) {
385 // Normal, single offset LDS instruction.
386 if (!BaseOp) {
387 // DS_CONSUME/DS_APPEND use M0 for the base address.
388 // TODO: find the implicit use operand for M0 and use that as BaseOp?
389 return false;
390 }
391 BaseOps.push_back(Elt: BaseOp);
392 Offset = OffsetOp->getImm();
393 // Get appropriate operand, and compute width accordingly.
394 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
395 if (DataOpIdx == -1)
396 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
397 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
398 Width = LocationSize::precise(Value: 64);
399 else
400 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
401 } else {
402 // The 2 offset instructions use offset0 and offset1 instead. We can treat
403 // these as a load with a single offset if the 2 offsets are consecutive.
404 // We will use this for some partially aligned loads.
405 const MachineOperand *Offset0Op =
406 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset0);
407 const MachineOperand *Offset1Op =
408 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset1);
409
410 unsigned Offset0 = Offset0Op->getImm() & 0xff;
411 unsigned Offset1 = Offset1Op->getImm() & 0xff;
412 if (Offset0 + 1 != Offset1)
413 return false;
414
415 // Each of these offsets is in element sized units, so we need to convert
416 // to bytes of the individual reads.
417
418 unsigned EltSize;
419 if (LdSt.mayLoad())
420 EltSize = TRI->getRegSizeInBits(RC: *getOpRegClass(MI: LdSt, OpNo: 0)) / 16;
421 else {
422 assert(LdSt.mayStore());
423 int Data0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
424 EltSize = TRI->getRegSizeInBits(RC: *getOpRegClass(MI: LdSt, OpNo: Data0Idx)) / 8;
425 }
426
427 if (isStride64(Opc))
428 EltSize *= 64;
429
430 BaseOps.push_back(Elt: BaseOp);
431 Offset = EltSize * Offset0;
432 // Get appropriate operand(s), and compute width accordingly.
433 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
434 if (DataOpIdx == -1) {
435 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
436 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
437 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data1);
438 Width = LocationSize::precise(
439 Value: Width.getValue() + TypeSize::getFixed(ExactSize: getOpSize(MI: LdSt, OpNo: DataOpIdx)));
440 } else {
441 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
442 }
443 }
444 return true;
445 }
446
447 if (isMUBUF(MI: LdSt) || isMTBUF(MI: LdSt)) {
448 const MachineOperand *RSrc = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::srsrc);
449 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
450 return false;
451 BaseOps.push_back(Elt: RSrc);
452 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr);
453 if (BaseOp && !BaseOp->isFI())
454 BaseOps.push_back(Elt: BaseOp);
455 const MachineOperand *OffsetImm =
456 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
457 Offset = OffsetImm->getImm();
458 const MachineOperand *SOffset =
459 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::soffset);
460 if (SOffset) {
461 if (SOffset->isReg())
462 BaseOps.push_back(Elt: SOffset);
463 else
464 Offset += SOffset->getImm();
465 }
466 // Get appropriate operand, and compute width accordingly.
467 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
468 if (DataOpIdx == -1)
469 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
470 if (DataOpIdx == -1) // LDS DMA
471 return false;
472 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
473 return true;
474 }
475
476 if (isImage(MI: LdSt)) {
477 auto RsrcOpName =
478 isMIMG(MI: LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
479 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: RsrcOpName);
480 BaseOps.push_back(Elt: &LdSt.getOperand(i: SRsrcIdx));
481 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0);
482 if (VAddr0Idx >= 0) {
483 // GFX10 possible NSA encoding.
484 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
485 BaseOps.push_back(Elt: &LdSt.getOperand(i: I));
486 } else {
487 BaseOps.push_back(Elt: getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr));
488 }
489 Offset = 0;
490 // Get appropriate operand, and compute width accordingly.
491 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
492 if (DataOpIdx == -1)
493 return false; // no return sampler
494 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
495 return true;
496 }
497
498 if (isSMRD(MI: LdSt)) {
499 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::sbase);
500 if (!BaseOp) // e.g. S_MEMTIME
501 return false;
502 BaseOps.push_back(Elt: BaseOp);
503 OffsetOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
504 Offset = OffsetOp ? OffsetOp->getImm() : 0;
505 // Get appropriate operand, and compute width accordingly.
506 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::sdst);
507 if (DataOpIdx == -1)
508 return false;
509 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
510 return true;
511 }
512
513 if (isFLAT(MI: LdSt)) {
514 // Instructions have either vaddr or saddr or both or none.
515 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr);
516 if (BaseOp)
517 BaseOps.push_back(Elt: BaseOp);
518 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::saddr);
519 if (BaseOp)
520 BaseOps.push_back(Elt: BaseOp);
521 Offset = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset)->getImm();
522 // Get appropriate operand, and compute width accordingly.
523 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
524 if (DataOpIdx == -1)
525 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
526 if (DataOpIdx == -1) // LDS DMA
527 return false;
528 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
529 return true;
530 }
531
532 return false;
533}
534
535static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
536 ArrayRef<const MachineOperand *> BaseOps1,
537 const MachineInstr &MI2,
538 ArrayRef<const MachineOperand *> BaseOps2) {
539 // Only examine the first "base" operand of each instruction, on the
540 // assumption that it represents the real base address of the memory access.
541 // Other operands are typically offsets or indices from this base address.
542 if (BaseOps1.front()->isIdenticalTo(Other: *BaseOps2.front()))
543 return true;
544
545 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
546 return false;
547
548 auto *MO1 = *MI1.memoperands_begin();
549 auto *MO2 = *MI2.memoperands_begin();
550 if (MO1->getAddrSpace() != MO2->getAddrSpace())
551 return false;
552
553 const auto *Base1 = MO1->getValue();
554 const auto *Base2 = MO2->getValue();
555 if (!Base1 || !Base2)
556 return false;
557 Base1 = getUnderlyingObject(V: Base1);
558 Base2 = getUnderlyingObject(V: Base2);
559
560 if (isa<UndefValue>(Val: Base1) || isa<UndefValue>(Val: Base2))
561 return false;
562
563 return Base1 == Base2;
564}
565
566bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
567 int64_t Offset1, bool OffsetIsScalable1,
568 ArrayRef<const MachineOperand *> BaseOps2,
569 int64_t Offset2, bool OffsetIsScalable2,
570 unsigned ClusterSize,
571 unsigned NumBytes) const {
572 // If the mem ops (to be clustered) do not have the same base ptr, then they
573 // should not be clustered
574 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
575 if (!BaseOps1.empty() && !BaseOps2.empty()) {
576 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
577 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
578 if (!memOpsHaveSameBasePtr(MI1: FirstLdSt, BaseOps1, MI2: SecondLdSt, BaseOps2))
579 return false;
580
581 const SIMachineFunctionInfo *MFI =
582 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
583 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
584 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
585 // If only one base op is empty, they do not have the same base ptr
586 return false;
587 }
588
589 // In order to avoid register pressure, on an average, the number of DWORDS
590 // loaded together by all clustered mem ops should not exceed
591 // MaxMemoryClusterDWords. This is an empirical value based on certain
592 // observations and performance related experiments.
593 // The good thing about this heuristic is - it avoids clustering of too many
594 // sub-word loads, and also avoids clustering of wide loads. Below is the
595 // brief summary of how the heuristic behaves for various `LoadSize` when
596 // MaxMemoryClusterDWords is 8.
597 //
598 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
599 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
600 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
601 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
602 // (5) LoadSize >= 17: do not cluster
603 const unsigned LoadSize = NumBytes / ClusterSize;
604 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
605 return NumDWords <= MaxMemoryClusterDWords;
606}
607
608// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
609// the first 16 loads will be interleaved with the stores, and the next 16 will
610// be clustered as expected. It should really split into 2 16 store batches.
611//
612// Loads are clustered until this returns false, rather than trying to schedule
613// groups of stores. This also means we have to deal with saying different
614// address space loads should be clustered, and ones which might cause bank
615// conflicts.
616//
617// This might be deprecated so it might not be worth that much effort to fix.
618bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
619 int64_t Offset0, int64_t Offset1,
620 unsigned NumLoads) const {
621 assert(Offset1 > Offset0 &&
622 "Second offset should be larger than first offset!");
623 // If we have less than 16 loads in a row, and the offsets are within 64
624 // bytes, then schedule together.
625
626 // A cacheline is 64 bytes (for global memory).
627 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
628}
629
630static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
631 MachineBasicBlock::iterator MI,
632 const DebugLoc &DL, MCRegister DestReg,
633 MCRegister SrcReg, bool KillSrc,
634 const char *Msg = "illegal VGPR to SGPR copy") {
635 MachineFunction *MF = MBB.getParent();
636
637 LLVMContext &C = MF->getFunction().getContext();
638 C.diagnose(DI: DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
639
640 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_ILLEGAL_COPY), DestReg)
641 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
642}
643
644/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
645/// possible to have a direct copy in these cases on GFX908, so an intermediate
646/// VGPR copy is required.
647static void indirectCopyToAGPR(const SIInstrInfo &TII,
648 MachineBasicBlock &MBB,
649 MachineBasicBlock::iterator MI,
650 const DebugLoc &DL, MCRegister DestReg,
651 MCRegister SrcReg, bool KillSrc,
652 RegScavenger &RS, bool RegsOverlap,
653 Register ImpDefSuperReg = Register(),
654 Register ImpUseSuperReg = Register()) {
655 assert((TII.getSubtarget().hasMAIInsts() &&
656 !TII.getSubtarget().hasGFX90AInsts()) &&
657 "Expected GFX908 subtarget.");
658
659 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
660 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
661 "Source register of the copy should be either an SGPR or an AGPR.");
662
663 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
664 "Destination register of the copy should be an AGPR.");
665
666 const SIRegisterInfo &RI = TII.getRegisterInfo();
667
668 // First try to find defining accvgpr_write to avoid temporary registers.
669 // In the case of copies of overlapping AGPRs, we conservatively do not
670 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
671 // an accvgpr_write used for this same copy due to implicit-defs
672 if (!RegsOverlap) {
673 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
674 --Def;
675
676 if (!Def->modifiesRegister(Reg: SrcReg, TRI: &RI))
677 continue;
678
679 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
680 Def->getOperand(i: 0).getReg() != SrcReg)
681 break;
682
683 MachineOperand &DefOp = Def->getOperand(i: 1);
684 assert(DefOp.isReg() || DefOp.isImm());
685
686 if (DefOp.isReg()) {
687 bool SafeToPropagate = true;
688 // Check that register source operand is not clobbered before MI.
689 // Immediate operands are always safe to propagate.
690 for (auto I = Def; I != MI && SafeToPropagate; ++I)
691 if (I->modifiesRegister(Reg: DefOp.getReg(), TRI: &RI))
692 SafeToPropagate = false;
693
694 if (!SafeToPropagate)
695 break;
696
697 for (auto I = Def; I != MI; ++I)
698 I->clearRegisterKills(Reg: DefOp.getReg(), RegInfo: &RI);
699 }
700
701 MachineInstrBuilder Builder =
702 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
703 .add(MO: DefOp);
704 if (ImpDefSuperReg)
705 Builder.addReg(RegNo: ImpDefSuperReg, Flags: RegState::Define | RegState::Implicit);
706
707 if (ImpUseSuperReg) {
708 Builder.addReg(RegNo: ImpUseSuperReg,
709 Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
710 }
711
712 return;
713 }
714 }
715
716 RS.enterBasicBlockEnd(MBB);
717 RS.backward(I: std::next(x: MI));
718
719 // Ideally we want to have three registers for a long reg_sequence copy
720 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
721 unsigned MaxVGPRs = RI.getRegPressureLimit(RC: &AMDGPU::VGPR_32RegClass,
722 MF&: *MBB.getParent());
723
724 // Registers in the sequence are allocated contiguously so we can just
725 // use register number to pick one of three round-robin temps.
726 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
727 Register Tmp =
728 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
729 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
730 "VGPR used for an intermediate copy should have been reserved.");
731
732 // Only loop through if there are any free registers left. We don't want to
733 // spill.
734 while (RegNo--) {
735 Register Tmp2 = RS.scavengeRegisterBackwards(RC: AMDGPU::VGPR_32RegClass, To: MI,
736 /* RestoreAfter */ false, SPAdj: 0,
737 /* AllowSpill */ false);
738 if (!Tmp2 || RI.getHWRegIndex(Reg: Tmp2) >= MaxVGPRs)
739 break;
740 Tmp = Tmp2;
741 RS.setRegUsed(Reg: Tmp);
742 }
743
744 // Insert copy to temporary VGPR.
745 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
746 if (AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg)) {
747 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
748 } else {
749 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
750 }
751
752 MachineInstrBuilder UseBuilder = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: TmpCopyOp), DestReg: Tmp)
753 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
754 if (ImpUseSuperReg) {
755 UseBuilder.addReg(RegNo: ImpUseSuperReg,
756 Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
757 }
758
759 MachineInstrBuilder DefBuilder
760 = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
761 .addReg(RegNo: Tmp, Flags: RegState::Kill);
762
763 if (ImpDefSuperReg)
764 DefBuilder.addReg(RegNo: ImpDefSuperReg, Flags: RegState::Define | RegState::Implicit);
765}
766
767static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
768 MachineBasicBlock::iterator MI, const DebugLoc &DL,
769 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
770 const TargetRegisterClass *RC, bool Forward) {
771 const SIRegisterInfo &RI = TII.getRegisterInfo();
772 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, EltSize: 4);
773 MachineBasicBlock::iterator I = MI;
774 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
775
776 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
777 int16_t SubIdx = BaseIndices[Idx];
778 Register DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
779 Register SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
780 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
781 unsigned Opcode = AMDGPU::S_MOV_B32;
782
783 // Is SGPR aligned? If so try to combine with next.
784 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
785 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
786 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
787 // Can use SGPR64 copy
788 unsigned Channel = RI.getChannelFromSubReg(SubReg: SubIdx);
789 SubIdx = RI.getSubRegFromChannel(Channel, NumRegs: 2);
790 DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
791 SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
792 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
793 Opcode = AMDGPU::S_MOV_B64;
794 Idx++;
795 }
796
797 LastMI = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode), DestReg: DestSubReg)
798 .addReg(RegNo: SrcSubReg)
799 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
800
801 if (!FirstMI)
802 FirstMI = LastMI;
803
804 if (!Forward)
805 I--;
806 }
807
808 assert(FirstMI && LastMI);
809 if (!Forward)
810 std::swap(a&: FirstMI, b&: LastMI);
811
812 FirstMI->addOperand(
813 Op: MachineOperand::CreateReg(Reg: DestReg, isDef: true /*IsDef*/, isImp: true /*IsImp*/));
814
815 if (KillSrc)
816 LastMI->addRegisterKilled(IncomingReg: SrcReg, RegInfo: &RI);
817}
818
819void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
820 MachineBasicBlock::iterator MI,
821 const DebugLoc &DL, Register DestReg,
822 Register SrcReg, bool KillSrc, bool RenamableDest,
823 bool RenamableSrc) const {
824 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(Reg: DestReg);
825 unsigned Size = RI.getRegSizeInBits(RC: *RC);
826 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(Reg: SrcReg);
827 unsigned SrcSize = RI.getRegSizeInBits(RC: *SrcRC);
828
829 // The rest of copyPhysReg assumes Src and Dst size are the same size.
830 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
831 // we remove Fix16BitCopies and this code block?
832 if (Fix16BitCopies) {
833 if (((Size == 16) != (SrcSize == 16))) {
834 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
835 assert(ST.useRealTrue16Insts());
836 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
837 MCRegister SubReg = RI.getSubReg(Reg: RegToFix, Idx: AMDGPU::lo16);
838 RegToFix = SubReg;
839
840 if (DestReg == SrcReg) {
841 // Identity copy. Insert empty bundle since ExpandPostRA expects an
842 // instruction here.
843 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::BUNDLE));
844 return;
845 }
846 RC = RI.getPhysRegBaseClass(Reg: DestReg);
847 Size = RI.getRegSizeInBits(RC: *RC);
848 SrcRC = RI.getPhysRegBaseClass(Reg: SrcReg);
849 SrcSize = RI.getRegSizeInBits(RC: *SrcRC);
850 }
851 }
852
853 if (RC == &AMDGPU::VGPR_32RegClass) {
854 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
855 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
856 AMDGPU::AGPR_32RegClass.contains(SrcReg));
857 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg) ?
858 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
859 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: Opc), DestReg)
860 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
861 return;
862 }
863
864 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
865 RC == &AMDGPU::SReg_32RegClass) {
866 if (SrcReg == AMDGPU::SCC) {
867 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B32), DestReg)
868 .addImm(Val: 1)
869 .addImm(Val: 0);
870 return;
871 }
872
873 if (!AMDGPU::SReg_32RegClass.contains(Reg: SrcReg)) {
874 if (DestReg == AMDGPU::VCC_LO) {
875 // FIXME: Hack until VReg_1 removed.
876 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
877 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_U32_e32))
878 .addImm(Val: 0)
879 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
880 return;
881 }
882
883 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
884 return;
885 }
886
887 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg)
888 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
889 return;
890 }
891
892 if (RC == &AMDGPU::SReg_64RegClass) {
893 if (SrcReg == AMDGPU::SCC) {
894 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B64), DestReg)
895 .addImm(Val: 1)
896 .addImm(Val: 0);
897 return;
898 }
899
900 if (!AMDGPU::SReg_64_EncodableRegClass.contains(Reg: SrcReg)) {
901 if (DestReg == AMDGPU::VCC) {
902 // FIXME: Hack until VReg_1 removed.
903 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
904 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_U32_e32))
905 .addImm(Val: 0)
906 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
907 return;
908 }
909
910 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
911 return;
912 }
913
914 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B64), DestReg)
915 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
916 return;
917 }
918
919 if (DestReg == AMDGPU::SCC) {
920 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
921 // but SelectionDAG emits such copies for i1 sources.
922 if (AMDGPU::SReg_64RegClass.contains(Reg: SrcReg)) {
923 // This copy can only be produced by patterns
924 // with explicit SCC, which are known to be enabled
925 // only for subtargets with S_CMP_LG_U64 present.
926 assert(ST.hasScalarCompareEq64());
927 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U64))
928 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
929 .addImm(Val: 0);
930 } else {
931 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
932 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U32))
933 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
934 .addImm(Val: 0);
935 }
936
937 return;
938 }
939
940 if (RC == &AMDGPU::AGPR_32RegClass) {
941 if (AMDGPU::VGPR_32RegClass.contains(Reg: SrcReg) ||
942 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(Reg: SrcReg))) {
943 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
944 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
945 return;
946 }
947
948 if (AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg) && ST.hasGFX90AInsts()) {
949 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
950 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
951 return;
952 }
953
954 // FIXME: Pass should maintain scavenger to avoid scan through the block on
955 // every AGPR spill.
956 RegScavenger RS;
957 const bool Overlap = RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
958 indirectCopyToAGPR(TII: *this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, RegsOverlap: Overlap);
959 return;
960 }
961
962 if (Size == 16) {
963 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
964 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
965 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
966
967 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(Reg: DestReg);
968 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(Reg: SrcReg);
969 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(Reg: DestReg);
970 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(Reg: SrcReg);
971 bool DstLow = !AMDGPU::isHi16Reg(Reg: DestReg, MRI: RI);
972 bool SrcLow = !AMDGPU::isHi16Reg(Reg: SrcReg, MRI: RI);
973 MCRegister NewDestReg = RI.get32BitRegister(Reg: DestReg);
974 MCRegister NewSrcReg = RI.get32BitRegister(Reg: SrcReg);
975
976 if (IsSGPRDst) {
977 if (!IsSGPRSrc) {
978 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
979 return;
980 }
981
982 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: NewDestReg)
983 .addReg(RegNo: NewSrcReg, Flags: getKillRegState(B: KillSrc));
984 return;
985 }
986
987 if (IsAGPRDst || IsAGPRSrc) {
988 if (!DstLow || !SrcLow) {
989 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
990 Msg: "Cannot use hi16 subreg with an AGPR!");
991 }
992
993 copyPhysReg(MBB, MI, DL, DestReg: NewDestReg, SrcReg: NewSrcReg, KillSrc);
994 return;
995 }
996
997 if (ST.useRealTrue16Insts()) {
998 if (IsSGPRSrc) {
999 assert(SrcLow);
1000 SrcReg = NewSrcReg;
1001 }
1002 // Use the smaller instruction encoding if possible.
1003 if (AMDGPU::VGPR_16_Lo128RegClass.contains(Reg: DestReg) &&
1004 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(Reg: SrcReg))) {
1005 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B16_t16_e32), DestReg)
1006 .addReg(RegNo: SrcReg);
1007 } else {
1008 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B16_t16_e64), DestReg)
1009 .addImm(Val: 0) // src0_modifiers
1010 .addReg(RegNo: SrcReg)
1011 .addImm(Val: 0); // op_sel
1012 }
1013 return;
1014 }
1015
1016 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1017 if (!DstLow || !SrcLow) {
1018 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1019 Msg: "Cannot use hi16 subreg on VI!");
1020 }
1021
1022 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: NewDestReg)
1023 .addReg(RegNo: NewSrcReg, Flags: getKillRegState(B: KillSrc));
1024 return;
1025 }
1026
1027 auto MIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_sdwa), DestReg: NewDestReg)
1028 .addImm(Val: 0) // src0_modifiers
1029 .addReg(RegNo: NewSrcReg)
1030 .addImm(Val: 0) // clamp
1031 .addImm(Val: DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1032 : AMDGPU::SDWA::SdwaSel::WORD_1)
1033 .addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
1034 .addImm(Val: SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1035 : AMDGPU::SDWA::SdwaSel::WORD_1)
1036 .addReg(RegNo: NewDestReg, Flags: RegState::Implicit | RegState::Undef);
1037 // First implicit operand is $exec.
1038 MIB->tieOperands(DefIdx: 0, UseIdx: MIB->getNumOperands() - 1);
1039 return;
1040 }
1041
1042 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(RC: SrcRC))) {
1043 if (ST.hasMovB64()) {
1044 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B64_e32), DestReg)
1045 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
1046 return;
1047 }
1048 if (ST.hasPkMovB32()) {
1049 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg)
1050 .addImm(Val: SISrcMods::OP_SEL_1)
1051 .addReg(RegNo: SrcReg)
1052 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1053 .addReg(RegNo: SrcReg)
1054 .addImm(Val: 0) // op_sel_lo
1055 .addImm(Val: 0) // op_sel_hi
1056 .addImm(Val: 0) // neg_lo
1057 .addImm(Val: 0) // neg_hi
1058 .addImm(Val: 0) // clamp
1059 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
1060 return;
1061 }
1062 }
1063
1064 const bool Forward = RI.getHWRegIndex(Reg: DestReg) <= RI.getHWRegIndex(Reg: SrcReg);
1065 if (RI.isSGPRClass(RC)) {
1066 if (!RI.isSGPRClass(RC: SrcRC)) {
1067 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1068 return;
1069 }
1070 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
1071 expandSGPRCopy(TII: *this, MBB, MI, DL, DestReg, SrcReg, KillSrc: CanKillSuperReg, RC,
1072 Forward);
1073 return;
1074 }
1075
1076 unsigned EltSize = 4;
1077 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1078 if (RI.isAGPRClass(RC)) {
1079 if (ST.hasGFX90AInsts() && RI.isAGPRClass(RC: SrcRC))
1080 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1081 else if (RI.hasVGPRs(RC: SrcRC) ||
1082 (ST.hasGFX90AInsts() && RI.isSGPRClass(RC: SrcRC)))
1083 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1084 else
1085 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1086 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(RC: SrcRC)) {
1087 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1088 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1089 (RI.isProperlyAlignedRC(RC: *RC) &&
1090 (SrcRC == RC || RI.isSGPRClass(RC: SrcRC)))) {
1091 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1092 if (ST.hasMovB64()) {
1093 Opcode = AMDGPU::V_MOV_B64_e32;
1094 EltSize = 8;
1095 } else if (ST.hasPkMovB32()) {
1096 Opcode = AMDGPU::V_PK_MOV_B32;
1097 EltSize = 8;
1098 }
1099 }
1100
1101 // For the cases where we need an intermediate instruction/temporary register
1102 // (destination is an AGPR), we need a scavenger.
1103 //
1104 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1105 // whole block for every handled copy.
1106 std::unique_ptr<RegScavenger> RS;
1107 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1108 RS = std::make_unique<RegScavenger>();
1109
1110 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1111
1112 // If there is an overlap, we can't kill the super-register on the last
1113 // instruction, since it will also kill the components made live by this def.
1114 const bool Overlap = RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
1115 const bool CanKillSuperReg = KillSrc && !Overlap;
1116
1117 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1118 unsigned SubIdx;
1119 if (Forward)
1120 SubIdx = SubIndices[Idx];
1121 else
1122 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1123 Register DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
1124 Register SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
1125 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1126
1127 bool IsFirstSubreg = Idx == 0;
1128 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1129
1130 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1131 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1132 Register ImpUseSuper = SrcReg;
1133 indirectCopyToAGPR(TII: *this, MBB, MI, DL, DestReg: DestSubReg, SrcReg: SrcSubReg, KillSrc: UseKill,
1134 RS&: *RS, RegsOverlap: Overlap, ImpDefSuperReg: ImpDefSuper, ImpUseSuperReg: ImpUseSuper);
1135 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1136 MachineInstrBuilder MIB =
1137 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg: DestSubReg)
1138 .addImm(Val: SISrcMods::OP_SEL_1)
1139 .addReg(RegNo: SrcSubReg)
1140 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1141 .addReg(RegNo: SrcSubReg)
1142 .addImm(Val: 0) // op_sel_lo
1143 .addImm(Val: 0) // op_sel_hi
1144 .addImm(Val: 0) // neg_lo
1145 .addImm(Val: 0) // neg_hi
1146 .addImm(Val: 0) // clamp
1147 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: UseKill) | RegState::Implicit);
1148 if (IsFirstSubreg)
1149 MIB.addReg(RegNo: DestReg, Flags: RegState::Define | RegState::Implicit);
1150 } else {
1151 MachineInstrBuilder Builder =
1152 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode), DestReg: DestSubReg).addReg(RegNo: SrcSubReg);
1153 if (IsFirstSubreg)
1154 Builder.addReg(RegNo: DestReg, Flags: RegState::Define | RegState::Implicit);
1155
1156 Builder.addReg(RegNo: SrcReg, Flags: getKillRegState(B: UseKill) | RegState::Implicit);
1157 }
1158 }
1159}
1160
1161int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1162 int32_t NewOpc;
1163
1164 // Try to map original to commuted opcode
1165 NewOpc = AMDGPU::getCommuteRev(Opcode);
1166 if (NewOpc != -1)
1167 // Check if the commuted (REV) opcode exists on the target.
1168 return pseudoToMCOpcode(Opcode: NewOpc) != -1 ? NewOpc : -1;
1169
1170 // Try to map commuted to original opcode
1171 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1172 if (NewOpc != -1)
1173 // Check if the original (non-REV) opcode exists on the target.
1174 return pseudoToMCOpcode(Opcode: NewOpc) != -1 ? NewOpc : -1;
1175
1176 return Opcode;
1177}
1178
1179const TargetRegisterClass *
1180SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
1181 return &AMDGPU::VGPR_32RegClass;
1182}
1183
1184void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
1185 MachineBasicBlock::iterator I,
1186 const DebugLoc &DL, Register DstReg,
1187 ArrayRef<MachineOperand> Cond,
1188 Register TrueReg,
1189 Register FalseReg) const {
1190 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1191 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1192 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
1193 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1194 "Not a VGPR32 reg");
1195
1196 if (Cond.size() == 1) {
1197 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1198 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1199 .add(MO: Cond[0]);
1200 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1201 .addImm(Val: 0)
1202 .addReg(RegNo: FalseReg)
1203 .addImm(Val: 0)
1204 .addReg(RegNo: TrueReg)
1205 .addReg(RegNo: SReg);
1206 } else if (Cond.size() == 2) {
1207 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1208 switch (Cond[0].getImm()) {
1209 case SIInstrInfo::SCC_TRUE: {
1210 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1211 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 1).addImm(Val: 0);
1212 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1213 .addImm(Val: 0)
1214 .addReg(RegNo: FalseReg)
1215 .addImm(Val: 0)
1216 .addReg(RegNo: TrueReg)
1217 .addReg(RegNo: SReg);
1218 break;
1219 }
1220 case SIInstrInfo::SCC_FALSE: {
1221 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1222 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 0).addImm(Val: 1);
1223 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1224 .addImm(Val: 0)
1225 .addReg(RegNo: FalseReg)
1226 .addImm(Val: 0)
1227 .addReg(RegNo: TrueReg)
1228 .addReg(RegNo: SReg);
1229 break;
1230 }
1231 case SIInstrInfo::VCCNZ: {
1232 MachineOperand RegOp = Cond[1];
1233 RegOp.setImplicit(false);
1234 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1235 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1236 .add(MO: RegOp);
1237 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1238 .addImm(Val: 0)
1239 .addReg(RegNo: FalseReg)
1240 .addImm(Val: 0)
1241 .addReg(RegNo: TrueReg)
1242 .addReg(RegNo: SReg);
1243 break;
1244 }
1245 case SIInstrInfo::VCCZ: {
1246 MachineOperand RegOp = Cond[1];
1247 RegOp.setImplicit(false);
1248 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1249 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1250 .add(MO: RegOp);
1251 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1252 .addImm(Val: 0)
1253 .addReg(RegNo: TrueReg)
1254 .addImm(Val: 0)
1255 .addReg(RegNo: FalseReg)
1256 .addReg(RegNo: SReg);
1257 break;
1258 }
1259 case SIInstrInfo::EXECNZ: {
1260 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1261 Register SReg2 = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1262 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.OrSaveExecOpc), DestReg: SReg2).addImm(Val: 0);
1263 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 1).addImm(Val: 0);
1264 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1265 .addImm(Val: 0)
1266 .addReg(RegNo: FalseReg)
1267 .addImm(Val: 0)
1268 .addReg(RegNo: TrueReg)
1269 .addReg(RegNo: SReg);
1270 break;
1271 }
1272 case SIInstrInfo::EXECZ: {
1273 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1274 Register SReg2 = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1275 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.OrSaveExecOpc), DestReg: SReg2).addImm(Val: 0);
1276 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 0).addImm(Val: 1);
1277 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1278 .addImm(Val: 0)
1279 .addReg(RegNo: FalseReg)
1280 .addImm(Val: 0)
1281 .addReg(RegNo: TrueReg)
1282 .addReg(RegNo: SReg);
1283 llvm_unreachable("Unhandled branch predicate EXECZ");
1284 break;
1285 }
1286 default:
1287 llvm_unreachable("invalid branch predicate");
1288 }
1289 } else {
1290 llvm_unreachable("Can only handle Cond size 1 or 2");
1291 }
1292}
1293
1294Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
1295 MachineBasicBlock::iterator I,
1296 const DebugLoc &DL,
1297 Register SrcReg, int Value) const {
1298 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1299 Register Reg = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1300 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_EQ_I32_e64), DestReg: Reg)
1301 .addImm(Val: Value)
1302 .addReg(RegNo: SrcReg);
1303
1304 return Reg;
1305}
1306
1307Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
1308 MachineBasicBlock::iterator I,
1309 const DebugLoc &DL,
1310 Register SrcReg, int Value) const {
1311 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1312 Register Reg = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1313 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_I32_e64), DestReg: Reg)
1314 .addImm(Val: Value)
1315 .addReg(RegNo: SrcReg);
1316
1317 return Reg;
1318}
1319
1320bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
1321 const Register Reg,
1322 int64_t &ImmVal) const {
1323 switch (MI.getOpcode()) {
1324 case AMDGPU::V_MOV_B32_e32:
1325 case AMDGPU::S_MOV_B32:
1326 case AMDGPU::S_MOVK_I32:
1327 case AMDGPU::S_MOV_B64:
1328 case AMDGPU::V_MOV_B64_e32:
1329 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1330 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1331 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1332 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1333 case AMDGPU::V_MOV_B64_PSEUDO:
1334 case AMDGPU::V_MOV_B16_t16_e32: {
1335 const MachineOperand &Src0 = MI.getOperand(i: 1);
1336 if (Src0.isImm()) {
1337 ImmVal = Src0.getImm();
1338 return MI.getOperand(i: 0).getReg() == Reg;
1339 }
1340
1341 return false;
1342 }
1343 case AMDGPU::V_MOV_B16_t16_e64: {
1344 const MachineOperand &Src0 = MI.getOperand(i: 2);
1345 if (Src0.isImm() && !MI.getOperand(i: 1).getImm()) {
1346 ImmVal = Src0.getImm();
1347 return MI.getOperand(i: 0).getReg() == Reg;
1348 }
1349
1350 return false;
1351 }
1352 case AMDGPU::S_BREV_B32:
1353 case AMDGPU::V_BFREV_B32_e32:
1354 case AMDGPU::V_BFREV_B32_e64: {
1355 const MachineOperand &Src0 = MI.getOperand(i: 1);
1356 if (Src0.isImm()) {
1357 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Val: Src0.getImm()));
1358 return MI.getOperand(i: 0).getReg() == Reg;
1359 }
1360
1361 return false;
1362 }
1363 case AMDGPU::S_NOT_B32:
1364 case AMDGPU::V_NOT_B32_e32:
1365 case AMDGPU::V_NOT_B32_e64: {
1366 const MachineOperand &Src0 = MI.getOperand(i: 1);
1367 if (Src0.isImm()) {
1368 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1369 return MI.getOperand(i: 0).getReg() == Reg;
1370 }
1371
1372 return false;
1373 }
1374 default:
1375 return false;
1376 }
1377}
1378
1379std::optional<int64_t>
1380SIInstrInfo::getImmOrMaterializedImm(MachineOperand &Op) const {
1381 if (Op.isImm())
1382 return Op.getImm();
1383
1384 if (!Op.isReg() || !Op.getReg().isVirtual())
1385 return std::nullopt;
1386 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1387 const MachineInstr *Def = MRI.getVRegDef(Reg: Op.getReg());
1388 if (Def && Def->isMoveImmediate()) {
1389 const MachineOperand &ImmSrc = Def->getOperand(i: 1);
1390 if (ImmSrc.isImm())
1391 return extractSubregFromImm(ImmVal: ImmSrc.getImm(), SubRegIndex: Op.getSubReg());
1392 }
1393
1394 return std::nullopt;
1395}
1396
1397unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
1398
1399 if (RI.isAGPRClass(RC: DstRC))
1400 return AMDGPU::COPY;
1401 if (RI.getRegSizeInBits(RC: *DstRC) == 16) {
1402 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1403 // before RA.
1404 return RI.isSGPRClass(RC: DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1405 }
1406 if (RI.getRegSizeInBits(RC: *DstRC) == 32)
1407 return RI.isSGPRClass(RC: DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1408 if (RI.getRegSizeInBits(RC: *DstRC) == 64 && RI.isSGPRClass(RC: DstRC))
1409 return AMDGPU::S_MOV_B64;
1410 if (RI.getRegSizeInBits(RC: *DstRC) == 64 && !RI.isSGPRClass(RC: DstRC))
1411 return AMDGPU::V_MOV_B64_PSEUDO;
1412 return AMDGPU::COPY;
1413}
1414
1415const MCInstrDesc &
1416SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
1417 bool IsIndirectSrc) const {
1418 if (IsIndirectSrc) {
1419 if (VecSize <= 32) // 4 bytes
1420 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1421 if (VecSize <= 64) // 8 bytes
1422 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1423 if (VecSize <= 96) // 12 bytes
1424 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1425 if (VecSize <= 128) // 16 bytes
1426 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1427 if (VecSize <= 160) // 20 bytes
1428 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1429 if (VecSize <= 192) // 24 bytes
1430 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1431 if (VecSize <= 224) // 28 bytes
1432 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1433 if (VecSize <= 256) // 32 bytes
1434 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1435 if (VecSize <= 288) // 36 bytes
1436 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1437 if (VecSize <= 320) // 40 bytes
1438 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1439 if (VecSize <= 352) // 44 bytes
1440 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1441 if (VecSize <= 384) // 48 bytes
1442 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1443 if (VecSize <= 512) // 64 bytes
1444 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1445 if (VecSize <= 1024) // 128 bytes
1446 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1447
1448 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1449 }
1450
1451 if (VecSize <= 32) // 4 bytes
1452 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1453 if (VecSize <= 64) // 8 bytes
1454 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1455 if (VecSize <= 96) // 12 bytes
1456 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1457 if (VecSize <= 128) // 16 bytes
1458 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1459 if (VecSize <= 160) // 20 bytes
1460 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1461 if (VecSize <= 192) // 24 bytes
1462 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1463 if (VecSize <= 224) // 28 bytes
1464 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1465 if (VecSize <= 256) // 32 bytes
1466 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1467 if (VecSize <= 288) // 36 bytes
1468 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1469 if (VecSize <= 320) // 40 bytes
1470 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1471 if (VecSize <= 352) // 44 bytes
1472 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1473 if (VecSize <= 384) // 48 bytes
1474 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1475 if (VecSize <= 512) // 64 bytes
1476 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1477 if (VecSize <= 1024) // 128 bytes
1478 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1479
1480 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1481}
1482
1483static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1484 if (VecSize <= 32) // 4 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1486 if (VecSize <= 64) // 8 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1488 if (VecSize <= 96) // 12 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1490 if (VecSize <= 128) // 16 bytes
1491 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1492 if (VecSize <= 160) // 20 bytes
1493 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1494 if (VecSize <= 192) // 24 bytes
1495 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1496 if (VecSize <= 224) // 28 bytes
1497 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1498 if (VecSize <= 256) // 32 bytes
1499 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1500 if (VecSize <= 288) // 36 bytes
1501 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1502 if (VecSize <= 320) // 40 bytes
1503 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1504 if (VecSize <= 352) // 44 bytes
1505 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1506 if (VecSize <= 384) // 48 bytes
1507 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1508 if (VecSize <= 512) // 64 bytes
1509 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1510 if (VecSize <= 1024) // 128 bytes
1511 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1512
1513 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1514}
1515
1516static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1517 if (VecSize <= 32) // 4 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1519 if (VecSize <= 64) // 8 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1521 if (VecSize <= 96) // 12 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1523 if (VecSize <= 128) // 16 bytes
1524 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1525 if (VecSize <= 160) // 20 bytes
1526 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1527 if (VecSize <= 192) // 24 bytes
1528 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1529 if (VecSize <= 224) // 28 bytes
1530 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1531 if (VecSize <= 256) // 32 bytes
1532 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1533 if (VecSize <= 288) // 36 bytes
1534 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1535 if (VecSize <= 320) // 40 bytes
1536 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1537 if (VecSize <= 352) // 44 bytes
1538 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1539 if (VecSize <= 384) // 48 bytes
1540 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1541 if (VecSize <= 512) // 64 bytes
1542 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1543 if (VecSize <= 1024) // 128 bytes
1544 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1545
1546 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1547}
1548
1549static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1550 if (VecSize <= 64) // 8 bytes
1551 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1552 if (VecSize <= 128) // 16 bytes
1553 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1554 if (VecSize <= 256) // 32 bytes
1555 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1556 if (VecSize <= 512) // 64 bytes
1557 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1558 if (VecSize <= 1024) // 128 bytes
1559 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1560
1561 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1562}
1563
1564const MCInstrDesc &
1565SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1566 bool IsSGPR) const {
1567 if (IsSGPR) {
1568 switch (EltSize) {
1569 case 32:
1570 return get(Opcode: getIndirectSGPRWriteMovRelPseudo32(VecSize));
1571 case 64:
1572 return get(Opcode: getIndirectSGPRWriteMovRelPseudo64(VecSize));
1573 default:
1574 llvm_unreachable("invalid reg indexing elt size");
1575 }
1576 }
1577
1578 assert(EltSize == 32 && "invalid reg indexing elt size");
1579 return get(Opcode: getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
1580}
1581
1582static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1583 switch (Size) {
1584 case 4:
1585 return AMDGPU::SI_SPILL_S32_SAVE;
1586 case 8:
1587 return AMDGPU::SI_SPILL_S64_SAVE;
1588 case 12:
1589 return AMDGPU::SI_SPILL_S96_SAVE;
1590 case 16:
1591 return AMDGPU::SI_SPILL_S128_SAVE;
1592 case 20:
1593 return AMDGPU::SI_SPILL_S160_SAVE;
1594 case 24:
1595 return AMDGPU::SI_SPILL_S192_SAVE;
1596 case 28:
1597 return AMDGPU::SI_SPILL_S224_SAVE;
1598 case 32:
1599 return AMDGPU::SI_SPILL_S256_SAVE;
1600 case 36:
1601 return AMDGPU::SI_SPILL_S288_SAVE;
1602 case 40:
1603 return AMDGPU::SI_SPILL_S320_SAVE;
1604 case 44:
1605 return AMDGPU::SI_SPILL_S352_SAVE;
1606 case 48:
1607 return AMDGPU::SI_SPILL_S384_SAVE;
1608 case 64:
1609 return AMDGPU::SI_SPILL_S512_SAVE;
1610 case 128:
1611 return AMDGPU::SI_SPILL_S1024_SAVE;
1612 default:
1613 llvm_unreachable("unknown register size");
1614 }
1615}
1616
1617static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1618 switch (Size) {
1619 case 2:
1620 return AMDGPU::SI_SPILL_V16_SAVE;
1621 case 4:
1622 return AMDGPU::SI_SPILL_V32_SAVE;
1623 case 8:
1624 return AMDGPU::SI_SPILL_V64_SAVE;
1625 case 12:
1626 return AMDGPU::SI_SPILL_V96_SAVE;
1627 case 16:
1628 return AMDGPU::SI_SPILL_V128_SAVE;
1629 case 20:
1630 return AMDGPU::SI_SPILL_V160_SAVE;
1631 case 24:
1632 return AMDGPU::SI_SPILL_V192_SAVE;
1633 case 28:
1634 return AMDGPU::SI_SPILL_V224_SAVE;
1635 case 32:
1636 return AMDGPU::SI_SPILL_V256_SAVE;
1637 case 36:
1638 return AMDGPU::SI_SPILL_V288_SAVE;
1639 case 40:
1640 return AMDGPU::SI_SPILL_V320_SAVE;
1641 case 44:
1642 return AMDGPU::SI_SPILL_V352_SAVE;
1643 case 48:
1644 return AMDGPU::SI_SPILL_V384_SAVE;
1645 case 64:
1646 return AMDGPU::SI_SPILL_V512_SAVE;
1647 case 128:
1648 return AMDGPU::SI_SPILL_V1024_SAVE;
1649 default:
1650 llvm_unreachable("unknown register size");
1651 }
1652}
1653
1654static unsigned getAVSpillSaveOpcode(unsigned Size) {
1655 switch (Size) {
1656 case 4:
1657 return AMDGPU::SI_SPILL_AV32_SAVE;
1658 case 8:
1659 return AMDGPU::SI_SPILL_AV64_SAVE;
1660 case 12:
1661 return AMDGPU::SI_SPILL_AV96_SAVE;
1662 case 16:
1663 return AMDGPU::SI_SPILL_AV128_SAVE;
1664 case 20:
1665 return AMDGPU::SI_SPILL_AV160_SAVE;
1666 case 24:
1667 return AMDGPU::SI_SPILL_AV192_SAVE;
1668 case 28:
1669 return AMDGPU::SI_SPILL_AV224_SAVE;
1670 case 32:
1671 return AMDGPU::SI_SPILL_AV256_SAVE;
1672 case 36:
1673 return AMDGPU::SI_SPILL_AV288_SAVE;
1674 case 40:
1675 return AMDGPU::SI_SPILL_AV320_SAVE;
1676 case 44:
1677 return AMDGPU::SI_SPILL_AV352_SAVE;
1678 case 48:
1679 return AMDGPU::SI_SPILL_AV384_SAVE;
1680 case 64:
1681 return AMDGPU::SI_SPILL_AV512_SAVE;
1682 case 128:
1683 return AMDGPU::SI_SPILL_AV1024_SAVE;
1684 default:
1685 llvm_unreachable("unknown register size");
1686 }
1687}
1688
1689static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1690 bool IsVectorSuperClass) {
1691 // Currently, there is only 32-bit WWM register spills needed.
1692 if (Size != 4)
1693 llvm_unreachable("unknown wwm register spill size");
1694
1695 if (IsVectorSuperClass)
1696 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1697
1698 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1699}
1700
1701unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
1702 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1703 const SIMachineFunctionInfo &MFI) const {
1704 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1705
1706 // Choose the right opcode if spilling a WWM register.
1707 if (MFI.checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
1708 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1709
1710 // TODO: Check if AGPRs are available
1711 if (ST.hasMAIInsts())
1712 return getAVSpillSaveOpcode(Size);
1713
1714 return getVGPRSpillSaveOpcode(Size);
1715}
1716
1717void SIInstrInfo::storeRegToStackSlot(
1718 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
1719 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1720 MachineInstr::MIFlag Flags) const {
1721 MachineFunction *MF = MBB.getParent();
1722 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1723 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1724 const DebugLoc &DL = MBB.findDebugLoc(MBBI: MI);
1725
1726 MachinePointerInfo PtrInfo
1727 = MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1728 MachineMemOperand *MMO = MF->getMachineMemOperand(
1729 PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
1730 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
1731 unsigned SpillSize = RI.getSpillSize(RC: *RC);
1732
1733 MachineRegisterInfo &MRI = MF->getRegInfo();
1734 if (RI.isSGPRClass(RC)) {
1735 MFI->setHasSpilledSGPRs();
1736 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1737 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1738 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1739
1740 // We are only allowed to create one new instruction when spilling
1741 // registers, so we need to use pseudo instruction for spilling SGPRs.
1742 const MCInstrDesc &OpDesc = get(Opcode: getSGPRSpillSaveOpcode(Size: SpillSize));
1743
1744 // The SGPR spill/restore instructions only work on number sgprs, so we need
1745 // to make sure we are using the correct register class.
1746 if (SrcReg.isVirtual() && SpillSize == 4) {
1747 MRI.constrainRegClass(Reg: SrcReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
1748 }
1749
1750 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: OpDesc)
1751 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: isKill)) // data
1752 .addFrameIndex(Idx: FrameIndex) // addr
1753 .addMemOperand(MMO)
1754 .addReg(RegNo: MFI->getStackPtrOffsetReg(), Flags: RegState::Implicit);
1755
1756 if (RI.spillSGPRToVGPR())
1757 FrameInfo.setStackID(ObjectIdx: FrameIndex, ID: TargetStackID::SGPRSpill);
1758 return;
1759 }
1760
1761 unsigned Opcode =
1762 getVectorRegSpillSaveOpcode(Reg: VReg ? VReg : SrcReg, RC, Size: SpillSize, MFI: *MFI);
1763 MFI->setHasSpilledVGPRs();
1764
1765 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode))
1766 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: isKill)) // data
1767 .addFrameIndex(Idx: FrameIndex) // addr
1768 .addReg(RegNo: MFI->getStackPtrOffsetReg()) // scratch_offset
1769 .addImm(Val: 0) // offset
1770 .addMemOperand(MMO);
1771}
1772
1773static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1774 switch (Size) {
1775 case 4:
1776 return AMDGPU::SI_SPILL_S32_RESTORE;
1777 case 8:
1778 return AMDGPU::SI_SPILL_S64_RESTORE;
1779 case 12:
1780 return AMDGPU::SI_SPILL_S96_RESTORE;
1781 case 16:
1782 return AMDGPU::SI_SPILL_S128_RESTORE;
1783 case 20:
1784 return AMDGPU::SI_SPILL_S160_RESTORE;
1785 case 24:
1786 return AMDGPU::SI_SPILL_S192_RESTORE;
1787 case 28:
1788 return AMDGPU::SI_SPILL_S224_RESTORE;
1789 case 32:
1790 return AMDGPU::SI_SPILL_S256_RESTORE;
1791 case 36:
1792 return AMDGPU::SI_SPILL_S288_RESTORE;
1793 case 40:
1794 return AMDGPU::SI_SPILL_S320_RESTORE;
1795 case 44:
1796 return AMDGPU::SI_SPILL_S352_RESTORE;
1797 case 48:
1798 return AMDGPU::SI_SPILL_S384_RESTORE;
1799 case 64:
1800 return AMDGPU::SI_SPILL_S512_RESTORE;
1801 case 128:
1802 return AMDGPU::SI_SPILL_S1024_RESTORE;
1803 default:
1804 llvm_unreachable("unknown register size");
1805 }
1806}
1807
1808static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1809 switch (Size) {
1810 case 2:
1811 return AMDGPU::SI_SPILL_V16_RESTORE;
1812 case 4:
1813 return AMDGPU::SI_SPILL_V32_RESTORE;
1814 case 8:
1815 return AMDGPU::SI_SPILL_V64_RESTORE;
1816 case 12:
1817 return AMDGPU::SI_SPILL_V96_RESTORE;
1818 case 16:
1819 return AMDGPU::SI_SPILL_V128_RESTORE;
1820 case 20:
1821 return AMDGPU::SI_SPILL_V160_RESTORE;
1822 case 24:
1823 return AMDGPU::SI_SPILL_V192_RESTORE;
1824 case 28:
1825 return AMDGPU::SI_SPILL_V224_RESTORE;
1826 case 32:
1827 return AMDGPU::SI_SPILL_V256_RESTORE;
1828 case 36:
1829 return AMDGPU::SI_SPILL_V288_RESTORE;
1830 case 40:
1831 return AMDGPU::SI_SPILL_V320_RESTORE;
1832 case 44:
1833 return AMDGPU::SI_SPILL_V352_RESTORE;
1834 case 48:
1835 return AMDGPU::SI_SPILL_V384_RESTORE;
1836 case 64:
1837 return AMDGPU::SI_SPILL_V512_RESTORE;
1838 case 128:
1839 return AMDGPU::SI_SPILL_V1024_RESTORE;
1840 default:
1841 llvm_unreachable("unknown register size");
1842 }
1843}
1844
1845static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1846 switch (Size) {
1847 case 4:
1848 return AMDGPU::SI_SPILL_AV32_RESTORE;
1849 case 8:
1850 return AMDGPU::SI_SPILL_AV64_RESTORE;
1851 case 12:
1852 return AMDGPU::SI_SPILL_AV96_RESTORE;
1853 case 16:
1854 return AMDGPU::SI_SPILL_AV128_RESTORE;
1855 case 20:
1856 return AMDGPU::SI_SPILL_AV160_RESTORE;
1857 case 24:
1858 return AMDGPU::SI_SPILL_AV192_RESTORE;
1859 case 28:
1860 return AMDGPU::SI_SPILL_AV224_RESTORE;
1861 case 32:
1862 return AMDGPU::SI_SPILL_AV256_RESTORE;
1863 case 36:
1864 return AMDGPU::SI_SPILL_AV288_RESTORE;
1865 case 40:
1866 return AMDGPU::SI_SPILL_AV320_RESTORE;
1867 case 44:
1868 return AMDGPU::SI_SPILL_AV352_RESTORE;
1869 case 48:
1870 return AMDGPU::SI_SPILL_AV384_RESTORE;
1871 case 64:
1872 return AMDGPU::SI_SPILL_AV512_RESTORE;
1873 case 128:
1874 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1875 default:
1876 llvm_unreachable("unknown register size");
1877 }
1878}
1879
1880static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1881 bool IsVectorSuperClass) {
1882 // Currently, there is only 32-bit WWM register spills needed.
1883 if (Size != 4)
1884 llvm_unreachable("unknown wwm register spill size");
1885
1886 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1887 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1888
1889 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1890}
1891
1892unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(
1893 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1894 const SIMachineFunctionInfo &MFI) const {
1895 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1896
1897 // Choose the right opcode if restoring a WWM register.
1898 if (MFI.checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
1899 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1900
1901 // TODO: Check if AGPRs are available
1902 if (ST.hasMAIInsts())
1903 return getAVSpillRestoreOpcode(Size);
1904
1905 assert(!RI.isAGPRClass(RC));
1906 return getVGPRSpillRestoreOpcode(Size);
1907}
1908
1909void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
1910 MachineBasicBlock::iterator MI,
1911 Register DestReg, int FrameIndex,
1912 const TargetRegisterClass *RC,
1913 Register VReg, unsigned SubReg,
1914 MachineInstr::MIFlag Flags) const {
1915 MachineFunction *MF = MBB.getParent();
1916 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1917 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1918 const DebugLoc &DL = MBB.findDebugLoc(MBBI: MI);
1919 unsigned SpillSize = RI.getSpillSize(RC: *RC);
1920
1921 MachinePointerInfo PtrInfo
1922 = MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1923
1924 MachineMemOperand *MMO = MF->getMachineMemOperand(
1925 PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
1926 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
1927
1928 if (RI.isSGPRClass(RC)) {
1929 MFI->setHasSpilledSGPRs();
1930 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1931 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1932 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1933
1934 // FIXME: Maybe this should not include a memoperand because it will be
1935 // lowered to non-memory instructions.
1936 const MCInstrDesc &OpDesc = get(Opcode: getSGPRSpillRestoreOpcode(Size: SpillSize));
1937 if (DestReg.isVirtual() && SpillSize == 4) {
1938 MachineRegisterInfo &MRI = MF->getRegInfo();
1939 MRI.constrainRegClass(Reg: DestReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
1940 }
1941
1942 if (RI.spillSGPRToVGPR())
1943 FrameInfo.setStackID(ObjectIdx: FrameIndex, ID: TargetStackID::SGPRSpill);
1944 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: OpDesc, DestReg)
1945 .addFrameIndex(Idx: FrameIndex) // addr
1946 .addMemOperand(MMO)
1947 .addReg(RegNo: MFI->getStackPtrOffsetReg(), Flags: RegState::Implicit);
1948
1949 return;
1950 }
1951
1952 unsigned Opcode = getVectorRegSpillRestoreOpcode(Reg: VReg ? VReg : DestReg, RC,
1953 Size: SpillSize, MFI: *MFI);
1954 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode), DestReg)
1955 .addFrameIndex(Idx: FrameIndex) // vaddr
1956 .addReg(RegNo: MFI->getStackPtrOffsetReg()) // scratch_offset
1957 .addImm(Val: 0) // offset
1958 .addMemOperand(MMO);
1959}
1960
1961void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
1962 MachineBasicBlock::iterator MI) const {
1963 insertNoops(MBB, MI, Quantity: 1);
1964}
1965
1966void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
1967 MachineBasicBlock::iterator MI,
1968 unsigned Quantity) const {
1969 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
1970 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1971 while (Quantity > 0) {
1972 unsigned Arg = std::min(a: Quantity, b: MaxSNopCount);
1973 Quantity -= Arg;
1974 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOP)).addImm(Val: Arg - 1);
1975 }
1976}
1977
1978void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
1979 auto *MF = MBB.getParent();
1980 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1981
1982 assert(Info->isEntryFunction());
1983
1984 if (MBB.succ_empty()) {
1985 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1986 if (HasNoTerminator) {
1987 if (Info->returnsVoid()) {
1988 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DebugLoc(), MCID: get(Opcode: AMDGPU::S_ENDPGM)).addImm(Val: 0);
1989 } else {
1990 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DebugLoc(), MCID: get(Opcode: AMDGPU::SI_RETURN_TO_EPILOG));
1991 }
1992 }
1993 }
1994}
1995
1996MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
1997 MachineBasicBlock &MBB,
1998 MachineInstr &MI,
1999 const DebugLoc &DL) const {
2000 MachineFunction *MF = MBB.getParent();
2001 constexpr unsigned DoorbellIDMask = 0x3ff;
2002 constexpr unsigned ECQueueWaveAbort = 0x400;
2003
2004 MachineBasicBlock *TrapBB = &MBB;
2005 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2006
2007 if (!MBB.succ_empty() || std::next(x: MI.getIterator()) != MBB.end()) {
2008 MBB.splitAt(SplitInst&: MI, /*UpdateLiveIns=*/false);
2009 TrapBB = MF->CreateMachineBasicBlock();
2010 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CBRANCH_EXECNZ)).addMBB(MBB: TrapBB);
2011 MF->push_back(MBB: TrapBB);
2012 MBB.addSuccessor(Succ: TrapBB);
2013 }
2014 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2015 // will be a nop.
2016 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_TRAP))
2017 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2018 Register DoorbellReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2019 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SENDMSG_RTN_B32),
2020 DestReg: DoorbellReg)
2021 .addImm(Val: AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
2022 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::TTMP2)
2023 .addUse(RegNo: AMDGPU::M0);
2024 Register DoorbellRegMasked =
2025 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2026 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_AND_B32), DestReg: DoorbellRegMasked)
2027 .addUse(RegNo: DoorbellReg)
2028 .addImm(Val: DoorbellIDMask);
2029 Register SetWaveAbortBit =
2030 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2031 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_OR_B32), DestReg: SetWaveAbortBit)
2032 .addUse(RegNo: DoorbellRegMasked)
2033 .addImm(Val: ECQueueWaveAbort);
2034 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
2035 .addUse(RegNo: SetWaveAbortBit);
2036 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SENDMSG))
2037 .addImm(Val: AMDGPU::SendMsg::ID_INTERRUPT);
2038 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
2039 .addUse(RegNo: AMDGPU::TTMP2);
2040 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH)).addMBB(MBB: HaltLoopBB);
2041 TrapBB->addSuccessor(Succ: HaltLoopBB);
2042
2043 BuildMI(BB&: *HaltLoopBB, I: HaltLoopBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETHALT)).addImm(Val: 5);
2044 BuildMI(BB&: *HaltLoopBB, I: HaltLoopBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
2045 .addMBB(MBB: HaltLoopBB);
2046 MF->push_back(MBB: HaltLoopBB);
2047 HaltLoopBB->addSuccessor(Succ: HaltLoopBB);
2048
2049 return MBB.getNextNode();
2050}
2051
2052unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
2053 switch (MI.getOpcode()) {
2054 default:
2055 if (MI.isMetaInstruction())
2056 return 0;
2057 return 1; // FIXME: Do wait states equal cycles?
2058
2059 case AMDGPU::S_NOP:
2060 return MI.getOperand(i: 0).getImm() + 1;
2061 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2062 // hazard, even if one exist, won't really be visible. Should we handle it?
2063 }
2064}
2065
2066bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2067 MachineBasicBlock &MBB = *MI.getParent();
2068 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
2069 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
2070 switch (MI.getOpcode()) {
2071 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2072 case AMDGPU::S_MOV_B64_term:
2073 // This is only a terminator to get the correct spill code placement during
2074 // register allocation.
2075 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2076 break;
2077
2078 case AMDGPU::S_MOV_B32_term:
2079 // This is only a terminator to get the correct spill code placement during
2080 // register allocation.
2081 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B32));
2082 break;
2083
2084 case AMDGPU::S_XOR_B64_term:
2085 // This is only a terminator to get the correct spill code placement during
2086 // register allocation.
2087 MI.setDesc(get(Opcode: AMDGPU::S_XOR_B64));
2088 break;
2089
2090 case AMDGPU::S_XOR_B32_term:
2091 // This is only a terminator to get the correct spill code placement during
2092 // register allocation.
2093 MI.setDesc(get(Opcode: AMDGPU::S_XOR_B32));
2094 break;
2095 case AMDGPU::S_OR_B64_term:
2096 // This is only a terminator to get the correct spill code placement during
2097 // register allocation.
2098 MI.setDesc(get(Opcode: AMDGPU::S_OR_B64));
2099 break;
2100 case AMDGPU::S_OR_B32_term:
2101 // This is only a terminator to get the correct spill code placement during
2102 // register allocation.
2103 MI.setDesc(get(Opcode: AMDGPU::S_OR_B32));
2104 break;
2105
2106 case AMDGPU::S_ANDN2_B64_term:
2107 // This is only a terminator to get the correct spill code placement during
2108 // register allocation.
2109 MI.setDesc(get(Opcode: AMDGPU::S_ANDN2_B64));
2110 break;
2111
2112 case AMDGPU::S_ANDN2_B32_term:
2113 // This is only a terminator to get the correct spill code placement during
2114 // register allocation.
2115 MI.setDesc(get(Opcode: AMDGPU::S_ANDN2_B32));
2116 break;
2117
2118 case AMDGPU::S_AND_B64_term:
2119 // This is only a terminator to get the correct spill code placement during
2120 // register allocation.
2121 MI.setDesc(get(Opcode: AMDGPU::S_AND_B64));
2122 break;
2123
2124 case AMDGPU::S_AND_B32_term:
2125 // This is only a terminator to get the correct spill code placement during
2126 // register allocation.
2127 MI.setDesc(get(Opcode: AMDGPU::S_AND_B32));
2128 break;
2129
2130 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2131 // This is only a terminator to get the correct spill code placement during
2132 // register allocation.
2133 MI.setDesc(get(Opcode: AMDGPU::S_AND_SAVEEXEC_B64));
2134 break;
2135
2136 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2137 // This is only a terminator to get the correct spill code placement during
2138 // register allocation.
2139 MI.setDesc(get(Opcode: AMDGPU::S_AND_SAVEEXEC_B32));
2140 break;
2141
2142 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2143 MI.setDesc(get(Opcode: AMDGPU::V_WRITELANE_B32));
2144 break;
2145
2146 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2147 MI.setDesc(get(Opcode: AMDGPU::V_READLANE_B32));
2148 break;
2149 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2150 Register Dst = MI.getOperand(i: 0).getReg();
2151 bool IsAGPR = SIRegisterInfo::isAGPRClass(RC: RI.getPhysRegBaseClass(Reg: Dst));
2152 MI.setDesc(
2153 get(Opcode: IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2154 break;
2155 }
2156 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2157 Register Dst = MI.getOperand(i: 0).getReg();
2158 if (SIRegisterInfo::isAGPRClass(RC: RI.getPhysRegBaseClass(Reg: Dst))) {
2159 int64_t Imm = MI.getOperand(i: 1).getImm();
2160
2161 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2162 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2163 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg: DstLo)
2164 .addImm(Val: SignExtend64<32>(x: Imm))
2165 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2166 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg: DstHi)
2167 .addImm(Val: SignExtend64<32>(x: Imm >> 32))
2168 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2169 MI.eraseFromParent();
2170 break;
2171 }
2172
2173 [[fallthrough]];
2174 }
2175 case AMDGPU::V_MOV_B64_PSEUDO: {
2176 Register Dst = MI.getOperand(i: 0).getReg();
2177 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2178 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2179
2180 const MCInstrDesc &Mov64Desc = get(Opcode: AMDGPU::V_MOV_B64_e32);
2181 const TargetRegisterClass *Mov64RC = getRegClass(MCID: Mov64Desc, /*OpNum=*/0);
2182
2183 const MachineOperand &SrcOp = MI.getOperand(i: 1);
2184 // FIXME: Will this work for 64-bit floating point immediates?
2185 assert(!SrcOp.isFPImm());
2186 if (ST.hasMovB64() && Mov64RC->contains(Reg: Dst)) {
2187 MI.setDesc(Mov64Desc);
2188 if (SrcOp.isReg() || isInlineConstant(MI, OpIdx: 1) ||
2189 isUInt<32>(x: SrcOp.getImm()) || ST.has64BitLiterals())
2190 break;
2191 }
2192 if (SrcOp.isImm()) {
2193 APInt Imm(64, SrcOp.getImm());
2194 APInt Lo(32, Imm.getLoBits(numBits: 32).getZExtValue());
2195 APInt Hi(32, Imm.getHiBits(numBits: 32).getZExtValue());
2196 const MCInstrDesc &PkMovDesc = get(Opcode: AMDGPU::V_PK_MOV_B32);
2197 const TargetRegisterClass *PkMovRC = getRegClass(MCID: PkMovDesc, /*OpNum=*/0);
2198
2199 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Imm: Lo) &&
2200 PkMovRC->contains(Reg: Dst)) {
2201 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: PkMovDesc, DestReg: Dst)
2202 .addImm(Val: SISrcMods::OP_SEL_1)
2203 .addImm(Val: Lo.getSExtValue())
2204 .addImm(Val: SISrcMods::OP_SEL_1)
2205 .addImm(Val: Lo.getSExtValue())
2206 .addImm(Val: 0) // op_sel_lo
2207 .addImm(Val: 0) // op_sel_hi
2208 .addImm(Val: 0) // neg_lo
2209 .addImm(Val: 0) // neg_hi
2210 .addImm(Val: 0); // clamp
2211 } else {
2212 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo)
2213 .addImm(Val: Lo.getSExtValue())
2214 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2215 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstHi)
2216 .addImm(Val: Hi.getSExtValue())
2217 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2218 }
2219 } else {
2220 assert(SrcOp.isReg());
2221 if (ST.hasPkMovB32() &&
2222 !RI.isAGPR(MRI: MBB.getParent()->getRegInfo(), Reg: SrcOp.getReg())) {
2223 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg: Dst)
2224 .addImm(Val: SISrcMods::OP_SEL_1) // src0_mod
2225 .addReg(RegNo: SrcOp.getReg())
2226 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
2227 .addReg(RegNo: SrcOp.getReg())
2228 .addImm(Val: 0) // op_sel_lo
2229 .addImm(Val: 0) // op_sel_hi
2230 .addImm(Val: 0) // neg_lo
2231 .addImm(Val: 0) // neg_hi
2232 .addImm(Val: 0); // clamp
2233 } else {
2234 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo)
2235 .addReg(RegNo: RI.getSubReg(Reg: SrcOp.getReg(), Idx: AMDGPU::sub0))
2236 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2237 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstHi)
2238 .addReg(RegNo: RI.getSubReg(Reg: SrcOp.getReg(), Idx: AMDGPU::sub1))
2239 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2240 }
2241 }
2242 MI.eraseFromParent();
2243 break;
2244 }
2245 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2246 expandMovDPP64(MI);
2247 break;
2248 }
2249 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2250 const MachineOperand &SrcOp = MI.getOperand(i: 1);
2251 assert(!SrcOp.isFPImm());
2252
2253 if (ST.has64BitLiterals()) {
2254 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2255 break;
2256 }
2257
2258 APInt Imm(64, SrcOp.getImm());
2259 if (Imm.isIntN(N: 32) || isInlineConstant(Imm)) {
2260 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2261 break;
2262 }
2263
2264 Register Dst = MI.getOperand(i: 0).getReg();
2265 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2266 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2267
2268 APInt Lo(32, Imm.getLoBits(numBits: 32).getZExtValue());
2269 APInt Hi(32, Imm.getHiBits(numBits: 32).getZExtValue());
2270 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstLo)
2271 .addImm(Val: Lo.getSExtValue())
2272 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2273 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstHi)
2274 .addImm(Val: Hi.getSExtValue())
2275 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2276 MI.eraseFromParent();
2277 break;
2278 }
2279 case AMDGPU::V_SET_INACTIVE_B32: {
2280 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2281 Register DstReg = MI.getOperand(i: 0).getReg();
2282 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
2283 .add(MO: MI.getOperand(i: 3))
2284 .add(MO: MI.getOperand(i: 4))
2285 .add(MO: MI.getOperand(i: 1))
2286 .add(MO: MI.getOperand(i: 2))
2287 .add(MO: MI.getOperand(i: 5));
2288 MI.eraseFromParent();
2289 break;
2290 }
2291 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2292 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2294 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2298 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2305 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2324 const TargetRegisterClass *EltRC = getOpRegClass(MI, OpNo: 2);
2325
2326 unsigned Opc;
2327 if (RI.hasVGPRs(RC: EltRC)) {
2328 Opc = AMDGPU::V_MOVRELD_B32_e32;
2329 } else {
2330 Opc = RI.getRegSizeInBits(RC: *EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2331 : AMDGPU::S_MOVRELD_B32;
2332 }
2333
2334 const MCInstrDesc &OpDesc = get(Opcode: Opc);
2335 Register VecReg = MI.getOperand(i: 0).getReg();
2336 bool IsUndef = MI.getOperand(i: 1).isUndef();
2337 unsigned SubReg = MI.getOperand(i: 3).getImm();
2338 assert(VecReg == MI.getOperand(1).getReg());
2339
2340 MachineInstrBuilder MIB =
2341 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: OpDesc)
2342 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2343 .add(MO: MI.getOperand(i: 2))
2344 .addReg(RegNo: VecReg, Flags: RegState::ImplicitDefine)
2345 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2346
2347 const int ImpDefIdx =
2348 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2349 const int ImpUseIdx = ImpDefIdx + 1;
2350 MIB->tieOperands(DefIdx: ImpDefIdx, UseIdx: ImpUseIdx);
2351 MI.eraseFromParent();
2352 break;
2353 }
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2357 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2358 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2359 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2360 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2361 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2362 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2363 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2364 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2365 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2366 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2367 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2368 assert(ST.useVGPRIndexMode());
2369 Register VecReg = MI.getOperand(i: 0).getReg();
2370 bool IsUndef = MI.getOperand(i: 1).isUndef();
2371 MachineOperand &Idx = MI.getOperand(i: 3);
2372 Register SubReg = MI.getOperand(i: 4).getImm();
2373
2374 MachineInstr *SetOn = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_ON))
2375 .add(MO: Idx)
2376 .addImm(Val: AMDGPU::VGPRIndexMode::DST_ENABLE);
2377 SetOn->getOperand(i: 3).setIsUndef();
2378
2379 const MCInstrDesc &OpDesc = get(Opcode: AMDGPU::V_MOV_B32_indirect_write);
2380 MachineInstrBuilder MIB =
2381 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: OpDesc)
2382 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2383 .add(MO: MI.getOperand(i: 2))
2384 .addReg(RegNo: VecReg, Flags: RegState::ImplicitDefine)
2385 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2386
2387 const int ImpDefIdx =
2388 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2389 const int ImpUseIdx = ImpDefIdx + 1;
2390 MIB->tieOperands(DefIdx: ImpDefIdx, UseIdx: ImpUseIdx);
2391
2392 MachineInstr *SetOff = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_OFF));
2393
2394 finalizeBundle(MBB, FirstMI: SetOn->getIterator(), LastMI: std::next(x: SetOff->getIterator()));
2395
2396 MI.eraseFromParent();
2397 break;
2398 }
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2402 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2403 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2404 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2405 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2406 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2407 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2408 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2409 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2410 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2411 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2412 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2413 assert(ST.useVGPRIndexMode());
2414 Register Dst = MI.getOperand(i: 0).getReg();
2415 Register VecReg = MI.getOperand(i: 1).getReg();
2416 bool IsUndef = MI.getOperand(i: 1).isUndef();
2417 Register SubReg = MI.getOperand(i: 3).getImm();
2418
2419 MachineInstr *SetOn = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_ON))
2420 .add(MO: MI.getOperand(i: 2))
2421 .addImm(Val: AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2422 SetOn->getOperand(i: 3).setIsUndef();
2423
2424 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_indirect_read))
2425 .addDef(RegNo: Dst)
2426 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2427 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2428
2429 MachineInstr *SetOff = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_OFF));
2430
2431 finalizeBundle(MBB, FirstMI: SetOn->getIterator(), LastMI: std::next(x: SetOff->getIterator()));
2432
2433 MI.eraseFromParent();
2434 break;
2435 }
2436 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2437 MachineFunction &MF = *MBB.getParent();
2438 Register Reg = MI.getOperand(i: 0).getReg();
2439 Register RegLo = RI.getSubReg(Reg, Idx: AMDGPU::sub0);
2440 Register RegHi = RI.getSubReg(Reg, Idx: AMDGPU::sub1);
2441 MachineOperand OpLo = MI.getOperand(i: 1);
2442 MachineOperand OpHi = MI.getOperand(i: 2);
2443
2444 // Create a bundle so these instructions won't be re-ordered by the
2445 // post-RA scheduler.
2446 MIBundleBuilder Bundler(MBB, MI);
2447 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: Reg));
2448
2449 // What we want here is an offset from the value returned by s_getpc (which
2450 // is the address of the s_add_u32 instruction) to the global variable, but
2451 // since the encoding of $symbol starts 4 bytes after the start of the
2452 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2453 // small. This requires us to add 4 to the global variable offset in order
2454 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2455 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2456 // instruction.
2457
2458 int64_t Adjust = 0;
2459 if (ST.hasGetPCZeroExtension()) {
2460 // Fix up hardware that does not sign-extend the 48-bit PC value by
2461 // inserting: s_sext_i32_i16 reghi, reghi
2462 Bundler.append(
2463 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SEXT_I32_I16), DestReg: RegHi).addReg(RegNo: RegHi));
2464 Adjust += 4;
2465 }
2466
2467 if (OpLo.isGlobal())
2468 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2469 Bundler.append(
2470 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U32), DestReg: RegLo).addReg(RegNo: RegLo).add(MO: OpLo));
2471
2472 if (OpHi.isGlobal())
2473 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2474 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADDC_U32), DestReg: RegHi)
2475 .addReg(RegNo: RegHi)
2476 .add(MO: OpHi));
2477
2478 finalizeBundle(MBB, FirstMI: Bundler.begin());
2479
2480 MI.eraseFromParent();
2481 break;
2482 }
2483 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2484 MachineFunction &MF = *MBB.getParent();
2485 Register Reg = MI.getOperand(i: 0).getReg();
2486 MachineOperand Op = MI.getOperand(i: 1);
2487
2488 // Create a bundle so these instructions won't be re-ordered by the
2489 // post-RA scheduler.
2490 MIBundleBuilder Bundler(MBB, MI);
2491 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: Reg));
2492 if (Op.isGlobal())
2493 Op.setOffset(Op.getOffset() + 4);
2494 Bundler.append(
2495 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U64), DestReg: Reg).addReg(RegNo: Reg).add(MO: Op));
2496
2497 finalizeBundle(MBB, FirstMI: Bundler.begin());
2498
2499 MI.eraseFromParent();
2500 break;
2501 }
2502 case AMDGPU::ENTER_STRICT_WWM: {
2503 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2504 // Whole Wave Mode is entered.
2505 MI.setDesc(get(Opcode: LMC.OrSaveExecOpc));
2506 break;
2507 }
2508 case AMDGPU::ENTER_STRICT_WQM: {
2509 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2510 // STRICT_WQM is entered.
2511 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: LMC.MovOpc), DestReg: MI.getOperand(i: 0).getReg())
2512 .addReg(RegNo: LMC.ExecReg);
2513 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: LMC.WQMOpc), DestReg: LMC.ExecReg).addReg(RegNo: LMC.ExecReg);
2514
2515 MI.eraseFromParent();
2516 break;
2517 }
2518 case AMDGPU::EXIT_STRICT_WWM:
2519 case AMDGPU::EXIT_STRICT_WQM: {
2520 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2521 // WWM/STICT_WQM is exited.
2522 MI.setDesc(get(Opcode: LMC.MovOpc));
2523 break;
2524 }
2525 case AMDGPU::SI_RETURN: {
2526 const MachineFunction *MF = MBB.getParent();
2527 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2528 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2529 // Hiding the return address use with SI_RETURN may lead to extra kills in
2530 // the function and missing live-ins. We are fine in practice because callee
2531 // saved register handling ensures the register value is restored before
2532 // RET, but we need the undef flag here to appease the MachineVerifier
2533 // liveness checks.
2534 MachineInstrBuilder MIB =
2535 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETPC_B64_return))
2536 .addReg(RegNo: TRI->getReturnAddressReg(MF: *MF), Flags: RegState::Undef);
2537
2538 MIB.copyImplicitOps(OtherMI: MI);
2539 MI.eraseFromParent();
2540 break;
2541 }
2542
2543 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2544 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2545 MI.setDesc(get(Opcode: AMDGPU::S_MUL_U64));
2546 break;
2547
2548 case AMDGPU::S_GETPC_B64_pseudo:
2549 MI.setDesc(get(Opcode: AMDGPU::S_GETPC_B64));
2550 if (ST.hasGetPCZeroExtension()) {
2551 Register Dst = MI.getOperand(i: 0).getReg();
2552 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2553 // Fix up hardware that does not sign-extend the 48-bit PC value by
2554 // inserting: s_sext_i32_i16 dsthi, dsthi
2555 BuildMI(BB&: MBB, I: std::next(x: MI.getIterator()), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SEXT_I32_I16),
2556 DestReg: DstHi)
2557 .addReg(RegNo: DstHi);
2558 }
2559 break;
2560
2561 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2562 assert(ST.hasBF16PackedInsts());
2563 MI.setDesc(get(Opcode: AMDGPU::V_PK_MAX_NUM_BF16));
2564 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // op_sel
2565 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // neg_lo
2566 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // neg_hi
2567 auto Op0 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
2568 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2569 auto Op1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
2570 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2571 break;
2572 }
2573
2574 case AMDGPU::GET_STACK_BASE:
2575 // The stack starts at offset 0 unless we need to reserve some space at the
2576 // bottom.
2577 if (ST.getFrameLowering()->mayReserveScratchForCWSR(MF: *MBB.getParent())) {
2578 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2579 // some of the VGPRs. The size of the required scratch space has already
2580 // been computed by prolog epilog insertion.
2581 const SIMachineFunctionInfo *MFI =
2582 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2583 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2584 Register DestReg = MI.getOperand(i: 0).getReg();
2585 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETREG_B32), DestReg)
2586 .addImm(Val: AMDGPU::Hwreg::HwregEncoding::encode(
2587 Values: AMDGPU::Hwreg::ID_HW_ID2, Values: AMDGPU::Hwreg::OFFSET_ME_ID, Values: 2));
2588 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2589 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2590 // SCC, so we need to check for 0 manually.
2591 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U32)).addImm(Val: 0).addReg(RegNo: DestReg);
2592 // Change the implicif-def of SCC to an explicit use (but first remove
2593 // the dead flag if present).
2594 MI.getOperand(i: MI.getNumExplicitOperands()).setIsDead(false);
2595 MI.getOperand(i: MI.getNumExplicitOperands()).setIsUse();
2596 MI.setDesc(get(Opcode: AMDGPU::S_CMOVK_I32));
2597 MI.addOperand(Op: MachineOperand::CreateImm(Val: VGPRSize));
2598 } else {
2599 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B32));
2600 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2601 MI.removeOperand(
2602 OpNo: MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2603 }
2604 break;
2605 }
2606
2607 return true;
2608}
2609
2610void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
2611 MachineBasicBlock::iterator I, Register DestReg,
2612 unsigned SubIdx,
2613 const MachineInstr &Orig) const {
2614
2615 // Try shrinking the instruction to remat only the part needed for current
2616 // context.
2617 // TODO: Handle more cases.
2618 unsigned Opcode = Orig.getOpcode();
2619 switch (Opcode) {
2620 case AMDGPU::S_LOAD_DWORDX16_IMM:
2621 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2622 if (SubIdx != 0)
2623 break;
2624
2625 if (I == MBB.end())
2626 break;
2627
2628 if (I->isBundled())
2629 break;
2630
2631 // Look for a single use of the register that is also a subreg.
2632 Register RegToFind = Orig.getOperand(i: 0).getReg();
2633 MachineOperand *UseMO = nullptr;
2634 for (auto &CandMO : I->operands()) {
2635 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2636 continue;
2637 if (UseMO) {
2638 UseMO = nullptr;
2639 break;
2640 }
2641 UseMO = &CandMO;
2642 }
2643 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2644 break;
2645
2646 unsigned Offset = RI.getSubRegIdxOffset(Idx: UseMO->getSubReg());
2647 unsigned SubregSize = RI.getSubRegIdxSize(Idx: UseMO->getSubReg());
2648
2649 MachineFunction *MF = MBB.getParent();
2650 MachineRegisterInfo &MRI = MF->getRegInfo();
2651 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2652
2653 unsigned NewOpcode = -1;
2654 if (SubregSize == 256)
2655 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2656 else if (SubregSize == 128)
2657 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2658 else
2659 break;
2660
2661 const MCInstrDesc &TID = get(Opcode: NewOpcode);
2662 const TargetRegisterClass *NewRC =
2663 RI.getAllocatableClass(RC: getRegClass(MCID: TID, OpNum: 0));
2664 MRI.setRegClass(Reg: DestReg, RC: NewRC);
2665
2666 UseMO->setReg(DestReg);
2667 UseMO->setSubReg(AMDGPU::NoSubRegister);
2668
2669 // Use a smaller load with the desired size, possibly with updated offset.
2670 MachineInstr *MI = MF->CloneMachineInstr(Orig: &Orig);
2671 MI->setDesc(TID);
2672 MI->getOperand(i: 0).setReg(DestReg);
2673 MI->getOperand(i: 0).setSubReg(AMDGPU::NoSubRegister);
2674 if (Offset) {
2675 MachineOperand *OffsetMO = getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset);
2676 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2677 OffsetMO->setImm(FinalOffset);
2678 }
2679 SmallVector<MachineMemOperand *> NewMMOs;
2680 for (const MachineMemOperand *MemOp : Orig.memoperands())
2681 NewMMOs.push_back(Elt: MF->getMachineMemOperand(MMO: MemOp, PtrInfo: MemOp->getPointerInfo(),
2682 Size: SubregSize / 8));
2683 MI->setMemRefs(MF&: *MF, MemRefs: NewMMOs);
2684
2685 MBB.insert(I, MI);
2686 return;
2687 }
2688
2689 default:
2690 break;
2691 }
2692
2693 TargetInstrInfo::reMaterialize(MBB, MI: I, DestReg, SubIdx, Orig);
2694}
2695
2696std::pair<MachineInstr*, MachineInstr*>
2697SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
2698 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2699
2700 if (ST.hasMovB64() && ST.hasFeature(Feature: AMDGPU::FeatureDPALU_DPP) &&
2701 AMDGPU::isLegalDPALU_DPPControl(
2702 ST, DC: getNamedOperand(MI, OperandName: AMDGPU::OpName::dpp_ctrl)->getImm())) {
2703 MI.setDesc(get(Opcode: AMDGPU::V_MOV_B64_dpp));
2704 return std::pair(&MI, nullptr);
2705 }
2706
2707 MachineBasicBlock &MBB = *MI.getParent();
2708 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
2709 MachineFunction *MF = MBB.getParent();
2710 MachineRegisterInfo &MRI = MF->getRegInfo();
2711 Register Dst = MI.getOperand(i: 0).getReg();
2712 unsigned Part = 0;
2713 MachineInstr *Split[2];
2714
2715 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2716 auto MovDPP = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_dpp));
2717 if (Dst.isPhysical()) {
2718 MovDPP.addDef(RegNo: RI.getSubReg(Reg: Dst, Idx: Sub));
2719 } else {
2720 assert(MRI.isSSA());
2721 auto Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2722 MovDPP.addDef(RegNo: Tmp);
2723 }
2724
2725 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2726 const MachineOperand &SrcOp = MI.getOperand(i: I);
2727 assert(!SrcOp.isFPImm());
2728 if (SrcOp.isImm()) {
2729 APInt Imm(64, SrcOp.getImm());
2730 Imm.ashrInPlace(ShiftAmt: Part * 32);
2731 MovDPP.addImm(Val: Imm.getLoBits(numBits: 32).getZExtValue());
2732 } else {
2733 assert(SrcOp.isReg());
2734 Register Src = SrcOp.getReg();
2735 if (Src.isPhysical())
2736 MovDPP.addReg(RegNo: RI.getSubReg(Reg: Src, Idx: Sub));
2737 else
2738 MovDPP.addReg(RegNo: Src, Flags: getUndefRegState(B: SrcOp.isUndef()), SubReg: Sub);
2739 }
2740 }
2741
2742 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.explicit_operands(), N: 3))
2743 MovDPP.addImm(Val: MO.getImm());
2744
2745 Split[Part] = MovDPP;
2746 ++Part;
2747 }
2748
2749 if (Dst.isVirtual())
2750 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
2751 .addReg(RegNo: Split[0]->getOperand(i: 0).getReg())
2752 .addImm(Val: AMDGPU::sub0)
2753 .addReg(RegNo: Split[1]->getOperand(i: 0).getReg())
2754 .addImm(Val: AMDGPU::sub1);
2755
2756 MI.eraseFromParent();
2757 return std::pair(Split[0], Split[1]);
2758}
2759
2760std::optional<DestSourcePair>
2761SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
2762 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2763 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 1)};
2764
2765 return std::nullopt;
2766}
2767
2768bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0,
2769 AMDGPU::OpName Src0OpName,
2770 MachineOperand &Src1,
2771 AMDGPU::OpName Src1OpName) const {
2772 MachineOperand *Src0Mods = getNamedOperand(MI, OperandName: Src0OpName);
2773 if (!Src0Mods)
2774 return false;
2775
2776 MachineOperand *Src1Mods = getNamedOperand(MI, OperandName: Src1OpName);
2777 assert(Src1Mods &&
2778 "All commutable instructions have both src0 and src1 modifiers");
2779
2780 int Src0ModsVal = Src0Mods->getImm();
2781 int Src1ModsVal = Src1Mods->getImm();
2782
2783 Src1Mods->setImm(Src0ModsVal);
2784 Src0Mods->setImm(Src1ModsVal);
2785 return true;
2786}
2787
2788static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
2789 MachineOperand &RegOp,
2790 MachineOperand &NonRegOp) {
2791 Register Reg = RegOp.getReg();
2792 unsigned SubReg = RegOp.getSubReg();
2793 bool IsKill = RegOp.isKill();
2794 bool IsDead = RegOp.isDead();
2795 bool IsUndef = RegOp.isUndef();
2796 bool IsDebug = RegOp.isDebug();
2797
2798 if (NonRegOp.isImm())
2799 RegOp.ChangeToImmediate(ImmVal: NonRegOp.getImm());
2800 else if (NonRegOp.isFI())
2801 RegOp.ChangeToFrameIndex(Idx: NonRegOp.getIndex());
2802 else if (NonRegOp.isGlobal()) {
2803 RegOp.ChangeToGA(GV: NonRegOp.getGlobal(), Offset: NonRegOp.getOffset(),
2804 TargetFlags: NonRegOp.getTargetFlags());
2805 } else
2806 return nullptr;
2807
2808 // Make sure we don't reinterpret a subreg index in the target flags.
2809 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2810
2811 NonRegOp.ChangeToRegister(Reg, isDef: false, isImp: false, isKill: IsKill, isDead: IsDead, isUndef: IsUndef, isDebug: IsDebug);
2812 NonRegOp.setSubReg(SubReg);
2813
2814 return &MI;
2815}
2816
2817static MachineInstr *swapImmOperands(MachineInstr &MI,
2818 MachineOperand &NonRegOp1,
2819 MachineOperand &NonRegOp2) {
2820 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2821 int64_t NonRegVal = NonRegOp1.getImm();
2822
2823 NonRegOp1.setImm(NonRegOp2.getImm());
2824 NonRegOp2.setImm(NonRegVal);
2825 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2826 NonRegOp2.setTargetFlags(TargetFlags);
2827 return &MI;
2828}
2829
2830bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2831 unsigned OpIdx1) const {
2832 const MCInstrDesc &InstDesc = MI.getDesc();
2833 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2834 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2835
2836 unsigned Opc = MI.getOpcode();
2837 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
2838
2839 const MachineOperand &MO0 = MI.getOperand(i: OpIdx0);
2840 const MachineOperand &MO1 = MI.getOperand(i: OpIdx1);
2841
2842 // Swap doesn't breach constant bus or literal limits
2843 // It may move literal to position other than src0, this is not allowed
2844 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2845 // FIXME: After gfx9, literal can be in place other than Src0
2846 if (isVALU(MI)) {
2847 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2848 !isInlineConstant(MO: MO0, OpInfo: OpInfo1))
2849 return false;
2850 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2851 !isInlineConstant(MO: MO1, OpInfo: OpInfo0))
2852 return false;
2853 }
2854
2855 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2856 if (OpInfo1.RegClass == -1)
2857 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2858 return isLegalRegOperand(MI, OpIdx: OpIdx1, MO: MO0) &&
2859 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx: OpIdx0, MO: MO1));
2860 }
2861 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2862 if (OpInfo0.RegClass == -1)
2863 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2864 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx: OpIdx1, MO: MO0)) &&
2865 isLegalRegOperand(MI, OpIdx: OpIdx0, MO: MO1);
2866 }
2867
2868 // No need to check 64-bit literals since swapping does not bring new
2869 // 64-bit literals into current instruction to fold to 32-bit
2870
2871 return isImmOperandLegal(MI, OpNo: OpIdx1, MO: MO0);
2872}
2873
2874MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
2875 unsigned Src0Idx,
2876 unsigned Src1Idx) const {
2877 assert(!NewMI && "this should never be used");
2878
2879 unsigned Opc = MI.getOpcode();
2880 int CommutedOpcode = commuteOpcode(Opcode: Opc);
2881 if (CommutedOpcode == -1)
2882 return nullptr;
2883
2884 if (Src0Idx > Src1Idx)
2885 std::swap(a&: Src0Idx, b&: Src1Idx);
2886
2887 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2888 static_cast<int>(Src0Idx) &&
2889 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2890 static_cast<int>(Src1Idx) &&
2891 "inconsistency with findCommutedOpIndices");
2892
2893 if (!isLegalToSwap(MI, OpIdx0: Src0Idx, OpIdx1: Src1Idx))
2894 return nullptr;
2895
2896 MachineInstr *CommutedMI = nullptr;
2897 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
2898 MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
2899 if (Src0.isReg() && Src1.isReg()) {
2900 // Be sure to copy the source modifiers to the right place.
2901 CommutedMI =
2902 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1: Src0Idx, OpIdx2: Src1Idx);
2903 } else if (Src0.isReg() && !Src1.isReg()) {
2904 CommutedMI = swapRegAndNonRegOperand(MI, RegOp&: Src0, NonRegOp&: Src1);
2905 } else if (!Src0.isReg() && Src1.isReg()) {
2906 CommutedMI = swapRegAndNonRegOperand(MI, RegOp&: Src1, NonRegOp&: Src0);
2907 } else if (Src0.isImm() && Src1.isImm()) {
2908 CommutedMI = swapImmOperands(MI, NonRegOp1&: Src0, NonRegOp2&: Src1);
2909 } else {
2910 // FIXME: Found two non registers to commute. This does happen.
2911 return nullptr;
2912 }
2913
2914 if (CommutedMI) {
2915 swapSourceModifiers(MI, Src0, Src0OpName: AMDGPU::OpName::src0_modifiers,
2916 Src1, Src1OpName: AMDGPU::OpName::src1_modifiers);
2917
2918 swapSourceModifiers(MI, Src0, Src0OpName: AMDGPU::OpName::src0_sel, Src1,
2919 Src1OpName: AMDGPU::OpName::src1_sel);
2920
2921 CommutedMI->setDesc(get(Opcode: CommutedOpcode));
2922 }
2923
2924 return CommutedMI;
2925}
2926
2927// This needs to be implemented because the source modifiers may be inserted
2928// between the true commutable operands, and the base
2929// TargetInstrInfo::commuteInstruction uses it.
2930bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
2931 unsigned &SrcOpIdx0,
2932 unsigned &SrcOpIdx1) const {
2933 return findCommutedOpIndices(Desc: MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2934}
2935
2936bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc,
2937 unsigned &SrcOpIdx0,
2938 unsigned &SrcOpIdx1) const {
2939 if (!Desc.isCommutable())
2940 return false;
2941
2942 unsigned Opc = Desc.getOpcode();
2943 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
2944 if (Src0Idx == -1)
2945 return false;
2946
2947 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1);
2948 if (Src1Idx == -1)
2949 return false;
2950
2951 return fixCommutedOpIndices(ResultIdx1&: SrcOpIdx0, ResultIdx2&: SrcOpIdx1, CommutableOpIdx1: Src0Idx, CommutableOpIdx2: Src1Idx);
2952}
2953
2954bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
2955 int64_t BrOffset) const {
2956 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2957 // because its dest block is unanalyzable.
2958 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2959
2960 // Convert to dwords.
2961 BrOffset /= 4;
2962
2963 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2964 // from the next instruction.
2965 BrOffset -= 1;
2966
2967 return isIntN(N: BranchOffsetBits, x: BrOffset);
2968}
2969
2970MachineBasicBlock *
2971SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
2972 return MI.getOperand(i: 0).getMBB();
2973}
2974
2975bool SIInstrInfo::hasDivergentBranch(const MachineBasicBlock *MBB) const {
2976 for (const MachineInstr &MI : MBB->terminators()) {
2977 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2978 MI.getOpcode() == AMDGPU::SI_LOOP)
2979 return true;
2980 }
2981 return false;
2982}
2983
2984void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
2985 MachineBasicBlock &DestBB,
2986 MachineBasicBlock &RestoreBB,
2987 const DebugLoc &DL, int64_t BrOffset,
2988 RegScavenger *RS) const {
2989 assert(MBB.empty() &&
2990 "new block should be inserted for expanding unconditional branch");
2991 assert(MBB.pred_size() == 1);
2992 assert(RestoreBB.empty() &&
2993 "restore block should be inserted for restoring clobbered registers");
2994
2995 MachineFunction *MF = MBB.getParent();
2996 MachineRegisterInfo &MRI = MF->getRegInfo();
2997 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2998 auto I = MBB.end();
2999 auto &MCCtx = MF->getContext();
3000
3001 if (ST.useAddPC64Inst()) {
3002 MCSymbol *Offset =
3003 MCCtx.createTempSymbol(Name: "offset", /*AlwaysAddSuffix=*/true);
3004 auto AddPC = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_PC_I64))
3005 .addSym(Sym: Offset, TargetFlags: MO_FAR_BRANCH_OFFSET);
3006 MCSymbol *PostAddPCLabel =
3007 MCCtx.createTempSymbol(Name: "post_addpc", /*AlwaysAddSuffix=*/true);
3008 AddPC->setPostInstrSymbol(MF&: *MF, Symbol: PostAddPCLabel);
3009 auto *OffsetExpr = MCBinaryExpr::createSub(
3010 LHS: MCSymbolRefExpr::create(Symbol: DestBB.getSymbol(), Ctx&: MCCtx),
3011 RHS: MCSymbolRefExpr::create(Symbol: PostAddPCLabel, Ctx&: MCCtx), Ctx&: MCCtx);
3012 Offset->setVariableValue(OffsetExpr);
3013 return;
3014 }
3015
3016 assert(RS && "RegScavenger required for long branching");
3017
3018 // FIXME: Virtual register workaround for RegScavenger not working with empty
3019 // blocks.
3020 Register PCReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
3021
3022 // Note: as this is used after hazard recognizer we need to apply some hazard
3023 // workarounds directly.
3024 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
3025 ST.hasVALUReadSGPRHazard();
3026 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
3027 if (FlushSGPRWrites)
3028 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3029 .addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST));
3030 };
3031
3032 // We need to compute the offset relative to the instruction immediately after
3033 // s_getpc_b64. Insert pc arithmetic code before last terminator.
3034 MachineInstr *GetPC = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: PCReg);
3035 ApplyHazardWorkarounds();
3036
3037 MCSymbol *PostGetPCLabel =
3038 MCCtx.createTempSymbol(Name: "post_getpc", /*AlwaysAddSuffix=*/true);
3039 GetPC->setPostInstrSymbol(MF&: *MF, Symbol: PostGetPCLabel);
3040
3041 MCSymbol *OffsetLo =
3042 MCCtx.createTempSymbol(Name: "offset_lo", /*AlwaysAddSuffix=*/true);
3043 MCSymbol *OffsetHi =
3044 MCCtx.createTempSymbol(Name: "offset_hi", /*AlwaysAddSuffix=*/true);
3045 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U32))
3046 .addReg(RegNo: PCReg, Flags: RegState::Define, SubReg: AMDGPU::sub0)
3047 .addReg(RegNo: PCReg, Flags: {}, SubReg: AMDGPU::sub0)
3048 .addSym(Sym: OffsetLo, TargetFlags: MO_FAR_BRANCH_OFFSET);
3049 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADDC_U32))
3050 .addReg(RegNo: PCReg, Flags: RegState::Define, SubReg: AMDGPU::sub1)
3051 .addReg(RegNo: PCReg, Flags: {}, SubReg: AMDGPU::sub1)
3052 .addSym(Sym: OffsetHi, TargetFlags: MO_FAR_BRANCH_OFFSET);
3053 ApplyHazardWorkarounds();
3054
3055 // Insert the indirect branch after the other terminator.
3056 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETPC_B64))
3057 .addReg(RegNo: PCReg);
3058
3059 // If a spill is needed for the pc register pair, we need to insert a spill
3060 // restore block right before the destination block, and insert a short branch
3061 // into the old destination block's fallthrough predecessor.
3062 // e.g.:
3063 //
3064 // s_cbranch_scc0 skip_long_branch:
3065 //
3066 // long_branch_bb:
3067 // spill s[8:9]
3068 // s_getpc_b64 s[8:9]
3069 // s_add_u32 s8, s8, restore_bb
3070 // s_addc_u32 s9, s9, 0
3071 // s_setpc_b64 s[8:9]
3072 //
3073 // skip_long_branch:
3074 // foo;
3075 //
3076 // .....
3077 //
3078 // dest_bb_fallthrough_predecessor:
3079 // bar;
3080 // s_branch dest_bb
3081 //
3082 // restore_bb:
3083 // restore s[8:9]
3084 // fallthrough dest_bb
3085 ///
3086 // dest_bb:
3087 // buzz;
3088
3089 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3090 Register Scav;
3091
3092 // If we've previously reserved a register for long branches
3093 // avoid running the scavenger and just use those registers
3094 if (LongBranchReservedReg) {
3095 RS->enterBasicBlock(MBB);
3096 Scav = LongBranchReservedReg;
3097 } else {
3098 RS->enterBasicBlockEnd(MBB);
3099 Scav = RS->scavengeRegisterBackwards(
3100 RC: AMDGPU::SReg_64RegClass, To: MachineBasicBlock::iterator(GetPC),
3101 /* RestoreAfter */ false, SPAdj: 0, /* AllowSpill */ false);
3102 }
3103 if (Scav) {
3104 RS->setRegUsed(Reg: Scav);
3105 MRI.replaceRegWith(FromReg: PCReg, ToReg: Scav);
3106 MRI.clearVirtRegs();
3107 } else {
3108 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3109 // SGPR spill.
3110 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3111 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3112 TRI->spillEmergencySGPR(MI: GetPC, RestoreMBB&: RestoreBB, SGPR: AMDGPU::SGPR0_SGPR1, RS);
3113 MRI.replaceRegWith(FromReg: PCReg, ToReg: AMDGPU::SGPR0_SGPR1);
3114 MRI.clearVirtRegs();
3115 }
3116
3117 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3118 // Now, the distance could be defined.
3119 auto *Offset = MCBinaryExpr::createSub(
3120 LHS: MCSymbolRefExpr::create(Symbol: DestLabel, Ctx&: MCCtx),
3121 RHS: MCSymbolRefExpr::create(Symbol: PostGetPCLabel, Ctx&: MCCtx), Ctx&: MCCtx);
3122 // Add offset assignments.
3123 auto *Mask = MCConstantExpr::create(Value: 0xFFFFFFFFULL, Ctx&: MCCtx);
3124 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(LHS: Offset, RHS: Mask, Ctx&: MCCtx));
3125 auto *ShAmt = MCConstantExpr::create(Value: 32, Ctx&: MCCtx);
3126 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(LHS: Offset, RHS: ShAmt, Ctx&: MCCtx));
3127}
3128
3129unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3130 switch (Cond) {
3131 case SIInstrInfo::SCC_TRUE:
3132 return AMDGPU::S_CBRANCH_SCC1;
3133 case SIInstrInfo::SCC_FALSE:
3134 return AMDGPU::S_CBRANCH_SCC0;
3135 case SIInstrInfo::VCCNZ:
3136 return AMDGPU::S_CBRANCH_VCCNZ;
3137 case SIInstrInfo::VCCZ:
3138 return AMDGPU::S_CBRANCH_VCCZ;
3139 case SIInstrInfo::EXECNZ:
3140 return AMDGPU::S_CBRANCH_EXECNZ;
3141 case SIInstrInfo::EXECZ:
3142 return AMDGPU::S_CBRANCH_EXECZ;
3143 default:
3144 llvm_unreachable("invalid branch predicate");
3145 }
3146}
3147
3148SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3149 switch (Opcode) {
3150 case AMDGPU::S_CBRANCH_SCC0:
3151 return SCC_FALSE;
3152 case AMDGPU::S_CBRANCH_SCC1:
3153 return SCC_TRUE;
3154 case AMDGPU::S_CBRANCH_VCCNZ:
3155 return VCCNZ;
3156 case AMDGPU::S_CBRANCH_VCCZ:
3157 return VCCZ;
3158 case AMDGPU::S_CBRANCH_EXECNZ:
3159 return EXECNZ;
3160 case AMDGPU::S_CBRANCH_EXECZ:
3161 return EXECZ;
3162 default:
3163 return INVALID_BR;
3164 }
3165}
3166
3167bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
3168 MachineBasicBlock::iterator I,
3169 MachineBasicBlock *&TBB,
3170 MachineBasicBlock *&FBB,
3171 SmallVectorImpl<MachineOperand> &Cond,
3172 bool AllowModify) const {
3173 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3174 // Unconditional Branch
3175 TBB = I->getOperand(i: 0).getMBB();
3176 return false;
3177 }
3178
3179 BranchPredicate Pred = getBranchPredicate(Opcode: I->getOpcode());
3180 if (Pred == INVALID_BR)
3181 return true;
3182
3183 MachineBasicBlock *CondBB = I->getOperand(i: 0).getMBB();
3184 Cond.push_back(Elt: MachineOperand::CreateImm(Val: Pred));
3185 Cond.push_back(Elt: I->getOperand(i: 1)); // Save the branch register.
3186
3187 ++I;
3188
3189 if (I == MBB.end()) {
3190 // Conditional branch followed by fall-through.
3191 TBB = CondBB;
3192 return false;
3193 }
3194
3195 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3196 TBB = CondBB;
3197 FBB = I->getOperand(i: 0).getMBB();
3198 return false;
3199 }
3200
3201 return true;
3202}
3203
3204bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
3205 MachineBasicBlock *&FBB,
3206 SmallVectorImpl<MachineOperand> &Cond,
3207 bool AllowModify) const {
3208 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3209 auto E = MBB.end();
3210 if (I == E)
3211 return false;
3212
3213 // Skip over the instructions that are artificially terminators for special
3214 // exec management.
3215 while (I != E && !I->isBranch() && !I->isReturn()) {
3216 switch (I->getOpcode()) {
3217 case AMDGPU::S_MOV_B64_term:
3218 case AMDGPU::S_XOR_B64_term:
3219 case AMDGPU::S_OR_B64_term:
3220 case AMDGPU::S_ANDN2_B64_term:
3221 case AMDGPU::S_AND_B64_term:
3222 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3223 case AMDGPU::S_MOV_B32_term:
3224 case AMDGPU::S_XOR_B32_term:
3225 case AMDGPU::S_OR_B32_term:
3226 case AMDGPU::S_ANDN2_B32_term:
3227 case AMDGPU::S_AND_B32_term:
3228 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3229 break;
3230 case AMDGPU::SI_IF:
3231 case AMDGPU::SI_ELSE:
3232 case AMDGPU::SI_KILL_I1_TERMINATOR:
3233 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3234 // FIXME: It's messy that these need to be considered here at all.
3235 return true;
3236 default:
3237 llvm_unreachable("unexpected non-branch terminator inst");
3238 }
3239
3240 ++I;
3241 }
3242
3243 if (I == E)
3244 return false;
3245
3246 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3247}
3248
3249unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
3250 int *BytesRemoved) const {
3251 unsigned Count = 0;
3252 unsigned RemovedSize = 0;
3253 for (MachineInstr &MI : llvm::make_early_inc_range(Range: MBB.terminators())) {
3254 // Skip over artificial terminators when removing instructions.
3255 if (MI.isBranch() || MI.isReturn()) {
3256 RemovedSize += getInstSizeInBytes(MI);
3257 MI.eraseFromParent();
3258 ++Count;
3259 }
3260 }
3261
3262 if (BytesRemoved)
3263 *BytesRemoved = RemovedSize;
3264
3265 return Count;
3266}
3267
3268// Copy the flags onto the implicit condition register operand.
3269static void preserveCondRegFlags(MachineOperand &CondReg,
3270 const MachineOperand &OrigCond) {
3271 CondReg.setIsUndef(OrigCond.isUndef());
3272 CondReg.setIsKill(OrigCond.isKill());
3273}
3274
3275unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
3276 MachineBasicBlock *TBB,
3277 MachineBasicBlock *FBB,
3278 ArrayRef<MachineOperand> Cond,
3279 const DebugLoc &DL,
3280 int *BytesAdded) const {
3281 if (!FBB && Cond.empty()) {
3282 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
3283 .addMBB(MBB: TBB);
3284 if (BytesAdded)
3285 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3286 return 1;
3287 }
3288
3289 assert(TBB && Cond[0].isImm());
3290
3291 unsigned Opcode
3292 = getBranchOpcode(Cond: static_cast<BranchPredicate>(Cond[0].getImm()));
3293
3294 if (!FBB) {
3295 MachineInstr *CondBr =
3296 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode))
3297 .addMBB(MBB: TBB);
3298
3299 // Copy the flags onto the implicit condition register operand.
3300 preserveCondRegFlags(CondReg&: CondBr->getOperand(i: 1), OrigCond: Cond[1]);
3301 fixImplicitOperands(MI&: *CondBr);
3302
3303 if (BytesAdded)
3304 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3305 return 1;
3306 }
3307
3308 assert(TBB && FBB);
3309
3310 MachineInstr *CondBr =
3311 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode))
3312 .addMBB(MBB: TBB);
3313 fixImplicitOperands(MI&: *CondBr);
3314 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
3315 .addMBB(MBB: FBB);
3316
3317 MachineOperand &CondReg = CondBr->getOperand(i: 1);
3318 CondReg.setIsUndef(Cond[1].isUndef());
3319 CondReg.setIsKill(Cond[1].isKill());
3320
3321 if (BytesAdded)
3322 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3323
3324 return 2;
3325}
3326
3327bool SIInstrInfo::reverseBranchCondition(
3328 SmallVectorImpl<MachineOperand> &Cond) const {
3329 if (Cond.size() != 2) {
3330 return true;
3331 }
3332
3333 if (Cond[0].isImm()) {
3334 Cond[0].setImm(-Cond[0].getImm());
3335 return false;
3336 }
3337
3338 return true;
3339}
3340
3341bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
3342 ArrayRef<MachineOperand> Cond,
3343 Register DstReg, Register TrueReg,
3344 Register FalseReg, int &CondCycles,
3345 int &TrueCycles, int &FalseCycles) const {
3346 switch (Cond[0].getImm()) {
3347 case VCCNZ:
3348 case VCCZ: {
3349 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3350 const TargetRegisterClass *RC = MRI.getRegClass(Reg: TrueReg);
3351 if (MRI.getRegClass(Reg: FalseReg) != RC)
3352 return false;
3353
3354 int NumInsts = AMDGPU::getRegBitWidth(RC: *RC) / 32;
3355 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3356
3357 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3358 return RI.hasVGPRs(RC) && NumInsts <= 6;
3359 }
3360 case SCC_TRUE:
3361 case SCC_FALSE: {
3362 // FIXME: We could insert for VGPRs if we could replace the original compare
3363 // with a vector one.
3364 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3365 const TargetRegisterClass *RC = MRI.getRegClass(Reg: TrueReg);
3366 if (MRI.getRegClass(Reg: FalseReg) != RC)
3367 return false;
3368
3369 int NumInsts = AMDGPU::getRegBitWidth(RC: *RC) / 32;
3370
3371 // Multiples of 8 can do s_cselect_b64
3372 if (NumInsts % 2 == 0)
3373 NumInsts /= 2;
3374
3375 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3376 return RI.isSGPRClass(RC);
3377 }
3378 default:
3379 return false;
3380 }
3381}
3382
3383void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
3384 MachineBasicBlock::iterator I, const DebugLoc &DL,
3385 Register DstReg, ArrayRef<MachineOperand> Cond,
3386 Register TrueReg, Register FalseReg) const {
3387 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3388 if (Pred == VCCZ || Pred == SCC_FALSE) {
3389 Pred = static_cast<BranchPredicate>(-Pred);
3390 std::swap(a&: TrueReg, b&: FalseReg);
3391 }
3392
3393 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3394 const TargetRegisterClass *DstRC = MRI.getRegClass(Reg: DstReg);
3395 unsigned DstSize = RI.getRegSizeInBits(RC: *DstRC);
3396
3397 if (DstSize == 32) {
3398 MachineInstr *Select;
3399 if (Pred == SCC_TRUE) {
3400 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: DstReg)
3401 .addReg(RegNo: TrueReg)
3402 .addReg(RegNo: FalseReg);
3403 } else {
3404 // Instruction's operands are backwards from what is expected.
3405 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e32), DestReg: DstReg)
3406 .addReg(RegNo: FalseReg)
3407 .addReg(RegNo: TrueReg);
3408 }
3409
3410 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3411 return;
3412 }
3413
3414 if (DstSize == 64 && Pred == SCC_TRUE) {
3415 MachineInstr *Select =
3416 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: DstReg)
3417 .addReg(RegNo: TrueReg)
3418 .addReg(RegNo: FalseReg);
3419
3420 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3421 return;
3422 }
3423
3424 static const int16_t Sub0_15[] = {
3425 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3426 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3427 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3428 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3429 };
3430
3431 static const int16_t Sub0_15_64[] = {
3432 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3433 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3434 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3435 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3436 };
3437
3438 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3439 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3440 const int16_t *SubIndices = Sub0_15;
3441 int NElts = DstSize / 32;
3442
3443 // 64-bit select is only available for SALU.
3444 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3445 if (Pred == SCC_TRUE) {
3446 if (NElts % 2) {
3447 SelOp = AMDGPU::S_CSELECT_B32;
3448 EltRC = &AMDGPU::SGPR_32RegClass;
3449 } else {
3450 SelOp = AMDGPU::S_CSELECT_B64;
3451 EltRC = &AMDGPU::SGPR_64RegClass;
3452 SubIndices = Sub0_15_64;
3453 NElts /= 2;
3454 }
3455 }
3456
3457 MachineInstrBuilder MIB = BuildMI(
3458 BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg);
3459
3460 I = MIB->getIterator();
3461
3462 SmallVector<Register, 8> Regs;
3463 for (int Idx = 0; Idx != NElts; ++Idx) {
3464 Register DstElt = MRI.createVirtualRegister(RegClass: EltRC);
3465 Regs.push_back(Elt: DstElt);
3466
3467 unsigned SubIdx = SubIndices[Idx];
3468
3469 MachineInstr *Select;
3470 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3471 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SelOp), DestReg: DstElt)
3472 .addReg(RegNo: FalseReg, Flags: {}, SubReg: SubIdx)
3473 .addReg(RegNo: TrueReg, Flags: {}, SubReg: SubIdx);
3474 } else {
3475 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SelOp), DestReg: DstElt)
3476 .addReg(RegNo: TrueReg, Flags: {}, SubReg: SubIdx)
3477 .addReg(RegNo: FalseReg, Flags: {}, SubReg: SubIdx);
3478 }
3479
3480 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3481 fixImplicitOperands(MI&: *Select);
3482
3483 MIB.addReg(RegNo: DstElt)
3484 .addImm(Val: SubIdx);
3485 }
3486}
3487
3488bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
3489 switch (MI.getOpcode()) {
3490 case AMDGPU::V_MOV_B16_t16_e32:
3491 case AMDGPU::V_MOV_B16_t16_e64:
3492 case AMDGPU::V_MOV_B32_e32:
3493 case AMDGPU::V_MOV_B32_e64:
3494 case AMDGPU::V_MOV_B64_PSEUDO:
3495 case AMDGPU::V_MOV_B64_e32:
3496 case AMDGPU::V_MOV_B64_e64:
3497 case AMDGPU::S_MOV_B32:
3498 case AMDGPU::S_MOV_B64:
3499 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3500 case AMDGPU::COPY:
3501 case AMDGPU::WWM_COPY:
3502 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3503 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3504 case AMDGPU::V_ACCVGPR_MOV_B32:
3505 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3506 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3507 return true;
3508 default:
3509 return false;
3510 }
3511}
3512
3513unsigned SIInstrInfo::getFoldableCopySrcIdx(const MachineInstr &MI) {
3514 switch (MI.getOpcode()) {
3515 case AMDGPU::V_MOV_B16_t16_e32:
3516 case AMDGPU::V_MOV_B16_t16_e64:
3517 return 2;
3518 case AMDGPU::V_MOV_B32_e32:
3519 case AMDGPU::V_MOV_B32_e64:
3520 case AMDGPU::V_MOV_B64_PSEUDO:
3521 case AMDGPU::V_MOV_B64_e32:
3522 case AMDGPU::V_MOV_B64_e64:
3523 case AMDGPU::S_MOV_B32:
3524 case AMDGPU::S_MOV_B64:
3525 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3526 case AMDGPU::COPY:
3527 case AMDGPU::WWM_COPY:
3528 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3529 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3530 case AMDGPU::V_ACCVGPR_MOV_B32:
3531 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3532 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3533 return 1;
3534 default:
3535 llvm_unreachable("MI is not a foldable copy");
3536 }
3537}
3538
3539static constexpr AMDGPU::OpName ModifierOpNames[] = {
3540 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3541 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3542 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3543
3544void SIInstrInfo::removeModOperands(MachineInstr &MI) const {
3545 unsigned Opc = MI.getOpcode();
3546 for (AMDGPU::OpName Name : reverse(C: ModifierOpNames)) {
3547 int Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name);
3548 if (Idx >= 0)
3549 MI.removeOperand(OpNo: Idx);
3550 }
3551}
3552
3553void SIInstrInfo::mutateAndCleanupImplicit(MachineInstr &MI,
3554 const MCInstrDesc &NewDesc) const {
3555 MI.setDesc(NewDesc);
3556
3557 // Remove any leftover implicit operands from mutating the instruction. e.g.
3558 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3559 // anymore.
3560 const MCInstrDesc &Desc = MI.getDesc();
3561 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3562 Desc.implicit_defs().size();
3563
3564 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3565 MI.removeOperand(OpNo: I);
3566}
3567
3568std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3569 unsigned SubRegIndex) {
3570 switch (SubRegIndex) {
3571 case AMDGPU::NoSubRegister:
3572 return Imm;
3573 case AMDGPU::sub0:
3574 return SignExtend64<32>(x: Imm);
3575 case AMDGPU::sub1:
3576 return SignExtend64<32>(x: Imm >> 32);
3577 case AMDGPU::lo16:
3578 return SignExtend64<16>(x: Imm);
3579 case AMDGPU::hi16:
3580 return SignExtend64<16>(x: Imm >> 16);
3581 case AMDGPU::sub1_lo16:
3582 return SignExtend64<16>(x: Imm >> 32);
3583 case AMDGPU::sub1_hi16:
3584 return SignExtend64<16>(x: Imm >> 48);
3585 default:
3586 return std::nullopt;
3587 }
3588
3589 llvm_unreachable("covered subregister switch");
3590}
3591
3592static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3593 switch (Opc) {
3594 case AMDGPU::V_MAC_F16_e32:
3595 case AMDGPU::V_MAC_F16_e64:
3596 case AMDGPU::V_MAD_F16_e64:
3597 return AMDGPU::V_MADAK_F16;
3598 case AMDGPU::V_MAC_F32_e32:
3599 case AMDGPU::V_MAC_F32_e64:
3600 case AMDGPU::V_MAD_F32_e64:
3601 return AMDGPU::V_MADAK_F32;
3602 case AMDGPU::V_FMAC_F32_e32:
3603 case AMDGPU::V_FMAC_F32_e64:
3604 case AMDGPU::V_FMA_F32_e64:
3605 return AMDGPU::V_FMAAK_F32;
3606 case AMDGPU::V_FMAC_F16_e32:
3607 case AMDGPU::V_FMAC_F16_e64:
3608 case AMDGPU::V_FMAC_F16_t16_e64:
3609 case AMDGPU::V_FMAC_F16_fake16_e64:
3610 case AMDGPU::V_FMAC_F16_t16_e32:
3611 case AMDGPU::V_FMAC_F16_fake16_e32:
3612 case AMDGPU::V_FMA_F16_e64:
3613 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3614 ? AMDGPU::V_FMAAK_F16_t16
3615 : AMDGPU::V_FMAAK_F16_fake16
3616 : AMDGPU::V_FMAAK_F16;
3617 case AMDGPU::V_FMAC_F64_e32:
3618 case AMDGPU::V_FMAC_F64_e64:
3619 case AMDGPU::V_FMA_F64_e64:
3620 return AMDGPU::V_FMAAK_F64;
3621 default:
3622 llvm_unreachable("invalid instruction");
3623 }
3624}
3625
3626static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3627 switch (Opc) {
3628 case AMDGPU::V_MAC_F16_e32:
3629 case AMDGPU::V_MAC_F16_e64:
3630 case AMDGPU::V_MAD_F16_e64:
3631 return AMDGPU::V_MADMK_F16;
3632 case AMDGPU::V_MAC_F32_e32:
3633 case AMDGPU::V_MAC_F32_e64:
3634 case AMDGPU::V_MAD_F32_e64:
3635 return AMDGPU::V_MADMK_F32;
3636 case AMDGPU::V_FMAC_F32_e32:
3637 case AMDGPU::V_FMAC_F32_e64:
3638 case AMDGPU::V_FMA_F32_e64:
3639 return AMDGPU::V_FMAMK_F32;
3640 case AMDGPU::V_FMAC_F16_e32:
3641 case AMDGPU::V_FMAC_F16_e64:
3642 case AMDGPU::V_FMAC_F16_t16_e64:
3643 case AMDGPU::V_FMAC_F16_fake16_e64:
3644 case AMDGPU::V_FMAC_F16_t16_e32:
3645 case AMDGPU::V_FMAC_F16_fake16_e32:
3646 case AMDGPU::V_FMA_F16_e64:
3647 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3648 ? AMDGPU::V_FMAMK_F16_t16
3649 : AMDGPU::V_FMAMK_F16_fake16
3650 : AMDGPU::V_FMAMK_F16;
3651 case AMDGPU::V_FMAC_F64_e32:
3652 case AMDGPU::V_FMAC_F64_e64:
3653 case AMDGPU::V_FMA_F64_e64:
3654 return AMDGPU::V_FMAMK_F64;
3655 default:
3656 llvm_unreachable("invalid instruction");
3657 }
3658}
3659
3660bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3661 Register Reg, MachineRegisterInfo *MRI) const {
3662 int64_t Imm;
3663 if (!getConstValDefinedInReg(MI: DefMI, Reg, ImmVal&: Imm))
3664 return false;
3665
3666 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(RegNo: Reg);
3667
3668 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3669
3670 unsigned Opc = UseMI.getOpcode();
3671 if (Opc == AMDGPU::COPY) {
3672 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3673
3674 Register DstReg = UseMI.getOperand(i: 0).getReg();
3675 Register UseSubReg = UseMI.getOperand(i: 1).getSubReg();
3676
3677 const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI: *MRI, Reg: DstReg);
3678
3679 if (HasMultipleUses) {
3680 // TODO: This should fold in more cases with multiple use, but we need to
3681 // more carefully consider what those uses are.
3682 unsigned ImmDefSize = RI.getRegSizeInBits(RC: *MRI->getRegClass(Reg));
3683
3684 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3685 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3686 return false;
3687
3688 // Most of the time folding a 32-bit inline constant is free (though this
3689 // might not be true if we can't later fold it into a real user).
3690 //
3691 // FIXME: This isInlineConstant check is imprecise if
3692 // getConstValDefinedInReg handled the tricky non-mov cases.
3693 if (ImmDefSize == 32 &&
3694 !isInlineConstant(ImmVal: Imm, OperandType: AMDGPU::OPERAND_REG_IMM_INT32))
3695 return false;
3696 }
3697
3698 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3699 RI.getSubRegIdxSize(Idx: UseSubReg) == 16;
3700
3701 if (Is16Bit) {
3702 if (RI.hasVGPRs(RC: DstRC))
3703 return false; // Do not clobber vgpr_hi16
3704
3705 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3706 return false;
3707 }
3708
3709 MachineFunction *MF = UseMI.getMF();
3710
3711 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3712 MCRegister MovDstPhysReg =
3713 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3714
3715 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, SubRegIndex: UseSubReg);
3716
3717 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3718 for (unsigned MovOp :
3719 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3720 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3721 const MCInstrDesc &MovDesc = get(Opcode: MovOp);
3722
3723 const TargetRegisterClass *MovDstRC = getRegClass(MCID: MovDesc, OpNum: 0);
3724 if (Is16Bit) {
3725 // We just need to find a correctly sized register class, so the
3726 // subregister index compatibility doesn't matter since we're statically
3727 // extracting the immediate value.
3728 MovDstRC = RI.getMatchingSuperRegClass(A: MovDstRC, B: DstRC, Idx: AMDGPU::lo16);
3729 if (!MovDstRC)
3730 continue;
3731
3732 if (MovDstPhysReg) {
3733 // FIXME: We probably should not do this. If there is a live value in
3734 // the high half of the register, it will be corrupted.
3735 MovDstPhysReg =
3736 RI.getMatchingSuperReg(Reg: MovDstPhysReg, SubIdx: AMDGPU::lo16, RC: MovDstRC);
3737 if (!MovDstPhysReg)
3738 continue;
3739 }
3740 }
3741
3742 // Result class isn't the right size, try the next instruction.
3743 if (MovDstPhysReg) {
3744 if (!MovDstRC->contains(Reg: MovDstPhysReg))
3745 return false;
3746 } else if (!MRI->constrainRegClass(Reg: DstReg, RC: MovDstRC)) {
3747 // TODO: This will be overly conservative in the case of 16-bit virtual
3748 // SGPRs. We could hack up the virtual register uses to use a compatible
3749 // 32-bit class.
3750 continue;
3751 }
3752
3753 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3754
3755 // Ensure the interpreted immediate value is a valid operand in the new
3756 // mov.
3757 //
3758 // FIXME: isImmOperandLegal should have form that doesn't require existing
3759 // MachineInstr or MachineOperand
3760 if (!RI.opCanUseLiteralConstant(OpType: OpInfo.OperandType) &&
3761 !isInlineConstant(ImmVal: *SubRegImm, OperandType: OpInfo.OperandType))
3762 break;
3763
3764 NewOpc = MovOp;
3765 break;
3766 }
3767
3768 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3769 return false;
3770
3771 if (Is16Bit) {
3772 UseMI.getOperand(i: 0).setSubReg(AMDGPU::NoSubRegister);
3773 if (MovDstPhysReg)
3774 UseMI.getOperand(i: 0).setReg(MovDstPhysReg);
3775 assert(UseMI.getOperand(1).getReg().isVirtual());
3776 }
3777
3778 const MCInstrDesc &NewMCID = get(Opcode: NewOpc);
3779 UseMI.setDesc(NewMCID);
3780 UseMI.getOperand(i: 1).ChangeToImmediate(ImmVal: *SubRegImm);
3781 UseMI.addImplicitDefUseOperands(MF&: *MF);
3782 return true;
3783 }
3784
3785 if (HasMultipleUses)
3786 return false;
3787
3788 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3789 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3790 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3791 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3792 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3793 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3794 Opc == AMDGPU::V_FMAC_F64_e64) {
3795 // Don't fold if we are using source or output modifiers. The new VOP2
3796 // instructions don't have them.
3797 if (hasAnyModifiersSet(MI: UseMI))
3798 return false;
3799
3800 // If this is a free constant, there's no reason to do this.
3801 // TODO: We could fold this here instead of letting SIFoldOperands do it
3802 // later.
3803 int Src0Idx = getNamedOperandIdx(Opcode: UseMI.getOpcode(), Name: AMDGPU::OpName::src0);
3804
3805 // Any src operand can be used for the legality check.
3806 if (isInlineConstant(MI: UseMI, OpIdx: Src0Idx, ImmVal: Imm))
3807 return false;
3808
3809 MachineOperand *Src0 = &UseMI.getOperand(i: Src0Idx);
3810
3811 MachineOperand *Src1 = getNamedOperand(MI&: UseMI, OperandName: AMDGPU::OpName::src1);
3812 MachineOperand *Src2 = getNamedOperand(MI&: UseMI, OperandName: AMDGPU::OpName::src2);
3813
3814 auto CopyRegOperandToNarrowerRC =
3815 [MRI, this](MachineInstr &MI, unsigned OpNo,
3816 const TargetRegisterClass *NewRC) -> void {
3817 if (!MI.getOperand(i: OpNo).isReg())
3818 return;
3819 Register Reg = MI.getOperand(i: OpNo).getReg();
3820 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI: *MRI, Reg);
3821 if (RI.getCommonSubClass(A: RC, B: NewRC) != NewRC)
3822 return;
3823 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
3824 BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(),
3825 MCID: get(Opcode: AMDGPU::COPY), DestReg: Tmp)
3826 .addReg(RegNo: Reg);
3827 MI.getOperand(i: OpNo).setReg(Tmp);
3828 MI.getOperand(i: OpNo).setIsKill();
3829 };
3830
3831 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3832 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3833 (Src1->isReg() && Src1->getReg() == Reg)) {
3834 MachineOperand *RegSrc =
3835 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3836 if (!RegSrc->isReg())
3837 return false;
3838 if (RI.isSGPRClass(RC: MRI->getRegClass(Reg: RegSrc->getReg())) &&
3839 ST.getConstantBusLimit(Opcode: Opc) < 2)
3840 return false;
3841
3842 if (!Src2->isReg() || RI.isSGPRClass(RC: MRI->getRegClass(Reg: Src2->getReg())))
3843 return false;
3844
3845 // If src2 is also a literal constant then we have to choose which one to
3846 // fold. In general it is better to choose madak so that the other literal
3847 // can be materialized in an sgpr instead of a vgpr:
3848 // s_mov_b32 s0, literal
3849 // v_madak_f32 v0, s0, v0, literal
3850 // Instead of:
3851 // v_mov_b32 v1, literal
3852 // v_madmk_f32 v0, v0, literal, v1
3853 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src2->getReg());
3854 if (Def && Def->isMoveImmediate() &&
3855 !isInlineConstant(MO: Def->getOperand(i: 1)))
3856 return false;
3857
3858 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3859 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
3860 return false;
3861
3862 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3863 Imm, SubRegIndex: RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3864
3865 // FIXME: This would be a lot easier if we could return a new instruction
3866 // instead of having to modify in place.
3867
3868 Register SrcReg = RegSrc->getReg();
3869 unsigned SrcSubReg = RegSrc->getSubReg();
3870 Src0->setReg(SrcReg);
3871 Src0->setSubReg(SrcSubReg);
3872 Src0->setIsKill(RegSrc->isKill());
3873
3874 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3875 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3876 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3877 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3878 UseMI.untieRegOperand(
3879 OpIdx: AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2));
3880
3881 Src1->ChangeToImmediate(ImmVal: *SubRegImm);
3882
3883 removeModOperands(MI&: UseMI);
3884 UseMI.setDesc(get(Opcode: NewOpc));
3885
3886 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3887 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3888 const TargetRegisterClass *NewRC = getRegClass(MCID: get(Opcode: NewOpc), OpNum: 0);
3889 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
3890 BuildMI(BB&: *UseMI.getParent(), I: std::next(x: UseMI.getIterator()),
3891 MIMD: UseMI.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
3892 DestReg: UseMI.getOperand(i: 0).getReg())
3893 .addReg(RegNo: Tmp, Flags: RegState::Kill);
3894 UseMI.getOperand(i: 0).setReg(Tmp);
3895 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3896 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3897 }
3898
3899 bool DeleteDef = MRI->use_nodbg_empty(RegNo: Reg);
3900 if (DeleteDef)
3901 DefMI.eraseFromParent();
3902
3903 return true;
3904 }
3905
3906 // Added part is the constant: Use v_madak_{f16, f32}.
3907 if (Src2->isReg() && Src2->getReg() == Reg) {
3908 if (ST.getConstantBusLimit(Opcode: Opc) < 2) {
3909 // Not allowed to use constant bus for another operand.
3910 // We can however allow an inline immediate as src0.
3911 bool Src0Inlined = false;
3912 if (Src0->isReg()) {
3913 // Try to inline constant if possible.
3914 // If the Def moves immediate and the use is single
3915 // We are saving VGPR here.
3916 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src0->getReg());
3917 if (Def && Def->isMoveImmediate() &&
3918 isInlineConstant(MO: Def->getOperand(i: 1)) &&
3919 MRI->hasOneNonDBGUse(RegNo: Src0->getReg())) {
3920 Src0->ChangeToImmediate(ImmVal: Def->getOperand(i: 1).getImm());
3921 Src0Inlined = true;
3922 } else if (ST.getConstantBusLimit(Opcode: Opc) <= 1 &&
3923 RI.isSGPRReg(MRI: *MRI, Reg: Src0->getReg())) {
3924 return false;
3925 }
3926 // VGPR is okay as Src0 - fallthrough
3927 }
3928
3929 if (Src1->isReg() && !Src0Inlined) {
3930 // We have one slot for inlinable constant so far - try to fill it
3931 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src1->getReg());
3932 if (Def && Def->isMoveImmediate() &&
3933 isInlineConstant(MO: Def->getOperand(i: 1)) &&
3934 MRI->hasOneNonDBGUse(RegNo: Src1->getReg()) && commuteInstruction(MI&: UseMI))
3935 Src0->ChangeToImmediate(ImmVal: Def->getOperand(i: 1).getImm());
3936 else if (RI.isSGPRReg(MRI: *MRI, Reg: Src1->getReg()))
3937 return false;
3938 // VGPR is okay as Src1 - fallthrough
3939 }
3940 }
3941
3942 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3943 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
3944 return false;
3945
3946 // FIXME: This would be a lot easier if we could return a new instruction
3947 // instead of having to modify in place.
3948
3949 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3950 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3951 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3952 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3953 UseMI.untieRegOperand(
3954 OpIdx: AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2));
3955
3956 const std::optional<int64_t> SubRegImm =
3957 extractSubregFromImm(Imm, SubRegIndex: Src2->getSubReg());
3958
3959 // ChangingToImmediate adds Src2 back to the instruction.
3960 Src2->ChangeToImmediate(ImmVal: *SubRegImm);
3961
3962 // These come before src2.
3963 removeModOperands(MI&: UseMI);
3964 UseMI.setDesc(get(Opcode: NewOpc));
3965
3966 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3967 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3968 const TargetRegisterClass *NewRC = getRegClass(MCID: get(Opcode: NewOpc), OpNum: 0);
3969 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
3970 BuildMI(BB&: *UseMI.getParent(), I: std::next(x: UseMI.getIterator()),
3971 MIMD: UseMI.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
3972 DestReg: UseMI.getOperand(i: 0).getReg())
3973 .addReg(RegNo: Tmp, Flags: RegState::Kill);
3974 UseMI.getOperand(i: 0).setReg(Tmp);
3975 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3976 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
3977 }
3978
3979 // It might happen that UseMI was commuted
3980 // and we now have SGPR as SRC1. If so 2 inlined
3981 // constant and SGPR are illegal.
3982 legalizeOperands(MI&: UseMI);
3983
3984 bool DeleteDef = MRI->use_nodbg_empty(RegNo: Reg);
3985 if (DeleteDef)
3986 DefMI.eraseFromParent();
3987
3988 return true;
3989 }
3990 }
3991
3992 return false;
3993}
3994
3995static bool
3996memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
3997 ArrayRef<const MachineOperand *> BaseOps2) {
3998 if (BaseOps1.size() != BaseOps2.size())
3999 return false;
4000 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
4001 if (!BaseOps1[I]->isIdenticalTo(Other: *BaseOps2[I]))
4002 return false;
4003 }
4004 return true;
4005}
4006
4007static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
4008 LocationSize WidthB, int OffsetB) {
4009 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
4010 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
4011 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
4012 return LowWidth.hasValue() &&
4013 LowOffset + (int)LowWidth.getValue() <= HighOffset;
4014}
4015
4016bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
4017 const MachineInstr &MIb) const {
4018 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
4019 int64_t Offset0, Offset1;
4020 LocationSize Dummy0 = LocationSize::precise(Value: 0);
4021 LocationSize Dummy1 = LocationSize::precise(Value: 0);
4022 bool Offset0IsScalable, Offset1IsScalable;
4023 if (!getMemOperandsWithOffsetWidth(LdSt: MIa, BaseOps&: BaseOps0, Offset&: Offset0, OffsetIsScalable&: Offset0IsScalable,
4024 Width&: Dummy0, TRI: &RI) ||
4025 !getMemOperandsWithOffsetWidth(LdSt: MIb, BaseOps&: BaseOps1, Offset&: Offset1, OffsetIsScalable&: Offset1IsScalable,
4026 Width&: Dummy1, TRI: &RI))
4027 return false;
4028
4029 if (!memOpsHaveSameBaseOperands(BaseOps1: BaseOps0, BaseOps2: BaseOps1))
4030 return false;
4031
4032 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
4033 // FIXME: Handle ds_read2 / ds_write2.
4034 return false;
4035 }
4036 LocationSize Width0 = MIa.memoperands().front()->getSize();
4037 LocationSize Width1 = MIb.memoperands().front()->getSize();
4038 return offsetsDoNotOverlap(WidthA: Width0, OffsetA: Offset0, WidthB: Width1, OffsetB: Offset1);
4039}
4040
4041bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
4042 const MachineInstr &MIb) const {
4043 assert(MIa.mayLoadOrStore() &&
4044 "MIa must load from or modify a memory location");
4045 assert(MIb.mayLoadOrStore() &&
4046 "MIb must load from or modify a memory location");
4047
4048 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
4049 return false;
4050
4051 // XXX - Can we relax this between address spaces?
4052 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4053 return false;
4054
4055 if (isLDSDMA(MI: MIa) || isLDSDMA(MI: MIb))
4056 return false;
4057
4058 if (MIa.isBundle() || MIb.isBundle())
4059 return false;
4060
4061 // TODO: Should we check the address space from the MachineMemOperand? That
4062 // would allow us to distinguish objects we know don't alias based on the
4063 // underlying address space, even if it was lowered to a different one,
4064 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4065 // buffer.
4066 if (isDS(MI: MIa)) {
4067 if (isDS(MI: MIb))
4068 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4069
4070 return !isFLAT(MI: MIb) || isSegmentSpecificFLAT(MI: MIb);
4071 }
4072
4073 if (isMUBUF(MI: MIa) || isMTBUF(MI: MIa)) {
4074 if (isMUBUF(MI: MIb) || isMTBUF(MI: MIb))
4075 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4076
4077 if (isFLAT(MI: MIb))
4078 return isFLATScratch(MI: MIb);
4079
4080 return !isSMRD(MI: MIb);
4081 }
4082
4083 if (isSMRD(MI: MIa)) {
4084 if (isSMRD(MI: MIb))
4085 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4086
4087 if (isFLAT(MI: MIb))
4088 return isFLATScratch(MI: MIb);
4089
4090 return !isMUBUF(MI: MIb) && !isMTBUF(MI: MIb);
4091 }
4092
4093 if (isFLAT(MI: MIa)) {
4094 if (isFLAT(MI: MIb)) {
4095 if ((isFLATScratch(MI: MIa) && isFLATGlobal(MI: MIb)) ||
4096 (isFLATGlobal(MI: MIa) && isFLATScratch(MI: MIb)))
4097 return true;
4098
4099 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4100 }
4101
4102 return false;
4103 }
4104
4105 return false;
4106}
4107
4108static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,
4109 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4110 if (Reg.isPhysical())
4111 return false;
4112 auto *Def = MRI.getUniqueVRegDef(Reg);
4113 if (Def && SIInstrInfo::isFoldableCopy(MI: *Def) && Def->getOperand(i: 1).isImm()) {
4114 Imm = Def->getOperand(i: 1).getImm();
4115 if (DefMI)
4116 *DefMI = Def;
4117 return true;
4118 }
4119 return false;
4120}
4121
4122static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4123 MachineInstr **DefMI = nullptr) {
4124 if (!MO->isReg())
4125 return false;
4126 const MachineFunction *MF = MO->getParent()->getMF();
4127 const MachineRegisterInfo &MRI = MF->getRegInfo();
4128 return getFoldableImm(Reg: MO->getReg(), MRI, Imm, DefMI);
4129}
4130
4131static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
4132 MachineInstr &NewMI) {
4133 if (LV) {
4134 unsigned NumOps = MI.getNumOperands();
4135 for (unsigned I = 1; I < NumOps; ++I) {
4136 MachineOperand &Op = MI.getOperand(i: I);
4137 if (Op.isReg() && Op.isKill())
4138 LV->replaceKillInstruction(Reg: Op.getReg(), OldMI&: MI, NewMI);
4139 }
4140 }
4141}
4142
4143static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4144 switch (Opc) {
4145 case AMDGPU::V_MAC_F16_e32:
4146 case AMDGPU::V_MAC_F16_e64:
4147 return AMDGPU::V_MAD_F16_e64;
4148 case AMDGPU::V_MAC_F32_e32:
4149 case AMDGPU::V_MAC_F32_e64:
4150 return AMDGPU::V_MAD_F32_e64;
4151 case AMDGPU::V_MAC_LEGACY_F32_e32:
4152 case AMDGPU::V_MAC_LEGACY_F32_e64:
4153 return AMDGPU::V_MAD_LEGACY_F32_e64;
4154 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4155 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4156 return AMDGPU::V_FMA_LEGACY_F32_e64;
4157 case AMDGPU::V_FMAC_F16_e32:
4158 case AMDGPU::V_FMAC_F16_e64:
4159 case AMDGPU::V_FMAC_F16_t16_e64:
4160 case AMDGPU::V_FMAC_F16_fake16_e64:
4161 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4162 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4163 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4164 : AMDGPU::V_FMA_F16_gfx9_e64;
4165 case AMDGPU::V_FMAC_F32_e32:
4166 case AMDGPU::V_FMAC_F32_e64:
4167 return AMDGPU::V_FMA_F32_e64;
4168 case AMDGPU::V_FMAC_F64_e32:
4169 case AMDGPU::V_FMAC_F64_e64:
4170 return AMDGPU::V_FMA_F64_e64;
4171 default:
4172 llvm_unreachable("invalid instruction");
4173 }
4174}
4175
4176/// Helper struct for the implementation of 3-address conversion to communicate
4177/// updates made to instruction operands.
4178struct SIInstrInfo::ThreeAddressUpdates {
4179 /// Other instruction whose def is no longer used by the converted
4180 /// instruction.
4181 MachineInstr *RemoveMIUse = nullptr;
4182};
4183
4184MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
4185 LiveVariables *LV,
4186 LiveIntervals *LIS) const {
4187 MachineBasicBlock &MBB = *MI.getParent();
4188 MachineInstr *CandidateMI = &MI;
4189
4190 if (MI.isBundle()) {
4191 // This is a temporary placeholder for bundle handling that enables us to
4192 // exercise the relevant code paths in the two-address instruction pass.
4193 if (MI.getBundleSize() != 1)
4194 return nullptr;
4195 CandidateMI = MI.getNextNode();
4196 }
4197
4198 ThreeAddressUpdates U;
4199 MachineInstr *NewMI = convertToThreeAddressImpl(MI&: *CandidateMI, Updates&: U);
4200 if (!NewMI)
4201 return nullptr;
4202
4203 if (MI.isBundle()) {
4204 CandidateMI->eraseFromBundle();
4205
4206 for (MachineOperand &MO : MI.all_defs()) {
4207 if (MO.isTied())
4208 MI.untieRegOperand(OpIdx: MO.getOperandNo());
4209 }
4210 } else {
4211 updateLiveVariables(LV, MI, NewMI&: *NewMI);
4212 if (LIS) {
4213 LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *NewMI);
4214 // SlotIndex of defs needs to be updated when converting to early-clobber
4215 MachineOperand &Def = NewMI->getOperand(i: 0);
4216 if (Def.isEarlyClobber() && Def.isReg() &&
4217 LIS->hasInterval(Reg: Def.getReg())) {
4218 SlotIndex OldIndex = LIS->getInstructionIndex(Instr: *NewMI).getRegSlot(EC: false);
4219 SlotIndex NewIndex = LIS->getInstructionIndex(Instr: *NewMI).getRegSlot(EC: true);
4220 auto &LI = LIS->getInterval(Reg: Def.getReg());
4221 auto UpdateDefIndex = [&](LiveRange &LR) {
4222 auto *S = LR.find(Pos: OldIndex);
4223 if (S != LR.end() && S->start == OldIndex) {
4224 assert(S->valno && S->valno->def == OldIndex);
4225 S->start = NewIndex;
4226 S->valno->def = NewIndex;
4227 }
4228 };
4229 UpdateDefIndex(LI);
4230 for (auto &SR : LI.subranges())
4231 UpdateDefIndex(SR);
4232 }
4233 }
4234 }
4235
4236 if (U.RemoveMIUse) {
4237 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4238 // The only user is the instruction which will be killed.
4239 Register DefReg = U.RemoveMIUse->getOperand(i: 0).getReg();
4240
4241 if (MRI.hasOneNonDBGUse(RegNo: DefReg)) {
4242 // We cannot just remove the DefMI here, calling pass will crash.
4243 U.RemoveMIUse->setDesc(get(Opcode: AMDGPU::IMPLICIT_DEF));
4244 U.RemoveMIUse->getOperand(i: 0).setIsDead(true);
4245 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4246 U.RemoveMIUse->removeOperand(OpNo: I);
4247 if (LV)
4248 LV->getVarInfo(Reg: DefReg).AliveBlocks.clear();
4249 }
4250
4251 if (MI.isBundle()) {
4252 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, Reg: DefReg);
4253 if (!VRI.Reads && !VRI.Writes) {
4254 for (MachineOperand &MO : MI.all_uses()) {
4255 if (MO.isReg() && MO.getReg() == DefReg) {
4256 assert(MO.getSubReg() == 0 &&
4257 "tied sub-registers in bundles currently not supported");
4258 MI.removeOperand(OpNo: MO.getOperandNo());
4259 break;
4260 }
4261 }
4262
4263 if (LIS)
4264 LIS->shrinkToUses(li: &LIS->getInterval(Reg: DefReg));
4265 }
4266 } else if (LIS) {
4267 LiveInterval &DefLI = LIS->getInterval(Reg: DefReg);
4268
4269 // We cannot delete the original instruction here, so hack out the use
4270 // in the original instruction with a dummy register so we can use
4271 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4272 // not have the complexity of deleting a use to consider here.
4273 Register DummyReg = MRI.cloneVirtualRegister(VReg: DefReg);
4274 for (MachineOperand &MIOp : MI.uses()) {
4275 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4276 MIOp.setIsUndef(true);
4277 MIOp.setReg(DummyReg);
4278 }
4279 }
4280
4281 if (MI.isBundle()) {
4282 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, Reg: DefReg);
4283 if (!VRI.Reads && !VRI.Writes) {
4284 for (MachineOperand &MIOp : MI.uses()) {
4285 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4286 MIOp.setIsUndef(true);
4287 MIOp.setReg(DummyReg);
4288 }
4289 }
4290 }
4291
4292 MI.addOperand(Op: MachineOperand::CreateReg(Reg: DummyReg, isDef: false, isImp: false, isKill: false,
4293 isDead: false, /*isUndef=*/true));
4294 }
4295
4296 LIS->shrinkToUses(li: &DefLI);
4297 }
4298 }
4299
4300 return MI.isBundle() ? &MI : NewMI;
4301}
4302
4303MachineInstr *
4304SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4305 ThreeAddressUpdates &U) const {
4306 MachineBasicBlock &MBB = *MI.getParent();
4307 unsigned Opc = MI.getOpcode();
4308
4309 // Handle MFMA.
4310 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opcode: Opc);
4311 if (NewMFMAOpc != -1) {
4312 MachineInstrBuilder MIB =
4313 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewMFMAOpc));
4314 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4315 MIB.add(MO: MI.getOperand(i: I));
4316 return MIB;
4317 }
4318
4319 if (SIInstrInfo::isWMMA(MI)) {
4320 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(Opc: MI.getOpcode());
4321 MachineInstrBuilder MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4322 .setMIFlags(MI.getFlags());
4323 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4324 MIB->addOperand(Op: MI.getOperand(i: I));
4325 return MIB;
4326 }
4327
4328 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4329 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4330 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4331 "present pre-RA");
4332
4333 // Handle MAC/FMAC.
4334 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4335 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4336 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4337 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4338 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4339 bool Src0Literal = false;
4340
4341 switch (Opc) {
4342 default:
4343 return nullptr;
4344 case AMDGPU::V_MAC_F16_e64:
4345 case AMDGPU::V_FMAC_F16_e64:
4346 case AMDGPU::V_FMAC_F16_t16_e64:
4347 case AMDGPU::V_FMAC_F16_fake16_e64:
4348 case AMDGPU::V_MAC_F32_e64:
4349 case AMDGPU::V_MAC_LEGACY_F32_e64:
4350 case AMDGPU::V_FMAC_F32_e64:
4351 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4352 case AMDGPU::V_FMAC_F64_e64:
4353 break;
4354 case AMDGPU::V_MAC_F16_e32:
4355 case AMDGPU::V_FMAC_F16_e32:
4356 case AMDGPU::V_MAC_F32_e32:
4357 case AMDGPU::V_MAC_LEGACY_F32_e32:
4358 case AMDGPU::V_FMAC_F32_e32:
4359 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4360 case AMDGPU::V_FMAC_F64_e32: {
4361 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
4362 Name: AMDGPU::OpName::src0);
4363 const MachineOperand *Src0 = &MI.getOperand(i: Src0Idx);
4364 if (!Src0->isReg() && !Src0->isImm())
4365 return nullptr;
4366
4367 if (Src0->isImm() && !isInlineConstant(MI, OpIdx: Src0Idx, MO: *Src0))
4368 Src0Literal = true;
4369
4370 break;
4371 }
4372 }
4373
4374 MachineInstrBuilder MIB;
4375 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
4376 const MachineOperand *Src0 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
4377 const MachineOperand *Src0Mods =
4378 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
4379 const MachineOperand *Src1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4380 const MachineOperand *Src1Mods =
4381 getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
4382 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
4383 const MachineOperand *Src2Mods =
4384 getNamedOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers);
4385 const MachineOperand *Clamp = getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
4386 const MachineOperand *Omod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
4387 const MachineOperand *OpSel = getNamedOperand(MI, OperandName: AMDGPU::OpName::op_sel);
4388
4389 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4390 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4391 // If we have an SGPR input, we will violate the constant bus restriction.
4392 (ST.getConstantBusLimit(Opcode: Opc) > 1 || !Src0->isReg() ||
4393 !RI.isSGPRReg(MRI: MBB.getParent()->getRegInfo(), Reg: Src0->getReg()))) {
4394 MachineInstr *DefMI;
4395
4396 int64_t Imm;
4397 if (!Src0Literal && getFoldableImm(MO: Src2, Imm, DefMI: &DefMI)) {
4398 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4399 if (pseudoToMCOpcode(Opcode: NewOpc) != -1) {
4400 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4401 .add(MO: *Dst)
4402 .add(MO: *Src0)
4403 .add(MO: *Src1)
4404 .addImm(Val: Imm)
4405 .setMIFlags(MI.getFlags());
4406 U.RemoveMIUse = DefMI;
4407 return MIB;
4408 }
4409 }
4410 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4411 if (!Src0Literal && getFoldableImm(MO: Src1, Imm, DefMI: &DefMI)) {
4412 if (pseudoToMCOpcode(Opcode: NewOpc) != -1) {
4413 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4414 .add(MO: *Dst)
4415 .add(MO: *Src0)
4416 .addImm(Val: Imm)
4417 .add(MO: *Src2)
4418 .setMIFlags(MI.getFlags());
4419 U.RemoveMIUse = DefMI;
4420 return MIB;
4421 }
4422 }
4423 if (Src0Literal || getFoldableImm(MO: Src0, Imm, DefMI: &DefMI)) {
4424 if (Src0Literal) {
4425 Imm = Src0->getImm();
4426 DefMI = nullptr;
4427 }
4428 if (pseudoToMCOpcode(Opcode: NewOpc) != -1 &&
4429 isOperandLegal(
4430 MI, OpIdx: AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::src0),
4431 MO: Src1)) {
4432 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4433 .add(MO: *Dst)
4434 .add(MO: *Src1)
4435 .addImm(Val: Imm)
4436 .add(MO: *Src2)
4437 .setMIFlags(MI.getFlags());
4438 U.RemoveMIUse = DefMI;
4439 return MIB;
4440 }
4441 }
4442 }
4443
4444 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4445 // if VOP3 does not allow a literal operand.
4446 if (Src0Literal && !ST.hasVOP3Literal())
4447 return nullptr;
4448
4449 unsigned NewOpc = getNewFMAInst(ST, Opc);
4450
4451 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
4452 return nullptr;
4453
4454 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4455 .add(MO: *Dst)
4456 .addImm(Val: Src0Mods ? Src0Mods->getImm() : 0)
4457 .add(MO: *Src0)
4458 .addImm(Val: Src1Mods ? Src1Mods->getImm() : 0)
4459 .add(MO: *Src1)
4460 .addImm(Val: Src2Mods ? Src2Mods->getImm() : 0)
4461 .add(MO: *Src2)
4462 .addImm(Val: Clamp ? Clamp->getImm() : 0)
4463 .addImm(Val: Omod ? Omod->getImm() : 0)
4464 .setMIFlags(MI.getFlags());
4465 if (AMDGPU::hasNamedOperand(Opcode: NewOpc, NamedIdx: AMDGPU::OpName::op_sel))
4466 MIB.addImm(Val: OpSel ? OpSel->getImm() : 0);
4467 return MIB;
4468}
4469
4470// It's not generally safe to move VALU instructions across these since it will
4471// start using the register as a base index rather than directly.
4472// XXX - Why isn't hasSideEffects sufficient for these?
4473static bool changesVGPRIndexingMode(const MachineInstr &MI) {
4474 switch (MI.getOpcode()) {
4475 case AMDGPU::S_SET_GPR_IDX_ON:
4476 case AMDGPU::S_SET_GPR_IDX_MODE:
4477 case AMDGPU::S_SET_GPR_IDX_OFF:
4478 return true;
4479 default:
4480 return false;
4481 }
4482}
4483
4484bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
4485 const MachineBasicBlock *MBB,
4486 const MachineFunction &MF) const {
4487 // Skipping the check for SP writes in the base implementation. The reason it
4488 // was added was apparently due to compile time concerns.
4489 //
4490 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4491 // but is probably avoidable.
4492
4493 // Copied from base implementation.
4494 // Terminators and labels can't be scheduled around.
4495 if (MI.isTerminator() || MI.isPosition())
4496 return true;
4497
4498 // INLINEASM_BR can jump to another block
4499 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4500 return true;
4501
4502 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(i: 0).getImm() == 0)
4503 return true;
4504
4505 // Target-independent instructions do not have an implicit-use of EXEC, even
4506 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4507 // boundaries prevents incorrect movements of such instructions.
4508 return MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &RI) ||
4509 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4510 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4511 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4512 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4513 changesVGPRIndexingMode(MI);
4514}
4515
4516bool SIInstrInfo::isAlwaysGDS(uint32_t Opcode) const {
4517 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4518 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4519 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4520}
4521
4522bool SIInstrInfo::mayAccessScratch(const MachineInstr &MI) const {
4523 // Instructions that access scratch use FLAT encoding or BUF encodings.
4524 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4525 return false;
4526
4527 // SCRATCH instructions always access scratch.
4528 if (isFLATScratch(MI))
4529 return true;
4530
4531 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4532 // via the aperture.
4533 if (MI.getMF()->getFunction().hasFnAttribute(Kind: "amdgpu-no-flat-scratch-init"))
4534 return false;
4535
4536 // If there are no memory operands then conservatively assume the flat
4537 // operation may access scratch.
4538 if (MI.memoperands_empty())
4539 return true;
4540
4541 // See if any memory operand specifies an address space that involves scratch.
4542 return any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *Memop) {
4543 unsigned AS = Memop->getAddrSpace();
4544 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4545 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4546 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4547 MD: *MD, Val: AMDGPUAS::PRIVATE_ADDRESS);
4548 }
4549 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4550 });
4551}
4552
4553bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
4554 assert(isFLAT(MI));
4555
4556 // All flat instructions use the VMEM counter except prefetch.
4557 if (!usesVM_CNT(MI))
4558 return false;
4559
4560 // If there are no memory operands then conservatively assume the flat
4561 // operation may access VMEM.
4562 if (MI.memoperands_empty())
4563 return true;
4564
4565 // See if any memory operand specifies an address space that involves VMEM.
4566 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4567 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4568 // (GDS) address space is not supported by flat operations. Therefore, simply
4569 // return true unless only the LDS address space is found.
4570 for (const MachineMemOperand *Memop : MI.memoperands()) {
4571 unsigned AS = Memop->getAddrSpace();
4572 assert(AS != AMDGPUAS::REGION_ADDRESS);
4573 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4574 return true;
4575 }
4576
4577 return false;
4578}
4579
4580bool SIInstrInfo::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
4581 assert(isFLAT(MI));
4582
4583 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4584 if (!usesLGKM_CNT(MI))
4585 return false;
4586
4587 // If in tgsplit mode then there can be no use of LDS.
4588 if (ST.isTgSplitEnabled())
4589 return false;
4590
4591 // If there are no memory operands then conservatively assume the flat
4592 // operation may access LDS.
4593 if (MI.memoperands_empty())
4594 return true;
4595
4596 // See if any memory operand specifies an address space that involves LDS.
4597 for (const MachineMemOperand *Memop : MI.memoperands()) {
4598 unsigned AS = Memop->getAddrSpace();
4599 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
4600 return true;
4601 }
4602
4603 return false;
4604}
4605
4606bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
4607 // Skip the full operand and register alias search modifiesRegister
4608 // does. There's only a handful of instructions that touch this, it's only an
4609 // implicit def, and doesn't alias any other registers.
4610 return is_contained(Range: MI.getDesc().implicit_defs(), Element: AMDGPU::MODE);
4611}
4612
4613bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
4614 unsigned Opcode = MI.getOpcode();
4615
4616 if (MI.mayStore() && isSMRD(MI))
4617 return true; // scalar store or atomic
4618
4619 // This will terminate the function when other lanes may need to continue.
4620 if (MI.isReturn())
4621 return true;
4622
4623 // These instructions cause shader I/O that may cause hardware lockups
4624 // when executed with an empty EXEC mask.
4625 //
4626 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4627 // EXEC = 0, but checking for that case here seems not worth it
4628 // given the typical code patterns.
4629 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4630 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4631 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
4632 Opcode == AMDGPU::S_SETHALT)
4633 return true;
4634
4635 if (MI.isCall() || MI.isInlineAsm())
4636 return true; // conservative assumption
4637
4638 // Assume that barrier interactions are only intended with active lanes.
4639 if (isBarrier(Opcode))
4640 return true;
4641
4642 // A mode change is a scalar operation that influences vector instructions.
4643 if (modifiesModeRegister(MI))
4644 return true;
4645
4646 // These are like SALU instructions in terms of effects, so it's questionable
4647 // whether we should return true for those.
4648 //
4649 // However, executing them with EXEC = 0 causes them to operate on undefined
4650 // data, which we avoid by returning true here.
4651 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4652 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4653 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4654 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4655 return true;
4656
4657 return false;
4658}
4659
4660bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,
4661 const MachineInstr &MI) const {
4662 if (MI.isMetaInstruction())
4663 return false;
4664
4665 // This won't read exec if this is an SGPR->SGPR copy.
4666 if (MI.isCopyLike()) {
4667 if (!RI.isSGPRReg(MRI, Reg: MI.getOperand(i: 0).getReg()))
4668 return true;
4669
4670 // Make sure this isn't copying exec as a normal operand
4671 return MI.readsRegister(Reg: AMDGPU::EXEC, TRI: &RI);
4672 }
4673
4674 // Make a conservative assumption about the callee.
4675 if (MI.isCall())
4676 return true;
4677
4678 // Be conservative with any unhandled generic opcodes.
4679 if (!isTargetSpecificOpcode(Opcode: MI.getOpcode()))
4680 return true;
4681
4682 return !isSALU(MI) || MI.readsRegister(Reg: AMDGPU::EXEC, TRI: &RI);
4683}
4684
4685bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4686 switch (Imm.getBitWidth()) {
4687 case 1: // This likely will be a condition code mask.
4688 return true;
4689
4690 case 32:
4691 return AMDGPU::isInlinableLiteral32(Literal: Imm.getSExtValue(),
4692 HasInv2Pi: ST.hasInv2PiInlineImm());
4693 case 64:
4694 return AMDGPU::isInlinableLiteral64(Literal: Imm.getSExtValue(),
4695 HasInv2Pi: ST.hasInv2PiInlineImm());
4696 case 16:
4697 return ST.has16BitInsts() &&
4698 AMDGPU::isInlinableLiteralI16(Literal: Imm.getSExtValue(),
4699 HasInv2Pi: ST.hasInv2PiInlineImm());
4700 default:
4701 llvm_unreachable("invalid bitwidth");
4702 }
4703}
4704
4705bool SIInstrInfo::isInlineConstant(const APFloat &Imm) const {
4706 APInt IntImm = Imm.bitcastToAPInt();
4707 int64_t IntImmVal = IntImm.getSExtValue();
4708 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4709 switch (APFloat::SemanticsToEnum(Sem: Imm.getSemantics())) {
4710 default:
4711 llvm_unreachable("invalid fltSemantics");
4712 case APFloatBase::S_IEEEsingle:
4713 case APFloatBase::S_IEEEdouble:
4714 return isInlineConstant(Imm: IntImm);
4715 case APFloatBase::S_BFloat:
4716 return ST.has16BitInsts() &&
4717 AMDGPU::isInlinableLiteralBF16(Literal: IntImmVal, HasInv2Pi);
4718 case APFloatBase::S_IEEEhalf:
4719 return ST.has16BitInsts() &&
4720 AMDGPU::isInlinableLiteralFP16(Literal: IntImmVal, HasInv2Pi);
4721 }
4722}
4723
4724bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4725 // MachineOperand provides no way to tell the true operand size, since it only
4726 // records a 64-bit value. We need to know the size to determine if a 32-bit
4727 // floating point immediate bit pattern is legal for an integer immediate. It
4728 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4729 switch (OperandType) {
4730 case AMDGPU::OPERAND_REG_IMM_INT32:
4731 case AMDGPU::OPERAND_REG_IMM_FP32:
4732 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
4733 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
4734 case AMDGPU::OPERAND_REG_IMM_V2FP32:
4735 case AMDGPU::OPERAND_REG_IMM_V2INT32:
4736 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
4737 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
4738 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: {
4739 int32_t Trunc = static_cast<int32_t>(Imm);
4740 return AMDGPU::isInlinableLiteral32(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4741 }
4742 case AMDGPU::OPERAND_REG_IMM_INT64:
4743 case AMDGPU::OPERAND_REG_IMM_FP64:
4744 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
4745 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
4746 case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
4747 return AMDGPU::isInlinableLiteral64(Literal: Imm, HasInv2Pi: ST.hasInv2PiInlineImm());
4748 case AMDGPU::OPERAND_REG_IMM_INT16:
4749 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
4750 // We would expect inline immediates to not be concerned with an integer/fp
4751 // distinction. However, in the case of 16-bit integer operations, the
4752 // "floating point" values appear to not work. It seems read the low 16-bits
4753 // of 32-bit immediates, which happens to always work for the integer
4754 // values.
4755 //
4756 // See llvm bugzilla 46302.
4757 //
4758 // TODO: Theoretically we could use op-sel to use the high bits of the
4759 // 32-bit FP values.
4760 return AMDGPU::isInlinableIntLiteral(Literal: Imm);
4761 case AMDGPU::OPERAND_REG_IMM_V2INT16:
4762 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
4763 return AMDGPU::isInlinableLiteralV2I16(Literal: Imm);
4764 case AMDGPU::OPERAND_REG_IMM_V2FP16:
4765 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
4766 return AMDGPU::isInlinableLiteralV2F16(Literal: Imm);
4767 case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
4768 return AMDGPU::isPKFMACF16InlineConstant(Literal: Imm, IsGFX11Plus: ST.isGFX11Plus());
4769 case AMDGPU::OPERAND_REG_IMM_V2BF16:
4770 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
4771 return AMDGPU::isInlinableLiteralV2BF16(Literal: Imm);
4772 case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
4773 return false;
4774 case AMDGPU::OPERAND_REG_IMM_FP16:
4775 case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
4776 if (isInt<16>(x: Imm) || isUInt<16>(x: Imm)) {
4777 // A few special case instructions have 16-bit operands on subtargets
4778 // where 16-bit instructions are not legal.
4779 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4780 // constants in these cases
4781 int16_t Trunc = static_cast<int16_t>(Imm);
4782 return ST.has16BitInsts() &&
4783 AMDGPU::isInlinableLiteralFP16(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4784 }
4785
4786 return false;
4787 }
4788 case AMDGPU::OPERAND_REG_IMM_BF16:
4789 case AMDGPU::OPERAND_REG_INLINE_C_BF16: {
4790 if (isInt<16>(x: Imm) || isUInt<16>(x: Imm)) {
4791 int16_t Trunc = static_cast<int16_t>(Imm);
4792 return ST.has16BitInsts() &&
4793 AMDGPU::isInlinableLiteralBF16(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4794 }
4795 return false;
4796 }
4797 case AMDGPU::OPERAND_KIMM32:
4798 case AMDGPU::OPERAND_KIMM16:
4799 case AMDGPU::OPERAND_KIMM64:
4800 return false;
4801 case AMDGPU::OPERAND_INLINE_C_AV64_PSEUDO:
4802 return isLegalAV64PseudoImm(Imm);
4803 case AMDGPU::OPERAND_INPUT_MODS:
4804 case MCOI::OPERAND_IMMEDIATE:
4805 // Always embedded in the instruction for free.
4806 return true;
4807 case MCOI::OPERAND_UNKNOWN:
4808 case MCOI::OPERAND_REGISTER:
4809 case MCOI::OPERAND_PCREL:
4810 case MCOI::OPERAND_GENERIC_0:
4811 case MCOI::OPERAND_GENERIC_1:
4812 case MCOI::OPERAND_GENERIC_2:
4813 case MCOI::OPERAND_GENERIC_3:
4814 case MCOI::OPERAND_GENERIC_4:
4815 case MCOI::OPERAND_GENERIC_5:
4816 // Just ignore anything else.
4817 return true;
4818 default:
4819 llvm_unreachable("invalid operand type");
4820 }
4821}
4822
4823static bool compareMachineOp(const MachineOperand &Op0,
4824 const MachineOperand &Op1) {
4825 if (Op0.getType() != Op1.getType())
4826 return false;
4827
4828 switch (Op0.getType()) {
4829 case MachineOperand::MO_Register:
4830 return Op0.getReg() == Op1.getReg();
4831 case MachineOperand::MO_Immediate:
4832 return Op0.getImm() == Op1.getImm();
4833 default:
4834 llvm_unreachable("Didn't expect to be comparing these operand types");
4835 }
4836}
4837
4838bool SIInstrInfo::isLiteralOperandLegal(const MCInstrDesc &InstDesc,
4839 const MCOperandInfo &OpInfo) const {
4840 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4841 return true;
4842
4843 if (!RI.opCanUseLiteralConstant(OpType: OpInfo.OperandType))
4844 return false;
4845
4846 if (!isVOP3(Desc: InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4847 return true;
4848
4849 return ST.hasVOP3Literal();
4850}
4851
4852bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4853 int64_t ImmVal) const {
4854 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4855 if (isInlineConstant(Imm: ImmVal, OperandType: OpInfo.OperandType)) {
4856 if (isMAI(Desc: InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4857 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(Opcode: InstDesc.getOpcode(),
4858 Name: AMDGPU::OpName::src2))
4859 return false;
4860 return RI.opCanUseInlineConstant(OpType: OpInfo.OperandType);
4861 }
4862
4863 return isLiteralOperandLegal(InstDesc, OpInfo);
4864}
4865
4866bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4867 const MachineOperand &MO) const {
4868 if (MO.isImm())
4869 return isImmOperandLegal(InstDesc, OpNo, ImmVal: MO.getImm());
4870
4871 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4872 "unexpected imm-like operand kind");
4873 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4874 return isLiteralOperandLegal(InstDesc, OpInfo);
4875}
4876
4877bool SIInstrInfo::isLegalAV64PseudoImm(uint64_t Imm) const {
4878 // 2 32-bit inline constants packed into one.
4879 return AMDGPU::isInlinableLiteral32(Literal: Lo_32(Value: Imm), HasInv2Pi: ST.hasInv2PiInlineImm()) &&
4880 AMDGPU::isInlinableLiteral32(Literal: Hi_32(Value: Imm), HasInv2Pi: ST.hasInv2PiInlineImm());
4881}
4882
4883bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4884 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4885 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4886 return false;
4887
4888 int Op32 = AMDGPU::getVOPe32(Opcode);
4889 if (Op32 == -1)
4890 return false;
4891
4892 return pseudoToMCOpcode(Opcode: Op32) != -1;
4893}
4894
4895bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4896 // The src0_modifier operand is present on all instructions
4897 // that have modifiers.
4898
4899 return AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers);
4900}
4901
4902bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
4903 AMDGPU::OpName OpName) const {
4904 const MachineOperand *Mods = getNamedOperand(MI, OperandName: OpName);
4905 return Mods && Mods->getImm();
4906}
4907
4908bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
4909 return any_of(Range: ModifierOpNames,
4910 P: [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, OpName: Name); });
4911}
4912
4913bool SIInstrInfo::canShrink(const MachineInstr &MI,
4914 const MachineRegisterInfo &MRI) const {
4915 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
4916 // Can't shrink instruction with three operands.
4917 if (Src2) {
4918 switch (MI.getOpcode()) {
4919 default: return false;
4920
4921 case AMDGPU::V_ADDC_U32_e64:
4922 case AMDGPU::V_SUBB_U32_e64:
4923 case AMDGPU::V_SUBBREV_U32_e64: {
4924 const MachineOperand *Src1
4925 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4926 if (!Src1->isReg() || !RI.isVGPR(MRI, Reg: Src1->getReg()))
4927 return false;
4928 // Additional verification is needed for sdst/src2.
4929 return true;
4930 }
4931 case AMDGPU::V_MAC_F16_e64:
4932 case AMDGPU::V_MAC_F32_e64:
4933 case AMDGPU::V_MAC_LEGACY_F32_e64:
4934 case AMDGPU::V_FMAC_F16_e64:
4935 case AMDGPU::V_FMAC_F16_t16_e64:
4936 case AMDGPU::V_FMAC_F16_fake16_e64:
4937 case AMDGPU::V_FMAC_F32_e64:
4938 case AMDGPU::V_FMAC_F64_e64:
4939 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4940 if (!Src2->isReg() || !RI.isVGPR(MRI, Reg: Src2->getReg()) ||
4941 hasModifiersSet(MI, OpName: AMDGPU::OpName::src2_modifiers))
4942 return false;
4943 break;
4944
4945 case AMDGPU::V_CNDMASK_B32_e64:
4946 break;
4947 }
4948 }
4949
4950 const MachineOperand *Src1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4951 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Reg: Src1->getReg()) ||
4952 hasModifiersSet(MI, OpName: AMDGPU::OpName::src1_modifiers)))
4953 return false;
4954
4955 // We don't need to check src0, all input types are legal, so just make sure
4956 // src0 isn't using any modifiers.
4957 if (hasModifiersSet(MI, OpName: AMDGPU::OpName::src0_modifiers))
4958 return false;
4959
4960 // Can it be shrunk to a valid 32 bit opcode?
4961 if (!hasVALU32BitEncoding(Opcode: MI.getOpcode()))
4962 return false;
4963
4964 // Check output modifiers
4965 return !hasModifiersSet(MI, OpName: AMDGPU::OpName::omod) &&
4966 !hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) &&
4967 !hasModifiersSet(MI, OpName: AMDGPU::OpName::byte_sel) &&
4968 // TODO: Can we avoid checking bound_ctrl/fi here?
4969 // They are only used by permlane*_swap special case.
4970 !hasModifiersSet(MI, OpName: AMDGPU::OpName::bound_ctrl) &&
4971 !hasModifiersSet(MI, OpName: AMDGPU::OpName::fi);
4972}
4973
4974// Set VCC operand with all flags from \p Orig, except for setting it as
4975// implicit.
4976static void copyFlagsToImplicitVCC(MachineInstr &MI,
4977 const MachineOperand &Orig) {
4978
4979 for (MachineOperand &Use : MI.implicit_operands()) {
4980 if (Use.isUse() &&
4981 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4982 Use.setIsUndef(Orig.isUndef());
4983 Use.setIsKill(Orig.isKill());
4984 return;
4985 }
4986 }
4987}
4988
4989MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
4990 unsigned Op32) const {
4991 MachineBasicBlock *MBB = MI.getParent();
4992
4993 const MCInstrDesc &Op32Desc = get(Opcode: Op32);
4994 MachineInstrBuilder Inst32 =
4995 BuildMI(BB&: *MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: Op32Desc)
4996 .setMIFlags(MI.getFlags());
4997
4998 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4999 // For VOPC instructions, this is replaced by an implicit def of vcc.
5000
5001 // We assume the defs of the shrunk opcode are in the same order, and the
5002 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
5003 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
5004 Inst32.add(MO: MI.getOperand(i: I));
5005
5006 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
5007
5008 int Idx = MI.getNumExplicitDefs();
5009 for (const MachineOperand &Use : MI.explicit_uses()) {
5010 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
5011 if (OpTy == AMDGPU::OPERAND_INPUT_MODS || OpTy == MCOI::OPERAND_IMMEDIATE)
5012 continue;
5013
5014 if (&Use == Src2) {
5015 if (AMDGPU::getNamedOperandIdx(Opcode: Op32, Name: AMDGPU::OpName::src2) == -1) {
5016 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
5017 // replaced with an implicit read of vcc or vcc_lo. The implicit read
5018 // of vcc was already added during the initial BuildMI, but we
5019 // 1) may need to change vcc to vcc_lo to preserve the original register
5020 // 2) have to preserve the original flags.
5021 copyFlagsToImplicitVCC(MI&: *Inst32, Orig: *Src2);
5022 continue;
5023 }
5024 }
5025
5026 Inst32.add(MO: Use);
5027 }
5028
5029 // FIXME: Losing implicit operands
5030 fixImplicitOperands(MI&: *Inst32);
5031 return Inst32;
5032}
5033
5034bool SIInstrInfo::physRegUsesConstantBus(const MachineOperand &RegOp) const {
5035 // Null is free
5036 Register Reg = RegOp.getReg();
5037 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
5038 return false;
5039
5040 // SGPRs use the constant bus
5041
5042 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5043 // physical register operands should also count, except for exec.
5044 if (RegOp.isImplicit())
5045 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5046
5047 // SGPRs use the constant bus
5048 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5049 AMDGPU::SReg_64RegClass.contains(Reg);
5050}
5051
5052bool SIInstrInfo::regUsesConstantBus(const MachineOperand &RegOp,
5053 const MachineRegisterInfo &MRI) const {
5054 Register Reg = RegOp.getReg();
5055 return Reg.isVirtual() ? RI.isSGPRClass(RC: MRI.getRegClass(Reg))
5056 : physRegUsesConstantBus(RegOp);
5057}
5058
5059bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
5060 const MachineOperand &MO,
5061 const MCOperandInfo &OpInfo) const {
5062 // Literal constants use the constant bus.
5063 if (!MO.isReg())
5064 return !isInlineConstant(MO, OpInfo);
5065
5066 Register Reg = MO.getReg();
5067 return Reg.isVirtual() ? RI.isSGPRClass(RC: MRI.getRegClass(Reg))
5068 : physRegUsesConstantBus(RegOp: MO);
5069}
5070
5071static Register findImplicitSGPRRead(const MachineInstr &MI) {
5072 for (const MachineOperand &MO : MI.implicit_operands()) {
5073 // We only care about reads.
5074 if (MO.isDef())
5075 continue;
5076
5077 switch (MO.getReg()) {
5078 case AMDGPU::VCC:
5079 case AMDGPU::VCC_LO:
5080 case AMDGPU::VCC_HI:
5081 case AMDGPU::M0:
5082 case AMDGPU::FLAT_SCR:
5083 return MO.getReg();
5084
5085 default:
5086 break;
5087 }
5088 }
5089
5090 return Register();
5091}
5092
5093static bool shouldReadExec(const MachineInstr &MI) {
5094 if (SIInstrInfo::isVALU(MI)) {
5095 switch (MI.getOpcode()) {
5096 case AMDGPU::V_READLANE_B32:
5097 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5098 case AMDGPU::V_WRITELANE_B32:
5099 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5100 return false;
5101 }
5102
5103 return true;
5104 }
5105
5106 if (MI.isPreISelOpcode() ||
5107 SIInstrInfo::isGenericOpcode(Opc: MI.getOpcode()) ||
5108 SIInstrInfo::isSALU(MI) ||
5109 SIInstrInfo::isSMRD(MI))
5110 return false;
5111
5112 return true;
5113}
5114
5115static bool isRegOrFI(const MachineOperand &MO) {
5116 return MO.isReg() || MO.isFI();
5117}
5118
5119static bool isSubRegOf(const SIRegisterInfo &TRI,
5120 const MachineOperand &SuperVec,
5121 const MachineOperand &SubReg) {
5122 if (SubReg.getReg().isPhysical())
5123 return TRI.isSubRegister(RegA: SuperVec.getReg(), RegB: SubReg.getReg());
5124
5125 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5126 SubReg.getReg() == SuperVec.getReg();
5127}
5128
5129// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5130bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5131 const MachineRegisterInfo &MRI,
5132 StringRef &ErrInfo) const {
5133 Register DstReg = MI.getOperand(i: 0).getReg();
5134 Register SrcReg = MI.getOperand(i: 1).getReg();
5135 // This is a check for copy from vector register to SGPR
5136 if (RI.isVectorRegister(MRI, Reg: SrcReg) && RI.isSGPRReg(MRI, Reg: DstReg)) {
5137 ErrInfo = "illegal copy from vector register to SGPR";
5138 return false;
5139 }
5140 return true;
5141}
5142
5143bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
5144 StringRef &ErrInfo) const {
5145 uint32_t Opcode = MI.getOpcode();
5146 const MachineFunction *MF = MI.getMF();
5147 const MachineRegisterInfo &MRI = MF->getRegInfo();
5148
5149 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5150 // Find a better property to recognize the point where instruction selection
5151 // is just done.
5152 // We can only enforce this check after SIFixSGPRCopies pass so that the
5153 // illegal copies are legalized and thereafter we don't expect a pass
5154 // inserting similar copies.
5155 if (!MRI.isSSA() && MI.isCopy())
5156 return verifyCopy(MI, MRI, ErrInfo);
5157
5158 if (SIInstrInfo::isGenericOpcode(Opc: Opcode))
5159 return true;
5160
5161 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0);
5162 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src1);
5163 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src2);
5164 int Src3Idx = -1;
5165 if (Src0Idx == -1) {
5166 // VOPD V_DUAL_* instructions use different operand names.
5167 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0X);
5168 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vsrc1X);
5169 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0Y);
5170 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vsrc1Y);
5171 }
5172
5173 // Make sure the number of operands is correct.
5174 const MCInstrDesc &Desc = get(Opcode);
5175 if (!Desc.isVariadic() &&
5176 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5177 ErrInfo = "Instruction has wrong number of operands.";
5178 return false;
5179 }
5180
5181 if (MI.isInlineAsm()) {
5182 // Verify register classes for inlineasm constraints.
5183 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5184 I != E; ++I) {
5185 const TargetRegisterClass *RC = MI.getRegClassConstraint(OpIdx: I, TII: this, TRI: &RI);
5186 if (!RC)
5187 continue;
5188
5189 const MachineOperand &Op = MI.getOperand(i: I);
5190 if (!Op.isReg())
5191 continue;
5192
5193 Register Reg = Op.getReg();
5194 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5195 ErrInfo = "inlineasm operand has incorrect register class.";
5196 return false;
5197 }
5198 }
5199
5200 return true;
5201 }
5202
5203 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5204 ErrInfo = "missing memory operand from image instruction.";
5205 return false;
5206 }
5207
5208 // Make sure the register classes are correct.
5209 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5210 const MachineOperand &MO = MI.getOperand(i);
5211 if (MO.isFPImm()) {
5212 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5213 "all fp values to integers.";
5214 return false;
5215 }
5216
5217 const MCOperandInfo &OpInfo = Desc.operands()[i];
5218 int16_t RegClass = getOpRegClassID(OpInfo);
5219
5220 switch (OpInfo.OperandType) {
5221 case MCOI::OPERAND_REGISTER:
5222 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5223 ErrInfo = "Illegal immediate value for operand.";
5224 return false;
5225 }
5226 break;
5227 case AMDGPU::OPERAND_REG_IMM_INT32:
5228 case AMDGPU::OPERAND_REG_IMM_INT64:
5229 case AMDGPU::OPERAND_REG_IMM_INT16:
5230 case AMDGPU::OPERAND_REG_IMM_FP32:
5231 case AMDGPU::OPERAND_REG_IMM_V2FP32:
5232 case AMDGPU::OPERAND_REG_IMM_BF16:
5233 case AMDGPU::OPERAND_REG_IMM_FP16:
5234 case AMDGPU::OPERAND_REG_IMM_FP64:
5235 case AMDGPU::OPERAND_REG_IMM_V2FP16:
5236 case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
5237 case AMDGPU::OPERAND_REG_IMM_V2INT16:
5238 case AMDGPU::OPERAND_REG_IMM_V2INT32:
5239 case AMDGPU::OPERAND_REG_IMM_V2BF16:
5240 break;
5241 case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
5242 break;
5243 break;
5244 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
5245 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
5246 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
5247 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
5248 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
5249 case AMDGPU::OPERAND_REG_INLINE_C_BF16:
5250 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
5251 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
5252 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
5253 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
5254 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
5255 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
5256 case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
5257 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, OpIdx: i))) {
5258 ErrInfo = "Illegal immediate value for operand.";
5259 return false;
5260 }
5261 break;
5262 }
5263 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
5264 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, OpIdx: i)) {
5265 ErrInfo = "Expected inline constant for operand.";
5266 return false;
5267 }
5268 break;
5269 case AMDGPU::OPERAND_INPUT_MODS:
5270 case AMDGPU::OPERAND_SDWA_VOPC_DST:
5271 case AMDGPU::OPERAND_KIMM16:
5272 break;
5273 case MCOI::OPERAND_IMMEDIATE:
5274 case AMDGPU::OPERAND_KIMM32:
5275 case AMDGPU::OPERAND_KIMM64:
5276 case AMDGPU::OPERAND_INLINE_C_AV64_PSEUDO:
5277 // Check if this operand is an immediate.
5278 // FrameIndex operands will be replaced by immediates, so they are
5279 // allowed.
5280 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5281 ErrInfo = "Expected immediate, but got non-immediate";
5282 return false;
5283 }
5284 break;
5285 case MCOI::OPERAND_UNKNOWN:
5286 case MCOI::OPERAND_MEMORY:
5287 case MCOI::OPERAND_PCREL:
5288 break;
5289 default:
5290 if (OpInfo.isGenericType())
5291 continue;
5292 break;
5293 }
5294
5295 if (!MO.isReg())
5296 continue;
5297 Register Reg = MO.getReg();
5298 if (!Reg)
5299 continue;
5300
5301 // FIXME: Ideally we would have separate instruction definitions with the
5302 // aligned register constraint.
5303 // FIXME: We do not verify inline asm operands, but custom inline asm
5304 // verification is broken anyway
5305 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5306 Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
5307 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5308 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5309 if (const TargetRegisterClass *SubRC =
5310 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5311 RC = RI.getCompatibleSubRegClass(SuperRC: RC, SubRC, SubIdx: MO.getSubReg());
5312 if (RC)
5313 RC = SubRC;
5314 }
5315 }
5316
5317 // Check that this is the aligned version of the class.
5318 if (!RC || !RI.isProperlyAlignedRC(RC: *RC)) {
5319 ErrInfo = "Subtarget requires even aligned vector registers";
5320 return false;
5321 }
5322 }
5323
5324 if (RegClass != -1) {
5325 if (Reg.isVirtual())
5326 continue;
5327
5328 const TargetRegisterClass *RC = RI.getRegClass(i: RegClass);
5329 if (!RC->contains(Reg)) {
5330 ErrInfo = "Operand has incorrect register class.";
5331 return false;
5332 }
5333 }
5334 }
5335
5336 // Verify SDWA
5337 if (isSDWA(MI)) {
5338 if (!ST.hasSDWA()) {
5339 ErrInfo = "SDWA is not supported on this target";
5340 return false;
5341 }
5342
5343 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5344 AMDGPU::OpName::dst_sel}) {
5345 const MachineOperand *MO = getNamedOperand(MI, OperandName: Op);
5346 if (!MO)
5347 continue;
5348 int64_t Imm = MO->getImm();
5349 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5350 ErrInfo = "Invalid SDWA selection";
5351 return false;
5352 }
5353 }
5354
5355 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdst);
5356
5357 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5358 if (OpIdx == -1)
5359 continue;
5360 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5361
5362 if (!ST.hasSDWAScalar()) {
5363 // Only VGPRS on VI
5364 if (!MO.isReg() || !RI.hasVGPRs(RC: RI.getRegClassForReg(MRI, Reg: MO.getReg()))) {
5365 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5366 return false;
5367 }
5368 } else {
5369 // No immediates on GFX9
5370 if (!MO.isReg()) {
5371 ErrInfo =
5372 "Only reg allowed as operands in SDWA instructions on GFX9+";
5373 return false;
5374 }
5375 }
5376 }
5377
5378 if (!ST.hasSDWAOmod()) {
5379 // No omod allowed on VI
5380 const MachineOperand *OMod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
5381 if (OMod != nullptr &&
5382 (!OMod->isImm() || OMod->getImm() != 0)) {
5383 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5384 return false;
5385 }
5386 }
5387
5388 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5389 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5390 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5391 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5392 const MachineOperand *Src0ModsMO =
5393 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
5394 unsigned Mods = Src0ModsMO->getImm();
5395 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5396 Mods & SISrcMods::SEXT) {
5397 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5398 return false;
5399 }
5400 }
5401
5402 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5403 if (isVOPC(Opcode: BasicOpcode)) {
5404 if (!ST.hasSDWASdst() && DstIdx != -1) {
5405 // Only vcc allowed as dst on VI for VOPC
5406 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5407 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5408 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5409 return false;
5410 }
5411 } else if (!ST.hasSDWAOutModsVOPC()) {
5412 // No clamp allowed on GFX9 for VOPC
5413 const MachineOperand *Clamp = getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
5414 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5415 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5416 return false;
5417 }
5418
5419 // No omod allowed on GFX9 for VOPC
5420 const MachineOperand *OMod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
5421 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5422 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5423 return false;
5424 }
5425 }
5426 }
5427
5428 const MachineOperand *DstUnused = getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused);
5429 if (DstUnused && DstUnused->isImm() &&
5430 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5431 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5432 if (!Dst.isReg() || !Dst.isTied()) {
5433 ErrInfo = "Dst register should have tied register";
5434 return false;
5435 }
5436
5437 const MachineOperand &TiedMO =
5438 MI.getOperand(i: MI.findTiedOperandIdx(OpIdx: DstIdx));
5439 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5440 ErrInfo =
5441 "Dst register should be tied to implicit use of preserved register";
5442 return false;
5443 }
5444 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5445 ErrInfo = "Dst register should use same physical register as preserved";
5446 return false;
5447 }
5448 }
5449 }
5450
5451 // Verify MIMG / VIMAGE / VSAMPLE
5452 if (isImage(Opcode) && !MI.mayStore()) {
5453 // Ensure that the return type used is large enough for all the options
5454 // being used TFE/LWE require an extra result register.
5455 const MachineOperand *DMask = getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask);
5456 if (DMask) {
5457 uint64_t DMaskImm = DMask->getImm();
5458 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(Value: DMaskImm);
5459 const MachineOperand *TFE = getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe);
5460 const MachineOperand *LWE = getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe);
5461 const MachineOperand *D16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::d16);
5462
5463 // Adjust for packed 16 bit values
5464 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5465 RegCount = divideCeil(Numerator: RegCount, Denominator: 2);
5466
5467 // Adjust if using LWE or TFE
5468 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5469 RegCount += 1;
5470
5471 const uint32_t DstIdx =
5472 AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdata);
5473 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5474 if (Dst.isReg()) {
5475 const TargetRegisterClass *DstRC = getOpRegClass(MI, OpNo: DstIdx);
5476 uint32_t DstSize = RI.getRegSizeInBits(RC: *DstRC) / 32;
5477 if (RegCount > DstSize) {
5478 ErrInfo = "Image instruction returns too many registers for dst "
5479 "register class";
5480 return false;
5481 }
5482 }
5483 }
5484 }
5485
5486 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5487 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5488 unsigned ConstantBusCount = 0;
5489 bool UsesLiteral = false;
5490 const MachineOperand *LiteralVal = nullptr;
5491
5492 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::imm);
5493 if (ImmIdx != -1) {
5494 ++ConstantBusCount;
5495 UsesLiteral = true;
5496 LiteralVal = &MI.getOperand(i: ImmIdx);
5497 }
5498
5499 SmallVector<Register, 2> SGPRsUsed;
5500 Register SGPRUsed;
5501
5502 // Only look at the true operands. Only a real operand can use the constant
5503 // bus, and we don't want to check pseudo-operands like the source modifier
5504 // flags.
5505 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5506 if (OpIdx == -1)
5507 continue;
5508 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5509 if (usesConstantBus(MRI, MO, OpInfo: MI.getDesc().operands()[OpIdx])) {
5510 if (MO.isReg()) {
5511 SGPRUsed = MO.getReg();
5512 if (!llvm::is_contained(Range&: SGPRsUsed, Element: SGPRUsed)) {
5513 ++ConstantBusCount;
5514 SGPRsUsed.push_back(Elt: SGPRUsed);
5515 }
5516 } else if (!MO.isFI()) { // Treat FI like a register.
5517 if (!UsesLiteral) {
5518 ++ConstantBusCount;
5519 UsesLiteral = true;
5520 LiteralVal = &MO;
5521 } else if (!MO.isIdenticalTo(Other: *LiteralVal)) {
5522 assert(isVOP2(MI) || isVOP3(MI));
5523 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5524 return false;
5525 }
5526 }
5527 }
5528 }
5529
5530 SGPRUsed = findImplicitSGPRRead(MI);
5531 if (SGPRUsed) {
5532 // Implicit uses may safely overlap true operands
5533 if (llvm::all_of(Range&: SGPRsUsed, P: [this, SGPRUsed](unsigned SGPR) {
5534 return !RI.regsOverlap(RegA: SGPRUsed, RegB: SGPR);
5535 })) {
5536 ++ConstantBusCount;
5537 SGPRsUsed.push_back(Elt: SGPRUsed);
5538 }
5539 }
5540
5541 // v_writelane_b32 is an exception from constant bus restriction:
5542 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5543 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5544 Opcode != AMDGPU::V_WRITELANE_B32) {
5545 ErrInfo = "VOP* instruction violates constant bus restriction";
5546 return false;
5547 }
5548
5549 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5550 ErrInfo = "VOP3 instruction uses literal";
5551 return false;
5552 }
5553 }
5554
5555 // Special case for writelane - this can break the multiple constant bus rule,
5556 // but still can't use more than one SGPR register
5557 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5558 unsigned SGPRCount = 0;
5559 Register SGPRUsed;
5560
5561 for (int OpIdx : {Src0Idx, Src1Idx}) {
5562 if (OpIdx == -1)
5563 break;
5564
5565 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5566
5567 if (usesConstantBus(MRI, MO, OpInfo: MI.getDesc().operands()[OpIdx])) {
5568 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5569 if (MO.getReg() != SGPRUsed)
5570 ++SGPRCount;
5571 SGPRUsed = MO.getReg();
5572 }
5573 }
5574 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5575 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5576 return false;
5577 }
5578 }
5579 }
5580
5581 // Verify misc. restrictions on specific instructions.
5582 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5583 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5584 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5585 const MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
5586 const MachineOperand &Src2 = MI.getOperand(i: Src2Idx);
5587 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5588 if (!compareMachineOp(Op0: Src0, Op1: Src1) &&
5589 !compareMachineOp(Op0: Src0, Op1: Src2)) {
5590 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5591 return false;
5592 }
5593 }
5594 if ((getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers)->getImm() &
5595 SISrcMods::ABS) ||
5596 (getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers)->getImm() &
5597 SISrcMods::ABS) ||
5598 (getNamedOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers)->getImm() &
5599 SISrcMods::ABS)) {
5600 ErrInfo = "ABS not allowed in VOP3B instructions";
5601 return false;
5602 }
5603 }
5604
5605 if (isSOP2(MI) || isSOPC(MI)) {
5606 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5607 const MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
5608
5609 if (!isRegOrFI(MO: Src0) && !isRegOrFI(MO: Src1) &&
5610 !isInlineConstant(MO: Src0, OpInfo: Desc.operands()[Src0Idx]) &&
5611 !isInlineConstant(MO: Src1, OpInfo: Desc.operands()[Src1Idx]) &&
5612 !Src0.isIdenticalTo(Other: Src1)) {
5613 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5614 return false;
5615 }
5616 }
5617
5618 if (isSOPK(MI)) {
5619 const auto *Op = getNamedOperand(MI, OperandName: AMDGPU::OpName::simm16);
5620 if (Desc.isBranch()) {
5621 if (!Op->isMBB()) {
5622 ErrInfo = "invalid branch target for SOPK instruction";
5623 return false;
5624 }
5625 } else {
5626 uint64_t Imm = Op->getImm();
5627 if (sopkIsZext(Opcode)) {
5628 if (!isUInt<16>(x: Imm)) {
5629 ErrInfo = "invalid immediate for SOPK instruction";
5630 return false;
5631 }
5632 } else {
5633 if (!isInt<16>(x: Imm)) {
5634 ErrInfo = "invalid immediate for SOPK instruction";
5635 return false;
5636 }
5637 }
5638 }
5639 }
5640
5641 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5642 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5643 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5644 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5645 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5646 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5647
5648 const unsigned StaticNumOps =
5649 Desc.getNumOperands() + Desc.implicit_uses().size();
5650 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5651
5652 // Require additional implicit operands. This allows a fixup done by the
5653 // post RA scheduler where the main implicit operand is killed and
5654 // implicit-defs are added for sub-registers that remain live after this
5655 // instruction.
5656 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5657 ErrInfo = "missing implicit register operands";
5658 return false;
5659 }
5660
5661 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
5662 if (IsDst) {
5663 if (!Dst->isUse()) {
5664 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5665 return false;
5666 }
5667
5668 unsigned UseOpIdx;
5669 if (!MI.isRegTiedToUseOperand(DefOpIdx: StaticNumOps, UseOpIdx: &UseOpIdx) ||
5670 UseOpIdx != StaticNumOps + 1) {
5671 ErrInfo = "movrel implicit operands should be tied";
5672 return false;
5673 }
5674 }
5675
5676 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5677 const MachineOperand &ImpUse
5678 = MI.getOperand(i: StaticNumOps + NumImplicitOps - 1);
5679 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5680 !isSubRegOf(TRI: RI, SuperVec: ImpUse, SubReg: IsDst ? *Dst : Src0)) {
5681 ErrInfo = "src0 should be subreg of implicit vector use";
5682 return false;
5683 }
5684 }
5685
5686 // Make sure we aren't losing exec uses in the td files. This mostly requires
5687 // being careful when using let Uses to try to add other use registers.
5688 if (shouldReadExec(MI)) {
5689 if (!MI.hasRegisterImplicitUseOperand(Reg: AMDGPU::EXEC)) {
5690 ErrInfo = "VALU instruction does not implicitly read exec mask";
5691 return false;
5692 }
5693 }
5694
5695 if (isSMRD(MI)) {
5696 if (MI.mayStore() &&
5697 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5698 // The register offset form of scalar stores may only use m0 as the
5699 // soffset register.
5700 const MachineOperand *Soff = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
5701 if (Soff && Soff->getReg() != AMDGPU::M0) {
5702 ErrInfo = "scalar stores must use m0 as offset register";
5703 return false;
5704 }
5705 }
5706 }
5707
5708 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5709 const MachineOperand *Offset = getNamedOperand(MI, OperandName: AMDGPU::OpName::offset);
5710 if (Offset->getImm() != 0) {
5711 ErrInfo = "subtarget does not support offsets in flat instructions";
5712 return false;
5713 }
5714 }
5715
5716 if (isDS(MI) && !ST.hasGDS()) {
5717 const MachineOperand *GDSOp = getNamedOperand(MI, OperandName: AMDGPU::OpName::gds);
5718 if (GDSOp && GDSOp->getImm() != 0) {
5719 ErrInfo = "GDS is not supported on this subtarget";
5720 return false;
5721 }
5722 }
5723
5724 if (isImage(MI)) {
5725 const MachineOperand *DimOp = getNamedOperand(MI, OperandName: AMDGPU::OpName::dim);
5726 if (DimOp) {
5727 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5728 Name: AMDGPU::OpName::vaddr0);
5729 AMDGPU::OpName RSrcOpName =
5730 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5731 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: RSrcOpName);
5732 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Opcode);
5733 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5734 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
5735 const AMDGPU::MIMGDimInfo *Dim =
5736 AMDGPU::getMIMGDimInfoByEncoding(DimEnc: DimOp->getImm());
5737
5738 if (!Dim) {
5739 ErrInfo = "dim is out of range";
5740 return false;
5741 }
5742
5743 bool IsA16 = false;
5744 if (ST.hasR128A16()) {
5745 const MachineOperand *R128A16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::r128);
5746 IsA16 = R128A16->getImm() != 0;
5747 } else if (ST.hasA16()) {
5748 const MachineOperand *A16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::a16);
5749 IsA16 = A16->getImm() != 0;
5750 }
5751
5752 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5753
5754 unsigned AddrWords =
5755 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, IsG16Supported: ST.hasG16());
5756
5757 unsigned VAddrWords;
5758 if (IsNSA) {
5759 VAddrWords = RsrcIdx - VAddr0Idx;
5760 if (ST.hasPartialNSAEncoding() &&
5761 AddrWords > ST.getNSAMaxSize(HasSampler: isVSAMPLE(MI))) {
5762 unsigned LastVAddrIdx = RsrcIdx - 1;
5763 VAddrWords += getOpSize(MI, OpNo: LastVAddrIdx) / 4 - 1;
5764 }
5765 } else {
5766 VAddrWords = getOpSize(MI, OpNo: VAddr0Idx) / 4;
5767 if (AddrWords > 12)
5768 AddrWords = 16;
5769 }
5770
5771 if (VAddrWords != AddrWords) {
5772 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5773 << " but got " << VAddrWords << "\n");
5774 ErrInfo = "bad vaddr size";
5775 return false;
5776 }
5777 }
5778 }
5779
5780 const MachineOperand *DppCt = getNamedOperand(MI, OperandName: AMDGPU::OpName::dpp_ctrl);
5781 if (DppCt) {
5782 using namespace AMDGPU::DPP;
5783
5784 unsigned DC = DppCt->getImm();
5785 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5786 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5787 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5788 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5789 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5790 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5791 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5792 ErrInfo = "Invalid dpp_ctrl value";
5793 return false;
5794 }
5795 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5796 !ST.hasDPPWavefrontShifts()) {
5797 ErrInfo = "Invalid dpp_ctrl value: "
5798 "wavefront shifts are not supported on GFX10+";
5799 return false;
5800 }
5801 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5802 !ST.hasDPPBroadcasts()) {
5803 ErrInfo = "Invalid dpp_ctrl value: "
5804 "broadcasts are not supported on GFX10+";
5805 return false;
5806 }
5807 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5808 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5809 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5810 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5811 !ST.hasGFX90AInsts()) {
5812 ErrInfo = "Invalid dpp_ctrl value: "
5813 "row_newbroadcast/row_share is not supported before "
5814 "GFX90A/GFX10";
5815 return false;
5816 }
5817 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5818 ErrInfo = "Invalid dpp_ctrl value: "
5819 "row_share and row_xmask are not supported before GFX10";
5820 return false;
5821 }
5822 }
5823
5824 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5825 !AMDGPU::isLegalDPALU_DPPControl(ST, DC) &&
5826 AMDGPU::isDPALU_DPP(OpDesc: Desc, MII: *this, ST)) {
5827 ErrInfo = "Invalid dpp_ctrl value: "
5828 "DP ALU dpp only support row_newbcast";
5829 return false;
5830 }
5831 }
5832
5833 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5834 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
5835 AMDGPU::OpName DataName =
5836 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5837 const MachineOperand *Data = getNamedOperand(MI, OperandName: DataName);
5838 const MachineOperand *Data2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::data1);
5839 if (Data && !Data->isReg())
5840 Data = nullptr;
5841
5842 if (ST.hasGFX90AInsts()) {
5843 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5844 (RI.isAGPR(MRI, Reg: Dst->getReg()) != RI.isAGPR(MRI, Reg: Data->getReg()))) {
5845 ErrInfo = "Invalid register class: "
5846 "vdata and vdst should be both VGPR or AGPR";
5847 return false;
5848 }
5849 if (Data && Data2 &&
5850 (RI.isAGPR(MRI, Reg: Data->getReg()) != RI.isAGPR(MRI, Reg: Data2->getReg()))) {
5851 ErrInfo = "Invalid register class: "
5852 "both data operands should be VGPR or AGPR";
5853 return false;
5854 }
5855 } else {
5856 if ((Dst && RI.isAGPR(MRI, Reg: Dst->getReg())) ||
5857 (Data && RI.isAGPR(MRI, Reg: Data->getReg())) ||
5858 (Data2 && RI.isAGPR(MRI, Reg: Data2->getReg()))) {
5859 ErrInfo = "Invalid register class: "
5860 "agpr loads and stores not supported on this GPU";
5861 return false;
5862 }
5863 }
5864 }
5865
5866 if (ST.needsAlignedVGPRs()) {
5867 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5868 const MachineOperand *Op = getNamedOperand(MI, OperandName: OpName);
5869 if (!Op)
5870 return true;
5871 Register Reg = Op->getReg();
5872 if (Reg.isPhysical())
5873 return !(RI.getHWRegIndex(Reg) & 1);
5874 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5875 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5876 !(RI.getChannelFromSubReg(SubReg: Op->getSubReg()) & 1);
5877 };
5878
5879 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5880 Opcode == AMDGPU::DS_GWS_BARRIER) {
5881
5882 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5883 ErrInfo = "Subtarget requires even aligned vector registers "
5884 "for DS_GWS instructions";
5885 return false;
5886 }
5887 }
5888
5889 if (isMIMG(MI)) {
5890 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5891 ErrInfo = "Subtarget requires even aligned vector registers "
5892 "for vaddr operand of image instructions";
5893 return false;
5894 }
5895 }
5896 }
5897
5898 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5899 const MachineOperand *Src = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
5900 if (Src->isReg() && RI.isSGPRReg(MRI, Reg: Src->getReg())) {
5901 ErrInfo = "Invalid register class: "
5902 "v_accvgpr_write with an SGPR is not supported on this GPU";
5903 return false;
5904 }
5905 }
5906
5907 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5908 const MachineOperand &SrcOp = MI.getOperand(i: 1);
5909 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5910 ErrInfo = "pseudo expects only physical SGPRs";
5911 return false;
5912 }
5913 }
5914
5915 if (const MachineOperand *CPol = getNamedOperand(MI, OperandName: AMDGPU::OpName::cpol)) {
5916 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5917 if (!ST.hasScaleOffset()) {
5918 ErrInfo = "Subtarget does not support offset scaling";
5919 return false;
5920 }
5921 if (!AMDGPU::supportsScaleOffset(MII: *this, Opcode: MI.getOpcode())) {
5922 ErrInfo = "Instruction does not support offset scaling";
5923 return false;
5924 }
5925 }
5926 }
5927
5928 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5929 // information.
5930 if (AMDGPU::isPackedFP32Inst(Opc: Opcode) && AMDGPU::isGFX12Plus(STI: ST)) {
5931 for (unsigned I = 0; I < 3; ++I) {
5932 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, SrcN: I))
5933 return false;
5934 }
5935 }
5936
5937 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5938 MI.readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI: nullptr)) {
5939 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
5940 if ((Dst && RI.getRegClassForReg(MRI, Reg: Dst->getReg()) ==
5941 &AMDGPU::SReg_64RegClass) ||
5942 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5943 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5944 return false;
5945 }
5946 }
5947
5948 return true;
5949}
5950
5951// It is more readable to list mapped opcodes on the same line.
5952// clang-format off
5953
5954unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
5955 switch (MI.getOpcode()) {
5956 default: return AMDGPU::INSTRUCTION_LIST_END;
5957 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5958 case AMDGPU::COPY: return AMDGPU::COPY;
5959 case AMDGPU::PHI: return AMDGPU::PHI;
5960 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5961 case AMDGPU::WQM: return AMDGPU::WQM;
5962 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5963 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5964 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5965 case AMDGPU::S_MOV_B32: {
5966 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5967 return MI.getOperand(i: 1).isReg() ||
5968 RI.isAGPR(MRI, Reg: MI.getOperand(i: 0).getReg()) ?
5969 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5970 }
5971 case AMDGPU::S_ADD_I32:
5972 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5973 case AMDGPU::S_ADDC_U32:
5974 return AMDGPU::V_ADDC_U32_e32;
5975 case AMDGPU::S_SUB_I32:
5976 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5977 // FIXME: These are not consistently handled, and selected when the carry is
5978 // used.
5979 case AMDGPU::S_ADD_U32:
5980 return AMDGPU::V_ADD_CO_U32_e32;
5981 case AMDGPU::S_SUB_U32:
5982 return AMDGPU::V_SUB_CO_U32_e32;
5983 case AMDGPU::S_ADD_U64_PSEUDO:
5984 return AMDGPU::V_ADD_U64_PSEUDO;
5985 case AMDGPU::S_SUB_U64_PSEUDO:
5986 return AMDGPU::V_SUB_U64_PSEUDO;
5987 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5988 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5989 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5990 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5991 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5992 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5993 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5994 case AMDGPU::S_XNOR_B32:
5995 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5996 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5997 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5998 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5999 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
6000 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
6001 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
6002 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
6003 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
6004 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
6005 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
6006 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
6007 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
6008 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
6009 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
6010 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
6011 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
6012 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
6013 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
6014 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
6015 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
6016 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
6017 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
6018 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
6019 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
6020 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
6021 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
6022 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
6023 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
6024 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
6025 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
6026 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
6027 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
6028 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
6029 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
6030 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
6031 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
6032 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
6033 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
6034 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6035 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6036 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6037 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6038 case AMDGPU::S_CVT_F32_F16:
6039 case AMDGPU::S_CVT_HI_F32_F16:
6040 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6041 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6042 case AMDGPU::S_CVT_F16_F32:
6043 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6044 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6045 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6046 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6047 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6048 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6049 case AMDGPU::S_CEIL_F16:
6050 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6051 : AMDGPU::V_CEIL_F16_fake16_e64;
6052 case AMDGPU::S_FLOOR_F16:
6053 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6054 : AMDGPU::V_FLOOR_F16_fake16_e64;
6055 case AMDGPU::S_TRUNC_F16:
6056 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6057 : AMDGPU::V_TRUNC_F16_fake16_e64;
6058 case AMDGPU::S_RNDNE_F16:
6059 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6060 : AMDGPU::V_RNDNE_F16_fake16_e64;
6061 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6062 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6063 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6064 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6065 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6066 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6067 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6068 case AMDGPU::S_ADD_F16:
6069 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6070 : AMDGPU::V_ADD_F16_fake16_e64;
6071 case AMDGPU::S_SUB_F16:
6072 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6073 : AMDGPU::V_SUB_F16_fake16_e64;
6074 case AMDGPU::S_MIN_F16:
6075 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6076 : AMDGPU::V_MIN_F16_fake16_e64;
6077 case AMDGPU::S_MAX_F16:
6078 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6079 : AMDGPU::V_MAX_F16_fake16_e64;
6080 case AMDGPU::S_MINIMUM_F16:
6081 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6082 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6083 case AMDGPU::S_MAXIMUM_F16:
6084 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6085 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6086 case AMDGPU::S_MUL_F16:
6087 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6088 : AMDGPU::V_MUL_F16_fake16_e64;
6089 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6090 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6091 case AMDGPU::S_FMAC_F16:
6092 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6093 : AMDGPU::V_FMAC_F16_fake16_e64;
6094 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6095 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6096 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6097 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6098 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6099 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6100 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6101 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6102 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6103 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6104 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6105 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6106 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6107 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6108 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6109 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6110 case AMDGPU::S_CMP_LT_F16:
6111 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6112 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6113 case AMDGPU::S_CMP_EQ_F16:
6114 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6115 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6116 case AMDGPU::S_CMP_LE_F16:
6117 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6118 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6119 case AMDGPU::S_CMP_GT_F16:
6120 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6121 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6122 case AMDGPU::S_CMP_LG_F16:
6123 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6124 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6125 case AMDGPU::S_CMP_GE_F16:
6126 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6127 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6128 case AMDGPU::S_CMP_O_F16:
6129 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6130 : AMDGPU::V_CMP_O_F16_fake16_e64;
6131 case AMDGPU::S_CMP_U_F16:
6132 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6133 : AMDGPU::V_CMP_U_F16_fake16_e64;
6134 case AMDGPU::S_CMP_NGE_F16:
6135 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6136 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6137 case AMDGPU::S_CMP_NLG_F16:
6138 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6139 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6140 case AMDGPU::S_CMP_NGT_F16:
6141 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6142 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6143 case AMDGPU::S_CMP_NLE_F16:
6144 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6145 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6146 case AMDGPU::S_CMP_NEQ_F16:
6147 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6148 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6149 case AMDGPU::S_CMP_NLT_F16:
6150 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6151 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6152 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6153 case AMDGPU::V_S_EXP_F16_e64:
6154 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6155 : AMDGPU::V_EXP_F16_fake16_e64;
6156 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6157 case AMDGPU::V_S_LOG_F16_e64:
6158 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6159 : AMDGPU::V_LOG_F16_fake16_e64;
6160 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6161 case AMDGPU::V_S_RCP_F16_e64:
6162 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6163 : AMDGPU::V_RCP_F16_fake16_e64;
6164 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6165 case AMDGPU::V_S_RSQ_F16_e64:
6166 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6167 : AMDGPU::V_RSQ_F16_fake16_e64;
6168 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6169 case AMDGPU::V_S_SQRT_F16_e64:
6170 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6171 : AMDGPU::V_SQRT_F16_fake16_e64;
6172 }
6173 llvm_unreachable(
6174 "Unexpected scalar opcode without corresponding vector one!");
6175}
6176
6177// clang-format on
6178
6179void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
6180 MachineBasicBlock &MBB,
6181 MachineBasicBlock::iterator MBBI,
6182 const DebugLoc &DL, Register Reg,
6183 bool IsSCCLive,
6184 SlotIndexes *Indexes) const {
6185 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6186 const SIInstrInfo *TII = ST.getInstrInfo();
6187 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
6188 if (IsSCCLive) {
6189 // Insert two move instructions, one to save the original value of EXEC and
6190 // the other to turn on all bits in EXEC. This is required as we can't use
6191 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6192 auto StoreExecMI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: Reg)
6193 .addReg(RegNo: LMC.ExecReg, Flags: RegState::Kill);
6194 auto FlipExecMI =
6195 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg).addImm(Val: -1);
6196 if (Indexes) {
6197 Indexes->insertMachineInstrInMaps(MI&: *StoreExecMI);
6198 Indexes->insertMachineInstrInMaps(MI&: *FlipExecMI);
6199 }
6200 } else {
6201 auto SaveExec =
6202 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.OrSaveExecOpc), DestReg: Reg).addImm(Val: -1);
6203 SaveExec->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
6204 if (Indexes)
6205 Indexes->insertMachineInstrInMaps(MI&: *SaveExec);
6206 }
6207}
6208
6209void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
6210 MachineBasicBlock::iterator MBBI,
6211 const DebugLoc &DL, Register Reg,
6212 SlotIndexes *Indexes) const {
6213 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
6214 auto ExecRestoreMI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
6215 .addReg(RegNo: Reg, Flags: RegState::Kill);
6216 if (Indexes)
6217 Indexes->insertMachineInstrInMaps(MI&: *ExecRestoreMI);
6218}
6219
6220MachineInstr *
6221SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
6222 assert(MF.getInfo<SIMachineFunctionInfo>()->isWholeWaveFunction() &&
6223 "Not a whole wave func");
6224 MachineBasicBlock &MBB = *MF.begin();
6225 for (MachineInstr &MI : MBB)
6226 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6227 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6228 return &MI;
6229
6230 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6231}
6232
6233const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
6234 unsigned OpNo) const {
6235 const MCInstrDesc &Desc = get(Opcode: MI.getOpcode());
6236 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6237 Desc.operands()[OpNo].RegClass == -1) {
6238 Register Reg = MI.getOperand(i: OpNo).getReg();
6239
6240 if (Reg.isVirtual()) {
6241 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6242 return MRI.getRegClass(Reg);
6243 }
6244 return RI.getPhysRegBaseClass(Reg);
6245 }
6246
6247 int16_t RegClass = getOpRegClassID(OpInfo: Desc.operands()[OpNo]);
6248 return RegClass < 0 ? nullptr : RI.getRegClass(i: RegClass);
6249}
6250
6251void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
6252 MachineBasicBlock::iterator I = MI;
6253 MachineBasicBlock *MBB = MI.getParent();
6254 MachineOperand &MO = MI.getOperand(i: OpIdx);
6255 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6256 unsigned RCID = getOpRegClassID(OpInfo: get(Opcode: MI.getOpcode()).operands()[OpIdx]);
6257 const TargetRegisterClass *RC = RI.getRegClass(i: RCID);
6258 unsigned Size = RI.getRegSizeInBits(RC: *RC);
6259 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6260 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6261 : AMDGPU::V_MOV_B32_e32;
6262 if (MO.isReg())
6263 Opcode = AMDGPU::COPY;
6264 else if (RI.isSGPRClass(RC))
6265 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6266
6267 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(SRC: RC);
6268 Register Reg = MRI.createVirtualRegister(RegClass: VRC);
6269 DebugLoc DL = MBB->findDebugLoc(MBBI: I);
6270 BuildMI(BB&: *MI.getParent(), I, MIMD: DL, MCID: get(Opcode), DestReg: Reg).add(MO);
6271 MO.ChangeToRegister(Reg, isDef: false);
6272}
6273
6274unsigned SIInstrInfo::buildExtractSubReg(
6275 MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
6276 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6277 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6278 if (!SuperReg.getReg().isVirtual())
6279 return RI.getSubReg(Reg: SuperReg.getReg(), Idx: SubIdx);
6280
6281 MachineBasicBlock *MBB = MI->getParent();
6282 const DebugLoc &DL = MI->getDebugLoc();
6283 Register SubReg = MRI.createVirtualRegister(RegClass: SubRC);
6284
6285 unsigned NewSubIdx = RI.composeSubRegIndices(a: SuperReg.getSubReg(), b: SubIdx);
6286 BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: SubReg)
6287 .addReg(RegNo: SuperReg.getReg(), Flags: {}, SubReg: NewSubIdx);
6288 return SubReg;
6289}
6290
6291MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
6292 MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI,
6293 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6294 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6295 if (Op.isImm()) {
6296 if (SubIdx == AMDGPU::sub0)
6297 return MachineOperand::CreateImm(Val: static_cast<int32_t>(Op.getImm()));
6298 if (SubIdx == AMDGPU::sub1)
6299 return MachineOperand::CreateImm(Val: static_cast<int32_t>(Op.getImm() >> 32));
6300
6301 llvm_unreachable("Unhandled register index for immediate");
6302 }
6303
6304 unsigned SubReg = buildExtractSubReg(MI: MII, MRI, SuperReg: Op, SuperRC,
6305 SubIdx, SubRC);
6306 return MachineOperand::CreateReg(Reg: SubReg, isDef: false);
6307}
6308
6309// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6310void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6311 assert(Inst.getNumExplicitOperands() == 3);
6312 MachineOperand Op1 = Inst.getOperand(i: 1);
6313 Inst.removeOperand(OpNo: 1);
6314 Inst.addOperand(Op: Op1);
6315}
6316
6317bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
6318 const MCOperandInfo &OpInfo,
6319 const MachineOperand &MO) const {
6320 if (!MO.isReg())
6321 return false;
6322
6323 Register Reg = MO.getReg();
6324
6325 const TargetRegisterClass *DRC = RI.getRegClass(i: getOpRegClassID(OpInfo));
6326 if (Reg.isPhysical())
6327 return DRC->contains(Reg);
6328
6329 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6330
6331 if (MO.getSubReg()) {
6332 const MachineFunction *MF = MO.getParent()->getMF();
6333 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, MF: *MF);
6334 if (!SuperRC)
6335 return false;
6336 return RI.getMatchingSuperRegClass(A: SuperRC, B: DRC, Idx: MO.getSubReg()) != nullptr;
6337 }
6338
6339 return RI.getCommonSubClass(A: DRC, B: RC) != nullptr;
6340}
6341
6342bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
6343 const MachineOperand &MO) const {
6344 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6345 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6346 unsigned Opc = MI.getOpcode();
6347
6348 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6349 // information.
6350 if (AMDGPU::isPackedFP32Inst(Opc: MI.getOpcode()) && AMDGPU::isGFX12Plus(STI: ST) &&
6351 MO.isReg() && RI.isSGPRReg(MRI, Reg: MO.getReg())) {
6352 constexpr AMDGPU::OpName OpNames[] = {
6353 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6354
6355 for (auto [I, OpName] : enumerate(First: OpNames)) {
6356 int SrcIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[I]);
6357 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6358 !isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, SrcN: I, MO: &MO))
6359 return false;
6360 }
6361 }
6362
6363 if (!isLegalRegOperand(MRI, OpInfo, MO))
6364 return false;
6365
6366 // check Accumulate GPR operand
6367 bool IsAGPR = RI.isAGPR(MRI, Reg: MO.getReg());
6368 if (IsAGPR && !ST.hasMAIInsts())
6369 return false;
6370 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6371 (MI.mayLoad() || MI.mayStore() || isDS(Opcode: Opc) || isMIMG(Opcode: Opc)))
6372 return false;
6373 // Atomics should have both vdst and vdata either vgpr or agpr.
6374 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
6375 const int DataIdx = AMDGPU::getNamedOperandIdx(
6376 Opcode: Opc, Name: isDS(Opcode: Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6377 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6378 MI.getOperand(i: DataIdx).isReg() &&
6379 RI.isAGPR(MRI, Reg: MI.getOperand(i: DataIdx).getReg()) != IsAGPR)
6380 return false;
6381 if ((int)OpIdx == DataIdx) {
6382 if (VDstIdx != -1 &&
6383 RI.isAGPR(MRI, Reg: MI.getOperand(i: VDstIdx).getReg()) != IsAGPR)
6384 return false;
6385 // DS instructions with 2 src operands also must have tied RC.
6386 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data1);
6387 if (Data1Idx != -1 && MI.getOperand(i: Data1Idx).isReg() &&
6388 RI.isAGPR(MRI, Reg: MI.getOperand(i: Data1Idx).getReg()) != IsAGPR)
6389 return false;
6390 }
6391
6392 // Check V_ACCVGPR_WRITE_B32_e64
6393 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6394 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0) &&
6395 RI.isSGPRReg(MRI, Reg: MO.getReg()))
6396 return false;
6397
6398 if (ST.hasFlatScratchHiInB64InstHazard() &&
6399 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6400 if (const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst)) {
6401 if (AMDGPU::getRegBitWidth(RC: *RI.getRegClassForReg(MRI, Reg: Dst->getReg())) ==
6402 64)
6403 return false;
6404 }
6405 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6406 return false;
6407 }
6408
6409 return true;
6410}
6411
6412bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
6413 const MCOperandInfo &OpInfo,
6414 const MachineOperand &MO) const {
6415 if (MO.isReg())
6416 return isLegalRegOperand(MRI, OpInfo, MO);
6417
6418 // Handle non-register types that are treated like immediates.
6419 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6420 return true;
6421}
6422
6423bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
6424 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6425 const MachineOperand *MO) const {
6426 constexpr unsigned NumOps = 3;
6427 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6428 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6429 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6430 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6431
6432 assert(SrcN < NumOps);
6433
6434 if (!MO) {
6435 int SrcIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[SrcN]);
6436 if (SrcIdx == -1)
6437 return true;
6438 MO = &MI.getOperand(i: SrcIdx);
6439 }
6440
6441 if (!MO->isReg() || !RI.isSGPRReg(MRI, Reg: MO->getReg()))
6442 return true;
6443
6444 int ModsIdx =
6445 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[NumOps + SrcN]);
6446 if (ModsIdx == -1)
6447 return true;
6448
6449 unsigned Mods = MI.getOperand(i: ModsIdx).getImm();
6450 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6451 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6452
6453 return !OpSel && !OpSelHi;
6454}
6455
6456bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
6457 const MachineOperand *MO) const {
6458 const MachineFunction &MF = *MI.getMF();
6459 const MachineRegisterInfo &MRI = MF.getRegInfo();
6460 const MCInstrDesc &InstDesc = MI.getDesc();
6461 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6462 int64_t RegClass = getOpRegClassID(OpInfo);
6463 const TargetRegisterClass *DefinedRC =
6464 RegClass != -1 ? RI.getRegClass(i: RegClass) : nullptr;
6465 if (!MO)
6466 MO = &MI.getOperand(i: OpIdx);
6467
6468 const bool IsInlineConst = !MO->isReg() && isInlineConstant(MO: *MO, OpInfo);
6469
6470 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, MO: *MO, OpInfo)) {
6471 const MachineOperand *UsedLiteral = nullptr;
6472
6473 int ConstantBusLimit = ST.getConstantBusLimit(Opcode: MI.getOpcode());
6474 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6475
6476 // TODO: Be more permissive with frame indexes.
6477 if (!MO->isReg() && !isInlineConstant(MO: *MO, OpInfo)) {
6478 if (!LiteralLimit--)
6479 return false;
6480
6481 UsedLiteral = MO;
6482 }
6483
6484 SmallDenseSet<RegSubRegPair> SGPRsUsed;
6485 if (MO->isReg())
6486 SGPRsUsed.insert(V: RegSubRegPair(MO->getReg(), MO->getSubReg()));
6487
6488 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6489 if (i == OpIdx)
6490 continue;
6491 const MachineOperand &Op = MI.getOperand(i);
6492 if (Op.isReg()) {
6493 if (Op.isUse()) {
6494 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6495 if (regUsesConstantBus(RegOp: Op, MRI) && SGPRsUsed.insert(V: SGPR).second) {
6496 if (--ConstantBusLimit <= 0)
6497 return false;
6498 }
6499 }
6500 } else if (AMDGPU::isSISrcOperand(OpInfo: InstDesc.operands()[i]) &&
6501 !isInlineConstant(MO: Op, OpInfo: InstDesc.operands()[i])) {
6502 // The same literal may be used multiple times.
6503 if (!UsedLiteral)
6504 UsedLiteral = &Op;
6505 else if (UsedLiteral->isIdenticalTo(Other: Op))
6506 continue;
6507
6508 if (!LiteralLimit--)
6509 return false;
6510 if (--ConstantBusLimit <= 0)
6511 return false;
6512 }
6513 }
6514 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6515 // There can be at most one literal operand, but it can be repeated.
6516 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6517 if (i == OpIdx)
6518 continue;
6519 const MachineOperand &Op = MI.getOperand(i);
6520 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6521 !isInlineConstant(MO: Op, OpInfo: InstDesc.operands()[i]) &&
6522 !Op.isIdenticalTo(Other: *MO))
6523 return false;
6524
6525 // Do not fold a non-inlineable and non-register operand into an
6526 // instruction that already has a frame index. The frame index handling
6527 // code could not handle well when a frame index co-exists with another
6528 // non-register operand, unless that operand is an inlineable immediate.
6529 if (Op.isFI())
6530 return false;
6531 }
6532 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6533 isF16PseudoScalarTrans(Opcode: MI.getOpcode())) {
6534 return false;
6535 }
6536
6537 if (MO->isReg()) {
6538 if (!DefinedRC)
6539 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6540 return isLegalRegOperand(MI, OpIdx, MO: *MO);
6541 }
6542
6543 if (MO->isImm()) {
6544 uint64_t Imm = MO->getImm();
6545 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6546 bool Is64BitOp = Is64BitFPOp ||
6547 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6548 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6549 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6550 if (Is64BitOp &&
6551 !AMDGPU::isInlinableLiteral64(Literal: Imm, HasInv2Pi: ST.hasInv2PiInlineImm())) {
6552 if (!AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: Is64BitFPOp) &&
6553 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6554 return false;
6555
6556 // FIXME: We can use sign extended 64-bit literals, but only for signed
6557 // operands. At the moment we do not know if an operand is signed.
6558 // Such operand will be encoded as its low 32 bits and then either
6559 // correctly sign extended or incorrectly zero extended by HW.
6560 // If 64-bit literals are supported and the literal will be encoded
6561 // as full 64 bit we still can use it.
6562 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6563 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: false)))
6564 return false;
6565 }
6566 }
6567
6568 // Handle non-register types that are treated like immediates.
6569 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6570
6571 if (!DefinedRC) {
6572 // This operand expects an immediate.
6573 return true;
6574 }
6575
6576 return isImmOperandLegal(MI, OpNo: OpIdx, MO: *MO);
6577}
6578
6579bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
6580 bool IsGFX950Only = ST.hasGFX950Insts();
6581 bool IsGFX940Only = ST.hasGFX940Insts();
6582
6583 if (!IsGFX950Only && !IsGFX940Only)
6584 return false;
6585
6586 if (!isVALU(MI))
6587 return false;
6588
6589 // V_COS, V_EXP, V_RCP, etc.
6590 if (isTRANS(MI))
6591 return true;
6592
6593 // DOT2, DOT2C, DOT4, etc.
6594 if (isDOT(MI))
6595 return true;
6596
6597 // MFMA, SMFMA
6598 if (isMFMA(MI))
6599 return true;
6600
6601 unsigned Opcode = MI.getOpcode();
6602 switch (Opcode) {
6603 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6604 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6605 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6606 case AMDGPU::V_MQSAD_U32_U8_e64:
6607 case AMDGPU::V_PK_ADD_F16:
6608 case AMDGPU::V_PK_ADD_F32:
6609 case AMDGPU::V_PK_ADD_I16:
6610 case AMDGPU::V_PK_ADD_U16:
6611 case AMDGPU::V_PK_ASHRREV_I16:
6612 case AMDGPU::V_PK_FMA_F16:
6613 case AMDGPU::V_PK_FMA_F32:
6614 case AMDGPU::V_PK_FMAC_F16_e32:
6615 case AMDGPU::V_PK_FMAC_F16_e64:
6616 case AMDGPU::V_PK_LSHLREV_B16:
6617 case AMDGPU::V_PK_LSHRREV_B16:
6618 case AMDGPU::V_PK_MAD_I16:
6619 case AMDGPU::V_PK_MAD_U16:
6620 case AMDGPU::V_PK_MAX_F16:
6621 case AMDGPU::V_PK_MAX_I16:
6622 case AMDGPU::V_PK_MAX_U16:
6623 case AMDGPU::V_PK_MIN_F16:
6624 case AMDGPU::V_PK_MIN_I16:
6625 case AMDGPU::V_PK_MIN_U16:
6626 case AMDGPU::V_PK_MOV_B32:
6627 case AMDGPU::V_PK_MUL_F16:
6628 case AMDGPU::V_PK_MUL_F32:
6629 case AMDGPU::V_PK_MUL_LO_U16:
6630 case AMDGPU::V_PK_SUB_I16:
6631 case AMDGPU::V_PK_SUB_U16:
6632 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6633 return true;
6634 default:
6635 return false;
6636 }
6637}
6638
6639void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
6640 MachineInstr &MI) const {
6641 unsigned Opc = MI.getOpcode();
6642 const MCInstrDesc &InstrDesc = get(Opcode: Opc);
6643
6644 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
6645 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
6646
6647 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1);
6648 MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
6649
6650 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6651 // we need to only have one constant bus use before GFX10.
6652 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6653 if (HasImplicitSGPR && ST.getConstantBusLimit(Opcode: Opc) <= 1 && Src0.isReg() &&
6654 RI.isSGPRReg(MRI, Reg: Src0.getReg()))
6655 legalizeOpWithMove(MI, OpIdx: Src0Idx);
6656
6657 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6658 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6659 // src0/src1 with V_READFIRSTLANE.
6660 if (Opc == AMDGPU::V_WRITELANE_B32) {
6661 const DebugLoc &DL = MI.getDebugLoc();
6662 if (Src0.isReg() && RI.isVGPR(MRI, Reg: Src0.getReg())) {
6663 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6664 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6665 .add(MO: Src0);
6666 Src0.ChangeToRegister(Reg, isDef: false);
6667 }
6668 if (Src1.isReg() && RI.isVGPR(MRI, Reg: Src1.getReg())) {
6669 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6670 const DebugLoc &DL = MI.getDebugLoc();
6671 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6672 .add(MO: Src1);
6673 Src1.ChangeToRegister(Reg, isDef: false);
6674 }
6675 return;
6676 }
6677
6678 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6679 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6680 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
6681 if (!RI.isVGPR(MRI, Reg: MI.getOperand(i: Src2Idx).getReg()))
6682 legalizeOpWithMove(MI, OpIdx: Src2Idx);
6683 }
6684
6685 // VOP2 src0 instructions support all operand types, so we don't need to check
6686 // their legality. If src1 is already legal, we don't need to do anything.
6687 if (isLegalRegOperand(MRI, OpInfo: InstrDesc.operands()[Src1Idx], MO: Src1))
6688 return;
6689
6690 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6691 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6692 // select is uniform.
6693 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6694 RI.isVGPR(MRI, Reg: Src1.getReg())) {
6695 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6696 const DebugLoc &DL = MI.getDebugLoc();
6697 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6698 .add(MO: Src1);
6699 Src1.ChangeToRegister(Reg, isDef: false);
6700 return;
6701 }
6702
6703 // We do not use commuteInstruction here because it is too aggressive and will
6704 // commute if it is possible. We only want to commute here if it improves
6705 // legality. This can be called a fairly large number of times so don't waste
6706 // compile time pointlessly swapping and checking legality again.
6707 if (HasImplicitSGPR || !MI.isCommutable()) {
6708 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6709 return;
6710 }
6711
6712 // If src0 can be used as src1, commuting will make the operands legal.
6713 // Otherwise we have to give up and insert a move.
6714 //
6715 // TODO: Other immediate-like operand kinds could be commuted if there was a
6716 // MachineOperand::ChangeTo* for them.
6717 if ((!Src1.isImm() && !Src1.isReg()) ||
6718 !isLegalRegOperand(MRI, OpInfo: InstrDesc.operands()[Src1Idx], MO: Src0)) {
6719 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6720 return;
6721 }
6722
6723 int CommutedOpc = commuteOpcode(MI);
6724 if (CommutedOpc == -1) {
6725 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6726 return;
6727 }
6728
6729 MI.setDesc(get(Opcode: CommutedOpc));
6730
6731 Register Src0Reg = Src0.getReg();
6732 unsigned Src0SubReg = Src0.getSubReg();
6733 bool Src0Kill = Src0.isKill();
6734
6735 if (Src1.isImm())
6736 Src0.ChangeToImmediate(ImmVal: Src1.getImm());
6737 else if (Src1.isReg()) {
6738 Src0.ChangeToRegister(Reg: Src1.getReg(), isDef: false, isImp: false, isKill: Src1.isKill());
6739 Src0.setSubReg(Src1.getSubReg());
6740 } else
6741 llvm_unreachable("Should only have register or immediate operands");
6742
6743 Src1.ChangeToRegister(Reg: Src0Reg, isDef: false, isImp: false, isKill: Src0Kill);
6744 Src1.setSubReg(Src0SubReg);
6745 fixImplicitOperands(MI);
6746}
6747
6748// Legalize VOP3 operands. All operand types are supported for any operand
6749// but only one literal constant and only starting from GFX10.
6750void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
6751 MachineInstr &MI) const {
6752 unsigned Opc = MI.getOpcode();
6753
6754 int VOP3Idx[3] = {
6755 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0),
6756 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1),
6757 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2)
6758 };
6759
6760 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6761 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6762 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6763 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6764 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6765 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6766 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6767 // src1 and src2 must be scalar
6768 MachineOperand &Src1 = MI.getOperand(i: VOP3Idx[1]);
6769 const DebugLoc &DL = MI.getDebugLoc();
6770 if (Src1.isReg() && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src1.getReg()))) {
6771 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6772 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6773 .add(MO: Src1);
6774 Src1.ChangeToRegister(Reg, isDef: false);
6775 }
6776 if (VOP3Idx[2] != -1) {
6777 MachineOperand &Src2 = MI.getOperand(i: VOP3Idx[2]);
6778 if (Src2.isReg() && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src2.getReg()))) {
6779 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6780 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6781 .add(MO: Src2);
6782 Src2.ChangeToRegister(Reg, isDef: false);
6783 }
6784 }
6785 }
6786
6787 // Find the one SGPR operand we are allowed to use.
6788 int ConstantBusLimit = ST.getConstantBusLimit(Opcode: Opc);
6789 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6790 SmallDenseSet<unsigned> SGPRsUsed;
6791 Register SGPRReg = findUsedSGPR(MI, OpIndices: VOP3Idx);
6792 if (SGPRReg) {
6793 SGPRsUsed.insert(V: SGPRReg);
6794 --ConstantBusLimit;
6795 }
6796
6797 for (int Idx : VOP3Idx) {
6798 if (Idx == -1)
6799 break;
6800 MachineOperand &MO = MI.getOperand(i: Idx);
6801
6802 if (!MO.isReg()) {
6803 if (isInlineConstant(MO, OpInfo: get(Opcode: Opc).operands()[Idx]))
6804 continue;
6805
6806 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6807 --LiteralLimit;
6808 --ConstantBusLimit;
6809 continue;
6810 }
6811
6812 --LiteralLimit;
6813 --ConstantBusLimit;
6814 legalizeOpWithMove(MI, OpIdx: Idx);
6815 continue;
6816 }
6817
6818 if (!RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg: MO.getReg())))
6819 continue; // VGPRs are legal
6820
6821 // We can use one SGPR in each VOP3 instruction prior to GFX10
6822 // and two starting from GFX10.
6823 if (SGPRsUsed.count(V: MO.getReg()))
6824 continue;
6825 if (ConstantBusLimit > 0) {
6826 SGPRsUsed.insert(V: MO.getReg());
6827 --ConstantBusLimit;
6828 continue;
6829 }
6830
6831 // If we make it this far, then the operand is not legal and we must
6832 // legalize it.
6833 legalizeOpWithMove(MI, OpIdx: Idx);
6834 }
6835
6836 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6837 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6838 !RI.isVGPR(MRI, Reg: MI.getOperand(i: VOP3Idx[2]).getReg()))
6839 legalizeOpWithMove(MI, OpIdx: VOP3Idx[2]);
6840
6841 // Fix the register class of packed FP32 instructions on gfx12+. See
6842 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6843 if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(STI: ST)) {
6844 for (unsigned I = 0; I < 3; ++I) {
6845 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
6846 legalizeOpWithMove(MI, OpIdx: VOP3Idx[I]);
6847 }
6848 }
6849}
6850
6851Register SIInstrInfo::readlaneVGPRToSGPR(
6852 Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI,
6853 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6854 const TargetRegisterClass *VRC = MRI.getRegClass(Reg: SrcReg);
6855 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6856 if (DstRC)
6857 SRC = RI.getCommonSubClass(A: SRC, B: DstRC);
6858
6859 Register DstReg = MRI.createVirtualRegister(RegClass: SRC);
6860 unsigned SubRegs = RI.getRegSizeInBits(RC: *VRC) / 32;
6861
6862 if (RI.hasAGPRs(RC: VRC)) {
6863 VRC = RI.getEquivalentVGPRClass(SRC: VRC);
6864 Register NewSrcReg = MRI.createVirtualRegister(RegClass: VRC);
6865 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6866 MCID: get(Opcode: TargetOpcode::COPY), DestReg: NewSrcReg)
6867 .addReg(RegNo: SrcReg);
6868 SrcReg = NewSrcReg;
6869 }
6870
6871 if (SubRegs == 1) {
6872 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6873 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
6874 .addReg(RegNo: SrcReg);
6875 return DstReg;
6876 }
6877
6878 SmallVector<Register, 8> SRegs;
6879 for (unsigned i = 0; i < SubRegs; ++i) {
6880 Register SGPR = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
6881 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6882 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: SGPR)
6883 .addReg(RegNo: SrcReg, Flags: {}, SubReg: RI.getSubRegFromChannel(Channel: i));
6884 SRegs.push_back(Elt: SGPR);
6885 }
6886
6887 MachineInstrBuilder MIB =
6888 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6889 MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg);
6890 for (unsigned i = 0; i < SubRegs; ++i) {
6891 MIB.addReg(RegNo: SRegs[i]);
6892 MIB.addImm(Val: RI.getSubRegFromChannel(Channel: i));
6893 }
6894 return DstReg;
6895}
6896
6897void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
6898 MachineInstr &MI) const {
6899
6900 // If the pointer is store in VGPRs, then we need to move them to
6901 // SGPRs using v_readfirstlane. This is safe because we only select
6902 // loads with uniform pointers to SMRD instruction so we know the
6903 // pointer value is uniform.
6904 MachineOperand *SBase = getNamedOperand(MI, OperandName: AMDGPU::OpName::sbase);
6905 if (SBase && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SBase->getReg()))) {
6906 Register SGPR = readlaneVGPRToSGPR(SrcReg: SBase->getReg(), UseMI&: MI, MRI);
6907 SBase->setReg(SGPR);
6908 }
6909 MachineOperand *SOff = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
6910 if (SOff && !RI.isSGPRReg(MRI, Reg: SOff->getReg())) {
6911 Register SGPR = readlaneVGPRToSGPR(SrcReg: SOff->getReg(), UseMI&: MI, MRI);
6912 SOff->setReg(SGPR);
6913 }
6914}
6915
6916bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
6917 unsigned Opc = Inst.getOpcode();
6918 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::saddr);
6919 if (OldSAddrIdx < 0)
6920 return false;
6921
6922 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6923
6924 int NewOpc = AMDGPU::getGlobalVaddrOp(Opcode: Opc);
6925 if (NewOpc < 0)
6926 NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opcode: Opc);
6927 if (NewOpc < 0)
6928 return false;
6929
6930 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6931 MachineOperand &SAddr = Inst.getOperand(i: OldSAddrIdx);
6932 if (RI.isSGPRReg(MRI, Reg: SAddr.getReg()))
6933 return false;
6934
6935 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::vaddr);
6936 if (NewVAddrIdx < 0)
6937 return false;
6938
6939 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr);
6940
6941 // Check vaddr, it shall be zero or absent.
6942 MachineInstr *VAddrDef = nullptr;
6943 if (OldVAddrIdx >= 0) {
6944 MachineOperand &VAddr = Inst.getOperand(i: OldVAddrIdx);
6945 VAddrDef = MRI.getUniqueVRegDef(Reg: VAddr.getReg());
6946 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6947 !VAddrDef->getOperand(i: 1).isImm() ||
6948 VAddrDef->getOperand(i: 1).getImm() != 0)
6949 return false;
6950 }
6951
6952 const MCInstrDesc &NewDesc = get(Opcode: NewOpc);
6953 Inst.setDesc(NewDesc);
6954
6955 // Callers expect iterator to be valid after this call, so modify the
6956 // instruction in place.
6957 if (OldVAddrIdx == NewVAddrIdx) {
6958 MachineOperand &NewVAddr = Inst.getOperand(i: NewVAddrIdx);
6959 // Clear use list from the old vaddr holding a zero register.
6960 MRI.removeRegOperandFromUseList(MO: &NewVAddr);
6961 MRI.moveOperands(Dst: &NewVAddr, Src: &SAddr, NumOps: 1);
6962 Inst.removeOperand(OpNo: OldSAddrIdx);
6963 // Update the use list with the pointer we have just moved from vaddr to
6964 // saddr position. Otherwise new vaddr will be missing from the use list.
6965 MRI.removeRegOperandFromUseList(MO: &NewVAddr);
6966 MRI.addRegOperandToUseList(MO: &NewVAddr);
6967 } else {
6968 assert(OldSAddrIdx == NewVAddrIdx);
6969
6970 if (OldVAddrIdx >= 0) {
6971 int NewVDstIn = AMDGPU::getNamedOperandIdx(Opcode: NewOpc,
6972 Name: AMDGPU::OpName::vdst_in);
6973
6974 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6975 // it asserts. Untie the operands for now and retie them afterwards.
6976 if (NewVDstIn != -1) {
6977 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst_in);
6978 Inst.untieRegOperand(OpIdx: OldVDstIn);
6979 }
6980
6981 Inst.removeOperand(OpNo: OldVAddrIdx);
6982
6983 if (NewVDstIn != -1) {
6984 int NewVDst = AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::vdst);
6985 Inst.tieOperands(DefIdx: NewVDst, UseIdx: NewVDstIn);
6986 }
6987 }
6988 }
6989
6990 if (VAddrDef && MRI.use_nodbg_empty(RegNo: VAddrDef->getOperand(i: 0).getReg()))
6991 VAddrDef->eraseFromParent();
6992
6993 return true;
6994}
6995
6996// FIXME: Remove this when SelectionDAG is obsoleted.
6997void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
6998 MachineInstr &MI) const {
6999 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
7000 return;
7001
7002 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
7003 // thinks they are uniform, so a readfirstlane should be valid.
7004 MachineOperand *SAddr = getNamedOperand(MI, OperandName: AMDGPU::OpName::saddr);
7005 if (!SAddr || RI.isSGPRClass(RC: MRI.getRegClass(Reg: SAddr->getReg())))
7006 return;
7007
7008 if (moveFlatAddrToVGPR(Inst&: MI))
7009 return;
7010
7011 const TargetRegisterClass *DeclaredRC =
7012 getRegClass(MCID: MI.getDesc(), OpNum: SAddr->getOperandNo());
7013
7014 Register ToSGPR = readlaneVGPRToSGPR(SrcReg: SAddr->getReg(), UseMI&: MI, MRI, DstRC: DeclaredRC);
7015 SAddr->setReg(ToSGPR);
7016}
7017
7018void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
7019 MachineBasicBlock::iterator I,
7020 const TargetRegisterClass *DstRC,
7021 MachineOperand &Op,
7022 MachineRegisterInfo &MRI,
7023 const DebugLoc &DL) const {
7024 Register OpReg = Op.getReg();
7025 unsigned OpSubReg = Op.getSubReg();
7026
7027 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7028 RI.getRegClassForReg(MRI, Reg: OpReg), OpSubReg);
7029
7030 // Check if operand is already the correct register class.
7031 if (DstRC == OpRC)
7032 return;
7033
7034 Register DstReg = MRI.createVirtualRegister(RegClass: DstRC);
7035 auto Copy =
7036 BuildMI(BB&: InsertMBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: OpReg);
7037 Op.setReg(DstReg);
7038
7039 MachineInstr *Def = MRI.getVRegDef(Reg: OpReg);
7040 if (!Def)
7041 return;
7042
7043 // Try to eliminate the copy if it is copying an immediate value.
7044 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7045 foldImmediate(UseMI&: *Copy, DefMI&: *Def, Reg: OpReg, MRI: &MRI);
7046
7047 bool ImpDef = Def->isImplicitDef();
7048 while (!ImpDef && Def && Def->isCopy()) {
7049 if (Def->getOperand(i: 1).getReg().isPhysical())
7050 break;
7051 Def = MRI.getUniqueVRegDef(Reg: Def->getOperand(i: 1).getReg());
7052 ImpDef = Def && Def->isImplicitDef();
7053 }
7054 if (!RI.isSGPRClass(RC: DstRC) && !Copy->readsRegister(Reg: AMDGPU::EXEC, TRI: &RI) &&
7055 !ImpDef)
7056 Copy.addReg(RegNo: AMDGPU::EXEC, Flags: RegState::Implicit);
7057}
7058
7059// Emit the actual waterfall loop, executing the wrapped instruction for each
7060// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7061// iteration, in the worst case we execute 64 (once per lane).
7062static void
7063emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
7064 MachineRegisterInfo &MRI,
7065 MachineBasicBlock &LoopBB,
7066 MachineBasicBlock &BodyBB,
7067 const DebugLoc &DL,
7068 ArrayRef<MachineOperand *> ScalarOps) {
7069 MachineFunction &MF = *LoopBB.getParent();
7070 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7071 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7072 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
7073 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7074
7075 MachineBasicBlock::iterator I = LoopBB.begin();
7076 Register CondReg;
7077
7078 for (MachineOperand *ScalarOp : ScalarOps) {
7079 unsigned RegSize = TRI->getRegSizeInBits(Reg: ScalarOp->getReg(), MRI);
7080 unsigned NumSubRegs = RegSize / 32;
7081 Register VScalarOp = ScalarOp->getReg();
7082
7083 if (NumSubRegs == 1) {
7084 Register CurReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7085
7086 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurReg)
7087 .addReg(RegNo: VScalarOp);
7088
7089 Register NewCondReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7090
7091 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: NewCondReg)
7092 .addReg(RegNo: CurReg)
7093 .addReg(RegNo: VScalarOp);
7094
7095 // Combine the comparison results with AND.
7096 if (!CondReg) // First.
7097 CondReg = NewCondReg;
7098 else { // If not the first, we create an AND.
7099 Register AndReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7100 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndOpc), DestReg: AndReg)
7101 .addReg(RegNo: CondReg)
7102 .addReg(RegNo: NewCondReg);
7103 CondReg = AndReg;
7104 }
7105
7106 // Update ScalarOp operand to use the SGPR ScalarOp.
7107 ScalarOp->setReg(CurReg);
7108 ScalarOp->setIsKill();
7109 } else {
7110 SmallVector<Register, 8> ReadlanePieces;
7111 RegState VScalarOpUndef = getUndefRegState(B: ScalarOp->isUndef());
7112 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7113 "Unhandled register size");
7114
7115 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7116 Register CurRegLo =
7117 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7118 Register CurRegHi =
7119 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7120
7121 // Read the next variant <- also loop target.
7122 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurRegLo)
7123 .addReg(RegNo: VScalarOp, Flags: VScalarOpUndef, SubReg: TRI->getSubRegFromChannel(Channel: Idx));
7124
7125 // Read the next variant <- also loop target.
7126 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurRegHi)
7127 .addReg(RegNo: VScalarOp, Flags: VScalarOpUndef,
7128 SubReg: TRI->getSubRegFromChannel(Channel: Idx + 1));
7129
7130 ReadlanePieces.push_back(Elt: CurRegLo);
7131 ReadlanePieces.push_back(Elt: CurRegHi);
7132
7133 // Comparison is to be done as 64-bit.
7134 Register CurReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_64RegClass);
7135 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: CurReg)
7136 .addReg(RegNo: CurRegLo)
7137 .addImm(Val: AMDGPU::sub0)
7138 .addReg(RegNo: CurRegHi)
7139 .addImm(Val: AMDGPU::sub1);
7140
7141 Register NewCondReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7142 auto Cmp = BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U64_e64),
7143 DestReg: NewCondReg)
7144 .addReg(RegNo: CurReg);
7145 if (NumSubRegs <= 2)
7146 Cmp.addReg(RegNo: VScalarOp);
7147 else
7148 Cmp.addReg(RegNo: VScalarOp, Flags: VScalarOpUndef,
7149 SubReg: TRI->getSubRegFromChannel(Channel: Idx, NumRegs: 2));
7150
7151 // Combine the comparison results with AND.
7152 if (!CondReg) // First.
7153 CondReg = NewCondReg;
7154 else { // If not the first, we create an AND.
7155 Register AndReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7156 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndOpc), DestReg: AndReg)
7157 .addReg(RegNo: CondReg)
7158 .addReg(RegNo: NewCondReg);
7159 CondReg = AndReg;
7160 }
7161 } // End for loop.
7162
7163 const auto *SScalarOpRC =
7164 TRI->getEquivalentSGPRClass(VRC: MRI.getRegClass(Reg: VScalarOp));
7165 Register SScalarOp = MRI.createVirtualRegister(RegClass: SScalarOpRC);
7166
7167 // Build scalar ScalarOp.
7168 auto Merge =
7169 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: SScalarOp);
7170 unsigned Channel = 0;
7171 for (Register Piece : ReadlanePieces) {
7172 Merge.addReg(RegNo: Piece).addImm(Val: TRI->getSubRegFromChannel(Channel: Channel++));
7173 }
7174
7175 // Update ScalarOp operand to use the SGPR ScalarOp.
7176 ScalarOp->setReg(SScalarOp);
7177 ScalarOp->setIsKill();
7178 }
7179 }
7180
7181 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7182 MRI.setSimpleHint(VReg: SaveExec, PrefReg: CondReg);
7183
7184 // Update EXEC to matching lanes, saving original to SaveExec.
7185 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndSaveExecOpc), DestReg: SaveExec)
7186 .addReg(RegNo: CondReg, Flags: RegState::Kill);
7187
7188 // The original instruction is here; we insert the terminators after it.
7189 I = BodyBB.end();
7190
7191 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7192 BuildMI(BB&: BodyBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.XorTermOpc), DestReg: LMC.ExecReg)
7193 .addReg(RegNo: LMC.ExecReg)
7194 .addReg(RegNo: SaveExec);
7195
7196 BuildMI(BB&: BodyBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::SI_WATERFALL_LOOP)).addMBB(MBB: &LoopBB);
7197}
7198
7199// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7200// with SGPRs by iterating over all unique values across all lanes.
7201// Returns the loop basic block that now contains \p MI.
7202static MachineBasicBlock *
7203loadScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
7204 ArrayRef<MachineOperand *> ScalarOps,
7205 MachineDominatorTree *MDT,
7206 MachineBasicBlock::iterator Begin = nullptr,
7207 MachineBasicBlock::iterator End = nullptr) {
7208 MachineBasicBlock &MBB = *MI.getParent();
7209 MachineFunction &MF = *MBB.getParent();
7210 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7211 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7212 MachineRegisterInfo &MRI = MF.getRegInfo();
7213 if (!Begin.isValid())
7214 Begin = &MI;
7215 if (!End.isValid()) {
7216 End = &MI;
7217 ++End;
7218 }
7219 const DebugLoc &DL = MI.getDebugLoc();
7220 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
7221 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7222
7223 // Save SCC. Waterfall Loop may overwrite SCC.
7224 Register SaveSCCReg;
7225
7226 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7227 // rather than unlimited scan everywhere
7228 bool SCCNotDead =
7229 MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::SCC, Before: MI,
7230 Neighborhood: std::numeric_limits<unsigned>::max()) !=
7231 MachineBasicBlock::LQR_Dead;
7232 if (SCCNotDead) {
7233 SaveSCCReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7234 BuildMI(BB&: MBB, I: Begin, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: SaveSCCReg)
7235 .addImm(Val: 1)
7236 .addImm(Val: 0);
7237 }
7238
7239 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7240
7241 // Save the EXEC mask
7242 BuildMI(BB&: MBB, I: Begin, MIMD: DL, MCID: TII.get(Opcode: LMC.MovOpc), DestReg: SaveExec).addReg(RegNo: LMC.ExecReg);
7243
7244 // Killed uses in the instruction we are waterfalling around will be
7245 // incorrect due to the added control-flow.
7246 MachineBasicBlock::iterator AfterMI = MI;
7247 ++AfterMI;
7248 for (auto I = Begin; I != AfterMI; I++) {
7249 for (auto &MO : I->all_uses())
7250 MRI.clearKillFlags(Reg: MO.getReg());
7251 }
7252
7253 // To insert the loop we need to split the block. Move everything after this
7254 // point to a new block, and insert a new empty block between the two.
7255 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
7256 MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
7257 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7258 MachineFunction::iterator MBBI(MBB);
7259 ++MBBI;
7260
7261 MF.insert(MBBI, MBB: LoopBB);
7262 MF.insert(MBBI, MBB: BodyBB);
7263 MF.insert(MBBI, MBB: RemainderBB);
7264
7265 LoopBB->addSuccessor(Succ: BodyBB);
7266 BodyBB->addSuccessor(Succ: LoopBB);
7267 BodyBB->addSuccessor(Succ: RemainderBB);
7268
7269 // Move Begin to MI to the BodyBB, and the remainder of the block to
7270 // RemainderBB.
7271 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
7272 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: End, To: MBB.end());
7273 BodyBB->splice(Where: BodyBB->begin(), Other: &MBB, From: Begin, To: MBB.end());
7274
7275 MBB.addSuccessor(Succ: LoopBB);
7276
7277 // Update dominators. We know that MBB immediately dominates LoopBB, that
7278 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7279 // RemainderBB. RemainderBB immediately dominates all of the successors
7280 // transferred to it from MBB that MBB used to properly dominate.
7281 if (MDT) {
7282 MDT->addNewBlock(BB: LoopBB, DomBB: &MBB);
7283 MDT->addNewBlock(BB: BodyBB, DomBB: LoopBB);
7284 MDT->addNewBlock(BB: RemainderBB, DomBB: BodyBB);
7285 for (auto &Succ : RemainderBB->successors()) {
7286 if (MDT->properlyDominates(A: &MBB, B: Succ)) {
7287 MDT->changeImmediateDominator(BB: Succ, NewBB: RemainderBB);
7288 }
7289 }
7290 }
7291
7292 emitLoadScalarOpsFromVGPRLoop(TII, MRI, LoopBB&: *LoopBB, BodyBB&: *BodyBB, DL, ScalarOps);
7293
7294 MachineBasicBlock::iterator First = RemainderBB->begin();
7295 // Restore SCC
7296 if (SCCNotDead) {
7297 BuildMI(BB&: *RemainderBB, I: First, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CMP_LG_U32))
7298 .addReg(RegNo: SaveSCCReg, Flags: RegState::Kill)
7299 .addImm(Val: 0);
7300 }
7301
7302 // Restore the EXEC mask
7303 BuildMI(BB&: *RemainderBB, I: First, MIMD: DL, MCID: TII.get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
7304 .addReg(RegNo: SaveExec);
7305 return BodyBB;
7306}
7307
7308// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7309static std::tuple<unsigned, unsigned>
7310extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
7311 MachineBasicBlock &MBB = *MI.getParent();
7312 MachineFunction &MF = *MBB.getParent();
7313 MachineRegisterInfo &MRI = MF.getRegInfo();
7314
7315 // Extract the ptr from the resource descriptor.
7316 unsigned RsrcPtr =
7317 TII.buildExtractSubReg(MI, MRI, SuperReg: Rsrc, SuperRC: &AMDGPU::VReg_128RegClass,
7318 SubIdx: AMDGPU::sub0_sub1, SubRC: &AMDGPU::VReg_64RegClass);
7319
7320 // Create an empty resource descriptor
7321 Register Zero64 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
7322 Register SRsrcFormatLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
7323 Register SRsrcFormatHi = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
7324 Register NewSRsrc = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_128RegClass);
7325 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7326
7327 // Zero64 = 0
7328 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B64), DestReg: Zero64)
7329 .addImm(Val: 0);
7330
7331 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7332 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: SRsrcFormatLo)
7333 .addImm(Val: Lo_32(Value: RsrcDataFormat));
7334
7335 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7336 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: SRsrcFormatHi)
7337 .addImm(Val: Hi_32(Value: RsrcDataFormat));
7338
7339 // NewSRsrc = {Zero64, SRsrcFormat}
7340 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewSRsrc)
7341 .addReg(RegNo: Zero64)
7342 .addImm(Val: AMDGPU::sub0_sub1)
7343 .addReg(RegNo: SRsrcFormatLo)
7344 .addImm(Val: AMDGPU::sub2)
7345 .addReg(RegNo: SRsrcFormatHi)
7346 .addImm(Val: AMDGPU::sub3);
7347
7348 return std::tuple(RsrcPtr, NewSRsrc);
7349}
7350
7351MachineBasicBlock *
7352SIInstrInfo::legalizeOperands(MachineInstr &MI,
7353 MachineDominatorTree *MDT) const {
7354 MachineFunction &MF = *MI.getMF();
7355 MachineRegisterInfo &MRI = MF.getRegInfo();
7356 MachineBasicBlock *CreatedBB = nullptr;
7357
7358 // Legalize VOP2
7359 if (isVOP2(MI) || isVOPC(MI)) {
7360 legalizeOperandsVOP2(MRI, MI);
7361 return CreatedBB;
7362 }
7363
7364 // Legalize VOP3
7365 if (isVOP3(MI)) {
7366 legalizeOperandsVOP3(MRI, MI);
7367 return CreatedBB;
7368 }
7369
7370 // Legalize SMRD
7371 if (isSMRD(MI)) {
7372 legalizeOperandsSMRD(MRI, MI);
7373 return CreatedBB;
7374 }
7375
7376 // Legalize FLAT
7377 if (isFLAT(MI)) {
7378 legalizeOperandsFLAT(MRI, MI);
7379 return CreatedBB;
7380 }
7381
7382 // Legalize PHI
7383 // The register class of the operands must be the same type as the register
7384 // class of the output.
7385 if (MI.getOpcode() == AMDGPU::PHI) {
7386 const TargetRegisterClass *VRC = getOpRegClass(MI, OpNo: 0);
7387 assert(!RI.isSGPRClass(VRC));
7388
7389 // Update all the operands so they have the same type.
7390 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7391 MachineOperand &Op = MI.getOperand(i: I);
7392 if (!Op.isReg() || !Op.getReg().isVirtual())
7393 continue;
7394
7395 // MI is a PHI instruction.
7396 MachineBasicBlock *InsertBB = MI.getOperand(i: I + 1).getMBB();
7397 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
7398
7399 // Avoid creating no-op copies with the same src and dst reg class. These
7400 // confuse some of the machine passes.
7401 legalizeGenericOperand(InsertMBB&: *InsertBB, I: Insert, DstRC: VRC, Op, MRI, DL: MI.getDebugLoc());
7402 }
7403 }
7404
7405 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7406 // VGPR dest type and SGPR sources, insert copies so all operands are
7407 // VGPRs. This seems to help operand folding / the register coalescer.
7408 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7409 MachineBasicBlock *MBB = MI.getParent();
7410 const TargetRegisterClass *DstRC = getOpRegClass(MI, OpNo: 0);
7411 if (RI.hasVGPRs(RC: DstRC)) {
7412 // Update all the operands so they are VGPR register classes. These may
7413 // not be the same register class because REG_SEQUENCE supports mixing
7414 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7415 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7416 MachineOperand &Op = MI.getOperand(i: I);
7417 if (!Op.isReg() || !Op.getReg().isVirtual())
7418 continue;
7419
7420 const TargetRegisterClass *OpRC = MRI.getRegClass(Reg: Op.getReg());
7421 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(SRC: OpRC);
7422 if (VRC == OpRC)
7423 continue;
7424
7425 legalizeGenericOperand(InsertMBB&: *MBB, I: MI, DstRC: VRC, Op, MRI, DL: MI.getDebugLoc());
7426 Op.setIsKill();
7427 }
7428 }
7429
7430 return CreatedBB;
7431 }
7432
7433 // Legalize INSERT_SUBREG
7434 // src0 must have the same register class as dst
7435 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7436 Register Dst = MI.getOperand(i: 0).getReg();
7437 Register Src0 = MI.getOperand(i: 1).getReg();
7438 const TargetRegisterClass *DstRC = MRI.getRegClass(Reg: Dst);
7439 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0);
7440 if (DstRC != Src0RC) {
7441 MachineBasicBlock *MBB = MI.getParent();
7442 MachineOperand &Op = MI.getOperand(i: 1);
7443 legalizeGenericOperand(InsertMBB&: *MBB, I: MI, DstRC, Op, MRI, DL: MI.getDebugLoc());
7444 }
7445 return CreatedBB;
7446 }
7447
7448 // Legalize SI_INIT_M0
7449 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7450 MachineOperand &Src = MI.getOperand(i: 0);
7451 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7452 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7453 return CreatedBB;
7454 }
7455
7456 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7457 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7458 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7459 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7460 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7461 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7462 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7463 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7464 MachineOperand &Src = MI.getOperand(i: 1);
7465 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7466 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7467 return CreatedBB;
7468 }
7469
7470 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7471 //
7472 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7473 // scratch memory access. In both cases, the legalization never involves
7474 // conversion to the addr64 form.
7475 if (isImage(MI) || (AMDGPU::isGraphics(CC: MF.getFunction().getCallingConv()) &&
7476 (isMUBUF(MI) || isMTBUF(MI)))) {
7477 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7478 ? AMDGPU::OpName::rsrc
7479 : AMDGPU::OpName::srsrc;
7480 MachineOperand *SRsrc = getNamedOperand(MI, OperandName: RSrcOpName);
7481 if (SRsrc && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SRsrc->getReg())))
7482 CreatedBB = loadScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {SRsrc}, MDT);
7483
7484 AMDGPU::OpName SampOpName =
7485 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7486 MachineOperand *SSamp = getNamedOperand(MI, OperandName: SampOpName);
7487 if (SSamp && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SSamp->getReg())))
7488 CreatedBB = loadScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {SSamp}, MDT);
7489
7490 return CreatedBB;
7491 }
7492
7493 // Legalize SI_CALL
7494 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7495 MachineOperand *Dest = &MI.getOperand(i: 0);
7496 if (!RI.isSGPRClass(RC: MRI.getRegClass(Reg: Dest->getReg()))) {
7497 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7498 // following copies, we also need to move copies from and to physical
7499 // registers into the loop block.
7500 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7501 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7502
7503 // Also move the copies to physical registers into the loop block
7504 MachineBasicBlock &MBB = *MI.getParent();
7505 MachineBasicBlock::iterator Start(&MI);
7506 while (Start->getOpcode() != FrameSetupOpcode)
7507 --Start;
7508 MachineBasicBlock::iterator End(&MI);
7509 while (End->getOpcode() != FrameDestroyOpcode)
7510 ++End;
7511 // Also include following copies of the return value
7512 ++End;
7513 while (End != MBB.end() && End->isCopy() && End->getOperand(i: 1).isReg() &&
7514 MI.definesRegister(Reg: End->getOperand(i: 1).getReg(), /*TRI=*/nullptr))
7515 ++End;
7516 CreatedBB =
7517 loadScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Dest}, MDT, Begin: Start, End);
7518 }
7519 }
7520
7521 // Legalize s_sleep_var.
7522 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7523 const DebugLoc &DL = MI.getDebugLoc();
7524 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7525 int Src0Idx =
7526 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src0);
7527 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
7528 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
7529 .add(MO: Src0);
7530 Src0.ChangeToRegister(Reg, isDef: false);
7531 return nullptr;
7532 }
7533
7534 // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
7535 // operands are scalar.
7536 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
7537 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
7538 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
7539 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
7540 for (MachineOperand &Src : MI.explicit_operands()) {
7541 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7542 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7543 }
7544 return CreatedBB;
7545 }
7546
7547 // Legalize MUBUF instructions.
7548 bool isSoffsetLegal = true;
7549 int SoffsetIdx =
7550 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::soffset);
7551 if (SoffsetIdx != -1) {
7552 MachineOperand *Soffset = &MI.getOperand(i: SoffsetIdx);
7553 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7554 !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Soffset->getReg()))) {
7555 isSoffsetLegal = false;
7556 }
7557 }
7558
7559 bool isRsrcLegal = true;
7560 int RsrcIdx =
7561 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::srsrc);
7562 if (RsrcIdx != -1) {
7563 MachineOperand *Rsrc = &MI.getOperand(i: RsrcIdx);
7564 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Reg: Rsrc->getReg()))
7565 isRsrcLegal = false;
7566 }
7567
7568 // The operands are legal.
7569 if (isRsrcLegal && isSoffsetLegal)
7570 return CreatedBB;
7571
7572 if (!isRsrcLegal) {
7573 // Legalize a VGPR Rsrc
7574 //
7575 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7576 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7577 // a zero-value SRsrc.
7578 //
7579 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7580 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7581 // above.
7582 //
7583 // Otherwise we are on non-ADDR64 hardware, and/or we have
7584 // idxen/offen/bothen and we fall back to a waterfall loop.
7585
7586 MachineOperand *Rsrc = &MI.getOperand(i: RsrcIdx);
7587 MachineBasicBlock &MBB = *MI.getParent();
7588
7589 MachineOperand *VAddr = getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
7590 if (VAddr && AMDGPU::getIfAddr64Inst(Opcode: MI.getOpcode()) != -1) {
7591 // This is already an ADDR64 instruction so we need to add the pointer
7592 // extracted from the resource descriptor to the current value of VAddr.
7593 Register NewVAddrLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7594 Register NewVAddrHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7595 Register NewVAddr = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
7596
7597 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7598 Register CondReg0 = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7599 Register CondReg1 = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7600
7601 unsigned RsrcPtr, NewSRsrc;
7602 std::tie(args&: RsrcPtr, args&: NewSRsrc) = extractRsrcPtr(TII: *this, MI, Rsrc&: *Rsrc);
7603
7604 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7605 const DebugLoc &DL = MI.getDebugLoc();
7606 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: NewVAddrLo)
7607 .addDef(RegNo: CondReg0)
7608 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub0)
7609 .addReg(RegNo: VAddr->getReg(), Flags: {}, SubReg: AMDGPU::sub0)
7610 .addImm(Val: 0);
7611
7612 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7613 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: NewVAddrHi)
7614 .addDef(RegNo: CondReg1, Flags: RegState::Dead)
7615 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub1)
7616 .addReg(RegNo: VAddr->getReg(), Flags: {}, SubReg: AMDGPU::sub1)
7617 .addReg(RegNo: CondReg0, Flags: RegState::Kill)
7618 .addImm(Val: 0);
7619
7620 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7621 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewVAddr)
7622 .addReg(RegNo: NewVAddrLo)
7623 .addImm(Val: AMDGPU::sub0)
7624 .addReg(RegNo: NewVAddrHi)
7625 .addImm(Val: AMDGPU::sub1);
7626
7627 VAddr->setReg(NewVAddr);
7628 Rsrc->setReg(NewSRsrc);
7629 } else if (!VAddr && ST.hasAddr64()) {
7630 // This instructions is the _OFFSET variant, so we need to convert it to
7631 // ADDR64.
7632 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7633 "FIXME: Need to emit flat atomics here");
7634
7635 unsigned RsrcPtr, NewSRsrc;
7636 std::tie(args&: RsrcPtr, args&: NewSRsrc) = extractRsrcPtr(TII: *this, MI, Rsrc&: *Rsrc);
7637
7638 Register NewVAddr = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
7639 MachineOperand *VData = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata);
7640 MachineOperand *Offset = getNamedOperand(MI, OperandName: AMDGPU::OpName::offset);
7641 MachineOperand *SOffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7642 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(Opcode: MI.getOpcode());
7643
7644 // Atomics with return have an additional tied operand and are
7645 // missing some of the special bits.
7646 MachineOperand *VDataIn = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata_in);
7647 MachineInstr *Addr64;
7648
7649 if (!VDataIn) {
7650 // Regular buffer load / store.
7651 MachineInstrBuilder MIB =
7652 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: Addr64Opcode))
7653 .add(MO: *VData)
7654 .addReg(RegNo: NewVAddr)
7655 .addReg(RegNo: NewSRsrc)
7656 .add(MO: *SOffset)
7657 .add(MO: *Offset);
7658
7659 if (const MachineOperand *CPol =
7660 getNamedOperand(MI, OperandName: AMDGPU::OpName::cpol)) {
7661 MIB.addImm(Val: CPol->getImm());
7662 }
7663
7664 if (const MachineOperand *TFE =
7665 getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe)) {
7666 MIB.addImm(Val: TFE->getImm());
7667 }
7668
7669 MIB.addImm(Val: getNamedImmOperand(MI, OperandName: AMDGPU::OpName::swz));
7670
7671 MIB.cloneMemRefs(OtherMI: MI);
7672 Addr64 = MIB;
7673 } else {
7674 // Atomics with return.
7675 Addr64 = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: Addr64Opcode))
7676 .add(MO: *VData)
7677 .add(MO: *VDataIn)
7678 .addReg(RegNo: NewVAddr)
7679 .addReg(RegNo: NewSRsrc)
7680 .add(MO: *SOffset)
7681 .add(MO: *Offset)
7682 .addImm(Val: getNamedImmOperand(MI, OperandName: AMDGPU::OpName::cpol))
7683 .cloneMemRefs(OtherMI: MI);
7684 }
7685
7686 MI.removeFromParent();
7687
7688 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7689 BuildMI(BB&: MBB, I: Addr64, MIMD: Addr64->getDebugLoc(), MCID: get(Opcode: AMDGPU::REG_SEQUENCE),
7690 DestReg: NewVAddr)
7691 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub0)
7692 .addImm(Val: AMDGPU::sub0)
7693 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub1)
7694 .addImm(Val: AMDGPU::sub1);
7695 } else {
7696 // Legalize a VGPR Rsrc and soffset together.
7697 if (!isSoffsetLegal) {
7698 MachineOperand *Soffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7699 CreatedBB = loadScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Rsrc, Soffset}, MDT);
7700 return CreatedBB;
7701 }
7702 CreatedBB = loadScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Rsrc}, MDT);
7703 return CreatedBB;
7704 }
7705 }
7706
7707 // Legalize a VGPR soffset.
7708 if (!isSoffsetLegal) {
7709 MachineOperand *Soffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7710 CreatedBB = loadScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Soffset}, MDT);
7711 return CreatedBB;
7712 }
7713 return CreatedBB;
7714}
7715
7716void SIInstrWorklist::insert(MachineInstr *MI) {
7717 InstrList.insert(X: MI);
7718 // Add MBUF instructiosn to deferred list.
7719 int RsrcIdx =
7720 AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(), Name: AMDGPU::OpName::srsrc);
7721 if (RsrcIdx != -1) {
7722 DeferredList.insert(X: MI);
7723 }
7724}
7725
7726bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
7727 return DeferredList.contains(key: MI);
7728}
7729
7730// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7731// lowering (change sgpr to vgpr).
7732// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7733// size. Need to legalize the size of the operands during the vgpr lowering
7734// chain. This can be removed after we have sgpr16 in place
7735void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
7736 MachineRegisterInfo &MRI) const {
7737 if (!ST.useRealTrue16Insts())
7738 return;
7739
7740 unsigned Opcode = MI.getOpcode();
7741 MachineBasicBlock *MBB = MI.getParent();
7742 // Legalize operands and check for size mismatch
7743 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7744 OpIdx >= get(Opcode).getNumOperands() ||
7745 get(Opcode).operands()[OpIdx].RegClass == -1)
7746 return;
7747
7748 MachineOperand &Op = MI.getOperand(i: OpIdx);
7749 if (!Op.isReg() || !Op.getReg().isVirtual())
7750 return;
7751
7752 const TargetRegisterClass *CurrRC = MRI.getRegClass(Reg: Op.getReg());
7753 if (!RI.isVGPRClass(RC: CurrRC))
7754 return;
7755
7756 int16_t RCID = getOpRegClassID(OpInfo: get(Opcode).operands()[OpIdx]);
7757 const TargetRegisterClass *ExpectedRC = RI.getRegClass(i: RCID);
7758 if (RI.getMatchingSuperRegClass(A: CurrRC, B: ExpectedRC, Idx: AMDGPU::lo16)) {
7759 Op.setSubReg(AMDGPU::lo16);
7760 } else if (RI.getMatchingSuperRegClass(A: ExpectedRC, B: CurrRC, Idx: AMDGPU::lo16)) {
7761 const DebugLoc &DL = MI.getDebugLoc();
7762 Register NewDstReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7763 Register Undef = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass);
7764 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
7765 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewDstReg)
7766 .addReg(RegNo: Op.getReg())
7767 .addImm(Val: AMDGPU::lo16)
7768 .addReg(RegNo: Undef)
7769 .addImm(Val: AMDGPU::hi16);
7770 Op.setReg(NewDstReg);
7771 }
7772}
7773void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
7774 MachineRegisterInfo &MRI) const {
7775 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7776 legalizeOperandsVALUt16(MI, OpIdx, MRI);
7777}
7778
7779void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
7780 MachineDominatorTree *MDT) const {
7781
7782 while (!Worklist.empty()) {
7783 MachineInstr &Inst = *Worklist.top();
7784 Worklist.erase_top();
7785 // Skip MachineInstr in the deferred list.
7786 if (Worklist.isDeferred(MI: &Inst))
7787 continue;
7788 moveToVALUImpl(Worklist, MDT, Inst);
7789 }
7790
7791 // Deferred list of instructions will be processed once
7792 // all the MachineInstr in the worklist are done.
7793 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7794 moveToVALUImpl(Worklist, MDT, Inst&: *Inst);
7795 assert(Worklist.empty() &&
7796 "Deferred MachineInstr are not supposed to re-populate worklist");
7797 }
7798}
7799
7800void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
7801 MachineDominatorTree *MDT,
7802 MachineInstr &Inst) const {
7803
7804 MachineBasicBlock *MBB = Inst.getParent();
7805 if (!MBB)
7806 return;
7807 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7808 unsigned Opcode = Inst.getOpcode();
7809 unsigned NewOpcode = getVALUOp(MI: Inst);
7810 const DebugLoc &DL = Inst.getDebugLoc();
7811
7812 // Handle some special cases
7813 switch (Opcode) {
7814 default:
7815 break;
7816 case AMDGPU::S_ADD_I32:
7817 case AMDGPU::S_SUB_I32: {
7818 // FIXME: The u32 versions currently selected use the carry.
7819 bool Changed;
7820 MachineBasicBlock *CreatedBBTmp = nullptr;
7821 std::tie(args&: Changed, args&: CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7822 if (Changed)
7823 return;
7824
7825 // Default handling
7826 break;
7827 }
7828
7829 case AMDGPU::S_MUL_U64:
7830 if (ST.hasVectorMulU64()) {
7831 NewOpcode = AMDGPU::V_MUL_U64_e64;
7832 break;
7833 }
7834 // Split s_mul_u64 in 32-bit vector multiplications.
7835 splitScalarSMulU64(Worklist, Inst, MDT);
7836 Inst.eraseFromParent();
7837 return;
7838
7839 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7840 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7841 // This is a special case of s_mul_u64 where all the operands are either
7842 // zero extended or sign extended.
7843 splitScalarSMulPseudo(Worklist, Inst, MDT);
7844 Inst.eraseFromParent();
7845 return;
7846
7847 case AMDGPU::S_AND_B64:
7848 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_AND_B32, MDT);
7849 Inst.eraseFromParent();
7850 return;
7851
7852 case AMDGPU::S_OR_B64:
7853 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_OR_B32, MDT);
7854 Inst.eraseFromParent();
7855 return;
7856
7857 case AMDGPU::S_XOR_B64:
7858 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_XOR_B32, MDT);
7859 Inst.eraseFromParent();
7860 return;
7861
7862 case AMDGPU::S_NAND_B64:
7863 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_NAND_B32, MDT);
7864 Inst.eraseFromParent();
7865 return;
7866
7867 case AMDGPU::S_NOR_B64:
7868 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_NOR_B32, MDT);
7869 Inst.eraseFromParent();
7870 return;
7871
7872 case AMDGPU::S_XNOR_B64:
7873 if (ST.hasDLInsts())
7874 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_XNOR_B32, MDT);
7875 else
7876 splitScalar64BitXnor(Worklist, Inst, MDT);
7877 Inst.eraseFromParent();
7878 return;
7879
7880 case AMDGPU::S_ANDN2_B64:
7881 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_ANDN2_B32, MDT);
7882 Inst.eraseFromParent();
7883 return;
7884
7885 case AMDGPU::S_ORN2_B64:
7886 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_ORN2_B32, MDT);
7887 Inst.eraseFromParent();
7888 return;
7889
7890 case AMDGPU::S_BREV_B64:
7891 splitScalar64BitUnaryOp(Worklist, Inst, Opcode: AMDGPU::S_BREV_B32, Swap: true);
7892 Inst.eraseFromParent();
7893 return;
7894
7895 case AMDGPU::S_NOT_B64:
7896 splitScalar64BitUnaryOp(Worklist, Inst, Opcode: AMDGPU::S_NOT_B32);
7897 Inst.eraseFromParent();
7898 return;
7899
7900 case AMDGPU::S_BCNT1_I32_B64:
7901 splitScalar64BitBCNT(Worklist, Inst);
7902 Inst.eraseFromParent();
7903 return;
7904
7905 case AMDGPU::S_BFE_I64:
7906 splitScalar64BitBFE(Worklist, Inst);
7907 Inst.eraseFromParent();
7908 return;
7909
7910 case AMDGPU::S_FLBIT_I32_B64:
7911 splitScalar64BitCountOp(Worklist, Inst, Opcode: AMDGPU::V_FFBH_U32_e32);
7912 Inst.eraseFromParent();
7913 return;
7914 case AMDGPU::S_FF1_I32_B64:
7915 splitScalar64BitCountOp(Worklist, Inst, Opcode: AMDGPU::V_FFBL_B32_e32);
7916 Inst.eraseFromParent();
7917 return;
7918
7919 case AMDGPU::S_LSHL_B32:
7920 if (ST.hasOnlyRevVALUShifts()) {
7921 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7922 swapOperands(Inst);
7923 }
7924 break;
7925 case AMDGPU::S_ASHR_I32:
7926 if (ST.hasOnlyRevVALUShifts()) {
7927 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7928 swapOperands(Inst);
7929 }
7930 break;
7931 case AMDGPU::S_LSHR_B32:
7932 if (ST.hasOnlyRevVALUShifts()) {
7933 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7934 swapOperands(Inst);
7935 }
7936 break;
7937 case AMDGPU::S_LSHL_B64:
7938 if (ST.hasOnlyRevVALUShifts()) {
7939 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7940 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7941 : AMDGPU::V_LSHLREV_B64_e64;
7942 swapOperands(Inst);
7943 }
7944 break;
7945 case AMDGPU::S_ASHR_I64:
7946 if (ST.hasOnlyRevVALUShifts()) {
7947 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7948 swapOperands(Inst);
7949 }
7950 break;
7951 case AMDGPU::S_LSHR_B64:
7952 if (ST.hasOnlyRevVALUShifts()) {
7953 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7954 swapOperands(Inst);
7955 }
7956 break;
7957
7958 case AMDGPU::S_ABS_I32:
7959 lowerScalarAbs(Worklist, Inst);
7960 Inst.eraseFromParent();
7961 return;
7962
7963 case AMDGPU::S_ABSDIFF_I32:
7964 lowerScalarAbsDiff(Worklist, Inst);
7965 Inst.eraseFromParent();
7966 return;
7967
7968 case AMDGPU::S_CBRANCH_SCC0:
7969 case AMDGPU::S_CBRANCH_SCC1: {
7970 // Clear unused bits of vcc
7971 Register CondReg = Inst.getOperand(i: 1).getReg();
7972 bool IsSCC = CondReg == AMDGPU::SCC;
7973 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
7974 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: LMC.AndOpc), DestReg: LMC.VccReg)
7975 .addReg(RegNo: LMC.ExecReg)
7976 .addReg(RegNo: IsSCC ? LMC.VccReg : CondReg);
7977 Inst.removeOperand(OpNo: 1);
7978 } break;
7979
7980 case AMDGPU::S_BFE_U64:
7981 case AMDGPU::S_BFM_B64:
7982 llvm_unreachable("Moving this op to VALU not implemented");
7983
7984 case AMDGPU::S_PACK_LL_B32_B16:
7985 case AMDGPU::S_PACK_LH_B32_B16:
7986 case AMDGPU::S_PACK_HL_B32_B16:
7987 case AMDGPU::S_PACK_HH_B32_B16:
7988 movePackToVALU(Worklist, MRI, Inst);
7989 Inst.eraseFromParent();
7990 return;
7991
7992 case AMDGPU::S_XNOR_B32:
7993 lowerScalarXnor(Worklist, Inst);
7994 Inst.eraseFromParent();
7995 return;
7996
7997 case AMDGPU::S_NAND_B32:
7998 splitScalarNotBinop(Worklist, Inst, Opcode: AMDGPU::S_AND_B32);
7999 Inst.eraseFromParent();
8000 return;
8001
8002 case AMDGPU::S_NOR_B32:
8003 splitScalarNotBinop(Worklist, Inst, Opcode: AMDGPU::S_OR_B32);
8004 Inst.eraseFromParent();
8005 return;
8006
8007 case AMDGPU::S_ANDN2_B32:
8008 splitScalarBinOpN2(Worklist, Inst, Opcode: AMDGPU::S_AND_B32);
8009 Inst.eraseFromParent();
8010 return;
8011
8012 case AMDGPU::S_ORN2_B32:
8013 splitScalarBinOpN2(Worklist, Inst, Opcode: AMDGPU::S_OR_B32);
8014 Inst.eraseFromParent();
8015 return;
8016
8017 // TODO: remove as soon as everything is ready
8018 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8019 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8020 // can only be selected from the uniform SDNode.
8021 case AMDGPU::S_ADD_CO_PSEUDO:
8022 case AMDGPU::S_SUB_CO_PSEUDO: {
8023 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8024 ? AMDGPU::V_ADDC_U32_e64
8025 : AMDGPU::V_SUBB_U32_e64;
8026 const auto *CarryRC = RI.getWaveMaskRegClass();
8027
8028 Register CarryInReg = Inst.getOperand(i: 4).getReg();
8029 if (!MRI.constrainRegClass(Reg: CarryInReg, RC: CarryRC)) {
8030 Register NewCarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
8031 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY), DestReg: NewCarryReg)
8032 .addReg(RegNo: CarryInReg);
8033 }
8034
8035 Register CarryOutReg = Inst.getOperand(i: 1).getReg();
8036
8037 Register DestReg = MRI.createVirtualRegister(RegClass: RI.getEquivalentVGPRClass(
8038 SRC: MRI.getRegClass(Reg: Inst.getOperand(i: 0).getReg())));
8039 MachineInstr *CarryOp =
8040 BuildMI(BB&: *MBB, I: &Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: Opc), DestReg)
8041 .addReg(RegNo: CarryOutReg, Flags: RegState::Define)
8042 .add(MO: Inst.getOperand(i: 2))
8043 .add(MO: Inst.getOperand(i: 3))
8044 .addReg(RegNo: CarryInReg)
8045 .addImm(Val: 0);
8046 legalizeOperands(MI&: *CarryOp);
8047 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: DestReg);
8048 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8049 Inst.eraseFromParent();
8050 }
8051 return;
8052 case AMDGPU::S_UADDO_PSEUDO:
8053 case AMDGPU::S_USUBO_PSEUDO: {
8054 MachineOperand &Dest0 = Inst.getOperand(i: 0);
8055 MachineOperand &Dest1 = Inst.getOperand(i: 1);
8056 MachineOperand &Src0 = Inst.getOperand(i: 2);
8057 MachineOperand &Src1 = Inst.getOperand(i: 3);
8058
8059 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8060 ? AMDGPU::V_ADD_CO_U32_e64
8061 : AMDGPU::V_SUB_CO_U32_e64;
8062 const TargetRegisterClass *NewRC =
8063 RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest0.getReg()));
8064 Register DestReg = MRI.createVirtualRegister(RegClass: NewRC);
8065 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I: &Inst, MIMD: DL, MCID: get(Opcode: Opc), DestReg)
8066 .addReg(RegNo: Dest1.getReg(), Flags: RegState::Define)
8067 .add(MO: Src0)
8068 .add(MO: Src1)
8069 .addImm(Val: 0); // clamp bit
8070
8071 legalizeOperands(MI&: *NewInstr, MDT);
8072 MRI.replaceRegWith(FromReg: Dest0.getReg(), ToReg: DestReg);
8073 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8074 Inst.eraseFromParent();
8075 }
8076 return;
8077 case AMDGPU::S_LSHL1_ADD_U32:
8078 case AMDGPU::S_LSHL2_ADD_U32:
8079 case AMDGPU::S_LSHL3_ADD_U32:
8080 case AMDGPU::S_LSHL4_ADD_U32: {
8081 MachineOperand &Dest = Inst.getOperand(i: 0);
8082 MachineOperand &Src0 = Inst.getOperand(i: 1);
8083 MachineOperand &Src1 = Inst.getOperand(i: 2);
8084 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8085 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8086 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8087 : 4);
8088
8089 const TargetRegisterClass *NewRC =
8090 RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest.getReg()));
8091 Register DestReg = MRI.createVirtualRegister(RegClass: NewRC);
8092 MachineInstr *NewInstr =
8093 BuildMI(BB&: *MBB, I: &Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8094 .add(MO: Src0)
8095 .addImm(Val: ShiftAmt)
8096 .add(MO: Src1);
8097
8098 legalizeOperands(MI&: *NewInstr, MDT);
8099 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: DestReg);
8100 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8101 Inst.eraseFromParent();
8102 }
8103 return;
8104 case AMDGPU::S_CSELECT_B32:
8105 case AMDGPU::S_CSELECT_B64:
8106 lowerSelect(Worklist, Inst, MDT);
8107 Inst.eraseFromParent();
8108 return;
8109 case AMDGPU::S_CMP_EQ_I32:
8110 case AMDGPU::S_CMP_LG_I32:
8111 case AMDGPU::S_CMP_GT_I32:
8112 case AMDGPU::S_CMP_GE_I32:
8113 case AMDGPU::S_CMP_LT_I32:
8114 case AMDGPU::S_CMP_LE_I32:
8115 case AMDGPU::S_CMP_EQ_U32:
8116 case AMDGPU::S_CMP_LG_U32:
8117 case AMDGPU::S_CMP_GT_U32:
8118 case AMDGPU::S_CMP_GE_U32:
8119 case AMDGPU::S_CMP_LT_U32:
8120 case AMDGPU::S_CMP_LE_U32:
8121 case AMDGPU::S_CMP_EQ_U64:
8122 case AMDGPU::S_CMP_LG_U64:
8123 case AMDGPU::S_CMP_LT_F32:
8124 case AMDGPU::S_CMP_EQ_F32:
8125 case AMDGPU::S_CMP_LE_F32:
8126 case AMDGPU::S_CMP_GT_F32:
8127 case AMDGPU::S_CMP_LG_F32:
8128 case AMDGPU::S_CMP_GE_F32:
8129 case AMDGPU::S_CMP_O_F32:
8130 case AMDGPU::S_CMP_U_F32:
8131 case AMDGPU::S_CMP_NGE_F32:
8132 case AMDGPU::S_CMP_NLG_F32:
8133 case AMDGPU::S_CMP_NGT_F32:
8134 case AMDGPU::S_CMP_NLE_F32:
8135 case AMDGPU::S_CMP_NEQ_F32:
8136 case AMDGPU::S_CMP_NLT_F32: {
8137 Register CondReg = MRI.createVirtualRegister(RegClass: RI.getWaveMaskRegClass());
8138 auto NewInstr =
8139 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode), DestReg: CondReg)
8140 .setMIFlags(Inst.getFlags());
8141 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src0_modifiers) >=
8142 0) {
8143 NewInstr
8144 .addImm(Val: 0) // src0_modifiers
8145 .add(MO: Inst.getOperand(i: 0)) // src0
8146 .addImm(Val: 0) // src1_modifiers
8147 .add(MO: Inst.getOperand(i: 1)) // src1
8148 .addImm(Val: 0); // clamp
8149 } else {
8150 NewInstr.add(MO: Inst.getOperand(i: 0)).add(MO: Inst.getOperand(i: 1));
8151 }
8152 legalizeOperands(MI&: *NewInstr, MDT);
8153 int SCCIdx = Inst.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
8154 const MachineOperand &SCCOp = Inst.getOperand(i: SCCIdx);
8155 addSCCDefUsersToVALUWorklist(Op: SCCOp, SCCDefInst&: Inst, Worklist, NewCond: CondReg);
8156 Inst.eraseFromParent();
8157 return;
8158 }
8159 case AMDGPU::S_CMP_LT_F16:
8160 case AMDGPU::S_CMP_EQ_F16:
8161 case AMDGPU::S_CMP_LE_F16:
8162 case AMDGPU::S_CMP_GT_F16:
8163 case AMDGPU::S_CMP_LG_F16:
8164 case AMDGPU::S_CMP_GE_F16:
8165 case AMDGPU::S_CMP_O_F16:
8166 case AMDGPU::S_CMP_U_F16:
8167 case AMDGPU::S_CMP_NGE_F16:
8168 case AMDGPU::S_CMP_NLG_F16:
8169 case AMDGPU::S_CMP_NGT_F16:
8170 case AMDGPU::S_CMP_NLE_F16:
8171 case AMDGPU::S_CMP_NEQ_F16:
8172 case AMDGPU::S_CMP_NLT_F16: {
8173 Register CondReg = MRI.createVirtualRegister(RegClass: RI.getWaveMaskRegClass());
8174 auto NewInstr =
8175 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode), DestReg: CondReg)
8176 .setMIFlags(Inst.getFlags());
8177 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::src0_modifiers)) {
8178 NewInstr
8179 .addImm(Val: 0) // src0_modifiers
8180 .add(MO: Inst.getOperand(i: 0)) // src0
8181 .addImm(Val: 0) // src1_modifiers
8182 .add(MO: Inst.getOperand(i: 1)) // src1
8183 .addImm(Val: 0); // clamp
8184 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::op_sel))
8185 NewInstr.addImm(Val: 0); // op_sel0
8186 } else {
8187 NewInstr
8188 .add(MO: Inst.getOperand(i: 0))
8189 .add(MO: Inst.getOperand(i: 1));
8190 }
8191 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8192 legalizeOperands(MI&: *NewInstr, MDT);
8193 int SCCIdx = Inst.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
8194 const MachineOperand &SCCOp = Inst.getOperand(i: SCCIdx);
8195 addSCCDefUsersToVALUWorklist(Op: SCCOp, SCCDefInst&: Inst, Worklist, NewCond: CondReg);
8196 Inst.eraseFromParent();
8197 return;
8198 }
8199 case AMDGPU::S_CVT_HI_F32_F16: {
8200 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8201 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8202 if (ST.useRealTrue16Insts()) {
8203 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: TmpReg)
8204 .add(MO: Inst.getOperand(i: 1));
8205 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8206 .addImm(Val: 0) // src0_modifiers
8207 .addReg(RegNo: TmpReg, Flags: {}, SubReg: AMDGPU::hi16)
8208 .addImm(Val: 0) // clamp
8209 .addImm(Val: 0) // omod
8210 .addImm(Val: 0); // op_sel0
8211 } else {
8212 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
8213 .addImm(Val: 16)
8214 .add(MO: Inst.getOperand(i: 1));
8215 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8216 .addImm(Val: 0) // src0_modifiers
8217 .addReg(RegNo: TmpReg)
8218 .addImm(Val: 0) // clamp
8219 .addImm(Val: 0); // omod
8220 }
8221
8222 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8223 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8224 Inst.eraseFromParent();
8225 return;
8226 }
8227 case AMDGPU::S_MINIMUM_F32:
8228 case AMDGPU::S_MAXIMUM_F32: {
8229 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8230 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8231 .addImm(Val: 0) // src0_modifiers
8232 .add(MO: Inst.getOperand(i: 1))
8233 .addImm(Val: 0) // src1_modifiers
8234 .add(MO: Inst.getOperand(i: 2))
8235 .addImm(Val: 0) // clamp
8236 .addImm(Val: 0); // omod
8237 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8238
8239 legalizeOperands(MI&: *NewInstr, MDT);
8240 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8241 Inst.eraseFromParent();
8242 return;
8243 }
8244 case AMDGPU::S_MINIMUM_F16:
8245 case AMDGPU::S_MAXIMUM_F16: {
8246 Register NewDst = MRI.createVirtualRegister(RegClass: ST.useRealTrue16Insts()
8247 ? &AMDGPU::VGPR_16RegClass
8248 : &AMDGPU::VGPR_32RegClass);
8249 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8250 .addImm(Val: 0) // src0_modifiers
8251 .add(MO: Inst.getOperand(i: 1))
8252 .addImm(Val: 0) // src1_modifiers
8253 .add(MO: Inst.getOperand(i: 2))
8254 .addImm(Val: 0) // clamp
8255 .addImm(Val: 0) // omod
8256 .addImm(Val: 0); // opsel0
8257 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8258 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8259 legalizeOperands(MI&: *NewInstr, MDT);
8260 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8261 Inst.eraseFromParent();
8262 return;
8263 }
8264 case AMDGPU::V_S_EXP_F16_e64:
8265 case AMDGPU::V_S_LOG_F16_e64:
8266 case AMDGPU::V_S_RCP_F16_e64:
8267 case AMDGPU::V_S_RSQ_F16_e64:
8268 case AMDGPU::V_S_SQRT_F16_e64: {
8269 Register NewDst = MRI.createVirtualRegister(RegClass: ST.useRealTrue16Insts()
8270 ? &AMDGPU::VGPR_16RegClass
8271 : &AMDGPU::VGPR_32RegClass);
8272 auto NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8273 .add(MO: Inst.getOperand(i: 1)) // src0_modifiers
8274 .add(MO: Inst.getOperand(i: 2))
8275 .add(MO: Inst.getOperand(i: 3)) // clamp
8276 .add(MO: Inst.getOperand(i: 4)) // omod
8277 .setMIFlags(Inst.getFlags());
8278 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::op_sel))
8279 NewInstr.addImm(Val: 0); // opsel0
8280 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8281 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8282 legalizeOperands(MI&: *NewInstr, MDT);
8283 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8284 Inst.eraseFromParent();
8285 return;
8286 }
8287 }
8288
8289 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8290 // We cannot move this instruction to the VALU, so we should try to
8291 // legalize its operands instead.
8292 legalizeOperands(MI&: Inst, MDT);
8293 return;
8294 }
8295 // Handle converting generic instructions like COPY-to-SGPR into
8296 // COPY-to-VGPR.
8297 if (NewOpcode == Opcode) {
8298 Register DstReg = Inst.getOperand(i: 0).getReg();
8299 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8300
8301 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8302 // hope for the best.
8303 if (Inst.isCopy() && DstReg.isPhysical() &&
8304 RI.isVGPR(MRI, Reg: Inst.getOperand(i: 1).getReg())) {
8305 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8306 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
8307 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: NewDst)
8308 .add(MO: Inst.getOperand(i: 1));
8309 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
8310 DestReg: DstReg)
8311 .addReg(RegNo: NewDst);
8312
8313 Inst.eraseFromParent();
8314 return;
8315 }
8316
8317 if (Inst.isCopy() && Inst.getOperand(i: 1).getReg().isVirtual()) {
8318 Register NewDstReg = Inst.getOperand(i: 1).getReg();
8319 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, Reg: NewDstReg);
8320 if (const TargetRegisterClass *CommonRC =
8321 RI.getCommonSubClass(A: NewDstRC, B: SrcRC)) {
8322 // Instead of creating a copy where src and dst are the same register
8323 // class, we just replace all uses of dst with src. These kinds of
8324 // copies interfere with the heuristics MachineSink uses to decide
8325 // whether or not to split a critical edge. Since the pass assumes
8326 // that copies will end up as machine instructions and not be
8327 // eliminated.
8328 addUsersToMoveToVALUWorklist(Reg: DstReg, MRI, Worklist);
8329 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8330 MRI.clearKillFlags(Reg: NewDstReg);
8331 Inst.getOperand(i: 0).setReg(DstReg);
8332
8333 if (!MRI.constrainRegClass(Reg: NewDstReg, RC: CommonRC))
8334 llvm_unreachable("failed to constrain register");
8335
8336 Inst.eraseFromParent();
8337
8338 for (MachineOperand &UseMO :
8339 make_early_inc_range(Range: MRI.use_operands(Reg: NewDstReg))) {
8340 MachineInstr &UseMI = *UseMO.getParent();
8341
8342 // Legalize t16 operands since replaceReg is called after
8343 // addUsersToVALU.
8344 legalizeOperandsVALUt16(MI&: UseMI, MRI);
8345
8346 unsigned OpIdx = UseMI.getOperandNo(I: &UseMO);
8347 if (const TargetRegisterClass *OpRC =
8348 getRegClass(MCID: UseMI.getDesc(), OpNum: OpIdx))
8349 MRI.constrainRegClass(Reg: NewDstReg, RC: OpRC);
8350 }
8351
8352 return;
8353 }
8354 }
8355
8356 // If this is a v2s copy between 16bit and 32bit reg,
8357 // replace vgpr copy to reg_sequence/extract_subreg
8358 // This can be remove after we have sgpr16 in place
8359 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8360 Inst.getOperand(i: 1).getReg().isVirtual() &&
8361 RI.isVGPR(MRI, Reg: Inst.getOperand(i: 1).getReg())) {
8362 const TargetRegisterClass *SrcRegRC = getOpRegClass(MI: Inst, OpNo: 1);
8363 if (RI.getMatchingSuperRegClass(A: NewDstRC, B: SrcRegRC, Idx: AMDGPU::lo16)) {
8364 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8365 Register Undef = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass);
8366 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
8367 MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
8368 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
8369 MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewDstReg)
8370 .addReg(RegNo: Inst.getOperand(i: 1).getReg())
8371 .addImm(Val: AMDGPU::lo16)
8372 .addReg(RegNo: Undef)
8373 .addImm(Val: AMDGPU::hi16);
8374 Inst.eraseFromParent();
8375 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8376 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8377 return;
8378 } else if (RI.getMatchingSuperRegClass(A: SrcRegRC, B: NewDstRC,
8379 Idx: AMDGPU::lo16)) {
8380 Inst.getOperand(i: 1).setSubReg(AMDGPU::lo16);
8381 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8382 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8383 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8384 return;
8385 }
8386 }
8387
8388 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8389 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8390 legalizeOperands(MI&: Inst, MDT);
8391 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8392 return;
8393 }
8394
8395 // Use the new VALU Opcode.
8396 auto NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode))
8397 .setMIFlags(Inst.getFlags());
8398 if (isVOP3(Opcode: NewOpcode) && !isVOP3(Opcode)) {
8399 // Intersperse VOP3 modifiers among the SALU operands.
8400 NewInstr->addOperand(Op: Inst.getOperand(i: 0));
8401 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8402 Name: AMDGPU::OpName::src0_modifiers) >= 0)
8403 NewInstr.addImm(Val: 0);
8404 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::src0)) {
8405 const MachineOperand &Src = Inst.getOperand(i: 1);
8406 NewInstr->addOperand(Op: Src);
8407 }
8408
8409 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8410 // We are converting these to a BFE, so we need to add the missing
8411 // operands for the size and offset.
8412 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8413 NewInstr.addImm(Val: 0);
8414 NewInstr.addImm(Val: Size);
8415 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8416 // The VALU version adds the second operand to the result, so insert an
8417 // extra 0 operand.
8418 NewInstr.addImm(Val: 0);
8419 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8420 const MachineOperand &OffsetWidthOp = Inst.getOperand(i: 2);
8421 // If we need to move this to VGPRs, we need to unpack the second
8422 // operand back into the 2 separate ones for bit offset and width.
8423 assert(OffsetWidthOp.isImm() &&
8424 "Scalar BFE is only implemented for constant width and offset");
8425 uint32_t Imm = OffsetWidthOp.getImm();
8426
8427 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8428 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8429 NewInstr.addImm(Val: Offset);
8430 NewInstr.addImm(Val: BitWidth);
8431 } else {
8432 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8433 Name: AMDGPU::OpName::src1_modifiers) >= 0)
8434 NewInstr.addImm(Val: 0);
8435 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src1) >= 0)
8436 NewInstr->addOperand(Op: Inst.getOperand(i: 2));
8437 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8438 Name: AMDGPU::OpName::src2_modifiers) >= 0)
8439 NewInstr.addImm(Val: 0);
8440 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src2) >= 0)
8441 NewInstr->addOperand(Op: Inst.getOperand(i: 3));
8442 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::clamp) >= 0)
8443 NewInstr.addImm(Val: 0);
8444 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::omod) >= 0)
8445 NewInstr.addImm(Val: 0);
8446 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::op_sel) >= 0)
8447 NewInstr.addImm(Val: 0);
8448 }
8449 } else {
8450 // Just copy the SALU operands.
8451 for (const MachineOperand &Op : Inst.explicit_operands())
8452 NewInstr->addOperand(Op);
8453 }
8454
8455 // Remove any references to SCC. Vector instructions can't read from it, and
8456 // We're just about to add the implicit use / defs of VCC, and we don't want
8457 // both.
8458 for (MachineOperand &Op : Inst.implicit_operands()) {
8459 if (Op.getReg() == AMDGPU::SCC) {
8460 // Only propagate through live-def of SCC.
8461 if (Op.isDef() && !Op.isDead())
8462 addSCCDefUsersToVALUWorklist(Op, SCCDefInst&: Inst, Worklist);
8463 if (Op.isUse())
8464 addSCCDefsToVALUWorklist(SCCUseInst: NewInstr, Worklist);
8465 }
8466 }
8467 Inst.eraseFromParent();
8468 Register NewDstReg;
8469 if (NewInstr->getOperand(i: 0).isReg() && NewInstr->getOperand(i: 0).isDef()) {
8470 Register DstReg = NewInstr->getOperand(i: 0).getReg();
8471 assert(DstReg.isVirtual());
8472 // Update the destination register class.
8473 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst: *NewInstr);
8474 assert(NewDstRC);
8475 NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8476 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8477 }
8478 fixImplicitOperands(MI&: *NewInstr);
8479
8480 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8481
8482 // Legalize the operands
8483 legalizeOperands(MI&: *NewInstr, MDT);
8484 if (NewDstReg)
8485 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8486}
8487
8488// Add/sub require special handling to deal with carry outs.
8489std::pair<bool, MachineBasicBlock *>
8490SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8491 MachineDominatorTree *MDT) const {
8492 if (ST.hasAddNoCarryInsts()) {
8493 // Assume there is no user of scc since we don't select this in that case.
8494 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8495 // is used.
8496
8497 MachineBasicBlock &MBB = *Inst.getParent();
8498 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8499
8500 Register OldDstReg = Inst.getOperand(i: 0).getReg();
8501 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8502
8503 unsigned Opc = Inst.getOpcode();
8504 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8505
8506 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8507 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8508
8509 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8510 Inst.removeOperand(OpNo: 3);
8511
8512 Inst.setDesc(get(Opcode: NewOpc));
8513 Inst.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // clamp bit
8514 Inst.addImplicitDefUseOperands(MF&: *MBB.getParent());
8515 MRI.replaceRegWith(FromReg: OldDstReg, ToReg: ResultReg);
8516 MachineBasicBlock *NewBB = legalizeOperands(MI&: Inst, MDT);
8517
8518 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8519 return std::pair(true, NewBB);
8520 }
8521
8522 return std::pair(false, nullptr);
8523}
8524
8525void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8526 MachineDominatorTree *MDT) const {
8527
8528 MachineBasicBlock &MBB = *Inst.getParent();
8529 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8530 MachineBasicBlock::iterator MII = Inst;
8531 const DebugLoc &DL = Inst.getDebugLoc();
8532
8533 MachineOperand &Dest = Inst.getOperand(i: 0);
8534 MachineOperand &Src0 = Inst.getOperand(i: 1);
8535 MachineOperand &Src1 = Inst.getOperand(i: 2);
8536 MachineOperand &Cond = Inst.getOperand(i: 3);
8537
8538 Register CondReg = Cond.getReg();
8539 bool IsSCC = (CondReg == AMDGPU::SCC);
8540
8541 // If this is a trivial select where the condition is effectively not SCC
8542 // (CondReg is a source of copy to SCC), then the select is semantically
8543 // equivalent to copying CondReg. Hence, there is no need to create
8544 // V_CNDMASK, we can just use that and bail out.
8545 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8546 (Src1.getImm() == 0)) {
8547 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: CondReg);
8548 return;
8549 }
8550
8551 Register NewCondReg = CondReg;
8552 if (IsSCC) {
8553 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8554 NewCondReg = MRI.createVirtualRegister(RegClass: TC);
8555
8556 // Now look for the closest SCC def if it is a copy
8557 // replacing the CondReg with the COPY source register
8558 bool CopyFound = false;
8559 for (MachineInstr &CandI :
8560 make_range(x: std::next(x: MachineBasicBlock::reverse_iterator(Inst)),
8561 y: Inst.getParent()->rend())) {
8562 if (CandI.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isDead: false, Overlap: false) !=
8563 -1) {
8564 if (CandI.isCopy() && CandI.getOperand(i: 0).getReg() == AMDGPU::SCC) {
8565 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: NewCondReg)
8566 .addReg(RegNo: CandI.getOperand(i: 1).getReg());
8567 CopyFound = true;
8568 }
8569 break;
8570 }
8571 }
8572 if (!CopyFound) {
8573 // SCC def is not a copy
8574 // Insert a trivial select instead of creating a copy, because a copy from
8575 // SCC would semantically mean just copying a single bit, but we may need
8576 // the result to be a vector condition mask that needs preserving.
8577 unsigned Opcode =
8578 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8579 auto NewSelect =
8580 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: NewCondReg).addImm(Val: -1).addImm(Val: 0);
8581 NewSelect->getOperand(i: 3).setIsUndef(Cond.isUndef());
8582 }
8583 }
8584
8585 Register NewDestReg = MRI.createVirtualRegister(
8586 RegClass: RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest.getReg())));
8587 MachineInstr *NewInst;
8588 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8589 NewInst = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: NewDestReg)
8590 .addImm(Val: 0)
8591 .add(MO: Src1) // False
8592 .addImm(Val: 0)
8593 .add(MO: Src0) // True
8594 .addReg(RegNo: NewCondReg);
8595 } else {
8596 NewInst =
8597 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B64_PSEUDO), DestReg: NewDestReg)
8598 .add(MO: Src1) // False
8599 .add(MO: Src0) // True
8600 .addReg(RegNo: NewCondReg);
8601 }
8602 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDestReg);
8603 legalizeOperands(MI&: *NewInst, MDT);
8604 addUsersToMoveToVALUWorklist(Reg: NewDestReg, MRI, Worklist);
8605}
8606
8607void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8608 MachineInstr &Inst) const {
8609 MachineBasicBlock &MBB = *Inst.getParent();
8610 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8611 MachineBasicBlock::iterator MII = Inst;
8612 const DebugLoc &DL = Inst.getDebugLoc();
8613
8614 MachineOperand &Dest = Inst.getOperand(i: 0);
8615 MachineOperand &Src = Inst.getOperand(i: 1);
8616 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8617 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8618
8619 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8620 : AMDGPU::V_SUB_CO_U32_e32;
8621
8622 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: TmpReg)
8623 .addImm(Val: 0)
8624 .addReg(RegNo: Src.getReg());
8625
8626 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MAX_I32_e64), DestReg: ResultReg)
8627 .addReg(RegNo: Src.getReg())
8628 .addReg(RegNo: TmpReg);
8629
8630 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8631 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8632}
8633
8634void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8635 MachineInstr &Inst) const {
8636 MachineBasicBlock &MBB = *Inst.getParent();
8637 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8638 MachineBasicBlock::iterator MII = Inst;
8639 const DebugLoc &DL = Inst.getDebugLoc();
8640
8641 MachineOperand &Dest = Inst.getOperand(i: 0);
8642 MachineOperand &Src1 = Inst.getOperand(i: 1);
8643 MachineOperand &Src2 = Inst.getOperand(i: 2);
8644 Register SubResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8645 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8646 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8647
8648 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8649 : AMDGPU::V_SUB_CO_U32_e32;
8650
8651 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: SubResultReg)
8652 .addReg(RegNo: Src1.getReg())
8653 .addReg(RegNo: Src2.getReg());
8654
8655 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: TmpReg).addImm(Val: 0).addReg(RegNo: SubResultReg);
8656
8657 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MAX_I32_e64), DestReg: ResultReg)
8658 .addReg(RegNo: SubResultReg)
8659 .addReg(RegNo: TmpReg);
8660
8661 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8662 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8663}
8664
8665void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8666 MachineInstr &Inst) const {
8667 MachineBasicBlock &MBB = *Inst.getParent();
8668 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8669 MachineBasicBlock::iterator MII = Inst;
8670 const DebugLoc &DL = Inst.getDebugLoc();
8671
8672 MachineOperand &Dest = Inst.getOperand(i: 0);
8673 MachineOperand &Src0 = Inst.getOperand(i: 1);
8674 MachineOperand &Src1 = Inst.getOperand(i: 2);
8675
8676 if (ST.hasDLInsts()) {
8677 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8678 legalizeGenericOperand(InsertMBB&: MBB, I: MII, DstRC: &AMDGPU::VGPR_32RegClass, Op&: Src0, MRI, DL);
8679 legalizeGenericOperand(InsertMBB&: MBB, I: MII, DstRC: &AMDGPU::VGPR_32RegClass, Op&: Src1, MRI, DL);
8680
8681 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_XNOR_B32_e64), DestReg: NewDest)
8682 .add(MO: Src0)
8683 .add(MO: Src1);
8684
8685 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8686 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8687 } else {
8688 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8689 // invert either source and then perform the XOR. If either source is a
8690 // scalar register, then we can leave the inversion on the scalar unit to
8691 // achieve a better distribution of scalar and vector instructions.
8692 bool Src0IsSGPR = Src0.isReg() &&
8693 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src0.getReg()));
8694 bool Src1IsSGPR = Src1.isReg() &&
8695 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src1.getReg()));
8696 MachineInstr *Xor;
8697 Register Temp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8698 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8699
8700 // Build a pair of scalar instructions and add them to the work list.
8701 // The next iteration over the work list will lower these to the vector
8702 // unit as necessary.
8703 if (Src0IsSGPR) {
8704 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Temp).add(MO: Src0);
8705 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: NewDest)
8706 .addReg(RegNo: Temp)
8707 .add(MO: Src1);
8708 } else if (Src1IsSGPR) {
8709 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Temp).add(MO: Src1);
8710 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: NewDest)
8711 .add(MO: Src0)
8712 .addReg(RegNo: Temp);
8713 } else {
8714 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: Temp)
8715 .add(MO: Src0)
8716 .add(MO: Src1);
8717 MachineInstr *Not =
8718 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: NewDest).addReg(RegNo: Temp);
8719 Worklist.insert(MI: Not);
8720 }
8721
8722 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8723
8724 Worklist.insert(MI: Xor);
8725
8726 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8727 }
8728}
8729
8730void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8731 MachineInstr &Inst,
8732 unsigned Opcode) const {
8733 MachineBasicBlock &MBB = *Inst.getParent();
8734 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8735 MachineBasicBlock::iterator MII = Inst;
8736 const DebugLoc &DL = Inst.getDebugLoc();
8737
8738 MachineOperand &Dest = Inst.getOperand(i: 0);
8739 MachineOperand &Src0 = Inst.getOperand(i: 1);
8740 MachineOperand &Src1 = Inst.getOperand(i: 2);
8741
8742 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8743 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8744
8745 MachineInstr &Op = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: Interm)
8746 .add(MO: Src0)
8747 .add(MO: Src1);
8748
8749 MachineInstr &Not = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: NewDest)
8750 .addReg(RegNo: Interm);
8751
8752 Worklist.insert(MI: &Op);
8753 Worklist.insert(MI: &Not);
8754
8755 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8756 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8757}
8758
8759void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8760 MachineInstr &Inst,
8761 unsigned Opcode) const {
8762 MachineBasicBlock &MBB = *Inst.getParent();
8763 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8764 MachineBasicBlock::iterator MII = Inst;
8765 const DebugLoc &DL = Inst.getDebugLoc();
8766
8767 MachineOperand &Dest = Inst.getOperand(i: 0);
8768 MachineOperand &Src0 = Inst.getOperand(i: 1);
8769 MachineOperand &Src1 = Inst.getOperand(i: 2);
8770
8771 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8772 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8773
8774 MachineInstr &Not = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Interm)
8775 .add(MO: Src1);
8776
8777 MachineInstr &Op = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: NewDest)
8778 .add(MO: Src0)
8779 .addReg(RegNo: Interm);
8780
8781 Worklist.insert(MI: &Not);
8782 Worklist.insert(MI: &Op);
8783
8784 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8785 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8786}
8787
8788void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8789 MachineInstr &Inst, unsigned Opcode,
8790 bool Swap) const {
8791 MachineBasicBlock &MBB = *Inst.getParent();
8792 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8793
8794 MachineOperand &Dest = Inst.getOperand(i: 0);
8795 MachineOperand &Src0 = Inst.getOperand(i: 1);
8796 const DebugLoc &DL = Inst.getDebugLoc();
8797
8798 MachineBasicBlock::iterator MII = Inst;
8799
8800 const MCInstrDesc &InstDesc = get(Opcode);
8801 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8802 MRI.getRegClass(Reg: Src0.getReg()) :
8803 &AMDGPU::SGPR_32RegClass;
8804
8805 const TargetRegisterClass *Src0SubRC =
8806 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8807
8808 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
8809 SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8810
8811 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
8812 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(SRC: DestRC);
8813 const TargetRegisterClass *NewDestSubRC =
8814 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8815
8816 Register DestSub0 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
8817 MachineInstr &LoHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub0).add(MO: SrcReg0Sub0);
8818
8819 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
8820 SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
8821
8822 Register DestSub1 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
8823 MachineInstr &HiHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub1).add(MO: SrcReg0Sub1);
8824
8825 if (Swap)
8826 std::swap(a&: DestSub0, b&: DestSub1);
8827
8828 Register FullDestReg = MRI.createVirtualRegister(RegClass: NewDestRC);
8829 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
8830 .addReg(RegNo: DestSub0)
8831 .addImm(Val: AMDGPU::sub0)
8832 .addReg(RegNo: DestSub1)
8833 .addImm(Val: AMDGPU::sub1);
8834
8835 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
8836
8837 Worklist.insert(MI: &LoHalf);
8838 Worklist.insert(MI: &HiHalf);
8839
8840 // We don't need to legalizeOperands here because for a single operand, src0
8841 // will support any kind of input.
8842
8843 // Move all users of this moved value.
8844 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
8845}
8846
8847// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8848// split the s_mul_u64 in 32-bit vector multiplications.
8849void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8850 MachineInstr &Inst,
8851 MachineDominatorTree *MDT) const {
8852 MachineBasicBlock &MBB = *Inst.getParent();
8853 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8854
8855 Register FullDestReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
8856 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8857 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8858
8859 MachineOperand &Dest = Inst.getOperand(i: 0);
8860 MachineOperand &Src0 = Inst.getOperand(i: 1);
8861 MachineOperand &Src1 = Inst.getOperand(i: 2);
8862 const DebugLoc &DL = Inst.getDebugLoc();
8863 MachineBasicBlock::iterator MII = Inst;
8864
8865 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0.getReg());
8866 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: Src1.getReg());
8867 const TargetRegisterClass *Src0SubRC =
8868 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8869 if (RI.isSGPRClass(RC: Src0SubRC))
8870 Src0SubRC = RI.getEquivalentVGPRClass(SRC: Src0SubRC);
8871 const TargetRegisterClass *Src1SubRC =
8872 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8873 if (RI.isSGPRClass(RC: Src1SubRC))
8874 Src1SubRC = RI.getEquivalentVGPRClass(SRC: Src1SubRC);
8875
8876 // First, we extract the low 32-bit and high 32-bit values from each of the
8877 // operands.
8878 MachineOperand Op0L =
8879 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8880 MachineOperand Op1L =
8881 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
8882 MachineOperand Op0H =
8883 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
8884 MachineOperand Op1H =
8885 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
8886
8887 // The multilication is done as follows:
8888 //
8889 // Op1H Op1L
8890 // * Op0H Op0L
8891 // --------------------
8892 // Op1H*Op0L Op1L*Op0L
8893 // + Op1H*Op0H Op1L*Op0H
8894 // -----------------------------------------
8895 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8896 //
8897 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8898 // value and that would overflow.
8899 // The low 32-bit value is Op1L*Op0L.
8900 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8901
8902 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8903 MachineInstr *Op1L_Op0H =
8904 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: Op1L_Op0H_Reg)
8905 .add(MO: Op1L)
8906 .add(MO: Op0H);
8907
8908 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8909 MachineInstr *Op1H_Op0L =
8910 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: Op1H_Op0L_Reg)
8911 .add(MO: Op1H)
8912 .add(MO: Op0L);
8913
8914 Register CarryReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8915 MachineInstr *Carry =
8916 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_HI_U32_e64), DestReg: CarryReg)
8917 .add(MO: Op1L)
8918 .add(MO: Op0L);
8919
8920 MachineInstr *LoHalf =
8921 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: DestSub0)
8922 .add(MO: Op1L)
8923 .add(MO: Op0L);
8924
8925 Register AddReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8926 MachineInstr *Add = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg: AddReg)
8927 .addReg(RegNo: Op1L_Op0H_Reg)
8928 .addReg(RegNo: Op1H_Op0L_Reg);
8929
8930 MachineInstr *HiHalf =
8931 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg: DestSub1)
8932 .addReg(RegNo: AddReg)
8933 .addReg(RegNo: CarryReg);
8934
8935 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
8936 .addReg(RegNo: DestSub0)
8937 .addImm(Val: AMDGPU::sub0)
8938 .addReg(RegNo: DestSub1)
8939 .addImm(Val: AMDGPU::sub1);
8940
8941 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
8942
8943 // Try to legalize the operands in case we need to swap the order to keep it
8944 // valid.
8945 legalizeOperands(MI&: *Op1L_Op0H, MDT);
8946 legalizeOperands(MI&: *Op1H_Op0L, MDT);
8947 legalizeOperands(MI&: *Carry, MDT);
8948 legalizeOperands(MI&: *LoHalf, MDT);
8949 legalizeOperands(MI&: *Add, MDT);
8950 legalizeOperands(MI&: *HiHalf, MDT);
8951
8952 // Move all users of this moved value.
8953 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
8954}
8955
8956// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8957// multiplications.
8958void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8959 MachineInstr &Inst,
8960 MachineDominatorTree *MDT) const {
8961 MachineBasicBlock &MBB = *Inst.getParent();
8962 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8963
8964 Register FullDestReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
8965 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8966 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8967
8968 MachineOperand &Dest = Inst.getOperand(i: 0);
8969 MachineOperand &Src0 = Inst.getOperand(i: 1);
8970 MachineOperand &Src1 = Inst.getOperand(i: 2);
8971 const DebugLoc &DL = Inst.getDebugLoc();
8972 MachineBasicBlock::iterator MII = Inst;
8973
8974 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0.getReg());
8975 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: Src1.getReg());
8976 const TargetRegisterClass *Src0SubRC =
8977 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8978 if (RI.isSGPRClass(RC: Src0SubRC))
8979 Src0SubRC = RI.getEquivalentVGPRClass(SRC: Src0SubRC);
8980 const TargetRegisterClass *Src1SubRC =
8981 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8982 if (RI.isSGPRClass(RC: Src1SubRC))
8983 Src1SubRC = RI.getEquivalentVGPRClass(SRC: Src1SubRC);
8984
8985 // First, we extract the low 32-bit and high 32-bit values from each of the
8986 // operands.
8987 MachineOperand Op0L =
8988 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8989 MachineOperand Op1L =
8990 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
8991
8992 unsigned Opc = Inst.getOpcode();
8993 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8994 ? AMDGPU::V_MUL_HI_U32_e64
8995 : AMDGPU::V_MUL_HI_I32_e64;
8996 MachineInstr *HiHalf =
8997 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: NewOpc), DestReg: DestSub1).add(MO: Op1L).add(MO: Op0L);
8998
8999 MachineInstr *LoHalf =
9000 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: DestSub0)
9001 .add(MO: Op1L)
9002 .add(MO: Op0L);
9003
9004 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
9005 .addReg(RegNo: DestSub0)
9006 .addImm(Val: AMDGPU::sub0)
9007 .addReg(RegNo: DestSub1)
9008 .addImm(Val: AMDGPU::sub1);
9009
9010 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
9011
9012 // Try to legalize the operands in case we need to swap the order to keep it
9013 // valid.
9014 legalizeOperands(MI&: *HiHalf, MDT);
9015 legalizeOperands(MI&: *LoHalf, MDT);
9016
9017 // Move all users of this moved value.
9018 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
9019}
9020
9021void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9022 MachineInstr &Inst, unsigned Opcode,
9023 MachineDominatorTree *MDT) const {
9024 MachineBasicBlock &MBB = *Inst.getParent();
9025 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9026
9027 MachineOperand &Dest = Inst.getOperand(i: 0);
9028 MachineOperand &Src0 = Inst.getOperand(i: 1);
9029 MachineOperand &Src1 = Inst.getOperand(i: 2);
9030 const DebugLoc &DL = Inst.getDebugLoc();
9031
9032 MachineBasicBlock::iterator MII = Inst;
9033
9034 const MCInstrDesc &InstDesc = get(Opcode);
9035 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9036 MRI.getRegClass(Reg: Src0.getReg()) :
9037 &AMDGPU::SGPR_32RegClass;
9038
9039 const TargetRegisterClass *Src0SubRC =
9040 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9041 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9042 MRI.getRegClass(Reg: Src1.getReg()) :
9043 &AMDGPU::SGPR_32RegClass;
9044
9045 const TargetRegisterClass *Src1SubRC =
9046 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9047
9048 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
9049 SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
9050 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC,
9051 SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
9052 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
9053 SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
9054 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC,
9055 SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
9056
9057 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
9058 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(SRC: DestRC);
9059 const TargetRegisterClass *NewDestSubRC =
9060 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9061
9062 Register DestSub0 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
9063 MachineInstr &LoHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub0)
9064 .add(MO: SrcReg0Sub0)
9065 .add(MO: SrcReg1Sub0);
9066
9067 Register DestSub1 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
9068 MachineInstr &HiHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub1)
9069 .add(MO: SrcReg0Sub1)
9070 .add(MO: SrcReg1Sub1);
9071
9072 Register FullDestReg = MRI.createVirtualRegister(RegClass: NewDestRC);
9073 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
9074 .addReg(RegNo: DestSub0)
9075 .addImm(Val: AMDGPU::sub0)
9076 .addReg(RegNo: DestSub1)
9077 .addImm(Val: AMDGPU::sub1);
9078
9079 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
9080
9081 Worklist.insert(MI: &LoHalf);
9082 Worklist.insert(MI: &HiHalf);
9083
9084 // Move all users of this moved value.
9085 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
9086}
9087
9088void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9089 MachineInstr &Inst,
9090 MachineDominatorTree *MDT) const {
9091 MachineBasicBlock &MBB = *Inst.getParent();
9092 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9093
9094 MachineOperand &Dest = Inst.getOperand(i: 0);
9095 MachineOperand &Src0 = Inst.getOperand(i: 1);
9096 MachineOperand &Src1 = Inst.getOperand(i: 2);
9097 const DebugLoc &DL = Inst.getDebugLoc();
9098
9099 MachineBasicBlock::iterator MII = Inst;
9100
9101 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
9102
9103 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
9104
9105 MachineOperand* Op0;
9106 MachineOperand* Op1;
9107
9108 if (Src0.isReg() && RI.isSGPRReg(MRI, Reg: Src0.getReg())) {
9109 Op0 = &Src0;
9110 Op1 = &Src1;
9111 } else {
9112 Op0 = &Src1;
9113 Op1 = &Src0;
9114 }
9115
9116 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B64), DestReg: Interm)
9117 .add(MO: *Op0);
9118
9119 Register NewDest = MRI.createVirtualRegister(RegClass: DestRC);
9120
9121 MachineInstr &Xor = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B64), DestReg: NewDest)
9122 .addReg(RegNo: Interm)
9123 .add(MO: *Op1);
9124
9125 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
9126
9127 Worklist.insert(MI: &Xor);
9128}
9129
9130void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9131 MachineInstr &Inst) const {
9132 MachineBasicBlock &MBB = *Inst.getParent();
9133 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9134
9135 MachineBasicBlock::iterator MII = Inst;
9136 const DebugLoc &DL = Inst.getDebugLoc();
9137
9138 MachineOperand &Dest = Inst.getOperand(i: 0);
9139 MachineOperand &Src = Inst.getOperand(i: 1);
9140
9141 const MCInstrDesc &InstDesc = get(Opcode: AMDGPU::V_BCNT_U32_B32_e64);
9142 const TargetRegisterClass *SrcRC = Src.isReg() ?
9143 MRI.getRegClass(Reg: Src.getReg()) :
9144 &AMDGPU::SGPR_32RegClass;
9145
9146 Register MidReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9147 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9148
9149 const TargetRegisterClass *SrcSubRC =
9150 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9151
9152 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC,
9153 SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
9154 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC,
9155 SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
9156
9157 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg).add(MO: SrcRegSub0).addImm(Val: 0);
9158
9159 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: ResultReg).add(MO: SrcRegSub1).addReg(RegNo: MidReg);
9160
9161 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9162
9163 // We don't need to legalize operands here. src0 for either instruction can be
9164 // an SGPR, and the second input is unused or determined here.
9165 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9166}
9167
9168void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9169 MachineInstr &Inst) const {
9170 MachineBasicBlock &MBB = *Inst.getParent();
9171 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9172 MachineBasicBlock::iterator MII = Inst;
9173 const DebugLoc &DL = Inst.getDebugLoc();
9174
9175 MachineOperand &Dest = Inst.getOperand(i: 0);
9176 uint32_t Imm = Inst.getOperand(i: 2).getImm();
9177 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9178 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9179
9180 (void) Offset;
9181
9182 // Only sext_inreg cases handled.
9183 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9184 Offset == 0 && "Not implemented");
9185
9186 if (BitWidth < 32) {
9187 Register MidRegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9188 Register MidRegHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9189 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
9190
9191 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_BFE_I32_e64), DestReg: MidRegLo)
9192 .addReg(RegNo: Inst.getOperand(i: 1).getReg(), Flags: {}, SubReg: AMDGPU::sub0)
9193 .addImm(Val: 0)
9194 .addImm(Val: BitWidth);
9195
9196 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ASHRREV_I32_e32), DestReg: MidRegHi)
9197 .addImm(Val: 31)
9198 .addReg(RegNo: MidRegLo);
9199
9200 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: ResultReg)
9201 .addReg(RegNo: MidRegLo)
9202 .addImm(Val: AMDGPU::sub0)
9203 .addReg(RegNo: MidRegHi)
9204 .addImm(Val: AMDGPU::sub1);
9205
9206 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9207 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9208 return;
9209 }
9210
9211 MachineOperand &Src = Inst.getOperand(i: 1);
9212 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9213 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
9214
9215 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ASHRREV_I32_e64), DestReg: TmpReg)
9216 .addImm(Val: 31)
9217 .addReg(RegNo: Src.getReg(), Flags: {}, SubReg: AMDGPU::sub0);
9218
9219 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: ResultReg)
9220 .addReg(RegNo: Src.getReg(), Flags: {}, SubReg: AMDGPU::sub0)
9221 .addImm(Val: AMDGPU::sub0)
9222 .addReg(RegNo: TmpReg)
9223 .addImm(Val: AMDGPU::sub1);
9224
9225 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9226 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9227}
9228
9229void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9230 MachineInstr &Inst, unsigned Opcode,
9231 MachineDominatorTree *MDT) const {
9232 // (S_FLBIT_I32_B64 hi:lo) ->
9233 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9234 // (S_FF1_I32_B64 hi:lo) ->
9235 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9236
9237 MachineBasicBlock &MBB = *Inst.getParent();
9238 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9239 MachineBasicBlock::iterator MII = Inst;
9240 const DebugLoc &DL = Inst.getDebugLoc();
9241
9242 MachineOperand &Dest = Inst.getOperand(i: 0);
9243 MachineOperand &Src = Inst.getOperand(i: 1);
9244
9245 const MCInstrDesc &InstDesc = get(Opcode);
9246
9247 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9248 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9249 : AMDGPU::V_ADD_CO_U32_e32;
9250
9251 const TargetRegisterClass *SrcRC =
9252 Src.isReg() ? MRI.getRegClass(Reg: Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9253 const TargetRegisterClass *SrcSubRC =
9254 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9255
9256 MachineOperand SrcRegSub0 =
9257 buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
9258 MachineOperand SrcRegSub1 =
9259 buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
9260
9261 Register MidReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9262 Register MidReg2 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9263 Register MidReg3 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9264 Register MidReg4 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9265
9266 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg1).add(MO: SrcRegSub0);
9267
9268 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg2).add(MO: SrcRegSub1);
9269
9270 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: OpcodeAdd), DestReg: MidReg3)
9271 .addReg(RegNo: IsCtlz ? MidReg1 : MidReg2)
9272 .addImm(Val: 32)
9273 .addImm(Val: 1); // enable clamp
9274
9275 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MIN_U32_e64), DestReg: MidReg4)
9276 .addReg(RegNo: MidReg3)
9277 .addReg(RegNo: IsCtlz ? MidReg2 : MidReg1);
9278
9279 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: MidReg4);
9280
9281 addUsersToMoveToVALUWorklist(Reg: MidReg4, MRI, Worklist);
9282}
9283
9284void SIInstrInfo::addUsersToMoveToVALUWorklist(
9285 Register DstReg, MachineRegisterInfo &MRI,
9286 SIInstrWorklist &Worklist) const {
9287 for (MachineOperand &MO : make_early_inc_range(Range: MRI.use_operands(Reg: DstReg))) {
9288 MachineInstr &UseMI = *MO.getParent();
9289
9290 unsigned OpNo = 0;
9291
9292 switch (UseMI.getOpcode()) {
9293 case AMDGPU::COPY:
9294 case AMDGPU::WQM:
9295 case AMDGPU::SOFT_WQM:
9296 case AMDGPU::STRICT_WWM:
9297 case AMDGPU::STRICT_WQM:
9298 case AMDGPU::REG_SEQUENCE:
9299 case AMDGPU::PHI:
9300 case AMDGPU::INSERT_SUBREG:
9301 break;
9302 default:
9303 OpNo = MO.getOperandNo();
9304 break;
9305 }
9306
9307 const TargetRegisterClass *OpRC = getOpRegClass(MI: UseMI, OpNo);
9308 MRI.constrainRegClass(Reg: DstReg, RC: OpRC);
9309
9310 if (!RI.hasVectorRegisters(RC: OpRC))
9311 Worklist.insert(MI: &UseMI);
9312 else
9313 // Legalization could change user list.
9314 legalizeOperandsVALUt16(MI&: UseMI, OpIdx: OpNo, MRI);
9315 }
9316}
9317
9318void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9319 MachineRegisterInfo &MRI,
9320 MachineInstr &Inst) const {
9321 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9322 MachineBasicBlock *MBB = Inst.getParent();
9323 MachineOperand &Src0 = Inst.getOperand(i: 1);
9324 MachineOperand &Src1 = Inst.getOperand(i: 2);
9325 const DebugLoc &DL = Inst.getDebugLoc();
9326
9327 if (ST.useRealTrue16Insts()) {
9328 Register SrcReg0, SrcReg1;
9329 if (!Src0.isReg() || !RI.isVGPR(MRI, Reg: Src0.getReg())) {
9330 SrcReg0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9331 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL,
9332 MCID: get(Opcode: Src0.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), DestReg: SrcReg0)
9333 .add(MO: Src0);
9334 } else {
9335 SrcReg0 = Src0.getReg();
9336 }
9337
9338 if (!Src1.isReg() || !RI.isVGPR(MRI, Reg: Src1.getReg())) {
9339 SrcReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9340 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL,
9341 MCID: get(Opcode: Src1.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), DestReg: SrcReg1)
9342 .add(MO: Src1);
9343 } else {
9344 SrcReg1 = Src1.getReg();
9345 }
9346
9347 bool isSrc0Reg16 = MRI.constrainRegClass(Reg: SrcReg0, RC: &AMDGPU::VGPR_16RegClass);
9348 bool isSrc1Reg16 = MRI.constrainRegClass(Reg: SrcReg1, RC: &AMDGPU::VGPR_16RegClass);
9349
9350 auto NewMI = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: ResultReg);
9351 switch (Inst.getOpcode()) {
9352 case AMDGPU::S_PACK_LL_B32_B16:
9353 NewMI
9354 .addReg(RegNo: SrcReg0, Flags: {},
9355 SubReg: isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9356 .addImm(Val: AMDGPU::lo16)
9357 .addReg(RegNo: SrcReg1, Flags: {},
9358 SubReg: isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9359 .addImm(Val: AMDGPU::hi16);
9360 break;
9361 case AMDGPU::S_PACK_LH_B32_B16:
9362 NewMI
9363 .addReg(RegNo: SrcReg0, Flags: {},
9364 SubReg: isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9365 .addImm(Val: AMDGPU::lo16)
9366 .addReg(RegNo: SrcReg1, Flags: {}, SubReg: AMDGPU::hi16)
9367 .addImm(Val: AMDGPU::hi16);
9368 break;
9369 case AMDGPU::S_PACK_HL_B32_B16:
9370 NewMI.addReg(RegNo: SrcReg0, Flags: {}, SubReg: AMDGPU::hi16)
9371 .addImm(Val: AMDGPU::lo16)
9372 .addReg(RegNo: SrcReg1, Flags: {},
9373 SubReg: isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9374 .addImm(Val: AMDGPU::hi16);
9375 break;
9376 case AMDGPU::S_PACK_HH_B32_B16:
9377 NewMI.addReg(RegNo: SrcReg0, Flags: {}, SubReg: AMDGPU::hi16)
9378 .addImm(Val: AMDGPU::lo16)
9379 .addReg(RegNo: SrcReg1, Flags: {}, SubReg: AMDGPU::hi16)
9380 .addImm(Val: AMDGPU::hi16);
9381 break;
9382 default:
9383 llvm_unreachable("unhandled s_pack_* instruction");
9384 }
9385
9386 MachineOperand &Dest = Inst.getOperand(i: 0);
9387 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9388 addUsersToMoveToVALUWorklist(DstReg: ResultReg, MRI, Worklist);
9389 return;
9390 }
9391
9392 switch (Inst.getOpcode()) {
9393 case AMDGPU::S_PACK_LL_B32_B16: {
9394 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9395 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9396
9397 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9398 // 0.
9399 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9400 .addImm(Val: 0xffff);
9401
9402 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_AND_B32_e64), DestReg: TmpReg)
9403 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9404 .add(MO: Src0);
9405
9406 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: ResultReg)
9407 .add(MO: Src1)
9408 .addImm(Val: 16)
9409 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9410 break;
9411 }
9412 case AMDGPU::S_PACK_LH_B32_B16: {
9413 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9414 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9415 .addImm(Val: 0xffff);
9416 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_BFI_B32_e64), DestReg: ResultReg)
9417 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9418 .add(MO: Src0)
9419 .add(MO: Src1);
9420 break;
9421 }
9422 case AMDGPU::S_PACK_HL_B32_B16: {
9423 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9424 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
9425 .addImm(Val: 16)
9426 .add(MO: Src0);
9427 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: ResultReg)
9428 .add(MO: Src1)
9429 .addImm(Val: 16)
9430 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9431 break;
9432 }
9433 case AMDGPU::S_PACK_HH_B32_B16: {
9434 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9435 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9436 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
9437 .addImm(Val: 16)
9438 .add(MO: Src0);
9439 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9440 .addImm(Val: 0xffff0000);
9441 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_AND_OR_B32_e64), DestReg: ResultReg)
9442 .add(MO: Src1)
9443 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9444 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9445 break;
9446 }
9447 default:
9448 llvm_unreachable("unhandled s_pack_* instruction");
9449 }
9450
9451 MachineOperand &Dest = Inst.getOperand(i: 0);
9452 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9453 addUsersToMoveToVALUWorklist(DstReg: ResultReg, MRI, Worklist);
9454}
9455
9456void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9457 MachineInstr &SCCDefInst,
9458 SIInstrWorklist &Worklist,
9459 Register NewCond) const {
9460
9461 // Ensure that def inst defines SCC, which is still live.
9462 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9463 !Op.isDead() && Op.getParent() == &SCCDefInst);
9464 SmallVector<MachineInstr *, 4> CopyToDelete;
9465 // This assumes that all the users of SCC are in the same block
9466 // as the SCC def.
9467 for (MachineInstr &MI : // Skip the def inst itself.
9468 make_range(x: std::next(x: MachineBasicBlock::iterator(SCCDefInst)),
9469 y: SCCDefInst.getParent()->end())) {
9470 // Check if SCC is used first.
9471 int SCCIdx = MI.findRegisterUseOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isKill: false);
9472 if (SCCIdx != -1) {
9473 if (MI.isCopy()) {
9474 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9475 Register DestReg = MI.getOperand(i: 0).getReg();
9476
9477 MRI.replaceRegWith(FromReg: DestReg, ToReg: NewCond);
9478 CopyToDelete.push_back(Elt: &MI);
9479 } else {
9480
9481 if (NewCond.isValid())
9482 MI.getOperand(i: SCCIdx).setReg(NewCond);
9483
9484 Worklist.insert(MI: &MI);
9485 }
9486 }
9487 // Exit if we find another SCC def.
9488 if (MI.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isDead: false, Overlap: false) != -1)
9489 break;
9490 }
9491 for (auto &Copy : CopyToDelete)
9492 Copy->eraseFromParent();
9493}
9494
9495// Instructions that use SCC may be converted to VALU instructions. When that
9496// happens, the SCC register is changed to VCC_LO. The instruction that defines
9497// SCC must be changed to an instruction that defines VCC. This function makes
9498// sure that the instruction that defines SCC is added to the moveToVALU
9499// worklist.
9500void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9501 SIInstrWorklist &Worklist) const {
9502 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9503 // then there is nothing to do because the defining instruction has been
9504 // converted to a VALU already. If SCC then that instruction needs to be
9505 // converted to a VALU.
9506 for (MachineInstr &MI :
9507 make_range(x: std::next(x: MachineBasicBlock::reverse_iterator(SCCUseInst)),
9508 y: SCCUseInst->getParent()->rend())) {
9509 if (MI.modifiesRegister(Reg: AMDGPU::VCC, TRI: &RI))
9510 break;
9511 if (MI.definesRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
9512 Worklist.insert(MI: &MI);
9513 break;
9514 }
9515 }
9516}
9517
9518const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9519 const MachineInstr &Inst) const {
9520 const TargetRegisterClass *NewDstRC = getOpRegClass(MI: Inst, OpNo: 0);
9521
9522 switch (Inst.getOpcode()) {
9523 // For target instructions, getOpRegClass just returns the virtual register
9524 // class associated with the operand, so we need to find an equivalent VGPR
9525 // register class in order to move the instruction to the VALU.
9526 case AMDGPU::COPY:
9527 case AMDGPU::PHI:
9528 case AMDGPU::REG_SEQUENCE:
9529 case AMDGPU::INSERT_SUBREG:
9530 case AMDGPU::WQM:
9531 case AMDGPU::SOFT_WQM:
9532 case AMDGPU::STRICT_WWM:
9533 case AMDGPU::STRICT_WQM: {
9534 const TargetRegisterClass *SrcRC = getOpRegClass(MI: Inst, OpNo: 1);
9535 if (RI.isAGPRClass(RC: SrcRC)) {
9536 if (RI.isAGPRClass(RC: NewDstRC))
9537 return nullptr;
9538
9539 switch (Inst.getOpcode()) {
9540 case AMDGPU::PHI:
9541 case AMDGPU::REG_SEQUENCE:
9542 case AMDGPU::INSERT_SUBREG:
9543 NewDstRC = RI.getEquivalentAGPRClass(SRC: NewDstRC);
9544 break;
9545 default:
9546 NewDstRC = RI.getEquivalentVGPRClass(SRC: NewDstRC);
9547 }
9548
9549 if (!NewDstRC)
9550 return nullptr;
9551 } else {
9552 if (RI.isVGPRClass(RC: NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9553 return nullptr;
9554
9555 NewDstRC = RI.getEquivalentVGPRClass(SRC: NewDstRC);
9556 if (!NewDstRC)
9557 return nullptr;
9558 }
9559
9560 return NewDstRC;
9561 }
9562 default:
9563 return NewDstRC;
9564 }
9565}
9566
9567// Find the one SGPR operand we are allowed to use.
9568Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9569 int OpIndices[3]) const {
9570 const MCInstrDesc &Desc = MI.getDesc();
9571
9572 // Find the one SGPR operand we are allowed to use.
9573 //
9574 // First we need to consider the instruction's operand requirements before
9575 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9576 // of VCC, but we are still bound by the constant bus requirement to only use
9577 // one.
9578 //
9579 // If the operand's class is an SGPR, we can never move it.
9580
9581 Register SGPRReg = findImplicitSGPRRead(MI);
9582 if (SGPRReg)
9583 return SGPRReg;
9584
9585 Register UsedSGPRs[3] = {Register()};
9586 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9587
9588 for (unsigned i = 0; i < 3; ++i) {
9589 int Idx = OpIndices[i];
9590 if (Idx == -1)
9591 break;
9592
9593 const MachineOperand &MO = MI.getOperand(i: Idx);
9594 if (!MO.isReg())
9595 continue;
9596
9597 // Is this operand statically required to be an SGPR based on the operand
9598 // constraints?
9599 const TargetRegisterClass *OpRC =
9600 RI.getRegClass(i: getOpRegClassID(OpInfo: Desc.operands()[Idx]));
9601 bool IsRequiredSGPR = RI.isSGPRClass(RC: OpRC);
9602 if (IsRequiredSGPR)
9603 return MO.getReg();
9604
9605 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9606 Register Reg = MO.getReg();
9607 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9608 if (RI.isSGPRClass(RC: RegRC))
9609 UsedSGPRs[i] = Reg;
9610 }
9611
9612 // We don't have a required SGPR operand, so we have a bit more freedom in
9613 // selecting operands to move.
9614
9615 // Try to select the most used SGPR. If an SGPR is equal to one of the
9616 // others, we choose that.
9617 //
9618 // e.g.
9619 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9620 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9621
9622 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9623 // prefer those.
9624
9625 if (UsedSGPRs[0]) {
9626 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9627 SGPRReg = UsedSGPRs[0];
9628 }
9629
9630 if (!SGPRReg && UsedSGPRs[1]) {
9631 if (UsedSGPRs[1] == UsedSGPRs[2])
9632 SGPRReg = UsedSGPRs[1];
9633 }
9634
9635 return SGPRReg;
9636}
9637
9638MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
9639 AMDGPU::OpName OperandName) const {
9640 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9641 return nullptr;
9642
9643 int Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OperandName);
9644 if (Idx == -1)
9645 return nullptr;
9646
9647 return &MI.getOperand(i: Idx);
9648}
9649
9650uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
9651 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9652 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9653 ? (int64_t)AMDGPU::UfmtGFX11::UFMT_32_FLOAT
9654 : (int64_t)AMDGPU::UfmtGFX10::UFMT_32_FLOAT;
9655 return (Format << 44) |
9656 (1ULL << 56) | // RESOURCE_LEVEL = 1
9657 (3ULL << 60); // OOB_SELECT = 3
9658 }
9659
9660 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9661 if (ST.isAmdHsaOS()) {
9662 // Set ATC = 1. GFX9 doesn't have this bit.
9663 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9664 RsrcDataFormat |= (1ULL << 56);
9665
9666 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9667 // BTW, it disables TC L2 and therefore decreases performance.
9668 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9669 RsrcDataFormat |= (2ULL << 59);
9670 }
9671
9672 return RsrcDataFormat;
9673}
9674
9675uint64_t SIInstrInfo::getScratchRsrcWords23() const {
9676 uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
9677 AMDGPU::RSRC_TID_ENABLE |
9678 0xffffffff; // Size;
9679
9680 // GFX9 doesn't have ELEMENT_SIZE.
9681 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9682 uint64_t EltSizeValue = Log2_32(Value: ST.getMaxPrivateElementSize(ForBufferRSrc: true)) - 1;
9683 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9684 }
9685
9686 // IndexStride = 64 / 32.
9687 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9688 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9689
9690 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9691 // Clear them unless we want a huge stride.
9692 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9693 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9694 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9695
9696 return Rsrc23;
9697}
9698
9699bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
9700 unsigned Opc = MI.getOpcode();
9701
9702 return isSMRD(Opcode: Opc);
9703}
9704
9705bool SIInstrInfo::isHighLatencyDef(int Opc) const {
9706 return get(Opcode: Opc).mayLoad() &&
9707 (isMUBUF(Opcode: Opc) || isMTBUF(Opcode: Opc) || isMIMG(Opcode: Opc) || isFLAT(Opcode: Opc));
9708}
9709
9710Register SIInstrInfo::isStackAccess(const MachineInstr &MI,
9711 int &FrameIndex) const {
9712 const MachineOperand *Addr = getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
9713 if (!Addr || !Addr->isFI())
9714 return Register();
9715
9716 assert(!MI.memoperands_empty() &&
9717 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9718
9719 FrameIndex = Addr->getIndex();
9720 return getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata)->getReg();
9721}
9722
9723Register SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
9724 int &FrameIndex) const {
9725 const MachineOperand *Addr = getNamedOperand(MI, OperandName: AMDGPU::OpName::addr);
9726 assert(Addr && Addr->isFI());
9727 FrameIndex = Addr->getIndex();
9728 return getNamedOperand(MI, OperandName: AMDGPU::OpName::data)->getReg();
9729}
9730
9731Register SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
9732 int &FrameIndex) const {
9733 if (!MI.mayLoad())
9734 return Register();
9735
9736 if (isMUBUF(MI) || isVGPRSpill(MI))
9737 return isStackAccess(MI, FrameIndex);
9738
9739 if (isSGPRSpill(MI))
9740 return isSGPRStackAccess(MI, FrameIndex);
9741
9742 return Register();
9743}
9744
9745Register SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
9746 int &FrameIndex) const {
9747 if (!MI.mayStore())
9748 return Register();
9749
9750 if (isMUBUF(MI) || isVGPRSpill(MI))
9751 return isStackAccess(MI, FrameIndex);
9752
9753 if (isSGPRSpill(MI))
9754 return isSGPRStackAccess(MI, FrameIndex);
9755
9756 return Register();
9757}
9758
9759unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
9760 unsigned Size = 0;
9761 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
9762 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9763 while (++I != E && I->isInsideBundle()) {
9764 assert(!I->isBundle() && "No nested bundle!");
9765 Size += getInstSizeInBytes(MI: *I);
9766 }
9767
9768 return Size;
9769}
9770
9771unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
9772 unsigned Opc = MI.getOpcode();
9773 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: Opc);
9774 unsigned DescSize = Desc.getSize();
9775
9776 // If we have a definitive size, we can use it. Otherwise we need to inspect
9777 // the operands to know the size.
9778 if (isFixedSize(MI)) {
9779 unsigned Size = DescSize;
9780
9781 // If we hit the buggy offset, an extra nop will be inserted in MC so
9782 // estimate the worst case.
9783 if (MI.isBranch() && ST.hasOffset3fBug())
9784 Size += 4;
9785
9786 return Size;
9787 }
9788
9789 // Instructions may have a 32-bit literal encoded after them. Check
9790 // operands that could ever be literals.
9791 if (isVALU(MI) || isSALU(MI)) {
9792 if (isDPP(MI))
9793 return DescSize;
9794 bool HasLiteral = false;
9795 unsigned LiteralSize = 4;
9796 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9797 const MachineOperand &Op = MI.getOperand(i: I);
9798 const MCOperandInfo &OpInfo = Desc.operands()[I];
9799 if (!Op.isReg() && !isInlineConstant(MO: Op, OpInfo)) {
9800 HasLiteral = true;
9801 if (ST.has64BitLiterals()) {
9802 switch (OpInfo.OperandType) {
9803 default:
9804 break;
9805 case AMDGPU::OPERAND_REG_IMM_FP64:
9806 if (!AMDGPU::isValid32BitLiteral(Val: Op.getImm(), IsFP64: true))
9807 LiteralSize = 8;
9808 break;
9809 case AMDGPU::OPERAND_REG_IMM_INT64:
9810 // A 32-bit literal is only valid when the value fits in BOTH signed
9811 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
9812 // emitter's getLit64Encoding logic. This is because of the lack of
9813 // abilility to tell signedness of the literal, therefore we need to
9814 // be conservative and assume values outside this range require a
9815 // 64-bit literal encoding (8 bytes).
9816 if (!Op.isImm() || !isInt<32>(x: Op.getImm()) ||
9817 !isUInt<32>(x: Op.getImm()))
9818 LiteralSize = 8;
9819 break;
9820 }
9821 }
9822 break;
9823 }
9824 }
9825 return HasLiteral ? DescSize + LiteralSize : DescSize;
9826 }
9827
9828 // Check whether we have extra NSA words.
9829 if (isMIMG(MI)) {
9830 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0);
9831 if (VAddr0Idx < 0)
9832 return 8;
9833
9834 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::srsrc);
9835 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9836 }
9837
9838 switch (Opc) {
9839 case TargetOpcode::BUNDLE:
9840 return getInstBundleSize(MI);
9841 case TargetOpcode::INLINEASM:
9842 case TargetOpcode::INLINEASM_BR: {
9843 const MachineFunction *MF = MI.getMF();
9844 const char *AsmStr = MI.getOperand(i: 0).getSymbolName();
9845 return getInlineAsmLength(Str: AsmStr, MAI: *MF->getTarget().getMCAsmInfo(), STI: &ST);
9846 }
9847 default:
9848 if (MI.isMetaInstruction())
9849 return 0;
9850
9851 // If D16 Pseudo inst, get correct MC code size
9852 const auto *D16Info = AMDGPU::getT16D16Helper(T16Op: Opc);
9853 if (D16Info) {
9854 // Assume d16_lo/hi inst are always in same size
9855 unsigned LoInstOpcode = D16Info->LoOp;
9856 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: LoInstOpcode);
9857 DescSize = Desc.getSize();
9858 }
9859
9860 // If FMA Pseudo inst, get correct MC code size
9861 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9862 // All potential lowerings are the same size; arbitrarily pick one.
9863 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: AMDGPU::V_FMA_MIXLO_F16);
9864 DescSize = Desc.getSize();
9865 }
9866
9867 return DescSize;
9868 }
9869}
9870
9871bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
9872 if (!isFLAT(MI))
9873 return false;
9874
9875 if (MI.memoperands_empty())
9876 return true;
9877
9878 for (const MachineMemOperand *MMO : MI.memoperands()) {
9879 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9880 return true;
9881 }
9882 return false;
9883}
9884
9885ArrayRef<std::pair<int, const char *>>
9886SIInstrInfo::getSerializableTargetIndices() const {
9887 static const std::pair<int, const char *> TargetIndices[] = {
9888 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9889 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9890 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9891 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9892 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9893 return ArrayRef(TargetIndices);
9894}
9895
9896/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9897/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9898ScheduleHazardRecognizer *
9899SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
9900 const ScheduleDAG *DAG) const {
9901 return new GCNHazardRecognizer(DAG->MF);
9902}
9903
9904/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9905/// pass.
9906ScheduleHazardRecognizer *
9907SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF,
9908 MachineLoopInfo *MLI) const {
9909 return new GCNHazardRecognizer(MF, MLI);
9910}
9911
9912// Called during:
9913// - pre-RA scheduling and post-RA scheduling
9914ScheduleHazardRecognizer *
9915SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
9916 const ScheduleDAGMI *DAG) const {
9917 // Borrowed from Arm Target
9918 // We would like to restrict this hazard recognizer to only
9919 // post-RA scheduling; we can tell that we're post-RA because we don't
9920 // track VRegLiveness.
9921 if (!DAG->hasVRegLiveness())
9922 return new GCNHazardRecognizer(DAG->MF);
9923 return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG);
9924}
9925
9926std::pair<unsigned, unsigned>
9927SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9928 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9929}
9930
9931ArrayRef<std::pair<unsigned, const char *>>
9932SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9933 static const std::pair<unsigned, const char *> TargetFlags[] = {
9934 {MO_GOTPCREL, "amdgpu-gotprel"},
9935 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9936 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9937 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9938 {MO_REL32_LO, "amdgpu-rel32-lo"},
9939 {MO_REL32_HI, "amdgpu-rel32-hi"},
9940 {MO_REL64, "amdgpu-rel64"},
9941 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9942 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9943 {MO_ABS64, "amdgpu-abs64"},
9944 };
9945
9946 return ArrayRef(TargetFlags);
9947}
9948
9949ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
9950SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9951 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9952 {
9953 {MONoClobber, "amdgpu-noclobber"},
9954 {MOLastUse, "amdgpu-last-use"},
9955 {MOCooperative, "amdgpu-cooperative"},
9956 {MOThreadPrivate, "amdgpu-thread-private"},
9957 };
9958
9959 return ArrayRef(TargetFlags);
9960}
9961
9962unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
9963 const MachineFunction &MF) const {
9964 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
9965 assert(SrcReg.isVirtual());
9966 if (MFI->checkFlag(Reg: SrcReg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
9967 return AMDGPU::WWM_COPY;
9968
9969 return AMDGPU::COPY;
9970}
9971
9972bool SIInstrInfo::canAddToBBProlog(const MachineInstr &MI) const {
9973 uint32_t Opcode = MI.getOpcode();
9974 // Check if it is SGPR spill or wwm-register spill Opcode.
9975 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
9976 return true;
9977
9978 const MachineFunction *MF = MI.getMF();
9979 const MachineRegisterInfo &MRI = MF->getRegInfo();
9980 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
9981
9982 // See if this is Liverange split instruction inserted for SGPR or
9983 // wwm-register. The implicit def inserted for wwm-registers should also be
9984 // included as they can appear at the bb begin.
9985 bool IsLRSplitInst = MI.getFlag(Flag: MachineInstr::LRSplit);
9986 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
9987 return false;
9988
9989 Register Reg = MI.getOperand(i: 0).getReg();
9990 if (RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg)))
9991 return IsLRSplitInst;
9992
9993 return MFI->isWWMReg(Reg);
9994}
9995
9996bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
9997 Register Reg) const {
9998 // We need to handle instructions which may be inserted during register
9999 // allocation to handle the prolog. The initial prolog instruction may have
10000 // been separated from the start of the block by spills and copies inserted
10001 // needed by the prolog. However, the insertions for scalar registers can
10002 // always be placed at the BB top as they are independent of the exec mask
10003 // value.
10004 bool IsNullOrVectorRegister = true;
10005 if (Reg) {
10006 const MachineFunction *MF = MI.getMF();
10007 const MachineRegisterInfo &MRI = MF->getRegInfo();
10008 IsNullOrVectorRegister = !RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg));
10009 }
10010
10011 return IsNullOrVectorRegister &&
10012 (canAddToBBProlog(MI) ||
10013 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10014 MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &RI)));
10015}
10016
10017MachineInstrBuilder
10018SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
10019 MachineBasicBlock::iterator I,
10020 const DebugLoc &DL,
10021 Register DestReg) const {
10022 if (ST.hasAddNoCarryInsts())
10023 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e64), DestReg);
10024
10025 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10026 Register UnusedCarry = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
10027 MRI.setRegAllocationHint(VReg: UnusedCarry, Type: 0, PrefReg: RI.getVCC());
10028
10029 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg)
10030 .addReg(RegNo: UnusedCarry, Flags: RegState::Define | RegState::Dead);
10031}
10032
10033MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
10034 MachineBasicBlock::iterator I,
10035 const DebugLoc &DL,
10036 Register DestReg,
10037 RegScavenger &RS) const {
10038 if (ST.hasAddNoCarryInsts())
10039 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg);
10040
10041 // If available, prefer to use vcc.
10042 Register UnusedCarry = !RS.isRegUsed(Reg: AMDGPU::VCC)
10043 ? Register(RI.getVCC())
10044 : RS.scavengeRegisterBackwards(
10045 RC: *RI.getBoolRC(), To: I, /* RestoreAfter */ false,
10046 SPAdj: 0, /* AllowSpill */ false);
10047
10048 // TODO: Users need to deal with this.
10049 if (!UnusedCarry.isValid())
10050 return MachineInstrBuilder();
10051
10052 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg)
10053 .addReg(RegNo: UnusedCarry, Flags: RegState::Define | RegState::Dead);
10054}
10055
10056bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10057 switch (Opcode) {
10058 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10059 case AMDGPU::SI_KILL_I1_TERMINATOR:
10060 return true;
10061 default:
10062 return false;
10063 }
10064}
10065
10066const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
10067 switch (Opcode) {
10068 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10069 return get(Opcode: AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10070 case AMDGPU::SI_KILL_I1_PSEUDO:
10071 return get(Opcode: AMDGPU::SI_KILL_I1_TERMINATOR);
10072 default:
10073 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10074 }
10075}
10076
10077bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10078 return Imm <= getMaxMUBUFImmOffset(ST);
10079}
10080
10081unsigned SIInstrInfo::getMaxMUBUFImmOffset(const GCNSubtarget &ST) {
10082 // GFX12 field is non-negative 24-bit signed byte offset.
10083 const unsigned OffsetBits =
10084 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10085 return (1 << OffsetBits) - 1;
10086}
10087
10088void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
10089 if (!ST.isWave32())
10090 return;
10091
10092 if (MI.isInlineAsm())
10093 return;
10094
10095 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10096 return;
10097
10098 for (auto &Op : MI.implicit_operands()) {
10099 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10100 Op.setReg(AMDGPU::VCC_LO);
10101 }
10102}
10103
10104bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
10105 if (!isSMRD(MI))
10106 return false;
10107
10108 // Check that it is using a buffer resource.
10109 int Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::sbase);
10110 if (Idx == -1) // e.g. s_memtime
10111 return false;
10112
10113 const int16_t RCID = getOpRegClassID(OpInfo: MI.getDesc().operands()[Idx]);
10114 return RI.getRegClass(i: RCID)->hasSubClassEq(RC: &AMDGPU::SGPR_128RegClass);
10115}
10116
10117// Given Imm, split it into the values to put into the SOffset and ImmOffset
10118// fields in an MUBUF instruction. Return false if it is not possible (due to a
10119// hardware bug needing a workaround).
10120//
10121// The required alignment ensures that individual address components remain
10122// aligned if they are aligned to begin with. It also ensures that additional
10123// offsets within the given alignment can be added to the resulting ImmOffset.
10124bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset,
10125 uint32_t &ImmOffset, Align Alignment) const {
10126 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10127 const uint32_t MaxImm = alignDown(Value: MaxOffset, Align: Alignment.value());
10128 uint32_t Overflow = 0;
10129
10130 if (Imm > MaxImm) {
10131 if (Imm <= MaxImm + 64) {
10132 // Use an SOffset inline constant for 4..64
10133 Overflow = Imm - MaxImm;
10134 Imm = MaxImm;
10135 } else {
10136 // Try to keep the same value in SOffset for adjacent loads, so that
10137 // the corresponding register contents can be re-used.
10138 //
10139 // Load values with all low-bits (except for alignment bits) set into
10140 // SOffset, so that a larger range of values can be covered using
10141 // s_movk_i32.
10142 //
10143 // Atomic operations fail to work correctly when individual address
10144 // components are unaligned, even if their sum is aligned.
10145 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10146 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10147 Imm = Low;
10148 Overflow = High - Alignment.value();
10149 }
10150 }
10151
10152 if (Overflow > 0) {
10153 // There is a hardware bug in SI and CI which prevents address clamping in
10154 // MUBUF instructions from working correctly with SOffsets. The immediate
10155 // offset is unaffected.
10156 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10157 return false;
10158
10159 // It is not possible to set immediate in SOffset field on some targets.
10160 if (ST.hasRestrictedSOffset())
10161 return false;
10162 }
10163
10164 ImmOffset = Imm;
10165 SOffset = Overflow;
10166 return true;
10167}
10168
10169// Depending on the used address space and instructions, some immediate offsets
10170// are allowed and some are not.
10171// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10172// scratch instruction offsets can also be negative. On GFX12, offsets can be
10173// negative for all variants.
10174//
10175// There are several bugs related to these offsets:
10176// On gfx10.1, flat instructions that go into the global address space cannot
10177// use an offset.
10178//
10179// For scratch instructions, the address can be either an SGPR or a VGPR.
10180// The following offsets can be used, depending on the architecture (x means
10181// cannot be used):
10182// +----------------------------+------+------+
10183// | Address-Mode | SGPR | VGPR |
10184// +----------------------------+------+------+
10185// | gfx9 | | |
10186// | negative, 4-aligned offset | x | ok |
10187// | negative, unaligned offset | x | ok |
10188// +----------------------------+------+------+
10189// | gfx10 | | |
10190// | negative, 4-aligned offset | ok | ok |
10191// | negative, unaligned offset | ok | x |
10192// +----------------------------+------+------+
10193// | gfx10.3 | | |
10194// | negative, 4-aligned offset | ok | ok |
10195// | negative, unaligned offset | ok | ok |
10196// +----------------------------+------+------+
10197//
10198// This function ignores the addressing mode, so if an offset cannot be used in
10199// one addressing mode, it is considered illegal.
10200bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10201 uint64_t FlatVariant) const {
10202 // TODO: Should 0 be special cased?
10203 if (!ST.hasFlatInstOffsets())
10204 return false;
10205
10206 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10207 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10208 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10209 return false;
10210
10211 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10212 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10213 (Offset % 4) != 0) {
10214 return false;
10215 }
10216
10217 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10218 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10219 return isIntN(N, x: Offset) && (AllowNegative || Offset >= 0);
10220}
10221
10222// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10223std::pair<int64_t, int64_t>
10224SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10225 uint64_t FlatVariant) const {
10226 int64_t RemainderOffset = COffsetVal;
10227 int64_t ImmField = 0;
10228
10229 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10230 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10231
10232 if (AllowNegative) {
10233 // Use signed division by a power of two to truncate towards 0.
10234 int64_t D = 1LL << NumBits;
10235 RemainderOffset = (COffsetVal / D) * D;
10236 ImmField = COffsetVal - RemainderOffset;
10237
10238 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10239 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10240 (ImmField % 4) != 0) {
10241 // Make ImmField a multiple of 4
10242 RemainderOffset += ImmField % 4;
10243 ImmField -= ImmField % 4;
10244 }
10245 } else if (COffsetVal >= 0) {
10246 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(N: NumBits);
10247 RemainderOffset = COffsetVal - ImmField;
10248 }
10249
10250 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10251 assert(RemainderOffset + ImmField == COffsetVal);
10252 return {ImmField, RemainderOffset};
10253}
10254
10255bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const {
10256 if (ST.hasNegativeScratchOffsetBug() &&
10257 FlatVariant == SIInstrFlags::FlatScratch)
10258 return false;
10259
10260 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(STI: ST);
10261}
10262
10263static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10264 switch (ST.getGeneration()) {
10265 default:
10266 break;
10267 case AMDGPUSubtarget::SOUTHERN_ISLANDS:
10268 case AMDGPUSubtarget::SEA_ISLANDS:
10269 return SIEncodingFamily::SI;
10270 case AMDGPUSubtarget::VOLCANIC_ISLANDS:
10271 case AMDGPUSubtarget::GFX9:
10272 return SIEncodingFamily::VI;
10273 case AMDGPUSubtarget::GFX10:
10274 return SIEncodingFamily::GFX10;
10275 case AMDGPUSubtarget::GFX11:
10276 return ST.hasGFX11_7Insts() ? SIEncodingFamily::GFX1170
10277 : SIEncodingFamily::GFX11;
10278 case AMDGPUSubtarget::GFX12:
10279 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10280 : SIEncodingFamily::GFX12;
10281 case AMDGPUSubtarget::GFX13:
10282 return SIEncodingFamily::GFX13;
10283 }
10284 llvm_unreachable("Unknown subtarget generation!");
10285}
10286
10287bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10288 switch(MCOp) {
10289 // These opcodes use indirect register addressing so
10290 // they need special handling by codegen (currently missing).
10291 // Therefore it is too risky to allow these opcodes
10292 // to be selected by dpp combiner or sdwa peepholer.
10293 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10294 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10295 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10296 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10297 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10298 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10299 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10300 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10301 return true;
10302 default:
10303 return false;
10304 }
10305}
10306
10307#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10308 case OPCODE##_dpp: \
10309 case OPCODE##_e32: \
10310 case OPCODE##_e64: \
10311 case OPCODE##_e64_dpp: \
10312 case OPCODE##_sdwa:
10313
10314static bool isRenamedInGFX9(int Opcode) {
10315 switch (Opcode) {
10316 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10317 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10318 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10319 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10320 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10321 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10322 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10323 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10324 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10325 //
10326 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10327 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10328 case AMDGPU::V_FMA_F16_gfx9_e64:
10329 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10330 case AMDGPU::V_INTERP_P2_F16:
10331 case AMDGPU::V_MAD_F16_e64:
10332 case AMDGPU::V_MAD_U16_e64:
10333 case AMDGPU::V_MAD_I16_e64:
10334 return true;
10335 default:
10336 return false;
10337 }
10338}
10339
10340int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10341 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10342 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10343
10344 unsigned Gen = subtargetEncodingFamily(ST);
10345
10346 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10347 Gen = SIEncodingFamily::GFX9;
10348
10349 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10350 // subtarget has UnpackedD16VMem feature.
10351 // TODO: remove this when we discard GFX80 encoding.
10352 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10353 Gen = SIEncodingFamily::GFX80;
10354
10355 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10356 switch (ST.getGeneration()) {
10357 default:
10358 Gen = SIEncodingFamily::SDWA;
10359 break;
10360 case AMDGPUSubtarget::GFX9:
10361 Gen = SIEncodingFamily::SDWA9;
10362 break;
10363 case AMDGPUSubtarget::GFX10:
10364 Gen = SIEncodingFamily::SDWA10;
10365 break;
10366 }
10367 }
10368
10369 if (isMAI(Opcode)) {
10370 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10371 if (MFMAOp != -1)
10372 Opcode = MFMAOp;
10373 }
10374
10375 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10376
10377 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX11_7Insts())
10378 MCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX11);
10379
10380 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10381 MCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX12);
10382
10383 // -1 means that Opcode is already a native instruction.
10384 if (MCOp == -1)
10385 return Opcode;
10386
10387 if (ST.hasGFX90AInsts()) {
10388 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10389 if (ST.hasGFX940Insts())
10390 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX940);
10391 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10392 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX90A);
10393 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10394 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX9);
10395 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10396 MCOp = NMCOp;
10397 }
10398
10399 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10400 // encoding in the given subtarget generation.
10401 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10402 return -1;
10403
10404 if (isAsmOnlyOpcode(MCOp))
10405 return -1;
10406
10407 return MCOp;
10408}
10409
10410static
10411TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
10412 assert(RegOpnd.isReg());
10413 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10414 getRegSubRegPair(O: RegOpnd);
10415}
10416
10417TargetInstrInfo::RegSubRegPair
10418llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
10419 assert(MI.isRegSequence());
10420 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10421 if (MI.getOperand(i: 1 + 2 * I + 1).getImm() == SubReg) {
10422 auto &RegOp = MI.getOperand(i: 1 + 2 * I);
10423 return getRegOrUndef(RegOpnd: RegOp);
10424 }
10425 return TargetInstrInfo::RegSubRegPair();
10426}
10427
10428// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10429// Following a subreg of reg:subreg isn't supported
10430static bool followSubRegDef(MachineInstr &MI,
10431 TargetInstrInfo::RegSubRegPair &RSR) {
10432 if (!RSR.SubReg)
10433 return false;
10434 switch (MI.getOpcode()) {
10435 default: break;
10436 case AMDGPU::REG_SEQUENCE:
10437 RSR = getRegSequenceSubReg(MI, SubReg: RSR.SubReg);
10438 return true;
10439 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10440 case AMDGPU::INSERT_SUBREG:
10441 if (RSR.SubReg == (unsigned)MI.getOperand(i: 3).getImm())
10442 // inserted the subreg we're looking for
10443 RSR = getRegOrUndef(RegOpnd: MI.getOperand(i: 2));
10444 else { // the subreg in the rest of the reg
10445 auto R1 = getRegOrUndef(RegOpnd: MI.getOperand(i: 1));
10446 if (R1.SubReg) // subreg of subreg isn't supported
10447 return false;
10448 RSR.Reg = R1.Reg;
10449 }
10450 return true;
10451 }
10452 return false;
10453}
10454
10455MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
10456 const MachineRegisterInfo &MRI) {
10457 assert(MRI.isSSA());
10458 if (!P.Reg.isVirtual())
10459 return nullptr;
10460
10461 auto RSR = P;
10462 auto *DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10463 while (auto *MI = DefInst) {
10464 DefInst = nullptr;
10465 switch (MI->getOpcode()) {
10466 case AMDGPU::COPY:
10467 case AMDGPU::V_MOV_B32_e32: {
10468 auto &Op1 = MI->getOperand(i: 1);
10469 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10470 if (Op1.isUndef())
10471 return nullptr;
10472 RSR = getRegSubRegPair(O: Op1);
10473 DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10474 }
10475 break;
10476 }
10477 default:
10478 if (followSubRegDef(MI&: *MI, RSR)) {
10479 if (!RSR.Reg)
10480 return nullptr;
10481 DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10482 }
10483 }
10484 if (!DefInst)
10485 return MI;
10486 }
10487 return nullptr;
10488}
10489
10490bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
10491 Register VReg,
10492 const MachineInstr &DefMI,
10493 const MachineInstr &UseMI) {
10494 assert(MRI.isSSA() && "Must be run on SSA");
10495
10496 auto *TRI = MRI.getTargetRegisterInfo();
10497 auto *DefBB = DefMI.getParent();
10498
10499 // Don't bother searching between blocks, although it is possible this block
10500 // doesn't modify exec.
10501 if (UseMI.getParent() != DefBB)
10502 return true;
10503
10504 const int MaxInstScan = 20;
10505 int NumInst = 0;
10506
10507 // Stop scan at the use.
10508 auto E = UseMI.getIterator();
10509 for (auto I = std::next(x: DefMI.getIterator()); I != E; ++I) {
10510 if (I->isDebugInstr())
10511 continue;
10512
10513 if (++NumInst > MaxInstScan)
10514 return true;
10515
10516 if (I->modifiesRegister(Reg: AMDGPU::EXEC, TRI))
10517 return true;
10518 }
10519
10520 return false;
10521}
10522
10523bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
10524 Register VReg,
10525 const MachineInstr &DefMI) {
10526 assert(MRI.isSSA() && "Must be run on SSA");
10527
10528 auto *TRI = MRI.getTargetRegisterInfo();
10529 auto *DefBB = DefMI.getParent();
10530
10531 const int MaxUseScan = 10;
10532 int NumUse = 0;
10533
10534 for (auto &Use : MRI.use_nodbg_operands(Reg: VReg)) {
10535 auto &UseInst = *Use.getParent();
10536 // Don't bother searching between blocks, although it is possible this block
10537 // doesn't modify exec.
10538 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10539 return true;
10540
10541 if (++NumUse > MaxUseScan)
10542 return true;
10543 }
10544
10545 if (NumUse == 0)
10546 return false;
10547
10548 const int MaxInstScan = 20;
10549 int NumInst = 0;
10550
10551 // Stop scan when we have seen all the uses.
10552 for (auto I = std::next(x: DefMI.getIterator()); ; ++I) {
10553 assert(I != DefBB->end());
10554
10555 if (I->isDebugInstr())
10556 continue;
10557
10558 if (++NumInst > MaxInstScan)
10559 return true;
10560
10561 for (const MachineOperand &Op : I->operands()) {
10562 // We don't check reg masks here as they're used only on calls:
10563 // 1. EXEC is only considered const within one BB
10564 // 2. Call should be a terminator instruction if present in a BB
10565
10566 if (!Op.isReg())
10567 continue;
10568
10569 Register Reg = Op.getReg();
10570 if (Op.isUse()) {
10571 if (Reg == VReg && --NumUse == 0)
10572 return false;
10573 } else if (TRI->regsOverlap(RegA: Reg, RegB: AMDGPU::EXEC))
10574 return true;
10575 }
10576 }
10577}
10578
10579MachineInstr *SIInstrInfo::createPHIDestinationCopy(
10580 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
10581 const DebugLoc &DL, Register Src, Register Dst) const {
10582 auto Cur = MBB.begin();
10583 if (Cur != MBB.end())
10584 do {
10585 if (!Cur->isPHI() && Cur->readsRegister(Reg: Dst, /*TRI=*/nullptr))
10586 return BuildMI(BB&: MBB, I: Cur, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: Dst).addReg(RegNo: Src);
10587 ++Cur;
10588 } while (Cur != MBB.end() && Cur != LastPHIIt);
10589
10590 return TargetInstrInfo::createPHIDestinationCopy(MBB, InsPt: LastPHIIt, DL, Src,
10591 Dst);
10592}
10593
10594MachineInstr *SIInstrInfo::createPHISourceCopy(
10595 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
10596 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10597 if (InsPt != MBB.end() &&
10598 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10599 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10600 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10601 InsPt->definesRegister(Reg: Src, /*TRI=*/nullptr)) {
10602 InsPt++;
10603 return BuildMI(BB&: MBB, I: InsPt, MIMD: DL,
10604 MCID: get(Opcode: AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), DestReg: Dst)
10605 .addReg(RegNo: Src, Flags: {}, SubReg: SrcSubReg)
10606 .addReg(RegNo: AMDGPU::EXEC, Flags: RegState::Implicit);
10607 }
10608 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10609 Dst);
10610}
10611
10612bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10613
10614MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
10615 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
10616 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10617 VirtRegMap *VRM) const {
10618 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10619 //
10620 // %0:sreg_32 = COPY $m0
10621 //
10622 // We explicitly chose SReg_32 for the virtual register so such a copy might
10623 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10624 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10625 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10626 // TargetInstrInfo::foldMemoryOperand() is going to try.
10627 // A similar issue also exists with spilling and reloading $exec registers.
10628 //
10629 // To prevent that, constrain the %0 register class here.
10630 if (isFullCopyInstr(MI)) {
10631 Register DstReg = MI.getOperand(i: 0).getReg();
10632 Register SrcReg = MI.getOperand(i: 1).getReg();
10633 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10634 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10635 MachineRegisterInfo &MRI = MF.getRegInfo();
10636 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10637 const TargetRegisterClass *RC = MRI.getRegClass(Reg: VirtReg);
10638 if (RC->hasSuperClassEq(RC: &AMDGPU::SReg_32RegClass)) {
10639 MRI.constrainRegClass(Reg: VirtReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
10640 return nullptr;
10641 }
10642 if (RC->hasSuperClassEq(RC: &AMDGPU::SReg_64RegClass)) {
10643 MRI.constrainRegClass(Reg: VirtReg, RC: &AMDGPU::SReg_64_XEXECRegClass);
10644 return nullptr;
10645 }
10646 }
10647 }
10648
10649 return nullptr;
10650}
10651
10652unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
10653 const MachineInstr &MI,
10654 unsigned *PredCost) const {
10655 if (MI.isBundle()) {
10656 MachineBasicBlock::const_instr_iterator I(MI.getIterator());
10657 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10658 unsigned Lat = 0, Count = 0;
10659 for (++I; I != E && I->isBundledWithPred(); ++I) {
10660 ++Count;
10661 Lat = std::max(a: Lat, b: SchedModel.computeInstrLatency(MI: &*I));
10662 }
10663 return Lat + Count - 1;
10664 }
10665
10666 return SchedModel.computeInstrLatency(MI: &MI);
10667}
10668
10669const MachineOperand &
10670SIInstrInfo::getCalleeOperand(const MachineInstr &MI) const {
10671 if (const MachineOperand *CallAddrOp =
10672 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0))
10673 return *CallAddrOp;
10674 return TargetInstrInfo::getCalleeOperand(MI);
10675}
10676
10677InstructionUniformity
10678SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
10679 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10680 unsigned Opcode = MI.getOpcode();
10681
10682 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10683 Register Dst = MI.getOperand(i: 0).getReg();
10684 Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: 2).getReg()
10685 : MI.getOperand(i: 1).getReg();
10686 LLT DstTy = MRI.getType(Reg: Dst);
10687 LLT SrcTy = MRI.getType(Reg: Src);
10688 unsigned DstAS = DstTy.getAddressSpace();
10689 unsigned SrcAS = SrcTy.getAddressSpace();
10690 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10691 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10692 ST.hasGloballyAddressableScratch()
10693 ? InstructionUniformity::NeverUniform
10694 : InstructionUniformity::Default;
10695 };
10696
10697 // If the target supports globally addressable scratch, the mapping from
10698 // scratch memory to the flat aperture changes therefore an address space cast
10699 // is no longer uniform.
10700 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10701 return HandleAddrSpaceCast(MI);
10702
10703 if (auto *GI = dyn_cast<GIntrinsic>(Val: &MI)) {
10704 auto IID = GI->getIntrinsicID();
10705 if (AMDGPU::isIntrinsicSourceOfDivergence(IntrID: IID))
10706 return InstructionUniformity::NeverUniform;
10707 if (AMDGPU::isIntrinsicAlwaysUniform(IntrID: IID))
10708 return InstructionUniformity::AlwaysUniform;
10709
10710 switch (IID) {
10711 case Intrinsic::amdgcn_addrspacecast_nonnull:
10712 return HandleAddrSpaceCast(MI);
10713 case Intrinsic::amdgcn_if:
10714 case Intrinsic::amdgcn_else:
10715 // FIXME: Uniform if second result
10716 break;
10717 }
10718
10719 return InstructionUniformity::Default;
10720 }
10721
10722 // Loads from the private and flat address spaces are divergent, because
10723 // threads can execute the load instruction with the same inputs and get
10724 // different results.
10725 //
10726 // All other loads are not divergent, because if threads issue loads with the
10727 // same arguments, they will always get the same result.
10728 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10729 Opcode == AMDGPU::G_SEXTLOAD) {
10730 if (MI.memoperands_empty())
10731 return InstructionUniformity::NeverUniform; // conservative assumption
10732
10733 if (llvm::any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *mmo) {
10734 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10735 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10736 })) {
10737 // At least one MMO in a non-global address space.
10738 return InstructionUniformity::NeverUniform;
10739 }
10740 return InstructionUniformity::Default;
10741 }
10742
10743 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opc: Opcode) ||
10744 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10745 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10746 AMDGPU::isGenericAtomic(Opc: Opcode)) {
10747 return InstructionUniformity::NeverUniform;
10748 }
10749 return InstructionUniformity::Default;
10750}
10751
10752const MIRFormatter *SIInstrInfo::getMIRFormatter() const {
10753 if (!Formatter)
10754 Formatter = std::make_unique<AMDGPUMIRFormatter>(args: ST);
10755 return Formatter.get();
10756}
10757
10758InstructionUniformity
10759SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
10760
10761 if (isNeverUniform(MI))
10762 return InstructionUniformity::NeverUniform;
10763
10764 unsigned opcode = MI.getOpcode();
10765 if (opcode == AMDGPU::V_READLANE_B32 ||
10766 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10767 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10768 return InstructionUniformity::AlwaysUniform;
10769
10770 if (isCopyInstr(MI)) {
10771 const MachineOperand &srcOp = MI.getOperand(i: 1);
10772 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10773 const TargetRegisterClass *regClass =
10774 RI.getPhysRegBaseClass(Reg: srcOp.getReg());
10775 return RI.isSGPRClass(RC: regClass) ? InstructionUniformity::AlwaysUniform
10776 : InstructionUniformity::NeverUniform;
10777 }
10778 return InstructionUniformity::Default;
10779 }
10780
10781 // GMIR handling
10782 if (MI.isPreISelOpcode())
10783 return SIInstrInfo::getGenericInstructionUniformity(MI);
10784
10785 // Atomics are divergent because they are executed sequentially: when an
10786 // atomic operation refers to the same address in each thread, then each
10787 // thread after the first sees the value written by the previous thread as
10788 // original value.
10789
10790 if (isAtomic(MI))
10791 return InstructionUniformity::NeverUniform;
10792
10793 // Loads from the private and flat address spaces are divergent, because
10794 // threads can execute the load instruction with the same inputs and get
10795 // different results.
10796 if (isFLAT(MI) && MI.mayLoad()) {
10797 if (MI.memoperands_empty())
10798 return InstructionUniformity::NeverUniform; // conservative assumption
10799
10800 if (llvm::any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *mmo) {
10801 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10802 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10803 })) {
10804 // At least one MMO in a non-global address space.
10805 return InstructionUniformity::NeverUniform;
10806 }
10807
10808 return InstructionUniformity::Default;
10809 }
10810
10811 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10812 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10813
10814 // FIXME: It's conceptually broken to report this for an instruction, and not
10815 // a specific def operand. For inline asm in particular, there could be mixed
10816 // uniform and divergent results.
10817 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10818 const MachineOperand &SrcOp = MI.getOperand(i: I);
10819 if (!SrcOp.isReg())
10820 continue;
10821
10822 Register Reg = SrcOp.getReg();
10823 if (!Reg || !SrcOp.readsReg())
10824 continue;
10825
10826 // If RegBank is null, this is unassigned or an unallocatable special
10827 // register, which are all scalars.
10828 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, TRI: RI);
10829 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10830 return InstructionUniformity::NeverUniform;
10831 }
10832
10833 // TODO: Uniformity check condtions above can be rearranged for more
10834 // redability
10835
10836 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10837 // currently turned into no-op COPYs by SelectionDAG ISel and are
10838 // therefore no longer recognizable.
10839
10840 return InstructionUniformity::Default;
10841}
10842
10843unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
10844 switch (MF.getFunction().getCallingConv()) {
10845 case CallingConv::AMDGPU_PS:
10846 return 1;
10847 case CallingConv::AMDGPU_VS:
10848 return 2;
10849 case CallingConv::AMDGPU_GS:
10850 return 3;
10851 case CallingConv::AMDGPU_HS:
10852 case CallingConv::AMDGPU_LS:
10853 case CallingConv::AMDGPU_ES: {
10854 const Function &F = MF.getFunction();
10855 F.getContext().diagnose(DI: DiagnosticInfoUnsupported(
10856 F, "ds_ordered_count unsupported for this calling conv"));
10857 [[fallthrough]];
10858 }
10859 case CallingConv::AMDGPU_CS:
10860 case CallingConv::AMDGPU_KERNEL:
10861 case CallingConv::C:
10862 case CallingConv::Fast:
10863 default:
10864 // Assume other calling conventions are various compute callable functions
10865 return 0;
10866 }
10867}
10868
10869bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
10870 Register &SrcReg2, int64_t &CmpMask,
10871 int64_t &CmpValue) const {
10872 if (!MI.getOperand(i: 0).isReg() || MI.getOperand(i: 0).getSubReg())
10873 return false;
10874
10875 switch (MI.getOpcode()) {
10876 default:
10877 break;
10878 case AMDGPU::S_CMP_EQ_U32:
10879 case AMDGPU::S_CMP_EQ_I32:
10880 case AMDGPU::S_CMP_LG_U32:
10881 case AMDGPU::S_CMP_LG_I32:
10882 case AMDGPU::S_CMP_LT_U32:
10883 case AMDGPU::S_CMP_LT_I32:
10884 case AMDGPU::S_CMP_GT_U32:
10885 case AMDGPU::S_CMP_GT_I32:
10886 case AMDGPU::S_CMP_LE_U32:
10887 case AMDGPU::S_CMP_LE_I32:
10888 case AMDGPU::S_CMP_GE_U32:
10889 case AMDGPU::S_CMP_GE_I32:
10890 case AMDGPU::S_CMP_EQ_U64:
10891 case AMDGPU::S_CMP_LG_U64:
10892 SrcReg = MI.getOperand(i: 0).getReg();
10893 if (MI.getOperand(i: 1).isReg()) {
10894 if (MI.getOperand(i: 1).getSubReg())
10895 return false;
10896 SrcReg2 = MI.getOperand(i: 1).getReg();
10897 CmpValue = 0;
10898 } else if (MI.getOperand(i: 1).isImm()) {
10899 SrcReg2 = Register();
10900 CmpValue = MI.getOperand(i: 1).getImm();
10901 } else {
10902 return false;
10903 }
10904 CmpMask = ~0;
10905 return true;
10906 case AMDGPU::S_CMPK_EQ_U32:
10907 case AMDGPU::S_CMPK_EQ_I32:
10908 case AMDGPU::S_CMPK_LG_U32:
10909 case AMDGPU::S_CMPK_LG_I32:
10910 case AMDGPU::S_CMPK_LT_U32:
10911 case AMDGPU::S_CMPK_LT_I32:
10912 case AMDGPU::S_CMPK_GT_U32:
10913 case AMDGPU::S_CMPK_GT_I32:
10914 case AMDGPU::S_CMPK_LE_U32:
10915 case AMDGPU::S_CMPK_LE_I32:
10916 case AMDGPU::S_CMPK_GE_U32:
10917 case AMDGPU::S_CMPK_GE_I32:
10918 SrcReg = MI.getOperand(i: 0).getReg();
10919 SrcReg2 = Register();
10920 CmpValue = MI.getOperand(i: 1).getImm();
10921 CmpMask = ~0;
10922 return true;
10923 }
10924
10925 return false;
10926}
10927
10928static bool isSCCDeadOnExit(MachineBasicBlock *MBB) {
10929 for (MachineBasicBlock *S : MBB->successors()) {
10930 if (S->isLiveIn(Reg: AMDGPU::SCC))
10931 return false;
10932 }
10933 return true;
10934}
10935
10936// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
10937// (incoming SCC) = !(SCC defined by SCCDef).
10938// Return true if all uses can be re-written, false otherwise.
10939bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
10940 MachineBasicBlock *MBB = SCCDef->getParent();
10941 SmallVector<MachineInstr *> InvertInstr;
10942 bool SCCIsDead = false;
10943
10944 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
10945 constexpr unsigned ScanLimit = 12;
10946 unsigned Count = 0;
10947 for (MachineInstr &MI :
10948 make_range(x: std::next(x: MachineBasicBlock::iterator(SCCDef)), y: MBB->end())) {
10949 if (++Count > ScanLimit)
10950 return false;
10951 if (MI.readsRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
10952 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
10953 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
10954 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10955 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
10956 InvertInstr.push_back(Elt: &MI);
10957 else
10958 return false;
10959 }
10960 if (MI.definesRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
10961 SCCIsDead = true;
10962 break;
10963 }
10964 }
10965 if (!SCCIsDead && isSCCDeadOnExit(MBB))
10966 SCCIsDead = true;
10967
10968 // SCC may have more uses. Can't invert all of them.
10969 if (!SCCIsDead)
10970 return false;
10971
10972 // Invert uses
10973 for (MachineInstr *MI : InvertInstr) {
10974 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
10975 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
10976 swapOperands(Inst&: *MI);
10977 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10978 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
10979 MI->setDesc(get(Opcode: MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
10980 ? AMDGPU::S_CBRANCH_SCC1
10981 : AMDGPU::S_CBRANCH_SCC0));
10982 } else {
10983 llvm_unreachable("SCC used but no inversion handling");
10984 }
10985 }
10986 return true;
10987}
10988
10989// SCC is already valid after SCCValid.
10990// SCCRedefine will redefine SCC to the same value already available after
10991// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
10992// update kill/dead flags if necessary.
10993bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
10994 bool NeedInversion) const {
10995 MachineInstr *KillsSCC = nullptr;
10996 if (SCCValid->getParent() != SCCRedefine->getParent())
10997 return false;
10998 for (MachineInstr &MI : make_range(x: std::next(x: SCCValid->getIterator()),
10999 y: SCCRedefine->getIterator())) {
11000 if (MI.modifiesRegister(Reg: AMDGPU::SCC, TRI: &RI))
11001 return false;
11002 if (MI.killsRegister(Reg: AMDGPU::SCC, TRI: &RI))
11003 KillsSCC = &MI;
11004 }
11005 if (NeedInversion && !invertSCCUse(SCCDef: SCCRedefine))
11006 return false;
11007 if (MachineOperand *SccDef =
11008 SCCValid->findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr))
11009 SccDef->setIsDead(false);
11010 if (KillsSCC)
11011 KillsSCC->clearRegisterKills(Reg: AMDGPU::SCC, /*TRI=*/RegInfo: nullptr);
11012 SCCRedefine->eraseFromParent();
11013 return true;
11014}
11015
11016static bool foldableSelect(const MachineInstr &Def) {
11017 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11018 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11019 return false;
11020 bool Op1IsNonZeroImm =
11021 Def.getOperand(i: 1).isImm() && Def.getOperand(i: 1).getImm() != 0;
11022 bool Op2IsZeroImm =
11023 Def.getOperand(i: 2).isImm() && Def.getOperand(i: 2).getImm() == 0;
11024 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11025 return false;
11026 return true;
11027}
11028
11029static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11030 unsigned &NewDefOpc) {
11031 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11032 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11033 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11034 Def.getOpcode() != AMDGPU::S_ADD_U32)
11035 return false;
11036 const MachineOperand &AddSrc1 = Def.getOperand(i: 1);
11037 const MachineOperand &AddSrc2 = Def.getOperand(i: 2);
11038 int64_t addend;
11039
11040 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11041 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11042 (!getFoldableImm(MO: &AddSrc1, Imm&: addend) || addend != 1) &&
11043 (!getFoldableImm(MO: &AddSrc2, Imm&: addend) || addend != 1))
11044 return false;
11045
11046 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11047 const MachineOperand *SccDef =
11048 Def.findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
11049 if (!SccDef->isDead())
11050 return false;
11051 NewDefOpc = AMDGPU::S_ADD_U32;
11052 }
11053 NeedInversion = !NeedInversion;
11054 return true;
11055}
11056
11057bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
11058 Register SrcReg2, int64_t CmpMask,
11059 int64_t CmpValue,
11060 const MachineRegisterInfo *MRI) const {
11061 if (!SrcReg || SrcReg.isPhysical())
11062 return false;
11063
11064 if (SrcReg2 && !getFoldableImm(Reg: SrcReg2, MRI: *MRI, Imm&: CmpValue))
11065 return false;
11066
11067 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11068 this](bool NeedInversion) -> bool {
11069 if (CmpValue != 0)
11070 return false;
11071
11072 MachineInstr *Def = MRI->getVRegDef(Reg: SrcReg);
11073 if (!Def)
11074 return false;
11075
11076 // For S_OP that set SCC = DST!=0, do the transformation
11077 //
11078 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11079 //
11080 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11081 // do the transformation:
11082 //
11083 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11084 //
11085 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11086 // for S_CSELECT* already has the same value that will be calculated by
11087 // s_cmp_lg_*
11088 //
11089 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11090 // (non-zero imm), 0)
11091
11092 unsigned NewDefOpc = Def->getOpcode();
11093 if (!setsSCCIfResultIsNonZero(*Def) &&
11094 !setsSCCIfResultIsZero(Def: *Def, NeedInversion, NewDefOpc) &&
11095 !foldableSelect(Def: *Def))
11096 return false;
11097
11098 if (!optimizeSCC(SCCValid: Def, SCCRedefine: &CmpInstr, NeedInversion))
11099 return false;
11100
11101 if (NewDefOpc != Def->getOpcode())
11102 Def->setDesc(get(Opcode: NewDefOpc));
11103
11104 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11105 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11106 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11107 // sX = s_cselect_b64 (non-zero imm), 0
11108 // sLo = copy sX.sub0
11109 // sHi = copy sX.sub1
11110 // sY = s_or_b32 sLo, sHi
11111 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11112 MRI->use_nodbg_empty(RegNo: Def->getOperand(i: 0).getReg())) {
11113 const MachineOperand &OrOpnd1 = Def->getOperand(i: 1);
11114 const MachineOperand &OrOpnd2 = Def->getOperand(i: 2);
11115 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11116 MachineInstr *Def1 = MRI->getVRegDef(Reg: OrOpnd1.getReg());
11117 MachineInstr *Def2 = MRI->getVRegDef(Reg: OrOpnd2.getReg());
11118 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11119 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(i: 1).isReg() &&
11120 Def2->getOperand(i: 1).isReg() &&
11121 Def1->getOperand(i: 1).getSubReg() == AMDGPU::sub0 &&
11122 Def2->getOperand(i: 1).getSubReg() == AMDGPU::sub1 &&
11123 Def1->getOperand(i: 1).getReg() == Def2->getOperand(i: 1).getReg()) {
11124 MachineInstr *Select = MRI->getVRegDef(Reg: Def1->getOperand(i: 1).getReg());
11125 if (Select && foldableSelect(Def: *Select))
11126 optimizeSCC(SCCValid: Select, SCCRedefine: Def, /*NeedInversion=*/false);
11127 }
11128 }
11129 }
11130 return true;
11131 };
11132
11133 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11134 this](int64_t ExpectedValue, unsigned SrcSize,
11135 bool IsReversible, bool IsSigned) -> bool {
11136 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11137 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11138 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11139 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11140 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11141 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11142 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11143 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11144 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11145 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11146 //
11147 // Signed ge/gt are not used for the sign bit.
11148 //
11149 // If result of the AND is unused except in the compare:
11150 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11151 //
11152 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11153 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11154 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11155 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11156 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11157 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11158
11159 MachineInstr *Def = MRI->getVRegDef(Reg: SrcReg);
11160 if (!Def)
11161 return false;
11162
11163 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11164 Def->getOpcode() != AMDGPU::S_AND_B64)
11165 return false;
11166
11167 int64_t Mask;
11168 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11169 if (MO->isImm())
11170 Mask = MO->getImm();
11171 else if (!getFoldableImm(MO, Imm&: Mask))
11172 return false;
11173 Mask &= maxUIntN(N: SrcSize);
11174 return isPowerOf2_64(Value: Mask);
11175 };
11176
11177 MachineOperand *SrcOp = &Def->getOperand(i: 1);
11178 if (isMask(SrcOp))
11179 SrcOp = &Def->getOperand(i: 2);
11180 else if (isMask(&Def->getOperand(i: 2)))
11181 SrcOp = &Def->getOperand(i: 1);
11182 else
11183 return false;
11184
11185 // A valid Mask is required to have a single bit set, hence a non-zero and
11186 // power-of-two value. This verifies that we will not do 64-bit shift below.
11187 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11188 unsigned BitNo = llvm::countr_zero(Val: (uint64_t)Mask);
11189 if (IsSigned && BitNo == SrcSize - 1)
11190 return false;
11191
11192 ExpectedValue <<= BitNo;
11193
11194 bool IsReversedCC = false;
11195 if (CmpValue != ExpectedValue) {
11196 if (!IsReversible)
11197 return false;
11198 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11199 if (!IsReversedCC)
11200 return false;
11201 }
11202
11203 Register DefReg = Def->getOperand(i: 0).getReg();
11204 if (IsReversedCC && !MRI->hasOneNonDBGUse(RegNo: DefReg))
11205 return false;
11206
11207 if (!optimizeSCC(SCCValid: Def, SCCRedefine: &CmpInstr, /*NeedInversion=*/false))
11208 return false;
11209
11210 if (!MRI->use_nodbg_empty(RegNo: DefReg)) {
11211 assert(!IsReversedCC);
11212 return true;
11213 }
11214
11215 // Replace AND with unused result with a S_BITCMP.
11216 MachineBasicBlock *MBB = Def->getParent();
11217
11218 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11219 : AMDGPU::S_BITCMP1_B32
11220 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11221 : AMDGPU::S_BITCMP1_B64;
11222
11223 BuildMI(BB&: *MBB, I: Def, MIMD: Def->getDebugLoc(), MCID: get(Opcode: NewOpc))
11224 .add(MO: *SrcOp)
11225 .addImm(Val: BitNo);
11226 Def->eraseFromParent();
11227
11228 return true;
11229 };
11230
11231 switch (CmpInstr.getOpcode()) {
11232 default:
11233 break;
11234 case AMDGPU::S_CMP_EQ_U32:
11235 case AMDGPU::S_CMP_EQ_I32:
11236 case AMDGPU::S_CMPK_EQ_U32:
11237 case AMDGPU::S_CMPK_EQ_I32:
11238 return optimizeCmpAnd(1, 32, true, false) ||
11239 optimizeCmpSelect(/*NeedInversion=*/true);
11240 case AMDGPU::S_CMP_GE_U32:
11241 case AMDGPU::S_CMPK_GE_U32:
11242 return optimizeCmpAnd(1, 32, false, false);
11243 case AMDGPU::S_CMP_GE_I32:
11244 case AMDGPU::S_CMPK_GE_I32:
11245 return optimizeCmpAnd(1, 32, false, true);
11246 case AMDGPU::S_CMP_EQ_U64:
11247 return optimizeCmpAnd(1, 64, true, false);
11248 case AMDGPU::S_CMP_LG_U32:
11249 case AMDGPU::S_CMP_LG_I32:
11250 case AMDGPU::S_CMPK_LG_U32:
11251 case AMDGPU::S_CMPK_LG_I32:
11252 return optimizeCmpAnd(0, 32, true, false) ||
11253 optimizeCmpSelect(/*NeedInversion=*/false);
11254 case AMDGPU::S_CMP_GT_U32:
11255 case AMDGPU::S_CMPK_GT_U32:
11256 return optimizeCmpAnd(0, 32, false, false);
11257 case AMDGPU::S_CMP_GT_I32:
11258 case AMDGPU::S_CMPK_GT_I32:
11259 return optimizeCmpAnd(0, 32, false, true);
11260 case AMDGPU::S_CMP_LG_U64:
11261 return optimizeCmpAnd(0, 64, true, false) ||
11262 optimizeCmpSelect(/*NeedInversion=*/false);
11263 }
11264
11265 return false;
11266}
11267
11268void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI,
11269 AMDGPU::OpName OpName) const {
11270 if (!ST.needsAlignedVGPRs())
11271 return;
11272
11273 int OpNo = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpName);
11274 if (OpNo < 0)
11275 return;
11276 MachineOperand &Op = MI.getOperand(i: OpNo);
11277 if (getOpSize(MI, OpNo) > 4)
11278 return;
11279
11280 // Add implicit aligned super-reg to force alignment on the data operand.
11281 const DebugLoc &DL = MI.getDebugLoc();
11282 MachineBasicBlock *BB = MI.getParent();
11283 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11284 Register DataReg = Op.getReg();
11285 bool IsAGPR = RI.isAGPR(MRI, Reg: DataReg);
11286 Register Undef = MRI.createVirtualRegister(
11287 RegClass: IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11288 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
11289 Register NewVR =
11290 MRI.createVirtualRegister(RegClass: IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11291 : &AMDGPU::VReg_64_Align2RegClass);
11292 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewVR)
11293 .addReg(RegNo: DataReg, Flags: {}, SubReg: Op.getSubReg())
11294 .addImm(Val: AMDGPU::sub0)
11295 .addReg(RegNo: Undef)
11296 .addImm(Val: AMDGPU::sub1);
11297 Op.setReg(NewVR);
11298 Op.setSubReg(AMDGPU::sub0);
11299 MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewVR, isDef: false, isImp: true));
11300}
11301
11302bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
11303 if (isIGLP(MI: *MI))
11304 return false;
11305
11306 return TargetInstrInfo::isGlobalMemoryObject(MI);
11307}
11308
11309bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const {
11310 if (!isWMMA(MI) && !isSWMMAC(MI))
11311 return false;
11312
11313 if (ST.hasGFX1250Insts())
11314 return AMDGPU::getWMMAIsXDL(Opc: MI.getOpcode());
11315
11316 return true;
11317}
11318
11319bool SIInstrInfo::isXDL(const MachineInstr &MI) const {
11320 unsigned Opcode = MI.getOpcode();
11321
11322 if (AMDGPU::isGFX12Plus(STI: ST))
11323 return isDOT(MI) || isXDLWMMA(MI);
11324
11325 if (!isMAI(MI) || isDGEMM(Opcode) ||
11326 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11327 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11328 return false;
11329
11330 if (!ST.hasGFX940Insts())
11331 return true;
11332
11333 return AMDGPU::getMAIIsGFX940XDL(Opc: Opcode);
11334}
11335