1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/Analysis/ValueTracking.h"
24#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25#include "llvm/CodeGen/LiveIntervals.h"
26#include "llvm/CodeGen/LiveVariables.h"
27#include "llvm/CodeGen/MachineDominators.h"
28#include "llvm/CodeGen/MachineFrameInfo.h"
29#include "llvm/CodeGen/MachineScheduler.h"
30#include "llvm/CodeGen/RegisterScavenging.h"
31#include "llvm/CodeGen/ScheduleDAG.h"
32#include "llvm/IR/DiagnosticInfo.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
35#include "llvm/Support/CommandLine.h"
36#include "llvm/Target/TargetMachine.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
55static cl::opt<unsigned>
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(Val: 16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
59static cl::opt<bool> Fix16BitCopies(
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(Val: true),
63 cl::ReallyHidden);
64
65SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
66 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
67 AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(TSInfo: &ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(Num: N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
85static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1,
86 AMDGPU::OpName OpName) {
87 unsigned Opc0 = N0->getMachineOpcode();
88 unsigned Opc1 = N1->getMachineOpcode();
89
90 int Op0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: OpName);
91 int Op1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: OpName);
92
93 if (Op0Idx == -1 && Op1Idx == -1)
94 return true;
95
96
97 if ((Op0Idx == -1 && Op1Idx != -1) ||
98 (Op1Idx == -1 && Op0Idx != -1))
99 return false;
100
101 // getNamedOperandIdx returns the index for the MachineInstr's operands,
102 // which includes the result as the first operand. We are indexing into the
103 // MachineSDNode's operands, so we need to skip the result operand to get
104 // the real index.
105 --Op0Idx;
106 --Op1Idx;
107
108 return N0->getOperand(Num: Op0Idx) == N1->getOperand(Num: Op1Idx);
109}
110
111static bool canRemat(const MachineInstr &MI) {
112
113 if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) ||
114 SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) ||
115 SIInstrInfo::isSALU(MI))
116 return true;
117
118 if (SIInstrInfo::isSMRD(MI)) {
119 return !MI.memoperands_empty() &&
120 llvm::all_of(Range: MI.memoperands(), P: [](const MachineMemOperand *MMO) {
121 return MMO->isLoad() && MMO->isInvariant();
122 });
123 }
124
125 return false;
126}
127
128bool SIInstrInfo::isReMaterializableImpl(
129 const MachineInstr &MI) const {
130
131 if (canRemat(MI)) {
132 // Normally VALU use of exec would block the rematerialization, but that
133 // is OK in this case to have an implicit exec read as all VALU do.
134 // We really want all of the generic logic for this except for this.
135
136 // Another potential implicit use is mode register. The core logic of
137 // the RA will not attempt rematerialization if mode is set anywhere
138 // in the function, otherwise it is safe since mode is not changed.
139
140 // There is difference to generic method which does not allow
141 // rematerialization if there are virtual register uses. We allow this,
142 // therefore this method includes SOP instructions as well.
143 if (!MI.hasImplicitDef() &&
144 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
145 !MI.mayRaiseFPException())
146 return true;
147 }
148
149 return TargetInstrInfo::isReMaterializableImpl(MI);
150}
151
152// Returns true if the scalar result of a VALU instruction depends on exec.
153bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
154 // Ignore comparisons which are only used masked with exec.
155 // This allows some hoisting/sinking of VALU comparisons.
156 if (MI.isCompare()) {
157 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
158 if (!Dst)
159 return true;
160
161 Register DstReg = Dst->getReg();
162 if (!DstReg.isVirtual())
163 return true;
164
165 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
166 for (MachineInstr &Use : MRI.use_nodbg_instructions(Reg: DstReg)) {
167 switch (Use.getOpcode()) {
168 case AMDGPU::S_AND_SAVEEXEC_B32:
169 case AMDGPU::S_AND_SAVEEXEC_B64:
170 break;
171 case AMDGPU::S_AND_B32:
172 case AMDGPU::S_AND_B64:
173 if (!Use.readsRegister(Reg: AMDGPU::EXEC, /*TRI=*/nullptr))
174 return true;
175 break;
176 default:
177 return true;
178 }
179 }
180 return false;
181 }
182
183 // If it is not convergent it does not depend on EXEC.
184 if (!MI.isConvergent())
185 return false;
186
187 switch (MI.getOpcode()) {
188 default:
189 break;
190 case AMDGPU::V_READFIRSTLANE_B32:
191 return true;
192 }
193
194 return false;
195}
196
197bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
198 // Any implicit use of exec by VALU is not a real register read.
199 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
200 isVALU(MI: *MO.getParent()) && !resultDependsOnExec(MI: *MO.getParent());
201}
202
203bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
204 MachineBasicBlock *SuccToSinkTo,
205 MachineCycleInfo *CI) const {
206 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
207 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
208 return true;
209
210 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
211 // Check if sinking of MI would create temporal divergent use.
212 for (auto Op : MI.uses()) {
213 if (Op.isReg() && Op.getReg().isVirtual() &&
214 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Op.getReg()))) {
215 MachineInstr *SgprDef = MRI.getVRegDef(Reg: Op.getReg());
216
217 // SgprDef defined inside cycle
218 MachineCycle *FromCycle = CI->getCycle(Block: SgprDef->getParent());
219 if (FromCycle == nullptr)
220 continue;
221
222 MachineCycle *ToCycle = CI->getCycle(Block: SuccToSinkTo);
223 // Check if there is a FromCycle that contains SgprDef's basic block but
224 // does not contain SuccToSinkTo and also has divergent exit condition.
225 while (FromCycle && !FromCycle->contains(C: ToCycle)) {
226 SmallVector<MachineBasicBlock *, 1> ExitingBlocks;
227 FromCycle->getExitingBlocks(TmpStorage&: ExitingBlocks);
228
229 // FromCycle has divergent exit condition.
230 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
231 if (hasDivergentBranch(MBB: ExitingBlock))
232 return false;
233 }
234
235 FromCycle = FromCycle->getParentCycle();
236 }
237 }
238 }
239
240 return true;
241}
242
243bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
244 int64_t &Offset0,
245 int64_t &Offset1) const {
246 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
247 return false;
248
249 unsigned Opc0 = Load0->getMachineOpcode();
250 unsigned Opc1 = Load1->getMachineOpcode();
251
252 // Make sure both are actually loads.
253 if (!get(Opcode: Opc0).mayLoad() || !get(Opcode: Opc1).mayLoad())
254 return false;
255
256 // A mayLoad instruction without a def is not a load. Likely a prefetch.
257 if (!get(Opcode: Opc0).getNumDefs() || !get(Opcode: Opc1).getNumDefs())
258 return false;
259
260 if (isDS(Opcode: Opc0) && isDS(Opcode: Opc1)) {
261
262 // FIXME: Handle this case:
263 if (getNumOperandsNoGlue(Node: Load0) != getNumOperandsNoGlue(Node: Load1))
264 return false;
265
266 // Check base reg.
267 if (Load0->getOperand(Num: 0) != Load1->getOperand(Num: 0))
268 return false;
269
270 // Skip read2 / write2 variants for simplicity.
271 // TODO: We should report true if the used offsets are adjacent (excluded
272 // st64 versions).
273 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: AMDGPU::OpName::offset);
274 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: AMDGPU::OpName::offset);
275 if (Offset0Idx == -1 || Offset1Idx == -1)
276 return false;
277
278 // XXX - be careful of dataless loads
279 // getNamedOperandIdx returns the index for MachineInstrs. Since they
280 // include the output in the operand list, but SDNodes don't, we need to
281 // subtract the index by one.
282 Offset0Idx -= get(Opcode: Opc0).NumDefs;
283 Offset1Idx -= get(Opcode: Opc1).NumDefs;
284 Offset0 = Load0->getConstantOperandVal(Num: Offset0Idx);
285 Offset1 = Load1->getConstantOperandVal(Num: Offset1Idx);
286 return true;
287 }
288
289 if (isSMRD(Opcode: Opc0) && isSMRD(Opcode: Opc1)) {
290 // Skip time and cache invalidation instructions.
291 if (!AMDGPU::hasNamedOperand(Opcode: Opc0, NamedIdx: AMDGPU::OpName::sbase) ||
292 !AMDGPU::hasNamedOperand(Opcode: Opc1, NamedIdx: AMDGPU::OpName::sbase))
293 return false;
294
295 unsigned NumOps = getNumOperandsNoGlue(Node: Load0);
296 if (NumOps != getNumOperandsNoGlue(Node: Load1))
297 return false;
298
299 // Check base reg.
300 if (Load0->getOperand(Num: 0) != Load1->getOperand(Num: 0))
301 return false;
302
303 // Match register offsets, if both register and immediate offsets present.
304 assert(NumOps == 4 || NumOps == 5);
305 if (NumOps == 5 && Load0->getOperand(Num: 1) != Load1->getOperand(Num: 1))
306 return false;
307
308 const ConstantSDNode *Load0Offset =
309 dyn_cast<ConstantSDNode>(Val: Load0->getOperand(Num: NumOps - 3));
310 const ConstantSDNode *Load1Offset =
311 dyn_cast<ConstantSDNode>(Val: Load1->getOperand(Num: NumOps - 3));
312
313 if (!Load0Offset || !Load1Offset)
314 return false;
315
316 Offset0 = Load0Offset->getZExtValue();
317 Offset1 = Load1Offset->getZExtValue();
318 return true;
319 }
320
321 // MUBUF and MTBUF can access the same addresses.
322 if ((isMUBUF(Opcode: Opc0) || isMTBUF(Opcode: Opc0)) && (isMUBUF(Opcode: Opc1) || isMTBUF(Opcode: Opc1))) {
323
324 // MUBUF and MTBUF have vaddr at different indices.
325 if (!nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::soffset) ||
326 !nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::vaddr) ||
327 !nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::srsrc))
328 return false;
329
330 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: AMDGPU::OpName::offset);
331 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: AMDGPU::OpName::offset);
332
333 if (OffIdx0 == -1 || OffIdx1 == -1)
334 return false;
335
336 // getNamedOperandIdx returns the index for MachineInstrs. Since they
337 // include the output in the operand list, but SDNodes don't, we need to
338 // subtract the index by one.
339 OffIdx0 -= get(Opcode: Opc0).NumDefs;
340 OffIdx1 -= get(Opcode: Opc1).NumDefs;
341
342 SDValue Off0 = Load0->getOperand(Num: OffIdx0);
343 SDValue Off1 = Load1->getOperand(Num: OffIdx1);
344
345 // The offset might be a FrameIndexSDNode.
346 if (!isa<ConstantSDNode>(Val: Off0) || !isa<ConstantSDNode>(Val: Off1))
347 return false;
348
349 Offset0 = Off0->getAsZExtVal();
350 Offset1 = Off1->getAsZExtVal();
351 return true;
352 }
353
354 return false;
355}
356
357static bool isStride64(unsigned Opc) {
358 switch (Opc) {
359 case AMDGPU::DS_READ2ST64_B32:
360 case AMDGPU::DS_READ2ST64_B64:
361 case AMDGPU::DS_WRITE2ST64_B32:
362 case AMDGPU::DS_WRITE2ST64_B64:
363 return true;
364 default:
365 return false;
366 }
367}
368
369bool SIInstrInfo::getMemOperandsWithOffsetWidth(
370 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
371 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
372 const TargetRegisterInfo *TRI) const {
373 if (!LdSt.mayLoadOrStore())
374 return false;
375
376 unsigned Opc = LdSt.getOpcode();
377 OffsetIsScalable = false;
378 const MachineOperand *BaseOp, *OffsetOp;
379 int DataOpIdx;
380
381 if (isDS(MI: LdSt)) {
382 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::addr);
383 OffsetOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
384 if (OffsetOp) {
385 // Normal, single offset LDS instruction.
386 if (!BaseOp) {
387 // DS_CONSUME/DS_APPEND use M0 for the base address.
388 // TODO: find the implicit use operand for M0 and use that as BaseOp?
389 return false;
390 }
391 BaseOps.push_back(Elt: BaseOp);
392 Offset = OffsetOp->getImm();
393 // Get appropriate operand, and compute width accordingly.
394 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
395 if (DataOpIdx == -1)
396 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
397 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
398 Width = LocationSize::precise(Value: 64);
399 else
400 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
401 } else {
402 // The 2 offset instructions use offset0 and offset1 instead. We can treat
403 // these as a load with a single offset if the 2 offsets are consecutive.
404 // We will use this for some partially aligned loads.
405 const MachineOperand *Offset0Op =
406 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset0);
407 const MachineOperand *Offset1Op =
408 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset1);
409
410 unsigned Offset0 = Offset0Op->getImm() & 0xff;
411 unsigned Offset1 = Offset1Op->getImm() & 0xff;
412 if (Offset0 + 1 != Offset1)
413 return false;
414
415 // Each of these offsets is in element sized units, so we need to convert
416 // to bytes of the individual reads.
417
418 unsigned EltSize;
419 if (LdSt.mayLoad())
420 EltSize = TRI->getRegSizeInBits(RC: *getOpRegClass(MI: LdSt, OpNo: 0)) / 16;
421 else {
422 assert(LdSt.mayStore());
423 int Data0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
424 EltSize = TRI->getRegSizeInBits(RC: *getOpRegClass(MI: LdSt, OpNo: Data0Idx)) / 8;
425 }
426
427 if (isStride64(Opc))
428 EltSize *= 64;
429
430 BaseOps.push_back(Elt: BaseOp);
431 Offset = EltSize * Offset0;
432 // Get appropriate operand(s), and compute width accordingly.
433 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
434 if (DataOpIdx == -1) {
435 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
436 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
437 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data1);
438 Width = LocationSize::precise(
439 Value: Width.getValue() + TypeSize::getFixed(ExactSize: getOpSize(MI: LdSt, OpNo: DataOpIdx)));
440 } else {
441 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
442 }
443 }
444 return true;
445 }
446
447 if (isMUBUF(MI: LdSt) || isMTBUF(MI: LdSt)) {
448 const MachineOperand *RSrc = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::srsrc);
449 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
450 return false;
451 BaseOps.push_back(Elt: RSrc);
452 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr);
453 if (BaseOp && !BaseOp->isFI())
454 BaseOps.push_back(Elt: BaseOp);
455 const MachineOperand *OffsetImm =
456 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
457 Offset = OffsetImm->getImm();
458 const MachineOperand *SOffset =
459 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::soffset);
460 if (SOffset) {
461 if (SOffset->isReg())
462 BaseOps.push_back(Elt: SOffset);
463 else
464 Offset += SOffset->getImm();
465 }
466 // Get appropriate operand, and compute width accordingly.
467 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
468 if (DataOpIdx == -1)
469 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
470 if (DataOpIdx == -1) // LDS DMA
471 return false;
472 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
473 return true;
474 }
475
476 if (isImage(MI: LdSt)) {
477 auto RsrcOpName =
478 isMIMG(MI: LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
479 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: RsrcOpName);
480 BaseOps.push_back(Elt: &LdSt.getOperand(i: SRsrcIdx));
481 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0);
482 if (VAddr0Idx >= 0) {
483 // GFX10 possible NSA encoding.
484 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
485 BaseOps.push_back(Elt: &LdSt.getOperand(i: I));
486 } else {
487 BaseOps.push_back(Elt: getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr));
488 }
489 Offset = 0;
490 // Get appropriate operand, and compute width accordingly.
491 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
492 if (DataOpIdx == -1)
493 return false; // no return sampler
494 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
495 return true;
496 }
497
498 if (isSMRD(MI: LdSt)) {
499 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::sbase);
500 if (!BaseOp) // e.g. S_MEMTIME
501 return false;
502 BaseOps.push_back(Elt: BaseOp);
503 OffsetOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
504 Offset = OffsetOp ? OffsetOp->getImm() : 0;
505 // Get appropriate operand, and compute width accordingly.
506 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::sdst);
507 if (DataOpIdx == -1)
508 return false;
509 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
510 return true;
511 }
512
513 if (isFLAT(MI: LdSt)) {
514 // Instructions have either vaddr or saddr or both or none.
515 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr);
516 if (BaseOp)
517 BaseOps.push_back(Elt: BaseOp);
518 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::saddr);
519 if (BaseOp)
520 BaseOps.push_back(Elt: BaseOp);
521 Offset = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset)->getImm();
522 // Get appropriate operand, and compute width accordingly.
523 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
524 if (DataOpIdx == -1)
525 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
526 if (DataOpIdx == -1) // LDS DMA
527 return false;
528 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
529 return true;
530 }
531
532 return false;
533}
534
535static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
536 ArrayRef<const MachineOperand *> BaseOps1,
537 const MachineInstr &MI2,
538 ArrayRef<const MachineOperand *> BaseOps2) {
539 // Only examine the first "base" operand of each instruction, on the
540 // assumption that it represents the real base address of the memory access.
541 // Other operands are typically offsets or indices from this base address.
542 if (BaseOps1.front()->isIdenticalTo(Other: *BaseOps2.front()))
543 return true;
544
545 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
546 return false;
547
548 auto *MO1 = *MI1.memoperands_begin();
549 auto *MO2 = *MI2.memoperands_begin();
550 if (MO1->getAddrSpace() != MO2->getAddrSpace())
551 return false;
552
553 const auto *Base1 = MO1->getValue();
554 const auto *Base2 = MO2->getValue();
555 if (!Base1 || !Base2)
556 return false;
557 Base1 = getUnderlyingObject(V: Base1);
558 Base2 = getUnderlyingObject(V: Base2);
559
560 if (isa<UndefValue>(Val: Base1) || isa<UndefValue>(Val: Base2))
561 return false;
562
563 return Base1 == Base2;
564}
565
566bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
567 int64_t Offset1, bool OffsetIsScalable1,
568 ArrayRef<const MachineOperand *> BaseOps2,
569 int64_t Offset2, bool OffsetIsScalable2,
570 unsigned ClusterSize,
571 unsigned NumBytes) const {
572 // If the mem ops (to be clustered) do not have the same base ptr, then they
573 // should not be clustered
574 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
575 if (!BaseOps1.empty() && !BaseOps2.empty()) {
576 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
577 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
578 if (!memOpsHaveSameBasePtr(MI1: FirstLdSt, BaseOps1, MI2: SecondLdSt, BaseOps2))
579 return false;
580
581 const SIMachineFunctionInfo *MFI =
582 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
583 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
584 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
585 // If only one base op is empty, they do not have the same base ptr
586 return false;
587 }
588
589 // In order to avoid register pressure, on an average, the number of DWORDS
590 // loaded together by all clustered mem ops should not exceed
591 // MaxMemoryClusterDWords. This is an empirical value based on certain
592 // observations and performance related experiments.
593 // The good thing about this heuristic is - it avoids clustering of too many
594 // sub-word loads, and also avoids clustering of wide loads. Below is the
595 // brief summary of how the heuristic behaves for various `LoadSize` when
596 // MaxMemoryClusterDWords is 8.
597 //
598 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
599 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
600 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
601 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
602 // (5) LoadSize >= 17: do not cluster
603 const unsigned LoadSize = NumBytes / ClusterSize;
604 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
605 return NumDWords <= MaxMemoryClusterDWords;
606}
607
608// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
609// the first 16 loads will be interleaved with the stores, and the next 16 will
610// be clustered as expected. It should really split into 2 16 store batches.
611//
612// Loads are clustered until this returns false, rather than trying to schedule
613// groups of stores. This also means we have to deal with saying different
614// address space loads should be clustered, and ones which might cause bank
615// conflicts.
616//
617// This might be deprecated so it might not be worth that much effort to fix.
618bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
619 int64_t Offset0, int64_t Offset1,
620 unsigned NumLoads) const {
621 assert(Offset1 > Offset0 &&
622 "Second offset should be larger than first offset!");
623 // If we have less than 16 loads in a row, and the offsets are within 64
624 // bytes, then schedule together.
625
626 // A cacheline is 64 bytes (for global memory).
627 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
628}
629
630static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
631 MachineBasicBlock::iterator MI,
632 const DebugLoc &DL, MCRegister DestReg,
633 MCRegister SrcReg, bool KillSrc,
634 const char *Msg = "illegal VGPR to SGPR copy") {
635 MachineFunction *MF = MBB.getParent();
636
637 LLVMContext &C = MF->getFunction().getContext();
638 C.diagnose(DI: DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
639
640 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_ILLEGAL_COPY), DestReg)
641 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
642}
643
644/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
645/// possible to have a direct copy in these cases on GFX908, so an intermediate
646/// VGPR copy is required.
647static void indirectCopyToAGPR(const SIInstrInfo &TII,
648 MachineBasicBlock &MBB,
649 MachineBasicBlock::iterator MI,
650 const DebugLoc &DL, MCRegister DestReg,
651 MCRegister SrcReg, bool KillSrc,
652 RegScavenger &RS, bool RegsOverlap,
653 Register ImpDefSuperReg = Register(),
654 Register ImpUseSuperReg = Register()) {
655 assert((TII.getSubtarget().hasMAIInsts() &&
656 !TII.getSubtarget().hasGFX90AInsts()) &&
657 "Expected GFX908 subtarget.");
658
659 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
660 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
661 "Source register of the copy should be either an SGPR or an AGPR.");
662
663 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
664 "Destination register of the copy should be an AGPR.");
665
666 const SIRegisterInfo &RI = TII.getRegisterInfo();
667
668 // First try to find defining accvgpr_write to avoid temporary registers.
669 // In the case of copies of overlapping AGPRs, we conservatively do not
670 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
671 // an accvgpr_write used for this same copy due to implicit-defs
672 if (!RegsOverlap) {
673 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
674 --Def;
675
676 if (!Def->modifiesRegister(Reg: SrcReg, TRI: &RI))
677 continue;
678
679 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
680 Def->getOperand(i: 0).getReg() != SrcReg)
681 break;
682
683 MachineOperand &DefOp = Def->getOperand(i: 1);
684 assert(DefOp.isReg() || DefOp.isImm());
685
686 if (DefOp.isReg()) {
687 bool SafeToPropagate = true;
688 // Check that register source operand is not clobbered before MI.
689 // Immediate operands are always safe to propagate.
690 for (auto I = Def; I != MI && SafeToPropagate; ++I)
691 if (I->modifiesRegister(Reg: DefOp.getReg(), TRI: &RI))
692 SafeToPropagate = false;
693
694 if (!SafeToPropagate)
695 break;
696
697 for (auto I = Def; I != MI; ++I)
698 I->clearRegisterKills(Reg: DefOp.getReg(), RegInfo: &RI);
699 }
700
701 MachineInstrBuilder Builder =
702 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
703 .add(MO: DefOp);
704 if (ImpDefSuperReg)
705 Builder.addReg(RegNo: ImpDefSuperReg, Flags: RegState::Define | RegState::Implicit);
706
707 if (ImpUseSuperReg) {
708 Builder.addReg(RegNo: ImpUseSuperReg,
709 Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
710 }
711
712 return;
713 }
714 }
715
716 RS.enterBasicBlockEnd(MBB);
717 RS.backward(I: std::next(x: MI));
718
719 // Ideally we want to have three registers for a long reg_sequence copy
720 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
721 unsigned MaxVGPRs = RI.getRegPressureLimit(RC: &AMDGPU::VGPR_32RegClass,
722 MF&: *MBB.getParent());
723
724 // Registers in the sequence are allocated contiguously so we can just
725 // use register number to pick one of three round-robin temps.
726 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
727 Register Tmp =
728 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
729 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
730 "VGPR used for an intermediate copy should have been reserved.");
731
732 // Only loop through if there are any free registers left. We don't want to
733 // spill.
734 while (RegNo--) {
735 Register Tmp2 = RS.scavengeRegisterBackwards(RC: AMDGPU::VGPR_32RegClass, To: MI,
736 /* RestoreAfter */ false, SPAdj: 0,
737 /* AllowSpill */ false);
738 if (!Tmp2 || RI.getHWRegIndex(Reg: Tmp2) >= MaxVGPRs)
739 break;
740 Tmp = Tmp2;
741 RS.setRegUsed(Reg: Tmp);
742 }
743
744 // Insert copy to temporary VGPR.
745 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
746 if (AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg)) {
747 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
748 } else {
749 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
750 }
751
752 MachineInstrBuilder UseBuilder = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: TmpCopyOp), DestReg: Tmp)
753 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
754 if (ImpUseSuperReg) {
755 UseBuilder.addReg(RegNo: ImpUseSuperReg,
756 Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
757 }
758
759 MachineInstrBuilder DefBuilder
760 = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
761 .addReg(RegNo: Tmp, Flags: RegState::Kill);
762
763 if (ImpDefSuperReg)
764 DefBuilder.addReg(RegNo: ImpDefSuperReg, Flags: RegState::Define | RegState::Implicit);
765}
766
767static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
768 MachineBasicBlock::iterator MI, const DebugLoc &DL,
769 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
770 const TargetRegisterClass *RC, bool Forward) {
771 const SIRegisterInfo &RI = TII.getRegisterInfo();
772 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, EltSize: 4);
773 MachineBasicBlock::iterator I = MI;
774 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
775
776 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
777 int16_t SubIdx = BaseIndices[Idx];
778 Register DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
779 Register SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
780 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
781 unsigned Opcode = AMDGPU::S_MOV_B32;
782
783 // Is SGPR aligned? If so try to combine with next.
784 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
785 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
786 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
787 // Can use SGPR64 copy
788 unsigned Channel = RI.getChannelFromSubReg(SubReg: SubIdx);
789 SubIdx = RI.getSubRegFromChannel(Channel, NumRegs: 2);
790 DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
791 SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
792 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
793 Opcode = AMDGPU::S_MOV_B64;
794 Idx++;
795 }
796
797 LastMI = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode), DestReg: DestSubReg)
798 .addReg(RegNo: SrcSubReg)
799 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
800
801 if (!FirstMI)
802 FirstMI = LastMI;
803
804 if (!Forward)
805 I--;
806 }
807
808 assert(FirstMI && LastMI);
809 if (!Forward)
810 std::swap(a&: FirstMI, b&: LastMI);
811
812 FirstMI->addOperand(
813 Op: MachineOperand::CreateReg(Reg: DestReg, isDef: true /*IsDef*/, isImp: true /*IsImp*/));
814
815 if (KillSrc)
816 LastMI->addRegisterKilled(IncomingReg: SrcReg, RegInfo: &RI);
817}
818
819void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
820 MachineBasicBlock::iterator MI,
821 const DebugLoc &DL, Register DestReg,
822 Register SrcReg, bool KillSrc, bool RenamableDest,
823 bool RenamableSrc) const {
824 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(Reg: DestReg);
825 unsigned Size = RI.getRegSizeInBits(RC: *RC);
826 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(Reg: SrcReg);
827 unsigned SrcSize = RI.getRegSizeInBits(RC: *SrcRC);
828
829 // The rest of copyPhysReg assumes Src and Dst size are the same size.
830 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
831 // we remove Fix16BitCopies and this code block?
832 if (Fix16BitCopies) {
833 if (((Size == 16) != (SrcSize == 16))) {
834 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
835 assert(ST.useRealTrue16Insts());
836 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
837 MCRegister SubReg = RI.getSubReg(Reg: RegToFix, Idx: AMDGPU::lo16);
838 RegToFix = SubReg;
839
840 if (DestReg == SrcReg) {
841 // Identity copy. Insert empty bundle since ExpandPostRA expects an
842 // instruction here.
843 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::BUNDLE));
844 return;
845 }
846 RC = RI.getPhysRegBaseClass(Reg: DestReg);
847 Size = RI.getRegSizeInBits(RC: *RC);
848 SrcRC = RI.getPhysRegBaseClass(Reg: SrcReg);
849 SrcSize = RI.getRegSizeInBits(RC: *SrcRC);
850 }
851 }
852
853 if (RC == &AMDGPU::VGPR_32RegClass) {
854 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
855 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
856 AMDGPU::AGPR_32RegClass.contains(SrcReg));
857 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg) ?
858 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
859 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: Opc), DestReg)
860 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
861 return;
862 }
863
864 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
865 RC == &AMDGPU::SReg_32RegClass) {
866 if (SrcReg == AMDGPU::SCC) {
867 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B32), DestReg)
868 .addImm(Val: 1)
869 .addImm(Val: 0);
870 return;
871 }
872
873 if (!AMDGPU::SReg_32RegClass.contains(Reg: SrcReg)) {
874 if (DestReg == AMDGPU::VCC_LO) {
875 // FIXME: Hack until VReg_1 removed.
876 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
877 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_U32_e32))
878 .addImm(Val: 0)
879 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
880 return;
881 }
882
883 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
884 return;
885 }
886
887 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg)
888 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
889 return;
890 }
891
892 if (RC == &AMDGPU::SReg_64RegClass) {
893 if (SrcReg == AMDGPU::SCC) {
894 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B64), DestReg)
895 .addImm(Val: 1)
896 .addImm(Val: 0);
897 return;
898 }
899
900 if (!AMDGPU::SReg_64_EncodableRegClass.contains(Reg: SrcReg)) {
901 if (DestReg == AMDGPU::VCC) {
902 // FIXME: Hack until VReg_1 removed.
903 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
904 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_U32_e32))
905 .addImm(Val: 0)
906 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
907 return;
908 }
909
910 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
911 return;
912 }
913
914 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B64), DestReg)
915 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
916 return;
917 }
918
919 if (DestReg == AMDGPU::SCC) {
920 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
921 // but SelectionDAG emits such copies for i1 sources.
922 if (AMDGPU::SReg_64RegClass.contains(Reg: SrcReg)) {
923 // This copy can only be produced by patterns
924 // with explicit SCC, which are known to be enabled
925 // only for subtargets with S_CMP_LG_U64 present.
926 assert(ST.hasScalarCompareEq64());
927 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U64))
928 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
929 .addImm(Val: 0);
930 } else {
931 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
932 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U32))
933 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
934 .addImm(Val: 0);
935 }
936
937 return;
938 }
939
940 if (RC == &AMDGPU::AGPR_32RegClass) {
941 if (AMDGPU::VGPR_32RegClass.contains(Reg: SrcReg) ||
942 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(Reg: SrcReg))) {
943 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
944 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
945 return;
946 }
947
948 if (AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg) && ST.hasGFX90AInsts()) {
949 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
950 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
951 return;
952 }
953
954 // FIXME: Pass should maintain scavenger to avoid scan through the block on
955 // every AGPR spill.
956 RegScavenger RS;
957 const bool Overlap = RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
958 indirectCopyToAGPR(TII: *this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, RegsOverlap: Overlap);
959 return;
960 }
961
962 if (Size == 16) {
963 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
964 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
965 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
966
967 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(Reg: DestReg);
968 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(Reg: SrcReg);
969 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(Reg: DestReg);
970 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(Reg: SrcReg);
971 bool DstLow = !AMDGPU::isHi16Reg(Reg: DestReg, MRI: RI);
972 bool SrcLow = !AMDGPU::isHi16Reg(Reg: SrcReg, MRI: RI);
973 MCRegister NewDestReg = RI.get32BitRegister(Reg: DestReg);
974 MCRegister NewSrcReg = RI.get32BitRegister(Reg: SrcReg);
975
976 if (IsSGPRDst) {
977 if (!IsSGPRSrc) {
978 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
979 return;
980 }
981
982 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: NewDestReg)
983 .addReg(RegNo: NewSrcReg, Flags: getKillRegState(B: KillSrc));
984 return;
985 }
986
987 if (IsAGPRDst || IsAGPRSrc) {
988 if (!DstLow || !SrcLow) {
989 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
990 Msg: "Cannot use hi16 subreg with an AGPR!");
991 }
992
993 copyPhysReg(MBB, MI, DL, DestReg: NewDestReg, SrcReg: NewSrcReg, KillSrc);
994 return;
995 }
996
997 if (ST.useRealTrue16Insts()) {
998 if (IsSGPRSrc) {
999 assert(SrcLow);
1000 SrcReg = NewSrcReg;
1001 }
1002 // Use the smaller instruction encoding if possible.
1003 if (AMDGPU::VGPR_16_Lo128RegClass.contains(Reg: DestReg) &&
1004 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(Reg: SrcReg))) {
1005 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B16_t16_e32), DestReg)
1006 .addReg(RegNo: SrcReg);
1007 } else {
1008 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B16_t16_e64), DestReg)
1009 .addImm(Val: 0) // src0_modifiers
1010 .addReg(RegNo: SrcReg)
1011 .addImm(Val: 0); // op_sel
1012 }
1013 return;
1014 }
1015
1016 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1017 if (!DstLow || !SrcLow) {
1018 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1019 Msg: "Cannot use hi16 subreg on VI!");
1020 }
1021
1022 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: NewDestReg)
1023 .addReg(RegNo: NewSrcReg, Flags: getKillRegState(B: KillSrc));
1024 return;
1025 }
1026
1027 auto MIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_sdwa), DestReg: NewDestReg)
1028 .addImm(Val: 0) // src0_modifiers
1029 .addReg(RegNo: NewSrcReg)
1030 .addImm(Val: 0) // clamp
1031 .addImm(Val: DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1032 : AMDGPU::SDWA::SdwaSel::WORD_1)
1033 .addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
1034 .addImm(Val: SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1035 : AMDGPU::SDWA::SdwaSel::WORD_1)
1036 .addReg(RegNo: NewDestReg, Flags: RegState::Implicit | RegState::Undef);
1037 // First implicit operand is $exec.
1038 MIB->tieOperands(DefIdx: 0, UseIdx: MIB->getNumOperands() - 1);
1039 return;
1040 }
1041
1042 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(RC: SrcRC))) {
1043 if (ST.hasMovB64()) {
1044 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B64_e32), DestReg)
1045 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
1046 return;
1047 }
1048 if (ST.hasPkMovB32()) {
1049 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg)
1050 .addImm(Val: SISrcMods::OP_SEL_1)
1051 .addReg(RegNo: SrcReg)
1052 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1053 .addReg(RegNo: SrcReg)
1054 .addImm(Val: 0) // op_sel_lo
1055 .addImm(Val: 0) // op_sel_hi
1056 .addImm(Val: 0) // neg_lo
1057 .addImm(Val: 0) // neg_hi
1058 .addImm(Val: 0) // clamp
1059 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
1060 return;
1061 }
1062 }
1063
1064 const bool Forward = RI.getHWRegIndex(Reg: DestReg) <= RI.getHWRegIndex(Reg: SrcReg);
1065 if (RI.isSGPRClass(RC)) {
1066 if (!RI.isSGPRClass(RC: SrcRC)) {
1067 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1068 return;
1069 }
1070 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
1071 expandSGPRCopy(TII: *this, MBB, MI, DL, DestReg, SrcReg, KillSrc: CanKillSuperReg, RC,
1072 Forward);
1073 return;
1074 }
1075
1076 unsigned EltSize = 4;
1077 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1078 if (RI.isAGPRClass(RC)) {
1079 if (ST.hasGFX90AInsts() && RI.isAGPRClass(RC: SrcRC))
1080 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1081 else if (RI.hasVGPRs(RC: SrcRC) ||
1082 (ST.hasGFX90AInsts() && RI.isSGPRClass(RC: SrcRC)))
1083 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1084 else
1085 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1086 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(RC: SrcRC)) {
1087 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1088 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1089 (RI.isProperlyAlignedRC(RC: *RC) &&
1090 (SrcRC == RC || RI.isSGPRClass(RC: SrcRC)))) {
1091 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1092 if (ST.hasMovB64()) {
1093 Opcode = AMDGPU::V_MOV_B64_e32;
1094 EltSize = 8;
1095 } else if (ST.hasPkMovB32()) {
1096 Opcode = AMDGPU::V_PK_MOV_B32;
1097 EltSize = 8;
1098 }
1099 }
1100
1101 // For the cases where we need an intermediate instruction/temporary register
1102 // (destination is an AGPR), we need a scavenger.
1103 //
1104 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1105 // whole block for every handled copy.
1106 std::unique_ptr<RegScavenger> RS;
1107 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1108 RS = std::make_unique<RegScavenger>();
1109
1110 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1111
1112 // If there is an overlap, we can't kill the super-register on the last
1113 // instruction, since it will also kill the components made live by this def.
1114 const bool Overlap = RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
1115 const bool CanKillSuperReg = KillSrc && !Overlap;
1116
1117 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1118 unsigned SubIdx;
1119 if (Forward)
1120 SubIdx = SubIndices[Idx];
1121 else
1122 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1123 Register DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
1124 Register SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
1125 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1126
1127 bool IsFirstSubreg = Idx == 0;
1128 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1129
1130 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1131 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1132 Register ImpUseSuper = SrcReg;
1133 indirectCopyToAGPR(TII: *this, MBB, MI, DL, DestReg: DestSubReg, SrcReg: SrcSubReg, KillSrc: UseKill,
1134 RS&: *RS, RegsOverlap: Overlap, ImpDefSuperReg: ImpDefSuper, ImpUseSuperReg: ImpUseSuper);
1135 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1136 MachineInstrBuilder MIB =
1137 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg: DestSubReg)
1138 .addImm(Val: SISrcMods::OP_SEL_1)
1139 .addReg(RegNo: SrcSubReg)
1140 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1141 .addReg(RegNo: SrcSubReg)
1142 .addImm(Val: 0) // op_sel_lo
1143 .addImm(Val: 0) // op_sel_hi
1144 .addImm(Val: 0) // neg_lo
1145 .addImm(Val: 0) // neg_hi
1146 .addImm(Val: 0) // clamp
1147 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: UseKill) | RegState::Implicit);
1148 if (IsFirstSubreg)
1149 MIB.addReg(RegNo: DestReg, Flags: RegState::Define | RegState::Implicit);
1150 } else {
1151 MachineInstrBuilder Builder =
1152 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode), DestReg: DestSubReg).addReg(RegNo: SrcSubReg);
1153 if (IsFirstSubreg)
1154 Builder.addReg(RegNo: DestReg, Flags: RegState::Define | RegState::Implicit);
1155
1156 Builder.addReg(RegNo: SrcReg, Flags: getKillRegState(B: UseKill) | RegState::Implicit);
1157 }
1158 }
1159}
1160
1161int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1162 int32_t NewOpc;
1163
1164 // Try to map original to commuted opcode
1165 NewOpc = AMDGPU::getCommuteRev(Opcode);
1166 if (NewOpc != -1)
1167 // Check if the commuted (REV) opcode exists on the target.
1168 return pseudoToMCOpcode(Opcode: NewOpc) != -1 ? NewOpc : -1;
1169
1170 // Try to map commuted to original opcode
1171 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1172 if (NewOpc != -1)
1173 // Check if the original (non-REV) opcode exists on the target.
1174 return pseudoToMCOpcode(Opcode: NewOpc) != -1 ? NewOpc : -1;
1175
1176 return Opcode;
1177}
1178
1179const TargetRegisterClass *
1180SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
1181 return &AMDGPU::VGPR_32RegClass;
1182}
1183
1184void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
1185 MachineBasicBlock::iterator I,
1186 const DebugLoc &DL, Register DstReg,
1187 ArrayRef<MachineOperand> Cond,
1188 Register TrueReg,
1189 Register FalseReg) const {
1190 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1191 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1192 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
1193 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1194 "Not a VGPR32 reg");
1195
1196 if (Cond.size() == 1) {
1197 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1198 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1199 .add(MO: Cond[0]);
1200 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1201 .addImm(Val: 0)
1202 .addReg(RegNo: FalseReg)
1203 .addImm(Val: 0)
1204 .addReg(RegNo: TrueReg)
1205 .addReg(RegNo: SReg);
1206 } else if (Cond.size() == 2) {
1207 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1208 switch (Cond[0].getImm()) {
1209 case SIInstrInfo::SCC_TRUE: {
1210 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1211 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 1).addImm(Val: 0);
1212 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1213 .addImm(Val: 0)
1214 .addReg(RegNo: FalseReg)
1215 .addImm(Val: 0)
1216 .addReg(RegNo: TrueReg)
1217 .addReg(RegNo: SReg);
1218 break;
1219 }
1220 case SIInstrInfo::SCC_FALSE: {
1221 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1222 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 0).addImm(Val: 1);
1223 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1224 .addImm(Val: 0)
1225 .addReg(RegNo: FalseReg)
1226 .addImm(Val: 0)
1227 .addReg(RegNo: TrueReg)
1228 .addReg(RegNo: SReg);
1229 break;
1230 }
1231 case SIInstrInfo::VCCNZ: {
1232 MachineOperand RegOp = Cond[1];
1233 RegOp.setImplicit(false);
1234 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1235 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1236 .add(MO: RegOp);
1237 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1238 .addImm(Val: 0)
1239 .addReg(RegNo: FalseReg)
1240 .addImm(Val: 0)
1241 .addReg(RegNo: TrueReg)
1242 .addReg(RegNo: SReg);
1243 break;
1244 }
1245 case SIInstrInfo::VCCZ: {
1246 MachineOperand RegOp = Cond[1];
1247 RegOp.setImplicit(false);
1248 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1249 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1250 .add(MO: RegOp);
1251 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1252 .addImm(Val: 0)
1253 .addReg(RegNo: TrueReg)
1254 .addImm(Val: 0)
1255 .addReg(RegNo: FalseReg)
1256 .addReg(RegNo: SReg);
1257 break;
1258 }
1259 case SIInstrInfo::EXECNZ: {
1260 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1261 Register SReg2 = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1262 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.OrSaveExecOpc), DestReg: SReg2).addImm(Val: 0);
1263 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 1).addImm(Val: 0);
1264 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1265 .addImm(Val: 0)
1266 .addReg(RegNo: FalseReg)
1267 .addImm(Val: 0)
1268 .addReg(RegNo: TrueReg)
1269 .addReg(RegNo: SReg);
1270 break;
1271 }
1272 case SIInstrInfo::EXECZ: {
1273 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1274 Register SReg2 = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1275 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.OrSaveExecOpc), DestReg: SReg2).addImm(Val: 0);
1276 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 0).addImm(Val: 1);
1277 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1278 .addImm(Val: 0)
1279 .addReg(RegNo: FalseReg)
1280 .addImm(Val: 0)
1281 .addReg(RegNo: TrueReg)
1282 .addReg(RegNo: SReg);
1283 llvm_unreachable("Unhandled branch predicate EXECZ");
1284 break;
1285 }
1286 default:
1287 llvm_unreachable("invalid branch predicate");
1288 }
1289 } else {
1290 llvm_unreachable("Can only handle Cond size 1 or 2");
1291 }
1292}
1293
1294Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
1295 MachineBasicBlock::iterator I,
1296 const DebugLoc &DL,
1297 Register SrcReg, int Value) const {
1298 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1299 Register Reg = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1300 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_EQ_I32_e64), DestReg: Reg)
1301 .addImm(Val: Value)
1302 .addReg(RegNo: SrcReg);
1303
1304 return Reg;
1305}
1306
1307Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
1308 MachineBasicBlock::iterator I,
1309 const DebugLoc &DL,
1310 Register SrcReg, int Value) const {
1311 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1312 Register Reg = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1313 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_I32_e64), DestReg: Reg)
1314 .addImm(Val: Value)
1315 .addReg(RegNo: SrcReg);
1316
1317 return Reg;
1318}
1319
1320bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
1321 const Register Reg,
1322 int64_t &ImmVal) const {
1323 switch (MI.getOpcode()) {
1324 case AMDGPU::V_MOV_B32_e32:
1325 case AMDGPU::S_MOV_B32:
1326 case AMDGPU::S_MOVK_I32:
1327 case AMDGPU::S_MOV_B64:
1328 case AMDGPU::V_MOV_B64_e32:
1329 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1330 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1331 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1332 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1333 case AMDGPU::V_MOV_B64_PSEUDO:
1334 case AMDGPU::V_MOV_B16_t16_e32: {
1335 const MachineOperand &Src0 = MI.getOperand(i: 1);
1336 if (Src0.isImm()) {
1337 ImmVal = Src0.getImm();
1338 return MI.getOperand(i: 0).getReg() == Reg;
1339 }
1340
1341 return false;
1342 }
1343 case AMDGPU::V_MOV_B16_t16_e64: {
1344 const MachineOperand &Src0 = MI.getOperand(i: 2);
1345 if (Src0.isImm() && !MI.getOperand(i: 1).getImm()) {
1346 ImmVal = Src0.getImm();
1347 return MI.getOperand(i: 0).getReg() == Reg;
1348 }
1349
1350 return false;
1351 }
1352 case AMDGPU::S_BREV_B32:
1353 case AMDGPU::V_BFREV_B32_e32:
1354 case AMDGPU::V_BFREV_B32_e64: {
1355 const MachineOperand &Src0 = MI.getOperand(i: 1);
1356 if (Src0.isImm()) {
1357 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Val: Src0.getImm()));
1358 return MI.getOperand(i: 0).getReg() == Reg;
1359 }
1360
1361 return false;
1362 }
1363 case AMDGPU::S_NOT_B32:
1364 case AMDGPU::V_NOT_B32_e32:
1365 case AMDGPU::V_NOT_B32_e64: {
1366 const MachineOperand &Src0 = MI.getOperand(i: 1);
1367 if (Src0.isImm()) {
1368 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1369 return MI.getOperand(i: 0).getReg() == Reg;
1370 }
1371
1372 return false;
1373 }
1374 default:
1375 return false;
1376 }
1377}
1378
1379std::optional<int64_t>
1380SIInstrInfo::getImmOrMaterializedImm(MachineOperand &Op) const {
1381 if (Op.isImm())
1382 return Op.getImm();
1383
1384 if (!Op.isReg() || !Op.getReg().isVirtual())
1385 return std::nullopt;
1386 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1387 const MachineInstr *Def = MRI.getVRegDef(Reg: Op.getReg());
1388 if (Def && Def->isMoveImmediate()) {
1389 const MachineOperand &ImmSrc = Def->getOperand(i: 1);
1390 if (ImmSrc.isImm())
1391 return extractSubregFromImm(ImmVal: ImmSrc.getImm(), SubRegIndex: Op.getSubReg());
1392 }
1393
1394 return std::nullopt;
1395}
1396
1397unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
1398
1399 if (RI.isAGPRClass(RC: DstRC))
1400 return AMDGPU::COPY;
1401 if (RI.getRegSizeInBits(RC: *DstRC) == 16) {
1402 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1403 // before RA.
1404 return RI.isSGPRClass(RC: DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1405 }
1406 if (RI.getRegSizeInBits(RC: *DstRC) == 32)
1407 return RI.isSGPRClass(RC: DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1408 if (RI.getRegSizeInBits(RC: *DstRC) == 64 && RI.isSGPRClass(RC: DstRC))
1409 return AMDGPU::S_MOV_B64;
1410 if (RI.getRegSizeInBits(RC: *DstRC) == 64 && !RI.isSGPRClass(RC: DstRC))
1411 return AMDGPU::V_MOV_B64_PSEUDO;
1412 return AMDGPU::COPY;
1413}
1414
1415const MCInstrDesc &
1416SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
1417 bool IsIndirectSrc) const {
1418 if (IsIndirectSrc) {
1419 if (VecSize <= 32) // 4 bytes
1420 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1421 if (VecSize <= 64) // 8 bytes
1422 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1423 if (VecSize <= 96) // 12 bytes
1424 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1425 if (VecSize <= 128) // 16 bytes
1426 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1427 if (VecSize <= 160) // 20 bytes
1428 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1429 if (VecSize <= 192) // 24 bytes
1430 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1431 if (VecSize <= 224) // 28 bytes
1432 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1433 if (VecSize <= 256) // 32 bytes
1434 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1435 if (VecSize <= 288) // 36 bytes
1436 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1437 if (VecSize <= 320) // 40 bytes
1438 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1439 if (VecSize <= 352) // 44 bytes
1440 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1441 if (VecSize <= 384) // 48 bytes
1442 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1443 if (VecSize <= 512) // 64 bytes
1444 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1445 if (VecSize <= 1024) // 128 bytes
1446 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1447
1448 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1449 }
1450
1451 if (VecSize <= 32) // 4 bytes
1452 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1453 if (VecSize <= 64) // 8 bytes
1454 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1455 if (VecSize <= 96) // 12 bytes
1456 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1457 if (VecSize <= 128) // 16 bytes
1458 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1459 if (VecSize <= 160) // 20 bytes
1460 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1461 if (VecSize <= 192) // 24 bytes
1462 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1463 if (VecSize <= 224) // 28 bytes
1464 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1465 if (VecSize <= 256) // 32 bytes
1466 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1467 if (VecSize <= 288) // 36 bytes
1468 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1469 if (VecSize <= 320) // 40 bytes
1470 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1471 if (VecSize <= 352) // 44 bytes
1472 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1473 if (VecSize <= 384) // 48 bytes
1474 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1475 if (VecSize <= 512) // 64 bytes
1476 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1477 if (VecSize <= 1024) // 128 bytes
1478 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1479
1480 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1481}
1482
1483static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1484 if (VecSize <= 32) // 4 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1486 if (VecSize <= 64) // 8 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1488 if (VecSize <= 96) // 12 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1490 if (VecSize <= 128) // 16 bytes
1491 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1492 if (VecSize <= 160) // 20 bytes
1493 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1494 if (VecSize <= 192) // 24 bytes
1495 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1496 if (VecSize <= 224) // 28 bytes
1497 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1498 if (VecSize <= 256) // 32 bytes
1499 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1500 if (VecSize <= 288) // 36 bytes
1501 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1502 if (VecSize <= 320) // 40 bytes
1503 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1504 if (VecSize <= 352) // 44 bytes
1505 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1506 if (VecSize <= 384) // 48 bytes
1507 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1508 if (VecSize <= 512) // 64 bytes
1509 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1510 if (VecSize <= 1024) // 128 bytes
1511 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1512
1513 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1514}
1515
1516static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1517 if (VecSize <= 32) // 4 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1519 if (VecSize <= 64) // 8 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1521 if (VecSize <= 96) // 12 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1523 if (VecSize <= 128) // 16 bytes
1524 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1525 if (VecSize <= 160) // 20 bytes
1526 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1527 if (VecSize <= 192) // 24 bytes
1528 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1529 if (VecSize <= 224) // 28 bytes
1530 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1531 if (VecSize <= 256) // 32 bytes
1532 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1533 if (VecSize <= 288) // 36 bytes
1534 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1535 if (VecSize <= 320) // 40 bytes
1536 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1537 if (VecSize <= 352) // 44 bytes
1538 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1539 if (VecSize <= 384) // 48 bytes
1540 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1541 if (VecSize <= 512) // 64 bytes
1542 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1543 if (VecSize <= 1024) // 128 bytes
1544 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1545
1546 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1547}
1548
1549static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1550 if (VecSize <= 64) // 8 bytes
1551 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1552 if (VecSize <= 128) // 16 bytes
1553 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1554 if (VecSize <= 256) // 32 bytes
1555 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1556 if (VecSize <= 512) // 64 bytes
1557 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1558 if (VecSize <= 1024) // 128 bytes
1559 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1560
1561 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1562}
1563
1564const MCInstrDesc &
1565SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1566 bool IsSGPR) const {
1567 if (IsSGPR) {
1568 switch (EltSize) {
1569 case 32:
1570 return get(Opcode: getIndirectSGPRWriteMovRelPseudo32(VecSize));
1571 case 64:
1572 return get(Opcode: getIndirectSGPRWriteMovRelPseudo64(VecSize));
1573 default:
1574 llvm_unreachable("invalid reg indexing elt size");
1575 }
1576 }
1577
1578 assert(EltSize == 32 && "invalid reg indexing elt size");
1579 return get(Opcode: getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
1580}
1581
1582static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1583 switch (Size) {
1584 case 4:
1585 return AMDGPU::SI_SPILL_S32_SAVE;
1586 case 8:
1587 return AMDGPU::SI_SPILL_S64_SAVE;
1588 case 12:
1589 return AMDGPU::SI_SPILL_S96_SAVE;
1590 case 16:
1591 return AMDGPU::SI_SPILL_S128_SAVE;
1592 case 20:
1593 return AMDGPU::SI_SPILL_S160_SAVE;
1594 case 24:
1595 return AMDGPU::SI_SPILL_S192_SAVE;
1596 case 28:
1597 return AMDGPU::SI_SPILL_S224_SAVE;
1598 case 32:
1599 return AMDGPU::SI_SPILL_S256_SAVE;
1600 case 36:
1601 return AMDGPU::SI_SPILL_S288_SAVE;
1602 case 40:
1603 return AMDGPU::SI_SPILL_S320_SAVE;
1604 case 44:
1605 return AMDGPU::SI_SPILL_S352_SAVE;
1606 case 48:
1607 return AMDGPU::SI_SPILL_S384_SAVE;
1608 case 64:
1609 return AMDGPU::SI_SPILL_S512_SAVE;
1610 case 128:
1611 return AMDGPU::SI_SPILL_S1024_SAVE;
1612 default:
1613 llvm_unreachable("unknown register size");
1614 }
1615}
1616
1617static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1618 switch (Size) {
1619 case 2:
1620 return AMDGPU::SI_SPILL_V16_SAVE;
1621 case 4:
1622 return AMDGPU::SI_SPILL_V32_SAVE;
1623 case 8:
1624 return AMDGPU::SI_SPILL_V64_SAVE;
1625 case 12:
1626 return AMDGPU::SI_SPILL_V96_SAVE;
1627 case 16:
1628 return AMDGPU::SI_SPILL_V128_SAVE;
1629 case 20:
1630 return AMDGPU::SI_SPILL_V160_SAVE;
1631 case 24:
1632 return AMDGPU::SI_SPILL_V192_SAVE;
1633 case 28:
1634 return AMDGPU::SI_SPILL_V224_SAVE;
1635 case 32:
1636 return AMDGPU::SI_SPILL_V256_SAVE;
1637 case 36:
1638 return AMDGPU::SI_SPILL_V288_SAVE;
1639 case 40:
1640 return AMDGPU::SI_SPILL_V320_SAVE;
1641 case 44:
1642 return AMDGPU::SI_SPILL_V352_SAVE;
1643 case 48:
1644 return AMDGPU::SI_SPILL_V384_SAVE;
1645 case 64:
1646 return AMDGPU::SI_SPILL_V512_SAVE;
1647 case 128:
1648 return AMDGPU::SI_SPILL_V1024_SAVE;
1649 default:
1650 llvm_unreachable("unknown register size");
1651 }
1652}
1653
1654static unsigned getAVSpillSaveOpcode(unsigned Size) {
1655 switch (Size) {
1656 case 4:
1657 return AMDGPU::SI_SPILL_AV32_SAVE;
1658 case 8:
1659 return AMDGPU::SI_SPILL_AV64_SAVE;
1660 case 12:
1661 return AMDGPU::SI_SPILL_AV96_SAVE;
1662 case 16:
1663 return AMDGPU::SI_SPILL_AV128_SAVE;
1664 case 20:
1665 return AMDGPU::SI_SPILL_AV160_SAVE;
1666 case 24:
1667 return AMDGPU::SI_SPILL_AV192_SAVE;
1668 case 28:
1669 return AMDGPU::SI_SPILL_AV224_SAVE;
1670 case 32:
1671 return AMDGPU::SI_SPILL_AV256_SAVE;
1672 case 36:
1673 return AMDGPU::SI_SPILL_AV288_SAVE;
1674 case 40:
1675 return AMDGPU::SI_SPILL_AV320_SAVE;
1676 case 44:
1677 return AMDGPU::SI_SPILL_AV352_SAVE;
1678 case 48:
1679 return AMDGPU::SI_SPILL_AV384_SAVE;
1680 case 64:
1681 return AMDGPU::SI_SPILL_AV512_SAVE;
1682 case 128:
1683 return AMDGPU::SI_SPILL_AV1024_SAVE;
1684 default:
1685 llvm_unreachable("unknown register size");
1686 }
1687}
1688
1689static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1690 bool IsVectorSuperClass) {
1691 // Currently, there is only 32-bit WWM register spills needed.
1692 if (Size != 4)
1693 llvm_unreachable("unknown wwm register spill size");
1694
1695 if (IsVectorSuperClass)
1696 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1697
1698 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1699}
1700
1701unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
1702 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1703 const SIMachineFunctionInfo &MFI) const {
1704 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1705
1706 // Choose the right opcode if spilling a WWM register.
1707 if (MFI.checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
1708 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1709
1710 // TODO: Check if AGPRs are available
1711 if (ST.hasMAIInsts())
1712 return getAVSpillSaveOpcode(Size);
1713
1714 return getVGPRSpillSaveOpcode(Size);
1715}
1716
1717void SIInstrInfo::storeRegToStackSlot(
1718 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
1719 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1720 MachineInstr::MIFlag Flags) const {
1721 MachineFunction *MF = MBB.getParent();
1722 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1723 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1724 const DebugLoc &DL = MBB.findDebugLoc(MBBI: MI);
1725
1726 MachinePointerInfo PtrInfo
1727 = MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1728 MachineMemOperand *MMO = MF->getMachineMemOperand(
1729 PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
1730 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
1731 unsigned SpillSize = RI.getSpillSize(RC: *RC);
1732
1733 MachineRegisterInfo &MRI = MF->getRegInfo();
1734 if (RI.isSGPRClass(RC)) {
1735 MFI->setHasSpilledSGPRs();
1736 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1737 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1738 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1739
1740 // We are only allowed to create one new instruction when spilling
1741 // registers, so we need to use pseudo instruction for spilling SGPRs.
1742 const MCInstrDesc &OpDesc = get(Opcode: getSGPRSpillSaveOpcode(Size: SpillSize));
1743
1744 // The SGPR spill/restore instructions only work on number sgprs, so we need
1745 // to make sure we are using the correct register class.
1746 if (SrcReg.isVirtual() && SpillSize == 4) {
1747 MRI.constrainRegClass(Reg: SrcReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
1748 }
1749
1750 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: OpDesc)
1751 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: isKill)) // data
1752 .addFrameIndex(Idx: FrameIndex) // addr
1753 .addMemOperand(MMO)
1754 .addReg(RegNo: MFI->getStackPtrOffsetReg(), Flags: RegState::Implicit);
1755
1756 if (RI.spillSGPRToVGPR())
1757 FrameInfo.setStackID(ObjectIdx: FrameIndex, ID: TargetStackID::SGPRSpill);
1758 return;
1759 }
1760
1761 unsigned Opcode =
1762 getVectorRegSpillSaveOpcode(Reg: VReg ? VReg : SrcReg, RC, Size: SpillSize, MFI: *MFI);
1763 MFI->setHasSpilledVGPRs();
1764
1765 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode))
1766 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: isKill)) // data
1767 .addFrameIndex(Idx: FrameIndex) // addr
1768 .addReg(RegNo: MFI->getStackPtrOffsetReg()) // scratch_offset
1769 .addImm(Val: 0) // offset
1770 .addMemOperand(MMO);
1771}
1772
1773static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1774 switch (Size) {
1775 case 4:
1776 return AMDGPU::SI_SPILL_S32_RESTORE;
1777 case 8:
1778 return AMDGPU::SI_SPILL_S64_RESTORE;
1779 case 12:
1780 return AMDGPU::SI_SPILL_S96_RESTORE;
1781 case 16:
1782 return AMDGPU::SI_SPILL_S128_RESTORE;
1783 case 20:
1784 return AMDGPU::SI_SPILL_S160_RESTORE;
1785 case 24:
1786 return AMDGPU::SI_SPILL_S192_RESTORE;
1787 case 28:
1788 return AMDGPU::SI_SPILL_S224_RESTORE;
1789 case 32:
1790 return AMDGPU::SI_SPILL_S256_RESTORE;
1791 case 36:
1792 return AMDGPU::SI_SPILL_S288_RESTORE;
1793 case 40:
1794 return AMDGPU::SI_SPILL_S320_RESTORE;
1795 case 44:
1796 return AMDGPU::SI_SPILL_S352_RESTORE;
1797 case 48:
1798 return AMDGPU::SI_SPILL_S384_RESTORE;
1799 case 64:
1800 return AMDGPU::SI_SPILL_S512_RESTORE;
1801 case 128:
1802 return AMDGPU::SI_SPILL_S1024_RESTORE;
1803 default:
1804 llvm_unreachable("unknown register size");
1805 }
1806}
1807
1808static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1809 switch (Size) {
1810 case 2:
1811 return AMDGPU::SI_SPILL_V16_RESTORE;
1812 case 4:
1813 return AMDGPU::SI_SPILL_V32_RESTORE;
1814 case 8:
1815 return AMDGPU::SI_SPILL_V64_RESTORE;
1816 case 12:
1817 return AMDGPU::SI_SPILL_V96_RESTORE;
1818 case 16:
1819 return AMDGPU::SI_SPILL_V128_RESTORE;
1820 case 20:
1821 return AMDGPU::SI_SPILL_V160_RESTORE;
1822 case 24:
1823 return AMDGPU::SI_SPILL_V192_RESTORE;
1824 case 28:
1825 return AMDGPU::SI_SPILL_V224_RESTORE;
1826 case 32:
1827 return AMDGPU::SI_SPILL_V256_RESTORE;
1828 case 36:
1829 return AMDGPU::SI_SPILL_V288_RESTORE;
1830 case 40:
1831 return AMDGPU::SI_SPILL_V320_RESTORE;
1832 case 44:
1833 return AMDGPU::SI_SPILL_V352_RESTORE;
1834 case 48:
1835 return AMDGPU::SI_SPILL_V384_RESTORE;
1836 case 64:
1837 return AMDGPU::SI_SPILL_V512_RESTORE;
1838 case 128:
1839 return AMDGPU::SI_SPILL_V1024_RESTORE;
1840 default:
1841 llvm_unreachable("unknown register size");
1842 }
1843}
1844
1845static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1846 switch (Size) {
1847 case 4:
1848 return AMDGPU::SI_SPILL_AV32_RESTORE;
1849 case 8:
1850 return AMDGPU::SI_SPILL_AV64_RESTORE;
1851 case 12:
1852 return AMDGPU::SI_SPILL_AV96_RESTORE;
1853 case 16:
1854 return AMDGPU::SI_SPILL_AV128_RESTORE;
1855 case 20:
1856 return AMDGPU::SI_SPILL_AV160_RESTORE;
1857 case 24:
1858 return AMDGPU::SI_SPILL_AV192_RESTORE;
1859 case 28:
1860 return AMDGPU::SI_SPILL_AV224_RESTORE;
1861 case 32:
1862 return AMDGPU::SI_SPILL_AV256_RESTORE;
1863 case 36:
1864 return AMDGPU::SI_SPILL_AV288_RESTORE;
1865 case 40:
1866 return AMDGPU::SI_SPILL_AV320_RESTORE;
1867 case 44:
1868 return AMDGPU::SI_SPILL_AV352_RESTORE;
1869 case 48:
1870 return AMDGPU::SI_SPILL_AV384_RESTORE;
1871 case 64:
1872 return AMDGPU::SI_SPILL_AV512_RESTORE;
1873 case 128:
1874 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1875 default:
1876 llvm_unreachable("unknown register size");
1877 }
1878}
1879
1880static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1881 bool IsVectorSuperClass) {
1882 // Currently, there is only 32-bit WWM register spills needed.
1883 if (Size != 4)
1884 llvm_unreachable("unknown wwm register spill size");
1885
1886 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1887 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1888
1889 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1890}
1891
1892unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(
1893 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1894 const SIMachineFunctionInfo &MFI) const {
1895 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1896
1897 // Choose the right opcode if restoring a WWM register.
1898 if (MFI.checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
1899 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1900
1901 // TODO: Check if AGPRs are available
1902 if (ST.hasMAIInsts())
1903 return getAVSpillRestoreOpcode(Size);
1904
1905 assert(!RI.isAGPRClass(RC));
1906 return getVGPRSpillRestoreOpcode(Size);
1907}
1908
1909void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
1910 MachineBasicBlock::iterator MI,
1911 Register DestReg, int FrameIndex,
1912 const TargetRegisterClass *RC,
1913 Register VReg, unsigned SubReg,
1914 MachineInstr::MIFlag Flags) const {
1915 MachineFunction *MF = MBB.getParent();
1916 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1917 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1918 const DebugLoc &DL = MBB.findDebugLoc(MBBI: MI);
1919 unsigned SpillSize = RI.getSpillSize(RC: *RC);
1920
1921 MachinePointerInfo PtrInfo
1922 = MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1923
1924 MachineMemOperand *MMO = MF->getMachineMemOperand(
1925 PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
1926 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
1927
1928 if (RI.isSGPRClass(RC)) {
1929 MFI->setHasSpilledSGPRs();
1930 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1931 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1932 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1933
1934 // FIXME: Maybe this should not include a memoperand because it will be
1935 // lowered to non-memory instructions.
1936 const MCInstrDesc &OpDesc = get(Opcode: getSGPRSpillRestoreOpcode(Size: SpillSize));
1937 if (DestReg.isVirtual() && SpillSize == 4) {
1938 MachineRegisterInfo &MRI = MF->getRegInfo();
1939 MRI.constrainRegClass(Reg: DestReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
1940 }
1941
1942 if (RI.spillSGPRToVGPR())
1943 FrameInfo.setStackID(ObjectIdx: FrameIndex, ID: TargetStackID::SGPRSpill);
1944 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: OpDesc, DestReg)
1945 .addFrameIndex(Idx: FrameIndex) // addr
1946 .addMemOperand(MMO)
1947 .addReg(RegNo: MFI->getStackPtrOffsetReg(), Flags: RegState::Implicit);
1948
1949 return;
1950 }
1951
1952 unsigned Opcode = getVectorRegSpillRestoreOpcode(Reg: VReg ? VReg : DestReg, RC,
1953 Size: SpillSize, MFI: *MFI);
1954 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode), DestReg)
1955 .addFrameIndex(Idx: FrameIndex) // vaddr
1956 .addReg(RegNo: MFI->getStackPtrOffsetReg()) // scratch_offset
1957 .addImm(Val: 0) // offset
1958 .addMemOperand(MMO);
1959}
1960
1961void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
1962 MachineBasicBlock::iterator MI) const {
1963 insertNoops(MBB, MI, Quantity: 1);
1964}
1965
1966void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
1967 MachineBasicBlock::iterator MI,
1968 unsigned Quantity) const {
1969 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
1970 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1971 while (Quantity > 0) {
1972 unsigned Arg = std::min(a: Quantity, b: MaxSNopCount);
1973 Quantity -= Arg;
1974 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOP)).addImm(Val: Arg - 1);
1975 }
1976}
1977
1978void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
1979 auto *MF = MBB.getParent();
1980 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1981
1982 assert(Info->isEntryFunction());
1983
1984 if (MBB.succ_empty()) {
1985 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1986 if (HasNoTerminator) {
1987 if (Info->returnsVoid()) {
1988 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DebugLoc(), MCID: get(Opcode: AMDGPU::S_ENDPGM)).addImm(Val: 0);
1989 } else {
1990 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DebugLoc(), MCID: get(Opcode: AMDGPU::SI_RETURN_TO_EPILOG));
1991 }
1992 }
1993 }
1994}
1995
1996MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
1997 MachineBasicBlock &MBB,
1998 MachineInstr &MI,
1999 const DebugLoc &DL) const {
2000 MachineFunction *MF = MBB.getParent();
2001 constexpr unsigned DoorbellIDMask = 0x3ff;
2002 constexpr unsigned ECQueueWaveAbort = 0x400;
2003
2004 MachineBasicBlock *TrapBB = &MBB;
2005 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2006
2007 if (!MBB.succ_empty() || std::next(x: MI.getIterator()) != MBB.end()) {
2008 MBB.splitAt(SplitInst&: MI, /*UpdateLiveIns=*/false);
2009 TrapBB = MF->CreateMachineBasicBlock();
2010 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CBRANCH_EXECNZ)).addMBB(MBB: TrapBB);
2011 MF->push_back(MBB: TrapBB);
2012 MBB.addSuccessor(Succ: TrapBB);
2013 }
2014 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2015 // will be a nop.
2016 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_TRAP))
2017 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2018 Register DoorbellReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2019 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SENDMSG_RTN_B32),
2020 DestReg: DoorbellReg)
2021 .addImm(Val: AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
2022 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::TTMP2)
2023 .addUse(RegNo: AMDGPU::M0);
2024 Register DoorbellRegMasked =
2025 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2026 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_AND_B32), DestReg: DoorbellRegMasked)
2027 .addUse(RegNo: DoorbellReg)
2028 .addImm(Val: DoorbellIDMask);
2029 Register SetWaveAbortBit =
2030 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2031 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_OR_B32), DestReg: SetWaveAbortBit)
2032 .addUse(RegNo: DoorbellRegMasked)
2033 .addImm(Val: ECQueueWaveAbort);
2034 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
2035 .addUse(RegNo: SetWaveAbortBit);
2036 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SENDMSG))
2037 .addImm(Val: AMDGPU::SendMsg::ID_INTERRUPT);
2038 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
2039 .addUse(RegNo: AMDGPU::TTMP2);
2040 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH)).addMBB(MBB: HaltLoopBB);
2041 TrapBB->addSuccessor(Succ: HaltLoopBB);
2042
2043 BuildMI(BB&: *HaltLoopBB, I: HaltLoopBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETHALT)).addImm(Val: 5);
2044 BuildMI(BB&: *HaltLoopBB, I: HaltLoopBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
2045 .addMBB(MBB: HaltLoopBB);
2046 MF->push_back(MBB: HaltLoopBB);
2047 HaltLoopBB->addSuccessor(Succ: HaltLoopBB);
2048
2049 return MBB.getNextNode();
2050}
2051
2052unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
2053 switch (MI.getOpcode()) {
2054 default:
2055 if (MI.isMetaInstruction())
2056 return 0;
2057 return 1; // FIXME: Do wait states equal cycles?
2058
2059 case AMDGPU::S_NOP:
2060 return MI.getOperand(i: 0).getImm() + 1;
2061 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2062 // hazard, even if one exist, won't really be visible. Should we handle it?
2063 }
2064}
2065
2066bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2067 MachineBasicBlock &MBB = *MI.getParent();
2068 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
2069 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
2070 switch (MI.getOpcode()) {
2071 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2072 case AMDGPU::S_MOV_B64_term:
2073 // This is only a terminator to get the correct spill code placement during
2074 // register allocation.
2075 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2076 break;
2077
2078 case AMDGPU::S_MOV_B32_term:
2079 // This is only a terminator to get the correct spill code placement during
2080 // register allocation.
2081 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B32));
2082 break;
2083
2084 case AMDGPU::S_XOR_B64_term:
2085 // This is only a terminator to get the correct spill code placement during
2086 // register allocation.
2087 MI.setDesc(get(Opcode: AMDGPU::S_XOR_B64));
2088 break;
2089
2090 case AMDGPU::S_XOR_B32_term:
2091 // This is only a terminator to get the correct spill code placement during
2092 // register allocation.
2093 MI.setDesc(get(Opcode: AMDGPU::S_XOR_B32));
2094 break;
2095 case AMDGPU::S_OR_B64_term:
2096 // This is only a terminator to get the correct spill code placement during
2097 // register allocation.
2098 MI.setDesc(get(Opcode: AMDGPU::S_OR_B64));
2099 break;
2100 case AMDGPU::S_OR_B32_term:
2101 // This is only a terminator to get the correct spill code placement during
2102 // register allocation.
2103 MI.setDesc(get(Opcode: AMDGPU::S_OR_B32));
2104 break;
2105
2106 case AMDGPU::S_ANDN2_B64_term:
2107 // This is only a terminator to get the correct spill code placement during
2108 // register allocation.
2109 MI.setDesc(get(Opcode: AMDGPU::S_ANDN2_B64));
2110 break;
2111
2112 case AMDGPU::S_ANDN2_B32_term:
2113 // This is only a terminator to get the correct spill code placement during
2114 // register allocation.
2115 MI.setDesc(get(Opcode: AMDGPU::S_ANDN2_B32));
2116 break;
2117
2118 case AMDGPU::S_AND_B64_term:
2119 // This is only a terminator to get the correct spill code placement during
2120 // register allocation.
2121 MI.setDesc(get(Opcode: AMDGPU::S_AND_B64));
2122 break;
2123
2124 case AMDGPU::S_AND_B32_term:
2125 // This is only a terminator to get the correct spill code placement during
2126 // register allocation.
2127 MI.setDesc(get(Opcode: AMDGPU::S_AND_B32));
2128 break;
2129
2130 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2131 // This is only a terminator to get the correct spill code placement during
2132 // register allocation.
2133 MI.setDesc(get(Opcode: AMDGPU::S_AND_SAVEEXEC_B64));
2134 break;
2135
2136 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2137 // This is only a terminator to get the correct spill code placement during
2138 // register allocation.
2139 MI.setDesc(get(Opcode: AMDGPU::S_AND_SAVEEXEC_B32));
2140 break;
2141
2142 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2143 MI.setDesc(get(Opcode: AMDGPU::V_WRITELANE_B32));
2144 break;
2145
2146 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2147 MI.setDesc(get(Opcode: AMDGPU::V_READLANE_B32));
2148 break;
2149 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2150 Register Dst = MI.getOperand(i: 0).getReg();
2151 bool IsAGPR = SIRegisterInfo::isAGPRClass(RC: RI.getPhysRegBaseClass(Reg: Dst));
2152 MI.setDesc(
2153 get(Opcode: IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2154 break;
2155 }
2156 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2157 Register Dst = MI.getOperand(i: 0).getReg();
2158 if (SIRegisterInfo::isAGPRClass(RC: RI.getPhysRegBaseClass(Reg: Dst))) {
2159 int64_t Imm = MI.getOperand(i: 1).getImm();
2160
2161 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2162 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2163 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg: DstLo)
2164 .addImm(Val: SignExtend64<32>(x: Imm))
2165 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2166 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg: DstHi)
2167 .addImm(Val: SignExtend64<32>(x: Imm >> 32))
2168 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2169 MI.eraseFromParent();
2170 break;
2171 }
2172
2173 [[fallthrough]];
2174 }
2175 case AMDGPU::V_MOV_B64_PSEUDO: {
2176 Register Dst = MI.getOperand(i: 0).getReg();
2177 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2178 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2179
2180 const MCInstrDesc &Mov64Desc = get(Opcode: AMDGPU::V_MOV_B64_e32);
2181 const TargetRegisterClass *Mov64RC = getRegClass(MCID: Mov64Desc, /*OpNum=*/0);
2182
2183 const MachineOperand &SrcOp = MI.getOperand(i: 1);
2184 // FIXME: Will this work for 64-bit floating point immediates?
2185 assert(!SrcOp.isFPImm());
2186 if (ST.hasMovB64() && Mov64RC->contains(Reg: Dst)) {
2187 MI.setDesc(Mov64Desc);
2188 if (SrcOp.isReg() || isInlineConstant(MI, OpIdx: 1) ||
2189 isUInt<32>(x: SrcOp.getImm()) || ST.has64BitLiterals())
2190 break;
2191 }
2192 if (SrcOp.isImm()) {
2193 APInt Imm(64, SrcOp.getImm());
2194 APInt Lo(32, Imm.getLoBits(numBits: 32).getZExtValue());
2195 APInt Hi(32, Imm.getHiBits(numBits: 32).getZExtValue());
2196 const MCInstrDesc &PkMovDesc = get(Opcode: AMDGPU::V_PK_MOV_B32);
2197 const TargetRegisterClass *PkMovRC = getRegClass(MCID: PkMovDesc, /*OpNum=*/0);
2198
2199 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Imm: Lo) &&
2200 PkMovRC->contains(Reg: Dst)) {
2201 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: PkMovDesc, DestReg: Dst)
2202 .addImm(Val: SISrcMods::OP_SEL_1)
2203 .addImm(Val: Lo.getSExtValue())
2204 .addImm(Val: SISrcMods::OP_SEL_1)
2205 .addImm(Val: Lo.getSExtValue())
2206 .addImm(Val: 0) // op_sel_lo
2207 .addImm(Val: 0) // op_sel_hi
2208 .addImm(Val: 0) // neg_lo
2209 .addImm(Val: 0) // neg_hi
2210 .addImm(Val: 0); // clamp
2211 } else {
2212 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo)
2213 .addImm(Val: Lo.getSExtValue())
2214 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2215 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstHi)
2216 .addImm(Val: Hi.getSExtValue())
2217 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2218 }
2219 } else {
2220 assert(SrcOp.isReg());
2221 if (ST.hasPkMovB32() &&
2222 !RI.isAGPR(MRI: MBB.getParent()->getRegInfo(), Reg: SrcOp.getReg())) {
2223 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg: Dst)
2224 .addImm(Val: SISrcMods::OP_SEL_1) // src0_mod
2225 .addReg(RegNo: SrcOp.getReg())
2226 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
2227 .addReg(RegNo: SrcOp.getReg())
2228 .addImm(Val: 0) // op_sel_lo
2229 .addImm(Val: 0) // op_sel_hi
2230 .addImm(Val: 0) // neg_lo
2231 .addImm(Val: 0) // neg_hi
2232 .addImm(Val: 0); // clamp
2233 } else {
2234 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo)
2235 .addReg(RegNo: RI.getSubReg(Reg: SrcOp.getReg(), Idx: AMDGPU::sub0))
2236 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2237 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstHi)
2238 .addReg(RegNo: RI.getSubReg(Reg: SrcOp.getReg(), Idx: AMDGPU::sub1))
2239 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2240 }
2241 }
2242 MI.eraseFromParent();
2243 break;
2244 }
2245 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2246 expandMovDPP64(MI);
2247 break;
2248 }
2249 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2250 const MachineOperand &SrcOp = MI.getOperand(i: 1);
2251 assert(!SrcOp.isFPImm());
2252
2253 if (ST.has64BitLiterals()) {
2254 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2255 break;
2256 }
2257
2258 APInt Imm(64, SrcOp.getImm());
2259 if (Imm.isIntN(N: 32) || isInlineConstant(Imm)) {
2260 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2261 break;
2262 }
2263
2264 Register Dst = MI.getOperand(i: 0).getReg();
2265 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2266 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2267
2268 APInt Lo(32, Imm.getLoBits(numBits: 32).getZExtValue());
2269 APInt Hi(32, Imm.getHiBits(numBits: 32).getZExtValue());
2270 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstLo)
2271 .addImm(Val: Lo.getSExtValue())
2272 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2273 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstHi)
2274 .addImm(Val: Hi.getSExtValue())
2275 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2276 MI.eraseFromParent();
2277 break;
2278 }
2279 case AMDGPU::V_SET_INACTIVE_B32: {
2280 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2281 Register DstReg = MI.getOperand(i: 0).getReg();
2282 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
2283 .add(MO: MI.getOperand(i: 3))
2284 .add(MO: MI.getOperand(i: 4))
2285 .add(MO: MI.getOperand(i: 1))
2286 .add(MO: MI.getOperand(i: 2))
2287 .add(MO: MI.getOperand(i: 5));
2288 MI.eraseFromParent();
2289 break;
2290 }
2291 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2292 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2294 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2298 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2305 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2324 const TargetRegisterClass *EltRC = getOpRegClass(MI, OpNo: 2);
2325
2326 unsigned Opc;
2327 if (RI.hasVGPRs(RC: EltRC)) {
2328 Opc = AMDGPU::V_MOVRELD_B32_e32;
2329 } else {
2330 Opc = RI.getRegSizeInBits(RC: *EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2331 : AMDGPU::S_MOVRELD_B32;
2332 }
2333
2334 const MCInstrDesc &OpDesc = get(Opcode: Opc);
2335 Register VecReg = MI.getOperand(i: 0).getReg();
2336 bool IsUndef = MI.getOperand(i: 1).isUndef();
2337 unsigned SubReg = MI.getOperand(i: 3).getImm();
2338 assert(VecReg == MI.getOperand(1).getReg());
2339
2340 MachineInstrBuilder MIB =
2341 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: OpDesc)
2342 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2343 .add(MO: MI.getOperand(i: 2))
2344 .addReg(RegNo: VecReg, Flags: RegState::ImplicitDefine)
2345 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2346
2347 const int ImpDefIdx =
2348 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2349 const int ImpUseIdx = ImpDefIdx + 1;
2350 MIB->tieOperands(DefIdx: ImpDefIdx, UseIdx: ImpUseIdx);
2351 MI.eraseFromParent();
2352 break;
2353 }
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2357 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2358 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2359 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2360 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2361 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2362 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2363 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2364 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2365 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2366 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2367 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2368 assert(ST.useVGPRIndexMode());
2369 Register VecReg = MI.getOperand(i: 0).getReg();
2370 bool IsUndef = MI.getOperand(i: 1).isUndef();
2371 MachineOperand &Idx = MI.getOperand(i: 3);
2372 Register SubReg = MI.getOperand(i: 4).getImm();
2373
2374 MachineInstr *SetOn = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_ON))
2375 .add(MO: Idx)
2376 .addImm(Val: AMDGPU::VGPRIndexMode::DST_ENABLE);
2377 SetOn->getOperand(i: 3).setIsUndef();
2378
2379 const MCInstrDesc &OpDesc = get(Opcode: AMDGPU::V_MOV_B32_indirect_write);
2380 MachineInstrBuilder MIB =
2381 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: OpDesc)
2382 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2383 .add(MO: MI.getOperand(i: 2))
2384 .addReg(RegNo: VecReg, Flags: RegState::ImplicitDefine)
2385 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2386
2387 const int ImpDefIdx =
2388 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2389 const int ImpUseIdx = ImpDefIdx + 1;
2390 MIB->tieOperands(DefIdx: ImpDefIdx, UseIdx: ImpUseIdx);
2391
2392 MachineInstr *SetOff = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_OFF));
2393
2394 finalizeBundle(MBB, FirstMI: SetOn->getIterator(), LastMI: std::next(x: SetOff->getIterator()));
2395
2396 MI.eraseFromParent();
2397 break;
2398 }
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2402 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2403 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2404 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2405 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2406 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2407 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2408 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2409 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2410 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2411 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2412 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2413 assert(ST.useVGPRIndexMode());
2414 Register Dst = MI.getOperand(i: 0).getReg();
2415 Register VecReg = MI.getOperand(i: 1).getReg();
2416 bool IsUndef = MI.getOperand(i: 1).isUndef();
2417 Register SubReg = MI.getOperand(i: 3).getImm();
2418
2419 MachineInstr *SetOn = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_ON))
2420 .add(MO: MI.getOperand(i: 2))
2421 .addImm(Val: AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2422 SetOn->getOperand(i: 3).setIsUndef();
2423
2424 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_indirect_read))
2425 .addDef(RegNo: Dst)
2426 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2427 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2428
2429 MachineInstr *SetOff = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_OFF));
2430
2431 finalizeBundle(MBB, FirstMI: SetOn->getIterator(), LastMI: std::next(x: SetOff->getIterator()));
2432
2433 MI.eraseFromParent();
2434 break;
2435 }
2436 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2437 MachineFunction &MF = *MBB.getParent();
2438 Register Reg = MI.getOperand(i: 0).getReg();
2439 Register RegLo = RI.getSubReg(Reg, Idx: AMDGPU::sub0);
2440 Register RegHi = RI.getSubReg(Reg, Idx: AMDGPU::sub1);
2441 MachineOperand OpLo = MI.getOperand(i: 1);
2442 MachineOperand OpHi = MI.getOperand(i: 2);
2443
2444 // Create a bundle so these instructions won't be re-ordered by the
2445 // post-RA scheduler.
2446 MIBundleBuilder Bundler(MBB, MI);
2447 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: Reg));
2448
2449 // What we want here is an offset from the value returned by s_getpc (which
2450 // is the address of the s_add_u32 instruction) to the global variable, but
2451 // since the encoding of $symbol starts 4 bytes after the start of the
2452 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2453 // small. This requires us to add 4 to the global variable offset in order
2454 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2455 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2456 // instruction.
2457
2458 int64_t Adjust = 0;
2459 if (ST.hasGetPCZeroExtension()) {
2460 // Fix up hardware that does not sign-extend the 48-bit PC value by
2461 // inserting: s_sext_i32_i16 reghi, reghi
2462 Bundler.append(
2463 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SEXT_I32_I16), DestReg: RegHi).addReg(RegNo: RegHi));
2464 Adjust += 4;
2465 }
2466
2467 if (OpLo.isGlobal())
2468 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2469 Bundler.append(
2470 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U32), DestReg: RegLo).addReg(RegNo: RegLo).add(MO: OpLo));
2471
2472 if (OpHi.isGlobal())
2473 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2474 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADDC_U32), DestReg: RegHi)
2475 .addReg(RegNo: RegHi)
2476 .add(MO: OpHi));
2477
2478 finalizeBundle(MBB, FirstMI: Bundler.begin());
2479
2480 MI.eraseFromParent();
2481 break;
2482 }
2483 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2484 MachineFunction &MF = *MBB.getParent();
2485 Register Reg = MI.getOperand(i: 0).getReg();
2486 MachineOperand Op = MI.getOperand(i: 1);
2487
2488 // Create a bundle so these instructions won't be re-ordered by the
2489 // post-RA scheduler.
2490 MIBundleBuilder Bundler(MBB, MI);
2491 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: Reg));
2492 if (Op.isGlobal())
2493 Op.setOffset(Op.getOffset() + 4);
2494 Bundler.append(
2495 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U64), DestReg: Reg).addReg(RegNo: Reg).add(MO: Op));
2496
2497 finalizeBundle(MBB, FirstMI: Bundler.begin());
2498
2499 MI.eraseFromParent();
2500 break;
2501 }
2502 case AMDGPU::ENTER_STRICT_WWM: {
2503 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2504 // Whole Wave Mode is entered.
2505 MI.setDesc(get(Opcode: LMC.OrSaveExecOpc));
2506 break;
2507 }
2508 case AMDGPU::ENTER_STRICT_WQM: {
2509 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2510 // STRICT_WQM is entered.
2511 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: LMC.MovOpc), DestReg: MI.getOperand(i: 0).getReg())
2512 .addReg(RegNo: LMC.ExecReg);
2513 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: LMC.WQMOpc), DestReg: LMC.ExecReg).addReg(RegNo: LMC.ExecReg);
2514
2515 MI.eraseFromParent();
2516 break;
2517 }
2518 case AMDGPU::EXIT_STRICT_WWM:
2519 case AMDGPU::EXIT_STRICT_WQM: {
2520 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2521 // WWM/STICT_WQM is exited.
2522 MI.setDesc(get(Opcode: LMC.MovOpc));
2523 break;
2524 }
2525 case AMDGPU::SI_RETURN: {
2526 const MachineFunction *MF = MBB.getParent();
2527 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2528 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2529 // Hiding the return address use with SI_RETURN may lead to extra kills in
2530 // the function and missing live-ins. We are fine in practice because callee
2531 // saved register handling ensures the register value is restored before
2532 // RET, but we need the undef flag here to appease the MachineVerifier
2533 // liveness checks.
2534 MachineInstrBuilder MIB =
2535 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETPC_B64_return))
2536 .addReg(RegNo: TRI->getReturnAddressReg(MF: *MF), Flags: RegState::Undef);
2537
2538 MIB.copyImplicitOps(OtherMI: MI);
2539 MI.eraseFromParent();
2540 break;
2541 }
2542
2543 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2544 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2545 MI.setDesc(get(Opcode: AMDGPU::S_MUL_U64));
2546 break;
2547
2548 case AMDGPU::S_GETPC_B64_pseudo:
2549 MI.setDesc(get(Opcode: AMDGPU::S_GETPC_B64));
2550 if (ST.hasGetPCZeroExtension()) {
2551 Register Dst = MI.getOperand(i: 0).getReg();
2552 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2553 // Fix up hardware that does not sign-extend the 48-bit PC value by
2554 // inserting: s_sext_i32_i16 dsthi, dsthi
2555 BuildMI(BB&: MBB, I: std::next(x: MI.getIterator()), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SEXT_I32_I16),
2556 DestReg: DstHi)
2557 .addReg(RegNo: DstHi);
2558 }
2559 break;
2560
2561 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2562 assert(ST.hasBF16PackedInsts());
2563 MI.setDesc(get(Opcode: AMDGPU::V_PK_MAX_NUM_BF16));
2564 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // op_sel
2565 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // neg_lo
2566 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // neg_hi
2567 auto Op0 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
2568 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2569 auto Op1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
2570 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2571 break;
2572 }
2573
2574 case AMDGPU::GET_STACK_BASE:
2575 // The stack starts at offset 0 unless we need to reserve some space at the
2576 // bottom.
2577 if (ST.getFrameLowering()->mayReserveScratchForCWSR(MF: *MBB.getParent())) {
2578 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2579 // some of the VGPRs. The size of the required scratch space has already
2580 // been computed by prolog epilog insertion.
2581 const SIMachineFunctionInfo *MFI =
2582 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2583 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2584 Register DestReg = MI.getOperand(i: 0).getReg();
2585 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETREG_B32), DestReg)
2586 .addImm(Val: AMDGPU::Hwreg::HwregEncoding::encode(
2587 Values: AMDGPU::Hwreg::ID_HW_ID2, Values: AMDGPU::Hwreg::OFFSET_ME_ID, Values: 2));
2588 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2589 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2590 // SCC, so we need to check for 0 manually.
2591 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U32)).addImm(Val: 0).addReg(RegNo: DestReg);
2592 // Change the implicif-def of SCC to an explicit use (but first remove
2593 // the dead flag if present).
2594 MI.getOperand(i: MI.getNumExplicitOperands()).setIsDead(false);
2595 MI.getOperand(i: MI.getNumExplicitOperands()).setIsUse();
2596 MI.setDesc(get(Opcode: AMDGPU::S_CMOVK_I32));
2597 MI.addOperand(Op: MachineOperand::CreateImm(Val: VGPRSize));
2598 } else {
2599 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B32));
2600 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2601 MI.removeOperand(
2602 OpNo: MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2603 }
2604 break;
2605 }
2606
2607 return true;
2608}
2609
2610void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
2611 MachineBasicBlock::iterator I, Register DestReg,
2612 unsigned SubIdx,
2613 const MachineInstr &Orig) const {
2614
2615 // Try shrinking the instruction to remat only the part needed for current
2616 // context.
2617 // TODO: Handle more cases.
2618 unsigned Opcode = Orig.getOpcode();
2619 switch (Opcode) {
2620 case AMDGPU::S_LOAD_DWORDX16_IMM:
2621 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2622 if (SubIdx != 0)
2623 break;
2624
2625 if (I == MBB.end())
2626 break;
2627
2628 if (I->isBundled())
2629 break;
2630
2631 // Look for a single use of the register that is also a subreg.
2632 Register RegToFind = Orig.getOperand(i: 0).getReg();
2633 MachineOperand *UseMO = nullptr;
2634 for (auto &CandMO : I->operands()) {
2635 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2636 continue;
2637 if (UseMO) {
2638 UseMO = nullptr;
2639 break;
2640 }
2641 UseMO = &CandMO;
2642 }
2643 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2644 break;
2645
2646 unsigned Offset = RI.getSubRegIdxOffset(Idx: UseMO->getSubReg());
2647 unsigned SubregSize = RI.getSubRegIdxSize(Idx: UseMO->getSubReg());
2648
2649 MachineFunction *MF = MBB.getParent();
2650 MachineRegisterInfo &MRI = MF->getRegInfo();
2651 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2652
2653 unsigned NewOpcode = -1;
2654 if (SubregSize == 256)
2655 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2656 else if (SubregSize == 128)
2657 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2658 else
2659 break;
2660
2661 const MCInstrDesc &TID = get(Opcode: NewOpcode);
2662 const TargetRegisterClass *NewRC =
2663 RI.getAllocatableClass(RC: getRegClass(MCID: TID, OpNum: 0));
2664 MRI.setRegClass(Reg: DestReg, RC: NewRC);
2665
2666 UseMO->setReg(DestReg);
2667 UseMO->setSubReg(AMDGPU::NoSubRegister);
2668
2669 // Use a smaller load with the desired size, possibly with updated offset.
2670 MachineInstr *MI = MF->CloneMachineInstr(Orig: &Orig);
2671 MI->setDesc(TID);
2672 MI->getOperand(i: 0).setReg(DestReg);
2673 MI->getOperand(i: 0).setSubReg(AMDGPU::NoSubRegister);
2674 if (Offset) {
2675 MachineOperand *OffsetMO = getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset);
2676 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2677 OffsetMO->setImm(FinalOffset);
2678 }
2679 SmallVector<MachineMemOperand *> NewMMOs;
2680 for (const MachineMemOperand *MemOp : Orig.memoperands())
2681 NewMMOs.push_back(Elt: MF->getMachineMemOperand(MMO: MemOp, PtrInfo: MemOp->getPointerInfo(),
2682 Size: SubregSize / 8));
2683 MI->setMemRefs(MF&: *MF, MemRefs: NewMMOs);
2684
2685 MBB.insert(I, MI);
2686 return;
2687 }
2688
2689 default:
2690 break;
2691 }
2692
2693 TargetInstrInfo::reMaterialize(MBB, MI: I, DestReg, SubIdx, Orig);
2694}
2695
2696std::pair<MachineInstr*, MachineInstr*>
2697SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
2698 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2699
2700 if (ST.hasMovB64() && ST.hasFeature(Feature: AMDGPU::FeatureDPALU_DPP) &&
2701 AMDGPU::isLegalDPALU_DPPControl(
2702 ST, DC: getNamedOperand(MI, OperandName: AMDGPU::OpName::dpp_ctrl)->getImm())) {
2703 MI.setDesc(get(Opcode: AMDGPU::V_MOV_B64_dpp));
2704 return std::pair(&MI, nullptr);
2705 }
2706
2707 MachineBasicBlock &MBB = *MI.getParent();
2708 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
2709 MachineFunction *MF = MBB.getParent();
2710 MachineRegisterInfo &MRI = MF->getRegInfo();
2711 Register Dst = MI.getOperand(i: 0).getReg();
2712 unsigned Part = 0;
2713 MachineInstr *Split[2];
2714
2715 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2716 auto MovDPP = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_dpp));
2717 if (Dst.isPhysical()) {
2718 MovDPP.addDef(RegNo: RI.getSubReg(Reg: Dst, Idx: Sub));
2719 } else {
2720 assert(MRI.isSSA());
2721 auto Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2722 MovDPP.addDef(RegNo: Tmp);
2723 }
2724
2725 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2726 const MachineOperand &SrcOp = MI.getOperand(i: I);
2727 assert(!SrcOp.isFPImm());
2728 if (SrcOp.isImm()) {
2729 APInt Imm(64, SrcOp.getImm());
2730 Imm.ashrInPlace(ShiftAmt: Part * 32);
2731 MovDPP.addImm(Val: Imm.getLoBits(numBits: 32).getZExtValue());
2732 } else {
2733 assert(SrcOp.isReg());
2734 Register Src = SrcOp.getReg();
2735 if (Src.isPhysical())
2736 MovDPP.addReg(RegNo: RI.getSubReg(Reg: Src, Idx: Sub));
2737 else
2738 MovDPP.addReg(RegNo: Src, Flags: getUndefRegState(B: SrcOp.isUndef()), SubReg: Sub);
2739 }
2740 }
2741
2742 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.explicit_operands(), N: 3))
2743 MovDPP.addImm(Val: MO.getImm());
2744
2745 Split[Part] = MovDPP;
2746 ++Part;
2747 }
2748
2749 if (Dst.isVirtual())
2750 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
2751 .addReg(RegNo: Split[0]->getOperand(i: 0).getReg())
2752 .addImm(Val: AMDGPU::sub0)
2753 .addReg(RegNo: Split[1]->getOperand(i: 0).getReg())
2754 .addImm(Val: AMDGPU::sub1);
2755
2756 MI.eraseFromParent();
2757 return std::pair(Split[0], Split[1]);
2758}
2759
2760std::optional<DestSourcePair>
2761SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
2762 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2763 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 1)};
2764
2765 return std::nullopt;
2766}
2767
2768bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0,
2769 AMDGPU::OpName Src0OpName,
2770 MachineOperand &Src1,
2771 AMDGPU::OpName Src1OpName) const {
2772 MachineOperand *Src0Mods = getNamedOperand(MI, OperandName: Src0OpName);
2773 if (!Src0Mods)
2774 return false;
2775
2776 MachineOperand *Src1Mods = getNamedOperand(MI, OperandName: Src1OpName);
2777 assert(Src1Mods &&
2778 "All commutable instructions have both src0 and src1 modifiers");
2779
2780 int Src0ModsVal = Src0Mods->getImm();
2781 int Src1ModsVal = Src1Mods->getImm();
2782
2783 Src1Mods->setImm(Src0ModsVal);
2784 Src0Mods->setImm(Src1ModsVal);
2785 return true;
2786}
2787
2788static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
2789 MachineOperand &RegOp,
2790 MachineOperand &NonRegOp) {
2791 Register Reg = RegOp.getReg();
2792 unsigned SubReg = RegOp.getSubReg();
2793 bool IsKill = RegOp.isKill();
2794 bool IsDead = RegOp.isDead();
2795 bool IsUndef = RegOp.isUndef();
2796 bool IsDebug = RegOp.isDebug();
2797
2798 if (NonRegOp.isImm())
2799 RegOp.ChangeToImmediate(ImmVal: NonRegOp.getImm());
2800 else if (NonRegOp.isFI())
2801 RegOp.ChangeToFrameIndex(Idx: NonRegOp.getIndex());
2802 else if (NonRegOp.isGlobal()) {
2803 RegOp.ChangeToGA(GV: NonRegOp.getGlobal(), Offset: NonRegOp.getOffset(),
2804 TargetFlags: NonRegOp.getTargetFlags());
2805 } else
2806 return nullptr;
2807
2808 // Make sure we don't reinterpret a subreg index in the target flags.
2809 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2810
2811 NonRegOp.ChangeToRegister(Reg, isDef: false, isImp: false, isKill: IsKill, isDead: IsDead, isUndef: IsUndef, isDebug: IsDebug);
2812 NonRegOp.setSubReg(SubReg);
2813
2814 return &MI;
2815}
2816
2817static MachineInstr *swapImmOperands(MachineInstr &MI,
2818 MachineOperand &NonRegOp1,
2819 MachineOperand &NonRegOp2) {
2820 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2821 int64_t NonRegVal = NonRegOp1.getImm();
2822
2823 NonRegOp1.setImm(NonRegOp2.getImm());
2824 NonRegOp2.setImm(NonRegVal);
2825 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2826 NonRegOp2.setTargetFlags(TargetFlags);
2827 return &MI;
2828}
2829
2830bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2831 unsigned OpIdx1) const {
2832 const MCInstrDesc &InstDesc = MI.getDesc();
2833 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2834 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2835
2836 unsigned Opc = MI.getOpcode();
2837 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
2838
2839 const MachineOperand &MO0 = MI.getOperand(i: OpIdx0);
2840 const MachineOperand &MO1 = MI.getOperand(i: OpIdx1);
2841
2842 // Swap doesn't breach constant bus or literal limits
2843 // It may move literal to position other than src0, this is not allowed
2844 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2845 // FIXME: After gfx9, literal can be in place other than Src0
2846 if (isVALU(MI)) {
2847 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2848 !isInlineConstant(MO: MO0, OpInfo: OpInfo1))
2849 return false;
2850 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2851 !isInlineConstant(MO: MO1, OpInfo: OpInfo0))
2852 return false;
2853 }
2854
2855 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2856 if (OpInfo1.RegClass == -1)
2857 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2858 return isLegalRegOperand(MI, OpIdx: OpIdx1, MO: MO0) &&
2859 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx: OpIdx0, MO: MO1));
2860 }
2861 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2862 if (OpInfo0.RegClass == -1)
2863 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2864 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx: OpIdx1, MO: MO0)) &&
2865 isLegalRegOperand(MI, OpIdx: OpIdx0, MO: MO1);
2866 }
2867
2868 // No need to check 64-bit literals since swapping does not bring new
2869 // 64-bit literals into current instruction to fold to 32-bit
2870
2871 return isImmOperandLegal(MI, OpNo: OpIdx1, MO: MO0);
2872}
2873
2874MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
2875 unsigned Src0Idx,
2876 unsigned Src1Idx) const {
2877 assert(!NewMI && "this should never be used");
2878
2879 unsigned Opc = MI.getOpcode();
2880 int CommutedOpcode = commuteOpcode(Opcode: Opc);
2881 if (CommutedOpcode == -1)
2882 return nullptr;
2883
2884 if (Src0Idx > Src1Idx)
2885 std::swap(a&: Src0Idx, b&: Src1Idx);
2886
2887 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2888 static_cast<int>(Src0Idx) &&
2889 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2890 static_cast<int>(Src1Idx) &&
2891 "inconsistency with findCommutedOpIndices");
2892
2893 if (!isLegalToSwap(MI, OpIdx0: Src0Idx, OpIdx1: Src1Idx))
2894 return nullptr;
2895
2896 MachineInstr *CommutedMI = nullptr;
2897 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
2898 MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
2899 if (Src0.isReg() && Src1.isReg()) {
2900 // Be sure to copy the source modifiers to the right place.
2901 CommutedMI =
2902 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1: Src0Idx, OpIdx2: Src1Idx);
2903 } else if (Src0.isReg() && !Src1.isReg()) {
2904 CommutedMI = swapRegAndNonRegOperand(MI, RegOp&: Src0, NonRegOp&: Src1);
2905 } else if (!Src0.isReg() && Src1.isReg()) {
2906 CommutedMI = swapRegAndNonRegOperand(MI, RegOp&: Src1, NonRegOp&: Src0);
2907 } else if (Src0.isImm() && Src1.isImm()) {
2908 CommutedMI = swapImmOperands(MI, NonRegOp1&: Src0, NonRegOp2&: Src1);
2909 } else {
2910 // FIXME: Found two non registers to commute. This does happen.
2911 return nullptr;
2912 }
2913
2914 if (CommutedMI) {
2915 swapSourceModifiers(MI, Src0, Src0OpName: AMDGPU::OpName::src0_modifiers,
2916 Src1, Src1OpName: AMDGPU::OpName::src1_modifiers);
2917
2918 swapSourceModifiers(MI, Src0, Src0OpName: AMDGPU::OpName::src0_sel, Src1,
2919 Src1OpName: AMDGPU::OpName::src1_sel);
2920
2921 CommutedMI->setDesc(get(Opcode: CommutedOpcode));
2922 }
2923
2924 return CommutedMI;
2925}
2926
2927// This needs to be implemented because the source modifiers may be inserted
2928// between the true commutable operands, and the base
2929// TargetInstrInfo::commuteInstruction uses it.
2930bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
2931 unsigned &SrcOpIdx0,
2932 unsigned &SrcOpIdx1) const {
2933 return findCommutedOpIndices(Desc: MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2934}
2935
2936bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc,
2937 unsigned &SrcOpIdx0,
2938 unsigned &SrcOpIdx1) const {
2939 if (!Desc.isCommutable())
2940 return false;
2941
2942 unsigned Opc = Desc.getOpcode();
2943 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
2944 if (Src0Idx == -1)
2945 return false;
2946
2947 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1);
2948 if (Src1Idx == -1)
2949 return false;
2950
2951 return fixCommutedOpIndices(ResultIdx1&: SrcOpIdx0, ResultIdx2&: SrcOpIdx1, CommutableOpIdx1: Src0Idx, CommutableOpIdx2: Src1Idx);
2952}
2953
2954bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
2955 int64_t BrOffset) const {
2956 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2957 // because its dest block is unanalyzable.
2958 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2959
2960 // Convert to dwords.
2961 BrOffset /= 4;
2962
2963 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2964 // from the next instruction.
2965 BrOffset -= 1;
2966
2967 return isIntN(N: BranchOffsetBits, x: BrOffset);
2968}
2969
2970MachineBasicBlock *
2971SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
2972 return MI.getOperand(i: 0).getMBB();
2973}
2974
2975bool SIInstrInfo::hasDivergentBranch(const MachineBasicBlock *MBB) const {
2976 for (const MachineInstr &MI : MBB->terminators()) {
2977 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2978 MI.getOpcode() == AMDGPU::SI_LOOP)
2979 return true;
2980 }
2981 return false;
2982}
2983
2984void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
2985 MachineBasicBlock &DestBB,
2986 MachineBasicBlock &RestoreBB,
2987 const DebugLoc &DL, int64_t BrOffset,
2988 RegScavenger *RS) const {
2989 assert(MBB.empty() &&
2990 "new block should be inserted for expanding unconditional branch");
2991 assert(MBB.pred_size() == 1);
2992 assert(RestoreBB.empty() &&
2993 "restore block should be inserted for restoring clobbered registers");
2994
2995 MachineFunction *MF = MBB.getParent();
2996 MachineRegisterInfo &MRI = MF->getRegInfo();
2997 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2998 auto I = MBB.end();
2999 auto &MCCtx = MF->getContext();
3000
3001 if (ST.useAddPC64Inst()) {
3002 MCSymbol *Offset =
3003 MCCtx.createTempSymbol(Name: "offset", /*AlwaysAddSuffix=*/true);
3004 auto AddPC = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_PC_I64))
3005 .addSym(Sym: Offset, TargetFlags: MO_FAR_BRANCH_OFFSET);
3006 MCSymbol *PostAddPCLabel =
3007 MCCtx.createTempSymbol(Name: "post_addpc", /*AlwaysAddSuffix=*/true);
3008 AddPC->setPostInstrSymbol(MF&: *MF, Symbol: PostAddPCLabel);
3009 auto *OffsetExpr = MCBinaryExpr::createSub(
3010 LHS: MCSymbolRefExpr::create(Symbol: DestBB.getSymbol(), Ctx&: MCCtx),
3011 RHS: MCSymbolRefExpr::create(Symbol: PostAddPCLabel, Ctx&: MCCtx), Ctx&: MCCtx);
3012 Offset->setVariableValue(OffsetExpr);
3013 return;
3014 }
3015
3016 assert(RS && "RegScavenger required for long branching");
3017
3018 // FIXME: Virtual register workaround for RegScavenger not working with empty
3019 // blocks.
3020 Register PCReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
3021
3022 // Note: as this is used after hazard recognizer we need to apply some hazard
3023 // workarounds directly.
3024 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
3025 ST.hasVALUReadSGPRHazard();
3026 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
3027 if (FlushSGPRWrites)
3028 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3029 .addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST));
3030 };
3031
3032 // We need to compute the offset relative to the instruction immediately after
3033 // s_getpc_b64. Insert pc arithmetic code before last terminator.
3034 MachineInstr *GetPC = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: PCReg);
3035 ApplyHazardWorkarounds();
3036
3037 MCSymbol *PostGetPCLabel =
3038 MCCtx.createTempSymbol(Name: "post_getpc", /*AlwaysAddSuffix=*/true);
3039 GetPC->setPostInstrSymbol(MF&: *MF, Symbol: PostGetPCLabel);
3040
3041 MCSymbol *OffsetLo =
3042 MCCtx.createTempSymbol(Name: "offset_lo", /*AlwaysAddSuffix=*/true);
3043 MCSymbol *OffsetHi =
3044 MCCtx.createTempSymbol(Name: "offset_hi", /*AlwaysAddSuffix=*/true);
3045 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U32))
3046 .addReg(RegNo: PCReg, Flags: RegState::Define, SubReg: AMDGPU::sub0)
3047 .addReg(RegNo: PCReg, Flags: {}, SubReg: AMDGPU::sub0)
3048 .addSym(Sym: OffsetLo, TargetFlags: MO_FAR_BRANCH_OFFSET);
3049 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADDC_U32))
3050 .addReg(RegNo: PCReg, Flags: RegState::Define, SubReg: AMDGPU::sub1)
3051 .addReg(RegNo: PCReg, Flags: {}, SubReg: AMDGPU::sub1)
3052 .addSym(Sym: OffsetHi, TargetFlags: MO_FAR_BRANCH_OFFSET);
3053 ApplyHazardWorkarounds();
3054
3055 // Insert the indirect branch after the other terminator.
3056 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETPC_B64))
3057 .addReg(RegNo: PCReg);
3058
3059 // If a spill is needed for the pc register pair, we need to insert a spill
3060 // restore block right before the destination block, and insert a short branch
3061 // into the old destination block's fallthrough predecessor.
3062 // e.g.:
3063 //
3064 // s_cbranch_scc0 skip_long_branch:
3065 //
3066 // long_branch_bb:
3067 // spill s[8:9]
3068 // s_getpc_b64 s[8:9]
3069 // s_add_u32 s8, s8, restore_bb
3070 // s_addc_u32 s9, s9, 0
3071 // s_setpc_b64 s[8:9]
3072 //
3073 // skip_long_branch:
3074 // foo;
3075 //
3076 // .....
3077 //
3078 // dest_bb_fallthrough_predecessor:
3079 // bar;
3080 // s_branch dest_bb
3081 //
3082 // restore_bb:
3083 // restore s[8:9]
3084 // fallthrough dest_bb
3085 ///
3086 // dest_bb:
3087 // buzz;
3088
3089 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3090 Register Scav;
3091
3092 // If we've previously reserved a register for long branches
3093 // avoid running the scavenger and just use those registers
3094 if (LongBranchReservedReg) {
3095 RS->enterBasicBlock(MBB);
3096 Scav = LongBranchReservedReg;
3097 } else {
3098 RS->enterBasicBlockEnd(MBB);
3099 Scav = RS->scavengeRegisterBackwards(
3100 RC: AMDGPU::SReg_64RegClass, To: MachineBasicBlock::iterator(GetPC),
3101 /* RestoreAfter */ false, SPAdj: 0, /* AllowSpill */ false);
3102 }
3103 if (Scav) {
3104 RS->setRegUsed(Reg: Scav);
3105 MRI.replaceRegWith(FromReg: PCReg, ToReg: Scav);
3106 MRI.clearVirtRegs();
3107 } else {
3108 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3109 // SGPR spill.
3110 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3111 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3112 TRI->spillEmergencySGPR(MI: GetPC, RestoreMBB&: RestoreBB, SGPR: AMDGPU::SGPR0_SGPR1, RS);
3113 MRI.replaceRegWith(FromReg: PCReg, ToReg: AMDGPU::SGPR0_SGPR1);
3114 MRI.clearVirtRegs();
3115 }
3116
3117 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3118 // Now, the distance could be defined.
3119 auto *Offset = MCBinaryExpr::createSub(
3120 LHS: MCSymbolRefExpr::create(Symbol: DestLabel, Ctx&: MCCtx),
3121 RHS: MCSymbolRefExpr::create(Symbol: PostGetPCLabel, Ctx&: MCCtx), Ctx&: MCCtx);
3122 // Add offset assignments.
3123 auto *Mask = MCConstantExpr::create(Value: 0xFFFFFFFFULL, Ctx&: MCCtx);
3124 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(LHS: Offset, RHS: Mask, Ctx&: MCCtx));
3125 auto *ShAmt = MCConstantExpr::create(Value: 32, Ctx&: MCCtx);
3126 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(LHS: Offset, RHS: ShAmt, Ctx&: MCCtx));
3127}
3128
3129unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3130 switch (Cond) {
3131 case SIInstrInfo::SCC_TRUE:
3132 return AMDGPU::S_CBRANCH_SCC1;
3133 case SIInstrInfo::SCC_FALSE:
3134 return AMDGPU::S_CBRANCH_SCC0;
3135 case SIInstrInfo::VCCNZ:
3136 return AMDGPU::S_CBRANCH_VCCNZ;
3137 case SIInstrInfo::VCCZ:
3138 return AMDGPU::S_CBRANCH_VCCZ;
3139 case SIInstrInfo::EXECNZ:
3140 return AMDGPU::S_CBRANCH_EXECNZ;
3141 case SIInstrInfo::EXECZ:
3142 return AMDGPU::S_CBRANCH_EXECZ;
3143 default:
3144 llvm_unreachable("invalid branch predicate");
3145 }
3146}
3147
3148SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3149 switch (Opcode) {
3150 case AMDGPU::S_CBRANCH_SCC0:
3151 return SCC_FALSE;
3152 case AMDGPU::S_CBRANCH_SCC1:
3153 return SCC_TRUE;
3154 case AMDGPU::S_CBRANCH_VCCNZ:
3155 return VCCNZ;
3156 case AMDGPU::S_CBRANCH_VCCZ:
3157 return VCCZ;
3158 case AMDGPU::S_CBRANCH_EXECNZ:
3159 return EXECNZ;
3160 case AMDGPU::S_CBRANCH_EXECZ:
3161 return EXECZ;
3162 default:
3163 return INVALID_BR;
3164 }
3165}
3166
3167bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
3168 MachineBasicBlock::iterator I,
3169 MachineBasicBlock *&TBB,
3170 MachineBasicBlock *&FBB,
3171 SmallVectorImpl<MachineOperand> &Cond,
3172 bool AllowModify) const {
3173 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3174 // Unconditional Branch
3175 TBB = I->getOperand(i: 0).getMBB();
3176 return false;
3177 }
3178
3179 BranchPredicate Pred = getBranchPredicate(Opcode: I->getOpcode());
3180 if (Pred == INVALID_BR)
3181 return true;
3182
3183 MachineBasicBlock *CondBB = I->getOperand(i: 0).getMBB();
3184 Cond.push_back(Elt: MachineOperand::CreateImm(Val: Pred));
3185 Cond.push_back(Elt: I->getOperand(i: 1)); // Save the branch register.
3186
3187 ++I;
3188
3189 if (I == MBB.end()) {
3190 // Conditional branch followed by fall-through.
3191 TBB = CondBB;
3192 return false;
3193 }
3194
3195 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3196 TBB = CondBB;
3197 FBB = I->getOperand(i: 0).getMBB();
3198 return false;
3199 }
3200
3201 return true;
3202}
3203
3204bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
3205 MachineBasicBlock *&FBB,
3206 SmallVectorImpl<MachineOperand> &Cond,
3207 bool AllowModify) const {
3208 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3209 auto E = MBB.end();
3210 if (I == E)
3211 return false;
3212
3213 // Skip over the instructions that are artificially terminators for special
3214 // exec management.
3215 while (I != E && !I->isBranch() && !I->isReturn()) {
3216 switch (I->getOpcode()) {
3217 case AMDGPU::S_MOV_B64_term:
3218 case AMDGPU::S_XOR_B64_term:
3219 case AMDGPU::S_OR_B64_term:
3220 case AMDGPU::S_ANDN2_B64_term:
3221 case AMDGPU::S_AND_B64_term:
3222 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3223 case AMDGPU::S_MOV_B32_term:
3224 case AMDGPU::S_XOR_B32_term:
3225 case AMDGPU::S_OR_B32_term:
3226 case AMDGPU::S_ANDN2_B32_term:
3227 case AMDGPU::S_AND_B32_term:
3228 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3229 break;
3230 case AMDGPU::SI_IF:
3231 case AMDGPU::SI_ELSE:
3232 case AMDGPU::SI_KILL_I1_TERMINATOR:
3233 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3234 // FIXME: It's messy that these need to be considered here at all.
3235 return true;
3236 default:
3237 llvm_unreachable("unexpected non-branch terminator inst");
3238 }
3239
3240 ++I;
3241 }
3242
3243 if (I == E)
3244 return false;
3245
3246 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3247}
3248
3249unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
3250 int *BytesRemoved) const {
3251 unsigned Count = 0;
3252 unsigned RemovedSize = 0;
3253 for (MachineInstr &MI : llvm::make_early_inc_range(Range: MBB.terminators())) {
3254 // Skip over artificial terminators when removing instructions.
3255 if (MI.isBranch() || MI.isReturn()) {
3256 RemovedSize += getInstSizeInBytes(MI);
3257 MI.eraseFromParent();
3258 ++Count;
3259 }
3260 }
3261
3262 if (BytesRemoved)
3263 *BytesRemoved = RemovedSize;
3264
3265 return Count;
3266}
3267
3268// Copy the flags onto the implicit condition register operand.
3269static void preserveCondRegFlags(MachineOperand &CondReg,
3270 const MachineOperand &OrigCond) {
3271 CondReg.setIsUndef(OrigCond.isUndef());
3272 CondReg.setIsKill(OrigCond.isKill());
3273}
3274
3275unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
3276 MachineBasicBlock *TBB,
3277 MachineBasicBlock *FBB,
3278 ArrayRef<MachineOperand> Cond,
3279 const DebugLoc &DL,
3280 int *BytesAdded) const {
3281 if (!FBB && Cond.empty()) {
3282 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
3283 .addMBB(MBB: TBB);
3284 if (BytesAdded)
3285 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3286 return 1;
3287 }
3288
3289 assert(TBB && Cond[0].isImm());
3290
3291 unsigned Opcode
3292 = getBranchOpcode(Cond: static_cast<BranchPredicate>(Cond[0].getImm()));
3293
3294 if (!FBB) {
3295 MachineInstr *CondBr =
3296 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode))
3297 .addMBB(MBB: TBB);
3298
3299 // Copy the flags onto the implicit condition register operand.
3300 preserveCondRegFlags(CondReg&: CondBr->getOperand(i: 1), OrigCond: Cond[1]);
3301 fixImplicitOperands(MI&: *CondBr);
3302
3303 if (BytesAdded)
3304 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3305 return 1;
3306 }
3307
3308 assert(TBB && FBB);
3309
3310 MachineInstr *CondBr =
3311 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode))
3312 .addMBB(MBB: TBB);
3313 fixImplicitOperands(MI&: *CondBr);
3314 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
3315 .addMBB(MBB: FBB);
3316
3317 MachineOperand &CondReg = CondBr->getOperand(i: 1);
3318 CondReg.setIsUndef(Cond[1].isUndef());
3319 CondReg.setIsKill(Cond[1].isKill());
3320
3321 if (BytesAdded)
3322 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3323
3324 return 2;
3325}
3326
3327bool SIInstrInfo::reverseBranchCondition(
3328 SmallVectorImpl<MachineOperand> &Cond) const {
3329 if (Cond.size() != 2) {
3330 return true;
3331 }
3332
3333 if (Cond[0].isImm()) {
3334 Cond[0].setImm(-Cond[0].getImm());
3335 return false;
3336 }
3337
3338 return true;
3339}
3340
3341bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
3342 ArrayRef<MachineOperand> Cond,
3343 Register DstReg, Register TrueReg,
3344 Register FalseReg, int &CondCycles,
3345 int &TrueCycles, int &FalseCycles) const {
3346 switch (Cond[0].getImm()) {
3347 case VCCNZ:
3348 case VCCZ: {
3349 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3350 const TargetRegisterClass *RC = MRI.getRegClass(Reg: TrueReg);
3351 if (MRI.getRegClass(Reg: FalseReg) != RC)
3352 return false;
3353
3354 int NumInsts = AMDGPU::getRegBitWidth(RC: *RC) / 32;
3355 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3356
3357 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3358 return RI.hasVGPRs(RC) && NumInsts <= 6;
3359 }
3360 case SCC_TRUE:
3361 case SCC_FALSE: {
3362 // FIXME: We could insert for VGPRs if we could replace the original compare
3363 // with a vector one.
3364 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3365 const TargetRegisterClass *RC = MRI.getRegClass(Reg: TrueReg);
3366 if (MRI.getRegClass(Reg: FalseReg) != RC)
3367 return false;
3368
3369 int NumInsts = AMDGPU::getRegBitWidth(RC: *RC) / 32;
3370
3371 // Multiples of 8 can do s_cselect_b64
3372 if (NumInsts % 2 == 0)
3373 NumInsts /= 2;
3374
3375 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3376 return RI.isSGPRClass(RC);
3377 }
3378 default:
3379 return false;
3380 }
3381}
3382
3383void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
3384 MachineBasicBlock::iterator I, const DebugLoc &DL,
3385 Register DstReg, ArrayRef<MachineOperand> Cond,
3386 Register TrueReg, Register FalseReg) const {
3387 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3388 if (Pred == VCCZ || Pred == SCC_FALSE) {
3389 Pred = static_cast<BranchPredicate>(-Pred);
3390 std::swap(a&: TrueReg, b&: FalseReg);
3391 }
3392
3393 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3394 const TargetRegisterClass *DstRC = MRI.getRegClass(Reg: DstReg);
3395 unsigned DstSize = RI.getRegSizeInBits(RC: *DstRC);
3396
3397 if (DstSize == 32) {
3398 MachineInstr *Select;
3399 if (Pred == SCC_TRUE) {
3400 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: DstReg)
3401 .addReg(RegNo: TrueReg)
3402 .addReg(RegNo: FalseReg);
3403 } else {
3404 // Instruction's operands are backwards from what is expected.
3405 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e32), DestReg: DstReg)
3406 .addReg(RegNo: FalseReg)
3407 .addReg(RegNo: TrueReg);
3408 }
3409
3410 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3411 return;
3412 }
3413
3414 if (DstSize == 64 && Pred == SCC_TRUE) {
3415 MachineInstr *Select =
3416 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: DstReg)
3417 .addReg(RegNo: TrueReg)
3418 .addReg(RegNo: FalseReg);
3419
3420 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3421 return;
3422 }
3423
3424 static const int16_t Sub0_15[] = {
3425 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3426 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3427 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3428 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3429 };
3430
3431 static const int16_t Sub0_15_64[] = {
3432 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3433 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3434 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3435 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3436 };
3437
3438 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3439 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3440 const int16_t *SubIndices = Sub0_15;
3441 int NElts = DstSize / 32;
3442
3443 // 64-bit select is only available for SALU.
3444 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3445 if (Pred == SCC_TRUE) {
3446 if (NElts % 2) {
3447 SelOp = AMDGPU::S_CSELECT_B32;
3448 EltRC = &AMDGPU::SGPR_32RegClass;
3449 } else {
3450 SelOp = AMDGPU::S_CSELECT_B64;
3451 EltRC = &AMDGPU::SGPR_64RegClass;
3452 SubIndices = Sub0_15_64;
3453 NElts /= 2;
3454 }
3455 }
3456
3457 MachineInstrBuilder MIB = BuildMI(
3458 BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg);
3459
3460 I = MIB->getIterator();
3461
3462 SmallVector<Register, 8> Regs;
3463 for (int Idx = 0; Idx != NElts; ++Idx) {
3464 Register DstElt = MRI.createVirtualRegister(RegClass: EltRC);
3465 Regs.push_back(Elt: DstElt);
3466
3467 unsigned SubIdx = SubIndices[Idx];
3468
3469 MachineInstr *Select;
3470 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3471 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SelOp), DestReg: DstElt)
3472 .addReg(RegNo: FalseReg, Flags: {}, SubReg: SubIdx)
3473 .addReg(RegNo: TrueReg, Flags: {}, SubReg: SubIdx);
3474 } else {
3475 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SelOp), DestReg: DstElt)
3476 .addReg(RegNo: TrueReg, Flags: {}, SubReg: SubIdx)
3477 .addReg(RegNo: FalseReg, Flags: {}, SubReg: SubIdx);
3478 }
3479
3480 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3481 fixImplicitOperands(MI&: *Select);
3482
3483 MIB.addReg(RegNo: DstElt)
3484 .addImm(Val: SubIdx);
3485 }
3486}
3487
3488bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
3489 switch (MI.getOpcode()) {
3490 case AMDGPU::V_MOV_B16_t16_e32:
3491 case AMDGPU::V_MOV_B16_t16_e64:
3492 case AMDGPU::V_MOV_B32_e32:
3493 case AMDGPU::V_MOV_B32_e64:
3494 case AMDGPU::V_MOV_B64_PSEUDO:
3495 case AMDGPU::V_MOV_B64_e32:
3496 case AMDGPU::V_MOV_B64_e64:
3497 case AMDGPU::S_MOV_B32:
3498 case AMDGPU::S_MOV_B64:
3499 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3500 case AMDGPU::COPY:
3501 case AMDGPU::WWM_COPY:
3502 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3503 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3504 case AMDGPU::V_ACCVGPR_MOV_B32:
3505 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3506 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3507 return true;
3508 default:
3509 return false;
3510 }
3511}
3512
3513unsigned SIInstrInfo::getFoldableCopySrcIdx(const MachineInstr &MI) {
3514 switch (MI.getOpcode()) {
3515 case AMDGPU::V_MOV_B16_t16_e32:
3516 case AMDGPU::V_MOV_B16_t16_e64:
3517 return 2;
3518 case AMDGPU::V_MOV_B32_e32:
3519 case AMDGPU::V_MOV_B32_e64:
3520 case AMDGPU::V_MOV_B64_PSEUDO:
3521 case AMDGPU::V_MOV_B64_e32:
3522 case AMDGPU::V_MOV_B64_e64:
3523 case AMDGPU::S_MOV_B32:
3524 case AMDGPU::S_MOV_B64:
3525 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3526 case AMDGPU::COPY:
3527 case AMDGPU::WWM_COPY:
3528 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3529 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3530 case AMDGPU::V_ACCVGPR_MOV_B32:
3531 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3532 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3533 return 1;
3534 default:
3535 llvm_unreachable("MI is not a foldable copy");
3536 }
3537}
3538
3539static constexpr AMDGPU::OpName ModifierOpNames[] = {
3540 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3541 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3542 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3543
3544void SIInstrInfo::removeModOperands(MachineInstr &MI) const {
3545 unsigned Opc = MI.getOpcode();
3546 for (AMDGPU::OpName Name : reverse(C: ModifierOpNames)) {
3547 int Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name);
3548 if (Idx >= 0)
3549 MI.removeOperand(OpNo: Idx);
3550 }
3551}
3552
3553void SIInstrInfo::mutateAndCleanupImplicit(MachineInstr &MI,
3554 const MCInstrDesc &NewDesc) const {
3555 MI.setDesc(NewDesc);
3556
3557 // Remove any leftover implicit operands from mutating the instruction. e.g.
3558 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3559 // anymore.
3560 const MCInstrDesc &Desc = MI.getDesc();
3561 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3562 Desc.implicit_defs().size();
3563
3564 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3565 MI.removeOperand(OpNo: I);
3566}
3567
3568std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3569 unsigned SubRegIndex) {
3570 switch (SubRegIndex) {
3571 case AMDGPU::NoSubRegister:
3572 return Imm;
3573 case AMDGPU::sub0:
3574 return SignExtend64<32>(x: Imm);
3575 case AMDGPU::sub1:
3576 return SignExtend64<32>(x: Imm >> 32);
3577 case AMDGPU::lo16:
3578 return SignExtend64<16>(x: Imm);
3579 case AMDGPU::hi16:
3580 return SignExtend64<16>(x: Imm >> 16);
3581 case AMDGPU::sub1_lo16:
3582 return SignExtend64<16>(x: Imm >> 32);
3583 case AMDGPU::sub1_hi16:
3584 return SignExtend64<16>(x: Imm >> 48);
3585 default:
3586 return std::nullopt;
3587 }
3588
3589 llvm_unreachable("covered subregister switch");
3590}
3591
3592static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3593 switch (Opc) {
3594 case AMDGPU::V_MAC_F16_e32:
3595 case AMDGPU::V_MAC_F16_e64:
3596 case AMDGPU::V_MAD_F16_e64:
3597 return AMDGPU::V_MADAK_F16;
3598 case AMDGPU::V_MAC_F32_e32:
3599 case AMDGPU::V_MAC_F32_e64:
3600 case AMDGPU::V_MAD_F32_e64:
3601 return AMDGPU::V_MADAK_F32;
3602 case AMDGPU::V_FMAC_F32_e32:
3603 case AMDGPU::V_FMAC_F32_e64:
3604 case AMDGPU::V_FMA_F32_e64:
3605 return AMDGPU::V_FMAAK_F32;
3606 case AMDGPU::V_FMAC_F16_e32:
3607 case AMDGPU::V_FMAC_F16_e64:
3608 case AMDGPU::V_FMAC_F16_t16_e64:
3609 case AMDGPU::V_FMAC_F16_fake16_e64:
3610 case AMDGPU::V_FMAC_F16_t16_e32:
3611 case AMDGPU::V_FMAC_F16_fake16_e32:
3612 case AMDGPU::V_FMA_F16_e64:
3613 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3614 ? AMDGPU::V_FMAAK_F16_t16
3615 : AMDGPU::V_FMAAK_F16_fake16
3616 : AMDGPU::V_FMAAK_F16;
3617 case AMDGPU::V_FMAC_F64_e32:
3618 case AMDGPU::V_FMAC_F64_e64:
3619 case AMDGPU::V_FMA_F64_e64:
3620 return AMDGPU::V_FMAAK_F64;
3621 default:
3622 llvm_unreachable("invalid instruction");
3623 }
3624}
3625
3626static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3627 switch (Opc) {
3628 case AMDGPU::V_MAC_F16_e32:
3629 case AMDGPU::V_MAC_F16_e64:
3630 case AMDGPU::V_MAD_F16_e64:
3631 return AMDGPU::V_MADMK_F16;
3632 case AMDGPU::V_MAC_F32_e32:
3633 case AMDGPU::V_MAC_F32_e64:
3634 case AMDGPU::V_MAD_F32_e64:
3635 return AMDGPU::V_MADMK_F32;
3636 case AMDGPU::V_FMAC_F32_e32:
3637 case AMDGPU::V_FMAC_F32_e64:
3638 case AMDGPU::V_FMA_F32_e64:
3639 return AMDGPU::V_FMAMK_F32;
3640 case AMDGPU::V_FMAC_F16_e32:
3641 case AMDGPU::V_FMAC_F16_e64:
3642 case AMDGPU::V_FMAC_F16_t16_e64:
3643 case AMDGPU::V_FMAC_F16_fake16_e64:
3644 case AMDGPU::V_FMAC_F16_t16_e32:
3645 case AMDGPU::V_FMAC_F16_fake16_e32:
3646 case AMDGPU::V_FMA_F16_e64:
3647 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3648 ? AMDGPU::V_FMAMK_F16_t16
3649 : AMDGPU::V_FMAMK_F16_fake16
3650 : AMDGPU::V_FMAMK_F16;
3651 case AMDGPU::V_FMAC_F64_e32:
3652 case AMDGPU::V_FMAC_F64_e64:
3653 case AMDGPU::V_FMA_F64_e64:
3654 return AMDGPU::V_FMAMK_F64;
3655 default:
3656 llvm_unreachable("invalid instruction");
3657 }
3658}
3659
3660bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3661 Register Reg, MachineRegisterInfo *MRI) const {
3662 int64_t Imm;
3663 if (!getConstValDefinedInReg(MI: DefMI, Reg, ImmVal&: Imm))
3664 return false;
3665
3666 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(RegNo: Reg);
3667
3668 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3669
3670 unsigned Opc = UseMI.getOpcode();
3671 if (Opc == AMDGPU::COPY) {
3672 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3673
3674 Register DstReg = UseMI.getOperand(i: 0).getReg();
3675 Register UseSubReg = UseMI.getOperand(i: 1).getSubReg();
3676
3677 const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI: *MRI, Reg: DstReg);
3678
3679 if (HasMultipleUses) {
3680 // TODO: This should fold in more cases with multiple use, but we need to
3681 // more carefully consider what those uses are.
3682 unsigned ImmDefSize = RI.getRegSizeInBits(RC: *MRI->getRegClass(Reg));
3683
3684 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3685 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3686 return false;
3687
3688 // Most of the time folding a 32-bit inline constant is free (though this
3689 // might not be true if we can't later fold it into a real user).
3690 //
3691 // FIXME: This isInlineConstant check is imprecise if
3692 // getConstValDefinedInReg handled the tricky non-mov cases.
3693 if (ImmDefSize == 32 &&
3694 !isInlineConstant(ImmVal: Imm, OperandType: AMDGPU::OPERAND_REG_IMM_INT32))
3695 return false;
3696 }
3697
3698 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3699 RI.getSubRegIdxSize(Idx: UseSubReg) == 16;
3700
3701 if (Is16Bit) {
3702 if (RI.hasVGPRs(RC: DstRC))
3703 return false; // Do not clobber vgpr_hi16
3704
3705 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3706 return false;
3707 }
3708
3709 MachineFunction *MF = UseMI.getMF();
3710
3711 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3712 MCRegister MovDstPhysReg =
3713 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3714
3715 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, SubRegIndex: UseSubReg);
3716
3717 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3718 for (unsigned MovOp :
3719 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3720 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3721 const MCInstrDesc &MovDesc = get(Opcode: MovOp);
3722
3723 const TargetRegisterClass *MovDstRC = getRegClass(MCID: MovDesc, OpNum: 0);
3724 if (Is16Bit) {
3725 // We just need to find a correctly sized register class, so the
3726 // subregister index compatibility doesn't matter since we're statically
3727 // extracting the immediate value.
3728 MovDstRC = RI.getMatchingSuperRegClass(A: MovDstRC, B: DstRC, Idx: AMDGPU::lo16);
3729 if (!MovDstRC)
3730 continue;
3731
3732 if (MovDstPhysReg) {
3733 // FIXME: We probably should not do this. If there is a live value in
3734 // the high half of the register, it will be corrupted.
3735 MovDstPhysReg =
3736 RI.getMatchingSuperReg(Reg: MovDstPhysReg, SubIdx: AMDGPU::lo16, RC: MovDstRC);
3737 if (!MovDstPhysReg)
3738 continue;
3739 }
3740 }
3741
3742 // Result class isn't the right size, try the next instruction.
3743 if (MovDstPhysReg) {
3744 if (!MovDstRC->contains(Reg: MovDstPhysReg))
3745 return false;
3746 } else if (!MRI->constrainRegClass(Reg: DstReg, RC: MovDstRC)) {
3747 // TODO: This will be overly conservative in the case of 16-bit virtual
3748 // SGPRs. We could hack up the virtual register uses to use a compatible
3749 // 32-bit class.
3750 continue;
3751 }
3752
3753 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3754
3755 // Ensure the interpreted immediate value is a valid operand in the new
3756 // mov.
3757 //
3758 // FIXME: isImmOperandLegal should have form that doesn't require existing
3759 // MachineInstr or MachineOperand
3760 if (!RI.opCanUseLiteralConstant(OpType: OpInfo.OperandType) &&
3761 !isInlineConstant(ImmVal: *SubRegImm, OperandType: OpInfo.OperandType))
3762 break;
3763
3764 NewOpc = MovOp;
3765 break;
3766 }
3767
3768 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3769 return false;
3770
3771 if (Is16Bit) {
3772 UseMI.getOperand(i: 0).setSubReg(AMDGPU::NoSubRegister);
3773 if (MovDstPhysReg)
3774 UseMI.getOperand(i: 0).setReg(MovDstPhysReg);
3775 assert(UseMI.getOperand(1).getReg().isVirtual());
3776 }
3777
3778 const MCInstrDesc &NewMCID = get(Opcode: NewOpc);
3779 UseMI.setDesc(NewMCID);
3780 UseMI.getOperand(i: 1).ChangeToImmediate(ImmVal: *SubRegImm);
3781 UseMI.addImplicitDefUseOperands(MF&: *MF);
3782 return true;
3783 }
3784
3785 if (HasMultipleUses)
3786 return false;
3787
3788 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3789 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3790 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3791 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3792 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3793 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3794 Opc == AMDGPU::V_FMAC_F64_e64) {
3795 // Don't fold if we are using source or output modifiers. The new VOP2
3796 // instructions don't have them.
3797 if (hasAnyModifiersSet(MI: UseMI))
3798 return false;
3799
3800 // If this is a free constant, there's no reason to do this.
3801 // TODO: We could fold this here instead of letting SIFoldOperands do it
3802 // later.
3803 int Src0Idx = getNamedOperandIdx(Opcode: UseMI.getOpcode(), Name: AMDGPU::OpName::src0);
3804
3805 // Any src operand can be used for the legality check.
3806 if (isInlineConstant(MI: UseMI, OpIdx: Src0Idx, ImmVal: Imm))
3807 return false;
3808
3809 MachineOperand *Src0 = &UseMI.getOperand(i: Src0Idx);
3810
3811 MachineOperand *Src1 = getNamedOperand(MI&: UseMI, OperandName: AMDGPU::OpName::src1);
3812 MachineOperand *Src2 = getNamedOperand(MI&: UseMI, OperandName: AMDGPU::OpName::src2);
3813
3814 auto CopyRegOperandToNarrowerRC =
3815 [MRI, this](MachineInstr &MI, unsigned OpNo,
3816 const TargetRegisterClass *NewRC) -> void {
3817 if (!MI.getOperand(i: OpNo).isReg())
3818 return;
3819 Register Reg = MI.getOperand(i: OpNo).getReg();
3820 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI: *MRI, Reg);
3821 if (RI.getCommonSubClass(A: RC, B: NewRC) != NewRC)
3822 return;
3823 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
3824 BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(),
3825 MCID: get(Opcode: AMDGPU::COPY), DestReg: Tmp)
3826 .addReg(RegNo: Reg);
3827 MI.getOperand(i: OpNo).setReg(Tmp);
3828 MI.getOperand(i: OpNo).setIsKill();
3829 };
3830
3831 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3832 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3833 (Src1->isReg() && Src1->getReg() == Reg)) {
3834 MachineOperand *RegSrc =
3835 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3836 if (!RegSrc->isReg())
3837 return false;
3838 if (RI.isSGPRClass(RC: MRI->getRegClass(Reg: RegSrc->getReg())) &&
3839 ST.getConstantBusLimit(Opcode: Opc) < 2)
3840 return false;
3841
3842 if (!Src2->isReg() || RI.isSGPRClass(RC: MRI->getRegClass(Reg: Src2->getReg())))
3843 return false;
3844
3845 // If src2 is also a literal constant then we have to choose which one to
3846 // fold. In general it is better to choose madak so that the other literal
3847 // can be materialized in an sgpr instead of a vgpr:
3848 // s_mov_b32 s0, literal
3849 // v_madak_f32 v0, s0, v0, literal
3850 // Instead of:
3851 // v_mov_b32 v1, literal
3852 // v_madmk_f32 v0, v0, literal, v1
3853 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src2->getReg());
3854 if (Def && Def->isMoveImmediate() &&
3855 !isInlineConstant(MO: Def->getOperand(i: 1)))
3856 return false;
3857
3858 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3859 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
3860 return false;
3861
3862 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3863 Imm, SubRegIndex: RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3864
3865 // FIXME: This would be a lot easier if we could return a new instruction
3866 // instead of having to modify in place.
3867
3868 Register SrcReg = RegSrc->getReg();
3869 unsigned SrcSubReg = RegSrc->getSubReg();
3870 Src0->setReg(SrcReg);
3871 Src0->setSubReg(SrcSubReg);
3872 Src0->setIsKill(RegSrc->isKill());
3873
3874 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3875 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3876 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3877 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3878 UseMI.untieRegOperand(
3879 OpIdx: AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2));
3880
3881 Src1->ChangeToImmediate(ImmVal: *SubRegImm);
3882
3883 removeModOperands(MI&: UseMI);
3884 UseMI.setDesc(get(Opcode: NewOpc));
3885
3886 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3887 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3888 const TargetRegisterClass *NewRC = getRegClass(MCID: get(Opcode: NewOpc), OpNum: 0);
3889 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
3890 BuildMI(BB&: *UseMI.getParent(), I: std::next(x: UseMI.getIterator()),
3891 MIMD: UseMI.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
3892 DestReg: UseMI.getOperand(i: 0).getReg())
3893 .addReg(RegNo: Tmp, Flags: RegState::Kill);
3894 UseMI.getOperand(i: 0).setReg(Tmp);
3895 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3896 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3897 }
3898
3899 bool DeleteDef = MRI->use_nodbg_empty(RegNo: Reg);
3900 if (DeleteDef)
3901 DefMI.eraseFromParent();
3902
3903 return true;
3904 }
3905
3906 // Added part is the constant: Use v_madak_{f16, f32}.
3907 if (Src2->isReg() && Src2->getReg() == Reg) {
3908 if (ST.getConstantBusLimit(Opcode: Opc) < 2) {
3909 // Not allowed to use constant bus for another operand.
3910 // We can however allow an inline immediate as src0.
3911 bool Src0Inlined = false;
3912 if (Src0->isReg()) {
3913 // Try to inline constant if possible.
3914 // If the Def moves immediate and the use is single
3915 // We are saving VGPR here.
3916 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src0->getReg());
3917 if (Def && Def->isMoveImmediate() &&
3918 isInlineConstant(MO: Def->getOperand(i: 1)) &&
3919 MRI->hasOneNonDBGUse(RegNo: Src0->getReg())) {
3920 Src0->ChangeToImmediate(ImmVal: Def->getOperand(i: 1).getImm());
3921 Src0Inlined = true;
3922 } else if (ST.getConstantBusLimit(Opcode: Opc) <= 1 &&
3923 RI.isSGPRReg(MRI: *MRI, Reg: Src0->getReg())) {
3924 return false;
3925 }
3926 // VGPR is okay as Src0 - fallthrough
3927 }
3928
3929 if (Src1->isReg() && !Src0Inlined) {
3930 // We have one slot for inlinable constant so far - try to fill it
3931 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src1->getReg());
3932 if (Def && Def->isMoveImmediate() &&
3933 isInlineConstant(MO: Def->getOperand(i: 1)) &&
3934 MRI->hasOneNonDBGUse(RegNo: Src1->getReg()) && commuteInstruction(MI&: UseMI))
3935 Src0->ChangeToImmediate(ImmVal: Def->getOperand(i: 1).getImm());
3936 else if (RI.isSGPRReg(MRI: *MRI, Reg: Src1->getReg()))
3937 return false;
3938 // VGPR is okay as Src1 - fallthrough
3939 }
3940 }
3941
3942 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3943 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
3944 return false;
3945
3946 // FIXME: This would be a lot easier if we could return a new instruction
3947 // instead of having to modify in place.
3948
3949 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3950 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3951 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3952 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3953 UseMI.untieRegOperand(
3954 OpIdx: AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2));
3955
3956 const std::optional<int64_t> SubRegImm =
3957 extractSubregFromImm(Imm, SubRegIndex: Src2->getSubReg());
3958
3959 // ChangingToImmediate adds Src2 back to the instruction.
3960 Src2->ChangeToImmediate(ImmVal: *SubRegImm);
3961
3962 // These come before src2.
3963 removeModOperands(MI&: UseMI);
3964 UseMI.setDesc(get(Opcode: NewOpc));
3965
3966 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3967 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3968 const TargetRegisterClass *NewRC = getRegClass(MCID: get(Opcode: NewOpc), OpNum: 0);
3969 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
3970 BuildMI(BB&: *UseMI.getParent(), I: std::next(x: UseMI.getIterator()),
3971 MIMD: UseMI.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
3972 DestReg: UseMI.getOperand(i: 0).getReg())
3973 .addReg(RegNo: Tmp, Flags: RegState::Kill);
3974 UseMI.getOperand(i: 0).setReg(Tmp);
3975 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3976 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
3977 }
3978
3979 // It might happen that UseMI was commuted
3980 // and we now have SGPR as SRC1. If so 2 inlined
3981 // constant and SGPR are illegal.
3982 legalizeOperands(MI&: UseMI);
3983
3984 bool DeleteDef = MRI->use_nodbg_empty(RegNo: Reg);
3985 if (DeleteDef)
3986 DefMI.eraseFromParent();
3987
3988 return true;
3989 }
3990 }
3991
3992 return false;
3993}
3994
3995static bool
3996memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
3997 ArrayRef<const MachineOperand *> BaseOps2) {
3998 if (BaseOps1.size() != BaseOps2.size())
3999 return false;
4000 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
4001 if (!BaseOps1[I]->isIdenticalTo(Other: *BaseOps2[I]))
4002 return false;
4003 }
4004 return true;
4005}
4006
4007static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
4008 LocationSize WidthB, int OffsetB) {
4009 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
4010 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
4011 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
4012 return LowWidth.hasValue() &&
4013 LowOffset + (int)LowWidth.getValue() <= HighOffset;
4014}
4015
4016bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
4017 const MachineInstr &MIb) const {
4018 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
4019 int64_t Offset0, Offset1;
4020 LocationSize Dummy0 = LocationSize::precise(Value: 0);
4021 LocationSize Dummy1 = LocationSize::precise(Value: 0);
4022 bool Offset0IsScalable, Offset1IsScalable;
4023 if (!getMemOperandsWithOffsetWidth(LdSt: MIa, BaseOps&: BaseOps0, Offset&: Offset0, OffsetIsScalable&: Offset0IsScalable,
4024 Width&: Dummy0, TRI: &RI) ||
4025 !getMemOperandsWithOffsetWidth(LdSt: MIb, BaseOps&: BaseOps1, Offset&: Offset1, OffsetIsScalable&: Offset1IsScalable,
4026 Width&: Dummy1, TRI: &RI))
4027 return false;
4028
4029 if (!memOpsHaveSameBaseOperands(BaseOps1: BaseOps0, BaseOps2: BaseOps1))
4030 return false;
4031
4032 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
4033 // FIXME: Handle ds_read2 / ds_write2.
4034 return false;
4035 }
4036 LocationSize Width0 = MIa.memoperands().front()->getSize();
4037 LocationSize Width1 = MIb.memoperands().front()->getSize();
4038 return offsetsDoNotOverlap(WidthA: Width0, OffsetA: Offset0, WidthB: Width1, OffsetB: Offset1);
4039}
4040
4041bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
4042 const MachineInstr &MIb) const {
4043 assert(MIa.mayLoadOrStore() &&
4044 "MIa must load from or modify a memory location");
4045 assert(MIb.mayLoadOrStore() &&
4046 "MIb must load from or modify a memory location");
4047
4048 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
4049 return false;
4050
4051 // XXX - Can we relax this between address spaces?
4052 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4053 return false;
4054
4055 if (isLDSDMA(MI: MIa) || isLDSDMA(MI: MIb))
4056 return false;
4057
4058 if (MIa.isBundle() || MIb.isBundle())
4059 return false;
4060
4061 // TODO: Should we check the address space from the MachineMemOperand? That
4062 // would allow us to distinguish objects we know don't alias based on the
4063 // underlying address space, even if it was lowered to a different one,
4064 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4065 // buffer.
4066 if (isDS(MI: MIa)) {
4067 if (isDS(MI: MIb))
4068 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4069
4070 return !isFLAT(MI: MIb) || isSegmentSpecificFLAT(MI: MIb);
4071 }
4072
4073 if (isMUBUF(MI: MIa) || isMTBUF(MI: MIa)) {
4074 if (isMUBUF(MI: MIb) || isMTBUF(MI: MIb))
4075 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4076
4077 if (isFLAT(MI: MIb))
4078 return isFLATScratch(MI: MIb);
4079
4080 return !isSMRD(MI: MIb);
4081 }
4082
4083 if (isSMRD(MI: MIa)) {
4084 if (isSMRD(MI: MIb))
4085 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4086
4087 if (isFLAT(MI: MIb))
4088 return isFLATScratch(MI: MIb);
4089
4090 return !isMUBUF(MI: MIb) && !isMTBUF(MI: MIb);
4091 }
4092
4093 if (isFLAT(MI: MIa)) {
4094 if (isFLAT(MI: MIb)) {
4095 if ((isFLATScratch(MI: MIa) && isFLATGlobal(MI: MIb)) ||
4096 (isFLATGlobal(MI: MIa) && isFLATScratch(MI: MIb)))
4097 return true;
4098
4099 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4100 }
4101
4102 return false;
4103 }
4104
4105 return false;
4106}
4107
4108static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,
4109 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4110 if (Reg.isPhysical())
4111 return false;
4112 auto *Def = MRI.getUniqueVRegDef(Reg);
4113 if (Def && SIInstrInfo::isFoldableCopy(MI: *Def) && Def->getOperand(i: 1).isImm()) {
4114 Imm = Def->getOperand(i: 1).getImm();
4115 if (DefMI)
4116 *DefMI = Def;
4117 return true;
4118 }
4119 return false;
4120}
4121
4122static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4123 MachineInstr **DefMI = nullptr) {
4124 if (!MO->isReg())
4125 return false;
4126 const MachineFunction *MF = MO->getParent()->getMF();
4127 const MachineRegisterInfo &MRI = MF->getRegInfo();
4128 return getFoldableImm(Reg: MO->getReg(), MRI, Imm, DefMI);
4129}
4130
4131static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
4132 MachineInstr &NewMI) {
4133 if (LV) {
4134 unsigned NumOps = MI.getNumOperands();
4135 for (unsigned I = 1; I < NumOps; ++I) {
4136 MachineOperand &Op = MI.getOperand(i: I);
4137 if (Op.isReg() && Op.isKill())
4138 LV->replaceKillInstruction(Reg: Op.getReg(), OldMI&: MI, NewMI);
4139 }
4140 }
4141}
4142
4143static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4144 switch (Opc) {
4145 case AMDGPU::V_MAC_F16_e32:
4146 case AMDGPU::V_MAC_F16_e64:
4147 return AMDGPU::V_MAD_F16_e64;
4148 case AMDGPU::V_MAC_F32_e32:
4149 case AMDGPU::V_MAC_F32_e64:
4150 return AMDGPU::V_MAD_F32_e64;
4151 case AMDGPU::V_MAC_LEGACY_F32_e32:
4152 case AMDGPU::V_MAC_LEGACY_F32_e64:
4153 return AMDGPU::V_MAD_LEGACY_F32_e64;
4154 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4155 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4156 return AMDGPU::V_FMA_LEGACY_F32_e64;
4157 case AMDGPU::V_FMAC_F16_e32:
4158 case AMDGPU::V_FMAC_F16_e64:
4159 case AMDGPU::V_FMAC_F16_t16_e64:
4160 case AMDGPU::V_FMAC_F16_fake16_e64:
4161 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4162 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4163 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4164 : AMDGPU::V_FMA_F16_gfx9_e64;
4165 case AMDGPU::V_FMAC_F32_e32:
4166 case AMDGPU::V_FMAC_F32_e64:
4167 return AMDGPU::V_FMA_F32_e64;
4168 case AMDGPU::V_FMAC_F64_e32:
4169 case AMDGPU::V_FMAC_F64_e64:
4170 return AMDGPU::V_FMA_F64_e64;
4171 default:
4172 llvm_unreachable("invalid instruction");
4173 }
4174}
4175
4176/// Helper struct for the implementation of 3-address conversion to communicate
4177/// updates made to instruction operands.
4178struct SIInstrInfo::ThreeAddressUpdates {
4179 /// Other instruction whose def is no longer used by the converted
4180 /// instruction.
4181 MachineInstr *RemoveMIUse = nullptr;
4182};
4183
4184MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
4185 LiveVariables *LV,
4186 LiveIntervals *LIS) const {
4187 MachineBasicBlock &MBB = *MI.getParent();
4188 MachineInstr *CandidateMI = &MI;
4189
4190 if (MI.isBundle()) {
4191 // This is a temporary placeholder for bundle handling that enables us to
4192 // exercise the relevant code paths in the two-address instruction pass.
4193 if (MI.getBundleSize() != 1)
4194 return nullptr;
4195 CandidateMI = MI.getNextNode();
4196 }
4197
4198 ThreeAddressUpdates U;
4199 MachineInstr *NewMI = convertToThreeAddressImpl(MI&: *CandidateMI, Updates&: U);
4200 if (!NewMI)
4201 return nullptr;
4202
4203 if (MI.isBundle()) {
4204 CandidateMI->eraseFromBundle();
4205
4206 for (MachineOperand &MO : MI.all_defs()) {
4207 if (MO.isTied())
4208 MI.untieRegOperand(OpIdx: MO.getOperandNo());
4209 }
4210 } else {
4211 updateLiveVariables(LV, MI, NewMI&: *NewMI);
4212 if (LIS) {
4213 LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *NewMI);
4214 // SlotIndex of defs needs to be updated when converting to early-clobber
4215 MachineOperand &Def = NewMI->getOperand(i: 0);
4216 if (Def.isEarlyClobber() && Def.isReg() &&
4217 LIS->hasInterval(Reg: Def.getReg())) {
4218 SlotIndex OldIndex = LIS->getInstructionIndex(Instr: *NewMI).getRegSlot(EC: false);
4219 SlotIndex NewIndex = LIS->getInstructionIndex(Instr: *NewMI).getRegSlot(EC: true);
4220 auto &LI = LIS->getInterval(Reg: Def.getReg());
4221 auto UpdateDefIndex = [&](LiveRange &LR) {
4222 auto *S = LR.find(Pos: OldIndex);
4223 if (S != LR.end() && S->start == OldIndex) {
4224 assert(S->valno && S->valno->def == OldIndex);
4225 S->start = NewIndex;
4226 S->valno->def = NewIndex;
4227 }
4228 };
4229 UpdateDefIndex(LI);
4230 for (auto &SR : LI.subranges())
4231 UpdateDefIndex(SR);
4232 }
4233 }
4234 }
4235
4236 if (U.RemoveMIUse) {
4237 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4238 // The only user is the instruction which will be killed.
4239 Register DefReg = U.RemoveMIUse->getOperand(i: 0).getReg();
4240
4241 if (MRI.hasOneNonDBGUse(RegNo: DefReg)) {
4242 // We cannot just remove the DefMI here, calling pass will crash.
4243 U.RemoveMIUse->setDesc(get(Opcode: AMDGPU::IMPLICIT_DEF));
4244 U.RemoveMIUse->getOperand(i: 0).setIsDead(true);
4245 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4246 U.RemoveMIUse->removeOperand(OpNo: I);
4247 if (LV)
4248 LV->getVarInfo(Reg: DefReg).AliveBlocks.clear();
4249 }
4250
4251 if (MI.isBundle()) {
4252 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, Reg: DefReg);
4253 if (!VRI.Reads && !VRI.Writes) {
4254 for (MachineOperand &MO : MI.all_uses()) {
4255 if (MO.isReg() && MO.getReg() == DefReg) {
4256 assert(MO.getSubReg() == 0 &&
4257 "tied sub-registers in bundles currently not supported");
4258 MI.removeOperand(OpNo: MO.getOperandNo());
4259 break;
4260 }
4261 }
4262
4263 if (LIS)
4264 LIS->shrinkToUses(li: &LIS->getInterval(Reg: DefReg));
4265 }
4266 } else if (LIS) {
4267 LiveInterval &DefLI = LIS->getInterval(Reg: DefReg);
4268
4269 // We cannot delete the original instruction here, so hack out the use
4270 // in the original instruction with a dummy register so we can use
4271 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4272 // not have the complexity of deleting a use to consider here.
4273 Register DummyReg = MRI.cloneVirtualRegister(VReg: DefReg);
4274 for (MachineOperand &MIOp : MI.uses()) {
4275 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4276 MIOp.setIsUndef(true);
4277 MIOp.setReg(DummyReg);
4278 }
4279 }
4280
4281 if (MI.isBundle()) {
4282 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, Reg: DefReg);
4283 if (!VRI.Reads && !VRI.Writes) {
4284 for (MachineOperand &MIOp : MI.uses()) {
4285 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4286 MIOp.setIsUndef(true);
4287 MIOp.setReg(DummyReg);
4288 }
4289 }
4290 }
4291
4292 MI.addOperand(Op: MachineOperand::CreateReg(Reg: DummyReg, isDef: false, isImp: false, isKill: false,
4293 isDead: false, /*isUndef=*/true));
4294 }
4295
4296 LIS->shrinkToUses(li: &DefLI);
4297 }
4298 }
4299
4300 return MI.isBundle() ? &MI : NewMI;
4301}
4302
4303MachineInstr *
4304SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4305 ThreeAddressUpdates &U) const {
4306 MachineBasicBlock &MBB = *MI.getParent();
4307 unsigned Opc = MI.getOpcode();
4308
4309 // Handle MFMA.
4310 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opcode: Opc);
4311 if (NewMFMAOpc != -1) {
4312 MachineInstrBuilder MIB =
4313 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewMFMAOpc));
4314 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4315 MIB.add(MO: MI.getOperand(i: I));
4316 return MIB;
4317 }
4318
4319 if (SIInstrInfo::isWMMA(MI)) {
4320 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(Opc: MI.getOpcode());
4321 MachineInstrBuilder MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4322 .setMIFlags(MI.getFlags());
4323 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4324 MIB->addOperand(Op: MI.getOperand(i: I));
4325 return MIB;
4326 }
4327
4328 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4329 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4330 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4331 "present pre-RA");
4332
4333 // Handle MAC/FMAC.
4334 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4335 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4336 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4337 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4338 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4339 bool Src0Literal = false;
4340
4341 switch (Opc) {
4342 default:
4343 return nullptr;
4344 case AMDGPU::V_MAC_F16_e64:
4345 case AMDGPU::V_FMAC_F16_e64:
4346 case AMDGPU::V_FMAC_F16_t16_e64:
4347 case AMDGPU::V_FMAC_F16_fake16_e64:
4348 case AMDGPU::V_MAC_F32_e64:
4349 case AMDGPU::V_MAC_LEGACY_F32_e64:
4350 case AMDGPU::V_FMAC_F32_e64:
4351 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4352 case AMDGPU::V_FMAC_F64_e64:
4353 break;
4354 case AMDGPU::V_MAC_F16_e32:
4355 case AMDGPU::V_FMAC_F16_e32:
4356 case AMDGPU::V_MAC_F32_e32:
4357 case AMDGPU::V_MAC_LEGACY_F32_e32:
4358 case AMDGPU::V_FMAC_F32_e32:
4359 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4360 case AMDGPU::V_FMAC_F64_e32: {
4361 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
4362 Name: AMDGPU::OpName::src0);
4363 const MachineOperand *Src0 = &MI.getOperand(i: Src0Idx);
4364 if (!Src0->isReg() && !Src0->isImm())
4365 return nullptr;
4366
4367 if (Src0->isImm() && !isInlineConstant(MI, OpIdx: Src0Idx, MO: *Src0))
4368 Src0Literal = true;
4369
4370 break;
4371 }
4372 }
4373
4374 MachineInstrBuilder MIB;
4375 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
4376 const MachineOperand *Src0 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
4377 const MachineOperand *Src0Mods =
4378 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
4379 const MachineOperand *Src1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4380 const MachineOperand *Src1Mods =
4381 getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
4382 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
4383 const MachineOperand *Src2Mods =
4384 getNamedOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers);
4385 const MachineOperand *Clamp = getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
4386 const MachineOperand *Omod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
4387 const MachineOperand *OpSel = getNamedOperand(MI, OperandName: AMDGPU::OpName::op_sel);
4388
4389 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4390 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4391 // If we have an SGPR input, we will violate the constant bus restriction.
4392 (ST.getConstantBusLimit(Opcode: Opc) > 1 || !Src0->isReg() ||
4393 !RI.isSGPRReg(MRI: MBB.getParent()->getRegInfo(), Reg: Src0->getReg()))) {
4394 MachineInstr *DefMI;
4395
4396 int64_t Imm;
4397 if (!Src0Literal && getFoldableImm(MO: Src2, Imm, DefMI: &DefMI)) {
4398 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4399 if (pseudoToMCOpcode(Opcode: NewOpc) != -1) {
4400 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4401 .add(MO: *Dst)
4402 .add(MO: *Src0)
4403 .add(MO: *Src1)
4404 .addImm(Val: Imm)
4405 .setMIFlags(MI.getFlags());
4406 U.RemoveMIUse = DefMI;
4407 return MIB;
4408 }
4409 }
4410 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4411 if (!Src0Literal && getFoldableImm(MO: Src1, Imm, DefMI: &DefMI)) {
4412 if (pseudoToMCOpcode(Opcode: NewOpc) != -1) {
4413 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4414 .add(MO: *Dst)
4415 .add(MO: *Src0)
4416 .addImm(Val: Imm)
4417 .add(MO: *Src2)
4418 .setMIFlags(MI.getFlags());
4419 U.RemoveMIUse = DefMI;
4420 return MIB;
4421 }
4422 }
4423 if (Src0Literal || getFoldableImm(MO: Src0, Imm, DefMI: &DefMI)) {
4424 if (Src0Literal) {
4425 Imm = Src0->getImm();
4426 DefMI = nullptr;
4427 }
4428 if (pseudoToMCOpcode(Opcode: NewOpc) != -1 &&
4429 isOperandLegal(
4430 MI, OpIdx: AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::src0),
4431 MO: Src1)) {
4432 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4433 .add(MO: *Dst)
4434 .add(MO: *Src1)
4435 .addImm(Val: Imm)
4436 .add(MO: *Src2)
4437 .setMIFlags(MI.getFlags());
4438 U.RemoveMIUse = DefMI;
4439 return MIB;
4440 }
4441 }
4442 }
4443
4444 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4445 // if VOP3 does not allow a literal operand.
4446 if (Src0Literal && !ST.hasVOP3Literal())
4447 return nullptr;
4448
4449 unsigned NewOpc = getNewFMAInst(ST, Opc);
4450
4451 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
4452 return nullptr;
4453
4454 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4455 .add(MO: *Dst)
4456 .addImm(Val: Src0Mods ? Src0Mods->getImm() : 0)
4457 .add(MO: *Src0)
4458 .addImm(Val: Src1Mods ? Src1Mods->getImm() : 0)
4459 .add(MO: *Src1)
4460 .addImm(Val: Src2Mods ? Src2Mods->getImm() : 0)
4461 .add(MO: *Src2)
4462 .addImm(Val: Clamp ? Clamp->getImm() : 0)
4463 .addImm(Val: Omod ? Omod->getImm() : 0)
4464 .setMIFlags(MI.getFlags());
4465 if (AMDGPU::hasNamedOperand(Opcode: NewOpc, NamedIdx: AMDGPU::OpName::op_sel))
4466 MIB.addImm(Val: OpSel ? OpSel->getImm() : 0);
4467 return MIB;
4468}
4469
4470// It's not generally safe to move VALU instructions across these since it will
4471// start using the register as a base index rather than directly.
4472// XXX - Why isn't hasSideEffects sufficient for these?
4473static bool changesVGPRIndexingMode(const MachineInstr &MI) {
4474 switch (MI.getOpcode()) {
4475 case AMDGPU::S_SET_GPR_IDX_ON:
4476 case AMDGPU::S_SET_GPR_IDX_MODE:
4477 case AMDGPU::S_SET_GPR_IDX_OFF:
4478 return true;
4479 default:
4480 return false;
4481 }
4482}
4483
4484bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
4485 const MachineBasicBlock *MBB,
4486 const MachineFunction &MF) const {
4487 // Skipping the check for SP writes in the base implementation. The reason it
4488 // was added was apparently due to compile time concerns.
4489 //
4490 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4491 // but is probably avoidable.
4492
4493 // Copied from base implementation.
4494 // Terminators and labels can't be scheduled around.
4495 if (MI.isTerminator() || MI.isPosition())
4496 return true;
4497
4498 // INLINEASM_BR can jump to another block
4499 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4500 return true;
4501
4502 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(i: 0).getImm() == 0)
4503 return true;
4504
4505 // Target-independent instructions do not have an implicit-use of EXEC, even
4506 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4507 // boundaries prevents incorrect movements of such instructions.
4508 return MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &RI) ||
4509 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4510 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4511 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4512 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4513 changesVGPRIndexingMode(MI);
4514}
4515
4516bool SIInstrInfo::isAlwaysGDS(uint32_t Opcode) const {
4517 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4518 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4519 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4520}
4521
4522bool SIInstrInfo::mayAccessScratch(const MachineInstr &MI) const {
4523 // Instructions that access scratch use FLAT encoding or BUF encodings.
4524 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4525 return false;
4526
4527 // SCRATCH instructions always access scratch.
4528 if (isFLATScratch(MI))
4529 return true;
4530
4531 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4532 // via the aperture.
4533 if (MI.getMF()->getFunction().hasFnAttribute(Kind: "amdgpu-no-flat-scratch-init"))
4534 return false;
4535
4536 // If there are no memory operands then conservatively assume the flat
4537 // operation may access scratch.
4538 if (MI.memoperands_empty())
4539 return true;
4540
4541 // See if any memory operand specifies an address space that involves scratch.
4542 return any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *Memop) {
4543 unsigned AS = Memop->getAddrSpace();
4544 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4545 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4546 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4547 MD: *MD, Val: AMDGPUAS::PRIVATE_ADDRESS);
4548 }
4549 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4550 });
4551}
4552
4553bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
4554 assert(isFLAT(MI));
4555
4556 // All flat instructions use the VMEM counter except prefetch.
4557 if (!usesVM_CNT(MI))
4558 return false;
4559
4560 // If there are no memory operands then conservatively assume the flat
4561 // operation may access VMEM.
4562 if (MI.memoperands_empty())
4563 return true;
4564
4565 // See if any memory operand specifies an address space that involves VMEM.
4566 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4567 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4568 // (GDS) address space is not supported by flat operations. Therefore, simply
4569 // return true unless only the LDS address space is found.
4570 for (const MachineMemOperand *Memop : MI.memoperands()) {
4571 unsigned AS = Memop->getAddrSpace();
4572 assert(AS != AMDGPUAS::REGION_ADDRESS);
4573 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4574 return true;
4575 }
4576
4577 return false;
4578}
4579
4580bool SIInstrInfo::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
4581 assert(isFLAT(MI));
4582
4583 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4584 if (!usesLGKM_CNT(MI))
4585 return false;
4586
4587 // If in tgsplit mode then there can be no use of LDS.
4588 if (ST.isTgSplitEnabled())
4589 return false;
4590
4591 // If there are no memory operands then conservatively assume the flat
4592 // operation may access LDS.
4593 if (MI.memoperands_empty())
4594 return true;
4595
4596 // See if any memory operand specifies an address space that involves LDS.
4597 for (const MachineMemOperand *Memop : MI.memoperands()) {
4598 unsigned AS = Memop->getAddrSpace();
4599 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
4600 return true;
4601 }
4602
4603 return false;
4604}
4605
4606bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
4607 // Skip the full operand and register alias search modifiesRegister
4608 // does. There's only a handful of instructions that touch this, it's only an
4609 // implicit def, and doesn't alias any other registers.
4610 return is_contained(Range: MI.getDesc().implicit_defs(), Element: AMDGPU::MODE);
4611}
4612
4613bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
4614 unsigned Opcode = MI.getOpcode();
4615
4616 if (MI.mayStore() && isSMRD(MI))
4617 return true; // scalar store or atomic
4618
4619 // This will terminate the function when other lanes may need to continue.
4620 if (MI.isReturn())
4621 return true;
4622
4623 // These instructions cause shader I/O that may cause hardware lockups
4624 // when executed with an empty EXEC mask.
4625 //
4626 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4627 // EXEC = 0, but checking for that case here seems not worth it
4628 // given the typical code patterns.
4629 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4630 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4631 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4632 return true;
4633
4634 if (MI.isCall() || MI.isInlineAsm())
4635 return true; // conservative assumption
4636
4637 // Assume that barrier interactions are only intended with active lanes.
4638 if (isBarrier(Opcode))
4639 return true;
4640
4641 // A mode change is a scalar operation that influences vector instructions.
4642 if (modifiesModeRegister(MI))
4643 return true;
4644
4645 // These are like SALU instructions in terms of effects, so it's questionable
4646 // whether we should return true for those.
4647 //
4648 // However, executing them with EXEC = 0 causes them to operate on undefined
4649 // data, which we avoid by returning true here.
4650 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4651 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4652 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4653 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4654 return true;
4655
4656 return false;
4657}
4658
4659bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,
4660 const MachineInstr &MI) const {
4661 if (MI.isMetaInstruction())
4662 return false;
4663
4664 // This won't read exec if this is an SGPR->SGPR copy.
4665 if (MI.isCopyLike()) {
4666 if (!RI.isSGPRReg(MRI, Reg: MI.getOperand(i: 0).getReg()))
4667 return true;
4668
4669 // Make sure this isn't copying exec as a normal operand
4670 return MI.readsRegister(Reg: AMDGPU::EXEC, TRI: &RI);
4671 }
4672
4673 // Make a conservative assumption about the callee.
4674 if (MI.isCall())
4675 return true;
4676
4677 // Be conservative with any unhandled generic opcodes.
4678 if (!isTargetSpecificOpcode(Opcode: MI.getOpcode()))
4679 return true;
4680
4681 return !isSALU(MI) || MI.readsRegister(Reg: AMDGPU::EXEC, TRI: &RI);
4682}
4683
4684bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4685 switch (Imm.getBitWidth()) {
4686 case 1: // This likely will be a condition code mask.
4687 return true;
4688
4689 case 32:
4690 return AMDGPU::isInlinableLiteral32(Literal: Imm.getSExtValue(),
4691 HasInv2Pi: ST.hasInv2PiInlineImm());
4692 case 64:
4693 return AMDGPU::isInlinableLiteral64(Literal: Imm.getSExtValue(),
4694 HasInv2Pi: ST.hasInv2PiInlineImm());
4695 case 16:
4696 return ST.has16BitInsts() &&
4697 AMDGPU::isInlinableLiteralI16(Literal: Imm.getSExtValue(),
4698 HasInv2Pi: ST.hasInv2PiInlineImm());
4699 default:
4700 llvm_unreachable("invalid bitwidth");
4701 }
4702}
4703
4704bool SIInstrInfo::isInlineConstant(const APFloat &Imm) const {
4705 APInt IntImm = Imm.bitcastToAPInt();
4706 int64_t IntImmVal = IntImm.getSExtValue();
4707 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4708 switch (APFloat::SemanticsToEnum(Sem: Imm.getSemantics())) {
4709 default:
4710 llvm_unreachable("invalid fltSemantics");
4711 case APFloatBase::S_IEEEsingle:
4712 case APFloatBase::S_IEEEdouble:
4713 return isInlineConstant(Imm: IntImm);
4714 case APFloatBase::S_BFloat:
4715 return ST.has16BitInsts() &&
4716 AMDGPU::isInlinableLiteralBF16(Literal: IntImmVal, HasInv2Pi);
4717 case APFloatBase::S_IEEEhalf:
4718 return ST.has16BitInsts() &&
4719 AMDGPU::isInlinableLiteralFP16(Literal: IntImmVal, HasInv2Pi);
4720 }
4721}
4722
4723bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4724 // MachineOperand provides no way to tell the true operand size, since it only
4725 // records a 64-bit value. We need to know the size to determine if a 32-bit
4726 // floating point immediate bit pattern is legal for an integer immediate. It
4727 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4728 switch (OperandType) {
4729 case AMDGPU::OPERAND_REG_IMM_INT32:
4730 case AMDGPU::OPERAND_REG_IMM_FP32:
4731 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
4732 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
4733 case AMDGPU::OPERAND_REG_IMM_V2FP32:
4734 case AMDGPU::OPERAND_REG_IMM_V2INT32:
4735 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
4736 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
4737 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: {
4738 int32_t Trunc = static_cast<int32_t>(Imm);
4739 return AMDGPU::isInlinableLiteral32(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4740 }
4741 case AMDGPU::OPERAND_REG_IMM_INT64:
4742 case AMDGPU::OPERAND_REG_IMM_FP64:
4743 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
4744 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
4745 case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
4746 return AMDGPU::isInlinableLiteral64(Literal: Imm, HasInv2Pi: ST.hasInv2PiInlineImm());
4747 case AMDGPU::OPERAND_REG_IMM_INT16:
4748 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
4749 // We would expect inline immediates to not be concerned with an integer/fp
4750 // distinction. However, in the case of 16-bit integer operations, the
4751 // "floating point" values appear to not work. It seems read the low 16-bits
4752 // of 32-bit immediates, which happens to always work for the integer
4753 // values.
4754 //
4755 // See llvm bugzilla 46302.
4756 //
4757 // TODO: Theoretically we could use op-sel to use the high bits of the
4758 // 32-bit FP values.
4759 return AMDGPU::isInlinableIntLiteral(Literal: Imm);
4760 case AMDGPU::OPERAND_REG_IMM_V2INT16:
4761 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
4762 return AMDGPU::isInlinableLiteralV2I16(Literal: Imm);
4763 case AMDGPU::OPERAND_REG_IMM_V2FP16:
4764 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
4765 return AMDGPU::isInlinableLiteralV2F16(Literal: Imm);
4766 case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
4767 return AMDGPU::isPKFMACF16InlineConstant(Literal: Imm, IsGFX11Plus: ST.isGFX11Plus());
4768 case AMDGPU::OPERAND_REG_IMM_V2BF16:
4769 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
4770 return AMDGPU::isInlinableLiteralV2BF16(Literal: Imm);
4771 case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
4772 return false;
4773 case AMDGPU::OPERAND_REG_IMM_FP16:
4774 case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
4775 if (isInt<16>(x: Imm) || isUInt<16>(x: Imm)) {
4776 // A few special case instructions have 16-bit operands on subtargets
4777 // where 16-bit instructions are not legal.
4778 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4779 // constants in these cases
4780 int16_t Trunc = static_cast<int16_t>(Imm);
4781 return ST.has16BitInsts() &&
4782 AMDGPU::isInlinableLiteralFP16(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4783 }
4784
4785 return false;
4786 }
4787 case AMDGPU::OPERAND_REG_IMM_BF16:
4788 case AMDGPU::OPERAND_REG_INLINE_C_BF16: {
4789 if (isInt<16>(x: Imm) || isUInt<16>(x: Imm)) {
4790 int16_t Trunc = static_cast<int16_t>(Imm);
4791 return ST.has16BitInsts() &&
4792 AMDGPU::isInlinableLiteralBF16(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4793 }
4794 return false;
4795 }
4796 case AMDGPU::OPERAND_KIMM32:
4797 case AMDGPU::OPERAND_KIMM16:
4798 case AMDGPU::OPERAND_KIMM64:
4799 return false;
4800 case AMDGPU::OPERAND_INLINE_C_AV64_PSEUDO:
4801 return isLegalAV64PseudoImm(Imm);
4802 case AMDGPU::OPERAND_INPUT_MODS:
4803 case MCOI::OPERAND_IMMEDIATE:
4804 // Always embedded in the instruction for free.
4805 return true;
4806 case MCOI::OPERAND_UNKNOWN:
4807 case MCOI::OPERAND_REGISTER:
4808 case MCOI::OPERAND_PCREL:
4809 case MCOI::OPERAND_GENERIC_0:
4810 case MCOI::OPERAND_GENERIC_1:
4811 case MCOI::OPERAND_GENERIC_2:
4812 case MCOI::OPERAND_GENERIC_3:
4813 case MCOI::OPERAND_GENERIC_4:
4814 case MCOI::OPERAND_GENERIC_5:
4815 // Just ignore anything else.
4816 return true;
4817 default:
4818 llvm_unreachable("invalid operand type");
4819 }
4820}
4821
4822static bool compareMachineOp(const MachineOperand &Op0,
4823 const MachineOperand &Op1) {
4824 if (Op0.getType() != Op1.getType())
4825 return false;
4826
4827 switch (Op0.getType()) {
4828 case MachineOperand::MO_Register:
4829 return Op0.getReg() == Op1.getReg();
4830 case MachineOperand::MO_Immediate:
4831 return Op0.getImm() == Op1.getImm();
4832 default:
4833 llvm_unreachable("Didn't expect to be comparing these operand types");
4834 }
4835}
4836
4837bool SIInstrInfo::isLiteralOperandLegal(const MCInstrDesc &InstDesc,
4838 const MCOperandInfo &OpInfo) const {
4839 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4840 return true;
4841
4842 if (!RI.opCanUseLiteralConstant(OpType: OpInfo.OperandType))
4843 return false;
4844
4845 if (!isVOP3(Desc: InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4846 return true;
4847
4848 return ST.hasVOP3Literal();
4849}
4850
4851bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4852 int64_t ImmVal) const {
4853 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4854 if (isInlineConstant(Imm: ImmVal, OperandType: OpInfo.OperandType)) {
4855 if (isMAI(Desc: InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4856 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(Opcode: InstDesc.getOpcode(),
4857 Name: AMDGPU::OpName::src2))
4858 return false;
4859 return RI.opCanUseInlineConstant(OpType: OpInfo.OperandType);
4860 }
4861
4862 return isLiteralOperandLegal(InstDesc, OpInfo);
4863}
4864
4865bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4866 const MachineOperand &MO) const {
4867 if (MO.isImm())
4868 return isImmOperandLegal(InstDesc, OpNo, ImmVal: MO.getImm());
4869
4870 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4871 "unexpected imm-like operand kind");
4872 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4873 return isLiteralOperandLegal(InstDesc, OpInfo);
4874}
4875
4876bool SIInstrInfo::isLegalAV64PseudoImm(uint64_t Imm) const {
4877 // 2 32-bit inline constants packed into one.
4878 return AMDGPU::isInlinableLiteral32(Literal: Lo_32(Value: Imm), HasInv2Pi: ST.hasInv2PiInlineImm()) &&
4879 AMDGPU::isInlinableLiteral32(Literal: Hi_32(Value: Imm), HasInv2Pi: ST.hasInv2PiInlineImm());
4880}
4881
4882bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4883 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4884 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4885 return false;
4886
4887 int Op32 = AMDGPU::getVOPe32(Opcode);
4888 if (Op32 == -1)
4889 return false;
4890
4891 return pseudoToMCOpcode(Opcode: Op32) != -1;
4892}
4893
4894bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4895 // The src0_modifier operand is present on all instructions
4896 // that have modifiers.
4897
4898 return AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers);
4899}
4900
4901bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
4902 AMDGPU::OpName OpName) const {
4903 const MachineOperand *Mods = getNamedOperand(MI, OperandName: OpName);
4904 return Mods && Mods->getImm();
4905}
4906
4907bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
4908 return any_of(Range: ModifierOpNames,
4909 P: [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, OpName: Name); });
4910}
4911
4912bool SIInstrInfo::canShrink(const MachineInstr &MI,
4913 const MachineRegisterInfo &MRI) const {
4914 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
4915 // Can't shrink instruction with three operands.
4916 if (Src2) {
4917 switch (MI.getOpcode()) {
4918 default: return false;
4919
4920 case AMDGPU::V_ADDC_U32_e64:
4921 case AMDGPU::V_SUBB_U32_e64:
4922 case AMDGPU::V_SUBBREV_U32_e64: {
4923 const MachineOperand *Src1
4924 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4925 if (!Src1->isReg() || !RI.isVGPR(MRI, Reg: Src1->getReg()))
4926 return false;
4927 // Additional verification is needed for sdst/src2.
4928 return true;
4929 }
4930 case AMDGPU::V_MAC_F16_e64:
4931 case AMDGPU::V_MAC_F32_e64:
4932 case AMDGPU::V_MAC_LEGACY_F32_e64:
4933 case AMDGPU::V_FMAC_F16_e64:
4934 case AMDGPU::V_FMAC_F16_t16_e64:
4935 case AMDGPU::V_FMAC_F16_fake16_e64:
4936 case AMDGPU::V_FMAC_F32_e64:
4937 case AMDGPU::V_FMAC_F64_e64:
4938 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4939 if (!Src2->isReg() || !RI.isVGPR(MRI, Reg: Src2->getReg()) ||
4940 hasModifiersSet(MI, OpName: AMDGPU::OpName::src2_modifiers))
4941 return false;
4942 break;
4943
4944 case AMDGPU::V_CNDMASK_B32_e64:
4945 break;
4946 }
4947 }
4948
4949 const MachineOperand *Src1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4950 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Reg: Src1->getReg()) ||
4951 hasModifiersSet(MI, OpName: AMDGPU::OpName::src1_modifiers)))
4952 return false;
4953
4954 // We don't need to check src0, all input types are legal, so just make sure
4955 // src0 isn't using any modifiers.
4956 if (hasModifiersSet(MI, OpName: AMDGPU::OpName::src0_modifiers))
4957 return false;
4958
4959 // Can it be shrunk to a valid 32 bit opcode?
4960 if (!hasVALU32BitEncoding(Opcode: MI.getOpcode()))
4961 return false;
4962
4963 // Check output modifiers
4964 return !hasModifiersSet(MI, OpName: AMDGPU::OpName::omod) &&
4965 !hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) &&
4966 !hasModifiersSet(MI, OpName: AMDGPU::OpName::byte_sel) &&
4967 // TODO: Can we avoid checking bound_ctrl/fi here?
4968 // They are only used by permlane*_swap special case.
4969 !hasModifiersSet(MI, OpName: AMDGPU::OpName::bound_ctrl) &&
4970 !hasModifiersSet(MI, OpName: AMDGPU::OpName::fi);
4971}
4972
4973// Set VCC operand with all flags from \p Orig, except for setting it as
4974// implicit.
4975static void copyFlagsToImplicitVCC(MachineInstr &MI,
4976 const MachineOperand &Orig) {
4977
4978 for (MachineOperand &Use : MI.implicit_operands()) {
4979 if (Use.isUse() &&
4980 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4981 Use.setIsUndef(Orig.isUndef());
4982 Use.setIsKill(Orig.isKill());
4983 return;
4984 }
4985 }
4986}
4987
4988MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
4989 unsigned Op32) const {
4990 MachineBasicBlock *MBB = MI.getParent();
4991
4992 const MCInstrDesc &Op32Desc = get(Opcode: Op32);
4993 MachineInstrBuilder Inst32 =
4994 BuildMI(BB&: *MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: Op32Desc)
4995 .setMIFlags(MI.getFlags());
4996
4997 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4998 // For VOPC instructions, this is replaced by an implicit def of vcc.
4999
5000 // We assume the defs of the shrunk opcode are in the same order, and the
5001 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
5002 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
5003 Inst32.add(MO: MI.getOperand(i: I));
5004
5005 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
5006
5007 int Idx = MI.getNumExplicitDefs();
5008 for (const MachineOperand &Use : MI.explicit_uses()) {
5009 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
5010 if (OpTy == AMDGPU::OPERAND_INPUT_MODS || OpTy == MCOI::OPERAND_IMMEDIATE)
5011 continue;
5012
5013 if (&Use == Src2) {
5014 if (AMDGPU::getNamedOperandIdx(Opcode: Op32, Name: AMDGPU::OpName::src2) == -1) {
5015 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
5016 // replaced with an implicit read of vcc or vcc_lo. The implicit read
5017 // of vcc was already added during the initial BuildMI, but we
5018 // 1) may need to change vcc to vcc_lo to preserve the original register
5019 // 2) have to preserve the original flags.
5020 copyFlagsToImplicitVCC(MI&: *Inst32, Orig: *Src2);
5021 continue;
5022 }
5023 }
5024
5025 Inst32.add(MO: Use);
5026 }
5027
5028 // FIXME: Losing implicit operands
5029 fixImplicitOperands(MI&: *Inst32);
5030 return Inst32;
5031}
5032
5033bool SIInstrInfo::physRegUsesConstantBus(const MachineOperand &RegOp) const {
5034 // Null is free
5035 Register Reg = RegOp.getReg();
5036 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
5037 return false;
5038
5039 // SGPRs use the constant bus
5040
5041 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5042 // physical register operands should also count, except for exec.
5043 if (RegOp.isImplicit())
5044 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5045
5046 // SGPRs use the constant bus
5047 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5048 AMDGPU::SReg_64RegClass.contains(Reg);
5049}
5050
5051bool SIInstrInfo::regUsesConstantBus(const MachineOperand &RegOp,
5052 const MachineRegisterInfo &MRI) const {
5053 Register Reg = RegOp.getReg();
5054 return Reg.isVirtual() ? RI.isSGPRClass(RC: MRI.getRegClass(Reg))
5055 : physRegUsesConstantBus(RegOp);
5056}
5057
5058bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
5059 const MachineOperand &MO,
5060 const MCOperandInfo &OpInfo) const {
5061 // Literal constants use the constant bus.
5062 if (!MO.isReg())
5063 return !isInlineConstant(MO, OpInfo);
5064
5065 Register Reg = MO.getReg();
5066 return Reg.isVirtual() ? RI.isSGPRClass(RC: MRI.getRegClass(Reg))
5067 : physRegUsesConstantBus(RegOp: MO);
5068}
5069
5070static Register findImplicitSGPRRead(const MachineInstr &MI) {
5071 for (const MachineOperand &MO : MI.implicit_operands()) {
5072 // We only care about reads.
5073 if (MO.isDef())
5074 continue;
5075
5076 switch (MO.getReg()) {
5077 case AMDGPU::VCC:
5078 case AMDGPU::VCC_LO:
5079 case AMDGPU::VCC_HI:
5080 case AMDGPU::M0:
5081 case AMDGPU::FLAT_SCR:
5082 return MO.getReg();
5083
5084 default:
5085 break;
5086 }
5087 }
5088
5089 return Register();
5090}
5091
5092static bool shouldReadExec(const MachineInstr &MI) {
5093 if (SIInstrInfo::isVALU(MI)) {
5094 switch (MI.getOpcode()) {
5095 case AMDGPU::V_READLANE_B32:
5096 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5097 case AMDGPU::V_WRITELANE_B32:
5098 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5099 return false;
5100 }
5101
5102 return true;
5103 }
5104
5105 if (MI.isPreISelOpcode() ||
5106 SIInstrInfo::isGenericOpcode(Opc: MI.getOpcode()) ||
5107 SIInstrInfo::isSALU(MI) ||
5108 SIInstrInfo::isSMRD(MI))
5109 return false;
5110
5111 return true;
5112}
5113
5114static bool isRegOrFI(const MachineOperand &MO) {
5115 return MO.isReg() || MO.isFI();
5116}
5117
5118static bool isSubRegOf(const SIRegisterInfo &TRI,
5119 const MachineOperand &SuperVec,
5120 const MachineOperand &SubReg) {
5121 if (SubReg.getReg().isPhysical())
5122 return TRI.isSubRegister(RegA: SuperVec.getReg(), RegB: SubReg.getReg());
5123
5124 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5125 SubReg.getReg() == SuperVec.getReg();
5126}
5127
5128// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5129bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5130 const MachineRegisterInfo &MRI,
5131 StringRef &ErrInfo) const {
5132 Register DstReg = MI.getOperand(i: 0).getReg();
5133 Register SrcReg = MI.getOperand(i: 1).getReg();
5134 // This is a check for copy from vector register to SGPR
5135 if (RI.isVectorRegister(MRI, Reg: SrcReg) && RI.isSGPRReg(MRI, Reg: DstReg)) {
5136 ErrInfo = "illegal copy from vector register to SGPR";
5137 return false;
5138 }
5139 return true;
5140}
5141
5142bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
5143 StringRef &ErrInfo) const {
5144 uint32_t Opcode = MI.getOpcode();
5145 const MachineFunction *MF = MI.getMF();
5146 const MachineRegisterInfo &MRI = MF->getRegInfo();
5147
5148 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5149 // Find a better property to recognize the point where instruction selection
5150 // is just done.
5151 // We can only enforce this check after SIFixSGPRCopies pass so that the
5152 // illegal copies are legalized and thereafter we don't expect a pass
5153 // inserting similar copies.
5154 if (!MRI.isSSA() && MI.isCopy())
5155 return verifyCopy(MI, MRI, ErrInfo);
5156
5157 if (SIInstrInfo::isGenericOpcode(Opc: Opcode))
5158 return true;
5159
5160 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0);
5161 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src1);
5162 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src2);
5163 int Src3Idx = -1;
5164 if (Src0Idx == -1) {
5165 // VOPD V_DUAL_* instructions use different operand names.
5166 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0X);
5167 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vsrc1X);
5168 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0Y);
5169 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vsrc1Y);
5170 }
5171
5172 // Make sure the number of operands is correct.
5173 const MCInstrDesc &Desc = get(Opcode);
5174 if (!Desc.isVariadic() &&
5175 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5176 ErrInfo = "Instruction has wrong number of operands.";
5177 return false;
5178 }
5179
5180 if (MI.isInlineAsm()) {
5181 // Verify register classes for inlineasm constraints.
5182 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5183 I != E; ++I) {
5184 const TargetRegisterClass *RC = MI.getRegClassConstraint(OpIdx: I, TII: this, TRI: &RI);
5185 if (!RC)
5186 continue;
5187
5188 const MachineOperand &Op = MI.getOperand(i: I);
5189 if (!Op.isReg())
5190 continue;
5191
5192 Register Reg = Op.getReg();
5193 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5194 ErrInfo = "inlineasm operand has incorrect register class.";
5195 return false;
5196 }
5197 }
5198
5199 return true;
5200 }
5201
5202 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5203 ErrInfo = "missing memory operand from image instruction.";
5204 return false;
5205 }
5206
5207 // Make sure the register classes are correct.
5208 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5209 const MachineOperand &MO = MI.getOperand(i);
5210 if (MO.isFPImm()) {
5211 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5212 "all fp values to integers.";
5213 return false;
5214 }
5215
5216 const MCOperandInfo &OpInfo = Desc.operands()[i];
5217 int16_t RegClass = getOpRegClassID(OpInfo);
5218
5219 switch (OpInfo.OperandType) {
5220 case MCOI::OPERAND_REGISTER:
5221 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5222 ErrInfo = "Illegal immediate value for operand.";
5223 return false;
5224 }
5225 break;
5226 case AMDGPU::OPERAND_REG_IMM_INT32:
5227 case AMDGPU::OPERAND_REG_IMM_INT64:
5228 case AMDGPU::OPERAND_REG_IMM_INT16:
5229 case AMDGPU::OPERAND_REG_IMM_FP32:
5230 case AMDGPU::OPERAND_REG_IMM_V2FP32:
5231 case AMDGPU::OPERAND_REG_IMM_BF16:
5232 case AMDGPU::OPERAND_REG_IMM_FP16:
5233 case AMDGPU::OPERAND_REG_IMM_FP64:
5234 case AMDGPU::OPERAND_REG_IMM_V2FP16:
5235 case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
5236 case AMDGPU::OPERAND_REG_IMM_V2INT16:
5237 case AMDGPU::OPERAND_REG_IMM_V2INT32:
5238 case AMDGPU::OPERAND_REG_IMM_V2BF16:
5239 break;
5240 case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
5241 break;
5242 break;
5243 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
5244 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
5245 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
5246 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
5247 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
5248 case AMDGPU::OPERAND_REG_INLINE_C_BF16:
5249 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
5250 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
5251 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
5252 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
5253 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
5254 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
5255 case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
5256 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, OpIdx: i))) {
5257 ErrInfo = "Illegal immediate value for operand.";
5258 return false;
5259 }
5260 break;
5261 }
5262 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
5263 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, OpIdx: i)) {
5264 ErrInfo = "Expected inline constant for operand.";
5265 return false;
5266 }
5267 break;
5268 case AMDGPU::OPERAND_INPUT_MODS:
5269 case AMDGPU::OPERAND_SDWA_VOPC_DST:
5270 case AMDGPU::OPERAND_KIMM16:
5271 break;
5272 case MCOI::OPERAND_IMMEDIATE:
5273 case AMDGPU::OPERAND_KIMM32:
5274 case AMDGPU::OPERAND_KIMM64:
5275 case AMDGPU::OPERAND_INLINE_C_AV64_PSEUDO:
5276 // Check if this operand is an immediate.
5277 // FrameIndex operands will be replaced by immediates, so they are
5278 // allowed.
5279 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5280 ErrInfo = "Expected immediate, but got non-immediate";
5281 return false;
5282 }
5283 break;
5284 case MCOI::OPERAND_UNKNOWN:
5285 case MCOI::OPERAND_MEMORY:
5286 case MCOI::OPERAND_PCREL:
5287 break;
5288 default:
5289 if (OpInfo.isGenericType())
5290 continue;
5291 break;
5292 }
5293
5294 if (!MO.isReg())
5295 continue;
5296 Register Reg = MO.getReg();
5297 if (!Reg)
5298 continue;
5299
5300 // FIXME: Ideally we would have separate instruction definitions with the
5301 // aligned register constraint.
5302 // FIXME: We do not verify inline asm operands, but custom inline asm
5303 // verification is broken anyway
5304 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5305 Opcode != AMDGPU::V_MOV_B64_PSEUDO) {
5306 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5307 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5308 if (const TargetRegisterClass *SubRC =
5309 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5310 RC = RI.getCompatibleSubRegClass(SuperRC: RC, SubRC, SubIdx: MO.getSubReg());
5311 if (RC)
5312 RC = SubRC;
5313 }
5314 }
5315
5316 // Check that this is the aligned version of the class.
5317 if (!RC || !RI.isProperlyAlignedRC(RC: *RC)) {
5318 ErrInfo = "Subtarget requires even aligned vector registers";
5319 return false;
5320 }
5321 }
5322
5323 if (RegClass != -1) {
5324 if (Reg.isVirtual())
5325 continue;
5326
5327 const TargetRegisterClass *RC = RI.getRegClass(i: RegClass);
5328 if (!RC->contains(Reg)) {
5329 ErrInfo = "Operand has incorrect register class.";
5330 return false;
5331 }
5332 }
5333 }
5334
5335 // Verify SDWA
5336 if (isSDWA(MI)) {
5337 if (!ST.hasSDWA()) {
5338 ErrInfo = "SDWA is not supported on this target";
5339 return false;
5340 }
5341
5342 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5343 AMDGPU::OpName::dst_sel}) {
5344 const MachineOperand *MO = getNamedOperand(MI, OperandName: Op);
5345 if (!MO)
5346 continue;
5347 int64_t Imm = MO->getImm();
5348 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5349 ErrInfo = "Invalid SDWA selection";
5350 return false;
5351 }
5352 }
5353
5354 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdst);
5355
5356 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5357 if (OpIdx == -1)
5358 continue;
5359 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5360
5361 if (!ST.hasSDWAScalar()) {
5362 // Only VGPRS on VI
5363 if (!MO.isReg() || !RI.hasVGPRs(RC: RI.getRegClassForReg(MRI, Reg: MO.getReg()))) {
5364 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5365 return false;
5366 }
5367 } else {
5368 // No immediates on GFX9
5369 if (!MO.isReg()) {
5370 ErrInfo =
5371 "Only reg allowed as operands in SDWA instructions on GFX9+";
5372 return false;
5373 }
5374 }
5375 }
5376
5377 if (!ST.hasSDWAOmod()) {
5378 // No omod allowed on VI
5379 const MachineOperand *OMod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
5380 if (OMod != nullptr &&
5381 (!OMod->isImm() || OMod->getImm() != 0)) {
5382 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5383 return false;
5384 }
5385 }
5386
5387 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5388 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5389 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5390 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5391 const MachineOperand *Src0ModsMO =
5392 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
5393 unsigned Mods = Src0ModsMO->getImm();
5394 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5395 Mods & SISrcMods::SEXT) {
5396 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5397 return false;
5398 }
5399 }
5400
5401 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5402 if (isVOPC(Opcode: BasicOpcode)) {
5403 if (!ST.hasSDWASdst() && DstIdx != -1) {
5404 // Only vcc allowed as dst on VI for VOPC
5405 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5406 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5407 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5408 return false;
5409 }
5410 } else if (!ST.hasSDWAOutModsVOPC()) {
5411 // No clamp allowed on GFX9 for VOPC
5412 const MachineOperand *Clamp = getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
5413 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5414 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5415 return false;
5416 }
5417
5418 // No omod allowed on GFX9 for VOPC
5419 const MachineOperand *OMod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
5420 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5421 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5422 return false;
5423 }
5424 }
5425 }
5426
5427 const MachineOperand *DstUnused = getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused);
5428 if (DstUnused && DstUnused->isImm() &&
5429 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5430 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5431 if (!Dst.isReg() || !Dst.isTied()) {
5432 ErrInfo = "Dst register should have tied register";
5433 return false;
5434 }
5435
5436 const MachineOperand &TiedMO =
5437 MI.getOperand(i: MI.findTiedOperandIdx(OpIdx: DstIdx));
5438 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5439 ErrInfo =
5440 "Dst register should be tied to implicit use of preserved register";
5441 return false;
5442 }
5443 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5444 ErrInfo = "Dst register should use same physical register as preserved";
5445 return false;
5446 }
5447 }
5448 }
5449
5450 // Verify MIMG / VIMAGE / VSAMPLE
5451 if (isImage(Opcode) && !MI.mayStore()) {
5452 // Ensure that the return type used is large enough for all the options
5453 // being used TFE/LWE require an extra result register.
5454 const MachineOperand *DMask = getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask);
5455 if (DMask) {
5456 uint64_t DMaskImm = DMask->getImm();
5457 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(Value: DMaskImm);
5458 const MachineOperand *TFE = getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe);
5459 const MachineOperand *LWE = getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe);
5460 const MachineOperand *D16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::d16);
5461
5462 // Adjust for packed 16 bit values
5463 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5464 RegCount = divideCeil(Numerator: RegCount, Denominator: 2);
5465
5466 // Adjust if using LWE or TFE
5467 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5468 RegCount += 1;
5469
5470 const uint32_t DstIdx =
5471 AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdata);
5472 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5473 if (Dst.isReg()) {
5474 const TargetRegisterClass *DstRC = getOpRegClass(MI, OpNo: DstIdx);
5475 uint32_t DstSize = RI.getRegSizeInBits(RC: *DstRC) / 32;
5476 if (RegCount > DstSize) {
5477 ErrInfo = "Image instruction returns too many registers for dst "
5478 "register class";
5479 return false;
5480 }
5481 }
5482 }
5483 }
5484
5485 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5486 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5487 unsigned ConstantBusCount = 0;
5488 bool UsesLiteral = false;
5489 const MachineOperand *LiteralVal = nullptr;
5490
5491 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::imm);
5492 if (ImmIdx != -1) {
5493 ++ConstantBusCount;
5494 UsesLiteral = true;
5495 LiteralVal = &MI.getOperand(i: ImmIdx);
5496 }
5497
5498 SmallVector<Register, 2> SGPRsUsed;
5499 Register SGPRUsed;
5500
5501 // Only look at the true operands. Only a real operand can use the constant
5502 // bus, and we don't want to check pseudo-operands like the source modifier
5503 // flags.
5504 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5505 if (OpIdx == -1)
5506 continue;
5507 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5508 if (usesConstantBus(MRI, MO, OpInfo: MI.getDesc().operands()[OpIdx])) {
5509 if (MO.isReg()) {
5510 SGPRUsed = MO.getReg();
5511 if (!llvm::is_contained(Range&: SGPRsUsed, Element: SGPRUsed)) {
5512 ++ConstantBusCount;
5513 SGPRsUsed.push_back(Elt: SGPRUsed);
5514 }
5515 } else if (!MO.isFI()) { // Treat FI like a register.
5516 if (!UsesLiteral) {
5517 ++ConstantBusCount;
5518 UsesLiteral = true;
5519 LiteralVal = &MO;
5520 } else if (!MO.isIdenticalTo(Other: *LiteralVal)) {
5521 assert(isVOP2(MI) || isVOP3(MI));
5522 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5523 return false;
5524 }
5525 }
5526 }
5527 }
5528
5529 SGPRUsed = findImplicitSGPRRead(MI);
5530 if (SGPRUsed) {
5531 // Implicit uses may safely overlap true operands
5532 if (llvm::all_of(Range&: SGPRsUsed, P: [this, SGPRUsed](unsigned SGPR) {
5533 return !RI.regsOverlap(RegA: SGPRUsed, RegB: SGPR);
5534 })) {
5535 ++ConstantBusCount;
5536 SGPRsUsed.push_back(Elt: SGPRUsed);
5537 }
5538 }
5539
5540 // v_writelane_b32 is an exception from constant bus restriction:
5541 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5542 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5543 Opcode != AMDGPU::V_WRITELANE_B32) {
5544 ErrInfo = "VOP* instruction violates constant bus restriction";
5545 return false;
5546 }
5547
5548 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5549 ErrInfo = "VOP3 instruction uses literal";
5550 return false;
5551 }
5552 }
5553
5554 // Special case for writelane - this can break the multiple constant bus rule,
5555 // but still can't use more than one SGPR register
5556 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5557 unsigned SGPRCount = 0;
5558 Register SGPRUsed;
5559
5560 for (int OpIdx : {Src0Idx, Src1Idx}) {
5561 if (OpIdx == -1)
5562 break;
5563
5564 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5565
5566 if (usesConstantBus(MRI, MO, OpInfo: MI.getDesc().operands()[OpIdx])) {
5567 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5568 if (MO.getReg() != SGPRUsed)
5569 ++SGPRCount;
5570 SGPRUsed = MO.getReg();
5571 }
5572 }
5573 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5574 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5575 return false;
5576 }
5577 }
5578 }
5579
5580 // Verify misc. restrictions on specific instructions.
5581 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5582 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5583 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5584 const MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
5585 const MachineOperand &Src2 = MI.getOperand(i: Src2Idx);
5586 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5587 if (!compareMachineOp(Op0: Src0, Op1: Src1) &&
5588 !compareMachineOp(Op0: Src0, Op1: Src2)) {
5589 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5590 return false;
5591 }
5592 }
5593 if ((getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers)->getImm() &
5594 SISrcMods::ABS) ||
5595 (getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers)->getImm() &
5596 SISrcMods::ABS) ||
5597 (getNamedOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers)->getImm() &
5598 SISrcMods::ABS)) {
5599 ErrInfo = "ABS not allowed in VOP3B instructions";
5600 return false;
5601 }
5602 }
5603
5604 if (isSOP2(MI) || isSOPC(MI)) {
5605 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5606 const MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
5607
5608 if (!isRegOrFI(MO: Src0) && !isRegOrFI(MO: Src1) &&
5609 !isInlineConstant(MO: Src0, OpInfo: Desc.operands()[Src0Idx]) &&
5610 !isInlineConstant(MO: Src1, OpInfo: Desc.operands()[Src1Idx]) &&
5611 !Src0.isIdenticalTo(Other: Src1)) {
5612 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5613 return false;
5614 }
5615 }
5616
5617 if (isSOPK(MI)) {
5618 const auto *Op = getNamedOperand(MI, OperandName: AMDGPU::OpName::simm16);
5619 if (Desc.isBranch()) {
5620 if (!Op->isMBB()) {
5621 ErrInfo = "invalid branch target for SOPK instruction";
5622 return false;
5623 }
5624 } else {
5625 uint64_t Imm = Op->getImm();
5626 if (sopkIsZext(Opcode)) {
5627 if (!isUInt<16>(x: Imm)) {
5628 ErrInfo = "invalid immediate for SOPK instruction";
5629 return false;
5630 }
5631 } else {
5632 if (!isInt<16>(x: Imm)) {
5633 ErrInfo = "invalid immediate for SOPK instruction";
5634 return false;
5635 }
5636 }
5637 }
5638 }
5639
5640 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5641 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5642 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5643 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5644 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5645 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5646
5647 const unsigned StaticNumOps =
5648 Desc.getNumOperands() + Desc.implicit_uses().size();
5649 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5650
5651 // Require additional implicit operands. This allows a fixup done by the
5652 // post RA scheduler where the main implicit operand is killed and
5653 // implicit-defs are added for sub-registers that remain live after this
5654 // instruction.
5655 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5656 ErrInfo = "missing implicit register operands";
5657 return false;
5658 }
5659
5660 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
5661 if (IsDst) {
5662 if (!Dst->isUse()) {
5663 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5664 return false;
5665 }
5666
5667 unsigned UseOpIdx;
5668 if (!MI.isRegTiedToUseOperand(DefOpIdx: StaticNumOps, UseOpIdx: &UseOpIdx) ||
5669 UseOpIdx != StaticNumOps + 1) {
5670 ErrInfo = "movrel implicit operands should be tied";
5671 return false;
5672 }
5673 }
5674
5675 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5676 const MachineOperand &ImpUse
5677 = MI.getOperand(i: StaticNumOps + NumImplicitOps - 1);
5678 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5679 !isSubRegOf(TRI: RI, SuperVec: ImpUse, SubReg: IsDst ? *Dst : Src0)) {
5680 ErrInfo = "src0 should be subreg of implicit vector use";
5681 return false;
5682 }
5683 }
5684
5685 // Make sure we aren't losing exec uses in the td files. This mostly requires
5686 // being careful when using let Uses to try to add other use registers.
5687 if (shouldReadExec(MI)) {
5688 if (!MI.hasRegisterImplicitUseOperand(Reg: AMDGPU::EXEC)) {
5689 ErrInfo = "VALU instruction does not implicitly read exec mask";
5690 return false;
5691 }
5692 }
5693
5694 if (isSMRD(MI)) {
5695 if (MI.mayStore() &&
5696 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5697 // The register offset form of scalar stores may only use m0 as the
5698 // soffset register.
5699 const MachineOperand *Soff = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
5700 if (Soff && Soff->getReg() != AMDGPU::M0) {
5701 ErrInfo = "scalar stores must use m0 as offset register";
5702 return false;
5703 }
5704 }
5705 }
5706
5707 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5708 const MachineOperand *Offset = getNamedOperand(MI, OperandName: AMDGPU::OpName::offset);
5709 if (Offset->getImm() != 0) {
5710 ErrInfo = "subtarget does not support offsets in flat instructions";
5711 return false;
5712 }
5713 }
5714
5715 if (isDS(MI) && !ST.hasGDS()) {
5716 const MachineOperand *GDSOp = getNamedOperand(MI, OperandName: AMDGPU::OpName::gds);
5717 if (GDSOp && GDSOp->getImm() != 0) {
5718 ErrInfo = "GDS is not supported on this subtarget";
5719 return false;
5720 }
5721 }
5722
5723 if (isImage(MI)) {
5724 const MachineOperand *DimOp = getNamedOperand(MI, OperandName: AMDGPU::OpName::dim);
5725 if (DimOp) {
5726 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5727 Name: AMDGPU::OpName::vaddr0);
5728 AMDGPU::OpName RSrcOpName =
5729 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5730 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: RSrcOpName);
5731 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Opcode);
5732 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5733 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
5734 const AMDGPU::MIMGDimInfo *Dim =
5735 AMDGPU::getMIMGDimInfoByEncoding(DimEnc: DimOp->getImm());
5736
5737 if (!Dim) {
5738 ErrInfo = "dim is out of range";
5739 return false;
5740 }
5741
5742 bool IsA16 = false;
5743 if (ST.hasR128A16()) {
5744 const MachineOperand *R128A16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::r128);
5745 IsA16 = R128A16->getImm() != 0;
5746 } else if (ST.hasA16()) {
5747 const MachineOperand *A16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::a16);
5748 IsA16 = A16->getImm() != 0;
5749 }
5750
5751 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5752
5753 unsigned AddrWords =
5754 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, IsG16Supported: ST.hasG16());
5755
5756 unsigned VAddrWords;
5757 if (IsNSA) {
5758 VAddrWords = RsrcIdx - VAddr0Idx;
5759 if (ST.hasPartialNSAEncoding() &&
5760 AddrWords > ST.getNSAMaxSize(HasSampler: isVSAMPLE(MI))) {
5761 unsigned LastVAddrIdx = RsrcIdx - 1;
5762 VAddrWords += getOpSize(MI, OpNo: LastVAddrIdx) / 4 - 1;
5763 }
5764 } else {
5765 VAddrWords = getOpSize(MI, OpNo: VAddr0Idx) / 4;
5766 if (AddrWords > 12)
5767 AddrWords = 16;
5768 }
5769
5770 if (VAddrWords != AddrWords) {
5771 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5772 << " but got " << VAddrWords << "\n");
5773 ErrInfo = "bad vaddr size";
5774 return false;
5775 }
5776 }
5777 }
5778
5779 const MachineOperand *DppCt = getNamedOperand(MI, OperandName: AMDGPU::OpName::dpp_ctrl);
5780 if (DppCt) {
5781 using namespace AMDGPU::DPP;
5782
5783 unsigned DC = DppCt->getImm();
5784 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5785 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5786 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5787 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5788 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5789 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5790 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5791 ErrInfo = "Invalid dpp_ctrl value";
5792 return false;
5793 }
5794 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5795 !ST.hasDPPWavefrontShifts()) {
5796 ErrInfo = "Invalid dpp_ctrl value: "
5797 "wavefront shifts are not supported on GFX10+";
5798 return false;
5799 }
5800 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5801 !ST.hasDPPBroadcasts()) {
5802 ErrInfo = "Invalid dpp_ctrl value: "
5803 "broadcasts are not supported on GFX10+";
5804 return false;
5805 }
5806 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5807 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5808 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5809 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5810 !ST.hasGFX90AInsts()) {
5811 ErrInfo = "Invalid dpp_ctrl value: "
5812 "row_newbroadcast/row_share is not supported before "
5813 "GFX90A/GFX10";
5814 return false;
5815 }
5816 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5817 ErrInfo = "Invalid dpp_ctrl value: "
5818 "row_share and row_xmask are not supported before GFX10";
5819 return false;
5820 }
5821 }
5822
5823 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5824 !AMDGPU::isLegalDPALU_DPPControl(ST, DC) &&
5825 AMDGPU::isDPALU_DPP(OpDesc: Desc, MII: *this, ST)) {
5826 ErrInfo = "Invalid dpp_ctrl value: "
5827 "DP ALU dpp only support row_newbcast";
5828 return false;
5829 }
5830 }
5831
5832 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5833 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
5834 AMDGPU::OpName DataName =
5835 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5836 const MachineOperand *Data = getNamedOperand(MI, OperandName: DataName);
5837 const MachineOperand *Data2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::data1);
5838 if (Data && !Data->isReg())
5839 Data = nullptr;
5840
5841 if (ST.hasGFX90AInsts()) {
5842 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5843 (RI.isAGPR(MRI, Reg: Dst->getReg()) != RI.isAGPR(MRI, Reg: Data->getReg()))) {
5844 ErrInfo = "Invalid register class: "
5845 "vdata and vdst should be both VGPR or AGPR";
5846 return false;
5847 }
5848 if (Data && Data2 &&
5849 (RI.isAGPR(MRI, Reg: Data->getReg()) != RI.isAGPR(MRI, Reg: Data2->getReg()))) {
5850 ErrInfo = "Invalid register class: "
5851 "both data operands should be VGPR or AGPR";
5852 return false;
5853 }
5854 } else {
5855 if ((Dst && RI.isAGPR(MRI, Reg: Dst->getReg())) ||
5856 (Data && RI.isAGPR(MRI, Reg: Data->getReg())) ||
5857 (Data2 && RI.isAGPR(MRI, Reg: Data2->getReg()))) {
5858 ErrInfo = "Invalid register class: "
5859 "agpr loads and stores not supported on this GPU";
5860 return false;
5861 }
5862 }
5863 }
5864
5865 if (ST.needsAlignedVGPRs()) {
5866 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5867 const MachineOperand *Op = getNamedOperand(MI, OperandName: OpName);
5868 if (!Op)
5869 return true;
5870 Register Reg = Op->getReg();
5871 if (Reg.isPhysical())
5872 return !(RI.getHWRegIndex(Reg) & 1);
5873 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5874 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5875 !(RI.getChannelFromSubReg(SubReg: Op->getSubReg()) & 1);
5876 };
5877
5878 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5879 Opcode == AMDGPU::DS_GWS_BARRIER) {
5880
5881 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5882 ErrInfo = "Subtarget requires even aligned vector registers "
5883 "for DS_GWS instructions";
5884 return false;
5885 }
5886 }
5887
5888 if (isMIMG(MI)) {
5889 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5890 ErrInfo = "Subtarget requires even aligned vector registers "
5891 "for vaddr operand of image instructions";
5892 return false;
5893 }
5894 }
5895 }
5896
5897 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5898 const MachineOperand *Src = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
5899 if (Src->isReg() && RI.isSGPRReg(MRI, Reg: Src->getReg())) {
5900 ErrInfo = "Invalid register class: "
5901 "v_accvgpr_write with an SGPR is not supported on this GPU";
5902 return false;
5903 }
5904 }
5905
5906 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5907 const MachineOperand &SrcOp = MI.getOperand(i: 1);
5908 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5909 ErrInfo = "pseudo expects only physical SGPRs";
5910 return false;
5911 }
5912 }
5913
5914 if (const MachineOperand *CPol = getNamedOperand(MI, OperandName: AMDGPU::OpName::cpol)) {
5915 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5916 if (!ST.hasScaleOffset()) {
5917 ErrInfo = "Subtarget does not support offset scaling";
5918 return false;
5919 }
5920 if (!AMDGPU::supportsScaleOffset(MII: *this, Opcode: MI.getOpcode())) {
5921 ErrInfo = "Instruction does not support offset scaling";
5922 return false;
5923 }
5924 }
5925 }
5926
5927 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5928 // information.
5929 if (AMDGPU::isPackedFP32Inst(Opc: Opcode) && AMDGPU::isGFX12Plus(STI: ST)) {
5930 for (unsigned I = 0; I < 3; ++I) {
5931 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, SrcN: I))
5932 return false;
5933 }
5934 }
5935
5936 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5937 MI.readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI: nullptr)) {
5938 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
5939 if ((Dst && RI.getRegClassForReg(MRI, Reg: Dst->getReg()) ==
5940 &AMDGPU::SReg_64RegClass) ||
5941 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5942 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5943 return false;
5944 }
5945 }
5946
5947 return true;
5948}
5949
5950// It is more readable to list mapped opcodes on the same line.
5951// clang-format off
5952
5953unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
5954 switch (MI.getOpcode()) {
5955 default: return AMDGPU::INSTRUCTION_LIST_END;
5956 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5957 case AMDGPU::COPY: return AMDGPU::COPY;
5958 case AMDGPU::PHI: return AMDGPU::PHI;
5959 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5960 case AMDGPU::WQM: return AMDGPU::WQM;
5961 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5962 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5963 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5964 case AMDGPU::S_MOV_B32: {
5965 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5966 return MI.getOperand(i: 1).isReg() ||
5967 RI.isAGPR(MRI, Reg: MI.getOperand(i: 0).getReg()) ?
5968 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5969 }
5970 case AMDGPU::S_ADD_I32:
5971 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5972 case AMDGPU::S_ADDC_U32:
5973 return AMDGPU::V_ADDC_U32_e32;
5974 case AMDGPU::S_SUB_I32:
5975 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5976 // FIXME: These are not consistently handled, and selected when the carry is
5977 // used.
5978 case AMDGPU::S_ADD_U32:
5979 return AMDGPU::V_ADD_CO_U32_e32;
5980 case AMDGPU::S_SUB_U32:
5981 return AMDGPU::V_SUB_CO_U32_e32;
5982 case AMDGPU::S_ADD_U64_PSEUDO:
5983 return AMDGPU::V_ADD_U64_PSEUDO;
5984 case AMDGPU::S_SUB_U64_PSEUDO:
5985 return AMDGPU::V_SUB_U64_PSEUDO;
5986 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5987 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5988 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5989 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5990 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5991 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5992 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5993 case AMDGPU::S_XNOR_B32:
5994 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5995 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5996 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5997 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5998 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5999 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
6000 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
6001 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
6002 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
6003 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
6004 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
6005 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
6006 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
6007 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
6008 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
6009 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
6010 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
6011 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
6012 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
6013 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
6014 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
6015 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
6016 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
6017 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
6018 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
6019 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
6020 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
6021 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
6022 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
6023 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
6024 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
6025 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
6026 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
6027 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
6028 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
6029 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
6030 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
6031 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
6032 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
6033 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6034 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6035 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6036 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6037 case AMDGPU::S_CVT_F32_F16:
6038 case AMDGPU::S_CVT_HI_F32_F16:
6039 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6040 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6041 case AMDGPU::S_CVT_F16_F32:
6042 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6043 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6044 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6045 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6046 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6047 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6048 case AMDGPU::S_CEIL_F16:
6049 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6050 : AMDGPU::V_CEIL_F16_fake16_e64;
6051 case AMDGPU::S_FLOOR_F16:
6052 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6053 : AMDGPU::V_FLOOR_F16_fake16_e64;
6054 case AMDGPU::S_TRUNC_F16:
6055 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6056 : AMDGPU::V_TRUNC_F16_fake16_e64;
6057 case AMDGPU::S_RNDNE_F16:
6058 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6059 : AMDGPU::V_RNDNE_F16_fake16_e64;
6060 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6061 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6062 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6063 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6064 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6065 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6066 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6067 case AMDGPU::S_ADD_F16:
6068 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6069 : AMDGPU::V_ADD_F16_fake16_e64;
6070 case AMDGPU::S_SUB_F16:
6071 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6072 : AMDGPU::V_SUB_F16_fake16_e64;
6073 case AMDGPU::S_MIN_F16:
6074 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6075 : AMDGPU::V_MIN_F16_fake16_e64;
6076 case AMDGPU::S_MAX_F16:
6077 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6078 : AMDGPU::V_MAX_F16_fake16_e64;
6079 case AMDGPU::S_MINIMUM_F16:
6080 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6081 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6082 case AMDGPU::S_MAXIMUM_F16:
6083 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6084 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6085 case AMDGPU::S_MUL_F16:
6086 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6087 : AMDGPU::V_MUL_F16_fake16_e64;
6088 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6089 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6090 case AMDGPU::S_FMAC_F16:
6091 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6092 : AMDGPU::V_FMAC_F16_fake16_e64;
6093 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6094 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6095 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6096 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6097 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6098 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6099 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6100 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6101 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6102 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6103 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6104 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6105 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6106 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6107 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6108 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6109 case AMDGPU::S_CMP_LT_F16:
6110 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6111 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6112 case AMDGPU::S_CMP_EQ_F16:
6113 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6114 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6115 case AMDGPU::S_CMP_LE_F16:
6116 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6117 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6118 case AMDGPU::S_CMP_GT_F16:
6119 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6120 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6121 case AMDGPU::S_CMP_LG_F16:
6122 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6123 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6124 case AMDGPU::S_CMP_GE_F16:
6125 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6126 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6127 case AMDGPU::S_CMP_O_F16:
6128 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6129 : AMDGPU::V_CMP_O_F16_fake16_e64;
6130 case AMDGPU::S_CMP_U_F16:
6131 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6132 : AMDGPU::V_CMP_U_F16_fake16_e64;
6133 case AMDGPU::S_CMP_NGE_F16:
6134 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6135 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6136 case AMDGPU::S_CMP_NLG_F16:
6137 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6138 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6139 case AMDGPU::S_CMP_NGT_F16:
6140 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6141 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6142 case AMDGPU::S_CMP_NLE_F16:
6143 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6144 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6145 case AMDGPU::S_CMP_NEQ_F16:
6146 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6147 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6148 case AMDGPU::S_CMP_NLT_F16:
6149 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6150 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6151 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6152 case AMDGPU::V_S_EXP_F16_e64:
6153 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6154 : AMDGPU::V_EXP_F16_fake16_e64;
6155 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6156 case AMDGPU::V_S_LOG_F16_e64:
6157 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6158 : AMDGPU::V_LOG_F16_fake16_e64;
6159 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6160 case AMDGPU::V_S_RCP_F16_e64:
6161 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6162 : AMDGPU::V_RCP_F16_fake16_e64;
6163 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6164 case AMDGPU::V_S_RSQ_F16_e64:
6165 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6166 : AMDGPU::V_RSQ_F16_fake16_e64;
6167 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6168 case AMDGPU::V_S_SQRT_F16_e64:
6169 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6170 : AMDGPU::V_SQRT_F16_fake16_e64;
6171 }
6172 llvm_unreachable(
6173 "Unexpected scalar opcode without corresponding vector one!");
6174}
6175
6176// clang-format on
6177
6178void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
6179 MachineBasicBlock &MBB,
6180 MachineBasicBlock::iterator MBBI,
6181 const DebugLoc &DL, Register Reg,
6182 bool IsSCCLive,
6183 SlotIndexes *Indexes) const {
6184 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6185 const SIInstrInfo *TII = ST.getInstrInfo();
6186 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
6187 if (IsSCCLive) {
6188 // Insert two move instructions, one to save the original value of EXEC and
6189 // the other to turn on all bits in EXEC. This is required as we can't use
6190 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6191 auto StoreExecMI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: Reg)
6192 .addReg(RegNo: LMC.ExecReg, Flags: RegState::Kill);
6193 auto FlipExecMI =
6194 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg).addImm(Val: -1);
6195 if (Indexes) {
6196 Indexes->insertMachineInstrInMaps(MI&: *StoreExecMI);
6197 Indexes->insertMachineInstrInMaps(MI&: *FlipExecMI);
6198 }
6199 } else {
6200 auto SaveExec =
6201 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.OrSaveExecOpc), DestReg: Reg).addImm(Val: -1);
6202 SaveExec->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
6203 if (Indexes)
6204 Indexes->insertMachineInstrInMaps(MI&: *SaveExec);
6205 }
6206}
6207
6208void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
6209 MachineBasicBlock::iterator MBBI,
6210 const DebugLoc &DL, Register Reg,
6211 SlotIndexes *Indexes) const {
6212 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
6213 auto ExecRestoreMI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
6214 .addReg(RegNo: Reg, Flags: RegState::Kill);
6215 if (Indexes)
6216 Indexes->insertMachineInstrInMaps(MI&: *ExecRestoreMI);
6217}
6218
6219MachineInstr *
6220SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
6221 assert(MF.getInfo<SIMachineFunctionInfo>()->isWholeWaveFunction() &&
6222 "Not a whole wave func");
6223 MachineBasicBlock &MBB = *MF.begin();
6224 for (MachineInstr &MI : MBB)
6225 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6226 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6227 return &MI;
6228
6229 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6230}
6231
6232const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
6233 unsigned OpNo) const {
6234 const MCInstrDesc &Desc = get(Opcode: MI.getOpcode());
6235 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6236 Desc.operands()[OpNo].RegClass == -1) {
6237 Register Reg = MI.getOperand(i: OpNo).getReg();
6238
6239 if (Reg.isVirtual()) {
6240 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6241 return MRI.getRegClass(Reg);
6242 }
6243 return RI.getPhysRegBaseClass(Reg);
6244 }
6245
6246 int16_t RegClass = getOpRegClassID(OpInfo: Desc.operands()[OpNo]);
6247 return RegClass < 0 ? nullptr : RI.getRegClass(i: RegClass);
6248}
6249
6250void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
6251 MachineBasicBlock::iterator I = MI;
6252 MachineBasicBlock *MBB = MI.getParent();
6253 MachineOperand &MO = MI.getOperand(i: OpIdx);
6254 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6255 unsigned RCID = getOpRegClassID(OpInfo: get(Opcode: MI.getOpcode()).operands()[OpIdx]);
6256 const TargetRegisterClass *RC = RI.getRegClass(i: RCID);
6257 unsigned Size = RI.getRegSizeInBits(RC: *RC);
6258 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6259 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6260 : AMDGPU::V_MOV_B32_e32;
6261 if (MO.isReg())
6262 Opcode = AMDGPU::COPY;
6263 else if (RI.isSGPRClass(RC))
6264 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6265
6266 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(SRC: RC);
6267 Register Reg = MRI.createVirtualRegister(RegClass: VRC);
6268 DebugLoc DL = MBB->findDebugLoc(MBBI: I);
6269 BuildMI(BB&: *MI.getParent(), I, MIMD: DL, MCID: get(Opcode), DestReg: Reg).add(MO);
6270 MO.ChangeToRegister(Reg, isDef: false);
6271}
6272
6273unsigned SIInstrInfo::buildExtractSubReg(
6274 MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
6275 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6276 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6277 if (!SuperReg.getReg().isVirtual())
6278 return RI.getSubReg(Reg: SuperReg.getReg(), Idx: SubIdx);
6279
6280 MachineBasicBlock *MBB = MI->getParent();
6281 const DebugLoc &DL = MI->getDebugLoc();
6282 Register SubReg = MRI.createVirtualRegister(RegClass: SubRC);
6283
6284 unsigned NewSubIdx = RI.composeSubRegIndices(a: SuperReg.getSubReg(), b: SubIdx);
6285 BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: SubReg)
6286 .addReg(RegNo: SuperReg.getReg(), Flags: {}, SubReg: NewSubIdx);
6287 return SubReg;
6288}
6289
6290MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
6291 MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI,
6292 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6293 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6294 if (Op.isImm()) {
6295 if (SubIdx == AMDGPU::sub0)
6296 return MachineOperand::CreateImm(Val: static_cast<int32_t>(Op.getImm()));
6297 if (SubIdx == AMDGPU::sub1)
6298 return MachineOperand::CreateImm(Val: static_cast<int32_t>(Op.getImm() >> 32));
6299
6300 llvm_unreachable("Unhandled register index for immediate");
6301 }
6302
6303 unsigned SubReg = buildExtractSubReg(MI: MII, MRI, SuperReg: Op, SuperRC,
6304 SubIdx, SubRC);
6305 return MachineOperand::CreateReg(Reg: SubReg, isDef: false);
6306}
6307
6308// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6309void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6310 assert(Inst.getNumExplicitOperands() == 3);
6311 MachineOperand Op1 = Inst.getOperand(i: 1);
6312 Inst.removeOperand(OpNo: 1);
6313 Inst.addOperand(Op: Op1);
6314}
6315
6316bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
6317 const MCOperandInfo &OpInfo,
6318 const MachineOperand &MO) const {
6319 if (!MO.isReg())
6320 return false;
6321
6322 Register Reg = MO.getReg();
6323
6324 const TargetRegisterClass *DRC = RI.getRegClass(i: getOpRegClassID(OpInfo));
6325 if (Reg.isPhysical())
6326 return DRC->contains(Reg);
6327
6328 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6329
6330 if (MO.getSubReg()) {
6331 const MachineFunction *MF = MO.getParent()->getMF();
6332 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, MF: *MF);
6333 if (!SuperRC)
6334 return false;
6335 return RI.getMatchingSuperRegClass(A: SuperRC, B: DRC, Idx: MO.getSubReg()) != nullptr;
6336 }
6337
6338 return RI.getCommonSubClass(A: DRC, B: RC) != nullptr;
6339}
6340
6341bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
6342 const MachineOperand &MO) const {
6343 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6344 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6345 unsigned Opc = MI.getOpcode();
6346
6347 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6348 // information.
6349 if (AMDGPU::isPackedFP32Inst(Opc: MI.getOpcode()) && AMDGPU::isGFX12Plus(STI: ST) &&
6350 MO.isReg() && RI.isSGPRReg(MRI, Reg: MO.getReg())) {
6351 constexpr AMDGPU::OpName OpNames[] = {
6352 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6353
6354 for (auto [I, OpName] : enumerate(First: OpNames)) {
6355 int SrcIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[I]);
6356 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6357 !isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, SrcN: I, MO: &MO))
6358 return false;
6359 }
6360 }
6361
6362 if (!isLegalRegOperand(MRI, OpInfo, MO))
6363 return false;
6364
6365 // check Accumulate GPR operand
6366 bool IsAGPR = RI.isAGPR(MRI, Reg: MO.getReg());
6367 if (IsAGPR && !ST.hasMAIInsts())
6368 return false;
6369 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6370 (MI.mayLoad() || MI.mayStore() || isDS(Opcode: Opc) || isMIMG(Opcode: Opc)))
6371 return false;
6372 // Atomics should have both vdst and vdata either vgpr or agpr.
6373 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
6374 const int DataIdx = AMDGPU::getNamedOperandIdx(
6375 Opcode: Opc, Name: isDS(Opcode: Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6376 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6377 MI.getOperand(i: DataIdx).isReg() &&
6378 RI.isAGPR(MRI, Reg: MI.getOperand(i: DataIdx).getReg()) != IsAGPR)
6379 return false;
6380 if ((int)OpIdx == DataIdx) {
6381 if (VDstIdx != -1 &&
6382 RI.isAGPR(MRI, Reg: MI.getOperand(i: VDstIdx).getReg()) != IsAGPR)
6383 return false;
6384 // DS instructions with 2 src operands also must have tied RC.
6385 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data1);
6386 if (Data1Idx != -1 && MI.getOperand(i: Data1Idx).isReg() &&
6387 RI.isAGPR(MRI, Reg: MI.getOperand(i: Data1Idx).getReg()) != IsAGPR)
6388 return false;
6389 }
6390
6391 // Check V_ACCVGPR_WRITE_B32_e64
6392 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6393 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0) &&
6394 RI.isSGPRReg(MRI, Reg: MO.getReg()))
6395 return false;
6396
6397 if (ST.hasFlatScratchHiInB64InstHazard() &&
6398 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6399 if (const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst)) {
6400 if (AMDGPU::getRegBitWidth(RC: *RI.getRegClassForReg(MRI, Reg: Dst->getReg())) ==
6401 64)
6402 return false;
6403 }
6404 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6405 return false;
6406 }
6407
6408 return true;
6409}
6410
6411bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
6412 const MCOperandInfo &OpInfo,
6413 const MachineOperand &MO) const {
6414 if (MO.isReg())
6415 return isLegalRegOperand(MRI, OpInfo, MO);
6416
6417 // Handle non-register types that are treated like immediates.
6418 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6419 return true;
6420}
6421
6422bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
6423 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6424 const MachineOperand *MO) const {
6425 constexpr unsigned NumOps = 3;
6426 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6427 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6428 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6429 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6430
6431 assert(SrcN < NumOps);
6432
6433 if (!MO) {
6434 int SrcIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[SrcN]);
6435 if (SrcIdx == -1)
6436 return true;
6437 MO = &MI.getOperand(i: SrcIdx);
6438 }
6439
6440 if (!MO->isReg() || !RI.isSGPRReg(MRI, Reg: MO->getReg()))
6441 return true;
6442
6443 int ModsIdx =
6444 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[NumOps + SrcN]);
6445 if (ModsIdx == -1)
6446 return true;
6447
6448 unsigned Mods = MI.getOperand(i: ModsIdx).getImm();
6449 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6450 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6451
6452 return !OpSel && !OpSelHi;
6453}
6454
6455bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
6456 const MachineOperand *MO) const {
6457 const MachineFunction &MF = *MI.getMF();
6458 const MachineRegisterInfo &MRI = MF.getRegInfo();
6459 const MCInstrDesc &InstDesc = MI.getDesc();
6460 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6461 int64_t RegClass = getOpRegClassID(OpInfo);
6462 const TargetRegisterClass *DefinedRC =
6463 RegClass != -1 ? RI.getRegClass(i: RegClass) : nullptr;
6464 if (!MO)
6465 MO = &MI.getOperand(i: OpIdx);
6466
6467 const bool IsInlineConst = !MO->isReg() && isInlineConstant(MO: *MO, OpInfo);
6468
6469 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, MO: *MO, OpInfo)) {
6470 const MachineOperand *UsedLiteral = nullptr;
6471
6472 int ConstantBusLimit = ST.getConstantBusLimit(Opcode: MI.getOpcode());
6473 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6474
6475 // TODO: Be more permissive with frame indexes.
6476 if (!MO->isReg() && !isInlineConstant(MO: *MO, OpInfo)) {
6477 if (!LiteralLimit--)
6478 return false;
6479
6480 UsedLiteral = MO;
6481 }
6482
6483 SmallDenseSet<RegSubRegPair> SGPRsUsed;
6484 if (MO->isReg())
6485 SGPRsUsed.insert(V: RegSubRegPair(MO->getReg(), MO->getSubReg()));
6486
6487 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6488 if (i == OpIdx)
6489 continue;
6490 const MachineOperand &Op = MI.getOperand(i);
6491 if (Op.isReg()) {
6492 if (Op.isUse()) {
6493 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6494 if (regUsesConstantBus(RegOp: Op, MRI) && SGPRsUsed.insert(V: SGPR).second) {
6495 if (--ConstantBusLimit <= 0)
6496 return false;
6497 }
6498 }
6499 } else if (AMDGPU::isSISrcOperand(OpInfo: InstDesc.operands()[i]) &&
6500 !isInlineConstant(MO: Op, OpInfo: InstDesc.operands()[i])) {
6501 // The same literal may be used multiple times.
6502 if (!UsedLiteral)
6503 UsedLiteral = &Op;
6504 else if (UsedLiteral->isIdenticalTo(Other: Op))
6505 continue;
6506
6507 if (!LiteralLimit--)
6508 return false;
6509 if (--ConstantBusLimit <= 0)
6510 return false;
6511 }
6512 }
6513 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6514 // There can be at most one literal operand, but it can be repeated.
6515 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6516 if (i == OpIdx)
6517 continue;
6518 const MachineOperand &Op = MI.getOperand(i);
6519 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6520 !isInlineConstant(MO: Op, OpInfo: InstDesc.operands()[i]) &&
6521 !Op.isIdenticalTo(Other: *MO))
6522 return false;
6523
6524 // Do not fold a non-inlineable and non-register operand into an
6525 // instruction that already has a frame index. The frame index handling
6526 // code could not handle well when a frame index co-exists with another
6527 // non-register operand, unless that operand is an inlineable immediate.
6528 if (Op.isFI())
6529 return false;
6530 }
6531 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6532 isF16PseudoScalarTrans(Opcode: MI.getOpcode())) {
6533 return false;
6534 }
6535
6536 if (MO->isReg()) {
6537 if (!DefinedRC)
6538 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6539 return isLegalRegOperand(MI, OpIdx, MO: *MO);
6540 }
6541
6542 if (MO->isImm()) {
6543 uint64_t Imm = MO->getImm();
6544 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6545 bool Is64BitOp = Is64BitFPOp ||
6546 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6547 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6548 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6549 if (Is64BitOp &&
6550 !AMDGPU::isInlinableLiteral64(Literal: Imm, HasInv2Pi: ST.hasInv2PiInlineImm())) {
6551 if (!AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: Is64BitFPOp) &&
6552 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6553 return false;
6554
6555 // FIXME: We can use sign extended 64-bit literals, but only for signed
6556 // operands. At the moment we do not know if an operand is signed.
6557 // Such operand will be encoded as its low 32 bits and then either
6558 // correctly sign extended or incorrectly zero extended by HW.
6559 // If 64-bit literals are supported and the literal will be encoded
6560 // as full 64 bit we still can use it.
6561 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6562 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: false)))
6563 return false;
6564 }
6565 }
6566
6567 // Handle non-register types that are treated like immediates.
6568 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6569
6570 if (!DefinedRC) {
6571 // This operand expects an immediate.
6572 return true;
6573 }
6574
6575 return isImmOperandLegal(MI, OpNo: OpIdx, MO: *MO);
6576}
6577
6578bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
6579 bool IsGFX950Only = ST.hasGFX950Insts();
6580 bool IsGFX940Only = ST.hasGFX940Insts();
6581
6582 if (!IsGFX950Only && !IsGFX940Only)
6583 return false;
6584
6585 if (!isVALU(MI))
6586 return false;
6587
6588 // V_COS, V_EXP, V_RCP, etc.
6589 if (isTRANS(MI))
6590 return true;
6591
6592 // DOT2, DOT2C, DOT4, etc.
6593 if (isDOT(MI))
6594 return true;
6595
6596 // MFMA, SMFMA
6597 if (isMFMA(MI))
6598 return true;
6599
6600 unsigned Opcode = MI.getOpcode();
6601 switch (Opcode) {
6602 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6603 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6604 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6605 case AMDGPU::V_MQSAD_U32_U8_e64:
6606 case AMDGPU::V_PK_ADD_F16:
6607 case AMDGPU::V_PK_ADD_F32:
6608 case AMDGPU::V_PK_ADD_I16:
6609 case AMDGPU::V_PK_ADD_U16:
6610 case AMDGPU::V_PK_ASHRREV_I16:
6611 case AMDGPU::V_PK_FMA_F16:
6612 case AMDGPU::V_PK_FMA_F32:
6613 case AMDGPU::V_PK_FMAC_F16_e32:
6614 case AMDGPU::V_PK_FMAC_F16_e64:
6615 case AMDGPU::V_PK_LSHLREV_B16:
6616 case AMDGPU::V_PK_LSHRREV_B16:
6617 case AMDGPU::V_PK_MAD_I16:
6618 case AMDGPU::V_PK_MAD_U16:
6619 case AMDGPU::V_PK_MAX_F16:
6620 case AMDGPU::V_PK_MAX_I16:
6621 case AMDGPU::V_PK_MAX_U16:
6622 case AMDGPU::V_PK_MIN_F16:
6623 case AMDGPU::V_PK_MIN_I16:
6624 case AMDGPU::V_PK_MIN_U16:
6625 case AMDGPU::V_PK_MOV_B32:
6626 case AMDGPU::V_PK_MUL_F16:
6627 case AMDGPU::V_PK_MUL_F32:
6628 case AMDGPU::V_PK_MUL_LO_U16:
6629 case AMDGPU::V_PK_SUB_I16:
6630 case AMDGPU::V_PK_SUB_U16:
6631 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6632 return true;
6633 default:
6634 return false;
6635 }
6636}
6637
6638void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
6639 MachineInstr &MI) const {
6640 unsigned Opc = MI.getOpcode();
6641 const MCInstrDesc &InstrDesc = get(Opcode: Opc);
6642
6643 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
6644 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
6645
6646 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1);
6647 MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
6648
6649 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6650 // we need to only have one constant bus use before GFX10.
6651 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6652 if (HasImplicitSGPR && ST.getConstantBusLimit(Opcode: Opc) <= 1 && Src0.isReg() &&
6653 RI.isSGPRReg(MRI, Reg: Src0.getReg()))
6654 legalizeOpWithMove(MI, OpIdx: Src0Idx);
6655
6656 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6657 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6658 // src0/src1 with V_READFIRSTLANE.
6659 if (Opc == AMDGPU::V_WRITELANE_B32) {
6660 const DebugLoc &DL = MI.getDebugLoc();
6661 if (Src0.isReg() && RI.isVGPR(MRI, Reg: Src0.getReg())) {
6662 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6663 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6664 .add(MO: Src0);
6665 Src0.ChangeToRegister(Reg, isDef: false);
6666 }
6667 if (Src1.isReg() && RI.isVGPR(MRI, Reg: Src1.getReg())) {
6668 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6669 const DebugLoc &DL = MI.getDebugLoc();
6670 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6671 .add(MO: Src1);
6672 Src1.ChangeToRegister(Reg, isDef: false);
6673 }
6674 return;
6675 }
6676
6677 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6678 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6679 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
6680 if (!RI.isVGPR(MRI, Reg: MI.getOperand(i: Src2Idx).getReg()))
6681 legalizeOpWithMove(MI, OpIdx: Src2Idx);
6682 }
6683
6684 // VOP2 src0 instructions support all operand types, so we don't need to check
6685 // their legality. If src1 is already legal, we don't need to do anything.
6686 if (isLegalRegOperand(MRI, OpInfo: InstrDesc.operands()[Src1Idx], MO: Src1))
6687 return;
6688
6689 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6690 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6691 // select is uniform.
6692 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6693 RI.isVGPR(MRI, Reg: Src1.getReg())) {
6694 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6695 const DebugLoc &DL = MI.getDebugLoc();
6696 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6697 .add(MO: Src1);
6698 Src1.ChangeToRegister(Reg, isDef: false);
6699 return;
6700 }
6701
6702 // We do not use commuteInstruction here because it is too aggressive and will
6703 // commute if it is possible. We only want to commute here if it improves
6704 // legality. This can be called a fairly large number of times so don't waste
6705 // compile time pointlessly swapping and checking legality again.
6706 if (HasImplicitSGPR || !MI.isCommutable()) {
6707 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6708 return;
6709 }
6710
6711 // If src0 can be used as src1, commuting will make the operands legal.
6712 // Otherwise we have to give up and insert a move.
6713 //
6714 // TODO: Other immediate-like operand kinds could be commuted if there was a
6715 // MachineOperand::ChangeTo* for them.
6716 if ((!Src1.isImm() && !Src1.isReg()) ||
6717 !isLegalRegOperand(MRI, OpInfo: InstrDesc.operands()[Src1Idx], MO: Src0)) {
6718 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6719 return;
6720 }
6721
6722 int CommutedOpc = commuteOpcode(MI);
6723 if (CommutedOpc == -1) {
6724 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6725 return;
6726 }
6727
6728 MI.setDesc(get(Opcode: CommutedOpc));
6729
6730 Register Src0Reg = Src0.getReg();
6731 unsigned Src0SubReg = Src0.getSubReg();
6732 bool Src0Kill = Src0.isKill();
6733
6734 if (Src1.isImm())
6735 Src0.ChangeToImmediate(ImmVal: Src1.getImm());
6736 else if (Src1.isReg()) {
6737 Src0.ChangeToRegister(Reg: Src1.getReg(), isDef: false, isImp: false, isKill: Src1.isKill());
6738 Src0.setSubReg(Src1.getSubReg());
6739 } else
6740 llvm_unreachable("Should only have register or immediate operands");
6741
6742 Src1.ChangeToRegister(Reg: Src0Reg, isDef: false, isImp: false, isKill: Src0Kill);
6743 Src1.setSubReg(Src0SubReg);
6744 fixImplicitOperands(MI);
6745}
6746
6747// Legalize VOP3 operands. All operand types are supported for any operand
6748// but only one literal constant and only starting from GFX10.
6749void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
6750 MachineInstr &MI) const {
6751 unsigned Opc = MI.getOpcode();
6752
6753 int VOP3Idx[3] = {
6754 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0),
6755 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1),
6756 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2)
6757 };
6758
6759 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6760 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6761 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6762 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6763 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6764 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6765 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6766 // src1 and src2 must be scalar
6767 MachineOperand &Src1 = MI.getOperand(i: VOP3Idx[1]);
6768 const DebugLoc &DL = MI.getDebugLoc();
6769 if (Src1.isReg() && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src1.getReg()))) {
6770 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6771 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6772 .add(MO: Src1);
6773 Src1.ChangeToRegister(Reg, isDef: false);
6774 }
6775 if (VOP3Idx[2] != -1) {
6776 MachineOperand &Src2 = MI.getOperand(i: VOP3Idx[2]);
6777 if (Src2.isReg() && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src2.getReg()))) {
6778 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6779 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6780 .add(MO: Src2);
6781 Src2.ChangeToRegister(Reg, isDef: false);
6782 }
6783 }
6784 }
6785
6786 // Find the one SGPR operand we are allowed to use.
6787 int ConstantBusLimit = ST.getConstantBusLimit(Opcode: Opc);
6788 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6789 SmallDenseSet<unsigned> SGPRsUsed;
6790 Register SGPRReg = findUsedSGPR(MI, OpIndices: VOP3Idx);
6791 if (SGPRReg) {
6792 SGPRsUsed.insert(V: SGPRReg);
6793 --ConstantBusLimit;
6794 }
6795
6796 for (int Idx : VOP3Idx) {
6797 if (Idx == -1)
6798 break;
6799 MachineOperand &MO = MI.getOperand(i: Idx);
6800
6801 if (!MO.isReg()) {
6802 if (isInlineConstant(MO, OpInfo: get(Opcode: Opc).operands()[Idx]))
6803 continue;
6804
6805 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6806 --LiteralLimit;
6807 --ConstantBusLimit;
6808 continue;
6809 }
6810
6811 --LiteralLimit;
6812 --ConstantBusLimit;
6813 legalizeOpWithMove(MI, OpIdx: Idx);
6814 continue;
6815 }
6816
6817 if (!RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg: MO.getReg())))
6818 continue; // VGPRs are legal
6819
6820 // We can use one SGPR in each VOP3 instruction prior to GFX10
6821 // and two starting from GFX10.
6822 if (SGPRsUsed.count(V: MO.getReg()))
6823 continue;
6824 if (ConstantBusLimit > 0) {
6825 SGPRsUsed.insert(V: MO.getReg());
6826 --ConstantBusLimit;
6827 continue;
6828 }
6829
6830 // If we make it this far, then the operand is not legal and we must
6831 // legalize it.
6832 legalizeOpWithMove(MI, OpIdx: Idx);
6833 }
6834
6835 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6836 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6837 !RI.isVGPR(MRI, Reg: MI.getOperand(i: VOP3Idx[2]).getReg()))
6838 legalizeOpWithMove(MI, OpIdx: VOP3Idx[2]);
6839
6840 // Fix the register class of packed FP32 instructions on gfx12+. See
6841 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6842 if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(STI: ST)) {
6843 for (unsigned I = 0; I < 3; ++I) {
6844 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
6845 legalizeOpWithMove(MI, OpIdx: VOP3Idx[I]);
6846 }
6847 }
6848}
6849
6850Register SIInstrInfo::readlaneVGPRToSGPR(
6851 Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI,
6852 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6853 const TargetRegisterClass *VRC = MRI.getRegClass(Reg: SrcReg);
6854 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6855 if (DstRC)
6856 SRC = RI.getCommonSubClass(A: SRC, B: DstRC);
6857
6858 Register DstReg = MRI.createVirtualRegister(RegClass: SRC);
6859 unsigned SubRegs = RI.getRegSizeInBits(RC: *VRC) / 32;
6860
6861 if (RI.hasAGPRs(RC: VRC)) {
6862 VRC = RI.getEquivalentVGPRClass(SRC: VRC);
6863 Register NewSrcReg = MRI.createVirtualRegister(RegClass: VRC);
6864 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6865 MCID: get(Opcode: TargetOpcode::COPY), DestReg: NewSrcReg)
6866 .addReg(RegNo: SrcReg);
6867 SrcReg = NewSrcReg;
6868 }
6869
6870 if (SubRegs == 1) {
6871 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6872 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
6873 .addReg(RegNo: SrcReg);
6874 return DstReg;
6875 }
6876
6877 SmallVector<Register, 8> SRegs;
6878 for (unsigned i = 0; i < SubRegs; ++i) {
6879 Register SGPR = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
6880 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6881 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: SGPR)
6882 .addReg(RegNo: SrcReg, Flags: {}, SubReg: RI.getSubRegFromChannel(Channel: i));
6883 SRegs.push_back(Elt: SGPR);
6884 }
6885
6886 MachineInstrBuilder MIB =
6887 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6888 MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg);
6889 for (unsigned i = 0; i < SubRegs; ++i) {
6890 MIB.addReg(RegNo: SRegs[i]);
6891 MIB.addImm(Val: RI.getSubRegFromChannel(Channel: i));
6892 }
6893 return DstReg;
6894}
6895
6896void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
6897 MachineInstr &MI) const {
6898
6899 // If the pointer is store in VGPRs, then we need to move them to
6900 // SGPRs using v_readfirstlane. This is safe because we only select
6901 // loads with uniform pointers to SMRD instruction so we know the
6902 // pointer value is uniform.
6903 MachineOperand *SBase = getNamedOperand(MI, OperandName: AMDGPU::OpName::sbase);
6904 if (SBase && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SBase->getReg()))) {
6905 Register SGPR = readlaneVGPRToSGPR(SrcReg: SBase->getReg(), UseMI&: MI, MRI);
6906 SBase->setReg(SGPR);
6907 }
6908 MachineOperand *SOff = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
6909 if (SOff && !RI.isSGPRReg(MRI, Reg: SOff->getReg())) {
6910 Register SGPR = readlaneVGPRToSGPR(SrcReg: SOff->getReg(), UseMI&: MI, MRI);
6911 SOff->setReg(SGPR);
6912 }
6913}
6914
6915bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
6916 unsigned Opc = Inst.getOpcode();
6917 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::saddr);
6918 if (OldSAddrIdx < 0)
6919 return false;
6920
6921 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6922
6923 int NewOpc = AMDGPU::getGlobalVaddrOp(Opcode: Opc);
6924 if (NewOpc < 0)
6925 NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opcode: Opc);
6926 if (NewOpc < 0)
6927 return false;
6928
6929 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6930 MachineOperand &SAddr = Inst.getOperand(i: OldSAddrIdx);
6931 if (RI.isSGPRReg(MRI, Reg: SAddr.getReg()))
6932 return false;
6933
6934 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::vaddr);
6935 if (NewVAddrIdx < 0)
6936 return false;
6937
6938 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr);
6939
6940 // Check vaddr, it shall be zero or absent.
6941 MachineInstr *VAddrDef = nullptr;
6942 if (OldVAddrIdx >= 0) {
6943 MachineOperand &VAddr = Inst.getOperand(i: OldVAddrIdx);
6944 VAddrDef = MRI.getUniqueVRegDef(Reg: VAddr.getReg());
6945 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6946 !VAddrDef->getOperand(i: 1).isImm() ||
6947 VAddrDef->getOperand(i: 1).getImm() != 0)
6948 return false;
6949 }
6950
6951 const MCInstrDesc &NewDesc = get(Opcode: NewOpc);
6952 Inst.setDesc(NewDesc);
6953
6954 // Callers expect iterator to be valid after this call, so modify the
6955 // instruction in place.
6956 if (OldVAddrIdx == NewVAddrIdx) {
6957 MachineOperand &NewVAddr = Inst.getOperand(i: NewVAddrIdx);
6958 // Clear use list from the old vaddr holding a zero register.
6959 MRI.removeRegOperandFromUseList(MO: &NewVAddr);
6960 MRI.moveOperands(Dst: &NewVAddr, Src: &SAddr, NumOps: 1);
6961 Inst.removeOperand(OpNo: OldSAddrIdx);
6962 // Update the use list with the pointer we have just moved from vaddr to
6963 // saddr position. Otherwise new vaddr will be missing from the use list.
6964 MRI.removeRegOperandFromUseList(MO: &NewVAddr);
6965 MRI.addRegOperandToUseList(MO: &NewVAddr);
6966 } else {
6967 assert(OldSAddrIdx == NewVAddrIdx);
6968
6969 if (OldVAddrIdx >= 0) {
6970 int NewVDstIn = AMDGPU::getNamedOperandIdx(Opcode: NewOpc,
6971 Name: AMDGPU::OpName::vdst_in);
6972
6973 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6974 // it asserts. Untie the operands for now and retie them afterwards.
6975 if (NewVDstIn != -1) {
6976 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst_in);
6977 Inst.untieRegOperand(OpIdx: OldVDstIn);
6978 }
6979
6980 Inst.removeOperand(OpNo: OldVAddrIdx);
6981
6982 if (NewVDstIn != -1) {
6983 int NewVDst = AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::vdst);
6984 Inst.tieOperands(DefIdx: NewVDst, UseIdx: NewVDstIn);
6985 }
6986 }
6987 }
6988
6989 if (VAddrDef && MRI.use_nodbg_empty(RegNo: VAddrDef->getOperand(i: 0).getReg()))
6990 VAddrDef->eraseFromParent();
6991
6992 return true;
6993}
6994
6995// FIXME: Remove this when SelectionDAG is obsoleted.
6996void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
6997 MachineInstr &MI) const {
6998 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6999 return;
7000
7001 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
7002 // thinks they are uniform, so a readfirstlane should be valid.
7003 MachineOperand *SAddr = getNamedOperand(MI, OperandName: AMDGPU::OpName::saddr);
7004 if (!SAddr || RI.isSGPRClass(RC: MRI.getRegClass(Reg: SAddr->getReg())))
7005 return;
7006
7007 if (moveFlatAddrToVGPR(Inst&: MI))
7008 return;
7009
7010 const TargetRegisterClass *DeclaredRC =
7011 getRegClass(MCID: MI.getDesc(), OpNum: SAddr->getOperandNo());
7012
7013 Register ToSGPR = readlaneVGPRToSGPR(SrcReg: SAddr->getReg(), UseMI&: MI, MRI, DstRC: DeclaredRC);
7014 SAddr->setReg(ToSGPR);
7015}
7016
7017void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
7018 MachineBasicBlock::iterator I,
7019 const TargetRegisterClass *DstRC,
7020 MachineOperand &Op,
7021 MachineRegisterInfo &MRI,
7022 const DebugLoc &DL) const {
7023 Register OpReg = Op.getReg();
7024 unsigned OpSubReg = Op.getSubReg();
7025
7026 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7027 RI.getRegClassForReg(MRI, Reg: OpReg), OpSubReg);
7028
7029 // Check if operand is already the correct register class.
7030 if (DstRC == OpRC)
7031 return;
7032
7033 Register DstReg = MRI.createVirtualRegister(RegClass: DstRC);
7034 auto Copy =
7035 BuildMI(BB&: InsertMBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: OpReg);
7036 Op.setReg(DstReg);
7037
7038 MachineInstr *Def = MRI.getVRegDef(Reg: OpReg);
7039 if (!Def)
7040 return;
7041
7042 // Try to eliminate the copy if it is copying an immediate value.
7043 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7044 foldImmediate(UseMI&: *Copy, DefMI&: *Def, Reg: OpReg, MRI: &MRI);
7045
7046 bool ImpDef = Def->isImplicitDef();
7047 while (!ImpDef && Def && Def->isCopy()) {
7048 if (Def->getOperand(i: 1).getReg().isPhysical())
7049 break;
7050 Def = MRI.getUniqueVRegDef(Reg: Def->getOperand(i: 1).getReg());
7051 ImpDef = Def && Def->isImplicitDef();
7052 }
7053 if (!RI.isSGPRClass(RC: DstRC) && !Copy->readsRegister(Reg: AMDGPU::EXEC, TRI: &RI) &&
7054 !ImpDef)
7055 Copy.addReg(RegNo: AMDGPU::EXEC, Flags: RegState::Implicit);
7056}
7057
7058// Emit the actual waterfall loop, executing the wrapped instruction for each
7059// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7060// iteration, in the worst case we execute 64 (once per lane).
7061static void
7062emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
7063 MachineRegisterInfo &MRI,
7064 MachineBasicBlock &LoopBB,
7065 MachineBasicBlock &BodyBB,
7066 const DebugLoc &DL,
7067 ArrayRef<MachineOperand *> ScalarOps) {
7068 MachineFunction &MF = *LoopBB.getParent();
7069 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7070 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7071 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
7072 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7073
7074 MachineBasicBlock::iterator I = LoopBB.begin();
7075 Register CondReg;
7076
7077 for (MachineOperand *ScalarOp : ScalarOps) {
7078 unsigned RegSize = TRI->getRegSizeInBits(Reg: ScalarOp->getReg(), MRI);
7079 unsigned NumSubRegs = RegSize / 32;
7080 Register VScalarOp = ScalarOp->getReg();
7081
7082 if (NumSubRegs == 1) {
7083 Register CurReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7084
7085 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurReg)
7086 .addReg(RegNo: VScalarOp);
7087
7088 Register NewCondReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7089
7090 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: NewCondReg)
7091 .addReg(RegNo: CurReg)
7092 .addReg(RegNo: VScalarOp);
7093
7094 // Combine the comparison results with AND.
7095 if (!CondReg) // First.
7096 CondReg = NewCondReg;
7097 else { // If not the first, we create an AND.
7098 Register AndReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7099 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndOpc), DestReg: AndReg)
7100 .addReg(RegNo: CondReg)
7101 .addReg(RegNo: NewCondReg);
7102 CondReg = AndReg;
7103 }
7104
7105 // Update ScalarOp operand to use the SGPR ScalarOp.
7106 ScalarOp->setReg(CurReg);
7107 ScalarOp->setIsKill();
7108 } else {
7109 SmallVector<Register, 8> ReadlanePieces;
7110 RegState VScalarOpUndef = getUndefRegState(B: ScalarOp->isUndef());
7111 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7112 "Unhandled register size");
7113
7114 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7115 Register CurRegLo =
7116 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7117 Register CurRegHi =
7118 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7119
7120 // Read the next variant <- also loop target.
7121 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurRegLo)
7122 .addReg(RegNo: VScalarOp, Flags: VScalarOpUndef, SubReg: TRI->getSubRegFromChannel(Channel: Idx));
7123
7124 // Read the next variant <- also loop target.
7125 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurRegHi)
7126 .addReg(RegNo: VScalarOp, Flags: VScalarOpUndef,
7127 SubReg: TRI->getSubRegFromChannel(Channel: Idx + 1));
7128
7129 ReadlanePieces.push_back(Elt: CurRegLo);
7130 ReadlanePieces.push_back(Elt: CurRegHi);
7131
7132 // Comparison is to be done as 64-bit.
7133 Register CurReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_64RegClass);
7134 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: CurReg)
7135 .addReg(RegNo: CurRegLo)
7136 .addImm(Val: AMDGPU::sub0)
7137 .addReg(RegNo: CurRegHi)
7138 .addImm(Val: AMDGPU::sub1);
7139
7140 Register NewCondReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7141 auto Cmp = BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U64_e64),
7142 DestReg: NewCondReg)
7143 .addReg(RegNo: CurReg);
7144 if (NumSubRegs <= 2)
7145 Cmp.addReg(RegNo: VScalarOp);
7146 else
7147 Cmp.addReg(RegNo: VScalarOp, Flags: VScalarOpUndef,
7148 SubReg: TRI->getSubRegFromChannel(Channel: Idx, NumRegs: 2));
7149
7150 // Combine the comparison results with AND.
7151 if (!CondReg) // First.
7152 CondReg = NewCondReg;
7153 else { // If not the first, we create an AND.
7154 Register AndReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7155 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndOpc), DestReg: AndReg)
7156 .addReg(RegNo: CondReg)
7157 .addReg(RegNo: NewCondReg);
7158 CondReg = AndReg;
7159 }
7160 } // End for loop.
7161
7162 const auto *SScalarOpRC =
7163 TRI->getEquivalentSGPRClass(VRC: MRI.getRegClass(Reg: VScalarOp));
7164 Register SScalarOp = MRI.createVirtualRegister(RegClass: SScalarOpRC);
7165
7166 // Build scalar ScalarOp.
7167 auto Merge =
7168 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: SScalarOp);
7169 unsigned Channel = 0;
7170 for (Register Piece : ReadlanePieces) {
7171 Merge.addReg(RegNo: Piece).addImm(Val: TRI->getSubRegFromChannel(Channel: Channel++));
7172 }
7173
7174 // Update ScalarOp operand to use the SGPR ScalarOp.
7175 ScalarOp->setReg(SScalarOp);
7176 ScalarOp->setIsKill();
7177 }
7178 }
7179
7180 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7181 MRI.setSimpleHint(VReg: SaveExec, PrefReg: CondReg);
7182
7183 // Update EXEC to matching lanes, saving original to SaveExec.
7184 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndSaveExecOpc), DestReg: SaveExec)
7185 .addReg(RegNo: CondReg, Flags: RegState::Kill);
7186
7187 // The original instruction is here; we insert the terminators after it.
7188 I = BodyBB.end();
7189
7190 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7191 BuildMI(BB&: BodyBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.XorTermOpc), DestReg: LMC.ExecReg)
7192 .addReg(RegNo: LMC.ExecReg)
7193 .addReg(RegNo: SaveExec);
7194
7195 BuildMI(BB&: BodyBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::SI_WATERFALL_LOOP)).addMBB(MBB: &LoopBB);
7196}
7197
7198// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7199// with SGPRs by iterating over all unique values across all lanes.
7200// Returns the loop basic block that now contains \p MI.
7201static MachineBasicBlock *
7202loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
7203 ArrayRef<MachineOperand *> ScalarOps,
7204 MachineDominatorTree *MDT,
7205 MachineBasicBlock::iterator Begin = nullptr,
7206 MachineBasicBlock::iterator End = nullptr) {
7207 MachineBasicBlock &MBB = *MI.getParent();
7208 MachineFunction &MF = *MBB.getParent();
7209 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7210 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7211 MachineRegisterInfo &MRI = MF.getRegInfo();
7212 if (!Begin.isValid())
7213 Begin = &MI;
7214 if (!End.isValid()) {
7215 End = &MI;
7216 ++End;
7217 }
7218 const DebugLoc &DL = MI.getDebugLoc();
7219 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
7220 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7221
7222 // Save SCC. Waterfall Loop may overwrite SCC.
7223 Register SaveSCCReg;
7224
7225 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7226 // rather than unlimited scan everywhere
7227 bool SCCNotDead =
7228 MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::SCC, Before: MI,
7229 Neighborhood: std::numeric_limits<unsigned>::max()) !=
7230 MachineBasicBlock::LQR_Dead;
7231 if (SCCNotDead) {
7232 SaveSCCReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7233 BuildMI(BB&: MBB, I: Begin, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: SaveSCCReg)
7234 .addImm(Val: 1)
7235 .addImm(Val: 0);
7236 }
7237
7238 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7239
7240 // Save the EXEC mask
7241 BuildMI(BB&: MBB, I: Begin, MIMD: DL, MCID: TII.get(Opcode: LMC.MovOpc), DestReg: SaveExec).addReg(RegNo: LMC.ExecReg);
7242
7243 // Killed uses in the instruction we are waterfalling around will be
7244 // incorrect due to the added control-flow.
7245 MachineBasicBlock::iterator AfterMI = MI;
7246 ++AfterMI;
7247 for (auto I = Begin; I != AfterMI; I++) {
7248 for (auto &MO : I->all_uses())
7249 MRI.clearKillFlags(Reg: MO.getReg());
7250 }
7251
7252 // To insert the loop we need to split the block. Move everything after this
7253 // point to a new block, and insert a new empty block between the two.
7254 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
7255 MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
7256 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7257 MachineFunction::iterator MBBI(MBB);
7258 ++MBBI;
7259
7260 MF.insert(MBBI, MBB: LoopBB);
7261 MF.insert(MBBI, MBB: BodyBB);
7262 MF.insert(MBBI, MBB: RemainderBB);
7263
7264 LoopBB->addSuccessor(Succ: BodyBB);
7265 BodyBB->addSuccessor(Succ: LoopBB);
7266 BodyBB->addSuccessor(Succ: RemainderBB);
7267
7268 // Move Begin to MI to the BodyBB, and the remainder of the block to
7269 // RemainderBB.
7270 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
7271 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: End, To: MBB.end());
7272 BodyBB->splice(Where: BodyBB->begin(), Other: &MBB, From: Begin, To: MBB.end());
7273
7274 MBB.addSuccessor(Succ: LoopBB);
7275
7276 // Update dominators. We know that MBB immediately dominates LoopBB, that
7277 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7278 // RemainderBB. RemainderBB immediately dominates all of the successors
7279 // transferred to it from MBB that MBB used to properly dominate.
7280 if (MDT) {
7281 MDT->addNewBlock(BB: LoopBB, DomBB: &MBB);
7282 MDT->addNewBlock(BB: BodyBB, DomBB: LoopBB);
7283 MDT->addNewBlock(BB: RemainderBB, DomBB: BodyBB);
7284 for (auto &Succ : RemainderBB->successors()) {
7285 if (MDT->properlyDominates(A: &MBB, B: Succ)) {
7286 MDT->changeImmediateDominator(BB: Succ, NewBB: RemainderBB);
7287 }
7288 }
7289 }
7290
7291 emitLoadScalarOpsFromVGPRLoop(TII, MRI, LoopBB&: *LoopBB, BodyBB&: *BodyBB, DL, ScalarOps);
7292
7293 MachineBasicBlock::iterator First = RemainderBB->begin();
7294 // Restore SCC
7295 if (SCCNotDead) {
7296 BuildMI(BB&: *RemainderBB, I: First, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CMP_LG_U32))
7297 .addReg(RegNo: SaveSCCReg, Flags: RegState::Kill)
7298 .addImm(Val: 0);
7299 }
7300
7301 // Restore the EXEC mask
7302 BuildMI(BB&: *RemainderBB, I: First, MIMD: DL, MCID: TII.get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
7303 .addReg(RegNo: SaveExec);
7304 return BodyBB;
7305}
7306
7307// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7308static std::tuple<unsigned, unsigned>
7309extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
7310 MachineBasicBlock &MBB = *MI.getParent();
7311 MachineFunction &MF = *MBB.getParent();
7312 MachineRegisterInfo &MRI = MF.getRegInfo();
7313
7314 // Extract the ptr from the resource descriptor.
7315 unsigned RsrcPtr =
7316 TII.buildExtractSubReg(MI, MRI, SuperReg: Rsrc, SuperRC: &AMDGPU::VReg_128RegClass,
7317 SubIdx: AMDGPU::sub0_sub1, SubRC: &AMDGPU::VReg_64RegClass);
7318
7319 // Create an empty resource descriptor
7320 Register Zero64 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
7321 Register SRsrcFormatLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
7322 Register SRsrcFormatHi = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
7323 Register NewSRsrc = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_128RegClass);
7324 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7325
7326 // Zero64 = 0
7327 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B64), DestReg: Zero64)
7328 .addImm(Val: 0);
7329
7330 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7331 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: SRsrcFormatLo)
7332 .addImm(Val: Lo_32(Value: RsrcDataFormat));
7333
7334 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7335 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: SRsrcFormatHi)
7336 .addImm(Val: Hi_32(Value: RsrcDataFormat));
7337
7338 // NewSRsrc = {Zero64, SRsrcFormat}
7339 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewSRsrc)
7340 .addReg(RegNo: Zero64)
7341 .addImm(Val: AMDGPU::sub0_sub1)
7342 .addReg(RegNo: SRsrcFormatLo)
7343 .addImm(Val: AMDGPU::sub2)
7344 .addReg(RegNo: SRsrcFormatHi)
7345 .addImm(Val: AMDGPU::sub3);
7346
7347 return std::tuple(RsrcPtr, NewSRsrc);
7348}
7349
7350MachineBasicBlock *
7351SIInstrInfo::legalizeOperands(MachineInstr &MI,
7352 MachineDominatorTree *MDT) const {
7353 MachineFunction &MF = *MI.getMF();
7354 MachineRegisterInfo &MRI = MF.getRegInfo();
7355 MachineBasicBlock *CreatedBB = nullptr;
7356
7357 // Legalize VOP2
7358 if (isVOP2(MI) || isVOPC(MI)) {
7359 legalizeOperandsVOP2(MRI, MI);
7360 return CreatedBB;
7361 }
7362
7363 // Legalize VOP3
7364 if (isVOP3(MI)) {
7365 legalizeOperandsVOP3(MRI, MI);
7366 return CreatedBB;
7367 }
7368
7369 // Legalize SMRD
7370 if (isSMRD(MI)) {
7371 legalizeOperandsSMRD(MRI, MI);
7372 return CreatedBB;
7373 }
7374
7375 // Legalize FLAT
7376 if (isFLAT(MI)) {
7377 legalizeOperandsFLAT(MRI, MI);
7378 return CreatedBB;
7379 }
7380
7381 // Legalize PHI
7382 // The register class of the operands must be the same type as the register
7383 // class of the output.
7384 if (MI.getOpcode() == AMDGPU::PHI) {
7385 const TargetRegisterClass *VRC = getOpRegClass(MI, OpNo: 0);
7386 assert(!RI.isSGPRClass(VRC));
7387
7388 // Update all the operands so they have the same type.
7389 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7390 MachineOperand &Op = MI.getOperand(i: I);
7391 if (!Op.isReg() || !Op.getReg().isVirtual())
7392 continue;
7393
7394 // MI is a PHI instruction.
7395 MachineBasicBlock *InsertBB = MI.getOperand(i: I + 1).getMBB();
7396 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
7397
7398 // Avoid creating no-op copies with the same src and dst reg class. These
7399 // confuse some of the machine passes.
7400 legalizeGenericOperand(InsertMBB&: *InsertBB, I: Insert, DstRC: VRC, Op, MRI, DL: MI.getDebugLoc());
7401 }
7402 }
7403
7404 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7405 // VGPR dest type and SGPR sources, insert copies so all operands are
7406 // VGPRs. This seems to help operand folding / the register coalescer.
7407 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7408 MachineBasicBlock *MBB = MI.getParent();
7409 const TargetRegisterClass *DstRC = getOpRegClass(MI, OpNo: 0);
7410 if (RI.hasVGPRs(RC: DstRC)) {
7411 // Update all the operands so they are VGPR register classes. These may
7412 // not be the same register class because REG_SEQUENCE supports mixing
7413 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7414 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7415 MachineOperand &Op = MI.getOperand(i: I);
7416 if (!Op.isReg() || !Op.getReg().isVirtual())
7417 continue;
7418
7419 const TargetRegisterClass *OpRC = MRI.getRegClass(Reg: Op.getReg());
7420 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(SRC: OpRC);
7421 if (VRC == OpRC)
7422 continue;
7423
7424 legalizeGenericOperand(InsertMBB&: *MBB, I: MI, DstRC: VRC, Op, MRI, DL: MI.getDebugLoc());
7425 Op.setIsKill();
7426 }
7427 }
7428
7429 return CreatedBB;
7430 }
7431
7432 // Legalize INSERT_SUBREG
7433 // src0 must have the same register class as dst
7434 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7435 Register Dst = MI.getOperand(i: 0).getReg();
7436 Register Src0 = MI.getOperand(i: 1).getReg();
7437 const TargetRegisterClass *DstRC = MRI.getRegClass(Reg: Dst);
7438 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0);
7439 if (DstRC != Src0RC) {
7440 MachineBasicBlock *MBB = MI.getParent();
7441 MachineOperand &Op = MI.getOperand(i: 1);
7442 legalizeGenericOperand(InsertMBB&: *MBB, I: MI, DstRC, Op, MRI, DL: MI.getDebugLoc());
7443 }
7444 return CreatedBB;
7445 }
7446
7447 // Legalize SI_INIT_M0
7448 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7449 MachineOperand &Src = MI.getOperand(i: 0);
7450 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7451 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7452 return CreatedBB;
7453 }
7454
7455 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7456 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7457 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7458 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7459 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7460 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7461 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7462 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7463 MachineOperand &Src = MI.getOperand(i: 1);
7464 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7465 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7466 return CreatedBB;
7467 }
7468
7469 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7470 //
7471 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7472 // scratch memory access. In both cases, the legalization never involves
7473 // conversion to the addr64 form.
7474 if (isImage(MI) || (AMDGPU::isGraphics(CC: MF.getFunction().getCallingConv()) &&
7475 (isMUBUF(MI) || isMTBUF(MI)))) {
7476 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7477 ? AMDGPU::OpName::rsrc
7478 : AMDGPU::OpName::srsrc;
7479 MachineOperand *SRsrc = getNamedOperand(MI, OperandName: RSrcOpName);
7480 if (SRsrc && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SRsrc->getReg())))
7481 CreatedBB = loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {SRsrc}, MDT);
7482
7483 AMDGPU::OpName SampOpName =
7484 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7485 MachineOperand *SSamp = getNamedOperand(MI, OperandName: SampOpName);
7486 if (SSamp && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SSamp->getReg())))
7487 CreatedBB = loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {SSamp}, MDT);
7488
7489 return CreatedBB;
7490 }
7491
7492 // Legalize SI_CALL
7493 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7494 MachineOperand *Dest = &MI.getOperand(i: 0);
7495 if (!RI.isSGPRClass(RC: MRI.getRegClass(Reg: Dest->getReg()))) {
7496 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7497 // following copies, we also need to move copies from and to physical
7498 // registers into the loop block.
7499 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7500 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7501
7502 // Also move the copies to physical registers into the loop block
7503 MachineBasicBlock &MBB = *MI.getParent();
7504 MachineBasicBlock::iterator Start(&MI);
7505 while (Start->getOpcode() != FrameSetupOpcode)
7506 --Start;
7507 MachineBasicBlock::iterator End(&MI);
7508 while (End->getOpcode() != FrameDestroyOpcode)
7509 ++End;
7510 // Also include following copies of the return value
7511 ++End;
7512 while (End != MBB.end() && End->isCopy() && End->getOperand(i: 1).isReg() &&
7513 MI.definesRegister(Reg: End->getOperand(i: 1).getReg(), /*TRI=*/nullptr))
7514 ++End;
7515 CreatedBB =
7516 loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Dest}, MDT, Begin: Start, End);
7517 }
7518 }
7519
7520 // Legalize s_sleep_var.
7521 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7522 const DebugLoc &DL = MI.getDebugLoc();
7523 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7524 int Src0Idx =
7525 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src0);
7526 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
7527 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
7528 .add(MO: Src0);
7529 Src0.ChangeToRegister(Reg, isDef: false);
7530 return nullptr;
7531 }
7532
7533 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7534 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7535 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7536 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7537 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7538 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7539 for (MachineOperand &Src : MI.explicit_operands()) {
7540 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7541 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7542 }
7543 return CreatedBB;
7544 }
7545
7546 // Legalize MUBUF instructions.
7547 bool isSoffsetLegal = true;
7548 int SoffsetIdx =
7549 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::soffset);
7550 if (SoffsetIdx != -1) {
7551 MachineOperand *Soffset = &MI.getOperand(i: SoffsetIdx);
7552 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7553 !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Soffset->getReg()))) {
7554 isSoffsetLegal = false;
7555 }
7556 }
7557
7558 bool isRsrcLegal = true;
7559 int RsrcIdx =
7560 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::srsrc);
7561 if (RsrcIdx != -1) {
7562 MachineOperand *Rsrc = &MI.getOperand(i: RsrcIdx);
7563 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Reg: Rsrc->getReg()))
7564 isRsrcLegal = false;
7565 }
7566
7567 // The operands are legal.
7568 if (isRsrcLegal && isSoffsetLegal)
7569 return CreatedBB;
7570
7571 if (!isRsrcLegal) {
7572 // Legalize a VGPR Rsrc
7573 //
7574 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7575 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7576 // a zero-value SRsrc.
7577 //
7578 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7579 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7580 // above.
7581 //
7582 // Otherwise we are on non-ADDR64 hardware, and/or we have
7583 // idxen/offen/bothen and we fall back to a waterfall loop.
7584
7585 MachineOperand *Rsrc = &MI.getOperand(i: RsrcIdx);
7586 MachineBasicBlock &MBB = *MI.getParent();
7587
7588 MachineOperand *VAddr = getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
7589 if (VAddr && AMDGPU::getIfAddr64Inst(Opcode: MI.getOpcode()) != -1) {
7590 // This is already an ADDR64 instruction so we need to add the pointer
7591 // extracted from the resource descriptor to the current value of VAddr.
7592 Register NewVAddrLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7593 Register NewVAddrHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7594 Register NewVAddr = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
7595
7596 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7597 Register CondReg0 = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7598 Register CondReg1 = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7599
7600 unsigned RsrcPtr, NewSRsrc;
7601 std::tie(args&: RsrcPtr, args&: NewSRsrc) = extractRsrcPtr(TII: *this, MI, Rsrc&: *Rsrc);
7602
7603 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7604 const DebugLoc &DL = MI.getDebugLoc();
7605 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: NewVAddrLo)
7606 .addDef(RegNo: CondReg0)
7607 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub0)
7608 .addReg(RegNo: VAddr->getReg(), Flags: {}, SubReg: AMDGPU::sub0)
7609 .addImm(Val: 0);
7610
7611 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7612 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: NewVAddrHi)
7613 .addDef(RegNo: CondReg1, Flags: RegState::Dead)
7614 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub1)
7615 .addReg(RegNo: VAddr->getReg(), Flags: {}, SubReg: AMDGPU::sub1)
7616 .addReg(RegNo: CondReg0, Flags: RegState::Kill)
7617 .addImm(Val: 0);
7618
7619 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7620 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewVAddr)
7621 .addReg(RegNo: NewVAddrLo)
7622 .addImm(Val: AMDGPU::sub0)
7623 .addReg(RegNo: NewVAddrHi)
7624 .addImm(Val: AMDGPU::sub1);
7625
7626 VAddr->setReg(NewVAddr);
7627 Rsrc->setReg(NewSRsrc);
7628 } else if (!VAddr && ST.hasAddr64()) {
7629 // This instructions is the _OFFSET variant, so we need to convert it to
7630 // ADDR64.
7631 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7632 "FIXME: Need to emit flat atomics here");
7633
7634 unsigned RsrcPtr, NewSRsrc;
7635 std::tie(args&: RsrcPtr, args&: NewSRsrc) = extractRsrcPtr(TII: *this, MI, Rsrc&: *Rsrc);
7636
7637 Register NewVAddr = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
7638 MachineOperand *VData = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata);
7639 MachineOperand *Offset = getNamedOperand(MI, OperandName: AMDGPU::OpName::offset);
7640 MachineOperand *SOffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7641 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(Opcode: MI.getOpcode());
7642
7643 // Atomics with return have an additional tied operand and are
7644 // missing some of the special bits.
7645 MachineOperand *VDataIn = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata_in);
7646 MachineInstr *Addr64;
7647
7648 if (!VDataIn) {
7649 // Regular buffer load / store.
7650 MachineInstrBuilder MIB =
7651 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: Addr64Opcode))
7652 .add(MO: *VData)
7653 .addReg(RegNo: NewVAddr)
7654 .addReg(RegNo: NewSRsrc)
7655 .add(MO: *SOffset)
7656 .add(MO: *Offset);
7657
7658 if (const MachineOperand *CPol =
7659 getNamedOperand(MI, OperandName: AMDGPU::OpName::cpol)) {
7660 MIB.addImm(Val: CPol->getImm());
7661 }
7662
7663 if (const MachineOperand *TFE =
7664 getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe)) {
7665 MIB.addImm(Val: TFE->getImm());
7666 }
7667
7668 MIB.addImm(Val: getNamedImmOperand(MI, OperandName: AMDGPU::OpName::swz));
7669
7670 MIB.cloneMemRefs(OtherMI: MI);
7671 Addr64 = MIB;
7672 } else {
7673 // Atomics with return.
7674 Addr64 = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: Addr64Opcode))
7675 .add(MO: *VData)
7676 .add(MO: *VDataIn)
7677 .addReg(RegNo: NewVAddr)
7678 .addReg(RegNo: NewSRsrc)
7679 .add(MO: *SOffset)
7680 .add(MO: *Offset)
7681 .addImm(Val: getNamedImmOperand(MI, OperandName: AMDGPU::OpName::cpol))
7682 .cloneMemRefs(OtherMI: MI);
7683 }
7684
7685 MI.removeFromParent();
7686
7687 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7688 BuildMI(BB&: MBB, I: Addr64, MIMD: Addr64->getDebugLoc(), MCID: get(Opcode: AMDGPU::REG_SEQUENCE),
7689 DestReg: NewVAddr)
7690 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub0)
7691 .addImm(Val: AMDGPU::sub0)
7692 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub1)
7693 .addImm(Val: AMDGPU::sub1);
7694 } else {
7695 // Legalize a VGPR Rsrc and soffset together.
7696 if (!isSoffsetLegal) {
7697 MachineOperand *Soffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7698 CreatedBB =
7699 loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Rsrc, Soffset}, MDT);
7700 return CreatedBB;
7701 }
7702 CreatedBB = loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Rsrc}, MDT);
7703 return CreatedBB;
7704 }
7705 }
7706
7707 // Legalize a VGPR soffset.
7708 if (!isSoffsetLegal) {
7709 MachineOperand *Soffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7710 CreatedBB = loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Soffset}, MDT);
7711 return CreatedBB;
7712 }
7713 return CreatedBB;
7714}
7715
7716void SIInstrWorklist::insert(MachineInstr *MI) {
7717 InstrList.insert(X: MI);
7718 // Add MBUF instructiosn to deferred list.
7719 int RsrcIdx =
7720 AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(), Name: AMDGPU::OpName::srsrc);
7721 if (RsrcIdx != -1) {
7722 DeferredList.insert(X: MI);
7723 }
7724}
7725
7726bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
7727 return DeferredList.contains(key: MI);
7728}
7729
7730// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7731// lowering (change spgr to vgpr).
7732// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7733// size. Need to legalize the size of the operands during the vgpr lowering
7734// chain. This can be removed after we have sgpr16 in place
7735void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
7736 MachineRegisterInfo &MRI) const {
7737 if (!ST.useRealTrue16Insts())
7738 return;
7739
7740 unsigned Opcode = MI.getOpcode();
7741 MachineBasicBlock *MBB = MI.getParent();
7742 // Legalize operands and check for size mismatch
7743 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7744 OpIdx >= get(Opcode).getNumOperands() ||
7745 get(Opcode).operands()[OpIdx].RegClass == -1)
7746 return;
7747
7748 MachineOperand &Op = MI.getOperand(i: OpIdx);
7749 if (!Op.isReg() || !Op.getReg().isVirtual())
7750 return;
7751
7752 const TargetRegisterClass *CurrRC = MRI.getRegClass(Reg: Op.getReg());
7753 if (!RI.isVGPRClass(RC: CurrRC))
7754 return;
7755
7756 int16_t RCID = getOpRegClassID(OpInfo: get(Opcode).operands()[OpIdx]);
7757 const TargetRegisterClass *ExpectedRC = RI.getRegClass(i: RCID);
7758 if (RI.getMatchingSuperRegClass(A: CurrRC, B: ExpectedRC, Idx: AMDGPU::lo16)) {
7759 Op.setSubReg(AMDGPU::lo16);
7760 } else if (RI.getMatchingSuperRegClass(A: ExpectedRC, B: CurrRC, Idx: AMDGPU::lo16)) {
7761 const DebugLoc &DL = MI.getDebugLoc();
7762 Register NewDstReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7763 Register Undef = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass);
7764 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
7765 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewDstReg)
7766 .addReg(RegNo: Op.getReg())
7767 .addImm(Val: AMDGPU::lo16)
7768 .addReg(RegNo: Undef)
7769 .addImm(Val: AMDGPU::hi16);
7770 Op.setReg(NewDstReg);
7771 }
7772}
7773void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
7774 MachineRegisterInfo &MRI) const {
7775 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7776 legalizeOperandsVALUt16(MI, OpIdx, MRI);
7777}
7778
7779void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
7780 MachineDominatorTree *MDT) const {
7781
7782 while (!Worklist.empty()) {
7783 MachineInstr &Inst = *Worklist.top();
7784 Worklist.erase_top();
7785 // Skip MachineInstr in the deferred list.
7786 if (Worklist.isDeferred(MI: &Inst))
7787 continue;
7788 moveToVALUImpl(Worklist, MDT, Inst);
7789 }
7790
7791 // Deferred list of instructions will be processed once
7792 // all the MachineInstr in the worklist are done.
7793 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7794 moveToVALUImpl(Worklist, MDT, Inst&: *Inst);
7795 assert(Worklist.empty() &&
7796 "Deferred MachineInstr are not supposed to re-populate worklist");
7797 }
7798}
7799
7800void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
7801 MachineDominatorTree *MDT,
7802 MachineInstr &Inst) const {
7803
7804 MachineBasicBlock *MBB = Inst.getParent();
7805 if (!MBB)
7806 return;
7807 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7808 unsigned Opcode = Inst.getOpcode();
7809 unsigned NewOpcode = getVALUOp(MI: Inst);
7810 const DebugLoc &DL = Inst.getDebugLoc();
7811
7812 // Handle some special cases
7813 switch (Opcode) {
7814 default:
7815 break;
7816 case AMDGPU::S_ADD_I32:
7817 case AMDGPU::S_SUB_I32: {
7818 // FIXME: The u32 versions currently selected use the carry.
7819 bool Changed;
7820 MachineBasicBlock *CreatedBBTmp = nullptr;
7821 std::tie(args&: Changed, args&: CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7822 if (Changed)
7823 return;
7824
7825 // Default handling
7826 break;
7827 }
7828
7829 case AMDGPU::S_MUL_U64:
7830 if (ST.hasVectorMulU64()) {
7831 NewOpcode = AMDGPU::V_MUL_U64_e64;
7832 break;
7833 }
7834 // Split s_mul_u64 in 32-bit vector multiplications.
7835 splitScalarSMulU64(Worklist, Inst, MDT);
7836 Inst.eraseFromParent();
7837 return;
7838
7839 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7840 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7841 // This is a special case of s_mul_u64 where all the operands are either
7842 // zero extended or sign extended.
7843 splitScalarSMulPseudo(Worklist, Inst, MDT);
7844 Inst.eraseFromParent();
7845 return;
7846
7847 case AMDGPU::S_AND_B64:
7848 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_AND_B32, MDT);
7849 Inst.eraseFromParent();
7850 return;
7851
7852 case AMDGPU::S_OR_B64:
7853 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_OR_B32, MDT);
7854 Inst.eraseFromParent();
7855 return;
7856
7857 case AMDGPU::S_XOR_B64:
7858 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_XOR_B32, MDT);
7859 Inst.eraseFromParent();
7860 return;
7861
7862 case AMDGPU::S_NAND_B64:
7863 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_NAND_B32, MDT);
7864 Inst.eraseFromParent();
7865 return;
7866
7867 case AMDGPU::S_NOR_B64:
7868 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_NOR_B32, MDT);
7869 Inst.eraseFromParent();
7870 return;
7871
7872 case AMDGPU::S_XNOR_B64:
7873 if (ST.hasDLInsts())
7874 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_XNOR_B32, MDT);
7875 else
7876 splitScalar64BitXnor(Worklist, Inst, MDT);
7877 Inst.eraseFromParent();
7878 return;
7879
7880 case AMDGPU::S_ANDN2_B64:
7881 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_ANDN2_B32, MDT);
7882 Inst.eraseFromParent();
7883 return;
7884
7885 case AMDGPU::S_ORN2_B64:
7886 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_ORN2_B32, MDT);
7887 Inst.eraseFromParent();
7888 return;
7889
7890 case AMDGPU::S_BREV_B64:
7891 splitScalar64BitUnaryOp(Worklist, Inst, Opcode: AMDGPU::S_BREV_B32, Swap: true);
7892 Inst.eraseFromParent();
7893 return;
7894
7895 case AMDGPU::S_NOT_B64:
7896 splitScalar64BitUnaryOp(Worklist, Inst, Opcode: AMDGPU::S_NOT_B32);
7897 Inst.eraseFromParent();
7898 return;
7899
7900 case AMDGPU::S_BCNT1_I32_B64:
7901 splitScalar64BitBCNT(Worklist, Inst);
7902 Inst.eraseFromParent();
7903 return;
7904
7905 case AMDGPU::S_BFE_I64:
7906 splitScalar64BitBFE(Worklist, Inst);
7907 Inst.eraseFromParent();
7908 return;
7909
7910 case AMDGPU::S_FLBIT_I32_B64:
7911 splitScalar64BitCountOp(Worklist, Inst, Opcode: AMDGPU::V_FFBH_U32_e32);
7912 Inst.eraseFromParent();
7913 return;
7914 case AMDGPU::S_FF1_I32_B64:
7915 splitScalar64BitCountOp(Worklist, Inst, Opcode: AMDGPU::V_FFBL_B32_e32);
7916 Inst.eraseFromParent();
7917 return;
7918
7919 case AMDGPU::S_LSHL_B32:
7920 if (ST.hasOnlyRevVALUShifts()) {
7921 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7922 swapOperands(Inst);
7923 }
7924 break;
7925 case AMDGPU::S_ASHR_I32:
7926 if (ST.hasOnlyRevVALUShifts()) {
7927 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7928 swapOperands(Inst);
7929 }
7930 break;
7931 case AMDGPU::S_LSHR_B32:
7932 if (ST.hasOnlyRevVALUShifts()) {
7933 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7934 swapOperands(Inst);
7935 }
7936 break;
7937 case AMDGPU::S_LSHL_B64:
7938 if (ST.hasOnlyRevVALUShifts()) {
7939 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7940 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7941 : AMDGPU::V_LSHLREV_B64_e64;
7942 swapOperands(Inst);
7943 }
7944 break;
7945 case AMDGPU::S_ASHR_I64:
7946 if (ST.hasOnlyRevVALUShifts()) {
7947 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7948 swapOperands(Inst);
7949 }
7950 break;
7951 case AMDGPU::S_LSHR_B64:
7952 if (ST.hasOnlyRevVALUShifts()) {
7953 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7954 swapOperands(Inst);
7955 }
7956 break;
7957
7958 case AMDGPU::S_ABS_I32:
7959 lowerScalarAbs(Worklist, Inst);
7960 Inst.eraseFromParent();
7961 return;
7962
7963 case AMDGPU::S_ABSDIFF_I32:
7964 lowerScalarAbsDiff(Worklist, Inst);
7965 Inst.eraseFromParent();
7966 return;
7967
7968 case AMDGPU::S_CBRANCH_SCC0:
7969 case AMDGPU::S_CBRANCH_SCC1: {
7970 // Clear unused bits of vcc
7971 Register CondReg = Inst.getOperand(i: 1).getReg();
7972 bool IsSCC = CondReg == AMDGPU::SCC;
7973 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
7974 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: LMC.AndOpc), DestReg: LMC.VccReg)
7975 .addReg(RegNo: LMC.ExecReg)
7976 .addReg(RegNo: IsSCC ? LMC.VccReg : CondReg);
7977 Inst.removeOperand(OpNo: 1);
7978 } break;
7979
7980 case AMDGPU::S_BFE_U64:
7981 case AMDGPU::S_BFM_B64:
7982 llvm_unreachable("Moving this op to VALU not implemented");
7983
7984 case AMDGPU::S_PACK_LL_B32_B16:
7985 case AMDGPU::S_PACK_LH_B32_B16:
7986 case AMDGPU::S_PACK_HL_B32_B16:
7987 case AMDGPU::S_PACK_HH_B32_B16:
7988 movePackToVALU(Worklist, MRI, Inst);
7989 Inst.eraseFromParent();
7990 return;
7991
7992 case AMDGPU::S_XNOR_B32:
7993 lowerScalarXnor(Worklist, Inst);
7994 Inst.eraseFromParent();
7995 return;
7996
7997 case AMDGPU::S_NAND_B32:
7998 splitScalarNotBinop(Worklist, Inst, Opcode: AMDGPU::S_AND_B32);
7999 Inst.eraseFromParent();
8000 return;
8001
8002 case AMDGPU::S_NOR_B32:
8003 splitScalarNotBinop(Worklist, Inst, Opcode: AMDGPU::S_OR_B32);
8004 Inst.eraseFromParent();
8005 return;
8006
8007 case AMDGPU::S_ANDN2_B32:
8008 splitScalarBinOpN2(Worklist, Inst, Opcode: AMDGPU::S_AND_B32);
8009 Inst.eraseFromParent();
8010 return;
8011
8012 case AMDGPU::S_ORN2_B32:
8013 splitScalarBinOpN2(Worklist, Inst, Opcode: AMDGPU::S_OR_B32);
8014 Inst.eraseFromParent();
8015 return;
8016
8017 // TODO: remove as soon as everything is ready
8018 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8019 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8020 // can only be selected from the uniform SDNode.
8021 case AMDGPU::S_ADD_CO_PSEUDO:
8022 case AMDGPU::S_SUB_CO_PSEUDO: {
8023 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8024 ? AMDGPU::V_ADDC_U32_e64
8025 : AMDGPU::V_SUBB_U32_e64;
8026 const auto *CarryRC = RI.getWaveMaskRegClass();
8027
8028 Register CarryInReg = Inst.getOperand(i: 4).getReg();
8029 if (!MRI.constrainRegClass(Reg: CarryInReg, RC: CarryRC)) {
8030 Register NewCarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
8031 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY), DestReg: NewCarryReg)
8032 .addReg(RegNo: CarryInReg);
8033 }
8034
8035 Register CarryOutReg = Inst.getOperand(i: 1).getReg();
8036
8037 Register DestReg = MRI.createVirtualRegister(RegClass: RI.getEquivalentVGPRClass(
8038 SRC: MRI.getRegClass(Reg: Inst.getOperand(i: 0).getReg())));
8039 MachineInstr *CarryOp =
8040 BuildMI(BB&: *MBB, I: &Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: Opc), DestReg)
8041 .addReg(RegNo: CarryOutReg, Flags: RegState::Define)
8042 .add(MO: Inst.getOperand(i: 2))
8043 .add(MO: Inst.getOperand(i: 3))
8044 .addReg(RegNo: CarryInReg)
8045 .addImm(Val: 0);
8046 legalizeOperands(MI&: *CarryOp);
8047 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: DestReg);
8048 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8049 Inst.eraseFromParent();
8050 }
8051 return;
8052 case AMDGPU::S_UADDO_PSEUDO:
8053 case AMDGPU::S_USUBO_PSEUDO: {
8054 MachineOperand &Dest0 = Inst.getOperand(i: 0);
8055 MachineOperand &Dest1 = Inst.getOperand(i: 1);
8056 MachineOperand &Src0 = Inst.getOperand(i: 2);
8057 MachineOperand &Src1 = Inst.getOperand(i: 3);
8058
8059 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8060 ? AMDGPU::V_ADD_CO_U32_e64
8061 : AMDGPU::V_SUB_CO_U32_e64;
8062 const TargetRegisterClass *NewRC =
8063 RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest0.getReg()));
8064 Register DestReg = MRI.createVirtualRegister(RegClass: NewRC);
8065 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I: &Inst, MIMD: DL, MCID: get(Opcode: Opc), DestReg)
8066 .addReg(RegNo: Dest1.getReg(), Flags: RegState::Define)
8067 .add(MO: Src0)
8068 .add(MO: Src1)
8069 .addImm(Val: 0); // clamp bit
8070
8071 legalizeOperands(MI&: *NewInstr, MDT);
8072 MRI.replaceRegWith(FromReg: Dest0.getReg(), ToReg: DestReg);
8073 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8074 Inst.eraseFromParent();
8075 }
8076 return;
8077 case AMDGPU::S_LSHL1_ADD_U32:
8078 case AMDGPU::S_LSHL2_ADD_U32:
8079 case AMDGPU::S_LSHL3_ADD_U32:
8080 case AMDGPU::S_LSHL4_ADD_U32: {
8081 MachineOperand &Dest = Inst.getOperand(i: 0);
8082 MachineOperand &Src0 = Inst.getOperand(i: 1);
8083 MachineOperand &Src1 = Inst.getOperand(i: 2);
8084 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8085 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8086 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8087 : 4);
8088
8089 const TargetRegisterClass *NewRC =
8090 RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest.getReg()));
8091 Register DestReg = MRI.createVirtualRegister(RegClass: NewRC);
8092 MachineInstr *NewInstr =
8093 BuildMI(BB&: *MBB, I: &Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8094 .add(MO: Src0)
8095 .addImm(Val: ShiftAmt)
8096 .add(MO: Src1);
8097
8098 legalizeOperands(MI&: *NewInstr, MDT);
8099 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: DestReg);
8100 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8101 Inst.eraseFromParent();
8102 }
8103 return;
8104 case AMDGPU::S_CSELECT_B32:
8105 case AMDGPU::S_CSELECT_B64:
8106 lowerSelect(Worklist, Inst, MDT);
8107 Inst.eraseFromParent();
8108 return;
8109 case AMDGPU::S_CMP_EQ_I32:
8110 case AMDGPU::S_CMP_LG_I32:
8111 case AMDGPU::S_CMP_GT_I32:
8112 case AMDGPU::S_CMP_GE_I32:
8113 case AMDGPU::S_CMP_LT_I32:
8114 case AMDGPU::S_CMP_LE_I32:
8115 case AMDGPU::S_CMP_EQ_U32:
8116 case AMDGPU::S_CMP_LG_U32:
8117 case AMDGPU::S_CMP_GT_U32:
8118 case AMDGPU::S_CMP_GE_U32:
8119 case AMDGPU::S_CMP_LT_U32:
8120 case AMDGPU::S_CMP_LE_U32:
8121 case AMDGPU::S_CMP_EQ_U64:
8122 case AMDGPU::S_CMP_LG_U64:
8123 case AMDGPU::S_CMP_LT_F32:
8124 case AMDGPU::S_CMP_EQ_F32:
8125 case AMDGPU::S_CMP_LE_F32:
8126 case AMDGPU::S_CMP_GT_F32:
8127 case AMDGPU::S_CMP_LG_F32:
8128 case AMDGPU::S_CMP_GE_F32:
8129 case AMDGPU::S_CMP_O_F32:
8130 case AMDGPU::S_CMP_U_F32:
8131 case AMDGPU::S_CMP_NGE_F32:
8132 case AMDGPU::S_CMP_NLG_F32:
8133 case AMDGPU::S_CMP_NGT_F32:
8134 case AMDGPU::S_CMP_NLE_F32:
8135 case AMDGPU::S_CMP_NEQ_F32:
8136 case AMDGPU::S_CMP_NLT_F32: {
8137 Register CondReg = MRI.createVirtualRegister(RegClass: RI.getWaveMaskRegClass());
8138 auto NewInstr =
8139 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode), DestReg: CondReg)
8140 .setMIFlags(Inst.getFlags());
8141 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src0_modifiers) >=
8142 0) {
8143 NewInstr
8144 .addImm(Val: 0) // src0_modifiers
8145 .add(MO: Inst.getOperand(i: 0)) // src0
8146 .addImm(Val: 0) // src1_modifiers
8147 .add(MO: Inst.getOperand(i: 1)) // src1
8148 .addImm(Val: 0); // clamp
8149 } else {
8150 NewInstr.add(MO: Inst.getOperand(i: 0)).add(MO: Inst.getOperand(i: 1));
8151 }
8152 legalizeOperands(MI&: *NewInstr, MDT);
8153 int SCCIdx = Inst.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
8154 const MachineOperand &SCCOp = Inst.getOperand(i: SCCIdx);
8155 addSCCDefUsersToVALUWorklist(Op: SCCOp, SCCDefInst&: Inst, Worklist, NewCond: CondReg);
8156 Inst.eraseFromParent();
8157 return;
8158 }
8159 case AMDGPU::S_CMP_LT_F16:
8160 case AMDGPU::S_CMP_EQ_F16:
8161 case AMDGPU::S_CMP_LE_F16:
8162 case AMDGPU::S_CMP_GT_F16:
8163 case AMDGPU::S_CMP_LG_F16:
8164 case AMDGPU::S_CMP_GE_F16:
8165 case AMDGPU::S_CMP_O_F16:
8166 case AMDGPU::S_CMP_U_F16:
8167 case AMDGPU::S_CMP_NGE_F16:
8168 case AMDGPU::S_CMP_NLG_F16:
8169 case AMDGPU::S_CMP_NGT_F16:
8170 case AMDGPU::S_CMP_NLE_F16:
8171 case AMDGPU::S_CMP_NEQ_F16:
8172 case AMDGPU::S_CMP_NLT_F16: {
8173 Register CondReg = MRI.createVirtualRegister(RegClass: RI.getWaveMaskRegClass());
8174 auto NewInstr =
8175 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode), DestReg: CondReg)
8176 .setMIFlags(Inst.getFlags());
8177 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::src0_modifiers)) {
8178 NewInstr
8179 .addImm(Val: 0) // src0_modifiers
8180 .add(MO: Inst.getOperand(i: 0)) // src0
8181 .addImm(Val: 0) // src1_modifiers
8182 .add(MO: Inst.getOperand(i: 1)) // src1
8183 .addImm(Val: 0); // clamp
8184 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::op_sel))
8185 NewInstr.addImm(Val: 0); // op_sel0
8186 } else {
8187 NewInstr
8188 .add(MO: Inst.getOperand(i: 0))
8189 .add(MO: Inst.getOperand(i: 1));
8190 }
8191 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8192 legalizeOperands(MI&: *NewInstr, MDT);
8193 int SCCIdx = Inst.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
8194 const MachineOperand &SCCOp = Inst.getOperand(i: SCCIdx);
8195 addSCCDefUsersToVALUWorklist(Op: SCCOp, SCCDefInst&: Inst, Worklist, NewCond: CondReg);
8196 Inst.eraseFromParent();
8197 return;
8198 }
8199 case AMDGPU::S_CVT_HI_F32_F16: {
8200 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8201 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8202 if (ST.useRealTrue16Insts()) {
8203 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: TmpReg)
8204 .add(MO: Inst.getOperand(i: 1));
8205 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8206 .addImm(Val: 0) // src0_modifiers
8207 .addReg(RegNo: TmpReg, Flags: {}, SubReg: AMDGPU::hi16)
8208 .addImm(Val: 0) // clamp
8209 .addImm(Val: 0) // omod
8210 .addImm(Val: 0); // op_sel0
8211 } else {
8212 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
8213 .addImm(Val: 16)
8214 .add(MO: Inst.getOperand(i: 1));
8215 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8216 .addImm(Val: 0) // src0_modifiers
8217 .addReg(RegNo: TmpReg)
8218 .addImm(Val: 0) // clamp
8219 .addImm(Val: 0); // omod
8220 }
8221
8222 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8223 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8224 Inst.eraseFromParent();
8225 return;
8226 }
8227 case AMDGPU::S_MINIMUM_F32:
8228 case AMDGPU::S_MAXIMUM_F32: {
8229 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8230 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8231 .addImm(Val: 0) // src0_modifiers
8232 .add(MO: Inst.getOperand(i: 1))
8233 .addImm(Val: 0) // src1_modifiers
8234 .add(MO: Inst.getOperand(i: 2))
8235 .addImm(Val: 0) // clamp
8236 .addImm(Val: 0); // omod
8237 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8238
8239 legalizeOperands(MI&: *NewInstr, MDT);
8240 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8241 Inst.eraseFromParent();
8242 return;
8243 }
8244 case AMDGPU::S_MINIMUM_F16:
8245 case AMDGPU::S_MAXIMUM_F16: {
8246 Register NewDst = MRI.createVirtualRegister(RegClass: ST.useRealTrue16Insts()
8247 ? &AMDGPU::VGPR_16RegClass
8248 : &AMDGPU::VGPR_32RegClass);
8249 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8250 .addImm(Val: 0) // src0_modifiers
8251 .add(MO: Inst.getOperand(i: 1))
8252 .addImm(Val: 0) // src1_modifiers
8253 .add(MO: Inst.getOperand(i: 2))
8254 .addImm(Val: 0) // clamp
8255 .addImm(Val: 0) // omod
8256 .addImm(Val: 0); // opsel0
8257 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8258 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8259 legalizeOperands(MI&: *NewInstr, MDT);
8260 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8261 Inst.eraseFromParent();
8262 return;
8263 }
8264 case AMDGPU::V_S_EXP_F16_e64:
8265 case AMDGPU::V_S_LOG_F16_e64:
8266 case AMDGPU::V_S_RCP_F16_e64:
8267 case AMDGPU::V_S_RSQ_F16_e64:
8268 case AMDGPU::V_S_SQRT_F16_e64: {
8269 Register NewDst = MRI.createVirtualRegister(RegClass: ST.useRealTrue16Insts()
8270 ? &AMDGPU::VGPR_16RegClass
8271 : &AMDGPU::VGPR_32RegClass);
8272 auto NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8273 .add(MO: Inst.getOperand(i: 1)) // src0_modifiers
8274 .add(MO: Inst.getOperand(i: 2))
8275 .add(MO: Inst.getOperand(i: 3)) // clamp
8276 .add(MO: Inst.getOperand(i: 4)) // omod
8277 .setMIFlags(Inst.getFlags());
8278 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::op_sel))
8279 NewInstr.addImm(Val: 0); // opsel0
8280 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8281 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8282 legalizeOperands(MI&: *NewInstr, MDT);
8283 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8284 Inst.eraseFromParent();
8285 return;
8286 }
8287 }
8288
8289 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8290 // We cannot move this instruction to the VALU, so we should try to
8291 // legalize its operands instead.
8292 legalizeOperands(MI&: Inst, MDT);
8293 return;
8294 }
8295 // Handle converting generic instructions like COPY-to-SGPR into
8296 // COPY-to-VGPR.
8297 if (NewOpcode == Opcode) {
8298 Register DstReg = Inst.getOperand(i: 0).getReg();
8299 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8300
8301 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8302 // hope for the best.
8303 if (Inst.isCopy() && DstReg.isPhysical() &&
8304 RI.isVGPR(MRI, Reg: Inst.getOperand(i: 1).getReg())) {
8305 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8306 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
8307 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: NewDst)
8308 .add(MO: Inst.getOperand(i: 1));
8309 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
8310 DestReg: DstReg)
8311 .addReg(RegNo: NewDst);
8312
8313 Inst.eraseFromParent();
8314 return;
8315 }
8316
8317 if (Inst.isCopy() && Inst.getOperand(i: 1).getReg().isVirtual()) {
8318 Register NewDstReg = Inst.getOperand(i: 1).getReg();
8319 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, Reg: NewDstReg);
8320 if (const TargetRegisterClass *CommonRC =
8321 RI.getCommonSubClass(A: NewDstRC, B: SrcRC)) {
8322 // Also intersect with VGPR-compatible operand register class
8323 // constraints from user instructions. This preserves restricted
8324 // register classes (e.g., VGPR_32_Lo256 for WMMA scale operands) that
8325 // would otherwise be lost when an SGPR is replaced with a VGPR.
8326 // Constraints incompatible with VGPRs (e.g., SALU instructions
8327 // requiring SReg_32) are skipped because those users will be converted
8328 // to VALU by the worklist.
8329 for (const MachineOperand &UseMO : MRI.use_operands(Reg: DstReg)) {
8330 const MachineInstr *UseMI = UseMO.getParent();
8331 if (UseMI == &Inst)
8332 continue;
8333 unsigned OpIdx = UseMI->getOperandNo(I: &UseMO);
8334 if (const TargetRegisterClass *OpRC =
8335 getRegClass(MCID: UseMI->getDesc(), OpNum: OpIdx)) {
8336 if (const TargetRegisterClass *Narrowed =
8337 RI.getCommonSubClass(A: CommonRC, B: OpRC))
8338 CommonRC = Narrowed;
8339 }
8340 }
8341
8342 // Instead of creating a copy where src and dst are the same register
8343 // class, we just replace all uses of dst with src. These kinds of
8344 // copies interfere with the heuristics MachineSink uses to decide
8345 // whether or not to split a critical edge. Since the pass assumes
8346 // that copies will end up as machine instructions and not be
8347 // eliminated.
8348 addUsersToMoveToVALUWorklist(Reg: DstReg, MRI, Worklist);
8349 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8350 MRI.clearKillFlags(Reg: NewDstReg);
8351 Inst.getOperand(i: 0).setReg(DstReg);
8352
8353 if (!MRI.constrainRegClass(Reg: NewDstReg, RC: CommonRC))
8354 llvm_unreachable("failed to constrain register");
8355
8356 Inst.eraseFromParent();
8357 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8358 for (MachineOperand &MO :
8359 make_early_inc_range(Range: MRI.use_operands(Reg: NewDstReg))) {
8360 legalizeOperandsVALUt16(MI&: *MO.getParent(), MRI);
8361 }
8362
8363 return;
8364 }
8365 }
8366
8367 // If this is a v2s copy between 16bit and 32bit reg,
8368 // replace vgpr copy to reg_sequence/extract_subreg
8369 // This can be remove after we have sgpr16 in place
8370 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8371 Inst.getOperand(i: 1).getReg().isVirtual() &&
8372 RI.isVGPR(MRI, Reg: Inst.getOperand(i: 1).getReg())) {
8373 const TargetRegisterClass *SrcRegRC = getOpRegClass(MI: Inst, OpNo: 1);
8374 if (RI.getMatchingSuperRegClass(A: NewDstRC, B: SrcRegRC, Idx: AMDGPU::lo16)) {
8375 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8376 Register Undef = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass);
8377 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
8378 MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
8379 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
8380 MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewDstReg)
8381 .addReg(RegNo: Inst.getOperand(i: 1).getReg())
8382 .addImm(Val: AMDGPU::lo16)
8383 .addReg(RegNo: Undef)
8384 .addImm(Val: AMDGPU::hi16);
8385 Inst.eraseFromParent();
8386 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8387 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8388 return;
8389 } else if (RI.getMatchingSuperRegClass(A: SrcRegRC, B: NewDstRC,
8390 Idx: AMDGPU::lo16)) {
8391 Inst.getOperand(i: 1).setSubReg(AMDGPU::lo16);
8392 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8393 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8394 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8395 return;
8396 }
8397 }
8398
8399 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8400 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8401 legalizeOperands(MI&: Inst, MDT);
8402 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8403 return;
8404 }
8405
8406 // Use the new VALU Opcode.
8407 auto NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode))
8408 .setMIFlags(Inst.getFlags());
8409 if (isVOP3(Opcode: NewOpcode) && !isVOP3(Opcode)) {
8410 // Intersperse VOP3 modifiers among the SALU operands.
8411 NewInstr->addOperand(Op: Inst.getOperand(i: 0));
8412 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8413 Name: AMDGPU::OpName::src0_modifiers) >= 0)
8414 NewInstr.addImm(Val: 0);
8415 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::src0)) {
8416 const MachineOperand &Src = Inst.getOperand(i: 1);
8417 NewInstr->addOperand(Op: Src);
8418 }
8419
8420 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8421 // We are converting these to a BFE, so we need to add the missing
8422 // operands for the size and offset.
8423 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8424 NewInstr.addImm(Val: 0);
8425 NewInstr.addImm(Val: Size);
8426 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8427 // The VALU version adds the second operand to the result, so insert an
8428 // extra 0 operand.
8429 NewInstr.addImm(Val: 0);
8430 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8431 const MachineOperand &OffsetWidthOp = Inst.getOperand(i: 2);
8432 // If we need to move this to VGPRs, we need to unpack the second
8433 // operand back into the 2 separate ones for bit offset and width.
8434 assert(OffsetWidthOp.isImm() &&
8435 "Scalar BFE is only implemented for constant width and offset");
8436 uint32_t Imm = OffsetWidthOp.getImm();
8437
8438 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8439 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8440 NewInstr.addImm(Val: Offset);
8441 NewInstr.addImm(Val: BitWidth);
8442 } else {
8443 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8444 Name: AMDGPU::OpName::src1_modifiers) >= 0)
8445 NewInstr.addImm(Val: 0);
8446 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src1) >= 0)
8447 NewInstr->addOperand(Op: Inst.getOperand(i: 2));
8448 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8449 Name: AMDGPU::OpName::src2_modifiers) >= 0)
8450 NewInstr.addImm(Val: 0);
8451 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src2) >= 0)
8452 NewInstr->addOperand(Op: Inst.getOperand(i: 3));
8453 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::clamp) >= 0)
8454 NewInstr.addImm(Val: 0);
8455 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::omod) >= 0)
8456 NewInstr.addImm(Val: 0);
8457 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::op_sel) >= 0)
8458 NewInstr.addImm(Val: 0);
8459 }
8460 } else {
8461 // Just copy the SALU operands.
8462 for (const MachineOperand &Op : Inst.explicit_operands())
8463 NewInstr->addOperand(Op);
8464 }
8465
8466 // Remove any references to SCC. Vector instructions can't read from it, and
8467 // We're just about to add the implicit use / defs of VCC, and we don't want
8468 // both.
8469 for (MachineOperand &Op : Inst.implicit_operands()) {
8470 if (Op.getReg() == AMDGPU::SCC) {
8471 // Only propagate through live-def of SCC.
8472 if (Op.isDef() && !Op.isDead())
8473 addSCCDefUsersToVALUWorklist(Op, SCCDefInst&: Inst, Worklist);
8474 if (Op.isUse())
8475 addSCCDefsToVALUWorklist(SCCUseInst: NewInstr, Worklist);
8476 }
8477 }
8478 Inst.eraseFromParent();
8479 Register NewDstReg;
8480 if (NewInstr->getOperand(i: 0).isReg() && NewInstr->getOperand(i: 0).isDef()) {
8481 Register DstReg = NewInstr->getOperand(i: 0).getReg();
8482 assert(DstReg.isVirtual());
8483 // Update the destination register class.
8484 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst: *NewInstr);
8485 assert(NewDstRC);
8486 NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8487 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8488 }
8489 fixImplicitOperands(MI&: *NewInstr);
8490
8491 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8492
8493 // Legalize the operands
8494 legalizeOperands(MI&: *NewInstr, MDT);
8495 if (NewDstReg)
8496 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8497}
8498
8499// Add/sub require special handling to deal with carry outs.
8500std::pair<bool, MachineBasicBlock *>
8501SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8502 MachineDominatorTree *MDT) const {
8503 if (ST.hasAddNoCarryInsts()) {
8504 // Assume there is no user of scc since we don't select this in that case.
8505 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8506 // is used.
8507
8508 MachineBasicBlock &MBB = *Inst.getParent();
8509 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8510
8511 Register OldDstReg = Inst.getOperand(i: 0).getReg();
8512 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8513
8514 unsigned Opc = Inst.getOpcode();
8515 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8516
8517 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8518 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8519
8520 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8521 Inst.removeOperand(OpNo: 3);
8522
8523 Inst.setDesc(get(Opcode: NewOpc));
8524 Inst.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // clamp bit
8525 Inst.addImplicitDefUseOperands(MF&: *MBB.getParent());
8526 MRI.replaceRegWith(FromReg: OldDstReg, ToReg: ResultReg);
8527 MachineBasicBlock *NewBB = legalizeOperands(MI&: Inst, MDT);
8528
8529 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8530 return std::pair(true, NewBB);
8531 }
8532
8533 return std::pair(false, nullptr);
8534}
8535
8536void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8537 MachineDominatorTree *MDT) const {
8538
8539 MachineBasicBlock &MBB = *Inst.getParent();
8540 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8541 MachineBasicBlock::iterator MII = Inst;
8542 const DebugLoc &DL = Inst.getDebugLoc();
8543
8544 MachineOperand &Dest = Inst.getOperand(i: 0);
8545 MachineOperand &Src0 = Inst.getOperand(i: 1);
8546 MachineOperand &Src1 = Inst.getOperand(i: 2);
8547 MachineOperand &Cond = Inst.getOperand(i: 3);
8548
8549 Register CondReg = Cond.getReg();
8550 bool IsSCC = (CondReg == AMDGPU::SCC);
8551
8552 // If this is a trivial select where the condition is effectively not SCC
8553 // (CondReg is a source of copy to SCC), then the select is semantically
8554 // equivalent to copying CondReg. Hence, there is no need to create
8555 // V_CNDMASK, we can just use that and bail out.
8556 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8557 (Src1.getImm() == 0)) {
8558 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: CondReg);
8559 return;
8560 }
8561
8562 Register NewCondReg = CondReg;
8563 if (IsSCC) {
8564 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8565 NewCondReg = MRI.createVirtualRegister(RegClass: TC);
8566
8567 // Now look for the closest SCC def if it is a copy
8568 // replacing the CondReg with the COPY source register
8569 bool CopyFound = false;
8570 for (MachineInstr &CandI :
8571 make_range(x: std::next(x: MachineBasicBlock::reverse_iterator(Inst)),
8572 y: Inst.getParent()->rend())) {
8573 if (CandI.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isDead: false, Overlap: false) !=
8574 -1) {
8575 if (CandI.isCopy() && CandI.getOperand(i: 0).getReg() == AMDGPU::SCC) {
8576 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: NewCondReg)
8577 .addReg(RegNo: CandI.getOperand(i: 1).getReg());
8578 CopyFound = true;
8579 }
8580 break;
8581 }
8582 }
8583 if (!CopyFound) {
8584 // SCC def is not a copy
8585 // Insert a trivial select instead of creating a copy, because a copy from
8586 // SCC would semantically mean just copying a single bit, but we may need
8587 // the result to be a vector condition mask that needs preserving.
8588 unsigned Opcode =
8589 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8590 auto NewSelect =
8591 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: NewCondReg).addImm(Val: -1).addImm(Val: 0);
8592 NewSelect->getOperand(i: 3).setIsUndef(Cond.isUndef());
8593 }
8594 }
8595
8596 Register NewDestReg = MRI.createVirtualRegister(
8597 RegClass: RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest.getReg())));
8598 MachineInstr *NewInst;
8599 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8600 NewInst = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: NewDestReg)
8601 .addImm(Val: 0)
8602 .add(MO: Src1) // False
8603 .addImm(Val: 0)
8604 .add(MO: Src0) // True
8605 .addReg(RegNo: NewCondReg);
8606 } else {
8607 NewInst =
8608 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B64_PSEUDO), DestReg: NewDestReg)
8609 .add(MO: Src1) // False
8610 .add(MO: Src0) // True
8611 .addReg(RegNo: NewCondReg);
8612 }
8613 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDestReg);
8614 legalizeOperands(MI&: *NewInst, MDT);
8615 addUsersToMoveToVALUWorklist(Reg: NewDestReg, MRI, Worklist);
8616}
8617
8618void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8619 MachineInstr &Inst) const {
8620 MachineBasicBlock &MBB = *Inst.getParent();
8621 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8622 MachineBasicBlock::iterator MII = Inst;
8623 const DebugLoc &DL = Inst.getDebugLoc();
8624
8625 MachineOperand &Dest = Inst.getOperand(i: 0);
8626 MachineOperand &Src = Inst.getOperand(i: 1);
8627 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8628 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8629
8630 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8631 : AMDGPU::V_SUB_CO_U32_e32;
8632
8633 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: TmpReg)
8634 .addImm(Val: 0)
8635 .addReg(RegNo: Src.getReg());
8636
8637 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MAX_I32_e64), DestReg: ResultReg)
8638 .addReg(RegNo: Src.getReg())
8639 .addReg(RegNo: TmpReg);
8640
8641 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8642 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8643}
8644
8645void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8646 MachineInstr &Inst) const {
8647 MachineBasicBlock &MBB = *Inst.getParent();
8648 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8649 MachineBasicBlock::iterator MII = Inst;
8650 const DebugLoc &DL = Inst.getDebugLoc();
8651
8652 MachineOperand &Dest = Inst.getOperand(i: 0);
8653 MachineOperand &Src1 = Inst.getOperand(i: 1);
8654 MachineOperand &Src2 = Inst.getOperand(i: 2);
8655 Register SubResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8656 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8657 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8658
8659 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8660 : AMDGPU::V_SUB_CO_U32_e32;
8661
8662 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: SubResultReg)
8663 .addReg(RegNo: Src1.getReg())
8664 .addReg(RegNo: Src2.getReg());
8665
8666 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: TmpReg).addImm(Val: 0).addReg(RegNo: SubResultReg);
8667
8668 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MAX_I32_e64), DestReg: ResultReg)
8669 .addReg(RegNo: SubResultReg)
8670 .addReg(RegNo: TmpReg);
8671
8672 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8673 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8674}
8675
8676void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8677 MachineInstr &Inst) const {
8678 MachineBasicBlock &MBB = *Inst.getParent();
8679 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8680 MachineBasicBlock::iterator MII = Inst;
8681 const DebugLoc &DL = Inst.getDebugLoc();
8682
8683 MachineOperand &Dest = Inst.getOperand(i: 0);
8684 MachineOperand &Src0 = Inst.getOperand(i: 1);
8685 MachineOperand &Src1 = Inst.getOperand(i: 2);
8686
8687 if (ST.hasDLInsts()) {
8688 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8689 legalizeGenericOperand(InsertMBB&: MBB, I: MII, DstRC: &AMDGPU::VGPR_32RegClass, Op&: Src0, MRI, DL);
8690 legalizeGenericOperand(InsertMBB&: MBB, I: MII, DstRC: &AMDGPU::VGPR_32RegClass, Op&: Src1, MRI, DL);
8691
8692 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_XNOR_B32_e64), DestReg: NewDest)
8693 .add(MO: Src0)
8694 .add(MO: Src1);
8695
8696 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8697 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8698 } else {
8699 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8700 // invert either source and then perform the XOR. If either source is a
8701 // scalar register, then we can leave the inversion on the scalar unit to
8702 // achieve a better distribution of scalar and vector instructions.
8703 bool Src0IsSGPR = Src0.isReg() &&
8704 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src0.getReg()));
8705 bool Src1IsSGPR = Src1.isReg() &&
8706 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src1.getReg()));
8707 MachineInstr *Xor;
8708 Register Temp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8709 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8710
8711 // Build a pair of scalar instructions and add them to the work list.
8712 // The next iteration over the work list will lower these to the vector
8713 // unit as necessary.
8714 if (Src0IsSGPR) {
8715 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Temp).add(MO: Src0);
8716 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: NewDest)
8717 .addReg(RegNo: Temp)
8718 .add(MO: Src1);
8719 } else if (Src1IsSGPR) {
8720 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Temp).add(MO: Src1);
8721 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: NewDest)
8722 .add(MO: Src0)
8723 .addReg(RegNo: Temp);
8724 } else {
8725 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: Temp)
8726 .add(MO: Src0)
8727 .add(MO: Src1);
8728 MachineInstr *Not =
8729 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: NewDest).addReg(RegNo: Temp);
8730 Worklist.insert(MI: Not);
8731 }
8732
8733 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8734
8735 Worklist.insert(MI: Xor);
8736
8737 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8738 }
8739}
8740
8741void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8742 MachineInstr &Inst,
8743 unsigned Opcode) const {
8744 MachineBasicBlock &MBB = *Inst.getParent();
8745 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8746 MachineBasicBlock::iterator MII = Inst;
8747 const DebugLoc &DL = Inst.getDebugLoc();
8748
8749 MachineOperand &Dest = Inst.getOperand(i: 0);
8750 MachineOperand &Src0 = Inst.getOperand(i: 1);
8751 MachineOperand &Src1 = Inst.getOperand(i: 2);
8752
8753 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8754 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8755
8756 MachineInstr &Op = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: Interm)
8757 .add(MO: Src0)
8758 .add(MO: Src1);
8759
8760 MachineInstr &Not = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: NewDest)
8761 .addReg(RegNo: Interm);
8762
8763 Worklist.insert(MI: &Op);
8764 Worklist.insert(MI: &Not);
8765
8766 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8767 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8768}
8769
8770void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8771 MachineInstr &Inst,
8772 unsigned Opcode) const {
8773 MachineBasicBlock &MBB = *Inst.getParent();
8774 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8775 MachineBasicBlock::iterator MII = Inst;
8776 const DebugLoc &DL = Inst.getDebugLoc();
8777
8778 MachineOperand &Dest = Inst.getOperand(i: 0);
8779 MachineOperand &Src0 = Inst.getOperand(i: 1);
8780 MachineOperand &Src1 = Inst.getOperand(i: 2);
8781
8782 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8783 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8784
8785 MachineInstr &Not = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Interm)
8786 .add(MO: Src1);
8787
8788 MachineInstr &Op = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: NewDest)
8789 .add(MO: Src0)
8790 .addReg(RegNo: Interm);
8791
8792 Worklist.insert(MI: &Not);
8793 Worklist.insert(MI: &Op);
8794
8795 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8796 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8797}
8798
8799void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8800 MachineInstr &Inst, unsigned Opcode,
8801 bool Swap) const {
8802 MachineBasicBlock &MBB = *Inst.getParent();
8803 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8804
8805 MachineOperand &Dest = Inst.getOperand(i: 0);
8806 MachineOperand &Src0 = Inst.getOperand(i: 1);
8807 const DebugLoc &DL = Inst.getDebugLoc();
8808
8809 MachineBasicBlock::iterator MII = Inst;
8810
8811 const MCInstrDesc &InstDesc = get(Opcode);
8812 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8813 MRI.getRegClass(Reg: Src0.getReg()) :
8814 &AMDGPU::SGPR_32RegClass;
8815
8816 const TargetRegisterClass *Src0SubRC =
8817 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8818
8819 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
8820 SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8821
8822 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
8823 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(SRC: DestRC);
8824 const TargetRegisterClass *NewDestSubRC =
8825 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8826
8827 Register DestSub0 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
8828 MachineInstr &LoHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub0).add(MO: SrcReg0Sub0);
8829
8830 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
8831 SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
8832
8833 Register DestSub1 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
8834 MachineInstr &HiHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub1).add(MO: SrcReg0Sub1);
8835
8836 if (Swap)
8837 std::swap(a&: DestSub0, b&: DestSub1);
8838
8839 Register FullDestReg = MRI.createVirtualRegister(RegClass: NewDestRC);
8840 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
8841 .addReg(RegNo: DestSub0)
8842 .addImm(Val: AMDGPU::sub0)
8843 .addReg(RegNo: DestSub1)
8844 .addImm(Val: AMDGPU::sub1);
8845
8846 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
8847
8848 Worklist.insert(MI: &LoHalf);
8849 Worklist.insert(MI: &HiHalf);
8850
8851 // We don't need to legalizeOperands here because for a single operand, src0
8852 // will support any kind of input.
8853
8854 // Move all users of this moved value.
8855 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
8856}
8857
8858// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8859// split the s_mul_u64 in 32-bit vector multiplications.
8860void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8861 MachineInstr &Inst,
8862 MachineDominatorTree *MDT) const {
8863 MachineBasicBlock &MBB = *Inst.getParent();
8864 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8865
8866 Register FullDestReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
8867 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8868 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8869
8870 MachineOperand &Dest = Inst.getOperand(i: 0);
8871 MachineOperand &Src0 = Inst.getOperand(i: 1);
8872 MachineOperand &Src1 = Inst.getOperand(i: 2);
8873 const DebugLoc &DL = Inst.getDebugLoc();
8874 MachineBasicBlock::iterator MII = Inst;
8875
8876 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0.getReg());
8877 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: Src1.getReg());
8878 const TargetRegisterClass *Src0SubRC =
8879 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8880 if (RI.isSGPRClass(RC: Src0SubRC))
8881 Src0SubRC = RI.getEquivalentVGPRClass(SRC: Src0SubRC);
8882 const TargetRegisterClass *Src1SubRC =
8883 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8884 if (RI.isSGPRClass(RC: Src1SubRC))
8885 Src1SubRC = RI.getEquivalentVGPRClass(SRC: Src1SubRC);
8886
8887 // First, we extract the low 32-bit and high 32-bit values from each of the
8888 // operands.
8889 MachineOperand Op0L =
8890 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8891 MachineOperand Op1L =
8892 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
8893 MachineOperand Op0H =
8894 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
8895 MachineOperand Op1H =
8896 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
8897
8898 // The multilication is done as follows:
8899 //
8900 // Op1H Op1L
8901 // * Op0H Op0L
8902 // --------------------
8903 // Op1H*Op0L Op1L*Op0L
8904 // + Op1H*Op0H Op1L*Op0H
8905 // -----------------------------------------
8906 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8907 //
8908 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8909 // value and that would overflow.
8910 // The low 32-bit value is Op1L*Op0L.
8911 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8912
8913 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8914 MachineInstr *Op1L_Op0H =
8915 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: Op1L_Op0H_Reg)
8916 .add(MO: Op1L)
8917 .add(MO: Op0H);
8918
8919 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8920 MachineInstr *Op1H_Op0L =
8921 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: Op1H_Op0L_Reg)
8922 .add(MO: Op1H)
8923 .add(MO: Op0L);
8924
8925 Register CarryReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8926 MachineInstr *Carry =
8927 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_HI_U32_e64), DestReg: CarryReg)
8928 .add(MO: Op1L)
8929 .add(MO: Op0L);
8930
8931 MachineInstr *LoHalf =
8932 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: DestSub0)
8933 .add(MO: Op1L)
8934 .add(MO: Op0L);
8935
8936 Register AddReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8937 MachineInstr *Add = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg: AddReg)
8938 .addReg(RegNo: Op1L_Op0H_Reg)
8939 .addReg(RegNo: Op1H_Op0L_Reg);
8940
8941 MachineInstr *HiHalf =
8942 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg: DestSub1)
8943 .addReg(RegNo: AddReg)
8944 .addReg(RegNo: CarryReg);
8945
8946 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
8947 .addReg(RegNo: DestSub0)
8948 .addImm(Val: AMDGPU::sub0)
8949 .addReg(RegNo: DestSub1)
8950 .addImm(Val: AMDGPU::sub1);
8951
8952 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
8953
8954 // Try to legalize the operands in case we need to swap the order to keep it
8955 // valid.
8956 legalizeOperands(MI&: *Op1L_Op0H, MDT);
8957 legalizeOperands(MI&: *Op1H_Op0L, MDT);
8958 legalizeOperands(MI&: *Carry, MDT);
8959 legalizeOperands(MI&: *LoHalf, MDT);
8960 legalizeOperands(MI&: *Add, MDT);
8961 legalizeOperands(MI&: *HiHalf, MDT);
8962
8963 // Move all users of this moved value.
8964 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
8965}
8966
8967// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8968// multiplications.
8969void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8970 MachineInstr &Inst,
8971 MachineDominatorTree *MDT) const {
8972 MachineBasicBlock &MBB = *Inst.getParent();
8973 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8974
8975 Register FullDestReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
8976 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8977 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8978
8979 MachineOperand &Dest = Inst.getOperand(i: 0);
8980 MachineOperand &Src0 = Inst.getOperand(i: 1);
8981 MachineOperand &Src1 = Inst.getOperand(i: 2);
8982 const DebugLoc &DL = Inst.getDebugLoc();
8983 MachineBasicBlock::iterator MII = Inst;
8984
8985 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0.getReg());
8986 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: Src1.getReg());
8987 const TargetRegisterClass *Src0SubRC =
8988 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8989 if (RI.isSGPRClass(RC: Src0SubRC))
8990 Src0SubRC = RI.getEquivalentVGPRClass(SRC: Src0SubRC);
8991 const TargetRegisterClass *Src1SubRC =
8992 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8993 if (RI.isSGPRClass(RC: Src1SubRC))
8994 Src1SubRC = RI.getEquivalentVGPRClass(SRC: Src1SubRC);
8995
8996 // First, we extract the low 32-bit and high 32-bit values from each of the
8997 // operands.
8998 MachineOperand Op0L =
8999 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
9000 MachineOperand Op1L =
9001 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
9002
9003 unsigned Opc = Inst.getOpcode();
9004 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
9005 ? AMDGPU::V_MUL_HI_U32_e64
9006 : AMDGPU::V_MUL_HI_I32_e64;
9007 MachineInstr *HiHalf =
9008 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: NewOpc), DestReg: DestSub1).add(MO: Op1L).add(MO: Op0L);
9009
9010 MachineInstr *LoHalf =
9011 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: DestSub0)
9012 .add(MO: Op1L)
9013 .add(MO: Op0L);
9014
9015 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
9016 .addReg(RegNo: DestSub0)
9017 .addImm(Val: AMDGPU::sub0)
9018 .addReg(RegNo: DestSub1)
9019 .addImm(Val: AMDGPU::sub1);
9020
9021 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
9022
9023 // Try to legalize the operands in case we need to swap the order to keep it
9024 // valid.
9025 legalizeOperands(MI&: *HiHalf, MDT);
9026 legalizeOperands(MI&: *LoHalf, MDT);
9027
9028 // Move all users of this moved value.
9029 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
9030}
9031
9032void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9033 MachineInstr &Inst, unsigned Opcode,
9034 MachineDominatorTree *MDT) const {
9035 MachineBasicBlock &MBB = *Inst.getParent();
9036 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9037
9038 MachineOperand &Dest = Inst.getOperand(i: 0);
9039 MachineOperand &Src0 = Inst.getOperand(i: 1);
9040 MachineOperand &Src1 = Inst.getOperand(i: 2);
9041 const DebugLoc &DL = Inst.getDebugLoc();
9042
9043 MachineBasicBlock::iterator MII = Inst;
9044
9045 const MCInstrDesc &InstDesc = get(Opcode);
9046 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9047 MRI.getRegClass(Reg: Src0.getReg()) :
9048 &AMDGPU::SGPR_32RegClass;
9049
9050 const TargetRegisterClass *Src0SubRC =
9051 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9052 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9053 MRI.getRegClass(Reg: Src1.getReg()) :
9054 &AMDGPU::SGPR_32RegClass;
9055
9056 const TargetRegisterClass *Src1SubRC =
9057 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9058
9059 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
9060 SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
9061 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC,
9062 SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
9063 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
9064 SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
9065 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC,
9066 SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
9067
9068 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
9069 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(SRC: DestRC);
9070 const TargetRegisterClass *NewDestSubRC =
9071 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9072
9073 Register DestSub0 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
9074 MachineInstr &LoHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub0)
9075 .add(MO: SrcReg0Sub0)
9076 .add(MO: SrcReg1Sub0);
9077
9078 Register DestSub1 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
9079 MachineInstr &HiHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub1)
9080 .add(MO: SrcReg0Sub1)
9081 .add(MO: SrcReg1Sub1);
9082
9083 Register FullDestReg = MRI.createVirtualRegister(RegClass: NewDestRC);
9084 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
9085 .addReg(RegNo: DestSub0)
9086 .addImm(Val: AMDGPU::sub0)
9087 .addReg(RegNo: DestSub1)
9088 .addImm(Val: AMDGPU::sub1);
9089
9090 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
9091
9092 Worklist.insert(MI: &LoHalf);
9093 Worklist.insert(MI: &HiHalf);
9094
9095 // Move all users of this moved value.
9096 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
9097}
9098
9099void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9100 MachineInstr &Inst,
9101 MachineDominatorTree *MDT) const {
9102 MachineBasicBlock &MBB = *Inst.getParent();
9103 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9104
9105 MachineOperand &Dest = Inst.getOperand(i: 0);
9106 MachineOperand &Src0 = Inst.getOperand(i: 1);
9107 MachineOperand &Src1 = Inst.getOperand(i: 2);
9108 const DebugLoc &DL = Inst.getDebugLoc();
9109
9110 MachineBasicBlock::iterator MII = Inst;
9111
9112 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
9113
9114 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
9115
9116 MachineOperand* Op0;
9117 MachineOperand* Op1;
9118
9119 if (Src0.isReg() && RI.isSGPRReg(MRI, Reg: Src0.getReg())) {
9120 Op0 = &Src0;
9121 Op1 = &Src1;
9122 } else {
9123 Op0 = &Src1;
9124 Op1 = &Src0;
9125 }
9126
9127 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B64), DestReg: Interm)
9128 .add(MO: *Op0);
9129
9130 Register NewDest = MRI.createVirtualRegister(RegClass: DestRC);
9131
9132 MachineInstr &Xor = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B64), DestReg: NewDest)
9133 .addReg(RegNo: Interm)
9134 .add(MO: *Op1);
9135
9136 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
9137
9138 Worklist.insert(MI: &Xor);
9139}
9140
9141void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9142 MachineInstr &Inst) const {
9143 MachineBasicBlock &MBB = *Inst.getParent();
9144 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9145
9146 MachineBasicBlock::iterator MII = Inst;
9147 const DebugLoc &DL = Inst.getDebugLoc();
9148
9149 MachineOperand &Dest = Inst.getOperand(i: 0);
9150 MachineOperand &Src = Inst.getOperand(i: 1);
9151
9152 const MCInstrDesc &InstDesc = get(Opcode: AMDGPU::V_BCNT_U32_B32_e64);
9153 const TargetRegisterClass *SrcRC = Src.isReg() ?
9154 MRI.getRegClass(Reg: Src.getReg()) :
9155 &AMDGPU::SGPR_32RegClass;
9156
9157 Register MidReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9158 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9159
9160 const TargetRegisterClass *SrcSubRC =
9161 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9162
9163 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC,
9164 SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
9165 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC,
9166 SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
9167
9168 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg).add(MO: SrcRegSub0).addImm(Val: 0);
9169
9170 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: ResultReg).add(MO: SrcRegSub1).addReg(RegNo: MidReg);
9171
9172 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9173
9174 // We don't need to legalize operands here. src0 for either instruction can be
9175 // an SGPR, and the second input is unused or determined here.
9176 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9177}
9178
9179void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9180 MachineInstr &Inst) const {
9181 MachineBasicBlock &MBB = *Inst.getParent();
9182 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9183 MachineBasicBlock::iterator MII = Inst;
9184 const DebugLoc &DL = Inst.getDebugLoc();
9185
9186 MachineOperand &Dest = Inst.getOperand(i: 0);
9187 uint32_t Imm = Inst.getOperand(i: 2).getImm();
9188 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9189 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9190
9191 (void) Offset;
9192
9193 // Only sext_inreg cases handled.
9194 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9195 Offset == 0 && "Not implemented");
9196
9197 if (BitWidth < 32) {
9198 Register MidRegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9199 Register MidRegHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9200 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
9201
9202 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_BFE_I32_e64), DestReg: MidRegLo)
9203 .addReg(RegNo: Inst.getOperand(i: 1).getReg(), Flags: {}, SubReg: AMDGPU::sub0)
9204 .addImm(Val: 0)
9205 .addImm(Val: BitWidth);
9206
9207 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ASHRREV_I32_e32), DestReg: MidRegHi)
9208 .addImm(Val: 31)
9209 .addReg(RegNo: MidRegLo);
9210
9211 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: ResultReg)
9212 .addReg(RegNo: MidRegLo)
9213 .addImm(Val: AMDGPU::sub0)
9214 .addReg(RegNo: MidRegHi)
9215 .addImm(Val: AMDGPU::sub1);
9216
9217 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9218 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9219 return;
9220 }
9221
9222 MachineOperand &Src = Inst.getOperand(i: 1);
9223 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9224 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
9225
9226 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ASHRREV_I32_e64), DestReg: TmpReg)
9227 .addImm(Val: 31)
9228 .addReg(RegNo: Src.getReg(), Flags: {}, SubReg: AMDGPU::sub0);
9229
9230 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: ResultReg)
9231 .addReg(RegNo: Src.getReg(), Flags: {}, SubReg: AMDGPU::sub0)
9232 .addImm(Val: AMDGPU::sub0)
9233 .addReg(RegNo: TmpReg)
9234 .addImm(Val: AMDGPU::sub1);
9235
9236 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9237 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9238}
9239
9240void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9241 MachineInstr &Inst, unsigned Opcode,
9242 MachineDominatorTree *MDT) const {
9243 // (S_FLBIT_I32_B64 hi:lo) ->
9244 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9245 // (S_FF1_I32_B64 hi:lo) ->
9246 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9247
9248 MachineBasicBlock &MBB = *Inst.getParent();
9249 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9250 MachineBasicBlock::iterator MII = Inst;
9251 const DebugLoc &DL = Inst.getDebugLoc();
9252
9253 MachineOperand &Dest = Inst.getOperand(i: 0);
9254 MachineOperand &Src = Inst.getOperand(i: 1);
9255
9256 const MCInstrDesc &InstDesc = get(Opcode);
9257
9258 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9259 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9260 : AMDGPU::V_ADD_CO_U32_e32;
9261
9262 const TargetRegisterClass *SrcRC =
9263 Src.isReg() ? MRI.getRegClass(Reg: Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9264 const TargetRegisterClass *SrcSubRC =
9265 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9266
9267 MachineOperand SrcRegSub0 =
9268 buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
9269 MachineOperand SrcRegSub1 =
9270 buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
9271
9272 Register MidReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9273 Register MidReg2 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9274 Register MidReg3 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9275 Register MidReg4 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9276
9277 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg1).add(MO: SrcRegSub0);
9278
9279 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg2).add(MO: SrcRegSub1);
9280
9281 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: OpcodeAdd), DestReg: MidReg3)
9282 .addReg(RegNo: IsCtlz ? MidReg1 : MidReg2)
9283 .addImm(Val: 32)
9284 .addImm(Val: 1); // enable clamp
9285
9286 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MIN_U32_e64), DestReg: MidReg4)
9287 .addReg(RegNo: MidReg3)
9288 .addReg(RegNo: IsCtlz ? MidReg2 : MidReg1);
9289
9290 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: MidReg4);
9291
9292 addUsersToMoveToVALUWorklist(Reg: MidReg4, MRI, Worklist);
9293}
9294
9295void SIInstrInfo::addUsersToMoveToVALUWorklist(
9296 Register DstReg, MachineRegisterInfo &MRI,
9297 SIInstrWorklist &Worklist) const {
9298 for (MachineOperand &MO : make_early_inc_range(Range: MRI.use_operands(Reg: DstReg))) {
9299 MachineInstr &UseMI = *MO.getParent();
9300
9301 unsigned OpNo = 0;
9302
9303 switch (UseMI.getOpcode()) {
9304 case AMDGPU::COPY:
9305 case AMDGPU::WQM:
9306 case AMDGPU::SOFT_WQM:
9307 case AMDGPU::STRICT_WWM:
9308 case AMDGPU::STRICT_WQM:
9309 case AMDGPU::REG_SEQUENCE:
9310 case AMDGPU::PHI:
9311 case AMDGPU::INSERT_SUBREG:
9312 break;
9313 default:
9314 OpNo = MO.getOperandNo();
9315 break;
9316 }
9317
9318 const TargetRegisterClass *OpRC = getOpRegClass(MI: UseMI, OpNo);
9319 MRI.constrainRegClass(Reg: DstReg, RC: OpRC);
9320
9321 if (!RI.hasVectorRegisters(RC: OpRC))
9322 Worklist.insert(MI: &UseMI);
9323 else
9324 // Legalization could change user list.
9325 legalizeOperandsVALUt16(MI&: UseMI, OpIdx: OpNo, MRI);
9326 }
9327}
9328
9329void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9330 MachineRegisterInfo &MRI,
9331 MachineInstr &Inst) const {
9332 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9333 MachineBasicBlock *MBB = Inst.getParent();
9334 MachineOperand &Src0 = Inst.getOperand(i: 1);
9335 MachineOperand &Src1 = Inst.getOperand(i: 2);
9336 const DebugLoc &DL = Inst.getDebugLoc();
9337
9338 if (ST.useRealTrue16Insts()) {
9339 Register SrcReg0, SrcReg1;
9340 if (!Src0.isReg() || !RI.isVGPR(MRI, Reg: Src0.getReg())) {
9341 SrcReg0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9342 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: SrcReg0).add(MO: Src0);
9343 } else {
9344 SrcReg0 = Src0.getReg();
9345 }
9346
9347 if (!Src1.isReg() || !RI.isVGPR(MRI, Reg: Src1.getReg())) {
9348 SrcReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9349 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: SrcReg1).add(MO: Src1);
9350 } else {
9351 SrcReg1 = Src1.getReg();
9352 }
9353
9354 bool isSrc0Reg16 = MRI.constrainRegClass(Reg: SrcReg0, RC: &AMDGPU::VGPR_16RegClass);
9355 bool isSrc1Reg16 = MRI.constrainRegClass(Reg: SrcReg1, RC: &AMDGPU::VGPR_16RegClass);
9356
9357 auto NewMI = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: ResultReg);
9358 switch (Inst.getOpcode()) {
9359 case AMDGPU::S_PACK_LL_B32_B16:
9360 NewMI
9361 .addReg(RegNo: SrcReg0, Flags: {},
9362 SubReg: isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9363 .addImm(Val: AMDGPU::lo16)
9364 .addReg(RegNo: SrcReg1, Flags: {},
9365 SubReg: isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9366 .addImm(Val: AMDGPU::hi16);
9367 break;
9368 case AMDGPU::S_PACK_LH_B32_B16:
9369 NewMI
9370 .addReg(RegNo: SrcReg0, Flags: {},
9371 SubReg: isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9372 .addImm(Val: AMDGPU::lo16)
9373 .addReg(RegNo: SrcReg1, Flags: {}, SubReg: AMDGPU::hi16)
9374 .addImm(Val: AMDGPU::hi16);
9375 break;
9376 case AMDGPU::S_PACK_HL_B32_B16:
9377 NewMI.addReg(RegNo: SrcReg0, Flags: {}, SubReg: AMDGPU::hi16)
9378 .addImm(Val: AMDGPU::lo16)
9379 .addReg(RegNo: SrcReg1, Flags: {},
9380 SubReg: isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9381 .addImm(Val: AMDGPU::hi16);
9382 break;
9383 case AMDGPU::S_PACK_HH_B32_B16:
9384 NewMI.addReg(RegNo: SrcReg0, Flags: {}, SubReg: AMDGPU::hi16)
9385 .addImm(Val: AMDGPU::lo16)
9386 .addReg(RegNo: SrcReg1, Flags: {}, SubReg: AMDGPU::hi16)
9387 .addImm(Val: AMDGPU::hi16);
9388 break;
9389 default:
9390 llvm_unreachable("unhandled s_pack_* instruction");
9391 }
9392
9393 MachineOperand &Dest = Inst.getOperand(i: 0);
9394 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9395 addUsersToMoveToVALUWorklist(DstReg: ResultReg, MRI, Worklist);
9396 return;
9397 }
9398
9399 switch (Inst.getOpcode()) {
9400 case AMDGPU::S_PACK_LL_B32_B16: {
9401 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9402 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9403
9404 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9405 // 0.
9406 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9407 .addImm(Val: 0xffff);
9408
9409 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_AND_B32_e64), DestReg: TmpReg)
9410 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9411 .add(MO: Src0);
9412
9413 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: ResultReg)
9414 .add(MO: Src1)
9415 .addImm(Val: 16)
9416 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9417 break;
9418 }
9419 case AMDGPU::S_PACK_LH_B32_B16: {
9420 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9421 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9422 .addImm(Val: 0xffff);
9423 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_BFI_B32_e64), DestReg: ResultReg)
9424 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9425 .add(MO: Src0)
9426 .add(MO: Src1);
9427 break;
9428 }
9429 case AMDGPU::S_PACK_HL_B32_B16: {
9430 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9431 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
9432 .addImm(Val: 16)
9433 .add(MO: Src0);
9434 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: ResultReg)
9435 .add(MO: Src1)
9436 .addImm(Val: 16)
9437 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9438 break;
9439 }
9440 case AMDGPU::S_PACK_HH_B32_B16: {
9441 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9442 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9443 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
9444 .addImm(Val: 16)
9445 .add(MO: Src0);
9446 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9447 .addImm(Val: 0xffff0000);
9448 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_AND_OR_B32_e64), DestReg: ResultReg)
9449 .add(MO: Src1)
9450 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9451 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9452 break;
9453 }
9454 default:
9455 llvm_unreachable("unhandled s_pack_* instruction");
9456 }
9457
9458 MachineOperand &Dest = Inst.getOperand(i: 0);
9459 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9460 addUsersToMoveToVALUWorklist(DstReg: ResultReg, MRI, Worklist);
9461}
9462
9463void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9464 MachineInstr &SCCDefInst,
9465 SIInstrWorklist &Worklist,
9466 Register NewCond) const {
9467
9468 // Ensure that def inst defines SCC, which is still live.
9469 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9470 !Op.isDead() && Op.getParent() == &SCCDefInst);
9471 SmallVector<MachineInstr *, 4> CopyToDelete;
9472 // This assumes that all the users of SCC are in the same block
9473 // as the SCC def.
9474 for (MachineInstr &MI : // Skip the def inst itself.
9475 make_range(x: std::next(x: MachineBasicBlock::iterator(SCCDefInst)),
9476 y: SCCDefInst.getParent()->end())) {
9477 // Check if SCC is used first.
9478 int SCCIdx = MI.findRegisterUseOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isKill: false);
9479 if (SCCIdx != -1) {
9480 if (MI.isCopy()) {
9481 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9482 Register DestReg = MI.getOperand(i: 0).getReg();
9483
9484 MRI.replaceRegWith(FromReg: DestReg, ToReg: NewCond);
9485 CopyToDelete.push_back(Elt: &MI);
9486 } else {
9487
9488 if (NewCond.isValid())
9489 MI.getOperand(i: SCCIdx).setReg(NewCond);
9490
9491 Worklist.insert(MI: &MI);
9492 }
9493 }
9494 // Exit if we find another SCC def.
9495 if (MI.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isDead: false, Overlap: false) != -1)
9496 break;
9497 }
9498 for (auto &Copy : CopyToDelete)
9499 Copy->eraseFromParent();
9500}
9501
9502// Instructions that use SCC may be converted to VALU instructions. When that
9503// happens, the SCC register is changed to VCC_LO. The instruction that defines
9504// SCC must be changed to an instruction that defines VCC. This function makes
9505// sure that the instruction that defines SCC is added to the moveToVALU
9506// worklist.
9507void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9508 SIInstrWorklist &Worklist) const {
9509 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9510 // then there is nothing to do because the defining instruction has been
9511 // converted to a VALU already. If SCC then that instruction needs to be
9512 // converted to a VALU.
9513 for (MachineInstr &MI :
9514 make_range(x: std::next(x: MachineBasicBlock::reverse_iterator(SCCUseInst)),
9515 y: SCCUseInst->getParent()->rend())) {
9516 if (MI.modifiesRegister(Reg: AMDGPU::VCC, TRI: &RI))
9517 break;
9518 if (MI.definesRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
9519 Worklist.insert(MI: &MI);
9520 break;
9521 }
9522 }
9523}
9524
9525const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9526 const MachineInstr &Inst) const {
9527 const TargetRegisterClass *NewDstRC = getOpRegClass(MI: Inst, OpNo: 0);
9528
9529 switch (Inst.getOpcode()) {
9530 // For target instructions, getOpRegClass just returns the virtual register
9531 // class associated with the operand, so we need to find an equivalent VGPR
9532 // register class in order to move the instruction to the VALU.
9533 case AMDGPU::COPY:
9534 case AMDGPU::PHI:
9535 case AMDGPU::REG_SEQUENCE:
9536 case AMDGPU::INSERT_SUBREG:
9537 case AMDGPU::WQM:
9538 case AMDGPU::SOFT_WQM:
9539 case AMDGPU::STRICT_WWM:
9540 case AMDGPU::STRICT_WQM: {
9541 const TargetRegisterClass *SrcRC = getOpRegClass(MI: Inst, OpNo: 1);
9542 if (RI.isAGPRClass(RC: SrcRC)) {
9543 if (RI.isAGPRClass(RC: NewDstRC))
9544 return nullptr;
9545
9546 switch (Inst.getOpcode()) {
9547 case AMDGPU::PHI:
9548 case AMDGPU::REG_SEQUENCE:
9549 case AMDGPU::INSERT_SUBREG:
9550 NewDstRC = RI.getEquivalentAGPRClass(SRC: NewDstRC);
9551 break;
9552 default:
9553 NewDstRC = RI.getEquivalentVGPRClass(SRC: NewDstRC);
9554 }
9555
9556 if (!NewDstRC)
9557 return nullptr;
9558 } else {
9559 if (RI.isVGPRClass(RC: NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9560 return nullptr;
9561
9562 NewDstRC = RI.getEquivalentVGPRClass(SRC: NewDstRC);
9563 if (!NewDstRC)
9564 return nullptr;
9565 }
9566
9567 return NewDstRC;
9568 }
9569 default:
9570 return NewDstRC;
9571 }
9572}
9573
9574// Find the one SGPR operand we are allowed to use.
9575Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9576 int OpIndices[3]) const {
9577 const MCInstrDesc &Desc = MI.getDesc();
9578
9579 // Find the one SGPR operand we are allowed to use.
9580 //
9581 // First we need to consider the instruction's operand requirements before
9582 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9583 // of VCC, but we are still bound by the constant bus requirement to only use
9584 // one.
9585 //
9586 // If the operand's class is an SGPR, we can never move it.
9587
9588 Register SGPRReg = findImplicitSGPRRead(MI);
9589 if (SGPRReg)
9590 return SGPRReg;
9591
9592 Register UsedSGPRs[3] = {Register()};
9593 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9594
9595 for (unsigned i = 0; i < 3; ++i) {
9596 int Idx = OpIndices[i];
9597 if (Idx == -1)
9598 break;
9599
9600 const MachineOperand &MO = MI.getOperand(i: Idx);
9601 if (!MO.isReg())
9602 continue;
9603
9604 // Is this operand statically required to be an SGPR based on the operand
9605 // constraints?
9606 const TargetRegisterClass *OpRC =
9607 RI.getRegClass(i: getOpRegClassID(OpInfo: Desc.operands()[Idx]));
9608 bool IsRequiredSGPR = RI.isSGPRClass(RC: OpRC);
9609 if (IsRequiredSGPR)
9610 return MO.getReg();
9611
9612 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9613 Register Reg = MO.getReg();
9614 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9615 if (RI.isSGPRClass(RC: RegRC))
9616 UsedSGPRs[i] = Reg;
9617 }
9618
9619 // We don't have a required SGPR operand, so we have a bit more freedom in
9620 // selecting operands to move.
9621
9622 // Try to select the most used SGPR. If an SGPR is equal to one of the
9623 // others, we choose that.
9624 //
9625 // e.g.
9626 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9627 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9628
9629 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9630 // prefer those.
9631
9632 if (UsedSGPRs[0]) {
9633 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9634 SGPRReg = UsedSGPRs[0];
9635 }
9636
9637 if (!SGPRReg && UsedSGPRs[1]) {
9638 if (UsedSGPRs[1] == UsedSGPRs[2])
9639 SGPRReg = UsedSGPRs[1];
9640 }
9641
9642 return SGPRReg;
9643}
9644
9645MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
9646 AMDGPU::OpName OperandName) const {
9647 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9648 return nullptr;
9649
9650 int Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OperandName);
9651 if (Idx == -1)
9652 return nullptr;
9653
9654 return &MI.getOperand(i: Idx);
9655}
9656
9657uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
9658 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9659 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9660 ? (int64_t)AMDGPU::UfmtGFX11::UFMT_32_FLOAT
9661 : (int64_t)AMDGPU::UfmtGFX10::UFMT_32_FLOAT;
9662 return (Format << 44) |
9663 (1ULL << 56) | // RESOURCE_LEVEL = 1
9664 (3ULL << 60); // OOB_SELECT = 3
9665 }
9666
9667 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9668 if (ST.isAmdHsaOS()) {
9669 // Set ATC = 1. GFX9 doesn't have this bit.
9670 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9671 RsrcDataFormat |= (1ULL << 56);
9672
9673 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9674 // BTW, it disables TC L2 and therefore decreases performance.
9675 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9676 RsrcDataFormat |= (2ULL << 59);
9677 }
9678
9679 return RsrcDataFormat;
9680}
9681
9682uint64_t SIInstrInfo::getScratchRsrcWords23() const {
9683 uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
9684 AMDGPU::RSRC_TID_ENABLE |
9685 0xffffffff; // Size;
9686
9687 // GFX9 doesn't have ELEMENT_SIZE.
9688 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9689 uint64_t EltSizeValue = Log2_32(Value: ST.getMaxPrivateElementSize(ForBufferRSrc: true)) - 1;
9690 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9691 }
9692
9693 // IndexStride = 64 / 32.
9694 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9695 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9696
9697 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9698 // Clear them unless we want a huge stride.
9699 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9700 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9701 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9702
9703 return Rsrc23;
9704}
9705
9706bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
9707 unsigned Opc = MI.getOpcode();
9708
9709 return isSMRD(Opcode: Opc);
9710}
9711
9712bool SIInstrInfo::isHighLatencyDef(int Opc) const {
9713 return get(Opcode: Opc).mayLoad() &&
9714 (isMUBUF(Opcode: Opc) || isMTBUF(Opcode: Opc) || isMIMG(Opcode: Opc) || isFLAT(Opcode: Opc));
9715}
9716
9717Register SIInstrInfo::isStackAccess(const MachineInstr &MI,
9718 int &FrameIndex) const {
9719 const MachineOperand *Addr = getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
9720 if (!Addr || !Addr->isFI())
9721 return Register();
9722
9723 assert(!MI.memoperands_empty() &&
9724 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9725
9726 FrameIndex = Addr->getIndex();
9727 return getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata)->getReg();
9728}
9729
9730Register SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
9731 int &FrameIndex) const {
9732 const MachineOperand *Addr = getNamedOperand(MI, OperandName: AMDGPU::OpName::addr);
9733 assert(Addr && Addr->isFI());
9734 FrameIndex = Addr->getIndex();
9735 return getNamedOperand(MI, OperandName: AMDGPU::OpName::data)->getReg();
9736}
9737
9738Register SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
9739 int &FrameIndex) const {
9740 if (!MI.mayLoad())
9741 return Register();
9742
9743 if (isMUBUF(MI) || isVGPRSpill(MI))
9744 return isStackAccess(MI, FrameIndex);
9745
9746 if (isSGPRSpill(MI))
9747 return isSGPRStackAccess(MI, FrameIndex);
9748
9749 return Register();
9750}
9751
9752Register SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
9753 int &FrameIndex) const {
9754 if (!MI.mayStore())
9755 return Register();
9756
9757 if (isMUBUF(MI) || isVGPRSpill(MI))
9758 return isStackAccess(MI, FrameIndex);
9759
9760 if (isSGPRSpill(MI))
9761 return isSGPRStackAccess(MI, FrameIndex);
9762
9763 return Register();
9764}
9765
9766unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
9767 unsigned Size = 0;
9768 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
9769 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9770 while (++I != E && I->isInsideBundle()) {
9771 assert(!I->isBundle() && "No nested bundle!");
9772 Size += getInstSizeInBytes(MI: *I);
9773 }
9774
9775 return Size;
9776}
9777
9778unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
9779 unsigned Opc = MI.getOpcode();
9780 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: Opc);
9781 unsigned DescSize = Desc.getSize();
9782
9783 // If we have a definitive size, we can use it. Otherwise we need to inspect
9784 // the operands to know the size.
9785 if (isFixedSize(MI)) {
9786 unsigned Size = DescSize;
9787
9788 // If we hit the buggy offset, an extra nop will be inserted in MC so
9789 // estimate the worst case.
9790 if (MI.isBranch() && ST.hasOffset3fBug())
9791 Size += 4;
9792
9793 return Size;
9794 }
9795
9796 // Instructions may have a 32-bit literal encoded after them. Check
9797 // operands that could ever be literals.
9798 if (isVALU(MI) || isSALU(MI)) {
9799 if (isDPP(MI))
9800 return DescSize;
9801 bool HasLiteral = false;
9802 unsigned LiteralSize = 4;
9803 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9804 const MachineOperand &Op = MI.getOperand(i: I);
9805 const MCOperandInfo &OpInfo = Desc.operands()[I];
9806 if (!Op.isReg() && !isInlineConstant(MO: Op, OpInfo)) {
9807 HasLiteral = true;
9808 if (ST.has64BitLiterals()) {
9809 switch (OpInfo.OperandType) {
9810 default:
9811 break;
9812 case AMDGPU::OPERAND_REG_IMM_FP64:
9813 if (!AMDGPU::isValid32BitLiteral(Val: Op.getImm(), IsFP64: true))
9814 LiteralSize = 8;
9815 break;
9816 case AMDGPU::OPERAND_REG_IMM_INT64:
9817 // A 32-bit literal is only valid when the value fits in BOTH signed
9818 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
9819 // emitter's getLit64Encoding logic. This is because of the lack of
9820 // abilility to tell signedness of the literal, therefore we need to
9821 // be conservative and assume values outside this range require a
9822 // 64-bit literal encoding (8 bytes).
9823 if (!Op.isImm() || !isInt<32>(x: Op.getImm()) ||
9824 !isUInt<32>(x: Op.getImm()))
9825 LiteralSize = 8;
9826 break;
9827 }
9828 }
9829 break;
9830 }
9831 }
9832 return HasLiteral ? DescSize + LiteralSize : DescSize;
9833 }
9834
9835 // Check whether we have extra NSA words.
9836 if (isMIMG(MI)) {
9837 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0);
9838 if (VAddr0Idx < 0)
9839 return 8;
9840
9841 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::srsrc);
9842 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9843 }
9844
9845 switch (Opc) {
9846 case TargetOpcode::BUNDLE:
9847 return getInstBundleSize(MI);
9848 case TargetOpcode::INLINEASM:
9849 case TargetOpcode::INLINEASM_BR: {
9850 const MachineFunction *MF = MI.getMF();
9851 const char *AsmStr = MI.getOperand(i: 0).getSymbolName();
9852 return getInlineAsmLength(Str: AsmStr, MAI: *MF->getTarget().getMCAsmInfo(), STI: &ST);
9853 }
9854 default:
9855 if (MI.isMetaInstruction())
9856 return 0;
9857
9858 // If D16 Pseudo inst, get correct MC code size
9859 const auto *D16Info = AMDGPU::getT16D16Helper(T16Op: Opc);
9860 if (D16Info) {
9861 // Assume d16_lo/hi inst are always in same size
9862 unsigned LoInstOpcode = D16Info->LoOp;
9863 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: LoInstOpcode);
9864 DescSize = Desc.getSize();
9865 }
9866
9867 // If FMA Pseudo inst, get correct MC code size
9868 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9869 // All potential lowerings are the same size; arbitrarily pick one.
9870 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: AMDGPU::V_FMA_MIXLO_F16);
9871 DescSize = Desc.getSize();
9872 }
9873
9874 return DescSize;
9875 }
9876}
9877
9878bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
9879 if (!isFLAT(MI))
9880 return false;
9881
9882 if (MI.memoperands_empty())
9883 return true;
9884
9885 for (const MachineMemOperand *MMO : MI.memoperands()) {
9886 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9887 return true;
9888 }
9889 return false;
9890}
9891
9892ArrayRef<std::pair<int, const char *>>
9893SIInstrInfo::getSerializableTargetIndices() const {
9894 static const std::pair<int, const char *> TargetIndices[] = {
9895 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9896 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9897 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9898 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9899 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9900 return ArrayRef(TargetIndices);
9901}
9902
9903/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9904/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9905ScheduleHazardRecognizer *
9906SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
9907 const ScheduleDAG *DAG) const {
9908 return new GCNHazardRecognizer(DAG->MF);
9909}
9910
9911/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9912/// pass.
9913ScheduleHazardRecognizer *
9914SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
9915 return new GCNHazardRecognizer(MF);
9916}
9917
9918// Called during:
9919// - pre-RA scheduling and post-RA scheduling
9920ScheduleHazardRecognizer *
9921SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
9922 const ScheduleDAGMI *DAG) const {
9923 // Borrowed from Arm Target
9924 // We would like to restrict this hazard recognizer to only
9925 // post-RA scheduling; we can tell that we're post-RA because we don't
9926 // track VRegLiveness.
9927 if (!DAG->hasVRegLiveness())
9928 return new GCNHazardRecognizer(DAG->MF);
9929 return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG);
9930}
9931
9932std::pair<unsigned, unsigned>
9933SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9934 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9935}
9936
9937ArrayRef<std::pair<unsigned, const char *>>
9938SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9939 static const std::pair<unsigned, const char *> TargetFlags[] = {
9940 {MO_GOTPCREL, "amdgpu-gotprel"},
9941 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9942 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9943 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9944 {MO_REL32_LO, "amdgpu-rel32-lo"},
9945 {MO_REL32_HI, "amdgpu-rel32-hi"},
9946 {MO_REL64, "amdgpu-rel64"},
9947 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9948 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9949 {MO_ABS64, "amdgpu-abs64"},
9950 };
9951
9952 return ArrayRef(TargetFlags);
9953}
9954
9955ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
9956SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9957 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9958 {
9959 {MONoClobber, "amdgpu-noclobber"},
9960 {MOLastUse, "amdgpu-last-use"},
9961 {MOCooperative, "amdgpu-cooperative"},
9962 {MOThreadPrivate, "amdgpu-thread-private"},
9963 };
9964
9965 return ArrayRef(TargetFlags);
9966}
9967
9968unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
9969 const MachineFunction &MF) const {
9970 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
9971 assert(SrcReg.isVirtual());
9972 if (MFI->checkFlag(Reg: SrcReg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
9973 return AMDGPU::WWM_COPY;
9974
9975 return AMDGPU::COPY;
9976}
9977
9978bool SIInstrInfo::canAddToBBProlog(const MachineInstr &MI) const {
9979 uint32_t Opcode = MI.getOpcode();
9980 // Check if it is SGPR spill or wwm-register spill Opcode.
9981 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
9982 return true;
9983
9984 const MachineFunction *MF = MI.getMF();
9985 const MachineRegisterInfo &MRI = MF->getRegInfo();
9986 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
9987
9988 // See if this is Liverange split instruction inserted for SGPR or
9989 // wwm-register. The implicit def inserted for wwm-registers should also be
9990 // included as they can appear at the bb begin.
9991 bool IsLRSplitInst = MI.getFlag(Flag: MachineInstr::LRSplit);
9992 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
9993 return false;
9994
9995 Register Reg = MI.getOperand(i: 0).getReg();
9996 if (RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg)))
9997 return IsLRSplitInst;
9998
9999 return MFI->isWWMReg(Reg);
10000}
10001
10002bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
10003 Register Reg) const {
10004 // We need to handle instructions which may be inserted during register
10005 // allocation to handle the prolog. The initial prolog instruction may have
10006 // been separated from the start of the block by spills and copies inserted
10007 // needed by the prolog. However, the insertions for scalar registers can
10008 // always be placed at the BB top as they are independent of the exec mask
10009 // value.
10010 bool IsNullOrVectorRegister = true;
10011 if (Reg) {
10012 const MachineFunction *MF = MI.getMF();
10013 const MachineRegisterInfo &MRI = MF->getRegInfo();
10014 IsNullOrVectorRegister = !RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg));
10015 }
10016
10017 return IsNullOrVectorRegister &&
10018 (canAddToBBProlog(MI) ||
10019 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10020 MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &RI)));
10021}
10022
10023MachineInstrBuilder
10024SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
10025 MachineBasicBlock::iterator I,
10026 const DebugLoc &DL,
10027 Register DestReg) const {
10028 if (ST.hasAddNoCarryInsts())
10029 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e64), DestReg);
10030
10031 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10032 Register UnusedCarry = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
10033 MRI.setRegAllocationHint(VReg: UnusedCarry, Type: 0, PrefReg: RI.getVCC());
10034
10035 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg)
10036 .addReg(RegNo: UnusedCarry, Flags: RegState::Define | RegState::Dead);
10037}
10038
10039MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
10040 MachineBasicBlock::iterator I,
10041 const DebugLoc &DL,
10042 Register DestReg,
10043 RegScavenger &RS) const {
10044 if (ST.hasAddNoCarryInsts())
10045 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg);
10046
10047 // If available, prefer to use vcc.
10048 Register UnusedCarry = !RS.isRegUsed(Reg: AMDGPU::VCC)
10049 ? Register(RI.getVCC())
10050 : RS.scavengeRegisterBackwards(
10051 RC: *RI.getBoolRC(), To: I, /* RestoreAfter */ false,
10052 SPAdj: 0, /* AllowSpill */ false);
10053
10054 // TODO: Users need to deal with this.
10055 if (!UnusedCarry.isValid())
10056 return MachineInstrBuilder();
10057
10058 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg)
10059 .addReg(RegNo: UnusedCarry, Flags: RegState::Define | RegState::Dead);
10060}
10061
10062bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10063 switch (Opcode) {
10064 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10065 case AMDGPU::SI_KILL_I1_TERMINATOR:
10066 return true;
10067 default:
10068 return false;
10069 }
10070}
10071
10072const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
10073 switch (Opcode) {
10074 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10075 return get(Opcode: AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10076 case AMDGPU::SI_KILL_I1_PSEUDO:
10077 return get(Opcode: AMDGPU::SI_KILL_I1_TERMINATOR);
10078 default:
10079 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10080 }
10081}
10082
10083bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10084 return Imm <= getMaxMUBUFImmOffset(ST);
10085}
10086
10087unsigned SIInstrInfo::getMaxMUBUFImmOffset(const GCNSubtarget &ST) {
10088 // GFX12 field is non-negative 24-bit signed byte offset.
10089 const unsigned OffsetBits =
10090 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10091 return (1 << OffsetBits) - 1;
10092}
10093
10094void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
10095 if (!ST.isWave32())
10096 return;
10097
10098 if (MI.isInlineAsm())
10099 return;
10100
10101 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10102 return;
10103
10104 for (auto &Op : MI.implicit_operands()) {
10105 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10106 Op.setReg(AMDGPU::VCC_LO);
10107 }
10108}
10109
10110bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
10111 if (!isSMRD(MI))
10112 return false;
10113
10114 // Check that it is using a buffer resource.
10115 int Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::sbase);
10116 if (Idx == -1) // e.g. s_memtime
10117 return false;
10118
10119 const int16_t RCID = getOpRegClassID(OpInfo: MI.getDesc().operands()[Idx]);
10120 return RI.getRegClass(i: RCID)->hasSubClassEq(RC: &AMDGPU::SGPR_128RegClass);
10121}
10122
10123// Given Imm, split it into the values to put into the SOffset and ImmOffset
10124// fields in an MUBUF instruction. Return false if it is not possible (due to a
10125// hardware bug needing a workaround).
10126//
10127// The required alignment ensures that individual address components remain
10128// aligned if they are aligned to begin with. It also ensures that additional
10129// offsets within the given alignment can be added to the resulting ImmOffset.
10130bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset,
10131 uint32_t &ImmOffset, Align Alignment) const {
10132 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10133 const uint32_t MaxImm = alignDown(Value: MaxOffset, Align: Alignment.value());
10134 uint32_t Overflow = 0;
10135
10136 if (Imm > MaxImm) {
10137 if (Imm <= MaxImm + 64) {
10138 // Use an SOffset inline constant for 4..64
10139 Overflow = Imm - MaxImm;
10140 Imm = MaxImm;
10141 } else {
10142 // Try to keep the same value in SOffset for adjacent loads, so that
10143 // the corresponding register contents can be re-used.
10144 //
10145 // Load values with all low-bits (except for alignment bits) set into
10146 // SOffset, so that a larger range of values can be covered using
10147 // s_movk_i32.
10148 //
10149 // Atomic operations fail to work correctly when individual address
10150 // components are unaligned, even if their sum is aligned.
10151 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10152 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10153 Imm = Low;
10154 Overflow = High - Alignment.value();
10155 }
10156 }
10157
10158 if (Overflow > 0) {
10159 // There is a hardware bug in SI and CI which prevents address clamping in
10160 // MUBUF instructions from working correctly with SOffsets. The immediate
10161 // offset is unaffected.
10162 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10163 return false;
10164
10165 // It is not possible to set immediate in SOffset field on some targets.
10166 if (ST.hasRestrictedSOffset())
10167 return false;
10168 }
10169
10170 ImmOffset = Imm;
10171 SOffset = Overflow;
10172 return true;
10173}
10174
10175// Depending on the used address space and instructions, some immediate offsets
10176// are allowed and some are not.
10177// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10178// scratch instruction offsets can also be negative. On GFX12, offsets can be
10179// negative for all variants.
10180//
10181// There are several bugs related to these offsets:
10182// On gfx10.1, flat instructions that go into the global address space cannot
10183// use an offset.
10184//
10185// For scratch instructions, the address can be either an SGPR or a VGPR.
10186// The following offsets can be used, depending on the architecture (x means
10187// cannot be used):
10188// +----------------------------+------+------+
10189// | Address-Mode | SGPR | VGPR |
10190// +----------------------------+------+------+
10191// | gfx9 | | |
10192// | negative, 4-aligned offset | x | ok |
10193// | negative, unaligned offset | x | ok |
10194// +----------------------------+------+------+
10195// | gfx10 | | |
10196// | negative, 4-aligned offset | ok | ok |
10197// | negative, unaligned offset | ok | x |
10198// +----------------------------+------+------+
10199// | gfx10.3 | | |
10200// | negative, 4-aligned offset | ok | ok |
10201// | negative, unaligned offset | ok | ok |
10202// +----------------------------+------+------+
10203//
10204// This function ignores the addressing mode, so if an offset cannot be used in
10205// one addressing mode, it is considered illegal.
10206bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10207 uint64_t FlatVariant) const {
10208 // TODO: Should 0 be special cased?
10209 if (!ST.hasFlatInstOffsets())
10210 return false;
10211
10212 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10213 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10214 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10215 return false;
10216
10217 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10218 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10219 (Offset % 4) != 0) {
10220 return false;
10221 }
10222
10223 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10224 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10225 return isIntN(N, x: Offset) && (AllowNegative || Offset >= 0);
10226}
10227
10228// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10229std::pair<int64_t, int64_t>
10230SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10231 uint64_t FlatVariant) const {
10232 int64_t RemainderOffset = COffsetVal;
10233 int64_t ImmField = 0;
10234
10235 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10236 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10237
10238 if (AllowNegative) {
10239 // Use signed division by a power of two to truncate towards 0.
10240 int64_t D = 1LL << NumBits;
10241 RemainderOffset = (COffsetVal / D) * D;
10242 ImmField = COffsetVal - RemainderOffset;
10243
10244 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10245 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10246 (ImmField % 4) != 0) {
10247 // Make ImmField a multiple of 4
10248 RemainderOffset += ImmField % 4;
10249 ImmField -= ImmField % 4;
10250 }
10251 } else if (COffsetVal >= 0) {
10252 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(N: NumBits);
10253 RemainderOffset = COffsetVal - ImmField;
10254 }
10255
10256 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10257 assert(RemainderOffset + ImmField == COffsetVal);
10258 return {ImmField, RemainderOffset};
10259}
10260
10261bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const {
10262 if (ST.hasNegativeScratchOffsetBug() &&
10263 FlatVariant == SIInstrFlags::FlatScratch)
10264 return false;
10265
10266 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(STI: ST);
10267}
10268
10269static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10270 switch (ST.getGeneration()) {
10271 default:
10272 break;
10273 case AMDGPUSubtarget::SOUTHERN_ISLANDS:
10274 case AMDGPUSubtarget::SEA_ISLANDS:
10275 return SIEncodingFamily::SI;
10276 case AMDGPUSubtarget::VOLCANIC_ISLANDS:
10277 case AMDGPUSubtarget::GFX9:
10278 return SIEncodingFamily::VI;
10279 case AMDGPUSubtarget::GFX10:
10280 return SIEncodingFamily::GFX10;
10281 case AMDGPUSubtarget::GFX11:
10282 return SIEncodingFamily::GFX11;
10283 case AMDGPUSubtarget::GFX12:
10284 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10285 : SIEncodingFamily::GFX12;
10286 case AMDGPUSubtarget::GFX13:
10287 return SIEncodingFamily::GFX13;
10288 }
10289 llvm_unreachable("Unknown subtarget generation!");
10290}
10291
10292bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10293 switch(MCOp) {
10294 // These opcodes use indirect register addressing so
10295 // they need special handling by codegen (currently missing).
10296 // Therefore it is too risky to allow these opcodes
10297 // to be selected by dpp combiner or sdwa peepholer.
10298 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10299 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10300 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10301 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10302 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10303 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10304 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10305 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10306 return true;
10307 default:
10308 return false;
10309 }
10310}
10311
10312#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10313 case OPCODE##_dpp: \
10314 case OPCODE##_e32: \
10315 case OPCODE##_e64: \
10316 case OPCODE##_e64_dpp: \
10317 case OPCODE##_sdwa:
10318
10319static bool isRenamedInGFX9(int Opcode) {
10320 switch (Opcode) {
10321 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10322 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10323 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10324 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10325 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10326 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10327 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10328 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10329 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10330 //
10331 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10332 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10333 case AMDGPU::V_FMA_F16_gfx9_e64:
10334 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10335 case AMDGPU::V_INTERP_P2_F16:
10336 case AMDGPU::V_MAD_F16_e64:
10337 case AMDGPU::V_MAD_U16_e64:
10338 case AMDGPU::V_MAD_I16_e64:
10339 return true;
10340 default:
10341 return false;
10342 }
10343}
10344
10345int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10346 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10347 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10348
10349 unsigned Gen = subtargetEncodingFamily(ST);
10350
10351 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10352 Gen = SIEncodingFamily::GFX9;
10353
10354 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10355 // subtarget has UnpackedD16VMem feature.
10356 // TODO: remove this when we discard GFX80 encoding.
10357 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10358 Gen = SIEncodingFamily::GFX80;
10359
10360 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10361 switch (ST.getGeneration()) {
10362 default:
10363 Gen = SIEncodingFamily::SDWA;
10364 break;
10365 case AMDGPUSubtarget::GFX9:
10366 Gen = SIEncodingFamily::SDWA9;
10367 break;
10368 case AMDGPUSubtarget::GFX10:
10369 Gen = SIEncodingFamily::SDWA10;
10370 break;
10371 }
10372 }
10373
10374 if (isMAI(Opcode)) {
10375 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10376 if (MFMAOp != -1)
10377 Opcode = MFMAOp;
10378 }
10379
10380 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10381
10382 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10383 MCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX12);
10384
10385 // -1 means that Opcode is already a native instruction.
10386 if (MCOp == -1)
10387 return Opcode;
10388
10389 if (ST.hasGFX90AInsts()) {
10390 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10391 if (ST.hasGFX940Insts())
10392 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX940);
10393 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10394 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX90A);
10395 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10396 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX9);
10397 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10398 MCOp = NMCOp;
10399 }
10400
10401 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10402 // encoding in the given subtarget generation.
10403 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10404 return -1;
10405
10406 if (isAsmOnlyOpcode(MCOp))
10407 return -1;
10408
10409 return MCOp;
10410}
10411
10412static
10413TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
10414 assert(RegOpnd.isReg());
10415 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10416 getRegSubRegPair(O: RegOpnd);
10417}
10418
10419TargetInstrInfo::RegSubRegPair
10420llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
10421 assert(MI.isRegSequence());
10422 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10423 if (MI.getOperand(i: 1 + 2 * I + 1).getImm() == SubReg) {
10424 auto &RegOp = MI.getOperand(i: 1 + 2 * I);
10425 return getRegOrUndef(RegOpnd: RegOp);
10426 }
10427 return TargetInstrInfo::RegSubRegPair();
10428}
10429
10430// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10431// Following a subreg of reg:subreg isn't supported
10432static bool followSubRegDef(MachineInstr &MI,
10433 TargetInstrInfo::RegSubRegPair &RSR) {
10434 if (!RSR.SubReg)
10435 return false;
10436 switch (MI.getOpcode()) {
10437 default: break;
10438 case AMDGPU::REG_SEQUENCE:
10439 RSR = getRegSequenceSubReg(MI, SubReg: RSR.SubReg);
10440 return true;
10441 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10442 case AMDGPU::INSERT_SUBREG:
10443 if (RSR.SubReg == (unsigned)MI.getOperand(i: 3).getImm())
10444 // inserted the subreg we're looking for
10445 RSR = getRegOrUndef(RegOpnd: MI.getOperand(i: 2));
10446 else { // the subreg in the rest of the reg
10447 auto R1 = getRegOrUndef(RegOpnd: MI.getOperand(i: 1));
10448 if (R1.SubReg) // subreg of subreg isn't supported
10449 return false;
10450 RSR.Reg = R1.Reg;
10451 }
10452 return true;
10453 }
10454 return false;
10455}
10456
10457MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
10458 const MachineRegisterInfo &MRI) {
10459 assert(MRI.isSSA());
10460 if (!P.Reg.isVirtual())
10461 return nullptr;
10462
10463 auto RSR = P;
10464 auto *DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10465 while (auto *MI = DefInst) {
10466 DefInst = nullptr;
10467 switch (MI->getOpcode()) {
10468 case AMDGPU::COPY:
10469 case AMDGPU::V_MOV_B32_e32: {
10470 auto &Op1 = MI->getOperand(i: 1);
10471 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10472 if (Op1.isUndef())
10473 return nullptr;
10474 RSR = getRegSubRegPair(O: Op1);
10475 DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10476 }
10477 break;
10478 }
10479 default:
10480 if (followSubRegDef(MI&: *MI, RSR)) {
10481 if (!RSR.Reg)
10482 return nullptr;
10483 DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10484 }
10485 }
10486 if (!DefInst)
10487 return MI;
10488 }
10489 return nullptr;
10490}
10491
10492bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
10493 Register VReg,
10494 const MachineInstr &DefMI,
10495 const MachineInstr &UseMI) {
10496 assert(MRI.isSSA() && "Must be run on SSA");
10497
10498 auto *TRI = MRI.getTargetRegisterInfo();
10499 auto *DefBB = DefMI.getParent();
10500
10501 // Don't bother searching between blocks, although it is possible this block
10502 // doesn't modify exec.
10503 if (UseMI.getParent() != DefBB)
10504 return true;
10505
10506 const int MaxInstScan = 20;
10507 int NumInst = 0;
10508
10509 // Stop scan at the use.
10510 auto E = UseMI.getIterator();
10511 for (auto I = std::next(x: DefMI.getIterator()); I != E; ++I) {
10512 if (I->isDebugInstr())
10513 continue;
10514
10515 if (++NumInst > MaxInstScan)
10516 return true;
10517
10518 if (I->modifiesRegister(Reg: AMDGPU::EXEC, TRI))
10519 return true;
10520 }
10521
10522 return false;
10523}
10524
10525bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
10526 Register VReg,
10527 const MachineInstr &DefMI) {
10528 assert(MRI.isSSA() && "Must be run on SSA");
10529
10530 auto *TRI = MRI.getTargetRegisterInfo();
10531 auto *DefBB = DefMI.getParent();
10532
10533 const int MaxUseScan = 10;
10534 int NumUse = 0;
10535
10536 for (auto &Use : MRI.use_nodbg_operands(Reg: VReg)) {
10537 auto &UseInst = *Use.getParent();
10538 // Don't bother searching between blocks, although it is possible this block
10539 // doesn't modify exec.
10540 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10541 return true;
10542
10543 if (++NumUse > MaxUseScan)
10544 return true;
10545 }
10546
10547 if (NumUse == 0)
10548 return false;
10549
10550 const int MaxInstScan = 20;
10551 int NumInst = 0;
10552
10553 // Stop scan when we have seen all the uses.
10554 for (auto I = std::next(x: DefMI.getIterator()); ; ++I) {
10555 assert(I != DefBB->end());
10556
10557 if (I->isDebugInstr())
10558 continue;
10559
10560 if (++NumInst > MaxInstScan)
10561 return true;
10562
10563 for (const MachineOperand &Op : I->operands()) {
10564 // We don't check reg masks here as they're used only on calls:
10565 // 1. EXEC is only considered const within one BB
10566 // 2. Call should be a terminator instruction if present in a BB
10567
10568 if (!Op.isReg())
10569 continue;
10570
10571 Register Reg = Op.getReg();
10572 if (Op.isUse()) {
10573 if (Reg == VReg && --NumUse == 0)
10574 return false;
10575 } else if (TRI->regsOverlap(RegA: Reg, RegB: AMDGPU::EXEC))
10576 return true;
10577 }
10578 }
10579}
10580
10581MachineInstr *SIInstrInfo::createPHIDestinationCopy(
10582 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
10583 const DebugLoc &DL, Register Src, Register Dst) const {
10584 auto Cur = MBB.begin();
10585 if (Cur != MBB.end())
10586 do {
10587 if (!Cur->isPHI() && Cur->readsRegister(Reg: Dst, /*TRI=*/nullptr))
10588 return BuildMI(BB&: MBB, I: Cur, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: Dst).addReg(RegNo: Src);
10589 ++Cur;
10590 } while (Cur != MBB.end() && Cur != LastPHIIt);
10591
10592 return TargetInstrInfo::createPHIDestinationCopy(MBB, InsPt: LastPHIIt, DL, Src,
10593 Dst);
10594}
10595
10596MachineInstr *SIInstrInfo::createPHISourceCopy(
10597 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
10598 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10599 if (InsPt != MBB.end() &&
10600 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10601 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10602 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10603 InsPt->definesRegister(Reg: Src, /*TRI=*/nullptr)) {
10604 InsPt++;
10605 return BuildMI(BB&: MBB, I: InsPt, MIMD: DL,
10606 MCID: get(Opcode: AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), DestReg: Dst)
10607 .addReg(RegNo: Src, Flags: {}, SubReg: SrcSubReg)
10608 .addReg(RegNo: AMDGPU::EXEC, Flags: RegState::Implicit);
10609 }
10610 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10611 Dst);
10612}
10613
10614bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10615
10616MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
10617 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
10618 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10619 VirtRegMap *VRM) const {
10620 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10621 //
10622 // %0:sreg_32 = COPY $m0
10623 //
10624 // We explicitly chose SReg_32 for the virtual register so such a copy might
10625 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10626 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10627 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10628 // TargetInstrInfo::foldMemoryOperand() is going to try.
10629 // A similar issue also exists with spilling and reloading $exec registers.
10630 //
10631 // To prevent that, constrain the %0 register class here.
10632 if (isFullCopyInstr(MI)) {
10633 Register DstReg = MI.getOperand(i: 0).getReg();
10634 Register SrcReg = MI.getOperand(i: 1).getReg();
10635 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10636 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10637 MachineRegisterInfo &MRI = MF.getRegInfo();
10638 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10639 const TargetRegisterClass *RC = MRI.getRegClass(Reg: VirtReg);
10640 if (RC->hasSuperClassEq(RC: &AMDGPU::SReg_32RegClass)) {
10641 MRI.constrainRegClass(Reg: VirtReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
10642 return nullptr;
10643 }
10644 if (RC->hasSuperClassEq(RC: &AMDGPU::SReg_64RegClass)) {
10645 MRI.constrainRegClass(Reg: VirtReg, RC: &AMDGPU::SReg_64_XEXECRegClass);
10646 return nullptr;
10647 }
10648 }
10649 }
10650
10651 return nullptr;
10652}
10653
10654unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
10655 const MachineInstr &MI,
10656 unsigned *PredCost) const {
10657 if (MI.isBundle()) {
10658 MachineBasicBlock::const_instr_iterator I(MI.getIterator());
10659 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10660 unsigned Lat = 0, Count = 0;
10661 for (++I; I != E && I->isBundledWithPred(); ++I) {
10662 ++Count;
10663 Lat = std::max(a: Lat, b: SchedModel.computeInstrLatency(MI: &*I));
10664 }
10665 return Lat + Count - 1;
10666 }
10667
10668 return SchedModel.computeInstrLatency(MI: &MI);
10669}
10670
10671const MachineOperand &
10672SIInstrInfo::getCalleeOperand(const MachineInstr &MI) const {
10673 if (const MachineOperand *CallAddrOp =
10674 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0))
10675 return *CallAddrOp;
10676 return TargetInstrInfo::getCalleeOperand(MI);
10677}
10678
10679InstructionUniformity
10680SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
10681 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10682 unsigned Opcode = MI.getOpcode();
10683
10684 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10685 Register Dst = MI.getOperand(i: 0).getReg();
10686 Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: 2).getReg()
10687 : MI.getOperand(i: 1).getReg();
10688 LLT DstTy = MRI.getType(Reg: Dst);
10689 LLT SrcTy = MRI.getType(Reg: Src);
10690 unsigned DstAS = DstTy.getAddressSpace();
10691 unsigned SrcAS = SrcTy.getAddressSpace();
10692 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10693 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10694 ST.hasGloballyAddressableScratch()
10695 ? InstructionUniformity::NeverUniform
10696 : InstructionUniformity::Default;
10697 };
10698
10699 // If the target supports globally addressable scratch, the mapping from
10700 // scratch memory to the flat aperture changes therefore an address space cast
10701 // is no longer uniform.
10702 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10703 return HandleAddrSpaceCast(MI);
10704
10705 if (auto *GI = dyn_cast<GIntrinsic>(Val: &MI)) {
10706 auto IID = GI->getIntrinsicID();
10707 if (AMDGPU::isIntrinsicSourceOfDivergence(IntrID: IID))
10708 return InstructionUniformity::NeverUniform;
10709 if (AMDGPU::isIntrinsicAlwaysUniform(IntrID: IID))
10710 return InstructionUniformity::AlwaysUniform;
10711
10712 switch (IID) {
10713 case Intrinsic::amdgcn_addrspacecast_nonnull:
10714 return HandleAddrSpaceCast(MI);
10715 case Intrinsic::amdgcn_if:
10716 case Intrinsic::amdgcn_else:
10717 // FIXME: Uniform if second result
10718 break;
10719 }
10720
10721 return InstructionUniformity::Default;
10722 }
10723
10724 // Loads from the private and flat address spaces are divergent, because
10725 // threads can execute the load instruction with the same inputs and get
10726 // different results.
10727 //
10728 // All other loads are not divergent, because if threads issue loads with the
10729 // same arguments, they will always get the same result.
10730 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10731 Opcode == AMDGPU::G_SEXTLOAD) {
10732 if (MI.memoperands_empty())
10733 return InstructionUniformity::NeverUniform; // conservative assumption
10734
10735 if (llvm::any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *mmo) {
10736 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10737 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10738 })) {
10739 // At least one MMO in a non-global address space.
10740 return InstructionUniformity::NeverUniform;
10741 }
10742 return InstructionUniformity::Default;
10743 }
10744
10745 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opc: Opcode) ||
10746 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10747 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10748 AMDGPU::isGenericAtomic(Opc: Opcode)) {
10749 return InstructionUniformity::NeverUniform;
10750 }
10751 return InstructionUniformity::Default;
10752}
10753
10754const MIRFormatter *SIInstrInfo::getMIRFormatter() const {
10755 if (!Formatter)
10756 Formatter = std::make_unique<AMDGPUMIRFormatter>(args: ST);
10757 return Formatter.get();
10758}
10759
10760InstructionUniformity
10761SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
10762
10763 if (isNeverUniform(MI))
10764 return InstructionUniformity::NeverUniform;
10765
10766 unsigned opcode = MI.getOpcode();
10767 if (opcode == AMDGPU::V_READLANE_B32 ||
10768 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10769 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10770 return InstructionUniformity::AlwaysUniform;
10771
10772 if (isCopyInstr(MI)) {
10773 const MachineOperand &srcOp = MI.getOperand(i: 1);
10774 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10775 const TargetRegisterClass *regClass =
10776 RI.getPhysRegBaseClass(Reg: srcOp.getReg());
10777 return RI.isSGPRClass(RC: regClass) ? InstructionUniformity::AlwaysUniform
10778 : InstructionUniformity::NeverUniform;
10779 }
10780 return InstructionUniformity::Default;
10781 }
10782
10783 // GMIR handling
10784 if (MI.isPreISelOpcode())
10785 return SIInstrInfo::getGenericInstructionUniformity(MI);
10786
10787 // Atomics are divergent because they are executed sequentially: when an
10788 // atomic operation refers to the same address in each thread, then each
10789 // thread after the first sees the value written by the previous thread as
10790 // original value.
10791
10792 if (isAtomic(MI))
10793 return InstructionUniformity::NeverUniform;
10794
10795 // Loads from the private and flat address spaces are divergent, because
10796 // threads can execute the load instruction with the same inputs and get
10797 // different results.
10798 if (isFLAT(MI) && MI.mayLoad()) {
10799 if (MI.memoperands_empty())
10800 return InstructionUniformity::NeverUniform; // conservative assumption
10801
10802 if (llvm::any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *mmo) {
10803 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10804 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10805 })) {
10806 // At least one MMO in a non-global address space.
10807 return InstructionUniformity::NeverUniform;
10808 }
10809
10810 return InstructionUniformity::Default;
10811 }
10812
10813 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10814 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10815
10816 // FIXME: It's conceptually broken to report this for an instruction, and not
10817 // a specific def operand. For inline asm in particular, there could be mixed
10818 // uniform and divergent results.
10819 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10820 const MachineOperand &SrcOp = MI.getOperand(i: I);
10821 if (!SrcOp.isReg())
10822 continue;
10823
10824 Register Reg = SrcOp.getReg();
10825 if (!Reg || !SrcOp.readsReg())
10826 continue;
10827
10828 // If RegBank is null, this is unassigned or an unallocatable special
10829 // register, which are all scalars.
10830 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, TRI: RI);
10831 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10832 return InstructionUniformity::NeverUniform;
10833 }
10834
10835 // TODO: Uniformity check condtions above can be rearranged for more
10836 // redability
10837
10838 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10839 // currently turned into no-op COPYs by SelectionDAG ISel and are
10840 // therefore no longer recognizable.
10841
10842 return InstructionUniformity::Default;
10843}
10844
10845unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
10846 switch (MF.getFunction().getCallingConv()) {
10847 case CallingConv::AMDGPU_PS:
10848 return 1;
10849 case CallingConv::AMDGPU_VS:
10850 return 2;
10851 case CallingConv::AMDGPU_GS:
10852 return 3;
10853 case CallingConv::AMDGPU_HS:
10854 case CallingConv::AMDGPU_LS:
10855 case CallingConv::AMDGPU_ES: {
10856 const Function &F = MF.getFunction();
10857 F.getContext().diagnose(DI: DiagnosticInfoUnsupported(
10858 F, "ds_ordered_count unsupported for this calling conv"));
10859 [[fallthrough]];
10860 }
10861 case CallingConv::AMDGPU_CS:
10862 case CallingConv::AMDGPU_KERNEL:
10863 case CallingConv::C:
10864 case CallingConv::Fast:
10865 default:
10866 // Assume other calling conventions are various compute callable functions
10867 return 0;
10868 }
10869}
10870
10871bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
10872 Register &SrcReg2, int64_t &CmpMask,
10873 int64_t &CmpValue) const {
10874 if (!MI.getOperand(i: 0).isReg() || MI.getOperand(i: 0).getSubReg())
10875 return false;
10876
10877 switch (MI.getOpcode()) {
10878 default:
10879 break;
10880 case AMDGPU::S_CMP_EQ_U32:
10881 case AMDGPU::S_CMP_EQ_I32:
10882 case AMDGPU::S_CMP_LG_U32:
10883 case AMDGPU::S_CMP_LG_I32:
10884 case AMDGPU::S_CMP_LT_U32:
10885 case AMDGPU::S_CMP_LT_I32:
10886 case AMDGPU::S_CMP_GT_U32:
10887 case AMDGPU::S_CMP_GT_I32:
10888 case AMDGPU::S_CMP_LE_U32:
10889 case AMDGPU::S_CMP_LE_I32:
10890 case AMDGPU::S_CMP_GE_U32:
10891 case AMDGPU::S_CMP_GE_I32:
10892 case AMDGPU::S_CMP_EQ_U64:
10893 case AMDGPU::S_CMP_LG_U64:
10894 SrcReg = MI.getOperand(i: 0).getReg();
10895 if (MI.getOperand(i: 1).isReg()) {
10896 if (MI.getOperand(i: 1).getSubReg())
10897 return false;
10898 SrcReg2 = MI.getOperand(i: 1).getReg();
10899 CmpValue = 0;
10900 } else if (MI.getOperand(i: 1).isImm()) {
10901 SrcReg2 = Register();
10902 CmpValue = MI.getOperand(i: 1).getImm();
10903 } else {
10904 return false;
10905 }
10906 CmpMask = ~0;
10907 return true;
10908 case AMDGPU::S_CMPK_EQ_U32:
10909 case AMDGPU::S_CMPK_EQ_I32:
10910 case AMDGPU::S_CMPK_LG_U32:
10911 case AMDGPU::S_CMPK_LG_I32:
10912 case AMDGPU::S_CMPK_LT_U32:
10913 case AMDGPU::S_CMPK_LT_I32:
10914 case AMDGPU::S_CMPK_GT_U32:
10915 case AMDGPU::S_CMPK_GT_I32:
10916 case AMDGPU::S_CMPK_LE_U32:
10917 case AMDGPU::S_CMPK_LE_I32:
10918 case AMDGPU::S_CMPK_GE_U32:
10919 case AMDGPU::S_CMPK_GE_I32:
10920 SrcReg = MI.getOperand(i: 0).getReg();
10921 SrcReg2 = Register();
10922 CmpValue = MI.getOperand(i: 1).getImm();
10923 CmpMask = ~0;
10924 return true;
10925 }
10926
10927 return false;
10928}
10929
10930static bool isSCCDeadOnExit(MachineBasicBlock *MBB) {
10931 for (MachineBasicBlock *S : MBB->successors()) {
10932 if (S->isLiveIn(Reg: AMDGPU::SCC))
10933 return false;
10934 }
10935 return true;
10936}
10937
10938// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
10939// (incoming SCC) = !(SCC defined by SCCDef).
10940// Return true if all uses can be re-written, false otherwise.
10941bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
10942 MachineBasicBlock *MBB = SCCDef->getParent();
10943 SmallVector<MachineInstr *> InvertInstr;
10944 bool SCCIsDead = false;
10945
10946 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
10947 constexpr unsigned ScanLimit = 12;
10948 unsigned Count = 0;
10949 for (MachineInstr &MI :
10950 make_range(x: std::next(x: MachineBasicBlock::iterator(SCCDef)), y: MBB->end())) {
10951 if (++Count > ScanLimit)
10952 return false;
10953 if (MI.readsRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
10954 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
10955 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
10956 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10957 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
10958 InvertInstr.push_back(Elt: &MI);
10959 else
10960 return false;
10961 }
10962 if (MI.definesRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
10963 SCCIsDead = true;
10964 break;
10965 }
10966 }
10967 if (!SCCIsDead && isSCCDeadOnExit(MBB))
10968 SCCIsDead = true;
10969
10970 // SCC may have more uses. Can't invert all of them.
10971 if (!SCCIsDead)
10972 return false;
10973
10974 // Invert uses
10975 for (MachineInstr *MI : InvertInstr) {
10976 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
10977 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
10978 swapOperands(Inst&: *MI);
10979 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10980 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
10981 MI->setDesc(get(Opcode: MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
10982 ? AMDGPU::S_CBRANCH_SCC1
10983 : AMDGPU::S_CBRANCH_SCC0));
10984 } else {
10985 llvm_unreachable("SCC used but no inversion handling");
10986 }
10987 }
10988 return true;
10989}
10990
10991// SCC is already valid after SCCValid.
10992// SCCRedefine will redefine SCC to the same value already available after
10993// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
10994// update kill/dead flags if necessary.
10995bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
10996 bool NeedInversion) const {
10997 MachineInstr *KillsSCC = nullptr;
10998 if (SCCValid->getParent() != SCCRedefine->getParent())
10999 return false;
11000 for (MachineInstr &MI : make_range(x: std::next(x: SCCValid->getIterator()),
11001 y: SCCRedefine->getIterator())) {
11002 if (MI.modifiesRegister(Reg: AMDGPU::SCC, TRI: &RI))
11003 return false;
11004 if (MI.killsRegister(Reg: AMDGPU::SCC, TRI: &RI))
11005 KillsSCC = &MI;
11006 }
11007 if (NeedInversion && !invertSCCUse(SCCDef: SCCRedefine))
11008 return false;
11009 if (MachineOperand *SccDef =
11010 SCCValid->findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr))
11011 SccDef->setIsDead(false);
11012 if (KillsSCC)
11013 KillsSCC->clearRegisterKills(Reg: AMDGPU::SCC, /*TRI=*/RegInfo: nullptr);
11014 SCCRedefine->eraseFromParent();
11015 return true;
11016}
11017
11018static bool foldableSelect(const MachineInstr &Def) {
11019 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11020 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11021 return false;
11022 bool Op1IsNonZeroImm =
11023 Def.getOperand(i: 1).isImm() && Def.getOperand(i: 1).getImm() != 0;
11024 bool Op2IsZeroImm =
11025 Def.getOperand(i: 2).isImm() && Def.getOperand(i: 2).getImm() == 0;
11026 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11027 return false;
11028 return true;
11029}
11030
11031static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11032 unsigned &NewDefOpc) {
11033 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11034 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11035 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11036 Def.getOpcode() != AMDGPU::S_ADD_U32)
11037 return false;
11038 const MachineOperand &AddSrc1 = Def.getOperand(i: 1);
11039 const MachineOperand &AddSrc2 = Def.getOperand(i: 2);
11040 int64_t addend;
11041
11042 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11043 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11044 (!getFoldableImm(MO: &AddSrc1, Imm&: addend) || addend != 1) &&
11045 (!getFoldableImm(MO: &AddSrc2, Imm&: addend) || addend != 1))
11046 return false;
11047
11048 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11049 const MachineOperand *SccDef =
11050 Def.findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
11051 if (!SccDef->isDead())
11052 return false;
11053 NewDefOpc = AMDGPU::S_ADD_U32;
11054 }
11055 NeedInversion = !NeedInversion;
11056 return true;
11057}
11058
11059bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
11060 Register SrcReg2, int64_t CmpMask,
11061 int64_t CmpValue,
11062 const MachineRegisterInfo *MRI) const {
11063 if (!SrcReg || SrcReg.isPhysical())
11064 return false;
11065
11066 if (SrcReg2 && !getFoldableImm(Reg: SrcReg2, MRI: *MRI, Imm&: CmpValue))
11067 return false;
11068
11069 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11070 this](bool NeedInversion) -> bool {
11071 if (CmpValue != 0)
11072 return false;
11073
11074 MachineInstr *Def = MRI->getVRegDef(Reg: SrcReg);
11075 if (!Def)
11076 return false;
11077
11078 // For S_OP that set SCC = DST!=0, do the transformation
11079 //
11080 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11081 //
11082 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11083 // do the transformation:
11084 //
11085 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11086 //
11087 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11088 // for S_CSELECT* already has the same value that will be calculated by
11089 // s_cmp_lg_*
11090 //
11091 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11092 // (non-zero imm), 0)
11093
11094 unsigned NewDefOpc = Def->getOpcode();
11095 if (!setsSCCIfResultIsNonZero(*Def) &&
11096 !setsSCCIfResultIsZero(Def: *Def, NeedInversion, NewDefOpc) &&
11097 !foldableSelect(Def: *Def))
11098 return false;
11099
11100 if (!optimizeSCC(SCCValid: Def, SCCRedefine: &CmpInstr, NeedInversion))
11101 return false;
11102
11103 if (NewDefOpc != Def->getOpcode())
11104 Def->setDesc(get(Opcode: NewDefOpc));
11105
11106 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11107 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11108 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11109 // sX = s_cselect_b64 (non-zero imm), 0
11110 // sLo = copy sX.sub0
11111 // sHi = copy sX.sub1
11112 // sY = s_or_b32 sLo, sHi
11113 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11114 MRI->use_nodbg_empty(RegNo: Def->getOperand(i: 0).getReg())) {
11115 const MachineOperand &OrOpnd1 = Def->getOperand(i: 1);
11116 const MachineOperand &OrOpnd2 = Def->getOperand(i: 2);
11117 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11118 MachineInstr *Def1 = MRI->getVRegDef(Reg: OrOpnd1.getReg());
11119 MachineInstr *Def2 = MRI->getVRegDef(Reg: OrOpnd2.getReg());
11120 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11121 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(i: 1).isReg() &&
11122 Def2->getOperand(i: 1).isReg() &&
11123 Def1->getOperand(i: 1).getSubReg() == AMDGPU::sub0 &&
11124 Def2->getOperand(i: 1).getSubReg() == AMDGPU::sub1 &&
11125 Def1->getOperand(i: 1).getReg() == Def2->getOperand(i: 1).getReg()) {
11126 MachineInstr *Select = MRI->getVRegDef(Reg: Def1->getOperand(i: 1).getReg());
11127 if (Select && foldableSelect(Def: *Select))
11128 optimizeSCC(SCCValid: Select, SCCRedefine: Def, /*NeedInversion=*/false);
11129 }
11130 }
11131 }
11132 return true;
11133 };
11134
11135 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11136 this](int64_t ExpectedValue, unsigned SrcSize,
11137 bool IsReversible, bool IsSigned) -> bool {
11138 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11139 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11140 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11141 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11142 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11143 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11144 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11145 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11146 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11147 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11148 //
11149 // Signed ge/gt are not used for the sign bit.
11150 //
11151 // If result of the AND is unused except in the compare:
11152 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11153 //
11154 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11155 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11156 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11157 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11158 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11159 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11160
11161 MachineInstr *Def = MRI->getVRegDef(Reg: SrcReg);
11162 if (!Def)
11163 return false;
11164
11165 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11166 Def->getOpcode() != AMDGPU::S_AND_B64)
11167 return false;
11168
11169 int64_t Mask;
11170 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11171 if (MO->isImm())
11172 Mask = MO->getImm();
11173 else if (!getFoldableImm(MO, Imm&: Mask))
11174 return false;
11175 Mask &= maxUIntN(N: SrcSize);
11176 return isPowerOf2_64(Value: Mask);
11177 };
11178
11179 MachineOperand *SrcOp = &Def->getOperand(i: 1);
11180 if (isMask(SrcOp))
11181 SrcOp = &Def->getOperand(i: 2);
11182 else if (isMask(&Def->getOperand(i: 2)))
11183 SrcOp = &Def->getOperand(i: 1);
11184 else
11185 return false;
11186
11187 // A valid Mask is required to have a single bit set, hence a non-zero and
11188 // power-of-two value. This verifies that we will not do 64-bit shift below.
11189 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11190 unsigned BitNo = llvm::countr_zero(Val: (uint64_t)Mask);
11191 if (IsSigned && BitNo == SrcSize - 1)
11192 return false;
11193
11194 ExpectedValue <<= BitNo;
11195
11196 bool IsReversedCC = false;
11197 if (CmpValue != ExpectedValue) {
11198 if (!IsReversible)
11199 return false;
11200 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11201 if (!IsReversedCC)
11202 return false;
11203 }
11204
11205 Register DefReg = Def->getOperand(i: 0).getReg();
11206 if (IsReversedCC && !MRI->hasOneNonDBGUse(RegNo: DefReg))
11207 return false;
11208
11209 if (!optimizeSCC(SCCValid: Def, SCCRedefine: &CmpInstr, /*NeedInversion=*/false))
11210 return false;
11211
11212 if (!MRI->use_nodbg_empty(RegNo: DefReg)) {
11213 assert(!IsReversedCC);
11214 return true;
11215 }
11216
11217 // Replace AND with unused result with a S_BITCMP.
11218 MachineBasicBlock *MBB = Def->getParent();
11219
11220 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11221 : AMDGPU::S_BITCMP1_B32
11222 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11223 : AMDGPU::S_BITCMP1_B64;
11224
11225 BuildMI(BB&: *MBB, I: Def, MIMD: Def->getDebugLoc(), MCID: get(Opcode: NewOpc))
11226 .add(MO: *SrcOp)
11227 .addImm(Val: BitNo);
11228 Def->eraseFromParent();
11229
11230 return true;
11231 };
11232
11233 switch (CmpInstr.getOpcode()) {
11234 default:
11235 break;
11236 case AMDGPU::S_CMP_EQ_U32:
11237 case AMDGPU::S_CMP_EQ_I32:
11238 case AMDGPU::S_CMPK_EQ_U32:
11239 case AMDGPU::S_CMPK_EQ_I32:
11240 return optimizeCmpAnd(1, 32, true, false) ||
11241 optimizeCmpSelect(/*NeedInversion=*/true);
11242 case AMDGPU::S_CMP_GE_U32:
11243 case AMDGPU::S_CMPK_GE_U32:
11244 return optimizeCmpAnd(1, 32, false, false);
11245 case AMDGPU::S_CMP_GE_I32:
11246 case AMDGPU::S_CMPK_GE_I32:
11247 return optimizeCmpAnd(1, 32, false, true);
11248 case AMDGPU::S_CMP_EQ_U64:
11249 return optimizeCmpAnd(1, 64, true, false);
11250 case AMDGPU::S_CMP_LG_U32:
11251 case AMDGPU::S_CMP_LG_I32:
11252 case AMDGPU::S_CMPK_LG_U32:
11253 case AMDGPU::S_CMPK_LG_I32:
11254 return optimizeCmpAnd(0, 32, true, false) ||
11255 optimizeCmpSelect(/*NeedInversion=*/false);
11256 case AMDGPU::S_CMP_GT_U32:
11257 case AMDGPU::S_CMPK_GT_U32:
11258 return optimizeCmpAnd(0, 32, false, false);
11259 case AMDGPU::S_CMP_GT_I32:
11260 case AMDGPU::S_CMPK_GT_I32:
11261 return optimizeCmpAnd(0, 32, false, true);
11262 case AMDGPU::S_CMP_LG_U64:
11263 return optimizeCmpAnd(0, 64, true, false) ||
11264 optimizeCmpSelect(/*NeedInversion=*/false);
11265 }
11266
11267 return false;
11268}
11269
11270void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI,
11271 AMDGPU::OpName OpName) const {
11272 if (!ST.needsAlignedVGPRs())
11273 return;
11274
11275 int OpNo = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpName);
11276 if (OpNo < 0)
11277 return;
11278 MachineOperand &Op = MI.getOperand(i: OpNo);
11279 if (getOpSize(MI, OpNo) > 4)
11280 return;
11281
11282 // Add implicit aligned super-reg to force alignment on the data operand.
11283 const DebugLoc &DL = MI.getDebugLoc();
11284 MachineBasicBlock *BB = MI.getParent();
11285 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11286 Register DataReg = Op.getReg();
11287 bool IsAGPR = RI.isAGPR(MRI, Reg: DataReg);
11288 Register Undef = MRI.createVirtualRegister(
11289 RegClass: IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11290 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
11291 Register NewVR =
11292 MRI.createVirtualRegister(RegClass: IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11293 : &AMDGPU::VReg_64_Align2RegClass);
11294 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewVR)
11295 .addReg(RegNo: DataReg, Flags: {}, SubReg: Op.getSubReg())
11296 .addImm(Val: AMDGPU::sub0)
11297 .addReg(RegNo: Undef)
11298 .addImm(Val: AMDGPU::sub1);
11299 Op.setReg(NewVR);
11300 Op.setSubReg(AMDGPU::sub0);
11301 MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewVR, isDef: false, isImp: true));
11302}
11303
11304bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
11305 if (isIGLP(MI: *MI))
11306 return false;
11307
11308 return TargetInstrInfo::isGlobalMemoryObject(MI);
11309}
11310
11311bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const {
11312 if (!isWMMA(MI) && !isSWMMAC(MI))
11313 return false;
11314
11315 if (ST.hasGFX1250Insts())
11316 return AMDGPU::getWMMAIsXDL(Opc: MI.getOpcode());
11317
11318 return true;
11319}
11320
11321bool SIInstrInfo::isXDL(const MachineInstr &MI) const {
11322 unsigned Opcode = MI.getOpcode();
11323
11324 if (AMDGPU::isGFX12Plus(STI: ST))
11325 return isDOT(MI) || isXDLWMMA(MI);
11326
11327 if (!isMAI(MI) || isDGEMM(Opcode) ||
11328 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11329 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11330 return false;
11331
11332 if (!ST.hasGFX940Insts())
11333 return true;
11334
11335 return AMDGPU::getMAIIsGFX940XDL(Opc: Opcode);
11336}
11337