1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/Analysis/ValueTracking.h"
24#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25#include "llvm/CodeGen/LiveIntervals.h"
26#include "llvm/CodeGen/LiveVariables.h"
27#include "llvm/CodeGen/MachineCycleAnalysis.h"
28#include "llvm/CodeGen/MachineDominators.h"
29#include "llvm/CodeGen/MachineFrameInfo.h"
30#include "llvm/CodeGen/MachineScheduler.h"
31#include "llvm/CodeGen/RegisterScavenging.h"
32#include "llvm/CodeGen/ScheduleDAG.h"
33#include "llvm/IR/DiagnosticInfo.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/MC/MCContext.h"
36#include "llvm/Support/CommandLine.h"
37#include "llvm/Target/TargetMachine.h"
38
39using namespace llvm;
40
41#define DEBUG_TYPE "si-instr-info"
42
43#define GET_INSTRINFO_CTOR_DTOR
44#include "AMDGPUGenInstrInfo.inc"
45
46namespace llvm::AMDGPU {
47#define GET_D16ImageDimIntrinsics_IMPL
48#define GET_ImageDimIntrinsicTable_IMPL
49#define GET_RsrcIntrinsics_IMPL
50#include "AMDGPUGenSearchableTables.inc"
51} // namespace llvm::AMDGPU
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
56static cl::opt<unsigned>
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(Val: 16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
60static cl::opt<bool> Fix16BitCopies(
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(Val: true),
64 cl::ReallyHidden);
65
66SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
67 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
68 AMDGPU::ADJCALLSTACKDOWN),
69 RI(ST), ST(ST) {
70 SchedModel.init(TSInfo: &ST);
71}
72
73//===----------------------------------------------------------------------===//
74// TargetInstrInfo callbacks
75//===----------------------------------------------------------------------===//
76
77static unsigned getNumOperandsNoGlue(SDNode *Node) {
78 unsigned N = Node->getNumOperands();
79 while (N && Node->getOperand(Num: N - 1).getValueType() == MVT::Glue)
80 --N;
81 return N;
82}
83
84/// Returns true if both nodes have the same value for the given
85/// operand \p Op, or if both nodes do not have this operand.
86static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1,
87 AMDGPU::OpName OpName) {
88 unsigned Opc0 = N0->getMachineOpcode();
89 unsigned Opc1 = N1->getMachineOpcode();
90
91 int Op0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: OpName);
92 int Op1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: OpName);
93
94 if (Op0Idx == -1 && Op1Idx == -1)
95 return true;
96
97
98 if ((Op0Idx == -1 && Op1Idx != -1) ||
99 (Op1Idx == -1 && Op0Idx != -1))
100 return false;
101
102 // getNamedOperandIdx returns the index for the MachineInstr's operands,
103 // which includes the result as the first operand. We are indexing into the
104 // MachineSDNode's operands, so we need to skip the result operand to get
105 // the real index.
106 --Op0Idx;
107 --Op1Idx;
108
109 return N0->getOperand(Num: Op0Idx) == N1->getOperand(Num: Op1Idx);
110}
111
112static bool canRemat(const MachineInstr &MI) {
113
114 if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) ||
115 SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) ||
116 SIInstrInfo::isSALU(MI))
117 return true;
118
119 if (SIInstrInfo::isSMRD(MI)) {
120 return !MI.memoperands_empty() &&
121 llvm::all_of(Range: MI.memoperands(), P: [](const MachineMemOperand *MMO) {
122 return MMO->isLoad() && MMO->isInvariant();
123 });
124 }
125
126 return false;
127}
128
129bool SIInstrInfo::isReMaterializableImpl(
130 const MachineInstr &MI) const {
131
132 if (canRemat(MI)) {
133 // Normally VALU use of exec would block the rematerialization, but that
134 // is OK in this case to have an implicit exec read as all VALU do.
135 // We really want all of the generic logic for this except for this.
136
137 // Another potential implicit use is mode register. The core logic of
138 // the RA will not attempt rematerialization if mode is set anywhere
139 // in the function, otherwise it is safe since mode is not changed.
140
141 // There is difference to generic method which does not allow
142 // rematerialization if there are virtual register uses. We allow this,
143 // therefore this method includes SOP instructions as well.
144 if (!MI.hasImplicitDef() &&
145 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
146 !MI.mayRaiseFPException())
147 return true;
148 }
149
150 return TargetInstrInfo::isReMaterializableImpl(MI);
151}
152
153// Returns true if the result of a VALU instruction depends on exec.
154bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
155 assert(isVALU(MI, /*AllowLDSDMA=*/true));
156
157 // If it is convergent it depends on EXEC.
158 if (MI.isConvergent())
159 return true;
160
161 // If it defines SGPR it depends on EXEC
162 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
163 for (const MachineOperand &Def : MI.defs()) {
164 if (!Def.isReg())
165 continue;
166
167 Register Reg = Def.getReg();
168 if (Reg && RI.isSGPRReg(MRI, Reg))
169 return true;
170 }
171
172 return false;
173}
174
175bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
176 // Any implicit use of exec by VALU is not a real register read.
177 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
178 isVALU(MI: *MO.getParent(), /*AllowLDSDMA=*/true) &&
179 !resultDependsOnExec(MI: *MO.getParent());
180}
181
182bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
183 MachineBasicBlock *SuccToSinkTo,
184 MachineCycleInfo *CI) const {
185 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
186 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
187 return true;
188
189 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
190 // Check if sinking of MI would create temporal divergent use.
191 for (auto Op : MI.uses()) {
192 if (Op.isReg() && Op.getReg().isVirtual() &&
193 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Op.getReg()))) {
194 MachineInstr *SgprDef = MRI.getVRegDef(Reg: Op.getReg());
195
196 // SgprDef defined inside cycle
197 MachineCycle *FromCycle = CI->getCycle(Block: SgprDef->getParent());
198 if (FromCycle == nullptr)
199 continue;
200
201 MachineCycle *ToCycle = CI->getCycle(Block: SuccToSinkTo);
202 // Check if there is a FromCycle that contains SgprDef's basic block but
203 // does not contain SuccToSinkTo and also has divergent exit condition.
204 while (FromCycle && !FromCycle->contains(C: ToCycle)) {
205 SmallVector<MachineBasicBlock *, 1> ExitingBlocks;
206 FromCycle->getExitingBlocks(TmpStorage&: ExitingBlocks);
207
208 // FromCycle has divergent exit condition.
209 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
210 if (hasDivergentBranch(MBB: ExitingBlock))
211 return false;
212 }
213
214 FromCycle = FromCycle->getParentCycle();
215 }
216 }
217 }
218
219 return true;
220}
221
222bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
223 int64_t &Offset0,
224 int64_t &Offset1) const {
225 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
226 return false;
227
228 unsigned Opc0 = Load0->getMachineOpcode();
229 unsigned Opc1 = Load1->getMachineOpcode();
230
231 // Make sure both are actually loads.
232 if (!get(Opcode: Opc0).mayLoad() || !get(Opcode: Opc1).mayLoad())
233 return false;
234
235 // A mayLoad instruction without a def is not a load. Likely a prefetch.
236 if (!get(Opcode: Opc0).getNumDefs() || !get(Opcode: Opc1).getNumDefs())
237 return false;
238
239 if (isDS(Opcode: Opc0) && isDS(Opcode: Opc1)) {
240
241 // FIXME: Handle this case:
242 if (getNumOperandsNoGlue(Node: Load0) != getNumOperandsNoGlue(Node: Load1))
243 return false;
244
245 // Check base reg.
246 if (Load0->getOperand(Num: 0) != Load1->getOperand(Num: 0))
247 return false;
248
249 // Skip read2 / write2 variants for simplicity.
250 // TODO: We should report true if the used offsets are adjacent (excluded
251 // st64 versions).
252 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: AMDGPU::OpName::offset);
253 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: AMDGPU::OpName::offset);
254 if (Offset0Idx == -1 || Offset1Idx == -1)
255 return false;
256
257 // XXX - be careful of dataless loads
258 // getNamedOperandIdx returns the index for MachineInstrs. Since they
259 // include the output in the operand list, but SDNodes don't, we need to
260 // subtract the index by one.
261 Offset0Idx -= get(Opcode: Opc0).NumDefs;
262 Offset1Idx -= get(Opcode: Opc1).NumDefs;
263 Offset0 = Load0->getConstantOperandVal(Num: Offset0Idx);
264 Offset1 = Load1->getConstantOperandVal(Num: Offset1Idx);
265 return true;
266 }
267
268 if (isSMRD(Opcode: Opc0) && isSMRD(Opcode: Opc1)) {
269 // Skip time and cache invalidation instructions.
270 if (!AMDGPU::hasNamedOperand(Opcode: Opc0, NamedIdx: AMDGPU::OpName::sbase) ||
271 !AMDGPU::hasNamedOperand(Opcode: Opc1, NamedIdx: AMDGPU::OpName::sbase))
272 return false;
273
274 unsigned NumOps = getNumOperandsNoGlue(Node: Load0);
275 if (NumOps != getNumOperandsNoGlue(Node: Load1))
276 return false;
277
278 // Check base reg.
279 if (Load0->getOperand(Num: 0) != Load1->getOperand(Num: 0))
280 return false;
281
282 // Match register offsets, if both register and immediate offsets present.
283 assert(NumOps == 4 || NumOps == 5);
284 if (NumOps == 5 && Load0->getOperand(Num: 1) != Load1->getOperand(Num: 1))
285 return false;
286
287 const ConstantSDNode *Load0Offset =
288 dyn_cast<ConstantSDNode>(Val: Load0->getOperand(Num: NumOps - 3));
289 const ConstantSDNode *Load1Offset =
290 dyn_cast<ConstantSDNode>(Val: Load1->getOperand(Num: NumOps - 3));
291
292 if (!Load0Offset || !Load1Offset)
293 return false;
294
295 Offset0 = Load0Offset->getZExtValue();
296 Offset1 = Load1Offset->getZExtValue();
297 return true;
298 }
299
300 // MUBUF and MTBUF can access the same addresses.
301 if ((isMUBUF(Opcode: Opc0) || isMTBUF(Opcode: Opc0)) && (isMUBUF(Opcode: Opc1) || isMTBUF(Opcode: Opc1))) {
302
303 // MUBUF and MTBUF have vaddr at different indices.
304 if (!nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::soffset) ||
305 !nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::vaddr) ||
306 !nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::srsrc))
307 return false;
308
309 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: AMDGPU::OpName::offset);
310 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: AMDGPU::OpName::offset);
311
312 if (OffIdx0 == -1 || OffIdx1 == -1)
313 return false;
314
315 // getNamedOperandIdx returns the index for MachineInstrs. Since they
316 // include the output in the operand list, but SDNodes don't, we need to
317 // subtract the index by one.
318 OffIdx0 -= get(Opcode: Opc0).NumDefs;
319 OffIdx1 -= get(Opcode: Opc1).NumDefs;
320
321 SDValue Off0 = Load0->getOperand(Num: OffIdx0);
322 SDValue Off1 = Load1->getOperand(Num: OffIdx1);
323
324 // The offset might be a FrameIndexSDNode.
325 if (!isa<ConstantSDNode>(Val: Off0) || !isa<ConstantSDNode>(Val: Off1))
326 return false;
327
328 Offset0 = Off0->getAsZExtVal();
329 Offset1 = Off1->getAsZExtVal();
330 return true;
331 }
332
333 return false;
334}
335
336static bool isStride64(unsigned Opc) {
337 switch (Opc) {
338 case AMDGPU::DS_READ2ST64_B32:
339 case AMDGPU::DS_READ2ST64_B64:
340 case AMDGPU::DS_WRITE2ST64_B32:
341 case AMDGPU::DS_WRITE2ST64_B64:
342 return true;
343 default:
344 return false;
345 }
346}
347
348bool SIInstrInfo::getMemOperandsWithOffsetWidth(
349 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
350 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
351 const TargetRegisterInfo *TRI) const {
352 if (!LdSt.mayLoadOrStore())
353 return false;
354
355 unsigned Opc = LdSt.getOpcode();
356 OffsetIsScalable = false;
357 const MachineOperand *BaseOp, *OffsetOp;
358 int DataOpIdx;
359
360 if (isDS(MI: LdSt)) {
361 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::addr);
362 OffsetOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
363 if (OffsetOp) {
364 // Normal, single offset LDS instruction.
365 if (!BaseOp) {
366 // DS_CONSUME/DS_APPEND use M0 for the base address.
367 // TODO: find the implicit use operand for M0 and use that as BaseOp?
368 return false;
369 }
370 BaseOps.push_back(Elt: BaseOp);
371 Offset = OffsetOp->getImm();
372 // Get appropriate operand, and compute width accordingly.
373 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
374 if (DataOpIdx == -1)
375 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
376 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
377 Width = LocationSize::precise(Value: 64);
378 else
379 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
380 } else {
381 // The 2 offset instructions use offset0 and offset1 instead. We can treat
382 // these as a load with a single offset if the 2 offsets are consecutive.
383 // We will use this for some partially aligned loads.
384 const MachineOperand *Offset0Op =
385 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset0);
386 const MachineOperand *Offset1Op =
387 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset1);
388
389 unsigned Offset0 = Offset0Op->getImm() & 0xff;
390 unsigned Offset1 = Offset1Op->getImm() & 0xff;
391 if (Offset0 + 1 != Offset1)
392 return false;
393
394 // Each of these offsets is in element sized units, so we need to convert
395 // to bytes of the individual reads.
396
397 unsigned EltSize;
398 if (LdSt.mayLoad())
399 EltSize = TRI->getRegSizeInBits(RC: *getOpRegClass(MI: LdSt, OpNo: 0)) / 16;
400 else {
401 assert(LdSt.mayStore());
402 int Data0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
403 EltSize = TRI->getRegSizeInBits(RC: *getOpRegClass(MI: LdSt, OpNo: Data0Idx)) / 8;
404 }
405
406 if (isStride64(Opc))
407 EltSize *= 64;
408
409 BaseOps.push_back(Elt: BaseOp);
410 Offset = EltSize * Offset0;
411 // Get appropriate operand(s), and compute width accordingly.
412 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
413 if (DataOpIdx == -1) {
414 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
415 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
416 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data1);
417 Width = LocationSize::precise(
418 Value: Width.getValue() + TypeSize::getFixed(ExactSize: getOpSize(MI: LdSt, OpNo: DataOpIdx)));
419 } else {
420 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
421 }
422 }
423 return true;
424 }
425
426 if (isMUBUF(MI: LdSt) || isMTBUF(MI: LdSt)) {
427 const MachineOperand *RSrc = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::srsrc);
428 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
429 return false;
430 BaseOps.push_back(Elt: RSrc);
431 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr);
432 if (BaseOp && !BaseOp->isFI())
433 BaseOps.push_back(Elt: BaseOp);
434 const MachineOperand *OffsetImm =
435 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
436 Offset = OffsetImm->getImm();
437 const MachineOperand *SOffset =
438 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::soffset);
439 if (SOffset) {
440 if (SOffset->isReg())
441 BaseOps.push_back(Elt: SOffset);
442 else
443 Offset += SOffset->getImm();
444 }
445 // Get appropriate operand, and compute width accordingly.
446 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
447 if (DataOpIdx == -1)
448 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
449 if (DataOpIdx == -1) // LDS DMA
450 return false;
451 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
452 return true;
453 }
454
455 if (isImage(MI: LdSt)) {
456 auto RsrcOpName =
457 isMIMG(MI: LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
458 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: RsrcOpName);
459 BaseOps.push_back(Elt: &LdSt.getOperand(i: SRsrcIdx));
460 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0);
461 if (VAddr0Idx >= 0) {
462 // GFX10 possible NSA encoding.
463 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
464 BaseOps.push_back(Elt: &LdSt.getOperand(i: I));
465 } else {
466 BaseOps.push_back(Elt: getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr));
467 }
468 Offset = 0;
469 // Get appropriate operand, and compute width accordingly.
470 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
471 if (DataOpIdx == -1)
472 return false; // no return sampler
473 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
474 return true;
475 }
476
477 if (isSMRD(MI: LdSt)) {
478 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::sbase);
479 if (!BaseOp) // e.g. S_MEMTIME
480 return false;
481 BaseOps.push_back(Elt: BaseOp);
482 OffsetOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
483 Offset = OffsetOp ? OffsetOp->getImm() : 0;
484 // Get appropriate operand, and compute width accordingly.
485 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::sdst);
486 if (DataOpIdx == -1)
487 return false;
488 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
489 return true;
490 }
491
492 if (isFLAT(MI: LdSt)) {
493 // Instructions have either vaddr or saddr or both or none.
494 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr);
495 if (BaseOp)
496 BaseOps.push_back(Elt: BaseOp);
497 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::saddr);
498 if (BaseOp)
499 BaseOps.push_back(Elt: BaseOp);
500 Offset = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset)->getImm();
501 // Get appropriate operand, and compute width accordingly.
502 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
503 if (DataOpIdx == -1)
504 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
505 if (DataOpIdx == -1) // LDS DMA
506 return false;
507 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
508 return true;
509 }
510
511 return false;
512}
513
514static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
515 ArrayRef<const MachineOperand *> BaseOps1,
516 const MachineInstr &MI2,
517 ArrayRef<const MachineOperand *> BaseOps2) {
518 // Only examine the first "base" operand of each instruction, on the
519 // assumption that it represents the real base address of the memory access.
520 // Other operands are typically offsets or indices from this base address.
521 if (BaseOps1.front()->isIdenticalTo(Other: *BaseOps2.front()))
522 return true;
523
524 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
525 return false;
526
527 auto *MO1 = *MI1.memoperands_begin();
528 auto *MO2 = *MI2.memoperands_begin();
529 if (MO1->getAddrSpace() != MO2->getAddrSpace())
530 return false;
531
532 const auto *Base1 = MO1->getValue();
533 const auto *Base2 = MO2->getValue();
534 if (!Base1 || !Base2)
535 return false;
536 Base1 = getUnderlyingObject(V: Base1);
537 Base2 = getUnderlyingObject(V: Base2);
538
539 if (isa<UndefValue>(Val: Base1) || isa<UndefValue>(Val: Base2))
540 return false;
541
542 return Base1 == Base2;
543}
544
545bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
546 int64_t Offset1, bool OffsetIsScalable1,
547 ArrayRef<const MachineOperand *> BaseOps2,
548 int64_t Offset2, bool OffsetIsScalable2,
549 unsigned ClusterSize,
550 unsigned NumBytes) const {
551 // If the mem ops (to be clustered) do not have the same base ptr, then they
552 // should not be clustered
553 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
554 if (!BaseOps1.empty() && !BaseOps2.empty()) {
555 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
556 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
557 if (!memOpsHaveSameBasePtr(MI1: FirstLdSt, BaseOps1, MI2: SecondLdSt, BaseOps2))
558 return false;
559
560 const SIMachineFunctionInfo *MFI =
561 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
562 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
563 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
564 // If only one base op is empty, they do not have the same base ptr
565 return false;
566 }
567
568 // In order to avoid register pressure, on an average, the number of DWORDS
569 // loaded together by all clustered mem ops should not exceed
570 // MaxMemoryClusterDWords. This is an empirical value based on certain
571 // observations and performance related experiments.
572 // The good thing about this heuristic is - it avoids clustering of too many
573 // sub-word loads, and also avoids clustering of wide loads. Below is the
574 // brief summary of how the heuristic behaves for various `LoadSize` when
575 // MaxMemoryClusterDWords is 8.
576 //
577 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
578 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
579 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
580 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
581 // (5) LoadSize >= 17: do not cluster
582 const unsigned LoadSize = NumBytes / ClusterSize;
583 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
584 return NumDWords <= MaxMemoryClusterDWords;
585}
586
587// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
588// the first 16 loads will be interleaved with the stores, and the next 16 will
589// be clustered as expected. It should really split into 2 16 store batches.
590//
591// Loads are clustered until this returns false, rather than trying to schedule
592// groups of stores. This also means we have to deal with saying different
593// address space loads should be clustered, and ones which might cause bank
594// conflicts.
595//
596// This might be deprecated so it might not be worth that much effort to fix.
597bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
598 int64_t Offset0, int64_t Offset1,
599 unsigned NumLoads) const {
600 assert(Offset1 > Offset0 &&
601 "Second offset should be larger than first offset!");
602 // If we have less than 16 loads in a row, and the offsets are within 64
603 // bytes, then schedule together.
604
605 // A cacheline is 64 bytes (for global memory).
606 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
607}
608
609static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
610 MachineBasicBlock::iterator MI,
611 const DebugLoc &DL, MCRegister DestReg,
612 MCRegister SrcReg, bool KillSrc,
613 const char *Msg = "illegal VGPR to SGPR copy") {
614 MachineFunction *MF = MBB.getParent();
615
616 LLVMContext &C = MF->getFunction().getContext();
617 C.diagnose(DI: DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
618
619 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_ILLEGAL_COPY), DestReg)
620 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
621}
622
623/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
624/// possible to have a direct copy in these cases on GFX908, so an intermediate
625/// VGPR copy is required.
626static void indirectCopyToAGPR(const SIInstrInfo &TII,
627 MachineBasicBlock &MBB,
628 MachineBasicBlock::iterator MI,
629 const DebugLoc &DL, MCRegister DestReg,
630 MCRegister SrcReg, bool KillSrc,
631 RegScavenger &RS, bool RegsOverlap,
632 Register ImpDefSuperReg = Register(),
633 Register ImpUseSuperReg = Register()) {
634 assert((TII.getSubtarget().hasMAIInsts() &&
635 !TII.getSubtarget().hasGFX90AInsts()) &&
636 "Expected GFX908 subtarget.");
637
638 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
639 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
640 "Source register of the copy should be either an SGPR or an AGPR.");
641
642 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
643 "Destination register of the copy should be an AGPR.");
644
645 const SIRegisterInfo &RI = TII.getRegisterInfo();
646
647 // First try to find defining accvgpr_write to avoid temporary registers.
648 // In the case of copies of overlapping AGPRs, we conservatively do not
649 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
650 // an accvgpr_write used for this same copy due to implicit-defs
651 if (!RegsOverlap) {
652 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
653 --Def;
654
655 if (!Def->modifiesRegister(Reg: SrcReg, TRI: &RI))
656 continue;
657
658 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
659 Def->getOperand(i: 0).getReg() != SrcReg)
660 break;
661
662 MachineOperand &DefOp = Def->getOperand(i: 1);
663 assert(DefOp.isReg() || DefOp.isImm());
664
665 if (DefOp.isReg()) {
666 bool SafeToPropagate = true;
667 // Check that register source operand is not clobbered before MI.
668 // Immediate operands are always safe to propagate.
669 for (auto I = Def; I != MI && SafeToPropagate; ++I)
670 if (I->modifiesRegister(Reg: DefOp.getReg(), TRI: &RI))
671 SafeToPropagate = false;
672
673 if (!SafeToPropagate)
674 break;
675
676 for (auto I = Def; I != MI; ++I)
677 I->clearRegisterKills(Reg: DefOp.getReg(), RegInfo: &RI);
678 }
679
680 MachineInstrBuilder Builder =
681 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
682 .add(MO: DefOp);
683 if (ImpDefSuperReg)
684 Builder.addReg(RegNo: ImpDefSuperReg, Flags: RegState::Define | RegState::Implicit);
685
686 if (ImpUseSuperReg) {
687 Builder.addReg(RegNo: ImpUseSuperReg,
688 Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
689 }
690
691 return;
692 }
693 }
694
695 RS.enterBasicBlockEnd(MBB);
696 RS.backward(I: std::next(x: MI));
697
698 // Ideally we want to have three registers for a long reg_sequence copy
699 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
700 unsigned MaxVGPRs = RI.getRegPressureLimit(RC: &AMDGPU::VGPR_32RegClass,
701 MF&: *MBB.getParent());
702
703 // Registers in the sequence are allocated contiguously so we can just
704 // use register number to pick one of three round-robin temps.
705 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
706 Register Tmp =
707 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
708 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
709 "VGPR used for an intermediate copy should have been reserved.");
710
711 // Only loop through if there are any free registers left. We don't want to
712 // spill.
713 while (RegNo--) {
714 Register Tmp2 = RS.scavengeRegisterBackwards(RC: AMDGPU::VGPR_32RegClass, To: MI,
715 /* RestoreAfter */ false, SPAdj: 0,
716 /* AllowSpill */ false);
717 if (!Tmp2 || RI.getHWRegIndex(Reg: Tmp2) >= MaxVGPRs)
718 break;
719 Tmp = Tmp2;
720 RS.setRegUsed(Reg: Tmp);
721 }
722
723 // Insert copy to temporary VGPR.
724 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
725 if (AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg)) {
726 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
727 } else {
728 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
729 }
730
731 MachineInstrBuilder UseBuilder = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: TmpCopyOp), DestReg: Tmp)
732 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
733 if (ImpUseSuperReg) {
734 UseBuilder.addReg(RegNo: ImpUseSuperReg,
735 Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
736 }
737
738 MachineInstrBuilder DefBuilder
739 = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
740 .addReg(RegNo: Tmp, Flags: RegState::Kill);
741
742 if (ImpDefSuperReg)
743 DefBuilder.addReg(RegNo: ImpDefSuperReg, Flags: RegState::Define | RegState::Implicit);
744}
745
746static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
747 MachineBasicBlock::iterator MI, const DebugLoc &DL,
748 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
749 const TargetRegisterClass *RC, bool Forward) {
750 const SIRegisterInfo &RI = TII.getRegisterInfo();
751 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, EltSize: 4);
752 MachineBasicBlock::iterator I = MI;
753 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
754
755 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
756 int16_t SubIdx = BaseIndices[Idx];
757 Register DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
758 Register SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
759 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
760 unsigned Opcode = AMDGPU::S_MOV_B32;
761
762 // Is SGPR aligned? If so try to combine with next.
763 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
764 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
765 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
766 // Can use SGPR64 copy
767 unsigned Channel = RI.getChannelFromSubReg(SubReg: SubIdx);
768 SubIdx = RI.getSubRegFromChannel(Channel, NumRegs: 2);
769 DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
770 SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
771 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
772 Opcode = AMDGPU::S_MOV_B64;
773 Idx++;
774 }
775
776 LastMI = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode), DestReg: DestSubReg)
777 .addReg(RegNo: SrcSubReg)
778 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
779
780 if (!FirstMI)
781 FirstMI = LastMI;
782
783 if (!Forward)
784 I--;
785 }
786
787 assert(FirstMI && LastMI);
788 if (!Forward)
789 std::swap(a&: FirstMI, b&: LastMI);
790
791 FirstMI->addOperand(
792 Op: MachineOperand::CreateReg(Reg: DestReg, isDef: true /*IsDef*/, isImp: true /*IsImp*/));
793
794 if (KillSrc)
795 LastMI->addRegisterKilled(IncomingReg: SrcReg, RegInfo: &RI);
796}
797
798void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
799 MachineBasicBlock::iterator MI,
800 const DebugLoc &DL, Register DestReg,
801 Register SrcReg, bool KillSrc, bool RenamableDest,
802 bool RenamableSrc) const {
803 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(Reg: DestReg);
804 unsigned Size = RI.getRegSizeInBits(RC: *RC);
805 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(Reg: SrcReg);
806 unsigned SrcSize = RI.getRegSizeInBits(RC: *SrcRC);
807
808 // The rest of copyPhysReg assumes Src and Dst size are the same size.
809 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
810 // we remove Fix16BitCopies and this code block?
811 if (Fix16BitCopies) {
812 if (((Size == 16) != (SrcSize == 16))) {
813 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
814 assert(ST.useRealTrue16Insts());
815 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
816 MCRegister SubReg = RI.getSubReg(Reg: RegToFix, Idx: AMDGPU::lo16);
817 RegToFix = SubReg;
818
819 if (DestReg == SrcReg) {
820 // Identity copy. Insert empty bundle since ExpandPostRA expects an
821 // instruction here.
822 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::BUNDLE));
823 return;
824 }
825 RC = RI.getPhysRegBaseClass(Reg: DestReg);
826 Size = RI.getRegSizeInBits(RC: *RC);
827 SrcRC = RI.getPhysRegBaseClass(Reg: SrcReg);
828 SrcSize = RI.getRegSizeInBits(RC: *SrcRC);
829 }
830 }
831
832 if (RC == &AMDGPU::VGPR_32RegClass) {
833 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
834 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
835 AMDGPU::AGPR_32RegClass.contains(SrcReg));
836 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg) ?
837 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
838 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: Opc), DestReg)
839 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
840 return;
841 }
842
843 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
844 RC == &AMDGPU::SReg_32RegClass) {
845 if (SrcReg == AMDGPU::SCC) {
846 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B32), DestReg)
847 .addImm(Val: 1)
848 .addImm(Val: 0);
849 return;
850 }
851
852 if (!AMDGPU::SReg_32RegClass.contains(Reg: SrcReg)) {
853 if (DestReg == AMDGPU::VCC_LO) {
854 // FIXME: Hack until VReg_1 removed.
855 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
856 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_U32_e32))
857 .addImm(Val: 0)
858 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
859 return;
860 }
861
862 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
863 return;
864 }
865
866 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg)
867 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
868 return;
869 }
870
871 if (RC == &AMDGPU::SReg_64RegClass) {
872 if (SrcReg == AMDGPU::SCC) {
873 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B64), DestReg)
874 .addImm(Val: 1)
875 .addImm(Val: 0);
876 return;
877 }
878
879 if (!AMDGPU::SReg_64_EncodableRegClass.contains(Reg: SrcReg)) {
880 if (DestReg == AMDGPU::VCC) {
881 // FIXME: Hack until VReg_1 removed.
882 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
883 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_U32_e32))
884 .addImm(Val: 0)
885 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
886 return;
887 }
888
889 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
890 return;
891 }
892
893 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B64), DestReg)
894 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
895 return;
896 }
897
898 if (DestReg == AMDGPU::SCC) {
899 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
900 // but SelectionDAG emits such copies for i1 sources.
901 if (AMDGPU::SReg_64RegClass.contains(Reg: SrcReg)) {
902 // This copy can only be produced by patterns
903 // with explicit SCC, which are known to be enabled
904 // only for subtargets with S_CMP_LG_U64 present.
905 assert(ST.hasScalarCompareEq64());
906 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U64))
907 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
908 .addImm(Val: 0);
909 } else {
910 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
911 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U32))
912 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
913 .addImm(Val: 0);
914 }
915
916 return;
917 }
918
919 if (RC == &AMDGPU::AGPR_32RegClass) {
920 if (AMDGPU::VGPR_32RegClass.contains(Reg: SrcReg) ||
921 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(Reg: SrcReg))) {
922 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
923 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
924 return;
925 }
926
927 if (AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg) && ST.hasGFX90AInsts()) {
928 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
929 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
930 return;
931 }
932
933 // FIXME: Pass should maintain scavenger to avoid scan through the block on
934 // every AGPR spill.
935 RegScavenger RS;
936 const bool Overlap = RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
937 indirectCopyToAGPR(TII: *this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, RegsOverlap: Overlap);
938 return;
939 }
940
941 if (Size == 16) {
942 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
943 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
944 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
945
946 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(Reg: DestReg);
947 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(Reg: SrcReg);
948 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(Reg: DestReg);
949 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(Reg: SrcReg);
950 bool DstLow = !AMDGPU::isHi16Reg(Reg: DestReg, MRI: RI);
951 bool SrcLow = !AMDGPU::isHi16Reg(Reg: SrcReg, MRI: RI);
952 MCRegister NewDestReg = RI.get32BitRegister(Reg: DestReg);
953 MCRegister NewSrcReg = RI.get32BitRegister(Reg: SrcReg);
954
955 if (IsSGPRDst) {
956 if (!IsSGPRSrc) {
957 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
958 return;
959 }
960
961 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: NewDestReg)
962 .addReg(RegNo: NewSrcReg, Flags: getKillRegState(B: KillSrc));
963 return;
964 }
965
966 if (IsAGPRDst || IsAGPRSrc) {
967 if (!DstLow || !SrcLow) {
968 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
969 Msg: "Cannot use hi16 subreg with an AGPR!");
970 }
971
972 copyPhysReg(MBB, MI, DL, DestReg: NewDestReg, SrcReg: NewSrcReg, KillSrc);
973 return;
974 }
975
976 if (ST.useRealTrue16Insts()) {
977 if (IsSGPRSrc) {
978 assert(SrcLow);
979 SrcReg = NewSrcReg;
980 }
981 // Use the smaller instruction encoding if possible.
982 if (AMDGPU::VGPR_16_Lo128RegClass.contains(Reg: DestReg) &&
983 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(Reg: SrcReg))) {
984 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B16_t16_e32), DestReg)
985 .addReg(RegNo: SrcReg);
986 } else {
987 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B16_t16_e64), DestReg)
988 .addImm(Val: 0) // src0_modifiers
989 .addReg(RegNo: SrcReg)
990 .addImm(Val: 0); // op_sel
991 }
992 return;
993 }
994
995 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
996 if (!DstLow || !SrcLow) {
997 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
998 Msg: "Cannot use hi16 subreg on VI!");
999 }
1000
1001 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: NewDestReg)
1002 .addReg(RegNo: NewSrcReg, Flags: getKillRegState(B: KillSrc));
1003 return;
1004 }
1005
1006 auto MIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_sdwa), DestReg: NewDestReg)
1007 .addImm(Val: 0) // src0_modifiers
1008 .addReg(RegNo: NewSrcReg)
1009 .addImm(Val: 0) // clamp
1010 .addImm(Val: DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1011 : AMDGPU::SDWA::SdwaSel::WORD_1)
1012 .addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
1013 .addImm(Val: SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1014 : AMDGPU::SDWA::SdwaSel::WORD_1)
1015 .addReg(RegNo: NewDestReg, Flags: RegState::Implicit | RegState::Undef);
1016 // First implicit operand is $exec.
1017 MIB->tieOperands(DefIdx: 0, UseIdx: MIB->getNumOperands() - 1);
1018 return;
1019 }
1020
1021 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(RC: SrcRC))) {
1022 if (ST.hasVMovB64Inst()) {
1023 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B64_e32), DestReg)
1024 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
1025 return;
1026 }
1027 if (ST.hasPkMovB32()) {
1028 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg)
1029 .addImm(Val: SISrcMods::OP_SEL_1)
1030 .addReg(RegNo: SrcReg)
1031 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1032 .addReg(RegNo: SrcReg)
1033 .addImm(Val: 0) // op_sel_lo
1034 .addImm(Val: 0) // op_sel_hi
1035 .addImm(Val: 0) // neg_lo
1036 .addImm(Val: 0) // neg_hi
1037 .addImm(Val: 0) // clamp
1038 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
1039 return;
1040 }
1041 }
1042
1043 const bool Forward = RI.getHWRegIndex(Reg: DestReg) <= RI.getHWRegIndex(Reg: SrcReg);
1044 if (RI.isSGPRClass(RC)) {
1045 if (!RI.isSGPRClass(RC: SrcRC)) {
1046 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1047 return;
1048 }
1049 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
1050 expandSGPRCopy(TII: *this, MBB, MI, DL, DestReg, SrcReg, KillSrc: CanKillSuperReg, RC,
1051 Forward);
1052 return;
1053 }
1054
1055 unsigned EltSize = 4;
1056 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1057 if (RI.isAGPRClass(RC)) {
1058 if (ST.hasGFX90AInsts() && RI.isAGPRClass(RC: SrcRC))
1059 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1060 else if (RI.hasVGPRs(RC: SrcRC) ||
1061 (ST.hasGFX90AInsts() && RI.isSGPRClass(RC: SrcRC)))
1062 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1063 else
1064 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1065 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(RC: SrcRC)) {
1066 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1067 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1068 (RI.isProperlyAlignedRC(RC: *RC) &&
1069 (SrcRC == RC || RI.isSGPRClass(RC: SrcRC)))) {
1070 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1071 if (ST.hasVMovB64Inst()) {
1072 Opcode = AMDGPU::V_MOV_B64_e32;
1073 EltSize = 8;
1074 } else if (ST.hasPkMovB32()) {
1075 Opcode = AMDGPU::V_PK_MOV_B32;
1076 EltSize = 8;
1077 }
1078 }
1079
1080 // For the cases where we need an intermediate instruction/temporary register
1081 // (destination is an AGPR), we need a scavenger.
1082 //
1083 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1084 // whole block for every handled copy.
1085 std::unique_ptr<RegScavenger> RS;
1086 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1087 RS = std::make_unique<RegScavenger>();
1088
1089 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1090
1091 // If there is an overlap, we can't kill the super-register on the last
1092 // instruction, since it will also kill the components made live by this def.
1093 const bool Overlap = RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
1094 const bool CanKillSuperReg = KillSrc && !Overlap;
1095
1096 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1097 unsigned SubIdx;
1098 if (Forward)
1099 SubIdx = SubIndices[Idx];
1100 else
1101 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1102 Register DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
1103 Register SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
1104 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1105
1106 bool IsFirstSubreg = Idx == 0;
1107 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1108
1109 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1110 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1111 Register ImpUseSuper = SrcReg;
1112 indirectCopyToAGPR(TII: *this, MBB, MI, DL, DestReg: DestSubReg, SrcReg: SrcSubReg, KillSrc: UseKill,
1113 RS&: *RS, RegsOverlap: Overlap, ImpDefSuperReg: ImpDefSuper, ImpUseSuperReg: ImpUseSuper);
1114 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1115 MachineInstrBuilder MIB =
1116 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg: DestSubReg)
1117 .addImm(Val: SISrcMods::OP_SEL_1)
1118 .addReg(RegNo: SrcSubReg)
1119 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1120 .addReg(RegNo: SrcSubReg)
1121 .addImm(Val: 0) // op_sel_lo
1122 .addImm(Val: 0) // op_sel_hi
1123 .addImm(Val: 0) // neg_lo
1124 .addImm(Val: 0) // neg_hi
1125 .addImm(Val: 0) // clamp
1126 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: UseKill) | RegState::Implicit);
1127 if (IsFirstSubreg)
1128 MIB.addReg(RegNo: DestReg, Flags: RegState::Define | RegState::Implicit);
1129 } else {
1130 MachineInstrBuilder Builder =
1131 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode), DestReg: DestSubReg).addReg(RegNo: SrcSubReg);
1132 if (IsFirstSubreg)
1133 Builder.addReg(RegNo: DestReg, Flags: RegState::Define | RegState::Implicit);
1134
1135 Builder.addReg(RegNo: SrcReg, Flags: getKillRegState(B: UseKill) | RegState::Implicit);
1136 }
1137 }
1138}
1139
1140int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1141 int32_t NewOpc;
1142
1143 // Try to map original to commuted opcode
1144 NewOpc = AMDGPU::getCommuteRev(Opcode);
1145 if (NewOpc != -1)
1146 // Check if the commuted (REV) opcode exists on the target.
1147 return pseudoToMCOpcode(Opcode: NewOpc) != -1 ? NewOpc : -1;
1148
1149 // Try to map commuted to original opcode
1150 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1151 if (NewOpc != -1)
1152 // Check if the original (non-REV) opcode exists on the target.
1153 return pseudoToMCOpcode(Opcode: NewOpc) != -1 ? NewOpc : -1;
1154
1155 return Opcode;
1156}
1157
1158bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
1159 const Register Reg,
1160 int64_t &ImmVal) const {
1161 switch (MI.getOpcode()) {
1162 case AMDGPU::V_MOV_B32_e32:
1163 case AMDGPU::S_MOV_B32:
1164 case AMDGPU::S_MOVK_I32:
1165 case AMDGPU::S_MOV_B64:
1166 case AMDGPU::V_MOV_B64_e32:
1167 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1168 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1169 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1170 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1171 case AMDGPU::V_MOV_B64_PSEUDO:
1172 case AMDGPU::V_MOV_B16_t16_e32: {
1173 const MachineOperand &Src0 = MI.getOperand(i: 1);
1174 if (Src0.isImm()) {
1175 ImmVal = Src0.getImm();
1176 return MI.getOperand(i: 0).getReg() == Reg;
1177 }
1178
1179 return false;
1180 }
1181 case AMDGPU::V_MOV_B16_t16_e64: {
1182 const MachineOperand &Src0 = MI.getOperand(i: 2);
1183 if (Src0.isImm() && !MI.getOperand(i: 1).getImm()) {
1184 ImmVal = Src0.getImm();
1185 return MI.getOperand(i: 0).getReg() == Reg;
1186 }
1187
1188 return false;
1189 }
1190 case AMDGPU::S_BREV_B32:
1191 case AMDGPU::V_BFREV_B32_e32:
1192 case AMDGPU::V_BFREV_B32_e64: {
1193 const MachineOperand &Src0 = MI.getOperand(i: 1);
1194 if (Src0.isImm()) {
1195 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Val: Src0.getImm()));
1196 return MI.getOperand(i: 0).getReg() == Reg;
1197 }
1198
1199 return false;
1200 }
1201 case AMDGPU::S_NOT_B32:
1202 case AMDGPU::V_NOT_B32_e32:
1203 case AMDGPU::V_NOT_B32_e64: {
1204 const MachineOperand &Src0 = MI.getOperand(i: 1);
1205 if (Src0.isImm()) {
1206 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1207 return MI.getOperand(i: 0).getReg() == Reg;
1208 }
1209
1210 return false;
1211 }
1212 default:
1213 return false;
1214 }
1215}
1216
1217std::optional<int64_t>
1218SIInstrInfo::getImmOrMaterializedImm(MachineOperand &Op) const {
1219 if (Op.isImm())
1220 return Op.getImm();
1221
1222 if (!Op.isReg() || !Op.getReg().isVirtual())
1223 return std::nullopt;
1224 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1225 const MachineInstr *Def = MRI.getVRegDef(Reg: Op.getReg());
1226 if (Def && Def->isMoveImmediate()) {
1227 const MachineOperand &ImmSrc = Def->getOperand(i: 1);
1228 if (ImmSrc.isImm())
1229 return extractSubregFromImm(ImmVal: ImmSrc.getImm(), SubRegIndex: Op.getSubReg());
1230 }
1231
1232 return std::nullopt;
1233}
1234
1235unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
1236
1237 if (RI.isAGPRClass(RC: DstRC))
1238 return AMDGPU::COPY;
1239 if (RI.getRegSizeInBits(RC: *DstRC) == 16) {
1240 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1241 // before RA.
1242 return RI.isSGPRClass(RC: DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1243 }
1244 if (RI.getRegSizeInBits(RC: *DstRC) == 32)
1245 return RI.isSGPRClass(RC: DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1246 if (RI.getRegSizeInBits(RC: *DstRC) == 64 && RI.isSGPRClass(RC: DstRC))
1247 return AMDGPU::S_MOV_B64;
1248 if (RI.getRegSizeInBits(RC: *DstRC) == 64 && !RI.isSGPRClass(RC: DstRC))
1249 return AMDGPU::V_MOV_B64_PSEUDO;
1250 return AMDGPU::COPY;
1251}
1252
1253const MCInstrDesc &
1254SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
1255 bool IsIndirectSrc) const {
1256 if (IsIndirectSrc) {
1257 if (VecSize <= 32) // 4 bytes
1258 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1259 if (VecSize <= 64) // 8 bytes
1260 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1261 if (VecSize <= 96) // 12 bytes
1262 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1263 if (VecSize <= 128) // 16 bytes
1264 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1265 if (VecSize <= 160) // 20 bytes
1266 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1267 if (VecSize <= 192) // 24 bytes
1268 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1269 if (VecSize <= 224) // 28 bytes
1270 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1271 if (VecSize <= 256) // 32 bytes
1272 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1273 if (VecSize <= 288) // 36 bytes
1274 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1275 if (VecSize <= 320) // 40 bytes
1276 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1277 if (VecSize <= 352) // 44 bytes
1278 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1279 if (VecSize <= 384) // 48 bytes
1280 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1281 if (VecSize <= 512) // 64 bytes
1282 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1283 if (VecSize <= 1024) // 128 bytes
1284 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1285
1286 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1287 }
1288
1289 if (VecSize <= 32) // 4 bytes
1290 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1291 if (VecSize <= 64) // 8 bytes
1292 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1293 if (VecSize <= 96) // 12 bytes
1294 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1295 if (VecSize <= 128) // 16 bytes
1296 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1297 if (VecSize <= 160) // 20 bytes
1298 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1299 if (VecSize <= 192) // 24 bytes
1300 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1301 if (VecSize <= 224) // 28 bytes
1302 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1303 if (VecSize <= 256) // 32 bytes
1304 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1305 if (VecSize <= 288) // 36 bytes
1306 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1307 if (VecSize <= 320) // 40 bytes
1308 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1309 if (VecSize <= 352) // 44 bytes
1310 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1311 if (VecSize <= 384) // 48 bytes
1312 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1313 if (VecSize <= 512) // 64 bytes
1314 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1315 if (VecSize <= 1024) // 128 bytes
1316 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1317
1318 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1319}
1320
1321static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1322 if (VecSize <= 32) // 4 bytes
1323 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1324 if (VecSize <= 64) // 8 bytes
1325 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1326 if (VecSize <= 96) // 12 bytes
1327 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1328 if (VecSize <= 128) // 16 bytes
1329 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1330 if (VecSize <= 160) // 20 bytes
1331 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1332 if (VecSize <= 192) // 24 bytes
1333 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1334 if (VecSize <= 224) // 28 bytes
1335 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1336 if (VecSize <= 256) // 32 bytes
1337 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1338 if (VecSize <= 288) // 36 bytes
1339 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1340 if (VecSize <= 320) // 40 bytes
1341 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1342 if (VecSize <= 352) // 44 bytes
1343 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1344 if (VecSize <= 384) // 48 bytes
1345 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1346 if (VecSize <= 512) // 64 bytes
1347 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1348 if (VecSize <= 1024) // 128 bytes
1349 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1350
1351 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1352}
1353
1354static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1355 if (VecSize <= 32) // 4 bytes
1356 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1357 if (VecSize <= 64) // 8 bytes
1358 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1359 if (VecSize <= 96) // 12 bytes
1360 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1361 if (VecSize <= 128) // 16 bytes
1362 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1363 if (VecSize <= 160) // 20 bytes
1364 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1365 if (VecSize <= 192) // 24 bytes
1366 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1367 if (VecSize <= 224) // 28 bytes
1368 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1369 if (VecSize <= 256) // 32 bytes
1370 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1371 if (VecSize <= 288) // 36 bytes
1372 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1373 if (VecSize <= 320) // 40 bytes
1374 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1375 if (VecSize <= 352) // 44 bytes
1376 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1377 if (VecSize <= 384) // 48 bytes
1378 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1379 if (VecSize <= 512) // 64 bytes
1380 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1381 if (VecSize <= 1024) // 128 bytes
1382 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1383
1384 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1385}
1386
1387static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1388 if (VecSize <= 64) // 8 bytes
1389 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1390 if (VecSize <= 128) // 16 bytes
1391 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1392 if (VecSize <= 256) // 32 bytes
1393 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1394 if (VecSize <= 512) // 64 bytes
1395 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1396 if (VecSize <= 1024) // 128 bytes
1397 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1398
1399 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1400}
1401
1402const MCInstrDesc &
1403SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1404 bool IsSGPR) const {
1405 if (IsSGPR) {
1406 switch (EltSize) {
1407 case 32:
1408 return get(Opcode: getIndirectSGPRWriteMovRelPseudo32(VecSize));
1409 case 64:
1410 return get(Opcode: getIndirectSGPRWriteMovRelPseudo64(VecSize));
1411 default:
1412 llvm_unreachable("invalid reg indexing elt size");
1413 }
1414 }
1415
1416 assert(EltSize == 32 && "invalid reg indexing elt size");
1417 return get(Opcode: getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
1418}
1419
1420static unsigned getSGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1421 switch (Size) {
1422 case 4:
1423 return NeedsCFI ? AMDGPU::SI_SPILL_S32_CFI_SAVE : AMDGPU::SI_SPILL_S32_SAVE;
1424 case 8:
1425 return NeedsCFI ? AMDGPU::SI_SPILL_S64_CFI_SAVE : AMDGPU::SI_SPILL_S64_SAVE;
1426 case 12:
1427 return NeedsCFI ? AMDGPU::SI_SPILL_S96_CFI_SAVE : AMDGPU::SI_SPILL_S96_SAVE;
1428 case 16:
1429 return NeedsCFI ? AMDGPU::SI_SPILL_S128_CFI_SAVE
1430 : AMDGPU::SI_SPILL_S128_SAVE;
1431 case 20:
1432 return NeedsCFI ? AMDGPU::SI_SPILL_S160_CFI_SAVE
1433 : AMDGPU::SI_SPILL_S160_SAVE;
1434 case 24:
1435 return NeedsCFI ? AMDGPU::SI_SPILL_S192_CFI_SAVE
1436 : AMDGPU::SI_SPILL_S192_SAVE;
1437 case 28:
1438 return NeedsCFI ? AMDGPU::SI_SPILL_S224_CFI_SAVE
1439 : AMDGPU::SI_SPILL_S224_SAVE;
1440 case 32:
1441 return AMDGPU::SI_SPILL_S256_SAVE;
1442 case 36:
1443 return AMDGPU::SI_SPILL_S288_SAVE;
1444 case 40:
1445 return AMDGPU::SI_SPILL_S320_SAVE;
1446 case 44:
1447 return AMDGPU::SI_SPILL_S352_SAVE;
1448 case 48:
1449 return AMDGPU::SI_SPILL_S384_SAVE;
1450 case 64:
1451 return NeedsCFI ? AMDGPU::SI_SPILL_S512_CFI_SAVE
1452 : AMDGPU::SI_SPILL_S512_SAVE;
1453 case 128:
1454 return NeedsCFI ? AMDGPU::SI_SPILL_S1024_CFI_SAVE
1455 : AMDGPU::SI_SPILL_S1024_SAVE;
1456 default:
1457 llvm_unreachable("unknown register size");
1458 }
1459}
1460
1461static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1462 switch (Size) {
1463 case 2:
1464 return AMDGPU::SI_SPILL_V16_SAVE;
1465 case 4:
1466 return NeedsCFI ? AMDGPU::SI_SPILL_V32_CFI_SAVE : AMDGPU::SI_SPILL_V32_SAVE;
1467 case 8:
1468 return NeedsCFI ? AMDGPU::SI_SPILL_V64_CFI_SAVE : AMDGPU::SI_SPILL_V64_SAVE;
1469 case 12:
1470 return NeedsCFI ? AMDGPU::SI_SPILL_V96_CFI_SAVE : AMDGPU::SI_SPILL_V96_SAVE;
1471 case 16:
1472 return NeedsCFI ? AMDGPU::SI_SPILL_V128_CFI_SAVE
1473 : AMDGPU::SI_SPILL_V128_SAVE;
1474 case 20:
1475 return NeedsCFI ? AMDGPU::SI_SPILL_V160_CFI_SAVE
1476 : AMDGPU::SI_SPILL_V160_SAVE;
1477 case 24:
1478 return NeedsCFI ? AMDGPU::SI_SPILL_V192_CFI_SAVE
1479 : AMDGPU::SI_SPILL_V192_SAVE;
1480 case 28:
1481 return NeedsCFI ? AMDGPU::SI_SPILL_V224_CFI_SAVE
1482 : AMDGPU::SI_SPILL_V224_SAVE;
1483 case 32:
1484 return NeedsCFI ? AMDGPU::SI_SPILL_V256_CFI_SAVE
1485 : AMDGPU::SI_SPILL_V256_SAVE;
1486 case 36:
1487 return NeedsCFI ? AMDGPU::SI_SPILL_V288_CFI_SAVE
1488 : AMDGPU::SI_SPILL_V288_SAVE;
1489 case 40:
1490 return NeedsCFI ? AMDGPU::SI_SPILL_V320_CFI_SAVE
1491 : AMDGPU::SI_SPILL_V320_SAVE;
1492 case 44:
1493 return NeedsCFI ? AMDGPU::SI_SPILL_V352_CFI_SAVE
1494 : AMDGPU::SI_SPILL_V352_SAVE;
1495 case 48:
1496 return NeedsCFI ? AMDGPU::SI_SPILL_V384_CFI_SAVE
1497 : AMDGPU::SI_SPILL_V384_SAVE;
1498 case 64:
1499 return NeedsCFI ? AMDGPU::SI_SPILL_V512_CFI_SAVE
1500 : AMDGPU::SI_SPILL_V512_SAVE;
1501 case 128:
1502 return NeedsCFI ? AMDGPU::SI_SPILL_V1024_CFI_SAVE
1503 : AMDGPU::SI_SPILL_V1024_SAVE;
1504 default:
1505 llvm_unreachable("unknown register size");
1506 }
1507}
1508
1509static unsigned getAVSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1510 switch (Size) {
1511 case 4:
1512 return NeedsCFI ? AMDGPU::SI_SPILL_AV32_CFI_SAVE
1513 : AMDGPU::SI_SPILL_AV32_SAVE;
1514 case 8:
1515 return NeedsCFI ? AMDGPU::SI_SPILL_AV64_CFI_SAVE
1516 : AMDGPU::SI_SPILL_AV64_SAVE;
1517 case 12:
1518 return NeedsCFI ? AMDGPU::SI_SPILL_AV96_CFI_SAVE
1519 : AMDGPU::SI_SPILL_AV96_SAVE;
1520 case 16:
1521 return NeedsCFI ? AMDGPU::SI_SPILL_AV128_CFI_SAVE
1522 : AMDGPU::SI_SPILL_AV128_SAVE;
1523 case 20:
1524 return NeedsCFI ? AMDGPU::SI_SPILL_AV160_CFI_SAVE
1525 : AMDGPU::SI_SPILL_AV160_SAVE;
1526 case 24:
1527 return NeedsCFI ? AMDGPU::SI_SPILL_AV192_CFI_SAVE
1528 : AMDGPU::SI_SPILL_AV192_SAVE;
1529 case 28:
1530 return NeedsCFI ? AMDGPU::SI_SPILL_AV224_CFI_SAVE
1531 : AMDGPU::SI_SPILL_AV224_SAVE;
1532 case 32:
1533 return NeedsCFI ? AMDGPU::SI_SPILL_AV256_CFI_SAVE
1534 : AMDGPU::SI_SPILL_AV256_SAVE;
1535 case 36:
1536 return AMDGPU::SI_SPILL_AV288_SAVE;
1537 case 40:
1538 return AMDGPU::SI_SPILL_AV320_SAVE;
1539 case 44:
1540 return AMDGPU::SI_SPILL_AV352_SAVE;
1541 case 48:
1542 return AMDGPU::SI_SPILL_AV384_SAVE;
1543 case 64:
1544 return NeedsCFI ? AMDGPU::SI_SPILL_AV512_CFI_SAVE
1545 : AMDGPU::SI_SPILL_AV512_SAVE;
1546 case 128:
1547 return NeedsCFI ? AMDGPU::SI_SPILL_AV1024_CFI_SAVE
1548 : AMDGPU::SI_SPILL_AV1024_SAVE;
1549 default:
1550 llvm_unreachable("unknown register size");
1551 }
1552}
1553
1554static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1555 bool IsVectorSuperClass) {
1556 // Currently, there is only 32-bit WWM register spills needed.
1557 if (Size != 4)
1558 llvm_unreachable("unknown wwm register spill size");
1559
1560 if (IsVectorSuperClass)
1561 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1562
1563 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1564}
1565
1566unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
1567 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1568 const SIMachineFunctionInfo &MFI, bool NeedsCFI) const {
1569 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1570
1571 // Choose the right opcode if spilling a WWM register.
1572 if (MFI.checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
1573 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1574
1575 // TODO: Check if AGPRs are available
1576 if (ST.hasMAIInsts())
1577 return getAVSpillSaveOpcode(Size, NeedsCFI);
1578
1579 return getVGPRSpillSaveOpcode(Size, NeedsCFI);
1580}
1581
1582void SIInstrInfo::storeRegToStackSlotImpl(
1583 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
1584 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1585 MachineInstr::MIFlag Flags, bool NeedsCFI) const {
1586 MachineFunction *MF = MBB.getParent();
1587 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1588 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1589 const DebugLoc &DL = MBB.findDebugLoc(MBBI: MI);
1590
1591 MachinePointerInfo PtrInfo
1592 = MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1593 MachineMemOperand *MMO = MF->getMachineMemOperand(
1594 PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
1595 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
1596 unsigned SpillSize = RI.getSpillSize(RC: *RC);
1597
1598 MachineRegisterInfo &MRI = MF->getRegInfo();
1599 if (RI.isSGPRClass(RC)) {
1600 if (FrameInfo.getStackID(ObjectIdx: FrameIndex) == TargetStackID::SGPRSpill)
1601 MFI->setHasSpilledSGPRs();
1602 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1603 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1604 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1605
1606 // We are only allowed to create one new instruction when spilling
1607 // registers, so we need to use pseudo instruction for spilling SGPRs.
1608 const MCInstrDesc &OpDesc =
1609 get(Opcode: getSGPRSpillSaveOpcode(Size: SpillSize, NeedsCFI));
1610
1611 // The SGPR spill/restore instructions only work on number sgprs, so we need
1612 // to make sure we are using the correct register class.
1613 if (SrcReg.isVirtual() && SpillSize == 4) {
1614 MRI.constrainRegClass(Reg: SrcReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
1615 }
1616
1617 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: OpDesc)
1618 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: isKill)) // data
1619 .addFrameIndex(Idx: FrameIndex) // addr
1620 .addMemOperand(MMO)
1621 .addReg(RegNo: MFI->getStackPtrOffsetReg(), Flags: RegState::Implicit);
1622
1623 return;
1624 }
1625
1626 unsigned Opcode = getVectorRegSpillSaveOpcode(Reg: VReg ? VReg : SrcReg, RC,
1627 Size: SpillSize, MFI: *MFI, NeedsCFI);
1628 MFI->setHasSpilledVGPRs();
1629
1630 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode))
1631 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: isKill)) // data
1632 .addFrameIndex(Idx: FrameIndex) // addr
1633 .addReg(RegNo: MFI->getStackPtrOffsetReg()) // scratch_offset
1634 .addImm(Val: 0) // offset
1635 .addMemOperand(MMO);
1636}
1637
1638void SIInstrInfo::storeRegToStackSlot(
1639 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
1640 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1641 MachineInstr::MIFlag Flags) const {
1642 storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, VReg, Flags,
1643 NeedsCFI: false);
1644}
1645
1646void SIInstrInfo::storeRegToStackSlotCFI(MachineBasicBlock &MBB,
1647 MachineBasicBlock::iterator MI,
1648 Register SrcReg, bool isKill,
1649 int FrameIndex,
1650 const TargetRegisterClass *RC) const {
1651 storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, VReg: Register(),
1652 Flags: MachineInstr::NoFlags, NeedsCFI: true);
1653}
1654
1655static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1656 switch (Size) {
1657 case 4:
1658 return AMDGPU::SI_SPILL_S32_RESTORE;
1659 case 8:
1660 return AMDGPU::SI_SPILL_S64_RESTORE;
1661 case 12:
1662 return AMDGPU::SI_SPILL_S96_RESTORE;
1663 case 16:
1664 return AMDGPU::SI_SPILL_S128_RESTORE;
1665 case 20:
1666 return AMDGPU::SI_SPILL_S160_RESTORE;
1667 case 24:
1668 return AMDGPU::SI_SPILL_S192_RESTORE;
1669 case 28:
1670 return AMDGPU::SI_SPILL_S224_RESTORE;
1671 case 32:
1672 return AMDGPU::SI_SPILL_S256_RESTORE;
1673 case 36:
1674 return AMDGPU::SI_SPILL_S288_RESTORE;
1675 case 40:
1676 return AMDGPU::SI_SPILL_S320_RESTORE;
1677 case 44:
1678 return AMDGPU::SI_SPILL_S352_RESTORE;
1679 case 48:
1680 return AMDGPU::SI_SPILL_S384_RESTORE;
1681 case 64:
1682 return AMDGPU::SI_SPILL_S512_RESTORE;
1683 case 128:
1684 return AMDGPU::SI_SPILL_S1024_RESTORE;
1685 default:
1686 llvm_unreachable("unknown register size");
1687 }
1688}
1689
1690static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1691 switch (Size) {
1692 case 2:
1693 return AMDGPU::SI_SPILL_V16_RESTORE;
1694 case 4:
1695 return AMDGPU::SI_SPILL_V32_RESTORE;
1696 case 8:
1697 return AMDGPU::SI_SPILL_V64_RESTORE;
1698 case 12:
1699 return AMDGPU::SI_SPILL_V96_RESTORE;
1700 case 16:
1701 return AMDGPU::SI_SPILL_V128_RESTORE;
1702 case 20:
1703 return AMDGPU::SI_SPILL_V160_RESTORE;
1704 case 24:
1705 return AMDGPU::SI_SPILL_V192_RESTORE;
1706 case 28:
1707 return AMDGPU::SI_SPILL_V224_RESTORE;
1708 case 32:
1709 return AMDGPU::SI_SPILL_V256_RESTORE;
1710 case 36:
1711 return AMDGPU::SI_SPILL_V288_RESTORE;
1712 case 40:
1713 return AMDGPU::SI_SPILL_V320_RESTORE;
1714 case 44:
1715 return AMDGPU::SI_SPILL_V352_RESTORE;
1716 case 48:
1717 return AMDGPU::SI_SPILL_V384_RESTORE;
1718 case 64:
1719 return AMDGPU::SI_SPILL_V512_RESTORE;
1720 case 128:
1721 return AMDGPU::SI_SPILL_V1024_RESTORE;
1722 default:
1723 llvm_unreachable("unknown register size");
1724 }
1725}
1726
1727static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1728 switch (Size) {
1729 case 4:
1730 return AMDGPU::SI_SPILL_AV32_RESTORE;
1731 case 8:
1732 return AMDGPU::SI_SPILL_AV64_RESTORE;
1733 case 12:
1734 return AMDGPU::SI_SPILL_AV96_RESTORE;
1735 case 16:
1736 return AMDGPU::SI_SPILL_AV128_RESTORE;
1737 case 20:
1738 return AMDGPU::SI_SPILL_AV160_RESTORE;
1739 case 24:
1740 return AMDGPU::SI_SPILL_AV192_RESTORE;
1741 case 28:
1742 return AMDGPU::SI_SPILL_AV224_RESTORE;
1743 case 32:
1744 return AMDGPU::SI_SPILL_AV256_RESTORE;
1745 case 36:
1746 return AMDGPU::SI_SPILL_AV288_RESTORE;
1747 case 40:
1748 return AMDGPU::SI_SPILL_AV320_RESTORE;
1749 case 44:
1750 return AMDGPU::SI_SPILL_AV352_RESTORE;
1751 case 48:
1752 return AMDGPU::SI_SPILL_AV384_RESTORE;
1753 case 64:
1754 return AMDGPU::SI_SPILL_AV512_RESTORE;
1755 case 128:
1756 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1757 default:
1758 llvm_unreachable("unknown register size");
1759 }
1760}
1761
1762static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1763 bool IsVectorSuperClass) {
1764 // Currently, there is only 32-bit WWM register spills needed.
1765 if (Size != 4)
1766 llvm_unreachable("unknown wwm register spill size");
1767
1768 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1769 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1770
1771 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1772}
1773
1774unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(
1775 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1776 const SIMachineFunctionInfo &MFI) const {
1777 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1778
1779 // Choose the right opcode if restoring a WWM register.
1780 if (MFI.checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
1781 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1782
1783 // TODO: Check if AGPRs are available
1784 if (ST.hasMAIInsts())
1785 return getAVSpillRestoreOpcode(Size);
1786
1787 assert(!RI.isAGPRClass(RC));
1788 return getVGPRSpillRestoreOpcode(Size);
1789}
1790
1791void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
1792 MachineBasicBlock::iterator MI,
1793 Register DestReg, int FrameIndex,
1794 const TargetRegisterClass *RC,
1795 Register VReg, unsigned SubReg,
1796 MachineInstr::MIFlag Flags) const {
1797 MachineFunction *MF = MBB.getParent();
1798 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1799 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1800 const DebugLoc &DL = MBB.findDebugLoc(MBBI: MI);
1801 unsigned SpillSize = RI.getSpillSize(RC: *RC);
1802
1803 MachinePointerInfo PtrInfo
1804 = MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1805
1806 MachineMemOperand *MMO = MF->getMachineMemOperand(
1807 PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
1808 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
1809
1810 if (RI.isSGPRClass(RC)) {
1811 if (FrameInfo.getStackID(ObjectIdx: FrameIndex) == TargetStackID::SGPRSpill)
1812 MFI->setHasSpilledSGPRs();
1813 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1814 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1815 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1816
1817 // FIXME: Maybe this should not include a memoperand because it will be
1818 // lowered to non-memory instructions.
1819 const MCInstrDesc &OpDesc = get(Opcode: getSGPRSpillRestoreOpcode(Size: SpillSize));
1820 if (DestReg.isVirtual() && SpillSize == 4) {
1821 MachineRegisterInfo &MRI = MF->getRegInfo();
1822 MRI.constrainRegClass(Reg: DestReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
1823 }
1824
1825 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: OpDesc, DestReg)
1826 .addFrameIndex(Idx: FrameIndex) // addr
1827 .addMemOperand(MMO)
1828 .addReg(RegNo: MFI->getStackPtrOffsetReg(), Flags: RegState::Implicit);
1829
1830 return;
1831 }
1832
1833 unsigned Opcode = getVectorRegSpillRestoreOpcode(Reg: VReg ? VReg : DestReg, RC,
1834 Size: SpillSize, MFI: *MFI);
1835 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode), DestReg)
1836 .addFrameIndex(Idx: FrameIndex) // vaddr
1837 .addReg(RegNo: MFI->getStackPtrOffsetReg()) // scratch_offset
1838 .addImm(Val: 0) // offset
1839 .addMemOperand(MMO);
1840}
1841
1842void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
1843 MachineBasicBlock::iterator MI) const {
1844 insertNoops(MBB, MI, Quantity: 1);
1845}
1846
1847void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
1848 MachineBasicBlock::iterator MI,
1849 unsigned Quantity) const {
1850 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
1851 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1852 while (Quantity > 0) {
1853 unsigned Arg = std::min(a: Quantity, b: MaxSNopCount);
1854 Quantity -= Arg;
1855 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOP)).addImm(Val: Arg - 1);
1856 }
1857}
1858
1859MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
1860 MachineBasicBlock &MBB,
1861 MachineInstr &MI,
1862 const DebugLoc &DL) const {
1863 MachineFunction *MF = MBB.getParent();
1864 constexpr unsigned DoorbellIDMask = 0x3ff;
1865 constexpr unsigned ECQueueWaveAbort = 0x400;
1866
1867 MachineBasicBlock *TrapBB = &MBB;
1868 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1869
1870 if (!MBB.succ_empty() || std::next(x: MI.getIterator()) != MBB.end()) {
1871 MBB.splitAt(SplitInst&: MI, /*UpdateLiveIns=*/false);
1872 TrapBB = MF->CreateMachineBasicBlock();
1873 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CBRANCH_EXECNZ)).addMBB(MBB: TrapBB);
1874 MF->push_back(MBB: TrapBB);
1875 MBB.addSuccessor(Succ: TrapBB);
1876 }
1877 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1878 // will be a nop.
1879 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_TRAP))
1880 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1881 Register DoorbellReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1882 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SENDMSG_RTN_B32),
1883 DestReg: DoorbellReg)
1884 .addImm(Val: AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
1885 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::TTMP2)
1886 .addUse(RegNo: AMDGPU::M0);
1887 Register DoorbellRegMasked =
1888 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1889 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_AND_B32), DestReg: DoorbellRegMasked)
1890 .addUse(RegNo: DoorbellReg)
1891 .addImm(Val: DoorbellIDMask);
1892 Register SetWaveAbortBit =
1893 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
1894 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_OR_B32), DestReg: SetWaveAbortBit)
1895 .addUse(RegNo: DoorbellRegMasked)
1896 .addImm(Val: ECQueueWaveAbort);
1897 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
1898 .addUse(RegNo: SetWaveAbortBit);
1899 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SENDMSG))
1900 .addImm(Val: AMDGPU::SendMsg::ID_INTERRUPT);
1901 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
1902 .addUse(RegNo: AMDGPU::TTMP2);
1903 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH)).addMBB(MBB: HaltLoopBB);
1904 TrapBB->addSuccessor(Succ: HaltLoopBB);
1905
1906 BuildMI(BB&: *HaltLoopBB, I: HaltLoopBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETHALT)).addImm(Val: 5);
1907 BuildMI(BB&: *HaltLoopBB, I: HaltLoopBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
1908 .addMBB(MBB: HaltLoopBB);
1909 MF->push_back(MBB: HaltLoopBB);
1910 HaltLoopBB->addSuccessor(Succ: HaltLoopBB);
1911
1912 return MBB.getNextNode();
1913}
1914
1915unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
1916 switch (MI.getOpcode()) {
1917 default:
1918 if (MI.isMetaInstruction())
1919 return 0;
1920 return 1; // FIXME: Do wait states equal cycles?
1921
1922 case AMDGPU::S_NOP:
1923 return MI.getOperand(i: 0).getImm() + 1;
1924 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
1925 // hazard, even if one exist, won't really be visible. Should we handle it?
1926 }
1927}
1928
1929bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1930 MachineBasicBlock &MBB = *MI.getParent();
1931 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
1932 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
1933 switch (MI.getOpcode()) {
1934 default: return TargetInstrInfo::expandPostRAPseudo(MI);
1935 case AMDGPU::S_MOV_B64_term:
1936 // This is only a terminator to get the correct spill code placement during
1937 // register allocation.
1938 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
1939 break;
1940
1941 case AMDGPU::S_MOV_B32_term:
1942 // This is only a terminator to get the correct spill code placement during
1943 // register allocation.
1944 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B32));
1945 break;
1946
1947 case AMDGPU::S_XOR_B64_term:
1948 // This is only a terminator to get the correct spill code placement during
1949 // register allocation.
1950 MI.setDesc(get(Opcode: AMDGPU::S_XOR_B64));
1951 break;
1952
1953 case AMDGPU::S_XOR_B32_term:
1954 // This is only a terminator to get the correct spill code placement during
1955 // register allocation.
1956 MI.setDesc(get(Opcode: AMDGPU::S_XOR_B32));
1957 break;
1958 case AMDGPU::S_OR_B64_term:
1959 // This is only a terminator to get the correct spill code placement during
1960 // register allocation.
1961 MI.setDesc(get(Opcode: AMDGPU::S_OR_B64));
1962 break;
1963 case AMDGPU::S_OR_B32_term:
1964 // This is only a terminator to get the correct spill code placement during
1965 // register allocation.
1966 MI.setDesc(get(Opcode: AMDGPU::S_OR_B32));
1967 break;
1968
1969 case AMDGPU::S_ANDN2_B64_term:
1970 // This is only a terminator to get the correct spill code placement during
1971 // register allocation.
1972 MI.setDesc(get(Opcode: AMDGPU::S_ANDN2_B64));
1973 break;
1974
1975 case AMDGPU::S_ANDN2_B32_term:
1976 // This is only a terminator to get the correct spill code placement during
1977 // register allocation.
1978 MI.setDesc(get(Opcode: AMDGPU::S_ANDN2_B32));
1979 break;
1980
1981 case AMDGPU::S_AND_B64_term:
1982 // This is only a terminator to get the correct spill code placement during
1983 // register allocation.
1984 MI.setDesc(get(Opcode: AMDGPU::S_AND_B64));
1985 break;
1986
1987 case AMDGPU::S_AND_B32_term:
1988 // This is only a terminator to get the correct spill code placement during
1989 // register allocation.
1990 MI.setDesc(get(Opcode: AMDGPU::S_AND_B32));
1991 break;
1992
1993 case AMDGPU::S_AND_SAVEEXEC_B64_term:
1994 // This is only a terminator to get the correct spill code placement during
1995 // register allocation.
1996 MI.setDesc(get(Opcode: AMDGPU::S_AND_SAVEEXEC_B64));
1997 break;
1998
1999 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2000 // This is only a terminator to get the correct spill code placement during
2001 // register allocation.
2002 MI.setDesc(get(Opcode: AMDGPU::S_AND_SAVEEXEC_B32));
2003 break;
2004
2005 case AMDGPU::V_CMPX_EQ_U32_nosdst_e32_term:
2006 MI.setDesc(get(Opcode: AMDGPU::V_CMPX_EQ_U32_nosdst_e32));
2007 break;
2008 case AMDGPU::V_CMPX_EQ_U64_nosdst_e32_term:
2009 MI.setDesc(get(Opcode: AMDGPU::V_CMPX_EQ_U64_nosdst_e32));
2010 break;
2011
2012 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2013 MI.setDesc(get(Opcode: AMDGPU::V_WRITELANE_B32));
2014 break;
2015
2016 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2017 MI.setDesc(get(Opcode: AMDGPU::V_READLANE_B32));
2018 break;
2019 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2020 Register Dst = MI.getOperand(i: 0).getReg();
2021 bool IsAGPR = SIRegisterInfo::isAGPRClass(RC: RI.getPhysRegBaseClass(Reg: Dst));
2022 MI.setDesc(
2023 get(Opcode: IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2024 break;
2025 }
2026 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2027 Register Dst = MI.getOperand(i: 0).getReg();
2028 if (SIRegisterInfo::isAGPRClass(RC: RI.getPhysRegBaseClass(Reg: Dst))) {
2029 int64_t Imm = MI.getOperand(i: 1).getImm();
2030
2031 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2032 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2033 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg: DstLo)
2034 .addImm(Val: SignExtend64<32>(x: Imm));
2035 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg: DstHi)
2036 .addImm(Val: SignExtend64<32>(x: Imm >> 32));
2037 MI.eraseFromParent();
2038 break;
2039 }
2040
2041 [[fallthrough]];
2042 }
2043 case AMDGPU::V_MOV_B64_PSEUDO: {
2044 Register Dst = MI.getOperand(i: 0).getReg();
2045 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2046 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2047
2048 const MCInstrDesc &Mov64Desc = get(Opcode: AMDGPU::V_MOV_B64_e32);
2049 const TargetRegisterClass *Mov64RC = getRegClass(MCID: Mov64Desc, /*OpNum=*/0);
2050
2051 const MachineOperand &SrcOp = MI.getOperand(i: 1);
2052 // FIXME: Will this work for 64-bit floating point immediates?
2053 assert(!SrcOp.isFPImm());
2054 if (ST.hasVMovB64Inst() && Mov64RC->contains(Reg: Dst)) {
2055 MI.setDesc(Mov64Desc);
2056 if (SrcOp.isReg() || isInlineConstant(MI, OpIdx: 1) ||
2057 isUInt<32>(x: SrcOp.getImm()) || ST.has64BitLiterals())
2058 break;
2059 }
2060 if (SrcOp.isImm()) {
2061 APInt Imm(64, SrcOp.getImm());
2062 APInt Lo(32, Imm.getLoBits(numBits: 32).getZExtValue());
2063 APInt Hi(32, Imm.getHiBits(numBits: 32).getZExtValue());
2064 const MCInstrDesc &PkMovDesc = get(Opcode: AMDGPU::V_PK_MOV_B32);
2065 const TargetRegisterClass *PkMovRC = getRegClass(MCID: PkMovDesc, /*OpNum=*/0);
2066
2067 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Imm: Lo) &&
2068 PkMovRC->contains(Reg: Dst)) {
2069 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: PkMovDesc, DestReg: Dst)
2070 .addImm(Val: SISrcMods::OP_SEL_1)
2071 .addImm(Val: Lo.getSExtValue())
2072 .addImm(Val: SISrcMods::OP_SEL_1)
2073 .addImm(Val: Lo.getSExtValue())
2074 .addImm(Val: 0) // op_sel_lo
2075 .addImm(Val: 0) // op_sel_hi
2076 .addImm(Val: 0) // neg_lo
2077 .addImm(Val: 0) // neg_hi
2078 .addImm(Val: 0); // clamp
2079 } else {
2080 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo)
2081 .addImm(Val: Lo.getSExtValue());
2082 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstHi)
2083 .addImm(Val: Hi.getSExtValue());
2084 }
2085 } else {
2086 assert(SrcOp.isReg());
2087 if (ST.hasPkMovB32() &&
2088 !RI.isAGPR(MRI: MBB.getParent()->getRegInfo(), Reg: SrcOp.getReg())) {
2089 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg: Dst)
2090 .addImm(Val: SISrcMods::OP_SEL_1) // src0_mod
2091 .addReg(RegNo: SrcOp.getReg())
2092 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
2093 .addReg(RegNo: SrcOp.getReg())
2094 .addImm(Val: 0) // op_sel_lo
2095 .addImm(Val: 0) // op_sel_hi
2096 .addImm(Val: 0) // neg_lo
2097 .addImm(Val: 0) // neg_hi
2098 .addImm(Val: 0); // clamp
2099 } else {
2100 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo)
2101 .addReg(RegNo: RI.getSubReg(Reg: SrcOp.getReg(), Idx: AMDGPU::sub0));
2102 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstHi)
2103 .addReg(RegNo: RI.getSubReg(Reg: SrcOp.getReg(), Idx: AMDGPU::sub1));
2104 }
2105 }
2106 MI.eraseFromParent();
2107 break;
2108 }
2109 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2110 expandMovDPP64(MI);
2111 break;
2112 }
2113 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2114 const MachineOperand &SrcOp = MI.getOperand(i: 1);
2115 assert(!SrcOp.isFPImm());
2116
2117 if (ST.has64BitLiterals()) {
2118 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2119 break;
2120 }
2121
2122 APInt Imm(64, SrcOp.getImm());
2123 if (Imm.isIntN(N: 32) || isInlineConstant(Imm)) {
2124 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2125 break;
2126 }
2127
2128 Register Dst = MI.getOperand(i: 0).getReg();
2129 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2130 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2131
2132 APInt Lo(32, Imm.getLoBits(numBits: 32).getZExtValue());
2133 APInt Hi(32, Imm.getHiBits(numBits: 32).getZExtValue());
2134 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstLo)
2135 .addImm(Val: Lo.getSExtValue());
2136 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstHi)
2137 .addImm(Val: Hi.getSExtValue());
2138 MI.eraseFromParent();
2139 break;
2140 }
2141 case AMDGPU::V_SET_INACTIVE_B32: {
2142 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2143 Register DstReg = MI.getOperand(i: 0).getReg();
2144 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
2145 .add(MO: MI.getOperand(i: 3))
2146 .add(MO: MI.getOperand(i: 4))
2147 .add(MO: MI.getOperand(i: 1))
2148 .add(MO: MI.getOperand(i: 2))
2149 .add(MO: MI.getOperand(i: 5));
2150 MI.eraseFromParent();
2151 break;
2152 }
2153 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2154 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2155 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2156 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2157 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2158 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2159 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2160 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2161 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2162 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2163 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2164 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2165 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2166 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2167 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2168 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2169 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2170 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2171 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2172 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2173 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2174 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2175 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2176 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2177 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2178 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2179 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2180 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2181 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2182 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2183 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2184 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2185 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2186 const TargetRegisterClass *EltRC = getOpRegClass(MI, OpNo: 2);
2187
2188 unsigned Opc;
2189 if (RI.hasVGPRs(RC: EltRC)) {
2190 Opc = AMDGPU::V_MOVRELD_B32_e32;
2191 } else {
2192 Opc = RI.getRegSizeInBits(RC: *EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2193 : AMDGPU::S_MOVRELD_B32;
2194 }
2195
2196 const MCInstrDesc &OpDesc = get(Opcode: Opc);
2197 Register VecReg = MI.getOperand(i: 0).getReg();
2198 bool IsUndef = MI.getOperand(i: 1).isUndef();
2199 unsigned SubReg = MI.getOperand(i: 3).getImm();
2200 assert(VecReg == MI.getOperand(1).getReg());
2201
2202 MachineInstrBuilder MIB =
2203 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: OpDesc)
2204 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2205 .add(MO: MI.getOperand(i: 2))
2206 .addReg(RegNo: VecReg, Flags: RegState::ImplicitDefine)
2207 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2208
2209 const int ImpDefIdx =
2210 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2211 const int ImpUseIdx = ImpDefIdx + 1;
2212 MIB->tieOperands(DefIdx: ImpDefIdx, UseIdx: ImpUseIdx);
2213 MI.eraseFromParent();
2214 break;
2215 }
2216 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2217 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2218 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2219 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2220 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2221 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2222 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2223 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2224 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2225 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2226 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2227 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2228 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2229 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2230 assert(ST.useVGPRIndexMode());
2231 Register VecReg = MI.getOperand(i: 0).getReg();
2232 bool IsUndef = MI.getOperand(i: 1).isUndef();
2233 MachineOperand &Idx = MI.getOperand(i: 3);
2234 Register SubReg = MI.getOperand(i: 4).getImm();
2235
2236 MachineInstr *SetOn = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_ON))
2237 .add(MO: Idx)
2238 .addImm(Val: AMDGPU::VGPRIndexMode::DST_ENABLE);
2239 SetOn->getOperand(i: 3).setIsUndef();
2240
2241 const MCInstrDesc &OpDesc = get(Opcode: AMDGPU::V_MOV_B32_indirect_write);
2242 MachineInstrBuilder MIB =
2243 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: OpDesc)
2244 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2245 .add(MO: MI.getOperand(i: 2))
2246 .addReg(RegNo: VecReg, Flags: RegState::ImplicitDefine)
2247 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2248
2249 const int ImpDefIdx =
2250 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2251 const int ImpUseIdx = ImpDefIdx + 1;
2252 MIB->tieOperands(DefIdx: ImpDefIdx, UseIdx: ImpUseIdx);
2253
2254 MachineInstr *SetOff = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_OFF));
2255
2256 finalizeBundle(MBB, FirstMI: SetOn->getIterator(), LastMI: std::next(x: SetOff->getIterator()));
2257
2258 MI.eraseFromParent();
2259 break;
2260 }
2261 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2262 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2263 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2264 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2265 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2266 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2267 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2268 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2269 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2270 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2271 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2272 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2273 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2274 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2275 assert(ST.useVGPRIndexMode());
2276 Register Dst = MI.getOperand(i: 0).getReg();
2277 Register VecReg = MI.getOperand(i: 1).getReg();
2278 bool IsUndef = MI.getOperand(i: 1).isUndef();
2279 Register SubReg = MI.getOperand(i: 3).getImm();
2280
2281 MachineInstr *SetOn = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_ON))
2282 .add(MO: MI.getOperand(i: 2))
2283 .addImm(Val: AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2284 SetOn->getOperand(i: 3).setIsUndef();
2285
2286 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_indirect_read))
2287 .addDef(RegNo: Dst)
2288 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2289 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2290
2291 MachineInstr *SetOff = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_OFF));
2292
2293 finalizeBundle(MBB, FirstMI: SetOn->getIterator(), LastMI: std::next(x: SetOff->getIterator()));
2294
2295 MI.eraseFromParent();
2296 break;
2297 }
2298 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2299 MachineFunction &MF = *MBB.getParent();
2300 Register Reg = MI.getOperand(i: 0).getReg();
2301 Register RegLo = RI.getSubReg(Reg, Idx: AMDGPU::sub0);
2302 Register RegHi = RI.getSubReg(Reg, Idx: AMDGPU::sub1);
2303 MachineOperand OpLo = MI.getOperand(i: 1);
2304 MachineOperand OpHi = MI.getOperand(i: 2);
2305
2306 // Create a bundle so these instructions won't be re-ordered by the
2307 // post-RA scheduler.
2308 MIBundleBuilder Bundler(MBB, MI);
2309 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: Reg));
2310
2311 // What we want here is an offset from the value returned by s_getpc (which
2312 // is the address of the s_add_u32 instruction) to the global variable, but
2313 // since the encoding of $symbol starts 4 bytes after the start of the
2314 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2315 // small. This requires us to add 4 to the global variable offset in order
2316 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2317 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2318 // instruction.
2319
2320 int64_t Adjust = 0;
2321 if (ST.hasGetPCZeroExtension()) {
2322 // Fix up hardware that does not sign-extend the 48-bit PC value by
2323 // inserting: s_sext_i32_i16 reghi, reghi
2324 Bundler.append(
2325 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SEXT_I32_I16), DestReg: RegHi).addReg(RegNo: RegHi));
2326 Adjust += 4;
2327 }
2328
2329 if (OpLo.isGlobal())
2330 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2331 Bundler.append(
2332 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U32), DestReg: RegLo).addReg(RegNo: RegLo).add(MO: OpLo));
2333
2334 if (OpHi.isGlobal())
2335 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2336 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADDC_U32), DestReg: RegHi)
2337 .addReg(RegNo: RegHi)
2338 .add(MO: OpHi));
2339
2340 finalizeBundle(MBB, FirstMI: Bundler.begin());
2341
2342 MI.eraseFromParent();
2343 break;
2344 }
2345 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2346 MachineFunction &MF = *MBB.getParent();
2347 Register Reg = MI.getOperand(i: 0).getReg();
2348 MachineOperand Op = MI.getOperand(i: 1);
2349
2350 // Create a bundle so these instructions won't be re-ordered by the
2351 // post-RA scheduler.
2352 MIBundleBuilder Bundler(MBB, MI);
2353 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: Reg));
2354 if (Op.isGlobal())
2355 Op.setOffset(Op.getOffset() + 4);
2356 Bundler.append(
2357 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U64), DestReg: Reg).addReg(RegNo: Reg).add(MO: Op));
2358
2359 finalizeBundle(MBB, FirstMI: Bundler.begin());
2360
2361 MI.eraseFromParent();
2362 break;
2363 }
2364 case AMDGPU::ENTER_STRICT_WWM: {
2365 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2366 // Whole Wave Mode is entered.
2367 MI.setDesc(get(Opcode: LMC.OrSaveExecOpc));
2368 break;
2369 }
2370 case AMDGPU::ENTER_STRICT_WQM: {
2371 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2372 // STRICT_WQM is entered.
2373 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: LMC.MovOpc), DestReg: MI.getOperand(i: 0).getReg())
2374 .addReg(RegNo: LMC.ExecReg);
2375 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: LMC.WQMOpc), DestReg: LMC.ExecReg).addReg(RegNo: LMC.ExecReg);
2376
2377 MI.eraseFromParent();
2378 break;
2379 }
2380 case AMDGPU::EXIT_STRICT_WWM:
2381 case AMDGPU::EXIT_STRICT_WQM: {
2382 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2383 // WWM/STICT_WQM is exited.
2384 MI.setDesc(get(Opcode: LMC.MovOpc));
2385 break;
2386 }
2387 case AMDGPU::SI_RETURN: {
2388 const MachineFunction *MF = MBB.getParent();
2389 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2390 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2391 // Hiding the return address use with SI_RETURN may lead to extra kills in
2392 // the function and missing live-ins. We are fine in practice because callee
2393 // saved register handling ensures the register value is restored before
2394 // RET, but we need the undef flag here to appease the MachineVerifier
2395 // liveness checks.
2396 MachineInstrBuilder MIB =
2397 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETPC_B64_return))
2398 .addReg(RegNo: TRI->getReturnAddressReg(MF: *MF), Flags: RegState::Undef);
2399
2400 MIB.copyImplicitOps(OtherMI: MI);
2401 MI.eraseFromParent();
2402 break;
2403 }
2404
2405 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2406 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2407 MI.setDesc(get(Opcode: AMDGPU::S_MUL_U64));
2408 break;
2409
2410 case AMDGPU::S_GETPC_B64_pseudo:
2411 MI.setDesc(get(Opcode: AMDGPU::S_GETPC_B64));
2412 if (ST.hasGetPCZeroExtension()) {
2413 Register Dst = MI.getOperand(i: 0).getReg();
2414 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2415 // Fix up hardware that does not sign-extend the 48-bit PC value by
2416 // inserting: s_sext_i32_i16 dsthi, dsthi
2417 BuildMI(BB&: MBB, I: std::next(x: MI.getIterator()), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SEXT_I32_I16),
2418 DestReg: DstHi)
2419 .addReg(RegNo: DstHi);
2420 }
2421 break;
2422
2423 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2424 assert(ST.hasBF16PackedInsts());
2425 MI.setDesc(get(Opcode: AMDGPU::V_PK_MAX_NUM_BF16));
2426 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // op_sel
2427 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // neg_lo
2428 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // neg_hi
2429 auto Op0 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
2430 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2431 auto Op1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
2432 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2433 break;
2434 }
2435
2436 case AMDGPU::GET_STACK_BASE:
2437 // The stack starts at offset 0 unless we need to reserve some space at the
2438 // bottom.
2439 if (ST.getFrameLowering()->mayReserveScratchForCWSR(MF: *MBB.getParent())) {
2440 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2441 // some of the VGPRs. The size of the required scratch space has already
2442 // been computed by prolog epilog insertion.
2443 const SIMachineFunctionInfo *MFI =
2444 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2445 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2446 Register DestReg = MI.getOperand(i: 0).getReg();
2447 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETREG_B32), DestReg)
2448 .addImm(Val: AMDGPU::Hwreg::HwregEncoding::encode(
2449 Values: AMDGPU::Hwreg::ID_HW_ID2, Values: AMDGPU::Hwreg::OFFSET_ME_ID, Values: 2));
2450 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2451 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2452 // SCC, so we need to check for 0 manually.
2453 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U32)).addImm(Val: 0).addReg(RegNo: DestReg);
2454 // Change the implicif-def of SCC to an explicit use (but first remove
2455 // the dead flag if present).
2456 MI.getOperand(i: MI.getNumExplicitOperands()).setIsDead(false);
2457 MI.getOperand(i: MI.getNumExplicitOperands()).setIsUse();
2458 MI.setDesc(get(Opcode: AMDGPU::S_CMOVK_I32));
2459 MI.addOperand(Op: MachineOperand::CreateImm(Val: VGPRSize));
2460 } else {
2461 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B32));
2462 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2463 MI.removeOperand(
2464 OpNo: MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2465 }
2466 break;
2467 }
2468
2469 return true;
2470}
2471
2472void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
2473 MachineBasicBlock::iterator I, Register DestReg,
2474 unsigned SubIdx, const MachineInstr &Orig,
2475 LaneBitmask UsedLanes) const {
2476
2477 // Try shrinking the instruction to remat only the part needed for current
2478 // context.
2479 // TODO: Handle more cases.
2480 unsigned Opcode = Orig.getOpcode();
2481 switch (Opcode) {
2482 case AMDGPU::S_MOV_B64:
2483 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2484 if (SubIdx != 0)
2485 break;
2486
2487 if (!Orig.getOperand(i: 1).isImm())
2488 break;
2489
2490 // Shrink S_MOV_B64 to S_MOV_B32 when UsedLanes indicates only a single
2491 // 32-bit lane of the 64-bit value is live at the rematerialization point.
2492 if (UsedLanes.all())
2493 break;
2494
2495 // Determine which half of the 64-bit immediate corresponds to the use.
2496 unsigned OrigSubReg = Orig.getOperand(i: 0).getSubReg();
2497 unsigned LoSubReg = RI.composeSubRegIndices(a: OrigSubReg, b: AMDGPU::sub0);
2498 unsigned HiSubReg = RI.composeSubRegIndices(a: OrigSubReg, b: AMDGPU::sub1);
2499
2500 bool NeedLo = (UsedLanes & RI.getSubRegIndexLaneMask(SubIdx: LoSubReg)).any();
2501 bool NeedHi = (UsedLanes & RI.getSubRegIndexLaneMask(SubIdx: HiSubReg)).any();
2502
2503 if (NeedLo && NeedHi)
2504 break;
2505
2506 int64_t Imm64 = Orig.getOperand(i: 1).getImm();
2507 int32_t Imm32 = NeedLo ? Lo_32(Value: Imm64) : Hi_32(Value: Imm64);
2508
2509 unsigned UseSubReg = NeedLo ? LoSubReg : HiSubReg;
2510
2511 // Emit S_MOV_B32 defining just the needed 32-bit subreg of DestReg.
2512 BuildMI(BB&: MBB, I, MIMD: Orig.getDebugLoc(), MCID: get(Opcode: AMDGPU::S_MOV_B32))
2513 .addReg(RegNo: DestReg, Flags: RegState::Define | RegState::Undef, SubReg: UseSubReg)
2514 .addImm(Val: Imm32);
2515 return;
2516 }
2517
2518 case AMDGPU::S_LOAD_DWORDX16_IMM:
2519 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2520 if (SubIdx != 0)
2521 break;
2522
2523 if (I == MBB.end())
2524 break;
2525
2526 if (I->isBundled())
2527 break;
2528
2529 // Look for a single use of the register that is also a subreg.
2530 Register RegToFind = Orig.getOperand(i: 0).getReg();
2531 MachineOperand *UseMO = nullptr;
2532 for (auto &CandMO : I->operands()) {
2533 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2534 continue;
2535 if (UseMO) {
2536 UseMO = nullptr;
2537 break;
2538 }
2539 UseMO = &CandMO;
2540 }
2541 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2542 break;
2543
2544 unsigned Offset = RI.getSubRegIdxOffset(Idx: UseMO->getSubReg());
2545 unsigned SubregSize = RI.getSubRegIdxSize(Idx: UseMO->getSubReg());
2546
2547 MachineFunction *MF = MBB.getParent();
2548 MachineRegisterInfo &MRI = MF->getRegInfo();
2549 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2550
2551 unsigned NewOpcode = -1;
2552 if (SubregSize == 256)
2553 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2554 else if (SubregSize == 128)
2555 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2556 else
2557 break;
2558
2559 const MCInstrDesc &TID = get(Opcode: NewOpcode);
2560 const TargetRegisterClass *NewRC =
2561 RI.getAllocatableClass(RC: getRegClass(MCID: TID, OpNum: 0));
2562 MRI.setRegClass(Reg: DestReg, RC: NewRC);
2563
2564 UseMO->setReg(DestReg);
2565 UseMO->setSubReg(AMDGPU::NoSubRegister);
2566
2567 // Use a smaller load with the desired size, possibly with updated offset.
2568 MachineInstr *MI = MF->CloneMachineInstr(Orig: &Orig);
2569 MI->setDesc(TID);
2570 MI->getOperand(i: 0).setReg(DestReg);
2571 MI->getOperand(i: 0).setSubReg(AMDGPU::NoSubRegister);
2572 if (Offset) {
2573 MachineOperand *OffsetMO = getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset);
2574 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2575 OffsetMO->setImm(FinalOffset);
2576 }
2577 SmallVector<MachineMemOperand *> NewMMOs;
2578 for (const MachineMemOperand *MemOp : Orig.memoperands())
2579 NewMMOs.push_back(Elt: MF->getMachineMemOperand(MMO: MemOp, PtrInfo: MemOp->getPointerInfo(),
2580 Size: SubregSize / 8));
2581 MI->setMemRefs(MF&: *MF, MemRefs: NewMMOs);
2582
2583 MBB.insert(I, MI);
2584 return;
2585 }
2586
2587 default:
2588 break;
2589 }
2590
2591 TargetInstrInfo::reMaterialize(MBB, MI: I, DestReg, SubIdx, Orig, UsedLanes);
2592}
2593
2594std::pair<MachineInstr*, MachineInstr*>
2595SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
2596 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2597
2598 if (ST.hasVMovB64Inst() && ST.hasFeature(Feature: AMDGPU::FeatureDPALU_DPP) &&
2599 AMDGPU::isLegalDPALU_DPPControl(
2600 ST, DC: getNamedOperand(MI, OperandName: AMDGPU::OpName::dpp_ctrl)->getImm())) {
2601 MI.setDesc(get(Opcode: AMDGPU::V_MOV_B64_dpp));
2602 return std::pair(&MI, nullptr);
2603 }
2604
2605 MachineBasicBlock &MBB = *MI.getParent();
2606 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
2607 MachineFunction *MF = MBB.getParent();
2608 MachineRegisterInfo &MRI = MF->getRegInfo();
2609 Register Dst = MI.getOperand(i: 0).getReg();
2610 unsigned Part = 0;
2611 MachineInstr *Split[2];
2612
2613 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2614 auto MovDPP = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_dpp));
2615 if (Dst.isPhysical()) {
2616 MovDPP.addDef(RegNo: RI.getSubReg(Reg: Dst, Idx: Sub));
2617 } else {
2618 assert(MRI.isSSA());
2619 auto Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2620 MovDPP.addDef(RegNo: Tmp);
2621 }
2622
2623 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2624 const MachineOperand &SrcOp = MI.getOperand(i: I);
2625 assert(!SrcOp.isFPImm());
2626 if (SrcOp.isImm()) {
2627 APInt Imm(64, SrcOp.getImm());
2628 Imm.ashrInPlace(ShiftAmt: Part * 32);
2629 MovDPP.addImm(Val: Imm.getLoBits(numBits: 32).getZExtValue());
2630 } else {
2631 assert(SrcOp.isReg());
2632 Register Src = SrcOp.getReg();
2633 if (Src.isPhysical())
2634 MovDPP.addReg(RegNo: RI.getSubReg(Reg: Src, Idx: Sub));
2635 else
2636 MovDPP.addReg(RegNo: Src, Flags: getUndefRegState(B: SrcOp.isUndef()), SubReg: Sub);
2637 }
2638 }
2639
2640 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.explicit_operands(), N: 3))
2641 MovDPP.addImm(Val: MO.getImm());
2642
2643 Split[Part] = MovDPP;
2644 ++Part;
2645 }
2646
2647 if (Dst.isVirtual())
2648 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
2649 .addReg(RegNo: Split[0]->getOperand(i: 0).getReg())
2650 .addImm(Val: AMDGPU::sub0)
2651 .addReg(RegNo: Split[1]->getOperand(i: 0).getReg())
2652 .addImm(Val: AMDGPU::sub1);
2653
2654 MI.eraseFromParent();
2655 return std::pair(Split[0], Split[1]);
2656}
2657
2658std::optional<DestSourcePair>
2659SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
2660 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2661 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 1)};
2662
2663 return std::nullopt;
2664}
2665
2666bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0,
2667 AMDGPU::OpName Src0OpName,
2668 MachineOperand &Src1,
2669 AMDGPU::OpName Src1OpName) const {
2670 MachineOperand *Src0Mods = getNamedOperand(MI, OperandName: Src0OpName);
2671 if (!Src0Mods)
2672 return false;
2673
2674 MachineOperand *Src1Mods = getNamedOperand(MI, OperandName: Src1OpName);
2675 assert(Src1Mods &&
2676 "All commutable instructions have both src0 and src1 modifiers");
2677
2678 int Src0ModsVal = Src0Mods->getImm();
2679 int Src1ModsVal = Src1Mods->getImm();
2680
2681 Src1Mods->setImm(Src0ModsVal);
2682 Src0Mods->setImm(Src1ModsVal);
2683 return true;
2684}
2685
2686static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
2687 MachineOperand &RegOp,
2688 MachineOperand &NonRegOp) {
2689 Register Reg = RegOp.getReg();
2690 unsigned SubReg = RegOp.getSubReg();
2691 bool IsKill = RegOp.isKill();
2692 bool IsDead = RegOp.isDead();
2693 bool IsUndef = RegOp.isUndef();
2694 bool IsDebug = RegOp.isDebug();
2695
2696 if (NonRegOp.isImm())
2697 RegOp.ChangeToImmediate(ImmVal: NonRegOp.getImm());
2698 else if (NonRegOp.isFI())
2699 RegOp.ChangeToFrameIndex(Idx: NonRegOp.getIndex());
2700 else if (NonRegOp.isGlobal()) {
2701 RegOp.ChangeToGA(GV: NonRegOp.getGlobal(), Offset: NonRegOp.getOffset(),
2702 TargetFlags: NonRegOp.getTargetFlags());
2703 } else
2704 return nullptr;
2705
2706 // Make sure we don't reinterpret a subreg index in the target flags.
2707 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2708
2709 NonRegOp.ChangeToRegister(Reg, isDef: false, isImp: false, isKill: IsKill, isDead: IsDead, isUndef: IsUndef, isDebug: IsDebug);
2710 NonRegOp.setSubReg(SubReg);
2711
2712 return &MI;
2713}
2714
2715static MachineInstr *swapImmOperands(MachineInstr &MI,
2716 MachineOperand &NonRegOp1,
2717 MachineOperand &NonRegOp2) {
2718 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2719 int64_t NonRegVal = NonRegOp1.getImm();
2720
2721 NonRegOp1.setImm(NonRegOp2.getImm());
2722 NonRegOp2.setImm(NonRegVal);
2723 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2724 NonRegOp2.setTargetFlags(TargetFlags);
2725 return &MI;
2726}
2727
2728bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2729 unsigned OpIdx1) const {
2730 const MCInstrDesc &InstDesc = MI.getDesc();
2731 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2732 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2733
2734 unsigned Opc = MI.getOpcode();
2735 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
2736
2737 const MachineOperand &MO0 = MI.getOperand(i: OpIdx0);
2738 const MachineOperand &MO1 = MI.getOperand(i: OpIdx1);
2739
2740 // Swap doesn't breach constant bus or literal limits
2741 // It may move literal to position other than src0, this is not allowed
2742 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2743 // FIXME: After gfx9, literal can be in place other than Src0
2744 if (isVALU(MI, /*AllowLDSDMA=*/true)) {
2745 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2746 !isInlineConstant(MO: MO0, OpInfo: OpInfo1))
2747 return false;
2748 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2749 !isInlineConstant(MO: MO1, OpInfo: OpInfo0))
2750 return false;
2751 }
2752
2753 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2754 if (OpInfo1.RegClass == -1)
2755 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2756 return isLegalRegOperand(MI, OpIdx: OpIdx1, MO: MO0) &&
2757 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx: OpIdx0, MO: MO1));
2758 }
2759 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2760 if (OpInfo0.RegClass == -1)
2761 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2762 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx: OpIdx1, MO: MO0)) &&
2763 isLegalRegOperand(MI, OpIdx: OpIdx0, MO: MO1);
2764 }
2765
2766 // No need to check 64-bit literals since swapping does not bring new
2767 // 64-bit literals into current instruction to fold to 32-bit
2768
2769 return isImmOperandLegal(MI, OpNo: OpIdx1, MO: MO0);
2770}
2771
2772MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
2773 unsigned Src0Idx,
2774 unsigned Src1Idx) const {
2775 assert(!NewMI && "this should never be used");
2776
2777 unsigned Opc = MI.getOpcode();
2778 int CommutedOpcode = commuteOpcode(Opcode: Opc);
2779 if (CommutedOpcode == -1)
2780 return nullptr;
2781
2782 if (Src0Idx > Src1Idx)
2783 std::swap(a&: Src0Idx, b&: Src1Idx);
2784
2785 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2786 static_cast<int>(Src0Idx) &&
2787 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2788 static_cast<int>(Src1Idx) &&
2789 "inconsistency with findCommutedOpIndices");
2790
2791 if (!isLegalToSwap(MI, OpIdx0: Src0Idx, OpIdx1: Src1Idx))
2792 return nullptr;
2793
2794 MachineInstr *CommutedMI = nullptr;
2795 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
2796 MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
2797 if (Src0.isReg() && Src1.isReg()) {
2798 // Be sure to copy the source modifiers to the right place.
2799 CommutedMI =
2800 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1: Src0Idx, OpIdx2: Src1Idx);
2801 } else if (Src0.isReg() && !Src1.isReg()) {
2802 CommutedMI = swapRegAndNonRegOperand(MI, RegOp&: Src0, NonRegOp&: Src1);
2803 } else if (!Src0.isReg() && Src1.isReg()) {
2804 CommutedMI = swapRegAndNonRegOperand(MI, RegOp&: Src1, NonRegOp&: Src0);
2805 } else if (Src0.isImm() && Src1.isImm()) {
2806 CommutedMI = swapImmOperands(MI, NonRegOp1&: Src0, NonRegOp2&: Src1);
2807 } else {
2808 // FIXME: Found two non registers to commute. This does happen.
2809 return nullptr;
2810 }
2811
2812 if (CommutedMI) {
2813 swapSourceModifiers(MI, Src0, Src0OpName: AMDGPU::OpName::src0_modifiers,
2814 Src1, Src1OpName: AMDGPU::OpName::src1_modifiers);
2815
2816 swapSourceModifiers(MI, Src0, Src0OpName: AMDGPU::OpName::src0_sel, Src1,
2817 Src1OpName: AMDGPU::OpName::src1_sel);
2818
2819 CommutedMI->setDesc(get(Opcode: CommutedOpcode));
2820 }
2821
2822 return CommutedMI;
2823}
2824
2825// This needs to be implemented because the source modifiers may be inserted
2826// between the true commutable operands, and the base
2827// TargetInstrInfo::commuteInstruction uses it.
2828bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
2829 unsigned &SrcOpIdx0,
2830 unsigned &SrcOpIdx1) const {
2831 return findCommutedOpIndices(Desc: MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2832}
2833
2834bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc,
2835 unsigned &SrcOpIdx0,
2836 unsigned &SrcOpIdx1) const {
2837 if (!Desc.isCommutable())
2838 return false;
2839
2840 unsigned Opc = Desc.getOpcode();
2841 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
2842 if (Src0Idx == -1)
2843 return false;
2844
2845 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1);
2846 if (Src1Idx == -1)
2847 return false;
2848
2849 return fixCommutedOpIndices(ResultIdx1&: SrcOpIdx0, ResultIdx2&: SrcOpIdx1, CommutableOpIdx1: Src0Idx, CommutableOpIdx2: Src1Idx);
2850}
2851
2852bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
2853 int64_t BrOffset) const {
2854 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2855 // because its dest block is unanalyzable.
2856 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2857
2858 // Convert to dwords.
2859 BrOffset /= 4;
2860
2861 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2862 // from the next instruction.
2863 BrOffset -= 1;
2864
2865 return isIntN(N: BranchOffsetBits, x: BrOffset);
2866}
2867
2868MachineBasicBlock *
2869SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
2870 return MI.getOperand(i: 0).getMBB();
2871}
2872
2873bool SIInstrInfo::hasDivergentBranch(const MachineBasicBlock *MBB) const {
2874 for (const MachineInstr &MI : MBB->terminators()) {
2875 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2876 MI.getOpcode() == AMDGPU::SI_LOOP)
2877 return true;
2878 }
2879 return false;
2880}
2881
2882void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
2883 MachineBasicBlock &DestBB,
2884 MachineBasicBlock &RestoreBB,
2885 const DebugLoc &DL, int64_t BrOffset,
2886 RegScavenger *RS) const {
2887 assert(MBB.empty() &&
2888 "new block should be inserted for expanding unconditional branch");
2889 assert(MBB.pred_size() == 1);
2890 assert(RestoreBB.empty() &&
2891 "restore block should be inserted for restoring clobbered registers");
2892
2893 MachineFunction *MF = MBB.getParent();
2894 MachineRegisterInfo &MRI = MF->getRegInfo();
2895 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2896 auto I = MBB.end();
2897 auto &MCCtx = MF->getContext();
2898
2899 if (ST.useAddPC64Inst()) {
2900 MCSymbol *Offset =
2901 MCCtx.createTempSymbol(Name: "offset", /*AlwaysAddSuffix=*/true);
2902 auto AddPC = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_PC_I64))
2903 .addSym(Sym: Offset, TargetFlags: MO_FAR_BRANCH_OFFSET);
2904 MCSymbol *PostAddPCLabel =
2905 MCCtx.createTempSymbol(Name: "post_addpc", /*AlwaysAddSuffix=*/true);
2906 AddPC->setPostInstrSymbol(MF&: *MF, Symbol: PostAddPCLabel);
2907 auto *OffsetExpr = MCBinaryExpr::createSub(
2908 LHS: MCSymbolRefExpr::create(Symbol: DestBB.getSymbol(), Ctx&: MCCtx),
2909 RHS: MCSymbolRefExpr::create(Symbol: PostAddPCLabel, Ctx&: MCCtx), Ctx&: MCCtx);
2910 Offset->setVariableValue(OffsetExpr);
2911 return;
2912 }
2913
2914 assert(RS && "RegScavenger required for long branching");
2915
2916 // FIXME: Virtual register workaround for RegScavenger not working with empty
2917 // blocks.
2918 Register PCReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
2919
2920 // Note: as this is used after hazard recognizer we need to apply some hazard
2921 // workarounds directly.
2922 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2923 ST.hasVALUReadSGPRHazard();
2924 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2925 if (FlushSGPRWrites)
2926 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
2927 .addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST));
2928 };
2929
2930 // We need to compute the offset relative to the instruction immediately after
2931 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2932 MachineInstr *GetPC = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: PCReg);
2933 ApplyHazardWorkarounds();
2934
2935 MCSymbol *PostGetPCLabel =
2936 MCCtx.createTempSymbol(Name: "post_getpc", /*AlwaysAddSuffix=*/true);
2937 GetPC->setPostInstrSymbol(MF&: *MF, Symbol: PostGetPCLabel);
2938
2939 MCSymbol *OffsetLo =
2940 MCCtx.createTempSymbol(Name: "offset_lo", /*AlwaysAddSuffix=*/true);
2941 MCSymbol *OffsetHi =
2942 MCCtx.createTempSymbol(Name: "offset_hi", /*AlwaysAddSuffix=*/true);
2943 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U32))
2944 .addReg(RegNo: PCReg, Flags: RegState::Define, SubReg: AMDGPU::sub0)
2945 .addReg(RegNo: PCReg, Flags: {}, SubReg: AMDGPU::sub0)
2946 .addSym(Sym: OffsetLo, TargetFlags: MO_FAR_BRANCH_OFFSET);
2947 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADDC_U32))
2948 .addReg(RegNo: PCReg, Flags: RegState::Define, SubReg: AMDGPU::sub1)
2949 .addReg(RegNo: PCReg, Flags: {}, SubReg: AMDGPU::sub1)
2950 .addSym(Sym: OffsetHi, TargetFlags: MO_FAR_BRANCH_OFFSET);
2951 ApplyHazardWorkarounds();
2952
2953 // Insert the indirect branch after the other terminator.
2954 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETPC_B64))
2955 .addReg(RegNo: PCReg);
2956
2957 // If a spill is needed for the pc register pair, we need to insert a spill
2958 // restore block right before the destination block, and insert a short branch
2959 // into the old destination block's fallthrough predecessor.
2960 // e.g.:
2961 //
2962 // s_cbranch_scc0 skip_long_branch:
2963 //
2964 // long_branch_bb:
2965 // spill s[8:9]
2966 // s_getpc_b64 s[8:9]
2967 // s_add_u32 s8, s8, restore_bb
2968 // s_addc_u32 s9, s9, 0
2969 // s_setpc_b64 s[8:9]
2970 //
2971 // skip_long_branch:
2972 // foo;
2973 //
2974 // .....
2975 //
2976 // dest_bb_fallthrough_predecessor:
2977 // bar;
2978 // s_branch dest_bb
2979 //
2980 // restore_bb:
2981 // restore s[8:9]
2982 // fallthrough dest_bb
2983 ///
2984 // dest_bb:
2985 // buzz;
2986
2987 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2988 Register Scav;
2989
2990 // If we've previously reserved a register for long branches
2991 // avoid running the scavenger and just use those registers
2992 if (LongBranchReservedReg) {
2993 RS->enterBasicBlock(MBB);
2994 Scav = LongBranchReservedReg;
2995 } else {
2996 RS->enterBasicBlockEnd(MBB);
2997 Scav = RS->scavengeRegisterBackwards(
2998 RC: AMDGPU::SReg_64RegClass, To: MachineBasicBlock::iterator(GetPC),
2999 /* RestoreAfter */ false, SPAdj: 0, /* AllowSpill */ false);
3000 }
3001 if (Scav) {
3002 RS->setRegUsed(Reg: Scav);
3003 MRI.replaceRegWith(FromReg: PCReg, ToReg: Scav);
3004 MRI.clearVirtRegs();
3005 } else {
3006 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3007 // SGPR spill.
3008 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3009 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3010 TRI->spillEmergencySGPR(MI: GetPC, RestoreMBB&: RestoreBB, SGPR: AMDGPU::SGPR0_SGPR1, RS);
3011 MRI.replaceRegWith(FromReg: PCReg, ToReg: AMDGPU::SGPR0_SGPR1);
3012 MRI.clearVirtRegs();
3013 }
3014
3015 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3016 // Now, the distance could be defined.
3017 auto *Offset = MCBinaryExpr::createSub(
3018 LHS: MCSymbolRefExpr::create(Symbol: DestLabel, Ctx&: MCCtx),
3019 RHS: MCSymbolRefExpr::create(Symbol: PostGetPCLabel, Ctx&: MCCtx), Ctx&: MCCtx);
3020 // Add offset assignments.
3021 auto *Mask = MCConstantExpr::create(Value: 0xFFFFFFFFULL, Ctx&: MCCtx);
3022 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(LHS: Offset, RHS: Mask, Ctx&: MCCtx));
3023 auto *ShAmt = MCConstantExpr::create(Value: 32, Ctx&: MCCtx);
3024 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(LHS: Offset, RHS: ShAmt, Ctx&: MCCtx));
3025}
3026
3027unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3028 switch (Cond) {
3029 case SIInstrInfo::SCC_TRUE:
3030 return AMDGPU::S_CBRANCH_SCC1;
3031 case SIInstrInfo::SCC_FALSE:
3032 return AMDGPU::S_CBRANCH_SCC0;
3033 case SIInstrInfo::VCCNZ:
3034 return AMDGPU::S_CBRANCH_VCCNZ;
3035 case SIInstrInfo::VCCZ:
3036 return AMDGPU::S_CBRANCH_VCCZ;
3037 case SIInstrInfo::EXECNZ:
3038 return AMDGPU::S_CBRANCH_EXECNZ;
3039 case SIInstrInfo::EXECZ:
3040 return AMDGPU::S_CBRANCH_EXECZ;
3041 default:
3042 llvm_unreachable("invalid branch predicate");
3043 }
3044}
3045
3046SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3047 switch (Opcode) {
3048 case AMDGPU::S_CBRANCH_SCC0:
3049 return SCC_FALSE;
3050 case AMDGPU::S_CBRANCH_SCC1:
3051 return SCC_TRUE;
3052 case AMDGPU::S_CBRANCH_VCCNZ:
3053 return VCCNZ;
3054 case AMDGPU::S_CBRANCH_VCCZ:
3055 return VCCZ;
3056 case AMDGPU::S_CBRANCH_EXECNZ:
3057 return EXECNZ;
3058 case AMDGPU::S_CBRANCH_EXECZ:
3059 return EXECZ;
3060 default:
3061 return INVALID_BR;
3062 }
3063}
3064
3065bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
3066 MachineBasicBlock::iterator I,
3067 MachineBasicBlock *&TBB,
3068 MachineBasicBlock *&FBB,
3069 SmallVectorImpl<MachineOperand> &Cond,
3070 bool AllowModify) const {
3071 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3072 // Unconditional Branch
3073 TBB = I->getOperand(i: 0).getMBB();
3074 return false;
3075 }
3076
3077 BranchPredicate Pred = getBranchPredicate(Opcode: I->getOpcode());
3078 if (Pred == INVALID_BR)
3079 return true;
3080
3081 MachineBasicBlock *CondBB = I->getOperand(i: 0).getMBB();
3082 Cond.push_back(Elt: MachineOperand::CreateImm(Val: Pred));
3083 Cond.push_back(Elt: I->getOperand(i: 1)); // Save the branch register.
3084
3085 ++I;
3086
3087 if (I == MBB.end()) {
3088 // Conditional branch followed by fall-through.
3089 TBB = CondBB;
3090 return false;
3091 }
3092
3093 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3094 TBB = CondBB;
3095 FBB = I->getOperand(i: 0).getMBB();
3096 return false;
3097 }
3098
3099 return true;
3100}
3101
3102bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
3103 MachineBasicBlock *&FBB,
3104 SmallVectorImpl<MachineOperand> &Cond,
3105 bool AllowModify) const {
3106 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3107 auto E = MBB.end();
3108 if (I == E)
3109 return false;
3110
3111 // Skip over the instructions that are artificially terminators for special
3112 // exec management.
3113 while (I != E && !I->isBranch() && !I->isReturn()) {
3114 switch (I->getOpcode()) {
3115 case AMDGPU::S_MOV_B64_term:
3116 case AMDGPU::S_XOR_B64_term:
3117 case AMDGPU::S_OR_B64_term:
3118 case AMDGPU::S_ANDN2_B64_term:
3119 case AMDGPU::S_AND_B64_term:
3120 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3121 case AMDGPU::S_MOV_B32_term:
3122 case AMDGPU::S_XOR_B32_term:
3123 case AMDGPU::S_OR_B32_term:
3124 case AMDGPU::S_ANDN2_B32_term:
3125 case AMDGPU::S_AND_B32_term:
3126 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3127 case AMDGPU::V_CMPX_EQ_U32_nosdst_e32_term:
3128 case AMDGPU::V_CMPX_EQ_U64_nosdst_e32_term:
3129 break;
3130 case AMDGPU::SI_IF:
3131 case AMDGPU::SI_ELSE:
3132 case AMDGPU::SI_KILL_I1_TERMINATOR:
3133 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3134 // FIXME: It's messy that these need to be considered here at all.
3135 return true;
3136 default:
3137 llvm_unreachable("unexpected non-branch terminator inst");
3138 }
3139
3140 ++I;
3141 }
3142
3143 if (I == E)
3144 return false;
3145
3146 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3147}
3148
3149unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
3150 int *BytesRemoved) const {
3151 unsigned Count = 0;
3152 unsigned RemovedSize = 0;
3153 for (MachineInstr &MI : llvm::make_early_inc_range(Range: MBB.terminators())) {
3154 // Skip over artificial terminators when removing instructions.
3155 if (MI.isBranch() || MI.isReturn()) {
3156 RemovedSize += getInstSizeInBytes(MI);
3157 MI.eraseFromParent();
3158 ++Count;
3159 }
3160 }
3161
3162 if (BytesRemoved)
3163 *BytesRemoved = RemovedSize;
3164
3165 return Count;
3166}
3167
3168// Copy the flags onto the implicit condition register operand.
3169static void preserveCondRegFlags(MachineOperand &CondReg,
3170 const MachineOperand &OrigCond) {
3171 CondReg.setIsUndef(OrigCond.isUndef());
3172 CondReg.setIsKill(OrigCond.isKill());
3173}
3174
3175unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
3176 MachineBasicBlock *TBB,
3177 MachineBasicBlock *FBB,
3178 ArrayRef<MachineOperand> Cond,
3179 const DebugLoc &DL,
3180 int *BytesAdded) const {
3181 if (!FBB && Cond.empty()) {
3182 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
3183 .addMBB(MBB: TBB);
3184 if (BytesAdded)
3185 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3186 return 1;
3187 }
3188
3189 assert(TBB && Cond[0].isImm());
3190
3191 unsigned Opcode
3192 = getBranchOpcode(Cond: static_cast<BranchPredicate>(Cond[0].getImm()));
3193
3194 if (!FBB) {
3195 MachineInstr *CondBr =
3196 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode))
3197 .addMBB(MBB: TBB);
3198
3199 // Copy the flags onto the implicit condition register operand.
3200 preserveCondRegFlags(CondReg&: CondBr->getOperand(i: 1), OrigCond: Cond[1]);
3201 fixImplicitOperands(MI&: *CondBr);
3202
3203 if (BytesAdded)
3204 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3205 return 1;
3206 }
3207
3208 assert(TBB && FBB);
3209
3210 MachineInstr *CondBr =
3211 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode))
3212 .addMBB(MBB: TBB);
3213 fixImplicitOperands(MI&: *CondBr);
3214 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
3215 .addMBB(MBB: FBB);
3216
3217 MachineOperand &CondReg = CondBr->getOperand(i: 1);
3218 CondReg.setIsUndef(Cond[1].isUndef());
3219 CondReg.setIsKill(Cond[1].isKill());
3220
3221 if (BytesAdded)
3222 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3223
3224 return 2;
3225}
3226
3227bool SIInstrInfo::reverseBranchCondition(
3228 SmallVectorImpl<MachineOperand> &Cond) const {
3229 if (Cond.size() != 2) {
3230 return true;
3231 }
3232
3233 if (Cond[0].isImm()) {
3234 Cond[0].setImm(-Cond[0].getImm());
3235 return false;
3236 }
3237
3238 return true;
3239}
3240
3241bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
3242 ArrayRef<MachineOperand> Cond,
3243 Register DstReg, Register TrueReg,
3244 Register FalseReg, int &CondCycles,
3245 int &TrueCycles, int &FalseCycles) const {
3246 switch (Cond[0].getImm()) {
3247 case VCCNZ:
3248 case VCCZ: {
3249 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3250 const TargetRegisterClass *RC = MRI.getRegClass(Reg: TrueReg);
3251 if (MRI.getRegClass(Reg: FalseReg) != RC)
3252 return false;
3253
3254 int NumInsts = AMDGPU::getRegBitWidth(RC: *RC) / 32;
3255 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3256
3257 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3258 return RI.hasVGPRs(RC) && NumInsts <= 6;
3259 }
3260 case SCC_TRUE:
3261 case SCC_FALSE: {
3262 // FIXME: We could insert for VGPRs if we could replace the original compare
3263 // with a vector one.
3264 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3265 const TargetRegisterClass *RC = MRI.getRegClass(Reg: TrueReg);
3266 if (MRI.getRegClass(Reg: FalseReg) != RC)
3267 return false;
3268
3269 int NumInsts = AMDGPU::getRegBitWidth(RC: *RC) / 32;
3270
3271 // Multiples of 8 can do s_cselect_b64
3272 if (NumInsts % 2 == 0)
3273 NumInsts /= 2;
3274
3275 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3276 return RI.isSGPRClass(RC);
3277 }
3278 default:
3279 return false;
3280 }
3281}
3282
3283void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
3284 MachineBasicBlock::iterator I, const DebugLoc &DL,
3285 Register DstReg, ArrayRef<MachineOperand> Cond,
3286 Register TrueReg, Register FalseReg) const {
3287 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3288 if (Pred == VCCZ || Pred == SCC_FALSE) {
3289 Pred = static_cast<BranchPredicate>(-Pred);
3290 std::swap(a&: TrueReg, b&: FalseReg);
3291 }
3292
3293 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3294 const TargetRegisterClass *DstRC = MRI.getRegClass(Reg: DstReg);
3295 unsigned DstSize = RI.getRegSizeInBits(RC: *DstRC);
3296
3297 if (DstSize == 32) {
3298 MachineInstr *Select;
3299 if (Pred == SCC_TRUE) {
3300 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: DstReg)
3301 .addReg(RegNo: TrueReg)
3302 .addReg(RegNo: FalseReg);
3303 } else {
3304 // Instruction's operands are backwards from what is expected.
3305 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e32), DestReg: DstReg)
3306 .addReg(RegNo: FalseReg)
3307 .addReg(RegNo: TrueReg);
3308 }
3309
3310 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3311 return;
3312 }
3313
3314 if (DstSize == 64 && Pred == SCC_TRUE) {
3315 MachineInstr *Select =
3316 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: DstReg)
3317 .addReg(RegNo: TrueReg)
3318 .addReg(RegNo: FalseReg);
3319
3320 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3321 return;
3322 }
3323
3324 static const int16_t Sub0_15[] = {
3325 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3326 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3327 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3328 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3329 };
3330
3331 static const int16_t Sub0_15_64[] = {
3332 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3333 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3334 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3335 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3336 };
3337
3338 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3339 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3340 const int16_t *SubIndices = Sub0_15;
3341 int NElts = DstSize / 32;
3342
3343 // 64-bit select is only available for SALU.
3344 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3345 if (Pred == SCC_TRUE) {
3346 if (NElts % 2) {
3347 SelOp = AMDGPU::S_CSELECT_B32;
3348 EltRC = &AMDGPU::SGPR_32RegClass;
3349 } else {
3350 SelOp = AMDGPU::S_CSELECT_B64;
3351 EltRC = &AMDGPU::SGPR_64RegClass;
3352 SubIndices = Sub0_15_64;
3353 NElts /= 2;
3354 }
3355 }
3356
3357 MachineInstrBuilder MIB = BuildMI(
3358 BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg);
3359
3360 I = MIB->getIterator();
3361
3362 SmallVector<Register, 8> Regs;
3363 for (int Idx = 0; Idx != NElts; ++Idx) {
3364 Register DstElt = MRI.createVirtualRegister(RegClass: EltRC);
3365 Regs.push_back(Elt: DstElt);
3366
3367 unsigned SubIdx = SubIndices[Idx];
3368
3369 MachineInstr *Select;
3370 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3371 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SelOp), DestReg: DstElt)
3372 .addReg(RegNo: FalseReg, Flags: {}, SubReg: SubIdx)
3373 .addReg(RegNo: TrueReg, Flags: {}, SubReg: SubIdx);
3374 } else {
3375 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SelOp), DestReg: DstElt)
3376 .addReg(RegNo: TrueReg, Flags: {}, SubReg: SubIdx)
3377 .addReg(RegNo: FalseReg, Flags: {}, SubReg: SubIdx);
3378 }
3379
3380 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3381 fixImplicitOperands(MI&: *Select);
3382
3383 MIB.addReg(RegNo: DstElt)
3384 .addImm(Val: SubIdx);
3385 }
3386}
3387
3388bool SIInstrInfo::isXcntDrain(const MachineInstr &MI) {
3389
3390 if (MI.isBranch() || MI.isCall() || MI.isReturn() || MI.isIndirectBranch())
3391 return true;
3392
3393 switch (MI.getOpcode()) {
3394 case AMDGPU::S_ENDPGM:
3395 case AMDGPU::S_ENDPGM_SAVED:
3396 case AMDGPU::S_TRAP:
3397 case AMDGPU::S_GETREG_B32:
3398 case AMDGPU::S_SETREG_B32:
3399 case AMDGPU::S_SETREG_B32_mode:
3400 case AMDGPU::S_SETREG_IMM32_B32:
3401 case AMDGPU::S_SETREG_IMM32_B32_mode:
3402 case AMDGPU::S_SENDMSG:
3403 case AMDGPU::S_SENDMSGHALT:
3404 case AMDGPU::S_SENDMSG_RTN_B32:
3405 case AMDGPU::S_SENDMSG_RTN_B64:
3406 case AMDGPU::S_BARRIER_WAIT:
3407 case AMDGPU::S_BARRIER_SIGNAL_M0:
3408 case AMDGPU::S_BARRIER_SIGNAL_IMM:
3409 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
3410 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
3411 return true;
3412 default:
3413 return false;
3414 }
3415}
3416
3417bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
3418 switch (MI.getOpcode()) {
3419 case AMDGPU::V_MOV_B16_t16_e32:
3420 case AMDGPU::V_MOV_B16_t16_e64:
3421 case AMDGPU::V_MOV_B32_e32:
3422 case AMDGPU::V_MOV_B32_e64:
3423 case AMDGPU::V_MOV_B64_PSEUDO:
3424 case AMDGPU::V_MOV_B64_e32:
3425 case AMDGPU::V_MOV_B64_e64:
3426 case AMDGPU::S_MOV_B32:
3427 case AMDGPU::S_MOV_B64:
3428 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3429 case AMDGPU::COPY:
3430 case AMDGPU::WWM_COPY:
3431 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3432 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3433 case AMDGPU::V_ACCVGPR_MOV_B32:
3434 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3435 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3436 return true;
3437 default:
3438 return false;
3439 }
3440}
3441
3442unsigned SIInstrInfo::getFoldableCopySrcIdx(const MachineInstr &MI) {
3443 switch (MI.getOpcode()) {
3444 case AMDGPU::V_MOV_B16_t16_e32:
3445 case AMDGPU::V_MOV_B16_t16_e64:
3446 return 2;
3447 case AMDGPU::V_MOV_B32_e32:
3448 case AMDGPU::V_MOV_B32_e64:
3449 case AMDGPU::V_MOV_B64_PSEUDO:
3450 case AMDGPU::V_MOV_B64_e32:
3451 case AMDGPU::V_MOV_B64_e64:
3452 case AMDGPU::S_MOV_B32:
3453 case AMDGPU::S_MOV_B64:
3454 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3455 case AMDGPU::COPY:
3456 case AMDGPU::WWM_COPY:
3457 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3458 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3459 case AMDGPU::V_ACCVGPR_MOV_B32:
3460 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3461 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3462 return 1;
3463 default:
3464 llvm_unreachable("MI is not a foldable copy");
3465 }
3466}
3467
3468static constexpr AMDGPU::OpName ModifierOpNames[] = {
3469 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3470 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3471 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3472
3473void SIInstrInfo::removeModOperands(MachineInstr &MI) const {
3474 unsigned Opc = MI.getOpcode();
3475 for (AMDGPU::OpName Name : reverse(C: ModifierOpNames)) {
3476 int Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name);
3477 if (Idx >= 0)
3478 MI.removeOperand(OpNo: Idx);
3479 }
3480}
3481
3482void SIInstrInfo::mutateAndCleanupImplicit(MachineInstr &MI,
3483 const MCInstrDesc &NewDesc) const {
3484 MI.setDesc(NewDesc);
3485
3486 // Remove any leftover implicit operands from mutating the instruction. e.g.
3487 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3488 // anymore.
3489 const MCInstrDesc &Desc = MI.getDesc();
3490 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3491 Desc.implicit_defs().size();
3492
3493 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3494 MI.removeOperand(OpNo: I);
3495}
3496
3497std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3498 unsigned SubRegIndex) {
3499 switch (SubRegIndex) {
3500 case AMDGPU::NoSubRegister:
3501 return Imm;
3502 case AMDGPU::sub0:
3503 return SignExtend64<32>(x: Imm);
3504 case AMDGPU::sub1:
3505 return SignExtend64<32>(x: Imm >> 32);
3506 case AMDGPU::lo16:
3507 return SignExtend64<16>(x: Imm);
3508 case AMDGPU::hi16:
3509 return SignExtend64<16>(x: Imm >> 16);
3510 case AMDGPU::sub1_lo16:
3511 return SignExtend64<16>(x: Imm >> 32);
3512 case AMDGPU::sub1_hi16:
3513 return SignExtend64<16>(x: Imm >> 48);
3514 default:
3515 return std::nullopt;
3516 }
3517
3518 llvm_unreachable("covered subregister switch");
3519}
3520
3521static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3522 switch (Opc) {
3523 case AMDGPU::V_MAC_F16_e32:
3524 case AMDGPU::V_MAC_F16_e64:
3525 case AMDGPU::V_MAD_F16_e64:
3526 return AMDGPU::V_MADAK_F16;
3527 case AMDGPU::V_MAC_F32_e32:
3528 case AMDGPU::V_MAC_F32_e64:
3529 case AMDGPU::V_MAD_F32_e64:
3530 return AMDGPU::V_MADAK_F32;
3531 case AMDGPU::V_FMAC_F32_e32:
3532 case AMDGPU::V_FMAC_F32_e64:
3533 case AMDGPU::V_FMA_F32_e64:
3534 return AMDGPU::V_FMAAK_F32;
3535 case AMDGPU::V_FMAC_F16_e32:
3536 case AMDGPU::V_FMAC_F16_e64:
3537 case AMDGPU::V_FMAC_F16_t16_e64:
3538 case AMDGPU::V_FMAC_F16_fake16_e64:
3539 case AMDGPU::V_FMAC_F16_t16_e32:
3540 case AMDGPU::V_FMAC_F16_fake16_e32:
3541 case AMDGPU::V_FMA_F16_e64:
3542 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3543 ? AMDGPU::V_FMAAK_F16_t16
3544 : AMDGPU::V_FMAAK_F16_fake16
3545 : AMDGPU::V_FMAAK_F16;
3546 case AMDGPU::V_FMAC_F64_e32:
3547 case AMDGPU::V_FMAC_F64_e64:
3548 case AMDGPU::V_FMA_F64_e64:
3549 return AMDGPU::V_FMAAK_F64;
3550 default:
3551 llvm_unreachable("invalid instruction");
3552 }
3553}
3554
3555static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3556 switch (Opc) {
3557 case AMDGPU::V_MAC_F16_e32:
3558 case AMDGPU::V_MAC_F16_e64:
3559 case AMDGPU::V_MAD_F16_e64:
3560 return AMDGPU::V_MADMK_F16;
3561 case AMDGPU::V_MAC_F32_e32:
3562 case AMDGPU::V_MAC_F32_e64:
3563 case AMDGPU::V_MAD_F32_e64:
3564 return AMDGPU::V_MADMK_F32;
3565 case AMDGPU::V_FMAC_F32_e32:
3566 case AMDGPU::V_FMAC_F32_e64:
3567 case AMDGPU::V_FMA_F32_e64:
3568 return AMDGPU::V_FMAMK_F32;
3569 case AMDGPU::V_FMAC_F16_e32:
3570 case AMDGPU::V_FMAC_F16_e64:
3571 case AMDGPU::V_FMAC_F16_t16_e64:
3572 case AMDGPU::V_FMAC_F16_fake16_e64:
3573 case AMDGPU::V_FMAC_F16_t16_e32:
3574 case AMDGPU::V_FMAC_F16_fake16_e32:
3575 case AMDGPU::V_FMA_F16_e64:
3576 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3577 ? AMDGPU::V_FMAMK_F16_t16
3578 : AMDGPU::V_FMAMK_F16_fake16
3579 : AMDGPU::V_FMAMK_F16;
3580 case AMDGPU::V_FMAC_F64_e32:
3581 case AMDGPU::V_FMAC_F64_e64:
3582 case AMDGPU::V_FMA_F64_e64:
3583 return AMDGPU::V_FMAMK_F64;
3584 default:
3585 llvm_unreachable("invalid instruction");
3586 }
3587}
3588
3589bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3590 Register Reg, MachineRegisterInfo *MRI) const {
3591 int64_t Imm;
3592 if (!getConstValDefinedInReg(MI: DefMI, Reg, ImmVal&: Imm))
3593 return false;
3594
3595 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(RegNo: Reg);
3596
3597 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3598
3599 unsigned Opc = UseMI.getOpcode();
3600 if (Opc == AMDGPU::COPY) {
3601 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3602
3603 Register DstReg = UseMI.getOperand(i: 0).getReg();
3604 Register UseSubReg = UseMI.getOperand(i: 1).getSubReg();
3605
3606 const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI: *MRI, Reg: DstReg);
3607
3608 if (HasMultipleUses) {
3609 // TODO: This should fold in more cases with multiple use, but we need to
3610 // more carefully consider what those uses are.
3611 unsigned ImmDefSize = RI.getRegSizeInBits(RC: *MRI->getRegClass(Reg));
3612
3613 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3614 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3615 return false;
3616
3617 // Most of the time folding a 32-bit inline constant is free (though this
3618 // might not be true if we can't later fold it into a real user).
3619 //
3620 // FIXME: This isInlineConstant check is imprecise if
3621 // getConstValDefinedInReg handled the tricky non-mov cases.
3622 if (ImmDefSize == 32 &&
3623 !isInlineConstant(ImmVal: Imm, OperandType: AMDGPU::OPERAND_REG_IMM_INT32))
3624 return false;
3625 }
3626
3627 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3628 RI.getSubRegIdxSize(Idx: UseSubReg) == 16;
3629
3630 if (Is16Bit) {
3631 if (RI.hasVGPRs(RC: DstRC))
3632 return false; // Do not clobber vgpr_hi16
3633
3634 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3635 return false;
3636 }
3637
3638 MachineFunction *MF = UseMI.getMF();
3639
3640 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3641 MCRegister MovDstPhysReg =
3642 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3643
3644 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, SubRegIndex: UseSubReg);
3645
3646 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3647 for (unsigned MovOp :
3648 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3649 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3650 const MCInstrDesc &MovDesc = get(Opcode: MovOp);
3651
3652 const TargetRegisterClass *MovDstRC = getRegClass(MCID: MovDesc, OpNum: 0);
3653 if (Is16Bit) {
3654 // We just need to find a correctly sized register class, so the
3655 // subregister index compatibility doesn't matter since we're statically
3656 // extracting the immediate value.
3657 MovDstRC = RI.getMatchingSuperRegClass(A: MovDstRC, B: DstRC, Idx: AMDGPU::lo16);
3658 if (!MovDstRC)
3659 continue;
3660
3661 if (MovDstPhysReg) {
3662 // FIXME: We probably should not do this. If there is a live value in
3663 // the high half of the register, it will be corrupted.
3664 MovDstPhysReg =
3665 RI.getMatchingSuperReg(Reg: MovDstPhysReg, SubIdx: AMDGPU::lo16, RC: MovDstRC);
3666 if (!MovDstPhysReg)
3667 continue;
3668 }
3669 }
3670
3671 // Result class isn't the right size, try the next instruction.
3672 if (MovDstPhysReg) {
3673 if (!MovDstRC->contains(Reg: MovDstPhysReg))
3674 return false;
3675 } else if (!MRI->constrainRegClass(Reg: DstReg, RC: MovDstRC)) {
3676 // TODO: This will be overly conservative in the case of 16-bit virtual
3677 // SGPRs. We could hack up the virtual register uses to use a compatible
3678 // 32-bit class.
3679 continue;
3680 }
3681
3682 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3683
3684 // Ensure the interpreted immediate value is a valid operand in the new
3685 // mov.
3686 //
3687 // FIXME: isImmOperandLegal should have form that doesn't require existing
3688 // MachineInstr or MachineOperand
3689 if (!RI.opCanUseLiteralConstant(OpType: OpInfo.OperandType) &&
3690 !isInlineConstant(ImmVal: *SubRegImm, OperandType: OpInfo.OperandType))
3691 break;
3692
3693 NewOpc = MovOp;
3694 break;
3695 }
3696
3697 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3698 return false;
3699
3700 if (Is16Bit) {
3701 UseMI.getOperand(i: 0).setSubReg(AMDGPU::NoSubRegister);
3702 if (MovDstPhysReg)
3703 UseMI.getOperand(i: 0).setReg(MovDstPhysReg);
3704 assert(UseMI.getOperand(1).getReg().isVirtual());
3705 }
3706
3707 const MCInstrDesc &NewMCID = get(Opcode: NewOpc);
3708 UseMI.setDesc(NewMCID);
3709 UseMI.getOperand(i: 1).ChangeToImmediate(ImmVal: *SubRegImm);
3710 UseMI.addImplicitDefUseOperands(MF&: *MF);
3711 return true;
3712 }
3713
3714 if (HasMultipleUses)
3715 return false;
3716
3717 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3718 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3719 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3720 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3721 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3722 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3723 Opc == AMDGPU::V_FMAC_F64_e64) {
3724 // Don't fold if we are using source or output modifiers. The new VOP2
3725 // instructions don't have them.
3726 if (hasAnyModifiersSet(MI: UseMI))
3727 return false;
3728
3729 // If this is a free constant, there's no reason to do this.
3730 // TODO: We could fold this here instead of letting SIFoldOperands do it
3731 // later.
3732 int Src0Idx = getNamedOperandIdx(Opcode: UseMI.getOpcode(), Name: AMDGPU::OpName::src0);
3733
3734 // Any src operand can be used for the legality check.
3735 if (isInlineConstant(MI: UseMI, OpIdx: Src0Idx, ImmVal: Imm))
3736 return false;
3737
3738 MachineOperand *Src0 = &UseMI.getOperand(i: Src0Idx);
3739
3740 MachineOperand *Src1 = getNamedOperand(MI&: UseMI, OperandName: AMDGPU::OpName::src1);
3741 MachineOperand *Src2 = getNamedOperand(MI&: UseMI, OperandName: AMDGPU::OpName::src2);
3742
3743 auto CopyRegOperandToNarrowerRC =
3744 [MRI, this](MachineInstr &MI, unsigned OpNo,
3745 const TargetRegisterClass *NewRC) -> void {
3746 if (!MI.getOperand(i: OpNo).isReg())
3747 return;
3748 Register Reg = MI.getOperand(i: OpNo).getReg();
3749 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI: *MRI, Reg);
3750 if (RI.getCommonSubClass(A: RC, B: NewRC) != NewRC)
3751 return;
3752 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
3753 BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(),
3754 MCID: get(Opcode: AMDGPU::COPY), DestReg: Tmp)
3755 .addReg(RegNo: Reg);
3756 MI.getOperand(i: OpNo).setReg(Tmp);
3757 MI.getOperand(i: OpNo).setIsKill();
3758 };
3759
3760 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3761 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3762 (Src1->isReg() && Src1->getReg() == Reg)) {
3763 MachineOperand *RegSrc =
3764 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3765 if (!RegSrc->isReg())
3766 return false;
3767 if (RI.isSGPRClass(RC: MRI->getRegClass(Reg: RegSrc->getReg())) &&
3768 ST.getConstantBusLimit(Opcode: Opc) < 2)
3769 return false;
3770
3771 if (!Src2->isReg() || RI.isSGPRClass(RC: MRI->getRegClass(Reg: Src2->getReg())))
3772 return false;
3773
3774 // If src2 is also a literal constant then we have to choose which one to
3775 // fold. In general it is better to choose madak so that the other literal
3776 // can be materialized in an sgpr instead of a vgpr:
3777 // s_mov_b32 s0, literal
3778 // v_madak_f32 v0, s0, v0, literal
3779 // Instead of:
3780 // v_mov_b32 v1, literal
3781 // v_madmk_f32 v0, v0, literal, v1
3782 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src2->getReg());
3783 if (Def && Def->isMoveImmediate() &&
3784 !isInlineConstant(MO: Def->getOperand(i: 1)))
3785 return false;
3786
3787 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3788 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
3789 return false;
3790
3791 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3792 Imm, SubRegIndex: RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3793
3794 // FIXME: This would be a lot easier if we could return a new instruction
3795 // instead of having to modify in place.
3796
3797 Register SrcReg = RegSrc->getReg();
3798 unsigned SrcSubReg = RegSrc->getSubReg();
3799 Src0->setReg(SrcReg);
3800 Src0->setSubReg(SrcSubReg);
3801 Src0->setIsKill(RegSrc->isKill());
3802
3803 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3804 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3805 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3806 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3807 UseMI.untieRegOperand(
3808 OpIdx: AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2));
3809
3810 Src1->ChangeToImmediate(ImmVal: *SubRegImm);
3811
3812 removeModOperands(MI&: UseMI);
3813 UseMI.setDesc(get(Opcode: NewOpc));
3814
3815 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3816 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3817 const TargetRegisterClass *NewRC = getRegClass(MCID: get(Opcode: NewOpc), OpNum: 0);
3818 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
3819 BuildMI(BB&: *UseMI.getParent(), I: std::next(x: UseMI.getIterator()),
3820 MIMD: UseMI.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
3821 DestReg: UseMI.getOperand(i: 0).getReg())
3822 .addReg(RegNo: Tmp, Flags: RegState::Kill);
3823 UseMI.getOperand(i: 0).setReg(Tmp);
3824 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3825 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3826 }
3827
3828 bool DeleteDef = MRI->use_nodbg_empty(RegNo: Reg);
3829 if (DeleteDef)
3830 DefMI.eraseFromParent();
3831
3832 return true;
3833 }
3834
3835 // Added part is the constant: Use v_madak_{f16, f32}.
3836 if (Src2->isReg() && Src2->getReg() == Reg) {
3837 if (ST.getConstantBusLimit(Opcode: Opc) < 2) {
3838 // Not allowed to use constant bus for another operand.
3839 // We can however allow an inline immediate as src0.
3840 bool Src0Inlined = false;
3841 if (Src0->isReg()) {
3842 // Try to inline constant if possible.
3843 // If the Def moves immediate and the use is single
3844 // We are saving VGPR here.
3845 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src0->getReg());
3846 if (Def && Def->isMoveImmediate() &&
3847 isInlineConstant(MO: Def->getOperand(i: 1)) &&
3848 MRI->hasOneNonDBGUse(RegNo: Src0->getReg())) {
3849 Src0->ChangeToImmediate(ImmVal: Def->getOperand(i: 1).getImm());
3850 Src0Inlined = true;
3851 } else if (ST.getConstantBusLimit(Opcode: Opc) <= 1 &&
3852 RI.isSGPRReg(MRI: *MRI, Reg: Src0->getReg())) {
3853 return false;
3854 }
3855 // VGPR is okay as Src0 - fallthrough
3856 }
3857
3858 if (Src1->isReg() && !Src0Inlined) {
3859 // We have one slot for inlinable constant so far - try to fill it
3860 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src1->getReg());
3861 if (Def && Def->isMoveImmediate() &&
3862 isInlineConstant(MO: Def->getOperand(i: 1)) &&
3863 MRI->hasOneNonDBGUse(RegNo: Src1->getReg()) && commuteInstruction(MI&: UseMI))
3864 Src0->ChangeToImmediate(ImmVal: Def->getOperand(i: 1).getImm());
3865 else if (RI.isSGPRReg(MRI: *MRI, Reg: Src1->getReg()))
3866 return false;
3867 // VGPR is okay as Src1 - fallthrough
3868 }
3869 }
3870
3871 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3872 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
3873 return false;
3874
3875 // FIXME: This would be a lot easier if we could return a new instruction
3876 // instead of having to modify in place.
3877
3878 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3879 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3880 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3881 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3882 UseMI.untieRegOperand(
3883 OpIdx: AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2));
3884
3885 const std::optional<int64_t> SubRegImm =
3886 extractSubregFromImm(Imm, SubRegIndex: Src2->getSubReg());
3887
3888 // ChangingToImmediate adds Src2 back to the instruction.
3889 Src2->ChangeToImmediate(ImmVal: *SubRegImm);
3890
3891 // These come before src2.
3892 removeModOperands(MI&: UseMI);
3893 UseMI.setDesc(get(Opcode: NewOpc));
3894
3895 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3896 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3897 const TargetRegisterClass *NewRC = getRegClass(MCID: get(Opcode: NewOpc), OpNum: 0);
3898 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
3899 BuildMI(BB&: *UseMI.getParent(), I: std::next(x: UseMI.getIterator()),
3900 MIMD: UseMI.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
3901 DestReg: UseMI.getOperand(i: 0).getReg())
3902 .addReg(RegNo: Tmp, Flags: RegState::Kill);
3903 UseMI.getOperand(i: 0).setReg(Tmp);
3904 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3905 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
3906 }
3907
3908 // It might happen that UseMI was commuted
3909 // and we now have SGPR as SRC1. If so 2 inlined
3910 // constant and SGPR are illegal.
3911 legalizeOperands(MI&: UseMI);
3912
3913 bool DeleteDef = MRI->use_nodbg_empty(RegNo: Reg);
3914 if (DeleteDef)
3915 DefMI.eraseFromParent();
3916
3917 return true;
3918 }
3919 }
3920
3921 return false;
3922}
3923
3924static bool
3925memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
3926 ArrayRef<const MachineOperand *> BaseOps2) {
3927 if (BaseOps1.size() != BaseOps2.size())
3928 return false;
3929 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3930 if (!BaseOps1[I]->isIdenticalTo(Other: *BaseOps2[I]))
3931 return false;
3932 }
3933 return true;
3934}
3935
3936static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3937 LocationSize WidthB, int OffsetB) {
3938 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3939 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3940 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3941 return LowWidth.hasValue() &&
3942 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3943}
3944
3945bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3946 const MachineInstr &MIb) const {
3947 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3948 int64_t Offset0, Offset1;
3949 LocationSize Dummy0 = LocationSize::precise(Value: 0);
3950 LocationSize Dummy1 = LocationSize::precise(Value: 0);
3951 bool Offset0IsScalable, Offset1IsScalable;
3952 if (!getMemOperandsWithOffsetWidth(LdSt: MIa, BaseOps&: BaseOps0, Offset&: Offset0, OffsetIsScalable&: Offset0IsScalable,
3953 Width&: Dummy0, TRI: &RI) ||
3954 !getMemOperandsWithOffsetWidth(LdSt: MIb, BaseOps&: BaseOps1, Offset&: Offset1, OffsetIsScalable&: Offset1IsScalable,
3955 Width&: Dummy1, TRI: &RI))
3956 return false;
3957
3958 if (!memOpsHaveSameBaseOperands(BaseOps1: BaseOps0, BaseOps2: BaseOps1))
3959 return false;
3960
3961 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3962 // FIXME: Handle ds_read2 / ds_write2.
3963 return false;
3964 }
3965 LocationSize Width0 = MIa.memoperands().front()->getSize();
3966 LocationSize Width1 = MIb.memoperands().front()->getSize();
3967 return offsetsDoNotOverlap(WidthA: Width0, OffsetA: Offset0, WidthB: Width1, OffsetB: Offset1);
3968}
3969
3970bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
3971 const MachineInstr &MIb) const {
3972 assert(MIa.mayLoadOrStore() &&
3973 "MIa must load from or modify a memory location");
3974 assert(MIb.mayLoadOrStore() &&
3975 "MIb must load from or modify a memory location");
3976
3977 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
3978 return false;
3979
3980 // XXX - Can we relax this between address spaces?
3981 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3982 return false;
3983
3984 if (isLDSDMA(MI: MIa) || isLDSDMA(MI: MIb))
3985 return false;
3986
3987 if (MIa.isBundle() || MIb.isBundle())
3988 return false;
3989
3990 // TODO: Should we check the address space from the MachineMemOperand? That
3991 // would allow us to distinguish objects we know don't alias based on the
3992 // underlying address space, even if it was lowered to a different one,
3993 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3994 // buffer.
3995 if (isDS(MI: MIa)) {
3996 if (isDS(MI: MIb))
3997 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3998
3999 return !isFLAT(MI: MIb) || isSegmentSpecificFLAT(MI: MIb);
4000 }
4001
4002 if (isMUBUF(MI: MIa) || isMTBUF(MI: MIa)) {
4003 if (isMUBUF(MI: MIb) || isMTBUF(MI: MIb))
4004 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4005
4006 if (isFLAT(MI: MIb))
4007 return isFLATScratch(MI: MIb);
4008
4009 return !isSMRD(MI: MIb);
4010 }
4011
4012 if (isSMRD(MI: MIa)) {
4013 if (isSMRD(MI: MIb))
4014 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4015
4016 if (isFLAT(MI: MIb))
4017 return isFLATScratch(MI: MIb);
4018
4019 return !isMUBUF(MI: MIb) && !isMTBUF(MI: MIb);
4020 }
4021
4022 if (isFLAT(MI: MIa)) {
4023 if (isFLAT(MI: MIb)) {
4024 if ((isFLATScratch(MI: MIa) && isFLATGlobal(MI: MIb)) ||
4025 (isFLATGlobal(MI: MIa) && isFLATScratch(MI: MIb)))
4026 return true;
4027
4028 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4029 }
4030
4031 return false;
4032 }
4033
4034 return false;
4035}
4036
4037static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,
4038 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4039 if (Reg.isPhysical())
4040 return false;
4041 auto *Def = MRI.getUniqueVRegDef(Reg);
4042 if (Def && SIInstrInfo::isFoldableCopy(MI: *Def) && Def->getOperand(i: 1).isImm()) {
4043 Imm = Def->getOperand(i: 1).getImm();
4044 if (DefMI)
4045 *DefMI = Def;
4046 return true;
4047 }
4048 return false;
4049}
4050
4051static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4052 MachineInstr **DefMI = nullptr) {
4053 if (!MO->isReg())
4054 return false;
4055 const MachineFunction *MF = MO->getParent()->getMF();
4056 const MachineRegisterInfo &MRI = MF->getRegInfo();
4057 return getFoldableImm(Reg: MO->getReg(), MRI, Imm, DefMI);
4058}
4059
4060static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
4061 MachineInstr &NewMI) {
4062 if (LV) {
4063 unsigned NumOps = MI.getNumOperands();
4064 for (unsigned I = 1; I < NumOps; ++I) {
4065 MachineOperand &Op = MI.getOperand(i: I);
4066 if (Op.isReg() && Op.isKill())
4067 LV->replaceKillInstruction(Reg: Op.getReg(), OldMI&: MI, NewMI);
4068 }
4069 }
4070}
4071
4072static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4073 switch (Opc) {
4074 case AMDGPU::V_MAC_F16_e32:
4075 case AMDGPU::V_MAC_F16_e64:
4076 return AMDGPU::V_MAD_F16_e64;
4077 case AMDGPU::V_MAC_F32_e32:
4078 case AMDGPU::V_MAC_F32_e64:
4079 return AMDGPU::V_MAD_F32_e64;
4080 case AMDGPU::V_MAC_LEGACY_F32_e32:
4081 case AMDGPU::V_MAC_LEGACY_F32_e64:
4082 return AMDGPU::V_MAD_LEGACY_F32_e64;
4083 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4084 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4085 return AMDGPU::V_FMA_LEGACY_F32_e64;
4086 case AMDGPU::V_FMAC_F16_e32:
4087 case AMDGPU::V_FMAC_F16_e64:
4088 case AMDGPU::V_FMAC_F16_t16_e64:
4089 case AMDGPU::V_FMAC_F16_fake16_e64:
4090 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4091 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4092 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4093 : AMDGPU::V_FMA_F16_gfx9_e64;
4094 case AMDGPU::V_FMAC_F32_e32:
4095 case AMDGPU::V_FMAC_F32_e64:
4096 return AMDGPU::V_FMA_F32_e64;
4097 case AMDGPU::V_FMAC_F64_e32:
4098 case AMDGPU::V_FMAC_F64_e64:
4099 return AMDGPU::V_FMA_F64_e64;
4100 default:
4101 llvm_unreachable("invalid instruction");
4102 }
4103}
4104
4105/// Helper struct for the implementation of 3-address conversion to communicate
4106/// updates made to instruction operands.
4107struct SIInstrInfo::ThreeAddressUpdates {
4108 /// Other instruction whose def is no longer used by the converted
4109 /// instruction.
4110 MachineInstr *RemoveMIUse = nullptr;
4111};
4112
4113MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
4114 LiveVariables *LV,
4115 LiveIntervals *LIS) const {
4116 MachineBasicBlock &MBB = *MI.getParent();
4117 MachineInstr *CandidateMI = &MI;
4118
4119 if (MI.isBundle()) {
4120 // This is a temporary placeholder for bundle handling that enables us to
4121 // exercise the relevant code paths in the two-address instruction pass.
4122 if (MI.getBundleSize() != 1)
4123 return nullptr;
4124 CandidateMI = MI.getNextNode();
4125 }
4126
4127 ThreeAddressUpdates U;
4128 MachineInstr *NewMI = convertToThreeAddressImpl(MI&: *CandidateMI, Updates&: U);
4129 if (!NewMI)
4130 return nullptr;
4131
4132 if (MI.isBundle()) {
4133 CandidateMI->eraseFromBundle();
4134
4135 for (MachineOperand &MO : MI.all_defs()) {
4136 if (MO.isTied())
4137 MI.untieRegOperand(OpIdx: MO.getOperandNo());
4138 }
4139 } else {
4140 updateLiveVariables(LV, MI, NewMI&: *NewMI);
4141 if (LIS) {
4142 LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *NewMI);
4143 // SlotIndex of defs needs to be updated when converting to early-clobber
4144 MachineOperand &Def = NewMI->getOperand(i: 0);
4145 if (Def.isEarlyClobber() && Def.isReg() &&
4146 LIS->hasInterval(Reg: Def.getReg())) {
4147 SlotIndex OldIndex = LIS->getInstructionIndex(Instr: *NewMI).getRegSlot(EC: false);
4148 SlotIndex NewIndex = LIS->getInstructionIndex(Instr: *NewMI).getRegSlot(EC: true);
4149 auto &LI = LIS->getInterval(Reg: Def.getReg());
4150 auto UpdateDefIndex = [&](LiveRange &LR) {
4151 auto *S = LR.find(Pos: OldIndex);
4152 if (S != LR.end() && S->start == OldIndex) {
4153 assert(S->valno && S->valno->def == OldIndex);
4154 S->start = NewIndex;
4155 S->valno->def = NewIndex;
4156 }
4157 };
4158 UpdateDefIndex(LI);
4159 for (auto &SR : LI.subranges())
4160 UpdateDefIndex(SR);
4161 }
4162 }
4163 }
4164
4165 if (U.RemoveMIUse) {
4166 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4167 // The only user is the instruction which will be killed.
4168 Register DefReg = U.RemoveMIUse->getOperand(i: 0).getReg();
4169
4170 if (MRI.hasOneNonDBGUse(RegNo: DefReg)) {
4171 // We cannot just remove the DefMI here, calling pass will crash.
4172 U.RemoveMIUse->setDesc(get(Opcode: AMDGPU::IMPLICIT_DEF));
4173 U.RemoveMIUse->getOperand(i: 0).setIsDead(true);
4174 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4175 U.RemoveMIUse->removeOperand(OpNo: I);
4176 if (LV)
4177 LV->getVarInfo(Reg: DefReg).AliveBlocks.clear();
4178 }
4179
4180 if (MI.isBundle()) {
4181 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, Reg: DefReg);
4182 if (!VRI.Reads && !VRI.Writes) {
4183 for (MachineOperand &MO : MI.all_uses()) {
4184 if (MO.isReg() && MO.getReg() == DefReg) {
4185 assert(MO.getSubReg() == 0 &&
4186 "tied sub-registers in bundles currently not supported");
4187 MI.removeOperand(OpNo: MO.getOperandNo());
4188 break;
4189 }
4190 }
4191
4192 if (LIS)
4193 LIS->shrinkToUses(li: &LIS->getInterval(Reg: DefReg));
4194 }
4195 } else if (LIS) {
4196 LiveInterval &DefLI = LIS->getInterval(Reg: DefReg);
4197
4198 // We cannot delete the original instruction here, so hack out the use
4199 // in the original instruction with a dummy register so we can use
4200 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4201 // not have the complexity of deleting a use to consider here.
4202 Register DummyReg = MRI.cloneVirtualRegister(VReg: DefReg);
4203 for (MachineOperand &MIOp : MI.uses()) {
4204 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4205 MIOp.setIsUndef(true);
4206 MIOp.setReg(DummyReg);
4207 }
4208 }
4209
4210 if (MI.isBundle()) {
4211 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, Reg: DefReg);
4212 if (!VRI.Reads && !VRI.Writes) {
4213 for (MachineOperand &MIOp : MI.uses()) {
4214 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4215 MIOp.setIsUndef(true);
4216 MIOp.setReg(DummyReg);
4217 }
4218 }
4219 }
4220
4221 MI.addOperand(Op: MachineOperand::CreateReg(Reg: DummyReg, isDef: false, isImp: false, isKill: false,
4222 isDead: false, /*isUndef=*/true));
4223 }
4224
4225 LIS->shrinkToUses(li: &DefLI);
4226 }
4227 }
4228
4229 return MI.isBundle() ? &MI : NewMI;
4230}
4231
4232MachineInstr *
4233SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4234 ThreeAddressUpdates &U) const {
4235 MachineBasicBlock &MBB = *MI.getParent();
4236 unsigned Opc = MI.getOpcode();
4237
4238 // Handle MFMA.
4239 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opcode: Opc);
4240 if (NewMFMAOpc != -1) {
4241 MachineInstrBuilder MIB =
4242 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewMFMAOpc));
4243 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4244 MIB.add(MO: MI.getOperand(i: I));
4245 return MIB;
4246 }
4247
4248 if (SIInstrInfo::isWMMA(MI)) {
4249 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(Opc: MI.getOpcode());
4250 MachineInstrBuilder MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4251 .setMIFlags(MI.getFlags());
4252 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4253 MIB->addOperand(Op: MI.getOperand(i: I));
4254 return MIB;
4255 }
4256
4257 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4258 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4259 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4260 "present pre-RA");
4261
4262 // Handle MAC/FMAC.
4263 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4264 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4265 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4266 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4267 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4268 bool Src0Literal = false;
4269
4270 switch (Opc) {
4271 default:
4272 return nullptr;
4273 case AMDGPU::V_MAC_F16_e64:
4274 case AMDGPU::V_FMAC_F16_e64:
4275 case AMDGPU::V_FMAC_F16_t16_e64:
4276 case AMDGPU::V_FMAC_F16_fake16_e64:
4277 case AMDGPU::V_MAC_F32_e64:
4278 case AMDGPU::V_MAC_LEGACY_F32_e64:
4279 case AMDGPU::V_FMAC_F32_e64:
4280 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4281 case AMDGPU::V_FMAC_F64_e64:
4282 break;
4283 case AMDGPU::V_MAC_F16_e32:
4284 case AMDGPU::V_FMAC_F16_e32:
4285 case AMDGPU::V_MAC_F32_e32:
4286 case AMDGPU::V_MAC_LEGACY_F32_e32:
4287 case AMDGPU::V_FMAC_F32_e32:
4288 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4289 case AMDGPU::V_FMAC_F64_e32: {
4290 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
4291 Name: AMDGPU::OpName::src0);
4292 const MachineOperand *Src0 = &MI.getOperand(i: Src0Idx);
4293 if (!Src0->isReg() && !Src0->isImm())
4294 return nullptr;
4295
4296 if (Src0->isImm() && !isInlineConstant(MI, OpIdx: Src0Idx, MO: *Src0))
4297 Src0Literal = true;
4298
4299 break;
4300 }
4301 }
4302
4303 MachineInstrBuilder MIB;
4304 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
4305 const MachineOperand *Src0 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
4306 const MachineOperand *Src0Mods =
4307 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
4308 const MachineOperand *Src1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4309 const MachineOperand *Src1Mods =
4310 getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
4311 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
4312 const MachineOperand *Src2Mods =
4313 getNamedOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers);
4314 const MachineOperand *Clamp = getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
4315 const MachineOperand *Omod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
4316 const MachineOperand *OpSel = getNamedOperand(MI, OperandName: AMDGPU::OpName::op_sel);
4317
4318 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4319 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4320 // If we have an SGPR input, we will violate the constant bus restriction.
4321 (ST.getConstantBusLimit(Opcode: Opc) > 1 || !Src0->isReg() ||
4322 !RI.isSGPRReg(MRI: MBB.getParent()->getRegInfo(), Reg: Src0->getReg()))) {
4323 MachineInstr *DefMI;
4324
4325 int64_t Imm;
4326 if (!Src0Literal && getFoldableImm(MO: Src2, Imm, DefMI: &DefMI)) {
4327 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4328 if (pseudoToMCOpcode(Opcode: NewOpc) != -1) {
4329 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4330 .add(MO: *Dst)
4331 .add(MO: *Src0)
4332 .add(MO: *Src1)
4333 .addImm(Val: Imm)
4334 .setMIFlags(MI.getFlags());
4335 U.RemoveMIUse = DefMI;
4336 return MIB;
4337 }
4338 }
4339 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4340 if (!Src0Literal && getFoldableImm(MO: Src1, Imm, DefMI: &DefMI)) {
4341 if (pseudoToMCOpcode(Opcode: NewOpc) != -1) {
4342 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4343 .add(MO: *Dst)
4344 .add(MO: *Src0)
4345 .addImm(Val: Imm)
4346 .add(MO: *Src2)
4347 .setMIFlags(MI.getFlags());
4348 U.RemoveMIUse = DefMI;
4349 return MIB;
4350 }
4351 }
4352 if (Src0Literal || getFoldableImm(MO: Src0, Imm, DefMI: &DefMI)) {
4353 if (Src0Literal) {
4354 Imm = Src0->getImm();
4355 DefMI = nullptr;
4356 }
4357 if (pseudoToMCOpcode(Opcode: NewOpc) != -1 &&
4358 isOperandLegal(
4359 MI, OpIdx: AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::src0),
4360 MO: Src1)) {
4361 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4362 .add(MO: *Dst)
4363 .add(MO: *Src1)
4364 .addImm(Val: Imm)
4365 .add(MO: *Src2)
4366 .setMIFlags(MI.getFlags());
4367 U.RemoveMIUse = DefMI;
4368 return MIB;
4369 }
4370 }
4371 }
4372
4373 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4374 // if VOP3 does not allow a literal operand.
4375 if (Src0Literal && !ST.hasVOP3Literal())
4376 return nullptr;
4377
4378 unsigned NewOpc = getNewFMAInst(ST, Opc);
4379
4380 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
4381 return nullptr;
4382
4383 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4384 .add(MO: *Dst)
4385 .addImm(Val: Src0Mods ? Src0Mods->getImm() : 0)
4386 .add(MO: *Src0)
4387 .addImm(Val: Src1Mods ? Src1Mods->getImm() : 0)
4388 .add(MO: *Src1)
4389 .addImm(Val: Src2Mods ? Src2Mods->getImm() : 0)
4390 .add(MO: *Src2)
4391 .addImm(Val: Clamp ? Clamp->getImm() : 0)
4392 .addImm(Val: Omod ? Omod->getImm() : 0)
4393 .setMIFlags(MI.getFlags());
4394 if (AMDGPU::hasNamedOperand(Opcode: NewOpc, NamedIdx: AMDGPU::OpName::op_sel))
4395 MIB.addImm(Val: OpSel ? OpSel->getImm() : 0);
4396 return MIB;
4397}
4398
4399// It's not generally safe to move VALU instructions across these since it will
4400// start using the register as a base index rather than directly.
4401// XXX - Why isn't hasSideEffects sufficient for these?
4402static bool changesVGPRIndexingMode(const MachineInstr &MI) {
4403 switch (MI.getOpcode()) {
4404 case AMDGPU::S_SET_GPR_IDX_ON:
4405 case AMDGPU::S_SET_GPR_IDX_MODE:
4406 case AMDGPU::S_SET_GPR_IDX_OFF:
4407 return true;
4408 default:
4409 return false;
4410 }
4411}
4412
4413bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
4414 const MachineBasicBlock *MBB,
4415 const MachineFunction &MF) const {
4416 // Skipping the check for SP writes in the base implementation. The reason it
4417 // was added was apparently due to compile time concerns.
4418 //
4419 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4420 // but is probably avoidable.
4421
4422 // Copied from base implementation.
4423 // Terminators and labels can't be scheduled around.
4424 if (MI.isTerminator() || MI.isPosition())
4425 return true;
4426
4427 // INLINEASM_BR can jump to another block
4428 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4429 return true;
4430
4431 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(i: 0).getImm() == 0)
4432 return true;
4433
4434 // Target-independent instructions do not have an implicit-use of EXEC, even
4435 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4436 // boundaries prevents incorrect movements of such instructions.
4437 return MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &RI) ||
4438 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4439 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4440 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4441 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4442 changesVGPRIndexingMode(MI);
4443}
4444
4445bool SIInstrInfo::isAlwaysGDS(uint32_t Opcode) const {
4446 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4447 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4448 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4449}
4450
4451bool SIInstrInfo::mayAccessScratch(const MachineInstr &MI) const {
4452 // Instructions that access scratch use FLAT encoding or BUF encodings.
4453 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4454 return false;
4455
4456 // SCRATCH instructions always access scratch.
4457 if (isFLATScratch(MI))
4458 return true;
4459
4460 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4461 // via the aperture.
4462 if (MI.getMF()->getFunction().hasFnAttribute(Kind: "amdgpu-no-flat-scratch-init"))
4463 return false;
4464
4465 // If there are no memory operands then conservatively assume the flat
4466 // operation may access scratch.
4467 if (MI.memoperands_empty())
4468 return true;
4469
4470 // See if any memory operand specifies an address space that involves scratch.
4471 return any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *Memop) {
4472 unsigned AS = Memop->getAddrSpace();
4473 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4474 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4475 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4476 MD: *MD, Val: AMDGPUAS::PRIVATE_ADDRESS);
4477 }
4478 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4479 });
4480}
4481
4482bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
4483 assert(isFLAT(MI));
4484
4485 // All flat instructions use the VMEM counter except prefetch.
4486 if (!usesVM_CNT(MI))
4487 return false;
4488
4489 // If there are no memory operands then conservatively assume the flat
4490 // operation may access VMEM.
4491 if (MI.memoperands_empty())
4492 return true;
4493
4494 // See if any memory operand specifies an address space that involves VMEM.
4495 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4496 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4497 // (GDS) address space is not supported by flat operations. Therefore, simply
4498 // return true unless only the LDS address space is found.
4499 for (const MachineMemOperand *Memop : MI.memoperands()) {
4500 unsigned AS = Memop->getAddrSpace();
4501 assert(AS != AMDGPUAS::REGION_ADDRESS);
4502 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4503 return true;
4504 }
4505
4506 return false;
4507}
4508
4509bool SIInstrInfo::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
4510 assert(isFLAT(MI));
4511
4512 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4513 if (!usesLGKM_CNT(MI))
4514 return false;
4515
4516 // If in tgsplit mode then there can be no use of LDS.
4517 if (ST.isTgSplitEnabled())
4518 return false;
4519
4520 // If there are no memory operands then conservatively assume the flat
4521 // operation may access LDS.
4522 if (MI.memoperands_empty())
4523 return true;
4524
4525 // See if any memory operand specifies an address space that involves LDS.
4526 for (const MachineMemOperand *Memop : MI.memoperands()) {
4527 unsigned AS = Memop->getAddrSpace();
4528 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
4529 return true;
4530 }
4531
4532 return false;
4533}
4534
4535bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
4536 // Skip the full operand and register alias search modifiesRegister
4537 // does. There's only a handful of instructions that touch this, it's only an
4538 // implicit def, and doesn't alias any other registers.
4539 return is_contained(Range: MI.getDesc().implicit_defs(), Element: AMDGPU::MODE);
4540}
4541
4542bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
4543 unsigned Opcode = MI.getOpcode();
4544
4545 if (MI.mayStore() && isSMRD(MI))
4546 return true; // scalar store or atomic
4547
4548 // This will terminate the function when other lanes may need to continue.
4549 if (MI.isReturn())
4550 return true;
4551
4552 // These instructions cause shader I/O that may cause hardware lockups
4553 // when executed with an empty EXEC mask.
4554 //
4555 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4556 // EXEC = 0, but checking for that case here seems not worth it
4557 // given the typical code patterns.
4558 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4559 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4560 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
4561 Opcode == AMDGPU::S_SETHALT)
4562 return true;
4563
4564 if (MI.isCall() || MI.isInlineAsm())
4565 return true; // conservative assumption
4566
4567 // Assume that barrier interactions are only intended with active lanes.
4568 if (isBarrier(Opcode))
4569 return true;
4570
4571 // A mode change is a scalar operation that influences vector instructions.
4572 if (modifiesModeRegister(MI))
4573 return true;
4574
4575 // These are like SALU instructions in terms of effects, so it's questionable
4576 // whether we should return true for those.
4577 //
4578 // However, executing them with EXEC = 0 causes them to operate on undefined
4579 // data, which we avoid by returning true here.
4580 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4581 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4582 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4583 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4584 return true;
4585
4586 return false;
4587}
4588
4589bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,
4590 const MachineInstr &MI) const {
4591 if (MI.isMetaInstruction())
4592 return false;
4593
4594 // This won't read exec if this is an SGPR->SGPR copy.
4595 if (MI.isCopyLike()) {
4596 if (!RI.isSGPRReg(MRI, Reg: MI.getOperand(i: 0).getReg()))
4597 return true;
4598
4599 // Make sure this isn't copying exec as a normal operand
4600 return MI.readsRegister(Reg: AMDGPU::EXEC, TRI: &RI);
4601 }
4602
4603 // Make a conservative assumption about the callee.
4604 if (MI.isCall())
4605 return true;
4606
4607 // Be conservative with any unhandled generic opcodes.
4608 if (!isTargetSpecificOpcode(Opcode: MI.getOpcode()))
4609 return true;
4610
4611 return !isSALU(MI) || MI.readsRegister(Reg: AMDGPU::EXEC, TRI: &RI);
4612}
4613
4614bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4615 switch (Imm.getBitWidth()) {
4616 case 1: // This likely will be a condition code mask.
4617 return true;
4618
4619 case 32:
4620 return AMDGPU::isInlinableLiteral32(Literal: Imm.getSExtValue(),
4621 HasInv2Pi: ST.hasInv2PiInlineImm());
4622 case 64:
4623 return AMDGPU::isInlinableLiteral64(Literal: Imm.getSExtValue(),
4624 HasInv2Pi: ST.hasInv2PiInlineImm());
4625 case 16:
4626 return ST.has16BitInsts() &&
4627 AMDGPU::isInlinableLiteralI16(Literal: Imm.getSExtValue(),
4628 HasInv2Pi: ST.hasInv2PiInlineImm());
4629 default:
4630 llvm_unreachable("invalid bitwidth");
4631 }
4632}
4633
4634bool SIInstrInfo::isInlineConstant(const APFloat &Imm) const {
4635 APInt IntImm = Imm.bitcastToAPInt();
4636 int64_t IntImmVal = IntImm.getSExtValue();
4637 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4638 switch (APFloat::SemanticsToEnum(Sem: Imm.getSemantics())) {
4639 default:
4640 llvm_unreachable("invalid fltSemantics");
4641 case APFloatBase::S_IEEEsingle:
4642 case APFloatBase::S_IEEEdouble:
4643 return isInlineConstant(Imm: IntImm);
4644 case APFloatBase::S_BFloat:
4645 return ST.has16BitInsts() &&
4646 AMDGPU::isInlinableLiteralBF16(Literal: IntImmVal, HasInv2Pi);
4647 case APFloatBase::S_IEEEhalf:
4648 return ST.has16BitInsts() &&
4649 AMDGPU::isInlinableLiteralFP16(Literal: IntImmVal, HasInv2Pi);
4650 }
4651}
4652
4653bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4654 // MachineOperand provides no way to tell the true operand size, since it only
4655 // records a 64-bit value. We need to know the size to determine if a 32-bit
4656 // floating point immediate bit pattern is legal for an integer immediate. It
4657 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4658 switch (OperandType) {
4659 case AMDGPU::OPERAND_REG_IMM_INT32:
4660 case AMDGPU::OPERAND_REG_IMM_FP32:
4661 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
4662 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
4663 case AMDGPU::OPERAND_REG_IMM_V2FP32:
4664 case AMDGPU::OPERAND_REG_IMM_V2INT32:
4665 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
4666 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
4667 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: {
4668 int32_t Trunc = static_cast<int32_t>(Imm);
4669 return AMDGPU::isInlinableLiteral32(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4670 }
4671 case AMDGPU::OPERAND_REG_IMM_INT64:
4672 case AMDGPU::OPERAND_REG_IMM_FP64:
4673 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
4674 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
4675 case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
4676 case AMDGPU::OPERAND_REG_IMM_V2FP64:
4677 case AMDGPU::OPERAND_REG_IMM_V2INT64:
4678 return AMDGPU::isInlinableLiteral64(Literal: Imm, HasInv2Pi: ST.hasInv2PiInlineImm());
4679 case AMDGPU::OPERAND_REG_IMM_INT16:
4680 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
4681 // We would expect inline immediates to not be concerned with an integer/fp
4682 // distinction. However, in the case of 16-bit integer operations, the
4683 // "floating point" values appear to not work. It seems read the low 16-bits
4684 // of 32-bit immediates, which happens to always work for the integer
4685 // values.
4686 //
4687 // See llvm bugzilla 46302.
4688 //
4689 // TODO: Theoretically we could use op-sel to use the high bits of the
4690 // 32-bit FP values.
4691 return AMDGPU::isInlinableIntLiteral(Literal: Imm);
4692 case AMDGPU::OPERAND_REG_IMM_V2INT16:
4693 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
4694 return AMDGPU::isInlinableLiteralV2I16(Literal: Imm);
4695 case AMDGPU::OPERAND_REG_IMM_V2FP16:
4696 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
4697 return AMDGPU::isInlinableLiteralV2F16(Literal: Imm);
4698 case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
4699 return AMDGPU::isPKFMACF16InlineConstant(Literal: Imm, IsGFX11Plus: ST.isGFX11Plus());
4700 case AMDGPU::OPERAND_REG_IMM_V2BF16:
4701 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
4702 return AMDGPU::isInlinableLiteralV2BF16(Literal: Imm);
4703 case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
4704 return false;
4705 case AMDGPU::OPERAND_REG_IMM_FP16:
4706 case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
4707 if (isInt<16>(x: Imm) || isUInt<16>(x: Imm)) {
4708 // A few special case instructions have 16-bit operands on subtargets
4709 // where 16-bit instructions are not legal.
4710 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4711 // constants in these cases
4712 int16_t Trunc = static_cast<int16_t>(Imm);
4713 return ST.has16BitInsts() &&
4714 AMDGPU::isInlinableLiteralFP16(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4715 }
4716
4717 return false;
4718 }
4719 case AMDGPU::OPERAND_REG_IMM_BF16:
4720 case AMDGPU::OPERAND_REG_INLINE_C_BF16: {
4721 if (isInt<16>(x: Imm) || isUInt<16>(x: Imm)) {
4722 int16_t Trunc = static_cast<int16_t>(Imm);
4723 return ST.has16BitInsts() &&
4724 AMDGPU::isInlinableLiteralBF16(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4725 }
4726 return false;
4727 }
4728 case AMDGPU::OPERAND_KIMM32:
4729 case AMDGPU::OPERAND_KIMM16:
4730 case AMDGPU::OPERAND_KIMM64:
4731 return false;
4732 case AMDGPU::OPERAND_INLINE_C_AV64_PSEUDO:
4733 return isLegalAV64PseudoImm(Imm);
4734 case AMDGPU::OPERAND_INPUT_MODS:
4735 case MCOI::OPERAND_IMMEDIATE:
4736 // Always embedded in the instruction for free.
4737 return true;
4738 case MCOI::OPERAND_UNKNOWN:
4739 case MCOI::OPERAND_REGISTER:
4740 case MCOI::OPERAND_PCREL:
4741 case MCOI::OPERAND_GENERIC_0:
4742 case MCOI::OPERAND_GENERIC_1:
4743 case MCOI::OPERAND_GENERIC_2:
4744 case MCOI::OPERAND_GENERIC_3:
4745 case MCOI::OPERAND_GENERIC_4:
4746 case MCOI::OPERAND_GENERIC_5:
4747 // Just ignore anything else.
4748 return true;
4749 default:
4750 llvm_unreachable("invalid operand type");
4751 }
4752}
4753
4754static bool compareMachineOp(const MachineOperand &Op0,
4755 const MachineOperand &Op1) {
4756 if (Op0.getType() != Op1.getType())
4757 return false;
4758
4759 switch (Op0.getType()) {
4760 case MachineOperand::MO_Register:
4761 return Op0.getReg() == Op1.getReg();
4762 case MachineOperand::MO_Immediate:
4763 return Op0.getImm() == Op1.getImm();
4764 default:
4765 llvm_unreachable("Didn't expect to be comparing these operand types");
4766 }
4767}
4768
4769bool SIInstrInfo::isLiteralOperandLegal(const MCInstrDesc &InstDesc,
4770 const MCOperandInfo &OpInfo) const {
4771 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4772 return true;
4773
4774 if (!RI.opCanUseLiteralConstant(OpType: OpInfo.OperandType))
4775 return false;
4776
4777 if (!isVOP3(Desc: InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4778 return true;
4779
4780 return ST.hasVOP3Literal();
4781}
4782
4783bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4784 int64_t ImmVal) const {
4785 const unsigned Opc = InstDesc.getOpcode();
4786 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1);
4787 if (Src1Idx != -1 && isDPP(Opcode: Opc) && !ST.hasDPPSrc1SGPR() &&
4788 OpNo == static_cast<unsigned>(Src1Idx))
4789 return false;
4790
4791 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4792 if (isInlineConstant(Imm: ImmVal, OperandType: OpInfo.OperandType)) {
4793 if (isMAI(Desc: InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4794 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(Opcode: InstDesc.getOpcode(),
4795 Name: AMDGPU::OpName::src2))
4796 return false;
4797 return RI.opCanUseInlineConstant(OpType: OpInfo.OperandType);
4798 }
4799
4800 return isLiteralOperandLegal(InstDesc, OpInfo);
4801}
4802
4803bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4804 const MachineOperand &MO) const {
4805 if (MO.isImm())
4806 return isImmOperandLegal(InstDesc, OpNo, ImmVal: MO.getImm());
4807
4808 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4809 "unexpected imm-like operand kind");
4810 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4811 return isLiteralOperandLegal(InstDesc, OpInfo);
4812}
4813
4814bool SIInstrInfo::isLegalAV64PseudoImm(uint64_t Imm) const {
4815 // 2 32-bit inline constants packed into one.
4816 return AMDGPU::isInlinableLiteral32(Literal: Lo_32(Value: Imm), HasInv2Pi: ST.hasInv2PiInlineImm()) &&
4817 AMDGPU::isInlinableLiteral32(Literal: Hi_32(Value: Imm), HasInv2Pi: ST.hasInv2PiInlineImm());
4818}
4819
4820bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4821 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4822 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4823 return false;
4824
4825 int Op32 = AMDGPU::getVOPe32(Opcode);
4826 if (Op32 == -1)
4827 return false;
4828
4829 return pseudoToMCOpcode(Opcode: Op32) != -1;
4830}
4831
4832bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4833 // The src0_modifier operand is present on all instructions
4834 // that have modifiers.
4835
4836 return AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers);
4837}
4838
4839bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
4840 AMDGPU::OpName OpName) const {
4841 const MachineOperand *Mods = getNamedOperand(MI, OperandName: OpName);
4842 return Mods && Mods->getImm();
4843}
4844
4845bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
4846 return any_of(Range: ModifierOpNames,
4847 P: [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, OpName: Name); });
4848}
4849
4850bool SIInstrInfo::canShrink(const MachineInstr &MI,
4851 const MachineRegisterInfo &MRI) const {
4852 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
4853 // Can't shrink instruction with three operands.
4854 if (Src2) {
4855 switch (MI.getOpcode()) {
4856 default: return false;
4857
4858 case AMDGPU::V_ADDC_U32_e64:
4859 case AMDGPU::V_SUBB_U32_e64:
4860 case AMDGPU::V_SUBBREV_U32_e64: {
4861 const MachineOperand *Src1
4862 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4863 if (!Src1->isReg() || !RI.isVGPR(MRI, Reg: Src1->getReg()))
4864 return false;
4865 // Additional verification is needed for sdst/src2.
4866 return true;
4867 }
4868 case AMDGPU::V_MAC_F16_e64:
4869 case AMDGPU::V_MAC_F32_e64:
4870 case AMDGPU::V_MAC_LEGACY_F32_e64:
4871 case AMDGPU::V_FMAC_F16_e64:
4872 case AMDGPU::V_FMAC_F16_t16_e64:
4873 case AMDGPU::V_FMAC_F16_fake16_e64:
4874 case AMDGPU::V_FMAC_F32_e64:
4875 case AMDGPU::V_FMAC_F64_e64:
4876 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4877 if (!Src2->isReg() || !RI.isVGPR(MRI, Reg: Src2->getReg()) ||
4878 hasModifiersSet(MI, OpName: AMDGPU::OpName::src2_modifiers))
4879 return false;
4880 break;
4881
4882 case AMDGPU::V_CNDMASK_B32_e64:
4883 break;
4884 }
4885 }
4886
4887 const MachineOperand *Src1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4888 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Reg: Src1->getReg()) ||
4889 hasModifiersSet(MI, OpName: AMDGPU::OpName::src1_modifiers)))
4890 return false;
4891
4892 // We don't need to check src0, all input types are legal, so just make sure
4893 // src0 isn't using any modifiers.
4894 if (hasModifiersSet(MI, OpName: AMDGPU::OpName::src0_modifiers))
4895 return false;
4896
4897 // Can it be shrunk to a valid 32 bit opcode?
4898 if (!hasVALU32BitEncoding(Opcode: MI.getOpcode()))
4899 return false;
4900
4901 // Check output modifiers
4902 return !hasModifiersSet(MI, OpName: AMDGPU::OpName::omod) &&
4903 !hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) &&
4904 !hasModifiersSet(MI, OpName: AMDGPU::OpName::byte_sel) &&
4905 // TODO: Can we avoid checking bound_ctrl/fi here?
4906 // They are only used by permlane*_swap special case.
4907 !hasModifiersSet(MI, OpName: AMDGPU::OpName::bound_ctrl) &&
4908 !hasModifiersSet(MI, OpName: AMDGPU::OpName::fi);
4909}
4910
4911// Set VCC operand with all flags from \p Orig, except for setting it as
4912// implicit.
4913static void copyFlagsToImplicitVCC(MachineInstr &MI,
4914 const MachineOperand &Orig) {
4915
4916 for (MachineOperand &Use : MI.implicit_operands()) {
4917 if (Use.isUse() &&
4918 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4919 Use.setIsUndef(Orig.isUndef());
4920 Use.setIsKill(Orig.isKill());
4921 return;
4922 }
4923 }
4924}
4925
4926MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
4927 unsigned Op32) const {
4928 MachineBasicBlock *MBB = MI.getParent();
4929
4930 const MCInstrDesc &Op32Desc = get(Opcode: Op32);
4931 MachineInstrBuilder Inst32 =
4932 BuildMI(BB&: *MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: Op32Desc)
4933 .setMIFlags(MI.getFlags());
4934
4935 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4936 // For VOPC instructions, this is replaced by an implicit def of vcc.
4937
4938 // We assume the defs of the shrunk opcode are in the same order, and the
4939 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4940 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4941 Inst32.add(MO: MI.getOperand(i: I));
4942
4943 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
4944
4945 int Idx = MI.getNumExplicitDefs();
4946 for (const MachineOperand &Use : MI.explicit_uses()) {
4947 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4948 if (OpTy == AMDGPU::OPERAND_INPUT_MODS || OpTy == MCOI::OPERAND_IMMEDIATE)
4949 continue;
4950
4951 if (&Use == Src2) {
4952 if (AMDGPU::getNamedOperandIdx(Opcode: Op32, Name: AMDGPU::OpName::src2) == -1) {
4953 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4954 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4955 // of vcc was already added during the initial BuildMI, but we
4956 // 1) may need to change vcc to vcc_lo to preserve the original register
4957 // 2) have to preserve the original flags.
4958 copyFlagsToImplicitVCC(MI&: *Inst32, Orig: *Src2);
4959 continue;
4960 }
4961 }
4962
4963 Inst32.add(MO: Use);
4964 }
4965
4966 // FIXME: Losing implicit operands
4967 fixImplicitOperands(MI&: *Inst32);
4968 return Inst32;
4969}
4970
4971bool SIInstrInfo::physRegUsesConstantBus(const MachineOperand &RegOp) const {
4972 // Null is free
4973 Register Reg = RegOp.getReg();
4974 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4975 return false;
4976
4977 // SGPRs use the constant bus
4978
4979 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4980 // physical register operands should also count, except for exec.
4981 if (RegOp.isImplicit())
4982 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4983
4984 // SGPRs use the constant bus
4985 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4986 AMDGPU::SReg_64RegClass.contains(Reg);
4987}
4988
4989bool SIInstrInfo::regUsesConstantBus(const MachineOperand &RegOp,
4990 const MachineRegisterInfo &MRI) const {
4991 Register Reg = RegOp.getReg();
4992 return Reg.isVirtual() ? RI.isSGPRClass(RC: MRI.getRegClass(Reg))
4993 : physRegUsesConstantBus(RegOp);
4994}
4995
4996bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
4997 const MachineOperand &MO,
4998 const MCOperandInfo &OpInfo) const {
4999 // Literal constants use the constant bus.
5000 if (!MO.isReg())
5001 return !isInlineConstant(MO, OpInfo);
5002
5003 Register Reg = MO.getReg();
5004 return Reg.isVirtual() ? RI.isSGPRClass(RC: MRI.getRegClass(Reg))
5005 : physRegUsesConstantBus(RegOp: MO);
5006}
5007
5008static Register findImplicitSGPRRead(const MachineInstr &MI) {
5009 for (const MachineOperand &MO : MI.implicit_operands()) {
5010 // We only care about reads.
5011 if (MO.isDef())
5012 continue;
5013
5014 switch (MO.getReg()) {
5015 case AMDGPU::VCC:
5016 case AMDGPU::VCC_LO:
5017 case AMDGPU::VCC_HI:
5018 case AMDGPU::M0:
5019 case AMDGPU::FLAT_SCR:
5020 return MO.getReg();
5021
5022 default:
5023 break;
5024 }
5025 }
5026
5027 return Register();
5028}
5029
5030static bool shouldReadExec(const MachineInstr &MI) {
5031 if (SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true)) {
5032 switch (MI.getOpcode()) {
5033 case AMDGPU::V_READLANE_B32:
5034 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5035 case AMDGPU::V_WRITELANE_B32:
5036 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5037 return false;
5038 }
5039
5040 return true;
5041 }
5042
5043 if (MI.isPreISelOpcode() ||
5044 SIInstrInfo::isGenericOpcode(Opc: MI.getOpcode()) ||
5045 SIInstrInfo::isSALU(MI) ||
5046 SIInstrInfo::isSMRD(MI))
5047 return false;
5048
5049 return true;
5050}
5051
5052static bool isRegOrFI(const MachineOperand &MO) {
5053 return MO.isReg() || MO.isFI();
5054}
5055
5056static bool isSubRegOf(const SIRegisterInfo &TRI,
5057 const MachineOperand &SuperVec,
5058 const MachineOperand &SubReg) {
5059 if (SubReg.getReg().isPhysical())
5060 return TRI.isSubRegister(RegA: SuperVec.getReg(), RegB: SubReg.getReg());
5061
5062 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5063 SubReg.getReg() == SuperVec.getReg();
5064}
5065
5066// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5067bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5068 const MachineRegisterInfo &MRI,
5069 StringRef &ErrInfo) const {
5070 Register DstReg = MI.getOperand(i: 0).getReg();
5071 Register SrcReg = MI.getOperand(i: 1).getReg();
5072 // This is a check for copy from vector register to SGPR
5073 if (RI.isVectorRegister(MRI, Reg: SrcReg) && RI.isSGPRReg(MRI, Reg: DstReg)) {
5074 ErrInfo = "illegal copy from vector register to SGPR";
5075 return false;
5076 }
5077 return true;
5078}
5079
5080bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
5081 StringRef &ErrInfo) const {
5082 uint32_t Opcode = MI.getOpcode();
5083 const MachineFunction *MF = MI.getMF();
5084 const MachineRegisterInfo &MRI = MF->getRegInfo();
5085
5086 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5087 // Find a better property to recognize the point where instruction selection
5088 // is just done.
5089 // We can only enforce this check after SIFixSGPRCopies pass so that the
5090 // illegal copies are legalized and thereafter we don't expect a pass
5091 // inserting similar copies.
5092 if (!MRI.isSSA() && MI.isCopy())
5093 return verifyCopy(MI, MRI, ErrInfo);
5094
5095 if (SIInstrInfo::isGenericOpcode(Opc: Opcode))
5096 return true;
5097
5098 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0);
5099 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src1);
5100 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src2);
5101 int Src3Idx = -1;
5102 if (Src0Idx == -1) {
5103 // VOPD V_DUAL_* instructions use different operand names.
5104 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0X);
5105 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vsrc1X);
5106 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0Y);
5107 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vsrc1Y);
5108 }
5109
5110 // Make sure the number of operands is correct.
5111 const MCInstrDesc &Desc = get(Opcode);
5112 if (!Desc.isVariadic() &&
5113 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5114 ErrInfo = "Instruction has wrong number of operands.";
5115 return false;
5116 }
5117
5118 if (MI.isInlineAsm()) {
5119 // Verify register classes for inlineasm constraints.
5120 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5121 I != E; ++I) {
5122 const TargetRegisterClass *RC = MI.getRegClassConstraint(OpIdx: I, TII: this, TRI: &RI);
5123 if (!RC)
5124 continue;
5125
5126 const MachineOperand &Op = MI.getOperand(i: I);
5127 if (!Op.isReg())
5128 continue;
5129
5130 Register Reg = Op.getReg();
5131 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5132 ErrInfo = "inlineasm operand has incorrect register class.";
5133 return false;
5134 }
5135 }
5136
5137 return true;
5138 }
5139
5140 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5141 ErrInfo = "missing memory operand from image instruction.";
5142 return false;
5143 }
5144
5145 // Make sure the register classes are correct.
5146 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5147 const MachineOperand &MO = MI.getOperand(i);
5148 if (MO.isFPImm()) {
5149 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5150 "all fp values to integers.";
5151 return false;
5152 }
5153
5154 const MCOperandInfo &OpInfo = Desc.operands()[i];
5155 int16_t RegClass = getOpRegClassID(OpInfo);
5156
5157 switch (OpInfo.OperandType) {
5158 case MCOI::OPERAND_REGISTER:
5159 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5160 ErrInfo = "Illegal immediate value for operand.";
5161 return false;
5162 }
5163 break;
5164 case AMDGPU::OPERAND_REG_IMM_INT32:
5165 case AMDGPU::OPERAND_REG_IMM_INT16:
5166 case AMDGPU::OPERAND_REG_IMM_FP32:
5167 case AMDGPU::OPERAND_REG_IMM_BF16:
5168 case AMDGPU::OPERAND_REG_IMM_FP16:
5169 case AMDGPU::OPERAND_REG_IMM_V2FP16:
5170 case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
5171 case AMDGPU::OPERAND_REG_IMM_V2INT16:
5172 case AMDGPU::OPERAND_REG_IMM_V2BF16:
5173 case AMDGPU::OPERAND_REG_IMM_V2FP64:
5174 case AMDGPU::OPERAND_REG_IMM_V2INT64:
5175 break;
5176 case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
5177 break;
5178 break;
5179 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
5180 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
5181 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
5182 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
5183 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
5184 case AMDGPU::OPERAND_REG_INLINE_C_BF16:
5185 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
5186 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
5187 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
5188 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
5189 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
5190 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
5191 case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
5192 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, OpIdx: i))) {
5193 ErrInfo = "Illegal immediate value for operand.";
5194 return false;
5195 }
5196 break;
5197 }
5198 case AMDGPU::OPERAND_REG_IMM_FP64:
5199 case AMDGPU::OPERAND_REG_IMM_INT64:
5200 case AMDGPU::OPERAND_REG_IMM_V2INT32:
5201 case AMDGPU::OPERAND_REG_IMM_V2FP32:
5202 if (ST.has64BitLiterals() && Desc.getSize() != 4 && MO.isImm() &&
5203 !isInlineConstant(MI, OpIdx: i) &&
5204 !AMDGPU::isValid32BitLiteral(Val: MO.getImm(),
5205 IsFP64: OpInfo.OperandType ==
5206 AMDGPU::OPERAND_REG_IMM_FP64)) {
5207 ErrInfo = "illegal 64-bit immediate value for operand.";
5208 return false;
5209 }
5210 break;
5211 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
5212 case AMDGPU::OPERAND_INPUT_MODS:
5213 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, OpIdx: i)) {
5214 ErrInfo = "Expected inline constant for operand.";
5215 return false;
5216 }
5217 break;
5218 case AMDGPU::OPERAND_SDWA_VOPC_DST:
5219 case AMDGPU::OPERAND_KIMM16:
5220 break;
5221 case MCOI::OPERAND_IMMEDIATE:
5222 case AMDGPU::OPERAND_KIMM32:
5223 case AMDGPU::OPERAND_KIMM64:
5224 case AMDGPU::OPERAND_INLINE_C_AV64_PSEUDO:
5225 // Check if this operand is an immediate.
5226 // FrameIndex operands will be replaced by immediates, so they are
5227 // allowed.
5228 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5229 ErrInfo = "Expected immediate, but got non-immediate";
5230 return false;
5231 }
5232 break;
5233 case MCOI::OPERAND_UNKNOWN:
5234 case MCOI::OPERAND_MEMORY:
5235 case MCOI::OPERAND_PCREL:
5236 break;
5237 default:
5238 if (OpInfo.isGenericType())
5239 continue;
5240 break;
5241 }
5242
5243 if (!MO.isReg())
5244 continue;
5245 Register Reg = MO.getReg();
5246 if (!Reg)
5247 continue;
5248
5249 // FIXME: Ideally we would have separate instruction definitions with the
5250 // aligned register constraint.
5251 // FIXME: We do not verify inline asm operands, but custom inline asm
5252 // verification is broken anyway
5253 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5254 Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
5255 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5256 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5257 if (const TargetRegisterClass *SubRC =
5258 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5259 RC = RI.getCompatibleSubRegClass(SuperRC: RC, SubRC, SubIdx: MO.getSubReg());
5260 if (RC)
5261 RC = SubRC;
5262 }
5263 }
5264
5265 // Check that this is the aligned version of the class.
5266 if (!RC || !RI.isProperlyAlignedRC(RC: *RC)) {
5267 ErrInfo = "Subtarget requires even aligned vector registers";
5268 return false;
5269 }
5270 }
5271
5272 if (RegClass != -1) {
5273 if (Reg.isVirtual())
5274 continue;
5275
5276 const TargetRegisterClass *RC = RI.getRegClass(i: RegClass);
5277 if (!RC->contains(Reg)) {
5278 ErrInfo = "Operand has incorrect register class.";
5279 return false;
5280 }
5281 }
5282 }
5283
5284 // Verify SDWA
5285 if (isSDWA(MI)) {
5286 if (!ST.hasSDWA()) {
5287 ErrInfo = "SDWA is not supported on this target";
5288 return false;
5289 }
5290
5291 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5292 AMDGPU::OpName::dst_sel}) {
5293 const MachineOperand *MO = getNamedOperand(MI, OperandName: Op);
5294 if (!MO)
5295 continue;
5296 int64_t Imm = MO->getImm();
5297 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5298 ErrInfo = "Invalid SDWA selection";
5299 return false;
5300 }
5301 }
5302
5303 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdst);
5304
5305 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5306 if (OpIdx == -1)
5307 continue;
5308 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5309
5310 if (!ST.hasSDWAScalar()) {
5311 // Only VGPRS on VI
5312 if (!MO.isReg() || !RI.hasVGPRs(RC: RI.getRegClassForReg(MRI, Reg: MO.getReg()))) {
5313 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5314 return false;
5315 }
5316 } else {
5317 // No immediates on GFX9
5318 if (!MO.isReg()) {
5319 ErrInfo =
5320 "Only reg allowed as operands in SDWA instructions on GFX9+";
5321 return false;
5322 }
5323 }
5324 }
5325
5326 if (!ST.hasSDWAOmod()) {
5327 // No omod allowed on VI
5328 const MachineOperand *OMod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
5329 if (OMod != nullptr &&
5330 (!OMod->isImm() || OMod->getImm() != 0)) {
5331 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5332 return false;
5333 }
5334 }
5335
5336 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5337 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5338 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5339 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5340 const MachineOperand *Src0ModsMO =
5341 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
5342 unsigned Mods = Src0ModsMO->getImm();
5343 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5344 Mods & SISrcMods::SEXT) {
5345 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5346 return false;
5347 }
5348 }
5349
5350 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5351 if (isVOPC(Opcode: BasicOpcode)) {
5352 if (!ST.hasSDWASdst() && DstIdx != -1) {
5353 // Only vcc allowed as dst on VI for VOPC
5354 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5355 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5356 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5357 return false;
5358 }
5359 } else if (!ST.hasSDWAOutModsVOPC()) {
5360 // No clamp allowed on GFX9 for VOPC
5361 const MachineOperand *Clamp = getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
5362 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5363 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5364 return false;
5365 }
5366
5367 // No omod allowed on GFX9 for VOPC
5368 const MachineOperand *OMod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
5369 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5370 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5371 return false;
5372 }
5373 }
5374 }
5375
5376 const MachineOperand *DstUnused = getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused);
5377 if (DstUnused && DstUnused->isImm() &&
5378 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5379 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5380 if (!Dst.isReg() || !Dst.isTied()) {
5381 ErrInfo = "Dst register should have tied register";
5382 return false;
5383 }
5384
5385 const MachineOperand &TiedMO =
5386 MI.getOperand(i: MI.findTiedOperandIdx(OpIdx: DstIdx));
5387 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5388 ErrInfo =
5389 "Dst register should be tied to implicit use of preserved register";
5390 return false;
5391 }
5392 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5393 ErrInfo = "Dst register should use same physical register as preserved";
5394 return false;
5395 }
5396 }
5397 }
5398
5399 if (isDPP(MI) && !ST.hasDPPSrc1SGPR() && Src1Idx != -1) {
5400 const MachineOperand &Src1MO = MI.getOperand(i: Src1Idx);
5401 if (Src1MO.isReg() && RI.isSGPRReg(MRI, Reg: Src1MO.getReg())) {
5402 ErrInfo = "DPP src1 cannot be SGPR on this subtarget";
5403 return false;
5404 }
5405 if (Src1MO.isImm()) {
5406 ErrInfo = "DPP src1 cannot be an immediate on this subtarget";
5407 return false;
5408 }
5409 }
5410
5411 // Verify MIMG / VIMAGE / VSAMPLE
5412 if (isImage(Opcode) && !MI.mayStore()) {
5413 // Ensure that the return type used is large enough for all the options
5414 // being used TFE/LWE require an extra result register.
5415 const MachineOperand *DMask = getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask);
5416 if (DMask) {
5417 uint64_t DMaskImm = DMask->getImm();
5418 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(Value: DMaskImm);
5419 const MachineOperand *TFE = getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe);
5420 const MachineOperand *LWE = getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe);
5421 const MachineOperand *D16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::d16);
5422
5423 // Adjust for packed 16 bit values
5424 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5425 RegCount = divideCeil(Numerator: RegCount, Denominator: 2);
5426
5427 // Adjust if using LWE or TFE
5428 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5429 RegCount += 1;
5430
5431 const uint32_t DstIdx =
5432 AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdata);
5433 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5434 if (Dst.isReg()) {
5435 const TargetRegisterClass *DstRC = getOpRegClass(MI, OpNo: DstIdx);
5436 uint32_t DstSize = RI.getRegSizeInBits(RC: *DstRC) / 32;
5437 if (RegCount > DstSize) {
5438 ErrInfo = "Image instruction returns too many registers for dst "
5439 "register class";
5440 return false;
5441 }
5442 }
5443 }
5444 }
5445
5446 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5447 if (isVALU(MI, /*AllowLDSDMA=*/true) &&
5448 Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5449 unsigned ConstantBusCount = 0;
5450 bool UsesLiteral = false;
5451 const MachineOperand *LiteralVal = nullptr;
5452
5453 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::imm);
5454 if (ImmIdx != -1) {
5455 ++ConstantBusCount;
5456 UsesLiteral = true;
5457 LiteralVal = &MI.getOperand(i: ImmIdx);
5458 }
5459
5460 SmallVector<Register, 2> SGPRsUsed;
5461 Register SGPRUsed;
5462
5463 // Only look at the true operands. Only a real operand can use the constant
5464 // bus, and we don't want to check pseudo-operands like the source modifier
5465 // flags.
5466 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5467 if (OpIdx == -1)
5468 continue;
5469 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5470 if (usesConstantBus(MRI, MO, OpInfo: MI.getDesc().operands()[OpIdx])) {
5471 if (MO.isReg()) {
5472 SGPRUsed = MO.getReg();
5473 if (!llvm::is_contained(Range&: SGPRsUsed, Element: SGPRUsed)) {
5474 ++ConstantBusCount;
5475 SGPRsUsed.push_back(Elt: SGPRUsed);
5476 }
5477 } else if (!MO.isFI()) { // Treat FI like a register.
5478 if (!UsesLiteral) {
5479 ++ConstantBusCount;
5480 UsesLiteral = true;
5481 LiteralVal = &MO;
5482 } else if (!MO.isIdenticalTo(Other: *LiteralVal)) {
5483 assert(isVOP2(MI) || isVOP3(MI));
5484 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5485 return false;
5486 }
5487 }
5488 }
5489 }
5490
5491 SGPRUsed = findImplicitSGPRRead(MI);
5492 if (SGPRUsed) {
5493 // Implicit uses may safely overlap true operands
5494 if (llvm::all_of(Range&: SGPRsUsed, P: [this, SGPRUsed](unsigned SGPR) {
5495 return !RI.regsOverlap(RegA: SGPRUsed, RegB: SGPR);
5496 })) {
5497 ++ConstantBusCount;
5498 SGPRsUsed.push_back(Elt: SGPRUsed);
5499 }
5500 }
5501
5502 // v_writelane_b32 is an exception from constant bus restriction:
5503 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5504 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5505 Opcode != AMDGPU::V_WRITELANE_B32) {
5506 ErrInfo = "VOP* instruction violates constant bus restriction";
5507 return false;
5508 }
5509
5510 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5511 ErrInfo = "VOP3 instruction uses literal";
5512 return false;
5513 }
5514 }
5515
5516 // Special case for writelane - this can break the multiple constant bus rule,
5517 // but still can't use more than one SGPR register
5518 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5519 unsigned SGPRCount = 0;
5520 Register SGPRUsed;
5521
5522 for (int OpIdx : {Src0Idx, Src1Idx}) {
5523 if (OpIdx == -1)
5524 break;
5525
5526 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5527
5528 if (usesConstantBus(MRI, MO, OpInfo: MI.getDesc().operands()[OpIdx])) {
5529 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5530 if (MO.getReg() != SGPRUsed)
5531 ++SGPRCount;
5532 SGPRUsed = MO.getReg();
5533 }
5534 }
5535 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5536 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5537 return false;
5538 }
5539 }
5540 }
5541
5542 // Verify misc. restrictions on specific instructions.
5543 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5544 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5545 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5546 const MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
5547 const MachineOperand &Src2 = MI.getOperand(i: Src2Idx);
5548 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5549 if (!compareMachineOp(Op0: Src0, Op1: Src1) &&
5550 !compareMachineOp(Op0: Src0, Op1: Src2)) {
5551 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5552 return false;
5553 }
5554 }
5555 if ((getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers)->getImm() &
5556 SISrcMods::ABS) ||
5557 (getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers)->getImm() &
5558 SISrcMods::ABS) ||
5559 (getNamedOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers)->getImm() &
5560 SISrcMods::ABS)) {
5561 ErrInfo = "ABS not allowed in VOP3B instructions";
5562 return false;
5563 }
5564 }
5565
5566 if (isSOP2(MI) || isSOPC(MI)) {
5567 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5568 const MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
5569
5570 if (!isRegOrFI(MO: Src0) && !isRegOrFI(MO: Src1) &&
5571 !isInlineConstant(MO: Src0, OpInfo: Desc.operands()[Src0Idx]) &&
5572 !isInlineConstant(MO: Src1, OpInfo: Desc.operands()[Src1Idx]) &&
5573 !Src0.isIdenticalTo(Other: Src1)) {
5574 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5575 return false;
5576 }
5577 }
5578
5579 if (isSOPK(MI)) {
5580 const auto *Op = getNamedOperand(MI, OperandName: AMDGPU::OpName::simm16);
5581 if (Desc.isBranch()) {
5582 if (!Op->isMBB()) {
5583 ErrInfo = "invalid branch target for SOPK instruction";
5584 return false;
5585 }
5586 } else {
5587 uint64_t Imm = Op->getImm();
5588 if (sopkIsZext(Opcode)) {
5589 if (!isUInt<16>(x: Imm)) {
5590 ErrInfo = "invalid immediate for SOPK instruction";
5591 return false;
5592 }
5593 } else {
5594 if (!isInt<16>(x: Imm)) {
5595 ErrInfo = "invalid immediate for SOPK instruction";
5596 return false;
5597 }
5598 }
5599 }
5600 }
5601
5602 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5603 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5604 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5605 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5606 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5607 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5608
5609 const unsigned StaticNumOps =
5610 Desc.getNumOperands() + Desc.implicit_uses().size();
5611 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5612
5613 // Require additional implicit operands. This allows a fixup done by the
5614 // post RA scheduler where the main implicit operand is killed and
5615 // implicit-defs are added for sub-registers that remain live after this
5616 // instruction.
5617 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5618 ErrInfo = "missing implicit register operands";
5619 return false;
5620 }
5621
5622 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
5623 if (IsDst) {
5624 if (!Dst->isUse()) {
5625 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5626 return false;
5627 }
5628
5629 unsigned UseOpIdx;
5630 if (!MI.isRegTiedToUseOperand(DefOpIdx: StaticNumOps, UseOpIdx: &UseOpIdx) ||
5631 UseOpIdx != StaticNumOps + 1) {
5632 ErrInfo = "movrel implicit operands should be tied";
5633 return false;
5634 }
5635 }
5636
5637 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5638 const MachineOperand &ImpUse
5639 = MI.getOperand(i: StaticNumOps + NumImplicitOps - 1);
5640 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5641 !isSubRegOf(TRI: RI, SuperVec: ImpUse, SubReg: IsDst ? *Dst : Src0)) {
5642 ErrInfo = "src0 should be subreg of implicit vector use";
5643 return false;
5644 }
5645 }
5646
5647 // Make sure we aren't losing exec uses in the td files. This mostly requires
5648 // being careful when using let Uses to try to add other use registers.
5649 if (shouldReadExec(MI)) {
5650 if (!MI.hasRegisterImplicitUseOperand(Reg: AMDGPU::EXEC)) {
5651 ErrInfo = "VALU instruction does not implicitly read exec mask";
5652 return false;
5653 }
5654 }
5655
5656 if (isSMRD(MI)) {
5657 if (MI.mayStore() &&
5658 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5659 // The register offset form of scalar stores may only use m0 as the
5660 // soffset register.
5661 const MachineOperand *Soff = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
5662 if (Soff && Soff->getReg() != AMDGPU::M0) {
5663 ErrInfo = "scalar stores must use m0 as offset register";
5664 return false;
5665 }
5666 }
5667 }
5668
5669 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5670 const MachineOperand *Offset = getNamedOperand(MI, OperandName: AMDGPU::OpName::offset);
5671 if (Offset->getImm() != 0) {
5672 ErrInfo = "subtarget does not support offsets in flat instructions";
5673 return false;
5674 }
5675 }
5676
5677 if (isDS(MI) && !ST.hasGDS()) {
5678 const MachineOperand *GDSOp = getNamedOperand(MI, OperandName: AMDGPU::OpName::gds);
5679 if (GDSOp && GDSOp->getImm() != 0) {
5680 ErrInfo = "GDS is not supported on this subtarget";
5681 return false;
5682 }
5683 }
5684
5685 if (isImage(MI)) {
5686 const MachineOperand *DimOp = getNamedOperand(MI, OperandName: AMDGPU::OpName::dim);
5687 if (DimOp) {
5688 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5689 Name: AMDGPU::OpName::vaddr0);
5690 AMDGPU::OpName RSrcOpName =
5691 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5692 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: RSrcOpName);
5693 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Opcode);
5694 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5695 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
5696 const AMDGPU::MIMGDimInfo *Dim =
5697 AMDGPU::getMIMGDimInfoByEncoding(DimEnc: DimOp->getImm());
5698
5699 if (!Dim) {
5700 ErrInfo = "dim is out of range";
5701 return false;
5702 }
5703
5704 bool IsA16 = false;
5705 if (ST.hasR128A16()) {
5706 const MachineOperand *R128A16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::r128);
5707 IsA16 = R128A16->getImm() != 0;
5708 } else if (ST.hasA16()) {
5709 const MachineOperand *A16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::a16);
5710 IsA16 = A16->getImm() != 0;
5711 }
5712
5713 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5714
5715 unsigned AddrWords =
5716 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, IsG16Supported: ST.hasG16());
5717
5718 unsigned VAddrWords;
5719 if (IsNSA) {
5720 VAddrWords = RsrcIdx - VAddr0Idx;
5721 if (ST.hasPartialNSAEncoding() &&
5722 AddrWords > ST.getNSAMaxSize(HasSampler: isVSAMPLE(MI))) {
5723 unsigned LastVAddrIdx = RsrcIdx - 1;
5724 VAddrWords += getOpSize(MI, OpNo: LastVAddrIdx) / 4 - 1;
5725 }
5726 } else {
5727 VAddrWords = getOpSize(MI, OpNo: VAddr0Idx) / 4;
5728 if (AddrWords > 12)
5729 AddrWords = 16;
5730 }
5731
5732 if (VAddrWords != AddrWords) {
5733 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5734 << " but got " << VAddrWords << "\n");
5735 ErrInfo = "bad vaddr size";
5736 return false;
5737 }
5738 }
5739 }
5740
5741 const MachineOperand *DppCt = getNamedOperand(MI, OperandName: AMDGPU::OpName::dpp_ctrl);
5742 if (DppCt) {
5743 using namespace AMDGPU::DPP;
5744
5745 unsigned DC = DppCt->getImm();
5746 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5747 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5748 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5749 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5750 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5751 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5752 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5753 ErrInfo = "Invalid dpp_ctrl value";
5754 return false;
5755 }
5756 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5757 !ST.hasDPPWavefrontShifts()) {
5758 ErrInfo = "Invalid dpp_ctrl value: "
5759 "wavefront shifts are not supported on GFX10+";
5760 return false;
5761 }
5762 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5763 !ST.hasDPPBroadcasts()) {
5764 ErrInfo = "Invalid dpp_ctrl value: "
5765 "broadcasts are not supported on GFX10+";
5766 return false;
5767 }
5768 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5769 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5770 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5771 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5772 !ST.hasGFX90AInsts()) {
5773 ErrInfo = "Invalid dpp_ctrl value: "
5774 "row_newbroadcast/row_share is not supported before "
5775 "GFX90A/GFX10";
5776 return false;
5777 }
5778 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5779 ErrInfo = "Invalid dpp_ctrl value: "
5780 "row_share and row_xmask are not supported before GFX10";
5781 return false;
5782 }
5783 }
5784
5785 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5786 !AMDGPU::isLegalDPALU_DPPControl(ST, DC) &&
5787 AMDGPU::isDPALU_DPP(OpDesc: Desc, MII: *this, ST)) {
5788 ErrInfo = "Invalid dpp_ctrl value: "
5789 "DP ALU dpp only support row_newbcast";
5790 return false;
5791 }
5792 }
5793
5794 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5795 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
5796 AMDGPU::OpName DataName =
5797 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5798 const MachineOperand *Data = getNamedOperand(MI, OperandName: DataName);
5799 const MachineOperand *Data2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::data1);
5800 if (Data && !Data->isReg())
5801 Data = nullptr;
5802
5803 if (ST.hasGFX90AInsts()) {
5804 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5805 (RI.isAGPR(MRI, Reg: Dst->getReg()) != RI.isAGPR(MRI, Reg: Data->getReg()))) {
5806 ErrInfo = "Invalid register class: "
5807 "vdata and vdst should be both VGPR or AGPR";
5808 return false;
5809 }
5810 if (Data && Data2 &&
5811 (RI.isAGPR(MRI, Reg: Data->getReg()) != RI.isAGPR(MRI, Reg: Data2->getReg()))) {
5812 ErrInfo = "Invalid register class: "
5813 "both data operands should be VGPR or AGPR";
5814 return false;
5815 }
5816 } else {
5817 if ((Dst && RI.isAGPR(MRI, Reg: Dst->getReg())) ||
5818 (Data && RI.isAGPR(MRI, Reg: Data->getReg())) ||
5819 (Data2 && RI.isAGPR(MRI, Reg: Data2->getReg()))) {
5820 ErrInfo = "Invalid register class: "
5821 "agpr loads and stores not supported on this GPU";
5822 return false;
5823 }
5824 }
5825 }
5826
5827 if (ST.needsAlignedVGPRs()) {
5828 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5829 const MachineOperand *Op = getNamedOperand(MI, OperandName: OpName);
5830 if (!Op)
5831 return true;
5832 Register Reg = Op->getReg();
5833 if (Reg.isPhysical())
5834 return !(RI.getHWRegIndex(Reg) & 1);
5835 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5836 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5837 !(RI.getChannelFromSubReg(SubReg: Op->getSubReg()) & 1);
5838 };
5839
5840 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5841 Opcode == AMDGPU::DS_GWS_BARRIER) {
5842
5843 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5844 ErrInfo = "Subtarget requires even aligned vector registers "
5845 "for DS_GWS instructions";
5846 return false;
5847 }
5848 }
5849
5850 if (isMIMG(MI)) {
5851 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5852 ErrInfo = "Subtarget requires even aligned vector registers "
5853 "for vaddr operand of image instructions";
5854 return false;
5855 }
5856 }
5857 }
5858
5859 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5860 const MachineOperand *Src = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
5861 if (Src->isReg() && RI.isSGPRReg(MRI, Reg: Src->getReg())) {
5862 ErrInfo = "Invalid register class: "
5863 "v_accvgpr_write with an SGPR is not supported on this GPU";
5864 return false;
5865 }
5866 }
5867
5868 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5869 const MachineOperand &SrcOp = MI.getOperand(i: 1);
5870 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5871 ErrInfo = "pseudo expects only physical SGPRs";
5872 return false;
5873 }
5874 }
5875
5876 if (const MachineOperand *CPol = getNamedOperand(MI, OperandName: AMDGPU::OpName::cpol)) {
5877 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5878 if (!ST.hasScaleOffset()) {
5879 ErrInfo = "Subtarget does not support offset scaling";
5880 return false;
5881 }
5882 if (!AMDGPU::supportsScaleOffset(MII: *this, Opcode: MI.getOpcode())) {
5883 ErrInfo = "Instruction does not support offset scaling";
5884 return false;
5885 }
5886 }
5887 }
5888
5889 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32or64BitOperand for more
5890 // information.
5891 if (AMDGPU::isPackedFP32or64BitInst(Opc: Opcode) && AMDGPU::isGFX12Plus(STI: ST)) {
5892 for (unsigned I = 0; I < 3; ++I) {
5893 if (!isLegalGFX12PlusPackedMathFP32or64BitOperand(MRI, MI, SrcN: I))
5894 return false;
5895 }
5896 }
5897
5898 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5899 MI.readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI: nullptr)) {
5900 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
5901 if ((Dst && RI.getRegClassForReg(MRI, Reg: Dst->getReg()) ==
5902 &AMDGPU::SReg_64RegClass) ||
5903 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5904 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5905 return false;
5906 }
5907 }
5908
5909 return true;
5910}
5911
5912unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
5913 if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
5914 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5915 return MI.getOperand(i: 1).isReg() || RI.isAGPR(MRI, Reg: MI.getOperand(i: 0).getReg())
5916 ? AMDGPU::COPY
5917 : AMDGPU::V_MOV_B32_e32;
5918 }
5919 return getVALUOp(Opc: MI.getOpcode());
5920}
5921
5922// It is more readable to list mapped opcodes on the same line.
5923// clang-format off
5924
5925unsigned SIInstrInfo::getVALUOp(unsigned Opc) const {
5926 switch (Opc) {
5927 default: return AMDGPU::INSTRUCTION_LIST_END;
5928 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5929 case AMDGPU::COPY: return AMDGPU::COPY;
5930 case AMDGPU::PHI: return AMDGPU::PHI;
5931 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5932 case AMDGPU::WQM: return AMDGPU::WQM;
5933 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5934 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5935 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5936 case AMDGPU::S_ADD_I32:
5937 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5938 case AMDGPU::S_ADDC_U32:
5939 return AMDGPU::V_ADDC_U32_e32;
5940 case AMDGPU::S_SUB_I32:
5941 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5942 // FIXME: These are not consistently handled, and selected when the carry is
5943 // used.
5944 case AMDGPU::S_ADD_U32:
5945 return AMDGPU::V_ADD_CO_U32_e32;
5946 case AMDGPU::S_SUB_U32:
5947 return AMDGPU::V_SUB_CO_U32_e32;
5948 case AMDGPU::S_ADD_U64_PSEUDO:
5949 return AMDGPU::V_ADD_U64_PSEUDO;
5950 case AMDGPU::S_SUB_U64_PSEUDO:
5951 return AMDGPU::V_SUB_U64_PSEUDO;
5952 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5953 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5954 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5955 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5956 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5957 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5958 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5959 case AMDGPU::S_XNOR_B32:
5960 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5961 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5962 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5963 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5964 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5965 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5966 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5967 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5968 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5969 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5970 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5971 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5972 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5973 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5974 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5975 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5976 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5977 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5978 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5979 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5980 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5981 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5982 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5983 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5984 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5985 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5986 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5987 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5988 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5989 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5990 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5991 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5992 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5993 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5994 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5995 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5996 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5997 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5998 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5999 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6000 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6001 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6002 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6003 case AMDGPU::S_CVT_F32_F16:
6004 case AMDGPU::S_CVT_HI_F32_F16:
6005 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6006 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6007 case AMDGPU::S_CVT_F16_F32:
6008 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6009 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6010 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6011 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6012 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6013 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6014 case AMDGPU::S_CEIL_F16:
6015 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6016 : AMDGPU::V_CEIL_F16_fake16_e64;
6017 case AMDGPU::S_FLOOR_F16:
6018 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6019 : AMDGPU::V_FLOOR_F16_fake16_e64;
6020 case AMDGPU::S_TRUNC_F16:
6021 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6022 : AMDGPU::V_TRUNC_F16_fake16_e64;
6023 case AMDGPU::S_RNDNE_F16:
6024 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6025 : AMDGPU::V_RNDNE_F16_fake16_e64;
6026 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6027 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6028 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6029 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6030 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6031 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6032 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6033 case AMDGPU::S_ADD_F16:
6034 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6035 : AMDGPU::V_ADD_F16_fake16_e64;
6036 case AMDGPU::S_SUB_F16:
6037 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6038 : AMDGPU::V_SUB_F16_fake16_e64;
6039 case AMDGPU::S_MIN_F16:
6040 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6041 : AMDGPU::V_MIN_F16_fake16_e64;
6042 case AMDGPU::S_MAX_F16:
6043 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6044 : AMDGPU::V_MAX_F16_fake16_e64;
6045 case AMDGPU::S_MINIMUM_F16:
6046 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6047 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6048 case AMDGPU::S_MAXIMUM_F16:
6049 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6050 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6051 case AMDGPU::S_MUL_F16:
6052 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6053 : AMDGPU::V_MUL_F16_fake16_e64;
6054 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6055 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6056 case AMDGPU::S_FMAC_F16:
6057 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6058 : AMDGPU::V_FMAC_F16_fake16_e64;
6059 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6060 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6061 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6062 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6063 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6064 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6065 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6066 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6067 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6068 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6069 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6070 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6071 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6072 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6073 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6074 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6075 case AMDGPU::S_CMP_LT_F16:
6076 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6077 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6078 case AMDGPU::S_CMP_EQ_F16:
6079 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6080 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6081 case AMDGPU::S_CMP_LE_F16:
6082 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6083 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6084 case AMDGPU::S_CMP_GT_F16:
6085 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6086 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6087 case AMDGPU::S_CMP_LG_F16:
6088 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6089 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6090 case AMDGPU::S_CMP_GE_F16:
6091 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6092 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6093 case AMDGPU::S_CMP_O_F16:
6094 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6095 : AMDGPU::V_CMP_O_F16_fake16_e64;
6096 case AMDGPU::S_CMP_U_F16:
6097 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6098 : AMDGPU::V_CMP_U_F16_fake16_e64;
6099 case AMDGPU::S_CMP_NGE_F16:
6100 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6101 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6102 case AMDGPU::S_CMP_NLG_F16:
6103 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6104 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6105 case AMDGPU::S_CMP_NGT_F16:
6106 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6107 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6108 case AMDGPU::S_CMP_NLE_F16:
6109 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6110 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6111 case AMDGPU::S_CMP_NEQ_F16:
6112 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6113 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6114 case AMDGPU::S_CMP_NLT_F16:
6115 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6116 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6117 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6118 case AMDGPU::V_S_EXP_F16_e64:
6119 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6120 : AMDGPU::V_EXP_F16_fake16_e64;
6121 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6122 case AMDGPU::V_S_LOG_F16_e64:
6123 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6124 : AMDGPU::V_LOG_F16_fake16_e64;
6125 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6126 case AMDGPU::V_S_RCP_F16_e64:
6127 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6128 : AMDGPU::V_RCP_F16_fake16_e64;
6129 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6130 case AMDGPU::V_S_RSQ_F16_e64:
6131 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6132 : AMDGPU::V_RSQ_F16_fake16_e64;
6133 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6134 case AMDGPU::V_S_SQRT_F16_e64:
6135 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6136 : AMDGPU::V_SQRT_F16_fake16_e64;
6137 }
6138 llvm_unreachable(
6139 "Unexpected scalar opcode without corresponding vector one!");
6140}
6141
6142// clang-format on
6143
6144void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
6145 MachineBasicBlock &MBB,
6146 MachineBasicBlock::iterator MBBI,
6147 const DebugLoc &DL, Register Reg,
6148 bool IsSCCLive,
6149 SlotIndexes *Indexes) const {
6150 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6151 const SIInstrInfo *TII = ST.getInstrInfo();
6152 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
6153 if (IsSCCLive) {
6154 // Insert two move instructions, one to save the original value of EXEC and
6155 // the other to turn on all bits in EXEC. This is required as we can't use
6156 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6157 auto StoreExecMI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: Reg)
6158 .addReg(RegNo: LMC.ExecReg, Flags: RegState::Kill);
6159 auto FlipExecMI =
6160 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg).addImm(Val: -1);
6161 if (Indexes) {
6162 Indexes->insertMachineInstrInMaps(MI&: *StoreExecMI);
6163 Indexes->insertMachineInstrInMaps(MI&: *FlipExecMI);
6164 }
6165 } else {
6166 auto SaveExec =
6167 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.OrSaveExecOpc), DestReg: Reg).addImm(Val: -1);
6168 SaveExec->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
6169 if (Indexes)
6170 Indexes->insertMachineInstrInMaps(MI&: *SaveExec);
6171 }
6172}
6173
6174void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
6175 MachineBasicBlock::iterator MBBI,
6176 const DebugLoc &DL, Register Reg,
6177 SlotIndexes *Indexes) const {
6178 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
6179 auto ExecRestoreMI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
6180 .addReg(RegNo: Reg, Flags: RegState::Kill);
6181 if (Indexes)
6182 Indexes->insertMachineInstrInMaps(MI&: *ExecRestoreMI);
6183}
6184
6185MachineInstr *
6186SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
6187 assert(MF.getInfo<SIMachineFunctionInfo>()->isWholeWaveFunction() &&
6188 "Not a whole wave func");
6189 MachineBasicBlock &MBB = *MF.begin();
6190 for (MachineInstr &MI : MBB)
6191 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6192 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6193 return &MI;
6194
6195 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6196}
6197
6198const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
6199 unsigned OpNo) const {
6200 const MCInstrDesc &Desc = get(Opcode: MI.getOpcode());
6201 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6202 Desc.operands()[OpNo].RegClass == -1) {
6203 Register Reg = MI.getOperand(i: OpNo).getReg();
6204
6205 if (Reg.isVirtual()) {
6206 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6207 return MRI.getRegClass(Reg);
6208 }
6209 return RI.getPhysRegBaseClass(Reg);
6210 }
6211
6212 int16_t RegClass = getOpRegClassID(OpInfo: Desc.operands()[OpNo]);
6213 return RegClass < 0 ? nullptr : RI.getRegClass(i: RegClass);
6214}
6215
6216void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
6217 MachineBasicBlock::iterator I = MI;
6218 MachineBasicBlock *MBB = MI.getParent();
6219 MachineOperand &MO = MI.getOperand(i: OpIdx);
6220 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6221 unsigned RCID = getOpRegClassID(OpInfo: get(Opcode: MI.getOpcode()).operands()[OpIdx]);
6222 const TargetRegisterClass *RC = RI.getRegClass(i: RCID);
6223 unsigned Size = RI.getRegSizeInBits(RC: *RC);
6224 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6225 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6226 : AMDGPU::V_MOV_B32_e32;
6227 if (MO.isReg())
6228 Opcode = AMDGPU::COPY;
6229 else if (RI.isSGPRClass(RC))
6230 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6231
6232 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(SRC: RC);
6233 Register Reg = MRI.createVirtualRegister(RegClass: VRC);
6234 DebugLoc DL = MBB->findDebugLoc(MBBI: I);
6235 BuildMI(BB&: *MI.getParent(), I, MIMD: DL, MCID: get(Opcode), DestReg: Reg).add(MO);
6236 MO.ChangeToRegister(Reg, isDef: false);
6237}
6238
6239unsigned SIInstrInfo::buildExtractSubReg(
6240 MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
6241 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6242 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6243 if (!SuperReg.getReg().isVirtual())
6244 return RI.getSubReg(Reg: SuperReg.getReg(), Idx: SubIdx);
6245
6246 MachineBasicBlock *MBB = MI->getParent();
6247 const DebugLoc &DL = MI->getDebugLoc();
6248 Register SubReg = MRI.createVirtualRegister(RegClass: SubRC);
6249
6250 unsigned NewSubIdx = RI.composeSubRegIndices(a: SuperReg.getSubReg(), b: SubIdx);
6251 BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: SubReg)
6252 .addReg(RegNo: SuperReg.getReg(), Flags: {}, SubReg: NewSubIdx);
6253 return SubReg;
6254}
6255
6256MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
6257 MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI,
6258 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6259 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6260 if (Op.isImm()) {
6261 if (SubIdx == AMDGPU::sub0)
6262 return MachineOperand::CreateImm(Val: static_cast<int32_t>(Op.getImm()));
6263 if (SubIdx == AMDGPU::sub1)
6264 return MachineOperand::CreateImm(Val: static_cast<int32_t>(Op.getImm() >> 32));
6265
6266 llvm_unreachable("Unhandled register index for immediate");
6267 }
6268
6269 unsigned SubReg = buildExtractSubReg(MI: MII, MRI, SuperReg: Op, SuperRC,
6270 SubIdx, SubRC);
6271 return MachineOperand::CreateReg(Reg: SubReg, isDef: false);
6272}
6273
6274// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6275void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6276 assert(Inst.getNumExplicitOperands() == 3);
6277 MachineOperand Op1 = Inst.getOperand(i: 1);
6278 Inst.removeOperand(OpNo: 1);
6279 Inst.addOperand(Op: Op1);
6280}
6281
6282bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
6283 const MCOperandInfo &OpInfo,
6284 const MachineOperand &MO) const {
6285 if (!MO.isReg())
6286 return false;
6287
6288 Register Reg = MO.getReg();
6289
6290 const TargetRegisterClass *DRC = RI.getRegClass(i: getOpRegClassID(OpInfo));
6291 if (Reg.isPhysical())
6292 return DRC->contains(Reg);
6293
6294 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6295
6296 if (MO.getSubReg()) {
6297 const MachineFunction *MF = MO.getParent()->getMF();
6298 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, MF: *MF);
6299 if (!SuperRC)
6300 return false;
6301 return RI.getMatchingSuperRegClass(A: SuperRC, B: DRC, Idx: MO.getSubReg()) != nullptr;
6302 }
6303
6304 return RI.getCommonSubClass(A: DRC, B: RC) != nullptr;
6305}
6306
6307bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
6308 const MachineOperand &MO) const {
6309 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6310 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6311 unsigned Opc = MI.getOpcode();
6312
6313 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32or64BitOperand for more
6314 // information.
6315 if (AMDGPU::isPackedFP32or64BitInst(Opc: MI.getOpcode()) &&
6316 AMDGPU::isGFX12Plus(STI: ST) && MO.isReg() && RI.isSGPRReg(MRI, Reg: MO.getReg())) {
6317 constexpr AMDGPU::OpName OpNames[] = {
6318 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6319
6320 for (auto [I, OpName] : enumerate(First: OpNames)) {
6321 int SrcIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[I]);
6322 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6323 !isLegalGFX12PlusPackedMathFP32or64BitOperand(MRI, MI, SrcN: I, MO: &MO))
6324 return false;
6325 }
6326 }
6327
6328 if (!isLegalRegOperand(MRI, OpInfo, MO))
6329 return false;
6330
6331 // check Accumulate GPR operand
6332 bool IsAGPR = RI.isAGPR(MRI, Reg: MO.getReg());
6333 if (IsAGPR && !ST.hasMAIInsts())
6334 return false;
6335 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6336 (MI.mayLoad() || MI.mayStore() || isDS(Opcode: Opc) || isMIMG(Opcode: Opc)))
6337 return false;
6338 // Atomics should have both vdst and vdata either vgpr or agpr.
6339 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
6340 const int DataIdx = AMDGPU::getNamedOperandIdx(
6341 Opcode: Opc, Name: isDS(Opcode: Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6342 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6343 MI.getOperand(i: DataIdx).isReg() &&
6344 RI.isAGPR(MRI, Reg: MI.getOperand(i: DataIdx).getReg()) != IsAGPR)
6345 return false;
6346 if ((int)OpIdx == DataIdx) {
6347 if (VDstIdx != -1 &&
6348 RI.isAGPR(MRI, Reg: MI.getOperand(i: VDstIdx).getReg()) != IsAGPR)
6349 return false;
6350 // DS instructions with 2 src operands also must have tied RC.
6351 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data1);
6352 if (Data1Idx != -1 && MI.getOperand(i: Data1Idx).isReg() &&
6353 RI.isAGPR(MRI, Reg: MI.getOperand(i: Data1Idx).getReg()) != IsAGPR)
6354 return false;
6355 }
6356
6357 // Check V_ACCVGPR_WRITE_B32_e64
6358 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6359 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0) &&
6360 RI.isSGPRReg(MRI, Reg: MO.getReg()))
6361 return false;
6362
6363 if (ST.hasFlatScratchHiInB64InstHazard() &&
6364 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6365 if (const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst)) {
6366 if (AMDGPU::getRegBitWidth(RC: *RI.getRegClassForReg(MRI, Reg: Dst->getReg())) ==
6367 64)
6368 return false;
6369 }
6370 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6371 return false;
6372 }
6373 if (!ST.hasDPPSrc1SGPR() && isDPP(MI) && RI.isSGPRReg(MRI, Reg: MO.getReg()) &&
6374 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1))
6375 return false;
6376
6377 return true;
6378}
6379
6380bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
6381 const MCOperandInfo &OpInfo,
6382 const MachineOperand &MO) const {
6383 if (MO.isReg())
6384 return isLegalRegOperand(MRI, OpInfo, MO);
6385
6386 // Handle non-register types that are treated like immediates.
6387 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6388 return true;
6389}
6390
6391bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32or64BitOperand(
6392 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6393 const MachineOperand *MO) const {
6394 constexpr unsigned NumOps = 3;
6395 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6396 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6397 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6398 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6399
6400 assert(SrcN < NumOps);
6401
6402 if (!MO) {
6403 int SrcIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[SrcN]);
6404 if (SrcIdx == -1)
6405 return true;
6406 MO = &MI.getOperand(i: SrcIdx);
6407 }
6408
6409 if (!MO->isReg() || !RI.isSGPRReg(MRI, Reg: MO->getReg()))
6410 return true;
6411
6412 int ModsIdx =
6413 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[NumOps + SrcN]);
6414 if (ModsIdx == -1)
6415 return false;
6416
6417 unsigned Mods = MI.getOperand(i: ModsIdx).getImm();
6418 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6419 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6420
6421 return !OpSel && !OpSelHi;
6422}
6423
6424bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
6425 const MachineOperand *MO) const {
6426 const MachineFunction &MF = *MI.getMF();
6427 const MachineRegisterInfo &MRI = MF.getRegInfo();
6428 const MCInstrDesc &InstDesc = MI.getDesc();
6429 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6430 int64_t RegClass = getOpRegClassID(OpInfo);
6431 const TargetRegisterClass *DefinedRC =
6432 RegClass != -1 ? RI.getRegClass(i: RegClass) : nullptr;
6433 if (!MO)
6434 MO = &MI.getOperand(i: OpIdx);
6435
6436 const bool IsInlineConst = !MO->isReg() && isInlineConstant(MO: *MO, OpInfo);
6437
6438 if (isVALU(MI, /*AllowLDSDMA=*/true) && !IsInlineConst &&
6439 usesConstantBus(MRI, MO: *MO, OpInfo)) {
6440 const MachineOperand *UsedLiteral = nullptr;
6441
6442 int ConstantBusLimit = ST.getConstantBusLimit(Opcode: MI.getOpcode());
6443 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6444
6445 // TODO: Be more permissive with frame indexes.
6446 if (!MO->isReg() && !isInlineConstant(MO: *MO, OpInfo)) {
6447 if (!LiteralLimit--)
6448 return false;
6449
6450 UsedLiteral = MO;
6451 }
6452
6453 SmallDenseSet<RegSubRegPair> SGPRsUsed;
6454 if (MO->isReg())
6455 SGPRsUsed.insert(V: RegSubRegPair(MO->getReg(), MO->getSubReg()));
6456
6457 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6458 if (i == OpIdx)
6459 continue;
6460 const MachineOperand &Op = MI.getOperand(i);
6461 if (Op.isReg()) {
6462 if (Op.isUse()) {
6463 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6464 if (regUsesConstantBus(RegOp: Op, MRI) && SGPRsUsed.insert(V: SGPR).second) {
6465 if (--ConstantBusLimit <= 0)
6466 return false;
6467 }
6468 }
6469 } else if (AMDGPU::isSISrcOperand(OpInfo: InstDesc.operands()[i]) &&
6470 !isInlineConstant(MO: Op, OpInfo: InstDesc.operands()[i])) {
6471 // The same literal may be used multiple times.
6472 if (!UsedLiteral)
6473 UsedLiteral = &Op;
6474 else if (UsedLiteral->isIdenticalTo(Other: Op))
6475 continue;
6476
6477 if (!LiteralLimit--)
6478 return false;
6479 if (--ConstantBusLimit <= 0)
6480 return false;
6481 }
6482 }
6483 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6484 // There can be at most one literal operand, but it can be repeated.
6485 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6486 if (i == OpIdx)
6487 continue;
6488 const MachineOperand &Op = MI.getOperand(i);
6489 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6490 !isInlineConstant(MO: Op, OpInfo: InstDesc.operands()[i]) &&
6491 !Op.isIdenticalTo(Other: *MO))
6492 return false;
6493
6494 // Do not fold a non-inlineable and non-register operand into an
6495 // instruction that already has a frame index. The frame index handling
6496 // code could not handle well when a frame index co-exists with another
6497 // non-register operand, unless that operand is an inlineable immediate.
6498 if (Op.isFI())
6499 return false;
6500 }
6501 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6502 isF16PseudoScalarTrans(Opcode: MI.getOpcode())) {
6503 return false;
6504 }
6505
6506 if (MO->isReg()) {
6507 if (!DefinedRC)
6508 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6509 return isLegalRegOperand(MI, OpIdx, MO: *MO);
6510 }
6511
6512 if (MO->isImm()) {
6513 uint64_t Imm = MO->getImm();
6514 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
6515 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP64;
6516 bool Is64BitOp = Is64BitFPOp ||
6517 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6518 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6519 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32 ||
6520 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT64;
6521 if (Is64BitOp &&
6522 !AMDGPU::isInlinableLiteral64(Literal: Imm, HasInv2Pi: ST.hasInv2PiInlineImm())) {
6523 if (!AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: Is64BitFPOp) &&
6524 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6525 return false;
6526
6527 // FIXME: We can use sign extended 64-bit literals, but only for signed
6528 // operands. At the moment we do not know if an operand is signed.
6529 // Such operand will be encoded as its low 32 bits and then either
6530 // correctly sign extended or incorrectly zero extended by HW.
6531 // If 64-bit literals are supported and the literal will be encoded
6532 // as full 64 bit we still can use it.
6533 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6534 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: false)))
6535 return false;
6536 }
6537 }
6538
6539 // Handle non-register types that are treated like immediates.
6540 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6541
6542 if (!DefinedRC) {
6543 // This operand expects an immediate.
6544 return true;
6545 }
6546
6547 return isImmOperandLegal(MI, OpNo: OpIdx, MO: *MO);
6548}
6549
6550bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
6551 bool IsGFX950Only = ST.hasGFX950Insts();
6552 bool IsGFX940Only = ST.hasGFX940Insts();
6553
6554 if (!IsGFX950Only && !IsGFX940Only)
6555 return false;
6556
6557 if (!isVALU(MI, /*AllowLDSDMA=*/true))
6558 return false;
6559
6560 // V_COS, V_EXP, V_RCP, etc.
6561 if (isTRANS(MI))
6562 return true;
6563
6564 // DOT2, DOT2C, DOT4, etc.
6565 if (isDOT(MI))
6566 return true;
6567
6568 // MFMA, SMFMA
6569 if (isMFMA(MI))
6570 return true;
6571
6572 unsigned Opcode = MI.getOpcode();
6573 switch (Opcode) {
6574 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6575 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6576 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6577 case AMDGPU::V_MQSAD_U32_U8_e64:
6578 case AMDGPU::V_PK_ADD_F16:
6579 case AMDGPU::V_PK_ADD_F32:
6580 case AMDGPU::V_PK_ADD_I16:
6581 case AMDGPU::V_PK_ADD_U16:
6582 case AMDGPU::V_PK_ASHRREV_I16:
6583 case AMDGPU::V_PK_FMA_F16:
6584 case AMDGPU::V_PK_FMA_F32:
6585 case AMDGPU::V_PK_FMAC_F16_e32:
6586 case AMDGPU::V_PK_FMAC_F16_e64:
6587 case AMDGPU::V_PK_LSHLREV_B16:
6588 case AMDGPU::V_PK_LSHRREV_B16:
6589 case AMDGPU::V_PK_MAD_I16:
6590 case AMDGPU::V_PK_MAD_U16:
6591 case AMDGPU::V_PK_MAX_F16:
6592 case AMDGPU::V_PK_MAX_I16:
6593 case AMDGPU::V_PK_MAX_U16:
6594 case AMDGPU::V_PK_MIN_F16:
6595 case AMDGPU::V_PK_MIN_I16:
6596 case AMDGPU::V_PK_MIN_U16:
6597 case AMDGPU::V_PK_MOV_B32:
6598 case AMDGPU::V_PK_MUL_F16:
6599 case AMDGPU::V_PK_MUL_F32:
6600 case AMDGPU::V_PK_MUL_LO_U16:
6601 case AMDGPU::V_PK_SUB_I16:
6602 case AMDGPU::V_PK_SUB_U16:
6603 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6604 return true;
6605 default:
6606 return false;
6607 }
6608}
6609
6610void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
6611 MachineInstr &MI) const {
6612 unsigned Opc = MI.getOpcode();
6613 const MCInstrDesc &InstrDesc = get(Opcode: Opc);
6614
6615 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
6616 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
6617
6618 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1);
6619 MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
6620
6621 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6622 // we need to only have one constant bus use before GFX10.
6623 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6624 if (HasImplicitSGPR && ST.getConstantBusLimit(Opcode: Opc) <= 1 && Src0.isReg() &&
6625 RI.isSGPRReg(MRI, Reg: Src0.getReg()))
6626 legalizeOpWithMove(MI, OpIdx: Src0Idx);
6627
6628 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6629 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6630 // src0/src1 with V_READFIRSTLANE.
6631 if (Opc == AMDGPU::V_WRITELANE_B32) {
6632 const DebugLoc &DL = MI.getDebugLoc();
6633 if (Src0.isReg() && RI.isVGPR(MRI, Reg: Src0.getReg())) {
6634 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6635 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6636 .add(MO: Src0);
6637 Src0.ChangeToRegister(Reg, isDef: false);
6638 }
6639 if (Src1.isReg() && RI.isVGPR(MRI, Reg: Src1.getReg())) {
6640 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6641 const DebugLoc &DL = MI.getDebugLoc();
6642 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6643 .add(MO: Src1);
6644 Src1.ChangeToRegister(Reg, isDef: false);
6645 }
6646 return;
6647 }
6648
6649 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6650 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6651 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
6652 if (!RI.isVGPR(MRI, Reg: MI.getOperand(i: Src2Idx).getReg()))
6653 legalizeOpWithMove(MI, OpIdx: Src2Idx);
6654 }
6655
6656 // VOP2 src0 instructions support all operand types, so we don't need to check
6657 // their legality. If src1 is already legal, we don't need to do anything.
6658 if (isLegalRegOperand(MRI, OpInfo: InstrDesc.operands()[Src1Idx], MO: Src1))
6659 return;
6660
6661 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6662 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6663 // select is uniform.
6664 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6665 RI.isVGPR(MRI, Reg: Src1.getReg())) {
6666 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6667 const DebugLoc &DL = MI.getDebugLoc();
6668 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6669 .add(MO: Src1);
6670 Src1.ChangeToRegister(Reg, isDef: false);
6671 return;
6672 }
6673
6674 // We do not use commuteInstruction here because it is too aggressive and will
6675 // commute if it is possible. We only want to commute here if it improves
6676 // legality. This can be called a fairly large number of times so don't waste
6677 // compile time pointlessly swapping and checking legality again.
6678 if (HasImplicitSGPR || !MI.isCommutable()) {
6679 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6680 return;
6681 }
6682
6683 // If src0 can be used as src1, commuting will make the operands legal.
6684 // Otherwise we have to give up and insert a move.
6685 //
6686 // TODO: Other immediate-like operand kinds could be commuted if there was a
6687 // MachineOperand::ChangeTo* for them.
6688 if ((!Src1.isImm() && !Src1.isReg()) ||
6689 !isLegalRegOperand(MRI, OpInfo: InstrDesc.operands()[Src1Idx], MO: Src0)) {
6690 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6691 return;
6692 }
6693
6694 int CommutedOpc = commuteOpcode(MI);
6695 if (CommutedOpc == -1) {
6696 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6697 return;
6698 }
6699
6700 MI.setDesc(get(Opcode: CommutedOpc));
6701
6702 Register Src0Reg = Src0.getReg();
6703 unsigned Src0SubReg = Src0.getSubReg();
6704 bool Src0Kill = Src0.isKill();
6705
6706 if (Src1.isImm())
6707 Src0.ChangeToImmediate(ImmVal: Src1.getImm());
6708 else if (Src1.isReg()) {
6709 Src0.ChangeToRegister(Reg: Src1.getReg(), isDef: false, isImp: false, isKill: Src1.isKill());
6710 Src0.setSubReg(Src1.getSubReg());
6711 } else
6712 llvm_unreachable("Should only have register or immediate operands");
6713
6714 Src1.ChangeToRegister(Reg: Src0Reg, isDef: false, isImp: false, isKill: Src0Kill);
6715 Src1.setSubReg(Src0SubReg);
6716 fixImplicitOperands(MI);
6717}
6718
6719// Legalize VOP3 operands. All operand types are supported for any operand
6720// but only one literal constant and only starting from GFX10.
6721void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
6722 MachineInstr &MI) const {
6723 unsigned Opc = MI.getOpcode();
6724
6725 int VOP3Idx[3] = {
6726 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0),
6727 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1),
6728 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2)
6729 };
6730
6731 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6732 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6733 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6734 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6735 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6736 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6737 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6738 // src1 and src2 must be scalar
6739 MachineOperand &Src1 = MI.getOperand(i: VOP3Idx[1]);
6740 const DebugLoc &DL = MI.getDebugLoc();
6741 if (Src1.isReg() && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src1.getReg()))) {
6742 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6743 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6744 .add(MO: Src1);
6745 Src1.ChangeToRegister(Reg, isDef: false);
6746 }
6747 if (VOP3Idx[2] != -1) {
6748 MachineOperand &Src2 = MI.getOperand(i: VOP3Idx[2]);
6749 if (Src2.isReg() && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src2.getReg()))) {
6750 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6751 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6752 .add(MO: Src2);
6753 Src2.ChangeToRegister(Reg, isDef: false);
6754 }
6755 }
6756 }
6757
6758 // Find the one SGPR operand we are allowed to use.
6759 int ConstantBusLimit = ST.getConstantBusLimit(Opcode: Opc);
6760 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6761 SmallDenseSet<unsigned> SGPRsUsed;
6762 Register SGPRReg = findUsedSGPR(MI, OpIndices: VOP3Idx);
6763 if (SGPRReg) {
6764 SGPRsUsed.insert(V: SGPRReg);
6765 --ConstantBusLimit;
6766 }
6767
6768 for (int Idx : VOP3Idx) {
6769 if (Idx == -1)
6770 break;
6771 MachineOperand &MO = MI.getOperand(i: Idx);
6772
6773 if (!MO.isReg()) {
6774 if (isInlineConstant(MO, OpInfo: get(Opcode: Opc).operands()[Idx]))
6775 continue;
6776
6777 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6778 --LiteralLimit;
6779 --ConstantBusLimit;
6780 continue;
6781 }
6782
6783 --LiteralLimit;
6784 --ConstantBusLimit;
6785 legalizeOpWithMove(MI, OpIdx: Idx);
6786 continue;
6787 }
6788
6789 if (!RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg: MO.getReg())))
6790 continue; // VGPRs are legal
6791
6792 // We can use one SGPR in each VOP3 instruction prior to GFX10
6793 // and two starting from GFX10.
6794 if (SGPRsUsed.count(V: MO.getReg()))
6795 continue;
6796 if (ConstantBusLimit > 0) {
6797 SGPRsUsed.insert(V: MO.getReg());
6798 --ConstantBusLimit;
6799 continue;
6800 }
6801
6802 // If we make it this far, then the operand is not legal and we must
6803 // legalize it.
6804 legalizeOpWithMove(MI, OpIdx: Idx);
6805 }
6806
6807 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6808 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6809 !RI.isVGPR(MRI, Reg: MI.getOperand(i: VOP3Idx[2]).getReg()))
6810 legalizeOpWithMove(MI, OpIdx: VOP3Idx[2]);
6811
6812 // Fix the register class of packed FP32 instructions on gfx12+. See
6813 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32or64BitOperand for more
6814 // information.
6815 if (AMDGPU::isPackedFP32or64BitInst(Opc) && AMDGPU::isGFX12Plus(STI: ST)) {
6816 for (unsigned I = 0; I < 3; ++I) {
6817 if (!isLegalGFX12PlusPackedMathFP32or64BitOperand(MRI, MI, /*SrcN=*/I))
6818 legalizeOpWithMove(MI, OpIdx: VOP3Idx[I]);
6819 }
6820 }
6821}
6822
6823Register SIInstrInfo::readlaneVGPRToSGPR(
6824 Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI,
6825 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6826 const TargetRegisterClass *VRC = MRI.getRegClass(Reg: SrcReg);
6827 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6828 if (DstRC)
6829 SRC = RI.getCommonSubClass(A: SRC, B: DstRC);
6830
6831 Register DstReg = MRI.createVirtualRegister(RegClass: SRC);
6832 unsigned SubRegs = RI.getRegSizeInBits(RC: *VRC) / 32;
6833
6834 if (RI.hasAGPRs(RC: VRC)) {
6835 VRC = RI.getEquivalentVGPRClass(SRC: VRC);
6836 Register NewSrcReg = MRI.createVirtualRegister(RegClass: VRC);
6837 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6838 MCID: get(Opcode: TargetOpcode::COPY), DestReg: NewSrcReg)
6839 .addReg(RegNo: SrcReg);
6840 SrcReg = NewSrcReg;
6841 }
6842
6843 if (SubRegs == 1) {
6844 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6845 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
6846 .addReg(RegNo: SrcReg);
6847 return DstReg;
6848 }
6849
6850 SmallVector<Register, 8> SRegs;
6851 for (unsigned i = 0; i < SubRegs; ++i) {
6852 Register SGPR = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
6853 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6854 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: SGPR)
6855 .addReg(RegNo: SrcReg, Flags: {}, SubReg: RI.getSubRegFromChannel(Channel: i));
6856 SRegs.push_back(Elt: SGPR);
6857 }
6858
6859 MachineInstrBuilder MIB =
6860 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6861 MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg);
6862 for (unsigned i = 0; i < SubRegs; ++i) {
6863 MIB.addReg(RegNo: SRegs[i]);
6864 MIB.addImm(Val: RI.getSubRegFromChannel(Channel: i));
6865 }
6866 return DstReg;
6867}
6868
6869void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
6870 MachineInstr &MI) const {
6871
6872 // If the pointer is store in VGPRs, then we need to move them to
6873 // SGPRs using v_readfirstlane. This is safe because we only select
6874 // loads with uniform pointers to SMRD instruction so we know the
6875 // pointer value is uniform.
6876 MachineOperand *SBase = getNamedOperand(MI, OperandName: AMDGPU::OpName::sbase);
6877 if (SBase && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SBase->getReg()))) {
6878 Register SGPR = readlaneVGPRToSGPR(SrcReg: SBase->getReg(), UseMI&: MI, MRI);
6879 SBase->setReg(SGPR);
6880 }
6881 MachineOperand *SOff = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
6882 if (SOff && !RI.isSGPRReg(MRI, Reg: SOff->getReg())) {
6883 Register SGPR = readlaneVGPRToSGPR(SrcReg: SOff->getReg(), UseMI&: MI, MRI);
6884 SOff->setReg(SGPR);
6885 }
6886}
6887
6888bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
6889 unsigned Opc = Inst.getOpcode();
6890 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::saddr);
6891 if (OldSAddrIdx < 0)
6892 return false;
6893
6894 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6895
6896 int NewOpc = AMDGPU::getGlobalVaddrOp(Opcode: Opc);
6897 if (NewOpc < 0)
6898 NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opcode: Opc);
6899 if (NewOpc < 0)
6900 return false;
6901
6902 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6903 MachineOperand &SAddr = Inst.getOperand(i: OldSAddrIdx);
6904 if (RI.isSGPRReg(MRI, Reg: SAddr.getReg()))
6905 return false;
6906
6907 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::vaddr);
6908 if (NewVAddrIdx < 0)
6909 return false;
6910
6911 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr);
6912
6913 // Check vaddr, it shall be zero or absent.
6914 MachineInstr *VAddrDef = nullptr;
6915 if (OldVAddrIdx >= 0) {
6916 MachineOperand &VAddr = Inst.getOperand(i: OldVAddrIdx);
6917 VAddrDef = MRI.getUniqueVRegDef(Reg: VAddr.getReg());
6918 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6919 !VAddrDef->getOperand(i: 1).isImm() ||
6920 VAddrDef->getOperand(i: 1).getImm() != 0)
6921 return false;
6922 }
6923
6924 const MCInstrDesc &NewDesc = get(Opcode: NewOpc);
6925 Inst.setDesc(NewDesc);
6926
6927 // Callers expect iterator to be valid after this call, so modify the
6928 // instruction in place.
6929 if (OldVAddrIdx == NewVAddrIdx) {
6930 MachineOperand &NewVAddr = Inst.getOperand(i: NewVAddrIdx);
6931 // Clear use list from the old vaddr holding a zero register.
6932 MRI.removeRegOperandFromUseList(MO: &NewVAddr);
6933 MRI.moveOperands(Dst: &NewVAddr, Src: &SAddr, NumOps: 1);
6934 Inst.removeOperand(OpNo: OldSAddrIdx);
6935 // Update the use list with the pointer we have just moved from vaddr to
6936 // saddr position. Otherwise new vaddr will be missing from the use list.
6937 MRI.removeRegOperandFromUseList(MO: &NewVAddr);
6938 MRI.addRegOperandToUseList(MO: &NewVAddr);
6939 } else {
6940 assert(OldSAddrIdx == NewVAddrIdx);
6941
6942 if (OldVAddrIdx >= 0) {
6943 int NewVDstIn = AMDGPU::getNamedOperandIdx(Opcode: NewOpc,
6944 Name: AMDGPU::OpName::vdst_in);
6945
6946 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6947 // it asserts. Untie the operands for now and retie them afterwards.
6948 if (NewVDstIn != -1) {
6949 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst_in);
6950 Inst.untieRegOperand(OpIdx: OldVDstIn);
6951 }
6952
6953 Inst.removeOperand(OpNo: OldVAddrIdx);
6954
6955 if (NewVDstIn != -1) {
6956 int NewVDst = AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::vdst);
6957 Inst.tieOperands(DefIdx: NewVDst, UseIdx: NewVDstIn);
6958 }
6959 }
6960 }
6961
6962 if (VAddrDef && MRI.use_nodbg_empty(RegNo: VAddrDef->getOperand(i: 0).getReg()))
6963 VAddrDef->eraseFromParent();
6964
6965 return true;
6966}
6967
6968// FIXME: Remove this when SelectionDAG is obsoleted.
6969void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
6970 MachineInstr &MI) const {
6971 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6972 return;
6973
6974 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6975 // thinks they are uniform, so a readfirstlane should be valid.
6976 MachineOperand *SAddr = getNamedOperand(MI, OperandName: AMDGPU::OpName::saddr);
6977 if (!SAddr || RI.isSGPRClass(RC: MRI.getRegClass(Reg: SAddr->getReg())))
6978 return;
6979
6980 if (moveFlatAddrToVGPR(Inst&: MI))
6981 return;
6982
6983 const TargetRegisterClass *DeclaredRC =
6984 getRegClass(MCID: MI.getDesc(), OpNum: SAddr->getOperandNo());
6985
6986 Register ToSGPR = readlaneVGPRToSGPR(SrcReg: SAddr->getReg(), UseMI&: MI, MRI, DstRC: DeclaredRC);
6987 SAddr->setReg(ToSGPR);
6988}
6989
6990void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
6991 MachineBasicBlock::iterator I,
6992 const TargetRegisterClass *DstRC,
6993 MachineOperand &Op,
6994 MachineRegisterInfo &MRI,
6995 const DebugLoc &DL) const {
6996 Register OpReg = Op.getReg();
6997 unsigned OpSubReg = Op.getSubReg();
6998
6999 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7000 RI.getRegClassForReg(MRI, Reg: OpReg), OpSubReg);
7001
7002 // Check if operand is already the correct register class.
7003 if (DstRC == OpRC)
7004 return;
7005
7006 Register DstReg = MRI.createVirtualRegister(RegClass: DstRC);
7007 auto Copy =
7008 BuildMI(BB&: InsertMBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: OpReg);
7009 Op.setReg(DstReg);
7010
7011 MachineInstr *Def = MRI.getVRegDef(Reg: OpReg);
7012 if (!Def)
7013 return;
7014
7015 // Try to eliminate the copy if it is copying an immediate value.
7016 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7017 foldImmediate(UseMI&: *Copy, DefMI&: *Def, Reg: OpReg, MRI: &MRI);
7018
7019 bool ImpDef = Def->isImplicitDef();
7020 while (!ImpDef && Def && Def->isCopy()) {
7021 if (Def->getOperand(i: 1).getReg().isPhysical())
7022 break;
7023 Def = MRI.getUniqueVRegDef(Reg: Def->getOperand(i: 1).getReg());
7024 ImpDef = Def && Def->isImplicitDef();
7025 }
7026 if (!RI.isSGPRClass(RC: DstRC) && !Copy->readsRegister(Reg: AMDGPU::EXEC, TRI: &RI) &&
7027 !ImpDef)
7028 Copy.addReg(RegNo: AMDGPU::EXEC, Flags: RegState::Implicit);
7029}
7030
7031// Emit the actual waterfall loop, executing the wrapped instruction for each
7032// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7033// iteration, in the worst case we execute 64 (once per lane).
7034static void emitLoadScalarOpsFromVGPRLoop(
7035 const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &PredBB,
7036 MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL,
7037 ArrayRef<MachineOperand *> ScalarOps, ArrayRef<Register> PhySGPRs = {}) {
7038 MachineFunction &MF = *LoopBB.getParent();
7039 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7040 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7041 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
7042 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7043
7044 // Emit v_cmpx_eq and s_andn2_wrexec when both instructions are
7045 // available. Otherwise, use the previous pattern of v_cmp_eq,
7046 // s_and_saveexec, and s_xor.
7047 bool UseNewExecInstructions =
7048 ST.hasNoSdstCMPX() && TII.pseudoToMCOpcode(Opcode: LMC.AndN2WrExecOpc) != -1;
7049
7050 MachineBasicBlock::iterator I = LoopBB.begin();
7051 Register CondReg;
7052
7053 Register PhiExec;
7054 Register NewExec;
7055
7056 if (UseNewExecInstructions) {
7057 PhiExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7058 NewExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7059 Register InitExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7060 BuildMI(BB&: PredBB, I: PredBB.end(), MIMD: DL, MCID: TII.get(Opcode: LMC.MovOpc), DestReg: InitExec)
7061 .addReg(RegNo: LMC.ExecReg);
7062
7063 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::PHI), DestReg: PhiExec)
7064 .addReg(RegNo: InitExec)
7065 .addMBB(MBB: &PredBB)
7066 .addReg(RegNo: NewExec)
7067 .addMBB(MBB: &BodyBB);
7068 }
7069
7070 // Placement of v_cmpx instructions (when index is longer than 64 bit)
7071 // involves a trade-off between register pressure and latency:
7072 // (a) Defering all v_cmpx after all v_readfirstlane may increase
7073 // register pressure because arguments and results of all
7074 // v_readfirstlane instructions must stay live until deferred v_cmpx use them.
7075 // (b) Interleaving v_cmpx with v_readfirstlanes may reduce live ranges and
7076 // increase latency by placing v_readfirstlane instructions
7077 // immediately before v_cmpx instruction that directly depend on it.
7078 ///
7079 // Emitting interleaved v_cmpx and v_readfirstlane requires
7080 // block splitting because v_cmpx changes EXEC mask and therefore for safety
7081 // v_cmpx needs to be treated as terminator until after register allocation
7082 // (spill placement) and instruction reordering.
7083 //
7084 // Current implementation defers v_cmpx and leaves other instruction
7085 // scheduling decisions to later passes, where register pressure is known or
7086 // easier to approximate.
7087 // Non-terminators (V_READFIRSTLANE and REG_SEQUENCE) are inserted before I;
7088 // v_cmpx instructions are inserted at the end of LoopBB.
7089 // After the first v_cmpx is emitted, I is updated to point to it
7090 // so subsequent non-terminators are inserted before all v_cmpx instructions.
7091 for (auto [Idx, ScalarOp] : enumerate(First&: ScalarOps)) {
7092 unsigned RegSize = TRI->getRegSizeInBits(Reg: ScalarOp->getReg(), MRI);
7093 unsigned NumSubRegs = RegSize / 32;
7094 Register VScalarOp = ScalarOp->getReg();
7095
7096 const TargetRegisterClass *RFLSrcRC =
7097 TII.getRegClass(MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), OpNum: 1);
7098
7099 if (NumSubRegs == 1) {
7100 const TargetRegisterClass *VScalarOpRC = MRI.getRegClass(Reg: VScalarOp);
7101 if (const TargetRegisterClass *Common =
7102 TRI->getCommonSubClass(A: VScalarOpRC, B: RFLSrcRC);
7103 Common != VScalarOpRC) {
7104 Register VRReg = MRI.createVirtualRegister(RegClass: Common);
7105 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: VRReg).addReg(RegNo: VScalarOp);
7106 VScalarOp = VRReg;
7107 }
7108 Register CurReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7109
7110 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurReg)
7111 .addReg(RegNo: VScalarOp);
7112
7113 if (UseNewExecInstructions) {
7114 auto CmpxMI = BuildMI(BB&: LoopBB, I: LoopBB.end(), MIMD: DL,
7115 MCID: TII.get(Opcode: AMDGPU::V_CMPX_EQ_U32_nosdst_e32_term))
7116 .addReg(RegNo: CurReg)
7117 .addReg(RegNo: VScalarOp);
7118 if (I == LoopBB.end())
7119 I = CmpxMI.getInstr()->getIterator();
7120 } else {
7121 Register NewCondReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7122
7123 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: NewCondReg)
7124 .addReg(RegNo: CurReg)
7125 .addReg(RegNo: VScalarOp);
7126
7127 // Combine the comparison results with AND.
7128 if (!CondReg) { // First.
7129 CondReg = NewCondReg;
7130 } else { // If not the first, we create an AND.
7131 Register AndReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7132 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndOpc), DestReg: AndReg)
7133 .addReg(RegNo: CondReg)
7134 .addReg(RegNo: NewCondReg);
7135 CondReg = AndReg;
7136 }
7137 }
7138
7139 // Update ScalarOp operand to use the SGPR ScalarOp.
7140 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7141 ScalarOp->setReg(CurReg);
7142 else {
7143 // Insert into the same block of use
7144 BuildMI(BB&: *ScalarOp->getParent()->getParent(), I: ScalarOp->getParent(), MIMD: DL,
7145 MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: PhySGPRs[Idx])
7146 .addReg(RegNo: CurReg);
7147 ScalarOp->setReg(PhySGPRs[Idx]);
7148 }
7149 ScalarOp->setIsKill();
7150 } else {
7151 SmallVector<Register, 8> ReadlanePieces;
7152 RegState VScalarOpUndef = getUndefRegState(B: ScalarOp->isUndef());
7153 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7154 "Unhandled register size");
7155
7156 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7157 Register CurRegLo =
7158 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7159 Register CurRegHi =
7160 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7161
7162 // Read the next variant <- also loop target.
7163 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurRegLo)
7164 .addReg(RegNo: VScalarOp, Flags: VScalarOpUndef, SubReg: TRI->getSubRegFromChannel(Channel: Idx));
7165
7166 // Read the next variant <- also loop target.
7167 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurRegHi)
7168 .addReg(RegNo: VScalarOp, Flags: VScalarOpUndef,
7169 SubReg: TRI->getSubRegFromChannel(Channel: Idx + 1));
7170
7171 ReadlanePieces.push_back(Elt: CurRegLo);
7172 ReadlanePieces.push_back(Elt: CurRegHi);
7173
7174 // Comparison is to be done as 64-bit.
7175 Register CurReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_64RegClass);
7176 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: CurReg)
7177 .addReg(RegNo: CurRegLo)
7178 .addImm(Val: AMDGPU::sub0)
7179 .addReg(RegNo: CurRegHi)
7180 .addImm(Val: AMDGPU::sub1);
7181
7182 unsigned SubReg =
7183 NumSubRegs <= 2 ? 0 : TRI->getSubRegFromChannel(Channel: Idx, NumRegs: 2);
7184
7185 if (UseNewExecInstructions) {
7186 auto CmpxMI = BuildMI(BB&: LoopBB, I: LoopBB.end(), MIMD: DL,
7187 MCID: TII.get(Opcode: AMDGPU::V_CMPX_EQ_U64_nosdst_e32_term))
7188 .addReg(RegNo: CurReg)
7189 .addReg(RegNo: VScalarOp, Flags: VScalarOpUndef, SubReg);
7190 if (I == LoopBB.end())
7191 I = CmpxMI.getInstr()->getIterator();
7192 } else {
7193 Register NewCondReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7194 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U64_e64), DestReg: NewCondReg)
7195 .addReg(RegNo: CurReg)
7196 .addReg(RegNo: VScalarOp, Flags: VScalarOpUndef, SubReg);
7197
7198 // Combine the comparison results with AND.
7199 if (!CondReg) { // First.
7200 CondReg = NewCondReg;
7201 } else { // If not the first, we create an AND.
7202 Register AndReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7203 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndOpc), DestReg: AndReg)
7204 .addReg(RegNo: CondReg)
7205 .addReg(RegNo: NewCondReg);
7206 CondReg = AndReg;
7207 }
7208 }
7209 } // End for loop.
7210
7211 const auto *SScalarOpRC =
7212 TRI->getEquivalentSGPRClass(VRC: MRI.getRegClass(Reg: VScalarOp));
7213 Register SScalarOp = MRI.createVirtualRegister(RegClass: SScalarOpRC);
7214
7215 // Build scalar ScalarOp.
7216 auto Merge =
7217 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: SScalarOp);
7218 unsigned Channel = 0;
7219 for (Register Piece : ReadlanePieces) {
7220 Merge.addReg(RegNo: Piece).addImm(Val: TRI->getSubRegFromChannel(Channel: Channel++));
7221 }
7222
7223 // Update ScalarOp operand to use the SGPR ScalarOp.
7224 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7225 ScalarOp->setReg(SScalarOp);
7226 else {
7227 BuildMI(BB&: *ScalarOp->getParent()->getParent(), I: ScalarOp->getParent(), MIMD: DL,
7228 MCID: TII.get(Opcode: AMDGPU::COPY), DestReg: PhySGPRs[Idx])
7229 .addReg(RegNo: SScalarOp);
7230 ScalarOp->setReg(PhySGPRs[Idx]);
7231 }
7232 ScalarOp->setIsKill();
7233 }
7234 }
7235
7236 // Instructions AndSaveExecOpc and AndN2WrExecOpc that modify EXEC mask
7237 // should have isTerminator=1 but terminators that define
7238 // virtual registers are not supported.
7239 Register SaveExec;
7240 if (!UseNewExecInstructions) {
7241 SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7242 MRI.setSimpleHint(VReg: SaveExec, PrefReg: CondReg);
7243
7244 // Update EXEC to matching lanes, saving original to SaveExec.
7245 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndSaveExecOpc), DestReg: SaveExec)
7246 .addReg(RegNo: CondReg, Flags: RegState::Kill);
7247 }
7248
7249 // The original instruction is here; we insert the terminators after it.
7250 I = BodyBB.end();
7251
7252 if (UseNewExecInstructions) {
7253 MRI.setSimpleHint(VReg: NewExec, PrefReg: PhiExec);
7254 BuildMI(BB&: BodyBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndN2WrExecOpc), DestReg: NewExec)
7255 .addReg(RegNo: PhiExec);
7256 } else {
7257 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7258 BuildMI(BB&: BodyBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.XorTermOpc), DestReg: LMC.ExecReg)
7259 .addReg(RegNo: LMC.ExecReg)
7260 .addReg(RegNo: SaveExec);
7261 }
7262
7263 BuildMI(BB&: BodyBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::SI_WATERFALL_LOOP)).addMBB(MBB: &LoopBB);
7264}
7265
7266// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7267// with SGPRs by iterating over all unique values across all lanes.
7268// Returns the loop basic block that now contains \p MI.
7269static MachineBasicBlock *
7270generateWaterFallLoop(const SIInstrInfo &TII, MachineInstr &MI,
7271 ArrayRef<MachineOperand *> ScalarOps,
7272 MachineDominatorTree *MDT,
7273 MachineBasicBlock::iterator Begin = nullptr,
7274 MachineBasicBlock::iterator End = nullptr,
7275 ArrayRef<Register> PhySGPRs = {}) {
7276 assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) &&
7277 "Physical SGPRs must be empty or match the number of scalar operands");
7278 MachineBasicBlock &MBB = *MI.getParent();
7279 MachineFunction &MF = *MBB.getParent();
7280 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7281 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7282 MachineRegisterInfo &MRI = MF.getRegInfo();
7283 if (!Begin.isValid())
7284 Begin = &MI;
7285 if (!End.isValid()) {
7286 End = &MI;
7287 ++End;
7288 }
7289 const DebugLoc &DL = MI.getDebugLoc();
7290 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
7291 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7292
7293 // Save SCC. Waterfall Loop may overwrite SCC.
7294 Register SaveSCCReg;
7295
7296 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7297 // rather than unlimited scan everywhere
7298 bool SCCNotDead =
7299 MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::SCC, Before: MI,
7300 Neighborhood: std::numeric_limits<unsigned>::max()) !=
7301 MachineBasicBlock::LQR_Dead;
7302 if (SCCNotDead) {
7303 SaveSCCReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7304 BuildMI(BB&: MBB, I: Begin, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: SaveSCCReg)
7305 .addImm(Val: 1)
7306 .addImm(Val: 0);
7307 }
7308
7309 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7310
7311 // Save the EXEC mask
7312 BuildMI(BB&: MBB, I: Begin, MIMD: DL, MCID: TII.get(Opcode: LMC.MovOpc), DestReg: SaveExec).addReg(RegNo: LMC.ExecReg);
7313
7314 // Killed uses in the instruction we are waterfalling around will be
7315 // incorrect due to the added control-flow.
7316 MachineBasicBlock::iterator AfterMI = MI;
7317 ++AfterMI;
7318 for (auto I = Begin; I != AfterMI; I++) {
7319 for (auto &MO : I->all_uses())
7320 MRI.clearKillFlags(Reg: MO.getReg());
7321 }
7322
7323 // To insert the loop we need to split the block. Move everything after this
7324 // point to a new block, and insert a new empty block between the two.
7325 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
7326 MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
7327 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7328 MachineFunction::iterator MBBI(MBB);
7329 ++MBBI;
7330
7331 MF.insert(MBBI, MBB: LoopBB);
7332 MF.insert(MBBI, MBB: BodyBB);
7333 MF.insert(MBBI, MBB: RemainderBB);
7334
7335 LoopBB->addSuccessor(Succ: BodyBB);
7336 BodyBB->addSuccessor(Succ: LoopBB);
7337 BodyBB->addSuccessor(Succ: RemainderBB);
7338
7339 // Move Begin to MI to the BodyBB, and the remainder of the block to
7340 // RemainderBB.
7341 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
7342 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: End, To: MBB.end());
7343 BodyBB->splice(Where: BodyBB->begin(), Other: &MBB, From: Begin, To: MBB.end());
7344
7345 MBB.addSuccessor(Succ: LoopBB);
7346
7347 // Update dominators. We know that MBB immediately dominates LoopBB, that
7348 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7349 // RemainderBB. RemainderBB immediately dominates all of the successors
7350 // transferred to it from MBB that MBB used to properly dominate.
7351 if (MDT) {
7352 MDT->addNewBlock(BB: LoopBB, DomBB: &MBB);
7353 MDT->addNewBlock(BB: BodyBB, DomBB: LoopBB);
7354 MDT->addNewBlock(BB: RemainderBB, DomBB: BodyBB);
7355 for (auto &Succ : RemainderBB->successors()) {
7356 if (MDT->properlyDominates(A: &MBB, B: Succ)) {
7357 MDT->changeImmediateDominator(BB: Succ, NewBB: RemainderBB);
7358 }
7359 }
7360 }
7361
7362 emitLoadScalarOpsFromVGPRLoop(TII, MRI, PredBB&: MBB, LoopBB&: *LoopBB, BodyBB&: *BodyBB, DL, ScalarOps,
7363 PhySGPRs);
7364
7365 MachineBasicBlock::iterator First = RemainderBB->begin();
7366 // Restore SCC
7367 if (SCCNotDead) {
7368 BuildMI(BB&: *RemainderBB, I: First, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CMP_LG_U32))
7369 .addReg(RegNo: SaveSCCReg, Flags: RegState::Kill)
7370 .addImm(Val: 0);
7371 }
7372
7373 // Restore the EXEC mask
7374 BuildMI(BB&: *RemainderBB, I: First, MIMD: DL, MCID: TII.get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
7375 .addReg(RegNo: SaveExec);
7376 return BodyBB;
7377}
7378
7379// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7380static std::tuple<unsigned, unsigned>
7381extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
7382 MachineBasicBlock &MBB = *MI.getParent();
7383 MachineFunction &MF = *MBB.getParent();
7384 MachineRegisterInfo &MRI = MF.getRegInfo();
7385
7386 // Extract the ptr from the resource descriptor.
7387 unsigned RsrcPtr =
7388 TII.buildExtractSubReg(MI, MRI, SuperReg: Rsrc, SuperRC: &AMDGPU::VReg_128RegClass,
7389 SubIdx: AMDGPU::sub0_sub1, SubRC: &AMDGPU::VReg_64RegClass);
7390
7391 // Create an empty resource descriptor
7392 Register Zero64 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
7393 Register SRsrcFormatLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
7394 Register SRsrcFormatHi = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
7395 Register NewSRsrc = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_128RegClass);
7396 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7397
7398 // Zero64 = 0
7399 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B64), DestReg: Zero64)
7400 .addImm(Val: 0);
7401
7402 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7403 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: SRsrcFormatLo)
7404 .addImm(Val: Lo_32(Value: RsrcDataFormat));
7405
7406 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7407 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: SRsrcFormatHi)
7408 .addImm(Val: Hi_32(Value: RsrcDataFormat));
7409
7410 // NewSRsrc = {Zero64, SRsrcFormat}
7411 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewSRsrc)
7412 .addReg(RegNo: Zero64)
7413 .addImm(Val: AMDGPU::sub0_sub1)
7414 .addReg(RegNo: SRsrcFormatLo)
7415 .addImm(Val: AMDGPU::sub2)
7416 .addReg(RegNo: SRsrcFormatHi)
7417 .addImm(Val: AMDGPU::sub3);
7418
7419 return std::tuple(RsrcPtr, NewSRsrc);
7420}
7421
7422MachineBasicBlock *
7423SIInstrInfo::legalizeOperands(MachineInstr &MI,
7424 MachineDominatorTree *MDT) const {
7425 MachineFunction &MF = *MI.getMF();
7426 MachineRegisterInfo &MRI = MF.getRegInfo();
7427 MachineBasicBlock *CreatedBB = nullptr;
7428
7429 // Legalize VOP2
7430 if (isVOP2(MI) || isVOPC(MI)) {
7431 legalizeOperandsVOP2(MRI, MI);
7432 return CreatedBB;
7433 }
7434
7435 // Legalize VOP3
7436 if (isVOP3(MI)) {
7437 legalizeOperandsVOP3(MRI, MI);
7438 return CreatedBB;
7439 }
7440
7441 // Legalize SMRD
7442 if (isSMRD(MI)) {
7443 legalizeOperandsSMRD(MRI, MI);
7444 return CreatedBB;
7445 }
7446
7447 // Legalize FLAT
7448 if (isFLAT(MI)) {
7449 legalizeOperandsFLAT(MRI, MI);
7450 return CreatedBB;
7451 }
7452
7453 // Legalize PHI
7454 // The register class of the operands must be the same type as the register
7455 // class of the output.
7456 if (MI.getOpcode() == AMDGPU::PHI) {
7457 const TargetRegisterClass *VRC = getOpRegClass(MI, OpNo: 0);
7458 assert(!RI.isSGPRClass(VRC));
7459
7460 // Update all the operands so they have the same type.
7461 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7462 MachineOperand &Op = MI.getOperand(i: I);
7463 if (!Op.isReg() || !Op.getReg().isVirtual())
7464 continue;
7465
7466 // MI is a PHI instruction.
7467 MachineBasicBlock *InsertBB = MI.getOperand(i: I + 1).getMBB();
7468 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
7469
7470 // Avoid creating no-op copies with the same src and dst reg class. These
7471 // confuse some of the machine passes.
7472 legalizeGenericOperand(InsertMBB&: *InsertBB, I: Insert, DstRC: VRC, Op, MRI, DL: MI.getDebugLoc());
7473 }
7474 }
7475
7476 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7477 // VGPR dest type and SGPR sources, insert copies so all operands are
7478 // VGPRs. This seems to help operand folding / the register coalescer.
7479 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7480 MachineBasicBlock *MBB = MI.getParent();
7481 const TargetRegisterClass *DstRC = getOpRegClass(MI, OpNo: 0);
7482 if (RI.hasVGPRs(RC: DstRC)) {
7483 // Update all the operands so they are VGPR register classes. These may
7484 // not be the same register class because REG_SEQUENCE supports mixing
7485 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7486 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7487 MachineOperand &Op = MI.getOperand(i: I);
7488 if (!Op.isReg() || !Op.getReg().isVirtual())
7489 continue;
7490
7491 const TargetRegisterClass *OpRC = MRI.getRegClass(Reg: Op.getReg());
7492 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(SRC: OpRC);
7493 if (VRC == OpRC)
7494 continue;
7495
7496 legalizeGenericOperand(InsertMBB&: *MBB, I: MI, DstRC: VRC, Op, MRI, DL: MI.getDebugLoc());
7497 Op.setIsKill();
7498 }
7499 }
7500
7501 return CreatedBB;
7502 }
7503
7504 // Legalize INSERT_SUBREG
7505 // src0 must have the same register class as dst
7506 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7507 Register Dst = MI.getOperand(i: 0).getReg();
7508 Register Src0 = MI.getOperand(i: 1).getReg();
7509 const TargetRegisterClass *DstRC = MRI.getRegClass(Reg: Dst);
7510 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0);
7511 if (DstRC != Src0RC) {
7512 MachineBasicBlock *MBB = MI.getParent();
7513 MachineOperand &Op = MI.getOperand(i: 1);
7514 legalizeGenericOperand(InsertMBB&: *MBB, I: MI, DstRC, Op, MRI, DL: MI.getDebugLoc());
7515 }
7516 return CreatedBB;
7517 }
7518
7519 // Legalize SI_INIT_M0
7520 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7521 MachineOperand &Src = MI.getOperand(i: 0);
7522 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7523 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7524 return CreatedBB;
7525 }
7526
7527 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7528 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7529 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7530 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7531 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7532 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7533 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7534 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7535 MachineOperand &Src = MI.getOperand(i: 1);
7536 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7537 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7538 return CreatedBB;
7539 }
7540
7541 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7542 //
7543 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7544 // scratch memory access. In both cases, the legalization never involves
7545 // conversion to the addr64 form.
7546 if (isImage(MI) || (AMDGPU::isGraphics(CC: MF.getFunction().getCallingConv()) &&
7547 (isMUBUF(MI) || isMTBUF(MI)))) {
7548 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7549 ? AMDGPU::OpName::rsrc
7550 : AMDGPU::OpName::srsrc;
7551 MachineOperand *SRsrc = getNamedOperand(MI, OperandName: RSrcOpName);
7552 if (SRsrc && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SRsrc->getReg())))
7553 CreatedBB = generateWaterFallLoop(TII: *this, MI, ScalarOps: {SRsrc}, MDT);
7554
7555 AMDGPU::OpName SampOpName =
7556 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7557 MachineOperand *SSamp = getNamedOperand(MI, OperandName: SampOpName);
7558 if (SSamp && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SSamp->getReg())))
7559 CreatedBB = generateWaterFallLoop(TII: *this, MI, ScalarOps: {SSamp}, MDT);
7560
7561 return CreatedBB;
7562 }
7563
7564 // Legalize SI_CALL
7565 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7566 MachineOperand *Dest = &MI.getOperand(i: 0);
7567 if (!RI.isSGPRClass(RC: MRI.getRegClass(Reg: Dest->getReg()))) {
7568 createWaterFallForSiCall(MI: &MI, MDT, ScalarOps: {Dest});
7569 }
7570 }
7571
7572 // Legalize s_sleep_var.
7573 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7574 const DebugLoc &DL = MI.getDebugLoc();
7575 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7576 int Src0Idx =
7577 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src0);
7578 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
7579 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
7580 .add(MO: Src0);
7581 Src0.ChangeToRegister(Reg, isDef: false);
7582 return nullptr;
7583 }
7584
7585 // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
7586 // operands are scalar.
7587 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
7588 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
7589 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
7590 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
7591 for (MachineOperand &Src : MI.explicit_operands()) {
7592 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7593 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7594 }
7595 return CreatedBB;
7596 }
7597
7598 // Legalize MUBUF instructions.
7599 bool isSoffsetLegal = true;
7600 int SoffsetIdx =
7601 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::soffset);
7602 if (SoffsetIdx != -1) {
7603 MachineOperand *Soffset = &MI.getOperand(i: SoffsetIdx);
7604 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7605 !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Soffset->getReg()))) {
7606 isSoffsetLegal = false;
7607 }
7608 }
7609
7610 bool isRsrcLegal = true;
7611 int RsrcIdx =
7612 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::srsrc);
7613 if (RsrcIdx != -1) {
7614 MachineOperand *Rsrc = &MI.getOperand(i: RsrcIdx);
7615 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Reg: Rsrc->getReg()))
7616 isRsrcLegal = false;
7617 }
7618
7619 // The operands are legal.
7620 if (isRsrcLegal && isSoffsetLegal)
7621 return CreatedBB;
7622
7623 if (!isRsrcLegal) {
7624 // Legalize a VGPR Rsrc
7625 //
7626 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7627 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7628 // a zero-value SRsrc.
7629 //
7630 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7631 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7632 // above.
7633 //
7634 // Otherwise we are on non-ADDR64 hardware, and/or we have
7635 // idxen/offen/bothen and we fall back to a waterfall loop.
7636
7637 MachineOperand *Rsrc = &MI.getOperand(i: RsrcIdx);
7638 MachineBasicBlock &MBB = *MI.getParent();
7639
7640 MachineOperand *VAddr = getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
7641 if (VAddr && AMDGPU::getIfAddr64Inst(Opcode: MI.getOpcode()) != -1) {
7642 // This is already an ADDR64 instruction so we need to add the pointer
7643 // extracted from the resource descriptor to the current value of VAddr.
7644 Register NewVAddrLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7645 Register NewVAddrHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7646 Register NewVAddr = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
7647
7648 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7649 Register CondReg0 = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7650 Register CondReg1 = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7651
7652 unsigned RsrcPtr, NewSRsrc;
7653 std::tie(args&: RsrcPtr, args&: NewSRsrc) = extractRsrcPtr(TII: *this, MI, Rsrc&: *Rsrc);
7654
7655 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7656 const DebugLoc &DL = MI.getDebugLoc();
7657 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: NewVAddrLo)
7658 .addDef(RegNo: CondReg0)
7659 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub0)
7660 .addReg(RegNo: VAddr->getReg(), Flags: {}, SubReg: AMDGPU::sub0)
7661 .addImm(Val: 0);
7662
7663 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7664 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: NewVAddrHi)
7665 .addDef(RegNo: CondReg1, Flags: RegState::Dead)
7666 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub1)
7667 .addReg(RegNo: VAddr->getReg(), Flags: {}, SubReg: AMDGPU::sub1)
7668 .addReg(RegNo: CondReg0, Flags: RegState::Kill)
7669 .addImm(Val: 0);
7670
7671 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7672 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewVAddr)
7673 .addReg(RegNo: NewVAddrLo)
7674 .addImm(Val: AMDGPU::sub0)
7675 .addReg(RegNo: NewVAddrHi)
7676 .addImm(Val: AMDGPU::sub1);
7677
7678 VAddr->setReg(NewVAddr);
7679 Rsrc->setReg(NewSRsrc);
7680 } else if (!VAddr && ST.hasAddr64()) {
7681 // This instructions is the _OFFSET variant, so we need to convert it to
7682 // ADDR64.
7683 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7684 "FIXME: Need to emit flat atomics here");
7685
7686 unsigned RsrcPtr, NewSRsrc;
7687 std::tie(args&: RsrcPtr, args&: NewSRsrc) = extractRsrcPtr(TII: *this, MI, Rsrc&: *Rsrc);
7688
7689 Register NewVAddr = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
7690 MachineOperand *VData = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata);
7691 MachineOperand *Offset = getNamedOperand(MI, OperandName: AMDGPU::OpName::offset);
7692 MachineOperand *SOffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7693 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(Opcode: MI.getOpcode());
7694
7695 // Atomics with return have an additional tied operand and are
7696 // missing some of the special bits.
7697 MachineOperand *VDataIn = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata_in);
7698 MachineInstr *Addr64;
7699
7700 if (!VDataIn) {
7701 // Regular buffer load / store.
7702 MachineInstrBuilder MIB =
7703 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: Addr64Opcode))
7704 .add(MO: *VData)
7705 .addReg(RegNo: NewVAddr)
7706 .addReg(RegNo: NewSRsrc)
7707 .add(MO: *SOffset)
7708 .add(MO: *Offset);
7709
7710 if (const MachineOperand *CPol =
7711 getNamedOperand(MI, OperandName: AMDGPU::OpName::cpol)) {
7712 MIB.addImm(Val: CPol->getImm());
7713 }
7714
7715 if (const MachineOperand *TFE =
7716 getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe)) {
7717 MIB.addImm(Val: TFE->getImm());
7718 }
7719
7720 MIB.addImm(Val: getNamedImmOperand(MI, OperandName: AMDGPU::OpName::swz));
7721
7722 MIB.cloneMemRefs(OtherMI: MI);
7723 Addr64 = MIB;
7724 } else {
7725 // Atomics with return.
7726 Addr64 = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: Addr64Opcode))
7727 .add(MO: *VData)
7728 .add(MO: *VDataIn)
7729 .addReg(RegNo: NewVAddr)
7730 .addReg(RegNo: NewSRsrc)
7731 .add(MO: *SOffset)
7732 .add(MO: *Offset)
7733 .addImm(Val: getNamedImmOperand(MI, OperandName: AMDGPU::OpName::cpol))
7734 .cloneMemRefs(OtherMI: MI);
7735 }
7736
7737 MI.removeFromParent();
7738
7739 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7740 BuildMI(BB&: MBB, I: Addr64, MIMD: Addr64->getDebugLoc(), MCID: get(Opcode: AMDGPU::REG_SEQUENCE),
7741 DestReg: NewVAddr)
7742 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub0)
7743 .addImm(Val: AMDGPU::sub0)
7744 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub1)
7745 .addImm(Val: AMDGPU::sub1);
7746 } else {
7747 // Legalize a VGPR Rsrc and soffset together.
7748 if (!isSoffsetLegal) {
7749 MachineOperand *Soffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7750 CreatedBB = generateWaterFallLoop(TII: *this, MI, ScalarOps: {Rsrc, Soffset}, MDT);
7751 return CreatedBB;
7752 }
7753 CreatedBB = generateWaterFallLoop(TII: *this, MI, ScalarOps: {Rsrc}, MDT);
7754 return CreatedBB;
7755 }
7756 }
7757
7758 // Legalize a VGPR soffset.
7759 if (!isSoffsetLegal) {
7760 MachineOperand *Soffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7761 CreatedBB = generateWaterFallLoop(TII: *this, MI, ScalarOps: {Soffset}, MDT);
7762 return CreatedBB;
7763 }
7764 return CreatedBB;
7765}
7766
7767void SIInstrWorklist::insert(MachineInstr *MI) {
7768 InstrList.insert(X: MI);
7769 // Add MBUF instructiosn to deferred list.
7770 int RsrcIdx =
7771 AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(), Name: AMDGPU::OpName::srsrc);
7772 if (RsrcIdx != -1) {
7773 DeferredList.insert(X: MI);
7774 }
7775}
7776
7777bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
7778 return DeferredList.contains(key: MI);
7779}
7780
7781// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7782// lowering (change sgpr to vgpr).
7783// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7784// size. Need to legalize the size of the operands during the vgpr lowering
7785// chain. This can be removed after we have sgpr16 in place
7786void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
7787 MachineRegisterInfo &MRI) const {
7788 if (!ST.useRealTrue16Insts())
7789 return;
7790
7791 unsigned Opcode = MI.getOpcode();
7792 MachineBasicBlock *MBB = MI.getParent();
7793 // Legalize operands and check for size mismatch
7794 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7795 OpIdx >= get(Opcode).getNumOperands() ||
7796 get(Opcode).operands()[OpIdx].RegClass == -1)
7797 return;
7798
7799 MachineOperand &Op = MI.getOperand(i: OpIdx);
7800 if (!Op.isReg() || !Op.getReg().isVirtual())
7801 return;
7802
7803 const TargetRegisterClass *CurrRC = MRI.getRegClass(Reg: Op.getReg());
7804 if (!RI.isVGPRClass(RC: CurrRC))
7805 return;
7806
7807 int16_t RCID = getOpRegClassID(OpInfo: get(Opcode).operands()[OpIdx]);
7808 const TargetRegisterClass *ExpectedRC = RI.getRegClass(i: RCID);
7809 if (RI.getMatchingSuperRegClass(A: CurrRC, B: ExpectedRC, Idx: AMDGPU::lo16)) {
7810 Op.setSubReg(AMDGPU::lo16);
7811 } else if (RI.getMatchingSuperRegClass(A: ExpectedRC, B: CurrRC, Idx: AMDGPU::lo16)) {
7812 const DebugLoc &DL = MI.getDebugLoc();
7813 Register NewDstReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7814 Register Undef = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass);
7815 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
7816 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewDstReg)
7817 .addReg(RegNo: Op.getReg())
7818 .addImm(Val: AMDGPU::lo16)
7819 .addReg(RegNo: Undef)
7820 .addImm(Val: AMDGPU::hi16);
7821 Op.setReg(NewDstReg);
7822 }
7823}
7824void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
7825 MachineRegisterInfo &MRI) const {
7826 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7827 legalizeOperandsVALUt16(MI, OpIdx, MRI);
7828}
7829
7830void SIInstrInfo::createWaterFallForSiCall(MachineInstr *MI,
7831 MachineDominatorTree *MDT,
7832 ArrayRef<MachineOperand *> ScalarOps,
7833 ArrayRef<Register> PhySGPRs) const {
7834 assert(MI->getOpcode() == AMDGPU::SI_CALL_ISEL &&
7835 "This only handle waterfall for SI_CALL_ISEL");
7836 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7837 // following copies, we also need to move copies from and to physical
7838 // registers into the loop block.
7839 // Also move the copies to physical registers into the loop block
7840 MachineBasicBlock &MBB = *MI->getParent();
7841 MachineBasicBlock::iterator Start(MI);
7842 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
7843 --Start;
7844 MachineBasicBlock::iterator End(MI);
7845 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
7846 ++End;
7847
7848 // Also include following copies of the return value
7849 ++End;
7850 while (End != MBB.end() && End->isCopy() &&
7851 MI->definesRegister(Reg: End->getOperand(i: 1).getReg(), TRI: &RI))
7852 ++End;
7853
7854 generateWaterFallLoop(TII: *this, MI&: *MI, ScalarOps, MDT, Begin: Start, End, PhySGPRs);
7855}
7856
7857void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
7858 MachineDominatorTree *MDT) const {
7859 DenseMap<MachineInstr *, V2PhysSCopyInfo> WaterFalls;
7860 DenseMap<MachineInstr *, bool> V2SPhyCopiesToErase;
7861 while (!Worklist.empty()) {
7862 MachineInstr &Inst = *Worklist.top();
7863 Worklist.erase_top();
7864 // Skip MachineInstr in the deferred list.
7865 if (Worklist.isDeferred(MI: &Inst))
7866 continue;
7867 moveToVALUImpl(Worklist, MDT, Inst, WaterFalls, V2SPhyCopiesToErase);
7868 }
7869
7870 // Deferred list of instructions will be processed once
7871 // all the MachineInstr in the worklist are done.
7872 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7873 moveToVALUImpl(Worklist, MDT, Inst&: *Inst, WaterFalls, V2SPhyCopiesToErase);
7874 assert(Worklist.empty() &&
7875 "Deferred MachineInstr are not supposed to re-populate worklist");
7876 }
7877
7878 for (std::pair<MachineInstr *, V2PhysSCopyInfo> &Entry : WaterFalls) {
7879 if (Entry.first->getOpcode() == AMDGPU::SI_CALL_ISEL)
7880 createWaterFallForSiCall(MI: Entry.first, MDT, ScalarOps: Entry.second.MOs,
7881 PhySGPRs: Entry.second.SGPRs);
7882 }
7883
7884 for (std::pair<MachineInstr *, bool> Entry : V2SPhyCopiesToErase)
7885 if (Entry.second)
7886 Entry.first->eraseFromParent();
7887}
7888void SIInstrInfo::createReadFirstLaneFromCopyToPhysReg(
7889 MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const {
7890 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7891 // hope for the best.
7892 const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI, Reg: DstReg);
7893 ArrayRef<int16_t> SubRegIndices = RI.getRegSplitParts(RC: DstRC, EltSize: 4);
7894 if (SubRegIndices.size() <= 1) {
7895 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7896 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
7897 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: NewDst)
7898 .add(MO: Inst.getOperand(i: 1));
7899 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
7900 DestReg: DstReg)
7901 .addReg(RegNo: NewDst);
7902 } else {
7903 SmallVector<Register, 8> DstRegs;
7904 for (int16_t Indice : SubRegIndices) {
7905 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7906 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
7907 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: NewDst)
7908 .addReg(RegNo: Inst.getOperand(i: 1).getReg(), Flags: {}, SubReg: Indice);
7909
7910 DstRegs.push_back(Elt: NewDst);
7911 }
7912 MachineInstrBuilder MIB =
7913 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
7914 MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg);
7915 for (unsigned i = 0; i < SubRegIndices.size(); ++i) {
7916 MIB.addReg(RegNo: DstRegs[i]);
7917 MIB.addImm(Val: RI.getSubRegFromChannel(Channel: i));
7918 }
7919 }
7920}
7921
7922void SIInstrInfo::handleCopyToPhysHelper(
7923 SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst,
7924 MachineRegisterInfo &MRI,
7925 DenseMap<MachineInstr *, V2PhysSCopyInfo> &WaterFalls,
7926 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7927 if (DstReg == AMDGPU::M0) {
7928 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7929 V2SPhyCopiesToErase.try_emplace(Key: &Inst, Args: true);
7930 return;
7931 }
7932 Register SrcReg = Inst.getOperand(i: 1).getReg();
7933 MachineBasicBlock::iterator I = Inst.getIterator();
7934 MachineBasicBlock::iterator E = Inst.getParent()->end();
7935 // Only search current block since phyreg's def & use cannot cross
7936 // blocks when MF.NoPhi = false.
7937 while (++I != E) {
7938 // For SI_CALL_ISEL users, replace the phys SGPR with the VGPR source
7939 // and record the operand for later waterfall loop generation.
7940 if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) {
7941 MachineInstr *UseMI = &*I;
7942 for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) {
7943 if (UseMI->getOperand(i).isReg() &&
7944 UseMI->getOperand(i).getReg() == DstReg) {
7945 MachineOperand *MO = &UseMI->getOperand(i);
7946 MO->setReg(SrcReg);
7947 V2PhysSCopyInfo &V2SCopyInfo = WaterFalls[UseMI];
7948 V2SCopyInfo.MOs.push_back(Elt: MO);
7949 V2SCopyInfo.SGPRs.push_back(Elt: DstReg);
7950 V2SPhyCopiesToErase.try_emplace(Key: &Inst, Args: true);
7951 }
7952 }
7953 } else if (I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG &&
7954 I->getOperand(i: 0).isReg() &&
7955 I->getOperand(i: 0).getReg() == DstReg) {
7956 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7957 V2SPhyCopiesToErase.try_emplace(Key: &Inst, Args: true);
7958 } else if (I->readsRegister(Reg: DstReg, TRI: &RI)) {
7959 // COPY cannot be erased if other type of inst uses it.
7960 V2SPhyCopiesToErase[&Inst] = false;
7961 }
7962 if (I->findRegisterDefOperand(Reg: DstReg, TRI: &RI))
7963 break;
7964 }
7965}
7966
7967void SIInstrInfo::moveToVALUImpl(
7968 SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst,
7969 DenseMap<MachineInstr *, V2PhysSCopyInfo> &WaterFalls,
7970 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7971
7972 MachineBasicBlock *MBB = Inst.getParent();
7973 if (!MBB)
7974 return;
7975 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7976 unsigned Opcode = Inst.getOpcode();
7977 unsigned NewOpcode = getVALUOp(MI: Inst);
7978 const DebugLoc &DL = Inst.getDebugLoc();
7979
7980 // Handle some special cases
7981 switch (Opcode) {
7982 default:
7983 break;
7984 case AMDGPU::S_ADD_I32:
7985 case AMDGPU::S_SUB_I32: {
7986 // FIXME: The u32 versions currently selected use the carry.
7987 bool Changed;
7988 MachineBasicBlock *CreatedBBTmp = nullptr;
7989 std::tie(args&: Changed, args&: CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7990 if (Changed)
7991 return;
7992
7993 // Default handling
7994 break;
7995 }
7996
7997 case AMDGPU::S_MUL_U64:
7998 if (ST.hasVMulU64Inst()) {
7999 NewOpcode = AMDGPU::V_MUL_U64_e64;
8000 break;
8001 }
8002 // Split s_mul_u64 in 32-bit vector multiplications.
8003 splitScalarSMulU64(Worklist, Inst, MDT);
8004 Inst.eraseFromParent();
8005 return;
8006
8007 case AMDGPU::S_MUL_U64_U32_PSEUDO:
8008 case AMDGPU::S_MUL_I64_I32_PSEUDO:
8009 // This is a special case of s_mul_u64 where all the operands are either
8010 // zero extended or sign extended.
8011 splitScalarSMulPseudo(Worklist, Inst, MDT);
8012 Inst.eraseFromParent();
8013 return;
8014
8015 case AMDGPU::S_AND_B64:
8016 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_AND_B32, MDT);
8017 Inst.eraseFromParent();
8018 return;
8019
8020 case AMDGPU::S_OR_B64:
8021 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_OR_B32, MDT);
8022 Inst.eraseFromParent();
8023 return;
8024
8025 case AMDGPU::S_XOR_B64:
8026 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_XOR_B32, MDT);
8027 Inst.eraseFromParent();
8028 return;
8029
8030 case AMDGPU::S_NAND_B64:
8031 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_NAND_B32, MDT);
8032 Inst.eraseFromParent();
8033 return;
8034
8035 case AMDGPU::S_NOR_B64:
8036 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_NOR_B32, MDT);
8037 Inst.eraseFromParent();
8038 return;
8039
8040 case AMDGPU::S_XNOR_B64:
8041 if (ST.hasDLInsts())
8042 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_XNOR_B32, MDT);
8043 else
8044 splitScalar64BitXnor(Worklist, Inst, MDT);
8045 Inst.eraseFromParent();
8046 return;
8047
8048 case AMDGPU::S_ANDN2_B64:
8049 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_ANDN2_B32, MDT);
8050 Inst.eraseFromParent();
8051 return;
8052
8053 case AMDGPU::S_ORN2_B64:
8054 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_ORN2_B32, MDT);
8055 Inst.eraseFromParent();
8056 return;
8057
8058 case AMDGPU::S_BREV_B64:
8059 splitScalar64BitUnaryOp(Worklist, Inst, Opcode: AMDGPU::S_BREV_B32, Swap: true);
8060 Inst.eraseFromParent();
8061 return;
8062
8063 case AMDGPU::S_NOT_B64:
8064 splitScalar64BitUnaryOp(Worklist, Inst, Opcode: AMDGPU::S_NOT_B32);
8065 Inst.eraseFromParent();
8066 return;
8067
8068 case AMDGPU::S_BCNT1_I32_B64:
8069 splitScalar64BitBCNT(Worklist, Inst);
8070 Inst.eraseFromParent();
8071 return;
8072
8073 case AMDGPU::S_BFE_I64:
8074 splitScalar64BitBFE(Worklist, Inst);
8075 Inst.eraseFromParent();
8076 return;
8077
8078 case AMDGPU::S_FLBIT_I32_B64:
8079 splitScalar64BitCountOp(Worklist, Inst, Opcode: AMDGPU::V_FFBH_U32_e32);
8080 Inst.eraseFromParent();
8081 return;
8082 case AMDGPU::S_FF1_I32_B64:
8083 splitScalar64BitCountOp(Worklist, Inst, Opcode: AMDGPU::V_FFBL_B32_e32);
8084 Inst.eraseFromParent();
8085 return;
8086
8087 case AMDGPU::S_LSHL_B32:
8088 if (ST.hasOnlyRevVALUShifts()) {
8089 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
8090 swapOperands(Inst);
8091 }
8092 break;
8093 case AMDGPU::S_ASHR_I32:
8094 if (ST.hasOnlyRevVALUShifts()) {
8095 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
8096 swapOperands(Inst);
8097 }
8098 break;
8099 case AMDGPU::S_LSHR_B32:
8100 if (ST.hasOnlyRevVALUShifts()) {
8101 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
8102 swapOperands(Inst);
8103 }
8104 break;
8105 case AMDGPU::S_LSHL_B64:
8106 if (ST.hasOnlyRevVALUShifts()) {
8107 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
8108 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
8109 : AMDGPU::V_LSHLREV_B64_e64;
8110 swapOperands(Inst);
8111 }
8112 break;
8113 case AMDGPU::S_ASHR_I64:
8114 if (ST.hasOnlyRevVALUShifts()) {
8115 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
8116 swapOperands(Inst);
8117 }
8118 break;
8119 case AMDGPU::S_LSHR_B64:
8120 if (ST.hasOnlyRevVALUShifts()) {
8121 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
8122 swapOperands(Inst);
8123 }
8124 break;
8125
8126 case AMDGPU::S_ABS_I32:
8127 lowerScalarAbs(Worklist, Inst);
8128 Inst.eraseFromParent();
8129 return;
8130
8131 case AMDGPU::S_ABSDIFF_I32:
8132 lowerScalarAbsDiff(Worklist, Inst);
8133 Inst.eraseFromParent();
8134 return;
8135
8136 case AMDGPU::S_CBRANCH_SCC0:
8137 case AMDGPU::S_CBRANCH_SCC1: {
8138 // Clear unused bits of vcc
8139 Register CondReg = Inst.getOperand(i: 1).getReg();
8140 bool IsSCC = CondReg == AMDGPU::SCC;
8141 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
8142 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: LMC.AndOpc), DestReg: LMC.VccReg)
8143 .addReg(RegNo: LMC.ExecReg)
8144 .addReg(RegNo: IsSCC ? LMC.VccReg : CondReg);
8145 Inst.removeOperand(OpNo: 1);
8146 } break;
8147
8148 case AMDGPU::S_BFE_U64:
8149 case AMDGPU::S_BFM_B64:
8150 llvm_unreachable("Moving this op to VALU not implemented");
8151
8152 case AMDGPU::S_PACK_LL_B32_B16:
8153 case AMDGPU::S_PACK_LH_B32_B16:
8154 case AMDGPU::S_PACK_HL_B32_B16:
8155 case AMDGPU::S_PACK_HH_B32_B16:
8156 movePackToVALU(Worklist, MRI, Inst);
8157 Inst.eraseFromParent();
8158 return;
8159
8160 case AMDGPU::S_XNOR_B32:
8161 lowerScalarXnor(Worklist, Inst);
8162 Inst.eraseFromParent();
8163 return;
8164
8165 case AMDGPU::S_NAND_B32:
8166 splitScalarNotBinop(Worklist, Inst, Opcode: AMDGPU::S_AND_B32);
8167 Inst.eraseFromParent();
8168 return;
8169
8170 case AMDGPU::S_NOR_B32:
8171 splitScalarNotBinop(Worklist, Inst, Opcode: AMDGPU::S_OR_B32);
8172 Inst.eraseFromParent();
8173 return;
8174
8175 case AMDGPU::S_ANDN2_B32:
8176 splitScalarBinOpN2(Worklist, Inst, Opcode: AMDGPU::S_AND_B32);
8177 Inst.eraseFromParent();
8178 return;
8179
8180 case AMDGPU::S_ORN2_B32:
8181 splitScalarBinOpN2(Worklist, Inst, Opcode: AMDGPU::S_OR_B32);
8182 Inst.eraseFromParent();
8183 return;
8184
8185 // TODO: remove as soon as everything is ready
8186 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8187 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8188 // can only be selected from the uniform SDNode.
8189 case AMDGPU::S_ADD_CO_PSEUDO:
8190 case AMDGPU::S_SUB_CO_PSEUDO: {
8191 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8192 ? AMDGPU::V_ADDC_U32_e64
8193 : AMDGPU::V_SUBB_U32_e64;
8194 const auto *CarryRC = RI.getWaveMaskRegClass();
8195
8196 Register CarryInReg = Inst.getOperand(i: 4).getReg();
8197 if (!MRI.constrainRegClass(Reg: CarryInReg, RC: CarryRC)) {
8198 Register NewCarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
8199 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY), DestReg: NewCarryReg)
8200 .addReg(RegNo: CarryInReg);
8201 }
8202
8203 Register CarryOutReg = Inst.getOperand(i: 1).getReg();
8204
8205 Register DestReg = MRI.createVirtualRegister(RegClass: RI.getEquivalentVGPRClass(
8206 SRC: MRI.getRegClass(Reg: Inst.getOperand(i: 0).getReg())));
8207 MachineInstr *CarryOp =
8208 BuildMI(BB&: *MBB, I: &Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: Opc), DestReg)
8209 .addReg(RegNo: CarryOutReg, Flags: RegState::Define)
8210 .add(MO: Inst.getOperand(i: 2))
8211 .add(MO: Inst.getOperand(i: 3))
8212 .addReg(RegNo: CarryInReg)
8213 .addImm(Val: 0);
8214 legalizeOperands(MI&: *CarryOp);
8215 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: DestReg);
8216 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8217 Inst.eraseFromParent();
8218 }
8219 return;
8220 case AMDGPU::S_UADDO_PSEUDO:
8221 case AMDGPU::S_USUBO_PSEUDO: {
8222 MachineOperand &Dest0 = Inst.getOperand(i: 0);
8223 MachineOperand &Dest1 = Inst.getOperand(i: 1);
8224 MachineOperand &Src0 = Inst.getOperand(i: 2);
8225 MachineOperand &Src1 = Inst.getOperand(i: 3);
8226
8227 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8228 ? AMDGPU::V_ADD_CO_U32_e64
8229 : AMDGPU::V_SUB_CO_U32_e64;
8230 const TargetRegisterClass *NewRC =
8231 RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest0.getReg()));
8232 Register DestReg = MRI.createVirtualRegister(RegClass: NewRC);
8233 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I: &Inst, MIMD: DL, MCID: get(Opcode: Opc), DestReg)
8234 .addReg(RegNo: Dest1.getReg(), Flags: RegState::Define)
8235 .add(MO: Src0)
8236 .add(MO: Src1)
8237 .addImm(Val: 0); // clamp bit
8238
8239 legalizeOperands(MI&: *NewInstr, MDT);
8240 MRI.replaceRegWith(FromReg: Dest0.getReg(), ToReg: DestReg);
8241 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8242 Inst.eraseFromParent();
8243 }
8244 return;
8245 case AMDGPU::S_LSHL1_ADD_U32:
8246 case AMDGPU::S_LSHL2_ADD_U32:
8247 case AMDGPU::S_LSHL3_ADD_U32:
8248 case AMDGPU::S_LSHL4_ADD_U32: {
8249 MachineOperand &Dest = Inst.getOperand(i: 0);
8250 MachineOperand &Src0 = Inst.getOperand(i: 1);
8251 MachineOperand &Src1 = Inst.getOperand(i: 2);
8252 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8253 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8254 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8255 : 4);
8256
8257 const TargetRegisterClass *NewRC =
8258 RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest.getReg()));
8259 Register DestReg = MRI.createVirtualRegister(RegClass: NewRC);
8260 MachineInstr *NewInstr =
8261 BuildMI(BB&: *MBB, I: &Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8262 .add(MO: Src0)
8263 .addImm(Val: ShiftAmt)
8264 .add(MO: Src1);
8265
8266 legalizeOperands(MI&: *NewInstr, MDT);
8267 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: DestReg);
8268 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8269 Inst.eraseFromParent();
8270 }
8271 return;
8272 case AMDGPU::S_CSELECT_B32:
8273 case AMDGPU::S_CSELECT_B64:
8274 lowerSelect(Worklist, Inst, MDT);
8275 Inst.eraseFromParent();
8276 return;
8277 case AMDGPU::S_CMP_EQ_I32:
8278 case AMDGPU::S_CMP_LG_I32:
8279 case AMDGPU::S_CMP_GT_I32:
8280 case AMDGPU::S_CMP_GE_I32:
8281 case AMDGPU::S_CMP_LT_I32:
8282 case AMDGPU::S_CMP_LE_I32:
8283 case AMDGPU::S_CMP_EQ_U32:
8284 case AMDGPU::S_CMP_LG_U32:
8285 case AMDGPU::S_CMP_GT_U32:
8286 case AMDGPU::S_CMP_GE_U32:
8287 case AMDGPU::S_CMP_LT_U32:
8288 case AMDGPU::S_CMP_LE_U32:
8289 case AMDGPU::S_CMP_EQ_U64:
8290 case AMDGPU::S_CMP_LG_U64:
8291 case AMDGPU::S_CMP_LT_F32:
8292 case AMDGPU::S_CMP_EQ_F32:
8293 case AMDGPU::S_CMP_LE_F32:
8294 case AMDGPU::S_CMP_GT_F32:
8295 case AMDGPU::S_CMP_LG_F32:
8296 case AMDGPU::S_CMP_GE_F32:
8297 case AMDGPU::S_CMP_O_F32:
8298 case AMDGPU::S_CMP_U_F32:
8299 case AMDGPU::S_CMP_NGE_F32:
8300 case AMDGPU::S_CMP_NLG_F32:
8301 case AMDGPU::S_CMP_NGT_F32:
8302 case AMDGPU::S_CMP_NLE_F32:
8303 case AMDGPU::S_CMP_NEQ_F32:
8304 case AMDGPU::S_CMP_NLT_F32: {
8305 Register CondReg = MRI.createVirtualRegister(RegClass: RI.getWaveMaskRegClass());
8306 auto NewInstr =
8307 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode), DestReg: CondReg)
8308 .setMIFlags(Inst.getFlags());
8309 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src0_modifiers) >=
8310 0) {
8311 NewInstr
8312 .addImm(Val: 0) // src0_modifiers
8313 .add(MO: Inst.getOperand(i: 0)) // src0
8314 .addImm(Val: 0) // src1_modifiers
8315 .add(MO: Inst.getOperand(i: 1)) // src1
8316 .addImm(Val: 0); // clamp
8317 } else {
8318 NewInstr.add(MO: Inst.getOperand(i: 0)).add(MO: Inst.getOperand(i: 1));
8319 }
8320 legalizeOperands(MI&: *NewInstr, MDT);
8321 int SCCIdx = Inst.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
8322 const MachineOperand &SCCOp = Inst.getOperand(i: SCCIdx);
8323 addSCCDefUsersToVALUWorklist(Op: SCCOp, SCCDefInst&: Inst, Worklist, NewCond: CondReg);
8324 Inst.eraseFromParent();
8325 return;
8326 }
8327 case AMDGPU::S_CMP_LT_F16:
8328 case AMDGPU::S_CMP_EQ_F16:
8329 case AMDGPU::S_CMP_LE_F16:
8330 case AMDGPU::S_CMP_GT_F16:
8331 case AMDGPU::S_CMP_LG_F16:
8332 case AMDGPU::S_CMP_GE_F16:
8333 case AMDGPU::S_CMP_O_F16:
8334 case AMDGPU::S_CMP_U_F16:
8335 case AMDGPU::S_CMP_NGE_F16:
8336 case AMDGPU::S_CMP_NLG_F16:
8337 case AMDGPU::S_CMP_NGT_F16:
8338 case AMDGPU::S_CMP_NLE_F16:
8339 case AMDGPU::S_CMP_NEQ_F16:
8340 case AMDGPU::S_CMP_NLT_F16: {
8341 Register CondReg = MRI.createVirtualRegister(RegClass: RI.getWaveMaskRegClass());
8342 auto NewInstr =
8343 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode), DestReg: CondReg)
8344 .setMIFlags(Inst.getFlags());
8345 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::src0_modifiers)) {
8346 NewInstr
8347 .addImm(Val: 0) // src0_modifiers
8348 .add(MO: Inst.getOperand(i: 0)) // src0
8349 .addImm(Val: 0) // src1_modifiers
8350 .add(MO: Inst.getOperand(i: 1)) // src1
8351 .addImm(Val: 0); // clamp
8352 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::op_sel))
8353 NewInstr.addImm(Val: 0); // op_sel0
8354 } else {
8355 NewInstr
8356 .add(MO: Inst.getOperand(i: 0))
8357 .add(MO: Inst.getOperand(i: 1));
8358 }
8359 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8360 legalizeOperands(MI&: *NewInstr, MDT);
8361 int SCCIdx = Inst.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
8362 const MachineOperand &SCCOp = Inst.getOperand(i: SCCIdx);
8363 addSCCDefUsersToVALUWorklist(Op: SCCOp, SCCDefInst&: Inst, Worklist, NewCond: CondReg);
8364 Inst.eraseFromParent();
8365 return;
8366 }
8367 case AMDGPU::S_CVT_HI_F32_F16: {
8368 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8369 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8370 if (ST.useRealTrue16Insts()) {
8371 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: TmpReg)
8372 .add(MO: Inst.getOperand(i: 1));
8373 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8374 .addImm(Val: 0) // src0_modifiers
8375 .addReg(RegNo: TmpReg, Flags: {}, SubReg: AMDGPU::hi16)
8376 .addImm(Val: 0) // clamp
8377 .addImm(Val: 0) // omod
8378 .addImm(Val: 0); // op_sel0
8379 } else {
8380 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
8381 .addImm(Val: 16)
8382 .add(MO: Inst.getOperand(i: 1));
8383 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8384 .addImm(Val: 0) // src0_modifiers
8385 .addReg(RegNo: TmpReg)
8386 .addImm(Val: 0) // clamp
8387 .addImm(Val: 0); // omod
8388 }
8389
8390 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8391 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8392 Inst.eraseFromParent();
8393 return;
8394 }
8395 case AMDGPU::S_MINIMUM_F32:
8396 case AMDGPU::S_MAXIMUM_F32: {
8397 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8398 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8399 .addImm(Val: 0) // src0_modifiers
8400 .add(MO: Inst.getOperand(i: 1))
8401 .addImm(Val: 0) // src1_modifiers
8402 .add(MO: Inst.getOperand(i: 2))
8403 .addImm(Val: 0) // clamp
8404 .addImm(Val: 0); // omod
8405 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8406
8407 legalizeOperands(MI&: *NewInstr, MDT);
8408 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8409 Inst.eraseFromParent();
8410 return;
8411 }
8412 case AMDGPU::S_MINIMUM_F16:
8413 case AMDGPU::S_MAXIMUM_F16: {
8414 Register NewDst = MRI.createVirtualRegister(RegClass: ST.useRealTrue16Insts()
8415 ? &AMDGPU::VGPR_16RegClass
8416 : &AMDGPU::VGPR_32RegClass);
8417 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8418 .addImm(Val: 0) // src0_modifiers
8419 .add(MO: Inst.getOperand(i: 1))
8420 .addImm(Val: 0) // src1_modifiers
8421 .add(MO: Inst.getOperand(i: 2))
8422 .addImm(Val: 0) // clamp
8423 .addImm(Val: 0) // omod
8424 .addImm(Val: 0); // opsel0
8425 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8426 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8427 legalizeOperands(MI&: *NewInstr, MDT);
8428 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8429 Inst.eraseFromParent();
8430 return;
8431 }
8432 case AMDGPU::V_S_EXP_F16_e64:
8433 case AMDGPU::V_S_LOG_F16_e64:
8434 case AMDGPU::V_S_RCP_F16_e64:
8435 case AMDGPU::V_S_RSQ_F16_e64:
8436 case AMDGPU::V_S_SQRT_F16_e64: {
8437 Register NewDst = MRI.createVirtualRegister(RegClass: ST.useRealTrue16Insts()
8438 ? &AMDGPU::VGPR_16RegClass
8439 : &AMDGPU::VGPR_32RegClass);
8440 auto NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8441 .add(MO: Inst.getOperand(i: 1)) // src0_modifiers
8442 .add(MO: Inst.getOperand(i: 2))
8443 .add(MO: Inst.getOperand(i: 3)) // clamp
8444 .add(MO: Inst.getOperand(i: 4)) // omod
8445 .setMIFlags(Inst.getFlags());
8446 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::op_sel))
8447 NewInstr.addImm(Val: 0); // opsel0
8448 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8449 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8450 legalizeOperands(MI&: *NewInstr, MDT);
8451 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8452 Inst.eraseFromParent();
8453 return;
8454 }
8455 }
8456
8457 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8458 // We cannot move this instruction to the VALU, so we should try to
8459 // legalize its operands instead.
8460 legalizeOperands(MI&: Inst, MDT);
8461 return;
8462 }
8463 // Handle converting generic instructions like COPY-to-SGPR into
8464 // COPY-to-VGPR.
8465 if (NewOpcode == Opcode) {
8466 Register DstReg = Inst.getOperand(i: 0).getReg();
8467 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8468
8469 if (Inst.isCopy() && DstReg.isPhysical() &&
8470 Inst.getOperand(i: 1).getReg().isVirtual()) {
8471 handleCopyToPhysHelper(Worklist, DstReg, Inst, MRI, WaterFalls,
8472 V2SPhyCopiesToErase);
8473 return;
8474 }
8475
8476 if (Inst.isCopy() && Inst.getOperand(i: 1).getReg().isVirtual()) {
8477 Register NewDstReg = Inst.getOperand(i: 1).getReg();
8478 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, Reg: NewDstReg);
8479 if (const TargetRegisterClass *CommonRC =
8480 RI.getCommonSubClass(A: NewDstRC, B: SrcRC)) {
8481 // Instead of creating a copy where src and dst are the same register
8482 // class, we just replace all uses of dst with src. These kinds of
8483 // copies interfere with the heuristics MachineSink uses to decide
8484 // whether or not to split a critical edge. Since the pass assumes
8485 // that copies will end up as machine instructions and not be
8486 // eliminated.
8487 addUsersToMoveToVALUWorklist(Reg: DstReg, MRI, Worklist);
8488 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8489 MRI.clearKillFlags(Reg: NewDstReg);
8490 Inst.getOperand(i: 0).setReg(DstReg);
8491
8492 if (!MRI.constrainRegClass(Reg: NewDstReg, RC: CommonRC))
8493 llvm_unreachable("failed to constrain register");
8494
8495 Inst.eraseFromParent();
8496
8497 for (MachineOperand &UseMO :
8498 make_early_inc_range(Range: MRI.use_operands(Reg: NewDstReg))) {
8499 MachineInstr &UseMI = *UseMO.getParent();
8500
8501 // Legalize t16 operands since replaceReg is called after
8502 // addUsersToVALU.
8503 legalizeOperandsVALUt16(MI&: UseMI, MRI);
8504
8505 unsigned OpIdx = UseMI.getOperandNo(I: &UseMO);
8506 if (const TargetRegisterClass *OpRC =
8507 getRegClass(MCID: UseMI.getDesc(), OpNum: OpIdx))
8508 MRI.constrainRegClass(Reg: NewDstReg, RC: OpRC);
8509 }
8510
8511 return;
8512 }
8513 }
8514
8515 // If this is a v2s copy between 16bit and 32bit reg,
8516 // replace vgpr copy to reg_sequence/extract_subreg
8517 // This can be remove after we have sgpr16 in place
8518 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8519 Inst.getOperand(i: 1).getReg().isVirtual() &&
8520 RI.isVGPR(MRI, Reg: Inst.getOperand(i: 1).getReg())) {
8521 const TargetRegisterClass *SrcRegRC = getOpRegClass(MI: Inst, OpNo: 1);
8522 if (RI.getMatchingSuperRegClass(A: NewDstRC, B: SrcRegRC, Idx: AMDGPU::lo16)) {
8523 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8524 Register Undef = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass);
8525 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
8526 MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
8527 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
8528 MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewDstReg)
8529 .addReg(RegNo: Inst.getOperand(i: 1).getReg())
8530 .addImm(Val: AMDGPU::lo16)
8531 .addReg(RegNo: Undef)
8532 .addImm(Val: AMDGPU::hi16);
8533 Inst.eraseFromParent();
8534 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8535 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8536 return;
8537 } else if (RI.getMatchingSuperRegClass(A: SrcRegRC, B: NewDstRC,
8538 Idx: AMDGPU::lo16)) {
8539 Inst.getOperand(i: 1).setSubReg(AMDGPU::lo16);
8540 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8541 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8542 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8543 return;
8544 }
8545 }
8546
8547 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8548 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8549 legalizeOperands(MI&: Inst, MDT);
8550 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8551 return;
8552 }
8553
8554 // Use the new VALU Opcode.
8555 auto NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode))
8556 .setMIFlags(Inst.getFlags());
8557 if (isVOP3(Opcode: NewOpcode) && !isVOP3(Opcode)) {
8558 // Intersperse VOP3 modifiers among the SALU operands.
8559 NewInstr->addOperand(Op: Inst.getOperand(i: 0));
8560 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8561 Name: AMDGPU::OpName::src0_modifiers) >= 0)
8562 NewInstr.addImm(Val: 0);
8563 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::src0)) {
8564 const MachineOperand &Src = Inst.getOperand(i: 1);
8565 NewInstr->addOperand(Op: Src);
8566 }
8567
8568 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8569 // We are converting these to a BFE, so we need to add the missing
8570 // operands for the size and offset.
8571 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8572 NewInstr.addImm(Val: 0);
8573 NewInstr.addImm(Val: Size);
8574 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8575 // The VALU version adds the second operand to the result, so insert an
8576 // extra 0 operand.
8577 NewInstr.addImm(Val: 0);
8578 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8579 const MachineOperand &OffsetWidthOp = Inst.getOperand(i: 2);
8580 // If we need to move this to VGPRs, we need to unpack the second
8581 // operand back into the 2 separate ones for bit offset and width.
8582 assert(OffsetWidthOp.isImm() &&
8583 "Scalar BFE is only implemented for constant width and offset");
8584 uint32_t Imm = OffsetWidthOp.getImm();
8585
8586 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8587 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8588 NewInstr.addImm(Val: Offset);
8589 NewInstr.addImm(Val: BitWidth);
8590 } else {
8591 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8592 Name: AMDGPU::OpName::src1_modifiers) >= 0)
8593 NewInstr.addImm(Val: 0);
8594 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src1) >= 0)
8595 NewInstr->addOperand(Op: Inst.getOperand(i: 2));
8596 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8597 Name: AMDGPU::OpName::src2_modifiers) >= 0)
8598 NewInstr.addImm(Val: 0);
8599 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src2) >= 0)
8600 NewInstr->addOperand(Op: Inst.getOperand(i: 3));
8601 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::clamp) >= 0)
8602 NewInstr.addImm(Val: 0);
8603 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::omod) >= 0)
8604 NewInstr.addImm(Val: 0);
8605 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::op_sel) >= 0)
8606 NewInstr.addImm(Val: 0);
8607 }
8608 } else {
8609 // Just copy the SALU operands.
8610 for (const MachineOperand &Op : Inst.explicit_operands())
8611 NewInstr->addOperand(Op);
8612 }
8613
8614 // Remove any references to SCC. Vector instructions can't read from it, and
8615 // We're just about to add the implicit use / defs of VCC, and we don't want
8616 // both.
8617 for (MachineOperand &Op : Inst.implicit_operands()) {
8618 if (Op.getReg() == AMDGPU::SCC) {
8619 // Only propagate through live-def of SCC.
8620 if (Op.isDef() && !Op.isDead())
8621 addSCCDefUsersToVALUWorklist(Op, SCCDefInst&: Inst, Worklist);
8622 if (Op.isUse())
8623 addSCCDefsToVALUWorklist(SCCUseInst: NewInstr, Worklist);
8624 }
8625 }
8626 Inst.eraseFromParent();
8627 Register NewDstReg;
8628 if (NewInstr->getOperand(i: 0).isReg() && NewInstr->getOperand(i: 0).isDef()) {
8629 Register DstReg = NewInstr->getOperand(i: 0).getReg();
8630 assert(DstReg.isVirtual());
8631 // Update the destination register class.
8632 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst: *NewInstr);
8633 assert(NewDstRC);
8634 NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8635 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8636 }
8637 fixImplicitOperands(MI&: *NewInstr);
8638
8639 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8640
8641 // Legalize the operands
8642 legalizeOperands(MI&: *NewInstr, MDT);
8643 if (NewDstReg)
8644 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8645}
8646
8647// Add/sub require special handling to deal with carry outs.
8648std::pair<bool, MachineBasicBlock *>
8649SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8650 MachineDominatorTree *MDT) const {
8651 if (ST.hasAddNoCarryInsts()) {
8652 // Assume there is no user of scc since we don't select this in that case.
8653 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8654 // is used.
8655
8656 MachineBasicBlock &MBB = *Inst.getParent();
8657 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8658
8659 Register OldDstReg = Inst.getOperand(i: 0).getReg();
8660 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8661
8662 unsigned Opc = Inst.getOpcode();
8663 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8664
8665 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8666 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8667
8668 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8669 Inst.removeOperand(OpNo: 3);
8670
8671 Inst.setDesc(get(Opcode: NewOpc));
8672 Inst.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // clamp bit
8673 Inst.addImplicitDefUseOperands(MF&: *MBB.getParent());
8674 MRI.replaceRegWith(FromReg: OldDstReg, ToReg: ResultReg);
8675 MachineBasicBlock *NewBB = legalizeOperands(MI&: Inst, MDT);
8676
8677 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8678 return std::pair(true, NewBB);
8679 }
8680
8681 return std::pair(false, nullptr);
8682}
8683
8684void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8685 MachineDominatorTree *MDT) const {
8686
8687 MachineBasicBlock &MBB = *Inst.getParent();
8688 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8689 MachineBasicBlock::iterator MII = Inst;
8690 const DebugLoc &DL = Inst.getDebugLoc();
8691
8692 MachineOperand &Dest = Inst.getOperand(i: 0);
8693 MachineOperand &Src0 = Inst.getOperand(i: 1);
8694 MachineOperand &Src1 = Inst.getOperand(i: 2);
8695 MachineOperand &Cond = Inst.getOperand(i: 3);
8696
8697 Register CondReg = Cond.getReg();
8698 bool IsSCC = (CondReg == AMDGPU::SCC);
8699
8700 // If this is a trivial select where the condition is effectively not SCC
8701 // (CondReg is a source of copy to SCC), then the select is semantically
8702 // equivalent to copying CondReg. Hence, there is no need to create
8703 // V_CNDMASK, we can just use that and bail out.
8704 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8705 (Src1.getImm() == 0)) {
8706 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: CondReg);
8707 return;
8708 }
8709
8710 Register NewCondReg = CondReg;
8711 if (IsSCC) {
8712 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8713 NewCondReg = MRI.createVirtualRegister(RegClass: TC);
8714
8715 // Now look for the closest SCC def if it is a copy
8716 // replacing the CondReg with the COPY source register
8717 bool CopyFound = false;
8718 for (MachineInstr &CandI :
8719 make_range(x: std::next(x: MachineBasicBlock::reverse_iterator(Inst)),
8720 y: Inst.getParent()->rend())) {
8721 if (CandI.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isDead: false, Overlap: false) !=
8722 -1) {
8723 if (CandI.isCopy() && CandI.getOperand(i: 0).getReg() == AMDGPU::SCC) {
8724 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: NewCondReg)
8725 .addReg(RegNo: CandI.getOperand(i: 1).getReg());
8726 CopyFound = true;
8727 }
8728 break;
8729 }
8730 }
8731 if (!CopyFound) {
8732 // SCC def is not a copy
8733 // Insert a trivial select instead of creating a copy, because a copy from
8734 // SCC would semantically mean just copying a single bit, but we may need
8735 // the result to be a vector condition mask that needs preserving.
8736 unsigned Opcode =
8737 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8738 auto NewSelect =
8739 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: NewCondReg).addImm(Val: -1).addImm(Val: 0);
8740 NewSelect->getOperand(i: 3).setIsUndef(Cond.isUndef());
8741 }
8742 }
8743
8744 Register NewDestReg = MRI.createVirtualRegister(
8745 RegClass: RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest.getReg())));
8746 MachineInstr *NewInst;
8747 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8748 NewInst = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: NewDestReg)
8749 .addImm(Val: 0)
8750 .add(MO: Src1) // False
8751 .addImm(Val: 0)
8752 .add(MO: Src0) // True
8753 .addReg(RegNo: NewCondReg);
8754 } else {
8755 NewInst =
8756 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B64_PSEUDO), DestReg: NewDestReg)
8757 .add(MO: Src1) // False
8758 .add(MO: Src0) // True
8759 .addReg(RegNo: NewCondReg);
8760 }
8761 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDestReg);
8762 legalizeOperands(MI&: *NewInst, MDT);
8763 addUsersToMoveToVALUWorklist(Reg: NewDestReg, MRI, Worklist);
8764}
8765
8766void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8767 MachineInstr &Inst) const {
8768 MachineBasicBlock &MBB = *Inst.getParent();
8769 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8770 MachineBasicBlock::iterator MII = Inst;
8771 const DebugLoc &DL = Inst.getDebugLoc();
8772
8773 MachineOperand &Dest = Inst.getOperand(i: 0);
8774 MachineOperand &Src = Inst.getOperand(i: 1);
8775 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8776 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8777
8778 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8779 : AMDGPU::V_SUB_CO_U32_e32;
8780
8781 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: TmpReg)
8782 .addImm(Val: 0)
8783 .addReg(RegNo: Src.getReg());
8784
8785 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MAX_I32_e64), DestReg: ResultReg)
8786 .addReg(RegNo: Src.getReg())
8787 .addReg(RegNo: TmpReg);
8788
8789 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8790 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8791}
8792
8793void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8794 MachineInstr &Inst) const {
8795 MachineBasicBlock &MBB = *Inst.getParent();
8796 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8797 MachineBasicBlock::iterator MII = Inst;
8798 const DebugLoc &DL = Inst.getDebugLoc();
8799
8800 MachineOperand &Dest = Inst.getOperand(i: 0);
8801 MachineOperand &Src1 = Inst.getOperand(i: 1);
8802 MachineOperand &Src2 = Inst.getOperand(i: 2);
8803 Register SubResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8804 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8805 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8806
8807 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8808 : AMDGPU::V_SUB_CO_U32_e32;
8809
8810 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: SubResultReg)
8811 .addReg(RegNo: Src1.getReg())
8812 .addReg(RegNo: Src2.getReg());
8813
8814 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: TmpReg).addImm(Val: 0).addReg(RegNo: SubResultReg);
8815
8816 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MAX_I32_e64), DestReg: ResultReg)
8817 .addReg(RegNo: SubResultReg)
8818 .addReg(RegNo: TmpReg);
8819
8820 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8821 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8822}
8823
8824void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8825 MachineInstr &Inst) const {
8826 MachineBasicBlock &MBB = *Inst.getParent();
8827 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8828 MachineBasicBlock::iterator MII = Inst;
8829 const DebugLoc &DL = Inst.getDebugLoc();
8830
8831 MachineOperand &Dest = Inst.getOperand(i: 0);
8832 MachineOperand &Src0 = Inst.getOperand(i: 1);
8833 MachineOperand &Src1 = Inst.getOperand(i: 2);
8834
8835 if (ST.hasDLInsts()) {
8836 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8837 legalizeGenericOperand(InsertMBB&: MBB, I: MII, DstRC: &AMDGPU::VGPR_32RegClass, Op&: Src0, MRI, DL);
8838 legalizeGenericOperand(InsertMBB&: MBB, I: MII, DstRC: &AMDGPU::VGPR_32RegClass, Op&: Src1, MRI, DL);
8839
8840 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_XNOR_B32_e64), DestReg: NewDest)
8841 .add(MO: Src0)
8842 .add(MO: Src1);
8843
8844 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8845 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8846 } else {
8847 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8848 // invert either source and then perform the XOR. If either source is a
8849 // scalar register, then we can leave the inversion on the scalar unit to
8850 // achieve a better distribution of scalar and vector instructions.
8851 bool Src0IsSGPR = Src0.isReg() &&
8852 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src0.getReg()));
8853 bool Src1IsSGPR = Src1.isReg() &&
8854 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src1.getReg()));
8855 MachineInstr *Xor;
8856 Register Temp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8857 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8858
8859 // Build a pair of scalar instructions and add them to the work list.
8860 // The next iteration over the work list will lower these to the vector
8861 // unit as necessary.
8862 if (Src0IsSGPR) {
8863 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Temp).add(MO: Src0);
8864 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: NewDest)
8865 .addReg(RegNo: Temp)
8866 .add(MO: Src1);
8867 } else if (Src1IsSGPR) {
8868 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Temp).add(MO: Src1);
8869 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: NewDest)
8870 .add(MO: Src0)
8871 .addReg(RegNo: Temp);
8872 } else {
8873 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: Temp)
8874 .add(MO: Src0)
8875 .add(MO: Src1);
8876 MachineInstr *Not =
8877 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: NewDest).addReg(RegNo: Temp);
8878 Worklist.insert(MI: Not);
8879 }
8880
8881 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8882
8883 Worklist.insert(MI: Xor);
8884
8885 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8886 }
8887}
8888
8889void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8890 MachineInstr &Inst,
8891 unsigned Opcode) const {
8892 MachineBasicBlock &MBB = *Inst.getParent();
8893 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8894 MachineBasicBlock::iterator MII = Inst;
8895 const DebugLoc &DL = Inst.getDebugLoc();
8896
8897 MachineOperand &Dest = Inst.getOperand(i: 0);
8898 MachineOperand &Src0 = Inst.getOperand(i: 1);
8899 MachineOperand &Src1 = Inst.getOperand(i: 2);
8900
8901 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8902 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8903
8904 MachineInstr &Op = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: Interm)
8905 .add(MO: Src0)
8906 .add(MO: Src1);
8907
8908 MachineInstr &Not = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: NewDest)
8909 .addReg(RegNo: Interm);
8910
8911 Worklist.insert(MI: &Op);
8912 Worklist.insert(MI: &Not);
8913
8914 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8915 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8916}
8917
8918void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8919 MachineInstr &Inst,
8920 unsigned Opcode) const {
8921 MachineBasicBlock &MBB = *Inst.getParent();
8922 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8923 MachineBasicBlock::iterator MII = Inst;
8924 const DebugLoc &DL = Inst.getDebugLoc();
8925
8926 MachineOperand &Dest = Inst.getOperand(i: 0);
8927 MachineOperand &Src0 = Inst.getOperand(i: 1);
8928 MachineOperand &Src1 = Inst.getOperand(i: 2);
8929
8930 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8931 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8932
8933 MachineInstr &Not = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Interm)
8934 .add(MO: Src1);
8935
8936 MachineInstr &Op = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: NewDest)
8937 .add(MO: Src0)
8938 .addReg(RegNo: Interm);
8939
8940 Worklist.insert(MI: &Not);
8941 Worklist.insert(MI: &Op);
8942
8943 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8944 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8945}
8946
8947void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8948 MachineInstr &Inst, unsigned Opcode,
8949 bool Swap) const {
8950 MachineBasicBlock &MBB = *Inst.getParent();
8951 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8952
8953 MachineOperand &Dest = Inst.getOperand(i: 0);
8954 MachineOperand &Src0 = Inst.getOperand(i: 1);
8955 const DebugLoc &DL = Inst.getDebugLoc();
8956
8957 MachineBasicBlock::iterator MII = Inst;
8958
8959 const MCInstrDesc &InstDesc = get(Opcode);
8960 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8961 MRI.getRegClass(Reg: Src0.getReg()) :
8962 &AMDGPU::SGPR_32RegClass;
8963
8964 const TargetRegisterClass *Src0SubRC =
8965 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8966
8967 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
8968 SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8969
8970 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
8971 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(SRC: DestRC);
8972 const TargetRegisterClass *NewDestSubRC =
8973 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8974
8975 Register DestSub0 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
8976 MachineInstr &LoHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub0).add(MO: SrcReg0Sub0);
8977
8978 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
8979 SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
8980
8981 Register DestSub1 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
8982 MachineInstr &HiHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub1).add(MO: SrcReg0Sub1);
8983
8984 if (Swap)
8985 std::swap(a&: DestSub0, b&: DestSub1);
8986
8987 Register FullDestReg = MRI.createVirtualRegister(RegClass: NewDestRC);
8988 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
8989 .addReg(RegNo: DestSub0)
8990 .addImm(Val: AMDGPU::sub0)
8991 .addReg(RegNo: DestSub1)
8992 .addImm(Val: AMDGPU::sub1);
8993
8994 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
8995
8996 Worklist.insert(MI: &LoHalf);
8997 Worklist.insert(MI: &HiHalf);
8998
8999 // We don't need to legalizeOperands here because for a single operand, src0
9000 // will support any kind of input.
9001
9002 // Move all users of this moved value.
9003 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
9004}
9005
9006// There is not a vector equivalent of s_mul_u64. For this reason, we need to
9007// split the s_mul_u64 in 32-bit vector multiplications.
9008void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
9009 MachineInstr &Inst,
9010 MachineDominatorTree *MDT) const {
9011 MachineBasicBlock &MBB = *Inst.getParent();
9012 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9013
9014 Register FullDestReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
9015 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9016 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9017
9018 MachineOperand &Dest = Inst.getOperand(i: 0);
9019 MachineOperand &Src0 = Inst.getOperand(i: 1);
9020 MachineOperand &Src1 = Inst.getOperand(i: 2);
9021 const DebugLoc &DL = Inst.getDebugLoc();
9022 MachineBasicBlock::iterator MII = Inst;
9023
9024 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0.getReg());
9025 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: Src1.getReg());
9026 const TargetRegisterClass *Src0SubRC =
9027 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9028 if (RI.isSGPRClass(RC: Src0SubRC))
9029 Src0SubRC = RI.getEquivalentVGPRClass(SRC: Src0SubRC);
9030 const TargetRegisterClass *Src1SubRC =
9031 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9032 if (RI.isSGPRClass(RC: Src1SubRC))
9033 Src1SubRC = RI.getEquivalentVGPRClass(SRC: Src1SubRC);
9034
9035 // First, we extract the low 32-bit and high 32-bit values from each of the
9036 // operands.
9037 MachineOperand Op0L =
9038 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
9039 MachineOperand Op1L =
9040 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
9041 MachineOperand Op0H =
9042 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
9043 MachineOperand Op1H =
9044 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
9045
9046 // The multilication is done as follows:
9047 //
9048 // Op1H Op1L
9049 // * Op0H Op0L
9050 // --------------------
9051 // Op1H*Op0L Op1L*Op0L
9052 // + Op1H*Op0H Op1L*Op0H
9053 // -----------------------------------------
9054 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
9055 //
9056 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
9057 // value and that would overflow.
9058 // The low 32-bit value is Op1L*Op0L.
9059 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
9060
9061 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9062 MachineInstr *Op1L_Op0H =
9063 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: Op1L_Op0H_Reg)
9064 .add(MO: Op1L)
9065 .add(MO: Op0H);
9066
9067 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9068 MachineInstr *Op1H_Op0L =
9069 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: Op1H_Op0L_Reg)
9070 .add(MO: Op1H)
9071 .add(MO: Op0L);
9072
9073 Register CarryReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9074 MachineInstr *Carry =
9075 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_HI_U32_e64), DestReg: CarryReg)
9076 .add(MO: Op1L)
9077 .add(MO: Op0L);
9078
9079 MachineInstr *LoHalf =
9080 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: DestSub0)
9081 .add(MO: Op1L)
9082 .add(MO: Op0L);
9083
9084 Register AddReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9085 MachineInstr *Add = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg: AddReg)
9086 .addReg(RegNo: Op1L_Op0H_Reg)
9087 .addReg(RegNo: Op1H_Op0L_Reg);
9088
9089 MachineInstr *HiHalf =
9090 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg: DestSub1)
9091 .addReg(RegNo: AddReg)
9092 .addReg(RegNo: CarryReg);
9093
9094 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
9095 .addReg(RegNo: DestSub0)
9096 .addImm(Val: AMDGPU::sub0)
9097 .addReg(RegNo: DestSub1)
9098 .addImm(Val: AMDGPU::sub1);
9099
9100 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
9101
9102 // Try to legalize the operands in case we need to swap the order to keep it
9103 // valid.
9104 legalizeOperands(MI&: *Op1L_Op0H, MDT);
9105 legalizeOperands(MI&: *Op1H_Op0L, MDT);
9106 legalizeOperands(MI&: *Carry, MDT);
9107 legalizeOperands(MI&: *LoHalf, MDT);
9108 legalizeOperands(MI&: *Add, MDT);
9109 legalizeOperands(MI&: *HiHalf, MDT);
9110
9111 // Move all users of this moved value.
9112 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
9113}
9114
9115// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
9116// multiplications.
9117void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
9118 MachineInstr &Inst,
9119 MachineDominatorTree *MDT) const {
9120 MachineBasicBlock &MBB = *Inst.getParent();
9121 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9122
9123 Register FullDestReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
9124 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9125 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9126
9127 MachineOperand &Dest = Inst.getOperand(i: 0);
9128 MachineOperand &Src0 = Inst.getOperand(i: 1);
9129 MachineOperand &Src1 = Inst.getOperand(i: 2);
9130 const DebugLoc &DL = Inst.getDebugLoc();
9131 MachineBasicBlock::iterator MII = Inst;
9132
9133 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0.getReg());
9134 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: Src1.getReg());
9135 const TargetRegisterClass *Src0SubRC =
9136 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9137 if (RI.isSGPRClass(RC: Src0SubRC))
9138 Src0SubRC = RI.getEquivalentVGPRClass(SRC: Src0SubRC);
9139 const TargetRegisterClass *Src1SubRC =
9140 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9141 if (RI.isSGPRClass(RC: Src1SubRC))
9142 Src1SubRC = RI.getEquivalentVGPRClass(SRC: Src1SubRC);
9143
9144 // First, we extract the low 32-bit and high 32-bit values from each of the
9145 // operands.
9146 MachineOperand Op0L =
9147 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
9148 MachineOperand Op1L =
9149 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
9150
9151 unsigned Opc = Inst.getOpcode();
9152 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
9153 ? AMDGPU::V_MUL_HI_U32_e64
9154 : AMDGPU::V_MUL_HI_I32_e64;
9155 MachineInstr *HiHalf =
9156 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: NewOpc), DestReg: DestSub1).add(MO: Op1L).add(MO: Op0L);
9157
9158 MachineInstr *LoHalf =
9159 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: DestSub0)
9160 .add(MO: Op1L)
9161 .add(MO: Op0L);
9162
9163 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
9164 .addReg(RegNo: DestSub0)
9165 .addImm(Val: AMDGPU::sub0)
9166 .addReg(RegNo: DestSub1)
9167 .addImm(Val: AMDGPU::sub1);
9168
9169 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
9170
9171 // Try to legalize the operands in case we need to swap the order to keep it
9172 // valid.
9173 legalizeOperands(MI&: *HiHalf, MDT);
9174 legalizeOperands(MI&: *LoHalf, MDT);
9175
9176 // Move all users of this moved value.
9177 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
9178}
9179
9180void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9181 MachineInstr &Inst, unsigned Opcode,
9182 MachineDominatorTree *MDT) const {
9183 MachineBasicBlock &MBB = *Inst.getParent();
9184 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9185
9186 MachineOperand &Dest = Inst.getOperand(i: 0);
9187 MachineOperand &Src0 = Inst.getOperand(i: 1);
9188 MachineOperand &Src1 = Inst.getOperand(i: 2);
9189 const DebugLoc &DL = Inst.getDebugLoc();
9190
9191 MachineBasicBlock::iterator MII = Inst;
9192
9193 const MCInstrDesc &InstDesc = get(Opcode);
9194 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9195 MRI.getRegClass(Reg: Src0.getReg()) :
9196 &AMDGPU::SGPR_32RegClass;
9197
9198 const TargetRegisterClass *Src0SubRC =
9199 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9200 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9201 MRI.getRegClass(Reg: Src1.getReg()) :
9202 &AMDGPU::SGPR_32RegClass;
9203
9204 const TargetRegisterClass *Src1SubRC =
9205 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9206
9207 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
9208 SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
9209 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC,
9210 SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
9211 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
9212 SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
9213 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC,
9214 SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
9215
9216 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
9217 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(SRC: DestRC);
9218 const TargetRegisterClass *NewDestSubRC =
9219 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9220
9221 Register DestSub0 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
9222 MachineInstr &LoHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub0)
9223 .add(MO: SrcReg0Sub0)
9224 .add(MO: SrcReg1Sub0);
9225
9226 Register DestSub1 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
9227 MachineInstr &HiHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub1)
9228 .add(MO: SrcReg0Sub1)
9229 .add(MO: SrcReg1Sub1);
9230
9231 Register FullDestReg = MRI.createVirtualRegister(RegClass: NewDestRC);
9232 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
9233 .addReg(RegNo: DestSub0)
9234 .addImm(Val: AMDGPU::sub0)
9235 .addReg(RegNo: DestSub1)
9236 .addImm(Val: AMDGPU::sub1);
9237
9238 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
9239
9240 Worklist.insert(MI: &LoHalf);
9241 Worklist.insert(MI: &HiHalf);
9242
9243 // Move all users of this moved value.
9244 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
9245}
9246
9247void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9248 MachineInstr &Inst,
9249 MachineDominatorTree *MDT) const {
9250 MachineBasicBlock &MBB = *Inst.getParent();
9251 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9252
9253 MachineOperand &Dest = Inst.getOperand(i: 0);
9254 MachineOperand &Src0 = Inst.getOperand(i: 1);
9255 MachineOperand &Src1 = Inst.getOperand(i: 2);
9256 const DebugLoc &DL = Inst.getDebugLoc();
9257
9258 MachineBasicBlock::iterator MII = Inst;
9259
9260 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
9261
9262 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
9263
9264 MachineOperand* Op0;
9265 MachineOperand* Op1;
9266
9267 if (Src0.isReg() && RI.isSGPRReg(MRI, Reg: Src0.getReg())) {
9268 Op0 = &Src0;
9269 Op1 = &Src1;
9270 } else {
9271 Op0 = &Src1;
9272 Op1 = &Src0;
9273 }
9274
9275 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B64), DestReg: Interm)
9276 .add(MO: *Op0);
9277
9278 Register NewDest = MRI.createVirtualRegister(RegClass: DestRC);
9279
9280 MachineInstr &Xor = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B64), DestReg: NewDest)
9281 .addReg(RegNo: Interm)
9282 .add(MO: *Op1);
9283
9284 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
9285
9286 Worklist.insert(MI: &Xor);
9287}
9288
9289void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9290 MachineInstr &Inst) const {
9291 MachineBasicBlock &MBB = *Inst.getParent();
9292 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9293
9294 MachineBasicBlock::iterator MII = Inst;
9295 const DebugLoc &DL = Inst.getDebugLoc();
9296
9297 MachineOperand &Dest = Inst.getOperand(i: 0);
9298 MachineOperand &Src = Inst.getOperand(i: 1);
9299
9300 const MCInstrDesc &InstDesc = get(Opcode: AMDGPU::V_BCNT_U32_B32_e64);
9301 const TargetRegisterClass *SrcRC = Src.isReg() ?
9302 MRI.getRegClass(Reg: Src.getReg()) :
9303 &AMDGPU::SGPR_32RegClass;
9304
9305 Register MidReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9306 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9307
9308 const TargetRegisterClass *SrcSubRC =
9309 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9310
9311 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC,
9312 SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
9313 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC,
9314 SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
9315
9316 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg).add(MO: SrcRegSub0).addImm(Val: 0);
9317
9318 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: ResultReg).add(MO: SrcRegSub1).addReg(RegNo: MidReg);
9319
9320 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9321
9322 // We don't need to legalize operands here. src0 for either instruction can be
9323 // an SGPR, and the second input is unused or determined here.
9324 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9325}
9326
9327void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9328 MachineInstr &Inst) const {
9329 MachineBasicBlock &MBB = *Inst.getParent();
9330 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9331 MachineBasicBlock::iterator MII = Inst;
9332 const DebugLoc &DL = Inst.getDebugLoc();
9333
9334 MachineOperand &Dest = Inst.getOperand(i: 0);
9335 uint32_t Imm = Inst.getOperand(i: 2).getImm();
9336 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9337 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9338
9339 (void) Offset;
9340
9341 // Only sext_inreg cases handled.
9342 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9343 Offset == 0 && "Not implemented");
9344
9345 if (BitWidth < 32) {
9346 Register MidRegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9347 Register MidRegHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9348 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
9349
9350 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_BFE_I32_e64), DestReg: MidRegLo)
9351 .addReg(RegNo: Inst.getOperand(i: 1).getReg(), Flags: {}, SubReg: AMDGPU::sub0)
9352 .addImm(Val: 0)
9353 .addImm(Val: BitWidth);
9354
9355 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ASHRREV_I32_e32), DestReg: MidRegHi)
9356 .addImm(Val: 31)
9357 .addReg(RegNo: MidRegLo);
9358
9359 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: ResultReg)
9360 .addReg(RegNo: MidRegLo)
9361 .addImm(Val: AMDGPU::sub0)
9362 .addReg(RegNo: MidRegHi)
9363 .addImm(Val: AMDGPU::sub1);
9364
9365 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9366 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9367 return;
9368 }
9369
9370 MachineOperand &Src = Inst.getOperand(i: 1);
9371 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9372 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
9373
9374 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ASHRREV_I32_e64), DestReg: TmpReg)
9375 .addImm(Val: 31)
9376 .addReg(RegNo: Src.getReg(), Flags: {}, SubReg: AMDGPU::sub0);
9377
9378 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: ResultReg)
9379 .addReg(RegNo: Src.getReg(), Flags: {}, SubReg: AMDGPU::sub0)
9380 .addImm(Val: AMDGPU::sub0)
9381 .addReg(RegNo: TmpReg)
9382 .addImm(Val: AMDGPU::sub1);
9383
9384 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9385 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9386}
9387
9388void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9389 MachineInstr &Inst, unsigned Opcode,
9390 MachineDominatorTree *MDT) const {
9391 // (S_FLBIT_I32_B64 hi:lo) ->
9392 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9393 // (S_FF1_I32_B64 hi:lo) ->
9394 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9395
9396 MachineBasicBlock &MBB = *Inst.getParent();
9397 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9398 MachineBasicBlock::iterator MII = Inst;
9399 const DebugLoc &DL = Inst.getDebugLoc();
9400
9401 MachineOperand &Dest = Inst.getOperand(i: 0);
9402 MachineOperand &Src = Inst.getOperand(i: 1);
9403
9404 const MCInstrDesc &InstDesc = get(Opcode);
9405
9406 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9407 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9408 : AMDGPU::V_ADD_CO_U32_e32;
9409
9410 const TargetRegisterClass *SrcRC =
9411 Src.isReg() ? MRI.getRegClass(Reg: Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9412 const TargetRegisterClass *SrcSubRC =
9413 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9414
9415 MachineOperand SrcRegSub0 =
9416 buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
9417 MachineOperand SrcRegSub1 =
9418 buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
9419
9420 Register MidReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9421 Register MidReg2 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9422 Register MidReg3 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9423 Register MidReg4 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9424
9425 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg1).add(MO: SrcRegSub0);
9426
9427 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg2).add(MO: SrcRegSub1);
9428
9429 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: OpcodeAdd), DestReg: MidReg3)
9430 .addReg(RegNo: IsCtlz ? MidReg1 : MidReg2)
9431 .addImm(Val: 32)
9432 .addImm(Val: 1); // enable clamp
9433
9434 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MIN_U32_e64), DestReg: MidReg4)
9435 .addReg(RegNo: MidReg3)
9436 .addReg(RegNo: IsCtlz ? MidReg2 : MidReg1);
9437
9438 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: MidReg4);
9439
9440 addUsersToMoveToVALUWorklist(Reg: MidReg4, MRI, Worklist);
9441}
9442
9443void SIInstrInfo::addUsersToMoveToVALUWorklist(
9444 Register DstReg, MachineRegisterInfo &MRI,
9445 SIInstrWorklist &Worklist) const {
9446 for (MachineOperand &MO : make_early_inc_range(Range: MRI.use_operands(Reg: DstReg))) {
9447 MachineInstr &UseMI = *MO.getParent();
9448
9449 unsigned OpNo = 0;
9450
9451 switch (UseMI.getOpcode()) {
9452 case AMDGPU::COPY:
9453 case AMDGPU::WQM:
9454 case AMDGPU::SOFT_WQM:
9455 case AMDGPU::STRICT_WWM:
9456 case AMDGPU::STRICT_WQM:
9457 case AMDGPU::REG_SEQUENCE:
9458 case AMDGPU::PHI:
9459 case AMDGPU::INSERT_SUBREG:
9460 break;
9461 default:
9462 OpNo = MO.getOperandNo();
9463 break;
9464 }
9465
9466 const TargetRegisterClass *OpRC = getOpRegClass(MI: UseMI, OpNo);
9467 MRI.constrainRegClass(Reg: DstReg, RC: OpRC);
9468
9469 if (!RI.hasVectorRegisters(RC: OpRC))
9470 Worklist.insert(MI: &UseMI);
9471 else
9472 // Legalization could change user list.
9473 legalizeOperandsVALUt16(MI&: UseMI, OpIdx: OpNo, MRI);
9474 }
9475}
9476
9477void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9478 MachineRegisterInfo &MRI,
9479 MachineInstr &Inst) const {
9480 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9481 MachineBasicBlock *MBB = Inst.getParent();
9482 MachineOperand &Src0 = Inst.getOperand(i: 1);
9483 MachineOperand &Src1 = Inst.getOperand(i: 2);
9484 const DebugLoc &DL = Inst.getDebugLoc();
9485
9486 if (ST.useRealTrue16Insts()) {
9487 Register SrcReg0, SrcReg1;
9488 if (!Src0.isReg() || !RI.isVGPR(MRI, Reg: Src0.getReg())) {
9489 SrcReg0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9490 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL,
9491 MCID: get(Opcode: Src0.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), DestReg: SrcReg0)
9492 .add(MO: Src0);
9493 } else {
9494 SrcReg0 = Src0.getReg();
9495 }
9496
9497 if (!Src1.isReg() || !RI.isVGPR(MRI, Reg: Src1.getReg())) {
9498 SrcReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9499 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL,
9500 MCID: get(Opcode: Src1.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), DestReg: SrcReg1)
9501 .add(MO: Src1);
9502 } else {
9503 SrcReg1 = Src1.getReg();
9504 }
9505
9506 bool isSrc0Reg16 = MRI.constrainRegClass(Reg: SrcReg0, RC: &AMDGPU::VGPR_16RegClass);
9507 bool isSrc1Reg16 = MRI.constrainRegClass(Reg: SrcReg1, RC: &AMDGPU::VGPR_16RegClass);
9508
9509 auto NewMI = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: ResultReg);
9510 switch (Inst.getOpcode()) {
9511 case AMDGPU::S_PACK_LL_B32_B16:
9512 NewMI
9513 .addReg(RegNo: SrcReg0, Flags: {},
9514 SubReg: isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9515 .addImm(Val: AMDGPU::lo16)
9516 .addReg(RegNo: SrcReg1, Flags: {},
9517 SubReg: isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9518 .addImm(Val: AMDGPU::hi16);
9519 break;
9520 case AMDGPU::S_PACK_LH_B32_B16:
9521 NewMI
9522 .addReg(RegNo: SrcReg0, Flags: {},
9523 SubReg: isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9524 .addImm(Val: AMDGPU::lo16)
9525 .addReg(RegNo: SrcReg1, Flags: {}, SubReg: AMDGPU::hi16)
9526 .addImm(Val: AMDGPU::hi16);
9527 break;
9528 case AMDGPU::S_PACK_HL_B32_B16:
9529 NewMI.addReg(RegNo: SrcReg0, Flags: {}, SubReg: AMDGPU::hi16)
9530 .addImm(Val: AMDGPU::lo16)
9531 .addReg(RegNo: SrcReg1, Flags: {},
9532 SubReg: isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9533 .addImm(Val: AMDGPU::hi16);
9534 break;
9535 case AMDGPU::S_PACK_HH_B32_B16:
9536 NewMI.addReg(RegNo: SrcReg0, Flags: {}, SubReg: AMDGPU::hi16)
9537 .addImm(Val: AMDGPU::lo16)
9538 .addReg(RegNo: SrcReg1, Flags: {}, SubReg: AMDGPU::hi16)
9539 .addImm(Val: AMDGPU::hi16);
9540 break;
9541 default:
9542 llvm_unreachable("unhandled s_pack_* instruction");
9543 }
9544
9545 MachineOperand &Dest = Inst.getOperand(i: 0);
9546 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9547 addUsersToMoveToVALUWorklist(DstReg: ResultReg, MRI, Worklist);
9548 return;
9549 }
9550
9551 switch (Inst.getOpcode()) {
9552 case AMDGPU::S_PACK_LL_B32_B16: {
9553 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9554 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9555
9556 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9557 // 0.
9558 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9559 .addImm(Val: 0xffff);
9560
9561 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_AND_B32_e64), DestReg: TmpReg)
9562 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9563 .add(MO: Src0);
9564
9565 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: ResultReg)
9566 .add(MO: Src1)
9567 .addImm(Val: 16)
9568 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9569 break;
9570 }
9571 case AMDGPU::S_PACK_LH_B32_B16: {
9572 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9573 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9574 .addImm(Val: 0xffff);
9575 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_BFI_B32_e64), DestReg: ResultReg)
9576 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9577 .add(MO: Src0)
9578 .add(MO: Src1);
9579 break;
9580 }
9581 case AMDGPU::S_PACK_HL_B32_B16: {
9582 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9583 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
9584 .addImm(Val: 16)
9585 .add(MO: Src0);
9586 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: ResultReg)
9587 .add(MO: Src1)
9588 .addImm(Val: 16)
9589 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9590 break;
9591 }
9592 case AMDGPU::S_PACK_HH_B32_B16: {
9593 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9594 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9595 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
9596 .addImm(Val: 16)
9597 .add(MO: Src0);
9598 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9599 .addImm(Val: 0xffff0000);
9600 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_AND_OR_B32_e64), DestReg: ResultReg)
9601 .add(MO: Src1)
9602 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9603 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9604 break;
9605 }
9606 default:
9607 llvm_unreachable("unhandled s_pack_* instruction");
9608 }
9609
9610 MachineOperand &Dest = Inst.getOperand(i: 0);
9611 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9612 addUsersToMoveToVALUWorklist(DstReg: ResultReg, MRI, Worklist);
9613}
9614
9615void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9616 MachineInstr &SCCDefInst,
9617 SIInstrWorklist &Worklist,
9618 Register NewCond) const {
9619
9620 // Ensure that def inst defines SCC, which is still live.
9621 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9622 !Op.isDead() && Op.getParent() == &SCCDefInst);
9623 SmallVector<MachineInstr *, 4> CopyToDelete;
9624 // This assumes that all the users of SCC are in the same block
9625 // as the SCC def.
9626 for (MachineInstr &MI : // Skip the def inst itself.
9627 make_range(x: std::next(x: MachineBasicBlock::iterator(SCCDefInst)),
9628 y: SCCDefInst.getParent()->end())) {
9629 // Check if SCC is used first.
9630 int SCCIdx = MI.findRegisterUseOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isKill: false);
9631 if (SCCIdx != -1) {
9632 if (MI.isCopy()) {
9633 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9634 Register DestReg = MI.getOperand(i: 0).getReg();
9635
9636 MRI.replaceRegWith(FromReg: DestReg, ToReg: NewCond);
9637 CopyToDelete.push_back(Elt: &MI);
9638 } else {
9639
9640 if (NewCond.isValid())
9641 MI.getOperand(i: SCCIdx).setReg(NewCond);
9642
9643 Worklist.insert(MI: &MI);
9644 }
9645 }
9646 // Exit if we find another SCC def.
9647 if (MI.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isDead: false, Overlap: false) != -1)
9648 break;
9649 }
9650 for (auto &Copy : CopyToDelete)
9651 Copy->eraseFromParent();
9652}
9653
9654// Instructions that use SCC may be converted to VALU instructions. When that
9655// happens, the SCC register is changed to VCC_LO. The instruction that defines
9656// SCC must be changed to an instruction that defines VCC. This function makes
9657// sure that the instruction that defines SCC is added to the moveToVALU
9658// worklist.
9659void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9660 SIInstrWorklist &Worklist) const {
9661 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9662 // then there is nothing to do because the defining instruction has been
9663 // converted to a VALU already. If SCC then that instruction needs to be
9664 // converted to a VALU.
9665 for (MachineInstr &MI :
9666 make_range(x: std::next(x: MachineBasicBlock::reverse_iterator(SCCUseInst)),
9667 y: SCCUseInst->getParent()->rend())) {
9668 if (MI.modifiesRegister(Reg: AMDGPU::VCC, TRI: &RI))
9669 break;
9670 if (MI.definesRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
9671 Worklist.insert(MI: &MI);
9672 break;
9673 }
9674 }
9675}
9676
9677const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9678 const MachineInstr &Inst) const {
9679 const TargetRegisterClass *NewDstRC = getOpRegClass(MI: Inst, OpNo: 0);
9680
9681 switch (Inst.getOpcode()) {
9682 // For target instructions, getOpRegClass just returns the virtual register
9683 // class associated with the operand, so we need to find an equivalent VGPR
9684 // register class in order to move the instruction to the VALU.
9685 case AMDGPU::COPY:
9686 case AMDGPU::PHI:
9687 case AMDGPU::REG_SEQUENCE:
9688 case AMDGPU::INSERT_SUBREG:
9689 case AMDGPU::WQM:
9690 case AMDGPU::SOFT_WQM:
9691 case AMDGPU::STRICT_WWM:
9692 case AMDGPU::STRICT_WQM: {
9693 const TargetRegisterClass *SrcRC = getOpRegClass(MI: Inst, OpNo: 1);
9694 if (RI.isAGPRClass(RC: SrcRC)) {
9695 if (RI.isAGPRClass(RC: NewDstRC))
9696 return nullptr;
9697
9698 switch (Inst.getOpcode()) {
9699 case AMDGPU::PHI:
9700 case AMDGPU::REG_SEQUENCE:
9701 case AMDGPU::INSERT_SUBREG:
9702 NewDstRC = RI.getEquivalentAGPRClass(SRC: NewDstRC);
9703 break;
9704 default:
9705 NewDstRC = RI.getEquivalentVGPRClass(SRC: NewDstRC);
9706 }
9707
9708 if (!NewDstRC)
9709 return nullptr;
9710 } else {
9711 if (RI.isVGPRClass(RC: NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9712 return nullptr;
9713
9714 NewDstRC = RI.getEquivalentVGPRClass(SRC: NewDstRC);
9715 if (!NewDstRC)
9716 return nullptr;
9717 }
9718
9719 return NewDstRC;
9720 }
9721 default:
9722 return NewDstRC;
9723 }
9724}
9725
9726// Find the one SGPR operand we are allowed to use.
9727Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9728 int OpIndices[3]) const {
9729 const MCInstrDesc &Desc = MI.getDesc();
9730
9731 // Find the one SGPR operand we are allowed to use.
9732 //
9733 // First we need to consider the instruction's operand requirements before
9734 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9735 // of VCC, but we are still bound by the constant bus requirement to only use
9736 // one.
9737 //
9738 // If the operand's class is an SGPR, we can never move it.
9739
9740 Register SGPRReg = findImplicitSGPRRead(MI);
9741 if (SGPRReg)
9742 return SGPRReg;
9743
9744 Register UsedSGPRs[3] = {Register()};
9745 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9746
9747 for (unsigned i = 0; i < 3; ++i) {
9748 int Idx = OpIndices[i];
9749 if (Idx == -1)
9750 break;
9751
9752 const MachineOperand &MO = MI.getOperand(i: Idx);
9753 if (!MO.isReg())
9754 continue;
9755
9756 // Is this operand statically required to be an SGPR based on the operand
9757 // constraints?
9758 const TargetRegisterClass *OpRC =
9759 RI.getRegClass(i: getOpRegClassID(OpInfo: Desc.operands()[Idx]));
9760 bool IsRequiredSGPR = RI.isSGPRClass(RC: OpRC);
9761 if (IsRequiredSGPR)
9762 return MO.getReg();
9763
9764 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9765 Register Reg = MO.getReg();
9766 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9767 if (RI.isSGPRClass(RC: RegRC))
9768 UsedSGPRs[i] = Reg;
9769 }
9770
9771 // We don't have a required SGPR operand, so we have a bit more freedom in
9772 // selecting operands to move.
9773
9774 // Try to select the most used SGPR. If an SGPR is equal to one of the
9775 // others, we choose that.
9776 //
9777 // e.g.
9778 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9779 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9780
9781 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9782 // prefer those.
9783
9784 if (UsedSGPRs[0]) {
9785 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9786 SGPRReg = UsedSGPRs[0];
9787 }
9788
9789 if (!SGPRReg && UsedSGPRs[1]) {
9790 if (UsedSGPRs[1] == UsedSGPRs[2])
9791 SGPRReg = UsedSGPRs[1];
9792 }
9793
9794 return SGPRReg;
9795}
9796
9797MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
9798 AMDGPU::OpName OperandName) const {
9799 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9800 return nullptr;
9801
9802 int Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OperandName);
9803 if (Idx == -1)
9804 return nullptr;
9805
9806 return &MI.getOperand(i: Idx);
9807}
9808
9809uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
9810 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9811 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9812 ? (int64_t)AMDGPU::UfmtGFX11::UFMT_32_FLOAT
9813 : (int64_t)AMDGPU::UfmtGFX10::UFMT_32_FLOAT;
9814 return (Format << 44) |
9815 (1ULL << 56) | // RESOURCE_LEVEL = 1
9816 (3ULL << 60); // OOB_SELECT = 3
9817 }
9818
9819 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9820 if (ST.isAmdHsaOS()) {
9821 // Set ATC = 1. GFX9 doesn't have this bit.
9822 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9823 RsrcDataFormat |= (1ULL << 56);
9824
9825 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9826 // BTW, it disables TC L2 and therefore decreases performance.
9827 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9828 RsrcDataFormat |= (2ULL << 59);
9829 }
9830
9831 return RsrcDataFormat;
9832}
9833
9834uint64_t SIInstrInfo::getScratchRsrcWords23() const {
9835 uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
9836 AMDGPU::RSRC_TID_ENABLE |
9837 0xffffffff; // Size;
9838
9839 // GFX9 doesn't have ELEMENT_SIZE.
9840 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9841 uint64_t EltSizeValue = Log2_32(Value: ST.getMaxPrivateElementSize(ForBufferRSrc: true)) - 1;
9842 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9843 }
9844
9845 // IndexStride = 64 / 32.
9846 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9847 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9848
9849 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9850 // Clear them unless we want a huge stride.
9851 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9852 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9853 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9854
9855 return Rsrc23;
9856}
9857
9858bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
9859 unsigned Opc = MI.getOpcode();
9860
9861 return isSMRD(Opcode: Opc);
9862}
9863
9864bool SIInstrInfo::isHighLatencyDef(int Opc) const {
9865 return get(Opcode: Opc).mayLoad() &&
9866 (isMUBUF(Opcode: Opc) || isMTBUF(Opcode: Opc) || isMIMG(Opcode: Opc) || isFLAT(Opcode: Opc));
9867}
9868
9869Register SIInstrInfo::isStackAccess(const MachineInstr &MI, int &FrameIndex,
9870 TypeSize &MemBytes) const {
9871 const MachineOperand *Addr = getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
9872 if (!Addr || !Addr->isFI())
9873 return Register();
9874
9875 assert(!MI.memoperands_empty() &&
9876 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9877
9878 FrameIndex = Addr->getIndex();
9879
9880 int VDataIdx =
9881 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdata);
9882 MemBytes = TypeSize::getFixed(ExactSize: getOpSize(Opcode: MI.getOpcode(), OpNo: VDataIdx));
9883 return MI.getOperand(i: VDataIdx).getReg();
9884}
9885
9886Register SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex,
9887 TypeSize &MemBytes) const {
9888 const MachineOperand *Addr = getNamedOperand(MI, OperandName: AMDGPU::OpName::addr);
9889 assert(Addr && Addr->isFI());
9890 FrameIndex = Addr->getIndex();
9891
9892 int DataIdx =
9893 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::data);
9894 MemBytes = TypeSize::getFixed(ExactSize: getOpSize(Opcode: MI.getOpcode(), OpNo: DataIdx));
9895 return MI.getOperand(i: DataIdx).getReg();
9896}
9897
9898Register SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
9899 int &FrameIndex,
9900 TypeSize &MemBytes) const {
9901 if (!MI.mayLoad())
9902 return Register();
9903
9904 if (isMUBUF(MI) || isVGPRSpill(MI))
9905 return isStackAccess(MI, FrameIndex, MemBytes);
9906
9907 if (isSGPRSpill(MI))
9908 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9909
9910 return Register();
9911}
9912
9913Register SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
9914 int &FrameIndex,
9915 TypeSize &MemBytes) const {
9916 if (!MI.mayStore())
9917 return Register();
9918
9919 if (isMUBUF(MI) || isVGPRSpill(MI))
9920 return isStackAccess(MI, FrameIndex, MemBytes);
9921
9922 if (isSGPRSpill(MI))
9923 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9924
9925 return Register();
9926}
9927
9928unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
9929 unsigned Opc = MI.getOpcode();
9930 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: Opc);
9931 unsigned DescSize = Desc.getSize();
9932
9933 // If we have a definitive size, we can use it. Otherwise we need to inspect
9934 // the operands to know the size.
9935 if (isFixedSize(MI)) {
9936 unsigned Size = DescSize;
9937
9938 // If we hit the buggy offset, an extra nop will be inserted in MC so
9939 // estimate the worst case.
9940 if (MI.isBranch() && ST.hasOffset3fBug())
9941 Size += 4;
9942
9943 return Size;
9944 }
9945
9946 // Instructions may have a 32-bit literal encoded after them. Check
9947 // operands that could ever be literals.
9948 if (isVALU(MI, /*AllowLDSDMA=*/true) || isSALU(MI)) {
9949 if (isDPP(MI))
9950 return DescSize;
9951 bool HasLiteral = false;
9952 unsigned LiteralSize = 4;
9953 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9954 const MachineOperand &Op = MI.getOperand(i: I);
9955 const MCOperandInfo &OpInfo = Desc.operands()[I];
9956 if (!Op.isReg() && !isInlineConstant(MO: Op, OpInfo)) {
9957 HasLiteral = true;
9958 if (ST.has64BitLiterals()) {
9959 switch (OpInfo.OperandType) {
9960 default:
9961 break;
9962 case AMDGPU::OPERAND_REG_IMM_FP64:
9963 case AMDGPU::OPERAND_REG_IMM_V2FP64:
9964 if (!AMDGPU::isValid32BitLiteral(Val: Op.getImm(), IsFP64: true))
9965 LiteralSize = 8;
9966 break;
9967 case AMDGPU::OPERAND_REG_IMM_INT64:
9968 case AMDGPU::OPERAND_REG_IMM_V2INT64:
9969 // A 32-bit literal is only valid when the value fits in BOTH signed
9970 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
9971 // emitter's getLit64Encoding logic. This is because of the lack of
9972 // abilility to tell signedness of the literal, therefore we need to
9973 // be conservative and assume values outside this range require a
9974 // 64-bit literal encoding (8 bytes).
9975 if (!Op.isImm() || !isInt<32>(x: Op.getImm()) ||
9976 !isUInt<32>(x: Op.getImm()))
9977 LiteralSize = 8;
9978 break;
9979 }
9980 }
9981 break;
9982 }
9983 }
9984 return HasLiteral ? DescSize + LiteralSize : DescSize;
9985 }
9986
9987 // Check whether we have extra NSA words.
9988 if (isMIMG(MI)) {
9989 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0);
9990 if (VAddr0Idx < 0)
9991 return 8;
9992
9993 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::srsrc);
9994 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9995 }
9996
9997 switch (Opc) {
9998 case TargetOpcode::BUNDLE:
9999 return getInstBundleSize(MI);
10000 case TargetOpcode::INLINEASM:
10001 case TargetOpcode::INLINEASM_BR: {
10002 const MachineFunction *MF = MI.getMF();
10003 const char *AsmStr = MI.getOperand(i: 0).getSymbolName();
10004 return getInlineAsmLength(Str: AsmStr, MAI: MF->getTarget().getMCAsmInfo(), STI: &ST);
10005 }
10006 default:
10007 if (MI.isMetaInstruction())
10008 return 0;
10009
10010 // If D16 Pseudo inst, get correct MC code size
10011 const auto *D16Info = AMDGPU::getT16D16Helper(T16Op: Opc);
10012 if (D16Info) {
10013 // Assume d16_lo/hi inst are always in same size
10014 unsigned LoInstOpcode = D16Info->LoOp;
10015 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: LoInstOpcode);
10016 DescSize = Desc.getSize();
10017 }
10018
10019 // If FMA Pseudo inst, get correct MC code size
10020 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
10021 // All potential lowerings are the same size; arbitrarily pick one.
10022 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: AMDGPU::V_FMA_MIXLO_F16);
10023 DescSize = Desc.getSize();
10024 }
10025
10026 return DescSize;
10027 }
10028}
10029
10030TargetInstrInfo::InstSizeVerifyMode
10031SIInstrInfo::getInstSizeVerifyMode(const MachineInstr &MI) const {
10032 if (MI.isBranch() && ST.hasOffset3fBug())
10033 return InstSizeVerifyMode::NoVerify;
10034 return InstSizeVerifyMode::ExactSize;
10035}
10036
10037bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
10038 if (!isFLAT(MI))
10039 return false;
10040
10041 if (MI.memoperands_empty())
10042 return true;
10043
10044 for (const MachineMemOperand *MMO : MI.memoperands()) {
10045 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
10046 return true;
10047 }
10048 return false;
10049}
10050
10051ArrayRef<std::pair<int, const char *>>
10052SIInstrInfo::getSerializableTargetIndices() const {
10053 static const std::pair<int, const char *> TargetIndices[] = {
10054 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
10055 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
10056 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
10057 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
10058 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
10059 return ArrayRef(TargetIndices);
10060}
10061
10062/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
10063/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
10064ScheduleHazardRecognizer *
10065SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
10066 const ScheduleDAG *DAG) const {
10067 return new GCNHazardRecognizer(DAG->MF);
10068}
10069
10070/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
10071/// pass.
10072ScheduleHazardRecognizer *
10073SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF,
10074 MachineLoopInfo *MLI) const {
10075 return new GCNHazardRecognizer(MF, MLI);
10076}
10077
10078// Called during:
10079// - pre-RA scheduling and post-RA scheduling
10080ScheduleHazardRecognizer *
10081SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
10082 const ScheduleDAGMI *DAG) const {
10083 // Borrowed from Arm Target
10084 // We would like to restrict this hazard recognizer to only
10085 // post-RA scheduling; we can tell that we're post-RA because we don't
10086 // track VRegLiveness.
10087 if (!DAG->hasVRegLiveness())
10088 return new GCNHazardRecognizer(DAG->MF);
10089 return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG);
10090}
10091
10092std::pair<unsigned, unsigned>
10093SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
10094 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
10095}
10096
10097ArrayRef<std::pair<unsigned, const char *>>
10098SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
10099 static const std::pair<unsigned, const char *> TargetFlags[] = {
10100 {MO_GOTPCREL, "amdgpu-gotprel"},
10101 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
10102 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
10103 {MO_GOTPCREL64, "amdgpu-gotprel64"},
10104 {MO_REL32_LO, "amdgpu-rel32-lo"},
10105 {MO_REL32_HI, "amdgpu-rel32-hi"},
10106 {MO_REL64, "amdgpu-rel64"},
10107 {MO_ABS32_LO, "amdgpu-abs32-lo"},
10108 {MO_ABS32_HI, "amdgpu-abs32-hi"},
10109 {MO_ABS64, "amdgpu-abs64"},
10110 };
10111
10112 return ArrayRef(TargetFlags);
10113}
10114
10115ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
10116SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
10117 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
10118 {
10119 {MONoClobber, "amdgpu-noclobber"},
10120 {MOLastUse, "amdgpu-last-use"},
10121 {MOCooperative, "amdgpu-cooperative"},
10122 {MOThreadPrivate, "amdgpu-thread-private"},
10123 };
10124
10125 return ArrayRef(TargetFlags);
10126}
10127
10128unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
10129 const MachineFunction &MF) const {
10130 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
10131 assert(SrcReg.isVirtual());
10132 if (MFI->checkFlag(Reg: SrcReg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
10133 return AMDGPU::WWM_COPY;
10134
10135 return AMDGPU::COPY;
10136}
10137
10138bool SIInstrInfo::canAddToBBProlog(const MachineInstr &MI) const {
10139 uint32_t Opcode = MI.getOpcode();
10140 // Check if it is SGPR spill or wwm-register spill Opcode.
10141 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
10142 return true;
10143
10144 const MachineFunction *MF = MI.getMF();
10145 const MachineRegisterInfo &MRI = MF->getRegInfo();
10146 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
10147
10148 // See if this is Liverange split instruction inserted for SGPR or
10149 // wwm-register. The implicit def inserted for wwm-registers should also be
10150 // included as they can appear at the bb begin.
10151 bool IsLRSplitInst = MI.getFlag(Flag: MachineInstr::LRSplit);
10152 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
10153 return false;
10154
10155 Register Reg = MI.getOperand(i: 0).getReg();
10156 if (RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg)))
10157 return IsLRSplitInst;
10158
10159 return MFI->isWWMReg(Reg);
10160}
10161
10162bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
10163 Register Reg) const {
10164 // We need to handle instructions which may be inserted during register
10165 // allocation to handle the prolog. The initial prolog instruction may have
10166 // been separated from the start of the block by spills and copies inserted
10167 // needed by the prolog. However, the insertions for scalar registers can
10168 // always be placed at the BB top as they are independent of the exec mask
10169 // value.
10170 bool IsNullOrVectorRegister = true;
10171 if (Reg) {
10172 const MachineFunction *MF = MI.getMF();
10173 const MachineRegisterInfo &MRI = MF->getRegInfo();
10174 IsNullOrVectorRegister = !RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg));
10175 }
10176
10177 return IsNullOrVectorRegister &&
10178 (canAddToBBProlog(MI) ||
10179 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10180 MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &RI)));
10181}
10182
10183MachineInstrBuilder
10184SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
10185 MachineBasicBlock::iterator I,
10186 const DebugLoc &DL,
10187 Register DestReg) const {
10188 if (ST.hasAddNoCarryInsts())
10189 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e64), DestReg);
10190
10191 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10192 Register UnusedCarry = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
10193 MRI.setRegAllocationHint(VReg: UnusedCarry, Type: 0, PrefReg: RI.getVCC());
10194
10195 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg)
10196 .addReg(RegNo: UnusedCarry, Flags: RegState::Define | RegState::Dead);
10197}
10198
10199MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
10200 MachineBasicBlock::iterator I,
10201 const DebugLoc &DL,
10202 Register DestReg,
10203 RegScavenger &RS) const {
10204 if (ST.hasAddNoCarryInsts())
10205 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg);
10206
10207 // If available, prefer to use vcc.
10208 Register UnusedCarry = !RS.isRegUsed(Reg: AMDGPU::VCC)
10209 ? Register(RI.getVCC())
10210 : RS.scavengeRegisterBackwards(
10211 RC: *RI.getBoolRC(), To: I, /* RestoreAfter */ false,
10212 SPAdj: 0, /* AllowSpill */ false);
10213
10214 // TODO: Users need to deal with this.
10215 if (!UnusedCarry.isValid())
10216 return MachineInstrBuilder();
10217
10218 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg)
10219 .addReg(RegNo: UnusedCarry, Flags: RegState::Define | RegState::Dead);
10220}
10221
10222bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10223 switch (Opcode) {
10224 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10225 case AMDGPU::SI_KILL_I1_TERMINATOR:
10226 return true;
10227 default:
10228 return false;
10229 }
10230}
10231
10232const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
10233 switch (Opcode) {
10234 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10235 return get(Opcode: AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10236 case AMDGPU::SI_KILL_I1_PSEUDO:
10237 return get(Opcode: AMDGPU::SI_KILL_I1_TERMINATOR);
10238 default:
10239 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10240 }
10241}
10242
10243bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10244 return Imm <= getMaxMUBUFImmOffset(ST);
10245}
10246
10247unsigned SIInstrInfo::getMaxMUBUFImmOffset(const GCNSubtarget &ST) {
10248 // GFX12 field is non-negative 24-bit signed byte offset.
10249 const unsigned OffsetBits =
10250 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10251 return (1 << OffsetBits) - 1;
10252}
10253
10254void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
10255 if (!ST.isWave32())
10256 return;
10257
10258 if (MI.isInlineAsm())
10259 return;
10260
10261 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10262 return;
10263
10264 for (auto &Op : MI.implicit_operands()) {
10265 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10266 Op.setReg(AMDGPU::VCC_LO);
10267 }
10268}
10269
10270bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
10271 if (!isSMRD(MI))
10272 return false;
10273
10274 // Check that it is using a buffer resource.
10275 int Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::sbase);
10276 if (Idx == -1) // e.g. s_memtime
10277 return false;
10278
10279 const int16_t RCID = getOpRegClassID(OpInfo: MI.getDesc().operands()[Idx]);
10280 return RI.getRegClass(i: RCID)->hasSubClassEq(RC: &AMDGPU::SGPR_128RegClass);
10281}
10282
10283// Given Imm, split it into the values to put into the SOffset and ImmOffset
10284// fields in an MUBUF instruction. Return false if it is not possible (due to a
10285// hardware bug needing a workaround).
10286//
10287// The required alignment ensures that individual address components remain
10288// aligned if they are aligned to begin with. It also ensures that additional
10289// offsets within the given alignment can be added to the resulting ImmOffset.
10290bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset,
10291 uint32_t &ImmOffset, Align Alignment) const {
10292 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10293 const uint32_t MaxImm = alignDown(Value: MaxOffset, Align: Alignment.value());
10294 uint32_t Overflow = 0;
10295
10296 if (Imm > MaxImm) {
10297 if (Imm <= MaxImm + 64) {
10298 // Use an SOffset inline constant for 4..64
10299 Overflow = Imm - MaxImm;
10300 Imm = MaxImm;
10301 } else {
10302 // Try to keep the same value in SOffset for adjacent loads, so that
10303 // the corresponding register contents can be re-used.
10304 //
10305 // Load values with all low-bits (except for alignment bits) set into
10306 // SOffset, so that a larger range of values can be covered using
10307 // s_movk_i32.
10308 //
10309 // Atomic operations fail to work correctly when individual address
10310 // components are unaligned, even if their sum is aligned.
10311 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10312 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10313 Imm = Low;
10314 Overflow = High - Alignment.value();
10315 }
10316 }
10317
10318 if (Overflow > 0) {
10319 // There is a hardware bug in SI and CI which prevents address clamping in
10320 // MUBUF instructions from working correctly with SOffsets. The immediate
10321 // offset is unaffected.
10322 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10323 return false;
10324
10325 // It is not possible to set immediate in SOffset field on some targets.
10326 if (ST.hasRestrictedSOffset())
10327 return false;
10328 }
10329
10330 ImmOffset = Imm;
10331 SOffset = Overflow;
10332 return true;
10333}
10334
10335// Depending on the used address space and instructions, some immediate offsets
10336// are allowed and some are not.
10337// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10338// scratch instruction offsets can also be negative. On GFX12, offsets can be
10339// negative for all variants.
10340//
10341// There are several bugs related to these offsets:
10342// On gfx10.1, flat instructions that go into the global address space cannot
10343// use an offset.
10344//
10345// For scratch instructions, the address can be either an SGPR or a VGPR.
10346// The following offsets can be used, depending on the architecture (x means
10347// cannot be used):
10348// +----------------------------+------+------+
10349// | Address-Mode | SGPR | VGPR |
10350// +----------------------------+------+------+
10351// | gfx9 | | |
10352// | negative, 4-aligned offset | x | ok |
10353// | negative, unaligned offset | x | ok |
10354// +----------------------------+------+------+
10355// | gfx10 | | |
10356// | negative, 4-aligned offset | ok | ok |
10357// | negative, unaligned offset | ok | x |
10358// +----------------------------+------+------+
10359// | gfx10.3 | | |
10360// | negative, 4-aligned offset | ok | ok |
10361// | negative, unaligned offset | ok | ok |
10362// +----------------------------+------+------+
10363//
10364// This function ignores the addressing mode, so if an offset cannot be used in
10365// one addressing mode, it is considered illegal.
10366bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10367 AMDGPU::FlatAddrSpace FlatVariant) const {
10368 // TODO: Should 0 be special cased?
10369 if (!ST.hasFlatInstOffsets())
10370 return false;
10371
10372 using AMDGPU::FlatAddrSpace;
10373 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == FlatAddrSpace::FLAT &&
10374 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10375 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10376 return false;
10377
10378 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10379 FlatVariant == FlatAddrSpace::FlatScratch && Offset < 0 &&
10380 (Offset % 4) != 0) {
10381 return false;
10382 }
10383
10384 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10385 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10386 return isIntN(N, x: Offset) && (AllowNegative || Offset >= 0);
10387}
10388
10389// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10390std::pair<int64_t, int64_t>
10391SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10392 AMDGPU::FlatAddrSpace FlatVariant) const {
10393 int64_t RemainderOffset = COffsetVal;
10394 int64_t ImmField = 0;
10395
10396 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10397 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10398
10399 if (AllowNegative) {
10400 // Use signed division by a power of two to truncate towards 0.
10401 int64_t D = 1LL << NumBits;
10402 RemainderOffset = (COffsetVal / D) * D;
10403 ImmField = COffsetVal - RemainderOffset;
10404
10405 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10406 FlatVariant == AMDGPU::FlatAddrSpace::FlatScratch && ImmField < 0 &&
10407 (ImmField % 4) != 0) {
10408 // Make ImmField a multiple of 4
10409 RemainderOffset += ImmField % 4;
10410 ImmField -= ImmField % 4;
10411 }
10412 } else if (COffsetVal >= 0) {
10413 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(N: NumBits);
10414 RemainderOffset = COffsetVal - ImmField;
10415 }
10416
10417 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10418 assert(RemainderOffset + ImmField == COffsetVal);
10419 return {ImmField, RemainderOffset};
10420}
10421
10422bool SIInstrInfo::allowNegativeFlatOffset(
10423 AMDGPU::FlatAddrSpace FlatVariant) const {
10424 if (ST.hasNegativeScratchOffsetBug() &&
10425 FlatVariant == AMDGPU::FlatAddrSpace::FlatScratch)
10426 return false;
10427
10428 return FlatVariant != AMDGPU::FlatAddrSpace::FLAT || AMDGPU::isGFX12Plus(STI: ST);
10429}
10430
10431static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10432 switch (ST.getGeneration()) {
10433 default:
10434 break;
10435 case AMDGPUSubtarget::SOUTHERN_ISLANDS:
10436 case AMDGPUSubtarget::SEA_ISLANDS:
10437 return SIEncodingFamily::SI;
10438 case AMDGPUSubtarget::VOLCANIC_ISLANDS:
10439 case AMDGPUSubtarget::GFX9:
10440 return SIEncodingFamily::VI;
10441 case AMDGPUSubtarget::GFX10:
10442 return SIEncodingFamily::GFX10;
10443 case AMDGPUSubtarget::GFX11:
10444 return ST.hasGFX11_7Insts() ? SIEncodingFamily::GFX1170
10445 : SIEncodingFamily::GFX11;
10446 case AMDGPUSubtarget::GFX12:
10447 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10448 : SIEncodingFamily::GFX12;
10449 case AMDGPUSubtarget::GFX13:
10450 return SIEncodingFamily::GFX13;
10451 }
10452 llvm_unreachable("Unknown subtarget generation!");
10453}
10454
10455bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10456 switch(MCOp) {
10457 // These opcodes use indirect register addressing so
10458 // they need special handling by codegen (currently missing).
10459 // Therefore it is too risky to allow these opcodes
10460 // to be selected by dpp combiner or sdwa peepholer.
10461 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10462 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10463 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10464 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10465 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10466 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10467 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10468 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10469 return true;
10470 default:
10471 return false;
10472 }
10473}
10474
10475#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10476 case OPCODE##_dpp: \
10477 case OPCODE##_e32: \
10478 case OPCODE##_e64: \
10479 case OPCODE##_e64_dpp: \
10480 case OPCODE##_sdwa:
10481
10482static bool isRenamedInGFX9(int Opcode) {
10483 switch (Opcode) {
10484 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10485 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10486 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10487 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10488 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10489 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10490 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10491 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10492 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10493 //
10494 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10495 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10496 case AMDGPU::V_FMA_F16_gfx9_e64:
10497 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10498 case AMDGPU::V_INTERP_P2_F16:
10499 case AMDGPU::V_MAD_F16_e64:
10500 case AMDGPU::V_MAD_U16_e64:
10501 case AMDGPU::V_MAD_I16_e64:
10502 return true;
10503 default:
10504 return false;
10505 }
10506}
10507
10508int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10509 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10510 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10511
10512 unsigned Gen = subtargetEncodingFamily(ST);
10513
10514 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10515 Gen = SIEncodingFamily::GFX9;
10516
10517 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10518 // subtarget has UnpackedD16VMem feature.
10519 // TODO: remove this when we discard GFX80 encoding.
10520 if (ST.hasUnpackedD16VMem() && SIInstrFlags::isD16Buf(O: get(Opcode)))
10521 Gen = SIEncodingFamily::GFX80;
10522
10523 if (SIInstrFlags::isSDWA(O: get(Opcode))) {
10524 switch (ST.getGeneration()) {
10525 default:
10526 Gen = SIEncodingFamily::SDWA;
10527 break;
10528 case AMDGPUSubtarget::GFX9:
10529 Gen = SIEncodingFamily::SDWA9;
10530 break;
10531 case AMDGPUSubtarget::GFX10:
10532 Gen = SIEncodingFamily::SDWA10;
10533 break;
10534 }
10535 }
10536
10537 if (isMAI(Opcode)) {
10538 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10539 if (MFMAOp != -1)
10540 Opcode = MFMAOp;
10541 }
10542
10543 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10544
10545 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX11_7Insts())
10546 MCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX11);
10547
10548 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10549 MCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX12);
10550
10551 // -1 means that Opcode is already a native instruction.
10552 if (MCOp == -1)
10553 return Opcode;
10554
10555 if (ST.hasGFX90AInsts()) {
10556 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10557 if (ST.hasGFX940Insts())
10558 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX940);
10559 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10560 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX90A);
10561 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10562 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX9);
10563 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10564 MCOp = NMCOp;
10565 }
10566
10567 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10568 // encoding in the given subtarget generation.
10569 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10570 return -1;
10571
10572 if (isAsmOnlyOpcode(MCOp))
10573 return -1;
10574
10575 return MCOp;
10576}
10577
10578static
10579TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
10580 assert(RegOpnd.isReg());
10581 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10582 getRegSubRegPair(O: RegOpnd);
10583}
10584
10585TargetInstrInfo::RegSubRegPair
10586llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
10587 assert(MI.isRegSequence());
10588 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10589 if (MI.getOperand(i: 1 + 2 * I + 1).getImm() == SubReg) {
10590 auto &RegOp = MI.getOperand(i: 1 + 2 * I);
10591 return getRegOrUndef(RegOpnd: RegOp);
10592 }
10593 return TargetInstrInfo::RegSubRegPair();
10594}
10595
10596// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10597// Following a subreg of reg:subreg isn't supported
10598static bool followSubRegDef(MachineInstr &MI,
10599 TargetInstrInfo::RegSubRegPair &RSR) {
10600 if (!RSR.SubReg)
10601 return false;
10602 switch (MI.getOpcode()) {
10603 default: break;
10604 case AMDGPU::REG_SEQUENCE:
10605 RSR = getRegSequenceSubReg(MI, SubReg: RSR.SubReg);
10606 return true;
10607 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10608 case AMDGPU::INSERT_SUBREG:
10609 if (RSR.SubReg == (unsigned)MI.getOperand(i: 3).getImm())
10610 // inserted the subreg we're looking for
10611 RSR = getRegOrUndef(RegOpnd: MI.getOperand(i: 2));
10612 else { // the subreg in the rest of the reg
10613 auto R1 = getRegOrUndef(RegOpnd: MI.getOperand(i: 1));
10614 if (R1.SubReg) // subreg of subreg isn't supported
10615 return false;
10616 RSR.Reg = R1.Reg;
10617 }
10618 return true;
10619 }
10620 return false;
10621}
10622
10623MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
10624 const MachineRegisterInfo &MRI) {
10625 assert(MRI.isSSA());
10626 if (!P.Reg.isVirtual())
10627 return nullptr;
10628
10629 auto RSR = P;
10630 auto *DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10631 while (auto *MI = DefInst) {
10632 DefInst = nullptr;
10633 switch (MI->getOpcode()) {
10634 case AMDGPU::COPY:
10635 case AMDGPU::V_MOV_B32_e32: {
10636 auto &Op1 = MI->getOperand(i: 1);
10637 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10638 if (Op1.isUndef())
10639 return nullptr;
10640 RSR = getRegSubRegPair(O: Op1);
10641 DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10642 }
10643 break;
10644 }
10645 default:
10646 if (followSubRegDef(MI&: *MI, RSR)) {
10647 if (!RSR.Reg)
10648 return nullptr;
10649 DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10650 }
10651 }
10652 if (!DefInst)
10653 return MI;
10654 }
10655 return nullptr;
10656}
10657
10658bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
10659 Register VReg,
10660 const MachineInstr &DefMI,
10661 const MachineInstr &UseMI) {
10662 assert(MRI.isSSA() && "Must be run on SSA");
10663
10664 auto *TRI = MRI.getTargetRegisterInfo();
10665 auto *DefBB = DefMI.getParent();
10666
10667 // Don't bother searching between blocks, although it is possible this block
10668 // doesn't modify exec.
10669 if (UseMI.getParent() != DefBB)
10670 return true;
10671
10672 const int MaxInstScan = 20;
10673 int NumInst = 0;
10674
10675 // Stop scan at the use.
10676 auto E = UseMI.getIterator();
10677 for (auto I = std::next(x: DefMI.getIterator()); I != E; ++I) {
10678 if (I->isDebugInstr())
10679 continue;
10680
10681 if (++NumInst > MaxInstScan)
10682 return true;
10683
10684 if (I->modifiesRegister(Reg: AMDGPU::EXEC, TRI))
10685 return true;
10686 }
10687
10688 return false;
10689}
10690
10691bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
10692 Register VReg,
10693 const MachineInstr &DefMI) {
10694 assert(MRI.isSSA() && "Must be run on SSA");
10695
10696 auto *TRI = MRI.getTargetRegisterInfo();
10697 auto *DefBB = DefMI.getParent();
10698
10699 const int MaxUseScan = 10;
10700 int NumUse = 0;
10701
10702 for (auto &Use : MRI.use_nodbg_operands(Reg: VReg)) {
10703 auto &UseInst = *Use.getParent();
10704 // Don't bother searching between blocks, although it is possible this block
10705 // doesn't modify exec.
10706 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10707 return true;
10708
10709 if (++NumUse > MaxUseScan)
10710 return true;
10711 }
10712
10713 if (NumUse == 0)
10714 return false;
10715
10716 const int MaxInstScan = 20;
10717 int NumInst = 0;
10718
10719 // Stop scan when we have seen all the uses.
10720 for (auto I = std::next(x: DefMI.getIterator()); ; ++I) {
10721 assert(I != DefBB->end());
10722
10723 if (I->isDebugInstr())
10724 continue;
10725
10726 if (++NumInst > MaxInstScan)
10727 return true;
10728
10729 for (const MachineOperand &Op : I->operands()) {
10730 // We don't check reg masks here as they're used only on calls:
10731 // 1. EXEC is only considered const within one BB
10732 // 2. Call should be a terminator instruction if present in a BB
10733
10734 if (!Op.isReg())
10735 continue;
10736
10737 Register Reg = Op.getReg();
10738 if (Op.isUse()) {
10739 if (Reg == VReg && --NumUse == 0)
10740 return false;
10741 } else if (TRI->regsOverlap(RegA: Reg, RegB: AMDGPU::EXEC))
10742 return true;
10743 }
10744 }
10745}
10746
10747MachineInstr *SIInstrInfo::createPHIDestinationCopy(
10748 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
10749 const DebugLoc &DL, Register Src, Register Dst) const {
10750 auto Cur = MBB.begin();
10751 if (Cur != MBB.end())
10752 do {
10753 if (!Cur->isPHI() && Cur->readsRegister(Reg: Dst, /*TRI=*/nullptr))
10754 return BuildMI(BB&: MBB, I: Cur, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: Dst).addReg(RegNo: Src);
10755 ++Cur;
10756 } while (Cur != MBB.end() && Cur != LastPHIIt);
10757
10758 return TargetInstrInfo::createPHIDestinationCopy(MBB, InsPt: LastPHIIt, DL, Src,
10759 Dst);
10760}
10761
10762MachineInstr *SIInstrInfo::createPHISourceCopy(
10763 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
10764 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10765 if (InsPt != MBB.end() &&
10766 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10767 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10768 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10769 InsPt->definesRegister(Reg: Src, /*TRI=*/nullptr)) {
10770 InsPt++;
10771 return BuildMI(BB&: MBB, I: InsPt, MIMD: DL,
10772 MCID: get(Opcode: AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), DestReg: Dst)
10773 .addReg(RegNo: Src, Flags: {}, SubReg: SrcSubReg)
10774 .addReg(RegNo: AMDGPU::EXEC, Flags: RegState::Implicit);
10775 }
10776 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10777 Dst);
10778}
10779
10780bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10781
10782bool SIInstrInfo::hasRAWDependency(const MachineInstr &FirstMI,
10783 const MachineInstr &SecondMI) const {
10784 for (const auto &Use : SecondMI.all_uses()) {
10785 if (Use.isReg() && FirstMI.modifiesRegister(Reg: Use.getReg(), TRI: &RI))
10786 return true;
10787 }
10788 return false;
10789}
10790
10791/// If OpX is multicycle, anti-dependencies are not allowed.
10792/// isDPMACCInstruction was not designed for VOPD, but it is fit for the
10793/// purpose.
10794bool llvm::SIInstrInfo::isVOPDAntidependencyAllowed(
10795 const MachineInstr &OpX) const {
10796 return !AMDGPU::isDPMACCInstruction(Opc: OpX.getOpcode());
10797}
10798
10799MachineInstr *
10800SIInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
10801 ArrayRef<unsigned> Ops, int FrameIndex,
10802 MachineInstr *&CopyMI, LiveIntervals *LIS,
10803 VirtRegMap *VRM) const {
10804 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10805 //
10806 // %0:sreg_32 = COPY $m0
10807 //
10808 // We explicitly chose SReg_32 for the virtual register so such a copy might
10809 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10810 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10811 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10812 // TargetInstrInfo::foldMemoryOperand() is going to try.
10813 // A similar issue also exists with spilling and reloading $exec registers.
10814 //
10815 // To prevent that, constrain the %0 register class here.
10816 if (isFullCopyInstr(MI)) {
10817 Register DstReg = MI.getOperand(i: 0).getReg();
10818 Register SrcReg = MI.getOperand(i: 1).getReg();
10819 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10820 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10821 MachineRegisterInfo &MRI = MF.getRegInfo();
10822 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10823 const TargetRegisterClass *RC = MRI.getRegClass(Reg: VirtReg);
10824 if (RC->hasSuperClassEq(RC: &AMDGPU::SReg_32RegClass)) {
10825 MRI.constrainRegClass(Reg: VirtReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
10826 return nullptr;
10827 }
10828 if (RC->hasSuperClassEq(RC: &AMDGPU::SReg_64RegClass)) {
10829 MRI.constrainRegClass(Reg: VirtReg, RC: &AMDGPU::SReg_64_XEXECRegClass);
10830 return nullptr;
10831 }
10832 }
10833 }
10834
10835 return nullptr;
10836}
10837
10838unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
10839 const MachineInstr &MI,
10840 unsigned *PredCost) const {
10841 if (MI.isBundle()) {
10842 MachineBasicBlock::const_instr_iterator I(MI.getIterator());
10843 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10844 unsigned Lat = 0, Count = 0;
10845 for (++I; I != E && I->isBundledWithPred(); ++I) {
10846 ++Count;
10847 Lat = std::max(a: Lat, b: SchedModel.computeInstrLatency(MI: &*I));
10848 }
10849 return Lat + Count - 1;
10850 }
10851
10852 return SchedModel.computeInstrLatency(MI: &MI);
10853}
10854
10855const MachineOperand &
10856SIInstrInfo::getCalleeOperand(const MachineInstr &MI) const {
10857 if (const MachineOperand *CallAddrOp =
10858 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0))
10859 return *CallAddrOp;
10860 return TargetInstrInfo::getCalleeOperand(MI);
10861}
10862
10863ValueUniformity
10864SIInstrInfo::getGenericValueUniformity(const MachineInstr &MI) const {
10865 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10866 unsigned Opcode = MI.getOpcode();
10867
10868 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10869 Register Dst = MI.getOperand(i: 0).getReg();
10870 Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: 2).getReg()
10871 : MI.getOperand(i: 1).getReg();
10872 LLT DstTy = MRI.getType(Reg: Dst);
10873 LLT SrcTy = MRI.getType(Reg: Src);
10874 unsigned DstAS = DstTy.getAddressSpace();
10875 unsigned SrcAS = SrcTy.getAddressSpace();
10876 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10877 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10878 ST.hasGloballyAddressableScratch()
10879 ? ValueUniformity::NeverUniform
10880 : ValueUniformity::Default;
10881 };
10882
10883 // If the target supports globally addressable scratch, the mapping from
10884 // scratch memory to the flat aperture changes therefore an address space cast
10885 // is no longer uniform.
10886 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10887 return HandleAddrSpaceCast(MI);
10888
10889 if (auto *GI = dyn_cast<GIntrinsic>(Val: &MI)) {
10890 auto IID = GI->getIntrinsicID();
10891 if (AMDGPU::isIntrinsicSourceOfDivergence(IntrID: IID))
10892 return ValueUniformity::NeverUniform;
10893 if (AMDGPU::isIntrinsicAlwaysUniform(IntrID: IID))
10894 return ValueUniformity::AlwaysUniform;
10895
10896 switch (IID) {
10897 case Intrinsic::amdgcn_addrspacecast_nonnull:
10898 return HandleAddrSpaceCast(MI);
10899 case Intrinsic::amdgcn_if:
10900 case Intrinsic::amdgcn_else:
10901 // FIXME: Uniform if second result
10902 break;
10903 }
10904
10905 return ValueUniformity::Default;
10906 }
10907
10908 // Loads from the private and flat address spaces are divergent, because
10909 // threads can execute the load instruction with the same inputs and get
10910 // different results.
10911 //
10912 // All other loads are not divergent, because if threads issue loads with the
10913 // same arguments, they will always get the same result.
10914 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10915 Opcode == AMDGPU::G_SEXTLOAD) {
10916 if (MI.memoperands_empty())
10917 return ValueUniformity::NeverUniform; // conservative assumption
10918
10919 if (llvm::any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *mmo) {
10920 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10921 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10922 })) {
10923 // At least one MMO in a non-global address space.
10924 return ValueUniformity::NeverUniform;
10925 }
10926 return ValueUniformity::Default;
10927 }
10928
10929 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opc: Opcode) ||
10930 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10931 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10932 AMDGPU::isGenericAtomic(Opc: Opcode)) {
10933 return ValueUniformity::NeverUniform;
10934 }
10935
10936 // Result is computed from uniform SP and uniform wave-wide max size.
10937 if (Opcode == TargetOpcode::G_DYN_STACKALLOC)
10938 return ValueUniformity::AlwaysUniform;
10939
10940 if (Opcode == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
10941 return ValueUniformity::NeverUniform;
10942
10943 return ValueUniformity::Default;
10944}
10945
10946const MIRFormatter *SIInstrInfo::getMIRFormatter() const {
10947 if (!Formatter)
10948 Formatter = std::make_unique<AMDGPUMIRFormatter>(args: ST);
10949 return Formatter.get();
10950}
10951
10952ValueUniformity SIInstrInfo::getValueUniformity(const MachineInstr &MI) const {
10953
10954 if (isNeverUniform(MI))
10955 return ValueUniformity::NeverUniform;
10956
10957 unsigned opcode = MI.getOpcode();
10958 if (opcode == AMDGPU::V_READLANE_B32 ||
10959 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10960 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10961 return ValueUniformity::AlwaysUniform;
10962
10963 // If any of defs is divergent, report as NeverUniform. isUniformReg will
10964 // calculate in more detail for each def from its reg class, if available.
10965 if (MI.isInlineAsm()) {
10966 for (const MachineOperand &MO : MI.operands()) {
10967 if (!MO.isReg() || !MO.isDef())
10968 continue;
10969 const TargetRegisterClass *RC =
10970 MI.getRegClassConstraint(OpIdx: MO.getOperandNo(), TII: this, TRI: &RI);
10971 if (!RC || !RI.isSGPRClass(RC))
10972 return ValueUniformity::NeverUniform;
10973 }
10974 }
10975
10976 if (isCopyInstr(MI)) {
10977 const MachineOperand &srcOp = MI.getOperand(i: 1);
10978 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10979 const TargetRegisterClass *regClass =
10980 RI.getPhysRegBaseClass(Reg: srcOp.getReg());
10981 return RI.isSGPRClass(RC: regClass) ? ValueUniformity::AlwaysUniform
10982 : ValueUniformity::NeverUniform;
10983 }
10984 return ValueUniformity::Default;
10985 }
10986
10987 // GMIR handling
10988 if (MI.isPreISelOpcode())
10989 return SIInstrInfo::getGenericValueUniformity(MI);
10990
10991 // Atomics are divergent because they are executed sequentially: when an
10992 // atomic operation refers to the same address in each thread, then each
10993 // thread after the first sees the value written by the previous thread as
10994 // original value.
10995
10996 if (isAtomic(MI))
10997 return ValueUniformity::NeverUniform;
10998
10999 // Loads from the private and flat address spaces are divergent, because
11000 // threads can execute the load instruction with the same inputs and get
11001 // different results.
11002 if (isFLAT(MI) && MI.mayLoad()) {
11003 if (MI.memoperands_empty())
11004 return ValueUniformity::NeverUniform; // conservative assumption
11005
11006 if (llvm::any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *mmo) {
11007 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
11008 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
11009 })) {
11010 // At least one MMO in a non-global address space.
11011 return ValueUniformity::NeverUniform;
11012 }
11013
11014 return ValueUniformity::Default;
11015 }
11016
11017 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
11018 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
11019
11020 // FIXME: It's conceptually broken to report this for an instruction, and not
11021 // a specific def operand. For inline asm in particular, there could be mixed
11022 // uniform and divergent results.
11023 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
11024 const MachineOperand &SrcOp = MI.getOperand(i: I);
11025 if (!SrcOp.isReg())
11026 continue;
11027
11028 Register Reg = SrcOp.getReg();
11029 if (!Reg || !SrcOp.readsReg())
11030 continue;
11031
11032 // If RegBank is null, this is unassigned or an unallocatable special
11033 // register, which are all scalars.
11034 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, TRI: RI);
11035 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
11036 return ValueUniformity::NeverUniform;
11037 }
11038
11039 // TODO: Uniformity check condtions above can be rearranged for more
11040 // redability
11041
11042 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
11043 // currently turned into no-op COPYs by SelectionDAG ISel and are
11044 // therefore no longer recognizable.
11045
11046 return ValueUniformity::Default;
11047}
11048
11049unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
11050 switch (MF.getFunction().getCallingConv()) {
11051 case CallingConv::AMDGPU_PS:
11052 return 1;
11053 case CallingConv::AMDGPU_VS:
11054 return 2;
11055 case CallingConv::AMDGPU_GS:
11056 return 3;
11057 case CallingConv::AMDGPU_HS:
11058 case CallingConv::AMDGPU_LS:
11059 case CallingConv::AMDGPU_ES: {
11060 const Function &F = MF.getFunction();
11061 F.getContext().diagnose(DI: DiagnosticInfoUnsupported(
11062 F, "ds_ordered_count unsupported for this calling conv"));
11063 [[fallthrough]];
11064 }
11065 case CallingConv::AMDGPU_CS:
11066 case CallingConv::AMDGPU_KERNEL:
11067 case CallingConv::C:
11068 case CallingConv::Fast:
11069 default:
11070 // Assume other calling conventions are various compute callable functions
11071 return 0;
11072 }
11073}
11074
11075bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
11076 Register &SrcReg2, int64_t &CmpMask,
11077 int64_t &CmpValue) const {
11078 if (!MI.getOperand(i: 0).isReg() || MI.getOperand(i: 0).getSubReg())
11079 return false;
11080
11081 switch (MI.getOpcode()) {
11082 default:
11083 break;
11084 case AMDGPU::S_CMP_EQ_U32:
11085 case AMDGPU::S_CMP_EQ_I32:
11086 case AMDGPU::S_CMP_LG_U32:
11087 case AMDGPU::S_CMP_LG_I32:
11088 case AMDGPU::S_CMP_LT_U32:
11089 case AMDGPU::S_CMP_LT_I32:
11090 case AMDGPU::S_CMP_GT_U32:
11091 case AMDGPU::S_CMP_GT_I32:
11092 case AMDGPU::S_CMP_LE_U32:
11093 case AMDGPU::S_CMP_LE_I32:
11094 case AMDGPU::S_CMP_GE_U32:
11095 case AMDGPU::S_CMP_GE_I32:
11096 case AMDGPU::S_CMP_EQ_U64:
11097 case AMDGPU::S_CMP_LG_U64:
11098 SrcReg = MI.getOperand(i: 0).getReg();
11099 if (MI.getOperand(i: 1).isReg()) {
11100 if (MI.getOperand(i: 1).getSubReg())
11101 return false;
11102 SrcReg2 = MI.getOperand(i: 1).getReg();
11103 CmpValue = 0;
11104 } else if (MI.getOperand(i: 1).isImm()) {
11105 SrcReg2 = Register();
11106 CmpValue = MI.getOperand(i: 1).getImm();
11107 } else {
11108 return false;
11109 }
11110 CmpMask = ~0;
11111 return true;
11112 case AMDGPU::S_CMPK_EQ_U32:
11113 case AMDGPU::S_CMPK_EQ_I32:
11114 case AMDGPU::S_CMPK_LG_U32:
11115 case AMDGPU::S_CMPK_LG_I32:
11116 case AMDGPU::S_CMPK_LT_U32:
11117 case AMDGPU::S_CMPK_LT_I32:
11118 case AMDGPU::S_CMPK_GT_U32:
11119 case AMDGPU::S_CMPK_GT_I32:
11120 case AMDGPU::S_CMPK_LE_U32:
11121 case AMDGPU::S_CMPK_LE_I32:
11122 case AMDGPU::S_CMPK_GE_U32:
11123 case AMDGPU::S_CMPK_GE_I32:
11124 SrcReg = MI.getOperand(i: 0).getReg();
11125 SrcReg2 = Register();
11126 CmpValue = MI.getOperand(i: 1).getImm();
11127 CmpMask = ~0;
11128 return true;
11129 }
11130
11131 return false;
11132}
11133
11134static bool isSCCDeadOnExit(MachineBasicBlock *MBB) {
11135 for (MachineBasicBlock *S : MBB->successors()) {
11136 if (S->isLiveIn(Reg: AMDGPU::SCC))
11137 return false;
11138 }
11139 return true;
11140}
11141
11142// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
11143// (incoming SCC) = !(SCC defined by SCCDef).
11144// Return true if all uses can be re-written, false otherwise.
11145bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
11146 MachineBasicBlock *MBB = SCCDef->getParent();
11147 SmallVector<MachineInstr *> InvertInstr;
11148 bool SCCIsDead = false;
11149
11150 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
11151 constexpr unsigned ScanLimit = 12;
11152 unsigned Count = 0;
11153 for (MachineInstr &MI :
11154 make_range(x: std::next(x: MachineBasicBlock::iterator(SCCDef)), y: MBB->end())) {
11155 if (++Count > ScanLimit)
11156 return false;
11157 if (MI.readsRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
11158 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
11159 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
11160 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11161 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
11162 InvertInstr.push_back(Elt: &MI);
11163 else
11164 return false;
11165 }
11166 if (MI.definesRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
11167 SCCIsDead = true;
11168 break;
11169 }
11170 }
11171 if (!SCCIsDead && isSCCDeadOnExit(MBB))
11172 SCCIsDead = true;
11173
11174 // SCC may have more uses. Can't invert all of them.
11175 if (!SCCIsDead)
11176 return false;
11177
11178 // Invert uses
11179 for (MachineInstr *MI : InvertInstr) {
11180 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
11181 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
11182 swapOperands(Inst&: *MI);
11183 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11184 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
11185 MI->setDesc(get(Opcode: MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
11186 ? AMDGPU::S_CBRANCH_SCC1
11187 : AMDGPU::S_CBRANCH_SCC0));
11188 } else {
11189 llvm_unreachable("SCC used but no inversion handling");
11190 }
11191 }
11192 return true;
11193}
11194
11195// SCC is already valid after SCCValid.
11196// SCCRedefine will redefine SCC to the same value already available after
11197// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
11198// update kill/dead flags if necessary.
11199bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
11200 bool NeedInversion) const {
11201 MachineInstr *KillsSCC = nullptr;
11202 if (SCCValid->getParent() != SCCRedefine->getParent())
11203 return false;
11204 for (MachineInstr &MI : make_range(x: std::next(x: SCCValid->getIterator()),
11205 y: SCCRedefine->getIterator())) {
11206 if (MI.modifiesRegister(Reg: AMDGPU::SCC, TRI: &RI))
11207 return false;
11208 if (MI.killsRegister(Reg: AMDGPU::SCC, TRI: &RI))
11209 KillsSCC = &MI;
11210 }
11211 if (NeedInversion && !invertSCCUse(SCCDef: SCCRedefine))
11212 return false;
11213 if (MachineOperand *SccDef =
11214 SCCValid->findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr))
11215 SccDef->setIsDead(false);
11216 if (KillsSCC)
11217 KillsSCC->clearRegisterKills(Reg: AMDGPU::SCC, /*TRI=*/RegInfo: nullptr);
11218 SCCRedefine->eraseFromParent();
11219 return true;
11220}
11221
11222static bool foldableSelect(const MachineInstr &Def) {
11223 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11224 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11225 return false;
11226 bool Op1IsNonZeroImm =
11227 Def.getOperand(i: 1).isImm() && Def.getOperand(i: 1).getImm() != 0;
11228 bool Op2IsZeroImm =
11229 Def.getOperand(i: 2).isImm() && Def.getOperand(i: 2).getImm() == 0;
11230 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11231 return false;
11232 return true;
11233}
11234
11235static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11236 unsigned &NewDefOpc) {
11237 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11238 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11239 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11240 Def.getOpcode() != AMDGPU::S_ADD_U32)
11241 return false;
11242 const MachineOperand &AddSrc1 = Def.getOperand(i: 1);
11243 const MachineOperand &AddSrc2 = Def.getOperand(i: 2);
11244 int64_t addend;
11245
11246 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11247 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11248 (!getFoldableImm(MO: &AddSrc1, Imm&: addend) || addend != 1) &&
11249 (!getFoldableImm(MO: &AddSrc2, Imm&: addend) || addend != 1))
11250 return false;
11251
11252 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11253 const MachineOperand *SccDef =
11254 Def.findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
11255 if (!SccDef->isDead())
11256 return false;
11257 NewDefOpc = AMDGPU::S_ADD_U32;
11258 }
11259 NeedInversion = !NeedInversion;
11260 return true;
11261}
11262
11263bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
11264 Register SrcReg2, int64_t CmpMask,
11265 int64_t CmpValue,
11266 const MachineRegisterInfo *MRI) const {
11267 if (!SrcReg || SrcReg.isPhysical())
11268 return false;
11269
11270 if (SrcReg2 && !getFoldableImm(Reg: SrcReg2, MRI: *MRI, Imm&: CmpValue))
11271 return false;
11272
11273 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11274 this](bool NeedInversion) -> bool {
11275 if (CmpValue != 0)
11276 return false;
11277
11278 MachineInstr *Def = MRI->getVRegDef(Reg: SrcReg);
11279 if (!Def)
11280 return false;
11281
11282 // For S_OP that set SCC = DST!=0, do the transformation
11283 //
11284 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11285 //
11286 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11287 // do the transformation:
11288 //
11289 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11290 //
11291 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11292 // for S_CSELECT* already has the same value that will be calculated by
11293 // s_cmp_lg_*
11294 //
11295 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11296 // (non-zero imm), 0)
11297
11298 unsigned NewDefOpc = Def->getOpcode();
11299 if (!setsSCCIfResultIsNonZero(*Def) &&
11300 !setsSCCIfResultIsZero(Def: *Def, NeedInversion, NewDefOpc) &&
11301 !foldableSelect(Def: *Def))
11302 return false;
11303
11304 if (!optimizeSCC(SCCValid: Def, SCCRedefine: &CmpInstr, NeedInversion))
11305 return false;
11306
11307 if (NewDefOpc != Def->getOpcode())
11308 Def->setDesc(get(Opcode: NewDefOpc));
11309
11310 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11311 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11312 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11313 // sX = s_cselect_b64 (non-zero imm), 0
11314 // sLo = copy sX.sub0
11315 // sHi = copy sX.sub1
11316 // sY = s_or_b32 sLo, sHi
11317 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11318 MRI->use_nodbg_empty(RegNo: Def->getOperand(i: 0).getReg())) {
11319 const MachineOperand &OrOpnd1 = Def->getOperand(i: 1);
11320 const MachineOperand &OrOpnd2 = Def->getOperand(i: 2);
11321 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11322 MachineInstr *Def1 = MRI->getVRegDef(Reg: OrOpnd1.getReg());
11323 MachineInstr *Def2 = MRI->getVRegDef(Reg: OrOpnd2.getReg());
11324 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11325 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(i: 1).isReg() &&
11326 Def2->getOperand(i: 1).isReg() &&
11327 Def1->getOperand(i: 1).getSubReg() == AMDGPU::sub0 &&
11328 Def2->getOperand(i: 1).getSubReg() == AMDGPU::sub1 &&
11329 Def1->getOperand(i: 1).getReg() == Def2->getOperand(i: 1).getReg()) {
11330 MachineInstr *Select = MRI->getVRegDef(Reg: Def1->getOperand(i: 1).getReg());
11331 if (Select && foldableSelect(Def: *Select))
11332 optimizeSCC(SCCValid: Select, SCCRedefine: Def, /*NeedInversion=*/false);
11333 }
11334 }
11335 }
11336 return true;
11337 };
11338
11339 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11340 this](int64_t ExpectedValue, unsigned SrcSize,
11341 bool IsReversible, bool IsSigned) -> bool {
11342 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11343 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11344 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11345 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11346 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11347 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11348 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11349 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11350 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11351 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11352 //
11353 // Signed ge/gt are not used for the sign bit.
11354 //
11355 // If result of the AND is unused except in the compare:
11356 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11357 //
11358 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11359 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11360 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11361 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11362 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11363 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11364
11365 MachineInstr *Def = MRI->getVRegDef(Reg: SrcReg);
11366 if (!Def)
11367 return false;
11368
11369 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11370 Def->getOpcode() != AMDGPU::S_AND_B64)
11371 return false;
11372
11373 int64_t Mask;
11374 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11375 if (MO->isImm())
11376 Mask = MO->getImm();
11377 else if (!getFoldableImm(MO, Imm&: Mask))
11378 return false;
11379 Mask &= maxUIntN(N: SrcSize);
11380 return isPowerOf2_64(Value: Mask);
11381 };
11382
11383 MachineOperand *SrcOp = &Def->getOperand(i: 1);
11384 if (isMask(SrcOp))
11385 SrcOp = &Def->getOperand(i: 2);
11386 else if (isMask(&Def->getOperand(i: 2)))
11387 SrcOp = &Def->getOperand(i: 1);
11388 else
11389 return false;
11390
11391 // A valid Mask is required to have a single bit set, hence a non-zero and
11392 // power-of-two value. This verifies that we will not do 64-bit shift below.
11393 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11394 unsigned BitNo = llvm::countr_zero(Val: (uint64_t)Mask);
11395 if (IsSigned && BitNo == SrcSize - 1)
11396 return false;
11397
11398 ExpectedValue <<= BitNo;
11399
11400 bool IsReversedCC = false;
11401 if (CmpValue != ExpectedValue) {
11402 if (!IsReversible)
11403 return false;
11404 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11405 if (!IsReversedCC)
11406 return false;
11407 }
11408
11409 Register DefReg = Def->getOperand(i: 0).getReg();
11410 if (IsReversedCC && !MRI->hasOneNonDBGUse(RegNo: DefReg))
11411 return false;
11412
11413 if (!optimizeSCC(SCCValid: Def, SCCRedefine: &CmpInstr, /*NeedInversion=*/false))
11414 return false;
11415
11416 if (!MRI->use_nodbg_empty(RegNo: DefReg)) {
11417 assert(!IsReversedCC);
11418 return true;
11419 }
11420
11421 // Replace AND with unused result with a S_BITCMP.
11422 MachineBasicBlock *MBB = Def->getParent();
11423
11424 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11425 : AMDGPU::S_BITCMP1_B32
11426 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11427 : AMDGPU::S_BITCMP1_B64;
11428
11429 BuildMI(BB&: *MBB, I: Def, MIMD: Def->getDebugLoc(), MCID: get(Opcode: NewOpc))
11430 .add(MO: *SrcOp)
11431 .addImm(Val: BitNo);
11432 Def->eraseFromParent();
11433
11434 return true;
11435 };
11436
11437 switch (CmpInstr.getOpcode()) {
11438 default:
11439 break;
11440 case AMDGPU::S_CMP_EQ_U32:
11441 case AMDGPU::S_CMP_EQ_I32:
11442 case AMDGPU::S_CMPK_EQ_U32:
11443 case AMDGPU::S_CMPK_EQ_I32:
11444 return optimizeCmpAnd(1, 32, true, false) ||
11445 optimizeCmpSelect(/*NeedInversion=*/true);
11446 case AMDGPU::S_CMP_GE_U32:
11447 case AMDGPU::S_CMPK_GE_U32:
11448 return optimizeCmpAnd(1, 32, false, false);
11449 case AMDGPU::S_CMP_GE_I32:
11450 case AMDGPU::S_CMPK_GE_I32:
11451 return optimizeCmpAnd(1, 32, false, true);
11452 case AMDGPU::S_CMP_EQ_U64:
11453 return optimizeCmpAnd(1, 64, true, false);
11454 case AMDGPU::S_CMP_LG_U32:
11455 case AMDGPU::S_CMP_LG_I32:
11456 case AMDGPU::S_CMPK_LG_U32:
11457 case AMDGPU::S_CMPK_LG_I32:
11458 return optimizeCmpAnd(0, 32, true, false) ||
11459 optimizeCmpSelect(/*NeedInversion=*/false);
11460 case AMDGPU::S_CMP_GT_U32:
11461 case AMDGPU::S_CMPK_GT_U32:
11462 return optimizeCmpAnd(0, 32, false, false);
11463 case AMDGPU::S_CMP_GT_I32:
11464 case AMDGPU::S_CMPK_GT_I32:
11465 return optimizeCmpAnd(0, 32, false, true);
11466 case AMDGPU::S_CMP_LG_U64:
11467 return optimizeCmpAnd(0, 64, true, false) ||
11468 optimizeCmpSelect(/*NeedInversion=*/false);
11469 }
11470
11471 return false;
11472}
11473
11474void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI,
11475 AMDGPU::OpName OpName) const {
11476 if (!ST.needsAlignedVGPRs())
11477 return;
11478
11479 int OpNo = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpName);
11480 if (OpNo < 0)
11481 return;
11482 MachineOperand &Op = MI.getOperand(i: OpNo);
11483 if (getOpSize(MI, OpNo) > 4)
11484 return;
11485
11486 // Add implicit aligned super-reg to force alignment on the data operand.
11487 const DebugLoc &DL = MI.getDebugLoc();
11488 MachineBasicBlock *BB = MI.getParent();
11489 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11490 Register DataReg = Op.getReg();
11491 bool IsAGPR = RI.isAGPR(MRI, Reg: DataReg);
11492 Register Undef = MRI.createVirtualRegister(
11493 RegClass: IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11494 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
11495 Register NewVR =
11496 MRI.createVirtualRegister(RegClass: IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11497 : &AMDGPU::VReg_64_Align2RegClass);
11498 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewVR)
11499 .addReg(RegNo: DataReg, Flags: {}, SubReg: Op.getSubReg())
11500 .addImm(Val: AMDGPU::sub0)
11501 .addReg(RegNo: Undef)
11502 .addImm(Val: AMDGPU::sub1);
11503 Op.setReg(NewVR);
11504 Op.setSubReg(AMDGPU::sub0);
11505 MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewVR, isDef: false, isImp: true));
11506}
11507
11508bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
11509 if (isIGLP(MI: *MI))
11510 return false;
11511
11512 return TargetInstrInfo::isGlobalMemoryObject(MI);
11513}
11514
11515bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const {
11516 if (!isWMMA(MI) && !isSWMMAC(MI))
11517 return false;
11518
11519 if (ST.hasGFX1250Insts())
11520 return AMDGPU::getWMMAIsXDL(Opc: MI.getOpcode());
11521
11522 return true;
11523}
11524
11525bool SIInstrInfo::isXDL(const MachineInstr &MI) const {
11526 unsigned Opcode = MI.getOpcode();
11527
11528 if (AMDGPU::isGFX12Plus(STI: ST))
11529 return isDOT(MI) || isXDLWMMA(MI);
11530
11531 if (!isMAI(MI) || isDGEMM(Opcode) ||
11532 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11533 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11534 return false;
11535
11536 if (!ST.hasGFX940Insts())
11537 return true;
11538
11539 return AMDGPU::getMAIIsGFX940XDL(Opc: Opcode);
11540}
11541