1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/Analysis/ValueTracking.h"
24#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25#include "llvm/CodeGen/LiveIntervals.h"
26#include "llvm/CodeGen/LiveVariables.h"
27#include "llvm/CodeGen/MachineCycleAnalysis.h"
28#include "llvm/CodeGen/MachineDominators.h"
29#include "llvm/CodeGen/MachineFrameInfo.h"
30#include "llvm/CodeGen/MachineScheduler.h"
31#include "llvm/CodeGen/RegisterScavenging.h"
32#include "llvm/CodeGen/ScheduleDAG.h"
33#include "llvm/IR/DiagnosticInfo.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/MC/MCContext.h"
36#include "llvm/Support/CommandLine.h"
37#include "llvm/Target/TargetMachine.h"
38
39using namespace llvm;
40
41#define DEBUG_TYPE "si-instr-info"
42
43#define GET_INSTRINFO_CTOR_DTOR
44#include "AMDGPUGenInstrInfo.inc"
45
46namespace llvm::AMDGPU {
47#define GET_D16ImageDimIntrinsics_IMPL
48#define GET_ImageDimIntrinsicTable_IMPL
49#define GET_RsrcIntrinsics_IMPL
50#include "AMDGPUGenSearchableTables.inc"
51} // namespace llvm::AMDGPU
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
56static cl::opt<unsigned>
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(Val: 16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
60static cl::opt<bool> Fix16BitCopies(
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(Val: true),
64 cl::ReallyHidden);
65
66SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
67 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
68 AMDGPU::ADJCALLSTACKDOWN),
69 RI(ST), ST(ST) {
70 SchedModel.init(TSInfo: &ST);
71}
72
73//===----------------------------------------------------------------------===//
74// TargetInstrInfo callbacks
75//===----------------------------------------------------------------------===//
76
77static unsigned getNumOperandsNoGlue(SDNode *Node) {
78 unsigned N = Node->getNumOperands();
79 while (N && Node->getOperand(Num: N - 1).getValueType() == MVT::Glue)
80 --N;
81 return N;
82}
83
84/// Returns true if both nodes have the same value for the given
85/// operand \p Op, or if both nodes do not have this operand.
86static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1,
87 AMDGPU::OpName OpName) {
88 unsigned Opc0 = N0->getMachineOpcode();
89 unsigned Opc1 = N1->getMachineOpcode();
90
91 int Op0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: OpName);
92 int Op1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: OpName);
93
94 if (Op0Idx == -1 && Op1Idx == -1)
95 return true;
96
97
98 if ((Op0Idx == -1 && Op1Idx != -1) ||
99 (Op1Idx == -1 && Op0Idx != -1))
100 return false;
101
102 // getNamedOperandIdx returns the index for the MachineInstr's operands,
103 // which includes the result as the first operand. We are indexing into the
104 // MachineSDNode's operands, so we need to skip the result operand to get
105 // the real index.
106 --Op0Idx;
107 --Op1Idx;
108
109 return N0->getOperand(Num: Op0Idx) == N1->getOperand(Num: Op1Idx);
110}
111
112static bool canRemat(const MachineInstr &MI) {
113
114 if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) ||
115 SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) ||
116 SIInstrInfo::isSALU(MI))
117 return true;
118
119 if (SIInstrInfo::isSMRD(MI)) {
120 return !MI.memoperands_empty() &&
121 llvm::all_of(Range: MI.memoperands(), P: [](const MachineMemOperand *MMO) {
122 return MMO->isLoad() && MMO->isInvariant();
123 });
124 }
125
126 return false;
127}
128
129bool SIInstrInfo::isReMaterializableImpl(
130 const MachineInstr &MI) const {
131
132 if (canRemat(MI)) {
133 // Normally VALU use of exec would block the rematerialization, but that
134 // is OK in this case to have an implicit exec read as all VALU do.
135 // We really want all of the generic logic for this except for this.
136
137 // Another potential implicit use is mode register. The core logic of
138 // the RA will not attempt rematerialization if mode is set anywhere
139 // in the function, otherwise it is safe since mode is not changed.
140
141 // There is difference to generic method which does not allow
142 // rematerialization if there are virtual register uses. We allow this,
143 // therefore this method includes SOP instructions as well.
144 if (!MI.hasImplicitDef() &&
145 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
146 !MI.mayRaiseFPException())
147 return true;
148 }
149
150 return TargetInstrInfo::isReMaterializableImpl(MI);
151}
152
153// Returns true if the scalar result of a VALU instruction depends on exec.
154bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
155 // Ignore comparisons which are only used masked with exec.
156 // This allows some hoisting/sinking of VALU comparisons.
157 if (MI.isCompare()) {
158 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
159 if (!Dst)
160 return true;
161
162 Register DstReg = Dst->getReg();
163 if (!DstReg.isVirtual())
164 return true;
165
166 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
167 for (MachineInstr &Use : MRI.use_nodbg_instructions(Reg: DstReg)) {
168 switch (Use.getOpcode()) {
169 case AMDGPU::S_AND_SAVEEXEC_B32:
170 case AMDGPU::S_AND_SAVEEXEC_B64:
171 break;
172 case AMDGPU::S_AND_B32:
173 case AMDGPU::S_AND_B64:
174 if (!Use.readsRegister(Reg: AMDGPU::EXEC, /*TRI=*/nullptr))
175 return true;
176 break;
177 default:
178 return true;
179 }
180 }
181 return false;
182 }
183
184 // If it is not convergent it does not depend on EXEC.
185 if (!MI.isConvergent())
186 return false;
187
188 switch (MI.getOpcode()) {
189 default:
190 break;
191 case AMDGPU::V_READFIRSTLANE_B32:
192 return true;
193 }
194
195 return false;
196}
197
198bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
199 // Any implicit use of exec by VALU is not a real register read.
200 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
201 isVALU(MI: *MO.getParent()) && !resultDependsOnExec(MI: *MO.getParent());
202}
203
204bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
205 MachineBasicBlock *SuccToSinkTo,
206 MachineCycleInfo *CI) const {
207 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
208 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
209 return true;
210
211 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
212 // Check if sinking of MI would create temporal divergent use.
213 for (auto Op : MI.uses()) {
214 if (Op.isReg() && Op.getReg().isVirtual() &&
215 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Op.getReg()))) {
216 MachineInstr *SgprDef = MRI.getVRegDef(Reg: Op.getReg());
217
218 // SgprDef defined inside cycle
219 MachineCycle *FromCycle = CI->getCycle(Block: SgprDef->getParent());
220 if (FromCycle == nullptr)
221 continue;
222
223 MachineCycle *ToCycle = CI->getCycle(Block: SuccToSinkTo);
224 // Check if there is a FromCycle that contains SgprDef's basic block but
225 // does not contain SuccToSinkTo and also has divergent exit condition.
226 while (FromCycle && !FromCycle->contains(C: ToCycle)) {
227 SmallVector<MachineBasicBlock *, 1> ExitingBlocks;
228 FromCycle->getExitingBlocks(TmpStorage&: ExitingBlocks);
229
230 // FromCycle has divergent exit condition.
231 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
232 if (hasDivergentBranch(MBB: ExitingBlock))
233 return false;
234 }
235
236 FromCycle = FromCycle->getParentCycle();
237 }
238 }
239 }
240
241 return true;
242}
243
244bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
245 int64_t &Offset0,
246 int64_t &Offset1) const {
247 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
248 return false;
249
250 unsigned Opc0 = Load0->getMachineOpcode();
251 unsigned Opc1 = Load1->getMachineOpcode();
252
253 // Make sure both are actually loads.
254 if (!get(Opcode: Opc0).mayLoad() || !get(Opcode: Opc1).mayLoad())
255 return false;
256
257 // A mayLoad instruction without a def is not a load. Likely a prefetch.
258 if (!get(Opcode: Opc0).getNumDefs() || !get(Opcode: Opc1).getNumDefs())
259 return false;
260
261 if (isDS(Opcode: Opc0) && isDS(Opcode: Opc1)) {
262
263 // FIXME: Handle this case:
264 if (getNumOperandsNoGlue(Node: Load0) != getNumOperandsNoGlue(Node: Load1))
265 return false;
266
267 // Check base reg.
268 if (Load0->getOperand(Num: 0) != Load1->getOperand(Num: 0))
269 return false;
270
271 // Skip read2 / write2 variants for simplicity.
272 // TODO: We should report true if the used offsets are adjacent (excluded
273 // st64 versions).
274 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: AMDGPU::OpName::offset);
275 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: AMDGPU::OpName::offset);
276 if (Offset0Idx == -1 || Offset1Idx == -1)
277 return false;
278
279 // XXX - be careful of dataless loads
280 // getNamedOperandIdx returns the index for MachineInstrs. Since they
281 // include the output in the operand list, but SDNodes don't, we need to
282 // subtract the index by one.
283 Offset0Idx -= get(Opcode: Opc0).NumDefs;
284 Offset1Idx -= get(Opcode: Opc1).NumDefs;
285 Offset0 = Load0->getConstantOperandVal(Num: Offset0Idx);
286 Offset1 = Load1->getConstantOperandVal(Num: Offset1Idx);
287 return true;
288 }
289
290 if (isSMRD(Opcode: Opc0) && isSMRD(Opcode: Opc1)) {
291 // Skip time and cache invalidation instructions.
292 if (!AMDGPU::hasNamedOperand(Opcode: Opc0, NamedIdx: AMDGPU::OpName::sbase) ||
293 !AMDGPU::hasNamedOperand(Opcode: Opc1, NamedIdx: AMDGPU::OpName::sbase))
294 return false;
295
296 unsigned NumOps = getNumOperandsNoGlue(Node: Load0);
297 if (NumOps != getNumOperandsNoGlue(Node: Load1))
298 return false;
299
300 // Check base reg.
301 if (Load0->getOperand(Num: 0) != Load1->getOperand(Num: 0))
302 return false;
303
304 // Match register offsets, if both register and immediate offsets present.
305 assert(NumOps == 4 || NumOps == 5);
306 if (NumOps == 5 && Load0->getOperand(Num: 1) != Load1->getOperand(Num: 1))
307 return false;
308
309 const ConstantSDNode *Load0Offset =
310 dyn_cast<ConstantSDNode>(Val: Load0->getOperand(Num: NumOps - 3));
311 const ConstantSDNode *Load1Offset =
312 dyn_cast<ConstantSDNode>(Val: Load1->getOperand(Num: NumOps - 3));
313
314 if (!Load0Offset || !Load1Offset)
315 return false;
316
317 Offset0 = Load0Offset->getZExtValue();
318 Offset1 = Load1Offset->getZExtValue();
319 return true;
320 }
321
322 // MUBUF and MTBUF can access the same addresses.
323 if ((isMUBUF(Opcode: Opc0) || isMTBUF(Opcode: Opc0)) && (isMUBUF(Opcode: Opc1) || isMTBUF(Opcode: Opc1))) {
324
325 // MUBUF and MTBUF have vaddr at different indices.
326 if (!nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::soffset) ||
327 !nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::vaddr) ||
328 !nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::srsrc))
329 return false;
330
331 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: AMDGPU::OpName::offset);
332 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: AMDGPU::OpName::offset);
333
334 if (OffIdx0 == -1 || OffIdx1 == -1)
335 return false;
336
337 // getNamedOperandIdx returns the index for MachineInstrs. Since they
338 // include the output in the operand list, but SDNodes don't, we need to
339 // subtract the index by one.
340 OffIdx0 -= get(Opcode: Opc0).NumDefs;
341 OffIdx1 -= get(Opcode: Opc1).NumDefs;
342
343 SDValue Off0 = Load0->getOperand(Num: OffIdx0);
344 SDValue Off1 = Load1->getOperand(Num: OffIdx1);
345
346 // The offset might be a FrameIndexSDNode.
347 if (!isa<ConstantSDNode>(Val: Off0) || !isa<ConstantSDNode>(Val: Off1))
348 return false;
349
350 Offset0 = Off0->getAsZExtVal();
351 Offset1 = Off1->getAsZExtVal();
352 return true;
353 }
354
355 return false;
356}
357
358static bool isStride64(unsigned Opc) {
359 switch (Opc) {
360 case AMDGPU::DS_READ2ST64_B32:
361 case AMDGPU::DS_READ2ST64_B64:
362 case AMDGPU::DS_WRITE2ST64_B32:
363 case AMDGPU::DS_WRITE2ST64_B64:
364 return true;
365 default:
366 return false;
367 }
368}
369
370bool SIInstrInfo::getMemOperandsWithOffsetWidth(
371 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
372 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
373 const TargetRegisterInfo *TRI) const {
374 if (!LdSt.mayLoadOrStore())
375 return false;
376
377 unsigned Opc = LdSt.getOpcode();
378 OffsetIsScalable = false;
379 const MachineOperand *BaseOp, *OffsetOp;
380 int DataOpIdx;
381
382 if (isDS(MI: LdSt)) {
383 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::addr);
384 OffsetOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
385 if (OffsetOp) {
386 // Normal, single offset LDS instruction.
387 if (!BaseOp) {
388 // DS_CONSUME/DS_APPEND use M0 for the base address.
389 // TODO: find the implicit use operand for M0 and use that as BaseOp?
390 return false;
391 }
392 BaseOps.push_back(Elt: BaseOp);
393 Offset = OffsetOp->getImm();
394 // Get appropriate operand, and compute width accordingly.
395 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
396 if (DataOpIdx == -1)
397 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
398 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
399 Width = LocationSize::precise(Value: 64);
400 else
401 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
402 } else {
403 // The 2 offset instructions use offset0 and offset1 instead. We can treat
404 // these as a load with a single offset if the 2 offsets are consecutive.
405 // We will use this for some partially aligned loads.
406 const MachineOperand *Offset0Op =
407 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset0);
408 const MachineOperand *Offset1Op =
409 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset1);
410
411 unsigned Offset0 = Offset0Op->getImm() & 0xff;
412 unsigned Offset1 = Offset1Op->getImm() & 0xff;
413 if (Offset0 + 1 != Offset1)
414 return false;
415
416 // Each of these offsets is in element sized units, so we need to convert
417 // to bytes of the individual reads.
418
419 unsigned EltSize;
420 if (LdSt.mayLoad())
421 EltSize = TRI->getRegSizeInBits(RC: *getOpRegClass(MI: LdSt, OpNo: 0)) / 16;
422 else {
423 assert(LdSt.mayStore());
424 int Data0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
425 EltSize = TRI->getRegSizeInBits(RC: *getOpRegClass(MI: LdSt, OpNo: Data0Idx)) / 8;
426 }
427
428 if (isStride64(Opc))
429 EltSize *= 64;
430
431 BaseOps.push_back(Elt: BaseOp);
432 Offset = EltSize * Offset0;
433 // Get appropriate operand(s), and compute width accordingly.
434 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
435 if (DataOpIdx == -1) {
436 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
437 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
438 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data1);
439 Width = LocationSize::precise(
440 Value: Width.getValue() + TypeSize::getFixed(ExactSize: getOpSize(MI: LdSt, OpNo: DataOpIdx)));
441 } else {
442 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
443 }
444 }
445 return true;
446 }
447
448 if (isMUBUF(MI: LdSt) || isMTBUF(MI: LdSt)) {
449 const MachineOperand *RSrc = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::srsrc);
450 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
451 return false;
452 BaseOps.push_back(Elt: RSrc);
453 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr);
454 if (BaseOp && !BaseOp->isFI())
455 BaseOps.push_back(Elt: BaseOp);
456 const MachineOperand *OffsetImm =
457 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
458 Offset = OffsetImm->getImm();
459 const MachineOperand *SOffset =
460 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::soffset);
461 if (SOffset) {
462 if (SOffset->isReg())
463 BaseOps.push_back(Elt: SOffset);
464 else
465 Offset += SOffset->getImm();
466 }
467 // Get appropriate operand, and compute width accordingly.
468 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
469 if (DataOpIdx == -1)
470 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
471 if (DataOpIdx == -1) // LDS DMA
472 return false;
473 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
474 return true;
475 }
476
477 if (isImage(MI: LdSt)) {
478 auto RsrcOpName =
479 isMIMG(MI: LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
480 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: RsrcOpName);
481 BaseOps.push_back(Elt: &LdSt.getOperand(i: SRsrcIdx));
482 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0);
483 if (VAddr0Idx >= 0) {
484 // GFX10 possible NSA encoding.
485 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
486 BaseOps.push_back(Elt: &LdSt.getOperand(i: I));
487 } else {
488 BaseOps.push_back(Elt: getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr));
489 }
490 Offset = 0;
491 // Get appropriate operand, and compute width accordingly.
492 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
493 if (DataOpIdx == -1)
494 return false; // no return sampler
495 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
496 return true;
497 }
498
499 if (isSMRD(MI: LdSt)) {
500 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::sbase);
501 if (!BaseOp) // e.g. S_MEMTIME
502 return false;
503 BaseOps.push_back(Elt: BaseOp);
504 OffsetOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
505 Offset = OffsetOp ? OffsetOp->getImm() : 0;
506 // Get appropriate operand, and compute width accordingly.
507 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::sdst);
508 if (DataOpIdx == -1)
509 return false;
510 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
511 return true;
512 }
513
514 if (isFLAT(MI: LdSt)) {
515 // Instructions have either vaddr or saddr or both or none.
516 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr);
517 if (BaseOp)
518 BaseOps.push_back(Elt: BaseOp);
519 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::saddr);
520 if (BaseOp)
521 BaseOps.push_back(Elt: BaseOp);
522 Offset = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset)->getImm();
523 // Get appropriate operand, and compute width accordingly.
524 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
525 if (DataOpIdx == -1)
526 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
527 if (DataOpIdx == -1) // LDS DMA
528 return false;
529 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
530 return true;
531 }
532
533 return false;
534}
535
536static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
537 ArrayRef<const MachineOperand *> BaseOps1,
538 const MachineInstr &MI2,
539 ArrayRef<const MachineOperand *> BaseOps2) {
540 // Only examine the first "base" operand of each instruction, on the
541 // assumption that it represents the real base address of the memory access.
542 // Other operands are typically offsets or indices from this base address.
543 if (BaseOps1.front()->isIdenticalTo(Other: *BaseOps2.front()))
544 return true;
545
546 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
547 return false;
548
549 auto *MO1 = *MI1.memoperands_begin();
550 auto *MO2 = *MI2.memoperands_begin();
551 if (MO1->getAddrSpace() != MO2->getAddrSpace())
552 return false;
553
554 const auto *Base1 = MO1->getValue();
555 const auto *Base2 = MO2->getValue();
556 if (!Base1 || !Base2)
557 return false;
558 Base1 = getUnderlyingObject(V: Base1);
559 Base2 = getUnderlyingObject(V: Base2);
560
561 if (isa<UndefValue>(Val: Base1) || isa<UndefValue>(Val: Base2))
562 return false;
563
564 return Base1 == Base2;
565}
566
567bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
568 int64_t Offset1, bool OffsetIsScalable1,
569 ArrayRef<const MachineOperand *> BaseOps2,
570 int64_t Offset2, bool OffsetIsScalable2,
571 unsigned ClusterSize,
572 unsigned NumBytes) const {
573 // If the mem ops (to be clustered) do not have the same base ptr, then they
574 // should not be clustered
575 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
576 if (!BaseOps1.empty() && !BaseOps2.empty()) {
577 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
578 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
579 if (!memOpsHaveSameBasePtr(MI1: FirstLdSt, BaseOps1, MI2: SecondLdSt, BaseOps2))
580 return false;
581
582 const SIMachineFunctionInfo *MFI =
583 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
584 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
585 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
586 // If only one base op is empty, they do not have the same base ptr
587 return false;
588 }
589
590 // In order to avoid register pressure, on an average, the number of DWORDS
591 // loaded together by all clustered mem ops should not exceed
592 // MaxMemoryClusterDWords. This is an empirical value based on certain
593 // observations and performance related experiments.
594 // The good thing about this heuristic is - it avoids clustering of too many
595 // sub-word loads, and also avoids clustering of wide loads. Below is the
596 // brief summary of how the heuristic behaves for various `LoadSize` when
597 // MaxMemoryClusterDWords is 8.
598 //
599 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
600 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
601 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
602 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
603 // (5) LoadSize >= 17: do not cluster
604 const unsigned LoadSize = NumBytes / ClusterSize;
605 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
606 return NumDWords <= MaxMemoryClusterDWords;
607}
608
609// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
610// the first 16 loads will be interleaved with the stores, and the next 16 will
611// be clustered as expected. It should really split into 2 16 store batches.
612//
613// Loads are clustered until this returns false, rather than trying to schedule
614// groups of stores. This also means we have to deal with saying different
615// address space loads should be clustered, and ones which might cause bank
616// conflicts.
617//
618// This might be deprecated so it might not be worth that much effort to fix.
619bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
620 int64_t Offset0, int64_t Offset1,
621 unsigned NumLoads) const {
622 assert(Offset1 > Offset0 &&
623 "Second offset should be larger than first offset!");
624 // If we have less than 16 loads in a row, and the offsets are within 64
625 // bytes, then schedule together.
626
627 // A cacheline is 64 bytes (for global memory).
628 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
629}
630
631static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
632 MachineBasicBlock::iterator MI,
633 const DebugLoc &DL, MCRegister DestReg,
634 MCRegister SrcReg, bool KillSrc,
635 const char *Msg = "illegal VGPR to SGPR copy") {
636 MachineFunction *MF = MBB.getParent();
637
638 LLVMContext &C = MF->getFunction().getContext();
639 C.diagnose(DI: DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
640
641 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_ILLEGAL_COPY), DestReg)
642 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
643}
644
645/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
646/// possible to have a direct copy in these cases on GFX908, so an intermediate
647/// VGPR copy is required.
648static void indirectCopyToAGPR(const SIInstrInfo &TII,
649 MachineBasicBlock &MBB,
650 MachineBasicBlock::iterator MI,
651 const DebugLoc &DL, MCRegister DestReg,
652 MCRegister SrcReg, bool KillSrc,
653 RegScavenger &RS, bool RegsOverlap,
654 Register ImpDefSuperReg = Register(),
655 Register ImpUseSuperReg = Register()) {
656 assert((TII.getSubtarget().hasMAIInsts() &&
657 !TII.getSubtarget().hasGFX90AInsts()) &&
658 "Expected GFX908 subtarget.");
659
660 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
661 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
662 "Source register of the copy should be either an SGPR or an AGPR.");
663
664 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
665 "Destination register of the copy should be an AGPR.");
666
667 const SIRegisterInfo &RI = TII.getRegisterInfo();
668
669 // First try to find defining accvgpr_write to avoid temporary registers.
670 // In the case of copies of overlapping AGPRs, we conservatively do not
671 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
672 // an accvgpr_write used for this same copy due to implicit-defs
673 if (!RegsOverlap) {
674 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
675 --Def;
676
677 if (!Def->modifiesRegister(Reg: SrcReg, TRI: &RI))
678 continue;
679
680 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
681 Def->getOperand(i: 0).getReg() != SrcReg)
682 break;
683
684 MachineOperand &DefOp = Def->getOperand(i: 1);
685 assert(DefOp.isReg() || DefOp.isImm());
686
687 if (DefOp.isReg()) {
688 bool SafeToPropagate = true;
689 // Check that register source operand is not clobbered before MI.
690 // Immediate operands are always safe to propagate.
691 for (auto I = Def; I != MI && SafeToPropagate; ++I)
692 if (I->modifiesRegister(Reg: DefOp.getReg(), TRI: &RI))
693 SafeToPropagate = false;
694
695 if (!SafeToPropagate)
696 break;
697
698 for (auto I = Def; I != MI; ++I)
699 I->clearRegisterKills(Reg: DefOp.getReg(), RegInfo: &RI);
700 }
701
702 MachineInstrBuilder Builder =
703 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
704 .add(MO: DefOp);
705 if (ImpDefSuperReg)
706 Builder.addReg(RegNo: ImpDefSuperReg, Flags: RegState::Define | RegState::Implicit);
707
708 if (ImpUseSuperReg) {
709 Builder.addReg(RegNo: ImpUseSuperReg,
710 Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
711 }
712
713 return;
714 }
715 }
716
717 RS.enterBasicBlockEnd(MBB);
718 RS.backward(I: std::next(x: MI));
719
720 // Ideally we want to have three registers for a long reg_sequence copy
721 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
722 unsigned MaxVGPRs = RI.getRegPressureLimit(RC: &AMDGPU::VGPR_32RegClass,
723 MF&: *MBB.getParent());
724
725 // Registers in the sequence are allocated contiguously so we can just
726 // use register number to pick one of three round-robin temps.
727 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
728 Register Tmp =
729 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
730 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
731 "VGPR used for an intermediate copy should have been reserved.");
732
733 // Only loop through if there are any free registers left. We don't want to
734 // spill.
735 while (RegNo--) {
736 Register Tmp2 = RS.scavengeRegisterBackwards(RC: AMDGPU::VGPR_32RegClass, To: MI,
737 /* RestoreAfter */ false, SPAdj: 0,
738 /* AllowSpill */ false);
739 if (!Tmp2 || RI.getHWRegIndex(Reg: Tmp2) >= MaxVGPRs)
740 break;
741 Tmp = Tmp2;
742 RS.setRegUsed(Reg: Tmp);
743 }
744
745 // Insert copy to temporary VGPR.
746 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
747 if (AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg)) {
748 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
749 } else {
750 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
751 }
752
753 MachineInstrBuilder UseBuilder = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: TmpCopyOp), DestReg: Tmp)
754 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
755 if (ImpUseSuperReg) {
756 UseBuilder.addReg(RegNo: ImpUseSuperReg,
757 Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
758 }
759
760 MachineInstrBuilder DefBuilder
761 = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
762 .addReg(RegNo: Tmp, Flags: RegState::Kill);
763
764 if (ImpDefSuperReg)
765 DefBuilder.addReg(RegNo: ImpDefSuperReg, Flags: RegState::Define | RegState::Implicit);
766}
767
768static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
769 MachineBasicBlock::iterator MI, const DebugLoc &DL,
770 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
771 const TargetRegisterClass *RC, bool Forward) {
772 const SIRegisterInfo &RI = TII.getRegisterInfo();
773 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, EltSize: 4);
774 MachineBasicBlock::iterator I = MI;
775 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
776
777 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
778 int16_t SubIdx = BaseIndices[Idx];
779 Register DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
780 Register SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
781 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
782 unsigned Opcode = AMDGPU::S_MOV_B32;
783
784 // Is SGPR aligned? If so try to combine with next.
785 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
786 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
787 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
788 // Can use SGPR64 copy
789 unsigned Channel = RI.getChannelFromSubReg(SubReg: SubIdx);
790 SubIdx = RI.getSubRegFromChannel(Channel, NumRegs: 2);
791 DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
792 SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
793 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
794 Opcode = AMDGPU::S_MOV_B64;
795 Idx++;
796 }
797
798 LastMI = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode), DestReg: DestSubReg)
799 .addReg(RegNo: SrcSubReg)
800 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
801
802 if (!FirstMI)
803 FirstMI = LastMI;
804
805 if (!Forward)
806 I--;
807 }
808
809 assert(FirstMI && LastMI);
810 if (!Forward)
811 std::swap(a&: FirstMI, b&: LastMI);
812
813 FirstMI->addOperand(
814 Op: MachineOperand::CreateReg(Reg: DestReg, isDef: true /*IsDef*/, isImp: true /*IsImp*/));
815
816 if (KillSrc)
817 LastMI->addRegisterKilled(IncomingReg: SrcReg, RegInfo: &RI);
818}
819
820void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
821 MachineBasicBlock::iterator MI,
822 const DebugLoc &DL, Register DestReg,
823 Register SrcReg, bool KillSrc, bool RenamableDest,
824 bool RenamableSrc) const {
825 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(Reg: DestReg);
826 unsigned Size = RI.getRegSizeInBits(RC: *RC);
827 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(Reg: SrcReg);
828 unsigned SrcSize = RI.getRegSizeInBits(RC: *SrcRC);
829
830 // The rest of copyPhysReg assumes Src and Dst size are the same size.
831 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
832 // we remove Fix16BitCopies and this code block?
833 if (Fix16BitCopies) {
834 if (((Size == 16) != (SrcSize == 16))) {
835 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
836 assert(ST.useRealTrue16Insts());
837 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
838 MCRegister SubReg = RI.getSubReg(Reg: RegToFix, Idx: AMDGPU::lo16);
839 RegToFix = SubReg;
840
841 if (DestReg == SrcReg) {
842 // Identity copy. Insert empty bundle since ExpandPostRA expects an
843 // instruction here.
844 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::BUNDLE));
845 return;
846 }
847 RC = RI.getPhysRegBaseClass(Reg: DestReg);
848 Size = RI.getRegSizeInBits(RC: *RC);
849 SrcRC = RI.getPhysRegBaseClass(Reg: SrcReg);
850 SrcSize = RI.getRegSizeInBits(RC: *SrcRC);
851 }
852 }
853
854 if (RC == &AMDGPU::VGPR_32RegClass) {
855 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
856 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
857 AMDGPU::AGPR_32RegClass.contains(SrcReg));
858 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg) ?
859 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
860 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: Opc), DestReg)
861 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
862 return;
863 }
864
865 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
866 RC == &AMDGPU::SReg_32RegClass) {
867 if (SrcReg == AMDGPU::SCC) {
868 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B32), DestReg)
869 .addImm(Val: 1)
870 .addImm(Val: 0);
871 return;
872 }
873
874 if (!AMDGPU::SReg_32RegClass.contains(Reg: SrcReg)) {
875 if (DestReg == AMDGPU::VCC_LO) {
876 // FIXME: Hack until VReg_1 removed.
877 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
878 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_U32_e32))
879 .addImm(Val: 0)
880 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
881 return;
882 }
883
884 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
885 return;
886 }
887
888 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg)
889 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
890 return;
891 }
892
893 if (RC == &AMDGPU::SReg_64RegClass) {
894 if (SrcReg == AMDGPU::SCC) {
895 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B64), DestReg)
896 .addImm(Val: 1)
897 .addImm(Val: 0);
898 return;
899 }
900
901 if (!AMDGPU::SReg_64_EncodableRegClass.contains(Reg: SrcReg)) {
902 if (DestReg == AMDGPU::VCC) {
903 // FIXME: Hack until VReg_1 removed.
904 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
905 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_U32_e32))
906 .addImm(Val: 0)
907 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
908 return;
909 }
910
911 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
912 return;
913 }
914
915 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B64), DestReg)
916 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
917 return;
918 }
919
920 if (DestReg == AMDGPU::SCC) {
921 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
922 // but SelectionDAG emits such copies for i1 sources.
923 if (AMDGPU::SReg_64RegClass.contains(Reg: SrcReg)) {
924 // This copy can only be produced by patterns
925 // with explicit SCC, which are known to be enabled
926 // only for subtargets with S_CMP_LG_U64 present.
927 assert(ST.hasScalarCompareEq64());
928 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U64))
929 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
930 .addImm(Val: 0);
931 } else {
932 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
933 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U32))
934 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
935 .addImm(Val: 0);
936 }
937
938 return;
939 }
940
941 if (RC == &AMDGPU::AGPR_32RegClass) {
942 if (AMDGPU::VGPR_32RegClass.contains(Reg: SrcReg) ||
943 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(Reg: SrcReg))) {
944 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
945 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
946 return;
947 }
948
949 if (AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg) && ST.hasGFX90AInsts()) {
950 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
951 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
952 return;
953 }
954
955 // FIXME: Pass should maintain scavenger to avoid scan through the block on
956 // every AGPR spill.
957 RegScavenger RS;
958 const bool Overlap = RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
959 indirectCopyToAGPR(TII: *this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, RegsOverlap: Overlap);
960 return;
961 }
962
963 if (Size == 16) {
964 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
965 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
966 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
967
968 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(Reg: DestReg);
969 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(Reg: SrcReg);
970 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(Reg: DestReg);
971 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(Reg: SrcReg);
972 bool DstLow = !AMDGPU::isHi16Reg(Reg: DestReg, MRI: RI);
973 bool SrcLow = !AMDGPU::isHi16Reg(Reg: SrcReg, MRI: RI);
974 MCRegister NewDestReg = RI.get32BitRegister(Reg: DestReg);
975 MCRegister NewSrcReg = RI.get32BitRegister(Reg: SrcReg);
976
977 if (IsSGPRDst) {
978 if (!IsSGPRSrc) {
979 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
980 return;
981 }
982
983 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: NewDestReg)
984 .addReg(RegNo: NewSrcReg, Flags: getKillRegState(B: KillSrc));
985 return;
986 }
987
988 if (IsAGPRDst || IsAGPRSrc) {
989 if (!DstLow || !SrcLow) {
990 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
991 Msg: "Cannot use hi16 subreg with an AGPR!");
992 }
993
994 copyPhysReg(MBB, MI, DL, DestReg: NewDestReg, SrcReg: NewSrcReg, KillSrc);
995 return;
996 }
997
998 if (ST.useRealTrue16Insts()) {
999 if (IsSGPRSrc) {
1000 assert(SrcLow);
1001 SrcReg = NewSrcReg;
1002 }
1003 // Use the smaller instruction encoding if possible.
1004 if (AMDGPU::VGPR_16_Lo128RegClass.contains(Reg: DestReg) &&
1005 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(Reg: SrcReg))) {
1006 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B16_t16_e32), DestReg)
1007 .addReg(RegNo: SrcReg);
1008 } else {
1009 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B16_t16_e64), DestReg)
1010 .addImm(Val: 0) // src0_modifiers
1011 .addReg(RegNo: SrcReg)
1012 .addImm(Val: 0); // op_sel
1013 }
1014 return;
1015 }
1016
1017 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1018 if (!DstLow || !SrcLow) {
1019 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1020 Msg: "Cannot use hi16 subreg on VI!");
1021 }
1022
1023 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: NewDestReg)
1024 .addReg(RegNo: NewSrcReg, Flags: getKillRegState(B: KillSrc));
1025 return;
1026 }
1027
1028 auto MIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_sdwa), DestReg: NewDestReg)
1029 .addImm(Val: 0) // src0_modifiers
1030 .addReg(RegNo: NewSrcReg)
1031 .addImm(Val: 0) // clamp
1032 .addImm(Val: DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1033 : AMDGPU::SDWA::SdwaSel::WORD_1)
1034 .addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
1035 .addImm(Val: SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1036 : AMDGPU::SDWA::SdwaSel::WORD_1)
1037 .addReg(RegNo: NewDestReg, Flags: RegState::Implicit | RegState::Undef);
1038 // First implicit operand is $exec.
1039 MIB->tieOperands(DefIdx: 0, UseIdx: MIB->getNumOperands() - 1);
1040 return;
1041 }
1042
1043 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(RC: SrcRC))) {
1044 if (ST.hasMovB64()) {
1045 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B64_e32), DestReg)
1046 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
1047 return;
1048 }
1049 if (ST.hasPkMovB32()) {
1050 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg)
1051 .addImm(Val: SISrcMods::OP_SEL_1)
1052 .addReg(RegNo: SrcReg)
1053 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1054 .addReg(RegNo: SrcReg)
1055 .addImm(Val: 0) // op_sel_lo
1056 .addImm(Val: 0) // op_sel_hi
1057 .addImm(Val: 0) // neg_lo
1058 .addImm(Val: 0) // neg_hi
1059 .addImm(Val: 0) // clamp
1060 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc) | RegState::Implicit);
1061 return;
1062 }
1063 }
1064
1065 const bool Forward = RI.getHWRegIndex(Reg: DestReg) <= RI.getHWRegIndex(Reg: SrcReg);
1066 if (RI.isSGPRClass(RC)) {
1067 if (!RI.isSGPRClass(RC: SrcRC)) {
1068 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1069 return;
1070 }
1071 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
1072 expandSGPRCopy(TII: *this, MBB, MI, DL, DestReg, SrcReg, KillSrc: CanKillSuperReg, RC,
1073 Forward);
1074 return;
1075 }
1076
1077 unsigned EltSize = 4;
1078 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1079 if (RI.isAGPRClass(RC)) {
1080 if (ST.hasGFX90AInsts() && RI.isAGPRClass(RC: SrcRC))
1081 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1082 else if (RI.hasVGPRs(RC: SrcRC) ||
1083 (ST.hasGFX90AInsts() && RI.isSGPRClass(RC: SrcRC)))
1084 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1085 else
1086 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1087 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(RC: SrcRC)) {
1088 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1089 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1090 (RI.isProperlyAlignedRC(RC: *RC) &&
1091 (SrcRC == RC || RI.isSGPRClass(RC: SrcRC)))) {
1092 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1093 if (ST.hasMovB64()) {
1094 Opcode = AMDGPU::V_MOV_B64_e32;
1095 EltSize = 8;
1096 } else if (ST.hasPkMovB32()) {
1097 Opcode = AMDGPU::V_PK_MOV_B32;
1098 EltSize = 8;
1099 }
1100 }
1101
1102 // For the cases where we need an intermediate instruction/temporary register
1103 // (destination is an AGPR), we need a scavenger.
1104 //
1105 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1106 // whole block for every handled copy.
1107 std::unique_ptr<RegScavenger> RS;
1108 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1109 RS = std::make_unique<RegScavenger>();
1110
1111 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1112
1113 // If there is an overlap, we can't kill the super-register on the last
1114 // instruction, since it will also kill the components made live by this def.
1115 const bool Overlap = RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
1116 const bool CanKillSuperReg = KillSrc && !Overlap;
1117
1118 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1119 unsigned SubIdx;
1120 if (Forward)
1121 SubIdx = SubIndices[Idx];
1122 else
1123 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1124 Register DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
1125 Register SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
1126 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1127
1128 bool IsFirstSubreg = Idx == 0;
1129 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1130
1131 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1132 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1133 Register ImpUseSuper = SrcReg;
1134 indirectCopyToAGPR(TII: *this, MBB, MI, DL, DestReg: DestSubReg, SrcReg: SrcSubReg, KillSrc: UseKill,
1135 RS&: *RS, RegsOverlap: Overlap, ImpDefSuperReg: ImpDefSuper, ImpUseSuperReg: ImpUseSuper);
1136 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1137 MachineInstrBuilder MIB =
1138 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg: DestSubReg)
1139 .addImm(Val: SISrcMods::OP_SEL_1)
1140 .addReg(RegNo: SrcSubReg)
1141 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1142 .addReg(RegNo: SrcSubReg)
1143 .addImm(Val: 0) // op_sel_lo
1144 .addImm(Val: 0) // op_sel_hi
1145 .addImm(Val: 0) // neg_lo
1146 .addImm(Val: 0) // neg_hi
1147 .addImm(Val: 0) // clamp
1148 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: UseKill) | RegState::Implicit);
1149 if (IsFirstSubreg)
1150 MIB.addReg(RegNo: DestReg, Flags: RegState::Define | RegState::Implicit);
1151 } else {
1152 MachineInstrBuilder Builder =
1153 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode), DestReg: DestSubReg).addReg(RegNo: SrcSubReg);
1154 if (IsFirstSubreg)
1155 Builder.addReg(RegNo: DestReg, Flags: RegState::Define | RegState::Implicit);
1156
1157 Builder.addReg(RegNo: SrcReg, Flags: getKillRegState(B: UseKill) | RegState::Implicit);
1158 }
1159 }
1160}
1161
1162int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1163 int32_t NewOpc;
1164
1165 // Try to map original to commuted opcode
1166 NewOpc = AMDGPU::getCommuteRev(Opcode);
1167 if (NewOpc != -1)
1168 // Check if the commuted (REV) opcode exists on the target.
1169 return pseudoToMCOpcode(Opcode: NewOpc) != -1 ? NewOpc : -1;
1170
1171 // Try to map commuted to original opcode
1172 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1173 if (NewOpc != -1)
1174 // Check if the original (non-REV) opcode exists on the target.
1175 return pseudoToMCOpcode(Opcode: NewOpc) != -1 ? NewOpc : -1;
1176
1177 return Opcode;
1178}
1179
1180const TargetRegisterClass *
1181SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
1182 return &AMDGPU::VGPR_32RegClass;
1183}
1184
1185void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
1186 MachineBasicBlock::iterator I,
1187 const DebugLoc &DL, Register DstReg,
1188 ArrayRef<MachineOperand> Cond,
1189 Register TrueReg,
1190 Register FalseReg) const {
1191 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1192 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1193 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
1194 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1195 "Not a VGPR32 reg");
1196
1197 if (Cond.size() == 1) {
1198 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1199 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1200 .add(MO: Cond[0]);
1201 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1202 .addImm(Val: 0)
1203 .addReg(RegNo: FalseReg)
1204 .addImm(Val: 0)
1205 .addReg(RegNo: TrueReg)
1206 .addReg(RegNo: SReg);
1207 } else if (Cond.size() == 2) {
1208 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1209 switch (Cond[0].getImm()) {
1210 case SIInstrInfo::SCC_TRUE: {
1211 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1212 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 1).addImm(Val: 0);
1213 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1214 .addImm(Val: 0)
1215 .addReg(RegNo: FalseReg)
1216 .addImm(Val: 0)
1217 .addReg(RegNo: TrueReg)
1218 .addReg(RegNo: SReg);
1219 break;
1220 }
1221 case SIInstrInfo::SCC_FALSE: {
1222 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1223 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 0).addImm(Val: 1);
1224 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1225 .addImm(Val: 0)
1226 .addReg(RegNo: FalseReg)
1227 .addImm(Val: 0)
1228 .addReg(RegNo: TrueReg)
1229 .addReg(RegNo: SReg);
1230 break;
1231 }
1232 case SIInstrInfo::VCCNZ: {
1233 MachineOperand RegOp = Cond[1];
1234 RegOp.setImplicit(false);
1235 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1236 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1237 .add(MO: RegOp);
1238 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1239 .addImm(Val: 0)
1240 .addReg(RegNo: FalseReg)
1241 .addImm(Val: 0)
1242 .addReg(RegNo: TrueReg)
1243 .addReg(RegNo: SReg);
1244 break;
1245 }
1246 case SIInstrInfo::VCCZ: {
1247 MachineOperand RegOp = Cond[1];
1248 RegOp.setImplicit(false);
1249 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1250 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1251 .add(MO: RegOp);
1252 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1253 .addImm(Val: 0)
1254 .addReg(RegNo: TrueReg)
1255 .addImm(Val: 0)
1256 .addReg(RegNo: FalseReg)
1257 .addReg(RegNo: SReg);
1258 break;
1259 }
1260 case SIInstrInfo::EXECNZ: {
1261 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1262 Register SReg2 = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1263 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.OrSaveExecOpc), DestReg: SReg2).addImm(Val: 0);
1264 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 1).addImm(Val: 0);
1265 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1266 .addImm(Val: 0)
1267 .addReg(RegNo: FalseReg)
1268 .addImm(Val: 0)
1269 .addReg(RegNo: TrueReg)
1270 .addReg(RegNo: SReg);
1271 break;
1272 }
1273 case SIInstrInfo::EXECZ: {
1274 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1275 Register SReg2 = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1276 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.OrSaveExecOpc), DestReg: SReg2).addImm(Val: 0);
1277 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: LMC.CSelectOpc), DestReg: SReg).addImm(Val: 0).addImm(Val: 1);
1278 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1279 .addImm(Val: 0)
1280 .addReg(RegNo: FalseReg)
1281 .addImm(Val: 0)
1282 .addReg(RegNo: TrueReg)
1283 .addReg(RegNo: SReg);
1284 llvm_unreachable("Unhandled branch predicate EXECZ");
1285 break;
1286 }
1287 default:
1288 llvm_unreachable("invalid branch predicate");
1289 }
1290 } else {
1291 llvm_unreachable("Can only handle Cond size 1 or 2");
1292 }
1293}
1294
1295Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
1296 MachineBasicBlock::iterator I,
1297 const DebugLoc &DL,
1298 Register SrcReg, int Value) const {
1299 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1300 Register Reg = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1301 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_EQ_I32_e64), DestReg: Reg)
1302 .addImm(Val: Value)
1303 .addReg(RegNo: SrcReg);
1304
1305 return Reg;
1306}
1307
1308Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
1309 MachineBasicBlock::iterator I,
1310 const DebugLoc &DL,
1311 Register SrcReg, int Value) const {
1312 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1313 Register Reg = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1314 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_I32_e64), DestReg: Reg)
1315 .addImm(Val: Value)
1316 .addReg(RegNo: SrcReg);
1317
1318 return Reg;
1319}
1320
1321bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
1322 const Register Reg,
1323 int64_t &ImmVal) const {
1324 switch (MI.getOpcode()) {
1325 case AMDGPU::V_MOV_B32_e32:
1326 case AMDGPU::S_MOV_B32:
1327 case AMDGPU::S_MOVK_I32:
1328 case AMDGPU::S_MOV_B64:
1329 case AMDGPU::V_MOV_B64_e32:
1330 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1331 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1332 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1333 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1334 case AMDGPU::V_MOV_B64_PSEUDO:
1335 case AMDGPU::V_MOV_B16_t16_e32: {
1336 const MachineOperand &Src0 = MI.getOperand(i: 1);
1337 if (Src0.isImm()) {
1338 ImmVal = Src0.getImm();
1339 return MI.getOperand(i: 0).getReg() == Reg;
1340 }
1341
1342 return false;
1343 }
1344 case AMDGPU::V_MOV_B16_t16_e64: {
1345 const MachineOperand &Src0 = MI.getOperand(i: 2);
1346 if (Src0.isImm() && !MI.getOperand(i: 1).getImm()) {
1347 ImmVal = Src0.getImm();
1348 return MI.getOperand(i: 0).getReg() == Reg;
1349 }
1350
1351 return false;
1352 }
1353 case AMDGPU::S_BREV_B32:
1354 case AMDGPU::V_BFREV_B32_e32:
1355 case AMDGPU::V_BFREV_B32_e64: {
1356 const MachineOperand &Src0 = MI.getOperand(i: 1);
1357 if (Src0.isImm()) {
1358 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Val: Src0.getImm()));
1359 return MI.getOperand(i: 0).getReg() == Reg;
1360 }
1361
1362 return false;
1363 }
1364 case AMDGPU::S_NOT_B32:
1365 case AMDGPU::V_NOT_B32_e32:
1366 case AMDGPU::V_NOT_B32_e64: {
1367 const MachineOperand &Src0 = MI.getOperand(i: 1);
1368 if (Src0.isImm()) {
1369 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1370 return MI.getOperand(i: 0).getReg() == Reg;
1371 }
1372
1373 return false;
1374 }
1375 default:
1376 return false;
1377 }
1378}
1379
1380std::optional<int64_t>
1381SIInstrInfo::getImmOrMaterializedImm(MachineOperand &Op) const {
1382 if (Op.isImm())
1383 return Op.getImm();
1384
1385 if (!Op.isReg() || !Op.getReg().isVirtual())
1386 return std::nullopt;
1387 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1388 const MachineInstr *Def = MRI.getVRegDef(Reg: Op.getReg());
1389 if (Def && Def->isMoveImmediate()) {
1390 const MachineOperand &ImmSrc = Def->getOperand(i: 1);
1391 if (ImmSrc.isImm())
1392 return extractSubregFromImm(ImmVal: ImmSrc.getImm(), SubRegIndex: Op.getSubReg());
1393 }
1394
1395 return std::nullopt;
1396}
1397
1398unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
1399
1400 if (RI.isAGPRClass(RC: DstRC))
1401 return AMDGPU::COPY;
1402 if (RI.getRegSizeInBits(RC: *DstRC) == 16) {
1403 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1404 // before RA.
1405 return RI.isSGPRClass(RC: DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1406 }
1407 if (RI.getRegSizeInBits(RC: *DstRC) == 32)
1408 return RI.isSGPRClass(RC: DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1409 if (RI.getRegSizeInBits(RC: *DstRC) == 64 && RI.isSGPRClass(RC: DstRC))
1410 return AMDGPU::S_MOV_B64;
1411 if (RI.getRegSizeInBits(RC: *DstRC) == 64 && !RI.isSGPRClass(RC: DstRC))
1412 return AMDGPU::V_MOV_B64_PSEUDO;
1413 return AMDGPU::COPY;
1414}
1415
1416const MCInstrDesc &
1417SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
1418 bool IsIndirectSrc) const {
1419 if (IsIndirectSrc) {
1420 if (VecSize <= 32) // 4 bytes
1421 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1422 if (VecSize <= 64) // 8 bytes
1423 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1424 if (VecSize <= 96) // 12 bytes
1425 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1426 if (VecSize <= 128) // 16 bytes
1427 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1428 if (VecSize <= 160) // 20 bytes
1429 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1430 if (VecSize <= 192) // 24 bytes
1431 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1432 if (VecSize <= 224) // 28 bytes
1433 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1434 if (VecSize <= 256) // 32 bytes
1435 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1436 if (VecSize <= 288) // 36 bytes
1437 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1438 if (VecSize <= 320) // 40 bytes
1439 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1440 if (VecSize <= 352) // 44 bytes
1441 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1442 if (VecSize <= 384) // 48 bytes
1443 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1444 if (VecSize <= 512) // 64 bytes
1445 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1446 if (VecSize <= 1024) // 128 bytes
1447 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1448
1449 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1450 }
1451
1452 if (VecSize <= 32) // 4 bytes
1453 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1454 if (VecSize <= 64) // 8 bytes
1455 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1456 if (VecSize <= 96) // 12 bytes
1457 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1458 if (VecSize <= 128) // 16 bytes
1459 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1460 if (VecSize <= 160) // 20 bytes
1461 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1462 if (VecSize <= 192) // 24 bytes
1463 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1464 if (VecSize <= 224) // 28 bytes
1465 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1466 if (VecSize <= 256) // 32 bytes
1467 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1468 if (VecSize <= 288) // 36 bytes
1469 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1470 if (VecSize <= 320) // 40 bytes
1471 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1472 if (VecSize <= 352) // 44 bytes
1473 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1474 if (VecSize <= 384) // 48 bytes
1475 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1476 if (VecSize <= 512) // 64 bytes
1477 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1478 if (VecSize <= 1024) // 128 bytes
1479 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1480
1481 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1482}
1483
1484static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1485 if (VecSize <= 32) // 4 bytes
1486 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1487 if (VecSize <= 64) // 8 bytes
1488 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1489 if (VecSize <= 96) // 12 bytes
1490 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1491 if (VecSize <= 128) // 16 bytes
1492 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1493 if (VecSize <= 160) // 20 bytes
1494 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1495 if (VecSize <= 192) // 24 bytes
1496 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1497 if (VecSize <= 224) // 28 bytes
1498 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1499 if (VecSize <= 256) // 32 bytes
1500 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1501 if (VecSize <= 288) // 36 bytes
1502 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1503 if (VecSize <= 320) // 40 bytes
1504 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1505 if (VecSize <= 352) // 44 bytes
1506 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1507 if (VecSize <= 384) // 48 bytes
1508 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1509 if (VecSize <= 512) // 64 bytes
1510 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1511 if (VecSize <= 1024) // 128 bytes
1512 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1513
1514 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1515}
1516
1517static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1518 if (VecSize <= 32) // 4 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1520 if (VecSize <= 64) // 8 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1522 if (VecSize <= 96) // 12 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1524 if (VecSize <= 128) // 16 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1526 if (VecSize <= 160) // 20 bytes
1527 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1528 if (VecSize <= 192) // 24 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1530 if (VecSize <= 224) // 28 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1532 if (VecSize <= 256) // 32 bytes
1533 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1534 if (VecSize <= 288) // 36 bytes
1535 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1536 if (VecSize <= 320) // 40 bytes
1537 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1538 if (VecSize <= 352) // 44 bytes
1539 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1540 if (VecSize <= 384) // 48 bytes
1541 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1542 if (VecSize <= 512) // 64 bytes
1543 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1544 if (VecSize <= 1024) // 128 bytes
1545 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1546
1547 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1548}
1549
1550static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1551 if (VecSize <= 64) // 8 bytes
1552 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1553 if (VecSize <= 128) // 16 bytes
1554 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1555 if (VecSize <= 256) // 32 bytes
1556 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1557 if (VecSize <= 512) // 64 bytes
1558 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1559 if (VecSize <= 1024) // 128 bytes
1560 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1561
1562 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1563}
1564
1565const MCInstrDesc &
1566SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1567 bool IsSGPR) const {
1568 if (IsSGPR) {
1569 switch (EltSize) {
1570 case 32:
1571 return get(Opcode: getIndirectSGPRWriteMovRelPseudo32(VecSize));
1572 case 64:
1573 return get(Opcode: getIndirectSGPRWriteMovRelPseudo64(VecSize));
1574 default:
1575 llvm_unreachable("invalid reg indexing elt size");
1576 }
1577 }
1578
1579 assert(EltSize == 32 && "invalid reg indexing elt size");
1580 return get(Opcode: getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
1581}
1582
1583static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1584 switch (Size) {
1585 case 4:
1586 return AMDGPU::SI_SPILL_S32_SAVE;
1587 case 8:
1588 return AMDGPU::SI_SPILL_S64_SAVE;
1589 case 12:
1590 return AMDGPU::SI_SPILL_S96_SAVE;
1591 case 16:
1592 return AMDGPU::SI_SPILL_S128_SAVE;
1593 case 20:
1594 return AMDGPU::SI_SPILL_S160_SAVE;
1595 case 24:
1596 return AMDGPU::SI_SPILL_S192_SAVE;
1597 case 28:
1598 return AMDGPU::SI_SPILL_S224_SAVE;
1599 case 32:
1600 return AMDGPU::SI_SPILL_S256_SAVE;
1601 case 36:
1602 return AMDGPU::SI_SPILL_S288_SAVE;
1603 case 40:
1604 return AMDGPU::SI_SPILL_S320_SAVE;
1605 case 44:
1606 return AMDGPU::SI_SPILL_S352_SAVE;
1607 case 48:
1608 return AMDGPU::SI_SPILL_S384_SAVE;
1609 case 64:
1610 return AMDGPU::SI_SPILL_S512_SAVE;
1611 case 128:
1612 return AMDGPU::SI_SPILL_S1024_SAVE;
1613 default:
1614 llvm_unreachable("unknown register size");
1615 }
1616}
1617
1618static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1619 switch (Size) {
1620 case 2:
1621 return AMDGPU::SI_SPILL_V16_SAVE;
1622 case 4:
1623 return AMDGPU::SI_SPILL_V32_SAVE;
1624 case 8:
1625 return AMDGPU::SI_SPILL_V64_SAVE;
1626 case 12:
1627 return AMDGPU::SI_SPILL_V96_SAVE;
1628 case 16:
1629 return AMDGPU::SI_SPILL_V128_SAVE;
1630 case 20:
1631 return AMDGPU::SI_SPILL_V160_SAVE;
1632 case 24:
1633 return AMDGPU::SI_SPILL_V192_SAVE;
1634 case 28:
1635 return AMDGPU::SI_SPILL_V224_SAVE;
1636 case 32:
1637 return AMDGPU::SI_SPILL_V256_SAVE;
1638 case 36:
1639 return AMDGPU::SI_SPILL_V288_SAVE;
1640 case 40:
1641 return AMDGPU::SI_SPILL_V320_SAVE;
1642 case 44:
1643 return AMDGPU::SI_SPILL_V352_SAVE;
1644 case 48:
1645 return AMDGPU::SI_SPILL_V384_SAVE;
1646 case 64:
1647 return AMDGPU::SI_SPILL_V512_SAVE;
1648 case 128:
1649 return AMDGPU::SI_SPILL_V1024_SAVE;
1650 default:
1651 llvm_unreachable("unknown register size");
1652 }
1653}
1654
1655static unsigned getAVSpillSaveOpcode(unsigned Size) {
1656 switch (Size) {
1657 case 4:
1658 return AMDGPU::SI_SPILL_AV32_SAVE;
1659 case 8:
1660 return AMDGPU::SI_SPILL_AV64_SAVE;
1661 case 12:
1662 return AMDGPU::SI_SPILL_AV96_SAVE;
1663 case 16:
1664 return AMDGPU::SI_SPILL_AV128_SAVE;
1665 case 20:
1666 return AMDGPU::SI_SPILL_AV160_SAVE;
1667 case 24:
1668 return AMDGPU::SI_SPILL_AV192_SAVE;
1669 case 28:
1670 return AMDGPU::SI_SPILL_AV224_SAVE;
1671 case 32:
1672 return AMDGPU::SI_SPILL_AV256_SAVE;
1673 case 36:
1674 return AMDGPU::SI_SPILL_AV288_SAVE;
1675 case 40:
1676 return AMDGPU::SI_SPILL_AV320_SAVE;
1677 case 44:
1678 return AMDGPU::SI_SPILL_AV352_SAVE;
1679 case 48:
1680 return AMDGPU::SI_SPILL_AV384_SAVE;
1681 case 64:
1682 return AMDGPU::SI_SPILL_AV512_SAVE;
1683 case 128:
1684 return AMDGPU::SI_SPILL_AV1024_SAVE;
1685 default:
1686 llvm_unreachable("unknown register size");
1687 }
1688}
1689
1690static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1691 bool IsVectorSuperClass) {
1692 // Currently, there is only 32-bit WWM register spills needed.
1693 if (Size != 4)
1694 llvm_unreachable("unknown wwm register spill size");
1695
1696 if (IsVectorSuperClass)
1697 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1698
1699 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1700}
1701
1702unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
1703 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1704 const SIMachineFunctionInfo &MFI) const {
1705 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1706
1707 // Choose the right opcode if spilling a WWM register.
1708 if (MFI.checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
1709 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1710
1711 // TODO: Check if AGPRs are available
1712 if (ST.hasMAIInsts())
1713 return getAVSpillSaveOpcode(Size);
1714
1715 return getVGPRSpillSaveOpcode(Size);
1716}
1717
1718void SIInstrInfo::storeRegToStackSlot(
1719 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
1720 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1721 MachineInstr::MIFlag Flags) const {
1722 MachineFunction *MF = MBB.getParent();
1723 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1724 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1725 const DebugLoc &DL = MBB.findDebugLoc(MBBI: MI);
1726
1727 MachinePointerInfo PtrInfo
1728 = MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1729 MachineMemOperand *MMO = MF->getMachineMemOperand(
1730 PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
1731 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
1732 unsigned SpillSize = RI.getSpillSize(RC: *RC);
1733
1734 MachineRegisterInfo &MRI = MF->getRegInfo();
1735 if (RI.isSGPRClass(RC)) {
1736 MFI->setHasSpilledSGPRs();
1737 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1738 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1739 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1740
1741 // We are only allowed to create one new instruction when spilling
1742 // registers, so we need to use pseudo instruction for spilling SGPRs.
1743 const MCInstrDesc &OpDesc = get(Opcode: getSGPRSpillSaveOpcode(Size: SpillSize));
1744
1745 // The SGPR spill/restore instructions only work on number sgprs, so we need
1746 // to make sure we are using the correct register class.
1747 if (SrcReg.isVirtual() && SpillSize == 4) {
1748 MRI.constrainRegClass(Reg: SrcReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
1749 }
1750
1751 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: OpDesc)
1752 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: isKill)) // data
1753 .addFrameIndex(Idx: FrameIndex) // addr
1754 .addMemOperand(MMO)
1755 .addReg(RegNo: MFI->getStackPtrOffsetReg(), Flags: RegState::Implicit);
1756
1757 if (RI.spillSGPRToVGPR())
1758 FrameInfo.setStackID(ObjectIdx: FrameIndex, ID: TargetStackID::SGPRSpill);
1759 return;
1760 }
1761
1762 unsigned Opcode =
1763 getVectorRegSpillSaveOpcode(Reg: VReg ? VReg : SrcReg, RC, Size: SpillSize, MFI: *MFI);
1764 MFI->setHasSpilledVGPRs();
1765
1766 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode))
1767 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: isKill)) // data
1768 .addFrameIndex(Idx: FrameIndex) // addr
1769 .addReg(RegNo: MFI->getStackPtrOffsetReg()) // scratch_offset
1770 .addImm(Val: 0) // offset
1771 .addMemOperand(MMO);
1772}
1773
1774static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1775 switch (Size) {
1776 case 4:
1777 return AMDGPU::SI_SPILL_S32_RESTORE;
1778 case 8:
1779 return AMDGPU::SI_SPILL_S64_RESTORE;
1780 case 12:
1781 return AMDGPU::SI_SPILL_S96_RESTORE;
1782 case 16:
1783 return AMDGPU::SI_SPILL_S128_RESTORE;
1784 case 20:
1785 return AMDGPU::SI_SPILL_S160_RESTORE;
1786 case 24:
1787 return AMDGPU::SI_SPILL_S192_RESTORE;
1788 case 28:
1789 return AMDGPU::SI_SPILL_S224_RESTORE;
1790 case 32:
1791 return AMDGPU::SI_SPILL_S256_RESTORE;
1792 case 36:
1793 return AMDGPU::SI_SPILL_S288_RESTORE;
1794 case 40:
1795 return AMDGPU::SI_SPILL_S320_RESTORE;
1796 case 44:
1797 return AMDGPU::SI_SPILL_S352_RESTORE;
1798 case 48:
1799 return AMDGPU::SI_SPILL_S384_RESTORE;
1800 case 64:
1801 return AMDGPU::SI_SPILL_S512_RESTORE;
1802 case 128:
1803 return AMDGPU::SI_SPILL_S1024_RESTORE;
1804 default:
1805 llvm_unreachable("unknown register size");
1806 }
1807}
1808
1809static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1810 switch (Size) {
1811 case 2:
1812 return AMDGPU::SI_SPILL_V16_RESTORE;
1813 case 4:
1814 return AMDGPU::SI_SPILL_V32_RESTORE;
1815 case 8:
1816 return AMDGPU::SI_SPILL_V64_RESTORE;
1817 case 12:
1818 return AMDGPU::SI_SPILL_V96_RESTORE;
1819 case 16:
1820 return AMDGPU::SI_SPILL_V128_RESTORE;
1821 case 20:
1822 return AMDGPU::SI_SPILL_V160_RESTORE;
1823 case 24:
1824 return AMDGPU::SI_SPILL_V192_RESTORE;
1825 case 28:
1826 return AMDGPU::SI_SPILL_V224_RESTORE;
1827 case 32:
1828 return AMDGPU::SI_SPILL_V256_RESTORE;
1829 case 36:
1830 return AMDGPU::SI_SPILL_V288_RESTORE;
1831 case 40:
1832 return AMDGPU::SI_SPILL_V320_RESTORE;
1833 case 44:
1834 return AMDGPU::SI_SPILL_V352_RESTORE;
1835 case 48:
1836 return AMDGPU::SI_SPILL_V384_RESTORE;
1837 case 64:
1838 return AMDGPU::SI_SPILL_V512_RESTORE;
1839 case 128:
1840 return AMDGPU::SI_SPILL_V1024_RESTORE;
1841 default:
1842 llvm_unreachable("unknown register size");
1843 }
1844}
1845
1846static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1847 switch (Size) {
1848 case 4:
1849 return AMDGPU::SI_SPILL_AV32_RESTORE;
1850 case 8:
1851 return AMDGPU::SI_SPILL_AV64_RESTORE;
1852 case 12:
1853 return AMDGPU::SI_SPILL_AV96_RESTORE;
1854 case 16:
1855 return AMDGPU::SI_SPILL_AV128_RESTORE;
1856 case 20:
1857 return AMDGPU::SI_SPILL_AV160_RESTORE;
1858 case 24:
1859 return AMDGPU::SI_SPILL_AV192_RESTORE;
1860 case 28:
1861 return AMDGPU::SI_SPILL_AV224_RESTORE;
1862 case 32:
1863 return AMDGPU::SI_SPILL_AV256_RESTORE;
1864 case 36:
1865 return AMDGPU::SI_SPILL_AV288_RESTORE;
1866 case 40:
1867 return AMDGPU::SI_SPILL_AV320_RESTORE;
1868 case 44:
1869 return AMDGPU::SI_SPILL_AV352_RESTORE;
1870 case 48:
1871 return AMDGPU::SI_SPILL_AV384_RESTORE;
1872 case 64:
1873 return AMDGPU::SI_SPILL_AV512_RESTORE;
1874 case 128:
1875 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1876 default:
1877 llvm_unreachable("unknown register size");
1878 }
1879}
1880
1881static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1882 bool IsVectorSuperClass) {
1883 // Currently, there is only 32-bit WWM register spills needed.
1884 if (Size != 4)
1885 llvm_unreachable("unknown wwm register spill size");
1886
1887 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1888 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1889
1890 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1891}
1892
1893unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(
1894 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1895 const SIMachineFunctionInfo &MFI) const {
1896 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1897
1898 // Choose the right opcode if restoring a WWM register.
1899 if (MFI.checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
1900 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1901
1902 // TODO: Check if AGPRs are available
1903 if (ST.hasMAIInsts())
1904 return getAVSpillRestoreOpcode(Size);
1905
1906 assert(!RI.isAGPRClass(RC));
1907 return getVGPRSpillRestoreOpcode(Size);
1908}
1909
1910void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
1911 MachineBasicBlock::iterator MI,
1912 Register DestReg, int FrameIndex,
1913 const TargetRegisterClass *RC,
1914 Register VReg, unsigned SubReg,
1915 MachineInstr::MIFlag Flags) const {
1916 MachineFunction *MF = MBB.getParent();
1917 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1918 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1919 const DebugLoc &DL = MBB.findDebugLoc(MBBI: MI);
1920 unsigned SpillSize = RI.getSpillSize(RC: *RC);
1921
1922 MachinePointerInfo PtrInfo
1923 = MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1924
1925 MachineMemOperand *MMO = MF->getMachineMemOperand(
1926 PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
1927 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
1928
1929 if (RI.isSGPRClass(RC)) {
1930 MFI->setHasSpilledSGPRs();
1931 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1932 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1933 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1934
1935 // FIXME: Maybe this should not include a memoperand because it will be
1936 // lowered to non-memory instructions.
1937 const MCInstrDesc &OpDesc = get(Opcode: getSGPRSpillRestoreOpcode(Size: SpillSize));
1938 if (DestReg.isVirtual() && SpillSize == 4) {
1939 MachineRegisterInfo &MRI = MF->getRegInfo();
1940 MRI.constrainRegClass(Reg: DestReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
1941 }
1942
1943 if (RI.spillSGPRToVGPR())
1944 FrameInfo.setStackID(ObjectIdx: FrameIndex, ID: TargetStackID::SGPRSpill);
1945 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: OpDesc, DestReg)
1946 .addFrameIndex(Idx: FrameIndex) // addr
1947 .addMemOperand(MMO)
1948 .addReg(RegNo: MFI->getStackPtrOffsetReg(), Flags: RegState::Implicit);
1949
1950 return;
1951 }
1952
1953 unsigned Opcode = getVectorRegSpillRestoreOpcode(Reg: VReg ? VReg : DestReg, RC,
1954 Size: SpillSize, MFI: *MFI);
1955 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode), DestReg)
1956 .addFrameIndex(Idx: FrameIndex) // vaddr
1957 .addReg(RegNo: MFI->getStackPtrOffsetReg()) // scratch_offset
1958 .addImm(Val: 0) // offset
1959 .addMemOperand(MMO);
1960}
1961
1962void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
1963 MachineBasicBlock::iterator MI) const {
1964 insertNoops(MBB, MI, Quantity: 1);
1965}
1966
1967void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
1968 MachineBasicBlock::iterator MI,
1969 unsigned Quantity) const {
1970 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
1971 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1972 while (Quantity > 0) {
1973 unsigned Arg = std::min(a: Quantity, b: MaxSNopCount);
1974 Quantity -= Arg;
1975 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOP)).addImm(Val: Arg - 1);
1976 }
1977}
1978
1979void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
1980 auto *MF = MBB.getParent();
1981 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1982
1983 assert(Info->isEntryFunction());
1984
1985 if (MBB.succ_empty()) {
1986 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1987 if (HasNoTerminator) {
1988 if (Info->returnsVoid()) {
1989 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DebugLoc(), MCID: get(Opcode: AMDGPU::S_ENDPGM)).addImm(Val: 0);
1990 } else {
1991 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DebugLoc(), MCID: get(Opcode: AMDGPU::SI_RETURN_TO_EPILOG));
1992 }
1993 }
1994 }
1995}
1996
1997MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
1998 MachineBasicBlock &MBB,
1999 MachineInstr &MI,
2000 const DebugLoc &DL) const {
2001 MachineFunction *MF = MBB.getParent();
2002 constexpr unsigned DoorbellIDMask = 0x3ff;
2003 constexpr unsigned ECQueueWaveAbort = 0x400;
2004
2005 MachineBasicBlock *TrapBB = &MBB;
2006 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2007
2008 if (!MBB.succ_empty() || std::next(x: MI.getIterator()) != MBB.end()) {
2009 MBB.splitAt(SplitInst&: MI, /*UpdateLiveIns=*/false);
2010 TrapBB = MF->CreateMachineBasicBlock();
2011 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CBRANCH_EXECNZ)).addMBB(MBB: TrapBB);
2012 MF->push_back(MBB: TrapBB);
2013 MBB.addSuccessor(Succ: TrapBB);
2014 }
2015 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2016 // will be a nop.
2017 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_TRAP))
2018 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2019 Register DoorbellReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2020 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SENDMSG_RTN_B32),
2021 DestReg: DoorbellReg)
2022 .addImm(Val: AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
2023 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::TTMP2)
2024 .addUse(RegNo: AMDGPU::M0);
2025 Register DoorbellRegMasked =
2026 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2027 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_AND_B32), DestReg: DoorbellRegMasked)
2028 .addUse(RegNo: DoorbellReg)
2029 .addImm(Val: DoorbellIDMask);
2030 Register SetWaveAbortBit =
2031 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2032 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_OR_B32), DestReg: SetWaveAbortBit)
2033 .addUse(RegNo: DoorbellRegMasked)
2034 .addImm(Val: ECQueueWaveAbort);
2035 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
2036 .addUse(RegNo: SetWaveAbortBit);
2037 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SENDMSG))
2038 .addImm(Val: AMDGPU::SendMsg::ID_INTERRUPT);
2039 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
2040 .addUse(RegNo: AMDGPU::TTMP2);
2041 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH)).addMBB(MBB: HaltLoopBB);
2042 TrapBB->addSuccessor(Succ: HaltLoopBB);
2043
2044 BuildMI(BB&: *HaltLoopBB, I: HaltLoopBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETHALT)).addImm(Val: 5);
2045 BuildMI(BB&: *HaltLoopBB, I: HaltLoopBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
2046 .addMBB(MBB: HaltLoopBB);
2047 MF->push_back(MBB: HaltLoopBB);
2048 HaltLoopBB->addSuccessor(Succ: HaltLoopBB);
2049
2050 return MBB.getNextNode();
2051}
2052
2053unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
2054 switch (MI.getOpcode()) {
2055 default:
2056 if (MI.isMetaInstruction())
2057 return 0;
2058 return 1; // FIXME: Do wait states equal cycles?
2059
2060 case AMDGPU::S_NOP:
2061 return MI.getOperand(i: 0).getImm() + 1;
2062 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2063 // hazard, even if one exist, won't really be visible. Should we handle it?
2064 }
2065}
2066
2067bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2068 MachineBasicBlock &MBB = *MI.getParent();
2069 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
2070 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
2071 switch (MI.getOpcode()) {
2072 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2073 case AMDGPU::S_MOV_B64_term:
2074 // This is only a terminator to get the correct spill code placement during
2075 // register allocation.
2076 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2077 break;
2078
2079 case AMDGPU::S_MOV_B32_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B32));
2083 break;
2084
2085 case AMDGPU::S_XOR_B64_term:
2086 // This is only a terminator to get the correct spill code placement during
2087 // register allocation.
2088 MI.setDesc(get(Opcode: AMDGPU::S_XOR_B64));
2089 break;
2090
2091 case AMDGPU::S_XOR_B32_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(Opcode: AMDGPU::S_XOR_B32));
2095 break;
2096 case AMDGPU::S_OR_B64_term:
2097 // This is only a terminator to get the correct spill code placement during
2098 // register allocation.
2099 MI.setDesc(get(Opcode: AMDGPU::S_OR_B64));
2100 break;
2101 case AMDGPU::S_OR_B32_term:
2102 // This is only a terminator to get the correct spill code placement during
2103 // register allocation.
2104 MI.setDesc(get(Opcode: AMDGPU::S_OR_B32));
2105 break;
2106
2107 case AMDGPU::S_ANDN2_B64_term:
2108 // This is only a terminator to get the correct spill code placement during
2109 // register allocation.
2110 MI.setDesc(get(Opcode: AMDGPU::S_ANDN2_B64));
2111 break;
2112
2113 case AMDGPU::S_ANDN2_B32_term:
2114 // This is only a terminator to get the correct spill code placement during
2115 // register allocation.
2116 MI.setDesc(get(Opcode: AMDGPU::S_ANDN2_B32));
2117 break;
2118
2119 case AMDGPU::S_AND_B64_term:
2120 // This is only a terminator to get the correct spill code placement during
2121 // register allocation.
2122 MI.setDesc(get(Opcode: AMDGPU::S_AND_B64));
2123 break;
2124
2125 case AMDGPU::S_AND_B32_term:
2126 // This is only a terminator to get the correct spill code placement during
2127 // register allocation.
2128 MI.setDesc(get(Opcode: AMDGPU::S_AND_B32));
2129 break;
2130
2131 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2132 // This is only a terminator to get the correct spill code placement during
2133 // register allocation.
2134 MI.setDesc(get(Opcode: AMDGPU::S_AND_SAVEEXEC_B64));
2135 break;
2136
2137 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2138 // This is only a terminator to get the correct spill code placement during
2139 // register allocation.
2140 MI.setDesc(get(Opcode: AMDGPU::S_AND_SAVEEXEC_B32));
2141 break;
2142
2143 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2144 MI.setDesc(get(Opcode: AMDGPU::V_WRITELANE_B32));
2145 break;
2146
2147 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2148 MI.setDesc(get(Opcode: AMDGPU::V_READLANE_B32));
2149 break;
2150 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2151 Register Dst = MI.getOperand(i: 0).getReg();
2152 bool IsAGPR = SIRegisterInfo::isAGPRClass(RC: RI.getPhysRegBaseClass(Reg: Dst));
2153 MI.setDesc(
2154 get(Opcode: IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2155 break;
2156 }
2157 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2158 Register Dst = MI.getOperand(i: 0).getReg();
2159 if (SIRegisterInfo::isAGPRClass(RC: RI.getPhysRegBaseClass(Reg: Dst))) {
2160 int64_t Imm = MI.getOperand(i: 1).getImm();
2161
2162 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2163 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2164 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg: DstLo)
2165 .addImm(Val: SignExtend64<32>(x: Imm))
2166 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2167 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg: DstHi)
2168 .addImm(Val: SignExtend64<32>(x: Imm >> 32))
2169 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2170 MI.eraseFromParent();
2171 break;
2172 }
2173
2174 [[fallthrough]];
2175 }
2176 case AMDGPU::V_MOV_B64_PSEUDO: {
2177 Register Dst = MI.getOperand(i: 0).getReg();
2178 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2179 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2180
2181 const MCInstrDesc &Mov64Desc = get(Opcode: AMDGPU::V_MOV_B64_e32);
2182 const TargetRegisterClass *Mov64RC = getRegClass(MCID: Mov64Desc, /*OpNum=*/0);
2183
2184 const MachineOperand &SrcOp = MI.getOperand(i: 1);
2185 // FIXME: Will this work for 64-bit floating point immediates?
2186 assert(!SrcOp.isFPImm());
2187 if (ST.hasMovB64() && Mov64RC->contains(Reg: Dst)) {
2188 MI.setDesc(Mov64Desc);
2189 if (SrcOp.isReg() || isInlineConstant(MI, OpIdx: 1) ||
2190 isUInt<32>(x: SrcOp.getImm()) || ST.has64BitLiterals())
2191 break;
2192 }
2193 if (SrcOp.isImm()) {
2194 APInt Imm(64, SrcOp.getImm());
2195 APInt Lo(32, Imm.getLoBits(numBits: 32).getZExtValue());
2196 APInt Hi(32, Imm.getHiBits(numBits: 32).getZExtValue());
2197 const MCInstrDesc &PkMovDesc = get(Opcode: AMDGPU::V_PK_MOV_B32);
2198 const TargetRegisterClass *PkMovRC = getRegClass(MCID: PkMovDesc, /*OpNum=*/0);
2199
2200 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Imm: Lo) &&
2201 PkMovRC->contains(Reg: Dst)) {
2202 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: PkMovDesc, DestReg: Dst)
2203 .addImm(Val: SISrcMods::OP_SEL_1)
2204 .addImm(Val: Lo.getSExtValue())
2205 .addImm(Val: SISrcMods::OP_SEL_1)
2206 .addImm(Val: Lo.getSExtValue())
2207 .addImm(Val: 0) // op_sel_lo
2208 .addImm(Val: 0) // op_sel_hi
2209 .addImm(Val: 0) // neg_lo
2210 .addImm(Val: 0) // neg_hi
2211 .addImm(Val: 0); // clamp
2212 } else {
2213 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo)
2214 .addImm(Val: Lo.getSExtValue())
2215 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2216 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstHi)
2217 .addImm(Val: Hi.getSExtValue())
2218 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2219 }
2220 } else {
2221 assert(SrcOp.isReg());
2222 if (ST.hasPkMovB32() &&
2223 !RI.isAGPR(MRI: MBB.getParent()->getRegInfo(), Reg: SrcOp.getReg())) {
2224 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg: Dst)
2225 .addImm(Val: SISrcMods::OP_SEL_1) // src0_mod
2226 .addReg(RegNo: SrcOp.getReg())
2227 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
2228 .addReg(RegNo: SrcOp.getReg())
2229 .addImm(Val: 0) // op_sel_lo
2230 .addImm(Val: 0) // op_sel_hi
2231 .addImm(Val: 0) // neg_lo
2232 .addImm(Val: 0) // neg_hi
2233 .addImm(Val: 0); // clamp
2234 } else {
2235 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo)
2236 .addReg(RegNo: RI.getSubReg(Reg: SrcOp.getReg(), Idx: AMDGPU::sub0))
2237 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2238 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstHi)
2239 .addReg(RegNo: RI.getSubReg(Reg: SrcOp.getReg(), Idx: AMDGPU::sub1))
2240 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2241 }
2242 }
2243 MI.eraseFromParent();
2244 break;
2245 }
2246 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2247 expandMovDPP64(MI);
2248 break;
2249 }
2250 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2251 const MachineOperand &SrcOp = MI.getOperand(i: 1);
2252 assert(!SrcOp.isFPImm());
2253
2254 if (ST.has64BitLiterals()) {
2255 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2256 break;
2257 }
2258
2259 APInt Imm(64, SrcOp.getImm());
2260 if (Imm.isIntN(N: 32) || isInlineConstant(Imm)) {
2261 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2262 break;
2263 }
2264
2265 Register Dst = MI.getOperand(i: 0).getReg();
2266 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2267 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2268
2269 APInt Lo(32, Imm.getLoBits(numBits: 32).getZExtValue());
2270 APInt Hi(32, Imm.getHiBits(numBits: 32).getZExtValue());
2271 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstLo)
2272 .addImm(Val: Lo.getSExtValue())
2273 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2274 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstHi)
2275 .addImm(Val: Hi.getSExtValue())
2276 .addReg(RegNo: Dst, Flags: RegState::Implicit | RegState::Define);
2277 MI.eraseFromParent();
2278 break;
2279 }
2280 case AMDGPU::V_SET_INACTIVE_B32: {
2281 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2282 Register DstReg = MI.getOperand(i: 0).getReg();
2283 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
2284 .add(MO: MI.getOperand(i: 3))
2285 .add(MO: MI.getOperand(i: 4))
2286 .add(MO: MI.getOperand(i: 1))
2287 .add(MO: MI.getOperand(i: 2))
2288 .add(MO: MI.getOperand(i: 5));
2289 MI.eraseFromParent();
2290 break;
2291 }
2292 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2294 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2298 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2305 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2324 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2325 const TargetRegisterClass *EltRC = getOpRegClass(MI, OpNo: 2);
2326
2327 unsigned Opc;
2328 if (RI.hasVGPRs(RC: EltRC)) {
2329 Opc = AMDGPU::V_MOVRELD_B32_e32;
2330 } else {
2331 Opc = RI.getRegSizeInBits(RC: *EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2332 : AMDGPU::S_MOVRELD_B32;
2333 }
2334
2335 const MCInstrDesc &OpDesc = get(Opcode: Opc);
2336 Register VecReg = MI.getOperand(i: 0).getReg();
2337 bool IsUndef = MI.getOperand(i: 1).isUndef();
2338 unsigned SubReg = MI.getOperand(i: 3).getImm();
2339 assert(VecReg == MI.getOperand(1).getReg());
2340
2341 MachineInstrBuilder MIB =
2342 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: OpDesc)
2343 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2344 .add(MO: MI.getOperand(i: 2))
2345 .addReg(RegNo: VecReg, Flags: RegState::ImplicitDefine)
2346 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2347
2348 const int ImpDefIdx =
2349 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2350 const int ImpUseIdx = ImpDefIdx + 1;
2351 MIB->tieOperands(DefIdx: ImpDefIdx, UseIdx: ImpUseIdx);
2352 MI.eraseFromParent();
2353 break;
2354 }
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2357 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2358 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2359 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2360 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2361 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2362 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2363 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2364 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2365 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2366 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2367 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2368 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2369 assert(ST.useVGPRIndexMode());
2370 Register VecReg = MI.getOperand(i: 0).getReg();
2371 bool IsUndef = MI.getOperand(i: 1).isUndef();
2372 MachineOperand &Idx = MI.getOperand(i: 3);
2373 Register SubReg = MI.getOperand(i: 4).getImm();
2374
2375 MachineInstr *SetOn = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_ON))
2376 .add(MO: Idx)
2377 .addImm(Val: AMDGPU::VGPRIndexMode::DST_ENABLE);
2378 SetOn->getOperand(i: 3).setIsUndef();
2379
2380 const MCInstrDesc &OpDesc = get(Opcode: AMDGPU::V_MOV_B32_indirect_write);
2381 MachineInstrBuilder MIB =
2382 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: OpDesc)
2383 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2384 .add(MO: MI.getOperand(i: 2))
2385 .addReg(RegNo: VecReg, Flags: RegState::ImplicitDefine)
2386 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2387
2388 const int ImpDefIdx =
2389 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2390 const int ImpUseIdx = ImpDefIdx + 1;
2391 MIB->tieOperands(DefIdx: ImpDefIdx, UseIdx: ImpUseIdx);
2392
2393 MachineInstr *SetOff = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_OFF));
2394
2395 finalizeBundle(MBB, FirstMI: SetOn->getIterator(), LastMI: std::next(x: SetOff->getIterator()));
2396
2397 MI.eraseFromParent();
2398 break;
2399 }
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2402 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2403 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2404 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2405 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2406 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2407 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2408 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2409 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2410 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2411 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2412 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2413 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2414 assert(ST.useVGPRIndexMode());
2415 Register Dst = MI.getOperand(i: 0).getReg();
2416 Register VecReg = MI.getOperand(i: 1).getReg();
2417 bool IsUndef = MI.getOperand(i: 1).isUndef();
2418 Register SubReg = MI.getOperand(i: 3).getImm();
2419
2420 MachineInstr *SetOn = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_ON))
2421 .add(MO: MI.getOperand(i: 2))
2422 .addImm(Val: AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2423 SetOn->getOperand(i: 3).setIsUndef();
2424
2425 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_indirect_read))
2426 .addDef(RegNo: Dst)
2427 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), Flags: RegState::Undef)
2428 .addReg(RegNo: VecReg, Flags: RegState::Implicit | getUndefRegState(B: IsUndef));
2429
2430 MachineInstr *SetOff = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_OFF));
2431
2432 finalizeBundle(MBB, FirstMI: SetOn->getIterator(), LastMI: std::next(x: SetOff->getIterator()));
2433
2434 MI.eraseFromParent();
2435 break;
2436 }
2437 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2438 MachineFunction &MF = *MBB.getParent();
2439 Register Reg = MI.getOperand(i: 0).getReg();
2440 Register RegLo = RI.getSubReg(Reg, Idx: AMDGPU::sub0);
2441 Register RegHi = RI.getSubReg(Reg, Idx: AMDGPU::sub1);
2442 MachineOperand OpLo = MI.getOperand(i: 1);
2443 MachineOperand OpHi = MI.getOperand(i: 2);
2444
2445 // Create a bundle so these instructions won't be re-ordered by the
2446 // post-RA scheduler.
2447 MIBundleBuilder Bundler(MBB, MI);
2448 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: Reg));
2449
2450 // What we want here is an offset from the value returned by s_getpc (which
2451 // is the address of the s_add_u32 instruction) to the global variable, but
2452 // since the encoding of $symbol starts 4 bytes after the start of the
2453 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2454 // small. This requires us to add 4 to the global variable offset in order
2455 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2456 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2457 // instruction.
2458
2459 int64_t Adjust = 0;
2460 if (ST.hasGetPCZeroExtension()) {
2461 // Fix up hardware that does not sign-extend the 48-bit PC value by
2462 // inserting: s_sext_i32_i16 reghi, reghi
2463 Bundler.append(
2464 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SEXT_I32_I16), DestReg: RegHi).addReg(RegNo: RegHi));
2465 Adjust += 4;
2466 }
2467
2468 if (OpLo.isGlobal())
2469 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2470 Bundler.append(
2471 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U32), DestReg: RegLo).addReg(RegNo: RegLo).add(MO: OpLo));
2472
2473 if (OpHi.isGlobal())
2474 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2475 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADDC_U32), DestReg: RegHi)
2476 .addReg(RegNo: RegHi)
2477 .add(MO: OpHi));
2478
2479 finalizeBundle(MBB, FirstMI: Bundler.begin());
2480
2481 MI.eraseFromParent();
2482 break;
2483 }
2484 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2485 MachineFunction &MF = *MBB.getParent();
2486 Register Reg = MI.getOperand(i: 0).getReg();
2487 MachineOperand Op = MI.getOperand(i: 1);
2488
2489 // Create a bundle so these instructions won't be re-ordered by the
2490 // post-RA scheduler.
2491 MIBundleBuilder Bundler(MBB, MI);
2492 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: Reg));
2493 if (Op.isGlobal())
2494 Op.setOffset(Op.getOffset() + 4);
2495 Bundler.append(
2496 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U64), DestReg: Reg).addReg(RegNo: Reg).add(MO: Op));
2497
2498 finalizeBundle(MBB, FirstMI: Bundler.begin());
2499
2500 MI.eraseFromParent();
2501 break;
2502 }
2503 case AMDGPU::ENTER_STRICT_WWM: {
2504 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2505 // Whole Wave Mode is entered.
2506 MI.setDesc(get(Opcode: LMC.OrSaveExecOpc));
2507 break;
2508 }
2509 case AMDGPU::ENTER_STRICT_WQM: {
2510 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2511 // STRICT_WQM is entered.
2512 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: LMC.MovOpc), DestReg: MI.getOperand(i: 0).getReg())
2513 .addReg(RegNo: LMC.ExecReg);
2514 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: LMC.WQMOpc), DestReg: LMC.ExecReg).addReg(RegNo: LMC.ExecReg);
2515
2516 MI.eraseFromParent();
2517 break;
2518 }
2519 case AMDGPU::EXIT_STRICT_WWM:
2520 case AMDGPU::EXIT_STRICT_WQM: {
2521 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2522 // WWM/STICT_WQM is exited.
2523 MI.setDesc(get(Opcode: LMC.MovOpc));
2524 break;
2525 }
2526 case AMDGPU::SI_RETURN: {
2527 const MachineFunction *MF = MBB.getParent();
2528 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2529 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2530 // Hiding the return address use with SI_RETURN may lead to extra kills in
2531 // the function and missing live-ins. We are fine in practice because callee
2532 // saved register handling ensures the register value is restored before
2533 // RET, but we need the undef flag here to appease the MachineVerifier
2534 // liveness checks.
2535 MachineInstrBuilder MIB =
2536 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETPC_B64_return))
2537 .addReg(RegNo: TRI->getReturnAddressReg(MF: *MF), Flags: RegState::Undef);
2538
2539 MIB.copyImplicitOps(OtherMI: MI);
2540 MI.eraseFromParent();
2541 break;
2542 }
2543
2544 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2545 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2546 MI.setDesc(get(Opcode: AMDGPU::S_MUL_U64));
2547 break;
2548
2549 case AMDGPU::S_GETPC_B64_pseudo:
2550 MI.setDesc(get(Opcode: AMDGPU::S_GETPC_B64));
2551 if (ST.hasGetPCZeroExtension()) {
2552 Register Dst = MI.getOperand(i: 0).getReg();
2553 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2554 // Fix up hardware that does not sign-extend the 48-bit PC value by
2555 // inserting: s_sext_i32_i16 dsthi, dsthi
2556 BuildMI(BB&: MBB, I: std::next(x: MI.getIterator()), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SEXT_I32_I16),
2557 DestReg: DstHi)
2558 .addReg(RegNo: DstHi);
2559 }
2560 break;
2561
2562 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2563 assert(ST.hasBF16PackedInsts());
2564 MI.setDesc(get(Opcode: AMDGPU::V_PK_MAX_NUM_BF16));
2565 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // op_sel
2566 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // neg_lo
2567 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // neg_hi
2568 auto Op0 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
2569 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2570 auto Op1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
2571 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2572 break;
2573 }
2574
2575 case AMDGPU::GET_STACK_BASE:
2576 // The stack starts at offset 0 unless we need to reserve some space at the
2577 // bottom.
2578 if (ST.getFrameLowering()->mayReserveScratchForCWSR(MF: *MBB.getParent())) {
2579 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2580 // some of the VGPRs. The size of the required scratch space has already
2581 // been computed by prolog epilog insertion.
2582 const SIMachineFunctionInfo *MFI =
2583 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2584 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2585 Register DestReg = MI.getOperand(i: 0).getReg();
2586 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETREG_B32), DestReg)
2587 .addImm(Val: AMDGPU::Hwreg::HwregEncoding::encode(
2588 Values: AMDGPU::Hwreg::ID_HW_ID2, Values: AMDGPU::Hwreg::OFFSET_ME_ID, Values: 2));
2589 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2590 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2591 // SCC, so we need to check for 0 manually.
2592 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U32)).addImm(Val: 0).addReg(RegNo: DestReg);
2593 // Change the implicif-def of SCC to an explicit use (but first remove
2594 // the dead flag if present).
2595 MI.getOperand(i: MI.getNumExplicitOperands()).setIsDead(false);
2596 MI.getOperand(i: MI.getNumExplicitOperands()).setIsUse();
2597 MI.setDesc(get(Opcode: AMDGPU::S_CMOVK_I32));
2598 MI.addOperand(Op: MachineOperand::CreateImm(Val: VGPRSize));
2599 } else {
2600 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B32));
2601 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2602 MI.removeOperand(
2603 OpNo: MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2604 }
2605 break;
2606 }
2607
2608 return true;
2609}
2610
2611void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
2612 MachineBasicBlock::iterator I, Register DestReg,
2613 unsigned SubIdx, const MachineInstr &Orig,
2614 LaneBitmask UsedLanes) const {
2615
2616 // Try shrinking the instruction to remat only the part needed for current
2617 // context.
2618 // TODO: Handle more cases.
2619 unsigned Opcode = Orig.getOpcode();
2620 switch (Opcode) {
2621 case AMDGPU::S_MOV_B64:
2622 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2623 if (SubIdx != 0)
2624 break;
2625
2626 if (!Orig.getOperand(i: 1).isImm())
2627 break;
2628
2629 // Shrink S_MOV_B64 to S_MOV_B32 when UsedLanes indicates only a single
2630 // 32-bit lane of the 64-bit value is live at the rematerialization point.
2631 if (UsedLanes.all())
2632 break;
2633
2634 // Determine which half of the 64-bit immediate corresponds to the use.
2635 unsigned OrigSubReg = Orig.getOperand(i: 0).getSubReg();
2636 unsigned LoSubReg = RI.composeSubRegIndices(a: OrigSubReg, b: AMDGPU::sub0);
2637 unsigned HiSubReg = RI.composeSubRegIndices(a: OrigSubReg, b: AMDGPU::sub1);
2638
2639 bool NeedLo = (UsedLanes & RI.getSubRegIndexLaneMask(SubIdx: LoSubReg)).any();
2640 bool NeedHi = (UsedLanes & RI.getSubRegIndexLaneMask(SubIdx: HiSubReg)).any();
2641
2642 if (NeedLo && NeedHi)
2643 break;
2644
2645 int64_t Imm64 = Orig.getOperand(i: 1).getImm();
2646 int32_t Imm32 = NeedLo ? Lo_32(Value: Imm64) : Hi_32(Value: Imm64);
2647
2648 unsigned UseSubReg = NeedLo ? LoSubReg : HiSubReg;
2649
2650 // Emit S_MOV_B32 defining just the needed 32-bit subreg of DestReg.
2651 BuildMI(BB&: MBB, I, MIMD: Orig.getDebugLoc(), MCID: get(Opcode: AMDGPU::S_MOV_B32))
2652 .addReg(RegNo: DestReg, Flags: RegState::Define | RegState::Undef, SubReg: UseSubReg)
2653 .addImm(Val: Imm32);
2654 return;
2655 }
2656
2657 case AMDGPU::S_LOAD_DWORDX16_IMM:
2658 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2659 if (SubIdx != 0)
2660 break;
2661
2662 if (I == MBB.end())
2663 break;
2664
2665 if (I->isBundled())
2666 break;
2667
2668 // Look for a single use of the register that is also a subreg.
2669 Register RegToFind = Orig.getOperand(i: 0).getReg();
2670 MachineOperand *UseMO = nullptr;
2671 for (auto &CandMO : I->operands()) {
2672 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2673 continue;
2674 if (UseMO) {
2675 UseMO = nullptr;
2676 break;
2677 }
2678 UseMO = &CandMO;
2679 }
2680 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2681 break;
2682
2683 unsigned Offset = RI.getSubRegIdxOffset(Idx: UseMO->getSubReg());
2684 unsigned SubregSize = RI.getSubRegIdxSize(Idx: UseMO->getSubReg());
2685
2686 MachineFunction *MF = MBB.getParent();
2687 MachineRegisterInfo &MRI = MF->getRegInfo();
2688 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2689
2690 unsigned NewOpcode = -1;
2691 if (SubregSize == 256)
2692 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2693 else if (SubregSize == 128)
2694 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2695 else
2696 break;
2697
2698 const MCInstrDesc &TID = get(Opcode: NewOpcode);
2699 const TargetRegisterClass *NewRC =
2700 RI.getAllocatableClass(RC: getRegClass(MCID: TID, OpNum: 0));
2701 MRI.setRegClass(Reg: DestReg, RC: NewRC);
2702
2703 UseMO->setReg(DestReg);
2704 UseMO->setSubReg(AMDGPU::NoSubRegister);
2705
2706 // Use a smaller load with the desired size, possibly with updated offset.
2707 MachineInstr *MI = MF->CloneMachineInstr(Orig: &Orig);
2708 MI->setDesc(TID);
2709 MI->getOperand(i: 0).setReg(DestReg);
2710 MI->getOperand(i: 0).setSubReg(AMDGPU::NoSubRegister);
2711 if (Offset) {
2712 MachineOperand *OffsetMO = getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset);
2713 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2714 OffsetMO->setImm(FinalOffset);
2715 }
2716 SmallVector<MachineMemOperand *> NewMMOs;
2717 for (const MachineMemOperand *MemOp : Orig.memoperands())
2718 NewMMOs.push_back(Elt: MF->getMachineMemOperand(MMO: MemOp, PtrInfo: MemOp->getPointerInfo(),
2719 Size: SubregSize / 8));
2720 MI->setMemRefs(MF&: *MF, MemRefs: NewMMOs);
2721
2722 MBB.insert(I, MI);
2723 return;
2724 }
2725
2726 default:
2727 break;
2728 }
2729
2730 TargetInstrInfo::reMaterialize(MBB, MI: I, DestReg, SubIdx, Orig, UsedLanes);
2731}
2732
2733std::pair<MachineInstr*, MachineInstr*>
2734SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
2735 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2736
2737 if (ST.hasMovB64() && ST.hasFeature(Feature: AMDGPU::FeatureDPALU_DPP) &&
2738 AMDGPU::isLegalDPALU_DPPControl(
2739 ST, DC: getNamedOperand(MI, OperandName: AMDGPU::OpName::dpp_ctrl)->getImm())) {
2740 MI.setDesc(get(Opcode: AMDGPU::V_MOV_B64_dpp));
2741 return std::pair(&MI, nullptr);
2742 }
2743
2744 MachineBasicBlock &MBB = *MI.getParent();
2745 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
2746 MachineFunction *MF = MBB.getParent();
2747 MachineRegisterInfo &MRI = MF->getRegInfo();
2748 Register Dst = MI.getOperand(i: 0).getReg();
2749 unsigned Part = 0;
2750 MachineInstr *Split[2];
2751
2752 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2753 auto MovDPP = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_dpp));
2754 if (Dst.isPhysical()) {
2755 MovDPP.addDef(RegNo: RI.getSubReg(Reg: Dst, Idx: Sub));
2756 } else {
2757 assert(MRI.isSSA());
2758 auto Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2759 MovDPP.addDef(RegNo: Tmp);
2760 }
2761
2762 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2763 const MachineOperand &SrcOp = MI.getOperand(i: I);
2764 assert(!SrcOp.isFPImm());
2765 if (SrcOp.isImm()) {
2766 APInt Imm(64, SrcOp.getImm());
2767 Imm.ashrInPlace(ShiftAmt: Part * 32);
2768 MovDPP.addImm(Val: Imm.getLoBits(numBits: 32).getZExtValue());
2769 } else {
2770 assert(SrcOp.isReg());
2771 Register Src = SrcOp.getReg();
2772 if (Src.isPhysical())
2773 MovDPP.addReg(RegNo: RI.getSubReg(Reg: Src, Idx: Sub));
2774 else
2775 MovDPP.addReg(RegNo: Src, Flags: getUndefRegState(B: SrcOp.isUndef()), SubReg: Sub);
2776 }
2777 }
2778
2779 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.explicit_operands(), N: 3))
2780 MovDPP.addImm(Val: MO.getImm());
2781
2782 Split[Part] = MovDPP;
2783 ++Part;
2784 }
2785
2786 if (Dst.isVirtual())
2787 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
2788 .addReg(RegNo: Split[0]->getOperand(i: 0).getReg())
2789 .addImm(Val: AMDGPU::sub0)
2790 .addReg(RegNo: Split[1]->getOperand(i: 0).getReg())
2791 .addImm(Val: AMDGPU::sub1);
2792
2793 MI.eraseFromParent();
2794 return std::pair(Split[0], Split[1]);
2795}
2796
2797std::optional<DestSourcePair>
2798SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
2799 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2800 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 1)};
2801
2802 return std::nullopt;
2803}
2804
2805bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0,
2806 AMDGPU::OpName Src0OpName,
2807 MachineOperand &Src1,
2808 AMDGPU::OpName Src1OpName) const {
2809 MachineOperand *Src0Mods = getNamedOperand(MI, OperandName: Src0OpName);
2810 if (!Src0Mods)
2811 return false;
2812
2813 MachineOperand *Src1Mods = getNamedOperand(MI, OperandName: Src1OpName);
2814 assert(Src1Mods &&
2815 "All commutable instructions have both src0 and src1 modifiers");
2816
2817 int Src0ModsVal = Src0Mods->getImm();
2818 int Src1ModsVal = Src1Mods->getImm();
2819
2820 Src1Mods->setImm(Src0ModsVal);
2821 Src0Mods->setImm(Src1ModsVal);
2822 return true;
2823}
2824
2825static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
2826 MachineOperand &RegOp,
2827 MachineOperand &NonRegOp) {
2828 Register Reg = RegOp.getReg();
2829 unsigned SubReg = RegOp.getSubReg();
2830 bool IsKill = RegOp.isKill();
2831 bool IsDead = RegOp.isDead();
2832 bool IsUndef = RegOp.isUndef();
2833 bool IsDebug = RegOp.isDebug();
2834
2835 if (NonRegOp.isImm())
2836 RegOp.ChangeToImmediate(ImmVal: NonRegOp.getImm());
2837 else if (NonRegOp.isFI())
2838 RegOp.ChangeToFrameIndex(Idx: NonRegOp.getIndex());
2839 else if (NonRegOp.isGlobal()) {
2840 RegOp.ChangeToGA(GV: NonRegOp.getGlobal(), Offset: NonRegOp.getOffset(),
2841 TargetFlags: NonRegOp.getTargetFlags());
2842 } else
2843 return nullptr;
2844
2845 // Make sure we don't reinterpret a subreg index in the target flags.
2846 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2847
2848 NonRegOp.ChangeToRegister(Reg, isDef: false, isImp: false, isKill: IsKill, isDead: IsDead, isUndef: IsUndef, isDebug: IsDebug);
2849 NonRegOp.setSubReg(SubReg);
2850
2851 return &MI;
2852}
2853
2854static MachineInstr *swapImmOperands(MachineInstr &MI,
2855 MachineOperand &NonRegOp1,
2856 MachineOperand &NonRegOp2) {
2857 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2858 int64_t NonRegVal = NonRegOp1.getImm();
2859
2860 NonRegOp1.setImm(NonRegOp2.getImm());
2861 NonRegOp2.setImm(NonRegVal);
2862 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2863 NonRegOp2.setTargetFlags(TargetFlags);
2864 return &MI;
2865}
2866
2867bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2868 unsigned OpIdx1) const {
2869 const MCInstrDesc &InstDesc = MI.getDesc();
2870 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2871 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2872
2873 unsigned Opc = MI.getOpcode();
2874 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
2875
2876 const MachineOperand &MO0 = MI.getOperand(i: OpIdx0);
2877 const MachineOperand &MO1 = MI.getOperand(i: OpIdx1);
2878
2879 // Swap doesn't breach constant bus or literal limits
2880 // It may move literal to position other than src0, this is not allowed
2881 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2882 // FIXME: After gfx9, literal can be in place other than Src0
2883 if (isVALU(MI)) {
2884 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2885 !isInlineConstant(MO: MO0, OpInfo: OpInfo1))
2886 return false;
2887 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2888 !isInlineConstant(MO: MO1, OpInfo: OpInfo0))
2889 return false;
2890 }
2891
2892 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2893 if (OpInfo1.RegClass == -1)
2894 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2895 return isLegalRegOperand(MI, OpIdx: OpIdx1, MO: MO0) &&
2896 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx: OpIdx0, MO: MO1));
2897 }
2898 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2899 if (OpInfo0.RegClass == -1)
2900 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2901 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx: OpIdx1, MO: MO0)) &&
2902 isLegalRegOperand(MI, OpIdx: OpIdx0, MO: MO1);
2903 }
2904
2905 // No need to check 64-bit literals since swapping does not bring new
2906 // 64-bit literals into current instruction to fold to 32-bit
2907
2908 return isImmOperandLegal(MI, OpNo: OpIdx1, MO: MO0);
2909}
2910
2911MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
2912 unsigned Src0Idx,
2913 unsigned Src1Idx) const {
2914 assert(!NewMI && "this should never be used");
2915
2916 unsigned Opc = MI.getOpcode();
2917 int CommutedOpcode = commuteOpcode(Opcode: Opc);
2918 if (CommutedOpcode == -1)
2919 return nullptr;
2920
2921 if (Src0Idx > Src1Idx)
2922 std::swap(a&: Src0Idx, b&: Src1Idx);
2923
2924 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2925 static_cast<int>(Src0Idx) &&
2926 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2927 static_cast<int>(Src1Idx) &&
2928 "inconsistency with findCommutedOpIndices");
2929
2930 if (!isLegalToSwap(MI, OpIdx0: Src0Idx, OpIdx1: Src1Idx))
2931 return nullptr;
2932
2933 MachineInstr *CommutedMI = nullptr;
2934 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
2935 MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
2936 if (Src0.isReg() && Src1.isReg()) {
2937 // Be sure to copy the source modifiers to the right place.
2938 CommutedMI =
2939 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1: Src0Idx, OpIdx2: Src1Idx);
2940 } else if (Src0.isReg() && !Src1.isReg()) {
2941 CommutedMI = swapRegAndNonRegOperand(MI, RegOp&: Src0, NonRegOp&: Src1);
2942 } else if (!Src0.isReg() && Src1.isReg()) {
2943 CommutedMI = swapRegAndNonRegOperand(MI, RegOp&: Src1, NonRegOp&: Src0);
2944 } else if (Src0.isImm() && Src1.isImm()) {
2945 CommutedMI = swapImmOperands(MI, NonRegOp1&: Src0, NonRegOp2&: Src1);
2946 } else {
2947 // FIXME: Found two non registers to commute. This does happen.
2948 return nullptr;
2949 }
2950
2951 if (CommutedMI) {
2952 swapSourceModifiers(MI, Src0, Src0OpName: AMDGPU::OpName::src0_modifiers,
2953 Src1, Src1OpName: AMDGPU::OpName::src1_modifiers);
2954
2955 swapSourceModifiers(MI, Src0, Src0OpName: AMDGPU::OpName::src0_sel, Src1,
2956 Src1OpName: AMDGPU::OpName::src1_sel);
2957
2958 CommutedMI->setDesc(get(Opcode: CommutedOpcode));
2959 }
2960
2961 return CommutedMI;
2962}
2963
2964// This needs to be implemented because the source modifiers may be inserted
2965// between the true commutable operands, and the base
2966// TargetInstrInfo::commuteInstruction uses it.
2967bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
2968 unsigned &SrcOpIdx0,
2969 unsigned &SrcOpIdx1) const {
2970 return findCommutedOpIndices(Desc: MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2971}
2972
2973bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc,
2974 unsigned &SrcOpIdx0,
2975 unsigned &SrcOpIdx1) const {
2976 if (!Desc.isCommutable())
2977 return false;
2978
2979 unsigned Opc = Desc.getOpcode();
2980 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
2981 if (Src0Idx == -1)
2982 return false;
2983
2984 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1);
2985 if (Src1Idx == -1)
2986 return false;
2987
2988 return fixCommutedOpIndices(ResultIdx1&: SrcOpIdx0, ResultIdx2&: SrcOpIdx1, CommutableOpIdx1: Src0Idx, CommutableOpIdx2: Src1Idx);
2989}
2990
2991bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
2992 int64_t BrOffset) const {
2993 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2994 // because its dest block is unanalyzable.
2995 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2996
2997 // Convert to dwords.
2998 BrOffset /= 4;
2999
3000 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
3001 // from the next instruction.
3002 BrOffset -= 1;
3003
3004 return isIntN(N: BranchOffsetBits, x: BrOffset);
3005}
3006
3007MachineBasicBlock *
3008SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
3009 return MI.getOperand(i: 0).getMBB();
3010}
3011
3012bool SIInstrInfo::hasDivergentBranch(const MachineBasicBlock *MBB) const {
3013 for (const MachineInstr &MI : MBB->terminators()) {
3014 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
3015 MI.getOpcode() == AMDGPU::SI_LOOP)
3016 return true;
3017 }
3018 return false;
3019}
3020
3021void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
3022 MachineBasicBlock &DestBB,
3023 MachineBasicBlock &RestoreBB,
3024 const DebugLoc &DL, int64_t BrOffset,
3025 RegScavenger *RS) const {
3026 assert(MBB.empty() &&
3027 "new block should be inserted for expanding unconditional branch");
3028 assert(MBB.pred_size() == 1);
3029 assert(RestoreBB.empty() &&
3030 "restore block should be inserted for restoring clobbered registers");
3031
3032 MachineFunction *MF = MBB.getParent();
3033 MachineRegisterInfo &MRI = MF->getRegInfo();
3034 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
3035 auto I = MBB.end();
3036 auto &MCCtx = MF->getContext();
3037
3038 if (ST.useAddPC64Inst()) {
3039 MCSymbol *Offset =
3040 MCCtx.createTempSymbol(Name: "offset", /*AlwaysAddSuffix=*/true);
3041 auto AddPC = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_PC_I64))
3042 .addSym(Sym: Offset, TargetFlags: MO_FAR_BRANCH_OFFSET);
3043 MCSymbol *PostAddPCLabel =
3044 MCCtx.createTempSymbol(Name: "post_addpc", /*AlwaysAddSuffix=*/true);
3045 AddPC->setPostInstrSymbol(MF&: *MF, Symbol: PostAddPCLabel);
3046 auto *OffsetExpr = MCBinaryExpr::createSub(
3047 LHS: MCSymbolRefExpr::create(Symbol: DestBB.getSymbol(), Ctx&: MCCtx),
3048 RHS: MCSymbolRefExpr::create(Symbol: PostAddPCLabel, Ctx&: MCCtx), Ctx&: MCCtx);
3049 Offset->setVariableValue(OffsetExpr);
3050 return;
3051 }
3052
3053 assert(RS && "RegScavenger required for long branching");
3054
3055 // FIXME: Virtual register workaround for RegScavenger not working with empty
3056 // blocks.
3057 Register PCReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
3058
3059 // Note: as this is used after hazard recognizer we need to apply some hazard
3060 // workarounds directly.
3061 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
3062 ST.hasVALUReadSGPRHazard();
3063 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
3064 if (FlushSGPRWrites)
3065 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3066 .addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST));
3067 };
3068
3069 // We need to compute the offset relative to the instruction immediately after
3070 // s_getpc_b64. Insert pc arithmetic code before last terminator.
3071 MachineInstr *GetPC = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: PCReg);
3072 ApplyHazardWorkarounds();
3073
3074 MCSymbol *PostGetPCLabel =
3075 MCCtx.createTempSymbol(Name: "post_getpc", /*AlwaysAddSuffix=*/true);
3076 GetPC->setPostInstrSymbol(MF&: *MF, Symbol: PostGetPCLabel);
3077
3078 MCSymbol *OffsetLo =
3079 MCCtx.createTempSymbol(Name: "offset_lo", /*AlwaysAddSuffix=*/true);
3080 MCSymbol *OffsetHi =
3081 MCCtx.createTempSymbol(Name: "offset_hi", /*AlwaysAddSuffix=*/true);
3082 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U32))
3083 .addReg(RegNo: PCReg, Flags: RegState::Define, SubReg: AMDGPU::sub0)
3084 .addReg(RegNo: PCReg, Flags: {}, SubReg: AMDGPU::sub0)
3085 .addSym(Sym: OffsetLo, TargetFlags: MO_FAR_BRANCH_OFFSET);
3086 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADDC_U32))
3087 .addReg(RegNo: PCReg, Flags: RegState::Define, SubReg: AMDGPU::sub1)
3088 .addReg(RegNo: PCReg, Flags: {}, SubReg: AMDGPU::sub1)
3089 .addSym(Sym: OffsetHi, TargetFlags: MO_FAR_BRANCH_OFFSET);
3090 ApplyHazardWorkarounds();
3091
3092 // Insert the indirect branch after the other terminator.
3093 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETPC_B64))
3094 .addReg(RegNo: PCReg);
3095
3096 // If a spill is needed for the pc register pair, we need to insert a spill
3097 // restore block right before the destination block, and insert a short branch
3098 // into the old destination block's fallthrough predecessor.
3099 // e.g.:
3100 //
3101 // s_cbranch_scc0 skip_long_branch:
3102 //
3103 // long_branch_bb:
3104 // spill s[8:9]
3105 // s_getpc_b64 s[8:9]
3106 // s_add_u32 s8, s8, restore_bb
3107 // s_addc_u32 s9, s9, 0
3108 // s_setpc_b64 s[8:9]
3109 //
3110 // skip_long_branch:
3111 // foo;
3112 //
3113 // .....
3114 //
3115 // dest_bb_fallthrough_predecessor:
3116 // bar;
3117 // s_branch dest_bb
3118 //
3119 // restore_bb:
3120 // restore s[8:9]
3121 // fallthrough dest_bb
3122 ///
3123 // dest_bb:
3124 // buzz;
3125
3126 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3127 Register Scav;
3128
3129 // If we've previously reserved a register for long branches
3130 // avoid running the scavenger and just use those registers
3131 if (LongBranchReservedReg) {
3132 RS->enterBasicBlock(MBB);
3133 Scav = LongBranchReservedReg;
3134 } else {
3135 RS->enterBasicBlockEnd(MBB);
3136 Scav = RS->scavengeRegisterBackwards(
3137 RC: AMDGPU::SReg_64RegClass, To: MachineBasicBlock::iterator(GetPC),
3138 /* RestoreAfter */ false, SPAdj: 0, /* AllowSpill */ false);
3139 }
3140 if (Scav) {
3141 RS->setRegUsed(Reg: Scav);
3142 MRI.replaceRegWith(FromReg: PCReg, ToReg: Scav);
3143 MRI.clearVirtRegs();
3144 } else {
3145 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3146 // SGPR spill.
3147 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3148 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3149 TRI->spillEmergencySGPR(MI: GetPC, RestoreMBB&: RestoreBB, SGPR: AMDGPU::SGPR0_SGPR1, RS);
3150 MRI.replaceRegWith(FromReg: PCReg, ToReg: AMDGPU::SGPR0_SGPR1);
3151 MRI.clearVirtRegs();
3152 }
3153
3154 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3155 // Now, the distance could be defined.
3156 auto *Offset = MCBinaryExpr::createSub(
3157 LHS: MCSymbolRefExpr::create(Symbol: DestLabel, Ctx&: MCCtx),
3158 RHS: MCSymbolRefExpr::create(Symbol: PostGetPCLabel, Ctx&: MCCtx), Ctx&: MCCtx);
3159 // Add offset assignments.
3160 auto *Mask = MCConstantExpr::create(Value: 0xFFFFFFFFULL, Ctx&: MCCtx);
3161 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(LHS: Offset, RHS: Mask, Ctx&: MCCtx));
3162 auto *ShAmt = MCConstantExpr::create(Value: 32, Ctx&: MCCtx);
3163 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(LHS: Offset, RHS: ShAmt, Ctx&: MCCtx));
3164}
3165
3166unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3167 switch (Cond) {
3168 case SIInstrInfo::SCC_TRUE:
3169 return AMDGPU::S_CBRANCH_SCC1;
3170 case SIInstrInfo::SCC_FALSE:
3171 return AMDGPU::S_CBRANCH_SCC0;
3172 case SIInstrInfo::VCCNZ:
3173 return AMDGPU::S_CBRANCH_VCCNZ;
3174 case SIInstrInfo::VCCZ:
3175 return AMDGPU::S_CBRANCH_VCCZ;
3176 case SIInstrInfo::EXECNZ:
3177 return AMDGPU::S_CBRANCH_EXECNZ;
3178 case SIInstrInfo::EXECZ:
3179 return AMDGPU::S_CBRANCH_EXECZ;
3180 default:
3181 llvm_unreachable("invalid branch predicate");
3182 }
3183}
3184
3185SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3186 switch (Opcode) {
3187 case AMDGPU::S_CBRANCH_SCC0:
3188 return SCC_FALSE;
3189 case AMDGPU::S_CBRANCH_SCC1:
3190 return SCC_TRUE;
3191 case AMDGPU::S_CBRANCH_VCCNZ:
3192 return VCCNZ;
3193 case AMDGPU::S_CBRANCH_VCCZ:
3194 return VCCZ;
3195 case AMDGPU::S_CBRANCH_EXECNZ:
3196 return EXECNZ;
3197 case AMDGPU::S_CBRANCH_EXECZ:
3198 return EXECZ;
3199 default:
3200 return INVALID_BR;
3201 }
3202}
3203
3204bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
3205 MachineBasicBlock::iterator I,
3206 MachineBasicBlock *&TBB,
3207 MachineBasicBlock *&FBB,
3208 SmallVectorImpl<MachineOperand> &Cond,
3209 bool AllowModify) const {
3210 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3211 // Unconditional Branch
3212 TBB = I->getOperand(i: 0).getMBB();
3213 return false;
3214 }
3215
3216 BranchPredicate Pred = getBranchPredicate(Opcode: I->getOpcode());
3217 if (Pred == INVALID_BR)
3218 return true;
3219
3220 MachineBasicBlock *CondBB = I->getOperand(i: 0).getMBB();
3221 Cond.push_back(Elt: MachineOperand::CreateImm(Val: Pred));
3222 Cond.push_back(Elt: I->getOperand(i: 1)); // Save the branch register.
3223
3224 ++I;
3225
3226 if (I == MBB.end()) {
3227 // Conditional branch followed by fall-through.
3228 TBB = CondBB;
3229 return false;
3230 }
3231
3232 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3233 TBB = CondBB;
3234 FBB = I->getOperand(i: 0).getMBB();
3235 return false;
3236 }
3237
3238 return true;
3239}
3240
3241bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
3242 MachineBasicBlock *&FBB,
3243 SmallVectorImpl<MachineOperand> &Cond,
3244 bool AllowModify) const {
3245 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3246 auto E = MBB.end();
3247 if (I == E)
3248 return false;
3249
3250 // Skip over the instructions that are artificially terminators for special
3251 // exec management.
3252 while (I != E && !I->isBranch() && !I->isReturn()) {
3253 switch (I->getOpcode()) {
3254 case AMDGPU::S_MOV_B64_term:
3255 case AMDGPU::S_XOR_B64_term:
3256 case AMDGPU::S_OR_B64_term:
3257 case AMDGPU::S_ANDN2_B64_term:
3258 case AMDGPU::S_AND_B64_term:
3259 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3260 case AMDGPU::S_MOV_B32_term:
3261 case AMDGPU::S_XOR_B32_term:
3262 case AMDGPU::S_OR_B32_term:
3263 case AMDGPU::S_ANDN2_B32_term:
3264 case AMDGPU::S_AND_B32_term:
3265 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3266 break;
3267 case AMDGPU::SI_IF:
3268 case AMDGPU::SI_ELSE:
3269 case AMDGPU::SI_KILL_I1_TERMINATOR:
3270 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3271 // FIXME: It's messy that these need to be considered here at all.
3272 return true;
3273 default:
3274 llvm_unreachable("unexpected non-branch terminator inst");
3275 }
3276
3277 ++I;
3278 }
3279
3280 if (I == E)
3281 return false;
3282
3283 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3284}
3285
3286unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
3287 int *BytesRemoved) const {
3288 unsigned Count = 0;
3289 unsigned RemovedSize = 0;
3290 for (MachineInstr &MI : llvm::make_early_inc_range(Range: MBB.terminators())) {
3291 // Skip over artificial terminators when removing instructions.
3292 if (MI.isBranch() || MI.isReturn()) {
3293 RemovedSize += getInstSizeInBytes(MI);
3294 MI.eraseFromParent();
3295 ++Count;
3296 }
3297 }
3298
3299 if (BytesRemoved)
3300 *BytesRemoved = RemovedSize;
3301
3302 return Count;
3303}
3304
3305// Copy the flags onto the implicit condition register operand.
3306static void preserveCondRegFlags(MachineOperand &CondReg,
3307 const MachineOperand &OrigCond) {
3308 CondReg.setIsUndef(OrigCond.isUndef());
3309 CondReg.setIsKill(OrigCond.isKill());
3310}
3311
3312unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
3313 MachineBasicBlock *TBB,
3314 MachineBasicBlock *FBB,
3315 ArrayRef<MachineOperand> Cond,
3316 const DebugLoc &DL,
3317 int *BytesAdded) const {
3318 if (!FBB && Cond.empty()) {
3319 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
3320 .addMBB(MBB: TBB);
3321 if (BytesAdded)
3322 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3323 return 1;
3324 }
3325
3326 assert(TBB && Cond[0].isImm());
3327
3328 unsigned Opcode
3329 = getBranchOpcode(Cond: static_cast<BranchPredicate>(Cond[0].getImm()));
3330
3331 if (!FBB) {
3332 MachineInstr *CondBr =
3333 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode))
3334 .addMBB(MBB: TBB);
3335
3336 // Copy the flags onto the implicit condition register operand.
3337 preserveCondRegFlags(CondReg&: CondBr->getOperand(i: 1), OrigCond: Cond[1]);
3338 fixImplicitOperands(MI&: *CondBr);
3339
3340 if (BytesAdded)
3341 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3342 return 1;
3343 }
3344
3345 assert(TBB && FBB);
3346
3347 MachineInstr *CondBr =
3348 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode))
3349 .addMBB(MBB: TBB);
3350 fixImplicitOperands(MI&: *CondBr);
3351 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
3352 .addMBB(MBB: FBB);
3353
3354 MachineOperand &CondReg = CondBr->getOperand(i: 1);
3355 CondReg.setIsUndef(Cond[1].isUndef());
3356 CondReg.setIsKill(Cond[1].isKill());
3357
3358 if (BytesAdded)
3359 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3360
3361 return 2;
3362}
3363
3364bool SIInstrInfo::reverseBranchCondition(
3365 SmallVectorImpl<MachineOperand> &Cond) const {
3366 if (Cond.size() != 2) {
3367 return true;
3368 }
3369
3370 if (Cond[0].isImm()) {
3371 Cond[0].setImm(-Cond[0].getImm());
3372 return false;
3373 }
3374
3375 return true;
3376}
3377
3378bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
3379 ArrayRef<MachineOperand> Cond,
3380 Register DstReg, Register TrueReg,
3381 Register FalseReg, int &CondCycles,
3382 int &TrueCycles, int &FalseCycles) const {
3383 switch (Cond[0].getImm()) {
3384 case VCCNZ:
3385 case VCCZ: {
3386 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3387 const TargetRegisterClass *RC = MRI.getRegClass(Reg: TrueReg);
3388 if (MRI.getRegClass(Reg: FalseReg) != RC)
3389 return false;
3390
3391 int NumInsts = AMDGPU::getRegBitWidth(RC: *RC) / 32;
3392 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3393
3394 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3395 return RI.hasVGPRs(RC) && NumInsts <= 6;
3396 }
3397 case SCC_TRUE:
3398 case SCC_FALSE: {
3399 // FIXME: We could insert for VGPRs if we could replace the original compare
3400 // with a vector one.
3401 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3402 const TargetRegisterClass *RC = MRI.getRegClass(Reg: TrueReg);
3403 if (MRI.getRegClass(Reg: FalseReg) != RC)
3404 return false;
3405
3406 int NumInsts = AMDGPU::getRegBitWidth(RC: *RC) / 32;
3407
3408 // Multiples of 8 can do s_cselect_b64
3409 if (NumInsts % 2 == 0)
3410 NumInsts /= 2;
3411
3412 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3413 return RI.isSGPRClass(RC);
3414 }
3415 default:
3416 return false;
3417 }
3418}
3419
3420void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
3421 MachineBasicBlock::iterator I, const DebugLoc &DL,
3422 Register DstReg, ArrayRef<MachineOperand> Cond,
3423 Register TrueReg, Register FalseReg) const {
3424 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3425 if (Pred == VCCZ || Pred == SCC_FALSE) {
3426 Pred = static_cast<BranchPredicate>(-Pred);
3427 std::swap(a&: TrueReg, b&: FalseReg);
3428 }
3429
3430 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3431 const TargetRegisterClass *DstRC = MRI.getRegClass(Reg: DstReg);
3432 unsigned DstSize = RI.getRegSizeInBits(RC: *DstRC);
3433
3434 if (DstSize == 32) {
3435 MachineInstr *Select;
3436 if (Pred == SCC_TRUE) {
3437 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: DstReg)
3438 .addReg(RegNo: TrueReg)
3439 .addReg(RegNo: FalseReg);
3440 } else {
3441 // Instruction's operands are backwards from what is expected.
3442 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e32), DestReg: DstReg)
3443 .addReg(RegNo: FalseReg)
3444 .addReg(RegNo: TrueReg);
3445 }
3446
3447 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3448 return;
3449 }
3450
3451 if (DstSize == 64 && Pred == SCC_TRUE) {
3452 MachineInstr *Select =
3453 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: DstReg)
3454 .addReg(RegNo: TrueReg)
3455 .addReg(RegNo: FalseReg);
3456
3457 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3458 return;
3459 }
3460
3461 static const int16_t Sub0_15[] = {
3462 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3463 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3464 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3465 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3466 };
3467
3468 static const int16_t Sub0_15_64[] = {
3469 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3470 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3471 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3472 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3473 };
3474
3475 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3476 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3477 const int16_t *SubIndices = Sub0_15;
3478 int NElts = DstSize / 32;
3479
3480 // 64-bit select is only available for SALU.
3481 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3482 if (Pred == SCC_TRUE) {
3483 if (NElts % 2) {
3484 SelOp = AMDGPU::S_CSELECT_B32;
3485 EltRC = &AMDGPU::SGPR_32RegClass;
3486 } else {
3487 SelOp = AMDGPU::S_CSELECT_B64;
3488 EltRC = &AMDGPU::SGPR_64RegClass;
3489 SubIndices = Sub0_15_64;
3490 NElts /= 2;
3491 }
3492 }
3493
3494 MachineInstrBuilder MIB = BuildMI(
3495 BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg);
3496
3497 I = MIB->getIterator();
3498
3499 SmallVector<Register, 8> Regs;
3500 for (int Idx = 0; Idx != NElts; ++Idx) {
3501 Register DstElt = MRI.createVirtualRegister(RegClass: EltRC);
3502 Regs.push_back(Elt: DstElt);
3503
3504 unsigned SubIdx = SubIndices[Idx];
3505
3506 MachineInstr *Select;
3507 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3508 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SelOp), DestReg: DstElt)
3509 .addReg(RegNo: FalseReg, Flags: {}, SubReg: SubIdx)
3510 .addReg(RegNo: TrueReg, Flags: {}, SubReg: SubIdx);
3511 } else {
3512 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SelOp), DestReg: DstElt)
3513 .addReg(RegNo: TrueReg, Flags: {}, SubReg: SubIdx)
3514 .addReg(RegNo: FalseReg, Flags: {}, SubReg: SubIdx);
3515 }
3516
3517 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3518 fixImplicitOperands(MI&: *Select);
3519
3520 MIB.addReg(RegNo: DstElt)
3521 .addImm(Val: SubIdx);
3522 }
3523}
3524
3525bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
3526 switch (MI.getOpcode()) {
3527 case AMDGPU::V_MOV_B16_t16_e32:
3528 case AMDGPU::V_MOV_B16_t16_e64:
3529 case AMDGPU::V_MOV_B32_e32:
3530 case AMDGPU::V_MOV_B32_e64:
3531 case AMDGPU::V_MOV_B64_PSEUDO:
3532 case AMDGPU::V_MOV_B64_e32:
3533 case AMDGPU::V_MOV_B64_e64:
3534 case AMDGPU::S_MOV_B32:
3535 case AMDGPU::S_MOV_B64:
3536 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3537 case AMDGPU::COPY:
3538 case AMDGPU::WWM_COPY:
3539 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3540 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3541 case AMDGPU::V_ACCVGPR_MOV_B32:
3542 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3543 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3544 return true;
3545 default:
3546 return false;
3547 }
3548}
3549
3550unsigned SIInstrInfo::getFoldableCopySrcIdx(const MachineInstr &MI) {
3551 switch (MI.getOpcode()) {
3552 case AMDGPU::V_MOV_B16_t16_e32:
3553 case AMDGPU::V_MOV_B16_t16_e64:
3554 return 2;
3555 case AMDGPU::V_MOV_B32_e32:
3556 case AMDGPU::V_MOV_B32_e64:
3557 case AMDGPU::V_MOV_B64_PSEUDO:
3558 case AMDGPU::V_MOV_B64_e32:
3559 case AMDGPU::V_MOV_B64_e64:
3560 case AMDGPU::S_MOV_B32:
3561 case AMDGPU::S_MOV_B64:
3562 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3563 case AMDGPU::COPY:
3564 case AMDGPU::WWM_COPY:
3565 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3566 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3567 case AMDGPU::V_ACCVGPR_MOV_B32:
3568 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3569 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3570 return 1;
3571 default:
3572 llvm_unreachable("MI is not a foldable copy");
3573 }
3574}
3575
3576static constexpr AMDGPU::OpName ModifierOpNames[] = {
3577 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3578 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3579 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3580
3581void SIInstrInfo::removeModOperands(MachineInstr &MI) const {
3582 unsigned Opc = MI.getOpcode();
3583 for (AMDGPU::OpName Name : reverse(C: ModifierOpNames)) {
3584 int Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name);
3585 if (Idx >= 0)
3586 MI.removeOperand(OpNo: Idx);
3587 }
3588}
3589
3590void SIInstrInfo::mutateAndCleanupImplicit(MachineInstr &MI,
3591 const MCInstrDesc &NewDesc) const {
3592 MI.setDesc(NewDesc);
3593
3594 // Remove any leftover implicit operands from mutating the instruction. e.g.
3595 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3596 // anymore.
3597 const MCInstrDesc &Desc = MI.getDesc();
3598 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3599 Desc.implicit_defs().size();
3600
3601 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3602 MI.removeOperand(OpNo: I);
3603}
3604
3605std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3606 unsigned SubRegIndex) {
3607 switch (SubRegIndex) {
3608 case AMDGPU::NoSubRegister:
3609 return Imm;
3610 case AMDGPU::sub0:
3611 return SignExtend64<32>(x: Imm);
3612 case AMDGPU::sub1:
3613 return SignExtend64<32>(x: Imm >> 32);
3614 case AMDGPU::lo16:
3615 return SignExtend64<16>(x: Imm);
3616 case AMDGPU::hi16:
3617 return SignExtend64<16>(x: Imm >> 16);
3618 case AMDGPU::sub1_lo16:
3619 return SignExtend64<16>(x: Imm >> 32);
3620 case AMDGPU::sub1_hi16:
3621 return SignExtend64<16>(x: Imm >> 48);
3622 default:
3623 return std::nullopt;
3624 }
3625
3626 llvm_unreachable("covered subregister switch");
3627}
3628
3629static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3630 switch (Opc) {
3631 case AMDGPU::V_MAC_F16_e32:
3632 case AMDGPU::V_MAC_F16_e64:
3633 case AMDGPU::V_MAD_F16_e64:
3634 return AMDGPU::V_MADAK_F16;
3635 case AMDGPU::V_MAC_F32_e32:
3636 case AMDGPU::V_MAC_F32_e64:
3637 case AMDGPU::V_MAD_F32_e64:
3638 return AMDGPU::V_MADAK_F32;
3639 case AMDGPU::V_FMAC_F32_e32:
3640 case AMDGPU::V_FMAC_F32_e64:
3641 case AMDGPU::V_FMA_F32_e64:
3642 return AMDGPU::V_FMAAK_F32;
3643 case AMDGPU::V_FMAC_F16_e32:
3644 case AMDGPU::V_FMAC_F16_e64:
3645 case AMDGPU::V_FMAC_F16_t16_e64:
3646 case AMDGPU::V_FMAC_F16_fake16_e64:
3647 case AMDGPU::V_FMAC_F16_t16_e32:
3648 case AMDGPU::V_FMAC_F16_fake16_e32:
3649 case AMDGPU::V_FMA_F16_e64:
3650 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3651 ? AMDGPU::V_FMAAK_F16_t16
3652 : AMDGPU::V_FMAAK_F16_fake16
3653 : AMDGPU::V_FMAAK_F16;
3654 case AMDGPU::V_FMAC_F64_e32:
3655 case AMDGPU::V_FMAC_F64_e64:
3656 case AMDGPU::V_FMA_F64_e64:
3657 return AMDGPU::V_FMAAK_F64;
3658 default:
3659 llvm_unreachable("invalid instruction");
3660 }
3661}
3662
3663static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3664 switch (Opc) {
3665 case AMDGPU::V_MAC_F16_e32:
3666 case AMDGPU::V_MAC_F16_e64:
3667 case AMDGPU::V_MAD_F16_e64:
3668 return AMDGPU::V_MADMK_F16;
3669 case AMDGPU::V_MAC_F32_e32:
3670 case AMDGPU::V_MAC_F32_e64:
3671 case AMDGPU::V_MAD_F32_e64:
3672 return AMDGPU::V_MADMK_F32;
3673 case AMDGPU::V_FMAC_F32_e32:
3674 case AMDGPU::V_FMAC_F32_e64:
3675 case AMDGPU::V_FMA_F32_e64:
3676 return AMDGPU::V_FMAMK_F32;
3677 case AMDGPU::V_FMAC_F16_e32:
3678 case AMDGPU::V_FMAC_F16_e64:
3679 case AMDGPU::V_FMAC_F16_t16_e64:
3680 case AMDGPU::V_FMAC_F16_fake16_e64:
3681 case AMDGPU::V_FMAC_F16_t16_e32:
3682 case AMDGPU::V_FMAC_F16_fake16_e32:
3683 case AMDGPU::V_FMA_F16_e64:
3684 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3685 ? AMDGPU::V_FMAMK_F16_t16
3686 : AMDGPU::V_FMAMK_F16_fake16
3687 : AMDGPU::V_FMAMK_F16;
3688 case AMDGPU::V_FMAC_F64_e32:
3689 case AMDGPU::V_FMAC_F64_e64:
3690 case AMDGPU::V_FMA_F64_e64:
3691 return AMDGPU::V_FMAMK_F64;
3692 default:
3693 llvm_unreachable("invalid instruction");
3694 }
3695}
3696
3697bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3698 Register Reg, MachineRegisterInfo *MRI) const {
3699 int64_t Imm;
3700 if (!getConstValDefinedInReg(MI: DefMI, Reg, ImmVal&: Imm))
3701 return false;
3702
3703 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(RegNo: Reg);
3704
3705 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3706
3707 unsigned Opc = UseMI.getOpcode();
3708 if (Opc == AMDGPU::COPY) {
3709 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3710
3711 Register DstReg = UseMI.getOperand(i: 0).getReg();
3712 Register UseSubReg = UseMI.getOperand(i: 1).getSubReg();
3713
3714 const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI: *MRI, Reg: DstReg);
3715
3716 if (HasMultipleUses) {
3717 // TODO: This should fold in more cases with multiple use, but we need to
3718 // more carefully consider what those uses are.
3719 unsigned ImmDefSize = RI.getRegSizeInBits(RC: *MRI->getRegClass(Reg));
3720
3721 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3722 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3723 return false;
3724
3725 // Most of the time folding a 32-bit inline constant is free (though this
3726 // might not be true if we can't later fold it into a real user).
3727 //
3728 // FIXME: This isInlineConstant check is imprecise if
3729 // getConstValDefinedInReg handled the tricky non-mov cases.
3730 if (ImmDefSize == 32 &&
3731 !isInlineConstant(ImmVal: Imm, OperandType: AMDGPU::OPERAND_REG_IMM_INT32))
3732 return false;
3733 }
3734
3735 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3736 RI.getSubRegIdxSize(Idx: UseSubReg) == 16;
3737
3738 if (Is16Bit) {
3739 if (RI.hasVGPRs(RC: DstRC))
3740 return false; // Do not clobber vgpr_hi16
3741
3742 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3743 return false;
3744 }
3745
3746 MachineFunction *MF = UseMI.getMF();
3747
3748 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3749 MCRegister MovDstPhysReg =
3750 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3751
3752 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, SubRegIndex: UseSubReg);
3753
3754 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3755 for (unsigned MovOp :
3756 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3757 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3758 const MCInstrDesc &MovDesc = get(Opcode: MovOp);
3759
3760 const TargetRegisterClass *MovDstRC = getRegClass(MCID: MovDesc, OpNum: 0);
3761 if (Is16Bit) {
3762 // We just need to find a correctly sized register class, so the
3763 // subregister index compatibility doesn't matter since we're statically
3764 // extracting the immediate value.
3765 MovDstRC = RI.getMatchingSuperRegClass(A: MovDstRC, B: DstRC, Idx: AMDGPU::lo16);
3766 if (!MovDstRC)
3767 continue;
3768
3769 if (MovDstPhysReg) {
3770 // FIXME: We probably should not do this. If there is a live value in
3771 // the high half of the register, it will be corrupted.
3772 MovDstPhysReg =
3773 RI.getMatchingSuperReg(Reg: MovDstPhysReg, SubIdx: AMDGPU::lo16, RC: MovDstRC);
3774 if (!MovDstPhysReg)
3775 continue;
3776 }
3777 }
3778
3779 // Result class isn't the right size, try the next instruction.
3780 if (MovDstPhysReg) {
3781 if (!MovDstRC->contains(Reg: MovDstPhysReg))
3782 return false;
3783 } else if (!MRI->constrainRegClass(Reg: DstReg, RC: MovDstRC)) {
3784 // TODO: This will be overly conservative in the case of 16-bit virtual
3785 // SGPRs. We could hack up the virtual register uses to use a compatible
3786 // 32-bit class.
3787 continue;
3788 }
3789
3790 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3791
3792 // Ensure the interpreted immediate value is a valid operand in the new
3793 // mov.
3794 //
3795 // FIXME: isImmOperandLegal should have form that doesn't require existing
3796 // MachineInstr or MachineOperand
3797 if (!RI.opCanUseLiteralConstant(OpType: OpInfo.OperandType) &&
3798 !isInlineConstant(ImmVal: *SubRegImm, OperandType: OpInfo.OperandType))
3799 break;
3800
3801 NewOpc = MovOp;
3802 break;
3803 }
3804
3805 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3806 return false;
3807
3808 if (Is16Bit) {
3809 UseMI.getOperand(i: 0).setSubReg(AMDGPU::NoSubRegister);
3810 if (MovDstPhysReg)
3811 UseMI.getOperand(i: 0).setReg(MovDstPhysReg);
3812 assert(UseMI.getOperand(1).getReg().isVirtual());
3813 }
3814
3815 const MCInstrDesc &NewMCID = get(Opcode: NewOpc);
3816 UseMI.setDesc(NewMCID);
3817 UseMI.getOperand(i: 1).ChangeToImmediate(ImmVal: *SubRegImm);
3818 UseMI.addImplicitDefUseOperands(MF&: *MF);
3819 return true;
3820 }
3821
3822 if (HasMultipleUses)
3823 return false;
3824
3825 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3826 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3827 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3828 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3829 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3830 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3831 Opc == AMDGPU::V_FMAC_F64_e64) {
3832 // Don't fold if we are using source or output modifiers. The new VOP2
3833 // instructions don't have them.
3834 if (hasAnyModifiersSet(MI: UseMI))
3835 return false;
3836
3837 // If this is a free constant, there's no reason to do this.
3838 // TODO: We could fold this here instead of letting SIFoldOperands do it
3839 // later.
3840 int Src0Idx = getNamedOperandIdx(Opcode: UseMI.getOpcode(), Name: AMDGPU::OpName::src0);
3841
3842 // Any src operand can be used for the legality check.
3843 if (isInlineConstant(MI: UseMI, OpIdx: Src0Idx, ImmVal: Imm))
3844 return false;
3845
3846 MachineOperand *Src0 = &UseMI.getOperand(i: Src0Idx);
3847
3848 MachineOperand *Src1 = getNamedOperand(MI&: UseMI, OperandName: AMDGPU::OpName::src1);
3849 MachineOperand *Src2 = getNamedOperand(MI&: UseMI, OperandName: AMDGPU::OpName::src2);
3850
3851 auto CopyRegOperandToNarrowerRC =
3852 [MRI, this](MachineInstr &MI, unsigned OpNo,
3853 const TargetRegisterClass *NewRC) -> void {
3854 if (!MI.getOperand(i: OpNo).isReg())
3855 return;
3856 Register Reg = MI.getOperand(i: OpNo).getReg();
3857 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI: *MRI, Reg);
3858 if (RI.getCommonSubClass(A: RC, B: NewRC) != NewRC)
3859 return;
3860 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
3861 BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(),
3862 MCID: get(Opcode: AMDGPU::COPY), DestReg: Tmp)
3863 .addReg(RegNo: Reg);
3864 MI.getOperand(i: OpNo).setReg(Tmp);
3865 MI.getOperand(i: OpNo).setIsKill();
3866 };
3867
3868 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3869 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3870 (Src1->isReg() && Src1->getReg() == Reg)) {
3871 MachineOperand *RegSrc =
3872 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3873 if (!RegSrc->isReg())
3874 return false;
3875 if (RI.isSGPRClass(RC: MRI->getRegClass(Reg: RegSrc->getReg())) &&
3876 ST.getConstantBusLimit(Opcode: Opc) < 2)
3877 return false;
3878
3879 if (!Src2->isReg() || RI.isSGPRClass(RC: MRI->getRegClass(Reg: Src2->getReg())))
3880 return false;
3881
3882 // If src2 is also a literal constant then we have to choose which one to
3883 // fold. In general it is better to choose madak so that the other literal
3884 // can be materialized in an sgpr instead of a vgpr:
3885 // s_mov_b32 s0, literal
3886 // v_madak_f32 v0, s0, v0, literal
3887 // Instead of:
3888 // v_mov_b32 v1, literal
3889 // v_madmk_f32 v0, v0, literal, v1
3890 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src2->getReg());
3891 if (Def && Def->isMoveImmediate() &&
3892 !isInlineConstant(MO: Def->getOperand(i: 1)))
3893 return false;
3894
3895 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3896 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
3897 return false;
3898
3899 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3900 Imm, SubRegIndex: RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3901
3902 // FIXME: This would be a lot easier if we could return a new instruction
3903 // instead of having to modify in place.
3904
3905 Register SrcReg = RegSrc->getReg();
3906 unsigned SrcSubReg = RegSrc->getSubReg();
3907 Src0->setReg(SrcReg);
3908 Src0->setSubReg(SrcSubReg);
3909 Src0->setIsKill(RegSrc->isKill());
3910
3911 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3912 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3913 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3914 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3915 UseMI.untieRegOperand(
3916 OpIdx: AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2));
3917
3918 Src1->ChangeToImmediate(ImmVal: *SubRegImm);
3919
3920 removeModOperands(MI&: UseMI);
3921 UseMI.setDesc(get(Opcode: NewOpc));
3922
3923 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3924 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3925 const TargetRegisterClass *NewRC = getRegClass(MCID: get(Opcode: NewOpc), OpNum: 0);
3926 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
3927 BuildMI(BB&: *UseMI.getParent(), I: std::next(x: UseMI.getIterator()),
3928 MIMD: UseMI.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
3929 DestReg: UseMI.getOperand(i: 0).getReg())
3930 .addReg(RegNo: Tmp, Flags: RegState::Kill);
3931 UseMI.getOperand(i: 0).setReg(Tmp);
3932 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3933 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3934 }
3935
3936 bool DeleteDef = MRI->use_nodbg_empty(RegNo: Reg);
3937 if (DeleteDef)
3938 DefMI.eraseFromParent();
3939
3940 return true;
3941 }
3942
3943 // Added part is the constant: Use v_madak_{f16, f32}.
3944 if (Src2->isReg() && Src2->getReg() == Reg) {
3945 if (ST.getConstantBusLimit(Opcode: Opc) < 2) {
3946 // Not allowed to use constant bus for another operand.
3947 // We can however allow an inline immediate as src0.
3948 bool Src0Inlined = false;
3949 if (Src0->isReg()) {
3950 // Try to inline constant if possible.
3951 // If the Def moves immediate and the use is single
3952 // We are saving VGPR here.
3953 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src0->getReg());
3954 if (Def && Def->isMoveImmediate() &&
3955 isInlineConstant(MO: Def->getOperand(i: 1)) &&
3956 MRI->hasOneNonDBGUse(RegNo: Src0->getReg())) {
3957 Src0->ChangeToImmediate(ImmVal: Def->getOperand(i: 1).getImm());
3958 Src0Inlined = true;
3959 } else if (ST.getConstantBusLimit(Opcode: Opc) <= 1 &&
3960 RI.isSGPRReg(MRI: *MRI, Reg: Src0->getReg())) {
3961 return false;
3962 }
3963 // VGPR is okay as Src0 - fallthrough
3964 }
3965
3966 if (Src1->isReg() && !Src0Inlined) {
3967 // We have one slot for inlinable constant so far - try to fill it
3968 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src1->getReg());
3969 if (Def && Def->isMoveImmediate() &&
3970 isInlineConstant(MO: Def->getOperand(i: 1)) &&
3971 MRI->hasOneNonDBGUse(RegNo: Src1->getReg()) && commuteInstruction(MI&: UseMI))
3972 Src0->ChangeToImmediate(ImmVal: Def->getOperand(i: 1).getImm());
3973 else if (RI.isSGPRReg(MRI: *MRI, Reg: Src1->getReg()))
3974 return false;
3975 // VGPR is okay as Src1 - fallthrough
3976 }
3977 }
3978
3979 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3980 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
3981 return false;
3982
3983 // FIXME: This would be a lot easier if we could return a new instruction
3984 // instead of having to modify in place.
3985
3986 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3987 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3988 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3989 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3990 UseMI.untieRegOperand(
3991 OpIdx: AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2));
3992
3993 const std::optional<int64_t> SubRegImm =
3994 extractSubregFromImm(Imm, SubRegIndex: Src2->getSubReg());
3995
3996 // ChangingToImmediate adds Src2 back to the instruction.
3997 Src2->ChangeToImmediate(ImmVal: *SubRegImm);
3998
3999 // These come before src2.
4000 removeModOperands(MI&: UseMI);
4001 UseMI.setDesc(get(Opcode: NewOpc));
4002
4003 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
4004 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
4005 const TargetRegisterClass *NewRC = getRegClass(MCID: get(Opcode: NewOpc), OpNum: 0);
4006 Register Tmp = MRI->createVirtualRegister(RegClass: NewRC);
4007 BuildMI(BB&: *UseMI.getParent(), I: std::next(x: UseMI.getIterator()),
4008 MIMD: UseMI.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
4009 DestReg: UseMI.getOperand(i: 0).getReg())
4010 .addReg(RegNo: Tmp, Flags: RegState::Kill);
4011 UseMI.getOperand(i: 0).setReg(Tmp);
4012 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
4013 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
4014 }
4015
4016 // It might happen that UseMI was commuted
4017 // and we now have SGPR as SRC1. If so 2 inlined
4018 // constant and SGPR are illegal.
4019 legalizeOperands(MI&: UseMI);
4020
4021 bool DeleteDef = MRI->use_nodbg_empty(RegNo: Reg);
4022 if (DeleteDef)
4023 DefMI.eraseFromParent();
4024
4025 return true;
4026 }
4027 }
4028
4029 return false;
4030}
4031
4032static bool
4033memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
4034 ArrayRef<const MachineOperand *> BaseOps2) {
4035 if (BaseOps1.size() != BaseOps2.size())
4036 return false;
4037 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
4038 if (!BaseOps1[I]->isIdenticalTo(Other: *BaseOps2[I]))
4039 return false;
4040 }
4041 return true;
4042}
4043
4044static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
4045 LocationSize WidthB, int OffsetB) {
4046 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
4047 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
4048 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
4049 return LowWidth.hasValue() &&
4050 LowOffset + (int)LowWidth.getValue() <= HighOffset;
4051}
4052
4053bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
4054 const MachineInstr &MIb) const {
4055 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
4056 int64_t Offset0, Offset1;
4057 LocationSize Dummy0 = LocationSize::precise(Value: 0);
4058 LocationSize Dummy1 = LocationSize::precise(Value: 0);
4059 bool Offset0IsScalable, Offset1IsScalable;
4060 if (!getMemOperandsWithOffsetWidth(LdSt: MIa, BaseOps&: BaseOps0, Offset&: Offset0, OffsetIsScalable&: Offset0IsScalable,
4061 Width&: Dummy0, TRI: &RI) ||
4062 !getMemOperandsWithOffsetWidth(LdSt: MIb, BaseOps&: BaseOps1, Offset&: Offset1, OffsetIsScalable&: Offset1IsScalable,
4063 Width&: Dummy1, TRI: &RI))
4064 return false;
4065
4066 if (!memOpsHaveSameBaseOperands(BaseOps1: BaseOps0, BaseOps2: BaseOps1))
4067 return false;
4068
4069 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
4070 // FIXME: Handle ds_read2 / ds_write2.
4071 return false;
4072 }
4073 LocationSize Width0 = MIa.memoperands().front()->getSize();
4074 LocationSize Width1 = MIb.memoperands().front()->getSize();
4075 return offsetsDoNotOverlap(WidthA: Width0, OffsetA: Offset0, WidthB: Width1, OffsetB: Offset1);
4076}
4077
4078bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
4079 const MachineInstr &MIb) const {
4080 assert(MIa.mayLoadOrStore() &&
4081 "MIa must load from or modify a memory location");
4082 assert(MIb.mayLoadOrStore() &&
4083 "MIb must load from or modify a memory location");
4084
4085 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
4086 return false;
4087
4088 // XXX - Can we relax this between address spaces?
4089 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4090 return false;
4091
4092 if (isLDSDMA(MI: MIa) || isLDSDMA(MI: MIb))
4093 return false;
4094
4095 if (MIa.isBundle() || MIb.isBundle())
4096 return false;
4097
4098 // TODO: Should we check the address space from the MachineMemOperand? That
4099 // would allow us to distinguish objects we know don't alias based on the
4100 // underlying address space, even if it was lowered to a different one,
4101 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4102 // buffer.
4103 if (isDS(MI: MIa)) {
4104 if (isDS(MI: MIb))
4105 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4106
4107 return !isFLAT(MI: MIb) || isSegmentSpecificFLAT(MI: MIb);
4108 }
4109
4110 if (isMUBUF(MI: MIa) || isMTBUF(MI: MIa)) {
4111 if (isMUBUF(MI: MIb) || isMTBUF(MI: MIb))
4112 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4113
4114 if (isFLAT(MI: MIb))
4115 return isFLATScratch(MI: MIb);
4116
4117 return !isSMRD(MI: MIb);
4118 }
4119
4120 if (isSMRD(MI: MIa)) {
4121 if (isSMRD(MI: MIb))
4122 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4123
4124 if (isFLAT(MI: MIb))
4125 return isFLATScratch(MI: MIb);
4126
4127 return !isMUBUF(MI: MIb) && !isMTBUF(MI: MIb);
4128 }
4129
4130 if (isFLAT(MI: MIa)) {
4131 if (isFLAT(MI: MIb)) {
4132 if ((isFLATScratch(MI: MIa) && isFLATGlobal(MI: MIb)) ||
4133 (isFLATGlobal(MI: MIa) && isFLATScratch(MI: MIb)))
4134 return true;
4135
4136 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4137 }
4138
4139 return false;
4140 }
4141
4142 return false;
4143}
4144
4145static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,
4146 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4147 if (Reg.isPhysical())
4148 return false;
4149 auto *Def = MRI.getUniqueVRegDef(Reg);
4150 if (Def && SIInstrInfo::isFoldableCopy(MI: *Def) && Def->getOperand(i: 1).isImm()) {
4151 Imm = Def->getOperand(i: 1).getImm();
4152 if (DefMI)
4153 *DefMI = Def;
4154 return true;
4155 }
4156 return false;
4157}
4158
4159static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4160 MachineInstr **DefMI = nullptr) {
4161 if (!MO->isReg())
4162 return false;
4163 const MachineFunction *MF = MO->getParent()->getMF();
4164 const MachineRegisterInfo &MRI = MF->getRegInfo();
4165 return getFoldableImm(Reg: MO->getReg(), MRI, Imm, DefMI);
4166}
4167
4168static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
4169 MachineInstr &NewMI) {
4170 if (LV) {
4171 unsigned NumOps = MI.getNumOperands();
4172 for (unsigned I = 1; I < NumOps; ++I) {
4173 MachineOperand &Op = MI.getOperand(i: I);
4174 if (Op.isReg() && Op.isKill())
4175 LV->replaceKillInstruction(Reg: Op.getReg(), OldMI&: MI, NewMI);
4176 }
4177 }
4178}
4179
4180static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4181 switch (Opc) {
4182 case AMDGPU::V_MAC_F16_e32:
4183 case AMDGPU::V_MAC_F16_e64:
4184 return AMDGPU::V_MAD_F16_e64;
4185 case AMDGPU::V_MAC_F32_e32:
4186 case AMDGPU::V_MAC_F32_e64:
4187 return AMDGPU::V_MAD_F32_e64;
4188 case AMDGPU::V_MAC_LEGACY_F32_e32:
4189 case AMDGPU::V_MAC_LEGACY_F32_e64:
4190 return AMDGPU::V_MAD_LEGACY_F32_e64;
4191 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4192 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4193 return AMDGPU::V_FMA_LEGACY_F32_e64;
4194 case AMDGPU::V_FMAC_F16_e32:
4195 case AMDGPU::V_FMAC_F16_e64:
4196 case AMDGPU::V_FMAC_F16_t16_e64:
4197 case AMDGPU::V_FMAC_F16_fake16_e64:
4198 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4199 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4200 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4201 : AMDGPU::V_FMA_F16_gfx9_e64;
4202 case AMDGPU::V_FMAC_F32_e32:
4203 case AMDGPU::V_FMAC_F32_e64:
4204 return AMDGPU::V_FMA_F32_e64;
4205 case AMDGPU::V_FMAC_F64_e32:
4206 case AMDGPU::V_FMAC_F64_e64:
4207 return AMDGPU::V_FMA_F64_e64;
4208 default:
4209 llvm_unreachable("invalid instruction");
4210 }
4211}
4212
4213/// Helper struct for the implementation of 3-address conversion to communicate
4214/// updates made to instruction operands.
4215struct SIInstrInfo::ThreeAddressUpdates {
4216 /// Other instruction whose def is no longer used by the converted
4217 /// instruction.
4218 MachineInstr *RemoveMIUse = nullptr;
4219};
4220
4221MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
4222 LiveVariables *LV,
4223 LiveIntervals *LIS) const {
4224 MachineBasicBlock &MBB = *MI.getParent();
4225 MachineInstr *CandidateMI = &MI;
4226
4227 if (MI.isBundle()) {
4228 // This is a temporary placeholder for bundle handling that enables us to
4229 // exercise the relevant code paths in the two-address instruction pass.
4230 if (MI.getBundleSize() != 1)
4231 return nullptr;
4232 CandidateMI = MI.getNextNode();
4233 }
4234
4235 ThreeAddressUpdates U;
4236 MachineInstr *NewMI = convertToThreeAddressImpl(MI&: *CandidateMI, Updates&: U);
4237 if (!NewMI)
4238 return nullptr;
4239
4240 if (MI.isBundle()) {
4241 CandidateMI->eraseFromBundle();
4242
4243 for (MachineOperand &MO : MI.all_defs()) {
4244 if (MO.isTied())
4245 MI.untieRegOperand(OpIdx: MO.getOperandNo());
4246 }
4247 } else {
4248 updateLiveVariables(LV, MI, NewMI&: *NewMI);
4249 if (LIS) {
4250 LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *NewMI);
4251 // SlotIndex of defs needs to be updated when converting to early-clobber
4252 MachineOperand &Def = NewMI->getOperand(i: 0);
4253 if (Def.isEarlyClobber() && Def.isReg() &&
4254 LIS->hasInterval(Reg: Def.getReg())) {
4255 SlotIndex OldIndex = LIS->getInstructionIndex(Instr: *NewMI).getRegSlot(EC: false);
4256 SlotIndex NewIndex = LIS->getInstructionIndex(Instr: *NewMI).getRegSlot(EC: true);
4257 auto &LI = LIS->getInterval(Reg: Def.getReg());
4258 auto UpdateDefIndex = [&](LiveRange &LR) {
4259 auto *S = LR.find(Pos: OldIndex);
4260 if (S != LR.end() && S->start == OldIndex) {
4261 assert(S->valno && S->valno->def == OldIndex);
4262 S->start = NewIndex;
4263 S->valno->def = NewIndex;
4264 }
4265 };
4266 UpdateDefIndex(LI);
4267 for (auto &SR : LI.subranges())
4268 UpdateDefIndex(SR);
4269 }
4270 }
4271 }
4272
4273 if (U.RemoveMIUse) {
4274 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4275 // The only user is the instruction which will be killed.
4276 Register DefReg = U.RemoveMIUse->getOperand(i: 0).getReg();
4277
4278 if (MRI.hasOneNonDBGUse(RegNo: DefReg)) {
4279 // We cannot just remove the DefMI here, calling pass will crash.
4280 U.RemoveMIUse->setDesc(get(Opcode: AMDGPU::IMPLICIT_DEF));
4281 U.RemoveMIUse->getOperand(i: 0).setIsDead(true);
4282 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4283 U.RemoveMIUse->removeOperand(OpNo: I);
4284 if (LV)
4285 LV->getVarInfo(Reg: DefReg).AliveBlocks.clear();
4286 }
4287
4288 if (MI.isBundle()) {
4289 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, Reg: DefReg);
4290 if (!VRI.Reads && !VRI.Writes) {
4291 for (MachineOperand &MO : MI.all_uses()) {
4292 if (MO.isReg() && MO.getReg() == DefReg) {
4293 assert(MO.getSubReg() == 0 &&
4294 "tied sub-registers in bundles currently not supported");
4295 MI.removeOperand(OpNo: MO.getOperandNo());
4296 break;
4297 }
4298 }
4299
4300 if (LIS)
4301 LIS->shrinkToUses(li: &LIS->getInterval(Reg: DefReg));
4302 }
4303 } else if (LIS) {
4304 LiveInterval &DefLI = LIS->getInterval(Reg: DefReg);
4305
4306 // We cannot delete the original instruction here, so hack out the use
4307 // in the original instruction with a dummy register so we can use
4308 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4309 // not have the complexity of deleting a use to consider here.
4310 Register DummyReg = MRI.cloneVirtualRegister(VReg: DefReg);
4311 for (MachineOperand &MIOp : MI.uses()) {
4312 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4313 MIOp.setIsUndef(true);
4314 MIOp.setReg(DummyReg);
4315 }
4316 }
4317
4318 if (MI.isBundle()) {
4319 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, Reg: DefReg);
4320 if (!VRI.Reads && !VRI.Writes) {
4321 for (MachineOperand &MIOp : MI.uses()) {
4322 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4323 MIOp.setIsUndef(true);
4324 MIOp.setReg(DummyReg);
4325 }
4326 }
4327 }
4328
4329 MI.addOperand(Op: MachineOperand::CreateReg(Reg: DummyReg, isDef: false, isImp: false, isKill: false,
4330 isDead: false, /*isUndef=*/true));
4331 }
4332
4333 LIS->shrinkToUses(li: &DefLI);
4334 }
4335 }
4336
4337 return MI.isBundle() ? &MI : NewMI;
4338}
4339
4340MachineInstr *
4341SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4342 ThreeAddressUpdates &U) const {
4343 MachineBasicBlock &MBB = *MI.getParent();
4344 unsigned Opc = MI.getOpcode();
4345
4346 // Handle MFMA.
4347 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opcode: Opc);
4348 if (NewMFMAOpc != -1) {
4349 MachineInstrBuilder MIB =
4350 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewMFMAOpc));
4351 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4352 MIB.add(MO: MI.getOperand(i: I));
4353 return MIB;
4354 }
4355
4356 if (SIInstrInfo::isWMMA(MI)) {
4357 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(Opc: MI.getOpcode());
4358 MachineInstrBuilder MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4359 .setMIFlags(MI.getFlags());
4360 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4361 MIB->addOperand(Op: MI.getOperand(i: I));
4362 return MIB;
4363 }
4364
4365 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4366 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4367 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4368 "present pre-RA");
4369
4370 // Handle MAC/FMAC.
4371 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4372 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4373 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4374 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4375 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4376 bool Src0Literal = false;
4377
4378 switch (Opc) {
4379 default:
4380 return nullptr;
4381 case AMDGPU::V_MAC_F16_e64:
4382 case AMDGPU::V_FMAC_F16_e64:
4383 case AMDGPU::V_FMAC_F16_t16_e64:
4384 case AMDGPU::V_FMAC_F16_fake16_e64:
4385 case AMDGPU::V_MAC_F32_e64:
4386 case AMDGPU::V_MAC_LEGACY_F32_e64:
4387 case AMDGPU::V_FMAC_F32_e64:
4388 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4389 case AMDGPU::V_FMAC_F64_e64:
4390 break;
4391 case AMDGPU::V_MAC_F16_e32:
4392 case AMDGPU::V_FMAC_F16_e32:
4393 case AMDGPU::V_MAC_F32_e32:
4394 case AMDGPU::V_MAC_LEGACY_F32_e32:
4395 case AMDGPU::V_FMAC_F32_e32:
4396 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4397 case AMDGPU::V_FMAC_F64_e32: {
4398 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
4399 Name: AMDGPU::OpName::src0);
4400 const MachineOperand *Src0 = &MI.getOperand(i: Src0Idx);
4401 if (!Src0->isReg() && !Src0->isImm())
4402 return nullptr;
4403
4404 if (Src0->isImm() && !isInlineConstant(MI, OpIdx: Src0Idx, MO: *Src0))
4405 Src0Literal = true;
4406
4407 break;
4408 }
4409 }
4410
4411 MachineInstrBuilder MIB;
4412 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
4413 const MachineOperand *Src0 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
4414 const MachineOperand *Src0Mods =
4415 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
4416 const MachineOperand *Src1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4417 const MachineOperand *Src1Mods =
4418 getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
4419 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
4420 const MachineOperand *Src2Mods =
4421 getNamedOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers);
4422 const MachineOperand *Clamp = getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
4423 const MachineOperand *Omod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
4424 const MachineOperand *OpSel = getNamedOperand(MI, OperandName: AMDGPU::OpName::op_sel);
4425
4426 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4427 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4428 // If we have an SGPR input, we will violate the constant bus restriction.
4429 (ST.getConstantBusLimit(Opcode: Opc) > 1 || !Src0->isReg() ||
4430 !RI.isSGPRReg(MRI: MBB.getParent()->getRegInfo(), Reg: Src0->getReg()))) {
4431 MachineInstr *DefMI;
4432
4433 int64_t Imm;
4434 if (!Src0Literal && getFoldableImm(MO: Src2, Imm, DefMI: &DefMI)) {
4435 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4436 if (pseudoToMCOpcode(Opcode: NewOpc) != -1) {
4437 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4438 .add(MO: *Dst)
4439 .add(MO: *Src0)
4440 .add(MO: *Src1)
4441 .addImm(Val: Imm)
4442 .setMIFlags(MI.getFlags());
4443 U.RemoveMIUse = DefMI;
4444 return MIB;
4445 }
4446 }
4447 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4448 if (!Src0Literal && getFoldableImm(MO: Src1, Imm, DefMI: &DefMI)) {
4449 if (pseudoToMCOpcode(Opcode: NewOpc) != -1) {
4450 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4451 .add(MO: *Dst)
4452 .add(MO: *Src0)
4453 .addImm(Val: Imm)
4454 .add(MO: *Src2)
4455 .setMIFlags(MI.getFlags());
4456 U.RemoveMIUse = DefMI;
4457 return MIB;
4458 }
4459 }
4460 if (Src0Literal || getFoldableImm(MO: Src0, Imm, DefMI: &DefMI)) {
4461 if (Src0Literal) {
4462 Imm = Src0->getImm();
4463 DefMI = nullptr;
4464 }
4465 if (pseudoToMCOpcode(Opcode: NewOpc) != -1 &&
4466 isOperandLegal(
4467 MI, OpIdx: AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::src0),
4468 MO: Src1)) {
4469 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4470 .add(MO: *Dst)
4471 .add(MO: *Src1)
4472 .addImm(Val: Imm)
4473 .add(MO: *Src2)
4474 .setMIFlags(MI.getFlags());
4475 U.RemoveMIUse = DefMI;
4476 return MIB;
4477 }
4478 }
4479 }
4480
4481 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4482 // if VOP3 does not allow a literal operand.
4483 if (Src0Literal && !ST.hasVOP3Literal())
4484 return nullptr;
4485
4486 unsigned NewOpc = getNewFMAInst(ST, Opc);
4487
4488 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
4489 return nullptr;
4490
4491 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4492 .add(MO: *Dst)
4493 .addImm(Val: Src0Mods ? Src0Mods->getImm() : 0)
4494 .add(MO: *Src0)
4495 .addImm(Val: Src1Mods ? Src1Mods->getImm() : 0)
4496 .add(MO: *Src1)
4497 .addImm(Val: Src2Mods ? Src2Mods->getImm() : 0)
4498 .add(MO: *Src2)
4499 .addImm(Val: Clamp ? Clamp->getImm() : 0)
4500 .addImm(Val: Omod ? Omod->getImm() : 0)
4501 .setMIFlags(MI.getFlags());
4502 if (AMDGPU::hasNamedOperand(Opcode: NewOpc, NamedIdx: AMDGPU::OpName::op_sel))
4503 MIB.addImm(Val: OpSel ? OpSel->getImm() : 0);
4504 return MIB;
4505}
4506
4507// It's not generally safe to move VALU instructions across these since it will
4508// start using the register as a base index rather than directly.
4509// XXX - Why isn't hasSideEffects sufficient for these?
4510static bool changesVGPRIndexingMode(const MachineInstr &MI) {
4511 switch (MI.getOpcode()) {
4512 case AMDGPU::S_SET_GPR_IDX_ON:
4513 case AMDGPU::S_SET_GPR_IDX_MODE:
4514 case AMDGPU::S_SET_GPR_IDX_OFF:
4515 return true;
4516 default:
4517 return false;
4518 }
4519}
4520
4521bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
4522 const MachineBasicBlock *MBB,
4523 const MachineFunction &MF) const {
4524 // Skipping the check for SP writes in the base implementation. The reason it
4525 // was added was apparently due to compile time concerns.
4526 //
4527 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4528 // but is probably avoidable.
4529
4530 // Copied from base implementation.
4531 // Terminators and labels can't be scheduled around.
4532 if (MI.isTerminator() || MI.isPosition())
4533 return true;
4534
4535 // INLINEASM_BR can jump to another block
4536 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4537 return true;
4538
4539 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(i: 0).getImm() == 0)
4540 return true;
4541
4542 // Target-independent instructions do not have an implicit-use of EXEC, even
4543 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4544 // boundaries prevents incorrect movements of such instructions.
4545 return MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &RI) ||
4546 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4547 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4548 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4549 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4550 changesVGPRIndexingMode(MI);
4551}
4552
4553bool SIInstrInfo::isAlwaysGDS(uint32_t Opcode) const {
4554 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4555 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4556 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4557}
4558
4559bool SIInstrInfo::mayAccessScratch(const MachineInstr &MI) const {
4560 // Instructions that access scratch use FLAT encoding or BUF encodings.
4561 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4562 return false;
4563
4564 // SCRATCH instructions always access scratch.
4565 if (isFLATScratch(MI))
4566 return true;
4567
4568 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4569 // via the aperture.
4570 if (MI.getMF()->getFunction().hasFnAttribute(Kind: "amdgpu-no-flat-scratch-init"))
4571 return false;
4572
4573 // If there are no memory operands then conservatively assume the flat
4574 // operation may access scratch.
4575 if (MI.memoperands_empty())
4576 return true;
4577
4578 // See if any memory operand specifies an address space that involves scratch.
4579 return any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *Memop) {
4580 unsigned AS = Memop->getAddrSpace();
4581 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4582 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4583 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4584 MD: *MD, Val: AMDGPUAS::PRIVATE_ADDRESS);
4585 }
4586 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4587 });
4588}
4589
4590bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
4591 assert(isFLAT(MI));
4592
4593 // All flat instructions use the VMEM counter except prefetch.
4594 if (!usesVM_CNT(MI))
4595 return false;
4596
4597 // If there are no memory operands then conservatively assume the flat
4598 // operation may access VMEM.
4599 if (MI.memoperands_empty())
4600 return true;
4601
4602 // See if any memory operand specifies an address space that involves VMEM.
4603 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4604 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4605 // (GDS) address space is not supported by flat operations. Therefore, simply
4606 // return true unless only the LDS address space is found.
4607 for (const MachineMemOperand *Memop : MI.memoperands()) {
4608 unsigned AS = Memop->getAddrSpace();
4609 assert(AS != AMDGPUAS::REGION_ADDRESS);
4610 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4611 return true;
4612 }
4613
4614 return false;
4615}
4616
4617bool SIInstrInfo::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
4618 assert(isFLAT(MI));
4619
4620 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4621 if (!usesLGKM_CNT(MI))
4622 return false;
4623
4624 // If in tgsplit mode then there can be no use of LDS.
4625 if (ST.isTgSplitEnabled())
4626 return false;
4627
4628 // If there are no memory operands then conservatively assume the flat
4629 // operation may access LDS.
4630 if (MI.memoperands_empty())
4631 return true;
4632
4633 // See if any memory operand specifies an address space that involves LDS.
4634 for (const MachineMemOperand *Memop : MI.memoperands()) {
4635 unsigned AS = Memop->getAddrSpace();
4636 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
4637 return true;
4638 }
4639
4640 return false;
4641}
4642
4643bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
4644 // Skip the full operand and register alias search modifiesRegister
4645 // does. There's only a handful of instructions that touch this, it's only an
4646 // implicit def, and doesn't alias any other registers.
4647 return is_contained(Range: MI.getDesc().implicit_defs(), Element: AMDGPU::MODE);
4648}
4649
4650bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
4651 unsigned Opcode = MI.getOpcode();
4652
4653 if (MI.mayStore() && isSMRD(MI))
4654 return true; // scalar store or atomic
4655
4656 // This will terminate the function when other lanes may need to continue.
4657 if (MI.isReturn())
4658 return true;
4659
4660 // These instructions cause shader I/O that may cause hardware lockups
4661 // when executed with an empty EXEC mask.
4662 //
4663 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4664 // EXEC = 0, but checking for that case here seems not worth it
4665 // given the typical code patterns.
4666 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4667 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4668 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
4669 Opcode == AMDGPU::S_SETHALT)
4670 return true;
4671
4672 if (MI.isCall() || MI.isInlineAsm())
4673 return true; // conservative assumption
4674
4675 // Assume that barrier interactions are only intended with active lanes.
4676 if (isBarrier(Opcode))
4677 return true;
4678
4679 // A mode change is a scalar operation that influences vector instructions.
4680 if (modifiesModeRegister(MI))
4681 return true;
4682
4683 // These are like SALU instructions in terms of effects, so it's questionable
4684 // whether we should return true for those.
4685 //
4686 // However, executing them with EXEC = 0 causes them to operate on undefined
4687 // data, which we avoid by returning true here.
4688 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4689 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4690 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4691 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4692 return true;
4693
4694 return false;
4695}
4696
4697bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,
4698 const MachineInstr &MI) const {
4699 if (MI.isMetaInstruction())
4700 return false;
4701
4702 // This won't read exec if this is an SGPR->SGPR copy.
4703 if (MI.isCopyLike()) {
4704 if (!RI.isSGPRReg(MRI, Reg: MI.getOperand(i: 0).getReg()))
4705 return true;
4706
4707 // Make sure this isn't copying exec as a normal operand
4708 return MI.readsRegister(Reg: AMDGPU::EXEC, TRI: &RI);
4709 }
4710
4711 // Make a conservative assumption about the callee.
4712 if (MI.isCall())
4713 return true;
4714
4715 // Be conservative with any unhandled generic opcodes.
4716 if (!isTargetSpecificOpcode(Opcode: MI.getOpcode()))
4717 return true;
4718
4719 return !isSALU(MI) || MI.readsRegister(Reg: AMDGPU::EXEC, TRI: &RI);
4720}
4721
4722bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4723 switch (Imm.getBitWidth()) {
4724 case 1: // This likely will be a condition code mask.
4725 return true;
4726
4727 case 32:
4728 return AMDGPU::isInlinableLiteral32(Literal: Imm.getSExtValue(),
4729 HasInv2Pi: ST.hasInv2PiInlineImm());
4730 case 64:
4731 return AMDGPU::isInlinableLiteral64(Literal: Imm.getSExtValue(),
4732 HasInv2Pi: ST.hasInv2PiInlineImm());
4733 case 16:
4734 return ST.has16BitInsts() &&
4735 AMDGPU::isInlinableLiteralI16(Literal: Imm.getSExtValue(),
4736 HasInv2Pi: ST.hasInv2PiInlineImm());
4737 default:
4738 llvm_unreachable("invalid bitwidth");
4739 }
4740}
4741
4742bool SIInstrInfo::isInlineConstant(const APFloat &Imm) const {
4743 APInt IntImm = Imm.bitcastToAPInt();
4744 int64_t IntImmVal = IntImm.getSExtValue();
4745 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4746 switch (APFloat::SemanticsToEnum(Sem: Imm.getSemantics())) {
4747 default:
4748 llvm_unreachable("invalid fltSemantics");
4749 case APFloatBase::S_IEEEsingle:
4750 case APFloatBase::S_IEEEdouble:
4751 return isInlineConstant(Imm: IntImm);
4752 case APFloatBase::S_BFloat:
4753 return ST.has16BitInsts() &&
4754 AMDGPU::isInlinableLiteralBF16(Literal: IntImmVal, HasInv2Pi);
4755 case APFloatBase::S_IEEEhalf:
4756 return ST.has16BitInsts() &&
4757 AMDGPU::isInlinableLiteralFP16(Literal: IntImmVal, HasInv2Pi);
4758 }
4759}
4760
4761bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4762 // MachineOperand provides no way to tell the true operand size, since it only
4763 // records a 64-bit value. We need to know the size to determine if a 32-bit
4764 // floating point immediate bit pattern is legal for an integer immediate. It
4765 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4766 switch (OperandType) {
4767 case AMDGPU::OPERAND_REG_IMM_INT32:
4768 case AMDGPU::OPERAND_REG_IMM_FP32:
4769 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
4770 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
4771 case AMDGPU::OPERAND_REG_IMM_V2FP32:
4772 case AMDGPU::OPERAND_REG_IMM_V2INT32:
4773 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
4774 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
4775 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: {
4776 int32_t Trunc = static_cast<int32_t>(Imm);
4777 return AMDGPU::isInlinableLiteral32(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4778 }
4779 case AMDGPU::OPERAND_REG_IMM_INT64:
4780 case AMDGPU::OPERAND_REG_IMM_FP64:
4781 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
4782 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
4783 case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
4784 return AMDGPU::isInlinableLiteral64(Literal: Imm, HasInv2Pi: ST.hasInv2PiInlineImm());
4785 case AMDGPU::OPERAND_REG_IMM_INT16:
4786 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
4787 // We would expect inline immediates to not be concerned with an integer/fp
4788 // distinction. However, in the case of 16-bit integer operations, the
4789 // "floating point" values appear to not work. It seems read the low 16-bits
4790 // of 32-bit immediates, which happens to always work for the integer
4791 // values.
4792 //
4793 // See llvm bugzilla 46302.
4794 //
4795 // TODO: Theoretically we could use op-sel to use the high bits of the
4796 // 32-bit FP values.
4797 return AMDGPU::isInlinableIntLiteral(Literal: Imm);
4798 case AMDGPU::OPERAND_REG_IMM_V2INT16:
4799 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
4800 return AMDGPU::isInlinableLiteralV2I16(Literal: Imm);
4801 case AMDGPU::OPERAND_REG_IMM_V2FP16:
4802 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
4803 return AMDGPU::isInlinableLiteralV2F16(Literal: Imm);
4804 case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
4805 return AMDGPU::isPKFMACF16InlineConstant(Literal: Imm, IsGFX11Plus: ST.isGFX11Plus());
4806 case AMDGPU::OPERAND_REG_IMM_V2BF16:
4807 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
4808 return AMDGPU::isInlinableLiteralV2BF16(Literal: Imm);
4809 case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
4810 return false;
4811 case AMDGPU::OPERAND_REG_IMM_FP16:
4812 case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
4813 if (isInt<16>(x: Imm) || isUInt<16>(x: Imm)) {
4814 // A few special case instructions have 16-bit operands on subtargets
4815 // where 16-bit instructions are not legal.
4816 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4817 // constants in these cases
4818 int16_t Trunc = static_cast<int16_t>(Imm);
4819 return ST.has16BitInsts() &&
4820 AMDGPU::isInlinableLiteralFP16(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4821 }
4822
4823 return false;
4824 }
4825 case AMDGPU::OPERAND_REG_IMM_BF16:
4826 case AMDGPU::OPERAND_REG_INLINE_C_BF16: {
4827 if (isInt<16>(x: Imm) || isUInt<16>(x: Imm)) {
4828 int16_t Trunc = static_cast<int16_t>(Imm);
4829 return ST.has16BitInsts() &&
4830 AMDGPU::isInlinableLiteralBF16(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4831 }
4832 return false;
4833 }
4834 case AMDGPU::OPERAND_KIMM32:
4835 case AMDGPU::OPERAND_KIMM16:
4836 case AMDGPU::OPERAND_KIMM64:
4837 return false;
4838 case AMDGPU::OPERAND_INLINE_C_AV64_PSEUDO:
4839 return isLegalAV64PseudoImm(Imm);
4840 case AMDGPU::OPERAND_INPUT_MODS:
4841 case MCOI::OPERAND_IMMEDIATE:
4842 // Always embedded in the instruction for free.
4843 return true;
4844 case MCOI::OPERAND_UNKNOWN:
4845 case MCOI::OPERAND_REGISTER:
4846 case MCOI::OPERAND_PCREL:
4847 case MCOI::OPERAND_GENERIC_0:
4848 case MCOI::OPERAND_GENERIC_1:
4849 case MCOI::OPERAND_GENERIC_2:
4850 case MCOI::OPERAND_GENERIC_3:
4851 case MCOI::OPERAND_GENERIC_4:
4852 case MCOI::OPERAND_GENERIC_5:
4853 // Just ignore anything else.
4854 return true;
4855 default:
4856 llvm_unreachable("invalid operand type");
4857 }
4858}
4859
4860static bool compareMachineOp(const MachineOperand &Op0,
4861 const MachineOperand &Op1) {
4862 if (Op0.getType() != Op1.getType())
4863 return false;
4864
4865 switch (Op0.getType()) {
4866 case MachineOperand::MO_Register:
4867 return Op0.getReg() == Op1.getReg();
4868 case MachineOperand::MO_Immediate:
4869 return Op0.getImm() == Op1.getImm();
4870 default:
4871 llvm_unreachable("Didn't expect to be comparing these operand types");
4872 }
4873}
4874
4875bool SIInstrInfo::isLiteralOperandLegal(const MCInstrDesc &InstDesc,
4876 const MCOperandInfo &OpInfo) const {
4877 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4878 return true;
4879
4880 if (!RI.opCanUseLiteralConstant(OpType: OpInfo.OperandType))
4881 return false;
4882
4883 if (!isVOP3(Desc: InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4884 return true;
4885
4886 return ST.hasVOP3Literal();
4887}
4888
4889bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4890 int64_t ImmVal) const {
4891 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4892 if (isInlineConstant(Imm: ImmVal, OperandType: OpInfo.OperandType)) {
4893 if (isMAI(Desc: InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4894 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(Opcode: InstDesc.getOpcode(),
4895 Name: AMDGPU::OpName::src2))
4896 return false;
4897 return RI.opCanUseInlineConstant(OpType: OpInfo.OperandType);
4898 }
4899
4900 return isLiteralOperandLegal(InstDesc, OpInfo);
4901}
4902
4903bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4904 const MachineOperand &MO) const {
4905 if (MO.isImm())
4906 return isImmOperandLegal(InstDesc, OpNo, ImmVal: MO.getImm());
4907
4908 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4909 "unexpected imm-like operand kind");
4910 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4911 return isLiteralOperandLegal(InstDesc, OpInfo);
4912}
4913
4914bool SIInstrInfo::isLegalAV64PseudoImm(uint64_t Imm) const {
4915 // 2 32-bit inline constants packed into one.
4916 return AMDGPU::isInlinableLiteral32(Literal: Lo_32(Value: Imm), HasInv2Pi: ST.hasInv2PiInlineImm()) &&
4917 AMDGPU::isInlinableLiteral32(Literal: Hi_32(Value: Imm), HasInv2Pi: ST.hasInv2PiInlineImm());
4918}
4919
4920bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4921 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4922 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4923 return false;
4924
4925 int Op32 = AMDGPU::getVOPe32(Opcode);
4926 if (Op32 == -1)
4927 return false;
4928
4929 return pseudoToMCOpcode(Opcode: Op32) != -1;
4930}
4931
4932bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4933 // The src0_modifier operand is present on all instructions
4934 // that have modifiers.
4935
4936 return AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers);
4937}
4938
4939bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
4940 AMDGPU::OpName OpName) const {
4941 const MachineOperand *Mods = getNamedOperand(MI, OperandName: OpName);
4942 return Mods && Mods->getImm();
4943}
4944
4945bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
4946 return any_of(Range: ModifierOpNames,
4947 P: [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, OpName: Name); });
4948}
4949
4950bool SIInstrInfo::canShrink(const MachineInstr &MI,
4951 const MachineRegisterInfo &MRI) const {
4952 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
4953 // Can't shrink instruction with three operands.
4954 if (Src2) {
4955 switch (MI.getOpcode()) {
4956 default: return false;
4957
4958 case AMDGPU::V_ADDC_U32_e64:
4959 case AMDGPU::V_SUBB_U32_e64:
4960 case AMDGPU::V_SUBBREV_U32_e64: {
4961 const MachineOperand *Src1
4962 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4963 if (!Src1->isReg() || !RI.isVGPR(MRI, Reg: Src1->getReg()))
4964 return false;
4965 // Additional verification is needed for sdst/src2.
4966 return true;
4967 }
4968 case AMDGPU::V_MAC_F16_e64:
4969 case AMDGPU::V_MAC_F32_e64:
4970 case AMDGPU::V_MAC_LEGACY_F32_e64:
4971 case AMDGPU::V_FMAC_F16_e64:
4972 case AMDGPU::V_FMAC_F16_t16_e64:
4973 case AMDGPU::V_FMAC_F16_fake16_e64:
4974 case AMDGPU::V_FMAC_F32_e64:
4975 case AMDGPU::V_FMAC_F64_e64:
4976 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4977 if (!Src2->isReg() || !RI.isVGPR(MRI, Reg: Src2->getReg()) ||
4978 hasModifiersSet(MI, OpName: AMDGPU::OpName::src2_modifiers))
4979 return false;
4980 break;
4981
4982 case AMDGPU::V_CNDMASK_B32_e64:
4983 break;
4984 }
4985 }
4986
4987 const MachineOperand *Src1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4988 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Reg: Src1->getReg()) ||
4989 hasModifiersSet(MI, OpName: AMDGPU::OpName::src1_modifiers)))
4990 return false;
4991
4992 // We don't need to check src0, all input types are legal, so just make sure
4993 // src0 isn't using any modifiers.
4994 if (hasModifiersSet(MI, OpName: AMDGPU::OpName::src0_modifiers))
4995 return false;
4996
4997 // Can it be shrunk to a valid 32 bit opcode?
4998 if (!hasVALU32BitEncoding(Opcode: MI.getOpcode()))
4999 return false;
5000
5001 // Check output modifiers
5002 return !hasModifiersSet(MI, OpName: AMDGPU::OpName::omod) &&
5003 !hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) &&
5004 !hasModifiersSet(MI, OpName: AMDGPU::OpName::byte_sel) &&
5005 // TODO: Can we avoid checking bound_ctrl/fi here?
5006 // They are only used by permlane*_swap special case.
5007 !hasModifiersSet(MI, OpName: AMDGPU::OpName::bound_ctrl) &&
5008 !hasModifiersSet(MI, OpName: AMDGPU::OpName::fi);
5009}
5010
5011// Set VCC operand with all flags from \p Orig, except for setting it as
5012// implicit.
5013static void copyFlagsToImplicitVCC(MachineInstr &MI,
5014 const MachineOperand &Orig) {
5015
5016 for (MachineOperand &Use : MI.implicit_operands()) {
5017 if (Use.isUse() &&
5018 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
5019 Use.setIsUndef(Orig.isUndef());
5020 Use.setIsKill(Orig.isKill());
5021 return;
5022 }
5023 }
5024}
5025
5026MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
5027 unsigned Op32) const {
5028 MachineBasicBlock *MBB = MI.getParent();
5029
5030 const MCInstrDesc &Op32Desc = get(Opcode: Op32);
5031 MachineInstrBuilder Inst32 =
5032 BuildMI(BB&: *MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: Op32Desc)
5033 .setMIFlags(MI.getFlags());
5034
5035 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
5036 // For VOPC instructions, this is replaced by an implicit def of vcc.
5037
5038 // We assume the defs of the shrunk opcode are in the same order, and the
5039 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
5040 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
5041 Inst32.add(MO: MI.getOperand(i: I));
5042
5043 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
5044
5045 int Idx = MI.getNumExplicitDefs();
5046 for (const MachineOperand &Use : MI.explicit_uses()) {
5047 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
5048 if (OpTy == AMDGPU::OPERAND_INPUT_MODS || OpTy == MCOI::OPERAND_IMMEDIATE)
5049 continue;
5050
5051 if (&Use == Src2) {
5052 if (AMDGPU::getNamedOperandIdx(Opcode: Op32, Name: AMDGPU::OpName::src2) == -1) {
5053 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
5054 // replaced with an implicit read of vcc or vcc_lo. The implicit read
5055 // of vcc was already added during the initial BuildMI, but we
5056 // 1) may need to change vcc to vcc_lo to preserve the original register
5057 // 2) have to preserve the original flags.
5058 copyFlagsToImplicitVCC(MI&: *Inst32, Orig: *Src2);
5059 continue;
5060 }
5061 }
5062
5063 Inst32.add(MO: Use);
5064 }
5065
5066 // FIXME: Losing implicit operands
5067 fixImplicitOperands(MI&: *Inst32);
5068 return Inst32;
5069}
5070
5071bool SIInstrInfo::physRegUsesConstantBus(const MachineOperand &RegOp) const {
5072 // Null is free
5073 Register Reg = RegOp.getReg();
5074 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
5075 return false;
5076
5077 // SGPRs use the constant bus
5078
5079 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5080 // physical register operands should also count, except for exec.
5081 if (RegOp.isImplicit())
5082 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5083
5084 // SGPRs use the constant bus
5085 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5086 AMDGPU::SReg_64RegClass.contains(Reg);
5087}
5088
5089bool SIInstrInfo::regUsesConstantBus(const MachineOperand &RegOp,
5090 const MachineRegisterInfo &MRI) const {
5091 Register Reg = RegOp.getReg();
5092 return Reg.isVirtual() ? RI.isSGPRClass(RC: MRI.getRegClass(Reg))
5093 : physRegUsesConstantBus(RegOp);
5094}
5095
5096bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
5097 const MachineOperand &MO,
5098 const MCOperandInfo &OpInfo) const {
5099 // Literal constants use the constant bus.
5100 if (!MO.isReg())
5101 return !isInlineConstant(MO, OpInfo);
5102
5103 Register Reg = MO.getReg();
5104 return Reg.isVirtual() ? RI.isSGPRClass(RC: MRI.getRegClass(Reg))
5105 : physRegUsesConstantBus(RegOp: MO);
5106}
5107
5108static Register findImplicitSGPRRead(const MachineInstr &MI) {
5109 for (const MachineOperand &MO : MI.implicit_operands()) {
5110 // We only care about reads.
5111 if (MO.isDef())
5112 continue;
5113
5114 switch (MO.getReg()) {
5115 case AMDGPU::VCC:
5116 case AMDGPU::VCC_LO:
5117 case AMDGPU::VCC_HI:
5118 case AMDGPU::M0:
5119 case AMDGPU::FLAT_SCR:
5120 return MO.getReg();
5121
5122 default:
5123 break;
5124 }
5125 }
5126
5127 return Register();
5128}
5129
5130static bool shouldReadExec(const MachineInstr &MI) {
5131 if (SIInstrInfo::isVALU(MI)) {
5132 switch (MI.getOpcode()) {
5133 case AMDGPU::V_READLANE_B32:
5134 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5135 case AMDGPU::V_WRITELANE_B32:
5136 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5137 return false;
5138 }
5139
5140 return true;
5141 }
5142
5143 if (MI.isPreISelOpcode() ||
5144 SIInstrInfo::isGenericOpcode(Opc: MI.getOpcode()) ||
5145 SIInstrInfo::isSALU(MI) ||
5146 SIInstrInfo::isSMRD(MI))
5147 return false;
5148
5149 return true;
5150}
5151
5152static bool isRegOrFI(const MachineOperand &MO) {
5153 return MO.isReg() || MO.isFI();
5154}
5155
5156static bool isSubRegOf(const SIRegisterInfo &TRI,
5157 const MachineOperand &SuperVec,
5158 const MachineOperand &SubReg) {
5159 if (SubReg.getReg().isPhysical())
5160 return TRI.isSubRegister(RegA: SuperVec.getReg(), RegB: SubReg.getReg());
5161
5162 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5163 SubReg.getReg() == SuperVec.getReg();
5164}
5165
5166// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5167bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5168 const MachineRegisterInfo &MRI,
5169 StringRef &ErrInfo) const {
5170 Register DstReg = MI.getOperand(i: 0).getReg();
5171 Register SrcReg = MI.getOperand(i: 1).getReg();
5172 // This is a check for copy from vector register to SGPR
5173 if (RI.isVectorRegister(MRI, Reg: SrcReg) && RI.isSGPRReg(MRI, Reg: DstReg)) {
5174 ErrInfo = "illegal copy from vector register to SGPR";
5175 return false;
5176 }
5177 return true;
5178}
5179
5180bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
5181 StringRef &ErrInfo) const {
5182 uint32_t Opcode = MI.getOpcode();
5183 const MachineFunction *MF = MI.getMF();
5184 const MachineRegisterInfo &MRI = MF->getRegInfo();
5185
5186 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5187 // Find a better property to recognize the point where instruction selection
5188 // is just done.
5189 // We can only enforce this check after SIFixSGPRCopies pass so that the
5190 // illegal copies are legalized and thereafter we don't expect a pass
5191 // inserting similar copies.
5192 if (!MRI.isSSA() && MI.isCopy())
5193 return verifyCopy(MI, MRI, ErrInfo);
5194
5195 if (SIInstrInfo::isGenericOpcode(Opc: Opcode))
5196 return true;
5197
5198 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0);
5199 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src1);
5200 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src2);
5201 int Src3Idx = -1;
5202 if (Src0Idx == -1) {
5203 // VOPD V_DUAL_* instructions use different operand names.
5204 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0X);
5205 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vsrc1X);
5206 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0Y);
5207 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vsrc1Y);
5208 }
5209
5210 // Make sure the number of operands is correct.
5211 const MCInstrDesc &Desc = get(Opcode);
5212 if (!Desc.isVariadic() &&
5213 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5214 ErrInfo = "Instruction has wrong number of operands.";
5215 return false;
5216 }
5217
5218 if (MI.isInlineAsm()) {
5219 // Verify register classes for inlineasm constraints.
5220 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5221 I != E; ++I) {
5222 const TargetRegisterClass *RC = MI.getRegClassConstraint(OpIdx: I, TII: this, TRI: &RI);
5223 if (!RC)
5224 continue;
5225
5226 const MachineOperand &Op = MI.getOperand(i: I);
5227 if (!Op.isReg())
5228 continue;
5229
5230 Register Reg = Op.getReg();
5231 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5232 ErrInfo = "inlineasm operand has incorrect register class.";
5233 return false;
5234 }
5235 }
5236
5237 return true;
5238 }
5239
5240 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5241 ErrInfo = "missing memory operand from image instruction.";
5242 return false;
5243 }
5244
5245 // Make sure the register classes are correct.
5246 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5247 const MachineOperand &MO = MI.getOperand(i);
5248 if (MO.isFPImm()) {
5249 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5250 "all fp values to integers.";
5251 return false;
5252 }
5253
5254 const MCOperandInfo &OpInfo = Desc.operands()[i];
5255 int16_t RegClass = getOpRegClassID(OpInfo);
5256
5257 switch (OpInfo.OperandType) {
5258 case MCOI::OPERAND_REGISTER:
5259 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5260 ErrInfo = "Illegal immediate value for operand.";
5261 return false;
5262 }
5263 break;
5264 case AMDGPU::OPERAND_REG_IMM_INT32:
5265 case AMDGPU::OPERAND_REG_IMM_INT64:
5266 case AMDGPU::OPERAND_REG_IMM_INT16:
5267 case AMDGPU::OPERAND_REG_IMM_FP32:
5268 case AMDGPU::OPERAND_REG_IMM_V2FP32:
5269 case AMDGPU::OPERAND_REG_IMM_BF16:
5270 case AMDGPU::OPERAND_REG_IMM_FP16:
5271 case AMDGPU::OPERAND_REG_IMM_FP64:
5272 case AMDGPU::OPERAND_REG_IMM_V2FP16:
5273 case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
5274 case AMDGPU::OPERAND_REG_IMM_V2INT16:
5275 case AMDGPU::OPERAND_REG_IMM_V2INT32:
5276 case AMDGPU::OPERAND_REG_IMM_V2BF16:
5277 break;
5278 case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
5279 break;
5280 break;
5281 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
5282 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
5283 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
5284 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
5285 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
5286 case AMDGPU::OPERAND_REG_INLINE_C_BF16:
5287 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
5288 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
5289 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
5290 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
5291 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
5292 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
5293 case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
5294 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, OpIdx: i))) {
5295 ErrInfo = "Illegal immediate value for operand.";
5296 return false;
5297 }
5298 break;
5299 }
5300 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
5301 case AMDGPU::OPERAND_INPUT_MODS:
5302 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, OpIdx: i)) {
5303 ErrInfo = "Expected inline constant for operand.";
5304 return false;
5305 }
5306 break;
5307 case AMDGPU::OPERAND_SDWA_VOPC_DST:
5308 case AMDGPU::OPERAND_KIMM16:
5309 break;
5310 case MCOI::OPERAND_IMMEDIATE:
5311 case AMDGPU::OPERAND_KIMM32:
5312 case AMDGPU::OPERAND_KIMM64:
5313 case AMDGPU::OPERAND_INLINE_C_AV64_PSEUDO:
5314 // Check if this operand is an immediate.
5315 // FrameIndex operands will be replaced by immediates, so they are
5316 // allowed.
5317 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5318 ErrInfo = "Expected immediate, but got non-immediate";
5319 return false;
5320 }
5321 break;
5322 case MCOI::OPERAND_UNKNOWN:
5323 case MCOI::OPERAND_MEMORY:
5324 case MCOI::OPERAND_PCREL:
5325 break;
5326 default:
5327 if (OpInfo.isGenericType())
5328 continue;
5329 break;
5330 }
5331
5332 if (!MO.isReg())
5333 continue;
5334 Register Reg = MO.getReg();
5335 if (!Reg)
5336 continue;
5337
5338 // FIXME: Ideally we would have separate instruction definitions with the
5339 // aligned register constraint.
5340 // FIXME: We do not verify inline asm operands, but custom inline asm
5341 // verification is broken anyway
5342 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5343 Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
5344 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5345 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5346 if (const TargetRegisterClass *SubRC =
5347 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5348 RC = RI.getCompatibleSubRegClass(SuperRC: RC, SubRC, SubIdx: MO.getSubReg());
5349 if (RC)
5350 RC = SubRC;
5351 }
5352 }
5353
5354 // Check that this is the aligned version of the class.
5355 if (!RC || !RI.isProperlyAlignedRC(RC: *RC)) {
5356 ErrInfo = "Subtarget requires even aligned vector registers";
5357 return false;
5358 }
5359 }
5360
5361 if (RegClass != -1) {
5362 if (Reg.isVirtual())
5363 continue;
5364
5365 const TargetRegisterClass *RC = RI.getRegClass(i: RegClass);
5366 if (!RC->contains(Reg)) {
5367 ErrInfo = "Operand has incorrect register class.";
5368 return false;
5369 }
5370 }
5371 }
5372
5373 // Verify SDWA
5374 if (isSDWA(MI)) {
5375 if (!ST.hasSDWA()) {
5376 ErrInfo = "SDWA is not supported on this target";
5377 return false;
5378 }
5379
5380 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5381 AMDGPU::OpName::dst_sel}) {
5382 const MachineOperand *MO = getNamedOperand(MI, OperandName: Op);
5383 if (!MO)
5384 continue;
5385 int64_t Imm = MO->getImm();
5386 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5387 ErrInfo = "Invalid SDWA selection";
5388 return false;
5389 }
5390 }
5391
5392 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdst);
5393
5394 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5395 if (OpIdx == -1)
5396 continue;
5397 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5398
5399 if (!ST.hasSDWAScalar()) {
5400 // Only VGPRS on VI
5401 if (!MO.isReg() || !RI.hasVGPRs(RC: RI.getRegClassForReg(MRI, Reg: MO.getReg()))) {
5402 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5403 return false;
5404 }
5405 } else {
5406 // No immediates on GFX9
5407 if (!MO.isReg()) {
5408 ErrInfo =
5409 "Only reg allowed as operands in SDWA instructions on GFX9+";
5410 return false;
5411 }
5412 }
5413 }
5414
5415 if (!ST.hasSDWAOmod()) {
5416 // No omod allowed on VI
5417 const MachineOperand *OMod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
5418 if (OMod != nullptr &&
5419 (!OMod->isImm() || OMod->getImm() != 0)) {
5420 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5421 return false;
5422 }
5423 }
5424
5425 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5426 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5427 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5428 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5429 const MachineOperand *Src0ModsMO =
5430 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
5431 unsigned Mods = Src0ModsMO->getImm();
5432 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5433 Mods & SISrcMods::SEXT) {
5434 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5435 return false;
5436 }
5437 }
5438
5439 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5440 if (isVOPC(Opcode: BasicOpcode)) {
5441 if (!ST.hasSDWASdst() && DstIdx != -1) {
5442 // Only vcc allowed as dst on VI for VOPC
5443 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5444 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5445 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5446 return false;
5447 }
5448 } else if (!ST.hasSDWAOutModsVOPC()) {
5449 // No clamp allowed on GFX9 for VOPC
5450 const MachineOperand *Clamp = getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
5451 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5452 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5453 return false;
5454 }
5455
5456 // No omod allowed on GFX9 for VOPC
5457 const MachineOperand *OMod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
5458 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5459 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5460 return false;
5461 }
5462 }
5463 }
5464
5465 const MachineOperand *DstUnused = getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused);
5466 if (DstUnused && DstUnused->isImm() &&
5467 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5468 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5469 if (!Dst.isReg() || !Dst.isTied()) {
5470 ErrInfo = "Dst register should have tied register";
5471 return false;
5472 }
5473
5474 const MachineOperand &TiedMO =
5475 MI.getOperand(i: MI.findTiedOperandIdx(OpIdx: DstIdx));
5476 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5477 ErrInfo =
5478 "Dst register should be tied to implicit use of preserved register";
5479 return false;
5480 }
5481 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5482 ErrInfo = "Dst register should use same physical register as preserved";
5483 return false;
5484 }
5485 }
5486 }
5487
5488 // Verify MIMG / VIMAGE / VSAMPLE
5489 if (isImage(Opcode) && !MI.mayStore()) {
5490 // Ensure that the return type used is large enough for all the options
5491 // being used TFE/LWE require an extra result register.
5492 const MachineOperand *DMask = getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask);
5493 if (DMask) {
5494 uint64_t DMaskImm = DMask->getImm();
5495 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(Value: DMaskImm);
5496 const MachineOperand *TFE = getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe);
5497 const MachineOperand *LWE = getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe);
5498 const MachineOperand *D16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::d16);
5499
5500 // Adjust for packed 16 bit values
5501 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5502 RegCount = divideCeil(Numerator: RegCount, Denominator: 2);
5503
5504 // Adjust if using LWE or TFE
5505 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5506 RegCount += 1;
5507
5508 const uint32_t DstIdx =
5509 AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdata);
5510 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5511 if (Dst.isReg()) {
5512 const TargetRegisterClass *DstRC = getOpRegClass(MI, OpNo: DstIdx);
5513 uint32_t DstSize = RI.getRegSizeInBits(RC: *DstRC) / 32;
5514 if (RegCount > DstSize) {
5515 ErrInfo = "Image instruction returns too many registers for dst "
5516 "register class";
5517 return false;
5518 }
5519 }
5520 }
5521 }
5522
5523 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5524 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5525 unsigned ConstantBusCount = 0;
5526 bool UsesLiteral = false;
5527 const MachineOperand *LiteralVal = nullptr;
5528
5529 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::imm);
5530 if (ImmIdx != -1) {
5531 ++ConstantBusCount;
5532 UsesLiteral = true;
5533 LiteralVal = &MI.getOperand(i: ImmIdx);
5534 }
5535
5536 SmallVector<Register, 2> SGPRsUsed;
5537 Register SGPRUsed;
5538
5539 // Only look at the true operands. Only a real operand can use the constant
5540 // bus, and we don't want to check pseudo-operands like the source modifier
5541 // flags.
5542 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5543 if (OpIdx == -1)
5544 continue;
5545 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5546 if (usesConstantBus(MRI, MO, OpInfo: MI.getDesc().operands()[OpIdx])) {
5547 if (MO.isReg()) {
5548 SGPRUsed = MO.getReg();
5549 if (!llvm::is_contained(Range&: SGPRsUsed, Element: SGPRUsed)) {
5550 ++ConstantBusCount;
5551 SGPRsUsed.push_back(Elt: SGPRUsed);
5552 }
5553 } else if (!MO.isFI()) { // Treat FI like a register.
5554 if (!UsesLiteral) {
5555 ++ConstantBusCount;
5556 UsesLiteral = true;
5557 LiteralVal = &MO;
5558 } else if (!MO.isIdenticalTo(Other: *LiteralVal)) {
5559 assert(isVOP2(MI) || isVOP3(MI));
5560 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5561 return false;
5562 }
5563 }
5564 }
5565 }
5566
5567 SGPRUsed = findImplicitSGPRRead(MI);
5568 if (SGPRUsed) {
5569 // Implicit uses may safely overlap true operands
5570 if (llvm::all_of(Range&: SGPRsUsed, P: [this, SGPRUsed](unsigned SGPR) {
5571 return !RI.regsOverlap(RegA: SGPRUsed, RegB: SGPR);
5572 })) {
5573 ++ConstantBusCount;
5574 SGPRsUsed.push_back(Elt: SGPRUsed);
5575 }
5576 }
5577
5578 // v_writelane_b32 is an exception from constant bus restriction:
5579 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5580 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5581 Opcode != AMDGPU::V_WRITELANE_B32) {
5582 ErrInfo = "VOP* instruction violates constant bus restriction";
5583 return false;
5584 }
5585
5586 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5587 ErrInfo = "VOP3 instruction uses literal";
5588 return false;
5589 }
5590 }
5591
5592 // Special case for writelane - this can break the multiple constant bus rule,
5593 // but still can't use more than one SGPR register
5594 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5595 unsigned SGPRCount = 0;
5596 Register SGPRUsed;
5597
5598 for (int OpIdx : {Src0Idx, Src1Idx}) {
5599 if (OpIdx == -1)
5600 break;
5601
5602 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5603
5604 if (usesConstantBus(MRI, MO, OpInfo: MI.getDesc().operands()[OpIdx])) {
5605 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5606 if (MO.getReg() != SGPRUsed)
5607 ++SGPRCount;
5608 SGPRUsed = MO.getReg();
5609 }
5610 }
5611 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5612 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5613 return false;
5614 }
5615 }
5616 }
5617
5618 // Verify misc. restrictions on specific instructions.
5619 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5620 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5621 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5622 const MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
5623 const MachineOperand &Src2 = MI.getOperand(i: Src2Idx);
5624 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5625 if (!compareMachineOp(Op0: Src0, Op1: Src1) &&
5626 !compareMachineOp(Op0: Src0, Op1: Src2)) {
5627 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5628 return false;
5629 }
5630 }
5631 if ((getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers)->getImm() &
5632 SISrcMods::ABS) ||
5633 (getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers)->getImm() &
5634 SISrcMods::ABS) ||
5635 (getNamedOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers)->getImm() &
5636 SISrcMods::ABS)) {
5637 ErrInfo = "ABS not allowed in VOP3B instructions";
5638 return false;
5639 }
5640 }
5641
5642 if (isSOP2(MI) || isSOPC(MI)) {
5643 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5644 const MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
5645
5646 if (!isRegOrFI(MO: Src0) && !isRegOrFI(MO: Src1) &&
5647 !isInlineConstant(MO: Src0, OpInfo: Desc.operands()[Src0Idx]) &&
5648 !isInlineConstant(MO: Src1, OpInfo: Desc.operands()[Src1Idx]) &&
5649 !Src0.isIdenticalTo(Other: Src1)) {
5650 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5651 return false;
5652 }
5653 }
5654
5655 if (isSOPK(MI)) {
5656 const auto *Op = getNamedOperand(MI, OperandName: AMDGPU::OpName::simm16);
5657 if (Desc.isBranch()) {
5658 if (!Op->isMBB()) {
5659 ErrInfo = "invalid branch target for SOPK instruction";
5660 return false;
5661 }
5662 } else {
5663 uint64_t Imm = Op->getImm();
5664 if (sopkIsZext(Opcode)) {
5665 if (!isUInt<16>(x: Imm)) {
5666 ErrInfo = "invalid immediate for SOPK instruction";
5667 return false;
5668 }
5669 } else {
5670 if (!isInt<16>(x: Imm)) {
5671 ErrInfo = "invalid immediate for SOPK instruction";
5672 return false;
5673 }
5674 }
5675 }
5676 }
5677
5678 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5679 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5680 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5681 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5682 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5683 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5684
5685 const unsigned StaticNumOps =
5686 Desc.getNumOperands() + Desc.implicit_uses().size();
5687 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5688
5689 // Require additional implicit operands. This allows a fixup done by the
5690 // post RA scheduler where the main implicit operand is killed and
5691 // implicit-defs are added for sub-registers that remain live after this
5692 // instruction.
5693 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5694 ErrInfo = "missing implicit register operands";
5695 return false;
5696 }
5697
5698 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
5699 if (IsDst) {
5700 if (!Dst->isUse()) {
5701 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5702 return false;
5703 }
5704
5705 unsigned UseOpIdx;
5706 if (!MI.isRegTiedToUseOperand(DefOpIdx: StaticNumOps, UseOpIdx: &UseOpIdx) ||
5707 UseOpIdx != StaticNumOps + 1) {
5708 ErrInfo = "movrel implicit operands should be tied";
5709 return false;
5710 }
5711 }
5712
5713 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5714 const MachineOperand &ImpUse
5715 = MI.getOperand(i: StaticNumOps + NumImplicitOps - 1);
5716 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5717 !isSubRegOf(TRI: RI, SuperVec: ImpUse, SubReg: IsDst ? *Dst : Src0)) {
5718 ErrInfo = "src0 should be subreg of implicit vector use";
5719 return false;
5720 }
5721 }
5722
5723 // Make sure we aren't losing exec uses in the td files. This mostly requires
5724 // being careful when using let Uses to try to add other use registers.
5725 if (shouldReadExec(MI)) {
5726 if (!MI.hasRegisterImplicitUseOperand(Reg: AMDGPU::EXEC)) {
5727 ErrInfo = "VALU instruction does not implicitly read exec mask";
5728 return false;
5729 }
5730 }
5731
5732 if (isSMRD(MI)) {
5733 if (MI.mayStore() &&
5734 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5735 // The register offset form of scalar stores may only use m0 as the
5736 // soffset register.
5737 const MachineOperand *Soff = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
5738 if (Soff && Soff->getReg() != AMDGPU::M0) {
5739 ErrInfo = "scalar stores must use m0 as offset register";
5740 return false;
5741 }
5742 }
5743 }
5744
5745 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5746 const MachineOperand *Offset = getNamedOperand(MI, OperandName: AMDGPU::OpName::offset);
5747 if (Offset->getImm() != 0) {
5748 ErrInfo = "subtarget does not support offsets in flat instructions";
5749 return false;
5750 }
5751 }
5752
5753 if (isDS(MI) && !ST.hasGDS()) {
5754 const MachineOperand *GDSOp = getNamedOperand(MI, OperandName: AMDGPU::OpName::gds);
5755 if (GDSOp && GDSOp->getImm() != 0) {
5756 ErrInfo = "GDS is not supported on this subtarget";
5757 return false;
5758 }
5759 }
5760
5761 if (isImage(MI)) {
5762 const MachineOperand *DimOp = getNamedOperand(MI, OperandName: AMDGPU::OpName::dim);
5763 if (DimOp) {
5764 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5765 Name: AMDGPU::OpName::vaddr0);
5766 AMDGPU::OpName RSrcOpName =
5767 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5768 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: RSrcOpName);
5769 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Opcode);
5770 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5771 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
5772 const AMDGPU::MIMGDimInfo *Dim =
5773 AMDGPU::getMIMGDimInfoByEncoding(DimEnc: DimOp->getImm());
5774
5775 if (!Dim) {
5776 ErrInfo = "dim is out of range";
5777 return false;
5778 }
5779
5780 bool IsA16 = false;
5781 if (ST.hasR128A16()) {
5782 const MachineOperand *R128A16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::r128);
5783 IsA16 = R128A16->getImm() != 0;
5784 } else if (ST.hasA16()) {
5785 const MachineOperand *A16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::a16);
5786 IsA16 = A16->getImm() != 0;
5787 }
5788
5789 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5790
5791 unsigned AddrWords =
5792 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, IsG16Supported: ST.hasG16());
5793
5794 unsigned VAddrWords;
5795 if (IsNSA) {
5796 VAddrWords = RsrcIdx - VAddr0Idx;
5797 if (ST.hasPartialNSAEncoding() &&
5798 AddrWords > ST.getNSAMaxSize(HasSampler: isVSAMPLE(MI))) {
5799 unsigned LastVAddrIdx = RsrcIdx - 1;
5800 VAddrWords += getOpSize(MI, OpNo: LastVAddrIdx) / 4 - 1;
5801 }
5802 } else {
5803 VAddrWords = getOpSize(MI, OpNo: VAddr0Idx) / 4;
5804 if (AddrWords > 12)
5805 AddrWords = 16;
5806 }
5807
5808 if (VAddrWords != AddrWords) {
5809 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5810 << " but got " << VAddrWords << "\n");
5811 ErrInfo = "bad vaddr size";
5812 return false;
5813 }
5814 }
5815 }
5816
5817 const MachineOperand *DppCt = getNamedOperand(MI, OperandName: AMDGPU::OpName::dpp_ctrl);
5818 if (DppCt) {
5819 using namespace AMDGPU::DPP;
5820
5821 unsigned DC = DppCt->getImm();
5822 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5823 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5824 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5825 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5826 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5827 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5828 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5829 ErrInfo = "Invalid dpp_ctrl value";
5830 return false;
5831 }
5832 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5833 !ST.hasDPPWavefrontShifts()) {
5834 ErrInfo = "Invalid dpp_ctrl value: "
5835 "wavefront shifts are not supported on GFX10+";
5836 return false;
5837 }
5838 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5839 !ST.hasDPPBroadcasts()) {
5840 ErrInfo = "Invalid dpp_ctrl value: "
5841 "broadcasts are not supported on GFX10+";
5842 return false;
5843 }
5844 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5845 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5846 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5847 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5848 !ST.hasGFX90AInsts()) {
5849 ErrInfo = "Invalid dpp_ctrl value: "
5850 "row_newbroadcast/row_share is not supported before "
5851 "GFX90A/GFX10";
5852 return false;
5853 }
5854 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5855 ErrInfo = "Invalid dpp_ctrl value: "
5856 "row_share and row_xmask are not supported before GFX10";
5857 return false;
5858 }
5859 }
5860
5861 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5862 !AMDGPU::isLegalDPALU_DPPControl(ST, DC) &&
5863 AMDGPU::isDPALU_DPP(OpDesc: Desc, MII: *this, ST)) {
5864 ErrInfo = "Invalid dpp_ctrl value: "
5865 "DP ALU dpp only support row_newbcast";
5866 return false;
5867 }
5868 }
5869
5870 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5871 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
5872 AMDGPU::OpName DataName =
5873 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5874 const MachineOperand *Data = getNamedOperand(MI, OperandName: DataName);
5875 const MachineOperand *Data2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::data1);
5876 if (Data && !Data->isReg())
5877 Data = nullptr;
5878
5879 if (ST.hasGFX90AInsts()) {
5880 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5881 (RI.isAGPR(MRI, Reg: Dst->getReg()) != RI.isAGPR(MRI, Reg: Data->getReg()))) {
5882 ErrInfo = "Invalid register class: "
5883 "vdata and vdst should be both VGPR or AGPR";
5884 return false;
5885 }
5886 if (Data && Data2 &&
5887 (RI.isAGPR(MRI, Reg: Data->getReg()) != RI.isAGPR(MRI, Reg: Data2->getReg()))) {
5888 ErrInfo = "Invalid register class: "
5889 "both data operands should be VGPR or AGPR";
5890 return false;
5891 }
5892 } else {
5893 if ((Dst && RI.isAGPR(MRI, Reg: Dst->getReg())) ||
5894 (Data && RI.isAGPR(MRI, Reg: Data->getReg())) ||
5895 (Data2 && RI.isAGPR(MRI, Reg: Data2->getReg()))) {
5896 ErrInfo = "Invalid register class: "
5897 "agpr loads and stores not supported on this GPU";
5898 return false;
5899 }
5900 }
5901 }
5902
5903 if (ST.needsAlignedVGPRs()) {
5904 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5905 const MachineOperand *Op = getNamedOperand(MI, OperandName: OpName);
5906 if (!Op)
5907 return true;
5908 Register Reg = Op->getReg();
5909 if (Reg.isPhysical())
5910 return !(RI.getHWRegIndex(Reg) & 1);
5911 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5912 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5913 !(RI.getChannelFromSubReg(SubReg: Op->getSubReg()) & 1);
5914 };
5915
5916 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5917 Opcode == AMDGPU::DS_GWS_BARRIER) {
5918
5919 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5920 ErrInfo = "Subtarget requires even aligned vector registers "
5921 "for DS_GWS instructions";
5922 return false;
5923 }
5924 }
5925
5926 if (isMIMG(MI)) {
5927 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5928 ErrInfo = "Subtarget requires even aligned vector registers "
5929 "for vaddr operand of image instructions";
5930 return false;
5931 }
5932 }
5933 }
5934
5935 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5936 const MachineOperand *Src = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
5937 if (Src->isReg() && RI.isSGPRReg(MRI, Reg: Src->getReg())) {
5938 ErrInfo = "Invalid register class: "
5939 "v_accvgpr_write with an SGPR is not supported on this GPU";
5940 return false;
5941 }
5942 }
5943
5944 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5945 const MachineOperand &SrcOp = MI.getOperand(i: 1);
5946 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5947 ErrInfo = "pseudo expects only physical SGPRs";
5948 return false;
5949 }
5950 }
5951
5952 if (const MachineOperand *CPol = getNamedOperand(MI, OperandName: AMDGPU::OpName::cpol)) {
5953 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5954 if (!ST.hasScaleOffset()) {
5955 ErrInfo = "Subtarget does not support offset scaling";
5956 return false;
5957 }
5958 if (!AMDGPU::supportsScaleOffset(MII: *this, Opcode: MI.getOpcode())) {
5959 ErrInfo = "Instruction does not support offset scaling";
5960 return false;
5961 }
5962 }
5963 }
5964
5965 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5966 // information.
5967 if (AMDGPU::isPackedFP32Inst(Opc: Opcode) && AMDGPU::isGFX12Plus(STI: ST)) {
5968 for (unsigned I = 0; I < 3; ++I) {
5969 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, SrcN: I))
5970 return false;
5971 }
5972 }
5973
5974 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5975 MI.readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI: nullptr)) {
5976 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
5977 if ((Dst && RI.getRegClassForReg(MRI, Reg: Dst->getReg()) ==
5978 &AMDGPU::SReg_64RegClass) ||
5979 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5980 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5981 return false;
5982 }
5983 }
5984
5985 return true;
5986}
5987
5988unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
5989 if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
5990 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5991 return MI.getOperand(i: 1).isReg() || RI.isAGPR(MRI, Reg: MI.getOperand(i: 0).getReg())
5992 ? AMDGPU::COPY
5993 : AMDGPU::V_MOV_B32_e32;
5994 }
5995 return getVALUOp(Opc: MI.getOpcode());
5996}
5997
5998// It is more readable to list mapped opcodes on the same line.
5999// clang-format off
6000
6001unsigned SIInstrInfo::getVALUOp(unsigned Opc) const {
6002 switch (Opc) {
6003 default: return AMDGPU::INSTRUCTION_LIST_END;
6004 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
6005 case AMDGPU::COPY: return AMDGPU::COPY;
6006 case AMDGPU::PHI: return AMDGPU::PHI;
6007 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
6008 case AMDGPU::WQM: return AMDGPU::WQM;
6009 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
6010 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
6011 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
6012 case AMDGPU::S_ADD_I32:
6013 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
6014 case AMDGPU::S_ADDC_U32:
6015 return AMDGPU::V_ADDC_U32_e32;
6016 case AMDGPU::S_SUB_I32:
6017 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
6018 // FIXME: These are not consistently handled, and selected when the carry is
6019 // used.
6020 case AMDGPU::S_ADD_U32:
6021 return AMDGPU::V_ADD_CO_U32_e32;
6022 case AMDGPU::S_SUB_U32:
6023 return AMDGPU::V_SUB_CO_U32_e32;
6024 case AMDGPU::S_ADD_U64_PSEUDO:
6025 return AMDGPU::V_ADD_U64_PSEUDO;
6026 case AMDGPU::S_SUB_U64_PSEUDO:
6027 return AMDGPU::V_SUB_U64_PSEUDO;
6028 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
6029 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
6030 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
6031 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
6032 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
6033 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
6034 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
6035 case AMDGPU::S_XNOR_B32:
6036 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
6037 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
6038 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
6039 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
6040 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
6041 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
6042 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
6043 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
6044 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
6045 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
6046 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
6047 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
6048 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
6049 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
6050 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
6051 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
6052 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
6053 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
6054 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
6055 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
6056 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
6057 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
6058 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
6059 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
6060 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
6061 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
6062 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
6063 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
6064 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
6065 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
6066 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
6067 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
6068 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
6069 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
6070 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
6071 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
6072 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
6073 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
6074 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
6075 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6076 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6077 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6078 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6079 case AMDGPU::S_CVT_F32_F16:
6080 case AMDGPU::S_CVT_HI_F32_F16:
6081 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6082 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6083 case AMDGPU::S_CVT_F16_F32:
6084 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6085 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6086 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6087 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6088 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6089 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6090 case AMDGPU::S_CEIL_F16:
6091 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6092 : AMDGPU::V_CEIL_F16_fake16_e64;
6093 case AMDGPU::S_FLOOR_F16:
6094 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6095 : AMDGPU::V_FLOOR_F16_fake16_e64;
6096 case AMDGPU::S_TRUNC_F16:
6097 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6098 : AMDGPU::V_TRUNC_F16_fake16_e64;
6099 case AMDGPU::S_RNDNE_F16:
6100 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6101 : AMDGPU::V_RNDNE_F16_fake16_e64;
6102 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6103 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6104 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6105 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6106 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6107 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6108 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6109 case AMDGPU::S_ADD_F16:
6110 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6111 : AMDGPU::V_ADD_F16_fake16_e64;
6112 case AMDGPU::S_SUB_F16:
6113 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6114 : AMDGPU::V_SUB_F16_fake16_e64;
6115 case AMDGPU::S_MIN_F16:
6116 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6117 : AMDGPU::V_MIN_F16_fake16_e64;
6118 case AMDGPU::S_MAX_F16:
6119 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6120 : AMDGPU::V_MAX_F16_fake16_e64;
6121 case AMDGPU::S_MINIMUM_F16:
6122 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6123 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6124 case AMDGPU::S_MAXIMUM_F16:
6125 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6126 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6127 case AMDGPU::S_MUL_F16:
6128 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6129 : AMDGPU::V_MUL_F16_fake16_e64;
6130 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6131 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6132 case AMDGPU::S_FMAC_F16:
6133 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6134 : AMDGPU::V_FMAC_F16_fake16_e64;
6135 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6136 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6137 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6138 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6139 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6140 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6141 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6142 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6143 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6144 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6145 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6146 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6147 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6148 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6149 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6150 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6151 case AMDGPU::S_CMP_LT_F16:
6152 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6153 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6154 case AMDGPU::S_CMP_EQ_F16:
6155 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6156 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6157 case AMDGPU::S_CMP_LE_F16:
6158 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6159 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6160 case AMDGPU::S_CMP_GT_F16:
6161 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6162 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6163 case AMDGPU::S_CMP_LG_F16:
6164 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6165 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6166 case AMDGPU::S_CMP_GE_F16:
6167 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6168 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6169 case AMDGPU::S_CMP_O_F16:
6170 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6171 : AMDGPU::V_CMP_O_F16_fake16_e64;
6172 case AMDGPU::S_CMP_U_F16:
6173 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6174 : AMDGPU::V_CMP_U_F16_fake16_e64;
6175 case AMDGPU::S_CMP_NGE_F16:
6176 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6177 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6178 case AMDGPU::S_CMP_NLG_F16:
6179 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6180 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6181 case AMDGPU::S_CMP_NGT_F16:
6182 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6183 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6184 case AMDGPU::S_CMP_NLE_F16:
6185 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6186 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6187 case AMDGPU::S_CMP_NEQ_F16:
6188 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6189 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6190 case AMDGPU::S_CMP_NLT_F16:
6191 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6192 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6193 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6194 case AMDGPU::V_S_EXP_F16_e64:
6195 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6196 : AMDGPU::V_EXP_F16_fake16_e64;
6197 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6198 case AMDGPU::V_S_LOG_F16_e64:
6199 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6200 : AMDGPU::V_LOG_F16_fake16_e64;
6201 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6202 case AMDGPU::V_S_RCP_F16_e64:
6203 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6204 : AMDGPU::V_RCP_F16_fake16_e64;
6205 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6206 case AMDGPU::V_S_RSQ_F16_e64:
6207 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6208 : AMDGPU::V_RSQ_F16_fake16_e64;
6209 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6210 case AMDGPU::V_S_SQRT_F16_e64:
6211 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6212 : AMDGPU::V_SQRT_F16_fake16_e64;
6213 }
6214 llvm_unreachable(
6215 "Unexpected scalar opcode without corresponding vector one!");
6216}
6217
6218// clang-format on
6219
6220void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
6221 MachineBasicBlock &MBB,
6222 MachineBasicBlock::iterator MBBI,
6223 const DebugLoc &DL, Register Reg,
6224 bool IsSCCLive,
6225 SlotIndexes *Indexes) const {
6226 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6227 const SIInstrInfo *TII = ST.getInstrInfo();
6228 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
6229 if (IsSCCLive) {
6230 // Insert two move instructions, one to save the original value of EXEC and
6231 // the other to turn on all bits in EXEC. This is required as we can't use
6232 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6233 auto StoreExecMI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: Reg)
6234 .addReg(RegNo: LMC.ExecReg, Flags: RegState::Kill);
6235 auto FlipExecMI =
6236 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg).addImm(Val: -1);
6237 if (Indexes) {
6238 Indexes->insertMachineInstrInMaps(MI&: *StoreExecMI);
6239 Indexes->insertMachineInstrInMaps(MI&: *FlipExecMI);
6240 }
6241 } else {
6242 auto SaveExec =
6243 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: LMC.OrSaveExecOpc), DestReg: Reg).addImm(Val: -1);
6244 SaveExec->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
6245 if (Indexes)
6246 Indexes->insertMachineInstrInMaps(MI&: *SaveExec);
6247 }
6248}
6249
6250void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
6251 MachineBasicBlock::iterator MBBI,
6252 const DebugLoc &DL, Register Reg,
6253 SlotIndexes *Indexes) const {
6254 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
6255 auto ExecRestoreMI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
6256 .addReg(RegNo: Reg, Flags: RegState::Kill);
6257 if (Indexes)
6258 Indexes->insertMachineInstrInMaps(MI&: *ExecRestoreMI);
6259}
6260
6261MachineInstr *
6262SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
6263 assert(MF.getInfo<SIMachineFunctionInfo>()->isWholeWaveFunction() &&
6264 "Not a whole wave func");
6265 MachineBasicBlock &MBB = *MF.begin();
6266 for (MachineInstr &MI : MBB)
6267 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6268 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6269 return &MI;
6270
6271 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6272}
6273
6274const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
6275 unsigned OpNo) const {
6276 const MCInstrDesc &Desc = get(Opcode: MI.getOpcode());
6277 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6278 Desc.operands()[OpNo].RegClass == -1) {
6279 Register Reg = MI.getOperand(i: OpNo).getReg();
6280
6281 if (Reg.isVirtual()) {
6282 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6283 return MRI.getRegClass(Reg);
6284 }
6285 return RI.getPhysRegBaseClass(Reg);
6286 }
6287
6288 int16_t RegClass = getOpRegClassID(OpInfo: Desc.operands()[OpNo]);
6289 return RegClass < 0 ? nullptr : RI.getRegClass(i: RegClass);
6290}
6291
6292void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
6293 MachineBasicBlock::iterator I = MI;
6294 MachineBasicBlock *MBB = MI.getParent();
6295 MachineOperand &MO = MI.getOperand(i: OpIdx);
6296 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6297 unsigned RCID = getOpRegClassID(OpInfo: get(Opcode: MI.getOpcode()).operands()[OpIdx]);
6298 const TargetRegisterClass *RC = RI.getRegClass(i: RCID);
6299 unsigned Size = RI.getRegSizeInBits(RC: *RC);
6300 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6301 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6302 : AMDGPU::V_MOV_B32_e32;
6303 if (MO.isReg())
6304 Opcode = AMDGPU::COPY;
6305 else if (RI.isSGPRClass(RC))
6306 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6307
6308 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(SRC: RC);
6309 Register Reg = MRI.createVirtualRegister(RegClass: VRC);
6310 DebugLoc DL = MBB->findDebugLoc(MBBI: I);
6311 BuildMI(BB&: *MI.getParent(), I, MIMD: DL, MCID: get(Opcode), DestReg: Reg).add(MO);
6312 MO.ChangeToRegister(Reg, isDef: false);
6313}
6314
6315unsigned SIInstrInfo::buildExtractSubReg(
6316 MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
6317 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6318 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6319 if (!SuperReg.getReg().isVirtual())
6320 return RI.getSubReg(Reg: SuperReg.getReg(), Idx: SubIdx);
6321
6322 MachineBasicBlock *MBB = MI->getParent();
6323 const DebugLoc &DL = MI->getDebugLoc();
6324 Register SubReg = MRI.createVirtualRegister(RegClass: SubRC);
6325
6326 unsigned NewSubIdx = RI.composeSubRegIndices(a: SuperReg.getSubReg(), b: SubIdx);
6327 BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: SubReg)
6328 .addReg(RegNo: SuperReg.getReg(), Flags: {}, SubReg: NewSubIdx);
6329 return SubReg;
6330}
6331
6332MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
6333 MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI,
6334 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6335 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6336 if (Op.isImm()) {
6337 if (SubIdx == AMDGPU::sub0)
6338 return MachineOperand::CreateImm(Val: static_cast<int32_t>(Op.getImm()));
6339 if (SubIdx == AMDGPU::sub1)
6340 return MachineOperand::CreateImm(Val: static_cast<int32_t>(Op.getImm() >> 32));
6341
6342 llvm_unreachable("Unhandled register index for immediate");
6343 }
6344
6345 unsigned SubReg = buildExtractSubReg(MI: MII, MRI, SuperReg: Op, SuperRC,
6346 SubIdx, SubRC);
6347 return MachineOperand::CreateReg(Reg: SubReg, isDef: false);
6348}
6349
6350// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6351void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6352 assert(Inst.getNumExplicitOperands() == 3);
6353 MachineOperand Op1 = Inst.getOperand(i: 1);
6354 Inst.removeOperand(OpNo: 1);
6355 Inst.addOperand(Op: Op1);
6356}
6357
6358bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
6359 const MCOperandInfo &OpInfo,
6360 const MachineOperand &MO) const {
6361 if (!MO.isReg())
6362 return false;
6363
6364 Register Reg = MO.getReg();
6365
6366 const TargetRegisterClass *DRC = RI.getRegClass(i: getOpRegClassID(OpInfo));
6367 if (Reg.isPhysical())
6368 return DRC->contains(Reg);
6369
6370 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6371
6372 if (MO.getSubReg()) {
6373 const MachineFunction *MF = MO.getParent()->getMF();
6374 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, MF: *MF);
6375 if (!SuperRC)
6376 return false;
6377 return RI.getMatchingSuperRegClass(A: SuperRC, B: DRC, Idx: MO.getSubReg()) != nullptr;
6378 }
6379
6380 return RI.getCommonSubClass(A: DRC, B: RC) != nullptr;
6381}
6382
6383bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
6384 const MachineOperand &MO) const {
6385 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6386 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6387 unsigned Opc = MI.getOpcode();
6388
6389 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6390 // information.
6391 if (AMDGPU::isPackedFP32Inst(Opc: MI.getOpcode()) && AMDGPU::isGFX12Plus(STI: ST) &&
6392 MO.isReg() && RI.isSGPRReg(MRI, Reg: MO.getReg())) {
6393 constexpr AMDGPU::OpName OpNames[] = {
6394 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6395
6396 for (auto [I, OpName] : enumerate(First: OpNames)) {
6397 int SrcIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[I]);
6398 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6399 !isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, SrcN: I, MO: &MO))
6400 return false;
6401 }
6402 }
6403
6404 if (!isLegalRegOperand(MRI, OpInfo, MO))
6405 return false;
6406
6407 // check Accumulate GPR operand
6408 bool IsAGPR = RI.isAGPR(MRI, Reg: MO.getReg());
6409 if (IsAGPR && !ST.hasMAIInsts())
6410 return false;
6411 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6412 (MI.mayLoad() || MI.mayStore() || isDS(Opcode: Opc) || isMIMG(Opcode: Opc)))
6413 return false;
6414 // Atomics should have both vdst and vdata either vgpr or agpr.
6415 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
6416 const int DataIdx = AMDGPU::getNamedOperandIdx(
6417 Opcode: Opc, Name: isDS(Opcode: Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6418 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6419 MI.getOperand(i: DataIdx).isReg() &&
6420 RI.isAGPR(MRI, Reg: MI.getOperand(i: DataIdx).getReg()) != IsAGPR)
6421 return false;
6422 if ((int)OpIdx == DataIdx) {
6423 if (VDstIdx != -1 &&
6424 RI.isAGPR(MRI, Reg: MI.getOperand(i: VDstIdx).getReg()) != IsAGPR)
6425 return false;
6426 // DS instructions with 2 src operands also must have tied RC.
6427 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data1);
6428 if (Data1Idx != -1 && MI.getOperand(i: Data1Idx).isReg() &&
6429 RI.isAGPR(MRI, Reg: MI.getOperand(i: Data1Idx).getReg()) != IsAGPR)
6430 return false;
6431 }
6432
6433 // Check V_ACCVGPR_WRITE_B32_e64
6434 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6435 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0) &&
6436 RI.isSGPRReg(MRI, Reg: MO.getReg()))
6437 return false;
6438
6439 if (ST.hasFlatScratchHiInB64InstHazard() &&
6440 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6441 if (const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst)) {
6442 if (AMDGPU::getRegBitWidth(RC: *RI.getRegClassForReg(MRI, Reg: Dst->getReg())) ==
6443 64)
6444 return false;
6445 }
6446 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6447 return false;
6448 }
6449
6450 return true;
6451}
6452
6453bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
6454 const MCOperandInfo &OpInfo,
6455 const MachineOperand &MO) const {
6456 if (MO.isReg())
6457 return isLegalRegOperand(MRI, OpInfo, MO);
6458
6459 // Handle non-register types that are treated like immediates.
6460 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6461 return true;
6462}
6463
6464bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
6465 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6466 const MachineOperand *MO) const {
6467 constexpr unsigned NumOps = 3;
6468 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6469 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6470 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6471 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6472
6473 assert(SrcN < NumOps);
6474
6475 if (!MO) {
6476 int SrcIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[SrcN]);
6477 if (SrcIdx == -1)
6478 return true;
6479 MO = &MI.getOperand(i: SrcIdx);
6480 }
6481
6482 if (!MO->isReg() || !RI.isSGPRReg(MRI, Reg: MO->getReg()))
6483 return true;
6484
6485 int ModsIdx =
6486 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpNames[NumOps + SrcN]);
6487 if (ModsIdx == -1)
6488 return true;
6489
6490 unsigned Mods = MI.getOperand(i: ModsIdx).getImm();
6491 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6492 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6493
6494 return !OpSel && !OpSelHi;
6495}
6496
6497bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
6498 const MachineOperand *MO) const {
6499 const MachineFunction &MF = *MI.getMF();
6500 const MachineRegisterInfo &MRI = MF.getRegInfo();
6501 const MCInstrDesc &InstDesc = MI.getDesc();
6502 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6503 int64_t RegClass = getOpRegClassID(OpInfo);
6504 const TargetRegisterClass *DefinedRC =
6505 RegClass != -1 ? RI.getRegClass(i: RegClass) : nullptr;
6506 if (!MO)
6507 MO = &MI.getOperand(i: OpIdx);
6508
6509 const bool IsInlineConst = !MO->isReg() && isInlineConstant(MO: *MO, OpInfo);
6510
6511 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, MO: *MO, OpInfo)) {
6512 const MachineOperand *UsedLiteral = nullptr;
6513
6514 int ConstantBusLimit = ST.getConstantBusLimit(Opcode: MI.getOpcode());
6515 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6516
6517 // TODO: Be more permissive with frame indexes.
6518 if (!MO->isReg() && !isInlineConstant(MO: *MO, OpInfo)) {
6519 if (!LiteralLimit--)
6520 return false;
6521
6522 UsedLiteral = MO;
6523 }
6524
6525 SmallDenseSet<RegSubRegPair> SGPRsUsed;
6526 if (MO->isReg())
6527 SGPRsUsed.insert(V: RegSubRegPair(MO->getReg(), MO->getSubReg()));
6528
6529 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6530 if (i == OpIdx)
6531 continue;
6532 const MachineOperand &Op = MI.getOperand(i);
6533 if (Op.isReg()) {
6534 if (Op.isUse()) {
6535 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6536 if (regUsesConstantBus(RegOp: Op, MRI) && SGPRsUsed.insert(V: SGPR).second) {
6537 if (--ConstantBusLimit <= 0)
6538 return false;
6539 }
6540 }
6541 } else if (AMDGPU::isSISrcOperand(OpInfo: InstDesc.operands()[i]) &&
6542 !isInlineConstant(MO: Op, OpInfo: InstDesc.operands()[i])) {
6543 // The same literal may be used multiple times.
6544 if (!UsedLiteral)
6545 UsedLiteral = &Op;
6546 else if (UsedLiteral->isIdenticalTo(Other: Op))
6547 continue;
6548
6549 if (!LiteralLimit--)
6550 return false;
6551 if (--ConstantBusLimit <= 0)
6552 return false;
6553 }
6554 }
6555 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6556 // There can be at most one literal operand, but it can be repeated.
6557 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6558 if (i == OpIdx)
6559 continue;
6560 const MachineOperand &Op = MI.getOperand(i);
6561 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6562 !isInlineConstant(MO: Op, OpInfo: InstDesc.operands()[i]) &&
6563 !Op.isIdenticalTo(Other: *MO))
6564 return false;
6565
6566 // Do not fold a non-inlineable and non-register operand into an
6567 // instruction that already has a frame index. The frame index handling
6568 // code could not handle well when a frame index co-exists with another
6569 // non-register operand, unless that operand is an inlineable immediate.
6570 if (Op.isFI())
6571 return false;
6572 }
6573 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6574 isF16PseudoScalarTrans(Opcode: MI.getOpcode())) {
6575 return false;
6576 }
6577
6578 if (MO->isReg()) {
6579 if (!DefinedRC)
6580 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6581 return isLegalRegOperand(MI, OpIdx, MO: *MO);
6582 }
6583
6584 if (MO->isImm()) {
6585 uint64_t Imm = MO->getImm();
6586 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6587 bool Is64BitOp = Is64BitFPOp ||
6588 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6589 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6590 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6591 if (Is64BitOp &&
6592 !AMDGPU::isInlinableLiteral64(Literal: Imm, HasInv2Pi: ST.hasInv2PiInlineImm())) {
6593 if (!AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: Is64BitFPOp) &&
6594 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6595 return false;
6596
6597 // FIXME: We can use sign extended 64-bit literals, but only for signed
6598 // operands. At the moment we do not know if an operand is signed.
6599 // Such operand will be encoded as its low 32 bits and then either
6600 // correctly sign extended or incorrectly zero extended by HW.
6601 // If 64-bit literals are supported and the literal will be encoded
6602 // as full 64 bit we still can use it.
6603 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6604 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: false)))
6605 return false;
6606 }
6607 }
6608
6609 // Handle non-register types that are treated like immediates.
6610 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6611
6612 if (!DefinedRC) {
6613 // This operand expects an immediate.
6614 return true;
6615 }
6616
6617 return isImmOperandLegal(MI, OpNo: OpIdx, MO: *MO);
6618}
6619
6620bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
6621 bool IsGFX950Only = ST.hasGFX950Insts();
6622 bool IsGFX940Only = ST.hasGFX940Insts();
6623
6624 if (!IsGFX950Only && !IsGFX940Only)
6625 return false;
6626
6627 if (!isVALU(MI))
6628 return false;
6629
6630 // V_COS, V_EXP, V_RCP, etc.
6631 if (isTRANS(MI))
6632 return true;
6633
6634 // DOT2, DOT2C, DOT4, etc.
6635 if (isDOT(MI))
6636 return true;
6637
6638 // MFMA, SMFMA
6639 if (isMFMA(MI))
6640 return true;
6641
6642 unsigned Opcode = MI.getOpcode();
6643 switch (Opcode) {
6644 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6645 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6646 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6647 case AMDGPU::V_MQSAD_U32_U8_e64:
6648 case AMDGPU::V_PK_ADD_F16:
6649 case AMDGPU::V_PK_ADD_F32:
6650 case AMDGPU::V_PK_ADD_I16:
6651 case AMDGPU::V_PK_ADD_U16:
6652 case AMDGPU::V_PK_ASHRREV_I16:
6653 case AMDGPU::V_PK_FMA_F16:
6654 case AMDGPU::V_PK_FMA_F32:
6655 case AMDGPU::V_PK_FMAC_F16_e32:
6656 case AMDGPU::V_PK_FMAC_F16_e64:
6657 case AMDGPU::V_PK_LSHLREV_B16:
6658 case AMDGPU::V_PK_LSHRREV_B16:
6659 case AMDGPU::V_PK_MAD_I16:
6660 case AMDGPU::V_PK_MAD_U16:
6661 case AMDGPU::V_PK_MAX_F16:
6662 case AMDGPU::V_PK_MAX_I16:
6663 case AMDGPU::V_PK_MAX_U16:
6664 case AMDGPU::V_PK_MIN_F16:
6665 case AMDGPU::V_PK_MIN_I16:
6666 case AMDGPU::V_PK_MIN_U16:
6667 case AMDGPU::V_PK_MOV_B32:
6668 case AMDGPU::V_PK_MUL_F16:
6669 case AMDGPU::V_PK_MUL_F32:
6670 case AMDGPU::V_PK_MUL_LO_U16:
6671 case AMDGPU::V_PK_SUB_I16:
6672 case AMDGPU::V_PK_SUB_U16:
6673 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6674 return true;
6675 default:
6676 return false;
6677 }
6678}
6679
6680void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
6681 MachineInstr &MI) const {
6682 unsigned Opc = MI.getOpcode();
6683 const MCInstrDesc &InstrDesc = get(Opcode: Opc);
6684
6685 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
6686 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
6687
6688 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1);
6689 MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
6690
6691 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6692 // we need to only have one constant bus use before GFX10.
6693 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6694 if (HasImplicitSGPR && ST.getConstantBusLimit(Opcode: Opc) <= 1 && Src0.isReg() &&
6695 RI.isSGPRReg(MRI, Reg: Src0.getReg()))
6696 legalizeOpWithMove(MI, OpIdx: Src0Idx);
6697
6698 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6699 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6700 // src0/src1 with V_READFIRSTLANE.
6701 if (Opc == AMDGPU::V_WRITELANE_B32) {
6702 const DebugLoc &DL = MI.getDebugLoc();
6703 if (Src0.isReg() && RI.isVGPR(MRI, Reg: Src0.getReg())) {
6704 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6705 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6706 .add(MO: Src0);
6707 Src0.ChangeToRegister(Reg, isDef: false);
6708 }
6709 if (Src1.isReg() && RI.isVGPR(MRI, Reg: Src1.getReg())) {
6710 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6711 const DebugLoc &DL = MI.getDebugLoc();
6712 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6713 .add(MO: Src1);
6714 Src1.ChangeToRegister(Reg, isDef: false);
6715 }
6716 return;
6717 }
6718
6719 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6720 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6721 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
6722 if (!RI.isVGPR(MRI, Reg: MI.getOperand(i: Src2Idx).getReg()))
6723 legalizeOpWithMove(MI, OpIdx: Src2Idx);
6724 }
6725
6726 // VOP2 src0 instructions support all operand types, so we don't need to check
6727 // their legality. If src1 is already legal, we don't need to do anything.
6728 if (isLegalRegOperand(MRI, OpInfo: InstrDesc.operands()[Src1Idx], MO: Src1))
6729 return;
6730
6731 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6732 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6733 // select is uniform.
6734 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6735 RI.isVGPR(MRI, Reg: Src1.getReg())) {
6736 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6737 const DebugLoc &DL = MI.getDebugLoc();
6738 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6739 .add(MO: Src1);
6740 Src1.ChangeToRegister(Reg, isDef: false);
6741 return;
6742 }
6743
6744 // We do not use commuteInstruction here because it is too aggressive and will
6745 // commute if it is possible. We only want to commute here if it improves
6746 // legality. This can be called a fairly large number of times so don't waste
6747 // compile time pointlessly swapping and checking legality again.
6748 if (HasImplicitSGPR || !MI.isCommutable()) {
6749 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6750 return;
6751 }
6752
6753 // If src0 can be used as src1, commuting will make the operands legal.
6754 // Otherwise we have to give up and insert a move.
6755 //
6756 // TODO: Other immediate-like operand kinds could be commuted if there was a
6757 // MachineOperand::ChangeTo* for them.
6758 if ((!Src1.isImm() && !Src1.isReg()) ||
6759 !isLegalRegOperand(MRI, OpInfo: InstrDesc.operands()[Src1Idx], MO: Src0)) {
6760 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6761 return;
6762 }
6763
6764 int CommutedOpc = commuteOpcode(MI);
6765 if (CommutedOpc == -1) {
6766 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6767 return;
6768 }
6769
6770 MI.setDesc(get(Opcode: CommutedOpc));
6771
6772 Register Src0Reg = Src0.getReg();
6773 unsigned Src0SubReg = Src0.getSubReg();
6774 bool Src0Kill = Src0.isKill();
6775
6776 if (Src1.isImm())
6777 Src0.ChangeToImmediate(ImmVal: Src1.getImm());
6778 else if (Src1.isReg()) {
6779 Src0.ChangeToRegister(Reg: Src1.getReg(), isDef: false, isImp: false, isKill: Src1.isKill());
6780 Src0.setSubReg(Src1.getSubReg());
6781 } else
6782 llvm_unreachable("Should only have register or immediate operands");
6783
6784 Src1.ChangeToRegister(Reg: Src0Reg, isDef: false, isImp: false, isKill: Src0Kill);
6785 Src1.setSubReg(Src0SubReg);
6786 fixImplicitOperands(MI);
6787}
6788
6789// Legalize VOP3 operands. All operand types are supported for any operand
6790// but only one literal constant and only starting from GFX10.
6791void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
6792 MachineInstr &MI) const {
6793 unsigned Opc = MI.getOpcode();
6794
6795 int VOP3Idx[3] = {
6796 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0),
6797 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1),
6798 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2)
6799 };
6800
6801 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6802 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6803 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6804 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6805 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6806 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6807 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6808 // src1 and src2 must be scalar
6809 MachineOperand &Src1 = MI.getOperand(i: VOP3Idx[1]);
6810 const DebugLoc &DL = MI.getDebugLoc();
6811 if (Src1.isReg() && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src1.getReg()))) {
6812 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6813 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6814 .add(MO: Src1);
6815 Src1.ChangeToRegister(Reg, isDef: false);
6816 }
6817 if (VOP3Idx[2] != -1) {
6818 MachineOperand &Src2 = MI.getOperand(i: VOP3Idx[2]);
6819 if (Src2.isReg() && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src2.getReg()))) {
6820 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6821 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6822 .add(MO: Src2);
6823 Src2.ChangeToRegister(Reg, isDef: false);
6824 }
6825 }
6826 }
6827
6828 // Find the one SGPR operand we are allowed to use.
6829 int ConstantBusLimit = ST.getConstantBusLimit(Opcode: Opc);
6830 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6831 SmallDenseSet<unsigned> SGPRsUsed;
6832 Register SGPRReg = findUsedSGPR(MI, OpIndices: VOP3Idx);
6833 if (SGPRReg) {
6834 SGPRsUsed.insert(V: SGPRReg);
6835 --ConstantBusLimit;
6836 }
6837
6838 for (int Idx : VOP3Idx) {
6839 if (Idx == -1)
6840 break;
6841 MachineOperand &MO = MI.getOperand(i: Idx);
6842
6843 if (!MO.isReg()) {
6844 if (isInlineConstant(MO, OpInfo: get(Opcode: Opc).operands()[Idx]))
6845 continue;
6846
6847 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6848 --LiteralLimit;
6849 --ConstantBusLimit;
6850 continue;
6851 }
6852
6853 --LiteralLimit;
6854 --ConstantBusLimit;
6855 legalizeOpWithMove(MI, OpIdx: Idx);
6856 continue;
6857 }
6858
6859 if (!RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg: MO.getReg())))
6860 continue; // VGPRs are legal
6861
6862 // We can use one SGPR in each VOP3 instruction prior to GFX10
6863 // and two starting from GFX10.
6864 if (SGPRsUsed.count(V: MO.getReg()))
6865 continue;
6866 if (ConstantBusLimit > 0) {
6867 SGPRsUsed.insert(V: MO.getReg());
6868 --ConstantBusLimit;
6869 continue;
6870 }
6871
6872 // If we make it this far, then the operand is not legal and we must
6873 // legalize it.
6874 legalizeOpWithMove(MI, OpIdx: Idx);
6875 }
6876
6877 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6878 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6879 !RI.isVGPR(MRI, Reg: MI.getOperand(i: VOP3Idx[2]).getReg()))
6880 legalizeOpWithMove(MI, OpIdx: VOP3Idx[2]);
6881
6882 // Fix the register class of packed FP32 instructions on gfx12+. See
6883 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6884 if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(STI: ST)) {
6885 for (unsigned I = 0; I < 3; ++I) {
6886 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
6887 legalizeOpWithMove(MI, OpIdx: VOP3Idx[I]);
6888 }
6889 }
6890}
6891
6892Register SIInstrInfo::readlaneVGPRToSGPR(
6893 Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI,
6894 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6895 const TargetRegisterClass *VRC = MRI.getRegClass(Reg: SrcReg);
6896 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6897 if (DstRC)
6898 SRC = RI.getCommonSubClass(A: SRC, B: DstRC);
6899
6900 Register DstReg = MRI.createVirtualRegister(RegClass: SRC);
6901 unsigned SubRegs = RI.getRegSizeInBits(RC: *VRC) / 32;
6902
6903 if (RI.hasAGPRs(RC: VRC)) {
6904 VRC = RI.getEquivalentVGPRClass(SRC: VRC);
6905 Register NewSrcReg = MRI.createVirtualRegister(RegClass: VRC);
6906 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6907 MCID: get(Opcode: TargetOpcode::COPY), DestReg: NewSrcReg)
6908 .addReg(RegNo: SrcReg);
6909 SrcReg = NewSrcReg;
6910 }
6911
6912 if (SubRegs == 1) {
6913 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6914 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
6915 .addReg(RegNo: SrcReg);
6916 return DstReg;
6917 }
6918
6919 SmallVector<Register, 8> SRegs;
6920 for (unsigned i = 0; i < SubRegs; ++i) {
6921 Register SGPR = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
6922 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6923 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: SGPR)
6924 .addReg(RegNo: SrcReg, Flags: {}, SubReg: RI.getSubRegFromChannel(Channel: i));
6925 SRegs.push_back(Elt: SGPR);
6926 }
6927
6928 MachineInstrBuilder MIB =
6929 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6930 MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg);
6931 for (unsigned i = 0; i < SubRegs; ++i) {
6932 MIB.addReg(RegNo: SRegs[i]);
6933 MIB.addImm(Val: RI.getSubRegFromChannel(Channel: i));
6934 }
6935 return DstReg;
6936}
6937
6938void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
6939 MachineInstr &MI) const {
6940
6941 // If the pointer is store in VGPRs, then we need to move them to
6942 // SGPRs using v_readfirstlane. This is safe because we only select
6943 // loads with uniform pointers to SMRD instruction so we know the
6944 // pointer value is uniform.
6945 MachineOperand *SBase = getNamedOperand(MI, OperandName: AMDGPU::OpName::sbase);
6946 if (SBase && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SBase->getReg()))) {
6947 Register SGPR = readlaneVGPRToSGPR(SrcReg: SBase->getReg(), UseMI&: MI, MRI);
6948 SBase->setReg(SGPR);
6949 }
6950 MachineOperand *SOff = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
6951 if (SOff && !RI.isSGPRReg(MRI, Reg: SOff->getReg())) {
6952 Register SGPR = readlaneVGPRToSGPR(SrcReg: SOff->getReg(), UseMI&: MI, MRI);
6953 SOff->setReg(SGPR);
6954 }
6955}
6956
6957bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
6958 unsigned Opc = Inst.getOpcode();
6959 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::saddr);
6960 if (OldSAddrIdx < 0)
6961 return false;
6962
6963 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6964
6965 int NewOpc = AMDGPU::getGlobalVaddrOp(Opcode: Opc);
6966 if (NewOpc < 0)
6967 NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opcode: Opc);
6968 if (NewOpc < 0)
6969 return false;
6970
6971 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6972 MachineOperand &SAddr = Inst.getOperand(i: OldSAddrIdx);
6973 if (RI.isSGPRReg(MRI, Reg: SAddr.getReg()))
6974 return false;
6975
6976 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::vaddr);
6977 if (NewVAddrIdx < 0)
6978 return false;
6979
6980 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr);
6981
6982 // Check vaddr, it shall be zero or absent.
6983 MachineInstr *VAddrDef = nullptr;
6984 if (OldVAddrIdx >= 0) {
6985 MachineOperand &VAddr = Inst.getOperand(i: OldVAddrIdx);
6986 VAddrDef = MRI.getUniqueVRegDef(Reg: VAddr.getReg());
6987 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6988 !VAddrDef->getOperand(i: 1).isImm() ||
6989 VAddrDef->getOperand(i: 1).getImm() != 0)
6990 return false;
6991 }
6992
6993 const MCInstrDesc &NewDesc = get(Opcode: NewOpc);
6994 Inst.setDesc(NewDesc);
6995
6996 // Callers expect iterator to be valid after this call, so modify the
6997 // instruction in place.
6998 if (OldVAddrIdx == NewVAddrIdx) {
6999 MachineOperand &NewVAddr = Inst.getOperand(i: NewVAddrIdx);
7000 // Clear use list from the old vaddr holding a zero register.
7001 MRI.removeRegOperandFromUseList(MO: &NewVAddr);
7002 MRI.moveOperands(Dst: &NewVAddr, Src: &SAddr, NumOps: 1);
7003 Inst.removeOperand(OpNo: OldSAddrIdx);
7004 // Update the use list with the pointer we have just moved from vaddr to
7005 // saddr position. Otherwise new vaddr will be missing from the use list.
7006 MRI.removeRegOperandFromUseList(MO: &NewVAddr);
7007 MRI.addRegOperandToUseList(MO: &NewVAddr);
7008 } else {
7009 assert(OldSAddrIdx == NewVAddrIdx);
7010
7011 if (OldVAddrIdx >= 0) {
7012 int NewVDstIn = AMDGPU::getNamedOperandIdx(Opcode: NewOpc,
7013 Name: AMDGPU::OpName::vdst_in);
7014
7015 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
7016 // it asserts. Untie the operands for now and retie them afterwards.
7017 if (NewVDstIn != -1) {
7018 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst_in);
7019 Inst.untieRegOperand(OpIdx: OldVDstIn);
7020 }
7021
7022 Inst.removeOperand(OpNo: OldVAddrIdx);
7023
7024 if (NewVDstIn != -1) {
7025 int NewVDst = AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::vdst);
7026 Inst.tieOperands(DefIdx: NewVDst, UseIdx: NewVDstIn);
7027 }
7028 }
7029 }
7030
7031 if (VAddrDef && MRI.use_nodbg_empty(RegNo: VAddrDef->getOperand(i: 0).getReg()))
7032 VAddrDef->eraseFromParent();
7033
7034 return true;
7035}
7036
7037// FIXME: Remove this when SelectionDAG is obsoleted.
7038void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
7039 MachineInstr &MI) const {
7040 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
7041 return;
7042
7043 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
7044 // thinks they are uniform, so a readfirstlane should be valid.
7045 MachineOperand *SAddr = getNamedOperand(MI, OperandName: AMDGPU::OpName::saddr);
7046 if (!SAddr || RI.isSGPRClass(RC: MRI.getRegClass(Reg: SAddr->getReg())))
7047 return;
7048
7049 if (moveFlatAddrToVGPR(Inst&: MI))
7050 return;
7051
7052 const TargetRegisterClass *DeclaredRC =
7053 getRegClass(MCID: MI.getDesc(), OpNum: SAddr->getOperandNo());
7054
7055 Register ToSGPR = readlaneVGPRToSGPR(SrcReg: SAddr->getReg(), UseMI&: MI, MRI, DstRC: DeclaredRC);
7056 SAddr->setReg(ToSGPR);
7057}
7058
7059void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
7060 MachineBasicBlock::iterator I,
7061 const TargetRegisterClass *DstRC,
7062 MachineOperand &Op,
7063 MachineRegisterInfo &MRI,
7064 const DebugLoc &DL) const {
7065 Register OpReg = Op.getReg();
7066 unsigned OpSubReg = Op.getSubReg();
7067
7068 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7069 RI.getRegClassForReg(MRI, Reg: OpReg), OpSubReg);
7070
7071 // Check if operand is already the correct register class.
7072 if (DstRC == OpRC)
7073 return;
7074
7075 Register DstReg = MRI.createVirtualRegister(RegClass: DstRC);
7076 auto Copy =
7077 BuildMI(BB&: InsertMBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: OpReg);
7078 Op.setReg(DstReg);
7079
7080 MachineInstr *Def = MRI.getVRegDef(Reg: OpReg);
7081 if (!Def)
7082 return;
7083
7084 // Try to eliminate the copy if it is copying an immediate value.
7085 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7086 foldImmediate(UseMI&: *Copy, DefMI&: *Def, Reg: OpReg, MRI: &MRI);
7087
7088 bool ImpDef = Def->isImplicitDef();
7089 while (!ImpDef && Def && Def->isCopy()) {
7090 if (Def->getOperand(i: 1).getReg().isPhysical())
7091 break;
7092 Def = MRI.getUniqueVRegDef(Reg: Def->getOperand(i: 1).getReg());
7093 ImpDef = Def && Def->isImplicitDef();
7094 }
7095 if (!RI.isSGPRClass(RC: DstRC) && !Copy->readsRegister(Reg: AMDGPU::EXEC, TRI: &RI) &&
7096 !ImpDef)
7097 Copy.addReg(RegNo: AMDGPU::EXEC, Flags: RegState::Implicit);
7098}
7099
7100// Emit the actual waterfall loop, executing the wrapped instruction for each
7101// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7102// iteration, in the worst case we execute 64 (once per lane).
7103static void
7104emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
7105 MachineRegisterInfo &MRI,
7106 MachineBasicBlock &LoopBB,
7107 MachineBasicBlock &BodyBB,
7108 const DebugLoc &DL,
7109 ArrayRef<MachineOperand *> ScalarOps) {
7110 MachineFunction &MF = *LoopBB.getParent();
7111 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7112 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7113 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
7114 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7115
7116 MachineBasicBlock::iterator I = LoopBB.begin();
7117 Register CondReg;
7118
7119 for (MachineOperand *ScalarOp : ScalarOps) {
7120 unsigned RegSize = TRI->getRegSizeInBits(Reg: ScalarOp->getReg(), MRI);
7121 unsigned NumSubRegs = RegSize / 32;
7122 Register VScalarOp = ScalarOp->getReg();
7123
7124 if (NumSubRegs == 1) {
7125 Register CurReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7126
7127 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurReg)
7128 .addReg(RegNo: VScalarOp);
7129
7130 Register NewCondReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7131
7132 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: NewCondReg)
7133 .addReg(RegNo: CurReg)
7134 .addReg(RegNo: VScalarOp);
7135
7136 // Combine the comparison results with AND.
7137 if (!CondReg) // First.
7138 CondReg = NewCondReg;
7139 else { // If not the first, we create an AND.
7140 Register AndReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7141 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndOpc), DestReg: AndReg)
7142 .addReg(RegNo: CondReg)
7143 .addReg(RegNo: NewCondReg);
7144 CondReg = AndReg;
7145 }
7146
7147 // Update ScalarOp operand to use the SGPR ScalarOp.
7148 ScalarOp->setReg(CurReg);
7149 ScalarOp->setIsKill();
7150 } else {
7151 SmallVector<Register, 8> ReadlanePieces;
7152 RegState VScalarOpUndef = getUndefRegState(B: ScalarOp->isUndef());
7153 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7154 "Unhandled register size");
7155
7156 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7157 Register CurRegLo =
7158 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7159 Register CurRegHi =
7160 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7161
7162 // Read the next variant <- also loop target.
7163 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurRegLo)
7164 .addReg(RegNo: VScalarOp, Flags: VScalarOpUndef, SubReg: TRI->getSubRegFromChannel(Channel: Idx));
7165
7166 // Read the next variant <- also loop target.
7167 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurRegHi)
7168 .addReg(RegNo: VScalarOp, Flags: VScalarOpUndef,
7169 SubReg: TRI->getSubRegFromChannel(Channel: Idx + 1));
7170
7171 ReadlanePieces.push_back(Elt: CurRegLo);
7172 ReadlanePieces.push_back(Elt: CurRegHi);
7173
7174 // Comparison is to be done as 64-bit.
7175 Register CurReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_64RegClass);
7176 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: CurReg)
7177 .addReg(RegNo: CurRegLo)
7178 .addImm(Val: AMDGPU::sub0)
7179 .addReg(RegNo: CurRegHi)
7180 .addImm(Val: AMDGPU::sub1);
7181
7182 Register NewCondReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7183 auto Cmp = BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U64_e64),
7184 DestReg: NewCondReg)
7185 .addReg(RegNo: CurReg);
7186 if (NumSubRegs <= 2)
7187 Cmp.addReg(RegNo: VScalarOp);
7188 else
7189 Cmp.addReg(RegNo: VScalarOp, Flags: VScalarOpUndef,
7190 SubReg: TRI->getSubRegFromChannel(Channel: Idx, NumRegs: 2));
7191
7192 // Combine the comparison results with AND.
7193 if (!CondReg) // First.
7194 CondReg = NewCondReg;
7195 else { // If not the first, we create an AND.
7196 Register AndReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7197 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndOpc), DestReg: AndReg)
7198 .addReg(RegNo: CondReg)
7199 .addReg(RegNo: NewCondReg);
7200 CondReg = AndReg;
7201 }
7202 } // End for loop.
7203
7204 const auto *SScalarOpRC =
7205 TRI->getEquivalentSGPRClass(VRC: MRI.getRegClass(Reg: VScalarOp));
7206 Register SScalarOp = MRI.createVirtualRegister(RegClass: SScalarOpRC);
7207
7208 // Build scalar ScalarOp.
7209 auto Merge =
7210 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: SScalarOp);
7211 unsigned Channel = 0;
7212 for (Register Piece : ReadlanePieces) {
7213 Merge.addReg(RegNo: Piece).addImm(Val: TRI->getSubRegFromChannel(Channel: Channel++));
7214 }
7215
7216 // Update ScalarOp operand to use the SGPR ScalarOp.
7217 ScalarOp->setReg(SScalarOp);
7218 ScalarOp->setIsKill();
7219 }
7220 }
7221
7222 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7223 MRI.setSimpleHint(VReg: SaveExec, PrefReg: CondReg);
7224
7225 // Update EXEC to matching lanes, saving original to SaveExec.
7226 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.AndSaveExecOpc), DestReg: SaveExec)
7227 .addReg(RegNo: CondReg, Flags: RegState::Kill);
7228
7229 // The original instruction is here; we insert the terminators after it.
7230 I = BodyBB.end();
7231
7232 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7233 BuildMI(BB&: BodyBB, I, MIMD: DL, MCID: TII.get(Opcode: LMC.XorTermOpc), DestReg: LMC.ExecReg)
7234 .addReg(RegNo: LMC.ExecReg)
7235 .addReg(RegNo: SaveExec);
7236
7237 BuildMI(BB&: BodyBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::SI_WATERFALL_LOOP)).addMBB(MBB: &LoopBB);
7238}
7239
7240// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7241// with SGPRs by iterating over all unique values across all lanes.
7242// Returns the loop basic block that now contains \p MI.
7243static MachineBasicBlock *
7244loadScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
7245 ArrayRef<MachineOperand *> ScalarOps,
7246 MachineDominatorTree *MDT,
7247 MachineBasicBlock::iterator Begin = nullptr,
7248 MachineBasicBlock::iterator End = nullptr) {
7249 MachineBasicBlock &MBB = *MI.getParent();
7250 MachineFunction &MF = *MBB.getParent();
7251 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7252 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7253 MachineRegisterInfo &MRI = MF.getRegInfo();
7254 if (!Begin.isValid())
7255 Begin = &MI;
7256 if (!End.isValid()) {
7257 End = &MI;
7258 ++End;
7259 }
7260 const DebugLoc &DL = MI.getDebugLoc();
7261 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
7262 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7263
7264 // Save SCC. Waterfall Loop may overwrite SCC.
7265 Register SaveSCCReg;
7266
7267 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7268 // rather than unlimited scan everywhere
7269 bool SCCNotDead =
7270 MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::SCC, Before: MI,
7271 Neighborhood: std::numeric_limits<unsigned>::max()) !=
7272 MachineBasicBlock::LQR_Dead;
7273 if (SCCNotDead) {
7274 SaveSCCReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7275 BuildMI(BB&: MBB, I: Begin, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: SaveSCCReg)
7276 .addImm(Val: 1)
7277 .addImm(Val: 0);
7278 }
7279
7280 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7281
7282 // Save the EXEC mask
7283 BuildMI(BB&: MBB, I: Begin, MIMD: DL, MCID: TII.get(Opcode: LMC.MovOpc), DestReg: SaveExec).addReg(RegNo: LMC.ExecReg);
7284
7285 // Killed uses in the instruction we are waterfalling around will be
7286 // incorrect due to the added control-flow.
7287 MachineBasicBlock::iterator AfterMI = MI;
7288 ++AfterMI;
7289 for (auto I = Begin; I != AfterMI; I++) {
7290 for (auto &MO : I->all_uses())
7291 MRI.clearKillFlags(Reg: MO.getReg());
7292 }
7293
7294 // To insert the loop we need to split the block. Move everything after this
7295 // point to a new block, and insert a new empty block between the two.
7296 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
7297 MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
7298 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7299 MachineFunction::iterator MBBI(MBB);
7300 ++MBBI;
7301
7302 MF.insert(MBBI, MBB: LoopBB);
7303 MF.insert(MBBI, MBB: BodyBB);
7304 MF.insert(MBBI, MBB: RemainderBB);
7305
7306 LoopBB->addSuccessor(Succ: BodyBB);
7307 BodyBB->addSuccessor(Succ: LoopBB);
7308 BodyBB->addSuccessor(Succ: RemainderBB);
7309
7310 // Move Begin to MI to the BodyBB, and the remainder of the block to
7311 // RemainderBB.
7312 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
7313 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: End, To: MBB.end());
7314 BodyBB->splice(Where: BodyBB->begin(), Other: &MBB, From: Begin, To: MBB.end());
7315
7316 MBB.addSuccessor(Succ: LoopBB);
7317
7318 // Update dominators. We know that MBB immediately dominates LoopBB, that
7319 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7320 // RemainderBB. RemainderBB immediately dominates all of the successors
7321 // transferred to it from MBB that MBB used to properly dominate.
7322 if (MDT) {
7323 MDT->addNewBlock(BB: LoopBB, DomBB: &MBB);
7324 MDT->addNewBlock(BB: BodyBB, DomBB: LoopBB);
7325 MDT->addNewBlock(BB: RemainderBB, DomBB: BodyBB);
7326 for (auto &Succ : RemainderBB->successors()) {
7327 if (MDT->properlyDominates(A: &MBB, B: Succ)) {
7328 MDT->changeImmediateDominator(BB: Succ, NewBB: RemainderBB);
7329 }
7330 }
7331 }
7332
7333 emitLoadScalarOpsFromVGPRLoop(TII, MRI, LoopBB&: *LoopBB, BodyBB&: *BodyBB, DL, ScalarOps);
7334
7335 MachineBasicBlock::iterator First = RemainderBB->begin();
7336 // Restore SCC
7337 if (SCCNotDead) {
7338 BuildMI(BB&: *RemainderBB, I: First, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CMP_LG_U32))
7339 .addReg(RegNo: SaveSCCReg, Flags: RegState::Kill)
7340 .addImm(Val: 0);
7341 }
7342
7343 // Restore the EXEC mask
7344 BuildMI(BB&: *RemainderBB, I: First, MIMD: DL, MCID: TII.get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
7345 .addReg(RegNo: SaveExec);
7346 return BodyBB;
7347}
7348
7349// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7350static std::tuple<unsigned, unsigned>
7351extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
7352 MachineBasicBlock &MBB = *MI.getParent();
7353 MachineFunction &MF = *MBB.getParent();
7354 MachineRegisterInfo &MRI = MF.getRegInfo();
7355
7356 // Extract the ptr from the resource descriptor.
7357 unsigned RsrcPtr =
7358 TII.buildExtractSubReg(MI, MRI, SuperReg: Rsrc, SuperRC: &AMDGPU::VReg_128RegClass,
7359 SubIdx: AMDGPU::sub0_sub1, SubRC: &AMDGPU::VReg_64RegClass);
7360
7361 // Create an empty resource descriptor
7362 Register Zero64 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
7363 Register SRsrcFormatLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
7364 Register SRsrcFormatHi = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
7365 Register NewSRsrc = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_128RegClass);
7366 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7367
7368 // Zero64 = 0
7369 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B64), DestReg: Zero64)
7370 .addImm(Val: 0);
7371
7372 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7373 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: SRsrcFormatLo)
7374 .addImm(Val: Lo_32(Value: RsrcDataFormat));
7375
7376 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7377 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: SRsrcFormatHi)
7378 .addImm(Val: Hi_32(Value: RsrcDataFormat));
7379
7380 // NewSRsrc = {Zero64, SRsrcFormat}
7381 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewSRsrc)
7382 .addReg(RegNo: Zero64)
7383 .addImm(Val: AMDGPU::sub0_sub1)
7384 .addReg(RegNo: SRsrcFormatLo)
7385 .addImm(Val: AMDGPU::sub2)
7386 .addReg(RegNo: SRsrcFormatHi)
7387 .addImm(Val: AMDGPU::sub3);
7388
7389 return std::tuple(RsrcPtr, NewSRsrc);
7390}
7391
7392MachineBasicBlock *
7393SIInstrInfo::legalizeOperands(MachineInstr &MI,
7394 MachineDominatorTree *MDT) const {
7395 MachineFunction &MF = *MI.getMF();
7396 MachineRegisterInfo &MRI = MF.getRegInfo();
7397 MachineBasicBlock *CreatedBB = nullptr;
7398
7399 // Legalize VOP2
7400 if (isVOP2(MI) || isVOPC(MI)) {
7401 legalizeOperandsVOP2(MRI, MI);
7402 return CreatedBB;
7403 }
7404
7405 // Legalize VOP3
7406 if (isVOP3(MI)) {
7407 legalizeOperandsVOP3(MRI, MI);
7408 return CreatedBB;
7409 }
7410
7411 // Legalize SMRD
7412 if (isSMRD(MI)) {
7413 legalizeOperandsSMRD(MRI, MI);
7414 return CreatedBB;
7415 }
7416
7417 // Legalize FLAT
7418 if (isFLAT(MI)) {
7419 legalizeOperandsFLAT(MRI, MI);
7420 return CreatedBB;
7421 }
7422
7423 // Legalize PHI
7424 // The register class of the operands must be the same type as the register
7425 // class of the output.
7426 if (MI.getOpcode() == AMDGPU::PHI) {
7427 const TargetRegisterClass *VRC = getOpRegClass(MI, OpNo: 0);
7428 assert(!RI.isSGPRClass(VRC));
7429
7430 // Update all the operands so they have the same type.
7431 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7432 MachineOperand &Op = MI.getOperand(i: I);
7433 if (!Op.isReg() || !Op.getReg().isVirtual())
7434 continue;
7435
7436 // MI is a PHI instruction.
7437 MachineBasicBlock *InsertBB = MI.getOperand(i: I + 1).getMBB();
7438 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
7439
7440 // Avoid creating no-op copies with the same src and dst reg class. These
7441 // confuse some of the machine passes.
7442 legalizeGenericOperand(InsertMBB&: *InsertBB, I: Insert, DstRC: VRC, Op, MRI, DL: MI.getDebugLoc());
7443 }
7444 }
7445
7446 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7447 // VGPR dest type and SGPR sources, insert copies so all operands are
7448 // VGPRs. This seems to help operand folding / the register coalescer.
7449 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7450 MachineBasicBlock *MBB = MI.getParent();
7451 const TargetRegisterClass *DstRC = getOpRegClass(MI, OpNo: 0);
7452 if (RI.hasVGPRs(RC: DstRC)) {
7453 // Update all the operands so they are VGPR register classes. These may
7454 // not be the same register class because REG_SEQUENCE supports mixing
7455 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7456 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7457 MachineOperand &Op = MI.getOperand(i: I);
7458 if (!Op.isReg() || !Op.getReg().isVirtual())
7459 continue;
7460
7461 const TargetRegisterClass *OpRC = MRI.getRegClass(Reg: Op.getReg());
7462 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(SRC: OpRC);
7463 if (VRC == OpRC)
7464 continue;
7465
7466 legalizeGenericOperand(InsertMBB&: *MBB, I: MI, DstRC: VRC, Op, MRI, DL: MI.getDebugLoc());
7467 Op.setIsKill();
7468 }
7469 }
7470
7471 return CreatedBB;
7472 }
7473
7474 // Legalize INSERT_SUBREG
7475 // src0 must have the same register class as dst
7476 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7477 Register Dst = MI.getOperand(i: 0).getReg();
7478 Register Src0 = MI.getOperand(i: 1).getReg();
7479 const TargetRegisterClass *DstRC = MRI.getRegClass(Reg: Dst);
7480 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0);
7481 if (DstRC != Src0RC) {
7482 MachineBasicBlock *MBB = MI.getParent();
7483 MachineOperand &Op = MI.getOperand(i: 1);
7484 legalizeGenericOperand(InsertMBB&: *MBB, I: MI, DstRC, Op, MRI, DL: MI.getDebugLoc());
7485 }
7486 return CreatedBB;
7487 }
7488
7489 // Legalize SI_INIT_M0
7490 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7491 MachineOperand &Src = MI.getOperand(i: 0);
7492 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7493 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7494 return CreatedBB;
7495 }
7496
7497 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7498 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7499 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7500 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7501 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7502 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7503 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7504 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7505 MachineOperand &Src = MI.getOperand(i: 1);
7506 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7507 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7508 return CreatedBB;
7509 }
7510
7511 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7512 //
7513 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7514 // scratch memory access. In both cases, the legalization never involves
7515 // conversion to the addr64 form.
7516 if (isImage(MI) || (AMDGPU::isGraphics(CC: MF.getFunction().getCallingConv()) &&
7517 (isMUBUF(MI) || isMTBUF(MI)))) {
7518 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7519 ? AMDGPU::OpName::rsrc
7520 : AMDGPU::OpName::srsrc;
7521 MachineOperand *SRsrc = getNamedOperand(MI, OperandName: RSrcOpName);
7522 if (SRsrc && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SRsrc->getReg())))
7523 CreatedBB = loadScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {SRsrc}, MDT);
7524
7525 AMDGPU::OpName SampOpName =
7526 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7527 MachineOperand *SSamp = getNamedOperand(MI, OperandName: SampOpName);
7528 if (SSamp && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SSamp->getReg())))
7529 CreatedBB = loadScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {SSamp}, MDT);
7530
7531 return CreatedBB;
7532 }
7533
7534 // Legalize SI_CALL
7535 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7536 MachineOperand *Dest = &MI.getOperand(i: 0);
7537 if (!RI.isSGPRClass(RC: MRI.getRegClass(Reg: Dest->getReg()))) {
7538 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7539 // following copies, we also need to move copies from and to physical
7540 // registers into the loop block.
7541 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7542 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7543
7544 // Also move the copies to physical registers into the loop block
7545 MachineBasicBlock &MBB = *MI.getParent();
7546 MachineBasicBlock::iterator Start(&MI);
7547 while (Start->getOpcode() != FrameSetupOpcode)
7548 --Start;
7549 MachineBasicBlock::iterator End(&MI);
7550 while (End->getOpcode() != FrameDestroyOpcode)
7551 ++End;
7552 // Also include following copies of the return value
7553 ++End;
7554 while (End != MBB.end() && End->isCopy() && End->getOperand(i: 1).isReg() &&
7555 MI.definesRegister(Reg: End->getOperand(i: 1).getReg(), /*TRI=*/nullptr))
7556 ++End;
7557 CreatedBB =
7558 loadScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Dest}, MDT, Begin: Start, End);
7559 }
7560 }
7561
7562 // Legalize s_sleep_var.
7563 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7564 const DebugLoc &DL = MI.getDebugLoc();
7565 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7566 int Src0Idx =
7567 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src0);
7568 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
7569 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
7570 .add(MO: Src0);
7571 Src0.ChangeToRegister(Reg, isDef: false);
7572 return nullptr;
7573 }
7574
7575 // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
7576 // operands are scalar.
7577 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
7578 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
7579 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
7580 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
7581 for (MachineOperand &Src : MI.explicit_operands()) {
7582 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7583 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7584 }
7585 return CreatedBB;
7586 }
7587
7588 // Legalize MUBUF instructions.
7589 bool isSoffsetLegal = true;
7590 int SoffsetIdx =
7591 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::soffset);
7592 if (SoffsetIdx != -1) {
7593 MachineOperand *Soffset = &MI.getOperand(i: SoffsetIdx);
7594 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7595 !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Soffset->getReg()))) {
7596 isSoffsetLegal = false;
7597 }
7598 }
7599
7600 bool isRsrcLegal = true;
7601 int RsrcIdx =
7602 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::srsrc);
7603 if (RsrcIdx != -1) {
7604 MachineOperand *Rsrc = &MI.getOperand(i: RsrcIdx);
7605 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Reg: Rsrc->getReg()))
7606 isRsrcLegal = false;
7607 }
7608
7609 // The operands are legal.
7610 if (isRsrcLegal && isSoffsetLegal)
7611 return CreatedBB;
7612
7613 if (!isRsrcLegal) {
7614 // Legalize a VGPR Rsrc
7615 //
7616 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7617 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7618 // a zero-value SRsrc.
7619 //
7620 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7621 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7622 // above.
7623 //
7624 // Otherwise we are on non-ADDR64 hardware, and/or we have
7625 // idxen/offen/bothen and we fall back to a waterfall loop.
7626
7627 MachineOperand *Rsrc = &MI.getOperand(i: RsrcIdx);
7628 MachineBasicBlock &MBB = *MI.getParent();
7629
7630 MachineOperand *VAddr = getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
7631 if (VAddr && AMDGPU::getIfAddr64Inst(Opcode: MI.getOpcode()) != -1) {
7632 // This is already an ADDR64 instruction so we need to add the pointer
7633 // extracted from the resource descriptor to the current value of VAddr.
7634 Register NewVAddrLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7635 Register NewVAddrHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7636 Register NewVAddr = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
7637
7638 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7639 Register CondReg0 = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7640 Register CondReg1 = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7641
7642 unsigned RsrcPtr, NewSRsrc;
7643 std::tie(args&: RsrcPtr, args&: NewSRsrc) = extractRsrcPtr(TII: *this, MI, Rsrc&: *Rsrc);
7644
7645 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7646 const DebugLoc &DL = MI.getDebugLoc();
7647 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: NewVAddrLo)
7648 .addDef(RegNo: CondReg0)
7649 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub0)
7650 .addReg(RegNo: VAddr->getReg(), Flags: {}, SubReg: AMDGPU::sub0)
7651 .addImm(Val: 0);
7652
7653 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7654 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: NewVAddrHi)
7655 .addDef(RegNo: CondReg1, Flags: RegState::Dead)
7656 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub1)
7657 .addReg(RegNo: VAddr->getReg(), Flags: {}, SubReg: AMDGPU::sub1)
7658 .addReg(RegNo: CondReg0, Flags: RegState::Kill)
7659 .addImm(Val: 0);
7660
7661 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7662 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewVAddr)
7663 .addReg(RegNo: NewVAddrLo)
7664 .addImm(Val: AMDGPU::sub0)
7665 .addReg(RegNo: NewVAddrHi)
7666 .addImm(Val: AMDGPU::sub1);
7667
7668 VAddr->setReg(NewVAddr);
7669 Rsrc->setReg(NewSRsrc);
7670 } else if (!VAddr && ST.hasAddr64()) {
7671 // This instructions is the _OFFSET variant, so we need to convert it to
7672 // ADDR64.
7673 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7674 "FIXME: Need to emit flat atomics here");
7675
7676 unsigned RsrcPtr, NewSRsrc;
7677 std::tie(args&: RsrcPtr, args&: NewSRsrc) = extractRsrcPtr(TII: *this, MI, Rsrc&: *Rsrc);
7678
7679 Register NewVAddr = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
7680 MachineOperand *VData = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata);
7681 MachineOperand *Offset = getNamedOperand(MI, OperandName: AMDGPU::OpName::offset);
7682 MachineOperand *SOffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7683 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(Opcode: MI.getOpcode());
7684
7685 // Atomics with return have an additional tied operand and are
7686 // missing some of the special bits.
7687 MachineOperand *VDataIn = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata_in);
7688 MachineInstr *Addr64;
7689
7690 if (!VDataIn) {
7691 // Regular buffer load / store.
7692 MachineInstrBuilder MIB =
7693 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: Addr64Opcode))
7694 .add(MO: *VData)
7695 .addReg(RegNo: NewVAddr)
7696 .addReg(RegNo: NewSRsrc)
7697 .add(MO: *SOffset)
7698 .add(MO: *Offset);
7699
7700 if (const MachineOperand *CPol =
7701 getNamedOperand(MI, OperandName: AMDGPU::OpName::cpol)) {
7702 MIB.addImm(Val: CPol->getImm());
7703 }
7704
7705 if (const MachineOperand *TFE =
7706 getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe)) {
7707 MIB.addImm(Val: TFE->getImm());
7708 }
7709
7710 MIB.addImm(Val: getNamedImmOperand(MI, OperandName: AMDGPU::OpName::swz));
7711
7712 MIB.cloneMemRefs(OtherMI: MI);
7713 Addr64 = MIB;
7714 } else {
7715 // Atomics with return.
7716 Addr64 = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: Addr64Opcode))
7717 .add(MO: *VData)
7718 .add(MO: *VDataIn)
7719 .addReg(RegNo: NewVAddr)
7720 .addReg(RegNo: NewSRsrc)
7721 .add(MO: *SOffset)
7722 .add(MO: *Offset)
7723 .addImm(Val: getNamedImmOperand(MI, OperandName: AMDGPU::OpName::cpol))
7724 .cloneMemRefs(OtherMI: MI);
7725 }
7726
7727 MI.removeFromParent();
7728
7729 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7730 BuildMI(BB&: MBB, I: Addr64, MIMD: Addr64->getDebugLoc(), MCID: get(Opcode: AMDGPU::REG_SEQUENCE),
7731 DestReg: NewVAddr)
7732 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub0)
7733 .addImm(Val: AMDGPU::sub0)
7734 .addReg(RegNo: RsrcPtr, Flags: {}, SubReg: AMDGPU::sub1)
7735 .addImm(Val: AMDGPU::sub1);
7736 } else {
7737 // Legalize a VGPR Rsrc and soffset together.
7738 if (!isSoffsetLegal) {
7739 MachineOperand *Soffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7740 CreatedBB = loadScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Rsrc, Soffset}, MDT);
7741 return CreatedBB;
7742 }
7743 CreatedBB = loadScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Rsrc}, MDT);
7744 return CreatedBB;
7745 }
7746 }
7747
7748 // Legalize a VGPR soffset.
7749 if (!isSoffsetLegal) {
7750 MachineOperand *Soffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7751 CreatedBB = loadScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Soffset}, MDT);
7752 return CreatedBB;
7753 }
7754 return CreatedBB;
7755}
7756
7757void SIInstrWorklist::insert(MachineInstr *MI) {
7758 InstrList.insert(X: MI);
7759 // Add MBUF instructiosn to deferred list.
7760 int RsrcIdx =
7761 AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(), Name: AMDGPU::OpName::srsrc);
7762 if (RsrcIdx != -1) {
7763 DeferredList.insert(X: MI);
7764 }
7765}
7766
7767bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
7768 return DeferredList.contains(key: MI);
7769}
7770
7771// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7772// lowering (change sgpr to vgpr).
7773// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7774// size. Need to legalize the size of the operands during the vgpr lowering
7775// chain. This can be removed after we have sgpr16 in place
7776void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
7777 MachineRegisterInfo &MRI) const {
7778 if (!ST.useRealTrue16Insts())
7779 return;
7780
7781 unsigned Opcode = MI.getOpcode();
7782 MachineBasicBlock *MBB = MI.getParent();
7783 // Legalize operands and check for size mismatch
7784 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7785 OpIdx >= get(Opcode).getNumOperands() ||
7786 get(Opcode).operands()[OpIdx].RegClass == -1)
7787 return;
7788
7789 MachineOperand &Op = MI.getOperand(i: OpIdx);
7790 if (!Op.isReg() || !Op.getReg().isVirtual())
7791 return;
7792
7793 const TargetRegisterClass *CurrRC = MRI.getRegClass(Reg: Op.getReg());
7794 if (!RI.isVGPRClass(RC: CurrRC))
7795 return;
7796
7797 int16_t RCID = getOpRegClassID(OpInfo: get(Opcode).operands()[OpIdx]);
7798 const TargetRegisterClass *ExpectedRC = RI.getRegClass(i: RCID);
7799 if (RI.getMatchingSuperRegClass(A: CurrRC, B: ExpectedRC, Idx: AMDGPU::lo16)) {
7800 Op.setSubReg(AMDGPU::lo16);
7801 } else if (RI.getMatchingSuperRegClass(A: ExpectedRC, B: CurrRC, Idx: AMDGPU::lo16)) {
7802 const DebugLoc &DL = MI.getDebugLoc();
7803 Register NewDstReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7804 Register Undef = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass);
7805 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
7806 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewDstReg)
7807 .addReg(RegNo: Op.getReg())
7808 .addImm(Val: AMDGPU::lo16)
7809 .addReg(RegNo: Undef)
7810 .addImm(Val: AMDGPU::hi16);
7811 Op.setReg(NewDstReg);
7812 }
7813}
7814void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
7815 MachineRegisterInfo &MRI) const {
7816 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7817 legalizeOperandsVALUt16(MI, OpIdx, MRI);
7818}
7819
7820void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
7821 MachineDominatorTree *MDT) const {
7822
7823 while (!Worklist.empty()) {
7824 MachineInstr &Inst = *Worklist.top();
7825 Worklist.erase_top();
7826 // Skip MachineInstr in the deferred list.
7827 if (Worklist.isDeferred(MI: &Inst))
7828 continue;
7829 moveToVALUImpl(Worklist, MDT, Inst);
7830 }
7831
7832 // Deferred list of instructions will be processed once
7833 // all the MachineInstr in the worklist are done.
7834 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7835 moveToVALUImpl(Worklist, MDT, Inst&: *Inst);
7836 assert(Worklist.empty() &&
7837 "Deferred MachineInstr are not supposed to re-populate worklist");
7838 }
7839}
7840
7841void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
7842 MachineDominatorTree *MDT,
7843 MachineInstr &Inst) const {
7844
7845 MachineBasicBlock *MBB = Inst.getParent();
7846 if (!MBB)
7847 return;
7848 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7849 unsigned Opcode = Inst.getOpcode();
7850 unsigned NewOpcode = getVALUOp(MI: Inst);
7851 const DebugLoc &DL = Inst.getDebugLoc();
7852
7853 // Handle some special cases
7854 switch (Opcode) {
7855 default:
7856 break;
7857 case AMDGPU::S_ADD_I32:
7858 case AMDGPU::S_SUB_I32: {
7859 // FIXME: The u32 versions currently selected use the carry.
7860 bool Changed;
7861 MachineBasicBlock *CreatedBBTmp = nullptr;
7862 std::tie(args&: Changed, args&: CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7863 if (Changed)
7864 return;
7865
7866 // Default handling
7867 break;
7868 }
7869
7870 case AMDGPU::S_MUL_U64:
7871 if (ST.hasVectorMulU64()) {
7872 NewOpcode = AMDGPU::V_MUL_U64_e64;
7873 break;
7874 }
7875 // Split s_mul_u64 in 32-bit vector multiplications.
7876 splitScalarSMulU64(Worklist, Inst, MDT);
7877 Inst.eraseFromParent();
7878 return;
7879
7880 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7881 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7882 // This is a special case of s_mul_u64 where all the operands are either
7883 // zero extended or sign extended.
7884 splitScalarSMulPseudo(Worklist, Inst, MDT);
7885 Inst.eraseFromParent();
7886 return;
7887
7888 case AMDGPU::S_AND_B64:
7889 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_AND_B32, MDT);
7890 Inst.eraseFromParent();
7891 return;
7892
7893 case AMDGPU::S_OR_B64:
7894 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_OR_B32, MDT);
7895 Inst.eraseFromParent();
7896 return;
7897
7898 case AMDGPU::S_XOR_B64:
7899 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_XOR_B32, MDT);
7900 Inst.eraseFromParent();
7901 return;
7902
7903 case AMDGPU::S_NAND_B64:
7904 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_NAND_B32, MDT);
7905 Inst.eraseFromParent();
7906 return;
7907
7908 case AMDGPU::S_NOR_B64:
7909 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_NOR_B32, MDT);
7910 Inst.eraseFromParent();
7911 return;
7912
7913 case AMDGPU::S_XNOR_B64:
7914 if (ST.hasDLInsts())
7915 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_XNOR_B32, MDT);
7916 else
7917 splitScalar64BitXnor(Worklist, Inst, MDT);
7918 Inst.eraseFromParent();
7919 return;
7920
7921 case AMDGPU::S_ANDN2_B64:
7922 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_ANDN2_B32, MDT);
7923 Inst.eraseFromParent();
7924 return;
7925
7926 case AMDGPU::S_ORN2_B64:
7927 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_ORN2_B32, MDT);
7928 Inst.eraseFromParent();
7929 return;
7930
7931 case AMDGPU::S_BREV_B64:
7932 splitScalar64BitUnaryOp(Worklist, Inst, Opcode: AMDGPU::S_BREV_B32, Swap: true);
7933 Inst.eraseFromParent();
7934 return;
7935
7936 case AMDGPU::S_NOT_B64:
7937 splitScalar64BitUnaryOp(Worklist, Inst, Opcode: AMDGPU::S_NOT_B32);
7938 Inst.eraseFromParent();
7939 return;
7940
7941 case AMDGPU::S_BCNT1_I32_B64:
7942 splitScalar64BitBCNT(Worklist, Inst);
7943 Inst.eraseFromParent();
7944 return;
7945
7946 case AMDGPU::S_BFE_I64:
7947 splitScalar64BitBFE(Worklist, Inst);
7948 Inst.eraseFromParent();
7949 return;
7950
7951 case AMDGPU::S_FLBIT_I32_B64:
7952 splitScalar64BitCountOp(Worklist, Inst, Opcode: AMDGPU::V_FFBH_U32_e32);
7953 Inst.eraseFromParent();
7954 return;
7955 case AMDGPU::S_FF1_I32_B64:
7956 splitScalar64BitCountOp(Worklist, Inst, Opcode: AMDGPU::V_FFBL_B32_e32);
7957 Inst.eraseFromParent();
7958 return;
7959
7960 case AMDGPU::S_LSHL_B32:
7961 if (ST.hasOnlyRevVALUShifts()) {
7962 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7963 swapOperands(Inst);
7964 }
7965 break;
7966 case AMDGPU::S_ASHR_I32:
7967 if (ST.hasOnlyRevVALUShifts()) {
7968 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7969 swapOperands(Inst);
7970 }
7971 break;
7972 case AMDGPU::S_LSHR_B32:
7973 if (ST.hasOnlyRevVALUShifts()) {
7974 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7975 swapOperands(Inst);
7976 }
7977 break;
7978 case AMDGPU::S_LSHL_B64:
7979 if (ST.hasOnlyRevVALUShifts()) {
7980 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7981 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7982 : AMDGPU::V_LSHLREV_B64_e64;
7983 swapOperands(Inst);
7984 }
7985 break;
7986 case AMDGPU::S_ASHR_I64:
7987 if (ST.hasOnlyRevVALUShifts()) {
7988 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7989 swapOperands(Inst);
7990 }
7991 break;
7992 case AMDGPU::S_LSHR_B64:
7993 if (ST.hasOnlyRevVALUShifts()) {
7994 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7995 swapOperands(Inst);
7996 }
7997 break;
7998
7999 case AMDGPU::S_ABS_I32:
8000 lowerScalarAbs(Worklist, Inst);
8001 Inst.eraseFromParent();
8002 return;
8003
8004 case AMDGPU::S_ABSDIFF_I32:
8005 lowerScalarAbsDiff(Worklist, Inst);
8006 Inst.eraseFromParent();
8007 return;
8008
8009 case AMDGPU::S_CBRANCH_SCC0:
8010 case AMDGPU::S_CBRANCH_SCC1: {
8011 // Clear unused bits of vcc
8012 Register CondReg = Inst.getOperand(i: 1).getReg();
8013 bool IsSCC = CondReg == AMDGPU::SCC;
8014 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
8015 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: LMC.AndOpc), DestReg: LMC.VccReg)
8016 .addReg(RegNo: LMC.ExecReg)
8017 .addReg(RegNo: IsSCC ? LMC.VccReg : CondReg);
8018 Inst.removeOperand(OpNo: 1);
8019 } break;
8020
8021 case AMDGPU::S_BFE_U64:
8022 case AMDGPU::S_BFM_B64:
8023 llvm_unreachable("Moving this op to VALU not implemented");
8024
8025 case AMDGPU::S_PACK_LL_B32_B16:
8026 case AMDGPU::S_PACK_LH_B32_B16:
8027 case AMDGPU::S_PACK_HL_B32_B16:
8028 case AMDGPU::S_PACK_HH_B32_B16:
8029 movePackToVALU(Worklist, MRI, Inst);
8030 Inst.eraseFromParent();
8031 return;
8032
8033 case AMDGPU::S_XNOR_B32:
8034 lowerScalarXnor(Worklist, Inst);
8035 Inst.eraseFromParent();
8036 return;
8037
8038 case AMDGPU::S_NAND_B32:
8039 splitScalarNotBinop(Worklist, Inst, Opcode: AMDGPU::S_AND_B32);
8040 Inst.eraseFromParent();
8041 return;
8042
8043 case AMDGPU::S_NOR_B32:
8044 splitScalarNotBinop(Worklist, Inst, Opcode: AMDGPU::S_OR_B32);
8045 Inst.eraseFromParent();
8046 return;
8047
8048 case AMDGPU::S_ANDN2_B32:
8049 splitScalarBinOpN2(Worklist, Inst, Opcode: AMDGPU::S_AND_B32);
8050 Inst.eraseFromParent();
8051 return;
8052
8053 case AMDGPU::S_ORN2_B32:
8054 splitScalarBinOpN2(Worklist, Inst, Opcode: AMDGPU::S_OR_B32);
8055 Inst.eraseFromParent();
8056 return;
8057
8058 // TODO: remove as soon as everything is ready
8059 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8060 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8061 // can only be selected from the uniform SDNode.
8062 case AMDGPU::S_ADD_CO_PSEUDO:
8063 case AMDGPU::S_SUB_CO_PSEUDO: {
8064 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8065 ? AMDGPU::V_ADDC_U32_e64
8066 : AMDGPU::V_SUBB_U32_e64;
8067 const auto *CarryRC = RI.getWaveMaskRegClass();
8068
8069 Register CarryInReg = Inst.getOperand(i: 4).getReg();
8070 if (!MRI.constrainRegClass(Reg: CarryInReg, RC: CarryRC)) {
8071 Register NewCarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
8072 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY), DestReg: NewCarryReg)
8073 .addReg(RegNo: CarryInReg);
8074 }
8075
8076 Register CarryOutReg = Inst.getOperand(i: 1).getReg();
8077
8078 Register DestReg = MRI.createVirtualRegister(RegClass: RI.getEquivalentVGPRClass(
8079 SRC: MRI.getRegClass(Reg: Inst.getOperand(i: 0).getReg())));
8080 MachineInstr *CarryOp =
8081 BuildMI(BB&: *MBB, I: &Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: Opc), DestReg)
8082 .addReg(RegNo: CarryOutReg, Flags: RegState::Define)
8083 .add(MO: Inst.getOperand(i: 2))
8084 .add(MO: Inst.getOperand(i: 3))
8085 .addReg(RegNo: CarryInReg)
8086 .addImm(Val: 0);
8087 legalizeOperands(MI&: *CarryOp);
8088 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: DestReg);
8089 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8090 Inst.eraseFromParent();
8091 }
8092 return;
8093 case AMDGPU::S_UADDO_PSEUDO:
8094 case AMDGPU::S_USUBO_PSEUDO: {
8095 MachineOperand &Dest0 = Inst.getOperand(i: 0);
8096 MachineOperand &Dest1 = Inst.getOperand(i: 1);
8097 MachineOperand &Src0 = Inst.getOperand(i: 2);
8098 MachineOperand &Src1 = Inst.getOperand(i: 3);
8099
8100 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8101 ? AMDGPU::V_ADD_CO_U32_e64
8102 : AMDGPU::V_SUB_CO_U32_e64;
8103 const TargetRegisterClass *NewRC =
8104 RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest0.getReg()));
8105 Register DestReg = MRI.createVirtualRegister(RegClass: NewRC);
8106 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I: &Inst, MIMD: DL, MCID: get(Opcode: Opc), DestReg)
8107 .addReg(RegNo: Dest1.getReg(), Flags: RegState::Define)
8108 .add(MO: Src0)
8109 .add(MO: Src1)
8110 .addImm(Val: 0); // clamp bit
8111
8112 legalizeOperands(MI&: *NewInstr, MDT);
8113 MRI.replaceRegWith(FromReg: Dest0.getReg(), ToReg: DestReg);
8114 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8115 Inst.eraseFromParent();
8116 }
8117 return;
8118 case AMDGPU::S_LSHL1_ADD_U32:
8119 case AMDGPU::S_LSHL2_ADD_U32:
8120 case AMDGPU::S_LSHL3_ADD_U32:
8121 case AMDGPU::S_LSHL4_ADD_U32: {
8122 MachineOperand &Dest = Inst.getOperand(i: 0);
8123 MachineOperand &Src0 = Inst.getOperand(i: 1);
8124 MachineOperand &Src1 = Inst.getOperand(i: 2);
8125 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8126 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8127 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8128 : 4);
8129
8130 const TargetRegisterClass *NewRC =
8131 RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest.getReg()));
8132 Register DestReg = MRI.createVirtualRegister(RegClass: NewRC);
8133 MachineInstr *NewInstr =
8134 BuildMI(BB&: *MBB, I: &Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8135 .add(MO: Src0)
8136 .addImm(Val: ShiftAmt)
8137 .add(MO: Src1);
8138
8139 legalizeOperands(MI&: *NewInstr, MDT);
8140 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: DestReg);
8141 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
8142 Inst.eraseFromParent();
8143 }
8144 return;
8145 case AMDGPU::S_CSELECT_B32:
8146 case AMDGPU::S_CSELECT_B64:
8147 lowerSelect(Worklist, Inst, MDT);
8148 Inst.eraseFromParent();
8149 return;
8150 case AMDGPU::S_CMP_EQ_I32:
8151 case AMDGPU::S_CMP_LG_I32:
8152 case AMDGPU::S_CMP_GT_I32:
8153 case AMDGPU::S_CMP_GE_I32:
8154 case AMDGPU::S_CMP_LT_I32:
8155 case AMDGPU::S_CMP_LE_I32:
8156 case AMDGPU::S_CMP_EQ_U32:
8157 case AMDGPU::S_CMP_LG_U32:
8158 case AMDGPU::S_CMP_GT_U32:
8159 case AMDGPU::S_CMP_GE_U32:
8160 case AMDGPU::S_CMP_LT_U32:
8161 case AMDGPU::S_CMP_LE_U32:
8162 case AMDGPU::S_CMP_EQ_U64:
8163 case AMDGPU::S_CMP_LG_U64:
8164 case AMDGPU::S_CMP_LT_F32:
8165 case AMDGPU::S_CMP_EQ_F32:
8166 case AMDGPU::S_CMP_LE_F32:
8167 case AMDGPU::S_CMP_GT_F32:
8168 case AMDGPU::S_CMP_LG_F32:
8169 case AMDGPU::S_CMP_GE_F32:
8170 case AMDGPU::S_CMP_O_F32:
8171 case AMDGPU::S_CMP_U_F32:
8172 case AMDGPU::S_CMP_NGE_F32:
8173 case AMDGPU::S_CMP_NLG_F32:
8174 case AMDGPU::S_CMP_NGT_F32:
8175 case AMDGPU::S_CMP_NLE_F32:
8176 case AMDGPU::S_CMP_NEQ_F32:
8177 case AMDGPU::S_CMP_NLT_F32: {
8178 Register CondReg = MRI.createVirtualRegister(RegClass: RI.getWaveMaskRegClass());
8179 auto NewInstr =
8180 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode), DestReg: CondReg)
8181 .setMIFlags(Inst.getFlags());
8182 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src0_modifiers) >=
8183 0) {
8184 NewInstr
8185 .addImm(Val: 0) // src0_modifiers
8186 .add(MO: Inst.getOperand(i: 0)) // src0
8187 .addImm(Val: 0) // src1_modifiers
8188 .add(MO: Inst.getOperand(i: 1)) // src1
8189 .addImm(Val: 0); // clamp
8190 } else {
8191 NewInstr.add(MO: Inst.getOperand(i: 0)).add(MO: Inst.getOperand(i: 1));
8192 }
8193 legalizeOperands(MI&: *NewInstr, MDT);
8194 int SCCIdx = Inst.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
8195 const MachineOperand &SCCOp = Inst.getOperand(i: SCCIdx);
8196 addSCCDefUsersToVALUWorklist(Op: SCCOp, SCCDefInst&: Inst, Worklist, NewCond: CondReg);
8197 Inst.eraseFromParent();
8198 return;
8199 }
8200 case AMDGPU::S_CMP_LT_F16:
8201 case AMDGPU::S_CMP_EQ_F16:
8202 case AMDGPU::S_CMP_LE_F16:
8203 case AMDGPU::S_CMP_GT_F16:
8204 case AMDGPU::S_CMP_LG_F16:
8205 case AMDGPU::S_CMP_GE_F16:
8206 case AMDGPU::S_CMP_O_F16:
8207 case AMDGPU::S_CMP_U_F16:
8208 case AMDGPU::S_CMP_NGE_F16:
8209 case AMDGPU::S_CMP_NLG_F16:
8210 case AMDGPU::S_CMP_NGT_F16:
8211 case AMDGPU::S_CMP_NLE_F16:
8212 case AMDGPU::S_CMP_NEQ_F16:
8213 case AMDGPU::S_CMP_NLT_F16: {
8214 Register CondReg = MRI.createVirtualRegister(RegClass: RI.getWaveMaskRegClass());
8215 auto NewInstr =
8216 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode), DestReg: CondReg)
8217 .setMIFlags(Inst.getFlags());
8218 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::src0_modifiers)) {
8219 NewInstr
8220 .addImm(Val: 0) // src0_modifiers
8221 .add(MO: Inst.getOperand(i: 0)) // src0
8222 .addImm(Val: 0) // src1_modifiers
8223 .add(MO: Inst.getOperand(i: 1)) // src1
8224 .addImm(Val: 0); // clamp
8225 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::op_sel))
8226 NewInstr.addImm(Val: 0); // op_sel0
8227 } else {
8228 NewInstr
8229 .add(MO: Inst.getOperand(i: 0))
8230 .add(MO: Inst.getOperand(i: 1));
8231 }
8232 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8233 legalizeOperands(MI&: *NewInstr, MDT);
8234 int SCCIdx = Inst.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
8235 const MachineOperand &SCCOp = Inst.getOperand(i: SCCIdx);
8236 addSCCDefUsersToVALUWorklist(Op: SCCOp, SCCDefInst&: Inst, Worklist, NewCond: CondReg);
8237 Inst.eraseFromParent();
8238 return;
8239 }
8240 case AMDGPU::S_CVT_HI_F32_F16: {
8241 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8242 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8243 if (ST.useRealTrue16Insts()) {
8244 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: TmpReg)
8245 .add(MO: Inst.getOperand(i: 1));
8246 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8247 .addImm(Val: 0) // src0_modifiers
8248 .addReg(RegNo: TmpReg, Flags: {}, SubReg: AMDGPU::hi16)
8249 .addImm(Val: 0) // clamp
8250 .addImm(Val: 0) // omod
8251 .addImm(Val: 0); // op_sel0
8252 } else {
8253 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
8254 .addImm(Val: 16)
8255 .add(MO: Inst.getOperand(i: 1));
8256 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8257 .addImm(Val: 0) // src0_modifiers
8258 .addReg(RegNo: TmpReg)
8259 .addImm(Val: 0) // clamp
8260 .addImm(Val: 0); // omod
8261 }
8262
8263 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8264 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8265 Inst.eraseFromParent();
8266 return;
8267 }
8268 case AMDGPU::S_MINIMUM_F32:
8269 case AMDGPU::S_MAXIMUM_F32: {
8270 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8271 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8272 .addImm(Val: 0) // src0_modifiers
8273 .add(MO: Inst.getOperand(i: 1))
8274 .addImm(Val: 0) // src1_modifiers
8275 .add(MO: Inst.getOperand(i: 2))
8276 .addImm(Val: 0) // clamp
8277 .addImm(Val: 0); // omod
8278 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8279
8280 legalizeOperands(MI&: *NewInstr, MDT);
8281 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8282 Inst.eraseFromParent();
8283 return;
8284 }
8285 case AMDGPU::S_MINIMUM_F16:
8286 case AMDGPU::S_MAXIMUM_F16: {
8287 Register NewDst = MRI.createVirtualRegister(RegClass: ST.useRealTrue16Insts()
8288 ? &AMDGPU::VGPR_16RegClass
8289 : &AMDGPU::VGPR_32RegClass);
8290 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8291 .addImm(Val: 0) // src0_modifiers
8292 .add(MO: Inst.getOperand(i: 1))
8293 .addImm(Val: 0) // src1_modifiers
8294 .add(MO: Inst.getOperand(i: 2))
8295 .addImm(Val: 0) // clamp
8296 .addImm(Val: 0) // omod
8297 .addImm(Val: 0); // opsel0
8298 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8299 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8300 legalizeOperands(MI&: *NewInstr, MDT);
8301 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8302 Inst.eraseFromParent();
8303 return;
8304 }
8305 case AMDGPU::V_S_EXP_F16_e64:
8306 case AMDGPU::V_S_LOG_F16_e64:
8307 case AMDGPU::V_S_RCP_F16_e64:
8308 case AMDGPU::V_S_RSQ_F16_e64:
8309 case AMDGPU::V_S_SQRT_F16_e64: {
8310 Register NewDst = MRI.createVirtualRegister(RegClass: ST.useRealTrue16Insts()
8311 ? &AMDGPU::VGPR_16RegClass
8312 : &AMDGPU::VGPR_32RegClass);
8313 auto NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
8314 .add(MO: Inst.getOperand(i: 1)) // src0_modifiers
8315 .add(MO: Inst.getOperand(i: 2))
8316 .add(MO: Inst.getOperand(i: 3)) // clamp
8317 .add(MO: Inst.getOperand(i: 4)) // omod
8318 .setMIFlags(Inst.getFlags());
8319 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::op_sel))
8320 NewInstr.addImm(Val: 0); // opsel0
8321 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
8322 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8323 legalizeOperands(MI&: *NewInstr, MDT);
8324 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
8325 Inst.eraseFromParent();
8326 return;
8327 }
8328 }
8329
8330 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8331 // We cannot move this instruction to the VALU, so we should try to
8332 // legalize its operands instead.
8333 legalizeOperands(MI&: Inst, MDT);
8334 return;
8335 }
8336 // Handle converting generic instructions like COPY-to-SGPR into
8337 // COPY-to-VGPR.
8338 if (NewOpcode == Opcode) {
8339 Register DstReg = Inst.getOperand(i: 0).getReg();
8340 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8341
8342 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8343 // hope for the best.
8344 if (Inst.isCopy() && DstReg.isPhysical() &&
8345 RI.isVGPR(MRI, Reg: Inst.getOperand(i: 1).getReg())) {
8346 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8347 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
8348 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: NewDst)
8349 .add(MO: Inst.getOperand(i: 1));
8350 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
8351 DestReg: DstReg)
8352 .addReg(RegNo: NewDst);
8353
8354 Inst.eraseFromParent();
8355 return;
8356 }
8357
8358 if (Inst.isCopy() && Inst.getOperand(i: 1).getReg().isVirtual()) {
8359 Register NewDstReg = Inst.getOperand(i: 1).getReg();
8360 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, Reg: NewDstReg);
8361 if (const TargetRegisterClass *CommonRC =
8362 RI.getCommonSubClass(A: NewDstRC, B: SrcRC)) {
8363 // Instead of creating a copy where src and dst are the same register
8364 // class, we just replace all uses of dst with src. These kinds of
8365 // copies interfere with the heuristics MachineSink uses to decide
8366 // whether or not to split a critical edge. Since the pass assumes
8367 // that copies will end up as machine instructions and not be
8368 // eliminated.
8369 addUsersToMoveToVALUWorklist(Reg: DstReg, MRI, Worklist);
8370 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8371 MRI.clearKillFlags(Reg: NewDstReg);
8372 Inst.getOperand(i: 0).setReg(DstReg);
8373
8374 if (!MRI.constrainRegClass(Reg: NewDstReg, RC: CommonRC))
8375 llvm_unreachable("failed to constrain register");
8376
8377 Inst.eraseFromParent();
8378
8379 for (MachineOperand &UseMO :
8380 make_early_inc_range(Range: MRI.use_operands(Reg: NewDstReg))) {
8381 MachineInstr &UseMI = *UseMO.getParent();
8382
8383 // Legalize t16 operands since replaceReg is called after
8384 // addUsersToVALU.
8385 legalizeOperandsVALUt16(MI&: UseMI, MRI);
8386
8387 unsigned OpIdx = UseMI.getOperandNo(I: &UseMO);
8388 if (const TargetRegisterClass *OpRC =
8389 getRegClass(MCID: UseMI.getDesc(), OpNum: OpIdx))
8390 MRI.constrainRegClass(Reg: NewDstReg, RC: OpRC);
8391 }
8392
8393 return;
8394 }
8395 }
8396
8397 // If this is a v2s copy between 16bit and 32bit reg,
8398 // replace vgpr copy to reg_sequence/extract_subreg
8399 // This can be remove after we have sgpr16 in place
8400 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8401 Inst.getOperand(i: 1).getReg().isVirtual() &&
8402 RI.isVGPR(MRI, Reg: Inst.getOperand(i: 1).getReg())) {
8403 const TargetRegisterClass *SrcRegRC = getOpRegClass(MI: Inst, OpNo: 1);
8404 if (RI.getMatchingSuperRegClass(A: NewDstRC, B: SrcRegRC, Idx: AMDGPU::lo16)) {
8405 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8406 Register Undef = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass);
8407 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
8408 MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
8409 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
8410 MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewDstReg)
8411 .addReg(RegNo: Inst.getOperand(i: 1).getReg())
8412 .addImm(Val: AMDGPU::lo16)
8413 .addReg(RegNo: Undef)
8414 .addImm(Val: AMDGPU::hi16);
8415 Inst.eraseFromParent();
8416 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8417 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8418 return;
8419 } else if (RI.getMatchingSuperRegClass(A: SrcRegRC, B: NewDstRC,
8420 Idx: AMDGPU::lo16)) {
8421 Inst.getOperand(i: 1).setSubReg(AMDGPU::lo16);
8422 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8423 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8424 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8425 return;
8426 }
8427 }
8428
8429 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8430 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8431 legalizeOperands(MI&: Inst, MDT);
8432 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8433 return;
8434 }
8435
8436 // Use the new VALU Opcode.
8437 auto NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode))
8438 .setMIFlags(Inst.getFlags());
8439 if (isVOP3(Opcode: NewOpcode) && !isVOP3(Opcode)) {
8440 // Intersperse VOP3 modifiers among the SALU operands.
8441 NewInstr->addOperand(Op: Inst.getOperand(i: 0));
8442 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8443 Name: AMDGPU::OpName::src0_modifiers) >= 0)
8444 NewInstr.addImm(Val: 0);
8445 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::src0)) {
8446 const MachineOperand &Src = Inst.getOperand(i: 1);
8447 NewInstr->addOperand(Op: Src);
8448 }
8449
8450 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8451 // We are converting these to a BFE, so we need to add the missing
8452 // operands for the size and offset.
8453 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8454 NewInstr.addImm(Val: 0);
8455 NewInstr.addImm(Val: Size);
8456 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8457 // The VALU version adds the second operand to the result, so insert an
8458 // extra 0 operand.
8459 NewInstr.addImm(Val: 0);
8460 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8461 const MachineOperand &OffsetWidthOp = Inst.getOperand(i: 2);
8462 // If we need to move this to VGPRs, we need to unpack the second
8463 // operand back into the 2 separate ones for bit offset and width.
8464 assert(OffsetWidthOp.isImm() &&
8465 "Scalar BFE is only implemented for constant width and offset");
8466 uint32_t Imm = OffsetWidthOp.getImm();
8467
8468 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8469 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8470 NewInstr.addImm(Val: Offset);
8471 NewInstr.addImm(Val: BitWidth);
8472 } else {
8473 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8474 Name: AMDGPU::OpName::src1_modifiers) >= 0)
8475 NewInstr.addImm(Val: 0);
8476 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src1) >= 0)
8477 NewInstr->addOperand(Op: Inst.getOperand(i: 2));
8478 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
8479 Name: AMDGPU::OpName::src2_modifiers) >= 0)
8480 NewInstr.addImm(Val: 0);
8481 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src2) >= 0)
8482 NewInstr->addOperand(Op: Inst.getOperand(i: 3));
8483 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::clamp) >= 0)
8484 NewInstr.addImm(Val: 0);
8485 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::omod) >= 0)
8486 NewInstr.addImm(Val: 0);
8487 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::op_sel) >= 0)
8488 NewInstr.addImm(Val: 0);
8489 }
8490 } else {
8491 // Just copy the SALU operands.
8492 for (const MachineOperand &Op : Inst.explicit_operands())
8493 NewInstr->addOperand(Op);
8494 }
8495
8496 // Remove any references to SCC. Vector instructions can't read from it, and
8497 // We're just about to add the implicit use / defs of VCC, and we don't want
8498 // both.
8499 for (MachineOperand &Op : Inst.implicit_operands()) {
8500 if (Op.getReg() == AMDGPU::SCC) {
8501 // Only propagate through live-def of SCC.
8502 if (Op.isDef() && !Op.isDead())
8503 addSCCDefUsersToVALUWorklist(Op, SCCDefInst&: Inst, Worklist);
8504 if (Op.isUse())
8505 addSCCDefsToVALUWorklist(SCCUseInst: NewInstr, Worklist);
8506 }
8507 }
8508 Inst.eraseFromParent();
8509 Register NewDstReg;
8510 if (NewInstr->getOperand(i: 0).isReg() && NewInstr->getOperand(i: 0).isDef()) {
8511 Register DstReg = NewInstr->getOperand(i: 0).getReg();
8512 assert(DstReg.isVirtual());
8513 // Update the destination register class.
8514 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst: *NewInstr);
8515 assert(NewDstRC);
8516 NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
8517 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
8518 }
8519 fixImplicitOperands(MI&: *NewInstr);
8520
8521 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
8522
8523 // Legalize the operands
8524 legalizeOperands(MI&: *NewInstr, MDT);
8525 if (NewDstReg)
8526 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
8527}
8528
8529// Add/sub require special handling to deal with carry outs.
8530std::pair<bool, MachineBasicBlock *>
8531SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8532 MachineDominatorTree *MDT) const {
8533 if (ST.hasAddNoCarryInsts()) {
8534 // Assume there is no user of scc since we don't select this in that case.
8535 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8536 // is used.
8537
8538 MachineBasicBlock &MBB = *Inst.getParent();
8539 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8540
8541 Register OldDstReg = Inst.getOperand(i: 0).getReg();
8542 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8543
8544 unsigned Opc = Inst.getOpcode();
8545 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8546
8547 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8548 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8549
8550 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8551 Inst.removeOperand(OpNo: 3);
8552
8553 Inst.setDesc(get(Opcode: NewOpc));
8554 Inst.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // clamp bit
8555 Inst.addImplicitDefUseOperands(MF&: *MBB.getParent());
8556 MRI.replaceRegWith(FromReg: OldDstReg, ToReg: ResultReg);
8557 MachineBasicBlock *NewBB = legalizeOperands(MI&: Inst, MDT);
8558
8559 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8560 return std::pair(true, NewBB);
8561 }
8562
8563 return std::pair(false, nullptr);
8564}
8565
8566void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8567 MachineDominatorTree *MDT) const {
8568
8569 MachineBasicBlock &MBB = *Inst.getParent();
8570 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8571 MachineBasicBlock::iterator MII = Inst;
8572 const DebugLoc &DL = Inst.getDebugLoc();
8573
8574 MachineOperand &Dest = Inst.getOperand(i: 0);
8575 MachineOperand &Src0 = Inst.getOperand(i: 1);
8576 MachineOperand &Src1 = Inst.getOperand(i: 2);
8577 MachineOperand &Cond = Inst.getOperand(i: 3);
8578
8579 Register CondReg = Cond.getReg();
8580 bool IsSCC = (CondReg == AMDGPU::SCC);
8581
8582 // If this is a trivial select where the condition is effectively not SCC
8583 // (CondReg is a source of copy to SCC), then the select is semantically
8584 // equivalent to copying CondReg. Hence, there is no need to create
8585 // V_CNDMASK, we can just use that and bail out.
8586 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8587 (Src1.getImm() == 0)) {
8588 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: CondReg);
8589 return;
8590 }
8591
8592 Register NewCondReg = CondReg;
8593 if (IsSCC) {
8594 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8595 NewCondReg = MRI.createVirtualRegister(RegClass: TC);
8596
8597 // Now look for the closest SCC def if it is a copy
8598 // replacing the CondReg with the COPY source register
8599 bool CopyFound = false;
8600 for (MachineInstr &CandI :
8601 make_range(x: std::next(x: MachineBasicBlock::reverse_iterator(Inst)),
8602 y: Inst.getParent()->rend())) {
8603 if (CandI.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isDead: false, Overlap: false) !=
8604 -1) {
8605 if (CandI.isCopy() && CandI.getOperand(i: 0).getReg() == AMDGPU::SCC) {
8606 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: NewCondReg)
8607 .addReg(RegNo: CandI.getOperand(i: 1).getReg());
8608 CopyFound = true;
8609 }
8610 break;
8611 }
8612 }
8613 if (!CopyFound) {
8614 // SCC def is not a copy
8615 // Insert a trivial select instead of creating a copy, because a copy from
8616 // SCC would semantically mean just copying a single bit, but we may need
8617 // the result to be a vector condition mask that needs preserving.
8618 unsigned Opcode =
8619 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8620 auto NewSelect =
8621 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: NewCondReg).addImm(Val: -1).addImm(Val: 0);
8622 NewSelect->getOperand(i: 3).setIsUndef(Cond.isUndef());
8623 }
8624 }
8625
8626 Register NewDestReg = MRI.createVirtualRegister(
8627 RegClass: RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest.getReg())));
8628 MachineInstr *NewInst;
8629 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8630 NewInst = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: NewDestReg)
8631 .addImm(Val: 0)
8632 .add(MO: Src1) // False
8633 .addImm(Val: 0)
8634 .add(MO: Src0) // True
8635 .addReg(RegNo: NewCondReg);
8636 } else {
8637 NewInst =
8638 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B64_PSEUDO), DestReg: NewDestReg)
8639 .add(MO: Src1) // False
8640 .add(MO: Src0) // True
8641 .addReg(RegNo: NewCondReg);
8642 }
8643 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDestReg);
8644 legalizeOperands(MI&: *NewInst, MDT);
8645 addUsersToMoveToVALUWorklist(Reg: NewDestReg, MRI, Worklist);
8646}
8647
8648void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8649 MachineInstr &Inst) const {
8650 MachineBasicBlock &MBB = *Inst.getParent();
8651 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8652 MachineBasicBlock::iterator MII = Inst;
8653 const DebugLoc &DL = Inst.getDebugLoc();
8654
8655 MachineOperand &Dest = Inst.getOperand(i: 0);
8656 MachineOperand &Src = Inst.getOperand(i: 1);
8657 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8658 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8659
8660 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8661 : AMDGPU::V_SUB_CO_U32_e32;
8662
8663 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: TmpReg)
8664 .addImm(Val: 0)
8665 .addReg(RegNo: Src.getReg());
8666
8667 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MAX_I32_e64), DestReg: ResultReg)
8668 .addReg(RegNo: Src.getReg())
8669 .addReg(RegNo: TmpReg);
8670
8671 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8672 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8673}
8674
8675void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8676 MachineInstr &Inst) const {
8677 MachineBasicBlock &MBB = *Inst.getParent();
8678 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8679 MachineBasicBlock::iterator MII = Inst;
8680 const DebugLoc &DL = Inst.getDebugLoc();
8681
8682 MachineOperand &Dest = Inst.getOperand(i: 0);
8683 MachineOperand &Src1 = Inst.getOperand(i: 1);
8684 MachineOperand &Src2 = Inst.getOperand(i: 2);
8685 Register SubResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8686 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8687 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8688
8689 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8690 : AMDGPU::V_SUB_CO_U32_e32;
8691
8692 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: SubResultReg)
8693 .addReg(RegNo: Src1.getReg())
8694 .addReg(RegNo: Src2.getReg());
8695
8696 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: TmpReg).addImm(Val: 0).addReg(RegNo: SubResultReg);
8697
8698 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MAX_I32_e64), DestReg: ResultReg)
8699 .addReg(RegNo: SubResultReg)
8700 .addReg(RegNo: TmpReg);
8701
8702 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8703 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8704}
8705
8706void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8707 MachineInstr &Inst) const {
8708 MachineBasicBlock &MBB = *Inst.getParent();
8709 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8710 MachineBasicBlock::iterator MII = Inst;
8711 const DebugLoc &DL = Inst.getDebugLoc();
8712
8713 MachineOperand &Dest = Inst.getOperand(i: 0);
8714 MachineOperand &Src0 = Inst.getOperand(i: 1);
8715 MachineOperand &Src1 = Inst.getOperand(i: 2);
8716
8717 if (ST.hasDLInsts()) {
8718 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8719 legalizeGenericOperand(InsertMBB&: MBB, I: MII, DstRC: &AMDGPU::VGPR_32RegClass, Op&: Src0, MRI, DL);
8720 legalizeGenericOperand(InsertMBB&: MBB, I: MII, DstRC: &AMDGPU::VGPR_32RegClass, Op&: Src1, MRI, DL);
8721
8722 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_XNOR_B32_e64), DestReg: NewDest)
8723 .add(MO: Src0)
8724 .add(MO: Src1);
8725
8726 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8727 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8728 } else {
8729 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8730 // invert either source and then perform the XOR. If either source is a
8731 // scalar register, then we can leave the inversion on the scalar unit to
8732 // achieve a better distribution of scalar and vector instructions.
8733 bool Src0IsSGPR = Src0.isReg() &&
8734 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src0.getReg()));
8735 bool Src1IsSGPR = Src1.isReg() &&
8736 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src1.getReg()));
8737 MachineInstr *Xor;
8738 Register Temp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8739 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8740
8741 // Build a pair of scalar instructions and add them to the work list.
8742 // The next iteration over the work list will lower these to the vector
8743 // unit as necessary.
8744 if (Src0IsSGPR) {
8745 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Temp).add(MO: Src0);
8746 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: NewDest)
8747 .addReg(RegNo: Temp)
8748 .add(MO: Src1);
8749 } else if (Src1IsSGPR) {
8750 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Temp).add(MO: Src1);
8751 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: NewDest)
8752 .add(MO: Src0)
8753 .addReg(RegNo: Temp);
8754 } else {
8755 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: Temp)
8756 .add(MO: Src0)
8757 .add(MO: Src1);
8758 MachineInstr *Not =
8759 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: NewDest).addReg(RegNo: Temp);
8760 Worklist.insert(MI: Not);
8761 }
8762
8763 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8764
8765 Worklist.insert(MI: Xor);
8766
8767 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8768 }
8769}
8770
8771void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8772 MachineInstr &Inst,
8773 unsigned Opcode) const {
8774 MachineBasicBlock &MBB = *Inst.getParent();
8775 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8776 MachineBasicBlock::iterator MII = Inst;
8777 const DebugLoc &DL = Inst.getDebugLoc();
8778
8779 MachineOperand &Dest = Inst.getOperand(i: 0);
8780 MachineOperand &Src0 = Inst.getOperand(i: 1);
8781 MachineOperand &Src1 = Inst.getOperand(i: 2);
8782
8783 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8784 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8785
8786 MachineInstr &Op = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: Interm)
8787 .add(MO: Src0)
8788 .add(MO: Src1);
8789
8790 MachineInstr &Not = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: NewDest)
8791 .addReg(RegNo: Interm);
8792
8793 Worklist.insert(MI: &Op);
8794 Worklist.insert(MI: &Not);
8795
8796 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8797 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8798}
8799
8800void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8801 MachineInstr &Inst,
8802 unsigned Opcode) const {
8803 MachineBasicBlock &MBB = *Inst.getParent();
8804 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8805 MachineBasicBlock::iterator MII = Inst;
8806 const DebugLoc &DL = Inst.getDebugLoc();
8807
8808 MachineOperand &Dest = Inst.getOperand(i: 0);
8809 MachineOperand &Src0 = Inst.getOperand(i: 1);
8810 MachineOperand &Src1 = Inst.getOperand(i: 2);
8811
8812 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8813 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8814
8815 MachineInstr &Not = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Interm)
8816 .add(MO: Src1);
8817
8818 MachineInstr &Op = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: NewDest)
8819 .add(MO: Src0)
8820 .addReg(RegNo: Interm);
8821
8822 Worklist.insert(MI: &Not);
8823 Worklist.insert(MI: &Op);
8824
8825 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8826 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8827}
8828
8829void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8830 MachineInstr &Inst, unsigned Opcode,
8831 bool Swap) const {
8832 MachineBasicBlock &MBB = *Inst.getParent();
8833 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8834
8835 MachineOperand &Dest = Inst.getOperand(i: 0);
8836 MachineOperand &Src0 = Inst.getOperand(i: 1);
8837 const DebugLoc &DL = Inst.getDebugLoc();
8838
8839 MachineBasicBlock::iterator MII = Inst;
8840
8841 const MCInstrDesc &InstDesc = get(Opcode);
8842 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8843 MRI.getRegClass(Reg: Src0.getReg()) :
8844 &AMDGPU::SGPR_32RegClass;
8845
8846 const TargetRegisterClass *Src0SubRC =
8847 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8848
8849 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
8850 SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8851
8852 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
8853 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(SRC: DestRC);
8854 const TargetRegisterClass *NewDestSubRC =
8855 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8856
8857 Register DestSub0 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
8858 MachineInstr &LoHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub0).add(MO: SrcReg0Sub0);
8859
8860 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
8861 SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
8862
8863 Register DestSub1 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
8864 MachineInstr &HiHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub1).add(MO: SrcReg0Sub1);
8865
8866 if (Swap)
8867 std::swap(a&: DestSub0, b&: DestSub1);
8868
8869 Register FullDestReg = MRI.createVirtualRegister(RegClass: NewDestRC);
8870 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
8871 .addReg(RegNo: DestSub0)
8872 .addImm(Val: AMDGPU::sub0)
8873 .addReg(RegNo: DestSub1)
8874 .addImm(Val: AMDGPU::sub1);
8875
8876 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
8877
8878 Worklist.insert(MI: &LoHalf);
8879 Worklist.insert(MI: &HiHalf);
8880
8881 // We don't need to legalizeOperands here because for a single operand, src0
8882 // will support any kind of input.
8883
8884 // Move all users of this moved value.
8885 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
8886}
8887
8888// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8889// split the s_mul_u64 in 32-bit vector multiplications.
8890void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8891 MachineInstr &Inst,
8892 MachineDominatorTree *MDT) const {
8893 MachineBasicBlock &MBB = *Inst.getParent();
8894 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8895
8896 Register FullDestReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
8897 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8898 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8899
8900 MachineOperand &Dest = Inst.getOperand(i: 0);
8901 MachineOperand &Src0 = Inst.getOperand(i: 1);
8902 MachineOperand &Src1 = Inst.getOperand(i: 2);
8903 const DebugLoc &DL = Inst.getDebugLoc();
8904 MachineBasicBlock::iterator MII = Inst;
8905
8906 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0.getReg());
8907 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: Src1.getReg());
8908 const TargetRegisterClass *Src0SubRC =
8909 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8910 if (RI.isSGPRClass(RC: Src0SubRC))
8911 Src0SubRC = RI.getEquivalentVGPRClass(SRC: Src0SubRC);
8912 const TargetRegisterClass *Src1SubRC =
8913 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8914 if (RI.isSGPRClass(RC: Src1SubRC))
8915 Src1SubRC = RI.getEquivalentVGPRClass(SRC: Src1SubRC);
8916
8917 // First, we extract the low 32-bit and high 32-bit values from each of the
8918 // operands.
8919 MachineOperand Op0L =
8920 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8921 MachineOperand Op1L =
8922 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
8923 MachineOperand Op0H =
8924 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
8925 MachineOperand Op1H =
8926 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
8927
8928 // The multilication is done as follows:
8929 //
8930 // Op1H Op1L
8931 // * Op0H Op0L
8932 // --------------------
8933 // Op1H*Op0L Op1L*Op0L
8934 // + Op1H*Op0H Op1L*Op0H
8935 // -----------------------------------------
8936 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8937 //
8938 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8939 // value and that would overflow.
8940 // The low 32-bit value is Op1L*Op0L.
8941 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8942
8943 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8944 MachineInstr *Op1L_Op0H =
8945 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: Op1L_Op0H_Reg)
8946 .add(MO: Op1L)
8947 .add(MO: Op0H);
8948
8949 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8950 MachineInstr *Op1H_Op0L =
8951 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: Op1H_Op0L_Reg)
8952 .add(MO: Op1H)
8953 .add(MO: Op0L);
8954
8955 Register CarryReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8956 MachineInstr *Carry =
8957 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_HI_U32_e64), DestReg: CarryReg)
8958 .add(MO: Op1L)
8959 .add(MO: Op0L);
8960
8961 MachineInstr *LoHalf =
8962 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: DestSub0)
8963 .add(MO: Op1L)
8964 .add(MO: Op0L);
8965
8966 Register AddReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8967 MachineInstr *Add = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg: AddReg)
8968 .addReg(RegNo: Op1L_Op0H_Reg)
8969 .addReg(RegNo: Op1H_Op0L_Reg);
8970
8971 MachineInstr *HiHalf =
8972 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg: DestSub1)
8973 .addReg(RegNo: AddReg)
8974 .addReg(RegNo: CarryReg);
8975
8976 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
8977 .addReg(RegNo: DestSub0)
8978 .addImm(Val: AMDGPU::sub0)
8979 .addReg(RegNo: DestSub1)
8980 .addImm(Val: AMDGPU::sub1);
8981
8982 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
8983
8984 // Try to legalize the operands in case we need to swap the order to keep it
8985 // valid.
8986 legalizeOperands(MI&: *Op1L_Op0H, MDT);
8987 legalizeOperands(MI&: *Op1H_Op0L, MDT);
8988 legalizeOperands(MI&: *Carry, MDT);
8989 legalizeOperands(MI&: *LoHalf, MDT);
8990 legalizeOperands(MI&: *Add, MDT);
8991 legalizeOperands(MI&: *HiHalf, MDT);
8992
8993 // Move all users of this moved value.
8994 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
8995}
8996
8997// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8998// multiplications.
8999void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
9000 MachineInstr &Inst,
9001 MachineDominatorTree *MDT) const {
9002 MachineBasicBlock &MBB = *Inst.getParent();
9003 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9004
9005 Register FullDestReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
9006 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9007 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9008
9009 MachineOperand &Dest = Inst.getOperand(i: 0);
9010 MachineOperand &Src0 = Inst.getOperand(i: 1);
9011 MachineOperand &Src1 = Inst.getOperand(i: 2);
9012 const DebugLoc &DL = Inst.getDebugLoc();
9013 MachineBasicBlock::iterator MII = Inst;
9014
9015 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0.getReg());
9016 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: Src1.getReg());
9017 const TargetRegisterClass *Src0SubRC =
9018 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9019 if (RI.isSGPRClass(RC: Src0SubRC))
9020 Src0SubRC = RI.getEquivalentVGPRClass(SRC: Src0SubRC);
9021 const TargetRegisterClass *Src1SubRC =
9022 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9023 if (RI.isSGPRClass(RC: Src1SubRC))
9024 Src1SubRC = RI.getEquivalentVGPRClass(SRC: Src1SubRC);
9025
9026 // First, we extract the low 32-bit and high 32-bit values from each of the
9027 // operands.
9028 MachineOperand Op0L =
9029 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
9030 MachineOperand Op1L =
9031 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
9032
9033 unsigned Opc = Inst.getOpcode();
9034 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
9035 ? AMDGPU::V_MUL_HI_U32_e64
9036 : AMDGPU::V_MUL_HI_I32_e64;
9037 MachineInstr *HiHalf =
9038 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: NewOpc), DestReg: DestSub1).add(MO: Op1L).add(MO: Op0L);
9039
9040 MachineInstr *LoHalf =
9041 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: DestSub0)
9042 .add(MO: Op1L)
9043 .add(MO: Op0L);
9044
9045 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
9046 .addReg(RegNo: DestSub0)
9047 .addImm(Val: AMDGPU::sub0)
9048 .addReg(RegNo: DestSub1)
9049 .addImm(Val: AMDGPU::sub1);
9050
9051 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
9052
9053 // Try to legalize the operands in case we need to swap the order to keep it
9054 // valid.
9055 legalizeOperands(MI&: *HiHalf, MDT);
9056 legalizeOperands(MI&: *LoHalf, MDT);
9057
9058 // Move all users of this moved value.
9059 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
9060}
9061
9062void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9063 MachineInstr &Inst, unsigned Opcode,
9064 MachineDominatorTree *MDT) const {
9065 MachineBasicBlock &MBB = *Inst.getParent();
9066 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9067
9068 MachineOperand &Dest = Inst.getOperand(i: 0);
9069 MachineOperand &Src0 = Inst.getOperand(i: 1);
9070 MachineOperand &Src1 = Inst.getOperand(i: 2);
9071 const DebugLoc &DL = Inst.getDebugLoc();
9072
9073 MachineBasicBlock::iterator MII = Inst;
9074
9075 const MCInstrDesc &InstDesc = get(Opcode);
9076 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9077 MRI.getRegClass(Reg: Src0.getReg()) :
9078 &AMDGPU::SGPR_32RegClass;
9079
9080 const TargetRegisterClass *Src0SubRC =
9081 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9082 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9083 MRI.getRegClass(Reg: Src1.getReg()) :
9084 &AMDGPU::SGPR_32RegClass;
9085
9086 const TargetRegisterClass *Src1SubRC =
9087 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9088
9089 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
9090 SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
9091 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC,
9092 SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
9093 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
9094 SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
9095 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC,
9096 SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
9097
9098 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
9099 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(SRC: DestRC);
9100 const TargetRegisterClass *NewDestSubRC =
9101 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9102
9103 Register DestSub0 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
9104 MachineInstr &LoHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub0)
9105 .add(MO: SrcReg0Sub0)
9106 .add(MO: SrcReg1Sub0);
9107
9108 Register DestSub1 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
9109 MachineInstr &HiHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub1)
9110 .add(MO: SrcReg0Sub1)
9111 .add(MO: SrcReg1Sub1);
9112
9113 Register FullDestReg = MRI.createVirtualRegister(RegClass: NewDestRC);
9114 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
9115 .addReg(RegNo: DestSub0)
9116 .addImm(Val: AMDGPU::sub0)
9117 .addReg(RegNo: DestSub1)
9118 .addImm(Val: AMDGPU::sub1);
9119
9120 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
9121
9122 Worklist.insert(MI: &LoHalf);
9123 Worklist.insert(MI: &HiHalf);
9124
9125 // Move all users of this moved value.
9126 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
9127}
9128
9129void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9130 MachineInstr &Inst,
9131 MachineDominatorTree *MDT) const {
9132 MachineBasicBlock &MBB = *Inst.getParent();
9133 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9134
9135 MachineOperand &Dest = Inst.getOperand(i: 0);
9136 MachineOperand &Src0 = Inst.getOperand(i: 1);
9137 MachineOperand &Src1 = Inst.getOperand(i: 2);
9138 const DebugLoc &DL = Inst.getDebugLoc();
9139
9140 MachineBasicBlock::iterator MII = Inst;
9141
9142 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
9143
9144 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
9145
9146 MachineOperand* Op0;
9147 MachineOperand* Op1;
9148
9149 if (Src0.isReg() && RI.isSGPRReg(MRI, Reg: Src0.getReg())) {
9150 Op0 = &Src0;
9151 Op1 = &Src1;
9152 } else {
9153 Op0 = &Src1;
9154 Op1 = &Src0;
9155 }
9156
9157 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B64), DestReg: Interm)
9158 .add(MO: *Op0);
9159
9160 Register NewDest = MRI.createVirtualRegister(RegClass: DestRC);
9161
9162 MachineInstr &Xor = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B64), DestReg: NewDest)
9163 .addReg(RegNo: Interm)
9164 .add(MO: *Op1);
9165
9166 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
9167
9168 Worklist.insert(MI: &Xor);
9169}
9170
9171void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9172 MachineInstr &Inst) const {
9173 MachineBasicBlock &MBB = *Inst.getParent();
9174 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9175
9176 MachineBasicBlock::iterator MII = Inst;
9177 const DebugLoc &DL = Inst.getDebugLoc();
9178
9179 MachineOperand &Dest = Inst.getOperand(i: 0);
9180 MachineOperand &Src = Inst.getOperand(i: 1);
9181
9182 const MCInstrDesc &InstDesc = get(Opcode: AMDGPU::V_BCNT_U32_B32_e64);
9183 const TargetRegisterClass *SrcRC = Src.isReg() ?
9184 MRI.getRegClass(Reg: Src.getReg()) :
9185 &AMDGPU::SGPR_32RegClass;
9186
9187 Register MidReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9188 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9189
9190 const TargetRegisterClass *SrcSubRC =
9191 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9192
9193 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC,
9194 SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
9195 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC,
9196 SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
9197
9198 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg).add(MO: SrcRegSub0).addImm(Val: 0);
9199
9200 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: ResultReg).add(MO: SrcRegSub1).addReg(RegNo: MidReg);
9201
9202 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9203
9204 // We don't need to legalize operands here. src0 for either instruction can be
9205 // an SGPR, and the second input is unused or determined here.
9206 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9207}
9208
9209void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9210 MachineInstr &Inst) const {
9211 MachineBasicBlock &MBB = *Inst.getParent();
9212 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9213 MachineBasicBlock::iterator MII = Inst;
9214 const DebugLoc &DL = Inst.getDebugLoc();
9215
9216 MachineOperand &Dest = Inst.getOperand(i: 0);
9217 uint32_t Imm = Inst.getOperand(i: 2).getImm();
9218 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9219 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9220
9221 (void) Offset;
9222
9223 // Only sext_inreg cases handled.
9224 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9225 Offset == 0 && "Not implemented");
9226
9227 if (BitWidth < 32) {
9228 Register MidRegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9229 Register MidRegHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9230 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
9231
9232 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_BFE_I32_e64), DestReg: MidRegLo)
9233 .addReg(RegNo: Inst.getOperand(i: 1).getReg(), Flags: {}, SubReg: AMDGPU::sub0)
9234 .addImm(Val: 0)
9235 .addImm(Val: BitWidth);
9236
9237 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ASHRREV_I32_e32), DestReg: MidRegHi)
9238 .addImm(Val: 31)
9239 .addReg(RegNo: MidRegLo);
9240
9241 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: ResultReg)
9242 .addReg(RegNo: MidRegLo)
9243 .addImm(Val: AMDGPU::sub0)
9244 .addReg(RegNo: MidRegHi)
9245 .addImm(Val: AMDGPU::sub1);
9246
9247 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9248 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9249 return;
9250 }
9251
9252 MachineOperand &Src = Inst.getOperand(i: 1);
9253 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9254 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
9255
9256 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ASHRREV_I32_e64), DestReg: TmpReg)
9257 .addImm(Val: 31)
9258 .addReg(RegNo: Src.getReg(), Flags: {}, SubReg: AMDGPU::sub0);
9259
9260 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: ResultReg)
9261 .addReg(RegNo: Src.getReg(), Flags: {}, SubReg: AMDGPU::sub0)
9262 .addImm(Val: AMDGPU::sub0)
9263 .addReg(RegNo: TmpReg)
9264 .addImm(Val: AMDGPU::sub1);
9265
9266 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9267 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
9268}
9269
9270void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9271 MachineInstr &Inst, unsigned Opcode,
9272 MachineDominatorTree *MDT) const {
9273 // (S_FLBIT_I32_B64 hi:lo) ->
9274 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9275 // (S_FF1_I32_B64 hi:lo) ->
9276 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9277
9278 MachineBasicBlock &MBB = *Inst.getParent();
9279 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9280 MachineBasicBlock::iterator MII = Inst;
9281 const DebugLoc &DL = Inst.getDebugLoc();
9282
9283 MachineOperand &Dest = Inst.getOperand(i: 0);
9284 MachineOperand &Src = Inst.getOperand(i: 1);
9285
9286 const MCInstrDesc &InstDesc = get(Opcode);
9287
9288 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9289 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9290 : AMDGPU::V_ADD_CO_U32_e32;
9291
9292 const TargetRegisterClass *SrcRC =
9293 Src.isReg() ? MRI.getRegClass(Reg: Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9294 const TargetRegisterClass *SrcSubRC =
9295 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9296
9297 MachineOperand SrcRegSub0 =
9298 buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
9299 MachineOperand SrcRegSub1 =
9300 buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
9301
9302 Register MidReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9303 Register MidReg2 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9304 Register MidReg3 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9305 Register MidReg4 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9306
9307 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg1).add(MO: SrcRegSub0);
9308
9309 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg2).add(MO: SrcRegSub1);
9310
9311 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: OpcodeAdd), DestReg: MidReg3)
9312 .addReg(RegNo: IsCtlz ? MidReg1 : MidReg2)
9313 .addImm(Val: 32)
9314 .addImm(Val: 1); // enable clamp
9315
9316 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MIN_U32_e64), DestReg: MidReg4)
9317 .addReg(RegNo: MidReg3)
9318 .addReg(RegNo: IsCtlz ? MidReg2 : MidReg1);
9319
9320 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: MidReg4);
9321
9322 addUsersToMoveToVALUWorklist(Reg: MidReg4, MRI, Worklist);
9323}
9324
9325void SIInstrInfo::addUsersToMoveToVALUWorklist(
9326 Register DstReg, MachineRegisterInfo &MRI,
9327 SIInstrWorklist &Worklist) const {
9328 for (MachineOperand &MO : make_early_inc_range(Range: MRI.use_operands(Reg: DstReg))) {
9329 MachineInstr &UseMI = *MO.getParent();
9330
9331 unsigned OpNo = 0;
9332
9333 switch (UseMI.getOpcode()) {
9334 case AMDGPU::COPY:
9335 case AMDGPU::WQM:
9336 case AMDGPU::SOFT_WQM:
9337 case AMDGPU::STRICT_WWM:
9338 case AMDGPU::STRICT_WQM:
9339 case AMDGPU::REG_SEQUENCE:
9340 case AMDGPU::PHI:
9341 case AMDGPU::INSERT_SUBREG:
9342 break;
9343 default:
9344 OpNo = MO.getOperandNo();
9345 break;
9346 }
9347
9348 const TargetRegisterClass *OpRC = getOpRegClass(MI: UseMI, OpNo);
9349 MRI.constrainRegClass(Reg: DstReg, RC: OpRC);
9350
9351 if (!RI.hasVectorRegisters(RC: OpRC))
9352 Worklist.insert(MI: &UseMI);
9353 else
9354 // Legalization could change user list.
9355 legalizeOperandsVALUt16(MI&: UseMI, OpIdx: OpNo, MRI);
9356 }
9357}
9358
9359void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9360 MachineRegisterInfo &MRI,
9361 MachineInstr &Inst) const {
9362 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9363 MachineBasicBlock *MBB = Inst.getParent();
9364 MachineOperand &Src0 = Inst.getOperand(i: 1);
9365 MachineOperand &Src1 = Inst.getOperand(i: 2);
9366 const DebugLoc &DL = Inst.getDebugLoc();
9367
9368 if (ST.useRealTrue16Insts()) {
9369 Register SrcReg0, SrcReg1;
9370 if (!Src0.isReg() || !RI.isVGPR(MRI, Reg: Src0.getReg())) {
9371 SrcReg0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9372 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL,
9373 MCID: get(Opcode: Src0.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), DestReg: SrcReg0)
9374 .add(MO: Src0);
9375 } else {
9376 SrcReg0 = Src0.getReg();
9377 }
9378
9379 if (!Src1.isReg() || !RI.isVGPR(MRI, Reg: Src1.getReg())) {
9380 SrcReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9381 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL,
9382 MCID: get(Opcode: Src1.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), DestReg: SrcReg1)
9383 .add(MO: Src1);
9384 } else {
9385 SrcReg1 = Src1.getReg();
9386 }
9387
9388 bool isSrc0Reg16 = MRI.constrainRegClass(Reg: SrcReg0, RC: &AMDGPU::VGPR_16RegClass);
9389 bool isSrc1Reg16 = MRI.constrainRegClass(Reg: SrcReg1, RC: &AMDGPU::VGPR_16RegClass);
9390
9391 auto NewMI = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: ResultReg);
9392 switch (Inst.getOpcode()) {
9393 case AMDGPU::S_PACK_LL_B32_B16:
9394 NewMI
9395 .addReg(RegNo: SrcReg0, Flags: {},
9396 SubReg: isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9397 .addImm(Val: AMDGPU::lo16)
9398 .addReg(RegNo: SrcReg1, Flags: {},
9399 SubReg: isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9400 .addImm(Val: AMDGPU::hi16);
9401 break;
9402 case AMDGPU::S_PACK_LH_B32_B16:
9403 NewMI
9404 .addReg(RegNo: SrcReg0, Flags: {},
9405 SubReg: isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9406 .addImm(Val: AMDGPU::lo16)
9407 .addReg(RegNo: SrcReg1, Flags: {}, SubReg: AMDGPU::hi16)
9408 .addImm(Val: AMDGPU::hi16);
9409 break;
9410 case AMDGPU::S_PACK_HL_B32_B16:
9411 NewMI.addReg(RegNo: SrcReg0, Flags: {}, SubReg: AMDGPU::hi16)
9412 .addImm(Val: AMDGPU::lo16)
9413 .addReg(RegNo: SrcReg1, Flags: {},
9414 SubReg: isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9415 .addImm(Val: AMDGPU::hi16);
9416 break;
9417 case AMDGPU::S_PACK_HH_B32_B16:
9418 NewMI.addReg(RegNo: SrcReg0, Flags: {}, SubReg: AMDGPU::hi16)
9419 .addImm(Val: AMDGPU::lo16)
9420 .addReg(RegNo: SrcReg1, Flags: {}, SubReg: AMDGPU::hi16)
9421 .addImm(Val: AMDGPU::hi16);
9422 break;
9423 default:
9424 llvm_unreachable("unhandled s_pack_* instruction");
9425 }
9426
9427 MachineOperand &Dest = Inst.getOperand(i: 0);
9428 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9429 addUsersToMoveToVALUWorklist(DstReg: ResultReg, MRI, Worklist);
9430 return;
9431 }
9432
9433 switch (Inst.getOpcode()) {
9434 case AMDGPU::S_PACK_LL_B32_B16: {
9435 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9436 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9437
9438 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9439 // 0.
9440 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9441 .addImm(Val: 0xffff);
9442
9443 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_AND_B32_e64), DestReg: TmpReg)
9444 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9445 .add(MO: Src0);
9446
9447 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: ResultReg)
9448 .add(MO: Src1)
9449 .addImm(Val: 16)
9450 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9451 break;
9452 }
9453 case AMDGPU::S_PACK_LH_B32_B16: {
9454 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9455 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9456 .addImm(Val: 0xffff);
9457 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_BFI_B32_e64), DestReg: ResultReg)
9458 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9459 .add(MO: Src0)
9460 .add(MO: Src1);
9461 break;
9462 }
9463 case AMDGPU::S_PACK_HL_B32_B16: {
9464 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9465 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
9466 .addImm(Val: 16)
9467 .add(MO: Src0);
9468 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: ResultReg)
9469 .add(MO: Src1)
9470 .addImm(Val: 16)
9471 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9472 break;
9473 }
9474 case AMDGPU::S_PACK_HH_B32_B16: {
9475 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9476 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
9477 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
9478 .addImm(Val: 16)
9479 .add(MO: Src0);
9480 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
9481 .addImm(Val: 0xffff0000);
9482 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_AND_OR_B32_e64), DestReg: ResultReg)
9483 .add(MO: Src1)
9484 .addReg(RegNo: ImmReg, Flags: RegState::Kill)
9485 .addReg(RegNo: TmpReg, Flags: RegState::Kill);
9486 break;
9487 }
9488 default:
9489 llvm_unreachable("unhandled s_pack_* instruction");
9490 }
9491
9492 MachineOperand &Dest = Inst.getOperand(i: 0);
9493 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
9494 addUsersToMoveToVALUWorklist(DstReg: ResultReg, MRI, Worklist);
9495}
9496
9497void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9498 MachineInstr &SCCDefInst,
9499 SIInstrWorklist &Worklist,
9500 Register NewCond) const {
9501
9502 // Ensure that def inst defines SCC, which is still live.
9503 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9504 !Op.isDead() && Op.getParent() == &SCCDefInst);
9505 SmallVector<MachineInstr *, 4> CopyToDelete;
9506 // This assumes that all the users of SCC are in the same block
9507 // as the SCC def.
9508 for (MachineInstr &MI : // Skip the def inst itself.
9509 make_range(x: std::next(x: MachineBasicBlock::iterator(SCCDefInst)),
9510 y: SCCDefInst.getParent()->end())) {
9511 // Check if SCC is used first.
9512 int SCCIdx = MI.findRegisterUseOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isKill: false);
9513 if (SCCIdx != -1) {
9514 if (MI.isCopy()) {
9515 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9516 Register DestReg = MI.getOperand(i: 0).getReg();
9517
9518 MRI.replaceRegWith(FromReg: DestReg, ToReg: NewCond);
9519 CopyToDelete.push_back(Elt: &MI);
9520 } else {
9521
9522 if (NewCond.isValid())
9523 MI.getOperand(i: SCCIdx).setReg(NewCond);
9524
9525 Worklist.insert(MI: &MI);
9526 }
9527 }
9528 // Exit if we find another SCC def.
9529 if (MI.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isDead: false, Overlap: false) != -1)
9530 break;
9531 }
9532 for (auto &Copy : CopyToDelete)
9533 Copy->eraseFromParent();
9534}
9535
9536// Instructions that use SCC may be converted to VALU instructions. When that
9537// happens, the SCC register is changed to VCC_LO. The instruction that defines
9538// SCC must be changed to an instruction that defines VCC. This function makes
9539// sure that the instruction that defines SCC is added to the moveToVALU
9540// worklist.
9541void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9542 SIInstrWorklist &Worklist) const {
9543 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9544 // then there is nothing to do because the defining instruction has been
9545 // converted to a VALU already. If SCC then that instruction needs to be
9546 // converted to a VALU.
9547 for (MachineInstr &MI :
9548 make_range(x: std::next(x: MachineBasicBlock::reverse_iterator(SCCUseInst)),
9549 y: SCCUseInst->getParent()->rend())) {
9550 if (MI.modifiesRegister(Reg: AMDGPU::VCC, TRI: &RI))
9551 break;
9552 if (MI.definesRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
9553 Worklist.insert(MI: &MI);
9554 break;
9555 }
9556 }
9557}
9558
9559const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9560 const MachineInstr &Inst) const {
9561 const TargetRegisterClass *NewDstRC = getOpRegClass(MI: Inst, OpNo: 0);
9562
9563 switch (Inst.getOpcode()) {
9564 // For target instructions, getOpRegClass just returns the virtual register
9565 // class associated with the operand, so we need to find an equivalent VGPR
9566 // register class in order to move the instruction to the VALU.
9567 case AMDGPU::COPY:
9568 case AMDGPU::PHI:
9569 case AMDGPU::REG_SEQUENCE:
9570 case AMDGPU::INSERT_SUBREG:
9571 case AMDGPU::WQM:
9572 case AMDGPU::SOFT_WQM:
9573 case AMDGPU::STRICT_WWM:
9574 case AMDGPU::STRICT_WQM: {
9575 const TargetRegisterClass *SrcRC = getOpRegClass(MI: Inst, OpNo: 1);
9576 if (RI.isAGPRClass(RC: SrcRC)) {
9577 if (RI.isAGPRClass(RC: NewDstRC))
9578 return nullptr;
9579
9580 switch (Inst.getOpcode()) {
9581 case AMDGPU::PHI:
9582 case AMDGPU::REG_SEQUENCE:
9583 case AMDGPU::INSERT_SUBREG:
9584 NewDstRC = RI.getEquivalentAGPRClass(SRC: NewDstRC);
9585 break;
9586 default:
9587 NewDstRC = RI.getEquivalentVGPRClass(SRC: NewDstRC);
9588 }
9589
9590 if (!NewDstRC)
9591 return nullptr;
9592 } else {
9593 if (RI.isVGPRClass(RC: NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9594 return nullptr;
9595
9596 NewDstRC = RI.getEquivalentVGPRClass(SRC: NewDstRC);
9597 if (!NewDstRC)
9598 return nullptr;
9599 }
9600
9601 return NewDstRC;
9602 }
9603 default:
9604 return NewDstRC;
9605 }
9606}
9607
9608// Find the one SGPR operand we are allowed to use.
9609Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9610 int OpIndices[3]) const {
9611 const MCInstrDesc &Desc = MI.getDesc();
9612
9613 // Find the one SGPR operand we are allowed to use.
9614 //
9615 // First we need to consider the instruction's operand requirements before
9616 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9617 // of VCC, but we are still bound by the constant bus requirement to only use
9618 // one.
9619 //
9620 // If the operand's class is an SGPR, we can never move it.
9621
9622 Register SGPRReg = findImplicitSGPRRead(MI);
9623 if (SGPRReg)
9624 return SGPRReg;
9625
9626 Register UsedSGPRs[3] = {Register()};
9627 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9628
9629 for (unsigned i = 0; i < 3; ++i) {
9630 int Idx = OpIndices[i];
9631 if (Idx == -1)
9632 break;
9633
9634 const MachineOperand &MO = MI.getOperand(i: Idx);
9635 if (!MO.isReg())
9636 continue;
9637
9638 // Is this operand statically required to be an SGPR based on the operand
9639 // constraints?
9640 const TargetRegisterClass *OpRC =
9641 RI.getRegClass(i: getOpRegClassID(OpInfo: Desc.operands()[Idx]));
9642 bool IsRequiredSGPR = RI.isSGPRClass(RC: OpRC);
9643 if (IsRequiredSGPR)
9644 return MO.getReg();
9645
9646 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9647 Register Reg = MO.getReg();
9648 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9649 if (RI.isSGPRClass(RC: RegRC))
9650 UsedSGPRs[i] = Reg;
9651 }
9652
9653 // We don't have a required SGPR operand, so we have a bit more freedom in
9654 // selecting operands to move.
9655
9656 // Try to select the most used SGPR. If an SGPR is equal to one of the
9657 // others, we choose that.
9658 //
9659 // e.g.
9660 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9661 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9662
9663 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9664 // prefer those.
9665
9666 if (UsedSGPRs[0]) {
9667 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9668 SGPRReg = UsedSGPRs[0];
9669 }
9670
9671 if (!SGPRReg && UsedSGPRs[1]) {
9672 if (UsedSGPRs[1] == UsedSGPRs[2])
9673 SGPRReg = UsedSGPRs[1];
9674 }
9675
9676 return SGPRReg;
9677}
9678
9679MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
9680 AMDGPU::OpName OperandName) const {
9681 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9682 return nullptr;
9683
9684 int Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OperandName);
9685 if (Idx == -1)
9686 return nullptr;
9687
9688 return &MI.getOperand(i: Idx);
9689}
9690
9691uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
9692 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9693 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9694 ? (int64_t)AMDGPU::UfmtGFX11::UFMT_32_FLOAT
9695 : (int64_t)AMDGPU::UfmtGFX10::UFMT_32_FLOAT;
9696 return (Format << 44) |
9697 (1ULL << 56) | // RESOURCE_LEVEL = 1
9698 (3ULL << 60); // OOB_SELECT = 3
9699 }
9700
9701 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9702 if (ST.isAmdHsaOS()) {
9703 // Set ATC = 1. GFX9 doesn't have this bit.
9704 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9705 RsrcDataFormat |= (1ULL << 56);
9706
9707 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9708 // BTW, it disables TC L2 and therefore decreases performance.
9709 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9710 RsrcDataFormat |= (2ULL << 59);
9711 }
9712
9713 return RsrcDataFormat;
9714}
9715
9716uint64_t SIInstrInfo::getScratchRsrcWords23() const {
9717 uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
9718 AMDGPU::RSRC_TID_ENABLE |
9719 0xffffffff; // Size;
9720
9721 // GFX9 doesn't have ELEMENT_SIZE.
9722 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9723 uint64_t EltSizeValue = Log2_32(Value: ST.getMaxPrivateElementSize(ForBufferRSrc: true)) - 1;
9724 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9725 }
9726
9727 // IndexStride = 64 / 32.
9728 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9729 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9730
9731 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9732 // Clear them unless we want a huge stride.
9733 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9734 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9735 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9736
9737 return Rsrc23;
9738}
9739
9740bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
9741 unsigned Opc = MI.getOpcode();
9742
9743 return isSMRD(Opcode: Opc);
9744}
9745
9746bool SIInstrInfo::isHighLatencyDef(int Opc) const {
9747 return get(Opcode: Opc).mayLoad() &&
9748 (isMUBUF(Opcode: Opc) || isMTBUF(Opcode: Opc) || isMIMG(Opcode: Opc) || isFLAT(Opcode: Opc));
9749}
9750
9751Register SIInstrInfo::isStackAccess(const MachineInstr &MI, int &FrameIndex,
9752 TypeSize &MemBytes) const {
9753 const MachineOperand *Addr = getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
9754 if (!Addr || !Addr->isFI())
9755 return Register();
9756
9757 assert(!MI.memoperands_empty() &&
9758 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9759
9760 FrameIndex = Addr->getIndex();
9761
9762 int VDataIdx =
9763 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdata);
9764 MemBytes = TypeSize::getFixed(ExactSize: getOpSize(Opcode: MI.getOpcode(), OpNo: VDataIdx));
9765 return MI.getOperand(i: VDataIdx).getReg();
9766}
9767
9768Register SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex,
9769 TypeSize &MemBytes) const {
9770 const MachineOperand *Addr = getNamedOperand(MI, OperandName: AMDGPU::OpName::addr);
9771 assert(Addr && Addr->isFI());
9772 FrameIndex = Addr->getIndex();
9773
9774 int DataIdx =
9775 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::data);
9776 MemBytes = TypeSize::getFixed(ExactSize: getOpSize(Opcode: MI.getOpcode(), OpNo: DataIdx));
9777 return MI.getOperand(i: DataIdx).getReg();
9778}
9779
9780Register SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
9781 int &FrameIndex,
9782 TypeSize &MemBytes) const {
9783 if (!MI.mayLoad())
9784 return Register();
9785
9786 if (isMUBUF(MI) || isVGPRSpill(MI))
9787 return isStackAccess(MI, FrameIndex, MemBytes);
9788
9789 if (isSGPRSpill(MI))
9790 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9791
9792 return Register();
9793}
9794
9795Register SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
9796 int &FrameIndex,
9797 TypeSize &MemBytes) const {
9798 if (!MI.mayStore())
9799 return Register();
9800
9801 if (isMUBUF(MI) || isVGPRSpill(MI))
9802 return isStackAccess(MI, FrameIndex, MemBytes);
9803
9804 if (isSGPRSpill(MI))
9805 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9806
9807 return Register();
9808}
9809
9810unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
9811 unsigned Size = 0;
9812 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
9813 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9814 while (++I != E && I->isInsideBundle()) {
9815 assert(!I->isBundle() && "No nested bundle!");
9816 Size += getInstSizeInBytes(MI: *I);
9817 }
9818
9819 return Size;
9820}
9821
9822unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
9823 unsigned Opc = MI.getOpcode();
9824 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: Opc);
9825 unsigned DescSize = Desc.getSize();
9826
9827 // If we have a definitive size, we can use it. Otherwise we need to inspect
9828 // the operands to know the size.
9829 if (isFixedSize(MI)) {
9830 unsigned Size = DescSize;
9831
9832 // If we hit the buggy offset, an extra nop will be inserted in MC so
9833 // estimate the worst case.
9834 if (MI.isBranch() && ST.hasOffset3fBug())
9835 Size += 4;
9836
9837 return Size;
9838 }
9839
9840 // Instructions may have a 32-bit literal encoded after them. Check
9841 // operands that could ever be literals.
9842 if (isVALU(MI) || isSALU(MI)) {
9843 if (isDPP(MI))
9844 return DescSize;
9845 bool HasLiteral = false;
9846 unsigned LiteralSize = 4;
9847 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9848 const MachineOperand &Op = MI.getOperand(i: I);
9849 const MCOperandInfo &OpInfo = Desc.operands()[I];
9850 if (!Op.isReg() && !isInlineConstant(MO: Op, OpInfo)) {
9851 HasLiteral = true;
9852 if (ST.has64BitLiterals()) {
9853 switch (OpInfo.OperandType) {
9854 default:
9855 break;
9856 case AMDGPU::OPERAND_REG_IMM_FP64:
9857 if (!AMDGPU::isValid32BitLiteral(Val: Op.getImm(), IsFP64: true))
9858 LiteralSize = 8;
9859 break;
9860 case AMDGPU::OPERAND_REG_IMM_INT64:
9861 // A 32-bit literal is only valid when the value fits in BOTH signed
9862 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
9863 // emitter's getLit64Encoding logic. This is because of the lack of
9864 // abilility to tell signedness of the literal, therefore we need to
9865 // be conservative and assume values outside this range require a
9866 // 64-bit literal encoding (8 bytes).
9867 if (!Op.isImm() || !isInt<32>(x: Op.getImm()) ||
9868 !isUInt<32>(x: Op.getImm()))
9869 LiteralSize = 8;
9870 break;
9871 }
9872 }
9873 break;
9874 }
9875 }
9876 return HasLiteral ? DescSize + LiteralSize : DescSize;
9877 }
9878
9879 // Check whether we have extra NSA words.
9880 if (isMIMG(MI)) {
9881 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0);
9882 if (VAddr0Idx < 0)
9883 return 8;
9884
9885 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::srsrc);
9886 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9887 }
9888
9889 switch (Opc) {
9890 case TargetOpcode::BUNDLE:
9891 return getInstBundleSize(MI);
9892 case TargetOpcode::INLINEASM:
9893 case TargetOpcode::INLINEASM_BR: {
9894 const MachineFunction *MF = MI.getMF();
9895 const char *AsmStr = MI.getOperand(i: 0).getSymbolName();
9896 return getInlineAsmLength(Str: AsmStr, MAI: *MF->getTarget().getMCAsmInfo(), STI: &ST);
9897 }
9898 default:
9899 if (MI.isMetaInstruction())
9900 return 0;
9901
9902 // If D16 Pseudo inst, get correct MC code size
9903 const auto *D16Info = AMDGPU::getT16D16Helper(T16Op: Opc);
9904 if (D16Info) {
9905 // Assume d16_lo/hi inst are always in same size
9906 unsigned LoInstOpcode = D16Info->LoOp;
9907 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: LoInstOpcode);
9908 DescSize = Desc.getSize();
9909 }
9910
9911 // If FMA Pseudo inst, get correct MC code size
9912 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9913 // All potential lowerings are the same size; arbitrarily pick one.
9914 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: AMDGPU::V_FMA_MIXLO_F16);
9915 DescSize = Desc.getSize();
9916 }
9917
9918 return DescSize;
9919 }
9920}
9921
9922bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
9923 if (!isFLAT(MI))
9924 return false;
9925
9926 if (MI.memoperands_empty())
9927 return true;
9928
9929 for (const MachineMemOperand *MMO : MI.memoperands()) {
9930 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9931 return true;
9932 }
9933 return false;
9934}
9935
9936ArrayRef<std::pair<int, const char *>>
9937SIInstrInfo::getSerializableTargetIndices() const {
9938 static const std::pair<int, const char *> TargetIndices[] = {
9939 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9940 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9941 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9942 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9943 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9944 return ArrayRef(TargetIndices);
9945}
9946
9947/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9948/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9949ScheduleHazardRecognizer *
9950SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
9951 const ScheduleDAG *DAG) const {
9952 return new GCNHazardRecognizer(DAG->MF);
9953}
9954
9955/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9956/// pass.
9957ScheduleHazardRecognizer *
9958SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF,
9959 MachineLoopInfo *MLI) const {
9960 return new GCNHazardRecognizer(MF, MLI);
9961}
9962
9963// Called during:
9964// - pre-RA scheduling and post-RA scheduling
9965ScheduleHazardRecognizer *
9966SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
9967 const ScheduleDAGMI *DAG) const {
9968 // Borrowed from Arm Target
9969 // We would like to restrict this hazard recognizer to only
9970 // post-RA scheduling; we can tell that we're post-RA because we don't
9971 // track VRegLiveness.
9972 if (!DAG->hasVRegLiveness())
9973 return new GCNHazardRecognizer(DAG->MF);
9974 return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG);
9975}
9976
9977std::pair<unsigned, unsigned>
9978SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9979 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9980}
9981
9982ArrayRef<std::pair<unsigned, const char *>>
9983SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9984 static const std::pair<unsigned, const char *> TargetFlags[] = {
9985 {MO_GOTPCREL, "amdgpu-gotprel"},
9986 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9987 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9988 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9989 {MO_REL32_LO, "amdgpu-rel32-lo"},
9990 {MO_REL32_HI, "amdgpu-rel32-hi"},
9991 {MO_REL64, "amdgpu-rel64"},
9992 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9993 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9994 {MO_ABS64, "amdgpu-abs64"},
9995 };
9996
9997 return ArrayRef(TargetFlags);
9998}
9999
10000ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
10001SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
10002 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
10003 {
10004 {MONoClobber, "amdgpu-noclobber"},
10005 {MOLastUse, "amdgpu-last-use"},
10006 {MOCooperative, "amdgpu-cooperative"},
10007 {MOThreadPrivate, "amdgpu-thread-private"},
10008 };
10009
10010 return ArrayRef(TargetFlags);
10011}
10012
10013unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
10014 const MachineFunction &MF) const {
10015 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
10016 assert(SrcReg.isVirtual());
10017 if (MFI->checkFlag(Reg: SrcReg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
10018 return AMDGPU::WWM_COPY;
10019
10020 return AMDGPU::COPY;
10021}
10022
10023bool SIInstrInfo::canAddToBBProlog(const MachineInstr &MI) const {
10024 uint32_t Opcode = MI.getOpcode();
10025 // Check if it is SGPR spill or wwm-register spill Opcode.
10026 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
10027 return true;
10028
10029 const MachineFunction *MF = MI.getMF();
10030 const MachineRegisterInfo &MRI = MF->getRegInfo();
10031 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
10032
10033 // See if this is Liverange split instruction inserted for SGPR or
10034 // wwm-register. The implicit def inserted for wwm-registers should also be
10035 // included as they can appear at the bb begin.
10036 bool IsLRSplitInst = MI.getFlag(Flag: MachineInstr::LRSplit);
10037 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
10038 return false;
10039
10040 Register Reg = MI.getOperand(i: 0).getReg();
10041 if (RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg)))
10042 return IsLRSplitInst;
10043
10044 return MFI->isWWMReg(Reg);
10045}
10046
10047bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
10048 Register Reg) const {
10049 // We need to handle instructions which may be inserted during register
10050 // allocation to handle the prolog. The initial prolog instruction may have
10051 // been separated from the start of the block by spills and copies inserted
10052 // needed by the prolog. However, the insertions for scalar registers can
10053 // always be placed at the BB top as they are independent of the exec mask
10054 // value.
10055 bool IsNullOrVectorRegister = true;
10056 if (Reg) {
10057 const MachineFunction *MF = MI.getMF();
10058 const MachineRegisterInfo &MRI = MF->getRegInfo();
10059 IsNullOrVectorRegister = !RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg));
10060 }
10061
10062 return IsNullOrVectorRegister &&
10063 (canAddToBBProlog(MI) ||
10064 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10065 MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &RI)));
10066}
10067
10068MachineInstrBuilder
10069SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
10070 MachineBasicBlock::iterator I,
10071 const DebugLoc &DL,
10072 Register DestReg) const {
10073 if (ST.hasAddNoCarryInsts())
10074 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e64), DestReg);
10075
10076 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10077 Register UnusedCarry = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
10078 MRI.setRegAllocationHint(VReg: UnusedCarry, Type: 0, PrefReg: RI.getVCC());
10079
10080 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg)
10081 .addReg(RegNo: UnusedCarry, Flags: RegState::Define | RegState::Dead);
10082}
10083
10084MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
10085 MachineBasicBlock::iterator I,
10086 const DebugLoc &DL,
10087 Register DestReg,
10088 RegScavenger &RS) const {
10089 if (ST.hasAddNoCarryInsts())
10090 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg);
10091
10092 // If available, prefer to use vcc.
10093 Register UnusedCarry = !RS.isRegUsed(Reg: AMDGPU::VCC)
10094 ? Register(RI.getVCC())
10095 : RS.scavengeRegisterBackwards(
10096 RC: *RI.getBoolRC(), To: I, /* RestoreAfter */ false,
10097 SPAdj: 0, /* AllowSpill */ false);
10098
10099 // TODO: Users need to deal with this.
10100 if (!UnusedCarry.isValid())
10101 return MachineInstrBuilder();
10102
10103 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg)
10104 .addReg(RegNo: UnusedCarry, Flags: RegState::Define | RegState::Dead);
10105}
10106
10107bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10108 switch (Opcode) {
10109 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10110 case AMDGPU::SI_KILL_I1_TERMINATOR:
10111 return true;
10112 default:
10113 return false;
10114 }
10115}
10116
10117const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
10118 switch (Opcode) {
10119 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10120 return get(Opcode: AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10121 case AMDGPU::SI_KILL_I1_PSEUDO:
10122 return get(Opcode: AMDGPU::SI_KILL_I1_TERMINATOR);
10123 default:
10124 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10125 }
10126}
10127
10128bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10129 return Imm <= getMaxMUBUFImmOffset(ST);
10130}
10131
10132unsigned SIInstrInfo::getMaxMUBUFImmOffset(const GCNSubtarget &ST) {
10133 // GFX12 field is non-negative 24-bit signed byte offset.
10134 const unsigned OffsetBits =
10135 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10136 return (1 << OffsetBits) - 1;
10137}
10138
10139void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
10140 if (!ST.isWave32())
10141 return;
10142
10143 if (MI.isInlineAsm())
10144 return;
10145
10146 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10147 return;
10148
10149 for (auto &Op : MI.implicit_operands()) {
10150 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10151 Op.setReg(AMDGPU::VCC_LO);
10152 }
10153}
10154
10155bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
10156 if (!isSMRD(MI))
10157 return false;
10158
10159 // Check that it is using a buffer resource.
10160 int Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::sbase);
10161 if (Idx == -1) // e.g. s_memtime
10162 return false;
10163
10164 const int16_t RCID = getOpRegClassID(OpInfo: MI.getDesc().operands()[Idx]);
10165 return RI.getRegClass(i: RCID)->hasSubClassEq(RC: &AMDGPU::SGPR_128RegClass);
10166}
10167
10168// Given Imm, split it into the values to put into the SOffset and ImmOffset
10169// fields in an MUBUF instruction. Return false if it is not possible (due to a
10170// hardware bug needing a workaround).
10171//
10172// The required alignment ensures that individual address components remain
10173// aligned if they are aligned to begin with. It also ensures that additional
10174// offsets within the given alignment can be added to the resulting ImmOffset.
10175bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset,
10176 uint32_t &ImmOffset, Align Alignment) const {
10177 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10178 const uint32_t MaxImm = alignDown(Value: MaxOffset, Align: Alignment.value());
10179 uint32_t Overflow = 0;
10180
10181 if (Imm > MaxImm) {
10182 if (Imm <= MaxImm + 64) {
10183 // Use an SOffset inline constant for 4..64
10184 Overflow = Imm - MaxImm;
10185 Imm = MaxImm;
10186 } else {
10187 // Try to keep the same value in SOffset for adjacent loads, so that
10188 // the corresponding register contents can be re-used.
10189 //
10190 // Load values with all low-bits (except for alignment bits) set into
10191 // SOffset, so that a larger range of values can be covered using
10192 // s_movk_i32.
10193 //
10194 // Atomic operations fail to work correctly when individual address
10195 // components are unaligned, even if their sum is aligned.
10196 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10197 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10198 Imm = Low;
10199 Overflow = High - Alignment.value();
10200 }
10201 }
10202
10203 if (Overflow > 0) {
10204 // There is a hardware bug in SI and CI which prevents address clamping in
10205 // MUBUF instructions from working correctly with SOffsets. The immediate
10206 // offset is unaffected.
10207 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10208 return false;
10209
10210 // It is not possible to set immediate in SOffset field on some targets.
10211 if (ST.hasRestrictedSOffset())
10212 return false;
10213 }
10214
10215 ImmOffset = Imm;
10216 SOffset = Overflow;
10217 return true;
10218}
10219
10220// Depending on the used address space and instructions, some immediate offsets
10221// are allowed and some are not.
10222// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10223// scratch instruction offsets can also be negative. On GFX12, offsets can be
10224// negative for all variants.
10225//
10226// There are several bugs related to these offsets:
10227// On gfx10.1, flat instructions that go into the global address space cannot
10228// use an offset.
10229//
10230// For scratch instructions, the address can be either an SGPR or a VGPR.
10231// The following offsets can be used, depending on the architecture (x means
10232// cannot be used):
10233// +----------------------------+------+------+
10234// | Address-Mode | SGPR | VGPR |
10235// +----------------------------+------+------+
10236// | gfx9 | | |
10237// | negative, 4-aligned offset | x | ok |
10238// | negative, unaligned offset | x | ok |
10239// +----------------------------+------+------+
10240// | gfx10 | | |
10241// | negative, 4-aligned offset | ok | ok |
10242// | negative, unaligned offset | ok | x |
10243// +----------------------------+------+------+
10244// | gfx10.3 | | |
10245// | negative, 4-aligned offset | ok | ok |
10246// | negative, unaligned offset | ok | ok |
10247// +----------------------------+------+------+
10248//
10249// This function ignores the addressing mode, so if an offset cannot be used in
10250// one addressing mode, it is considered illegal.
10251bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10252 uint64_t FlatVariant) const {
10253 // TODO: Should 0 be special cased?
10254 if (!ST.hasFlatInstOffsets())
10255 return false;
10256
10257 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10258 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10259 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10260 return false;
10261
10262 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10263 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10264 (Offset % 4) != 0) {
10265 return false;
10266 }
10267
10268 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10269 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10270 return isIntN(N, x: Offset) && (AllowNegative || Offset >= 0);
10271}
10272
10273// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10274std::pair<int64_t, int64_t>
10275SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10276 uint64_t FlatVariant) const {
10277 int64_t RemainderOffset = COffsetVal;
10278 int64_t ImmField = 0;
10279
10280 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10281 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10282
10283 if (AllowNegative) {
10284 // Use signed division by a power of two to truncate towards 0.
10285 int64_t D = 1LL << NumBits;
10286 RemainderOffset = (COffsetVal / D) * D;
10287 ImmField = COffsetVal - RemainderOffset;
10288
10289 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10290 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10291 (ImmField % 4) != 0) {
10292 // Make ImmField a multiple of 4
10293 RemainderOffset += ImmField % 4;
10294 ImmField -= ImmField % 4;
10295 }
10296 } else if (COffsetVal >= 0) {
10297 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(N: NumBits);
10298 RemainderOffset = COffsetVal - ImmField;
10299 }
10300
10301 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10302 assert(RemainderOffset + ImmField == COffsetVal);
10303 return {ImmField, RemainderOffset};
10304}
10305
10306bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const {
10307 if (ST.hasNegativeScratchOffsetBug() &&
10308 FlatVariant == SIInstrFlags::FlatScratch)
10309 return false;
10310
10311 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(STI: ST);
10312}
10313
10314static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10315 switch (ST.getGeneration()) {
10316 default:
10317 break;
10318 case AMDGPUSubtarget::SOUTHERN_ISLANDS:
10319 case AMDGPUSubtarget::SEA_ISLANDS:
10320 return SIEncodingFamily::SI;
10321 case AMDGPUSubtarget::VOLCANIC_ISLANDS:
10322 case AMDGPUSubtarget::GFX9:
10323 return SIEncodingFamily::VI;
10324 case AMDGPUSubtarget::GFX10:
10325 return SIEncodingFamily::GFX10;
10326 case AMDGPUSubtarget::GFX11:
10327 return ST.hasGFX11_7Insts() ? SIEncodingFamily::GFX1170
10328 : SIEncodingFamily::GFX11;
10329 case AMDGPUSubtarget::GFX12:
10330 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10331 : SIEncodingFamily::GFX12;
10332 case AMDGPUSubtarget::GFX13:
10333 return SIEncodingFamily::GFX13;
10334 }
10335 llvm_unreachable("Unknown subtarget generation!");
10336}
10337
10338bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10339 switch(MCOp) {
10340 // These opcodes use indirect register addressing so
10341 // they need special handling by codegen (currently missing).
10342 // Therefore it is too risky to allow these opcodes
10343 // to be selected by dpp combiner or sdwa peepholer.
10344 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10345 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10346 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10347 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10348 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10349 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10350 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10351 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10352 return true;
10353 default:
10354 return false;
10355 }
10356}
10357
10358#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10359 case OPCODE##_dpp: \
10360 case OPCODE##_e32: \
10361 case OPCODE##_e64: \
10362 case OPCODE##_e64_dpp: \
10363 case OPCODE##_sdwa:
10364
10365static bool isRenamedInGFX9(int Opcode) {
10366 switch (Opcode) {
10367 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10368 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10369 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10370 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10371 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10372 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10373 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10374 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10375 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10376 //
10377 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10378 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10379 case AMDGPU::V_FMA_F16_gfx9_e64:
10380 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10381 case AMDGPU::V_INTERP_P2_F16:
10382 case AMDGPU::V_MAD_F16_e64:
10383 case AMDGPU::V_MAD_U16_e64:
10384 case AMDGPU::V_MAD_I16_e64:
10385 return true;
10386 default:
10387 return false;
10388 }
10389}
10390
10391int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10392 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10393 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10394
10395 unsigned Gen = subtargetEncodingFamily(ST);
10396
10397 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10398 Gen = SIEncodingFamily::GFX9;
10399
10400 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10401 // subtarget has UnpackedD16VMem feature.
10402 // TODO: remove this when we discard GFX80 encoding.
10403 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10404 Gen = SIEncodingFamily::GFX80;
10405
10406 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10407 switch (ST.getGeneration()) {
10408 default:
10409 Gen = SIEncodingFamily::SDWA;
10410 break;
10411 case AMDGPUSubtarget::GFX9:
10412 Gen = SIEncodingFamily::SDWA9;
10413 break;
10414 case AMDGPUSubtarget::GFX10:
10415 Gen = SIEncodingFamily::SDWA10;
10416 break;
10417 }
10418 }
10419
10420 if (isMAI(Opcode)) {
10421 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10422 if (MFMAOp != -1)
10423 Opcode = MFMAOp;
10424 }
10425
10426 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10427
10428 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX11_7Insts())
10429 MCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX11);
10430
10431 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10432 MCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX12);
10433
10434 // -1 means that Opcode is already a native instruction.
10435 if (MCOp == -1)
10436 return Opcode;
10437
10438 if (ST.hasGFX90AInsts()) {
10439 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10440 if (ST.hasGFX940Insts())
10441 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX940);
10442 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10443 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX90A);
10444 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10445 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX9);
10446 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10447 MCOp = NMCOp;
10448 }
10449
10450 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10451 // encoding in the given subtarget generation.
10452 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10453 return -1;
10454
10455 if (isAsmOnlyOpcode(MCOp))
10456 return -1;
10457
10458 return MCOp;
10459}
10460
10461static
10462TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
10463 assert(RegOpnd.isReg());
10464 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10465 getRegSubRegPair(O: RegOpnd);
10466}
10467
10468TargetInstrInfo::RegSubRegPair
10469llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
10470 assert(MI.isRegSequence());
10471 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10472 if (MI.getOperand(i: 1 + 2 * I + 1).getImm() == SubReg) {
10473 auto &RegOp = MI.getOperand(i: 1 + 2 * I);
10474 return getRegOrUndef(RegOpnd: RegOp);
10475 }
10476 return TargetInstrInfo::RegSubRegPair();
10477}
10478
10479// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10480// Following a subreg of reg:subreg isn't supported
10481static bool followSubRegDef(MachineInstr &MI,
10482 TargetInstrInfo::RegSubRegPair &RSR) {
10483 if (!RSR.SubReg)
10484 return false;
10485 switch (MI.getOpcode()) {
10486 default: break;
10487 case AMDGPU::REG_SEQUENCE:
10488 RSR = getRegSequenceSubReg(MI, SubReg: RSR.SubReg);
10489 return true;
10490 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10491 case AMDGPU::INSERT_SUBREG:
10492 if (RSR.SubReg == (unsigned)MI.getOperand(i: 3).getImm())
10493 // inserted the subreg we're looking for
10494 RSR = getRegOrUndef(RegOpnd: MI.getOperand(i: 2));
10495 else { // the subreg in the rest of the reg
10496 auto R1 = getRegOrUndef(RegOpnd: MI.getOperand(i: 1));
10497 if (R1.SubReg) // subreg of subreg isn't supported
10498 return false;
10499 RSR.Reg = R1.Reg;
10500 }
10501 return true;
10502 }
10503 return false;
10504}
10505
10506MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
10507 const MachineRegisterInfo &MRI) {
10508 assert(MRI.isSSA());
10509 if (!P.Reg.isVirtual())
10510 return nullptr;
10511
10512 auto RSR = P;
10513 auto *DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10514 while (auto *MI = DefInst) {
10515 DefInst = nullptr;
10516 switch (MI->getOpcode()) {
10517 case AMDGPU::COPY:
10518 case AMDGPU::V_MOV_B32_e32: {
10519 auto &Op1 = MI->getOperand(i: 1);
10520 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10521 if (Op1.isUndef())
10522 return nullptr;
10523 RSR = getRegSubRegPair(O: Op1);
10524 DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10525 }
10526 break;
10527 }
10528 default:
10529 if (followSubRegDef(MI&: *MI, RSR)) {
10530 if (!RSR.Reg)
10531 return nullptr;
10532 DefInst = MRI.getVRegDef(Reg: RSR.Reg);
10533 }
10534 }
10535 if (!DefInst)
10536 return MI;
10537 }
10538 return nullptr;
10539}
10540
10541bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
10542 Register VReg,
10543 const MachineInstr &DefMI,
10544 const MachineInstr &UseMI) {
10545 assert(MRI.isSSA() && "Must be run on SSA");
10546
10547 auto *TRI = MRI.getTargetRegisterInfo();
10548 auto *DefBB = DefMI.getParent();
10549
10550 // Don't bother searching between blocks, although it is possible this block
10551 // doesn't modify exec.
10552 if (UseMI.getParent() != DefBB)
10553 return true;
10554
10555 const int MaxInstScan = 20;
10556 int NumInst = 0;
10557
10558 // Stop scan at the use.
10559 auto E = UseMI.getIterator();
10560 for (auto I = std::next(x: DefMI.getIterator()); I != E; ++I) {
10561 if (I->isDebugInstr())
10562 continue;
10563
10564 if (++NumInst > MaxInstScan)
10565 return true;
10566
10567 if (I->modifiesRegister(Reg: AMDGPU::EXEC, TRI))
10568 return true;
10569 }
10570
10571 return false;
10572}
10573
10574bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
10575 Register VReg,
10576 const MachineInstr &DefMI) {
10577 assert(MRI.isSSA() && "Must be run on SSA");
10578
10579 auto *TRI = MRI.getTargetRegisterInfo();
10580 auto *DefBB = DefMI.getParent();
10581
10582 const int MaxUseScan = 10;
10583 int NumUse = 0;
10584
10585 for (auto &Use : MRI.use_nodbg_operands(Reg: VReg)) {
10586 auto &UseInst = *Use.getParent();
10587 // Don't bother searching between blocks, although it is possible this block
10588 // doesn't modify exec.
10589 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10590 return true;
10591
10592 if (++NumUse > MaxUseScan)
10593 return true;
10594 }
10595
10596 if (NumUse == 0)
10597 return false;
10598
10599 const int MaxInstScan = 20;
10600 int NumInst = 0;
10601
10602 // Stop scan when we have seen all the uses.
10603 for (auto I = std::next(x: DefMI.getIterator()); ; ++I) {
10604 assert(I != DefBB->end());
10605
10606 if (I->isDebugInstr())
10607 continue;
10608
10609 if (++NumInst > MaxInstScan)
10610 return true;
10611
10612 for (const MachineOperand &Op : I->operands()) {
10613 // We don't check reg masks here as they're used only on calls:
10614 // 1. EXEC is only considered const within one BB
10615 // 2. Call should be a terminator instruction if present in a BB
10616
10617 if (!Op.isReg())
10618 continue;
10619
10620 Register Reg = Op.getReg();
10621 if (Op.isUse()) {
10622 if (Reg == VReg && --NumUse == 0)
10623 return false;
10624 } else if (TRI->regsOverlap(RegA: Reg, RegB: AMDGPU::EXEC))
10625 return true;
10626 }
10627 }
10628}
10629
10630MachineInstr *SIInstrInfo::createPHIDestinationCopy(
10631 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
10632 const DebugLoc &DL, Register Src, Register Dst) const {
10633 auto Cur = MBB.begin();
10634 if (Cur != MBB.end())
10635 do {
10636 if (!Cur->isPHI() && Cur->readsRegister(Reg: Dst, /*TRI=*/nullptr))
10637 return BuildMI(BB&: MBB, I: Cur, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: Dst).addReg(RegNo: Src);
10638 ++Cur;
10639 } while (Cur != MBB.end() && Cur != LastPHIIt);
10640
10641 return TargetInstrInfo::createPHIDestinationCopy(MBB, InsPt: LastPHIIt, DL, Src,
10642 Dst);
10643}
10644
10645MachineInstr *SIInstrInfo::createPHISourceCopy(
10646 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
10647 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10648 if (InsPt != MBB.end() &&
10649 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10650 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10651 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10652 InsPt->definesRegister(Reg: Src, /*TRI=*/nullptr)) {
10653 InsPt++;
10654 return BuildMI(BB&: MBB, I: InsPt, MIMD: DL,
10655 MCID: get(Opcode: AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), DestReg: Dst)
10656 .addReg(RegNo: Src, Flags: {}, SubReg: SrcSubReg)
10657 .addReg(RegNo: AMDGPU::EXEC, Flags: RegState::Implicit);
10658 }
10659 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10660 Dst);
10661}
10662
10663bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10664
10665MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
10666 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
10667 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10668 VirtRegMap *VRM) const {
10669 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10670 //
10671 // %0:sreg_32 = COPY $m0
10672 //
10673 // We explicitly chose SReg_32 for the virtual register so such a copy might
10674 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10675 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10676 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10677 // TargetInstrInfo::foldMemoryOperand() is going to try.
10678 // A similar issue also exists with spilling and reloading $exec registers.
10679 //
10680 // To prevent that, constrain the %0 register class here.
10681 if (isFullCopyInstr(MI)) {
10682 Register DstReg = MI.getOperand(i: 0).getReg();
10683 Register SrcReg = MI.getOperand(i: 1).getReg();
10684 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10685 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10686 MachineRegisterInfo &MRI = MF.getRegInfo();
10687 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10688 const TargetRegisterClass *RC = MRI.getRegClass(Reg: VirtReg);
10689 if (RC->hasSuperClassEq(RC: &AMDGPU::SReg_32RegClass)) {
10690 MRI.constrainRegClass(Reg: VirtReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
10691 return nullptr;
10692 }
10693 if (RC->hasSuperClassEq(RC: &AMDGPU::SReg_64RegClass)) {
10694 MRI.constrainRegClass(Reg: VirtReg, RC: &AMDGPU::SReg_64_XEXECRegClass);
10695 return nullptr;
10696 }
10697 }
10698 }
10699
10700 return nullptr;
10701}
10702
10703unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
10704 const MachineInstr &MI,
10705 unsigned *PredCost) const {
10706 if (MI.isBundle()) {
10707 MachineBasicBlock::const_instr_iterator I(MI.getIterator());
10708 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10709 unsigned Lat = 0, Count = 0;
10710 for (++I; I != E && I->isBundledWithPred(); ++I) {
10711 ++Count;
10712 Lat = std::max(a: Lat, b: SchedModel.computeInstrLatency(MI: &*I));
10713 }
10714 return Lat + Count - 1;
10715 }
10716
10717 return SchedModel.computeInstrLatency(MI: &MI);
10718}
10719
10720const MachineOperand &
10721SIInstrInfo::getCalleeOperand(const MachineInstr &MI) const {
10722 if (const MachineOperand *CallAddrOp =
10723 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0))
10724 return *CallAddrOp;
10725 return TargetInstrInfo::getCalleeOperand(MI);
10726}
10727
10728ValueUniformity
10729SIInstrInfo::getGenericValueUniformity(const MachineInstr &MI) const {
10730 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10731 unsigned Opcode = MI.getOpcode();
10732
10733 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10734 Register Dst = MI.getOperand(i: 0).getReg();
10735 Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: 2).getReg()
10736 : MI.getOperand(i: 1).getReg();
10737 LLT DstTy = MRI.getType(Reg: Dst);
10738 LLT SrcTy = MRI.getType(Reg: Src);
10739 unsigned DstAS = DstTy.getAddressSpace();
10740 unsigned SrcAS = SrcTy.getAddressSpace();
10741 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10742 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10743 ST.hasGloballyAddressableScratch()
10744 ? ValueUniformity::NeverUniform
10745 : ValueUniformity::Default;
10746 };
10747
10748 // If the target supports globally addressable scratch, the mapping from
10749 // scratch memory to the flat aperture changes therefore an address space cast
10750 // is no longer uniform.
10751 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10752 return HandleAddrSpaceCast(MI);
10753
10754 if (auto *GI = dyn_cast<GIntrinsic>(Val: &MI)) {
10755 auto IID = GI->getIntrinsicID();
10756 if (AMDGPU::isIntrinsicSourceOfDivergence(IntrID: IID))
10757 return ValueUniformity::NeverUniform;
10758 if (AMDGPU::isIntrinsicAlwaysUniform(IntrID: IID))
10759 return ValueUniformity::AlwaysUniform;
10760
10761 switch (IID) {
10762 case Intrinsic::amdgcn_addrspacecast_nonnull:
10763 return HandleAddrSpaceCast(MI);
10764 case Intrinsic::amdgcn_if:
10765 case Intrinsic::amdgcn_else:
10766 // FIXME: Uniform if second result
10767 break;
10768 }
10769
10770 return ValueUniformity::Default;
10771 }
10772
10773 // Loads from the private and flat address spaces are divergent, because
10774 // threads can execute the load instruction with the same inputs and get
10775 // different results.
10776 //
10777 // All other loads are not divergent, because if threads issue loads with the
10778 // same arguments, they will always get the same result.
10779 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10780 Opcode == AMDGPU::G_SEXTLOAD) {
10781 if (MI.memoperands_empty())
10782 return ValueUniformity::NeverUniform; // conservative assumption
10783
10784 if (llvm::any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *mmo) {
10785 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10786 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10787 })) {
10788 // At least one MMO in a non-global address space.
10789 return ValueUniformity::NeverUniform;
10790 }
10791 return ValueUniformity::Default;
10792 }
10793
10794 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opc: Opcode) ||
10795 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10796 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10797 AMDGPU::isGenericAtomic(Opc: Opcode)) {
10798 return ValueUniformity::NeverUniform;
10799 }
10800 return ValueUniformity::Default;
10801}
10802
10803const MIRFormatter *SIInstrInfo::getMIRFormatter() const {
10804 if (!Formatter)
10805 Formatter = std::make_unique<AMDGPUMIRFormatter>(args: ST);
10806 return Formatter.get();
10807}
10808
10809ValueUniformity SIInstrInfo::getValueUniformity(const MachineInstr &MI) const {
10810
10811 if (isNeverUniform(MI))
10812 return ValueUniformity::NeverUniform;
10813
10814 unsigned opcode = MI.getOpcode();
10815 if (opcode == AMDGPU::V_READLANE_B32 ||
10816 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10817 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10818 return ValueUniformity::AlwaysUniform;
10819
10820 if (isCopyInstr(MI)) {
10821 const MachineOperand &srcOp = MI.getOperand(i: 1);
10822 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10823 const TargetRegisterClass *regClass =
10824 RI.getPhysRegBaseClass(Reg: srcOp.getReg());
10825 return RI.isSGPRClass(RC: regClass) ? ValueUniformity::AlwaysUniform
10826 : ValueUniformity::NeverUniform;
10827 }
10828 return ValueUniformity::Default;
10829 }
10830
10831 // GMIR handling
10832 if (MI.isPreISelOpcode())
10833 return SIInstrInfo::getGenericValueUniformity(MI);
10834
10835 // Atomics are divergent because they are executed sequentially: when an
10836 // atomic operation refers to the same address in each thread, then each
10837 // thread after the first sees the value written by the previous thread as
10838 // original value.
10839
10840 if (isAtomic(MI))
10841 return ValueUniformity::NeverUniform;
10842
10843 // Loads from the private and flat address spaces are divergent, because
10844 // threads can execute the load instruction with the same inputs and get
10845 // different results.
10846 if (isFLAT(MI) && MI.mayLoad()) {
10847 if (MI.memoperands_empty())
10848 return ValueUniformity::NeverUniform; // conservative assumption
10849
10850 if (llvm::any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *mmo) {
10851 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10852 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10853 })) {
10854 // At least one MMO in a non-global address space.
10855 return ValueUniformity::NeverUniform;
10856 }
10857
10858 return ValueUniformity::Default;
10859 }
10860
10861 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10862 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10863
10864 // FIXME: It's conceptually broken to report this for an instruction, and not
10865 // a specific def operand. For inline asm in particular, there could be mixed
10866 // uniform and divergent results.
10867 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10868 const MachineOperand &SrcOp = MI.getOperand(i: I);
10869 if (!SrcOp.isReg())
10870 continue;
10871
10872 Register Reg = SrcOp.getReg();
10873 if (!Reg || !SrcOp.readsReg())
10874 continue;
10875
10876 // If RegBank is null, this is unassigned or an unallocatable special
10877 // register, which are all scalars.
10878 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, TRI: RI);
10879 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10880 return ValueUniformity::NeverUniform;
10881 }
10882
10883 // TODO: Uniformity check condtions above can be rearranged for more
10884 // redability
10885
10886 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10887 // currently turned into no-op COPYs by SelectionDAG ISel and are
10888 // therefore no longer recognizable.
10889
10890 return ValueUniformity::Default;
10891}
10892
10893unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
10894 switch (MF.getFunction().getCallingConv()) {
10895 case CallingConv::AMDGPU_PS:
10896 return 1;
10897 case CallingConv::AMDGPU_VS:
10898 return 2;
10899 case CallingConv::AMDGPU_GS:
10900 return 3;
10901 case CallingConv::AMDGPU_HS:
10902 case CallingConv::AMDGPU_LS:
10903 case CallingConv::AMDGPU_ES: {
10904 const Function &F = MF.getFunction();
10905 F.getContext().diagnose(DI: DiagnosticInfoUnsupported(
10906 F, "ds_ordered_count unsupported for this calling conv"));
10907 [[fallthrough]];
10908 }
10909 case CallingConv::AMDGPU_CS:
10910 case CallingConv::AMDGPU_KERNEL:
10911 case CallingConv::C:
10912 case CallingConv::Fast:
10913 default:
10914 // Assume other calling conventions are various compute callable functions
10915 return 0;
10916 }
10917}
10918
10919bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
10920 Register &SrcReg2, int64_t &CmpMask,
10921 int64_t &CmpValue) const {
10922 if (!MI.getOperand(i: 0).isReg() || MI.getOperand(i: 0).getSubReg())
10923 return false;
10924
10925 switch (MI.getOpcode()) {
10926 default:
10927 break;
10928 case AMDGPU::S_CMP_EQ_U32:
10929 case AMDGPU::S_CMP_EQ_I32:
10930 case AMDGPU::S_CMP_LG_U32:
10931 case AMDGPU::S_CMP_LG_I32:
10932 case AMDGPU::S_CMP_LT_U32:
10933 case AMDGPU::S_CMP_LT_I32:
10934 case AMDGPU::S_CMP_GT_U32:
10935 case AMDGPU::S_CMP_GT_I32:
10936 case AMDGPU::S_CMP_LE_U32:
10937 case AMDGPU::S_CMP_LE_I32:
10938 case AMDGPU::S_CMP_GE_U32:
10939 case AMDGPU::S_CMP_GE_I32:
10940 case AMDGPU::S_CMP_EQ_U64:
10941 case AMDGPU::S_CMP_LG_U64:
10942 SrcReg = MI.getOperand(i: 0).getReg();
10943 if (MI.getOperand(i: 1).isReg()) {
10944 if (MI.getOperand(i: 1).getSubReg())
10945 return false;
10946 SrcReg2 = MI.getOperand(i: 1).getReg();
10947 CmpValue = 0;
10948 } else if (MI.getOperand(i: 1).isImm()) {
10949 SrcReg2 = Register();
10950 CmpValue = MI.getOperand(i: 1).getImm();
10951 } else {
10952 return false;
10953 }
10954 CmpMask = ~0;
10955 return true;
10956 case AMDGPU::S_CMPK_EQ_U32:
10957 case AMDGPU::S_CMPK_EQ_I32:
10958 case AMDGPU::S_CMPK_LG_U32:
10959 case AMDGPU::S_CMPK_LG_I32:
10960 case AMDGPU::S_CMPK_LT_U32:
10961 case AMDGPU::S_CMPK_LT_I32:
10962 case AMDGPU::S_CMPK_GT_U32:
10963 case AMDGPU::S_CMPK_GT_I32:
10964 case AMDGPU::S_CMPK_LE_U32:
10965 case AMDGPU::S_CMPK_LE_I32:
10966 case AMDGPU::S_CMPK_GE_U32:
10967 case AMDGPU::S_CMPK_GE_I32:
10968 SrcReg = MI.getOperand(i: 0).getReg();
10969 SrcReg2 = Register();
10970 CmpValue = MI.getOperand(i: 1).getImm();
10971 CmpMask = ~0;
10972 return true;
10973 }
10974
10975 return false;
10976}
10977
10978static bool isSCCDeadOnExit(MachineBasicBlock *MBB) {
10979 for (MachineBasicBlock *S : MBB->successors()) {
10980 if (S->isLiveIn(Reg: AMDGPU::SCC))
10981 return false;
10982 }
10983 return true;
10984}
10985
10986// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
10987// (incoming SCC) = !(SCC defined by SCCDef).
10988// Return true if all uses can be re-written, false otherwise.
10989bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
10990 MachineBasicBlock *MBB = SCCDef->getParent();
10991 SmallVector<MachineInstr *> InvertInstr;
10992 bool SCCIsDead = false;
10993
10994 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
10995 constexpr unsigned ScanLimit = 12;
10996 unsigned Count = 0;
10997 for (MachineInstr &MI :
10998 make_range(x: std::next(x: MachineBasicBlock::iterator(SCCDef)), y: MBB->end())) {
10999 if (++Count > ScanLimit)
11000 return false;
11001 if (MI.readsRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
11002 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
11003 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
11004 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11005 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
11006 InvertInstr.push_back(Elt: &MI);
11007 else
11008 return false;
11009 }
11010 if (MI.definesRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
11011 SCCIsDead = true;
11012 break;
11013 }
11014 }
11015 if (!SCCIsDead && isSCCDeadOnExit(MBB))
11016 SCCIsDead = true;
11017
11018 // SCC may have more uses. Can't invert all of them.
11019 if (!SCCIsDead)
11020 return false;
11021
11022 // Invert uses
11023 for (MachineInstr *MI : InvertInstr) {
11024 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
11025 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
11026 swapOperands(Inst&: *MI);
11027 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11028 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
11029 MI->setDesc(get(Opcode: MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
11030 ? AMDGPU::S_CBRANCH_SCC1
11031 : AMDGPU::S_CBRANCH_SCC0));
11032 } else {
11033 llvm_unreachable("SCC used but no inversion handling");
11034 }
11035 }
11036 return true;
11037}
11038
11039// SCC is already valid after SCCValid.
11040// SCCRedefine will redefine SCC to the same value already available after
11041// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
11042// update kill/dead flags if necessary.
11043bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
11044 bool NeedInversion) const {
11045 MachineInstr *KillsSCC = nullptr;
11046 if (SCCValid->getParent() != SCCRedefine->getParent())
11047 return false;
11048 for (MachineInstr &MI : make_range(x: std::next(x: SCCValid->getIterator()),
11049 y: SCCRedefine->getIterator())) {
11050 if (MI.modifiesRegister(Reg: AMDGPU::SCC, TRI: &RI))
11051 return false;
11052 if (MI.killsRegister(Reg: AMDGPU::SCC, TRI: &RI))
11053 KillsSCC = &MI;
11054 }
11055 if (NeedInversion && !invertSCCUse(SCCDef: SCCRedefine))
11056 return false;
11057 if (MachineOperand *SccDef =
11058 SCCValid->findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr))
11059 SccDef->setIsDead(false);
11060 if (KillsSCC)
11061 KillsSCC->clearRegisterKills(Reg: AMDGPU::SCC, /*TRI=*/RegInfo: nullptr);
11062 SCCRedefine->eraseFromParent();
11063 return true;
11064}
11065
11066static bool foldableSelect(const MachineInstr &Def) {
11067 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11068 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11069 return false;
11070 bool Op1IsNonZeroImm =
11071 Def.getOperand(i: 1).isImm() && Def.getOperand(i: 1).getImm() != 0;
11072 bool Op2IsZeroImm =
11073 Def.getOperand(i: 2).isImm() && Def.getOperand(i: 2).getImm() == 0;
11074 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11075 return false;
11076 return true;
11077}
11078
11079static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11080 unsigned &NewDefOpc) {
11081 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11082 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11083 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11084 Def.getOpcode() != AMDGPU::S_ADD_U32)
11085 return false;
11086 const MachineOperand &AddSrc1 = Def.getOperand(i: 1);
11087 const MachineOperand &AddSrc2 = Def.getOperand(i: 2);
11088 int64_t addend;
11089
11090 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11091 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11092 (!getFoldableImm(MO: &AddSrc1, Imm&: addend) || addend != 1) &&
11093 (!getFoldableImm(MO: &AddSrc2, Imm&: addend) || addend != 1))
11094 return false;
11095
11096 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11097 const MachineOperand *SccDef =
11098 Def.findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
11099 if (!SccDef->isDead())
11100 return false;
11101 NewDefOpc = AMDGPU::S_ADD_U32;
11102 }
11103 NeedInversion = !NeedInversion;
11104 return true;
11105}
11106
11107bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
11108 Register SrcReg2, int64_t CmpMask,
11109 int64_t CmpValue,
11110 const MachineRegisterInfo *MRI) const {
11111 if (!SrcReg || SrcReg.isPhysical())
11112 return false;
11113
11114 if (SrcReg2 && !getFoldableImm(Reg: SrcReg2, MRI: *MRI, Imm&: CmpValue))
11115 return false;
11116
11117 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11118 this](bool NeedInversion) -> bool {
11119 if (CmpValue != 0)
11120 return false;
11121
11122 MachineInstr *Def = MRI->getVRegDef(Reg: SrcReg);
11123 if (!Def)
11124 return false;
11125
11126 // For S_OP that set SCC = DST!=0, do the transformation
11127 //
11128 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11129 //
11130 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11131 // do the transformation:
11132 //
11133 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11134 //
11135 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11136 // for S_CSELECT* already has the same value that will be calculated by
11137 // s_cmp_lg_*
11138 //
11139 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11140 // (non-zero imm), 0)
11141
11142 unsigned NewDefOpc = Def->getOpcode();
11143 if (!setsSCCIfResultIsNonZero(*Def) &&
11144 !setsSCCIfResultIsZero(Def: *Def, NeedInversion, NewDefOpc) &&
11145 !foldableSelect(Def: *Def))
11146 return false;
11147
11148 if (!optimizeSCC(SCCValid: Def, SCCRedefine: &CmpInstr, NeedInversion))
11149 return false;
11150
11151 if (NewDefOpc != Def->getOpcode())
11152 Def->setDesc(get(Opcode: NewDefOpc));
11153
11154 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11155 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11156 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11157 // sX = s_cselect_b64 (non-zero imm), 0
11158 // sLo = copy sX.sub0
11159 // sHi = copy sX.sub1
11160 // sY = s_or_b32 sLo, sHi
11161 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11162 MRI->use_nodbg_empty(RegNo: Def->getOperand(i: 0).getReg())) {
11163 const MachineOperand &OrOpnd1 = Def->getOperand(i: 1);
11164 const MachineOperand &OrOpnd2 = Def->getOperand(i: 2);
11165 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11166 MachineInstr *Def1 = MRI->getVRegDef(Reg: OrOpnd1.getReg());
11167 MachineInstr *Def2 = MRI->getVRegDef(Reg: OrOpnd2.getReg());
11168 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11169 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(i: 1).isReg() &&
11170 Def2->getOperand(i: 1).isReg() &&
11171 Def1->getOperand(i: 1).getSubReg() == AMDGPU::sub0 &&
11172 Def2->getOperand(i: 1).getSubReg() == AMDGPU::sub1 &&
11173 Def1->getOperand(i: 1).getReg() == Def2->getOperand(i: 1).getReg()) {
11174 MachineInstr *Select = MRI->getVRegDef(Reg: Def1->getOperand(i: 1).getReg());
11175 if (Select && foldableSelect(Def: *Select))
11176 optimizeSCC(SCCValid: Select, SCCRedefine: Def, /*NeedInversion=*/false);
11177 }
11178 }
11179 }
11180 return true;
11181 };
11182
11183 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11184 this](int64_t ExpectedValue, unsigned SrcSize,
11185 bool IsReversible, bool IsSigned) -> bool {
11186 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11187 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11188 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11189 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11190 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11191 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11192 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11193 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11194 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11195 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11196 //
11197 // Signed ge/gt are not used for the sign bit.
11198 //
11199 // If result of the AND is unused except in the compare:
11200 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11201 //
11202 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11203 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11204 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11205 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11206 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11207 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11208
11209 MachineInstr *Def = MRI->getVRegDef(Reg: SrcReg);
11210 if (!Def)
11211 return false;
11212
11213 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11214 Def->getOpcode() != AMDGPU::S_AND_B64)
11215 return false;
11216
11217 int64_t Mask;
11218 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11219 if (MO->isImm())
11220 Mask = MO->getImm();
11221 else if (!getFoldableImm(MO, Imm&: Mask))
11222 return false;
11223 Mask &= maxUIntN(N: SrcSize);
11224 return isPowerOf2_64(Value: Mask);
11225 };
11226
11227 MachineOperand *SrcOp = &Def->getOperand(i: 1);
11228 if (isMask(SrcOp))
11229 SrcOp = &Def->getOperand(i: 2);
11230 else if (isMask(&Def->getOperand(i: 2)))
11231 SrcOp = &Def->getOperand(i: 1);
11232 else
11233 return false;
11234
11235 // A valid Mask is required to have a single bit set, hence a non-zero and
11236 // power-of-two value. This verifies that we will not do 64-bit shift below.
11237 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11238 unsigned BitNo = llvm::countr_zero(Val: (uint64_t)Mask);
11239 if (IsSigned && BitNo == SrcSize - 1)
11240 return false;
11241
11242 ExpectedValue <<= BitNo;
11243
11244 bool IsReversedCC = false;
11245 if (CmpValue != ExpectedValue) {
11246 if (!IsReversible)
11247 return false;
11248 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11249 if (!IsReversedCC)
11250 return false;
11251 }
11252
11253 Register DefReg = Def->getOperand(i: 0).getReg();
11254 if (IsReversedCC && !MRI->hasOneNonDBGUse(RegNo: DefReg))
11255 return false;
11256
11257 if (!optimizeSCC(SCCValid: Def, SCCRedefine: &CmpInstr, /*NeedInversion=*/false))
11258 return false;
11259
11260 if (!MRI->use_nodbg_empty(RegNo: DefReg)) {
11261 assert(!IsReversedCC);
11262 return true;
11263 }
11264
11265 // Replace AND with unused result with a S_BITCMP.
11266 MachineBasicBlock *MBB = Def->getParent();
11267
11268 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11269 : AMDGPU::S_BITCMP1_B32
11270 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11271 : AMDGPU::S_BITCMP1_B64;
11272
11273 BuildMI(BB&: *MBB, I: Def, MIMD: Def->getDebugLoc(), MCID: get(Opcode: NewOpc))
11274 .add(MO: *SrcOp)
11275 .addImm(Val: BitNo);
11276 Def->eraseFromParent();
11277
11278 return true;
11279 };
11280
11281 switch (CmpInstr.getOpcode()) {
11282 default:
11283 break;
11284 case AMDGPU::S_CMP_EQ_U32:
11285 case AMDGPU::S_CMP_EQ_I32:
11286 case AMDGPU::S_CMPK_EQ_U32:
11287 case AMDGPU::S_CMPK_EQ_I32:
11288 return optimizeCmpAnd(1, 32, true, false) ||
11289 optimizeCmpSelect(/*NeedInversion=*/true);
11290 case AMDGPU::S_CMP_GE_U32:
11291 case AMDGPU::S_CMPK_GE_U32:
11292 return optimizeCmpAnd(1, 32, false, false);
11293 case AMDGPU::S_CMP_GE_I32:
11294 case AMDGPU::S_CMPK_GE_I32:
11295 return optimizeCmpAnd(1, 32, false, true);
11296 case AMDGPU::S_CMP_EQ_U64:
11297 return optimizeCmpAnd(1, 64, true, false);
11298 case AMDGPU::S_CMP_LG_U32:
11299 case AMDGPU::S_CMP_LG_I32:
11300 case AMDGPU::S_CMPK_LG_U32:
11301 case AMDGPU::S_CMPK_LG_I32:
11302 return optimizeCmpAnd(0, 32, true, false) ||
11303 optimizeCmpSelect(/*NeedInversion=*/false);
11304 case AMDGPU::S_CMP_GT_U32:
11305 case AMDGPU::S_CMPK_GT_U32:
11306 return optimizeCmpAnd(0, 32, false, false);
11307 case AMDGPU::S_CMP_GT_I32:
11308 case AMDGPU::S_CMPK_GT_I32:
11309 return optimizeCmpAnd(0, 32, false, true);
11310 case AMDGPU::S_CMP_LG_U64:
11311 return optimizeCmpAnd(0, 64, true, false) ||
11312 optimizeCmpSelect(/*NeedInversion=*/false);
11313 }
11314
11315 return false;
11316}
11317
11318void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI,
11319 AMDGPU::OpName OpName) const {
11320 if (!ST.needsAlignedVGPRs())
11321 return;
11322
11323 int OpNo = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpName);
11324 if (OpNo < 0)
11325 return;
11326 MachineOperand &Op = MI.getOperand(i: OpNo);
11327 if (getOpSize(MI, OpNo) > 4)
11328 return;
11329
11330 // Add implicit aligned super-reg to force alignment on the data operand.
11331 const DebugLoc &DL = MI.getDebugLoc();
11332 MachineBasicBlock *BB = MI.getParent();
11333 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11334 Register DataReg = Op.getReg();
11335 bool IsAGPR = RI.isAGPR(MRI, Reg: DataReg);
11336 Register Undef = MRI.createVirtualRegister(
11337 RegClass: IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11338 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
11339 Register NewVR =
11340 MRI.createVirtualRegister(RegClass: IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11341 : &AMDGPU::VReg_64_Align2RegClass);
11342 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewVR)
11343 .addReg(RegNo: DataReg, Flags: {}, SubReg: Op.getSubReg())
11344 .addImm(Val: AMDGPU::sub0)
11345 .addReg(RegNo: Undef)
11346 .addImm(Val: AMDGPU::sub1);
11347 Op.setReg(NewVR);
11348 Op.setSubReg(AMDGPU::sub0);
11349 MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewVR, isDef: false, isImp: true));
11350}
11351
11352bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
11353 if (isIGLP(MI: *MI))
11354 return false;
11355
11356 return TargetInstrInfo::isGlobalMemoryObject(MI);
11357}
11358
11359bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const {
11360 if (!isWMMA(MI) && !isSWMMAC(MI))
11361 return false;
11362
11363 if (ST.hasGFX1250Insts())
11364 return AMDGPU::getWMMAIsXDL(Opc: MI.getOpcode());
11365
11366 return true;
11367}
11368
11369bool SIInstrInfo::isXDL(const MachineInstr &MI) const {
11370 unsigned Opcode = MI.getOpcode();
11371
11372 if (AMDGPU::isGFX12Plus(STI: ST))
11373 return isDOT(MI) || isXDLWMMA(MI);
11374
11375 if (!isMAI(MI) || isDGEMM(Opcode) ||
11376 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11377 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11378 return false;
11379
11380 if (!ST.hasGFX940Insts())
11381 return true;
11382
11383 return AMDGPU::getMAIIsGFX940XDL(Opc: Opcode);
11384}
11385