1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
19#include "SIMachineFunctionInfo.h"
20#include "Utils/AMDGPUBaseInfo.h"
21#include "llvm/Analysis/ValueTracking.h"
22#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
23#include "llvm/CodeGen/LiveIntervals.h"
24#include "llvm/CodeGen/LiveVariables.h"
25#include "llvm/CodeGen/MachineDominators.h"
26#include "llvm/CodeGen/MachineFrameInfo.h"
27#include "llvm/CodeGen/MachineScheduler.h"
28#include "llvm/CodeGen/RegisterScavenging.h"
29#include "llvm/CodeGen/ScheduleDAG.h"
30#include "llvm/IR/DiagnosticInfo.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
33#include "llvm/Support/CommandLine.h"
34#include "llvm/Target/TargetMachine.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm::AMDGPU {
44#define GET_D16ImageDimIntrinsics_IMPL
45#define GET_ImageDimIntrinsicTable_IMPL
46#define GET_RsrcIntrinsics_IMPL
47#include "AMDGPUGenSearchableTables.inc"
48} // namespace llvm::AMDGPU
49
50// Must be at least 4 to be able to branch over minimum unconditional branch
51// code. This is only for making it possible to write reasonably small tests for
52// long branches.
53static cl::opt<unsigned>
54BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(Val: 16),
55 cl::desc("Restrict range of branch instructions (DEBUG)"));
56
57static cl::opt<bool> Fix16BitCopies(
58 "amdgpu-fix-16-bit-physreg-copies",
59 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
60 cl::init(Val: true),
61 cl::ReallyHidden);
62
63SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
64 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
65 RI(ST), ST(ST) {
66 SchedModel.init(TSInfo: &ST);
67}
68
69//===----------------------------------------------------------------------===//
70// TargetInstrInfo callbacks
71//===----------------------------------------------------------------------===//
72
73static unsigned getNumOperandsNoGlue(SDNode *Node) {
74 unsigned N = Node->getNumOperands();
75 while (N && Node->getOperand(Num: N - 1).getValueType() == MVT::Glue)
76 --N;
77 return N;
78}
79
80/// Returns true if both nodes have the same value for the given
81/// operand \p Op, or if both nodes do not have this operand.
82static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1,
83 AMDGPU::OpName OpName) {
84 unsigned Opc0 = N0->getMachineOpcode();
85 unsigned Opc1 = N1->getMachineOpcode();
86
87 int Op0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: OpName);
88 int Op1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: OpName);
89
90 if (Op0Idx == -1 && Op1Idx == -1)
91 return true;
92
93
94 if ((Op0Idx == -1 && Op1Idx != -1) ||
95 (Op1Idx == -1 && Op0Idx != -1))
96 return false;
97
98 // getNamedOperandIdx returns the index for the MachineInstr's operands,
99 // which includes the result as the first operand. We are indexing into the
100 // MachineSDNode's operands, so we need to skip the result operand to get
101 // the real index.
102 --Op0Idx;
103 --Op1Idx;
104
105 return N0->getOperand(Num: Op0Idx) == N1->getOperand(Num: Op1Idx);
106}
107
108static bool canRemat(const MachineInstr &MI) {
109
110 if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) ||
111 SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) ||
112 SIInstrInfo::isSALU(MI))
113 return true;
114
115 if (SIInstrInfo::isSMRD(MI)) {
116 return !MI.memoperands_empty() &&
117 llvm::all_of(Range: MI.memoperands(), P: [](const MachineMemOperand *MMO) {
118 return MMO->isLoad() && MMO->isInvariant();
119 });
120 }
121
122 return false;
123}
124
125bool SIInstrInfo::isReallyTriviallyReMaterializable(
126 const MachineInstr &MI) const {
127
128 if (canRemat(MI)) {
129 // Normally VALU use of exec would block the rematerialization, but that
130 // is OK in this case to have an implicit exec read as all VALU do.
131 // We really want all of the generic logic for this except for this.
132
133 // Another potential implicit use is mode register. The core logic of
134 // the RA will not attempt rematerialization if mode is set anywhere
135 // in the function, otherwise it is safe since mode is not changed.
136
137 // There is difference to generic method which does not allow
138 // rematerialization if there are virtual register uses. We allow this,
139 // therefore this method includes SOP instructions as well.
140 if (!MI.hasImplicitDef() &&
141 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
142 !MI.mayRaiseFPException())
143 return true;
144 }
145
146 return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
147}
148
149// Returns true if the scalar result of a VALU instruction depends on exec.
150bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
151 // Ignore comparisons which are only used masked with exec.
152 // This allows some hoisting/sinking of VALU comparisons.
153 if (MI.isCompare()) {
154 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
155 if (!Dst)
156 return true;
157
158 Register DstReg = Dst->getReg();
159 if (!DstReg.isVirtual())
160 return true;
161
162 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
163 for (MachineInstr &Use : MRI.use_nodbg_instructions(Reg: DstReg)) {
164 switch (Use.getOpcode()) {
165 case AMDGPU::S_AND_SAVEEXEC_B32:
166 case AMDGPU::S_AND_SAVEEXEC_B64:
167 break;
168 case AMDGPU::S_AND_B32:
169 case AMDGPU::S_AND_B64:
170 if (!Use.readsRegister(Reg: AMDGPU::EXEC, /*TRI=*/nullptr))
171 return true;
172 break;
173 default:
174 return true;
175 }
176 }
177 return false;
178 }
179
180 switch (MI.getOpcode()) {
181 default:
182 break;
183 case AMDGPU::V_READFIRSTLANE_B32:
184 return true;
185 }
186
187 return false;
188}
189
190bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
191 // Any implicit use of exec by VALU is not a real register read.
192 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
193 isVALU(MI: *MO.getParent()) && !resultDependsOnExec(MI: *MO.getParent());
194}
195
196bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
197 MachineBasicBlock *SuccToSinkTo,
198 MachineCycleInfo *CI) const {
199 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
200 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
201 return true;
202
203 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
204 // Check if sinking of MI would create temporal divergent use.
205 for (auto Op : MI.uses()) {
206 if (Op.isReg() && Op.getReg().isVirtual() &&
207 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Op.getReg()))) {
208 MachineInstr *SgprDef = MRI.getVRegDef(Reg: Op.getReg());
209
210 // SgprDef defined inside cycle
211 MachineCycle *FromCycle = CI->getCycle(Block: SgprDef->getParent());
212 if (FromCycle == nullptr)
213 continue;
214
215 MachineCycle *ToCycle = CI->getCycle(Block: SuccToSinkTo);
216 // Check if there is a FromCycle that contains SgprDef's basic block but
217 // does not contain SuccToSinkTo and also has divergent exit condition.
218 while (FromCycle && !FromCycle->contains(C: ToCycle)) {
219 SmallVector<MachineBasicBlock *, 1> ExitingBlocks;
220 FromCycle->getExitingBlocks(TmpStorage&: ExitingBlocks);
221
222 // FromCycle has divergent exit condition.
223 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
224 if (hasDivergentBranch(MBB: ExitingBlock))
225 return false;
226 }
227
228 FromCycle = FromCycle->getParentCycle();
229 }
230 }
231 }
232
233 return true;
234}
235
236bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
237 int64_t &Offset0,
238 int64_t &Offset1) const {
239 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
240 return false;
241
242 unsigned Opc0 = Load0->getMachineOpcode();
243 unsigned Opc1 = Load1->getMachineOpcode();
244
245 // Make sure both are actually loads.
246 if (!get(Opcode: Opc0).mayLoad() || !get(Opcode: Opc1).mayLoad())
247 return false;
248
249 // A mayLoad instruction without a def is not a load. Likely a prefetch.
250 if (!get(Opcode: Opc0).getNumDefs() || !get(Opcode: Opc1).getNumDefs())
251 return false;
252
253 if (isDS(Opcode: Opc0) && isDS(Opcode: Opc1)) {
254
255 // FIXME: Handle this case:
256 if (getNumOperandsNoGlue(Node: Load0) != getNumOperandsNoGlue(Node: Load1))
257 return false;
258
259 // Check base reg.
260 if (Load0->getOperand(Num: 0) != Load1->getOperand(Num: 0))
261 return false;
262
263 // Skip read2 / write2 variants for simplicity.
264 // TODO: We should report true if the used offsets are adjacent (excluded
265 // st64 versions).
266 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: AMDGPU::OpName::offset);
267 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: AMDGPU::OpName::offset);
268 if (Offset0Idx == -1 || Offset1Idx == -1)
269 return false;
270
271 // XXX - be careful of dataless loads
272 // getNamedOperandIdx returns the index for MachineInstrs. Since they
273 // include the output in the operand list, but SDNodes don't, we need to
274 // subtract the index by one.
275 Offset0Idx -= get(Opcode: Opc0).NumDefs;
276 Offset1Idx -= get(Opcode: Opc1).NumDefs;
277 Offset0 = Load0->getConstantOperandVal(Num: Offset0Idx);
278 Offset1 = Load1->getConstantOperandVal(Num: Offset1Idx);
279 return true;
280 }
281
282 if (isSMRD(Opcode: Opc0) && isSMRD(Opcode: Opc1)) {
283 // Skip time and cache invalidation instructions.
284 if (!AMDGPU::hasNamedOperand(Opcode: Opc0, NamedIdx: AMDGPU::OpName::sbase) ||
285 !AMDGPU::hasNamedOperand(Opcode: Opc1, NamedIdx: AMDGPU::OpName::sbase))
286 return false;
287
288 unsigned NumOps = getNumOperandsNoGlue(Node: Load0);
289 if (NumOps != getNumOperandsNoGlue(Node: Load1))
290 return false;
291
292 // Check base reg.
293 if (Load0->getOperand(Num: 0) != Load1->getOperand(Num: 0))
294 return false;
295
296 // Match register offsets, if both register and immediate offsets present.
297 assert(NumOps == 4 || NumOps == 5);
298 if (NumOps == 5 && Load0->getOperand(Num: 1) != Load1->getOperand(Num: 1))
299 return false;
300
301 const ConstantSDNode *Load0Offset =
302 dyn_cast<ConstantSDNode>(Val: Load0->getOperand(Num: NumOps - 3));
303 const ConstantSDNode *Load1Offset =
304 dyn_cast<ConstantSDNode>(Val: Load1->getOperand(Num: NumOps - 3));
305
306 if (!Load0Offset || !Load1Offset)
307 return false;
308
309 Offset0 = Load0Offset->getZExtValue();
310 Offset1 = Load1Offset->getZExtValue();
311 return true;
312 }
313
314 // MUBUF and MTBUF can access the same addresses.
315 if ((isMUBUF(Opcode: Opc0) || isMTBUF(Opcode: Opc0)) && (isMUBUF(Opcode: Opc1) || isMTBUF(Opcode: Opc1))) {
316
317 // MUBUF and MTBUF have vaddr at different indices.
318 if (!nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::soffset) ||
319 !nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::vaddr) ||
320 !nodesHaveSameOperandValue(N0: Load0, N1: Load1, OpName: AMDGPU::OpName::srsrc))
321 return false;
322
323 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opcode: Opc0, Name: AMDGPU::OpName::offset);
324 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opcode: Opc1, Name: AMDGPU::OpName::offset);
325
326 if (OffIdx0 == -1 || OffIdx1 == -1)
327 return false;
328
329 // getNamedOperandIdx returns the index for MachineInstrs. Since they
330 // include the output in the operand list, but SDNodes don't, we need to
331 // subtract the index by one.
332 OffIdx0 -= get(Opcode: Opc0).NumDefs;
333 OffIdx1 -= get(Opcode: Opc1).NumDefs;
334
335 SDValue Off0 = Load0->getOperand(Num: OffIdx0);
336 SDValue Off1 = Load1->getOperand(Num: OffIdx1);
337
338 // The offset might be a FrameIndexSDNode.
339 if (!isa<ConstantSDNode>(Val: Off0) || !isa<ConstantSDNode>(Val: Off1))
340 return false;
341
342 Offset0 = Off0->getAsZExtVal();
343 Offset1 = Off1->getAsZExtVal();
344 return true;
345 }
346
347 return false;
348}
349
350static bool isStride64(unsigned Opc) {
351 switch (Opc) {
352 case AMDGPU::DS_READ2ST64_B32:
353 case AMDGPU::DS_READ2ST64_B64:
354 case AMDGPU::DS_WRITE2ST64_B32:
355 case AMDGPU::DS_WRITE2ST64_B64:
356 return true;
357 default:
358 return false;
359 }
360}
361
362bool SIInstrInfo::getMemOperandsWithOffsetWidth(
363 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
364 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
365 const TargetRegisterInfo *TRI) const {
366 if (!LdSt.mayLoadOrStore())
367 return false;
368
369 unsigned Opc = LdSt.getOpcode();
370 OffsetIsScalable = false;
371 const MachineOperand *BaseOp, *OffsetOp;
372 int DataOpIdx;
373
374 if (isDS(MI: LdSt)) {
375 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::addr);
376 OffsetOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
377 if (OffsetOp) {
378 // Normal, single offset LDS instruction.
379 if (!BaseOp) {
380 // DS_CONSUME/DS_APPEND use M0 for the base address.
381 // TODO: find the implicit use operand for M0 and use that as BaseOp?
382 return false;
383 }
384 BaseOps.push_back(Elt: BaseOp);
385 Offset = OffsetOp->getImm();
386 // Get appropriate operand, and compute width accordingly.
387 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
388 if (DataOpIdx == -1)
389 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
390 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
391 Width = LocationSize::precise(Value: 64);
392 else
393 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
394 } else {
395 // The 2 offset instructions use offset0 and offset1 instead. We can treat
396 // these as a load with a single offset if the 2 offsets are consecutive.
397 // We will use this for some partially aligned loads.
398 const MachineOperand *Offset0Op =
399 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset0);
400 const MachineOperand *Offset1Op =
401 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset1);
402
403 unsigned Offset0 = Offset0Op->getImm() & 0xff;
404 unsigned Offset1 = Offset1Op->getImm() & 0xff;
405 if (Offset0 + 1 != Offset1)
406 return false;
407
408 // Each of these offsets is in element sized units, so we need to convert
409 // to bytes of the individual reads.
410
411 unsigned EltSize;
412 if (LdSt.mayLoad())
413 EltSize = TRI->getRegSizeInBits(RC: *getOpRegClass(MI: LdSt, OpNo: 0)) / 16;
414 else {
415 assert(LdSt.mayStore());
416 int Data0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
417 EltSize = TRI->getRegSizeInBits(RC: *getOpRegClass(MI: LdSt, OpNo: Data0Idx)) / 8;
418 }
419
420 if (isStride64(Opc))
421 EltSize *= 64;
422
423 BaseOps.push_back(Elt: BaseOp);
424 Offset = EltSize * Offset0;
425 // Get appropriate operand(s), and compute width accordingly.
426 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
427 if (DataOpIdx == -1) {
428 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
429 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
430 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data1);
431 Width = LocationSize::precise(
432 Value: Width.getValue() + TypeSize::getFixed(ExactSize: getOpSize(MI: LdSt, OpNo: DataOpIdx)));
433 } else {
434 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
435 }
436 }
437 return true;
438 }
439
440 if (isMUBUF(MI: LdSt) || isMTBUF(MI: LdSt)) {
441 const MachineOperand *RSrc = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::srsrc);
442 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
443 return false;
444 BaseOps.push_back(Elt: RSrc);
445 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr);
446 if (BaseOp && !BaseOp->isFI())
447 BaseOps.push_back(Elt: BaseOp);
448 const MachineOperand *OffsetImm =
449 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
450 Offset = OffsetImm->getImm();
451 const MachineOperand *SOffset =
452 getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::soffset);
453 if (SOffset) {
454 if (SOffset->isReg())
455 BaseOps.push_back(Elt: SOffset);
456 else
457 Offset += SOffset->getImm();
458 }
459 // Get appropriate operand, and compute width accordingly.
460 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
461 if (DataOpIdx == -1)
462 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
463 if (DataOpIdx == -1) // LDS DMA
464 return false;
465 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
466 return true;
467 }
468
469 if (isImage(MI: LdSt)) {
470 auto RsrcOpName =
471 isMIMG(MI: LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
472 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: RsrcOpName);
473 BaseOps.push_back(Elt: &LdSt.getOperand(i: SRsrcIdx));
474 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0);
475 if (VAddr0Idx >= 0) {
476 // GFX10 possible NSA encoding.
477 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
478 BaseOps.push_back(Elt: &LdSt.getOperand(i: I));
479 } else {
480 BaseOps.push_back(Elt: getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr));
481 }
482 Offset = 0;
483 // Get appropriate operand, and compute width accordingly.
484 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
485 if (DataOpIdx == -1)
486 return false; // no return sampler
487 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
488 return true;
489 }
490
491 if (isSMRD(MI: LdSt)) {
492 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::sbase);
493 if (!BaseOp) // e.g. S_MEMTIME
494 return false;
495 BaseOps.push_back(Elt: BaseOp);
496 OffsetOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset);
497 Offset = OffsetOp ? OffsetOp->getImm() : 0;
498 // Get appropriate operand, and compute width accordingly.
499 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::sdst);
500 if (DataOpIdx == -1)
501 return false;
502 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
503 return true;
504 }
505
506 if (isFLAT(MI: LdSt)) {
507 // Instructions have either vaddr or saddr or both or none.
508 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::vaddr);
509 if (BaseOp)
510 BaseOps.push_back(Elt: BaseOp);
511 BaseOp = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::saddr);
512 if (BaseOp)
513 BaseOps.push_back(Elt: BaseOp);
514 Offset = getNamedOperand(MI: LdSt, OperandName: AMDGPU::OpName::offset)->getImm();
515 // Get appropriate operand, and compute width accordingly.
516 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
517 if (DataOpIdx == -1)
518 DataOpIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdata);
519 if (DataOpIdx == -1) // LDS DMA
520 return false;
521 Width = LocationSize::precise(Value: getOpSize(MI: LdSt, OpNo: DataOpIdx));
522 return true;
523 }
524
525 return false;
526}
527
528static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
529 ArrayRef<const MachineOperand *> BaseOps1,
530 const MachineInstr &MI2,
531 ArrayRef<const MachineOperand *> BaseOps2) {
532 // Only examine the first "base" operand of each instruction, on the
533 // assumption that it represents the real base address of the memory access.
534 // Other operands are typically offsets or indices from this base address.
535 if (BaseOps1.front()->isIdenticalTo(Other: *BaseOps2.front()))
536 return true;
537
538 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
539 return false;
540
541 auto *MO1 = *MI1.memoperands_begin();
542 auto *MO2 = *MI2.memoperands_begin();
543 if (MO1->getAddrSpace() != MO2->getAddrSpace())
544 return false;
545
546 const auto *Base1 = MO1->getValue();
547 const auto *Base2 = MO2->getValue();
548 if (!Base1 || !Base2)
549 return false;
550 Base1 = getUnderlyingObject(V: Base1);
551 Base2 = getUnderlyingObject(V: Base2);
552
553 if (isa<UndefValue>(Val: Base1) || isa<UndefValue>(Val: Base2))
554 return false;
555
556 return Base1 == Base2;
557}
558
559bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
560 int64_t Offset1, bool OffsetIsScalable1,
561 ArrayRef<const MachineOperand *> BaseOps2,
562 int64_t Offset2, bool OffsetIsScalable2,
563 unsigned ClusterSize,
564 unsigned NumBytes) const {
565 // If the mem ops (to be clustered) do not have the same base ptr, then they
566 // should not be clustered
567 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
568 if (!BaseOps1.empty() && !BaseOps2.empty()) {
569 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
570 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
571 if (!memOpsHaveSameBasePtr(MI1: FirstLdSt, BaseOps1, MI2: SecondLdSt, BaseOps2))
572 return false;
573
574 const SIMachineFunctionInfo *MFI =
575 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
576 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
577 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
578 // If only one base op is empty, they do not have the same base ptr
579 return false;
580 }
581
582 // In order to avoid register pressure, on an average, the number of DWORDS
583 // loaded together by all clustered mem ops should not exceed
584 // MaxMemoryClusterDWords. This is an empirical value based on certain
585 // observations and performance related experiments.
586 // The good thing about this heuristic is - it avoids clustering of too many
587 // sub-word loads, and also avoids clustering of wide loads. Below is the
588 // brief summary of how the heuristic behaves for various `LoadSize` when
589 // MaxMemoryClusterDWords is 8.
590 //
591 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
592 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
593 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
594 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
595 // (5) LoadSize >= 17: do not cluster
596 const unsigned LoadSize = NumBytes / ClusterSize;
597 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
598 return NumDWords <= MaxMemoryClusterDWords;
599}
600
601// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
602// the first 16 loads will be interleaved with the stores, and the next 16 will
603// be clustered as expected. It should really split into 2 16 store batches.
604//
605// Loads are clustered until this returns false, rather than trying to schedule
606// groups of stores. This also means we have to deal with saying different
607// address space loads should be clustered, and ones which might cause bank
608// conflicts.
609//
610// This might be deprecated so it might not be worth that much effort to fix.
611bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
612 int64_t Offset0, int64_t Offset1,
613 unsigned NumLoads) const {
614 assert(Offset1 > Offset0 &&
615 "Second offset should be larger than first offset!");
616 // If we have less than 16 loads in a row, and the offsets are within 64
617 // bytes, then schedule together.
618
619 // A cacheline is 64 bytes (for global memory).
620 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
621}
622
623static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
624 MachineBasicBlock::iterator MI,
625 const DebugLoc &DL, MCRegister DestReg,
626 MCRegister SrcReg, bool KillSrc,
627 const char *Msg = "illegal VGPR to SGPR copy") {
628 MachineFunction *MF = MBB.getParent();
629
630 LLVMContext &C = MF->getFunction().getContext();
631 C.diagnose(DI: DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
632
633 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_ILLEGAL_COPY), DestReg)
634 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
635}
636
637/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
638/// possible to have a direct copy in these cases on GFX908, so an intermediate
639/// VGPR copy is required.
640static void indirectCopyToAGPR(const SIInstrInfo &TII,
641 MachineBasicBlock &MBB,
642 MachineBasicBlock::iterator MI,
643 const DebugLoc &DL, MCRegister DestReg,
644 MCRegister SrcReg, bool KillSrc,
645 RegScavenger &RS, bool RegsOverlap,
646 Register ImpDefSuperReg = Register(),
647 Register ImpUseSuperReg = Register()) {
648 assert((TII.getSubtarget().hasMAIInsts() &&
649 !TII.getSubtarget().hasGFX90AInsts()) &&
650 "Expected GFX908 subtarget.");
651
652 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
653 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
654 "Source register of the copy should be either an SGPR or an AGPR.");
655
656 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
657 "Destination register of the copy should be an AGPR.");
658
659 const SIRegisterInfo &RI = TII.getRegisterInfo();
660
661 // First try to find defining accvgpr_write to avoid temporary registers.
662 // In the case of copies of overlapping AGPRs, we conservatively do not
663 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
664 // an accvgpr_write used for this same copy due to implicit-defs
665 if (!RegsOverlap) {
666 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
667 --Def;
668
669 if (!Def->modifiesRegister(Reg: SrcReg, TRI: &RI))
670 continue;
671
672 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
673 Def->getOperand(i: 0).getReg() != SrcReg)
674 break;
675
676 MachineOperand &DefOp = Def->getOperand(i: 1);
677 assert(DefOp.isReg() || DefOp.isImm());
678
679 if (DefOp.isReg()) {
680 bool SafeToPropagate = true;
681 // Check that register source operand is not clobbered before MI.
682 // Immediate operands are always safe to propagate.
683 for (auto I = Def; I != MI && SafeToPropagate; ++I)
684 if (I->modifiesRegister(Reg: DefOp.getReg(), TRI: &RI))
685 SafeToPropagate = false;
686
687 if (!SafeToPropagate)
688 break;
689
690 DefOp.setIsKill(false);
691 }
692
693 MachineInstrBuilder Builder =
694 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
695 .add(MO: DefOp);
696 if (ImpDefSuperReg)
697 Builder.addReg(RegNo: ImpDefSuperReg, flags: RegState::Define | RegState::Implicit);
698
699 if (ImpUseSuperReg) {
700 Builder.addReg(RegNo: ImpUseSuperReg,
701 flags: getKillRegState(B: KillSrc) | RegState::Implicit);
702 }
703
704 return;
705 }
706 }
707
708 RS.enterBasicBlockEnd(MBB);
709 RS.backward(I: std::next(x: MI));
710
711 // Ideally we want to have three registers for a long reg_sequence copy
712 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
713 unsigned MaxVGPRs = RI.getRegPressureLimit(RC: &AMDGPU::VGPR_32RegClass,
714 MF&: *MBB.getParent());
715
716 // Registers in the sequence are allocated contiguously so we can just
717 // use register number to pick one of three round-robin temps.
718 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
719 Register Tmp =
720 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
721 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
722 "VGPR used for an intermediate copy should have been reserved.");
723
724 // Only loop through if there are any free registers left. We don't want to
725 // spill.
726 while (RegNo--) {
727 Register Tmp2 = RS.scavengeRegisterBackwards(RC: AMDGPU::VGPR_32RegClass, To: MI,
728 /* RestoreAfter */ false, SPAdj: 0,
729 /* AllowSpill */ false);
730 if (!Tmp2 || RI.getHWRegIndex(Reg: Tmp2) >= MaxVGPRs)
731 break;
732 Tmp = Tmp2;
733 RS.setRegUsed(Reg: Tmp);
734 }
735
736 // Insert copy to temporary VGPR.
737 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
738 if (AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg)) {
739 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
740 } else {
741 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
742 }
743
744 MachineInstrBuilder UseBuilder = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: TmpCopyOp), DestReg: Tmp)
745 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
746 if (ImpUseSuperReg) {
747 UseBuilder.addReg(RegNo: ImpUseSuperReg,
748 flags: getKillRegState(B: KillSrc) | RegState::Implicit);
749 }
750
751 MachineInstrBuilder DefBuilder
752 = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
753 .addReg(RegNo: Tmp, flags: RegState::Kill);
754
755 if (ImpDefSuperReg)
756 DefBuilder.addReg(RegNo: ImpDefSuperReg, flags: RegState::Define | RegState::Implicit);
757}
758
759static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
760 MachineBasicBlock::iterator MI, const DebugLoc &DL,
761 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
762 const TargetRegisterClass *RC, bool Forward) {
763 const SIRegisterInfo &RI = TII.getRegisterInfo();
764 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, EltSize: 4);
765 MachineBasicBlock::iterator I = MI;
766 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
767
768 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
769 int16_t SubIdx = BaseIndices[Idx];
770 Register DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
771 Register SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
772 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
773 unsigned Opcode = AMDGPU::S_MOV_B32;
774
775 // Is SGPR aligned? If so try to combine with next.
776 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
777 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
778 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
779 // Can use SGPR64 copy
780 unsigned Channel = RI.getChannelFromSubReg(SubReg: SubIdx);
781 SubIdx = RI.getSubRegFromChannel(Channel, NumRegs: 2);
782 DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
783 SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
784 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
785 Opcode = AMDGPU::S_MOV_B64;
786 Idx++;
787 }
788
789 LastMI = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII.get(Opcode), DestReg: DestSubReg)
790 .addReg(RegNo: SrcSubReg)
791 .addReg(RegNo: SrcReg, flags: RegState::Implicit);
792
793 if (!FirstMI)
794 FirstMI = LastMI;
795
796 if (!Forward)
797 I--;
798 }
799
800 assert(FirstMI && LastMI);
801 if (!Forward)
802 std::swap(a&: FirstMI, b&: LastMI);
803
804 FirstMI->addOperand(
805 Op: MachineOperand::CreateReg(Reg: DestReg, isDef: true /*IsDef*/, isImp: true /*IsImp*/));
806
807 if (KillSrc)
808 LastMI->addRegisterKilled(IncomingReg: SrcReg, RegInfo: &RI);
809}
810
811void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
812 MachineBasicBlock::iterator MI,
813 const DebugLoc &DL, Register DestReg,
814 Register SrcReg, bool KillSrc, bool RenamableDest,
815 bool RenamableSrc) const {
816 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(Reg: DestReg);
817 unsigned Size = RI.getRegSizeInBits(RC: *RC);
818 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(Reg: SrcReg);
819 unsigned SrcSize = RI.getRegSizeInBits(RC: *SrcRC);
820
821 // The rest of copyPhysReg assumes Src and Dst size are the same size.
822 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
823 // we remove Fix16BitCopies and this code block?
824 if (Fix16BitCopies) {
825 if (((Size == 16) != (SrcSize == 16))) {
826 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
827 assert(ST.useRealTrue16Insts());
828 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
829 MCRegister SubReg = RI.getSubReg(Reg: RegToFix, Idx: AMDGPU::lo16);
830 RegToFix = SubReg;
831
832 if (DestReg == SrcReg) {
833 // Identity copy. Insert empty bundle since ExpandPostRA expects an
834 // instruction here.
835 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::BUNDLE));
836 return;
837 }
838 RC = RI.getPhysRegBaseClass(Reg: DestReg);
839 Size = RI.getRegSizeInBits(RC: *RC);
840 SrcRC = RI.getPhysRegBaseClass(Reg: SrcReg);
841 SrcSize = RI.getRegSizeInBits(RC: *SrcRC);
842 }
843 }
844
845 if (RC == &AMDGPU::VGPR_32RegClass) {
846 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
847 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
848 AMDGPU::AGPR_32RegClass.contains(SrcReg));
849 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg) ?
850 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
851 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: Opc), DestReg)
852 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
853 return;
854 }
855
856 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
857 RC == &AMDGPU::SReg_32RegClass) {
858 if (SrcReg == AMDGPU::SCC) {
859 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B32), DestReg)
860 .addImm(Val: 1)
861 .addImm(Val: 0);
862 return;
863 }
864
865 if (DestReg == AMDGPU::VCC_LO) {
866 if (AMDGPU::SReg_32RegClass.contains(Reg: SrcReg)) {
867 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::VCC_LO)
868 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
869 } else {
870 // FIXME: Hack until VReg_1 removed.
871 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
872 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_U32_e32))
873 .addImm(Val: 0)
874 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
875 }
876
877 return;
878 }
879
880 if (!AMDGPU::SReg_32RegClass.contains(Reg: SrcReg)) {
881 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
882 return;
883 }
884
885 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg)
886 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
887 return;
888 }
889
890 if (RC == &AMDGPU::SReg_64RegClass) {
891 if (SrcReg == AMDGPU::SCC) {
892 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B64), DestReg)
893 .addImm(Val: 1)
894 .addImm(Val: 0);
895 return;
896 }
897
898 if (DestReg == AMDGPU::VCC) {
899 if (AMDGPU::SReg_64RegClass.contains(Reg: SrcReg)) {
900 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B64), DestReg: AMDGPU::VCC)
901 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
902 } else {
903 // FIXME: Hack until VReg_1 removed.
904 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
905 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_U32_e32))
906 .addImm(Val: 0)
907 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
908 }
909
910 return;
911 }
912
913 if (!AMDGPU::SReg_64RegClass.contains(Reg: SrcReg)) {
914 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
915 return;
916 }
917
918 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B64), DestReg)
919 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
920 return;
921 }
922
923 if (DestReg == AMDGPU::SCC) {
924 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
925 // but SelectionDAG emits such copies for i1 sources.
926 if (AMDGPU::SReg_64RegClass.contains(Reg: SrcReg)) {
927 // This copy can only be produced by patterns
928 // with explicit SCC, which are known to be enabled
929 // only for subtargets with S_CMP_LG_U64 present.
930 assert(ST.hasScalarCompareEq64());
931 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U64))
932 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc))
933 .addImm(Val: 0);
934 } else {
935 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
936 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CMP_LG_U32))
937 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc))
938 .addImm(Val: 0);
939 }
940
941 return;
942 }
943
944 if (RC == &AMDGPU::AGPR_32RegClass) {
945 if (AMDGPU::VGPR_32RegClass.contains(Reg: SrcReg) ||
946 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(Reg: SrcReg))) {
947 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
948 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
949 return;
950 }
951
952 if (AMDGPU::AGPR_32RegClass.contains(Reg: SrcReg) && ST.hasGFX90AInsts()) {
953 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
954 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
955 return;
956 }
957
958 // FIXME: Pass should maintain scavenger to avoid scan through the block on
959 // every AGPR spill.
960 RegScavenger RS;
961 const bool Overlap = RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
962 indirectCopyToAGPR(TII: *this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, RegsOverlap: Overlap);
963 return;
964 }
965
966 if (Size == 16) {
967 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
968 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
969 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
970
971 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(Reg: DestReg);
972 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(Reg: SrcReg);
973 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(Reg: DestReg);
974 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(Reg: SrcReg);
975 bool DstLow = !AMDGPU::isHi16Reg(Reg: DestReg, MRI: RI);
976 bool SrcLow = !AMDGPU::isHi16Reg(Reg: SrcReg, MRI: RI);
977 MCRegister NewDestReg = RI.get32BitRegister(Reg: DestReg);
978 MCRegister NewSrcReg = RI.get32BitRegister(Reg: SrcReg);
979
980 if (IsSGPRDst) {
981 if (!IsSGPRSrc) {
982 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
983 return;
984 }
985
986 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: NewDestReg)
987 .addReg(RegNo: NewSrcReg, flags: getKillRegState(B: KillSrc));
988 return;
989 }
990
991 if (IsAGPRDst || IsAGPRSrc) {
992 if (!DstLow || !SrcLow) {
993 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
994 Msg: "Cannot use hi16 subreg with an AGPR!");
995 }
996
997 copyPhysReg(MBB, MI, DL, DestReg: NewDestReg, SrcReg: NewSrcReg, KillSrc);
998 return;
999 }
1000
1001 if (ST.useRealTrue16Insts()) {
1002 if (IsSGPRSrc) {
1003 assert(SrcLow);
1004 SrcReg = NewSrcReg;
1005 }
1006 // Use the smaller instruction encoding if possible.
1007 if (AMDGPU::VGPR_16_Lo128RegClass.contains(Reg: DestReg) &&
1008 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(Reg: SrcReg))) {
1009 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B16_t16_e32), DestReg)
1010 .addReg(RegNo: SrcReg);
1011 } else {
1012 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B16_t16_e64), DestReg)
1013 .addImm(Val: 0) // src0_modifiers
1014 .addReg(RegNo: SrcReg)
1015 .addImm(Val: 0); // op_sel
1016 }
1017 return;
1018 }
1019
1020 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1021 if (!DstLow || !SrcLow) {
1022 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1023 Msg: "Cannot use hi16 subreg on VI!");
1024 }
1025
1026 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: NewDestReg)
1027 .addReg(RegNo: NewSrcReg, flags: getKillRegState(B: KillSrc));
1028 return;
1029 }
1030
1031 auto MIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_sdwa), DestReg: NewDestReg)
1032 .addImm(Val: 0) // src0_modifiers
1033 .addReg(RegNo: NewSrcReg)
1034 .addImm(Val: 0) // clamp
1035 .addImm(Val: DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1036 : AMDGPU::SDWA::SdwaSel::WORD_1)
1037 .addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
1038 .addImm(Val: SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
1039 : AMDGPU::SDWA::SdwaSel::WORD_1)
1040 .addReg(RegNo: NewDestReg, flags: RegState::Implicit | RegState::Undef);
1041 // First implicit operand is $exec.
1042 MIB->tieOperands(DefIdx: 0, UseIdx: MIB->getNumOperands() - 1);
1043 return;
1044 }
1045
1046 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(RC: SrcRC))) {
1047 if (ST.hasMovB64()) {
1048 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B64_e32), DestReg)
1049 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
1050 return;
1051 }
1052 if (ST.hasPkMovB32()) {
1053 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg)
1054 .addImm(Val: SISrcMods::OP_SEL_1)
1055 .addReg(RegNo: SrcReg)
1056 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1057 .addReg(RegNo: SrcReg)
1058 .addImm(Val: 0) // op_sel_lo
1059 .addImm(Val: 0) // op_sel_hi
1060 .addImm(Val: 0) // neg_lo
1061 .addImm(Val: 0) // neg_hi
1062 .addImm(Val: 0) // clamp
1063 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc) | RegState::Implicit);
1064 return;
1065 }
1066 }
1067
1068 const bool Forward = RI.getHWRegIndex(Reg: DestReg) <= RI.getHWRegIndex(Reg: SrcReg);
1069 if (RI.isSGPRClass(RC)) {
1070 if (!RI.isSGPRClass(RC: SrcRC)) {
1071 reportIllegalCopy(TII: this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1072 return;
1073 }
1074 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
1075 expandSGPRCopy(TII: *this, MBB, MI, DL, DestReg, SrcReg, KillSrc: CanKillSuperReg, RC,
1076 Forward);
1077 return;
1078 }
1079
1080 unsigned EltSize = 4;
1081 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1082 if (RI.isAGPRClass(RC)) {
1083 if (ST.hasGFX90AInsts() && RI.isAGPRClass(RC: SrcRC))
1084 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1085 else if (RI.hasVGPRs(RC: SrcRC) ||
1086 (ST.hasGFX90AInsts() && RI.isSGPRClass(RC: SrcRC)))
1087 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1088 else
1089 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1090 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(RC: SrcRC)) {
1091 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1092 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1093 (RI.isProperlyAlignedRC(RC: *RC) &&
1094 (SrcRC == RC || RI.isSGPRClass(RC: SrcRC)))) {
1095 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1096 if (ST.hasMovB64()) {
1097 Opcode = AMDGPU::V_MOV_B64_e32;
1098 EltSize = 8;
1099 } else if (ST.hasPkMovB32()) {
1100 Opcode = AMDGPU::V_PK_MOV_B32;
1101 EltSize = 8;
1102 }
1103 }
1104
1105 // For the cases where we need an intermediate instruction/temporary register
1106 // (destination is an AGPR), we need a scavenger.
1107 //
1108 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1109 // whole block for every handled copy.
1110 std::unique_ptr<RegScavenger> RS;
1111 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1112 RS = std::make_unique<RegScavenger>();
1113
1114 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1115
1116 // If there is an overlap, we can't kill the super-register on the last
1117 // instruction, since it will also kill the components made live by this def.
1118 const bool Overlap = RI.regsOverlap(RegA: SrcReg, RegB: DestReg);
1119 const bool CanKillSuperReg = KillSrc && !Overlap;
1120
1121 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1122 unsigned SubIdx;
1123 if (Forward)
1124 SubIdx = SubIndices[Idx];
1125 else
1126 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1127 Register DestSubReg = RI.getSubReg(Reg: DestReg, Idx: SubIdx);
1128 Register SrcSubReg = RI.getSubReg(Reg: SrcReg, Idx: SubIdx);
1129 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1130
1131 bool IsFirstSubreg = Idx == 0;
1132 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1133
1134 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1135 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1136 Register ImpUseSuper = SrcReg;
1137 indirectCopyToAGPR(TII: *this, MBB, MI, DL, DestReg: DestSubReg, SrcReg: SrcSubReg, KillSrc: UseKill,
1138 RS&: *RS, RegsOverlap: Overlap, ImpDefSuperReg: ImpDefSuper, ImpUseSuperReg: ImpUseSuper);
1139 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1140 MachineInstrBuilder MIB =
1141 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg: DestSubReg)
1142 .addImm(Val: SISrcMods::OP_SEL_1)
1143 .addReg(RegNo: SrcSubReg)
1144 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1145 .addReg(RegNo: SrcSubReg)
1146 .addImm(Val: 0) // op_sel_lo
1147 .addImm(Val: 0) // op_sel_hi
1148 .addImm(Val: 0) // neg_lo
1149 .addImm(Val: 0) // neg_hi
1150 .addImm(Val: 0) // clamp
1151 .addReg(RegNo: SrcReg, flags: getKillRegState(B: UseKill) | RegState::Implicit);
1152 if (IsFirstSubreg)
1153 MIB.addReg(RegNo: DestReg, flags: RegState::Define | RegState::Implicit);
1154 } else {
1155 MachineInstrBuilder Builder =
1156 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode), DestReg: DestSubReg).addReg(RegNo: SrcSubReg);
1157 if (IsFirstSubreg)
1158 Builder.addReg(RegNo: DestReg, flags: RegState::Define | RegState::Implicit);
1159
1160 Builder.addReg(RegNo: SrcReg, flags: getKillRegState(B: UseKill) | RegState::Implicit);
1161 }
1162 }
1163}
1164
1165int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1166 int NewOpc;
1167
1168 // Try to map original to commuted opcode
1169 NewOpc = AMDGPU::getCommuteRev(Opcode);
1170 if (NewOpc != -1)
1171 // Check if the commuted (REV) opcode exists on the target.
1172 return pseudoToMCOpcode(Opcode: NewOpc) != -1 ? NewOpc : -1;
1173
1174 // Try to map commuted to original opcode
1175 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1176 if (NewOpc != -1)
1177 // Check if the original (non-REV) opcode exists on the target.
1178 return pseudoToMCOpcode(Opcode: NewOpc) != -1 ? NewOpc : -1;
1179
1180 return Opcode;
1181}
1182
1183const TargetRegisterClass *
1184SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
1185 return &AMDGPU::VGPR_32RegClass;
1186}
1187
1188void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
1189 MachineBasicBlock::iterator I,
1190 const DebugLoc &DL, Register DstReg,
1191 ArrayRef<MachineOperand> Cond,
1192 Register TrueReg,
1193 Register FalseReg) const {
1194 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1195 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1196 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1197 "Not a VGPR32 reg");
1198
1199 if (Cond.size() == 1) {
1200 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1201 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1202 .add(MO: Cond[0]);
1203 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1204 .addImm(Val: 0)
1205 .addReg(RegNo: FalseReg)
1206 .addImm(Val: 0)
1207 .addReg(RegNo: TrueReg)
1208 .addReg(RegNo: SReg);
1209 } else if (Cond.size() == 2) {
1210 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1211 switch (Cond[0].getImm()) {
1212 case SIInstrInfo::SCC_TRUE: {
1213 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1214 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: ST.isWave32() ? AMDGPU::S_CSELECT_B32
1215 : AMDGPU::S_CSELECT_B64), DestReg: SReg)
1216 .addImm(Val: 1)
1217 .addImm(Val: 0);
1218 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1219 .addImm(Val: 0)
1220 .addReg(RegNo: FalseReg)
1221 .addImm(Val: 0)
1222 .addReg(RegNo: TrueReg)
1223 .addReg(RegNo: SReg);
1224 break;
1225 }
1226 case SIInstrInfo::SCC_FALSE: {
1227 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1228 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: ST.isWave32() ? AMDGPU::S_CSELECT_B32
1229 : AMDGPU::S_CSELECT_B64), DestReg: SReg)
1230 .addImm(Val: 0)
1231 .addImm(Val: 1);
1232 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1233 .addImm(Val: 0)
1234 .addReg(RegNo: FalseReg)
1235 .addImm(Val: 0)
1236 .addReg(RegNo: TrueReg)
1237 .addReg(RegNo: SReg);
1238 break;
1239 }
1240 case SIInstrInfo::VCCNZ: {
1241 MachineOperand RegOp = Cond[1];
1242 RegOp.setImplicit(false);
1243 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1244 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1245 .add(MO: RegOp);
1246 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1247 .addImm(Val: 0)
1248 .addReg(RegNo: FalseReg)
1249 .addImm(Val: 0)
1250 .addReg(RegNo: TrueReg)
1251 .addReg(RegNo: SReg);
1252 break;
1253 }
1254 case SIInstrInfo::VCCZ: {
1255 MachineOperand RegOp = Cond[1];
1256 RegOp.setImplicit(false);
1257 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1258 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: SReg)
1259 .add(MO: RegOp);
1260 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1261 .addImm(Val: 0)
1262 .addReg(RegNo: TrueReg)
1263 .addImm(Val: 0)
1264 .addReg(RegNo: FalseReg)
1265 .addReg(RegNo: SReg);
1266 break;
1267 }
1268 case SIInstrInfo::EXECNZ: {
1269 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1270 Register SReg2 = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1271 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1272 : AMDGPU::S_OR_SAVEEXEC_B64), DestReg: SReg2)
1273 .addImm(Val: 0);
1274 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: ST.isWave32() ? AMDGPU::S_CSELECT_B32
1275 : AMDGPU::S_CSELECT_B64), DestReg: SReg)
1276 .addImm(Val: 1)
1277 .addImm(Val: 0);
1278 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1279 .addImm(Val: 0)
1280 .addReg(RegNo: FalseReg)
1281 .addImm(Val: 0)
1282 .addReg(RegNo: TrueReg)
1283 .addReg(RegNo: SReg);
1284 break;
1285 }
1286 case SIInstrInfo::EXECZ: {
1287 Register SReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
1288 Register SReg2 = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1289 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1290 : AMDGPU::S_OR_SAVEEXEC_B64), DestReg: SReg2)
1291 .addImm(Val: 0);
1292 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: ST.isWave32() ? AMDGPU::S_CSELECT_B32
1293 : AMDGPU::S_CSELECT_B64), DestReg: SReg)
1294 .addImm(Val: 0)
1295 .addImm(Val: 1);
1296 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
1297 .addImm(Val: 0)
1298 .addReg(RegNo: FalseReg)
1299 .addImm(Val: 0)
1300 .addReg(RegNo: TrueReg)
1301 .addReg(RegNo: SReg);
1302 llvm_unreachable("Unhandled branch predicate EXECZ");
1303 break;
1304 }
1305 default:
1306 llvm_unreachable("invalid branch predicate");
1307 }
1308 } else {
1309 llvm_unreachable("Can only handle Cond size 1 or 2");
1310 }
1311}
1312
1313Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
1314 MachineBasicBlock::iterator I,
1315 const DebugLoc &DL,
1316 Register SrcReg, int Value) const {
1317 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1318 Register Reg = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1319 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_EQ_I32_e64), DestReg: Reg)
1320 .addImm(Val: Value)
1321 .addReg(RegNo: SrcReg);
1322
1323 return Reg;
1324}
1325
1326Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
1327 MachineBasicBlock::iterator I,
1328 const DebugLoc &DL,
1329 Register SrcReg, int Value) const {
1330 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1331 Register Reg = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
1332 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CMP_NE_I32_e64), DestReg: Reg)
1333 .addImm(Val: Value)
1334 .addReg(RegNo: SrcReg);
1335
1336 return Reg;
1337}
1338
1339bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
1340 const Register Reg,
1341 int64_t &ImmVal) const {
1342 switch (MI.getOpcode()) {
1343 case AMDGPU::V_MOV_B32_e32:
1344 case AMDGPU::S_MOV_B32:
1345 case AMDGPU::S_MOVK_I32:
1346 case AMDGPU::S_MOV_B64:
1347 case AMDGPU::V_MOV_B64_e32:
1348 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1349 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1350 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1351 case AMDGPU::V_MOV_B64_PSEUDO: {
1352 const MachineOperand &Src0 = MI.getOperand(i: 1);
1353 if (Src0.isImm()) {
1354 ImmVal = Src0.getImm();
1355 return MI.getOperand(i: 0).getReg() == Reg;
1356 }
1357
1358 return false;
1359 }
1360 case AMDGPU::S_BREV_B32:
1361 case AMDGPU::V_BFREV_B32_e32:
1362 case AMDGPU::V_BFREV_B32_e64: {
1363 const MachineOperand &Src0 = MI.getOperand(i: 1);
1364 if (Src0.isImm()) {
1365 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Val: Src0.getImm()));
1366 return MI.getOperand(i: 0).getReg() == Reg;
1367 }
1368
1369 return false;
1370 }
1371 case AMDGPU::S_NOT_B32:
1372 case AMDGPU::V_NOT_B32_e32:
1373 case AMDGPU::V_NOT_B32_e64: {
1374 const MachineOperand &Src0 = MI.getOperand(i: 1);
1375 if (Src0.isImm()) {
1376 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1377 return MI.getOperand(i: 0).getReg() == Reg;
1378 }
1379
1380 return false;
1381 }
1382 default:
1383 return false;
1384 }
1385}
1386
1387unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
1388
1389 if (RI.isAGPRClass(RC: DstRC))
1390 return AMDGPU::COPY;
1391 if (RI.getRegSizeInBits(RC: *DstRC) == 16) {
1392 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1393 // before RA.
1394 return RI.isSGPRClass(RC: DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1395 }
1396 if (RI.getRegSizeInBits(RC: *DstRC) == 32)
1397 return RI.isSGPRClass(RC: DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1398 if (RI.getRegSizeInBits(RC: *DstRC) == 64 && RI.isSGPRClass(RC: DstRC))
1399 return AMDGPU::S_MOV_B64;
1400 if (RI.getRegSizeInBits(RC: *DstRC) == 64 && !RI.isSGPRClass(RC: DstRC))
1401 return AMDGPU::V_MOV_B64_PSEUDO;
1402 return AMDGPU::COPY;
1403}
1404
1405const MCInstrDesc &
1406SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
1407 bool IsIndirectSrc) const {
1408 if (IsIndirectSrc) {
1409 if (VecSize <= 32) // 4 bytes
1410 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1411 if (VecSize <= 64) // 8 bytes
1412 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1413 if (VecSize <= 96) // 12 bytes
1414 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1415 if (VecSize <= 128) // 16 bytes
1416 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1417 if (VecSize <= 160) // 20 bytes
1418 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1419 if (VecSize <= 256) // 32 bytes
1420 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1421 if (VecSize <= 288) // 36 bytes
1422 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1423 if (VecSize <= 320) // 40 bytes
1424 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1425 if (VecSize <= 352) // 44 bytes
1426 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1427 if (VecSize <= 384) // 48 bytes
1428 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1429 if (VecSize <= 512) // 64 bytes
1430 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1431 if (VecSize <= 1024) // 128 bytes
1432 return get(Opcode: AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1433
1434 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1435 }
1436
1437 if (VecSize <= 32) // 4 bytes
1438 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1439 if (VecSize <= 64) // 8 bytes
1440 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1441 if (VecSize <= 96) // 12 bytes
1442 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1443 if (VecSize <= 128) // 16 bytes
1444 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1445 if (VecSize <= 160) // 20 bytes
1446 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1447 if (VecSize <= 256) // 32 bytes
1448 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1449 if (VecSize <= 288) // 36 bytes
1450 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1451 if (VecSize <= 320) // 40 bytes
1452 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1453 if (VecSize <= 352) // 44 bytes
1454 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1455 if (VecSize <= 384) // 48 bytes
1456 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1457 if (VecSize <= 512) // 64 bytes
1458 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1459 if (VecSize <= 1024) // 128 bytes
1460 return get(Opcode: AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1461
1462 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1463}
1464
1465static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1466 if (VecSize <= 32) // 4 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1468 if (VecSize <= 64) // 8 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1470 if (VecSize <= 96) // 12 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1472 if (VecSize <= 128) // 16 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1474 if (VecSize <= 160) // 20 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1476 if (VecSize <= 256) // 32 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1478 if (VecSize <= 288) // 36 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1480 if (VecSize <= 320) // 40 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1482 if (VecSize <= 352) // 44 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1484 if (VecSize <= 384) // 48 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1486 if (VecSize <= 512) // 64 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1488 if (VecSize <= 1024) // 128 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1490
1491 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1492}
1493
1494static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1495 if (VecSize <= 32) // 4 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1497 if (VecSize <= 64) // 8 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1499 if (VecSize <= 96) // 12 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1501 if (VecSize <= 128) // 16 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1503 if (VecSize <= 160) // 20 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1505 if (VecSize <= 256) // 32 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1507 if (VecSize <= 288) // 36 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1509 if (VecSize <= 320) // 40 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1511 if (VecSize <= 352) // 44 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1513 if (VecSize <= 384) // 48 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1515 if (VecSize <= 512) // 64 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1517 if (VecSize <= 1024) // 128 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1519
1520 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1521}
1522
1523static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1524 if (VecSize <= 64) // 8 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1526 if (VecSize <= 128) // 16 bytes
1527 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1528 if (VecSize <= 256) // 32 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1530 if (VecSize <= 512) // 64 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1532 if (VecSize <= 1024) // 128 bytes
1533 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1534
1535 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1536}
1537
1538const MCInstrDesc &
1539SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1540 bool IsSGPR) const {
1541 if (IsSGPR) {
1542 switch (EltSize) {
1543 case 32:
1544 return get(Opcode: getIndirectSGPRWriteMovRelPseudo32(VecSize));
1545 case 64:
1546 return get(Opcode: getIndirectSGPRWriteMovRelPseudo64(VecSize));
1547 default:
1548 llvm_unreachable("invalid reg indexing elt size");
1549 }
1550 }
1551
1552 assert(EltSize == 32 && "invalid reg indexing elt size");
1553 return get(Opcode: getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
1554}
1555
1556static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1557 switch (Size) {
1558 case 4:
1559 return AMDGPU::SI_SPILL_S32_SAVE;
1560 case 8:
1561 return AMDGPU::SI_SPILL_S64_SAVE;
1562 case 12:
1563 return AMDGPU::SI_SPILL_S96_SAVE;
1564 case 16:
1565 return AMDGPU::SI_SPILL_S128_SAVE;
1566 case 20:
1567 return AMDGPU::SI_SPILL_S160_SAVE;
1568 case 24:
1569 return AMDGPU::SI_SPILL_S192_SAVE;
1570 case 28:
1571 return AMDGPU::SI_SPILL_S224_SAVE;
1572 case 32:
1573 return AMDGPU::SI_SPILL_S256_SAVE;
1574 case 36:
1575 return AMDGPU::SI_SPILL_S288_SAVE;
1576 case 40:
1577 return AMDGPU::SI_SPILL_S320_SAVE;
1578 case 44:
1579 return AMDGPU::SI_SPILL_S352_SAVE;
1580 case 48:
1581 return AMDGPU::SI_SPILL_S384_SAVE;
1582 case 64:
1583 return AMDGPU::SI_SPILL_S512_SAVE;
1584 case 128:
1585 return AMDGPU::SI_SPILL_S1024_SAVE;
1586 default:
1587 llvm_unreachable("unknown register size");
1588 }
1589}
1590
1591static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1592 switch (Size) {
1593 case 2:
1594 return AMDGPU::SI_SPILL_V16_SAVE;
1595 case 4:
1596 return AMDGPU::SI_SPILL_V32_SAVE;
1597 case 8:
1598 return AMDGPU::SI_SPILL_V64_SAVE;
1599 case 12:
1600 return AMDGPU::SI_SPILL_V96_SAVE;
1601 case 16:
1602 return AMDGPU::SI_SPILL_V128_SAVE;
1603 case 20:
1604 return AMDGPU::SI_SPILL_V160_SAVE;
1605 case 24:
1606 return AMDGPU::SI_SPILL_V192_SAVE;
1607 case 28:
1608 return AMDGPU::SI_SPILL_V224_SAVE;
1609 case 32:
1610 return AMDGPU::SI_SPILL_V256_SAVE;
1611 case 36:
1612 return AMDGPU::SI_SPILL_V288_SAVE;
1613 case 40:
1614 return AMDGPU::SI_SPILL_V320_SAVE;
1615 case 44:
1616 return AMDGPU::SI_SPILL_V352_SAVE;
1617 case 48:
1618 return AMDGPU::SI_SPILL_V384_SAVE;
1619 case 64:
1620 return AMDGPU::SI_SPILL_V512_SAVE;
1621 case 128:
1622 return AMDGPU::SI_SPILL_V1024_SAVE;
1623 default:
1624 llvm_unreachable("unknown register size");
1625 }
1626}
1627
1628static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1629 switch (Size) {
1630 case 4:
1631 return AMDGPU::SI_SPILL_A32_SAVE;
1632 case 8:
1633 return AMDGPU::SI_SPILL_A64_SAVE;
1634 case 12:
1635 return AMDGPU::SI_SPILL_A96_SAVE;
1636 case 16:
1637 return AMDGPU::SI_SPILL_A128_SAVE;
1638 case 20:
1639 return AMDGPU::SI_SPILL_A160_SAVE;
1640 case 24:
1641 return AMDGPU::SI_SPILL_A192_SAVE;
1642 case 28:
1643 return AMDGPU::SI_SPILL_A224_SAVE;
1644 case 32:
1645 return AMDGPU::SI_SPILL_A256_SAVE;
1646 case 36:
1647 return AMDGPU::SI_SPILL_A288_SAVE;
1648 case 40:
1649 return AMDGPU::SI_SPILL_A320_SAVE;
1650 case 44:
1651 return AMDGPU::SI_SPILL_A352_SAVE;
1652 case 48:
1653 return AMDGPU::SI_SPILL_A384_SAVE;
1654 case 64:
1655 return AMDGPU::SI_SPILL_A512_SAVE;
1656 case 128:
1657 return AMDGPU::SI_SPILL_A1024_SAVE;
1658 default:
1659 llvm_unreachable("unknown register size");
1660 }
1661}
1662
1663static unsigned getAVSpillSaveOpcode(unsigned Size) {
1664 switch (Size) {
1665 case 4:
1666 return AMDGPU::SI_SPILL_AV32_SAVE;
1667 case 8:
1668 return AMDGPU::SI_SPILL_AV64_SAVE;
1669 case 12:
1670 return AMDGPU::SI_SPILL_AV96_SAVE;
1671 case 16:
1672 return AMDGPU::SI_SPILL_AV128_SAVE;
1673 case 20:
1674 return AMDGPU::SI_SPILL_AV160_SAVE;
1675 case 24:
1676 return AMDGPU::SI_SPILL_AV192_SAVE;
1677 case 28:
1678 return AMDGPU::SI_SPILL_AV224_SAVE;
1679 case 32:
1680 return AMDGPU::SI_SPILL_AV256_SAVE;
1681 case 36:
1682 return AMDGPU::SI_SPILL_AV288_SAVE;
1683 case 40:
1684 return AMDGPU::SI_SPILL_AV320_SAVE;
1685 case 44:
1686 return AMDGPU::SI_SPILL_AV352_SAVE;
1687 case 48:
1688 return AMDGPU::SI_SPILL_AV384_SAVE;
1689 case 64:
1690 return AMDGPU::SI_SPILL_AV512_SAVE;
1691 case 128:
1692 return AMDGPU::SI_SPILL_AV1024_SAVE;
1693 default:
1694 llvm_unreachable("unknown register size");
1695 }
1696}
1697
1698static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1699 bool IsVectorSuperClass) {
1700 // Currently, there is only 32-bit WWM register spills needed.
1701 if (Size != 4)
1702 llvm_unreachable("unknown wwm register spill size");
1703
1704 if (IsVectorSuperClass)
1705 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1706
1707 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1708}
1709
1710static unsigned getVectorRegSpillSaveOpcode(Register Reg,
1711 const TargetRegisterClass *RC,
1712 unsigned Size,
1713 const SIRegisterInfo &TRI,
1714 const SIMachineFunctionInfo &MFI) {
1715 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1716
1717 // Choose the right opcode if spilling a WWM register.
1718 if (MFI.checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
1719 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1720
1721 if (IsVectorSuperClass)
1722 return getAVSpillSaveOpcode(Size);
1723
1724 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1725 : getVGPRSpillSaveOpcode(Size);
1726}
1727
1728void SIInstrInfo::storeRegToStackSlot(
1729 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
1730 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1731 const TargetRegisterInfo *TRI, Register VReg,
1732 MachineInstr::MIFlag Flags) const {
1733 MachineFunction *MF = MBB.getParent();
1734 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1735 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1736 const DebugLoc &DL = MBB.findDebugLoc(MBBI: MI);
1737
1738 MachinePointerInfo PtrInfo
1739 = MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1740 MachineMemOperand *MMO = MF->getMachineMemOperand(
1741 PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
1742 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
1743 unsigned SpillSize = TRI->getSpillSize(RC: *RC);
1744
1745 MachineRegisterInfo &MRI = MF->getRegInfo();
1746 if (RI.isSGPRClass(RC)) {
1747 MFI->setHasSpilledSGPRs();
1748 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1749 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1750 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1751
1752 // We are only allowed to create one new instruction when spilling
1753 // registers, so we need to use pseudo instruction for spilling SGPRs.
1754 const MCInstrDesc &OpDesc = get(Opcode: getSGPRSpillSaveOpcode(Size: SpillSize));
1755
1756 // The SGPR spill/restore instructions only work on number sgprs, so we need
1757 // to make sure we are using the correct register class.
1758 if (SrcReg.isVirtual() && SpillSize == 4) {
1759 MRI.constrainRegClass(Reg: SrcReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
1760 }
1761
1762 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: OpDesc)
1763 .addReg(RegNo: SrcReg, flags: getKillRegState(B: isKill)) // data
1764 .addFrameIndex(Idx: FrameIndex) // addr
1765 .addMemOperand(MMO)
1766 .addReg(RegNo: MFI->getStackPtrOffsetReg(), flags: RegState::Implicit);
1767
1768 if (RI.spillSGPRToVGPR())
1769 FrameInfo.setStackID(ObjectIdx: FrameIndex, ID: TargetStackID::SGPRSpill);
1770 return;
1771 }
1772
1773 unsigned Opcode = getVectorRegSpillSaveOpcode(Reg: VReg ? VReg : SrcReg, RC,
1774 Size: SpillSize, TRI: RI, MFI: *MFI);
1775 MFI->setHasSpilledVGPRs();
1776
1777 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode))
1778 .addReg(RegNo: SrcReg, flags: getKillRegState(B: isKill)) // data
1779 .addFrameIndex(Idx: FrameIndex) // addr
1780 .addReg(RegNo: MFI->getStackPtrOffsetReg()) // scratch_offset
1781 .addImm(Val: 0) // offset
1782 .addMemOperand(MMO);
1783}
1784
1785static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1786 switch (Size) {
1787 case 4:
1788 return AMDGPU::SI_SPILL_S32_RESTORE;
1789 case 8:
1790 return AMDGPU::SI_SPILL_S64_RESTORE;
1791 case 12:
1792 return AMDGPU::SI_SPILL_S96_RESTORE;
1793 case 16:
1794 return AMDGPU::SI_SPILL_S128_RESTORE;
1795 case 20:
1796 return AMDGPU::SI_SPILL_S160_RESTORE;
1797 case 24:
1798 return AMDGPU::SI_SPILL_S192_RESTORE;
1799 case 28:
1800 return AMDGPU::SI_SPILL_S224_RESTORE;
1801 case 32:
1802 return AMDGPU::SI_SPILL_S256_RESTORE;
1803 case 36:
1804 return AMDGPU::SI_SPILL_S288_RESTORE;
1805 case 40:
1806 return AMDGPU::SI_SPILL_S320_RESTORE;
1807 case 44:
1808 return AMDGPU::SI_SPILL_S352_RESTORE;
1809 case 48:
1810 return AMDGPU::SI_SPILL_S384_RESTORE;
1811 case 64:
1812 return AMDGPU::SI_SPILL_S512_RESTORE;
1813 case 128:
1814 return AMDGPU::SI_SPILL_S1024_RESTORE;
1815 default:
1816 llvm_unreachable("unknown register size");
1817 }
1818}
1819
1820static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1821 switch (Size) {
1822 case 2:
1823 return AMDGPU::SI_SPILL_V16_RESTORE;
1824 case 4:
1825 return AMDGPU::SI_SPILL_V32_RESTORE;
1826 case 8:
1827 return AMDGPU::SI_SPILL_V64_RESTORE;
1828 case 12:
1829 return AMDGPU::SI_SPILL_V96_RESTORE;
1830 case 16:
1831 return AMDGPU::SI_SPILL_V128_RESTORE;
1832 case 20:
1833 return AMDGPU::SI_SPILL_V160_RESTORE;
1834 case 24:
1835 return AMDGPU::SI_SPILL_V192_RESTORE;
1836 case 28:
1837 return AMDGPU::SI_SPILL_V224_RESTORE;
1838 case 32:
1839 return AMDGPU::SI_SPILL_V256_RESTORE;
1840 case 36:
1841 return AMDGPU::SI_SPILL_V288_RESTORE;
1842 case 40:
1843 return AMDGPU::SI_SPILL_V320_RESTORE;
1844 case 44:
1845 return AMDGPU::SI_SPILL_V352_RESTORE;
1846 case 48:
1847 return AMDGPU::SI_SPILL_V384_RESTORE;
1848 case 64:
1849 return AMDGPU::SI_SPILL_V512_RESTORE;
1850 case 128:
1851 return AMDGPU::SI_SPILL_V1024_RESTORE;
1852 default:
1853 llvm_unreachable("unknown register size");
1854 }
1855}
1856
1857static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1858 switch (Size) {
1859 case 4:
1860 return AMDGPU::SI_SPILL_A32_RESTORE;
1861 case 8:
1862 return AMDGPU::SI_SPILL_A64_RESTORE;
1863 case 12:
1864 return AMDGPU::SI_SPILL_A96_RESTORE;
1865 case 16:
1866 return AMDGPU::SI_SPILL_A128_RESTORE;
1867 case 20:
1868 return AMDGPU::SI_SPILL_A160_RESTORE;
1869 case 24:
1870 return AMDGPU::SI_SPILL_A192_RESTORE;
1871 case 28:
1872 return AMDGPU::SI_SPILL_A224_RESTORE;
1873 case 32:
1874 return AMDGPU::SI_SPILL_A256_RESTORE;
1875 case 36:
1876 return AMDGPU::SI_SPILL_A288_RESTORE;
1877 case 40:
1878 return AMDGPU::SI_SPILL_A320_RESTORE;
1879 case 44:
1880 return AMDGPU::SI_SPILL_A352_RESTORE;
1881 case 48:
1882 return AMDGPU::SI_SPILL_A384_RESTORE;
1883 case 64:
1884 return AMDGPU::SI_SPILL_A512_RESTORE;
1885 case 128:
1886 return AMDGPU::SI_SPILL_A1024_RESTORE;
1887 default:
1888 llvm_unreachable("unknown register size");
1889 }
1890}
1891
1892static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1893 switch (Size) {
1894 case 4:
1895 return AMDGPU::SI_SPILL_AV32_RESTORE;
1896 case 8:
1897 return AMDGPU::SI_SPILL_AV64_RESTORE;
1898 case 12:
1899 return AMDGPU::SI_SPILL_AV96_RESTORE;
1900 case 16:
1901 return AMDGPU::SI_SPILL_AV128_RESTORE;
1902 case 20:
1903 return AMDGPU::SI_SPILL_AV160_RESTORE;
1904 case 24:
1905 return AMDGPU::SI_SPILL_AV192_RESTORE;
1906 case 28:
1907 return AMDGPU::SI_SPILL_AV224_RESTORE;
1908 case 32:
1909 return AMDGPU::SI_SPILL_AV256_RESTORE;
1910 case 36:
1911 return AMDGPU::SI_SPILL_AV288_RESTORE;
1912 case 40:
1913 return AMDGPU::SI_SPILL_AV320_RESTORE;
1914 case 44:
1915 return AMDGPU::SI_SPILL_AV352_RESTORE;
1916 case 48:
1917 return AMDGPU::SI_SPILL_AV384_RESTORE;
1918 case 64:
1919 return AMDGPU::SI_SPILL_AV512_RESTORE;
1920 case 128:
1921 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1922 default:
1923 llvm_unreachable("unknown register size");
1924 }
1925}
1926
1927static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1928 bool IsVectorSuperClass) {
1929 // Currently, there is only 32-bit WWM register spills needed.
1930 if (Size != 4)
1931 llvm_unreachable("unknown wwm register spill size");
1932
1933 if (IsVectorSuperClass)
1934 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1935
1936 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1937}
1938
1939static unsigned
1940getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
1941 unsigned Size, const SIRegisterInfo &TRI,
1942 const SIMachineFunctionInfo &MFI) {
1943 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1944
1945 // Choose the right opcode if restoring a WWM register.
1946 if (MFI.checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
1947 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1948
1949 if (IsVectorSuperClass)
1950 return getAVSpillRestoreOpcode(Size);
1951
1952 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1953 : getVGPRSpillRestoreOpcode(Size);
1954}
1955
1956void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
1957 MachineBasicBlock::iterator MI,
1958 Register DestReg, int FrameIndex,
1959 const TargetRegisterClass *RC,
1960 const TargetRegisterInfo *TRI,
1961 Register VReg,
1962 MachineInstr::MIFlag Flags) const {
1963 MachineFunction *MF = MBB.getParent();
1964 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1965 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1966 const DebugLoc &DL = MBB.findDebugLoc(MBBI: MI);
1967 unsigned SpillSize = TRI->getSpillSize(RC: *RC);
1968
1969 MachinePointerInfo PtrInfo
1970 = MachinePointerInfo::getFixedStack(MF&: *MF, FI: FrameIndex);
1971
1972 MachineMemOperand *MMO = MF->getMachineMemOperand(
1973 PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FrameIndex),
1974 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FrameIndex));
1975
1976 if (RI.isSGPRClass(RC)) {
1977 MFI->setHasSpilledSGPRs();
1978 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1979 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1980 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1981
1982 // FIXME: Maybe this should not include a memoperand because it will be
1983 // lowered to non-memory instructions.
1984 const MCInstrDesc &OpDesc = get(Opcode: getSGPRSpillRestoreOpcode(Size: SpillSize));
1985 if (DestReg.isVirtual() && SpillSize == 4) {
1986 MachineRegisterInfo &MRI = MF->getRegInfo();
1987 MRI.constrainRegClass(Reg: DestReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
1988 }
1989
1990 if (RI.spillSGPRToVGPR())
1991 FrameInfo.setStackID(ObjectIdx: FrameIndex, ID: TargetStackID::SGPRSpill);
1992 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: OpDesc, DestReg)
1993 .addFrameIndex(Idx: FrameIndex) // addr
1994 .addMemOperand(MMO)
1995 .addReg(RegNo: MFI->getStackPtrOffsetReg(), flags: RegState::Implicit);
1996
1997 return;
1998 }
1999
2000 unsigned Opcode = getVectorRegSpillRestoreOpcode(Reg: VReg ? VReg : DestReg, RC,
2001 Size: SpillSize, TRI: RI, MFI: *MFI);
2002 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode), DestReg)
2003 .addFrameIndex(Idx: FrameIndex) // vaddr
2004 .addReg(RegNo: MFI->getStackPtrOffsetReg()) // scratch_offset
2005 .addImm(Val: 0) // offset
2006 .addMemOperand(MMO);
2007}
2008
2009void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
2010 MachineBasicBlock::iterator MI) const {
2011 insertNoops(MBB, MI, Quantity: 1);
2012}
2013
2014void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
2015 MachineBasicBlock::iterator MI,
2016 unsigned Quantity) const {
2017 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
2018 while (Quantity > 0) {
2019 unsigned Arg = std::min(a: Quantity, b: 8u);
2020 Quantity -= Arg;
2021 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOP)).addImm(Val: Arg - 1);
2022 }
2023}
2024
2025void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
2026 auto *MF = MBB.getParent();
2027 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2028
2029 assert(Info->isEntryFunction());
2030
2031 if (MBB.succ_empty()) {
2032 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2033 if (HasNoTerminator) {
2034 if (Info->returnsVoid()) {
2035 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DebugLoc(), MCID: get(Opcode: AMDGPU::S_ENDPGM)).addImm(Val: 0);
2036 } else {
2037 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DebugLoc(), MCID: get(Opcode: AMDGPU::SI_RETURN_TO_EPILOG));
2038 }
2039 }
2040 }
2041}
2042
2043MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
2044 MachineBasicBlock &MBB,
2045 MachineInstr &MI,
2046 const DebugLoc &DL) const {
2047 MachineFunction *MF = MBB.getParent();
2048 constexpr unsigned DoorbellIDMask = 0x3ff;
2049 constexpr unsigned ECQueueWaveAbort = 0x400;
2050
2051 MachineBasicBlock *TrapBB = &MBB;
2052 MachineBasicBlock *ContBB = &MBB;
2053 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2054
2055 if (!MBB.succ_empty() || std::next(x: MI.getIterator()) != MBB.end()) {
2056 ContBB = MBB.splitAt(SplitInst&: MI, /*UpdateLiveIns=*/false);
2057 TrapBB = MF->CreateMachineBasicBlock();
2058 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CBRANCH_EXECNZ)).addMBB(MBB: TrapBB);
2059 MF->push_back(MBB: TrapBB);
2060 MBB.addSuccessor(Succ: TrapBB);
2061 }
2062
2063 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2064 // will be a nop.
2065 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_TRAP))
2066 .addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2067 Register DoorbellReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2068 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SENDMSG_RTN_B32),
2069 DestReg: DoorbellReg)
2070 .addImm(Val: AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
2071 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::TTMP2)
2072 .addUse(RegNo: AMDGPU::M0);
2073 Register DoorbellRegMasked =
2074 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2075 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_AND_B32), DestReg: DoorbellRegMasked)
2076 .addUse(RegNo: DoorbellReg)
2077 .addImm(Val: DoorbellIDMask);
2078 Register SetWaveAbortBit =
2079 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
2080 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_OR_B32), DestReg: SetWaveAbortBit)
2081 .addUse(RegNo: DoorbellRegMasked)
2082 .addImm(Val: ECQueueWaveAbort);
2083 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
2084 .addUse(RegNo: SetWaveAbortBit);
2085 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SENDMSG))
2086 .addImm(Val: AMDGPU::SendMsg::ID_INTERRUPT);
2087 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
2088 .addUse(RegNo: AMDGPU::TTMP2);
2089 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH)).addMBB(MBB: HaltLoopBB);
2090 TrapBB->addSuccessor(Succ: HaltLoopBB);
2091
2092 BuildMI(BB&: *HaltLoopBB, I: HaltLoopBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETHALT)).addImm(Val: 5);
2093 BuildMI(BB&: *HaltLoopBB, I: HaltLoopBB->end(), MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
2094 .addMBB(MBB: HaltLoopBB);
2095 MF->push_back(MBB: HaltLoopBB);
2096 HaltLoopBB->addSuccessor(Succ: HaltLoopBB);
2097
2098 return ContBB;
2099}
2100
2101unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
2102 switch (MI.getOpcode()) {
2103 default:
2104 if (MI.isMetaInstruction())
2105 return 0;
2106 return 1; // FIXME: Do wait states equal cycles?
2107
2108 case AMDGPU::S_NOP:
2109 return MI.getOperand(i: 0).getImm() + 1;
2110 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2111 // hazard, even if one exist, won't really be visible. Should we handle it?
2112 }
2113}
2114
2115bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2116 MachineBasicBlock &MBB = *MI.getParent();
2117 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
2118 switch (MI.getOpcode()) {
2119 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2120 case AMDGPU::S_MOV_B64_term:
2121 // This is only a terminator to get the correct spill code placement during
2122 // register allocation.
2123 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2124 break;
2125
2126 case AMDGPU::S_MOV_B32_term:
2127 // This is only a terminator to get the correct spill code placement during
2128 // register allocation.
2129 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B32));
2130 break;
2131
2132 case AMDGPU::S_XOR_B64_term:
2133 // This is only a terminator to get the correct spill code placement during
2134 // register allocation.
2135 MI.setDesc(get(Opcode: AMDGPU::S_XOR_B64));
2136 break;
2137
2138 case AMDGPU::S_XOR_B32_term:
2139 // This is only a terminator to get the correct spill code placement during
2140 // register allocation.
2141 MI.setDesc(get(Opcode: AMDGPU::S_XOR_B32));
2142 break;
2143 case AMDGPU::S_OR_B64_term:
2144 // This is only a terminator to get the correct spill code placement during
2145 // register allocation.
2146 MI.setDesc(get(Opcode: AMDGPU::S_OR_B64));
2147 break;
2148 case AMDGPU::S_OR_B32_term:
2149 // This is only a terminator to get the correct spill code placement during
2150 // register allocation.
2151 MI.setDesc(get(Opcode: AMDGPU::S_OR_B32));
2152 break;
2153
2154 case AMDGPU::S_ANDN2_B64_term:
2155 // This is only a terminator to get the correct spill code placement during
2156 // register allocation.
2157 MI.setDesc(get(Opcode: AMDGPU::S_ANDN2_B64));
2158 break;
2159
2160 case AMDGPU::S_ANDN2_B32_term:
2161 // This is only a terminator to get the correct spill code placement during
2162 // register allocation.
2163 MI.setDesc(get(Opcode: AMDGPU::S_ANDN2_B32));
2164 break;
2165
2166 case AMDGPU::S_AND_B64_term:
2167 // This is only a terminator to get the correct spill code placement during
2168 // register allocation.
2169 MI.setDesc(get(Opcode: AMDGPU::S_AND_B64));
2170 break;
2171
2172 case AMDGPU::S_AND_B32_term:
2173 // This is only a terminator to get the correct spill code placement during
2174 // register allocation.
2175 MI.setDesc(get(Opcode: AMDGPU::S_AND_B32));
2176 break;
2177
2178 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2179 // This is only a terminator to get the correct spill code placement during
2180 // register allocation.
2181 MI.setDesc(get(Opcode: AMDGPU::S_AND_SAVEEXEC_B64));
2182 break;
2183
2184 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2185 // This is only a terminator to get the correct spill code placement during
2186 // register allocation.
2187 MI.setDesc(get(Opcode: AMDGPU::S_AND_SAVEEXEC_B32));
2188 break;
2189
2190 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2191 MI.setDesc(get(Opcode: AMDGPU::V_WRITELANE_B32));
2192 break;
2193
2194 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2195 MI.setDesc(get(Opcode: AMDGPU::V_READLANE_B32));
2196 MI.getMF()->getRegInfo().constrainRegClass(Reg: MI.getOperand(i: 0).getReg(),
2197 RC: &AMDGPU::SReg_32_XM0RegClass);
2198 break;
2199 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2200 Register Dst = MI.getOperand(i: 0).getReg();
2201 bool IsAGPR = SIRegisterInfo::isAGPRClass(RC: RI.getPhysRegBaseClass(Reg: Dst));
2202 MI.setDesc(
2203 get(Opcode: IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2204 break;
2205 }
2206 case AMDGPU::V_MOV_B64_PSEUDO: {
2207 Register Dst = MI.getOperand(i: 0).getReg();
2208 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2209 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2210
2211 const MachineOperand &SrcOp = MI.getOperand(i: 1);
2212 // FIXME: Will this work for 64-bit floating point immediates?
2213 assert(!SrcOp.isFPImm());
2214 if (ST.hasMovB64()) {
2215 MI.setDesc(get(Opcode: AMDGPU::V_MOV_B64_e32));
2216 if (SrcOp.isReg() || isInlineConstant(MI, OpIdx: 1) ||
2217 isUInt<32>(x: SrcOp.getImm()))
2218 break;
2219 }
2220 if (SrcOp.isImm()) {
2221 APInt Imm(64, SrcOp.getImm());
2222 APInt Lo(32, Imm.getLoBits(numBits: 32).getZExtValue());
2223 APInt Hi(32, Imm.getHiBits(numBits: 32).getZExtValue());
2224 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Imm: Lo)) {
2225 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg: Dst)
2226 .addImm(Val: SISrcMods::OP_SEL_1)
2227 .addImm(Val: Lo.getSExtValue())
2228 .addImm(Val: SISrcMods::OP_SEL_1)
2229 .addImm(Val: Lo.getSExtValue())
2230 .addImm(Val: 0) // op_sel_lo
2231 .addImm(Val: 0) // op_sel_hi
2232 .addImm(Val: 0) // neg_lo
2233 .addImm(Val: 0) // neg_hi
2234 .addImm(Val: 0); // clamp
2235 } else {
2236 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo)
2237 .addImm(Val: Lo.getSExtValue())
2238 .addReg(RegNo: Dst, flags: RegState::Implicit | RegState::Define);
2239 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstHi)
2240 .addImm(Val: Hi.getSExtValue())
2241 .addReg(RegNo: Dst, flags: RegState::Implicit | RegState::Define);
2242 }
2243 } else {
2244 assert(SrcOp.isReg());
2245 if (ST.hasPkMovB32() &&
2246 !RI.isAGPR(MRI: MBB.getParent()->getRegInfo(), Reg: SrcOp.getReg())) {
2247 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_PK_MOV_B32), DestReg: Dst)
2248 .addImm(Val: SISrcMods::OP_SEL_1) // src0_mod
2249 .addReg(RegNo: SrcOp.getReg())
2250 .addImm(Val: SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
2251 .addReg(RegNo: SrcOp.getReg())
2252 .addImm(Val: 0) // op_sel_lo
2253 .addImm(Val: 0) // op_sel_hi
2254 .addImm(Val: 0) // neg_lo
2255 .addImm(Val: 0) // neg_hi
2256 .addImm(Val: 0); // clamp
2257 } else {
2258 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo)
2259 .addReg(RegNo: RI.getSubReg(Reg: SrcOp.getReg(), Idx: AMDGPU::sub0))
2260 .addReg(RegNo: Dst, flags: RegState::Implicit | RegState::Define);
2261 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstHi)
2262 .addReg(RegNo: RI.getSubReg(Reg: SrcOp.getReg(), Idx: AMDGPU::sub1))
2263 .addReg(RegNo: Dst, flags: RegState::Implicit | RegState::Define);
2264 }
2265 }
2266 MI.eraseFromParent();
2267 break;
2268 }
2269 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2270 expandMovDPP64(MI);
2271 break;
2272 }
2273 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2274 const MachineOperand &SrcOp = MI.getOperand(i: 1);
2275 assert(!SrcOp.isFPImm());
2276 APInt Imm(64, SrcOp.getImm());
2277 if (Imm.isIntN(N: 32) || isInlineConstant(Imm)) {
2278 MI.setDesc(get(Opcode: AMDGPU::S_MOV_B64));
2279 break;
2280 }
2281
2282 Register Dst = MI.getOperand(i: 0).getReg();
2283 Register DstLo = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub0);
2284 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2285
2286 APInt Lo(32, Imm.getLoBits(numBits: 32).getZExtValue());
2287 APInt Hi(32, Imm.getHiBits(numBits: 32).getZExtValue());
2288 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstLo)
2289 .addImm(Val: Lo.getSExtValue())
2290 .addReg(RegNo: Dst, flags: RegState::Implicit | RegState::Define);
2291 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstHi)
2292 .addImm(Val: Hi.getSExtValue())
2293 .addReg(RegNo: Dst, flags: RegState::Implicit | RegState::Define);
2294 MI.eraseFromParent();
2295 break;
2296 }
2297 case AMDGPU::V_SET_INACTIVE_B32: {
2298 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2299 Register DstReg = MI.getOperand(i: 0).getReg();
2300 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstReg)
2301 .add(MO: MI.getOperand(i: 3))
2302 .add(MO: MI.getOperand(i: 4))
2303 .add(MO: MI.getOperand(i: 1))
2304 .add(MO: MI.getOperand(i: 2))
2305 .add(MO: MI.getOperand(i: 5));
2306 MI.eraseFromParent();
2307 break;
2308 }
2309 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2310 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2311 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2317 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2318 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2319 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2320 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2324 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2325 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2326 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2327 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2328 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2329 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2330 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2331 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2332 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2333 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2334 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2335 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2336 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2337 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2338 const TargetRegisterClass *EltRC = getOpRegClass(MI, OpNo: 2);
2339
2340 unsigned Opc;
2341 if (RI.hasVGPRs(RC: EltRC)) {
2342 Opc = AMDGPU::V_MOVRELD_B32_e32;
2343 } else {
2344 Opc = RI.getRegSizeInBits(RC: *EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2345 : AMDGPU::S_MOVRELD_B32;
2346 }
2347
2348 const MCInstrDesc &OpDesc = get(Opcode: Opc);
2349 Register VecReg = MI.getOperand(i: 0).getReg();
2350 bool IsUndef = MI.getOperand(i: 1).isUndef();
2351 unsigned SubReg = MI.getOperand(i: 3).getImm();
2352 assert(VecReg == MI.getOperand(1).getReg());
2353
2354 MachineInstrBuilder MIB =
2355 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: OpDesc)
2356 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), flags: RegState::Undef)
2357 .add(MO: MI.getOperand(i: 2))
2358 .addReg(RegNo: VecReg, flags: RegState::ImplicitDefine)
2359 .addReg(RegNo: VecReg, flags: RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2360
2361 const int ImpDefIdx =
2362 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2363 const int ImpUseIdx = ImpDefIdx + 1;
2364 MIB->tieOperands(DefIdx: ImpDefIdx, UseIdx: ImpUseIdx);
2365 MI.eraseFromParent();
2366 break;
2367 }
2368 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2369 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2370 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2371 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2372 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2373 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2374 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2375 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2376 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2377 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2378 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2379 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2380 assert(ST.useVGPRIndexMode());
2381 Register VecReg = MI.getOperand(i: 0).getReg();
2382 bool IsUndef = MI.getOperand(i: 1).isUndef();
2383 MachineOperand &Idx = MI.getOperand(i: 3);
2384 Register SubReg = MI.getOperand(i: 4).getImm();
2385
2386 MachineInstr *SetOn = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_ON))
2387 .add(MO: Idx)
2388 .addImm(Val: AMDGPU::VGPRIndexMode::DST_ENABLE);
2389 SetOn->getOperand(i: 3).setIsUndef();
2390
2391 const MCInstrDesc &OpDesc = get(Opcode: AMDGPU::V_MOV_B32_indirect_write);
2392 MachineInstrBuilder MIB =
2393 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: OpDesc)
2394 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), flags: RegState::Undef)
2395 .add(MO: MI.getOperand(i: 2))
2396 .addReg(RegNo: VecReg, flags: RegState::ImplicitDefine)
2397 .addReg(RegNo: VecReg,
2398 flags: RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2399
2400 const int ImpDefIdx =
2401 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2402 const int ImpUseIdx = ImpDefIdx + 1;
2403 MIB->tieOperands(DefIdx: ImpDefIdx, UseIdx: ImpUseIdx);
2404
2405 MachineInstr *SetOff = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_OFF));
2406
2407 finalizeBundle(MBB, FirstMI: SetOn->getIterator(), LastMI: std::next(x: SetOff->getIterator()));
2408
2409 MI.eraseFromParent();
2410 break;
2411 }
2412 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2413 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2414 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2415 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2416 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2417 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2418 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2419 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2420 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2421 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2422 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2423 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2424 assert(ST.useVGPRIndexMode());
2425 Register Dst = MI.getOperand(i: 0).getReg();
2426 Register VecReg = MI.getOperand(i: 1).getReg();
2427 bool IsUndef = MI.getOperand(i: 1).isUndef();
2428 Register Idx = MI.getOperand(i: 2).getReg();
2429 Register SubReg = MI.getOperand(i: 3).getImm();
2430
2431 MachineInstr *SetOn = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_ON))
2432 .addReg(RegNo: Idx)
2433 .addImm(Val: AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2434 SetOn->getOperand(i: 3).setIsUndef();
2435
2436 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_indirect_read))
2437 .addDef(RegNo: Dst)
2438 .addReg(RegNo: RI.getSubReg(Reg: VecReg, Idx: SubReg), flags: RegState::Undef)
2439 .addReg(RegNo: VecReg, flags: RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2440
2441 MachineInstr *SetOff = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SET_GPR_IDX_OFF));
2442
2443 finalizeBundle(MBB, FirstMI: SetOn->getIterator(), LastMI: std::next(x: SetOff->getIterator()));
2444
2445 MI.eraseFromParent();
2446 break;
2447 }
2448 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2449 MachineFunction &MF = *MBB.getParent();
2450 Register Reg = MI.getOperand(i: 0).getReg();
2451 Register RegLo = RI.getSubReg(Reg, Idx: AMDGPU::sub0);
2452 Register RegHi = RI.getSubReg(Reg, Idx: AMDGPU::sub1);
2453 MachineOperand OpLo = MI.getOperand(i: 1);
2454 MachineOperand OpHi = MI.getOperand(i: 2);
2455
2456 // Create a bundle so these instructions won't be re-ordered by the
2457 // post-RA scheduler.
2458 MIBundleBuilder Bundler(MBB, MI);
2459 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: Reg));
2460
2461 // What we want here is an offset from the value returned by s_getpc (which
2462 // is the address of the s_add_u32 instruction) to the global variable, but
2463 // since the encoding of $symbol starts 4 bytes after the start of the
2464 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2465 // small. This requires us to add 4 to the global variable offset in order
2466 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2467 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2468 // instruction.
2469
2470 int64_t Adjust = 0;
2471 if (ST.hasGetPCZeroExtension()) {
2472 // Fix up hardware that does not sign-extend the 48-bit PC value by
2473 // inserting: s_sext_i32_i16 reghi, reghi
2474 Bundler.append(
2475 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SEXT_I32_I16), DestReg: RegHi).addReg(RegNo: RegHi));
2476 Adjust += 4;
2477 }
2478
2479 if (OpLo.isGlobal())
2480 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2481 Bundler.append(
2482 MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U32), DestReg: RegLo).addReg(RegNo: RegLo).add(MO: OpLo));
2483
2484 if (OpHi.isGlobal())
2485 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2486 Bundler.append(MI: BuildMI(MF, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADDC_U32), DestReg: RegHi)
2487 .addReg(RegNo: RegHi)
2488 .add(MO: OpHi));
2489
2490 finalizeBundle(MBB, FirstMI: Bundler.begin());
2491
2492 MI.eraseFromParent();
2493 break;
2494 }
2495 case AMDGPU::ENTER_STRICT_WWM: {
2496 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2497 // Whole Wave Mode is entered.
2498 MI.setDesc(get(Opcode: ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2499 : AMDGPU::S_OR_SAVEEXEC_B64));
2500 break;
2501 }
2502 case AMDGPU::ENTER_STRICT_WQM: {
2503 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2504 // STRICT_WQM is entered.
2505 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2506 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2507 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2508 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: MovOp), DestReg: MI.getOperand(i: 0).getReg()).addReg(RegNo: Exec);
2509 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: WQMOp), DestReg: Exec).addReg(RegNo: Exec);
2510
2511 MI.eraseFromParent();
2512 break;
2513 }
2514 case AMDGPU::EXIT_STRICT_WWM:
2515 case AMDGPU::EXIT_STRICT_WQM: {
2516 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2517 // WWM/STICT_WQM is exited.
2518 MI.setDesc(get(Opcode: ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2519 break;
2520 }
2521 case AMDGPU::SI_RETURN: {
2522 const MachineFunction *MF = MBB.getParent();
2523 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2524 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2525 // Hiding the return address use with SI_RETURN may lead to extra kills in
2526 // the function and missing live-ins. We are fine in practice because callee
2527 // saved register handling ensures the register value is restored before
2528 // RET, but we need the undef flag here to appease the MachineVerifier
2529 // liveness checks.
2530 MachineInstrBuilder MIB =
2531 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETPC_B64_return))
2532 .addReg(RegNo: TRI->getReturnAddressReg(MF: *MF), flags: RegState::Undef);
2533
2534 MIB.copyImplicitOps(OtherMI: MI);
2535 MI.eraseFromParent();
2536 break;
2537 }
2538
2539 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2540 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2541 MI.setDesc(get(Opcode: AMDGPU::S_MUL_U64));
2542 break;
2543
2544 case AMDGPU::S_GETPC_B64_pseudo:
2545 MI.setDesc(get(Opcode: AMDGPU::S_GETPC_B64));
2546 if (ST.hasGetPCZeroExtension()) {
2547 Register Dst = MI.getOperand(i: 0).getReg();
2548 Register DstHi = RI.getSubReg(Reg: Dst, Idx: AMDGPU::sub1);
2549 // Fix up hardware that does not sign-extend the 48-bit PC value by
2550 // inserting: s_sext_i32_i16 dsthi, dsthi
2551 BuildMI(BB&: MBB, I: std::next(x: MI.getIterator()), MIMD: DL, MCID: get(Opcode: AMDGPU::S_SEXT_I32_I16),
2552 DestReg: DstHi)
2553 .addReg(RegNo: DstHi);
2554 }
2555 break;
2556 }
2557 return true;
2558}
2559
2560void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
2561 MachineBasicBlock::iterator I, Register DestReg,
2562 unsigned SubIdx, const MachineInstr &Orig,
2563 const TargetRegisterInfo &RI) const {
2564
2565 // Try shrinking the instruction to remat only the part needed for current
2566 // context.
2567 // TODO: Handle more cases.
2568 unsigned Opcode = Orig.getOpcode();
2569 switch (Opcode) {
2570 case AMDGPU::S_LOAD_DWORDX16_IMM:
2571 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2572 if (SubIdx != 0)
2573 break;
2574
2575 if (I == MBB.end())
2576 break;
2577
2578 if (I->isBundled())
2579 break;
2580
2581 // Look for a single use of the register that is also a subreg.
2582 Register RegToFind = Orig.getOperand(i: 0).getReg();
2583 MachineOperand *UseMO = nullptr;
2584 for (auto &CandMO : I->operands()) {
2585 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2586 continue;
2587 if (UseMO) {
2588 UseMO = nullptr;
2589 break;
2590 }
2591 UseMO = &CandMO;
2592 }
2593 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2594 break;
2595
2596 unsigned Offset = RI.getSubRegIdxOffset(Idx: UseMO->getSubReg());
2597 unsigned SubregSize = RI.getSubRegIdxSize(Idx: UseMO->getSubReg());
2598
2599 MachineFunction *MF = MBB.getParent();
2600 MachineRegisterInfo &MRI = MF->getRegInfo();
2601 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2602
2603 unsigned NewOpcode = -1;
2604 if (SubregSize == 256)
2605 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2606 else if (SubregSize == 128)
2607 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2608 else
2609 break;
2610
2611 const MCInstrDesc &TID = get(Opcode: NewOpcode);
2612 const TargetRegisterClass *NewRC =
2613 RI.getAllocatableClass(RC: getRegClass(TID, OpNum: 0, TRI: &RI, MF: *MF));
2614 MRI.setRegClass(Reg: DestReg, RC: NewRC);
2615
2616 UseMO->setReg(DestReg);
2617 UseMO->setSubReg(AMDGPU::NoSubRegister);
2618
2619 // Use a smaller load with the desired size, possibly with updated offset.
2620 MachineInstr *MI = MF->CloneMachineInstr(Orig: &Orig);
2621 MI->setDesc(TID);
2622 MI->getOperand(i: 0).setReg(DestReg);
2623 MI->getOperand(i: 0).setSubReg(AMDGPU::NoSubRegister);
2624 if (Offset) {
2625 MachineOperand *OffsetMO = getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset);
2626 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2627 OffsetMO->setImm(FinalOffset);
2628 }
2629 SmallVector<MachineMemOperand *> NewMMOs;
2630 for (const MachineMemOperand *MemOp : Orig.memoperands())
2631 NewMMOs.push_back(Elt: MF->getMachineMemOperand(MMO: MemOp, PtrInfo: MemOp->getPointerInfo(),
2632 Size: SubregSize / 8));
2633 MI->setMemRefs(MF&: *MF, MemRefs: NewMMOs);
2634
2635 MBB.insert(I, MI);
2636 return;
2637 }
2638
2639 default:
2640 break;
2641 }
2642
2643 TargetInstrInfo::reMaterialize(MBB, MI: I, DestReg, SubIdx, Orig, TRI: RI);
2644}
2645
2646std::pair<MachineInstr*, MachineInstr*>
2647SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
2648 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2649
2650 if (ST.hasMovB64() &&
2651 AMDGPU::isLegalDPALU_DPPControl(
2652 DC: getNamedOperand(MI, OperandName: AMDGPU::OpName::dpp_ctrl)->getImm())) {
2653 MI.setDesc(get(Opcode: AMDGPU::V_MOV_B64_dpp));
2654 return std::pair(&MI, nullptr);
2655 }
2656
2657 MachineBasicBlock &MBB = *MI.getParent();
2658 DebugLoc DL = MBB.findDebugLoc(MBBI: MI);
2659 MachineFunction *MF = MBB.getParent();
2660 MachineRegisterInfo &MRI = MF->getRegInfo();
2661 Register Dst = MI.getOperand(i: 0).getReg();
2662 unsigned Part = 0;
2663 MachineInstr *Split[2];
2664
2665 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2666 auto MovDPP = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_dpp));
2667 if (Dst.isPhysical()) {
2668 MovDPP.addDef(RegNo: RI.getSubReg(Reg: Dst, Idx: Sub));
2669 } else {
2670 assert(MRI.isSSA());
2671 auto Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
2672 MovDPP.addDef(RegNo: Tmp);
2673 }
2674
2675 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2676 const MachineOperand &SrcOp = MI.getOperand(i: I);
2677 assert(!SrcOp.isFPImm());
2678 if (SrcOp.isImm()) {
2679 APInt Imm(64, SrcOp.getImm());
2680 Imm.ashrInPlace(ShiftAmt: Part * 32);
2681 MovDPP.addImm(Val: Imm.getLoBits(numBits: 32).getZExtValue());
2682 } else {
2683 assert(SrcOp.isReg());
2684 Register Src = SrcOp.getReg();
2685 if (Src.isPhysical())
2686 MovDPP.addReg(RegNo: RI.getSubReg(Reg: Src, Idx: Sub));
2687 else
2688 MovDPP.addReg(RegNo: Src, flags: SrcOp.isUndef() ? RegState::Undef : 0, SubReg: Sub);
2689 }
2690 }
2691
2692 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.explicit_operands(), N: 3))
2693 MovDPP.addImm(Val: MO.getImm());
2694
2695 Split[Part] = MovDPP;
2696 ++Part;
2697 }
2698
2699 if (Dst.isVirtual())
2700 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
2701 .addReg(RegNo: Split[0]->getOperand(i: 0).getReg())
2702 .addImm(Val: AMDGPU::sub0)
2703 .addReg(RegNo: Split[1]->getOperand(i: 0).getReg())
2704 .addImm(Val: AMDGPU::sub1);
2705
2706 MI.eraseFromParent();
2707 return std::pair(Split[0], Split[1]);
2708}
2709
2710std::optional<DestSourcePair>
2711SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
2712 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2713 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 1)};
2714
2715 return std::nullopt;
2716}
2717
2718bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0,
2719 AMDGPU::OpName Src0OpName,
2720 MachineOperand &Src1,
2721 AMDGPU::OpName Src1OpName) const {
2722 MachineOperand *Src0Mods = getNamedOperand(MI, OperandName: Src0OpName);
2723 if (!Src0Mods)
2724 return false;
2725
2726 MachineOperand *Src1Mods = getNamedOperand(MI, OperandName: Src1OpName);
2727 assert(Src1Mods &&
2728 "All commutable instructions have both src0 and src1 modifiers");
2729
2730 int Src0ModsVal = Src0Mods->getImm();
2731 int Src1ModsVal = Src1Mods->getImm();
2732
2733 Src1Mods->setImm(Src0ModsVal);
2734 Src0Mods->setImm(Src1ModsVal);
2735 return true;
2736}
2737
2738static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
2739 MachineOperand &RegOp,
2740 MachineOperand &NonRegOp) {
2741 Register Reg = RegOp.getReg();
2742 unsigned SubReg = RegOp.getSubReg();
2743 bool IsKill = RegOp.isKill();
2744 bool IsDead = RegOp.isDead();
2745 bool IsUndef = RegOp.isUndef();
2746 bool IsDebug = RegOp.isDebug();
2747
2748 if (NonRegOp.isImm())
2749 RegOp.ChangeToImmediate(ImmVal: NonRegOp.getImm());
2750 else if (NonRegOp.isFI())
2751 RegOp.ChangeToFrameIndex(Idx: NonRegOp.getIndex());
2752 else if (NonRegOp.isGlobal()) {
2753 RegOp.ChangeToGA(GV: NonRegOp.getGlobal(), Offset: NonRegOp.getOffset(),
2754 TargetFlags: NonRegOp.getTargetFlags());
2755 } else
2756 return nullptr;
2757
2758 // Make sure we don't reinterpret a subreg index in the target flags.
2759 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2760
2761 NonRegOp.ChangeToRegister(Reg, isDef: false, isImp: false, isKill: IsKill, isDead: IsDead, isUndef: IsUndef, isDebug: IsDebug);
2762 NonRegOp.setSubReg(SubReg);
2763
2764 return &MI;
2765}
2766
2767static MachineInstr *swapImmOperands(MachineInstr &MI,
2768 MachineOperand &NonRegOp1,
2769 MachineOperand &NonRegOp2) {
2770 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2771 int64_t NonRegVal = NonRegOp1.getImm();
2772
2773 NonRegOp1.setImm(NonRegOp2.getImm());
2774 NonRegOp2.setImm(NonRegVal);
2775 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2776 NonRegOp2.setTargetFlags(TargetFlags);
2777 return &MI;
2778}
2779
2780bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2781 const MachineOperand *MO0, unsigned OpIdx1,
2782 const MachineOperand *MO1) const {
2783 const MCInstrDesc &InstDesc = MI.getDesc();
2784 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2785 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2786 const TargetRegisterClass *DefinedRC1 =
2787 OpInfo1.RegClass != -1 ? RI.getRegClass(RCID: OpInfo1.RegClass) : nullptr;
2788 const TargetRegisterClass *DefinedRC0 =
2789 OpInfo1.RegClass != -1 ? RI.getRegClass(RCID: OpInfo0.RegClass) : nullptr;
2790
2791 unsigned Opc = MI.getOpcode();
2792 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
2793
2794 // Swap doesn't breach constant bus or literal limits
2795 // It may move literal to position other than src0, this is not allowed
2796 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2797 // FIXME: After gfx9, literal can be in place other than Src0
2798 if (isVALU(MI)) {
2799 if ((int)OpIdx0 == Src0Idx && !MO0->isReg() &&
2800 !isInlineConstant(MO: *MO0, OpInfo: OpInfo1))
2801 return false;
2802 if ((int)OpIdx1 == Src0Idx && !MO1->isReg() &&
2803 !isInlineConstant(MO: *MO1, OpInfo: OpInfo0))
2804 return false;
2805 }
2806
2807 if ((int)OpIdx1 != Src0Idx && MO0->isReg()) {
2808 if (!DefinedRC1)
2809 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2810 return isLegalRegOperand(MI, OpIdx: OpIdx1, MO: *MO0);
2811 }
2812 if ((int)OpIdx0 != Src0Idx && MO1->isReg()) {
2813 if (!DefinedRC0)
2814 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2815 return isLegalRegOperand(MI, OpIdx: OpIdx0, MO: *MO1);
2816 }
2817
2818 // No need to check 64-bit literals since swapping does not bring new
2819 // 64-bit literals into current instruction to fold to 32-bit
2820
2821 return isImmOperandLegal(MI, OpNo: OpIdx1, MO: *MO0);
2822}
2823
2824MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
2825 unsigned Src0Idx,
2826 unsigned Src1Idx) const {
2827 assert(!NewMI && "this should never be used");
2828
2829 unsigned Opc = MI.getOpcode();
2830 int CommutedOpcode = commuteOpcode(Opcode: Opc);
2831 if (CommutedOpcode == -1)
2832 return nullptr;
2833
2834 if (Src0Idx > Src1Idx)
2835 std::swap(a&: Src0Idx, b&: Src1Idx);
2836
2837 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2838 static_cast<int>(Src0Idx) &&
2839 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2840 static_cast<int>(Src1Idx) &&
2841 "inconsistency with findCommutedOpIndices");
2842
2843 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
2844 MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
2845 if (!isLegalToSwap(MI, OpIdx0: Src0Idx, MO0: &Src0, OpIdx1: Src1Idx, MO1: &Src1)) {
2846 return nullptr;
2847 }
2848 MachineInstr *CommutedMI = nullptr;
2849 if (Src0.isReg() && Src1.isReg()) {
2850 // Be sure to copy the source modifiers to the right place.
2851 CommutedMI =
2852 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1: Src0Idx, OpIdx2: Src1Idx);
2853 } else if (Src0.isReg() && !Src1.isReg()) {
2854 CommutedMI = swapRegAndNonRegOperand(MI, RegOp&: Src0, NonRegOp&: Src1);
2855 } else if (!Src0.isReg() && Src1.isReg()) {
2856 CommutedMI = swapRegAndNonRegOperand(MI, RegOp&: Src1, NonRegOp&: Src0);
2857 } else if (Src0.isImm() && Src1.isImm()) {
2858 CommutedMI = swapImmOperands(MI, NonRegOp1&: Src0, NonRegOp2&: Src1);
2859 } else {
2860 // FIXME: Found two non registers to commute. This does happen.
2861 return nullptr;
2862 }
2863
2864 if (CommutedMI) {
2865 swapSourceModifiers(MI, Src0, Src0OpName: AMDGPU::OpName::src0_modifiers,
2866 Src1, Src1OpName: AMDGPU::OpName::src1_modifiers);
2867
2868 swapSourceModifiers(MI, Src0, Src0OpName: AMDGPU::OpName::src0_sel, Src1,
2869 Src1OpName: AMDGPU::OpName::src1_sel);
2870
2871 CommutedMI->setDesc(get(Opcode: CommutedOpcode));
2872 }
2873
2874 return CommutedMI;
2875}
2876
2877// This needs to be implemented because the source modifiers may be inserted
2878// between the true commutable operands, and the base
2879// TargetInstrInfo::commuteInstruction uses it.
2880bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
2881 unsigned &SrcOpIdx0,
2882 unsigned &SrcOpIdx1) const {
2883 return findCommutedOpIndices(Desc: MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2884}
2885
2886bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc,
2887 unsigned &SrcOpIdx0,
2888 unsigned &SrcOpIdx1) const {
2889 if (!Desc.isCommutable())
2890 return false;
2891
2892 unsigned Opc = Desc.getOpcode();
2893 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
2894 if (Src0Idx == -1)
2895 return false;
2896
2897 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1);
2898 if (Src1Idx == -1)
2899 return false;
2900
2901 return fixCommutedOpIndices(ResultIdx1&: SrcOpIdx0, ResultIdx2&: SrcOpIdx1, CommutableOpIdx1: Src0Idx, CommutableOpIdx2: Src1Idx);
2902}
2903
2904bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
2905 int64_t BrOffset) const {
2906 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2907 // block is unanalyzable.
2908 assert(BranchOp != AMDGPU::S_SETPC_B64);
2909
2910 // Convert to dwords.
2911 BrOffset /= 4;
2912
2913 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2914 // from the next instruction.
2915 BrOffset -= 1;
2916
2917 return isIntN(N: BranchOffsetBits, x: BrOffset);
2918}
2919
2920MachineBasicBlock *
2921SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
2922 return MI.getOperand(i: 0).getMBB();
2923}
2924
2925bool SIInstrInfo::hasDivergentBranch(const MachineBasicBlock *MBB) const {
2926 for (const MachineInstr &MI : MBB->terminators()) {
2927 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2928 MI.getOpcode() == AMDGPU::SI_LOOP)
2929 return true;
2930 }
2931 return false;
2932}
2933
2934void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
2935 MachineBasicBlock &DestBB,
2936 MachineBasicBlock &RestoreBB,
2937 const DebugLoc &DL, int64_t BrOffset,
2938 RegScavenger *RS) const {
2939 assert(RS && "RegScavenger required for long branching");
2940 assert(MBB.empty() &&
2941 "new block should be inserted for expanding unconditional branch");
2942 assert(MBB.pred_size() == 1);
2943 assert(RestoreBB.empty() &&
2944 "restore block should be inserted for restoring clobbered registers");
2945
2946 MachineFunction *MF = MBB.getParent();
2947 MachineRegisterInfo &MRI = MF->getRegInfo();
2948 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2949
2950 // FIXME: Virtual register workaround for RegScavenger not working with empty
2951 // blocks.
2952 Register PCReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
2953
2954 auto I = MBB.end();
2955
2956 // Note: as this is used after hazard recognizer we need to apply some hazard
2957 // workarounds directly.
2958 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2959 ST.hasVALUReadSGPRHazard();
2960 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2961 if (FlushSGPRWrites)
2962 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
2963 .addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0));
2964 };
2965
2966 // We need to compute the offset relative to the instruction immediately after
2967 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2968 MachineInstr *GetPC = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_GETPC_B64), DestReg: PCReg);
2969 ApplyHazardWorkarounds();
2970
2971 auto &MCCtx = MF->getContext();
2972 MCSymbol *PostGetPCLabel =
2973 MCCtx.createTempSymbol(Name: "post_getpc", /*AlwaysAddSuffix=*/true);
2974 GetPC->setPostInstrSymbol(MF&: *MF, Symbol: PostGetPCLabel);
2975
2976 MCSymbol *OffsetLo =
2977 MCCtx.createTempSymbol(Name: "offset_lo", /*AlwaysAddSuffix=*/true);
2978 MCSymbol *OffsetHi =
2979 MCCtx.createTempSymbol(Name: "offset_hi", /*AlwaysAddSuffix=*/true);
2980 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADD_U32))
2981 .addReg(RegNo: PCReg, flags: RegState::Define, SubReg: AMDGPU::sub0)
2982 .addReg(RegNo: PCReg, flags: 0, SubReg: AMDGPU::sub0)
2983 .addSym(Sym: OffsetLo, TargetFlags: MO_FAR_BRANCH_OFFSET);
2984 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_ADDC_U32))
2985 .addReg(RegNo: PCReg, flags: RegState::Define, SubReg: AMDGPU::sub1)
2986 .addReg(RegNo: PCReg, flags: 0, SubReg: AMDGPU::sub1)
2987 .addSym(Sym: OffsetHi, TargetFlags: MO_FAR_BRANCH_OFFSET);
2988 ApplyHazardWorkarounds();
2989
2990 // Insert the indirect branch after the other terminator.
2991 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_SETPC_B64))
2992 .addReg(RegNo: PCReg);
2993
2994 // If a spill is needed for the pc register pair, we need to insert a spill
2995 // restore block right before the destination block, and insert a short branch
2996 // into the old destination block's fallthrough predecessor.
2997 // e.g.:
2998 //
2999 // s_cbranch_scc0 skip_long_branch:
3000 //
3001 // long_branch_bb:
3002 // spill s[8:9]
3003 // s_getpc_b64 s[8:9]
3004 // s_add_u32 s8, s8, restore_bb
3005 // s_addc_u32 s9, s9, 0
3006 // s_setpc_b64 s[8:9]
3007 //
3008 // skip_long_branch:
3009 // foo;
3010 //
3011 // .....
3012 //
3013 // dest_bb_fallthrough_predecessor:
3014 // bar;
3015 // s_branch dest_bb
3016 //
3017 // restore_bb:
3018 // restore s[8:9]
3019 // fallthrough dest_bb
3020 ///
3021 // dest_bb:
3022 // buzz;
3023
3024 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3025 Register Scav;
3026
3027 // If we've previously reserved a register for long branches
3028 // avoid running the scavenger and just use those registers
3029 if (LongBranchReservedReg) {
3030 RS->enterBasicBlock(MBB);
3031 Scav = LongBranchReservedReg;
3032 } else {
3033 RS->enterBasicBlockEnd(MBB);
3034 Scav = RS->scavengeRegisterBackwards(
3035 RC: AMDGPU::SReg_64RegClass, To: MachineBasicBlock::iterator(GetPC),
3036 /* RestoreAfter */ false, SPAdj: 0, /* AllowSpill */ false);
3037 }
3038 if (Scav) {
3039 RS->setRegUsed(Reg: Scav);
3040 MRI.replaceRegWith(FromReg: PCReg, ToReg: Scav);
3041 MRI.clearVirtRegs();
3042 } else {
3043 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3044 // SGPR spill.
3045 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3046 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3047 TRI->spillEmergencySGPR(MI: GetPC, RestoreMBB&: RestoreBB, SGPR: AMDGPU::SGPR0_SGPR1, RS);
3048 MRI.replaceRegWith(FromReg: PCReg, ToReg: AMDGPU::SGPR0_SGPR1);
3049 MRI.clearVirtRegs();
3050 }
3051
3052 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3053 // Now, the distance could be defined.
3054 auto *Offset = MCBinaryExpr::createSub(
3055 LHS: MCSymbolRefExpr::create(Symbol: DestLabel, Ctx&: MCCtx),
3056 RHS: MCSymbolRefExpr::create(Symbol: PostGetPCLabel, Ctx&: MCCtx), Ctx&: MCCtx);
3057 // Add offset assignments.
3058 auto *Mask = MCConstantExpr::create(Value: 0xFFFFFFFFULL, Ctx&: MCCtx);
3059 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(LHS: Offset, RHS: Mask, Ctx&: MCCtx));
3060 auto *ShAmt = MCConstantExpr::create(Value: 32, Ctx&: MCCtx);
3061 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(LHS: Offset, RHS: ShAmt, Ctx&: MCCtx));
3062}
3063
3064unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3065 switch (Cond) {
3066 case SIInstrInfo::SCC_TRUE:
3067 return AMDGPU::S_CBRANCH_SCC1;
3068 case SIInstrInfo::SCC_FALSE:
3069 return AMDGPU::S_CBRANCH_SCC0;
3070 case SIInstrInfo::VCCNZ:
3071 return AMDGPU::S_CBRANCH_VCCNZ;
3072 case SIInstrInfo::VCCZ:
3073 return AMDGPU::S_CBRANCH_VCCZ;
3074 case SIInstrInfo::EXECNZ:
3075 return AMDGPU::S_CBRANCH_EXECNZ;
3076 case SIInstrInfo::EXECZ:
3077 return AMDGPU::S_CBRANCH_EXECZ;
3078 default:
3079 llvm_unreachable("invalid branch predicate");
3080 }
3081}
3082
3083SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3084 switch (Opcode) {
3085 case AMDGPU::S_CBRANCH_SCC0:
3086 return SCC_FALSE;
3087 case AMDGPU::S_CBRANCH_SCC1:
3088 return SCC_TRUE;
3089 case AMDGPU::S_CBRANCH_VCCNZ:
3090 return VCCNZ;
3091 case AMDGPU::S_CBRANCH_VCCZ:
3092 return VCCZ;
3093 case AMDGPU::S_CBRANCH_EXECNZ:
3094 return EXECNZ;
3095 case AMDGPU::S_CBRANCH_EXECZ:
3096 return EXECZ;
3097 default:
3098 return INVALID_BR;
3099 }
3100}
3101
3102bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
3103 MachineBasicBlock::iterator I,
3104 MachineBasicBlock *&TBB,
3105 MachineBasicBlock *&FBB,
3106 SmallVectorImpl<MachineOperand> &Cond,
3107 bool AllowModify) const {
3108 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3109 // Unconditional Branch
3110 TBB = I->getOperand(i: 0).getMBB();
3111 return false;
3112 }
3113
3114 BranchPredicate Pred = getBranchPredicate(Opcode: I->getOpcode());
3115 if (Pred == INVALID_BR)
3116 return true;
3117
3118 MachineBasicBlock *CondBB = I->getOperand(i: 0).getMBB();
3119 Cond.push_back(Elt: MachineOperand::CreateImm(Val: Pred));
3120 Cond.push_back(Elt: I->getOperand(i: 1)); // Save the branch register.
3121
3122 ++I;
3123
3124 if (I == MBB.end()) {
3125 // Conditional branch followed by fall-through.
3126 TBB = CondBB;
3127 return false;
3128 }
3129
3130 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3131 TBB = CondBB;
3132 FBB = I->getOperand(i: 0).getMBB();
3133 return false;
3134 }
3135
3136 return true;
3137}
3138
3139bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
3140 MachineBasicBlock *&FBB,
3141 SmallVectorImpl<MachineOperand> &Cond,
3142 bool AllowModify) const {
3143 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3144 auto E = MBB.end();
3145 if (I == E)
3146 return false;
3147
3148 // Skip over the instructions that are artificially terminators for special
3149 // exec management.
3150 while (I != E && !I->isBranch() && !I->isReturn()) {
3151 switch (I->getOpcode()) {
3152 case AMDGPU::S_MOV_B64_term:
3153 case AMDGPU::S_XOR_B64_term:
3154 case AMDGPU::S_OR_B64_term:
3155 case AMDGPU::S_ANDN2_B64_term:
3156 case AMDGPU::S_AND_B64_term:
3157 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3158 case AMDGPU::S_MOV_B32_term:
3159 case AMDGPU::S_XOR_B32_term:
3160 case AMDGPU::S_OR_B32_term:
3161 case AMDGPU::S_ANDN2_B32_term:
3162 case AMDGPU::S_AND_B32_term:
3163 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3164 break;
3165 case AMDGPU::SI_IF:
3166 case AMDGPU::SI_ELSE:
3167 case AMDGPU::SI_KILL_I1_TERMINATOR:
3168 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3169 // FIXME: It's messy that these need to be considered here at all.
3170 return true;
3171 default:
3172 llvm_unreachable("unexpected non-branch terminator inst");
3173 }
3174
3175 ++I;
3176 }
3177
3178 if (I == E)
3179 return false;
3180
3181 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3182}
3183
3184unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
3185 int *BytesRemoved) const {
3186 unsigned Count = 0;
3187 unsigned RemovedSize = 0;
3188 for (MachineInstr &MI : llvm::make_early_inc_range(Range: MBB.terminators())) {
3189 // Skip over artificial terminators when removing instructions.
3190 if (MI.isBranch() || MI.isReturn()) {
3191 RemovedSize += getInstSizeInBytes(MI);
3192 MI.eraseFromParent();
3193 ++Count;
3194 }
3195 }
3196
3197 if (BytesRemoved)
3198 *BytesRemoved = RemovedSize;
3199
3200 return Count;
3201}
3202
3203// Copy the flags onto the implicit condition register operand.
3204static void preserveCondRegFlags(MachineOperand &CondReg,
3205 const MachineOperand &OrigCond) {
3206 CondReg.setIsUndef(OrigCond.isUndef());
3207 CondReg.setIsKill(OrigCond.isKill());
3208}
3209
3210unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
3211 MachineBasicBlock *TBB,
3212 MachineBasicBlock *FBB,
3213 ArrayRef<MachineOperand> Cond,
3214 const DebugLoc &DL,
3215 int *BytesAdded) const {
3216 if (!FBB && Cond.empty()) {
3217 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
3218 .addMBB(MBB: TBB);
3219 if (BytesAdded)
3220 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3221 return 1;
3222 }
3223
3224 assert(TBB && Cond[0].isImm());
3225
3226 unsigned Opcode
3227 = getBranchOpcode(Cond: static_cast<BranchPredicate>(Cond[0].getImm()));
3228
3229 if (!FBB) {
3230 MachineInstr *CondBr =
3231 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode))
3232 .addMBB(MBB: TBB);
3233
3234 // Copy the flags onto the implicit condition register operand.
3235 preserveCondRegFlags(CondReg&: CondBr->getOperand(i: 1), OrigCond: Cond[1]);
3236 fixImplicitOperands(MI&: *CondBr);
3237
3238 if (BytesAdded)
3239 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3240 return 1;
3241 }
3242
3243 assert(TBB && FBB);
3244
3245 MachineInstr *CondBr =
3246 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode))
3247 .addMBB(MBB: TBB);
3248 fixImplicitOperands(MI&: *CondBr);
3249 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AMDGPU::S_BRANCH))
3250 .addMBB(MBB: FBB);
3251
3252 MachineOperand &CondReg = CondBr->getOperand(i: 1);
3253 CondReg.setIsUndef(Cond[1].isUndef());
3254 CondReg.setIsKill(Cond[1].isKill());
3255
3256 if (BytesAdded)
3257 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3258
3259 return 2;
3260}
3261
3262bool SIInstrInfo::reverseBranchCondition(
3263 SmallVectorImpl<MachineOperand> &Cond) const {
3264 if (Cond.size() != 2) {
3265 return true;
3266 }
3267
3268 if (Cond[0].isImm()) {
3269 Cond[0].setImm(-Cond[0].getImm());
3270 return false;
3271 }
3272
3273 return true;
3274}
3275
3276bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
3277 ArrayRef<MachineOperand> Cond,
3278 Register DstReg, Register TrueReg,
3279 Register FalseReg, int &CondCycles,
3280 int &TrueCycles, int &FalseCycles) const {
3281 switch (Cond[0].getImm()) {
3282 case VCCNZ:
3283 case VCCZ: {
3284 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3285 const TargetRegisterClass *RC = MRI.getRegClass(Reg: TrueReg);
3286 if (MRI.getRegClass(Reg: FalseReg) != RC)
3287 return false;
3288
3289 int NumInsts = AMDGPU::getRegBitWidth(RC: *RC) / 32;
3290 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3291
3292 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3293 return RI.hasVGPRs(RC) && NumInsts <= 6;
3294 }
3295 case SCC_TRUE:
3296 case SCC_FALSE: {
3297 // FIXME: We could insert for VGPRs if we could replace the original compare
3298 // with a vector one.
3299 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3300 const TargetRegisterClass *RC = MRI.getRegClass(Reg: TrueReg);
3301 if (MRI.getRegClass(Reg: FalseReg) != RC)
3302 return false;
3303
3304 int NumInsts = AMDGPU::getRegBitWidth(RC: *RC) / 32;
3305
3306 // Multiples of 8 can do s_cselect_b64
3307 if (NumInsts % 2 == 0)
3308 NumInsts /= 2;
3309
3310 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3311 return RI.isSGPRClass(RC);
3312 }
3313 default:
3314 return false;
3315 }
3316}
3317
3318void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
3319 MachineBasicBlock::iterator I, const DebugLoc &DL,
3320 Register DstReg, ArrayRef<MachineOperand> Cond,
3321 Register TrueReg, Register FalseReg) const {
3322 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3323 if (Pred == VCCZ || Pred == SCC_FALSE) {
3324 Pred = static_cast<BranchPredicate>(-Pred);
3325 std::swap(a&: TrueReg, b&: FalseReg);
3326 }
3327
3328 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3329 const TargetRegisterClass *DstRC = MRI.getRegClass(Reg: DstReg);
3330 unsigned DstSize = RI.getRegSizeInBits(RC: *DstRC);
3331
3332 if (DstSize == 32) {
3333 MachineInstr *Select;
3334 if (Pred == SCC_TRUE) {
3335 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: DstReg)
3336 .addReg(RegNo: TrueReg)
3337 .addReg(RegNo: FalseReg);
3338 } else {
3339 // Instruction's operands are backwards from what is expected.
3340 Select = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e32), DestReg: DstReg)
3341 .addReg(RegNo: FalseReg)
3342 .addReg(RegNo: TrueReg);
3343 }
3344
3345 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3346 return;
3347 }
3348
3349 if (DstSize == 64 && Pred == SCC_TRUE) {
3350 MachineInstr *Select =
3351 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: DstReg)
3352 .addReg(RegNo: TrueReg)
3353 .addReg(RegNo: FalseReg);
3354
3355 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3356 return;
3357 }
3358
3359 static const int16_t Sub0_15[] = {
3360 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3361 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3362 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3363 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3364 };
3365
3366 static const int16_t Sub0_15_64[] = {
3367 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3368 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3369 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3370 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3371 };
3372
3373 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3374 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3375 const int16_t *SubIndices = Sub0_15;
3376 int NElts = DstSize / 32;
3377
3378 // 64-bit select is only available for SALU.
3379 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3380 if (Pred == SCC_TRUE) {
3381 if (NElts % 2) {
3382 SelOp = AMDGPU::S_CSELECT_B32;
3383 EltRC = &AMDGPU::SGPR_32RegClass;
3384 } else {
3385 SelOp = AMDGPU::S_CSELECT_B64;
3386 EltRC = &AMDGPU::SGPR_64RegClass;
3387 SubIndices = Sub0_15_64;
3388 NElts /= 2;
3389 }
3390 }
3391
3392 MachineInstrBuilder MIB = BuildMI(
3393 BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg);
3394
3395 I = MIB->getIterator();
3396
3397 SmallVector<Register, 8> Regs;
3398 for (int Idx = 0; Idx != NElts; ++Idx) {
3399 Register DstElt = MRI.createVirtualRegister(RegClass: EltRC);
3400 Regs.push_back(Elt: DstElt);
3401
3402 unsigned SubIdx = SubIndices[Idx];
3403
3404 MachineInstr *Select;
3405 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3406 Select =
3407 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SelOp), DestReg: DstElt)
3408 .addReg(RegNo: FalseReg, flags: 0, SubReg: SubIdx)
3409 .addReg(RegNo: TrueReg, flags: 0, SubReg: SubIdx);
3410 } else {
3411 Select =
3412 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SelOp), DestReg: DstElt)
3413 .addReg(RegNo: TrueReg, flags: 0, SubReg: SubIdx)
3414 .addReg(RegNo: FalseReg, flags: 0, SubReg: SubIdx);
3415 }
3416
3417 preserveCondRegFlags(CondReg&: Select->getOperand(i: 3), OrigCond: Cond[1]);
3418 fixImplicitOperands(MI&: *Select);
3419
3420 MIB.addReg(RegNo: DstElt)
3421 .addImm(Val: SubIdx);
3422 }
3423}
3424
3425bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
3426 switch (MI.getOpcode()) {
3427 case AMDGPU::V_MOV_B16_t16_e32:
3428 case AMDGPU::V_MOV_B16_t16_e64:
3429 case AMDGPU::V_MOV_B32_e32:
3430 case AMDGPU::V_MOV_B32_e64:
3431 case AMDGPU::V_MOV_B64_PSEUDO:
3432 case AMDGPU::V_MOV_B64_e32:
3433 case AMDGPU::V_MOV_B64_e64:
3434 case AMDGPU::S_MOV_B32:
3435 case AMDGPU::S_MOV_B64:
3436 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3437 case AMDGPU::COPY:
3438 case AMDGPU::WWM_COPY:
3439 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3440 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3441 case AMDGPU::V_ACCVGPR_MOV_B32:
3442 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3443 return true;
3444 default:
3445 return false;
3446 }
3447}
3448
3449static constexpr AMDGPU::OpName ModifierOpNames[] = {
3450 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3451 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3452 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3453
3454void SIInstrInfo::removeModOperands(MachineInstr &MI) const {
3455 unsigned Opc = MI.getOpcode();
3456 for (AMDGPU::OpName Name : reverse(C: ModifierOpNames)) {
3457 int Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name);
3458 if (Idx >= 0)
3459 MI.removeOperand(OpNo: Idx);
3460 }
3461}
3462
3463std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3464 unsigned SubRegIndex) {
3465 switch (SubRegIndex) {
3466 case AMDGPU::NoSubRegister:
3467 return Imm;
3468 case AMDGPU::sub0:
3469 return Lo_32(Value: Imm);
3470 case AMDGPU::sub1:
3471 return Hi_32(Value: Imm);
3472 case AMDGPU::lo16:
3473 return SignExtend64<16>(x: Imm);
3474 case AMDGPU::hi16:
3475 return SignExtend64<16>(x: Imm >> 16);
3476 case AMDGPU::sub1_lo16:
3477 return SignExtend64<16>(x: Imm >> 32);
3478 case AMDGPU::sub1_hi16:
3479 return SignExtend64<16>(x: Imm >> 48);
3480 default:
3481 return std::nullopt;
3482 }
3483
3484 llvm_unreachable("covered subregister switch");
3485}
3486
3487static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3488 switch (Opc) {
3489 case AMDGPU::V_MAC_F16_e32:
3490 case AMDGPU::V_MAC_F16_e64:
3491 case AMDGPU::V_MAD_F16_e64:
3492 return AMDGPU::V_MADAK_F16;
3493 case AMDGPU::V_MAC_F32_e32:
3494 case AMDGPU::V_MAC_F32_e64:
3495 case AMDGPU::V_MAD_F32_e64:
3496 return AMDGPU::V_MADAK_F32;
3497 case AMDGPU::V_FMAC_F32_e32:
3498 case AMDGPU::V_FMAC_F32_e64:
3499 case AMDGPU::V_FMA_F32_e64:
3500 return AMDGPU::V_FMAAK_F32;
3501 case AMDGPU::V_FMAC_F16_e32:
3502 case AMDGPU::V_FMAC_F16_e64:
3503 case AMDGPU::V_FMAC_F16_t16_e64:
3504 case AMDGPU::V_FMAC_F16_fake16_e64:
3505 case AMDGPU::V_FMA_F16_e64:
3506 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3507 ? AMDGPU::V_FMAAK_F16_t16
3508 : AMDGPU::V_FMAAK_F16_fake16
3509 : AMDGPU::V_FMAAK_F16;
3510 default:
3511 llvm_unreachable("invalid instruction");
3512 }
3513}
3514
3515static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3516 switch (Opc) {
3517 case AMDGPU::V_MAC_F16_e32:
3518 case AMDGPU::V_MAC_F16_e64:
3519 case AMDGPU::V_MAD_F16_e64:
3520 return AMDGPU::V_MADMK_F16;
3521 case AMDGPU::V_MAC_F32_e32:
3522 case AMDGPU::V_MAC_F32_e64:
3523 case AMDGPU::V_MAD_F32_e64:
3524 return AMDGPU::V_MADMK_F32;
3525 case AMDGPU::V_FMAC_F32_e32:
3526 case AMDGPU::V_FMAC_F32_e64:
3527 case AMDGPU::V_FMA_F32_e64:
3528 return AMDGPU::V_FMAMK_F32;
3529 case AMDGPU::V_FMAC_F16_e32:
3530 case AMDGPU::V_FMAC_F16_e64:
3531 case AMDGPU::V_FMAC_F16_t16_e64:
3532 case AMDGPU::V_FMAC_F16_fake16_e64:
3533 case AMDGPU::V_FMA_F16_e64:
3534 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3535 ? AMDGPU::V_FMAMK_F16_t16
3536 : AMDGPU::V_FMAMK_F16_fake16
3537 : AMDGPU::V_FMAMK_F16;
3538 default:
3539 llvm_unreachable("invalid instruction");
3540 }
3541}
3542
3543bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3544 Register Reg, MachineRegisterInfo *MRI) const {
3545 if (!MRI->hasOneNonDBGUse(RegNo: Reg))
3546 return false;
3547
3548 int64_t Imm;
3549 if (!getConstValDefinedInReg(MI: DefMI, Reg, ImmVal&: Imm))
3550 return false;
3551
3552 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3553
3554 unsigned Opc = UseMI.getOpcode();
3555 if (Opc == AMDGPU::COPY) {
3556 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3557
3558 Register DstReg = UseMI.getOperand(i: 0).getReg();
3559 unsigned OpSize = getOpSize(MI: UseMI, OpNo: 0);
3560 bool Is16Bit = OpSize == 2;
3561 bool Is64Bit = OpSize == 8;
3562 bool isVGPRCopy = RI.isVGPR(MRI: *MRI, Reg: DstReg);
3563 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3564 : AMDGPU::V_MOV_B32_e32
3565 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3566 : AMDGPU::S_MOV_B32;
3567
3568 std::optional<int64_t> SubRegImm =
3569 extractSubregFromImm(Imm, SubRegIndex: UseMI.getOperand(i: 1).getSubReg());
3570
3571 APInt Imm(Is64Bit ? 64 : 32, *SubRegImm,
3572 /*isSigned=*/true, /*implicitTrunc=*/true);
3573
3574 if (RI.isAGPR(MRI: *MRI, Reg: DstReg)) {
3575 if (Is64Bit || !isInlineConstant(Imm))
3576 return false;
3577 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3578 }
3579
3580 if (Is16Bit) {
3581 if (isVGPRCopy)
3582 return false; // Do not clobber vgpr_hi16
3583
3584 if (DstReg.isVirtual() && UseMI.getOperand(i: 0).getSubReg() != AMDGPU::lo16)
3585 return false;
3586
3587 UseMI.getOperand(i: 0).setSubReg(0);
3588 if (DstReg.isPhysical()) {
3589 DstReg = RI.get32BitRegister(Reg: DstReg);
3590 UseMI.getOperand(i: 0).setReg(DstReg);
3591 }
3592 assert(UseMI.getOperand(1).getReg().isVirtual());
3593 }
3594
3595 MachineFunction *MF = UseMI.getMF();
3596 const MCInstrDesc &NewMCID = get(Opcode: NewOpc);
3597 const TargetRegisterClass *NewDefRC = getRegClass(TID: NewMCID, OpNum: 0, TRI: &RI, MF: *MF);
3598
3599 if (DstReg.isPhysical()) {
3600 if (!NewDefRC->contains(Reg: DstReg))
3601 return false;
3602 } else if (!MRI->constrainRegClass(Reg: DstReg, RC: NewDefRC))
3603 return false;
3604
3605 UseMI.setDesc(NewMCID);
3606 UseMI.getOperand(i: 1).ChangeToImmediate(ImmVal: Imm.getSExtValue());
3607 UseMI.addImplicitDefUseOperands(MF&: *MF);
3608 return true;
3609 }
3610
3611 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3612 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3613 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3614 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3615 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3616 Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
3617 // Don't fold if we are using source or output modifiers. The new VOP2
3618 // instructions don't have them.
3619 if (hasAnyModifiersSet(MI: UseMI))
3620 return false;
3621
3622 // If this is a free constant, there's no reason to do this.
3623 // TODO: We could fold this here instead of letting SIFoldOperands do it
3624 // later.
3625 int Src0Idx = getNamedOperandIdx(Opcode: UseMI.getOpcode(), Name: AMDGPU::OpName::src0);
3626
3627 // Any src operand can be used for the legality check.
3628 if (isInlineConstant(MI: UseMI, OpIdx: Src0Idx, ImmVal: Imm))
3629 return false;
3630
3631 MachineOperand *Src0 = &UseMI.getOperand(i: Src0Idx);
3632
3633 MachineOperand *Src1 = getNamedOperand(MI&: UseMI, OperandName: AMDGPU::OpName::src1);
3634 MachineOperand *Src2 = getNamedOperand(MI&: UseMI, OperandName: AMDGPU::OpName::src2);
3635
3636 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3637 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3638 (Src1->isReg() && Src1->getReg() == Reg)) {
3639 MachineOperand *RegSrc =
3640 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3641 if (!RegSrc->isReg())
3642 return false;
3643 if (RI.isSGPRClass(RC: MRI->getRegClass(Reg: RegSrc->getReg())) &&
3644 ST.getConstantBusLimit(Opcode: Opc) < 2)
3645 return false;
3646
3647 if (!Src2->isReg() || RI.isSGPRClass(RC: MRI->getRegClass(Reg: Src2->getReg())))
3648 return false;
3649
3650 // If src2 is also a literal constant then we have to choose which one to
3651 // fold. In general it is better to choose madak so that the other literal
3652 // can be materialized in an sgpr instead of a vgpr:
3653 // s_mov_b32 s0, literal
3654 // v_madak_f32 v0, s0, v0, literal
3655 // Instead of:
3656 // v_mov_b32 v1, literal
3657 // v_madmk_f32 v0, v0, literal, v1
3658 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src2->getReg());
3659 if (Def && Def->isMoveImmediate() &&
3660 !isInlineConstant(MO: Def->getOperand(i: 1)))
3661 return false;
3662
3663 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3664 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
3665 return false;
3666
3667 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3668 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3669 // restricting their register classes. For now just bail out.
3670 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3671 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3672 return false;
3673
3674 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3675 Imm, SubRegIndex: RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3676
3677 // FIXME: This would be a lot easier if we could return a new instruction
3678 // instead of having to modify in place.
3679
3680 Register SrcReg = RegSrc->getReg();
3681 unsigned SrcSubReg = RegSrc->getSubReg();
3682 Src0->setReg(SrcReg);
3683 Src0->setSubReg(SrcSubReg);
3684 Src0->setIsKill(RegSrc->isKill());
3685
3686 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3687 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3688 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3689 UseMI.untieRegOperand(
3690 OpIdx: AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2));
3691
3692 Src1->ChangeToImmediate(ImmVal: *SubRegImm);
3693
3694 removeModOperands(MI&: UseMI);
3695 UseMI.setDesc(get(Opcode: NewOpc));
3696
3697 bool DeleteDef = MRI->use_nodbg_empty(RegNo: Reg);
3698 if (DeleteDef)
3699 DefMI.eraseFromParent();
3700
3701 return true;
3702 }
3703
3704 // Added part is the constant: Use v_madak_{f16, f32}.
3705 if (Src2->isReg() && Src2->getReg() == Reg) {
3706 if (ST.getConstantBusLimit(Opcode: Opc) < 2) {
3707 // Not allowed to use constant bus for another operand.
3708 // We can however allow an inline immediate as src0.
3709 bool Src0Inlined = false;
3710 if (Src0->isReg()) {
3711 // Try to inline constant if possible.
3712 // If the Def moves immediate and the use is single
3713 // We are saving VGPR here.
3714 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src0->getReg());
3715 if (Def && Def->isMoveImmediate() &&
3716 isInlineConstant(MO: Def->getOperand(i: 1)) &&
3717 MRI->hasOneUse(RegNo: Src0->getReg())) {
3718 Src0->ChangeToImmediate(ImmVal: Def->getOperand(i: 1).getImm());
3719 Src0Inlined = true;
3720 } else if (ST.getConstantBusLimit(Opcode: Opc) <= 1 &&
3721 RI.isSGPRReg(MRI: *MRI, Reg: Src0->getReg())) {
3722 return false;
3723 }
3724 // VGPR is okay as Src0 - fallthrough
3725 }
3726
3727 if (Src1->isReg() && !Src0Inlined) {
3728 // We have one slot for inlinable constant so far - try to fill it
3729 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Src1->getReg());
3730 if (Def && Def->isMoveImmediate() &&
3731 isInlineConstant(MO: Def->getOperand(i: 1)) &&
3732 MRI->hasOneUse(RegNo: Src1->getReg()) && commuteInstruction(MI&: UseMI))
3733 Src0->ChangeToImmediate(ImmVal: Def->getOperand(i: 1).getImm());
3734 else if (RI.isSGPRReg(MRI: *MRI, Reg: Src1->getReg()))
3735 return false;
3736 // VGPR is okay as Src1 - fallthrough
3737 }
3738 }
3739
3740 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3741 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
3742 return false;
3743
3744 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3745 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3746 // restricting their register classes. For now just bail out.
3747 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3748 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3749 return false;
3750
3751 // FIXME: This would be a lot easier if we could return a new instruction
3752 // instead of having to modify in place.
3753
3754 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3755 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3756 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3757 UseMI.untieRegOperand(
3758 OpIdx: AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2));
3759
3760 const std::optional<int64_t> SubRegImm =
3761 extractSubregFromImm(Imm, SubRegIndex: Src2->getSubReg());
3762
3763 // ChangingToImmediate adds Src2 back to the instruction.
3764 Src2->ChangeToImmediate(ImmVal: *SubRegImm);
3765
3766 // These come before src2.
3767 removeModOperands(MI&: UseMI);
3768 UseMI.setDesc(get(Opcode: NewOpc));
3769 // It might happen that UseMI was commuted
3770 // and we now have SGPR as SRC1. If so 2 inlined
3771 // constant and SGPR are illegal.
3772 legalizeOperands(MI&: UseMI);
3773
3774 bool DeleteDef = MRI->use_nodbg_empty(RegNo: Reg);
3775 if (DeleteDef)
3776 DefMI.eraseFromParent();
3777
3778 return true;
3779 }
3780 }
3781
3782 return false;
3783}
3784
3785static bool
3786memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
3787 ArrayRef<const MachineOperand *> BaseOps2) {
3788 if (BaseOps1.size() != BaseOps2.size())
3789 return false;
3790 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3791 if (!BaseOps1[I]->isIdenticalTo(Other: *BaseOps2[I]))
3792 return false;
3793 }
3794 return true;
3795}
3796
3797static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3798 LocationSize WidthB, int OffsetB) {
3799 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3800 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3801 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3802 return LowWidth.hasValue() &&
3803 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3804}
3805
3806bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3807 const MachineInstr &MIb) const {
3808 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3809 int64_t Offset0, Offset1;
3810 LocationSize Dummy0 = LocationSize::precise(Value: 0);
3811 LocationSize Dummy1 = LocationSize::precise(Value: 0);
3812 bool Offset0IsScalable, Offset1IsScalable;
3813 if (!getMemOperandsWithOffsetWidth(LdSt: MIa, BaseOps&: BaseOps0, Offset&: Offset0, OffsetIsScalable&: Offset0IsScalable,
3814 Width&: Dummy0, TRI: &RI) ||
3815 !getMemOperandsWithOffsetWidth(LdSt: MIb, BaseOps&: BaseOps1, Offset&: Offset1, OffsetIsScalable&: Offset1IsScalable,
3816 Width&: Dummy1, TRI: &RI))
3817 return false;
3818
3819 if (!memOpsHaveSameBaseOperands(BaseOps1: BaseOps0, BaseOps2: BaseOps1))
3820 return false;
3821
3822 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3823 // FIXME: Handle ds_read2 / ds_write2.
3824 return false;
3825 }
3826 LocationSize Width0 = MIa.memoperands().front()->getSize();
3827 LocationSize Width1 = MIb.memoperands().front()->getSize();
3828 return offsetsDoNotOverlap(WidthA: Width0, OffsetA: Offset0, WidthB: Width1, OffsetB: Offset1);
3829}
3830
3831bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
3832 const MachineInstr &MIb) const {
3833 assert(MIa.mayLoadOrStore() &&
3834 "MIa must load from or modify a memory location");
3835 assert(MIb.mayLoadOrStore() &&
3836 "MIb must load from or modify a memory location");
3837
3838 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
3839 return false;
3840
3841 // XXX - Can we relax this between address spaces?
3842 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3843 return false;
3844
3845 if (isLDSDMA(MI: MIa) || isLDSDMA(MI: MIb))
3846 return false;
3847
3848 // TODO: Should we check the address space from the MachineMemOperand? That
3849 // would allow us to distinguish objects we know don't alias based on the
3850 // underlying address space, even if it was lowered to a different one,
3851 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3852 // buffer.
3853 if (isDS(MI: MIa)) {
3854 if (isDS(MI: MIb))
3855 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3856
3857 return !isFLAT(MI: MIb) || isSegmentSpecificFLAT(MI: MIb);
3858 }
3859
3860 if (isMUBUF(MI: MIa) || isMTBUF(MI: MIa)) {
3861 if (isMUBUF(MI: MIb) || isMTBUF(MI: MIb))
3862 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3863
3864 if (isFLAT(MI: MIb))
3865 return isFLATScratch(MI: MIb);
3866
3867 return !isSMRD(MI: MIb);
3868 }
3869
3870 if (isSMRD(MI: MIa)) {
3871 if (isSMRD(MI: MIb))
3872 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3873
3874 if (isFLAT(MI: MIb))
3875 return isFLATScratch(MI: MIb);
3876
3877 return !isMUBUF(MI: MIb) && !isMTBUF(MI: MIb);
3878 }
3879
3880 if (isFLAT(MI: MIa)) {
3881 if (isFLAT(MI: MIb)) {
3882 if ((isFLATScratch(MI: MIa) && isFLATGlobal(MI: MIb)) ||
3883 (isFLATGlobal(MI: MIa) && isFLATScratch(MI: MIb)))
3884 return true;
3885
3886 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3887 }
3888
3889 return false;
3890 }
3891
3892 return false;
3893}
3894
3895static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,
3896 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3897 if (Reg.isPhysical())
3898 return false;
3899 auto *Def = MRI.getUniqueVRegDef(Reg);
3900 if (Def && SIInstrInfo::isFoldableCopy(MI: *Def) && Def->getOperand(i: 1).isImm()) {
3901 Imm = Def->getOperand(i: 1).getImm();
3902 if (DefMI)
3903 *DefMI = Def;
3904 return true;
3905 }
3906 return false;
3907}
3908
3909static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3910 MachineInstr **DefMI = nullptr) {
3911 if (!MO->isReg())
3912 return false;
3913 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3914 const MachineRegisterInfo &MRI = MF->getRegInfo();
3915 return getFoldableImm(Reg: MO->getReg(), MRI, Imm, DefMI);
3916}
3917
3918static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
3919 MachineInstr &NewMI) {
3920 if (LV) {
3921 unsigned NumOps = MI.getNumOperands();
3922 for (unsigned I = 1; I < NumOps; ++I) {
3923 MachineOperand &Op = MI.getOperand(i: I);
3924 if (Op.isReg() && Op.isKill())
3925 LV->replaceKillInstruction(Reg: Op.getReg(), OldMI&: MI, NewMI);
3926 }
3927 }
3928}
3929
3930static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
3931 switch (Opc) {
3932 case AMDGPU::V_MAC_F16_e32:
3933 case AMDGPU::V_MAC_F16_e64:
3934 return AMDGPU::V_MAD_F16_e64;
3935 case AMDGPU::V_MAC_F32_e32:
3936 case AMDGPU::V_MAC_F32_e64:
3937 return AMDGPU::V_MAD_F32_e64;
3938 case AMDGPU::V_MAC_LEGACY_F32_e32:
3939 case AMDGPU::V_MAC_LEGACY_F32_e64:
3940 return AMDGPU::V_MAD_LEGACY_F32_e64;
3941 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3942 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3943 return AMDGPU::V_FMA_LEGACY_F32_e64;
3944 case AMDGPU::V_FMAC_F16_e32:
3945 case AMDGPU::V_FMAC_F16_e64:
3946 case AMDGPU::V_FMAC_F16_t16_e64:
3947 case AMDGPU::V_FMAC_F16_fake16_e64:
3948 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3949 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
3950 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
3951 : AMDGPU::V_FMA_F16_gfx9_e64;
3952 case AMDGPU::V_FMAC_F32_e32:
3953 case AMDGPU::V_FMAC_F32_e64:
3954 return AMDGPU::V_FMA_F32_e64;
3955 case AMDGPU::V_FMAC_F64_e32:
3956 case AMDGPU::V_FMAC_F64_e64:
3957 return AMDGPU::V_FMA_F64_e64;
3958 default:
3959 llvm_unreachable("invalid instruction");
3960 }
3961}
3962
3963MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
3964 LiveVariables *LV,
3965 LiveIntervals *LIS) const {
3966 MachineBasicBlock &MBB = *MI.getParent();
3967 unsigned Opc = MI.getOpcode();
3968
3969 // Handle MFMA.
3970 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opcode: Opc);
3971 if (NewMFMAOpc != -1) {
3972 MachineInstrBuilder MIB =
3973 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewMFMAOpc));
3974 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3975 MIB.add(MO: MI.getOperand(i: I));
3976 updateLiveVariables(LV, MI, NewMI&: *MIB);
3977 if (LIS) {
3978 LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *MIB);
3979 // SlotIndex of defs needs to be updated when converting to early-clobber
3980 MachineOperand &Def = MIB->getOperand(i: 0);
3981 if (Def.isEarlyClobber() && Def.isReg() &&
3982 LIS->hasInterval(Reg: Def.getReg())) {
3983 SlotIndex OldIndex = LIS->getInstructionIndex(Instr: *MIB).getRegSlot(EC: false);
3984 SlotIndex NewIndex = LIS->getInstructionIndex(Instr: *MIB).getRegSlot(EC: true);
3985 auto &LI = LIS->getInterval(Reg: Def.getReg());
3986 auto UpdateDefIndex = [&](LiveRange &LR) {
3987 auto *S = LR.find(Pos: OldIndex);
3988 if (S != LR.end() && S->start == OldIndex) {
3989 assert(S->valno && S->valno->def == OldIndex);
3990 S->start = NewIndex;
3991 S->valno->def = NewIndex;
3992 }
3993 };
3994 UpdateDefIndex(LI);
3995 for (auto &SR : LI.subranges())
3996 UpdateDefIndex(SR);
3997 }
3998 }
3999 return MIB;
4000 }
4001
4002 if (SIInstrInfo::isWMMA(MI)) {
4003 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(Opc: MI.getOpcode());
4004 MachineInstrBuilder MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4005 .setMIFlags(MI.getFlags());
4006 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4007 MIB->addOperand(Op: MI.getOperand(i: I));
4008
4009 updateLiveVariables(LV, MI, NewMI&: *MIB);
4010 if (LIS)
4011 LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *MIB);
4012
4013 return MIB;
4014 }
4015
4016 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4017 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4018 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4019 "present pre-RA");
4020
4021 // Handle MAC/FMAC.
4022 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4023 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4024 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4025 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4026 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4027 bool Src0Literal = false;
4028
4029 switch (Opc) {
4030 default:
4031 return nullptr;
4032 case AMDGPU::V_MAC_F16_e64:
4033 case AMDGPU::V_FMAC_F16_e64:
4034 case AMDGPU::V_FMAC_F16_t16_e64:
4035 case AMDGPU::V_FMAC_F16_fake16_e64:
4036 case AMDGPU::V_MAC_F32_e64:
4037 case AMDGPU::V_MAC_LEGACY_F32_e64:
4038 case AMDGPU::V_FMAC_F32_e64:
4039 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4040 case AMDGPU::V_FMAC_F64_e64:
4041 break;
4042 case AMDGPU::V_MAC_F16_e32:
4043 case AMDGPU::V_FMAC_F16_e32:
4044 case AMDGPU::V_MAC_F32_e32:
4045 case AMDGPU::V_MAC_LEGACY_F32_e32:
4046 case AMDGPU::V_FMAC_F32_e32:
4047 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4048 case AMDGPU::V_FMAC_F64_e32: {
4049 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
4050 Name: AMDGPU::OpName::src0);
4051 const MachineOperand *Src0 = &MI.getOperand(i: Src0Idx);
4052 if (!Src0->isReg() && !Src0->isImm())
4053 return nullptr;
4054
4055 if (Src0->isImm() && !isInlineConstant(MI, OpIdx: Src0Idx, MO: *Src0))
4056 Src0Literal = true;
4057
4058 break;
4059 }
4060 }
4061
4062 MachineInstrBuilder MIB;
4063 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
4064 const MachineOperand *Src0 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
4065 const MachineOperand *Src0Mods =
4066 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
4067 const MachineOperand *Src1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4068 const MachineOperand *Src1Mods =
4069 getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
4070 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
4071 const MachineOperand *Src2Mods =
4072 getNamedOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers);
4073 const MachineOperand *Clamp = getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
4074 const MachineOperand *Omod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
4075 const MachineOperand *OpSel = getNamedOperand(MI, OperandName: AMDGPU::OpName::op_sel);
4076
4077 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
4078 !IsLegacy &&
4079 // If we have an SGPR input, we will violate the constant bus restriction.
4080 (ST.getConstantBusLimit(Opcode: Opc) > 1 || !Src0->isReg() ||
4081 !RI.isSGPRReg(MRI: MBB.getParent()->getRegInfo(), Reg: Src0->getReg()))) {
4082 MachineInstr *DefMI;
4083 const auto killDef = [&]() -> void {
4084 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4085 // The only user is the instruction which will be killed.
4086 Register DefReg = DefMI->getOperand(i: 0).getReg();
4087
4088 if (MRI.hasOneNonDBGUse(RegNo: DefReg)) {
4089 // We cannot just remove the DefMI here, calling pass will crash.
4090 DefMI->setDesc(get(Opcode: AMDGPU::IMPLICIT_DEF));
4091 DefMI->getOperand(i: 0).setIsDead(true);
4092 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
4093 DefMI->removeOperand(OpNo: I);
4094 if (LV)
4095 LV->getVarInfo(Reg: DefReg).AliveBlocks.clear();
4096 }
4097
4098 if (LIS) {
4099 LiveInterval &DefLI = LIS->getInterval(Reg: DefReg);
4100
4101 // We cannot delete the original instruction here, so hack out the use
4102 // in the original instruction with a dummy register so we can use
4103 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4104 // not have the complexity of deleting a use to consider here.
4105 Register DummyReg = MRI.cloneVirtualRegister(VReg: DefReg);
4106 for (MachineOperand &MIOp : MI.uses()) {
4107 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4108 MIOp.setIsUndef(true);
4109 MIOp.setReg(DummyReg);
4110 }
4111 }
4112
4113 LIS->shrinkToUses(li: &DefLI);
4114 }
4115 };
4116
4117 int64_t Imm;
4118 if (!Src0Literal && getFoldableImm(MO: Src2, Imm, DefMI: &DefMI)) {
4119 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4120 if (pseudoToMCOpcode(Opcode: NewOpc) != -1) {
4121 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4122 .add(MO: *Dst)
4123 .add(MO: *Src0)
4124 .add(MO: *Src1)
4125 .addImm(Val: Imm)
4126 .setMIFlags(MI.getFlags());
4127 updateLiveVariables(LV, MI, NewMI&: *MIB);
4128 if (LIS)
4129 LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *MIB);
4130 killDef();
4131 return MIB;
4132 }
4133 }
4134 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4135 if (!Src0Literal && getFoldableImm(MO: Src1, Imm, DefMI: &DefMI)) {
4136 if (pseudoToMCOpcode(Opcode: NewOpc) != -1) {
4137 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4138 .add(MO: *Dst)
4139 .add(MO: *Src0)
4140 .addImm(Val: Imm)
4141 .add(MO: *Src2)
4142 .setMIFlags(MI.getFlags());
4143 updateLiveVariables(LV, MI, NewMI&: *MIB);
4144
4145 if (LIS)
4146 LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *MIB);
4147 killDef();
4148 return MIB;
4149 }
4150 }
4151 if (Src0Literal || getFoldableImm(MO: Src0, Imm, DefMI: &DefMI)) {
4152 if (Src0Literal) {
4153 Imm = Src0->getImm();
4154 DefMI = nullptr;
4155 }
4156 if (pseudoToMCOpcode(Opcode: NewOpc) != -1 &&
4157 isOperandLegal(
4158 MI, OpIdx: AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::src0),
4159 MO: Src1)) {
4160 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4161 .add(MO: *Dst)
4162 .add(MO: *Src1)
4163 .addImm(Val: Imm)
4164 .add(MO: *Src2)
4165 .setMIFlags(MI.getFlags());
4166 updateLiveVariables(LV, MI, NewMI&: *MIB);
4167
4168 if (LIS)
4169 LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *MIB);
4170 if (DefMI)
4171 killDef();
4172 return MIB;
4173 }
4174 }
4175 }
4176
4177 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4178 // if VOP3 does not allow a literal operand.
4179 if (Src0Literal && !ST.hasVOP3Literal())
4180 return nullptr;
4181
4182 unsigned NewOpc = getNewFMAInst(ST, Opc);
4183
4184 if (pseudoToMCOpcode(Opcode: NewOpc) == -1)
4185 return nullptr;
4186
4187 MIB = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: NewOpc))
4188 .add(MO: *Dst)
4189 .addImm(Val: Src0Mods ? Src0Mods->getImm() : 0)
4190 .add(MO: *Src0)
4191 .addImm(Val: Src1Mods ? Src1Mods->getImm() : 0)
4192 .add(MO: *Src1)
4193 .addImm(Val: Src2Mods ? Src2Mods->getImm() : 0)
4194 .add(MO: *Src2)
4195 .addImm(Val: Clamp ? Clamp->getImm() : 0)
4196 .addImm(Val: Omod ? Omod->getImm() : 0)
4197 .setMIFlags(MI.getFlags());
4198 if (AMDGPU::hasNamedOperand(Opcode: NewOpc, NamedIdx: AMDGPU::OpName::op_sel))
4199 MIB.addImm(Val: OpSel ? OpSel->getImm() : 0);
4200 updateLiveVariables(LV, MI, NewMI&: *MIB);
4201 if (LIS)
4202 LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *MIB);
4203 return MIB;
4204}
4205
4206// It's not generally safe to move VALU instructions across these since it will
4207// start using the register as a base index rather than directly.
4208// XXX - Why isn't hasSideEffects sufficient for these?
4209static bool changesVGPRIndexingMode(const MachineInstr &MI) {
4210 switch (MI.getOpcode()) {
4211 case AMDGPU::S_SET_GPR_IDX_ON:
4212 case AMDGPU::S_SET_GPR_IDX_MODE:
4213 case AMDGPU::S_SET_GPR_IDX_OFF:
4214 return true;
4215 default:
4216 return false;
4217 }
4218}
4219
4220bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
4221 const MachineBasicBlock *MBB,
4222 const MachineFunction &MF) const {
4223 // Skipping the check for SP writes in the base implementation. The reason it
4224 // was added was apparently due to compile time concerns.
4225 //
4226 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4227 // but is probably avoidable.
4228
4229 // Copied from base implementation.
4230 // Terminators and labels can't be scheduled around.
4231 if (MI.isTerminator() || MI.isPosition())
4232 return true;
4233
4234 // INLINEASM_BR can jump to another block
4235 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4236 return true;
4237
4238 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(i: 0).getImm() == 0)
4239 return true;
4240
4241 // Target-independent instructions do not have an implicit-use of EXEC, even
4242 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4243 // boundaries prevents incorrect movements of such instructions.
4244 return MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &RI) ||
4245 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4246 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4247 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4248 changesVGPRIndexingMode(MI);
4249}
4250
4251bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
4252 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4253 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4254 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4255}
4256
4257bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
4258 // Skip the full operand and register alias search modifiesRegister
4259 // does. There's only a handful of instructions that touch this, it's only an
4260 // implicit def, and doesn't alias any other registers.
4261 return is_contained(Range: MI.getDesc().implicit_defs(), Element: AMDGPU::MODE);
4262}
4263
4264bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
4265 unsigned Opcode = MI.getOpcode();
4266
4267 if (MI.mayStore() && isSMRD(MI))
4268 return true; // scalar store or atomic
4269
4270 // This will terminate the function when other lanes may need to continue.
4271 if (MI.isReturn())
4272 return true;
4273
4274 // These instructions cause shader I/O that may cause hardware lockups
4275 // when executed with an empty EXEC mask.
4276 //
4277 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4278 // EXEC = 0, but checking for that case here seems not worth it
4279 // given the typical code patterns.
4280 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4281 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4282 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4283 return true;
4284
4285 if (MI.isCall() || MI.isInlineAsm())
4286 return true; // conservative assumption
4287
4288 // Assume that barrier interactions are only intended with active lanes.
4289 if (isBarrier(Opcode))
4290 return true;
4291
4292 // A mode change is a scalar operation that influences vector instructions.
4293 if (modifiesModeRegister(MI))
4294 return true;
4295
4296 // These are like SALU instructions in terms of effects, so it's questionable
4297 // whether we should return true for those.
4298 //
4299 // However, executing them with EXEC = 0 causes them to operate on undefined
4300 // data, which we avoid by returning true here.
4301 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4302 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4303 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4304 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4305 return true;
4306
4307 return false;
4308}
4309
4310bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,
4311 const MachineInstr &MI) const {
4312 if (MI.isMetaInstruction())
4313 return false;
4314
4315 // This won't read exec if this is an SGPR->SGPR copy.
4316 if (MI.isCopyLike()) {
4317 if (!RI.isSGPRReg(MRI, Reg: MI.getOperand(i: 0).getReg()))
4318 return true;
4319
4320 // Make sure this isn't copying exec as a normal operand
4321 return MI.readsRegister(Reg: AMDGPU::EXEC, TRI: &RI);
4322 }
4323
4324 // Make a conservative assumption about the callee.
4325 if (MI.isCall())
4326 return true;
4327
4328 // Be conservative with any unhandled generic opcodes.
4329 if (!isTargetSpecificOpcode(Opcode: MI.getOpcode()))
4330 return true;
4331
4332 return !isSALU(MI) || MI.readsRegister(Reg: AMDGPU::EXEC, TRI: &RI);
4333}
4334
4335bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4336 switch (Imm.getBitWidth()) {
4337 case 1: // This likely will be a condition code mask.
4338 return true;
4339
4340 case 32:
4341 return AMDGPU::isInlinableLiteral32(Literal: Imm.getSExtValue(),
4342 HasInv2Pi: ST.hasInv2PiInlineImm());
4343 case 64:
4344 return AMDGPU::isInlinableLiteral64(Literal: Imm.getSExtValue(),
4345 HasInv2Pi: ST.hasInv2PiInlineImm());
4346 case 16:
4347 return ST.has16BitInsts() &&
4348 AMDGPU::isInlinableLiteralI16(Literal: Imm.getSExtValue(),
4349 HasInv2Pi: ST.hasInv2PiInlineImm());
4350 default:
4351 llvm_unreachable("invalid bitwidth");
4352 }
4353}
4354
4355bool SIInstrInfo::isInlineConstant(const APFloat &Imm) const {
4356 APInt IntImm = Imm.bitcastToAPInt();
4357 int64_t IntImmVal = IntImm.getSExtValue();
4358 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4359 switch (APFloat::SemanticsToEnum(Sem: Imm.getSemantics())) {
4360 default:
4361 llvm_unreachable("invalid fltSemantics");
4362 case APFloatBase::S_IEEEsingle:
4363 case APFloatBase::S_IEEEdouble:
4364 return isInlineConstant(Imm: IntImm);
4365 case APFloatBase::S_BFloat:
4366 return ST.has16BitInsts() &&
4367 AMDGPU::isInlinableLiteralBF16(Literal: IntImmVal, HasInv2Pi);
4368 case APFloatBase::S_IEEEhalf:
4369 return ST.has16BitInsts() &&
4370 AMDGPU::isInlinableLiteralFP16(Literal: IntImmVal, HasInv2Pi);
4371 }
4372}
4373
4374bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4375 // MachineOperand provides no way to tell the true operand size, since it only
4376 // records a 64-bit value. We need to know the size to determine if a 32-bit
4377 // floating point immediate bit pattern is legal for an integer immediate. It
4378 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4379 switch (OperandType) {
4380 case AMDGPU::OPERAND_REG_IMM_INT32:
4381 case AMDGPU::OPERAND_REG_IMM_FP32:
4382 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
4383 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
4384 case AMDGPU::OPERAND_REG_IMM_V2FP32:
4385 case AMDGPU::OPERAND_REG_IMM_V2INT32:
4386 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
4387 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
4388 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: {
4389 int32_t Trunc = static_cast<int32_t>(Imm);
4390 return AMDGPU::isInlinableLiteral32(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4391 }
4392 case AMDGPU::OPERAND_REG_IMM_INT64:
4393 case AMDGPU::OPERAND_REG_IMM_FP64:
4394 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
4395 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
4396 case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
4397 return AMDGPU::isInlinableLiteral64(Literal: Imm, HasInv2Pi: ST.hasInv2PiInlineImm());
4398 case AMDGPU::OPERAND_REG_IMM_INT16:
4399 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
4400 // We would expect inline immediates to not be concerned with an integer/fp
4401 // distinction. However, in the case of 16-bit integer operations, the
4402 // "floating point" values appear to not work. It seems read the low 16-bits
4403 // of 32-bit immediates, which happens to always work for the integer
4404 // values.
4405 //
4406 // See llvm bugzilla 46302.
4407 //
4408 // TODO: Theoretically we could use op-sel to use the high bits of the
4409 // 32-bit FP values.
4410 return AMDGPU::isInlinableIntLiteral(Literal: Imm);
4411 case AMDGPU::OPERAND_REG_IMM_V2INT16:
4412 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
4413 return AMDGPU::isInlinableLiteralV2I16(Literal: Imm);
4414 case AMDGPU::OPERAND_REG_IMM_V2FP16:
4415 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
4416 return AMDGPU::isInlinableLiteralV2F16(Literal: Imm);
4417 case AMDGPU::OPERAND_REG_IMM_V2BF16:
4418 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
4419 return AMDGPU::isInlinableLiteralV2BF16(Literal: Imm);
4420 case AMDGPU::OPERAND_REG_IMM_FP16:
4421 case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
4422 if (isInt<16>(x: Imm) || isUInt<16>(x: Imm)) {
4423 // A few special case instructions have 16-bit operands on subtargets
4424 // where 16-bit instructions are not legal.
4425 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4426 // constants in these cases
4427 int16_t Trunc = static_cast<int16_t>(Imm);
4428 return ST.has16BitInsts() &&
4429 AMDGPU::isInlinableLiteralFP16(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4430 }
4431
4432 return false;
4433 }
4434 case AMDGPU::OPERAND_REG_IMM_BF16:
4435 case AMDGPU::OPERAND_REG_INLINE_C_BF16: {
4436 if (isInt<16>(x: Imm) || isUInt<16>(x: Imm)) {
4437 int16_t Trunc = static_cast<int16_t>(Imm);
4438 return ST.has16BitInsts() &&
4439 AMDGPU::isInlinableLiteralBF16(Literal: Trunc, HasInv2Pi: ST.hasInv2PiInlineImm());
4440 }
4441 return false;
4442 }
4443 case AMDGPU::OPERAND_KIMM32:
4444 case AMDGPU::OPERAND_KIMM16:
4445 return false;
4446 case AMDGPU::OPERAND_INPUT_MODS:
4447 case MCOI::OPERAND_IMMEDIATE:
4448 // Always embedded in the instruction for free.
4449 return true;
4450 case MCOI::OPERAND_UNKNOWN:
4451 case MCOI::OPERAND_REGISTER:
4452 case MCOI::OPERAND_PCREL:
4453 case MCOI::OPERAND_GENERIC_0:
4454 case MCOI::OPERAND_GENERIC_1:
4455 case MCOI::OPERAND_GENERIC_2:
4456 case MCOI::OPERAND_GENERIC_3:
4457 case MCOI::OPERAND_GENERIC_4:
4458 case MCOI::OPERAND_GENERIC_5:
4459 // Just ignore anything else.
4460 return true;
4461 default:
4462 llvm_unreachable("invalid operand type");
4463 }
4464}
4465
4466static bool compareMachineOp(const MachineOperand &Op0,
4467 const MachineOperand &Op1) {
4468 if (Op0.getType() != Op1.getType())
4469 return false;
4470
4471 switch (Op0.getType()) {
4472 case MachineOperand::MO_Register:
4473 return Op0.getReg() == Op1.getReg();
4474 case MachineOperand::MO_Immediate:
4475 return Op0.getImm() == Op1.getImm();
4476 default:
4477 llvm_unreachable("Didn't expect to be comparing these operand types");
4478 }
4479}
4480
4481bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
4482 const MachineOperand &MO) const {
4483 const MCInstrDesc &InstDesc = MI.getDesc();
4484 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4485
4486 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4487
4488 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4489 return true;
4490
4491 if (OpInfo.RegClass < 0)
4492 return false;
4493
4494 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4495 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4496 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
4497 Name: AMDGPU::OpName::src2))
4498 return false;
4499 return RI.opCanUseInlineConstant(OpType: OpInfo.OperandType);
4500 }
4501
4502 if (!RI.opCanUseLiteralConstant(OpType: OpInfo.OperandType))
4503 return false;
4504
4505 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(Desc: InstDesc, OpNo))
4506 return true;
4507
4508 return ST.hasVOP3Literal();
4509}
4510
4511bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4512 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4513 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4514 return false;
4515
4516 int Op32 = AMDGPU::getVOPe32(Opcode);
4517 if (Op32 == -1)
4518 return false;
4519
4520 return pseudoToMCOpcode(Opcode: Op32) != -1;
4521}
4522
4523bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4524 // The src0_modifier operand is present on all instructions
4525 // that have modifiers.
4526
4527 return AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::src0_modifiers);
4528}
4529
4530bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
4531 AMDGPU::OpName OpName) const {
4532 const MachineOperand *Mods = getNamedOperand(MI, OperandName: OpName);
4533 return Mods && Mods->getImm();
4534}
4535
4536bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
4537 return any_of(Range: ModifierOpNames,
4538 P: [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, OpName: Name); });
4539}
4540
4541bool SIInstrInfo::canShrink(const MachineInstr &MI,
4542 const MachineRegisterInfo &MRI) const {
4543 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
4544 // Can't shrink instruction with three operands.
4545 if (Src2) {
4546 switch (MI.getOpcode()) {
4547 default: return false;
4548
4549 case AMDGPU::V_ADDC_U32_e64:
4550 case AMDGPU::V_SUBB_U32_e64:
4551 case AMDGPU::V_SUBBREV_U32_e64: {
4552 const MachineOperand *Src1
4553 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4554 if (!Src1->isReg() || !RI.isVGPR(MRI, Reg: Src1->getReg()))
4555 return false;
4556 // Additional verification is needed for sdst/src2.
4557 return true;
4558 }
4559 case AMDGPU::V_MAC_F16_e64:
4560 case AMDGPU::V_MAC_F32_e64:
4561 case AMDGPU::V_MAC_LEGACY_F32_e64:
4562 case AMDGPU::V_FMAC_F16_e64:
4563 case AMDGPU::V_FMAC_F16_t16_e64:
4564 case AMDGPU::V_FMAC_F16_fake16_e64:
4565 case AMDGPU::V_FMAC_F32_e64:
4566 case AMDGPU::V_FMAC_F64_e64:
4567 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4568 if (!Src2->isReg() || !RI.isVGPR(MRI, Reg: Src2->getReg()) ||
4569 hasModifiersSet(MI, OpName: AMDGPU::OpName::src2_modifiers))
4570 return false;
4571 break;
4572
4573 case AMDGPU::V_CNDMASK_B32_e64:
4574 break;
4575 }
4576 }
4577
4578 const MachineOperand *Src1 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
4579 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Reg: Src1->getReg()) ||
4580 hasModifiersSet(MI, OpName: AMDGPU::OpName::src1_modifiers)))
4581 return false;
4582
4583 // We don't need to check src0, all input types are legal, so just make sure
4584 // src0 isn't using any modifiers.
4585 if (hasModifiersSet(MI, OpName: AMDGPU::OpName::src0_modifiers))
4586 return false;
4587
4588 // Can it be shrunk to a valid 32 bit opcode?
4589 if (!hasVALU32BitEncoding(Opcode: MI.getOpcode()))
4590 return false;
4591
4592 // Check output modifiers
4593 return !hasModifiersSet(MI, OpName: AMDGPU::OpName::omod) &&
4594 !hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) &&
4595 !hasModifiersSet(MI, OpName: AMDGPU::OpName::byte_sel) &&
4596 // TODO: Can we avoid checking bound_ctrl/fi here?
4597 // They are only used by permlane*_swap special case.
4598 !hasModifiersSet(MI, OpName: AMDGPU::OpName::bound_ctrl) &&
4599 !hasModifiersSet(MI, OpName: AMDGPU::OpName::fi);
4600}
4601
4602// Set VCC operand with all flags from \p Orig, except for setting it as
4603// implicit.
4604static void copyFlagsToImplicitVCC(MachineInstr &MI,
4605 const MachineOperand &Orig) {
4606
4607 for (MachineOperand &Use : MI.implicit_operands()) {
4608 if (Use.isUse() &&
4609 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4610 Use.setIsUndef(Orig.isUndef());
4611 Use.setIsKill(Orig.isKill());
4612 return;
4613 }
4614 }
4615}
4616
4617MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
4618 unsigned Op32) const {
4619 MachineBasicBlock *MBB = MI.getParent();
4620
4621 const MCInstrDesc &Op32Desc = get(Opcode: Op32);
4622 MachineInstrBuilder Inst32 =
4623 BuildMI(BB&: *MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: Op32Desc)
4624 .setMIFlags(MI.getFlags());
4625
4626 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4627 // For VOPC instructions, this is replaced by an implicit def of vcc.
4628
4629 // We assume the defs of the shrunk opcode are in the same order, and the
4630 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4631 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4632 Inst32.add(MO: MI.getOperand(i: I));
4633
4634 const MachineOperand *Src2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
4635
4636 int Idx = MI.getNumExplicitDefs();
4637 for (const MachineOperand &Use : MI.explicit_uses()) {
4638 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4639 if (OpTy == AMDGPU::OPERAND_INPUT_MODS || OpTy == MCOI::OPERAND_IMMEDIATE)
4640 continue;
4641
4642 if (&Use == Src2) {
4643 if (AMDGPU::getNamedOperandIdx(Opcode: Op32, Name: AMDGPU::OpName::src2) == -1) {
4644 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4645 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4646 // of vcc was already added during the initial BuildMI, but we
4647 // 1) may need to change vcc to vcc_lo to preserve the original register
4648 // 2) have to preserve the original flags.
4649 copyFlagsToImplicitVCC(MI&: *Inst32, Orig: *Src2);
4650 continue;
4651 }
4652 }
4653
4654 Inst32.add(MO: Use);
4655 }
4656
4657 // FIXME: Losing implicit operands
4658 fixImplicitOperands(MI&: *Inst32);
4659 return Inst32;
4660}
4661
4662bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
4663 const MachineOperand &MO,
4664 const MCOperandInfo &OpInfo) const {
4665 // Literal constants use the constant bus.
4666 if (!MO.isReg())
4667 return !isInlineConstant(MO, OpInfo);
4668
4669 if (!MO.isUse())
4670 return false;
4671
4672 if (MO.getReg().isVirtual())
4673 return RI.isSGPRClass(RC: MRI.getRegClass(Reg: MO.getReg()));
4674
4675 // Null is free
4676 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4677 return false;
4678
4679 // SGPRs use the constant bus
4680 if (MO.isImplicit()) {
4681 return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
4682 MO.getReg() == AMDGPU::VCC_LO;
4683 }
4684 return AMDGPU::SReg_32RegClass.contains(Reg: MO.getReg()) ||
4685 AMDGPU::SReg_64RegClass.contains(Reg: MO.getReg());
4686}
4687
4688static Register findImplicitSGPRRead(const MachineInstr &MI) {
4689 for (const MachineOperand &MO : MI.implicit_operands()) {
4690 // We only care about reads.
4691 if (MO.isDef())
4692 continue;
4693
4694 switch (MO.getReg()) {
4695 case AMDGPU::VCC:
4696 case AMDGPU::VCC_LO:
4697 case AMDGPU::VCC_HI:
4698 case AMDGPU::M0:
4699 case AMDGPU::FLAT_SCR:
4700 return MO.getReg();
4701
4702 default:
4703 break;
4704 }
4705 }
4706
4707 return Register();
4708}
4709
4710static bool shouldReadExec(const MachineInstr &MI) {
4711 if (SIInstrInfo::isVALU(MI)) {
4712 switch (MI.getOpcode()) {
4713 case AMDGPU::V_READLANE_B32:
4714 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4715 case AMDGPU::V_WRITELANE_B32:
4716 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4717 return false;
4718 }
4719
4720 return true;
4721 }
4722
4723 if (MI.isPreISelOpcode() ||
4724 SIInstrInfo::isGenericOpcode(Opc: MI.getOpcode()) ||
4725 SIInstrInfo::isSALU(MI) ||
4726 SIInstrInfo::isSMRD(MI))
4727 return false;
4728
4729 return true;
4730}
4731
4732static bool isRegOrFI(const MachineOperand &MO) {
4733 return MO.isReg() || MO.isFI();
4734}
4735
4736static bool isSubRegOf(const SIRegisterInfo &TRI,
4737 const MachineOperand &SuperVec,
4738 const MachineOperand &SubReg) {
4739 if (SubReg.getReg().isPhysical())
4740 return TRI.isSubRegister(RegA: SuperVec.getReg(), RegB: SubReg.getReg());
4741
4742 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4743 SubReg.getReg() == SuperVec.getReg();
4744}
4745
4746// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4747bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4748 const MachineRegisterInfo &MRI,
4749 StringRef &ErrInfo) const {
4750 Register DstReg = MI.getOperand(i: 0).getReg();
4751 Register SrcReg = MI.getOperand(i: 1).getReg();
4752 // This is a check for copy from vector register to SGPR
4753 if (RI.isVectorRegister(MRI, Reg: SrcReg) && RI.isSGPRReg(MRI, Reg: DstReg)) {
4754 ErrInfo = "illegal copy from vector register to SGPR";
4755 return false;
4756 }
4757 return true;
4758}
4759
4760bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
4761 StringRef &ErrInfo) const {
4762 uint16_t Opcode = MI.getOpcode();
4763 const MachineFunction *MF = MI.getParent()->getParent();
4764 const MachineRegisterInfo &MRI = MF->getRegInfo();
4765
4766 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4767 // Find a better property to recognize the point where instruction selection
4768 // is just done.
4769 // We can only enforce this check after SIFixSGPRCopies pass so that the
4770 // illegal copies are legalized and thereafter we don't expect a pass
4771 // inserting similar copies.
4772 if (!MRI.isSSA() && MI.isCopy())
4773 return verifyCopy(MI, MRI, ErrInfo);
4774
4775 if (SIInstrInfo::isGenericOpcode(Opc: Opcode))
4776 return true;
4777
4778 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0);
4779 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src1);
4780 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src2);
4781 int Src3Idx = -1;
4782 if (Src0Idx == -1) {
4783 // VOPD V_DUAL_* instructions use different operand names.
4784 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0X);
4785 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vsrc1X);
4786 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::src0Y);
4787 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vsrc1Y);
4788 }
4789
4790 // Make sure the number of operands is correct.
4791 const MCInstrDesc &Desc = get(Opcode);
4792 if (!Desc.isVariadic() &&
4793 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4794 ErrInfo = "Instruction has wrong number of operands.";
4795 return false;
4796 }
4797
4798 if (MI.isInlineAsm()) {
4799 // Verify register classes for inlineasm constraints.
4800 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4801 I != E; ++I) {
4802 const TargetRegisterClass *RC = MI.getRegClassConstraint(OpIdx: I, TII: this, TRI: &RI);
4803 if (!RC)
4804 continue;
4805
4806 const MachineOperand &Op = MI.getOperand(i: I);
4807 if (!Op.isReg())
4808 continue;
4809
4810 Register Reg = Op.getReg();
4811 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4812 ErrInfo = "inlineasm operand has incorrect register class.";
4813 return false;
4814 }
4815 }
4816
4817 return true;
4818 }
4819
4820 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4821 ErrInfo = "missing memory operand from image instruction.";
4822 return false;
4823 }
4824
4825 // Make sure the register classes are correct.
4826 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4827 const MachineOperand &MO = MI.getOperand(i);
4828 if (MO.isFPImm()) {
4829 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4830 "all fp values to integers.";
4831 return false;
4832 }
4833
4834 int RegClass = Desc.operands()[i].RegClass;
4835
4836 switch (Desc.operands()[i].OperandType) {
4837 case MCOI::OPERAND_REGISTER:
4838 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4839 ErrInfo = "Illegal immediate value for operand.";
4840 return false;
4841 }
4842 break;
4843 case AMDGPU::OPERAND_REG_IMM_INT32:
4844 case AMDGPU::OPERAND_REG_IMM_FP32:
4845 case AMDGPU::OPERAND_REG_IMM_V2FP32:
4846 break;
4847 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
4848 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
4849 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
4850 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
4851 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
4852 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
4853 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
4854 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
4855 case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
4856 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, OpIdx: i))) {
4857 ErrInfo = "Illegal immediate value for operand.";
4858 return false;
4859 }
4860 break;
4861 }
4862 case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
4863 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, OpIdx: i)) {
4864 ErrInfo = "Expected inline constant for operand.";
4865 return false;
4866 }
4867 break;
4868 case MCOI::OPERAND_IMMEDIATE:
4869 case AMDGPU::OPERAND_KIMM32:
4870 // Check if this operand is an immediate.
4871 // FrameIndex operands will be replaced by immediates, so they are
4872 // allowed.
4873 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4874 ErrInfo = "Expected immediate, but got non-immediate";
4875 return false;
4876 }
4877 [[fallthrough]];
4878 default:
4879 continue;
4880 }
4881
4882 if (!MO.isReg())
4883 continue;
4884 Register Reg = MO.getReg();
4885 if (!Reg)
4886 continue;
4887
4888 // FIXME: Ideally we would have separate instruction definitions with the
4889 // aligned register constraint.
4890 // FIXME: We do not verify inline asm operands, but custom inline asm
4891 // verification is broken anyway
4892 if (ST.needsAlignedVGPRs()) {
4893 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4894 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4895 if (const TargetRegisterClass *SubRC =
4896 RI.getSubRegisterClass(RC, MO.getSubReg())) {
4897 RC = RI.getCompatibleSubRegClass(SuperRC: RC, SubRC, SubIdx: MO.getSubReg());
4898 if (RC)
4899 RC = SubRC;
4900 }
4901 }
4902
4903 // Check that this is the aligned version of the class.
4904 if (!RC || !RI.isProperlyAlignedRC(RC: *RC)) {
4905 ErrInfo = "Subtarget requires even aligned vector registers";
4906 return false;
4907 }
4908 }
4909
4910 if (RegClass != -1) {
4911 if (Reg.isVirtual())
4912 continue;
4913
4914 const TargetRegisterClass *RC = RI.getRegClass(RCID: RegClass);
4915 if (!RC->contains(Reg)) {
4916 ErrInfo = "Operand has incorrect register class.";
4917 return false;
4918 }
4919 }
4920 }
4921
4922 // Verify SDWA
4923 if (isSDWA(MI)) {
4924 if (!ST.hasSDWA()) {
4925 ErrInfo = "SDWA is not supported on this target";
4926 return false;
4927 }
4928
4929 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
4930 AMDGPU::OpName::dst_sel}) {
4931 const MachineOperand *MO = getNamedOperand(MI, OperandName: Op);
4932 if (!MO)
4933 continue;
4934 int64_t Imm = MO->getImm();
4935 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
4936 ErrInfo = "Invalid SDWA selection";
4937 return false;
4938 }
4939 }
4940
4941 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdst);
4942
4943 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4944 if (OpIdx == -1)
4945 continue;
4946 const MachineOperand &MO = MI.getOperand(i: OpIdx);
4947
4948 if (!ST.hasSDWAScalar()) {
4949 // Only VGPRS on VI
4950 if (!MO.isReg() || !RI.hasVGPRs(RC: RI.getRegClassForReg(MRI, Reg: MO.getReg()))) {
4951 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4952 return false;
4953 }
4954 } else {
4955 // No immediates on GFX9
4956 if (!MO.isReg()) {
4957 ErrInfo =
4958 "Only reg allowed as operands in SDWA instructions on GFX9+";
4959 return false;
4960 }
4961 }
4962 }
4963
4964 if (!ST.hasSDWAOmod()) {
4965 // No omod allowed on VI
4966 const MachineOperand *OMod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
4967 if (OMod != nullptr &&
4968 (!OMod->isImm() || OMod->getImm() != 0)) {
4969 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4970 return false;
4971 }
4972 }
4973
4974 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4975 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4976 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4977 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4978 const MachineOperand *Src0ModsMO =
4979 getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
4980 unsigned Mods = Src0ModsMO->getImm();
4981 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4982 Mods & SISrcMods::SEXT) {
4983 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4984 return false;
4985 }
4986 }
4987
4988 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4989 if (isVOPC(Opcode: BasicOpcode)) {
4990 if (!ST.hasSDWASdst() && DstIdx != -1) {
4991 // Only vcc allowed as dst on VI for VOPC
4992 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
4993 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4994 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4995 return false;
4996 }
4997 } else if (!ST.hasSDWAOutModsVOPC()) {
4998 // No clamp allowed on GFX9 for VOPC
4999 const MachineOperand *Clamp = getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
5000 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5001 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5002 return false;
5003 }
5004
5005 // No omod allowed on GFX9 for VOPC
5006 const MachineOperand *OMod = getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
5007 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5008 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5009 return false;
5010 }
5011 }
5012 }
5013
5014 const MachineOperand *DstUnused = getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused);
5015 if (DstUnused && DstUnused->isImm() &&
5016 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5017 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5018 if (!Dst.isReg() || !Dst.isTied()) {
5019 ErrInfo = "Dst register should have tied register";
5020 return false;
5021 }
5022
5023 const MachineOperand &TiedMO =
5024 MI.getOperand(i: MI.findTiedOperandIdx(OpIdx: DstIdx));
5025 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5026 ErrInfo =
5027 "Dst register should be tied to implicit use of preserved register";
5028 return false;
5029 }
5030 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5031 ErrInfo = "Dst register should use same physical register as preserved";
5032 return false;
5033 }
5034 }
5035 }
5036
5037 // Verify MIMG / VIMAGE / VSAMPLE
5038 if (isImage(Opcode) && !MI.mayStore()) {
5039 // Ensure that the return type used is large enough for all the options
5040 // being used TFE/LWE require an extra result register.
5041 const MachineOperand *DMask = getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask);
5042 if (DMask) {
5043 uint64_t DMaskImm = DMask->getImm();
5044 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(Value: DMaskImm);
5045 const MachineOperand *TFE = getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe);
5046 const MachineOperand *LWE = getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe);
5047 const MachineOperand *D16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::d16);
5048
5049 // Adjust for packed 16 bit values
5050 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5051 RegCount = divideCeil(Numerator: RegCount, Denominator: 2);
5052
5053 // Adjust if using LWE or TFE
5054 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5055 RegCount += 1;
5056
5057 const uint32_t DstIdx =
5058 AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdata);
5059 const MachineOperand &Dst = MI.getOperand(i: DstIdx);
5060 if (Dst.isReg()) {
5061 const TargetRegisterClass *DstRC = getOpRegClass(MI, OpNo: DstIdx);
5062 uint32_t DstSize = RI.getRegSizeInBits(RC: *DstRC) / 32;
5063 if (RegCount > DstSize) {
5064 ErrInfo = "Image instruction returns too many registers for dst "
5065 "register class";
5066 return false;
5067 }
5068 }
5069 }
5070 }
5071
5072 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5073 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5074 unsigned ConstantBusCount = 0;
5075 bool UsesLiteral = false;
5076 const MachineOperand *LiteralVal = nullptr;
5077
5078 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::imm);
5079 if (ImmIdx != -1) {
5080 ++ConstantBusCount;
5081 UsesLiteral = true;
5082 LiteralVal = &MI.getOperand(i: ImmIdx);
5083 }
5084
5085 SmallVector<Register, 2> SGPRsUsed;
5086 Register SGPRUsed;
5087
5088 // Only look at the true operands. Only a real operand can use the constant
5089 // bus, and we don't want to check pseudo-operands like the source modifier
5090 // flags.
5091 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5092 if (OpIdx == -1)
5093 continue;
5094 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5095 if (usesConstantBus(MRI, MO, OpInfo: MI.getDesc().operands()[OpIdx])) {
5096 if (MO.isReg()) {
5097 SGPRUsed = MO.getReg();
5098 if (!llvm::is_contained(Range&: SGPRsUsed, Element: SGPRUsed)) {
5099 ++ConstantBusCount;
5100 SGPRsUsed.push_back(Elt: SGPRUsed);
5101 }
5102 } else if (!MO.isFI()) { // Treat FI like a register.
5103 if (!UsesLiteral) {
5104 ++ConstantBusCount;
5105 UsesLiteral = true;
5106 LiteralVal = &MO;
5107 } else if (!MO.isIdenticalTo(Other: *LiteralVal)) {
5108 assert(isVOP2(MI) || isVOP3(MI));
5109 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5110 return false;
5111 }
5112 }
5113 }
5114 }
5115
5116 SGPRUsed = findImplicitSGPRRead(MI);
5117 if (SGPRUsed) {
5118 // Implicit uses may safely overlap true operands
5119 if (llvm::all_of(Range&: SGPRsUsed, P: [this, SGPRUsed](unsigned SGPR) {
5120 return !RI.regsOverlap(RegA: SGPRUsed, RegB: SGPR);
5121 })) {
5122 ++ConstantBusCount;
5123 SGPRsUsed.push_back(Elt: SGPRUsed);
5124 }
5125 }
5126
5127 // v_writelane_b32 is an exception from constant bus restriction:
5128 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5129 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5130 Opcode != AMDGPU::V_WRITELANE_B32) {
5131 ErrInfo = "VOP* instruction violates constant bus restriction";
5132 return false;
5133 }
5134
5135 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5136 ErrInfo = "VOP3 instruction uses literal";
5137 return false;
5138 }
5139 }
5140
5141 // Special case for writelane - this can break the multiple constant bus rule,
5142 // but still can't use more than one SGPR register
5143 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5144 unsigned SGPRCount = 0;
5145 Register SGPRUsed;
5146
5147 for (int OpIdx : {Src0Idx, Src1Idx}) {
5148 if (OpIdx == -1)
5149 break;
5150
5151 const MachineOperand &MO = MI.getOperand(i: OpIdx);
5152
5153 if (usesConstantBus(MRI, MO, OpInfo: MI.getDesc().operands()[OpIdx])) {
5154 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5155 if (MO.getReg() != SGPRUsed)
5156 ++SGPRCount;
5157 SGPRUsed = MO.getReg();
5158 }
5159 }
5160 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5161 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5162 return false;
5163 }
5164 }
5165 }
5166
5167 // Verify misc. restrictions on specific instructions.
5168 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5169 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5170 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5171 const MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
5172 const MachineOperand &Src2 = MI.getOperand(i: Src2Idx);
5173 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5174 if (!compareMachineOp(Op0: Src0, Op1: Src1) &&
5175 !compareMachineOp(Op0: Src0, Op1: Src2)) {
5176 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5177 return false;
5178 }
5179 }
5180 if ((getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers)->getImm() &
5181 SISrcMods::ABS) ||
5182 (getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers)->getImm() &
5183 SISrcMods::ABS) ||
5184 (getNamedOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers)->getImm() &
5185 SISrcMods::ABS)) {
5186 ErrInfo = "ABS not allowed in VOP3B instructions";
5187 return false;
5188 }
5189 }
5190
5191 if (isSOP2(MI) || isSOPC(MI)) {
5192 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5193 const MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
5194
5195 if (!isRegOrFI(MO: Src0) && !isRegOrFI(MO: Src1) &&
5196 !isInlineConstant(MO: Src0, OpInfo: Desc.operands()[Src0Idx]) &&
5197 !isInlineConstant(MO: Src1, OpInfo: Desc.operands()[Src1Idx]) &&
5198 !Src0.isIdenticalTo(Other: Src1)) {
5199 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5200 return false;
5201 }
5202 }
5203
5204 if (isSOPK(MI)) {
5205 const auto *Op = getNamedOperand(MI, OperandName: AMDGPU::OpName::simm16);
5206 if (Desc.isBranch()) {
5207 if (!Op->isMBB()) {
5208 ErrInfo = "invalid branch target for SOPK instruction";
5209 return false;
5210 }
5211 } else {
5212 uint64_t Imm = Op->getImm();
5213 if (sopkIsZext(Opcode)) {
5214 if (!isUInt<16>(x: Imm)) {
5215 ErrInfo = "invalid immediate for SOPK instruction";
5216 return false;
5217 }
5218 } else {
5219 if (!isInt<16>(x: Imm)) {
5220 ErrInfo = "invalid immediate for SOPK instruction";
5221 return false;
5222 }
5223 }
5224 }
5225 }
5226
5227 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5228 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5229 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5230 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5231 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5232 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5233
5234 const unsigned StaticNumOps =
5235 Desc.getNumOperands() + Desc.implicit_uses().size();
5236 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5237
5238 // Allow additional implicit operands. This allows a fixup done by the post
5239 // RA scheduler where the main implicit operand is killed and implicit-defs
5240 // are added for sub-registers that remain live after this instruction.
5241 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5242 ErrInfo = "missing implicit register operands";
5243 return false;
5244 }
5245
5246 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
5247 if (IsDst) {
5248 if (!Dst->isUse()) {
5249 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5250 return false;
5251 }
5252
5253 unsigned UseOpIdx;
5254 if (!MI.isRegTiedToUseOperand(DefOpIdx: StaticNumOps, UseOpIdx: &UseOpIdx) ||
5255 UseOpIdx != StaticNumOps + 1) {
5256 ErrInfo = "movrel implicit operands should be tied";
5257 return false;
5258 }
5259 }
5260
5261 const MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
5262 const MachineOperand &ImpUse
5263 = MI.getOperand(i: StaticNumOps + NumImplicitOps - 1);
5264 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5265 !isSubRegOf(TRI: RI, SuperVec: ImpUse, SubReg: IsDst ? *Dst : Src0)) {
5266 ErrInfo = "src0 should be subreg of implicit vector use";
5267 return false;
5268 }
5269 }
5270
5271 // Make sure we aren't losing exec uses in the td files. This mostly requires
5272 // being careful when using let Uses to try to add other use registers.
5273 if (shouldReadExec(MI)) {
5274 if (!MI.hasRegisterImplicitUseOperand(Reg: AMDGPU::EXEC)) {
5275 ErrInfo = "VALU instruction does not implicitly read exec mask";
5276 return false;
5277 }
5278 }
5279
5280 if (isSMRD(MI)) {
5281 if (MI.mayStore() &&
5282 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5283 // The register offset form of scalar stores may only use m0 as the
5284 // soffset register.
5285 const MachineOperand *Soff = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
5286 if (Soff && Soff->getReg() != AMDGPU::M0) {
5287 ErrInfo = "scalar stores must use m0 as offset register";
5288 return false;
5289 }
5290 }
5291 }
5292
5293 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5294 const MachineOperand *Offset = getNamedOperand(MI, OperandName: AMDGPU::OpName::offset);
5295 if (Offset->getImm() != 0) {
5296 ErrInfo = "subtarget does not support offsets in flat instructions";
5297 return false;
5298 }
5299 }
5300
5301 if (isDS(MI) && !ST.hasGDS()) {
5302 const MachineOperand *GDSOp = getNamedOperand(MI, OperandName: AMDGPU::OpName::gds);
5303 if (GDSOp && GDSOp->getImm() != 0) {
5304 ErrInfo = "GDS is not supported on this subtarget";
5305 return false;
5306 }
5307 }
5308
5309 if (isImage(MI)) {
5310 const MachineOperand *DimOp = getNamedOperand(MI, OperandName: AMDGPU::OpName::dim);
5311 if (DimOp) {
5312 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5313 Name: AMDGPU::OpName::vaddr0);
5314 AMDGPU::OpName RSrcOpName =
5315 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5316 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: RSrcOpName);
5317 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Opcode);
5318 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5319 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
5320 const AMDGPU::MIMGDimInfo *Dim =
5321 AMDGPU::getMIMGDimInfoByEncoding(DimEnc: DimOp->getImm());
5322
5323 if (!Dim) {
5324 ErrInfo = "dim is out of range";
5325 return false;
5326 }
5327
5328 bool IsA16 = false;
5329 if (ST.hasR128A16()) {
5330 const MachineOperand *R128A16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::r128);
5331 IsA16 = R128A16->getImm() != 0;
5332 } else if (ST.hasA16()) {
5333 const MachineOperand *A16 = getNamedOperand(MI, OperandName: AMDGPU::OpName::a16);
5334 IsA16 = A16->getImm() != 0;
5335 }
5336
5337 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5338
5339 unsigned AddrWords =
5340 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, IsG16Supported: ST.hasG16());
5341
5342 unsigned VAddrWords;
5343 if (IsNSA) {
5344 VAddrWords = RsrcIdx - VAddr0Idx;
5345 if (ST.hasPartialNSAEncoding() &&
5346 AddrWords > ST.getNSAMaxSize(HasSampler: isVSAMPLE(MI))) {
5347 unsigned LastVAddrIdx = RsrcIdx - 1;
5348 VAddrWords += getOpSize(MI, OpNo: LastVAddrIdx) / 4 - 1;
5349 }
5350 } else {
5351 VAddrWords = getOpSize(MI, OpNo: VAddr0Idx) / 4;
5352 if (AddrWords > 12)
5353 AddrWords = 16;
5354 }
5355
5356 if (VAddrWords != AddrWords) {
5357 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5358 << " but got " << VAddrWords << "\n");
5359 ErrInfo = "bad vaddr size";
5360 return false;
5361 }
5362 }
5363 }
5364
5365 const MachineOperand *DppCt = getNamedOperand(MI, OperandName: AMDGPU::OpName::dpp_ctrl);
5366 if (DppCt) {
5367 using namespace AMDGPU::DPP;
5368
5369 unsigned DC = DppCt->getImm();
5370 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5371 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5372 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5373 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5374 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5375 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5376 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5377 ErrInfo = "Invalid dpp_ctrl value";
5378 return false;
5379 }
5380 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5381 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5382 ErrInfo = "Invalid dpp_ctrl value: "
5383 "wavefront shifts are not supported on GFX10+";
5384 return false;
5385 }
5386 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5387 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5388 ErrInfo = "Invalid dpp_ctrl value: "
5389 "broadcasts are not supported on GFX10+";
5390 return false;
5391 }
5392 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5393 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5394 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5395 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5396 !ST.hasGFX90AInsts()) {
5397 ErrInfo = "Invalid dpp_ctrl value: "
5398 "row_newbroadcast/row_share is not supported before "
5399 "GFX90A/GFX10";
5400 return false;
5401 }
5402 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5403 ErrInfo = "Invalid dpp_ctrl value: "
5404 "row_share and row_xmask are not supported before GFX10";
5405 return false;
5406 }
5407 }
5408
5409 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5410 !AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(OpDesc: Desc)) {
5411 ErrInfo = "Invalid dpp_ctrl value: "
5412 "DP ALU dpp only support row_newbcast";
5413 return false;
5414 }
5415 }
5416
5417 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5418 const MachineOperand *Dst = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
5419 AMDGPU::OpName DataName =
5420 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5421 const MachineOperand *Data = getNamedOperand(MI, OperandName: DataName);
5422 const MachineOperand *Data2 = getNamedOperand(MI, OperandName: AMDGPU::OpName::data1);
5423 if (Data && !Data->isReg())
5424 Data = nullptr;
5425
5426 if (ST.hasGFX90AInsts()) {
5427 if (Dst && Data &&
5428 (RI.isAGPR(MRI, Reg: Dst->getReg()) != RI.isAGPR(MRI, Reg: Data->getReg()))) {
5429 ErrInfo = "Invalid register class: "
5430 "vdata and vdst should be both VGPR or AGPR";
5431 return false;
5432 }
5433 if (Data && Data2 &&
5434 (RI.isAGPR(MRI, Reg: Data->getReg()) != RI.isAGPR(MRI, Reg: Data2->getReg()))) {
5435 ErrInfo = "Invalid register class: "
5436 "both data operands should be VGPR or AGPR";
5437 return false;
5438 }
5439 } else {
5440 if ((Dst && RI.isAGPR(MRI, Reg: Dst->getReg())) ||
5441 (Data && RI.isAGPR(MRI, Reg: Data->getReg())) ||
5442 (Data2 && RI.isAGPR(MRI, Reg: Data2->getReg()))) {
5443 ErrInfo = "Invalid register class: "
5444 "agpr loads and stores not supported on this GPU";
5445 return false;
5446 }
5447 }
5448 }
5449
5450 if (ST.needsAlignedVGPRs()) {
5451 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5452 const MachineOperand *Op = getNamedOperand(MI, OperandName: OpName);
5453 if (!Op)
5454 return true;
5455 Register Reg = Op->getReg();
5456 if (Reg.isPhysical())
5457 return !(RI.getHWRegIndex(Reg) & 1);
5458 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5459 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5460 !(RI.getChannelFromSubReg(SubReg: Op->getSubReg()) & 1);
5461 };
5462
5463 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5464 Opcode == AMDGPU::DS_GWS_BARRIER) {
5465
5466 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5467 ErrInfo = "Subtarget requires even aligned vector registers "
5468 "for DS_GWS instructions";
5469 return false;
5470 }
5471 }
5472
5473 if (isMIMG(MI)) {
5474 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5475 ErrInfo = "Subtarget requires even aligned vector registers "
5476 "for vaddr operand of image instructions";
5477 return false;
5478 }
5479 }
5480 }
5481
5482 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5483 const MachineOperand *Src = getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
5484 if (Src->isReg() && RI.isSGPRReg(MRI, Reg: Src->getReg())) {
5485 ErrInfo = "Invalid register class: "
5486 "v_accvgpr_write with an SGPR is not supported on this GPU";
5487 return false;
5488 }
5489 }
5490
5491 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5492 const MachineOperand &SrcOp = MI.getOperand(i: 1);
5493 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5494 ErrInfo = "pseudo expects only physical SGPRs";
5495 return false;
5496 }
5497 }
5498
5499 return true;
5500}
5501
5502// It is more readable to list mapped opcodes on the same line.
5503// clang-format off
5504
5505unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
5506 switch (MI.getOpcode()) {
5507 default: return AMDGPU::INSTRUCTION_LIST_END;
5508 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5509 case AMDGPU::COPY: return AMDGPU::COPY;
5510 case AMDGPU::PHI: return AMDGPU::PHI;
5511 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5512 case AMDGPU::WQM: return AMDGPU::WQM;
5513 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5514 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5515 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5516 case AMDGPU::S_MOV_B32: {
5517 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5518 return MI.getOperand(i: 1).isReg() ||
5519 RI.isAGPR(MRI, Reg: MI.getOperand(i: 0).getReg()) ?
5520 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5521 }
5522 case AMDGPU::S_ADD_I32:
5523 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5524 case AMDGPU::S_ADDC_U32:
5525 return AMDGPU::V_ADDC_U32_e32;
5526 case AMDGPU::S_SUB_I32:
5527 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5528 // FIXME: These are not consistently handled, and selected when the carry is
5529 // used.
5530 case AMDGPU::S_ADD_U32:
5531 return AMDGPU::V_ADD_CO_U32_e32;
5532 case AMDGPU::S_SUB_U32:
5533 return AMDGPU::V_SUB_CO_U32_e32;
5534 case AMDGPU::S_ADD_U64_PSEUDO:
5535 return AMDGPU::V_ADD_U64_PSEUDO;
5536 case AMDGPU::S_SUB_U64_PSEUDO:
5537 return AMDGPU::V_SUB_U64_PSEUDO;
5538 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5539 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5540 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5541 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5542 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5543 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5544 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5545 case AMDGPU::S_XNOR_B32:
5546 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5547 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5548 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5549 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5550 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5551 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5552 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5553 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5554 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5555 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5556 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5557 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5558 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5559 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5560 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5561 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5562 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5563 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5564 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5565 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5566 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5567 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5568 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5569 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5570 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5571 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5572 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5573 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5574 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5575 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5576 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5577 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5578 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5579 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5580 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5581 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5582 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5583 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5584 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5585 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5586 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5587 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5588 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5589 case AMDGPU::S_CVT_F32_F16:
5590 case AMDGPU::S_CVT_HI_F32_F16:
5591 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5592 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5593 case AMDGPU::S_CVT_F16_F32:
5594 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5595 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5596 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5597 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5598 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5599 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5600 case AMDGPU::S_CEIL_F16:
5601 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5602 : AMDGPU::V_CEIL_F16_fake16_e64;
5603 case AMDGPU::S_FLOOR_F16:
5604 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5605 : AMDGPU::V_FLOOR_F16_fake16_e64;
5606 case AMDGPU::S_TRUNC_F16:
5607 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5608 : AMDGPU::V_TRUNC_F16_fake16_e64;
5609 case AMDGPU::S_RNDNE_F16:
5610 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5611 : AMDGPU::V_RNDNE_F16_fake16_e64;
5612 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5613 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5614 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5615 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5616 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5617 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5618 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5619 case AMDGPU::S_ADD_F16:
5620 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5621 : AMDGPU::V_ADD_F16_fake16_e64;
5622 case AMDGPU::S_SUB_F16:
5623 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5624 : AMDGPU::V_SUB_F16_fake16_e64;
5625 case AMDGPU::S_MIN_F16:
5626 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5627 : AMDGPU::V_MIN_F16_fake16_e64;
5628 case AMDGPU::S_MAX_F16:
5629 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5630 : AMDGPU::V_MAX_F16_fake16_e64;
5631 case AMDGPU::S_MINIMUM_F16:
5632 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5633 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5634 case AMDGPU::S_MAXIMUM_F16:
5635 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5636 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5637 case AMDGPU::S_MUL_F16:
5638 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5639 : AMDGPU::V_MUL_F16_fake16_e64;
5640 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5641 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5642 case AMDGPU::S_FMAC_F16:
5643 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5644 : AMDGPU::V_FMAC_F16_fake16_e64;
5645 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5646 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5647 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5648 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5649 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5650 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5651 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5652 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5653 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5654 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5655 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5656 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5657 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5658 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5659 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5660 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5661 case AMDGPU::S_CMP_LT_F16:
5662 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5663 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5664 case AMDGPU::S_CMP_EQ_F16:
5665 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5666 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5667 case AMDGPU::S_CMP_LE_F16:
5668 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5669 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5670 case AMDGPU::S_CMP_GT_F16:
5671 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5672 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5673 case AMDGPU::S_CMP_LG_F16:
5674 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5675 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5676 case AMDGPU::S_CMP_GE_F16:
5677 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5678 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5679 case AMDGPU::S_CMP_O_F16:
5680 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5681 : AMDGPU::V_CMP_O_F16_fake16_e64;
5682 case AMDGPU::S_CMP_U_F16:
5683 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5684 : AMDGPU::V_CMP_U_F16_fake16_e64;
5685 case AMDGPU::S_CMP_NGE_F16:
5686 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5687 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5688 case AMDGPU::S_CMP_NLG_F16:
5689 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5690 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5691 case AMDGPU::S_CMP_NGT_F16:
5692 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5693 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5694 case AMDGPU::S_CMP_NLE_F16:
5695 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5696 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5697 case AMDGPU::S_CMP_NEQ_F16:
5698 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5699 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5700 case AMDGPU::S_CMP_NLT_F16:
5701 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5702 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5703 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5704 case AMDGPU::V_S_EXP_F16_e64:
5705 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
5706 : AMDGPU::V_EXP_F16_fake16_e64;
5707 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5708 case AMDGPU::V_S_LOG_F16_e64:
5709 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
5710 : AMDGPU::V_LOG_F16_fake16_e64;
5711 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5712 case AMDGPU::V_S_RCP_F16_e64:
5713 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
5714 : AMDGPU::V_RCP_F16_fake16_e64;
5715 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5716 case AMDGPU::V_S_RSQ_F16_e64:
5717 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
5718 : AMDGPU::V_RSQ_F16_fake16_e64;
5719 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5720 case AMDGPU::V_S_SQRT_F16_e64:
5721 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
5722 : AMDGPU::V_SQRT_F16_fake16_e64;
5723 }
5724 llvm_unreachable(
5725 "Unexpected scalar opcode without corresponding vector one!");
5726}
5727
5728// clang-format on
5729
5730void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
5731 MachineBasicBlock &MBB,
5732 MachineBasicBlock::iterator MBBI,
5733 const DebugLoc &DL, Register Reg,
5734 bool IsSCCLive,
5735 SlotIndexes *Indexes) const {
5736 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5737 const SIInstrInfo *TII = ST.getInstrInfo();
5738 bool IsWave32 = ST.isWave32();
5739 if (IsSCCLive) {
5740 // Insert two move instructions, one to save the original value of EXEC and
5741 // the other to turn on all bits in EXEC. This is required as we can't use
5742 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5743 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5744 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5745 auto StoreExecMI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: Reg)
5746 .addReg(RegNo: Exec, flags: RegState::Kill);
5747 auto FlipExecMI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: Exec).addImm(Val: -1);
5748 if (Indexes) {
5749 Indexes->insertMachineInstrInMaps(MI&: *StoreExecMI);
5750 Indexes->insertMachineInstrInMaps(MI&: *FlipExecMI);
5751 }
5752 } else {
5753 const unsigned OrSaveExec =
5754 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5755 auto SaveExec =
5756 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: OrSaveExec), DestReg: Reg).addImm(Val: -1);
5757 SaveExec->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
5758 if (Indexes)
5759 Indexes->insertMachineInstrInMaps(MI&: *SaveExec);
5760 }
5761}
5762
5763void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
5764 MachineBasicBlock::iterator MBBI,
5765 const DebugLoc &DL, Register Reg,
5766 SlotIndexes *Indexes) const {
5767 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5768 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5769 auto ExecRestoreMI =
5770 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: get(Opcode: ExecMov), DestReg: Exec).addReg(RegNo: Reg, flags: RegState::Kill);
5771 if (Indexes)
5772 Indexes->insertMachineInstrInMaps(MI&: *ExecRestoreMI);
5773}
5774
5775static const TargetRegisterClass *
5776adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
5777 const MachineRegisterInfo &MRI,
5778 const MCInstrDesc &TID, unsigned RCID,
5779 bool IsAllocatable) {
5780 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5781 (((TID.mayLoad() || TID.mayStore()) &&
5782 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5783 (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
5784 switch (RCID) {
5785 case AMDGPU::AV_32RegClassID:
5786 RCID = AMDGPU::VGPR_32RegClassID;
5787 break;
5788 case AMDGPU::AV_64RegClassID:
5789 RCID = AMDGPU::VReg_64RegClassID;
5790 break;
5791 case AMDGPU::AV_96RegClassID:
5792 RCID = AMDGPU::VReg_96RegClassID;
5793 break;
5794 case AMDGPU::AV_128RegClassID:
5795 RCID = AMDGPU::VReg_128RegClassID;
5796 break;
5797 case AMDGPU::AV_160RegClassID:
5798 RCID = AMDGPU::VReg_160RegClassID;
5799 break;
5800 case AMDGPU::AV_512RegClassID:
5801 RCID = AMDGPU::VReg_512RegClassID;
5802 break;
5803 default:
5804 break;
5805 }
5806 }
5807
5808 return RI.getProperlyAlignedRC(RC: RI.getRegClass(RCID));
5809}
5810
5811const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
5812 unsigned OpNum, const TargetRegisterInfo *TRI,
5813 const MachineFunction &MF)
5814 const {
5815 if (OpNum >= TID.getNumOperands())
5816 return nullptr;
5817 auto RegClass = TID.operands()[OpNum].RegClass;
5818 bool IsAllocatable = false;
5819 if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
5820 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5821 // with two data operands. Request register class constrained to VGPR only
5822 // of both operands present as Machine Copy Propagation can not check this
5823 // constraint and possibly other passes too.
5824 //
5825 // The check is limited to FLAT and DS because atomics in non-flat encoding
5826 // have their vdst and vdata tied to be the same register.
5827 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opcode: TID.Opcode,
5828 Name: AMDGPU::OpName::vdst);
5829 const int DataIdx = AMDGPU::getNamedOperandIdx(Opcode: TID.Opcode,
5830 Name: (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5831 : AMDGPU::OpName::vdata);
5832 if (DataIdx != -1) {
5833 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5834 Opcode: TID.Opcode, NamedIdx: AMDGPU::OpName::data1);
5835 }
5836 }
5837 return adjustAllocatableRegClass(ST, RI, MRI: MF.getRegInfo(), TID, RCID: RegClass,
5838 IsAllocatable);
5839}
5840
5841const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
5842 unsigned OpNo) const {
5843 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5844 const MCInstrDesc &Desc = get(Opcode: MI.getOpcode());
5845 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5846 Desc.operands()[OpNo].RegClass == -1) {
5847 Register Reg = MI.getOperand(i: OpNo).getReg();
5848
5849 if (Reg.isVirtual())
5850 return MRI.getRegClass(Reg);
5851 return RI.getPhysRegBaseClass(Reg);
5852 }
5853
5854 unsigned RCID = Desc.operands()[OpNo].RegClass;
5855 return adjustAllocatableRegClass(ST, RI, MRI, TID: Desc, RCID, IsAllocatable: true);
5856}
5857
5858void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
5859 MachineBasicBlock::iterator I = MI;
5860 MachineBasicBlock *MBB = MI.getParent();
5861 MachineOperand &MO = MI.getOperand(i: OpIdx);
5862 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
5863 unsigned RCID = get(Opcode: MI.getOpcode()).operands()[OpIdx].RegClass;
5864 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5865 unsigned Size = RI.getRegSizeInBits(RC: *RC);
5866 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
5867 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
5868 : AMDGPU::V_MOV_B32_e32;
5869 if (MO.isReg())
5870 Opcode = AMDGPU::COPY;
5871 else if (RI.isSGPRClass(RC))
5872 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5873
5874 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(SRC: RC);
5875 Register Reg = MRI.createVirtualRegister(RegClass: VRC);
5876 DebugLoc DL = MBB->findDebugLoc(MBBI: I);
5877 BuildMI(BB&: *MI.getParent(), I, MIMD: DL, MCID: get(Opcode), DestReg: Reg).add(MO);
5878 MO.ChangeToRegister(Reg, isDef: false);
5879}
5880
5881unsigned SIInstrInfo::buildExtractSubReg(
5882 MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
5883 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5884 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5885 if (!SuperReg.getReg().isVirtual())
5886 return RI.getSubReg(Reg: SuperReg.getReg(), Idx: SubIdx);
5887
5888 MachineBasicBlock *MBB = MI->getParent();
5889 const DebugLoc &DL = MI->getDebugLoc();
5890 Register SubReg = MRI.createVirtualRegister(RegClass: SubRC);
5891
5892 unsigned NewSubIdx = RI.composeSubRegIndices(a: SuperReg.getSubReg(), b: SubIdx);
5893 BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: SubReg)
5894 .addReg(RegNo: SuperReg.getReg(), flags: 0, SubReg: NewSubIdx);
5895 return SubReg;
5896}
5897
5898MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
5899 MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI,
5900 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5901 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5902 if (Op.isImm()) {
5903 if (SubIdx == AMDGPU::sub0)
5904 return MachineOperand::CreateImm(Val: static_cast<int32_t>(Op.getImm()));
5905 if (SubIdx == AMDGPU::sub1)
5906 return MachineOperand::CreateImm(Val: static_cast<int32_t>(Op.getImm() >> 32));
5907
5908 llvm_unreachable("Unhandled register index for immediate");
5909 }
5910
5911 unsigned SubReg = buildExtractSubReg(MI: MII, MRI, SuperReg: Op, SuperRC,
5912 SubIdx, SubRC);
5913 return MachineOperand::CreateReg(Reg: SubReg, isDef: false);
5914}
5915
5916// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5917void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5918 assert(Inst.getNumExplicitOperands() == 3);
5919 MachineOperand Op1 = Inst.getOperand(i: 1);
5920 Inst.removeOperand(OpNo: 1);
5921 Inst.addOperand(Op: Op1);
5922}
5923
5924bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
5925 const MCOperandInfo &OpInfo,
5926 const MachineOperand &MO) const {
5927 if (!MO.isReg())
5928 return false;
5929
5930 Register Reg = MO.getReg();
5931
5932 const TargetRegisterClass *DRC = RI.getRegClass(RCID: OpInfo.RegClass);
5933 if (Reg.isPhysical())
5934 return DRC->contains(Reg);
5935
5936 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5937
5938 if (MO.getSubReg()) {
5939 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5940 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, MF: *MF);
5941 if (!SuperRC)
5942 return false;
5943
5944 DRC = RI.getMatchingSuperRegClass(A: SuperRC, B: DRC, Idx: MO.getSubReg());
5945 if (!DRC)
5946 return false;
5947 }
5948 return RC->hasSuperClassEq(RC: DRC);
5949}
5950
5951bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
5952 const MachineOperand &MO) const {
5953 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5954 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
5955 unsigned Opc = MI.getOpcode();
5956
5957 if (!isLegalRegOperand(MRI, OpInfo, MO))
5958 return false;
5959
5960 // check Accumulate GPR operand
5961 bool IsAGPR = RI.isAGPR(MRI, Reg: MO.getReg());
5962 if (IsAGPR && !ST.hasMAIInsts())
5963 return false;
5964 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5965 (MI.mayLoad() || MI.mayStore() || isDS(Opcode: Opc) || isMIMG(Opcode: Opc)))
5966 return false;
5967 // Atomics should have both vdst and vdata either vgpr or agpr.
5968 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst);
5969 const int DataIdx = AMDGPU::getNamedOperandIdx(
5970 Opcode: Opc, Name: isDS(Opcode: Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5971 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5972 MI.getOperand(i: DataIdx).isReg() &&
5973 RI.isAGPR(MRI, Reg: MI.getOperand(i: DataIdx).getReg()) != IsAGPR)
5974 return false;
5975 if ((int)OpIdx == DataIdx) {
5976 if (VDstIdx != -1 &&
5977 RI.isAGPR(MRI, Reg: MI.getOperand(i: VDstIdx).getReg()) != IsAGPR)
5978 return false;
5979 // DS instructions with 2 src operands also must have tied RC.
5980 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data1);
5981 if (Data1Idx != -1 && MI.getOperand(i: Data1Idx).isReg() &&
5982 RI.isAGPR(MRI, Reg: MI.getOperand(i: Data1Idx).getReg()) != IsAGPR)
5983 return false;
5984 }
5985
5986 // Check V_ACCVGPR_WRITE_B32_e64
5987 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5988 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0) &&
5989 RI.isSGPRReg(MRI, Reg: MO.getReg()))
5990 return false;
5991 return true;
5992}
5993
5994bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
5995 const MCOperandInfo &OpInfo,
5996 const MachineOperand &MO) const {
5997 if (MO.isReg())
5998 return isLegalRegOperand(MRI, OpInfo, MO);
5999
6000 // Handle non-register types that are treated like immediates.
6001 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6002 return true;
6003}
6004
6005bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
6006 const MachineOperand *MO) const {
6007 const MachineFunction &MF = *MI.getParent()->getParent();
6008 const MachineRegisterInfo &MRI = MF.getRegInfo();
6009 const MCInstrDesc &InstDesc = MI.getDesc();
6010 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6011 const TargetRegisterClass *DefinedRC =
6012 OpInfo.RegClass != -1 ? RI.getRegClass(RCID: OpInfo.RegClass) : nullptr;
6013 if (!MO)
6014 MO = &MI.getOperand(i: OpIdx);
6015
6016 const bool IsInlineConst = !MO->isReg() && isInlineConstant(MO: *MO, OpInfo);
6017
6018 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, MO: *MO, OpInfo)) {
6019 const MachineOperand *UsedLiteral = nullptr;
6020
6021 int ConstantBusLimit = ST.getConstantBusLimit(Opcode: MI.getOpcode());
6022 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6023
6024 // TODO: Be more permissive with frame indexes.
6025 if (!MO->isReg() && !isInlineConstant(MO: *MO, OpInfo)) {
6026 if (!LiteralLimit--)
6027 return false;
6028
6029 UsedLiteral = MO;
6030 }
6031
6032 SmallDenseSet<RegSubRegPair> SGPRsUsed;
6033 if (MO->isReg())
6034 SGPRsUsed.insert(V: RegSubRegPair(MO->getReg(), MO->getSubReg()));
6035
6036 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6037 if (i == OpIdx)
6038 continue;
6039 const MachineOperand &Op = MI.getOperand(i);
6040 if (Op.isReg()) {
6041 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6042 if (!SGPRsUsed.count(V: SGPR) &&
6043 // FIXME: This can access off the end of the operands() array.
6044 usesConstantBus(MRI, MO: Op, OpInfo: InstDesc.operands().begin()[i])) {
6045 if (--ConstantBusLimit <= 0)
6046 return false;
6047 SGPRsUsed.insert(V: SGPR);
6048 }
6049 } else if (AMDGPU::isSISrcOperand(Desc: InstDesc, OpNo: i) &&
6050 !isInlineConstant(MO: Op, OpInfo: InstDesc.operands()[i])) {
6051 // The same literal may be used multiple times.
6052 if (!UsedLiteral)
6053 UsedLiteral = &Op;
6054 else if (UsedLiteral->isIdenticalTo(Other: Op))
6055 continue;
6056
6057 if (!LiteralLimit--)
6058 return false;
6059 if (--ConstantBusLimit <= 0)
6060 return false;
6061 }
6062 }
6063 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6064 // There can be at most one literal operand, but it can be repeated.
6065 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6066 if (i == OpIdx)
6067 continue;
6068 const MachineOperand &Op = MI.getOperand(i);
6069 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6070 !isInlineConstant(MO: Op, OpInfo: InstDesc.operands()[i]) &&
6071 !Op.isIdenticalTo(Other: *MO))
6072 return false;
6073
6074 // Do not fold a frame index into an instruction that already has a frame
6075 // index. The frame index handling code doesn't handle fixing up operand
6076 // constraints if there are multiple indexes.
6077 if (Op.isFI() && MO->isFI())
6078 return false;
6079 }
6080 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6081 isF16PseudoScalarTrans(Opcode: MI.getOpcode())) {
6082 return false;
6083 }
6084
6085 if (MO->isReg()) {
6086 if (!DefinedRC)
6087 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6088 return isLegalRegOperand(MI, OpIdx, MO: *MO);
6089 }
6090
6091 if (MO->isImm()) {
6092 uint64_t Imm = MO->getImm();
6093 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6094 bool Is64BitOp = Is64BitFPOp ||
6095 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6096 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6097 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6098 if (Is64BitOp &&
6099 !AMDGPU::isInlinableLiteral64(Literal: Imm, HasInv2Pi: ST.hasInv2PiInlineImm())) {
6100 if (!AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: Is64BitFPOp))
6101 return false;
6102
6103 // FIXME: We can use sign extended 64-bit literals, but only for signed
6104 // operands. At the moment we do not know if an operand is signed.
6105 // Such operand will be encoded as its low 32 bits and then either
6106 // correctly sign extended or incorrectly zero extended by HW.
6107 if (!Is64BitFPOp && (int32_t)Imm < 0)
6108 return false;
6109 }
6110 }
6111
6112 // Handle non-register types that are treated like immediates.
6113 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6114
6115 if (!DefinedRC) {
6116 // This operand expects an immediate.
6117 return true;
6118 }
6119
6120 return isImmOperandLegal(MI, OpNo: OpIdx, MO: *MO);
6121}
6122
6123void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
6124 MachineInstr &MI) const {
6125 unsigned Opc = MI.getOpcode();
6126 const MCInstrDesc &InstrDesc = get(Opcode: Opc);
6127
6128 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0);
6129 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
6130
6131 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1);
6132 MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
6133
6134 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6135 // we need to only have one constant bus use before GFX10.
6136 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6137 if (HasImplicitSGPR && ST.getConstantBusLimit(Opcode: Opc) <= 1 && Src0.isReg() &&
6138 RI.isSGPRReg(MRI, Reg: Src0.getReg()))
6139 legalizeOpWithMove(MI, OpIdx: Src0Idx);
6140
6141 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6142 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6143 // src0/src1 with V_READFIRSTLANE.
6144 if (Opc == AMDGPU::V_WRITELANE_B32) {
6145 const DebugLoc &DL = MI.getDebugLoc();
6146 if (Src0.isReg() && RI.isVGPR(MRI, Reg: Src0.getReg())) {
6147 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6148 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6149 .add(MO: Src0);
6150 Src0.ChangeToRegister(Reg, isDef: false);
6151 }
6152 if (Src1.isReg() && RI.isVGPR(MRI, Reg: Src1.getReg())) {
6153 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6154 const DebugLoc &DL = MI.getDebugLoc();
6155 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6156 .add(MO: Src1);
6157 Src1.ChangeToRegister(Reg, isDef: false);
6158 }
6159 return;
6160 }
6161
6162 // No VOP2 instructions support AGPRs.
6163 if (Src0.isReg() && RI.isAGPR(MRI, Reg: Src0.getReg()))
6164 legalizeOpWithMove(MI, OpIdx: Src0Idx);
6165
6166 if (Src1.isReg() && RI.isAGPR(MRI, Reg: Src1.getReg()))
6167 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6168
6169 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6170 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6171 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
6172 if (!RI.isVGPR(MRI, Reg: MI.getOperand(i: Src2Idx).getReg()))
6173 legalizeOpWithMove(MI, OpIdx: Src2Idx);
6174 }
6175
6176 // VOP2 src0 instructions support all operand types, so we don't need to check
6177 // their legality. If src1 is already legal, we don't need to do anything.
6178 if (isLegalRegOperand(MRI, OpInfo: InstrDesc.operands()[Src1Idx], MO: Src1))
6179 return;
6180
6181 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6182 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6183 // select is uniform.
6184 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6185 RI.isVGPR(MRI, Reg: Src1.getReg())) {
6186 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6187 const DebugLoc &DL = MI.getDebugLoc();
6188 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6189 .add(MO: Src1);
6190 Src1.ChangeToRegister(Reg, isDef: false);
6191 return;
6192 }
6193
6194 // We do not use commuteInstruction here because it is too aggressive and will
6195 // commute if it is possible. We only want to commute here if it improves
6196 // legality. This can be called a fairly large number of times so don't waste
6197 // compile time pointlessly swapping and checking legality again.
6198 if (HasImplicitSGPR || !MI.isCommutable()) {
6199 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6200 return;
6201 }
6202
6203 // If src0 can be used as src1, commuting will make the operands legal.
6204 // Otherwise we have to give up and insert a move.
6205 //
6206 // TODO: Other immediate-like operand kinds could be commuted if there was a
6207 // MachineOperand::ChangeTo* for them.
6208 if ((!Src1.isImm() && !Src1.isReg()) ||
6209 !isLegalRegOperand(MRI, OpInfo: InstrDesc.operands()[Src1Idx], MO: Src0)) {
6210 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6211 return;
6212 }
6213
6214 int CommutedOpc = commuteOpcode(MI);
6215 if (CommutedOpc == -1) {
6216 legalizeOpWithMove(MI, OpIdx: Src1Idx);
6217 return;
6218 }
6219
6220 MI.setDesc(get(Opcode: CommutedOpc));
6221
6222 Register Src0Reg = Src0.getReg();
6223 unsigned Src0SubReg = Src0.getSubReg();
6224 bool Src0Kill = Src0.isKill();
6225
6226 if (Src1.isImm())
6227 Src0.ChangeToImmediate(ImmVal: Src1.getImm());
6228 else if (Src1.isReg()) {
6229 Src0.ChangeToRegister(Reg: Src1.getReg(), isDef: false, isImp: false, isKill: Src1.isKill());
6230 Src0.setSubReg(Src1.getSubReg());
6231 } else
6232 llvm_unreachable("Should only have register or immediate operands");
6233
6234 Src1.ChangeToRegister(Reg: Src0Reg, isDef: false, isImp: false, isKill: Src0Kill);
6235 Src1.setSubReg(Src0SubReg);
6236 fixImplicitOperands(MI);
6237}
6238
6239// Legalize VOP3 operands. All operand types are supported for any operand
6240// but only one literal constant and only starting from GFX10.
6241void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
6242 MachineInstr &MI) const {
6243 unsigned Opc = MI.getOpcode();
6244
6245 int VOP3Idx[3] = {
6246 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0),
6247 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1),
6248 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2)
6249 };
6250
6251 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6252 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
6253 // src1 and src2 must be scalar
6254 MachineOperand &Src1 = MI.getOperand(i: VOP3Idx[1]);
6255 MachineOperand &Src2 = MI.getOperand(i: VOP3Idx[2]);
6256 const DebugLoc &DL = MI.getDebugLoc();
6257 if (Src1.isReg() && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src1.getReg()))) {
6258 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6259 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6260 .add(MO: Src1);
6261 Src1.ChangeToRegister(Reg, isDef: false);
6262 }
6263 if (Src2.isReg() && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src2.getReg()))) {
6264 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6265 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
6266 .add(MO: Src2);
6267 Src2.ChangeToRegister(Reg, isDef: false);
6268 }
6269 }
6270
6271 // Find the one SGPR operand we are allowed to use.
6272 int ConstantBusLimit = ST.getConstantBusLimit(Opcode: Opc);
6273 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6274 SmallDenseSet<unsigned> SGPRsUsed;
6275 Register SGPRReg = findUsedSGPR(MI, OpIndices: VOP3Idx);
6276 if (SGPRReg) {
6277 SGPRsUsed.insert(V: SGPRReg);
6278 --ConstantBusLimit;
6279 }
6280
6281 for (int Idx : VOP3Idx) {
6282 if (Idx == -1)
6283 break;
6284 MachineOperand &MO = MI.getOperand(i: Idx);
6285
6286 if (!MO.isReg()) {
6287 if (isInlineConstant(MO, OpInfo: get(Opcode: Opc).operands()[Idx]))
6288 continue;
6289
6290 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6291 --LiteralLimit;
6292 --ConstantBusLimit;
6293 continue;
6294 }
6295
6296 --LiteralLimit;
6297 --ConstantBusLimit;
6298 legalizeOpWithMove(MI, OpIdx: Idx);
6299 continue;
6300 }
6301
6302 if (RI.hasAGPRs(RC: RI.getRegClassForReg(MRI, Reg: MO.getReg())) &&
6303 !isOperandLegal(MI, OpIdx: Idx, MO: &MO)) {
6304 legalizeOpWithMove(MI, OpIdx: Idx);
6305 continue;
6306 }
6307
6308 if (!RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg: MO.getReg())))
6309 continue; // VGPRs are legal
6310
6311 // We can use one SGPR in each VOP3 instruction prior to GFX10
6312 // and two starting from GFX10.
6313 if (SGPRsUsed.count(V: MO.getReg()))
6314 continue;
6315 if (ConstantBusLimit > 0) {
6316 SGPRsUsed.insert(V: MO.getReg());
6317 --ConstantBusLimit;
6318 continue;
6319 }
6320
6321 // If we make it this far, then the operand is not legal and we must
6322 // legalize it.
6323 legalizeOpWithMove(MI, OpIdx: Idx);
6324 }
6325
6326 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6327 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6328 !RI.isVGPR(MRI, Reg: MI.getOperand(i: VOP3Idx[2]).getReg()))
6329 legalizeOpWithMove(MI, OpIdx: VOP3Idx[2]);
6330}
6331
6332Register SIInstrInfo::readlaneVGPRToSGPR(
6333 Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI,
6334 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6335 const TargetRegisterClass *VRC = MRI.getRegClass(Reg: SrcReg);
6336 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6337 if (DstRC)
6338 SRC = RI.getCommonSubClass(A: SRC, B: DstRC);
6339
6340 Register DstReg = MRI.createVirtualRegister(RegClass: SRC);
6341 unsigned SubRegs = RI.getRegSizeInBits(RC: *VRC) / 32;
6342
6343 if (RI.hasAGPRs(RC: VRC)) {
6344 VRC = RI.getEquivalentVGPRClass(SRC: VRC);
6345 Register NewSrcReg = MRI.createVirtualRegister(RegClass: VRC);
6346 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6347 MCID: get(Opcode: TargetOpcode::COPY), DestReg: NewSrcReg)
6348 .addReg(RegNo: SrcReg);
6349 SrcReg = NewSrcReg;
6350 }
6351
6352 if (SubRegs == 1) {
6353 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6354 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
6355 .addReg(RegNo: SrcReg);
6356 return DstReg;
6357 }
6358
6359 SmallVector<Register, 8> SRegs;
6360 for (unsigned i = 0; i < SubRegs; ++i) {
6361 Register SGPR = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
6362 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6363 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: SGPR)
6364 .addReg(RegNo: SrcReg, flags: 0, SubReg: RI.getSubRegFromChannel(Channel: i));
6365 SRegs.push_back(Elt: SGPR);
6366 }
6367
6368 MachineInstrBuilder MIB =
6369 BuildMI(BB&: *UseMI.getParent(), I&: UseMI, MIMD: UseMI.getDebugLoc(),
6370 MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg);
6371 for (unsigned i = 0; i < SubRegs; ++i) {
6372 MIB.addReg(RegNo: SRegs[i]);
6373 MIB.addImm(Val: RI.getSubRegFromChannel(Channel: i));
6374 }
6375 return DstReg;
6376}
6377
6378void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
6379 MachineInstr &MI) const {
6380
6381 // If the pointer is store in VGPRs, then we need to move them to
6382 // SGPRs using v_readfirstlane. This is safe because we only select
6383 // loads with uniform pointers to SMRD instruction so we know the
6384 // pointer value is uniform.
6385 MachineOperand *SBase = getNamedOperand(MI, OperandName: AMDGPU::OpName::sbase);
6386 if (SBase && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SBase->getReg()))) {
6387 Register SGPR = readlaneVGPRToSGPR(SrcReg: SBase->getReg(), UseMI&: MI, MRI);
6388 SBase->setReg(SGPR);
6389 }
6390 MachineOperand *SOff = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
6391 if (SOff && !RI.isSGPRReg(MRI, Reg: SOff->getReg())) {
6392 Register SGPR = readlaneVGPRToSGPR(SrcReg: SOff->getReg(), UseMI&: MI, MRI);
6393 SOff->setReg(SGPR);
6394 }
6395}
6396
6397bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
6398 unsigned Opc = Inst.getOpcode();
6399 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::saddr);
6400 if (OldSAddrIdx < 0)
6401 return false;
6402
6403 assert(isSegmentSpecificFLAT(Inst));
6404
6405 int NewOpc = AMDGPU::getGlobalVaddrOp(Opcode: Opc);
6406 if (NewOpc < 0)
6407 NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opcode: Opc);
6408 if (NewOpc < 0)
6409 return false;
6410
6411 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6412 MachineOperand &SAddr = Inst.getOperand(i: OldSAddrIdx);
6413 if (RI.isSGPRReg(MRI, Reg: SAddr.getReg()))
6414 return false;
6415
6416 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::vaddr);
6417 if (NewVAddrIdx < 0)
6418 return false;
6419
6420 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr);
6421
6422 // Check vaddr, it shall be zero or absent.
6423 MachineInstr *VAddrDef = nullptr;
6424 if (OldVAddrIdx >= 0) {
6425 MachineOperand &VAddr = Inst.getOperand(i: OldVAddrIdx);
6426 VAddrDef = MRI.getUniqueVRegDef(Reg: VAddr.getReg());
6427 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6428 !VAddrDef->getOperand(i: 1).isImm() ||
6429 VAddrDef->getOperand(i: 1).getImm() != 0)
6430 return false;
6431 }
6432
6433 const MCInstrDesc &NewDesc = get(Opcode: NewOpc);
6434 Inst.setDesc(NewDesc);
6435
6436 // Callers expect iterator to be valid after this call, so modify the
6437 // instruction in place.
6438 if (OldVAddrIdx == NewVAddrIdx) {
6439 MachineOperand &NewVAddr = Inst.getOperand(i: NewVAddrIdx);
6440 // Clear use list from the old vaddr holding a zero register.
6441 MRI.removeRegOperandFromUseList(MO: &NewVAddr);
6442 MRI.moveOperands(Dst: &NewVAddr, Src: &SAddr, NumOps: 1);
6443 Inst.removeOperand(OpNo: OldSAddrIdx);
6444 // Update the use list with the pointer we have just moved from vaddr to
6445 // saddr position. Otherwise new vaddr will be missing from the use list.
6446 MRI.removeRegOperandFromUseList(MO: &NewVAddr);
6447 MRI.addRegOperandToUseList(MO: &NewVAddr);
6448 } else {
6449 assert(OldSAddrIdx == NewVAddrIdx);
6450
6451 if (OldVAddrIdx >= 0) {
6452 int NewVDstIn = AMDGPU::getNamedOperandIdx(Opcode: NewOpc,
6453 Name: AMDGPU::OpName::vdst_in);
6454
6455 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6456 // it asserts. Untie the operands for now and retie them afterwards.
6457 if (NewVDstIn != -1) {
6458 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vdst_in);
6459 Inst.untieRegOperand(OpIdx: OldVDstIn);
6460 }
6461
6462 Inst.removeOperand(OpNo: OldVAddrIdx);
6463
6464 if (NewVDstIn != -1) {
6465 int NewVDst = AMDGPU::getNamedOperandIdx(Opcode: NewOpc, Name: AMDGPU::OpName::vdst);
6466 Inst.tieOperands(DefIdx: NewVDst, UseIdx: NewVDstIn);
6467 }
6468 }
6469 }
6470
6471 if (VAddrDef && MRI.use_nodbg_empty(RegNo: VAddrDef->getOperand(i: 0).getReg()))
6472 VAddrDef->eraseFromParent();
6473
6474 return true;
6475}
6476
6477// FIXME: Remove this when SelectionDAG is obsoleted.
6478void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
6479 MachineInstr &MI) const {
6480 if (!isSegmentSpecificFLAT(MI))
6481 return;
6482
6483 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6484 // thinks they are uniform, so a readfirstlane should be valid.
6485 MachineOperand *SAddr = getNamedOperand(MI, OperandName: AMDGPU::OpName::saddr);
6486 if (!SAddr || RI.isSGPRClass(RC: MRI.getRegClass(Reg: SAddr->getReg())))
6487 return;
6488
6489 if (moveFlatAddrToVGPR(Inst&: MI))
6490 return;
6491
6492 const TargetRegisterClass *DeclaredRC = getRegClass(
6493 TID: MI.getDesc(), OpNum: SAddr->getOperandNo(), TRI: &RI, MF: *MI.getParent()->getParent());
6494
6495 Register ToSGPR = readlaneVGPRToSGPR(SrcReg: SAddr->getReg(), UseMI&: MI, MRI, DstRC: DeclaredRC);
6496 SAddr->setReg(ToSGPR);
6497}
6498
6499void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
6500 MachineBasicBlock::iterator I,
6501 const TargetRegisterClass *DstRC,
6502 MachineOperand &Op,
6503 MachineRegisterInfo &MRI,
6504 const DebugLoc &DL) const {
6505 Register OpReg = Op.getReg();
6506 unsigned OpSubReg = Op.getSubReg();
6507
6508 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6509 RI.getRegClassForReg(MRI, Reg: OpReg), OpSubReg);
6510
6511 // Check if operand is already the correct register class.
6512 if (DstRC == OpRC)
6513 return;
6514
6515 Register DstReg = MRI.createVirtualRegister(RegClass: DstRC);
6516 auto Copy =
6517 BuildMI(BB&: InsertMBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: DstReg).addReg(RegNo: OpReg);
6518 Op.setReg(DstReg);
6519
6520 MachineInstr *Def = MRI.getVRegDef(Reg: OpReg);
6521 if (!Def)
6522 return;
6523
6524 // Try to eliminate the copy if it is copying an immediate value.
6525 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6526 foldImmediate(UseMI&: *Copy, DefMI&: *Def, Reg: OpReg, MRI: &MRI);
6527
6528 bool ImpDef = Def->isImplicitDef();
6529 while (!ImpDef && Def && Def->isCopy()) {
6530 if (Def->getOperand(i: 1).getReg().isPhysical())
6531 break;
6532 Def = MRI.getUniqueVRegDef(Reg: Def->getOperand(i: 1).getReg());
6533 ImpDef = Def && Def->isImplicitDef();
6534 }
6535 if (!RI.isSGPRClass(RC: DstRC) && !Copy->readsRegister(Reg: AMDGPU::EXEC, TRI: &RI) &&
6536 !ImpDef)
6537 Copy.addReg(RegNo: AMDGPU::EXEC, flags: RegState::Implicit);
6538}
6539
6540// Emit the actual waterfall loop, executing the wrapped instruction for each
6541// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6542// iteration, in the worst case we execute 64 (once per lane).
6543static void
6544emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
6545 MachineRegisterInfo &MRI,
6546 MachineBasicBlock &LoopBB,
6547 MachineBasicBlock &BodyBB,
6548 const DebugLoc &DL,
6549 ArrayRef<MachineOperand *> ScalarOps) {
6550 MachineFunction &MF = *LoopBB.getParent();
6551 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6552 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6553 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6554 unsigned SaveExecOpc =
6555 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6556 unsigned XorTermOpc =
6557 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6558 unsigned AndOpc =
6559 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6560 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6561
6562 MachineBasicBlock::iterator I = LoopBB.begin();
6563 Register CondReg;
6564
6565 for (MachineOperand *ScalarOp : ScalarOps) {
6566 unsigned RegSize = TRI->getRegSizeInBits(Reg: ScalarOp->getReg(), MRI);
6567 unsigned NumSubRegs = RegSize / 32;
6568 Register VScalarOp = ScalarOp->getReg();
6569
6570 if (NumSubRegs == 1) {
6571 Register CurReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6572
6573 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurReg)
6574 .addReg(RegNo: VScalarOp);
6575
6576 Register NewCondReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
6577
6578 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: NewCondReg)
6579 .addReg(RegNo: CurReg)
6580 .addReg(RegNo: VScalarOp);
6581
6582 // Combine the comparison results with AND.
6583 if (!CondReg) // First.
6584 CondReg = NewCondReg;
6585 else { // If not the first, we create an AND.
6586 Register AndReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
6587 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: AndReg)
6588 .addReg(RegNo: CondReg)
6589 .addReg(RegNo: NewCondReg);
6590 CondReg = AndReg;
6591 }
6592
6593 // Update ScalarOp operand to use the SGPR ScalarOp.
6594 ScalarOp->setReg(CurReg);
6595 ScalarOp->setIsKill();
6596 } else {
6597 SmallVector<Register, 8> ReadlanePieces;
6598 unsigned VScalarOpUndef = getUndefRegState(B: ScalarOp->isUndef());
6599 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6600 "Unhandled register size");
6601
6602 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6603 Register CurRegLo =
6604 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6605 Register CurRegHi =
6606 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6607
6608 // Read the next variant <- also loop target.
6609 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurRegLo)
6610 .addReg(RegNo: VScalarOp, flags: VScalarOpUndef, SubReg: TRI->getSubRegFromChannel(Channel: Idx));
6611
6612 // Read the next variant <- also loop target.
6613 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurRegHi)
6614 .addReg(RegNo: VScalarOp, flags: VScalarOpUndef,
6615 SubReg: TRI->getSubRegFromChannel(Channel: Idx + 1));
6616
6617 ReadlanePieces.push_back(Elt: CurRegLo);
6618 ReadlanePieces.push_back(Elt: CurRegHi);
6619
6620 // Comparison is to be done as 64-bit.
6621 Register CurReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_64RegClass);
6622 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: CurReg)
6623 .addReg(RegNo: CurRegLo)
6624 .addImm(Val: AMDGPU::sub0)
6625 .addReg(RegNo: CurRegHi)
6626 .addImm(Val: AMDGPU::sub1);
6627
6628 Register NewCondReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
6629 auto Cmp = BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_CMP_EQ_U64_e64),
6630 DestReg: NewCondReg)
6631 .addReg(RegNo: CurReg);
6632 if (NumSubRegs <= 2)
6633 Cmp.addReg(RegNo: VScalarOp);
6634 else
6635 Cmp.addReg(RegNo: VScalarOp, flags: VScalarOpUndef,
6636 SubReg: TRI->getSubRegFromChannel(Channel: Idx, NumRegs: 2));
6637
6638 // Combine the comparison results with AND.
6639 if (!CondReg) // First.
6640 CondReg = NewCondReg;
6641 else { // If not the first, we create an AND.
6642 Register AndReg = MRI.createVirtualRegister(RegClass: BoolXExecRC);
6643 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AndOpc), DestReg: AndReg)
6644 .addReg(RegNo: CondReg)
6645 .addReg(RegNo: NewCondReg);
6646 CondReg = AndReg;
6647 }
6648 } // End for loop.
6649
6650 const auto *SScalarOpRC =
6651 TRI->getEquivalentSGPRClass(VRC: MRI.getRegClass(Reg: VScalarOp));
6652 Register SScalarOp = MRI.createVirtualRegister(RegClass: SScalarOpRC);
6653
6654 // Build scalar ScalarOp.
6655 auto Merge =
6656 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: SScalarOp);
6657 unsigned Channel = 0;
6658 for (Register Piece : ReadlanePieces) {
6659 Merge.addReg(RegNo: Piece).addImm(Val: TRI->getSubRegFromChannel(Channel: Channel++));
6660 }
6661
6662 // Update ScalarOp operand to use the SGPR ScalarOp.
6663 ScalarOp->setReg(SScalarOp);
6664 ScalarOp->setIsKill();
6665 }
6666 }
6667
6668 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
6669 MRI.setSimpleHint(VReg: SaveExec, PrefReg: CondReg);
6670
6671 // Update EXEC to matching lanes, saving original to SaveExec.
6672 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII.get(Opcode: SaveExecOpc), DestReg: SaveExec)
6673 .addReg(RegNo: CondReg, flags: RegState::Kill);
6674
6675 // The original instruction is here; we insert the terminators after it.
6676 I = BodyBB.end();
6677
6678 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6679 BuildMI(BB&: BodyBB, I, MIMD: DL, MCID: TII.get(Opcode: XorTermOpc), DestReg: Exec)
6680 .addReg(RegNo: Exec)
6681 .addReg(RegNo: SaveExec);
6682
6683 BuildMI(BB&: BodyBB, I, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::SI_WATERFALL_LOOP)).addMBB(MBB: &LoopBB);
6684}
6685
6686// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6687// with SGPRs by iterating over all unique values across all lanes.
6688// Returns the loop basic block that now contains \p MI.
6689static MachineBasicBlock *
6690loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
6691 ArrayRef<MachineOperand *> ScalarOps,
6692 MachineDominatorTree *MDT,
6693 MachineBasicBlock::iterator Begin = nullptr,
6694 MachineBasicBlock::iterator End = nullptr) {
6695 MachineBasicBlock &MBB = *MI.getParent();
6696 MachineFunction &MF = *MBB.getParent();
6697 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6698 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6699 MachineRegisterInfo &MRI = MF.getRegInfo();
6700 if (!Begin.isValid())
6701 Begin = &MI;
6702 if (!End.isValid()) {
6703 End = &MI;
6704 ++End;
6705 }
6706 const DebugLoc &DL = MI.getDebugLoc();
6707 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6708 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6709 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6710
6711 // Save SCC. Waterfall Loop may overwrite SCC.
6712 Register SaveSCCReg;
6713
6714 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6715 // rather than unlimited scan everywhere
6716 bool SCCNotDead =
6717 MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::SCC, Before: MI,
6718 Neighborhood: std::numeric_limits<unsigned>::max()) !=
6719 MachineBasicBlock::LQR_Dead;
6720 if (SCCNotDead) {
6721 SaveSCCReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6722 BuildMI(BB&: MBB, I: Begin, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: SaveSCCReg)
6723 .addImm(Val: 1)
6724 .addImm(Val: 0);
6725 }
6726
6727 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
6728
6729 // Save the EXEC mask
6730 BuildMI(BB&: MBB, I: Begin, MIMD: DL, MCID: TII.get(Opcode: MovExecOpc), DestReg: SaveExec).addReg(RegNo: Exec);
6731
6732 // Killed uses in the instruction we are waterfalling around will be
6733 // incorrect due to the added control-flow.
6734 MachineBasicBlock::iterator AfterMI = MI;
6735 ++AfterMI;
6736 for (auto I = Begin; I != AfterMI; I++) {
6737 for (auto &MO : I->all_uses())
6738 MRI.clearKillFlags(Reg: MO.getReg());
6739 }
6740
6741 // To insert the loop we need to split the block. Move everything after this
6742 // point to a new block, and insert a new empty block between the two.
6743 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
6744 MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
6745 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6746 MachineFunction::iterator MBBI(MBB);
6747 ++MBBI;
6748
6749 MF.insert(MBBI, MBB: LoopBB);
6750 MF.insert(MBBI, MBB: BodyBB);
6751 MF.insert(MBBI, MBB: RemainderBB);
6752
6753 LoopBB->addSuccessor(Succ: BodyBB);
6754 BodyBB->addSuccessor(Succ: LoopBB);
6755 BodyBB->addSuccessor(Succ: RemainderBB);
6756
6757 // Move Begin to MI to the BodyBB, and the remainder of the block to
6758 // RemainderBB.
6759 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
6760 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: End, To: MBB.end());
6761 BodyBB->splice(Where: BodyBB->begin(), Other: &MBB, From: Begin, To: MBB.end());
6762
6763 MBB.addSuccessor(Succ: LoopBB);
6764
6765 // Update dominators. We know that MBB immediately dominates LoopBB, that
6766 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6767 // RemainderBB. RemainderBB immediately dominates all of the successors
6768 // transferred to it from MBB that MBB used to properly dominate.
6769 if (MDT) {
6770 MDT->addNewBlock(BB: LoopBB, DomBB: &MBB);
6771 MDT->addNewBlock(BB: BodyBB, DomBB: LoopBB);
6772 MDT->addNewBlock(BB: RemainderBB, DomBB: BodyBB);
6773 for (auto &Succ : RemainderBB->successors()) {
6774 if (MDT->properlyDominates(A: &MBB, B: Succ)) {
6775 MDT->changeImmediateDominator(BB: Succ, NewBB: RemainderBB);
6776 }
6777 }
6778 }
6779
6780 emitLoadScalarOpsFromVGPRLoop(TII, MRI, LoopBB&: *LoopBB, BodyBB&: *BodyBB, DL, ScalarOps);
6781
6782 MachineBasicBlock::iterator First = RemainderBB->begin();
6783 // Restore SCC
6784 if (SCCNotDead) {
6785 BuildMI(BB&: *RemainderBB, I: First, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_CMP_LG_U32))
6786 .addReg(RegNo: SaveSCCReg, flags: RegState::Kill)
6787 .addImm(Val: 0);
6788 }
6789
6790 // Restore the EXEC mask
6791 BuildMI(BB&: *RemainderBB, I: First, MIMD: DL, MCID: TII.get(Opcode: MovExecOpc), DestReg: Exec).addReg(RegNo: SaveExec);
6792 return BodyBB;
6793}
6794
6795// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6796static std::tuple<unsigned, unsigned>
6797extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
6798 MachineBasicBlock &MBB = *MI.getParent();
6799 MachineFunction &MF = *MBB.getParent();
6800 MachineRegisterInfo &MRI = MF.getRegInfo();
6801
6802 // Extract the ptr from the resource descriptor.
6803 unsigned RsrcPtr =
6804 TII.buildExtractSubReg(MI, MRI, SuperReg: Rsrc, SuperRC: &AMDGPU::VReg_128RegClass,
6805 SubIdx: AMDGPU::sub0_sub1, SubRC: &AMDGPU::VReg_64RegClass);
6806
6807 // Create an empty resource descriptor
6808 Register Zero64 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
6809 Register SRsrcFormatLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
6810 Register SRsrcFormatHi = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
6811 Register NewSRsrc = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_128RegClass);
6812 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6813
6814 // Zero64 = 0
6815 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B64), DestReg: Zero64)
6816 .addImm(Val: 0);
6817
6818 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6819 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: SRsrcFormatLo)
6820 .addImm(Val: Lo_32(Value: RsrcDataFormat));
6821
6822 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6823 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_MOV_B32), DestReg: SRsrcFormatHi)
6824 .addImm(Val: Hi_32(Value: RsrcDataFormat));
6825
6826 // NewSRsrc = {Zero64, SRsrcFormat}
6827 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewSRsrc)
6828 .addReg(RegNo: Zero64)
6829 .addImm(Val: AMDGPU::sub0_sub1)
6830 .addReg(RegNo: SRsrcFormatLo)
6831 .addImm(Val: AMDGPU::sub2)
6832 .addReg(RegNo: SRsrcFormatHi)
6833 .addImm(Val: AMDGPU::sub3);
6834
6835 return std::tuple(RsrcPtr, NewSRsrc);
6836}
6837
6838MachineBasicBlock *
6839SIInstrInfo::legalizeOperands(MachineInstr &MI,
6840 MachineDominatorTree *MDT) const {
6841 MachineFunction &MF = *MI.getParent()->getParent();
6842 MachineRegisterInfo &MRI = MF.getRegInfo();
6843 MachineBasicBlock *CreatedBB = nullptr;
6844
6845 // Legalize VOP2
6846 if (isVOP2(MI) || isVOPC(MI)) {
6847 legalizeOperandsVOP2(MRI, MI);
6848 return CreatedBB;
6849 }
6850
6851 // Legalize VOP3
6852 if (isVOP3(MI)) {
6853 legalizeOperandsVOP3(MRI, MI);
6854 return CreatedBB;
6855 }
6856
6857 // Legalize SMRD
6858 if (isSMRD(MI)) {
6859 legalizeOperandsSMRD(MRI, MI);
6860 return CreatedBB;
6861 }
6862
6863 // Legalize FLAT
6864 if (isFLAT(MI)) {
6865 legalizeOperandsFLAT(MRI, MI);
6866 return CreatedBB;
6867 }
6868
6869 // Legalize REG_SEQUENCE and PHI
6870 // The register class of the operands much be the same type as the register
6871 // class of the output.
6872 if (MI.getOpcode() == AMDGPU::PHI) {
6873 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6874 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6875 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6876 continue;
6877 const TargetRegisterClass *OpRC =
6878 MRI.getRegClass(Reg: MI.getOperand(i).getReg());
6879 if (RI.hasVectorRegisters(RC: OpRC)) {
6880 VRC = OpRC;
6881 } else {
6882 SRC = OpRC;
6883 }
6884 }
6885
6886 // If any of the operands are VGPR registers, then they all most be
6887 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6888 // them.
6889 if (VRC || !RI.isSGPRClass(RC: getOpRegClass(MI, OpNo: 0))) {
6890 if (!VRC) {
6891 assert(SRC);
6892 if (getOpRegClass(MI, OpNo: 0) == &AMDGPU::VReg_1RegClass) {
6893 VRC = &AMDGPU::VReg_1RegClass;
6894 } else
6895 VRC = RI.isAGPRClass(RC: getOpRegClass(MI, OpNo: 0))
6896 ? RI.getEquivalentAGPRClass(SRC)
6897 : RI.getEquivalentVGPRClass(SRC);
6898 } else {
6899 VRC = RI.isAGPRClass(RC: getOpRegClass(MI, OpNo: 0))
6900 ? RI.getEquivalentAGPRClass(SRC: VRC)
6901 : RI.getEquivalentVGPRClass(SRC: VRC);
6902 }
6903 RC = VRC;
6904 } else {
6905 RC = SRC;
6906 }
6907
6908 // Update all the operands so they have the same type.
6909 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6910 MachineOperand &Op = MI.getOperand(i: I);
6911 if (!Op.isReg() || !Op.getReg().isVirtual())
6912 continue;
6913
6914 // MI is a PHI instruction.
6915 MachineBasicBlock *InsertBB = MI.getOperand(i: I + 1).getMBB();
6916 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
6917
6918 // Avoid creating no-op copies with the same src and dst reg class. These
6919 // confuse some of the machine passes.
6920 legalizeGenericOperand(InsertMBB&: *InsertBB, I: Insert, DstRC: RC, Op, MRI, DL: MI.getDebugLoc());
6921 }
6922 }
6923
6924 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6925 // VGPR dest type and SGPR sources, insert copies so all operands are
6926 // VGPRs. This seems to help operand folding / the register coalescer.
6927 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6928 MachineBasicBlock *MBB = MI.getParent();
6929 const TargetRegisterClass *DstRC = getOpRegClass(MI, OpNo: 0);
6930 if (RI.hasVGPRs(RC: DstRC)) {
6931 // Update all the operands so they are VGPR register classes. These may
6932 // not be the same register class because REG_SEQUENCE supports mixing
6933 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6934 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6935 MachineOperand &Op = MI.getOperand(i: I);
6936 if (!Op.isReg() || !Op.getReg().isVirtual())
6937 continue;
6938
6939 const TargetRegisterClass *OpRC = MRI.getRegClass(Reg: Op.getReg());
6940 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(SRC: OpRC);
6941 if (VRC == OpRC)
6942 continue;
6943
6944 legalizeGenericOperand(InsertMBB&: *MBB, I: MI, DstRC: VRC, Op, MRI, DL: MI.getDebugLoc());
6945 Op.setIsKill();
6946 }
6947 }
6948
6949 return CreatedBB;
6950 }
6951
6952 // Legalize INSERT_SUBREG
6953 // src0 must have the same register class as dst
6954 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6955 Register Dst = MI.getOperand(i: 0).getReg();
6956 Register Src0 = MI.getOperand(i: 1).getReg();
6957 const TargetRegisterClass *DstRC = MRI.getRegClass(Reg: Dst);
6958 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0);
6959 if (DstRC != Src0RC) {
6960 MachineBasicBlock *MBB = MI.getParent();
6961 MachineOperand &Op = MI.getOperand(i: 1);
6962 legalizeGenericOperand(InsertMBB&: *MBB, I: MI, DstRC, Op, MRI, DL: MI.getDebugLoc());
6963 }
6964 return CreatedBB;
6965 }
6966
6967 // Legalize SI_INIT_M0
6968 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6969 MachineOperand &Src = MI.getOperand(i: 0);
6970 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
6971 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
6972 return CreatedBB;
6973 }
6974
6975 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6976 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6977 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6978 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6979 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6980 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
6981 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
6982 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
6983 MachineOperand &Src = MI.getOperand(i: 1);
6984 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
6985 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
6986 return CreatedBB;
6987 }
6988
6989 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6990 //
6991 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6992 // scratch memory access. In both cases, the legalization never involves
6993 // conversion to the addr64 form.
6994 if (isImage(MI) || (AMDGPU::isGraphics(CC: MF.getFunction().getCallingConv()) &&
6995 (isMUBUF(MI) || isMTBUF(MI)))) {
6996 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
6997 ? AMDGPU::OpName::rsrc
6998 : AMDGPU::OpName::srsrc;
6999 MachineOperand *SRsrc = getNamedOperand(MI, OperandName: RSrcOpName);
7000 if (SRsrc && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SRsrc->getReg())))
7001 CreatedBB = loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {SRsrc}, MDT);
7002
7003 AMDGPU::OpName SampOpName =
7004 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7005 MachineOperand *SSamp = getNamedOperand(MI, OperandName: SampOpName);
7006 if (SSamp && !RI.isSGPRClass(RC: MRI.getRegClass(Reg: SSamp->getReg())))
7007 CreatedBB = loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {SSamp}, MDT);
7008
7009 return CreatedBB;
7010 }
7011
7012 // Legalize SI_CALL
7013 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7014 MachineOperand *Dest = &MI.getOperand(i: 0);
7015 if (!RI.isSGPRClass(RC: MRI.getRegClass(Reg: Dest->getReg()))) {
7016 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7017 // following copies, we also need to move copies from and to physical
7018 // registers into the loop block.
7019 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7020 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7021
7022 // Also move the copies to physical registers into the loop block
7023 MachineBasicBlock &MBB = *MI.getParent();
7024 MachineBasicBlock::iterator Start(&MI);
7025 while (Start->getOpcode() != FrameSetupOpcode)
7026 --Start;
7027 MachineBasicBlock::iterator End(&MI);
7028 while (End->getOpcode() != FrameDestroyOpcode)
7029 ++End;
7030 // Also include following copies of the return value
7031 ++End;
7032 while (End != MBB.end() && End->isCopy() && End->getOperand(i: 1).isReg() &&
7033 MI.definesRegister(Reg: End->getOperand(i: 1).getReg(), /*TRI=*/nullptr))
7034 ++End;
7035 CreatedBB =
7036 loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Dest}, MDT, Begin: Start, End);
7037 }
7038 }
7039
7040 // Legalize s_sleep_var.
7041 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7042 const DebugLoc &DL = MI.getDebugLoc();
7043 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7044 int Src0Idx =
7045 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src0);
7046 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
7047 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: Reg)
7048 .add(MO: Src0);
7049 Src0.ChangeToRegister(Reg, isDef: false);
7050 return nullptr;
7051 }
7052
7053 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7054 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7055 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7056 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7057 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7058 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7059 for (MachineOperand &Src : MI.explicit_operands()) {
7060 if (Src.isReg() && RI.hasVectorRegisters(RC: MRI.getRegClass(Reg: Src.getReg())))
7061 Src.setReg(readlaneVGPRToSGPR(SrcReg: Src.getReg(), UseMI&: MI, MRI));
7062 }
7063 return CreatedBB;
7064 }
7065
7066 // Legalize MUBUF instructions.
7067 bool isSoffsetLegal = true;
7068 int SoffsetIdx =
7069 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::soffset);
7070 if (SoffsetIdx != -1) {
7071 MachineOperand *Soffset = &MI.getOperand(i: SoffsetIdx);
7072 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7073 !RI.isSGPRClass(RC: MRI.getRegClass(Reg: Soffset->getReg()))) {
7074 isSoffsetLegal = false;
7075 }
7076 }
7077
7078 bool isRsrcLegal = true;
7079 int RsrcIdx =
7080 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::srsrc);
7081 if (RsrcIdx != -1) {
7082 MachineOperand *Rsrc = &MI.getOperand(i: RsrcIdx);
7083 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Reg: Rsrc->getReg()))
7084 isRsrcLegal = false;
7085 }
7086
7087 // The operands are legal.
7088 if (isRsrcLegal && isSoffsetLegal)
7089 return CreatedBB;
7090
7091 if (!isRsrcLegal) {
7092 // Legalize a VGPR Rsrc
7093 //
7094 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7095 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7096 // a zero-value SRsrc.
7097 //
7098 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7099 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7100 // above.
7101 //
7102 // Otherwise we are on non-ADDR64 hardware, and/or we have
7103 // idxen/offen/bothen and we fall back to a waterfall loop.
7104
7105 MachineOperand *Rsrc = &MI.getOperand(i: RsrcIdx);
7106 MachineBasicBlock &MBB = *MI.getParent();
7107
7108 MachineOperand *VAddr = getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
7109 if (VAddr && AMDGPU::getIfAddr64Inst(Opcode: MI.getOpcode()) != -1) {
7110 // This is already an ADDR64 instruction so we need to add the pointer
7111 // extracted from the resource descriptor to the current value of VAddr.
7112 Register NewVAddrLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7113 Register NewVAddrHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7114 Register NewVAddr = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
7115
7116 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7117 Register CondReg0 = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7118 Register CondReg1 = MRI.createVirtualRegister(RegClass: BoolXExecRC);
7119
7120 unsigned RsrcPtr, NewSRsrc;
7121 std::tie(args&: RsrcPtr, args&: NewSRsrc) = extractRsrcPtr(TII: *this, MI, Rsrc&: *Rsrc);
7122
7123 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7124 const DebugLoc &DL = MI.getDebugLoc();
7125 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg: NewVAddrLo)
7126 .addDef(RegNo: CondReg0)
7127 .addReg(RegNo: RsrcPtr, flags: 0, SubReg: AMDGPU::sub0)
7128 .addReg(RegNo: VAddr->getReg(), flags: 0, SubReg: AMDGPU::sub0)
7129 .addImm(Val: 0);
7130
7131 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7132 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADDC_U32_e64), DestReg: NewVAddrHi)
7133 .addDef(RegNo: CondReg1, Flags: RegState::Dead)
7134 .addReg(RegNo: RsrcPtr, flags: 0, SubReg: AMDGPU::sub1)
7135 .addReg(RegNo: VAddr->getReg(), flags: 0, SubReg: AMDGPU::sub1)
7136 .addReg(RegNo: CondReg0, flags: RegState::Kill)
7137 .addImm(Val: 0);
7138
7139 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7140 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewVAddr)
7141 .addReg(RegNo: NewVAddrLo)
7142 .addImm(Val: AMDGPU::sub0)
7143 .addReg(RegNo: NewVAddrHi)
7144 .addImm(Val: AMDGPU::sub1);
7145
7146 VAddr->setReg(NewVAddr);
7147 Rsrc->setReg(NewSRsrc);
7148 } else if (!VAddr && ST.hasAddr64()) {
7149 // This instructions is the _OFFSET variant, so we need to convert it to
7150 // ADDR64.
7151 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7152 "FIXME: Need to emit flat atomics here");
7153
7154 unsigned RsrcPtr, NewSRsrc;
7155 std::tie(args&: RsrcPtr, args&: NewSRsrc) = extractRsrcPtr(TII: *this, MI, Rsrc&: *Rsrc);
7156
7157 Register NewVAddr = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
7158 MachineOperand *VData = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata);
7159 MachineOperand *Offset = getNamedOperand(MI, OperandName: AMDGPU::OpName::offset);
7160 MachineOperand *SOffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7161 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(Opcode: MI.getOpcode());
7162
7163 // Atomics with return have an additional tied operand and are
7164 // missing some of the special bits.
7165 MachineOperand *VDataIn = getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata_in);
7166 MachineInstr *Addr64;
7167
7168 if (!VDataIn) {
7169 // Regular buffer load / store.
7170 MachineInstrBuilder MIB =
7171 BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: Addr64Opcode))
7172 .add(MO: *VData)
7173 .addReg(RegNo: NewVAddr)
7174 .addReg(RegNo: NewSRsrc)
7175 .add(MO: *SOffset)
7176 .add(MO: *Offset);
7177
7178 if (const MachineOperand *CPol =
7179 getNamedOperand(MI, OperandName: AMDGPU::OpName::cpol)) {
7180 MIB.addImm(Val: CPol->getImm());
7181 }
7182
7183 if (const MachineOperand *TFE =
7184 getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe)) {
7185 MIB.addImm(Val: TFE->getImm());
7186 }
7187
7188 MIB.addImm(Val: getNamedImmOperand(MI, OperandName: AMDGPU::OpName::swz));
7189
7190 MIB.cloneMemRefs(OtherMI: MI);
7191 Addr64 = MIB;
7192 } else {
7193 // Atomics with return.
7194 Addr64 = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: get(Opcode: Addr64Opcode))
7195 .add(MO: *VData)
7196 .add(MO: *VDataIn)
7197 .addReg(RegNo: NewVAddr)
7198 .addReg(RegNo: NewSRsrc)
7199 .add(MO: *SOffset)
7200 .add(MO: *Offset)
7201 .addImm(Val: getNamedImmOperand(MI, OperandName: AMDGPU::OpName::cpol))
7202 .cloneMemRefs(OtherMI: MI);
7203 }
7204
7205 MI.removeFromParent();
7206
7207 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7208 BuildMI(BB&: MBB, I: Addr64, MIMD: Addr64->getDebugLoc(), MCID: get(Opcode: AMDGPU::REG_SEQUENCE),
7209 DestReg: NewVAddr)
7210 .addReg(RegNo: RsrcPtr, flags: 0, SubReg: AMDGPU::sub0)
7211 .addImm(Val: AMDGPU::sub0)
7212 .addReg(RegNo: RsrcPtr, flags: 0, SubReg: AMDGPU::sub1)
7213 .addImm(Val: AMDGPU::sub1);
7214 } else {
7215 // Legalize a VGPR Rsrc and soffset together.
7216 if (!isSoffsetLegal) {
7217 MachineOperand *Soffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7218 CreatedBB =
7219 loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Rsrc, Soffset}, MDT);
7220 return CreatedBB;
7221 }
7222 CreatedBB = loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Rsrc}, MDT);
7223 return CreatedBB;
7224 }
7225 }
7226
7227 // Legalize a VGPR soffset.
7228 if (!isSoffsetLegal) {
7229 MachineOperand *Soffset = getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
7230 CreatedBB = loadMBUFScalarOperandsFromVGPR(TII: *this, MI, ScalarOps: {Soffset}, MDT);
7231 return CreatedBB;
7232 }
7233 return CreatedBB;
7234}
7235
7236void SIInstrWorklist::insert(MachineInstr *MI) {
7237 InstrList.insert(X: MI);
7238 // Add MBUF instructiosn to deferred list.
7239 int RsrcIdx =
7240 AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(), Name: AMDGPU::OpName::srsrc);
7241 if (RsrcIdx != -1) {
7242 DeferredList.insert(X: MI);
7243 }
7244}
7245
7246bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
7247 return DeferredList.contains(key: MI);
7248}
7249
7250// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7251// lowering (change spgr to vgpr).
7252// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7253// size. Need to legalize the size of the operands during the vgpr lowering
7254// chain. This can be removed after we have sgpr16 in place
7255void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
7256 MachineRegisterInfo &MRI) const {
7257 if (!ST.useRealTrue16Insts())
7258 return;
7259
7260 unsigned Opcode = MI.getOpcode();
7261 MachineBasicBlock *MBB = MI.getParent();
7262 // Legalize operands and check for size mismatch
7263 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7264 OpIdx >= get(Opcode).getNumOperands())
7265 return;
7266
7267 MachineOperand &Op = MI.getOperand(i: OpIdx);
7268 if (!Op.isReg() || !Op.getReg().isVirtual())
7269 return;
7270
7271 const TargetRegisterClass *CurrRC = MRI.getRegClass(Reg: Op.getReg());
7272 if (!RI.isVGPRClass(RC: CurrRC))
7273 return;
7274
7275 unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
7276 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7277 if (RI.getMatchingSuperRegClass(A: CurrRC, B: ExpectedRC, Idx: AMDGPU::lo16)) {
7278 Op.setSubReg(AMDGPU::lo16);
7279 } else if (RI.getMatchingSuperRegClass(A: ExpectedRC, B: CurrRC, Idx: AMDGPU::lo16)) {
7280 const DebugLoc &DL = MI.getDebugLoc();
7281 Register NewDstReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7282 Register Undef = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass);
7283 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
7284 BuildMI(BB&: *MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewDstReg)
7285 .addReg(RegNo: Op.getReg())
7286 .addImm(Val: AMDGPU::lo16)
7287 .addReg(RegNo: Undef)
7288 .addImm(Val: AMDGPU::hi16);
7289 Op.setReg(NewDstReg);
7290 }
7291}
7292void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
7293 MachineRegisterInfo &MRI) const {
7294 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7295 legalizeOperandsVALUt16(MI, OpIdx, MRI);
7296}
7297
7298void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
7299 MachineDominatorTree *MDT) const {
7300
7301 while (!Worklist.empty()) {
7302 MachineInstr &Inst = *Worklist.top();
7303 Worklist.erase_top();
7304 // Skip MachineInstr in the deferred list.
7305 if (Worklist.isDeferred(MI: &Inst))
7306 continue;
7307 moveToVALUImpl(Worklist, MDT, Inst);
7308 }
7309
7310 // Deferred list of instructions will be processed once
7311 // all the MachineInstr in the worklist are done.
7312 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7313 moveToVALUImpl(Worklist, MDT, Inst&: *Inst);
7314 assert(Worklist.empty() &&
7315 "Deferred MachineInstr are not supposed to re-populate worklist");
7316 }
7317}
7318
7319void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
7320 MachineDominatorTree *MDT,
7321 MachineInstr &Inst) const {
7322
7323 MachineBasicBlock *MBB = Inst.getParent();
7324 if (!MBB)
7325 return;
7326 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7327 unsigned Opcode = Inst.getOpcode();
7328 unsigned NewOpcode = getVALUOp(MI: Inst);
7329 // Handle some special cases
7330 switch (Opcode) {
7331 default:
7332 break;
7333 case AMDGPU::S_ADD_I32:
7334 case AMDGPU::S_SUB_I32: {
7335 // FIXME: The u32 versions currently selected use the carry.
7336 bool Changed;
7337 MachineBasicBlock *CreatedBBTmp = nullptr;
7338 std::tie(args&: Changed, args&: CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7339 if (Changed)
7340 return;
7341
7342 // Default handling
7343 break;
7344 }
7345
7346 case AMDGPU::S_MUL_U64:
7347 // Split s_mul_u64 in 32-bit vector multiplications.
7348 splitScalarSMulU64(Worklist, Inst, MDT);
7349 Inst.eraseFromParent();
7350 return;
7351
7352 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7353 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7354 // This is a special case of s_mul_u64 where all the operands are either
7355 // zero extended or sign extended.
7356 splitScalarSMulPseudo(Worklist, Inst, MDT);
7357 Inst.eraseFromParent();
7358 return;
7359
7360 case AMDGPU::S_AND_B64:
7361 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_AND_B32, MDT);
7362 Inst.eraseFromParent();
7363 return;
7364
7365 case AMDGPU::S_OR_B64:
7366 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_OR_B32, MDT);
7367 Inst.eraseFromParent();
7368 return;
7369
7370 case AMDGPU::S_XOR_B64:
7371 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_XOR_B32, MDT);
7372 Inst.eraseFromParent();
7373 return;
7374
7375 case AMDGPU::S_NAND_B64:
7376 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_NAND_B32, MDT);
7377 Inst.eraseFromParent();
7378 return;
7379
7380 case AMDGPU::S_NOR_B64:
7381 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_NOR_B32, MDT);
7382 Inst.eraseFromParent();
7383 return;
7384
7385 case AMDGPU::S_XNOR_B64:
7386 if (ST.hasDLInsts())
7387 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_XNOR_B32, MDT);
7388 else
7389 splitScalar64BitXnor(Worklist, Inst, MDT);
7390 Inst.eraseFromParent();
7391 return;
7392
7393 case AMDGPU::S_ANDN2_B64:
7394 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_ANDN2_B32, MDT);
7395 Inst.eraseFromParent();
7396 return;
7397
7398 case AMDGPU::S_ORN2_B64:
7399 splitScalar64BitBinaryOp(Worklist, Inst, Opcode: AMDGPU::S_ORN2_B32, MDT);
7400 Inst.eraseFromParent();
7401 return;
7402
7403 case AMDGPU::S_BREV_B64:
7404 splitScalar64BitUnaryOp(Worklist, Inst, Opcode: AMDGPU::S_BREV_B32, Swap: true);
7405 Inst.eraseFromParent();
7406 return;
7407
7408 case AMDGPU::S_NOT_B64:
7409 splitScalar64BitUnaryOp(Worklist, Inst, Opcode: AMDGPU::S_NOT_B32);
7410 Inst.eraseFromParent();
7411 return;
7412
7413 case AMDGPU::S_BCNT1_I32_B64:
7414 splitScalar64BitBCNT(Worklist, Inst);
7415 Inst.eraseFromParent();
7416 return;
7417
7418 case AMDGPU::S_BFE_I64:
7419 splitScalar64BitBFE(Worklist, Inst);
7420 Inst.eraseFromParent();
7421 return;
7422
7423 case AMDGPU::S_FLBIT_I32_B64:
7424 splitScalar64BitCountOp(Worklist, Inst, Opcode: AMDGPU::V_FFBH_U32_e32);
7425 Inst.eraseFromParent();
7426 return;
7427 case AMDGPU::S_FF1_I32_B64:
7428 splitScalar64BitCountOp(Worklist, Inst, Opcode: AMDGPU::V_FFBL_B32_e32);
7429 Inst.eraseFromParent();
7430 return;
7431
7432 case AMDGPU::S_LSHL_B32:
7433 if (ST.hasOnlyRevVALUShifts()) {
7434 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7435 swapOperands(Inst);
7436 }
7437 break;
7438 case AMDGPU::S_ASHR_I32:
7439 if (ST.hasOnlyRevVALUShifts()) {
7440 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7441 swapOperands(Inst);
7442 }
7443 break;
7444 case AMDGPU::S_LSHR_B32:
7445 if (ST.hasOnlyRevVALUShifts()) {
7446 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7447 swapOperands(Inst);
7448 }
7449 break;
7450 case AMDGPU::S_LSHL_B64:
7451 if (ST.hasOnlyRevVALUShifts()) {
7452 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7453 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7454 : AMDGPU::V_LSHLREV_B64_e64;
7455 swapOperands(Inst);
7456 }
7457 break;
7458 case AMDGPU::S_ASHR_I64:
7459 if (ST.hasOnlyRevVALUShifts()) {
7460 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7461 swapOperands(Inst);
7462 }
7463 break;
7464 case AMDGPU::S_LSHR_B64:
7465 if (ST.hasOnlyRevVALUShifts()) {
7466 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7467 swapOperands(Inst);
7468 }
7469 break;
7470
7471 case AMDGPU::S_ABS_I32:
7472 lowerScalarAbs(Worklist, Inst);
7473 Inst.eraseFromParent();
7474 return;
7475
7476 case AMDGPU::S_CBRANCH_SCC0:
7477 case AMDGPU::S_CBRANCH_SCC1: {
7478 // Clear unused bits of vcc
7479 Register CondReg = Inst.getOperand(i: 1).getReg();
7480 bool IsSCC = CondReg == AMDGPU::SCC;
7481 Register VCC = RI.getVCC();
7482 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7483 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7484 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: Opc), DestReg: VCC)
7485 .addReg(RegNo: EXEC)
7486 .addReg(RegNo: IsSCC ? VCC : CondReg);
7487 Inst.removeOperand(OpNo: 1);
7488 } break;
7489
7490 case AMDGPU::S_BFE_U64:
7491 case AMDGPU::S_BFM_B64:
7492 llvm_unreachable("Moving this op to VALU not implemented");
7493
7494 case AMDGPU::S_PACK_LL_B32_B16:
7495 case AMDGPU::S_PACK_LH_B32_B16:
7496 case AMDGPU::S_PACK_HL_B32_B16:
7497 case AMDGPU::S_PACK_HH_B32_B16:
7498 movePackToVALU(Worklist, MRI, Inst);
7499 Inst.eraseFromParent();
7500 return;
7501
7502 case AMDGPU::S_XNOR_B32:
7503 lowerScalarXnor(Worklist, Inst);
7504 Inst.eraseFromParent();
7505 return;
7506
7507 case AMDGPU::S_NAND_B32:
7508 splitScalarNotBinop(Worklist, Inst, Opcode: AMDGPU::S_AND_B32);
7509 Inst.eraseFromParent();
7510 return;
7511
7512 case AMDGPU::S_NOR_B32:
7513 splitScalarNotBinop(Worklist, Inst, Opcode: AMDGPU::S_OR_B32);
7514 Inst.eraseFromParent();
7515 return;
7516
7517 case AMDGPU::S_ANDN2_B32:
7518 splitScalarBinOpN2(Worklist, Inst, Opcode: AMDGPU::S_AND_B32);
7519 Inst.eraseFromParent();
7520 return;
7521
7522 case AMDGPU::S_ORN2_B32:
7523 splitScalarBinOpN2(Worklist, Inst, Opcode: AMDGPU::S_OR_B32);
7524 Inst.eraseFromParent();
7525 return;
7526
7527 // TODO: remove as soon as everything is ready
7528 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7529 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7530 // can only be selected from the uniform SDNode.
7531 case AMDGPU::S_ADD_CO_PSEUDO:
7532 case AMDGPU::S_SUB_CO_PSEUDO: {
7533 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7534 ? AMDGPU::V_ADDC_U32_e64
7535 : AMDGPU::V_SUBB_U32_e64;
7536 const auto *CarryRC = RI.getWaveMaskRegClass();
7537
7538 Register CarryInReg = Inst.getOperand(i: 4).getReg();
7539 if (!MRI.constrainRegClass(Reg: CarryInReg, RC: CarryRC)) {
7540 Register NewCarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
7541 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY), DestReg: NewCarryReg)
7542 .addReg(RegNo: CarryInReg);
7543 }
7544
7545 Register CarryOutReg = Inst.getOperand(i: 1).getReg();
7546
7547 Register DestReg = MRI.createVirtualRegister(RegClass: RI.getEquivalentVGPRClass(
7548 SRC: MRI.getRegClass(Reg: Inst.getOperand(i: 0).getReg())));
7549 MachineInstr *CarryOp =
7550 BuildMI(BB&: *MBB, I: &Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: Opc), DestReg)
7551 .addReg(RegNo: CarryOutReg, flags: RegState::Define)
7552 .add(MO: Inst.getOperand(i: 2))
7553 .add(MO: Inst.getOperand(i: 3))
7554 .addReg(RegNo: CarryInReg)
7555 .addImm(Val: 0);
7556 legalizeOperands(MI&: *CarryOp);
7557 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: DestReg);
7558 addUsersToMoveToVALUWorklist(Reg: DestReg, MRI, Worklist);
7559 Inst.eraseFromParent();
7560 }
7561 return;
7562 case AMDGPU::S_UADDO_PSEUDO:
7563 case AMDGPU::S_USUBO_PSEUDO: {
7564 const DebugLoc &DL = Inst.getDebugLoc();
7565 MachineOperand &Dest0 = Inst.getOperand(i: 0);
7566 MachineOperand &Dest1 = Inst.getOperand(i: 1);
7567 MachineOperand &Src0 = Inst.getOperand(i: 2);
7568 MachineOperand &Src1 = Inst.getOperand(i: 3);
7569
7570 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7571 ? AMDGPU::V_ADD_CO_U32_e64
7572 : AMDGPU::V_SUB_CO_U32_e64;
7573 const TargetRegisterClass *NewRC =
7574 RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest0.getReg()));
7575 Register DestReg = MRI.createVirtualRegister(RegClass: NewRC);
7576 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I: &Inst, MIMD: DL, MCID: get(Opcode: Opc), DestReg)
7577 .addReg(RegNo: Dest1.getReg(), flags: RegState::Define)
7578 .add(MO: Src0)
7579 .add(MO: Src1)
7580 .addImm(Val: 0); // clamp bit
7581
7582 legalizeOperands(MI&: *NewInstr, MDT);
7583 MRI.replaceRegWith(FromReg: Dest0.getReg(), ToReg: DestReg);
7584 addUsersToMoveToVALUWorklist(Reg: NewInstr->getOperand(i: 0).getReg(), MRI,
7585 Worklist);
7586 Inst.eraseFromParent();
7587 }
7588 return;
7589
7590 case AMDGPU::S_CSELECT_B32:
7591 case AMDGPU::S_CSELECT_B64:
7592 lowerSelect(Worklist, Inst, MDT);
7593 Inst.eraseFromParent();
7594 return;
7595 case AMDGPU::S_CMP_EQ_I32:
7596 case AMDGPU::S_CMP_LG_I32:
7597 case AMDGPU::S_CMP_GT_I32:
7598 case AMDGPU::S_CMP_GE_I32:
7599 case AMDGPU::S_CMP_LT_I32:
7600 case AMDGPU::S_CMP_LE_I32:
7601 case AMDGPU::S_CMP_EQ_U32:
7602 case AMDGPU::S_CMP_LG_U32:
7603 case AMDGPU::S_CMP_GT_U32:
7604 case AMDGPU::S_CMP_GE_U32:
7605 case AMDGPU::S_CMP_LT_U32:
7606 case AMDGPU::S_CMP_LE_U32:
7607 case AMDGPU::S_CMP_EQ_U64:
7608 case AMDGPU::S_CMP_LG_U64:
7609 case AMDGPU::S_CMP_LT_F32:
7610 case AMDGPU::S_CMP_EQ_F32:
7611 case AMDGPU::S_CMP_LE_F32:
7612 case AMDGPU::S_CMP_GT_F32:
7613 case AMDGPU::S_CMP_LG_F32:
7614 case AMDGPU::S_CMP_GE_F32:
7615 case AMDGPU::S_CMP_O_F32:
7616 case AMDGPU::S_CMP_U_F32:
7617 case AMDGPU::S_CMP_NGE_F32:
7618 case AMDGPU::S_CMP_NLG_F32:
7619 case AMDGPU::S_CMP_NGT_F32:
7620 case AMDGPU::S_CMP_NLE_F32:
7621 case AMDGPU::S_CMP_NEQ_F32:
7622 case AMDGPU::S_CMP_NLT_F32: {
7623 Register CondReg = MRI.createVirtualRegister(RegClass: RI.getWaveMaskRegClass());
7624 auto NewInstr =
7625 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode), DestReg: CondReg)
7626 .setMIFlags(Inst.getFlags());
7627 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src0_modifiers) >=
7628 0) {
7629 NewInstr
7630 .addImm(Val: 0) // src0_modifiers
7631 .add(MO: Inst.getOperand(i: 0)) // src0
7632 .addImm(Val: 0) // src1_modifiers
7633 .add(MO: Inst.getOperand(i: 1)) // src1
7634 .addImm(Val: 0); // clamp
7635 } else {
7636 NewInstr.add(MO: Inst.getOperand(i: 0)).add(MO: Inst.getOperand(i: 1));
7637 }
7638 legalizeOperands(MI&: *NewInstr, MDT);
7639 int SCCIdx = Inst.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
7640 MachineOperand SCCOp = Inst.getOperand(i: SCCIdx);
7641 addSCCDefUsersToVALUWorklist(Op&: SCCOp, SCCDefInst&: Inst, Worklist, NewCond: CondReg);
7642 Inst.eraseFromParent();
7643 return;
7644 }
7645 case AMDGPU::S_CMP_LT_F16:
7646 case AMDGPU::S_CMP_EQ_F16:
7647 case AMDGPU::S_CMP_LE_F16:
7648 case AMDGPU::S_CMP_GT_F16:
7649 case AMDGPU::S_CMP_LG_F16:
7650 case AMDGPU::S_CMP_GE_F16:
7651 case AMDGPU::S_CMP_O_F16:
7652 case AMDGPU::S_CMP_U_F16:
7653 case AMDGPU::S_CMP_NGE_F16:
7654 case AMDGPU::S_CMP_NLG_F16:
7655 case AMDGPU::S_CMP_NGT_F16:
7656 case AMDGPU::S_CMP_NLE_F16:
7657 case AMDGPU::S_CMP_NEQ_F16:
7658 case AMDGPU::S_CMP_NLT_F16: {
7659 Register CondReg = MRI.createVirtualRegister(RegClass: RI.getWaveMaskRegClass());
7660 auto NewInstr =
7661 BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode), DestReg: CondReg)
7662 .setMIFlags(Inst.getFlags());
7663 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::src0_modifiers)) {
7664 NewInstr
7665 .addImm(Val: 0) // src0_modifiers
7666 .add(MO: Inst.getOperand(i: 0)) // src0
7667 .addImm(Val: 0) // src1_modifiers
7668 .add(MO: Inst.getOperand(i: 1)) // src1
7669 .addImm(Val: 0); // clamp
7670 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::op_sel))
7671 NewInstr.addImm(Val: 0); // op_sel0
7672 } else {
7673 NewInstr
7674 .add(MO: Inst.getOperand(i: 0))
7675 .add(MO: Inst.getOperand(i: 1));
7676 }
7677 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
7678 legalizeOperands(MI&: *NewInstr, MDT);
7679 int SCCIdx = Inst.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
7680 MachineOperand SCCOp = Inst.getOperand(i: SCCIdx);
7681 addSCCDefUsersToVALUWorklist(Op&: SCCOp, SCCDefInst&: Inst, Worklist, NewCond: CondReg);
7682 Inst.eraseFromParent();
7683 return;
7684 }
7685 case AMDGPU::S_CVT_HI_F32_F16: {
7686 const DebugLoc &DL = Inst.getDebugLoc();
7687 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7688 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7689 if (ST.useRealTrue16Insts()) {
7690 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: TmpReg)
7691 .add(MO: Inst.getOperand(i: 1));
7692 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
7693 .addImm(Val: 0) // src0_modifiers
7694 .addReg(RegNo: TmpReg, flags: 0, SubReg: AMDGPU::hi16)
7695 .addImm(Val: 0) // clamp
7696 .addImm(Val: 0) // omod
7697 .addImm(Val: 0); // op_sel0
7698 } else {
7699 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
7700 .addImm(Val: 16)
7701 .add(MO: Inst.getOperand(i: 1));
7702 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
7703 .addImm(Val: 0) // src0_modifiers
7704 .addReg(RegNo: TmpReg)
7705 .addImm(Val: 0) // clamp
7706 .addImm(Val: 0); // omod
7707 }
7708
7709 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
7710 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
7711 Inst.eraseFromParent();
7712 return;
7713 }
7714 case AMDGPU::S_MINIMUM_F32:
7715 case AMDGPU::S_MAXIMUM_F32: {
7716 const DebugLoc &DL = Inst.getDebugLoc();
7717 Register NewDst = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7718 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
7719 .addImm(Val: 0) // src0_modifiers
7720 .add(MO: Inst.getOperand(i: 1))
7721 .addImm(Val: 0) // src1_modifiers
7722 .add(MO: Inst.getOperand(i: 2))
7723 .addImm(Val: 0) // clamp
7724 .addImm(Val: 0); // omod
7725 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
7726
7727 legalizeOperands(MI&: *NewInstr, MDT);
7728 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
7729 Inst.eraseFromParent();
7730 return;
7731 }
7732 case AMDGPU::S_MINIMUM_F16:
7733 case AMDGPU::S_MAXIMUM_F16: {
7734 const DebugLoc &DL = Inst.getDebugLoc();
7735 Register NewDst = MRI.createVirtualRegister(RegClass: ST.useRealTrue16Insts()
7736 ? &AMDGPU::VGPR_16RegClass
7737 : &AMDGPU::VGPR_32RegClass);
7738 MachineInstr *NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
7739 .addImm(Val: 0) // src0_modifiers
7740 .add(MO: Inst.getOperand(i: 1))
7741 .addImm(Val: 0) // src1_modifiers
7742 .add(MO: Inst.getOperand(i: 2))
7743 .addImm(Val: 0) // clamp
7744 .addImm(Val: 0) // omod
7745 .addImm(Val: 0); // opsel0
7746 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
7747 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
7748 legalizeOperands(MI&: *NewInstr, MDT);
7749 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
7750 Inst.eraseFromParent();
7751 return;
7752 }
7753 case AMDGPU::V_S_EXP_F16_e64:
7754 case AMDGPU::V_S_LOG_F16_e64:
7755 case AMDGPU::V_S_RCP_F16_e64:
7756 case AMDGPU::V_S_RSQ_F16_e64:
7757 case AMDGPU::V_S_SQRT_F16_e64: {
7758 const DebugLoc &DL = Inst.getDebugLoc();
7759 Register NewDst = MRI.createVirtualRegister(RegClass: ST.useRealTrue16Insts()
7760 ? &AMDGPU::VGPR_16RegClass
7761 : &AMDGPU::VGPR_32RegClass);
7762 auto NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: NewOpcode), DestReg: NewDst)
7763 .add(MO: Inst.getOperand(i: 1)) // src0_modifiers
7764 .add(MO: Inst.getOperand(i: 2))
7765 .add(MO: Inst.getOperand(i: 3)) // clamp
7766 .add(MO: Inst.getOperand(i: 4)) // omod
7767 .setMIFlags(Inst.getFlags());
7768 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::op_sel))
7769 NewInstr.addImm(Val: 0); // opsel0
7770 MRI.replaceRegWith(FromReg: Inst.getOperand(i: 0).getReg(), ToReg: NewDst);
7771 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
7772 legalizeOperands(MI&: *NewInstr, MDT);
7773 addUsersToMoveToVALUWorklist(Reg: NewDst, MRI, Worklist);
7774 Inst.eraseFromParent();
7775 return;
7776 }
7777 }
7778
7779 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7780 // We cannot move this instruction to the VALU, so we should try to
7781 // legalize its operands instead.
7782 legalizeOperands(MI&: Inst, MDT);
7783 return;
7784 }
7785 // Handle converting generic instructions like COPY-to-SGPR into
7786 // COPY-to-VGPR.
7787 if (NewOpcode == Opcode) {
7788 Register DstReg = Inst.getOperand(i: 0).getReg();
7789 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7790
7791 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7792 // hope for the best.
7793 if (Inst.isCopy() && DstReg.isPhysical() &&
7794 RI.isVGPR(MRI, Reg: Inst.getOperand(i: 1).getReg())) {
7795 // TODO: Only works for 32 bit registers.
7796 if (MRI.constrainRegClass(Reg: DstReg, RC: &AMDGPU::SReg_32_XM0RegClass)) {
7797 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
7798 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
7799 .add(MO: Inst.getOperand(i: 1));
7800 } else {
7801 Register NewDst =
7802 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
7803 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
7804 MCID: get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: NewDst)
7805 .add(MO: Inst.getOperand(i: 1));
7806 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: AMDGPU::COPY),
7807 DestReg: DstReg)
7808 .addReg(RegNo: NewDst);
7809 }
7810 Inst.eraseFromParent();
7811 return;
7812 }
7813
7814 if (Inst.isCopy() && Inst.getOperand(i: 1).getReg().isVirtual() &&
7815 NewDstRC == RI.getRegClassForReg(MRI, Reg: Inst.getOperand(i: 1).getReg())) {
7816 // Instead of creating a copy where src and dst are the same register
7817 // class, we just replace all uses of dst with src. These kinds of
7818 // copies interfere with the heuristics MachineSink uses to decide
7819 // whether or not to split a critical edge. Since the pass assumes
7820 // that copies will end up as machine instructions and not be
7821 // eliminated.
7822 addUsersToMoveToVALUWorklist(Reg: DstReg, MRI, Worklist);
7823 MRI.replaceRegWith(FromReg: DstReg, ToReg: Inst.getOperand(i: 1).getReg());
7824 MRI.clearKillFlags(Reg: Inst.getOperand(i: 1).getReg());
7825 Inst.getOperand(i: 0).setReg(DstReg);
7826 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7827 // these are deleted later, but at -O0 it would leave a suspicious
7828 // looking illegal copy of an undef register.
7829 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7830 Inst.removeOperand(OpNo: I);
7831 Inst.setDesc(get(Opcode: AMDGPU::IMPLICIT_DEF));
7832 return;
7833 }
7834
7835 // If this is a v2s copy between 16bit and 32bit reg,
7836 // replace vgpr copy to reg_sequence/extract_subreg
7837 // This can be remove after we have sgpr16 in place
7838 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
7839 Inst.getOperand(i: 1).getReg().isVirtual() &&
7840 RI.isVGPR(MRI, Reg: Inst.getOperand(i: 1).getReg())) {
7841 const TargetRegisterClass *SrcRegRC = getOpRegClass(MI: Inst, OpNo: 1);
7842 if (RI.getMatchingSuperRegClass(A: NewDstRC, B: SrcRegRC, Idx: AMDGPU::lo16)) {
7843 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
7844 Register Undef = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass);
7845 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
7846 MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
7847 BuildMI(BB&: *Inst.getParent(), I: &Inst, MIMD: Inst.getDebugLoc(),
7848 MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewDstReg)
7849 .addReg(RegNo: Inst.getOperand(i: 1).getReg())
7850 .addImm(Val: AMDGPU::lo16)
7851 .addReg(RegNo: Undef)
7852 .addImm(Val: AMDGPU::hi16);
7853 Inst.eraseFromParent();
7854 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
7855 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
7856 return;
7857 } else if (RI.getMatchingSuperRegClass(A: SrcRegRC, B: NewDstRC,
7858 Idx: AMDGPU::lo16)) {
7859 Inst.getOperand(i: 1).setSubReg(AMDGPU::lo16);
7860 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
7861 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
7862 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
7863 return;
7864 }
7865 }
7866
7867 Register NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
7868 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
7869 legalizeOperands(MI&: Inst, MDT);
7870 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
7871 return;
7872 }
7873
7874 // Use the new VALU Opcode.
7875 auto NewInstr = BuildMI(BB&: *MBB, I&: Inst, MIMD: Inst.getDebugLoc(), MCID: get(Opcode: NewOpcode))
7876 .setMIFlags(Inst.getFlags());
7877 if (isVOP3(Opcode: NewOpcode) && !isVOP3(Opcode)) {
7878 // Intersperse VOP3 modifiers among the SALU operands.
7879 NewInstr->addOperand(Op: Inst.getOperand(i: 0));
7880 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
7881 Name: AMDGPU::OpName::src0_modifiers) >= 0)
7882 NewInstr.addImm(Val: 0);
7883 if (AMDGPU::hasNamedOperand(Opcode: NewOpcode, NamedIdx: AMDGPU::OpName::src0)) {
7884 MachineOperand Src = Inst.getOperand(i: 1);
7885 NewInstr->addOperand(Op: Src);
7886 }
7887
7888 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7889 // We are converting these to a BFE, so we need to add the missing
7890 // operands for the size and offset.
7891 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7892 NewInstr.addImm(Val: 0);
7893 NewInstr.addImm(Val: Size);
7894 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7895 // The VALU version adds the second operand to the result, so insert an
7896 // extra 0 operand.
7897 NewInstr.addImm(Val: 0);
7898 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7899 const MachineOperand &OffsetWidthOp = Inst.getOperand(i: 2);
7900 // If we need to move this to VGPRs, we need to unpack the second
7901 // operand back into the 2 separate ones for bit offset and width.
7902 assert(OffsetWidthOp.isImm() &&
7903 "Scalar BFE is only implemented for constant width and offset");
7904 uint32_t Imm = OffsetWidthOp.getImm();
7905
7906 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7907 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7908 NewInstr.addImm(Val: Offset);
7909 NewInstr.addImm(Val: BitWidth);
7910 } else {
7911 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
7912 Name: AMDGPU::OpName::src1_modifiers) >= 0)
7913 NewInstr.addImm(Val: 0);
7914 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src1) >= 0)
7915 NewInstr->addOperand(Op: Inst.getOperand(i: 2));
7916 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode,
7917 Name: AMDGPU::OpName::src2_modifiers) >= 0)
7918 NewInstr.addImm(Val: 0);
7919 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::src2) >= 0)
7920 NewInstr->addOperand(Op: Inst.getOperand(i: 3));
7921 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::clamp) >= 0)
7922 NewInstr.addImm(Val: 0);
7923 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::omod) >= 0)
7924 NewInstr.addImm(Val: 0);
7925 if (AMDGPU::getNamedOperandIdx(Opcode: NewOpcode, Name: AMDGPU::OpName::op_sel) >= 0)
7926 NewInstr.addImm(Val: 0);
7927 }
7928 } else {
7929 // Just copy the SALU operands.
7930 for (const MachineOperand &Op : Inst.explicit_operands())
7931 NewInstr->addOperand(Op);
7932 }
7933
7934 // Remove any references to SCC. Vector instructions can't read from it, and
7935 // We're just about to add the implicit use / defs of VCC, and we don't want
7936 // both.
7937 for (MachineOperand &Op : Inst.implicit_operands()) {
7938 if (Op.getReg() == AMDGPU::SCC) {
7939 // Only propagate through live-def of SCC.
7940 if (Op.isDef() && !Op.isDead())
7941 addSCCDefUsersToVALUWorklist(Op, SCCDefInst&: Inst, Worklist);
7942 if (Op.isUse())
7943 addSCCDefsToVALUWorklist(SCCUseInst: NewInstr, Worklist);
7944 }
7945 }
7946 Inst.eraseFromParent();
7947 Register NewDstReg;
7948 if (NewInstr->getOperand(i: 0).isReg() && NewInstr->getOperand(i: 0).isDef()) {
7949 Register DstReg = NewInstr->getOperand(i: 0).getReg();
7950 assert(DstReg.isVirtual());
7951 // Update the destination register class.
7952 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst: *NewInstr);
7953 assert(NewDstRC);
7954 NewDstReg = MRI.createVirtualRegister(RegClass: NewDstRC);
7955 MRI.replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
7956 }
7957 fixImplicitOperands(MI&: *NewInstr);
7958
7959 legalizeOperandsVALUt16(MI&: *NewInstr, MRI);
7960
7961 // Legalize the operands
7962 legalizeOperands(MI&: *NewInstr, MDT);
7963 if (NewDstReg)
7964 addUsersToMoveToVALUWorklist(Reg: NewDstReg, MRI, Worklist);
7965}
7966
7967// Add/sub require special handling to deal with carry outs.
7968std::pair<bool, MachineBasicBlock *>
7969SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7970 MachineDominatorTree *MDT) const {
7971 if (ST.hasAddNoCarry()) {
7972 // Assume there is no user of scc since we don't select this in that case.
7973 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7974 // is used.
7975
7976 MachineBasicBlock &MBB = *Inst.getParent();
7977 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7978
7979 Register OldDstReg = Inst.getOperand(i: 0).getReg();
7980 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
7981
7982 unsigned Opc = Inst.getOpcode();
7983 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7984
7985 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7986 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7987
7988 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7989 Inst.removeOperand(OpNo: 3);
7990
7991 Inst.setDesc(get(Opcode: NewOpc));
7992 Inst.addOperand(Op: MachineOperand::CreateImm(Val: 0)); // clamp bit
7993 Inst.addImplicitDefUseOperands(MF&: *MBB.getParent());
7994 MRI.replaceRegWith(FromReg: OldDstReg, ToReg: ResultReg);
7995 MachineBasicBlock *NewBB = legalizeOperands(MI&: Inst, MDT);
7996
7997 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
7998 return std::pair(true, NewBB);
7999 }
8000
8001 return std::pair(false, nullptr);
8002}
8003
8004void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8005 MachineDominatorTree *MDT) const {
8006
8007 MachineBasicBlock &MBB = *Inst.getParent();
8008 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8009 MachineBasicBlock::iterator MII = Inst;
8010 DebugLoc DL = Inst.getDebugLoc();
8011
8012 MachineOperand &Dest = Inst.getOperand(i: 0);
8013 MachineOperand &Src0 = Inst.getOperand(i: 1);
8014 MachineOperand &Src1 = Inst.getOperand(i: 2);
8015 MachineOperand &Cond = Inst.getOperand(i: 3);
8016
8017 Register CondReg = Cond.getReg();
8018 bool IsSCC = (CondReg == AMDGPU::SCC);
8019
8020 // If this is a trivial select where the condition is effectively not SCC
8021 // (CondReg is a source of copy to SCC), then the select is semantically
8022 // equivalent to copying CondReg. Hence, there is no need to create
8023 // V_CNDMASK, we can just use that and bail out.
8024 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8025 (Src1.getImm() == 0)) {
8026 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: CondReg);
8027 return;
8028 }
8029
8030 Register NewCondReg = CondReg;
8031 if (IsSCC) {
8032 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8033 NewCondReg = MRI.createVirtualRegister(RegClass: TC);
8034
8035 // Now look for the closest SCC def if it is a copy
8036 // replacing the CondReg with the COPY source register
8037 bool CopyFound = false;
8038 for (MachineInstr &CandI :
8039 make_range(x: std::next(x: MachineBasicBlock::reverse_iterator(Inst)),
8040 y: Inst.getParent()->rend())) {
8041 if (CandI.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isDead: false, Overlap: false) !=
8042 -1) {
8043 if (CandI.isCopy() && CandI.getOperand(i: 0).getReg() == AMDGPU::SCC) {
8044 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::COPY), DestReg: NewCondReg)
8045 .addReg(RegNo: CandI.getOperand(i: 1).getReg());
8046 CopyFound = true;
8047 }
8048 break;
8049 }
8050 }
8051 if (!CopyFound) {
8052 // SCC def is not a copy
8053 // Insert a trivial select instead of creating a copy, because a copy from
8054 // SCC would semantically mean just copying a single bit, but we may need
8055 // the result to be a vector condition mask that needs preserving.
8056 unsigned Opcode =
8057 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8058 auto NewSelect =
8059 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: NewCondReg).addImm(Val: -1).addImm(Val: 0);
8060 NewSelect->getOperand(i: 3).setIsUndef(Cond.isUndef());
8061 }
8062 }
8063
8064 Register NewDestReg = MRI.createVirtualRegister(
8065 RegClass: RI.getEquivalentVGPRClass(SRC: MRI.getRegClass(Reg: Dest.getReg())));
8066 MachineInstr *NewInst;
8067 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8068 NewInst = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: NewDestReg)
8069 .addImm(Val: 0)
8070 .add(MO: Src1) // False
8071 .addImm(Val: 0)
8072 .add(MO: Src0) // True
8073 .addReg(RegNo: NewCondReg);
8074 } else {
8075 NewInst =
8076 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_CNDMASK_B64_PSEUDO), DestReg: NewDestReg)
8077 .add(MO: Src1) // False
8078 .add(MO: Src0) // True
8079 .addReg(RegNo: NewCondReg);
8080 }
8081 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDestReg);
8082 legalizeOperands(MI&: *NewInst, MDT);
8083 addUsersToMoveToVALUWorklist(Reg: NewDestReg, MRI, Worklist);
8084}
8085
8086void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8087 MachineInstr &Inst) const {
8088 MachineBasicBlock &MBB = *Inst.getParent();
8089 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8090 MachineBasicBlock::iterator MII = Inst;
8091 DebugLoc DL = Inst.getDebugLoc();
8092
8093 MachineOperand &Dest = Inst.getOperand(i: 0);
8094 MachineOperand &Src = Inst.getOperand(i: 1);
8095 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8096 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8097
8098 unsigned SubOp = ST.hasAddNoCarry() ?
8099 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8100
8101 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: SubOp), DestReg: TmpReg)
8102 .addImm(Val: 0)
8103 .addReg(RegNo: Src.getReg());
8104
8105 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MAX_I32_e64), DestReg: ResultReg)
8106 .addReg(RegNo: Src.getReg())
8107 .addReg(RegNo: TmpReg);
8108
8109 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8110 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8111}
8112
8113void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8114 MachineInstr &Inst) const {
8115 MachineBasicBlock &MBB = *Inst.getParent();
8116 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8117 MachineBasicBlock::iterator MII = Inst;
8118 const DebugLoc &DL = Inst.getDebugLoc();
8119
8120 MachineOperand &Dest = Inst.getOperand(i: 0);
8121 MachineOperand &Src0 = Inst.getOperand(i: 1);
8122 MachineOperand &Src1 = Inst.getOperand(i: 2);
8123
8124 if (ST.hasDLInsts()) {
8125 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8126 legalizeGenericOperand(InsertMBB&: MBB, I: MII, DstRC: &AMDGPU::VGPR_32RegClass, Op&: Src0, MRI, DL);
8127 legalizeGenericOperand(InsertMBB&: MBB, I: MII, DstRC: &AMDGPU::VGPR_32RegClass, Op&: Src1, MRI, DL);
8128
8129 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_XNOR_B32_e64), DestReg: NewDest)
8130 .add(MO: Src0)
8131 .add(MO: Src1);
8132
8133 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8134 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8135 } else {
8136 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8137 // invert either source and then perform the XOR. If either source is a
8138 // scalar register, then we can leave the inversion on the scalar unit to
8139 // achieve a better distribution of scalar and vector instructions.
8140 bool Src0IsSGPR = Src0.isReg() &&
8141 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src0.getReg()));
8142 bool Src1IsSGPR = Src1.isReg() &&
8143 RI.isSGPRClass(RC: MRI.getRegClass(Reg: Src1.getReg()));
8144 MachineInstr *Xor;
8145 Register Temp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8146 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8147
8148 // Build a pair of scalar instructions and add them to the work list.
8149 // The next iteration over the work list will lower these to the vector
8150 // unit as necessary.
8151 if (Src0IsSGPR) {
8152 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Temp).add(MO: Src0);
8153 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: NewDest)
8154 .addReg(RegNo: Temp)
8155 .add(MO: Src1);
8156 } else if (Src1IsSGPR) {
8157 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Temp).add(MO: Src1);
8158 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: NewDest)
8159 .add(MO: Src0)
8160 .addReg(RegNo: Temp);
8161 } else {
8162 Xor = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B32), DestReg: Temp)
8163 .add(MO: Src0)
8164 .add(MO: Src1);
8165 MachineInstr *Not =
8166 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: NewDest).addReg(RegNo: Temp);
8167 Worklist.insert(MI: Not);
8168 }
8169
8170 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8171
8172 Worklist.insert(MI: Xor);
8173
8174 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8175 }
8176}
8177
8178void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8179 MachineInstr &Inst,
8180 unsigned Opcode) const {
8181 MachineBasicBlock &MBB = *Inst.getParent();
8182 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8183 MachineBasicBlock::iterator MII = Inst;
8184 const DebugLoc &DL = Inst.getDebugLoc();
8185
8186 MachineOperand &Dest = Inst.getOperand(i: 0);
8187 MachineOperand &Src0 = Inst.getOperand(i: 1);
8188 MachineOperand &Src1 = Inst.getOperand(i: 2);
8189
8190 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8191 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
8192
8193 MachineInstr &Op = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: Interm)
8194 .add(MO: Src0)
8195 .add(MO: Src1);
8196
8197 MachineInstr &Not = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: NewDest)
8198 .addReg(RegNo: Interm);
8199
8200 Worklist.insert(MI: &Op);
8201 Worklist.insert(MI: &Not);
8202
8203 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8204 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8205}
8206
8207void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8208 MachineInstr &Inst,
8209 unsigned Opcode) const {
8210 MachineBasicBlock &MBB = *Inst.getParent();
8211 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8212 MachineBasicBlock::iterator MII = Inst;
8213 const DebugLoc &DL = Inst.getDebugLoc();
8214
8215 MachineOperand &Dest = Inst.getOperand(i: 0);
8216 MachineOperand &Src0 = Inst.getOperand(i: 1);
8217 MachineOperand &Src1 = Inst.getOperand(i: 2);
8218
8219 Register NewDest = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8220 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
8221
8222 MachineInstr &Not = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B32), DestReg: Interm)
8223 .add(MO: Src1);
8224
8225 MachineInstr &Op = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode), DestReg: NewDest)
8226 .add(MO: Src0)
8227 .addReg(RegNo: Interm);
8228
8229 Worklist.insert(MI: &Not);
8230 Worklist.insert(MI: &Op);
8231
8232 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8233 addUsersToMoveToVALUWorklist(Reg: NewDest, MRI, Worklist);
8234}
8235
8236void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8237 MachineInstr &Inst, unsigned Opcode,
8238 bool Swap) const {
8239 MachineBasicBlock &MBB = *Inst.getParent();
8240 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8241
8242 MachineOperand &Dest = Inst.getOperand(i: 0);
8243 MachineOperand &Src0 = Inst.getOperand(i: 1);
8244 DebugLoc DL = Inst.getDebugLoc();
8245
8246 MachineBasicBlock::iterator MII = Inst;
8247
8248 const MCInstrDesc &InstDesc = get(Opcode);
8249 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8250 MRI.getRegClass(Reg: Src0.getReg()) :
8251 &AMDGPU::SGPR_32RegClass;
8252
8253 const TargetRegisterClass *Src0SubRC =
8254 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8255
8256 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
8257 SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8258
8259 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
8260 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(SRC: DestRC);
8261 const TargetRegisterClass *NewDestSubRC =
8262 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8263
8264 Register DestSub0 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
8265 MachineInstr &LoHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub0).add(MO: SrcReg0Sub0);
8266
8267 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
8268 SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
8269
8270 Register DestSub1 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
8271 MachineInstr &HiHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub1).add(MO: SrcReg0Sub1);
8272
8273 if (Swap)
8274 std::swap(a&: DestSub0, b&: DestSub1);
8275
8276 Register FullDestReg = MRI.createVirtualRegister(RegClass: NewDestRC);
8277 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
8278 .addReg(RegNo: DestSub0)
8279 .addImm(Val: AMDGPU::sub0)
8280 .addReg(RegNo: DestSub1)
8281 .addImm(Val: AMDGPU::sub1);
8282
8283 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
8284
8285 Worklist.insert(MI: &LoHalf);
8286 Worklist.insert(MI: &HiHalf);
8287
8288 // We don't need to legalizeOperands here because for a single operand, src0
8289 // will support any kind of input.
8290
8291 // Move all users of this moved value.
8292 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
8293}
8294
8295// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8296// split the s_mul_u64 in 32-bit vector multiplications.
8297void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8298 MachineInstr &Inst,
8299 MachineDominatorTree *MDT) const {
8300 MachineBasicBlock &MBB = *Inst.getParent();
8301 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8302
8303 Register FullDestReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
8304 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8305 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8306
8307 MachineOperand &Dest = Inst.getOperand(i: 0);
8308 MachineOperand &Src0 = Inst.getOperand(i: 1);
8309 MachineOperand &Src1 = Inst.getOperand(i: 2);
8310 const DebugLoc &DL = Inst.getDebugLoc();
8311 MachineBasicBlock::iterator MII = Inst;
8312
8313 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0.getReg());
8314 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: Src1.getReg());
8315 const TargetRegisterClass *Src0SubRC =
8316 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8317 if (RI.isSGPRClass(RC: Src0SubRC))
8318 Src0SubRC = RI.getEquivalentVGPRClass(SRC: Src0SubRC);
8319 const TargetRegisterClass *Src1SubRC =
8320 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8321 if (RI.isSGPRClass(RC: Src1SubRC))
8322 Src1SubRC = RI.getEquivalentVGPRClass(SRC: Src1SubRC);
8323
8324 // First, we extract the low 32-bit and high 32-bit values from each of the
8325 // operands.
8326 MachineOperand Op0L =
8327 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8328 MachineOperand Op1L =
8329 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
8330 MachineOperand Op0H =
8331 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
8332 MachineOperand Op1H =
8333 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
8334
8335 // The multilication is done as follows:
8336 //
8337 // Op1H Op1L
8338 // * Op0H Op0L
8339 // --------------------
8340 // Op1H*Op0L Op1L*Op0L
8341 // + Op1H*Op0H Op1L*Op0H
8342 // -----------------------------------------
8343 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8344 //
8345 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8346 // value and that would overflow.
8347 // The low 32-bit value is Op1L*Op0L.
8348 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8349
8350 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8351 MachineInstr *Op1L_Op0H =
8352 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: Op1L_Op0H_Reg)
8353 .add(MO: Op1L)
8354 .add(MO: Op0H);
8355
8356 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8357 MachineInstr *Op1H_Op0L =
8358 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: Op1H_Op0L_Reg)
8359 .add(MO: Op1H)
8360 .add(MO: Op0L);
8361
8362 Register CarryReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8363 MachineInstr *Carry =
8364 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_HI_U32_e64), DestReg: CarryReg)
8365 .add(MO: Op1L)
8366 .add(MO: Op0L);
8367
8368 MachineInstr *LoHalf =
8369 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: DestSub0)
8370 .add(MO: Op1L)
8371 .add(MO: Op0L);
8372
8373 Register AddReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8374 MachineInstr *Add = BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg: AddReg)
8375 .addReg(RegNo: Op1L_Op0H_Reg)
8376 .addReg(RegNo: Op1H_Op0L_Reg);
8377
8378 MachineInstr *HiHalf =
8379 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg: DestSub1)
8380 .addReg(RegNo: AddReg)
8381 .addReg(RegNo: CarryReg);
8382
8383 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
8384 .addReg(RegNo: DestSub0)
8385 .addImm(Val: AMDGPU::sub0)
8386 .addReg(RegNo: DestSub1)
8387 .addImm(Val: AMDGPU::sub1);
8388
8389 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
8390
8391 // Try to legalize the operands in case we need to swap the order to keep it
8392 // valid.
8393 legalizeOperands(MI&: *Op1L_Op0H, MDT);
8394 legalizeOperands(MI&: *Op1H_Op0L, MDT);
8395 legalizeOperands(MI&: *Carry, MDT);
8396 legalizeOperands(MI&: *LoHalf, MDT);
8397 legalizeOperands(MI&: *Add, MDT);
8398 legalizeOperands(MI&: *HiHalf, MDT);
8399
8400 // Move all users of this moved value.
8401 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
8402}
8403
8404// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8405// multiplications.
8406void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8407 MachineInstr &Inst,
8408 MachineDominatorTree *MDT) const {
8409 MachineBasicBlock &MBB = *Inst.getParent();
8410 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8411
8412 Register FullDestReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
8413 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8414 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8415
8416 MachineOperand &Dest = Inst.getOperand(i: 0);
8417 MachineOperand &Src0 = Inst.getOperand(i: 1);
8418 MachineOperand &Src1 = Inst.getOperand(i: 2);
8419 const DebugLoc &DL = Inst.getDebugLoc();
8420 MachineBasicBlock::iterator MII = Inst;
8421
8422 const TargetRegisterClass *Src0RC = MRI.getRegClass(Reg: Src0.getReg());
8423 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: Src1.getReg());
8424 const TargetRegisterClass *Src0SubRC =
8425 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8426 if (RI.isSGPRClass(RC: Src0SubRC))
8427 Src0SubRC = RI.getEquivalentVGPRClass(SRC: Src0SubRC);
8428 const TargetRegisterClass *Src1SubRC =
8429 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8430 if (RI.isSGPRClass(RC: Src1SubRC))
8431 Src1SubRC = RI.getEquivalentVGPRClass(SRC: Src1SubRC);
8432
8433 // First, we extract the low 32-bit and high 32-bit values from each of the
8434 // operands.
8435 MachineOperand Op0L =
8436 buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8437 MachineOperand Op1L =
8438 buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
8439
8440 unsigned Opc = Inst.getOpcode();
8441 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8442 ? AMDGPU::V_MUL_HI_U32_e64
8443 : AMDGPU::V_MUL_HI_I32_e64;
8444 MachineInstr *HiHalf =
8445 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: NewOpc), DestReg: DestSub1).add(MO: Op1L).add(MO: Op0L);
8446
8447 MachineInstr *LoHalf =
8448 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MUL_LO_U32_e64), DestReg: DestSub0)
8449 .add(MO: Op1L)
8450 .add(MO: Op0L);
8451
8452 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
8453 .addReg(RegNo: DestSub0)
8454 .addImm(Val: AMDGPU::sub0)
8455 .addReg(RegNo: DestSub1)
8456 .addImm(Val: AMDGPU::sub1);
8457
8458 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
8459
8460 // Try to legalize the operands in case we need to swap the order to keep it
8461 // valid.
8462 legalizeOperands(MI&: *HiHalf, MDT);
8463 legalizeOperands(MI&: *LoHalf, MDT);
8464
8465 // Move all users of this moved value.
8466 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
8467}
8468
8469void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8470 MachineInstr &Inst, unsigned Opcode,
8471 MachineDominatorTree *MDT) const {
8472 MachineBasicBlock &MBB = *Inst.getParent();
8473 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8474
8475 MachineOperand &Dest = Inst.getOperand(i: 0);
8476 MachineOperand &Src0 = Inst.getOperand(i: 1);
8477 MachineOperand &Src1 = Inst.getOperand(i: 2);
8478 DebugLoc DL = Inst.getDebugLoc();
8479
8480 MachineBasicBlock::iterator MII = Inst;
8481
8482 const MCInstrDesc &InstDesc = get(Opcode);
8483 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8484 MRI.getRegClass(Reg: Src0.getReg()) :
8485 &AMDGPU::SGPR_32RegClass;
8486
8487 const TargetRegisterClass *Src0SubRC =
8488 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8489 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8490 MRI.getRegClass(Reg: Src1.getReg()) :
8491 &AMDGPU::SGPR_32RegClass;
8492
8493 const TargetRegisterClass *Src1SubRC =
8494 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8495
8496 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
8497 SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
8498 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC,
8499 SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
8500 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src0, SuperRC: Src0RC,
8501 SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
8502 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src1, SuperRC: Src1RC,
8503 SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
8504
8505 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
8506 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(SRC: DestRC);
8507 const TargetRegisterClass *NewDestSubRC =
8508 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8509
8510 Register DestSub0 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
8511 MachineInstr &LoHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub0)
8512 .add(MO: SrcReg0Sub0)
8513 .add(MO: SrcReg1Sub0);
8514
8515 Register DestSub1 = MRI.createVirtualRegister(RegClass: NewDestSubRC);
8516 MachineInstr &HiHalf = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: DestSub1)
8517 .add(MO: SrcReg0Sub1)
8518 .add(MO: SrcReg1Sub1);
8519
8520 Register FullDestReg = MRI.createVirtualRegister(RegClass: NewDestRC);
8521 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: FullDestReg)
8522 .addReg(RegNo: DestSub0)
8523 .addImm(Val: AMDGPU::sub0)
8524 .addReg(RegNo: DestSub1)
8525 .addImm(Val: AMDGPU::sub1);
8526
8527 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: FullDestReg);
8528
8529 Worklist.insert(MI: &LoHalf);
8530 Worklist.insert(MI: &HiHalf);
8531
8532 // Move all users of this moved value.
8533 addUsersToMoveToVALUWorklist(Reg: FullDestReg, MRI, Worklist);
8534}
8535
8536void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8537 MachineInstr &Inst,
8538 MachineDominatorTree *MDT) const {
8539 MachineBasicBlock &MBB = *Inst.getParent();
8540 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8541
8542 MachineOperand &Dest = Inst.getOperand(i: 0);
8543 MachineOperand &Src0 = Inst.getOperand(i: 1);
8544 MachineOperand &Src1 = Inst.getOperand(i: 2);
8545 const DebugLoc &DL = Inst.getDebugLoc();
8546
8547 MachineBasicBlock::iterator MII = Inst;
8548
8549 const TargetRegisterClass *DestRC = MRI.getRegClass(Reg: Dest.getReg());
8550
8551 Register Interm = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
8552
8553 MachineOperand* Op0;
8554 MachineOperand* Op1;
8555
8556 if (Src0.isReg() && RI.isSGPRReg(MRI, Reg: Src0.getReg())) {
8557 Op0 = &Src0;
8558 Op1 = &Src1;
8559 } else {
8560 Op0 = &Src1;
8561 Op1 = &Src0;
8562 }
8563
8564 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_NOT_B64), DestReg: Interm)
8565 .add(MO: *Op0);
8566
8567 Register NewDest = MRI.createVirtualRegister(RegClass: DestRC);
8568
8569 MachineInstr &Xor = *BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::S_XOR_B64), DestReg: NewDest)
8570 .addReg(RegNo: Interm)
8571 .add(MO: *Op1);
8572
8573 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: NewDest);
8574
8575 Worklist.insert(MI: &Xor);
8576}
8577
8578void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8579 MachineInstr &Inst) const {
8580 MachineBasicBlock &MBB = *Inst.getParent();
8581 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8582
8583 MachineBasicBlock::iterator MII = Inst;
8584 const DebugLoc &DL = Inst.getDebugLoc();
8585
8586 MachineOperand &Dest = Inst.getOperand(i: 0);
8587 MachineOperand &Src = Inst.getOperand(i: 1);
8588
8589 const MCInstrDesc &InstDesc = get(Opcode: AMDGPU::V_BCNT_U32_B32_e64);
8590 const TargetRegisterClass *SrcRC = Src.isReg() ?
8591 MRI.getRegClass(Reg: Src.getReg()) :
8592 &AMDGPU::SGPR_32RegClass;
8593
8594 Register MidReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8595 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8596
8597 const TargetRegisterClass *SrcSubRC =
8598 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8599
8600 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC,
8601 SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
8602 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC,
8603 SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
8604
8605 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg).add(MO: SrcRegSub0).addImm(Val: 0);
8606
8607 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: ResultReg).add(MO: SrcRegSub1).addReg(RegNo: MidReg);
8608
8609 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8610
8611 // We don't need to legalize operands here. src0 for either instruction can be
8612 // an SGPR, and the second input is unused or determined here.
8613 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8614}
8615
8616void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8617 MachineInstr &Inst) const {
8618 MachineBasicBlock &MBB = *Inst.getParent();
8619 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8620 MachineBasicBlock::iterator MII = Inst;
8621 const DebugLoc &DL = Inst.getDebugLoc();
8622
8623 MachineOperand &Dest = Inst.getOperand(i: 0);
8624 uint32_t Imm = Inst.getOperand(i: 2).getImm();
8625 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8626 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8627
8628 (void) Offset;
8629
8630 // Only sext_inreg cases handled.
8631 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8632 Offset == 0 && "Not implemented");
8633
8634 if (BitWidth < 32) {
8635 Register MidRegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8636 Register MidRegHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8637 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
8638
8639 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_BFE_I32_e64), DestReg: MidRegLo)
8640 .addReg(RegNo: Inst.getOperand(i: 1).getReg(), flags: 0, SubReg: AMDGPU::sub0)
8641 .addImm(Val: 0)
8642 .addImm(Val: BitWidth);
8643
8644 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ASHRREV_I32_e32), DestReg: MidRegHi)
8645 .addImm(Val: 31)
8646 .addReg(RegNo: MidRegLo);
8647
8648 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: ResultReg)
8649 .addReg(RegNo: MidRegLo)
8650 .addImm(Val: AMDGPU::sub0)
8651 .addReg(RegNo: MidRegHi)
8652 .addImm(Val: AMDGPU::sub1);
8653
8654 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8655 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8656 return;
8657 }
8658
8659 MachineOperand &Src = Inst.getOperand(i: 1);
8660 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8661 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_64RegClass);
8662
8663 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ASHRREV_I32_e64), DestReg: TmpReg)
8664 .addImm(Val: 31)
8665 .addReg(RegNo: Src.getReg(), flags: 0, SubReg: AMDGPU::sub0);
8666
8667 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: ResultReg)
8668 .addReg(RegNo: Src.getReg(), flags: 0, SubReg: AMDGPU::sub0)
8669 .addImm(Val: AMDGPU::sub0)
8670 .addReg(RegNo: TmpReg)
8671 .addImm(Val: AMDGPU::sub1);
8672
8673 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8674 addUsersToMoveToVALUWorklist(Reg: ResultReg, MRI, Worklist);
8675}
8676
8677void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8678 MachineInstr &Inst, unsigned Opcode,
8679 MachineDominatorTree *MDT) const {
8680 // (S_FLBIT_I32_B64 hi:lo) ->
8681 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8682 // (S_FF1_I32_B64 hi:lo) ->
8683 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8684
8685 MachineBasicBlock &MBB = *Inst.getParent();
8686 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8687 MachineBasicBlock::iterator MII = Inst;
8688 const DebugLoc &DL = Inst.getDebugLoc();
8689
8690 MachineOperand &Dest = Inst.getOperand(i: 0);
8691 MachineOperand &Src = Inst.getOperand(i: 1);
8692
8693 const MCInstrDesc &InstDesc = get(Opcode);
8694
8695 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8696 unsigned OpcodeAdd =
8697 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8698
8699 const TargetRegisterClass *SrcRC =
8700 Src.isReg() ? MRI.getRegClass(Reg: Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8701 const TargetRegisterClass *SrcSubRC =
8702 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8703
8704 MachineOperand SrcRegSub0 =
8705 buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
8706 MachineOperand SrcRegSub1 =
8707 buildExtractSubRegOrImm(MII, MRI, Op: Src, SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
8708
8709 Register MidReg1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8710 Register MidReg2 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8711 Register MidReg3 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8712 Register MidReg4 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8713
8714 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg1).add(MO: SrcRegSub0);
8715
8716 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: InstDesc, DestReg: MidReg2).add(MO: SrcRegSub1);
8717
8718 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: OpcodeAdd), DestReg: MidReg3)
8719 .addReg(RegNo: IsCtlz ? MidReg1 : MidReg2)
8720 .addImm(Val: 32)
8721 .addImm(Val: 1); // enable clamp
8722
8723 BuildMI(BB&: MBB, I: MII, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MIN_U32_e64), DestReg: MidReg4)
8724 .addReg(RegNo: MidReg3)
8725 .addReg(RegNo: IsCtlz ? MidReg2 : MidReg1);
8726
8727 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: MidReg4);
8728
8729 addUsersToMoveToVALUWorklist(Reg: MidReg4, MRI, Worklist);
8730}
8731
8732void SIInstrInfo::addUsersToMoveToVALUWorklist(
8733 Register DstReg, MachineRegisterInfo &MRI,
8734 SIInstrWorklist &Worklist) const {
8735 for (MachineOperand &MO : make_early_inc_range(Range: MRI.use_operands(Reg: DstReg))) {
8736 MachineInstr &UseMI = *MO.getParent();
8737
8738 unsigned OpNo = 0;
8739
8740 switch (UseMI.getOpcode()) {
8741 case AMDGPU::COPY:
8742 case AMDGPU::WQM:
8743 case AMDGPU::SOFT_WQM:
8744 case AMDGPU::STRICT_WWM:
8745 case AMDGPU::STRICT_WQM:
8746 case AMDGPU::REG_SEQUENCE:
8747 case AMDGPU::PHI:
8748 case AMDGPU::INSERT_SUBREG:
8749 break;
8750 default:
8751 OpNo = MO.getOperandNo();
8752 break;
8753 }
8754
8755 if (!RI.hasVectorRegisters(RC: getOpRegClass(MI: UseMI, OpNo)))
8756 Worklist.insert(MI: &UseMI);
8757 else
8758 // Legalization could change user list.
8759 legalizeOperandsVALUt16(MI&: UseMI, OpIdx: OpNo, MRI);
8760 }
8761}
8762
8763void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8764 MachineRegisterInfo &MRI,
8765 MachineInstr &Inst) const {
8766 Register ResultReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8767 MachineBasicBlock *MBB = Inst.getParent();
8768 MachineOperand &Src0 = Inst.getOperand(i: 1);
8769 MachineOperand &Src1 = Inst.getOperand(i: 2);
8770 const DebugLoc &DL = Inst.getDebugLoc();
8771
8772 switch (Inst.getOpcode()) {
8773 case AMDGPU::S_PACK_LL_B32_B16: {
8774 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8775 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8776
8777 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8778 // 0.
8779 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
8780 .addImm(Val: 0xffff);
8781
8782 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_AND_B32_e64), DestReg: TmpReg)
8783 .addReg(RegNo: ImmReg, flags: RegState::Kill)
8784 .add(MO: Src0);
8785
8786 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: ResultReg)
8787 .add(MO: Src1)
8788 .addImm(Val: 16)
8789 .addReg(RegNo: TmpReg, flags: RegState::Kill);
8790 break;
8791 }
8792 case AMDGPU::S_PACK_LH_B32_B16: {
8793 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8794 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
8795 .addImm(Val: 0xffff);
8796 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_BFI_B32_e64), DestReg: ResultReg)
8797 .addReg(RegNo: ImmReg, flags: RegState::Kill)
8798 .add(MO: Src0)
8799 .add(MO: Src1);
8800 break;
8801 }
8802 case AMDGPU::S_PACK_HL_B32_B16: {
8803 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8804 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
8805 .addImm(Val: 16)
8806 .add(MO: Src0);
8807 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHL_OR_B32_e64), DestReg: ResultReg)
8808 .add(MO: Src1)
8809 .addImm(Val: 16)
8810 .addReg(RegNo: TmpReg, flags: RegState::Kill);
8811 break;
8812 }
8813 case AMDGPU::S_PACK_HH_B32_B16: {
8814 Register ImmReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8815 Register TmpReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
8816 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_LSHRREV_B32_e64), DestReg: TmpReg)
8817 .addImm(Val: 16)
8818 .add(MO: Src0);
8819 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: ImmReg)
8820 .addImm(Val: 0xffff0000);
8821 BuildMI(BB&: *MBB, I&: Inst, MIMD: DL, MCID: get(Opcode: AMDGPU::V_AND_OR_B32_e64), DestReg: ResultReg)
8822 .add(MO: Src1)
8823 .addReg(RegNo: ImmReg, flags: RegState::Kill)
8824 .addReg(RegNo: TmpReg, flags: RegState::Kill);
8825 break;
8826 }
8827 default:
8828 llvm_unreachable("unhandled s_pack_* instruction");
8829 }
8830
8831 MachineOperand &Dest = Inst.getOperand(i: 0);
8832 MRI.replaceRegWith(FromReg: Dest.getReg(), ToReg: ResultReg);
8833 addUsersToMoveToVALUWorklist(DstReg: ResultReg, MRI, Worklist);
8834}
8835
8836void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8837 MachineInstr &SCCDefInst,
8838 SIInstrWorklist &Worklist,
8839 Register NewCond) const {
8840
8841 // Ensure that def inst defines SCC, which is still live.
8842 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8843 !Op.isDead() && Op.getParent() == &SCCDefInst);
8844 SmallVector<MachineInstr *, 4> CopyToDelete;
8845 // This assumes that all the users of SCC are in the same block
8846 // as the SCC def.
8847 for (MachineInstr &MI : // Skip the def inst itself.
8848 make_range(x: std::next(x: MachineBasicBlock::iterator(SCCDefInst)),
8849 y: SCCDefInst.getParent()->end())) {
8850 // Check if SCC is used first.
8851 int SCCIdx = MI.findRegisterUseOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isKill: false);
8852 if (SCCIdx != -1) {
8853 if (MI.isCopy()) {
8854 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8855 Register DestReg = MI.getOperand(i: 0).getReg();
8856
8857 MRI.replaceRegWith(FromReg: DestReg, ToReg: NewCond);
8858 CopyToDelete.push_back(Elt: &MI);
8859 } else {
8860
8861 if (NewCond.isValid())
8862 MI.getOperand(i: SCCIdx).setReg(NewCond);
8863
8864 Worklist.insert(MI: &MI);
8865 }
8866 }
8867 // Exit if we find another SCC def.
8868 if (MI.findRegisterDefOperandIdx(Reg: AMDGPU::SCC, TRI: &RI, isDead: false, Overlap: false) != -1)
8869 break;
8870 }
8871 for (auto &Copy : CopyToDelete)
8872 Copy->eraseFromParent();
8873}
8874
8875// Instructions that use SCC may be converted to VALU instructions. When that
8876// happens, the SCC register is changed to VCC_LO. The instruction that defines
8877// SCC must be changed to an instruction that defines VCC. This function makes
8878// sure that the instruction that defines SCC is added to the moveToVALU
8879// worklist.
8880void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8881 SIInstrWorklist &Worklist) const {
8882 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8883 // then there is nothing to do because the defining instruction has been
8884 // converted to a VALU already. If SCC then that instruction needs to be
8885 // converted to a VALU.
8886 for (MachineInstr &MI :
8887 make_range(x: std::next(x: MachineBasicBlock::reverse_iterator(SCCUseInst)),
8888 y: SCCUseInst->getParent()->rend())) {
8889 if (MI.modifiesRegister(Reg: AMDGPU::VCC, TRI: &RI))
8890 break;
8891 if (MI.definesRegister(Reg: AMDGPU::SCC, TRI: &RI)) {
8892 Worklist.insert(MI: &MI);
8893 break;
8894 }
8895 }
8896}
8897
8898const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8899 const MachineInstr &Inst) const {
8900 const TargetRegisterClass *NewDstRC = getOpRegClass(MI: Inst, OpNo: 0);
8901
8902 switch (Inst.getOpcode()) {
8903 // For target instructions, getOpRegClass just returns the virtual register
8904 // class associated with the operand, so we need to find an equivalent VGPR
8905 // register class in order to move the instruction to the VALU.
8906 case AMDGPU::COPY:
8907 case AMDGPU::PHI:
8908 case AMDGPU::REG_SEQUENCE:
8909 case AMDGPU::INSERT_SUBREG:
8910 case AMDGPU::WQM:
8911 case AMDGPU::SOFT_WQM:
8912 case AMDGPU::STRICT_WWM:
8913 case AMDGPU::STRICT_WQM: {
8914 const TargetRegisterClass *SrcRC = getOpRegClass(MI: Inst, OpNo: 1);
8915 if (RI.isAGPRClass(RC: SrcRC)) {
8916 if (RI.isAGPRClass(RC: NewDstRC))
8917 return nullptr;
8918
8919 switch (Inst.getOpcode()) {
8920 case AMDGPU::PHI:
8921 case AMDGPU::REG_SEQUENCE:
8922 case AMDGPU::INSERT_SUBREG:
8923 NewDstRC = RI.getEquivalentAGPRClass(SRC: NewDstRC);
8924 break;
8925 default:
8926 NewDstRC = RI.getEquivalentVGPRClass(SRC: NewDstRC);
8927 }
8928
8929 if (!NewDstRC)
8930 return nullptr;
8931 } else {
8932 if (RI.isVGPRClass(RC: NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8933 return nullptr;
8934
8935 NewDstRC = RI.getEquivalentVGPRClass(SRC: NewDstRC);
8936 if (!NewDstRC)
8937 return nullptr;
8938 }
8939
8940 return NewDstRC;
8941 }
8942 default:
8943 return NewDstRC;
8944 }
8945}
8946
8947// Find the one SGPR operand we are allowed to use.
8948Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8949 int OpIndices[3]) const {
8950 const MCInstrDesc &Desc = MI.getDesc();
8951
8952 // Find the one SGPR operand we are allowed to use.
8953 //
8954 // First we need to consider the instruction's operand requirements before
8955 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8956 // of VCC, but we are still bound by the constant bus requirement to only use
8957 // one.
8958 //
8959 // If the operand's class is an SGPR, we can never move it.
8960
8961 Register SGPRReg = findImplicitSGPRRead(MI);
8962 if (SGPRReg)
8963 return SGPRReg;
8964
8965 Register UsedSGPRs[3] = {Register()};
8966 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8967
8968 for (unsigned i = 0; i < 3; ++i) {
8969 int Idx = OpIndices[i];
8970 if (Idx == -1)
8971 break;
8972
8973 const MachineOperand &MO = MI.getOperand(i: Idx);
8974 if (!MO.isReg())
8975 continue;
8976
8977 // Is this operand statically required to be an SGPR based on the operand
8978 // constraints?
8979 const TargetRegisterClass *OpRC =
8980 RI.getRegClass(RCID: Desc.operands()[Idx].RegClass);
8981 bool IsRequiredSGPR = RI.isSGPRClass(RC: OpRC);
8982 if (IsRequiredSGPR)
8983 return MO.getReg();
8984
8985 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8986 Register Reg = MO.getReg();
8987 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8988 if (RI.isSGPRClass(RC: RegRC))
8989 UsedSGPRs[i] = Reg;
8990 }
8991
8992 // We don't have a required SGPR operand, so we have a bit more freedom in
8993 // selecting operands to move.
8994
8995 // Try to select the most used SGPR. If an SGPR is equal to one of the
8996 // others, we choose that.
8997 //
8998 // e.g.
8999 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9000 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9001
9002 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9003 // prefer those.
9004
9005 if (UsedSGPRs[0]) {
9006 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9007 SGPRReg = UsedSGPRs[0];
9008 }
9009
9010 if (!SGPRReg && UsedSGPRs[1]) {
9011 if (UsedSGPRs[1] == UsedSGPRs[2])
9012 SGPRReg = UsedSGPRs[1];
9013 }
9014
9015 return SGPRReg;
9016}
9017
9018MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
9019 AMDGPU::OpName OperandName) const {
9020 int Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OperandName);
9021 if (Idx == -1)
9022 return nullptr;
9023
9024 return &MI.getOperand(i: Idx);
9025}
9026
9027uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
9028 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9029 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9030 ? (int64_t)AMDGPU::UfmtGFX11::UFMT_32_FLOAT
9031 : (int64_t)AMDGPU::UfmtGFX10::UFMT_32_FLOAT;
9032 return (Format << 44) |
9033 (1ULL << 56) | // RESOURCE_LEVEL = 1
9034 (3ULL << 60); // OOB_SELECT = 3
9035 }
9036
9037 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9038 if (ST.isAmdHsaOS()) {
9039 // Set ATC = 1. GFX9 doesn't have this bit.
9040 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9041 RsrcDataFormat |= (1ULL << 56);
9042
9043 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9044 // BTW, it disables TC L2 and therefore decreases performance.
9045 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9046 RsrcDataFormat |= (2ULL << 59);
9047 }
9048
9049 return RsrcDataFormat;
9050}
9051
9052uint64_t SIInstrInfo::getScratchRsrcWords23() const {
9053 uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
9054 AMDGPU::RSRC_TID_ENABLE |
9055 0xffffffff; // Size;
9056
9057 // GFX9 doesn't have ELEMENT_SIZE.
9058 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9059 uint64_t EltSizeValue = Log2_32(Value: ST.getMaxPrivateElementSize(ForBufferRSrc: true)) - 1;
9060 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9061 }
9062
9063 // IndexStride = 64 / 32.
9064 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9065 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9066
9067 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9068 // Clear them unless we want a huge stride.
9069 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9070 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9071 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9072
9073 return Rsrc23;
9074}
9075
9076bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
9077 unsigned Opc = MI.getOpcode();
9078
9079 return isSMRD(Opcode: Opc);
9080}
9081
9082bool SIInstrInfo::isHighLatencyDef(int Opc) const {
9083 return get(Opcode: Opc).mayLoad() &&
9084 (isMUBUF(Opcode: Opc) || isMTBUF(Opcode: Opc) || isMIMG(Opcode: Opc) || isFLAT(Opcode: Opc));
9085}
9086
9087unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
9088 int &FrameIndex) const {
9089 const MachineOperand *Addr = getNamedOperand(MI, OperandName: AMDGPU::OpName::vaddr);
9090 if (!Addr || !Addr->isFI())
9091 return Register();
9092
9093 assert(!MI.memoperands_empty() &&
9094 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9095
9096 FrameIndex = Addr->getIndex();
9097 return getNamedOperand(MI, OperandName: AMDGPU::OpName::vdata)->getReg();
9098}
9099
9100unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
9101 int &FrameIndex) const {
9102 const MachineOperand *Addr = getNamedOperand(MI, OperandName: AMDGPU::OpName::addr);
9103 assert(Addr && Addr->isFI());
9104 FrameIndex = Addr->getIndex();
9105 return getNamedOperand(MI, OperandName: AMDGPU::OpName::data)->getReg();
9106}
9107
9108Register SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
9109 int &FrameIndex) const {
9110 if (!MI.mayLoad())
9111 return Register();
9112
9113 if (isMUBUF(MI) || isVGPRSpill(MI))
9114 return isStackAccess(MI, FrameIndex);
9115
9116 if (isSGPRSpill(MI))
9117 return isSGPRStackAccess(MI, FrameIndex);
9118
9119 return Register();
9120}
9121
9122Register SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
9123 int &FrameIndex) const {
9124 if (!MI.mayStore())
9125 return Register();
9126
9127 if (isMUBUF(MI) || isVGPRSpill(MI))
9128 return isStackAccess(MI, FrameIndex);
9129
9130 if (isSGPRSpill(MI))
9131 return isSGPRStackAccess(MI, FrameIndex);
9132
9133 return Register();
9134}
9135
9136unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
9137 unsigned Size = 0;
9138 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
9139 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9140 while (++I != E && I->isInsideBundle()) {
9141 assert(!I->isBundle() && "No nested bundle!");
9142 Size += getInstSizeInBytes(MI: *I);
9143 }
9144
9145 return Size;
9146}
9147
9148unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
9149 unsigned Opc = MI.getOpcode();
9150 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opcode: Opc);
9151 unsigned DescSize = Desc.getSize();
9152
9153 // If we have a definitive size, we can use it. Otherwise we need to inspect
9154 // the operands to know the size.
9155 if (isFixedSize(MI)) {
9156 unsigned Size = DescSize;
9157
9158 // If we hit the buggy offset, an extra nop will be inserted in MC so
9159 // estimate the worst case.
9160 if (MI.isBranch() && ST.hasOffset3fBug())
9161 Size += 4;
9162
9163 return Size;
9164 }
9165
9166 // Instructions may have a 32-bit literal encoded after them. Check
9167 // operands that could ever be literals.
9168 if (isVALU(MI) || isSALU(MI)) {
9169 if (isDPP(MI))
9170 return DescSize;
9171 bool HasLiteral = false;
9172 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9173 const MachineOperand &Op = MI.getOperand(i: I);
9174 const MCOperandInfo &OpInfo = Desc.operands()[I];
9175 if (!Op.isReg() && !isInlineConstant(MO: Op, OpInfo)) {
9176 HasLiteral = true;
9177 break;
9178 }
9179 }
9180 return HasLiteral ? DescSize + 4 : DescSize;
9181 }
9182
9183 // Check whether we have extra NSA words.
9184 if (isMIMG(MI)) {
9185 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::vaddr0);
9186 if (VAddr0Idx < 0)
9187 return 8;
9188
9189 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::srsrc);
9190 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9191 }
9192
9193 switch (Opc) {
9194 case TargetOpcode::BUNDLE:
9195 return getInstBundleSize(MI);
9196 case TargetOpcode::INLINEASM:
9197 case TargetOpcode::INLINEASM_BR: {
9198 const MachineFunction *MF = MI.getParent()->getParent();
9199 const char *AsmStr = MI.getOperand(i: 0).getSymbolName();
9200 return getInlineAsmLength(Str: AsmStr, MAI: *MF->getTarget().getMCAsmInfo(), STI: &ST);
9201 }
9202 default:
9203 if (MI.isMetaInstruction())
9204 return 0;
9205 return DescSize;
9206 }
9207}
9208
9209bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
9210 if (!isFLAT(MI))
9211 return false;
9212
9213 if (MI.memoperands_empty())
9214 return true;
9215
9216 for (const MachineMemOperand *MMO : MI.memoperands()) {
9217 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9218 return true;
9219 }
9220 return false;
9221}
9222
9223ArrayRef<std::pair<int, const char *>>
9224SIInstrInfo::getSerializableTargetIndices() const {
9225 static const std::pair<int, const char *> TargetIndices[] = {
9226 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9227 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9228 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9229 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9230 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9231 return ArrayRef(TargetIndices);
9232}
9233
9234/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9235/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9236ScheduleHazardRecognizer *
9237SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
9238 const ScheduleDAG *DAG) const {
9239 return new GCNHazardRecognizer(DAG->MF);
9240}
9241
9242/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9243/// pass.
9244ScheduleHazardRecognizer *
9245SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
9246 return new GCNHazardRecognizer(MF);
9247}
9248
9249// Called during:
9250// - pre-RA scheduling and post-RA scheduling
9251ScheduleHazardRecognizer *
9252SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
9253 const ScheduleDAGMI *DAG) const {
9254 // Borrowed from Arm Target
9255 // We would like to restrict this hazard recognizer to only
9256 // post-RA scheduling; we can tell that we're post-RA because we don't
9257 // track VRegLiveness.
9258 if (!DAG->hasVRegLiveness())
9259 return new GCNHazardRecognizer(DAG->MF);
9260 return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG);
9261}
9262
9263std::pair<unsigned, unsigned>
9264SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9265 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9266}
9267
9268ArrayRef<std::pair<unsigned, const char *>>
9269SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9270 static const std::pair<unsigned, const char *> TargetFlags[] = {
9271 { MO_GOTPCREL, "amdgpu-gotprel" },
9272 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
9273 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
9274 { MO_REL32_LO, "amdgpu-rel32-lo" },
9275 { MO_REL32_HI, "amdgpu-rel32-hi" },
9276 { MO_ABS32_LO, "amdgpu-abs32-lo" },
9277 { MO_ABS32_HI, "amdgpu-abs32-hi" },
9278 };
9279
9280 return ArrayRef(TargetFlags);
9281}
9282
9283ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
9284SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9285 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9286 {
9287 {MONoClobber, "amdgpu-noclobber"},
9288 {MOLastUse, "amdgpu-last-use"},
9289 };
9290
9291 return ArrayRef(TargetFlags);
9292}
9293
9294unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
9295 const MachineFunction &MF) const {
9296 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
9297 assert(SrcReg.isVirtual());
9298 if (MFI->checkFlag(Reg: SrcReg, Flag: AMDGPU::VirtRegFlag::WWM_REG))
9299 return AMDGPU::WWM_COPY;
9300
9301 return AMDGPU::COPY;
9302}
9303
9304bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
9305 Register Reg) const {
9306 // We need to handle instructions which may be inserted during register
9307 // allocation to handle the prolog. The initial prolog instruction may have
9308 // been separated from the start of the block by spills and copies inserted
9309 // needed by the prolog. However, the insertions for scalar registers can
9310 // always be placed at the BB top as they are independent of the exec mask
9311 // value.
9312 const MachineFunction *MF = MI.getParent()->getParent();
9313 bool IsNullOrVectorRegister = true;
9314 if (Reg) {
9315 const MachineRegisterInfo &MRI = MF->getRegInfo();
9316 IsNullOrVectorRegister = !RI.isSGPRClass(RC: RI.getRegClassForReg(MRI, Reg));
9317 }
9318
9319 uint16_t Opcode = MI.getOpcode();
9320 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
9321 return IsNullOrVectorRegister &&
9322 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9323 (Opcode == AMDGPU::IMPLICIT_DEF &&
9324 MFI->isWWMReg(Reg: MI.getOperand(i: 0).getReg())) ||
9325 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9326 MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &RI)));
9327}
9328
9329MachineInstrBuilder
9330SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
9331 MachineBasicBlock::iterator I,
9332 const DebugLoc &DL,
9333 Register DestReg) const {
9334 if (ST.hasAddNoCarry())
9335 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e64), DestReg);
9336
9337 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9338 Register UnusedCarry = MRI.createVirtualRegister(RegClass: RI.getBoolRC());
9339 MRI.setRegAllocationHint(VReg: UnusedCarry, Type: 0, PrefReg: RI.getVCC());
9340
9341 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg)
9342 .addReg(RegNo: UnusedCarry, flags: RegState::Define | RegState::Dead);
9343}
9344
9345MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
9346 MachineBasicBlock::iterator I,
9347 const DebugLoc &DL,
9348 Register DestReg,
9349 RegScavenger &RS) const {
9350 if (ST.hasAddNoCarry())
9351 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_U32_e32), DestReg);
9352
9353 // If available, prefer to use vcc.
9354 Register UnusedCarry = !RS.isRegUsed(Reg: AMDGPU::VCC)
9355 ? Register(RI.getVCC())
9356 : RS.scavengeRegisterBackwards(
9357 RC: *RI.getBoolRC(), To: I, /* RestoreAfter */ false,
9358 SPAdj: 0, /* AllowSpill */ false);
9359
9360 // TODO: Users need to deal with this.
9361 if (!UnusedCarry.isValid())
9362 return MachineInstrBuilder();
9363
9364 return BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AMDGPU::V_ADD_CO_U32_e64), DestReg)
9365 .addReg(RegNo: UnusedCarry, flags: RegState::Define | RegState::Dead);
9366}
9367
9368bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9369 switch (Opcode) {
9370 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9371 case AMDGPU::SI_KILL_I1_TERMINATOR:
9372 return true;
9373 default:
9374 return false;
9375 }
9376}
9377
9378const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
9379 switch (Opcode) {
9380 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9381 return get(Opcode: AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9382 case AMDGPU::SI_KILL_I1_PSEUDO:
9383 return get(Opcode: AMDGPU::SI_KILL_I1_TERMINATOR);
9384 default:
9385 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9386 }
9387}
9388
9389bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9390 return Imm <= getMaxMUBUFImmOffset(ST);
9391}
9392
9393unsigned SIInstrInfo::getMaxMUBUFImmOffset(const GCNSubtarget &ST) {
9394 // GFX12 field is non-negative 24-bit signed byte offset.
9395 const unsigned OffsetBits =
9396 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9397 return (1 << OffsetBits) - 1;
9398}
9399
9400void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
9401 if (!ST.isWave32())
9402 return;
9403
9404 if (MI.isInlineAsm())
9405 return;
9406
9407 for (auto &Op : MI.implicit_operands()) {
9408 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9409 Op.setReg(AMDGPU::VCC_LO);
9410 }
9411}
9412
9413bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
9414 if (!isSMRD(MI))
9415 return false;
9416
9417 // Check that it is using a buffer resource.
9418 int Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::sbase);
9419 if (Idx == -1) // e.g. s_memtime
9420 return false;
9421
9422 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9423 return RI.getRegClass(RCID)->hasSubClassEq(RC: &AMDGPU::SGPR_128RegClass);
9424}
9425
9426// Given Imm, split it into the values to put into the SOffset and ImmOffset
9427// fields in an MUBUF instruction. Return false if it is not possible (due to a
9428// hardware bug needing a workaround).
9429//
9430// The required alignment ensures that individual address components remain
9431// aligned if they are aligned to begin with. It also ensures that additional
9432// offsets within the given alignment can be added to the resulting ImmOffset.
9433bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset,
9434 uint32_t &ImmOffset, Align Alignment) const {
9435 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9436 const uint32_t MaxImm = alignDown(Value: MaxOffset, Align: Alignment.value());
9437 uint32_t Overflow = 0;
9438
9439 if (Imm > MaxImm) {
9440 if (Imm <= MaxImm + 64) {
9441 // Use an SOffset inline constant for 4..64
9442 Overflow = Imm - MaxImm;
9443 Imm = MaxImm;
9444 } else {
9445 // Try to keep the same value in SOffset for adjacent loads, so that
9446 // the corresponding register contents can be re-used.
9447 //
9448 // Load values with all low-bits (except for alignment bits) set into
9449 // SOffset, so that a larger range of values can be covered using
9450 // s_movk_i32.
9451 //
9452 // Atomic operations fail to work correctly when individual address
9453 // components are unaligned, even if their sum is aligned.
9454 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9455 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9456 Imm = Low;
9457 Overflow = High - Alignment.value();
9458 }
9459 }
9460
9461 if (Overflow > 0) {
9462 // There is a hardware bug in SI and CI which prevents address clamping in
9463 // MUBUF instructions from working correctly with SOffsets. The immediate
9464 // offset is unaffected.
9465 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
9466 return false;
9467
9468 // It is not possible to set immediate in SOffset field on some targets.
9469 if (ST.hasRestrictedSOffset())
9470 return false;
9471 }
9472
9473 ImmOffset = Imm;
9474 SOffset = Overflow;
9475 return true;
9476}
9477
9478// Depending on the used address space and instructions, some immediate offsets
9479// are allowed and some are not.
9480// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9481// scratch instruction offsets can also be negative. On GFX12, offsets can be
9482// negative for all variants.
9483//
9484// There are several bugs related to these offsets:
9485// On gfx10.1, flat instructions that go into the global address space cannot
9486// use an offset.
9487//
9488// For scratch instructions, the address can be either an SGPR or a VGPR.
9489// The following offsets can be used, depending on the architecture (x means
9490// cannot be used):
9491// +----------------------------+------+------+
9492// | Address-Mode | SGPR | VGPR |
9493// +----------------------------+------+------+
9494// | gfx9 | | |
9495// | negative, 4-aligned offset | x | ok |
9496// | negative, unaligned offset | x | ok |
9497// +----------------------------+------+------+
9498// | gfx10 | | |
9499// | negative, 4-aligned offset | ok | ok |
9500// | negative, unaligned offset | ok | x |
9501// +----------------------------+------+------+
9502// | gfx10.3 | | |
9503// | negative, 4-aligned offset | ok | ok |
9504// | negative, unaligned offset | ok | ok |
9505// +----------------------------+------+------+
9506//
9507// This function ignores the addressing mode, so if an offset cannot be used in
9508// one addressing mode, it is considered illegal.
9509bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9510 uint64_t FlatVariant) const {
9511 // TODO: Should 0 be special cased?
9512 if (!ST.hasFlatInstOffsets())
9513 return false;
9514
9515 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9516 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9517 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9518 return false;
9519
9520 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9521 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9522 (Offset % 4) != 0) {
9523 return false;
9524 }
9525
9526 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9527 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9528 return isIntN(N, x: Offset) && (AllowNegative || Offset >= 0);
9529}
9530
9531// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9532std::pair<int64_t, int64_t>
9533SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9534 uint64_t FlatVariant) const {
9535 int64_t RemainderOffset = COffsetVal;
9536 int64_t ImmField = 0;
9537
9538 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9539 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9540
9541 if (AllowNegative) {
9542 // Use signed division by a power of two to truncate towards 0.
9543 int64_t D = 1LL << NumBits;
9544 RemainderOffset = (COffsetVal / D) * D;
9545 ImmField = COffsetVal - RemainderOffset;
9546
9547 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9548 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9549 (ImmField % 4) != 0) {
9550 // Make ImmField a multiple of 4
9551 RemainderOffset += ImmField % 4;
9552 ImmField -= ImmField % 4;
9553 }
9554 } else if (COffsetVal >= 0) {
9555 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(N: NumBits);
9556 RemainderOffset = COffsetVal - ImmField;
9557 }
9558
9559 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9560 assert(RemainderOffset + ImmField == COffsetVal);
9561 return {ImmField, RemainderOffset};
9562}
9563
9564bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const {
9565 if (ST.hasNegativeScratchOffsetBug() &&
9566 FlatVariant == SIInstrFlags::FlatScratch)
9567 return false;
9568
9569 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(STI: ST);
9570}
9571
9572static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9573 switch (ST.getGeneration()) {
9574 default:
9575 break;
9576 case AMDGPUSubtarget::SOUTHERN_ISLANDS:
9577 case AMDGPUSubtarget::SEA_ISLANDS:
9578 return SIEncodingFamily::SI;
9579 case AMDGPUSubtarget::VOLCANIC_ISLANDS:
9580 case AMDGPUSubtarget::GFX9:
9581 return SIEncodingFamily::VI;
9582 case AMDGPUSubtarget::GFX10:
9583 return SIEncodingFamily::GFX10;
9584 case AMDGPUSubtarget::GFX11:
9585 return SIEncodingFamily::GFX11;
9586 case AMDGPUSubtarget::GFX12:
9587 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
9588 : SIEncodingFamily::GFX12;
9589 }
9590 llvm_unreachable("Unknown subtarget generation!");
9591}
9592
9593bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9594 switch(MCOp) {
9595 // These opcodes use indirect register addressing so
9596 // they need special handling by codegen (currently missing).
9597 // Therefore it is too risky to allow these opcodes
9598 // to be selected by dpp combiner or sdwa peepholer.
9599 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9600 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9601 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9602 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9603 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9604 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9605 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9606 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9607 return true;
9608 default:
9609 return false;
9610 }
9611}
9612
9613#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9614 case OPCODE##_dpp: \
9615 case OPCODE##_e32: \
9616 case OPCODE##_e64: \
9617 case OPCODE##_e64_dpp: \
9618 case OPCODE##_sdwa:
9619
9620static bool isRenamedInGFX9(int Opcode) {
9621 switch (Opcode) {
9622 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9623 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9624 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9625 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9626 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9627 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9628 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9629 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9630 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9631 //
9632 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
9633 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
9634 case AMDGPU::V_FMA_F16_gfx9_e64:
9635 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
9636 case AMDGPU::V_INTERP_P2_F16:
9637 case AMDGPU::V_MAD_F16_e64:
9638 case AMDGPU::V_MAD_U16_e64:
9639 case AMDGPU::V_MAD_I16_e64:
9640 return true;
9641 default:
9642 return false;
9643 }
9644}
9645
9646int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9647 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9648
9649 unsigned Gen = subtargetEncodingFamily(ST);
9650
9651 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
9652 Gen = SIEncodingFamily::GFX9;
9653
9654 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9655 // subtarget has UnpackedD16VMem feature.
9656 // TODO: remove this when we discard GFX80 encoding.
9657 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9658 Gen = SIEncodingFamily::GFX80;
9659
9660 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9661 switch (ST.getGeneration()) {
9662 default:
9663 Gen = SIEncodingFamily::SDWA;
9664 break;
9665 case AMDGPUSubtarget::GFX9:
9666 Gen = SIEncodingFamily::SDWA9;
9667 break;
9668 case AMDGPUSubtarget::GFX10:
9669 Gen = SIEncodingFamily::SDWA10;
9670 break;
9671 }
9672 }
9673
9674 if (isMAI(Opcode)) {
9675 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9676 if (MFMAOp != -1)
9677 Opcode = MFMAOp;
9678 }
9679
9680 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9681
9682 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
9683 MCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX12);
9684
9685 // -1 means that Opcode is already a native instruction.
9686 if (MCOp == -1)
9687 return Opcode;
9688
9689 if (ST.hasGFX90AInsts()) {
9690 uint16_t NMCOp = (uint16_t)-1;
9691 if (ST.hasGFX940Insts())
9692 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX940);
9693 if (NMCOp == (uint16_t)-1)
9694 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX90A);
9695 if (NMCOp == (uint16_t)-1)
9696 NMCOp = AMDGPU::getMCOpcode(Opcode, Gen: SIEncodingFamily::GFX9);
9697 if (NMCOp != (uint16_t)-1)
9698 MCOp = NMCOp;
9699 }
9700
9701 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9702 // no encoding in the given subtarget generation.
9703 if (MCOp == (uint16_t)-1)
9704 return -1;
9705
9706 if (isAsmOnlyOpcode(MCOp))
9707 return -1;
9708
9709 return MCOp;
9710}
9711
9712static
9713TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
9714 assert(RegOpnd.isReg());
9715 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9716 getRegSubRegPair(O: RegOpnd);
9717}
9718
9719TargetInstrInfo::RegSubRegPair
9720llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
9721 assert(MI.isRegSequence());
9722 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9723 if (MI.getOperand(i: 1 + 2 * I + 1).getImm() == SubReg) {
9724 auto &RegOp = MI.getOperand(i: 1 + 2 * I);
9725 return getRegOrUndef(RegOpnd: RegOp);
9726 }
9727 return TargetInstrInfo::RegSubRegPair();
9728}
9729
9730// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9731// Following a subreg of reg:subreg isn't supported
9732static bool followSubRegDef(MachineInstr &MI,
9733 TargetInstrInfo::RegSubRegPair &RSR) {
9734 if (!RSR.SubReg)
9735 return false;
9736 switch (MI.getOpcode()) {
9737 default: break;
9738 case AMDGPU::REG_SEQUENCE:
9739 RSR = getRegSequenceSubReg(MI, SubReg: RSR.SubReg);
9740 return true;
9741 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9742 case AMDGPU::INSERT_SUBREG:
9743 if (RSR.SubReg == (unsigned)MI.getOperand(i: 3).getImm())
9744 // inserted the subreg we're looking for
9745 RSR = getRegOrUndef(RegOpnd: MI.getOperand(i: 2));
9746 else { // the subreg in the rest of the reg
9747 auto R1 = getRegOrUndef(RegOpnd: MI.getOperand(i: 1));
9748 if (R1.SubReg) // subreg of subreg isn't supported
9749 return false;
9750 RSR.Reg = R1.Reg;
9751 }
9752 return true;
9753 }
9754 return false;
9755}
9756
9757MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
9758 MachineRegisterInfo &MRI) {
9759 assert(MRI.isSSA());
9760 if (!P.Reg.isVirtual())
9761 return nullptr;
9762
9763 auto RSR = P;
9764 auto *DefInst = MRI.getVRegDef(Reg: RSR.Reg);
9765 while (auto *MI = DefInst) {
9766 DefInst = nullptr;
9767 switch (MI->getOpcode()) {
9768 case AMDGPU::COPY:
9769 case AMDGPU::V_MOV_B32_e32: {
9770 auto &Op1 = MI->getOperand(i: 1);
9771 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9772 if (Op1.isUndef())
9773 return nullptr;
9774 RSR = getRegSubRegPair(O: Op1);
9775 DefInst = MRI.getVRegDef(Reg: RSR.Reg);
9776 }
9777 break;
9778 }
9779 default:
9780 if (followSubRegDef(MI&: *MI, RSR)) {
9781 if (!RSR.Reg)
9782 return nullptr;
9783 DefInst = MRI.getVRegDef(Reg: RSR.Reg);
9784 }
9785 }
9786 if (!DefInst)
9787 return MI;
9788 }
9789 return nullptr;
9790}
9791
9792bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
9793 Register VReg,
9794 const MachineInstr &DefMI,
9795 const MachineInstr &UseMI) {
9796 assert(MRI.isSSA() && "Must be run on SSA");
9797
9798 auto *TRI = MRI.getTargetRegisterInfo();
9799 auto *DefBB = DefMI.getParent();
9800
9801 // Don't bother searching between blocks, although it is possible this block
9802 // doesn't modify exec.
9803 if (UseMI.getParent() != DefBB)
9804 return true;
9805
9806 const int MaxInstScan = 20;
9807 int NumInst = 0;
9808
9809 // Stop scan at the use.
9810 auto E = UseMI.getIterator();
9811 for (auto I = std::next(x: DefMI.getIterator()); I != E; ++I) {
9812 if (I->isDebugInstr())
9813 continue;
9814
9815 if (++NumInst > MaxInstScan)
9816 return true;
9817
9818 if (I->modifiesRegister(Reg: AMDGPU::EXEC, TRI))
9819 return true;
9820 }
9821
9822 return false;
9823}
9824
9825bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
9826 Register VReg,
9827 const MachineInstr &DefMI) {
9828 assert(MRI.isSSA() && "Must be run on SSA");
9829
9830 auto *TRI = MRI.getTargetRegisterInfo();
9831 auto *DefBB = DefMI.getParent();
9832
9833 const int MaxUseScan = 10;
9834 int NumUse = 0;
9835
9836 for (auto &Use : MRI.use_nodbg_operands(Reg: VReg)) {
9837 auto &UseInst = *Use.getParent();
9838 // Don't bother searching between blocks, although it is possible this block
9839 // doesn't modify exec.
9840 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9841 return true;
9842
9843 if (++NumUse > MaxUseScan)
9844 return true;
9845 }
9846
9847 if (NumUse == 0)
9848 return false;
9849
9850 const int MaxInstScan = 20;
9851 int NumInst = 0;
9852
9853 // Stop scan when we have seen all the uses.
9854 for (auto I = std::next(x: DefMI.getIterator()); ; ++I) {
9855 assert(I != DefBB->end());
9856
9857 if (I->isDebugInstr())
9858 continue;
9859
9860 if (++NumInst > MaxInstScan)
9861 return true;
9862
9863 for (const MachineOperand &Op : I->operands()) {
9864 // We don't check reg masks here as they're used only on calls:
9865 // 1. EXEC is only considered const within one BB
9866 // 2. Call should be a terminator instruction if present in a BB
9867
9868 if (!Op.isReg())
9869 continue;
9870
9871 Register Reg = Op.getReg();
9872 if (Op.isUse()) {
9873 if (Reg == VReg && --NumUse == 0)
9874 return false;
9875 } else if (TRI->regsOverlap(RegA: Reg, RegB: AMDGPU::EXEC))
9876 return true;
9877 }
9878 }
9879}
9880
9881MachineInstr *SIInstrInfo::createPHIDestinationCopy(
9882 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
9883 const DebugLoc &DL, Register Src, Register Dst) const {
9884 auto Cur = MBB.begin();
9885 if (Cur != MBB.end())
9886 do {
9887 if (!Cur->isPHI() && Cur->readsRegister(Reg: Dst, /*TRI=*/nullptr))
9888 return BuildMI(BB&: MBB, I: Cur, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: Dst).addReg(RegNo: Src);
9889 ++Cur;
9890 } while (Cur != MBB.end() && Cur != LastPHIIt);
9891
9892 return TargetInstrInfo::createPHIDestinationCopy(MBB, InsPt: LastPHIIt, DL, Src,
9893 Dst);
9894}
9895
9896MachineInstr *SIInstrInfo::createPHISourceCopy(
9897 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
9898 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9899 if (InsPt != MBB.end() &&
9900 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9901 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9902 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9903 InsPt->definesRegister(Reg: Src, /*TRI=*/nullptr)) {
9904 InsPt++;
9905 return BuildMI(BB&: MBB, I: InsPt, MIMD: DL,
9906 MCID: get(Opcode: ST.isWave32() ? AMDGPU::S_MOV_B32_term
9907 : AMDGPU::S_MOV_B64_term),
9908 DestReg: Dst)
9909 .addReg(RegNo: Src, flags: 0, SubReg: SrcSubReg)
9910 .addReg(RegNo: AMDGPU::EXEC, flags: RegState::Implicit);
9911 }
9912 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9913 Dst);
9914}
9915
9916bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9917
9918MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
9919 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
9920 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9921 VirtRegMap *VRM) const {
9922 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9923 //
9924 // %0:sreg_32 = COPY $m0
9925 //
9926 // We explicitly chose SReg_32 for the virtual register so such a copy might
9927 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9928 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9929 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9930 // TargetInstrInfo::foldMemoryOperand() is going to try.
9931 // A similar issue also exists with spilling and reloading $exec registers.
9932 //
9933 // To prevent that, constrain the %0 register class here.
9934 if (isFullCopyInstr(MI)) {
9935 Register DstReg = MI.getOperand(i: 0).getReg();
9936 Register SrcReg = MI.getOperand(i: 1).getReg();
9937 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9938 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9939 MachineRegisterInfo &MRI = MF.getRegInfo();
9940 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9941 const TargetRegisterClass *RC = MRI.getRegClass(Reg: VirtReg);
9942 if (RC->hasSuperClassEq(RC: &AMDGPU::SReg_32RegClass)) {
9943 MRI.constrainRegClass(Reg: VirtReg, RC: &AMDGPU::SReg_32_XM0_XEXECRegClass);
9944 return nullptr;
9945 }
9946 if (RC->hasSuperClassEq(RC: &AMDGPU::SReg_64RegClass)) {
9947 MRI.constrainRegClass(Reg: VirtReg, RC: &AMDGPU::SReg_64_XEXECRegClass);
9948 return nullptr;
9949 }
9950 }
9951 }
9952
9953 return nullptr;
9954}
9955
9956unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
9957 const MachineInstr &MI,
9958 unsigned *PredCost) const {
9959 if (MI.isBundle()) {
9960 MachineBasicBlock::const_instr_iterator I(MI.getIterator());
9961 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9962 unsigned Lat = 0, Count = 0;
9963 for (++I; I != E && I->isBundledWithPred(); ++I) {
9964 ++Count;
9965 Lat = std::max(a: Lat, b: SchedModel.computeInstrLatency(MI: &*I));
9966 }
9967 return Lat + Count - 1;
9968 }
9969
9970 return SchedModel.computeInstrLatency(MI: &MI);
9971}
9972
9973InstructionUniformity
9974SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
9975 unsigned opcode = MI.getOpcode();
9976 if (auto *GI = dyn_cast<GIntrinsic>(Val: &MI)) {
9977 auto IID = GI->getIntrinsicID();
9978 if (AMDGPU::isIntrinsicSourceOfDivergence(IntrID: IID))
9979 return InstructionUniformity::NeverUniform;
9980 if (AMDGPU::isIntrinsicAlwaysUniform(IntrID: IID))
9981 return InstructionUniformity::AlwaysUniform;
9982
9983 switch (IID) {
9984 case Intrinsic::amdgcn_if:
9985 case Intrinsic::amdgcn_else:
9986 // FIXME: Uniform if second result
9987 break;
9988 }
9989
9990 return InstructionUniformity::Default;
9991 }
9992
9993 // Loads from the private and flat address spaces are divergent, because
9994 // threads can execute the load instruction with the same inputs and get
9995 // different results.
9996 //
9997 // All other loads are not divergent, because if threads issue loads with the
9998 // same arguments, they will always get the same result.
9999 if (opcode == AMDGPU::G_LOAD) {
10000 if (MI.memoperands_empty())
10001 return InstructionUniformity::NeverUniform; // conservative assumption
10002
10003 if (llvm::any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *mmo) {
10004 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10005 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10006 })) {
10007 // At least one MMO in a non-global address space.
10008 return InstructionUniformity::NeverUniform;
10009 }
10010 return InstructionUniformity::Default;
10011 }
10012
10013 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opc: opcode) ||
10014 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10015 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10016 AMDGPU::isGenericAtomic(Opc: opcode)) {
10017 return InstructionUniformity::NeverUniform;
10018 }
10019 return InstructionUniformity::Default;
10020}
10021
10022InstructionUniformity
10023SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
10024
10025 if (isNeverUniform(MI))
10026 return InstructionUniformity::NeverUniform;
10027
10028 unsigned opcode = MI.getOpcode();
10029 if (opcode == AMDGPU::V_READLANE_B32 ||
10030 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10031 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10032 return InstructionUniformity::AlwaysUniform;
10033
10034 if (isCopyInstr(MI)) {
10035 const MachineOperand &srcOp = MI.getOperand(i: 1);
10036 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10037 const TargetRegisterClass *regClass =
10038 RI.getPhysRegBaseClass(Reg: srcOp.getReg());
10039 return RI.isSGPRClass(RC: regClass) ? InstructionUniformity::AlwaysUniform
10040 : InstructionUniformity::NeverUniform;
10041 }
10042 return InstructionUniformity::Default;
10043 }
10044
10045 // GMIR handling
10046 if (MI.isPreISelOpcode())
10047 return SIInstrInfo::getGenericInstructionUniformity(MI);
10048
10049 // Atomics are divergent because they are executed sequentially: when an
10050 // atomic operation refers to the same address in each thread, then each
10051 // thread after the first sees the value written by the previous thread as
10052 // original value.
10053
10054 if (isAtomic(MI))
10055 return InstructionUniformity::NeverUniform;
10056
10057 // Loads from the private and flat address spaces are divergent, because
10058 // threads can execute the load instruction with the same inputs and get
10059 // different results.
10060 if (isFLAT(MI) && MI.mayLoad()) {
10061 if (MI.memoperands_empty())
10062 return InstructionUniformity::NeverUniform; // conservative assumption
10063
10064 if (llvm::any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *mmo) {
10065 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10066 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10067 })) {
10068 // At least one MMO in a non-global address space.
10069 return InstructionUniformity::NeverUniform;
10070 }
10071
10072 return InstructionUniformity::Default;
10073 }
10074
10075 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10076 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10077
10078 // FIXME: It's conceptually broken to report this for an instruction, and not
10079 // a specific def operand. For inline asm in particular, there could be mixed
10080 // uniform and divergent results.
10081 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10082 const MachineOperand &SrcOp = MI.getOperand(i: I);
10083 if (!SrcOp.isReg())
10084 continue;
10085
10086 Register Reg = SrcOp.getReg();
10087 if (!Reg || !SrcOp.readsReg())
10088 continue;
10089
10090 // If RegBank is null, this is unassigned or an unallocatable special
10091 // register, which are all scalars.
10092 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, TRI: RI);
10093 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10094 return InstructionUniformity::NeverUniform;
10095 }
10096
10097 // TODO: Uniformity check condtions above can be rearranged for more
10098 // redability
10099
10100 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10101 // currently turned into no-op COPYs by SelectionDAG ISel and are
10102 // therefore no longer recognizable.
10103
10104 return InstructionUniformity::Default;
10105}
10106
10107unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
10108 switch (MF.getFunction().getCallingConv()) {
10109 case CallingConv::AMDGPU_PS:
10110 return 1;
10111 case CallingConv::AMDGPU_VS:
10112 return 2;
10113 case CallingConv::AMDGPU_GS:
10114 return 3;
10115 case CallingConv::AMDGPU_HS:
10116 case CallingConv::AMDGPU_LS:
10117 case CallingConv::AMDGPU_ES: {
10118 const Function &F = MF.getFunction();
10119 F.getContext().diagnose(DI: DiagnosticInfoUnsupported(
10120 F, "ds_ordered_count unsupported for this calling conv"));
10121 [[fallthrough]];
10122 }
10123 case CallingConv::AMDGPU_CS:
10124 case CallingConv::AMDGPU_KERNEL:
10125 case CallingConv::C:
10126 case CallingConv::Fast:
10127 default:
10128 // Assume other calling conventions are various compute callable functions
10129 return 0;
10130 }
10131}
10132
10133bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
10134 Register &SrcReg2, int64_t &CmpMask,
10135 int64_t &CmpValue) const {
10136 if (!MI.getOperand(i: 0).isReg() || MI.getOperand(i: 0).getSubReg())
10137 return false;
10138
10139 switch (MI.getOpcode()) {
10140 default:
10141 break;
10142 case AMDGPU::S_CMP_EQ_U32:
10143 case AMDGPU::S_CMP_EQ_I32:
10144 case AMDGPU::S_CMP_LG_U32:
10145 case AMDGPU::S_CMP_LG_I32:
10146 case AMDGPU::S_CMP_LT_U32:
10147 case AMDGPU::S_CMP_LT_I32:
10148 case AMDGPU::S_CMP_GT_U32:
10149 case AMDGPU::S_CMP_GT_I32:
10150 case AMDGPU::S_CMP_LE_U32:
10151 case AMDGPU::S_CMP_LE_I32:
10152 case AMDGPU::S_CMP_GE_U32:
10153 case AMDGPU::S_CMP_GE_I32:
10154 case AMDGPU::S_CMP_EQ_U64:
10155 case AMDGPU::S_CMP_LG_U64:
10156 SrcReg = MI.getOperand(i: 0).getReg();
10157 if (MI.getOperand(i: 1).isReg()) {
10158 if (MI.getOperand(i: 1).getSubReg())
10159 return false;
10160 SrcReg2 = MI.getOperand(i: 1).getReg();
10161 CmpValue = 0;
10162 } else if (MI.getOperand(i: 1).isImm()) {
10163 SrcReg2 = Register();
10164 CmpValue = MI.getOperand(i: 1).getImm();
10165 } else {
10166 return false;
10167 }
10168 CmpMask = ~0;
10169 return true;
10170 case AMDGPU::S_CMPK_EQ_U32:
10171 case AMDGPU::S_CMPK_EQ_I32:
10172 case AMDGPU::S_CMPK_LG_U32:
10173 case AMDGPU::S_CMPK_LG_I32:
10174 case AMDGPU::S_CMPK_LT_U32:
10175 case AMDGPU::S_CMPK_LT_I32:
10176 case AMDGPU::S_CMPK_GT_U32:
10177 case AMDGPU::S_CMPK_GT_I32:
10178 case AMDGPU::S_CMPK_LE_U32:
10179 case AMDGPU::S_CMPK_LE_I32:
10180 case AMDGPU::S_CMPK_GE_U32:
10181 case AMDGPU::S_CMPK_GE_I32:
10182 SrcReg = MI.getOperand(i: 0).getReg();
10183 SrcReg2 = Register();
10184 CmpValue = MI.getOperand(i: 1).getImm();
10185 CmpMask = ~0;
10186 return true;
10187 }
10188
10189 return false;
10190}
10191
10192bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
10193 Register SrcReg2, int64_t CmpMask,
10194 int64_t CmpValue,
10195 const MachineRegisterInfo *MRI) const {
10196 if (!SrcReg || SrcReg.isPhysical())
10197 return false;
10198
10199 if (SrcReg2 && !getFoldableImm(Reg: SrcReg2, MRI: *MRI, Imm&: CmpValue))
10200 return false;
10201
10202 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10203 this](int64_t ExpectedValue, unsigned SrcSize,
10204 bool IsReversible, bool IsSigned) -> bool {
10205 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10206 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10207 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10208 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10209 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10210 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10211 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10212 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10213 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10214 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10215 //
10216 // Signed ge/gt are not used for the sign bit.
10217 //
10218 // If result of the AND is unused except in the compare:
10219 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10220 //
10221 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10222 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10223 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10224 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10225 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10226 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10227
10228 MachineInstr *Def = MRI->getUniqueVRegDef(Reg: SrcReg);
10229 if (!Def || Def->getParent() != CmpInstr.getParent())
10230 return false;
10231
10232 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10233 Def->getOpcode() != AMDGPU::S_AND_B64)
10234 return false;
10235
10236 int64_t Mask;
10237 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10238 if (MO->isImm())
10239 Mask = MO->getImm();
10240 else if (!getFoldableImm(MO, Imm&: Mask))
10241 return false;
10242 Mask &= maxUIntN(N: SrcSize);
10243 return isPowerOf2_64(Value: Mask);
10244 };
10245
10246 MachineOperand *SrcOp = &Def->getOperand(i: 1);
10247 if (isMask(SrcOp))
10248 SrcOp = &Def->getOperand(i: 2);
10249 else if (isMask(&Def->getOperand(i: 2)))
10250 SrcOp = &Def->getOperand(i: 1);
10251 else
10252 return false;
10253
10254 // A valid Mask is required to have a single bit set, hence a non-zero and
10255 // power-of-two value. This verifies that we will not do 64-bit shift below.
10256 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10257 unsigned BitNo = llvm::countr_zero(Val: (uint64_t)Mask);
10258 if (IsSigned && BitNo == SrcSize - 1)
10259 return false;
10260
10261 ExpectedValue <<= BitNo;
10262
10263 bool IsReversedCC = false;
10264 if (CmpValue != ExpectedValue) {
10265 if (!IsReversible)
10266 return false;
10267 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10268 if (!IsReversedCC)
10269 return false;
10270 }
10271
10272 Register DefReg = Def->getOperand(i: 0).getReg();
10273 if (IsReversedCC && !MRI->hasOneNonDBGUse(RegNo: DefReg))
10274 return false;
10275
10276 for (auto I = std::next(x: Def->getIterator()), E = CmpInstr.getIterator();
10277 I != E; ++I) {
10278 if (I->modifiesRegister(Reg: AMDGPU::SCC, TRI: &RI) ||
10279 I->killsRegister(Reg: AMDGPU::SCC, TRI: &RI))
10280 return false;
10281 }
10282
10283 MachineOperand *SccDef =
10284 Def->findRegisterDefOperand(Reg: AMDGPU::SCC, /*TRI=*/nullptr);
10285 SccDef->setIsDead(false);
10286 CmpInstr.eraseFromParent();
10287
10288 if (!MRI->use_nodbg_empty(RegNo: DefReg)) {
10289 assert(!IsReversedCC);
10290 return true;
10291 }
10292
10293 // Replace AND with unused result with a S_BITCMP.
10294 MachineBasicBlock *MBB = Def->getParent();
10295
10296 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10297 : AMDGPU::S_BITCMP1_B32
10298 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10299 : AMDGPU::S_BITCMP1_B64;
10300
10301 BuildMI(BB&: *MBB, I: Def, MIMD: Def->getDebugLoc(), MCID: get(Opcode: NewOpc))
10302 .add(MO: *SrcOp)
10303 .addImm(Val: BitNo);
10304 Def->eraseFromParent();
10305
10306 return true;
10307 };
10308
10309 switch (CmpInstr.getOpcode()) {
10310 default:
10311 break;
10312 case AMDGPU::S_CMP_EQ_U32:
10313 case AMDGPU::S_CMP_EQ_I32:
10314 case AMDGPU::S_CMPK_EQ_U32:
10315 case AMDGPU::S_CMPK_EQ_I32:
10316 return optimizeCmpAnd(1, 32, true, false);
10317 case AMDGPU::S_CMP_GE_U32:
10318 case AMDGPU::S_CMPK_GE_U32:
10319 return optimizeCmpAnd(1, 32, false, false);
10320 case AMDGPU::S_CMP_GE_I32:
10321 case AMDGPU::S_CMPK_GE_I32:
10322 return optimizeCmpAnd(1, 32, false, true);
10323 case AMDGPU::S_CMP_EQ_U64:
10324 return optimizeCmpAnd(1, 64, true, false);
10325 case AMDGPU::S_CMP_LG_U32:
10326 case AMDGPU::S_CMP_LG_I32:
10327 case AMDGPU::S_CMPK_LG_U32:
10328 case AMDGPU::S_CMPK_LG_I32:
10329 return optimizeCmpAnd(0, 32, true, false);
10330 case AMDGPU::S_CMP_GT_U32:
10331 case AMDGPU::S_CMPK_GT_U32:
10332 return optimizeCmpAnd(0, 32, false, false);
10333 case AMDGPU::S_CMP_GT_I32:
10334 case AMDGPU::S_CMPK_GT_I32:
10335 return optimizeCmpAnd(0, 32, false, true);
10336 case AMDGPU::S_CMP_LG_U64:
10337 return optimizeCmpAnd(0, 64, true, false);
10338 }
10339
10340 return false;
10341}
10342
10343void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI,
10344 AMDGPU::OpName OpName) const {
10345 if (!ST.needsAlignedVGPRs())
10346 return;
10347
10348 int OpNo = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: OpName);
10349 if (OpNo < 0)
10350 return;
10351 MachineOperand &Op = MI.getOperand(i: OpNo);
10352 if (getOpSize(MI, OpNo) > 4)
10353 return;
10354
10355 // Add implicit aligned super-reg to force alignment on the data operand.
10356 const DebugLoc &DL = MI.getDebugLoc();
10357 MachineBasicBlock *BB = MI.getParent();
10358 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
10359 Register DataReg = Op.getReg();
10360 bool IsAGPR = RI.isAGPR(MRI, Reg: DataReg);
10361 Register Undef = MRI.createVirtualRegister(
10362 RegClass: IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10363 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
10364 Register NewVR =
10365 MRI.createVirtualRegister(RegClass: IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10366 : &AMDGPU::VReg_64_Align2RegClass);
10367 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: NewVR)
10368 .addReg(RegNo: DataReg, flags: 0, SubReg: Op.getSubReg())
10369 .addImm(Val: AMDGPU::sub0)
10370 .addReg(RegNo: Undef)
10371 .addImm(Val: AMDGPU::sub1);
10372 Op.setReg(NewVR);
10373 Op.setSubReg(AMDGPU::sub0);
10374 MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewVR, isDef: false, isImp: true));
10375}
10376
10377bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
10378 if (isIGLP(MI: *MI))
10379 return false;
10380
10381 return TargetInstrInfo::isGlobalMemoryObject(MI);
10382}
10383
10384bool SIInstrInfo::isXDL(const MachineInstr &MI) const {
10385 unsigned Opcode = MI.getOpcode();
10386
10387 if (!SIInstrInfo::isMAI(MI) || isDGEMM(Opcode) ||
10388 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
10389 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
10390 return false;
10391
10392 if (!ST.hasGFX940Insts())
10393 return true;
10394
10395 return AMDGPU::getMAIIsGFX940XDL(Opc: Opcode);
10396}
10397