1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "MCTargetDesc/R600MCTargetDesc.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
23#include "SIMachineFunctionInfo.h"
24#include "llvm/Analysis/UniformityAnalysis.h"
25#include "llvm/CodeGen/FunctionLoweringInfo.h"
26#include "llvm/CodeGen/SelectionDAG.h"
27#include "llvm/CodeGen/SelectionDAGISel.h"
28#include "llvm/CodeGen/SelectionDAGNodes.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include "llvm/InitializePasses.h"
31#include "llvm/Support/ErrorHandling.h"
32
33#ifdef EXPENSIVE_CHECKS
34#include "llvm/Analysis/LoopInfo.h"
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(i: 0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(Val: In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: 1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(i: 0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(i: 0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: Srl.getOperand(i: 1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Val: Srl.getOperand(i: 0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
81 llvm::SelectionDAG *CurDAG,
82 const GCNSubtarget *Subtarget) {
83 if (!Subtarget->useRealTrue16Insts()) {
84 return Lo;
85 }
86
87 SDValue NewSrc;
88 SDLoc SL(Lo);
89
90 if (Lo->isDivergent()) {
91 SDValue Undef = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF,
92 dl: SL, VT: Lo.getValueType()),
93 0);
94 const SDValue Ops[] = {
95 CurDAG->getTargetConstant(Val: AMDGPU::VGPR_32RegClassID, DL: SL, VT: MVT::i32), Lo,
96 CurDAG->getTargetConstant(Val: AMDGPU::lo16, DL: SL, VT: MVT::i16), Undef,
97 CurDAG->getTargetConstant(Val: AMDGPU::hi16, DL: SL, VT: MVT::i16)};
98
99 NewSrc = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: SL,
100 VT: Src.getValueType(), Ops),
101 0);
102 } else {
103 // the S_MOV is needed since the Lo could still be a VGPR16.
104 // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
105 // the fixvgpr2sgprcopy pass to legalize it
106 NewSrc = SDValue(
107 CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: Src.getValueType(), Op1: Lo),
108 0);
109 }
110
111 return NewSrc;
112}
113
114// Look through operations that obscure just looking at the low 16-bits of the
115// same register.
116static SDValue stripExtractLoElt(SDValue In) {
117 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
118 SDValue Idx = In.getOperand(i: 1);
119 if (isNullConstant(V: Idx) && In.getValueSizeInBits() <= 32)
120 return In.getOperand(i: 0);
121 }
122
123 if (In.getOpcode() == ISD::TRUNCATE) {
124 SDValue Src = In.getOperand(i: 0);
125 if (Src.getValueType().getSizeInBits() == 32)
126 return stripBitcast(Val: Src);
127 }
128
129 return In;
130}
131
132} // end anonymous namespace
133
134INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
135 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
136 false)
137INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
138INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
139#ifdef EXPENSIVE_CHECKS
140INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
141INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
142#endif
143INITIALIZE_PASS_END(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
144 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
145 false)
146
147/// This pass converts a legalized DAG into a AMDGPU-specific
148// DAG, ready for instruction scheduling.
149FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
150 CodeGenOptLevel OptLevel) {
151 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
152}
153
154AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM,
155 CodeGenOptLevel OptLevel)
156 : SelectionDAGISel(TM, OptLevel) {}
157
158bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
159 Subtarget = &MF.getSubtarget<GCNSubtarget>();
160 Subtarget->checkSubtargetFeatures(F: MF.getFunction());
161 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
162 return SelectionDAGISel::runOnMachineFunction(mf&: MF);
163}
164
165bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
166 // XXX - only need to list legal operations.
167 switch (Opc) {
168 case ISD::FADD:
169 case ISD::FSUB:
170 case ISD::FMUL:
171 case ISD::FDIV:
172 case ISD::FREM:
173 case ISD::FCANONICALIZE:
174 case ISD::UINT_TO_FP:
175 case ISD::SINT_TO_FP:
176 case ISD::FABS:
177 // Fabs is lowered to a bit operation, but it's an and which will clear the
178 // high bits anyway.
179 case ISD::FSQRT:
180 case ISD::FSIN:
181 case ISD::FCOS:
182 case ISD::FPOWI:
183 case ISD::FPOW:
184 case ISD::FLOG:
185 case ISD::FLOG2:
186 case ISD::FLOG10:
187 case ISD::FEXP:
188 case ISD::FEXP2:
189 case ISD::FCEIL:
190 case ISD::FTRUNC:
191 case ISD::FRINT:
192 case ISD::FNEARBYINT:
193 case ISD::FROUNDEVEN:
194 case ISD::FROUND:
195 case ISD::FFLOOR:
196 case ISD::FMINNUM:
197 case ISD::FMAXNUM:
198 case ISD::FLDEXP:
199 case AMDGPUISD::FRACT:
200 case AMDGPUISD::CLAMP:
201 case AMDGPUISD::COS_HW:
202 case AMDGPUISD::SIN_HW:
203 case AMDGPUISD::FMIN3:
204 case AMDGPUISD::FMAX3:
205 case AMDGPUISD::FMED3:
206 case AMDGPUISD::FMAD_FTZ:
207 case AMDGPUISD::RCP:
208 case AMDGPUISD::RSQ:
209 case AMDGPUISD::RCP_IFLAG:
210 // On gfx10, all 16-bit instructions preserve the high bits.
211 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
212 case ISD::FP_ROUND:
213 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
214 // high bits on gfx9.
215 // TODO: If we had the source node we could see if the source was fma/mad
216 return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
217 case ISD::FMA:
218 case ISD::FMAD:
219 case AMDGPUISD::DIV_FIXUP:
220 return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
221 default:
222 // fcopysign, select and others may be lowered to 32-bit bit operations
223 // which don't zero the high bits.
224 return false;
225 }
226}
227
228bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {
229#ifdef EXPENSIVE_CHECKS
230 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
231 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
232 for (auto &L : LI->getLoopsInPreorder()) {
233 assert(L->isLCSSAForm(DT));
234 }
235#endif
236 return SelectionDAGISelLegacy::runOnMachineFunction(MF);
237}
238
239void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
240 AU.addRequired<UniformityInfoWrapperPass>();
241#ifdef EXPENSIVE_CHECKS
242 AU.addRequired<DominatorTreeWrapperPass>();
243 AU.addRequired<LoopInfoWrapperPass>();
244#endif
245 SelectionDAGISelLegacy::getAnalysisUsage(AU);
246}
247
248bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
249 assert(Subtarget->d16PreservesUnusedBits());
250 MVT VT = N->getValueType(ResNo: 0).getSimpleVT();
251 if (VT != MVT::v2i16 && VT != MVT::v2f16)
252 return false;
253
254 SDValue Lo = N->getOperand(Num: 0);
255 SDValue Hi = N->getOperand(Num: 1);
256
257 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(Val: stripBitcast(Val: Hi));
258
259 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
260 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
261 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
262
263 // Need to check for possible indirect dependencies on the other half of the
264 // vector to avoid introducing a cycle.
265 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(N: Lo.getNode())) {
266 SDVTList VTList = CurDAG->getVTList(VT1: VT, VT2: MVT::Other);
267
268 SDValue TiedIn = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SDLoc(N), VT, Operand: Lo);
269 SDValue Ops[] = {
270 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
271 };
272
273 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
274 if (LdHi->getMemoryVT() == MVT::i8) {
275 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
276 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
277 } else {
278 assert(LdHi->getMemoryVT() == MVT::i16);
279 }
280
281 SDValue NewLoadHi =
282 CurDAG->getMemIntrinsicNode(Opcode: LoadOp, dl: SDLoc(LdHi), VTList,
283 Ops, MemVT: LdHi->getMemoryVT(),
284 MMO: LdHi->getMemOperand());
285
286 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: NewLoadHi);
287 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(LdHi, 1), To: NewLoadHi.getValue(R: 1));
288 return true;
289 }
290
291 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
292 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
293 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
294 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(Val: stripBitcast(Val: Lo));
295 if (LdLo && Lo.hasOneUse()) {
296 SDValue TiedIn = getHi16Elt(In: Hi);
297 if (!TiedIn || LdLo->isPredecessorOf(N: TiedIn.getNode()))
298 return false;
299
300 SDVTList VTList = CurDAG->getVTList(VT1: VT, VT2: MVT::Other);
301 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
302 if (LdLo->getMemoryVT() == MVT::i8) {
303 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
304 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
305 } else {
306 assert(LdLo->getMemoryVT() == MVT::i16);
307 }
308
309 TiedIn = CurDAG->getNode(Opcode: ISD::BITCAST, DL: SDLoc(N), VT, Operand: TiedIn);
310
311 SDValue Ops[] = {
312 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
313 };
314
315 SDValue NewLoadLo =
316 CurDAG->getMemIntrinsicNode(Opcode: LoadOp, dl: SDLoc(LdLo), VTList,
317 Ops, MemVT: LdLo->getMemoryVT(),
318 MMO: LdLo->getMemOperand());
319
320 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: NewLoadLo);
321 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(LdLo, 1), To: NewLoadLo.getValue(R: 1));
322 return true;
323 }
324
325 return false;
326}
327
328void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
329 if (!Subtarget->d16PreservesUnusedBits())
330 return;
331
332 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
333
334 bool MadeChange = false;
335 while (Position != CurDAG->allnodes_begin()) {
336 SDNode *N = &*--Position;
337 if (N->use_empty())
338 continue;
339
340 switch (N->getOpcode()) {
341 case ISD::BUILD_VECTOR:
342 // TODO: Match load d16 from shl (extload:i16), 16
343 MadeChange |= matchLoadD16FromBuildVector(N);
344 break;
345 default:
346 break;
347 }
348 }
349
350 if (MadeChange) {
351 CurDAG->RemoveDeadNodes();
352 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
353 CurDAG->dump(););
354 }
355}
356
357bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
358 if (N->isUndef())
359 return true;
360
361 const SIInstrInfo *TII = Subtarget->getInstrInfo();
362 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N))
363 return TII->isInlineConstant(Imm: C->getAPIntValue());
364
365 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val: N))
366 return TII->isInlineConstant(Imm: C->getValueAPF());
367
368 return false;
369}
370
371/// Determine the register class for \p OpNo
372/// \returns The register class of the virtual register that will be used for
373/// the given operand number \OpNo or NULL if the register class cannot be
374/// determined.
375const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
376 unsigned OpNo) const {
377 if (!N->isMachineOpcode()) {
378 if (N->getOpcode() == ISD::CopyToReg) {
379 Register Reg = cast<RegisterSDNode>(Val: N->getOperand(Num: 1))->getReg();
380 if (Reg.isVirtual()) {
381 MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
382 return MRI.getRegClass(Reg);
383 }
384
385 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
386 return TRI->getPhysRegBaseClass(Reg);
387 }
388
389 return nullptr;
390 }
391
392 switch (N->getMachineOpcode()) {
393 default: {
394 const SIInstrInfo *TII = Subtarget->getInstrInfo();
395 const MCInstrDesc &Desc = TII->get(Opcode: N->getMachineOpcode());
396 unsigned OpIdx = Desc.getNumDefs() + OpNo;
397 if (OpIdx >= Desc.getNumOperands())
398 return nullptr;
399
400 int16_t RegClass = TII->getOpRegClassID(OpInfo: Desc.operands()[OpIdx]);
401 if (RegClass == -1)
402 return nullptr;
403
404 return Subtarget->getRegisterInfo()->getRegClass(i: RegClass);
405 }
406 case AMDGPU::REG_SEQUENCE: {
407 unsigned RCID = N->getConstantOperandVal(Num: 0);
408 const TargetRegisterClass *SuperRC =
409 Subtarget->getRegisterInfo()->getRegClass(i: RCID);
410
411 SDValue SubRegOp = N->getOperand(Num: OpNo + 1);
412 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
413 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
414 SubRegIdx);
415 }
416 }
417}
418
419SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
420 SDValue Glue) const {
421 SmallVector <SDValue, 8> Ops;
422 Ops.push_back(Elt: NewChain); // Replace the chain.
423 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
424 Ops.push_back(Elt: N->getOperand(Num: i));
425
426 Ops.push_back(Elt: Glue);
427 return CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: N->getVTList(), Ops);
428}
429
430SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
431 const SITargetLowering& Lowering =
432 *static_cast<const SITargetLowering*>(getTargetLowering());
433
434 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
435
436 SDValue M0 = Lowering.copyToM0(DAG&: *CurDAG, Chain: N->getOperand(Num: 0), DL: SDLoc(N), V: Val);
437 return glueCopyToOp(N, NewChain: M0, Glue: M0.getValue(R: 1));
438}
439
440SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
441 unsigned AS = cast<MemSDNode>(Val: N)->getAddressSpace();
442 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
443 if (Subtarget->ldsRequiresM0Init())
444 return glueCopyToM0(
445 N, Val: CurDAG->getSignedTargetConstant(Val: -1, DL: SDLoc(N), VT: MVT::i32));
446 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
447 MachineFunction &MF = CurDAG->getMachineFunction();
448 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
449 return
450 glueCopyToM0(N, Val: CurDAG->getTargetConstant(Val: Value, DL: SDLoc(N), VT: MVT::i32));
451 }
452 return N;
453}
454
455MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
456 EVT VT) const {
457 SDNode *Lo = CurDAG->getMachineNode(
458 Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
459 Op1: CurDAG->getTargetConstant(Val: Lo_32(Value: Imm), DL, VT: MVT::i32));
460 SDNode *Hi = CurDAG->getMachineNode(
461 Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
462 Op1: CurDAG->getTargetConstant(Val: Hi_32(Value: Imm), DL, VT: MVT::i32));
463 const SDValue Ops[] = {
464 CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32),
465 SDValue(Lo, 0), CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
466 SDValue(Hi, 0), CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
467
468 return CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL, VT, Ops);
469}
470
471SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,
472 SelectionDAG &DAG) const {
473 // TODO: Handle undef as zero
474
475 assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
476 uint32_t LHSVal, RHSVal;
477 if (getConstantValue(N: N->getOperand(Num: 0), Out&: LHSVal) &&
478 getConstantValue(N: N->getOperand(Num: 1), Out&: RHSVal)) {
479 SDLoc SL(N);
480 uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
481 return DAG.getMachineNode(
482 Opcode: isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, dl: SL,
483 VT: N->getValueType(ResNo: 0), Op1: DAG.getTargetConstant(Val: K, DL: SL, VT: MVT::i32));
484 }
485
486 return nullptr;
487}
488
489void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
490 EVT VT = N->getValueType(ResNo: 0);
491 unsigned NumVectorElts = VT.getVectorNumElements();
492 EVT EltVT = VT.getVectorElementType();
493 SDLoc DL(N);
494 SDValue RegClass = CurDAG->getTargetConstant(Val: RegClassID, DL, VT: MVT::i32);
495
496 if (NumVectorElts == 1) {
497 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::COPY_TO_REGCLASS, VT: EltVT, Op1: N->getOperand(Num: 0),
498 Op2: RegClass);
499 return;
500 }
501
502 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
503 if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
504 CurDAG->isConstantValueOfAnyType(N: SDValue(N, 0))) {
505 uint64_t C = 0;
506 bool AllConst = true;
507 unsigned EltSize = EltVT.getSizeInBits();
508 for (unsigned I = 0; I < NumVectorElts; ++I) {
509 SDValue Op = N->getOperand(Num: I);
510 if (Op.isUndef()) {
511 AllConst = false;
512 break;
513 }
514 uint64_t Val;
515 if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
516 Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
517 } else
518 Val = cast<ConstantSDNode>(Val&: Op)->getZExtValue();
519 C |= Val << (EltSize * I);
520 }
521 if (AllConst) {
522 SDValue CV = CurDAG->getTargetConstant(Val: C, DL, VT: MVT::i64);
523 MachineSDNode *Copy =
524 CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B64_IMM_PSEUDO, dl: DL, VT, Op1: CV);
525 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::COPY_TO_REGCLASS, VT, Op1: SDValue(Copy, 0),
526 Op2: RegClass);
527 return;
528 }
529 }
530
531 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
532 "supported yet");
533 // 32 = Max Num Vector Elements
534 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
535 // 1 = Vector Register Class
536 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
537
538 RegSeqArgs[0] = CurDAG->getTargetConstant(Val: RegClassID, DL, VT: MVT::i32);
539 bool IsRegSeq = true;
540 unsigned NOps = N->getNumOperands();
541 for (unsigned i = 0; i < NOps; i++) {
542 // XXX: Why is this here?
543 if (isa<RegisterSDNode>(Val: N->getOperand(Num: i))) {
544 IsRegSeq = false;
545 break;
546 }
547 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(Channel: i)
548 : R600RegisterInfo::getSubRegFromChannel(Channel: i);
549 RegSeqArgs[1 + (2 * i)] = N->getOperand(Num: i);
550 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Val: Sub, DL, VT: MVT::i32);
551 }
552 if (NOps != NumVectorElts) {
553 // Fill in the missing undef elements if this was a scalar_to_vector.
554 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
555 MachineSDNode *ImpDef = CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF,
556 dl: DL, VT: EltVT);
557 for (unsigned i = NOps; i < NumVectorElts; ++i) {
558 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(Channel: i)
559 : R600RegisterInfo::getSubRegFromChannel(Channel: i);
560 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
561 RegSeqArgs[1 + (2 * i) + 1] =
562 CurDAG->getTargetConstant(Val: Sub, DL, VT: MVT::i32);
563 }
564 }
565
566 if (!IsRegSeq)
567 SelectCode(N);
568 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::REG_SEQUENCE, VTs: N->getVTList(), Ops: RegSeqArgs);
569}
570
571void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) {
572 EVT VT = N->getValueType(ResNo: 0);
573 EVT EltVT = VT.getVectorElementType();
574
575 // TODO: Handle 16-bit element vectors with even aligned masks.
576 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(VT: MVT::i32) ||
577 VT.getVectorNumElements() != 2) {
578 SelectCode(N);
579 return;
580 }
581
582 auto *SVN = cast<ShuffleVectorSDNode>(Val: N);
583
584 SDValue Src0 = SVN->getOperand(Num: 0);
585 SDValue Src1 = SVN->getOperand(Num: 1);
586 ArrayRef<int> Mask = SVN->getMask();
587 SDLoc DL(N);
588
589 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
590 Mask[0] < 4 && Mask[1] < 4);
591
592 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
593 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
594 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
595 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
596
597 if (Mask[0] < 0) {
598 Src0SubReg = Src1SubReg;
599 MachineSDNode *ImpDef =
600 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT);
601 VSrc0 = SDValue(ImpDef, 0);
602 }
603
604 if (Mask[1] < 0) {
605 Src1SubReg = Src0SubReg;
606 MachineSDNode *ImpDef =
607 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT);
608 VSrc1 = SDValue(ImpDef, 0);
609 }
610
611 // SGPR case needs to lower to copies.
612 //
613 // Also use subregister extract when we can directly blend the registers with
614 // a simple subregister copy.
615 //
616 // TODO: Maybe we should fold this out earlier
617 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
618 Src1SubReg == AMDGPU::sub0) {
619 // The low element of the result always comes from src0.
620 // The high element of the result always comes from src1.
621 // op_sel selects the high half of src0.
622 // op_sel_hi selects the high half of src1.
623
624 unsigned Src0OpSel =
625 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
626 unsigned Src1OpSel =
627 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
628
629 // Enable op_sel_hi to avoid printing it. This should have no effect on the
630 // result.
631 Src0OpSel |= SISrcMods::OP_SEL_1;
632 Src1OpSel |= SISrcMods::OP_SEL_1;
633
634 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Val: Src0OpSel, DL, VT: MVT::i32);
635 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Val: Src1OpSel, DL, VT: MVT::i32);
636 SDValue ZeroMods = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
637
638 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::V_PK_MOV_B32, VTs: N->getVTList(),
639 Ops: {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
640 ZeroMods, // clamp
641 ZeroMods, // op_sel
642 ZeroMods, // op_sel_hi
643 ZeroMods, // neg_lo
644 ZeroMods}); // neg_hi
645 return;
646 }
647
648 SDValue ResultElt0 =
649 CurDAG->getTargetExtractSubreg(SRIdx: Src0SubReg, DL, VT: EltVT, Operand: VSrc0);
650 SDValue ResultElt1 =
651 CurDAG->getTargetExtractSubreg(SRIdx: Src1SubReg, DL, VT: EltVT, Operand: VSrc1);
652
653 const SDValue Ops[] = {
654 CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32),
655 ResultElt0, CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
656 ResultElt1, CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
657 CurDAG->SelectNodeTo(N, MachineOpc: TargetOpcode::REG_SEQUENCE, VT, Ops);
658}
659
660void AMDGPUDAGToDAGISel::Select(SDNode *N) {
661 unsigned int Opc = N->getOpcode();
662 if (N->isMachineOpcode()) {
663 N->setNodeId(-1);
664 return; // Already selected.
665 }
666
667 // isa<MemSDNode> almost works but is slightly too permissive for some DS
668 // intrinsics.
669 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(Val: N)) {
670 N = glueCopyToM0LDSInit(N);
671 SelectCode(N);
672 return;
673 }
674
675 switch (Opc) {
676 default:
677 break;
678 // We are selecting i64 ADD here instead of custom lower it during
679 // DAG legalization, so we can fold some i64 ADDs used for address
680 // calculation into the LOAD and STORE instructions.
681 case ISD::ADDC:
682 case ISD::ADDE:
683 case ISD::SUBC:
684 case ISD::SUBE: {
685 if (N->getValueType(ResNo: 0) != MVT::i64)
686 break;
687
688 SelectADD_SUB_I64(N);
689 return;
690 }
691 case ISD::UADDO_CARRY:
692 case ISD::USUBO_CARRY:
693 if (N->getValueType(ResNo: 0) != MVT::i32)
694 break;
695
696 SelectAddcSubb(N);
697 return;
698 case ISD::UADDO:
699 case ISD::USUBO: {
700 SelectUADDO_USUBO(N);
701 return;
702 }
703 case AMDGPUISD::FMUL_W_CHAIN: {
704 SelectFMUL_W_CHAIN(N);
705 return;
706 }
707 case AMDGPUISD::FMA_W_CHAIN: {
708 SelectFMA_W_CHAIN(N);
709 return;
710 }
711
712 case ISD::SCALAR_TO_VECTOR:
713 case ISD::BUILD_VECTOR: {
714 EVT VT = N->getValueType(ResNo: 0);
715 unsigned NumVectorElts = VT.getVectorNumElements();
716 if (VT.getScalarSizeInBits() == 16) {
717 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
718 if (SDNode *Packed = packConstantV2I16(N, DAG&: *CurDAG)) {
719 ReplaceNode(F: N, T: Packed);
720 return;
721 }
722 }
723
724 break;
725 }
726
727 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
728 assert(VT.getVectorElementType().bitsEq(MVT::i32));
729 const TargetRegisterClass *RegClass =
730 N->isDivergent()
731 ? TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: NumVectorElts * 32)
732 : SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NumVectorElts * 32);
733
734 SelectBuildVector(N, RegClassID: RegClass->getID());
735 return;
736 }
737 case ISD::VECTOR_SHUFFLE:
738 SelectVectorShuffle(N);
739 return;
740 case ISD::BUILD_PAIR: {
741 SDValue RC, SubReg0, SubReg1;
742 SDLoc DL(N);
743 if (N->getValueType(ResNo: 0) == MVT::i128) {
744 RC = CurDAG->getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32);
745 SubReg0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0_sub1, DL, VT: MVT::i32);
746 SubReg1 = CurDAG->getTargetConstant(Val: AMDGPU::sub2_sub3, DL, VT: MVT::i32);
747 } else if (N->getValueType(ResNo: 0) == MVT::i64) {
748 RC = CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32);
749 SubReg0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
750 SubReg1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
751 } else {
752 llvm_unreachable("Unhandled value type for BUILD_PAIR");
753 }
754 const SDValue Ops[] = { RC, N->getOperand(Num: 0), SubReg0,
755 N->getOperand(Num: 1), SubReg1 };
756 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL,
757 VT: N->getValueType(ResNo: 0), Ops));
758 return;
759 }
760
761 case ISD::Constant:
762 case ISD::ConstantFP: {
763 if (N->getValueType(ResNo: 0).getSizeInBits() != 64 || isInlineImmediate(N) ||
764 Subtarget->has64BitLiterals())
765 break;
766
767 uint64_t Imm;
768 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(Val: N)) {
769 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
770 if (AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: true))
771 break;
772 } else {
773 ConstantSDNode *C = cast<ConstantSDNode>(Val: N);
774 Imm = C->getZExtValue();
775 if (AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: false))
776 break;
777 }
778
779 SDLoc DL(N);
780 ReplaceNode(F: N, T: buildSMovImm64(DL, Imm, VT: N->getValueType(ResNo: 0)));
781 return;
782 }
783 case AMDGPUISD::BFE_I32:
784 case AMDGPUISD::BFE_U32: {
785 // There is a scalar version available, but unlike the vector version which
786 // has a separate operand for the offset and width, the scalar version packs
787 // the width and offset into a single operand. Try to move to the scalar
788 // version if the offsets are constant, so that we can try to keep extended
789 // loads of kernel arguments in SGPRs.
790
791 // TODO: Technically we could try to pattern match scalar bitshifts of
792 // dynamic values, but it's probably not useful.
793 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
794 if (!Offset)
795 break;
796
797 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 2));
798 if (!Width)
799 break;
800
801 bool Signed = Opc == AMDGPUISD::BFE_I32;
802
803 uint32_t OffsetVal = Offset->getZExtValue();
804 uint32_t WidthVal = Width->getZExtValue();
805
806 ReplaceNode(F: N, T: getBFE32(IsSigned: Signed, DL: SDLoc(N), Val: N->getOperand(Num: 0), Offset: OffsetVal,
807 Width: WidthVal));
808 return;
809 }
810 case AMDGPUISD::DIV_SCALE: {
811 SelectDIV_SCALE(N);
812 return;
813 }
814 case AMDGPUISD::MAD_I64_I32:
815 case AMDGPUISD::MAD_U64_U32: {
816 SelectMAD_64_32(N);
817 return;
818 }
819 case ISD::SMUL_LOHI:
820 case ISD::UMUL_LOHI:
821 return SelectMUL_LOHI(N);
822 case ISD::CopyToReg: {
823 const SITargetLowering& Lowering =
824 *static_cast<const SITargetLowering*>(getTargetLowering());
825 N = Lowering.legalizeTargetIndependentNode(Node: N, DAG&: *CurDAG);
826 break;
827 }
828 case ISD::AND:
829 case ISD::SRL:
830 case ISD::SRA:
831 case ISD::SIGN_EXTEND_INREG:
832 if (N->getValueType(ResNo: 0) != MVT::i32)
833 break;
834
835 SelectS_BFE(N);
836 return;
837 case ISD::BRCOND:
838 SelectBRCOND(N);
839 return;
840 case ISD::FP_EXTEND:
841 SelectFP_EXTEND(N);
842 return;
843 case AMDGPUISD::CVT_PKRTZ_F16_F32:
844 case AMDGPUISD::CVT_PKNORM_I16_F32:
845 case AMDGPUISD::CVT_PKNORM_U16_F32:
846 case AMDGPUISD::CVT_PK_U16_U32:
847 case AMDGPUISD::CVT_PK_I16_I32: {
848 // Hack around using a legal type if f16 is illegal.
849 if (N->getValueType(ResNo: 0) == MVT::i32) {
850 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
851 N = CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: CurDAG->getVTList(VT: NewVT),
852 Ops: { N->getOperand(Num: 0), N->getOperand(Num: 1) });
853 SelectCode(N);
854 return;
855 }
856
857 break;
858 }
859 case ISD::INTRINSIC_W_CHAIN: {
860 SelectINTRINSIC_W_CHAIN(N);
861 return;
862 }
863 case ISD::INTRINSIC_WO_CHAIN: {
864 SelectINTRINSIC_WO_CHAIN(N);
865 return;
866 }
867 case ISD::INTRINSIC_VOID: {
868 SelectINTRINSIC_VOID(N);
869 return;
870 }
871 case AMDGPUISD::WAVE_ADDRESS: {
872 SelectWAVE_ADDRESS(N);
873 return;
874 }
875 case ISD::STACKRESTORE: {
876 SelectSTACKRESTORE(N);
877 return;
878 }
879 }
880
881 SelectCode(N);
882}
883
884bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
885 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
886 const Instruction *Term = BB->getTerminator();
887 return Term->getMetadata(Kind: "amdgpu.uniform") ||
888 Term->getMetadata(Kind: "structurizecfg.uniform");
889}
890
891bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
892 unsigned ShAmtBits) const {
893 assert(N->getOpcode() == ISD::AND);
894
895 const APInt &RHS = N->getConstantOperandAPInt(Num: 1);
896 if (RHS.countr_one() >= ShAmtBits)
897 return true;
898
899 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(Op: N->getOperand(Num: 0)).Zero;
900 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
901}
902
903static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
904 SDValue &N0, SDValue &N1) {
905 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
906 Addr.getOperand(i: 0).getOpcode() == ISD::BUILD_VECTOR) {
907 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
908 // (i64 (bitcast (v2i32 (build_vector
909 // (or (extract_vector_elt V, 0), OFFSET),
910 // (extract_vector_elt V, 1)))))
911 SDValue Lo = Addr.getOperand(i: 0).getOperand(i: 0);
912 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Op: Lo)) {
913 SDValue BaseLo = Lo.getOperand(i: 0);
914 SDValue BaseHi = Addr.getOperand(i: 0).getOperand(i: 1);
915 // Check that split base (Lo and Hi) are extracted from the same one.
916 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
917 BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
918 BaseLo.getOperand(i: 0) == BaseHi.getOperand(i: 0) &&
919 // Lo is statically extracted from index 0.
920 isa<ConstantSDNode>(Val: BaseLo.getOperand(i: 1)) &&
921 BaseLo.getConstantOperandVal(i: 1) == 0 &&
922 // Hi is statically extracted from index 0.
923 isa<ConstantSDNode>(Val: BaseHi.getOperand(i: 1)) &&
924 BaseHi.getConstantOperandVal(i: 1) == 1) {
925 N0 = BaseLo.getOperand(i: 0).getOperand(i: 0);
926 N1 = Lo.getOperand(i: 1);
927 return true;
928 }
929 }
930 }
931 return false;
932}
933
934bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
935 SDValue &RHS) const {
936 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
937 LHS = Addr.getOperand(i: 0);
938 RHS = Addr.getOperand(i: 1);
939 return true;
940 }
941
942 if (getBaseWithOffsetUsingSplitOR(DAG&: *CurDAG, Addr, N0&: LHS, N1&: RHS)) {
943 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
944 return true;
945 }
946
947 return false;
948}
949
950StringRef AMDGPUDAGToDAGISelLegacy::getPassName() const {
951 return "AMDGPU DAG->DAG Pattern Instruction Selection";
952}
953
954AMDGPUISelDAGToDAGPass::AMDGPUISelDAGToDAGPass(TargetMachine &TM)
955 : SelectionDAGISelPass(
956 std::make_unique<AMDGPUDAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {}
957
958PreservedAnalyses
959AMDGPUISelDAGToDAGPass::run(MachineFunction &MF,
960 MachineFunctionAnalysisManager &MFAM) {
961#ifdef EXPENSIVE_CHECKS
962 auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
963 .getManager();
964 auto &F = MF.getFunction();
965 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
966 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
967 for (auto &L : LI.getLoopsInPreorder())
968 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
969#endif
970 return SelectionDAGISelPass::run(MF, MFAM);
971}
972
973//===----------------------------------------------------------------------===//
974// Complex Patterns
975//===----------------------------------------------------------------------===//
976
977bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
978 SDValue &Offset) {
979 return false;
980}
981
982bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
983 SDValue &Offset) {
984 ConstantSDNode *C;
985 SDLoc DL(Addr);
986
987 if ((C = dyn_cast<ConstantSDNode>(Val&: Addr))) {
988 Base = CurDAG->getRegister(Reg: R600::INDIRECT_BASE_ADDR, VT: MVT::i32);
989 Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
990 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
991 (C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 0)))) {
992 Base = CurDAG->getRegister(Reg: R600::INDIRECT_BASE_ADDR, VT: MVT::i32);
993 Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
994 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
995 (C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 1)))) {
996 Base = Addr.getOperand(i: 0);
997 Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
998 } else {
999 Base = Addr;
1000 Offset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1001 }
1002
1003 return true;
1004}
1005
1006SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
1007 const SDLoc &DL) const {
1008 SDNode *Mov = CurDAG->getMachineNode(
1009 Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
1010 Op1: CurDAG->getTargetConstant(Val, DL, VT: MVT::i32));
1011 return SDValue(Mov, 0);
1012}
1013
1014// FIXME: Should only handle uaddo_carry/usubo_carry
1015void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
1016 SDLoc DL(N);
1017 SDValue LHS = N->getOperand(Num: 0);
1018 SDValue RHS = N->getOperand(Num: 1);
1019
1020 unsigned Opcode = N->getOpcode();
1021 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1022 bool ProduceCarry =
1023 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1024 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1025
1026 SDValue Sub0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
1027 SDValue Sub1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
1028
1029 SDNode *Lo0 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1030 dl: DL, VT: MVT::i32, Op1: LHS, Op2: Sub0);
1031 SDNode *Hi0 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1032 dl: DL, VT: MVT::i32, Op1: LHS, Op2: Sub1);
1033
1034 SDNode *Lo1 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1035 dl: DL, VT: MVT::i32, Op1: RHS, Op2: Sub0);
1036 SDNode *Hi1 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1037 dl: DL, VT: MVT::i32, Op1: RHS, Op2: Sub1);
1038
1039 SDVTList VTList = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::Glue);
1040
1041 static const unsigned OpcMap[2][2][2] = {
1042 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1043 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1044 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1045 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1046
1047 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
1048 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
1049
1050 SDNode *AddLo;
1051 if (!ConsumeCarry) {
1052 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1053 AddLo = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs: VTList, Ops: Args);
1054 } else {
1055 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(Num: 2) };
1056 AddLo = CurDAG->getMachineNode(Opcode: CarryOpc, dl: DL, VTs: VTList, Ops: Args);
1057 }
1058 SDValue AddHiArgs[] = {
1059 SDValue(Hi0, 0),
1060 SDValue(Hi1, 0),
1061 SDValue(AddLo, 1)
1062 };
1063 SDNode *AddHi = CurDAG->getMachineNode(Opcode: CarryOpc, dl: DL, VTs: VTList, Ops: AddHiArgs);
1064
1065 SDValue RegSequenceArgs[] = {
1066 CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32),
1067 SDValue(AddLo,0),
1068 Sub0,
1069 SDValue(AddHi,0),
1070 Sub1,
1071 };
1072 SDNode *RegSequence = CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL,
1073 VT: MVT::i64, Ops: RegSequenceArgs);
1074
1075 if (ProduceCarry) {
1076 // Replace the carry-use
1077 ReplaceUses(F: SDValue(N, 1), T: SDValue(AddHi, 1));
1078 }
1079
1080 // Replace the remaining uses.
1081 ReplaceNode(F: N, T: RegSequence);
1082}
1083
1084void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1085 SDValue LHS = N->getOperand(Num: 0);
1086 SDValue RHS = N->getOperand(Num: 1);
1087 SDValue CI = N->getOperand(Num: 2);
1088
1089 if (N->isDivergent()) {
1090 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1091 : AMDGPU::V_SUBB_U32_e64;
1092 CurDAG->SelectNodeTo(
1093 N, MachineOpc: Opc, VTs: N->getVTList(),
1094 Ops: {LHS, RHS, CI,
1095 CurDAG->getTargetConstant(Val: 0, DL: {}, VT: MVT::i1) /*clamp bit*/});
1096 } else {
1097 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1098 : AMDGPU::S_SUB_CO_PSEUDO;
1099 CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops: {LHS, RHS, CI});
1100 }
1101}
1102
1103void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1104 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1105 // carry out despite the _i32 name. These were renamed in VI to _U32.
1106 // FIXME: We should probably rename the opcodes here.
1107 bool IsAdd = N->getOpcode() == ISD::UADDO;
1108 bool IsVALU = N->isDivergent();
1109
1110 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1111 ++UI)
1112 if (UI.getUse().getResNo() == 1) {
1113 if (UI->isMachineOpcode()) {
1114 if (UI->getMachineOpcode() !=
1115 (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
1116 IsVALU = true;
1117 break;
1118 }
1119 } else {
1120 if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
1121 IsVALU = true;
1122 break;
1123 }
1124 }
1125 }
1126
1127 if (IsVALU) {
1128 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1129
1130 CurDAG->SelectNodeTo(
1131 N, MachineOpc: Opc, VTs: N->getVTList(),
1132 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1),
1133 CurDAG->getTargetConstant(Val: 0, DL: {}, VT: MVT::i1) /*clamp bit*/});
1134 } else {
1135 unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;
1136
1137 CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(),
1138 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1)});
1139 }
1140}
1141
1142void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1143 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1144 SDValue Ops[10];
1145
1146 SelectVOP3Mods0(In: N->getOperand(Num: 1), Src&: Ops[1], SrcMods&: Ops[0], Clamp&: Ops[6], Omod&: Ops[7]);
1147 SelectVOP3Mods(In: N->getOperand(Num: 2), Src&: Ops[3], SrcMods&: Ops[2]);
1148 SelectVOP3Mods(In: N->getOperand(Num: 3), Src&: Ops[5], SrcMods&: Ops[4]);
1149 Ops[8] = N->getOperand(Num: 0);
1150 Ops[9] = N->getOperand(Num: 4);
1151
1152 // If there are no source modifiers, prefer fmac over fma because it can use
1153 // the smaller VOP2 encoding.
1154 bool UseFMAC = Subtarget->hasDLInsts() &&
1155 cast<ConstantSDNode>(Val&: Ops[0])->isZero() &&
1156 cast<ConstantSDNode>(Val&: Ops[2])->isZero() &&
1157 cast<ConstantSDNode>(Val&: Ops[4])->isZero();
1158 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1159 CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops);
1160}
1161
1162void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1163 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1164 SDValue Ops[8];
1165
1166 SelectVOP3Mods0(In: N->getOperand(Num: 1), Src&: Ops[1], SrcMods&: Ops[0], Clamp&: Ops[4], Omod&: Ops[5]);
1167 SelectVOP3Mods(In: N->getOperand(Num: 2), Src&: Ops[3], SrcMods&: Ops[2]);
1168 Ops[6] = N->getOperand(Num: 0);
1169 Ops[7] = N->getOperand(Num: 3);
1170
1171 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::V_MUL_F32_e64, VTs: N->getVTList(), Ops);
1172}
1173
1174// We need to handle this here because tablegen doesn't support matching
1175// instructions with multiple outputs.
1176void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1177 EVT VT = N->getValueType(ResNo: 0);
1178
1179 assert(VT == MVT::f32 || VT == MVT::f64);
1180
1181 unsigned Opc
1182 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1183
1184 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1185 // omod
1186 SDValue Ops[8];
1187 SelectVOP3BMods0(In: N->getOperand(Num: 0), Src&: Ops[1], SrcMods&: Ops[0], Clamp&: Ops[6], Omod&: Ops[7]);
1188 SelectVOP3BMods(In: N->getOperand(Num: 1), Src&: Ops[3], SrcMods&: Ops[2]);
1189 SelectVOP3BMods(In: N->getOperand(Num: 2), Src&: Ops[5], SrcMods&: Ops[4]);
1190 CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
1191}
1192
1193// We need to handle this here because tablegen doesn't support matching
1194// instructions with multiple outputs.
1195void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1196 SDLoc SL(N);
1197 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1198 unsigned Opc;
1199 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(Value: 1);
1200 if (Subtarget->hasMADIntraFwdBug())
1201 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1202 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1203 else if (UseNoCarry)
1204 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1205 else
1206 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1207
1208 SDValue Clamp = CurDAG->getTargetConstant(Val: 0, DL: SL, VT: MVT::i1);
1209 SDValue Ops[] = { N->getOperand(Num: 0), N->getOperand(Num: 1), N->getOperand(Num: 2),
1210 Clamp };
1211
1212 if (UseNoCarry) {
1213 MachineSDNode *Mad = CurDAG->getMachineNode(Opcode: Opc, dl: SL, VT: MVT::i64, Ops);
1214 ReplaceUses(F: SDValue(N, 0), T: SDValue(Mad, 0));
1215 CurDAG->RemoveDeadNode(N);
1216 return;
1217 }
1218
1219 CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
1220}
1221
1222// We need to handle this here because tablegen doesn't support matching
1223// instructions with multiple outputs.
1224void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1225 SDLoc SL(N);
1226 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1227 SDVTList VTList;
1228 unsigned Opc;
1229 if (Subtarget->hasMadU64U32NoCarry()) {
1230 VTList = CurDAG->getVTList(VT: MVT::i64);
1231 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1232 } else {
1233 VTList = CurDAG->getVTList(VT1: MVT::i64, VT2: MVT::i1);
1234 if (Subtarget->hasMADIntraFwdBug()) {
1235 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1236 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1237 } else {
1238 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1239 }
1240 }
1241
1242 SDValue Zero = CurDAG->getTargetConstant(Val: 0, DL: SL, VT: MVT::i64);
1243 SDValue Clamp = CurDAG->getTargetConstant(Val: 0, DL: SL, VT: MVT::i1);
1244 SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), Zero, Clamp};
1245 SDNode *Mad = CurDAG->getMachineNode(Opcode: Opc, dl: SL, VTs: VTList, Ops);
1246 if (!SDValue(N, 0).use_empty()) {
1247 SDValue Sub0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32);
1248 SDNode *Lo = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: SL,
1249 VT: MVT::i32, Op1: SDValue(Mad, 0), Op2: Sub0);
1250 ReplaceUses(F: SDValue(N, 0), T: SDValue(Lo, 0));
1251 }
1252 if (!SDValue(N, 1).use_empty()) {
1253 SDValue Sub1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32);
1254 SDNode *Hi = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: SL,
1255 VT: MVT::i32, Op1: SDValue(Mad, 0), Op2: Sub1);
1256 ReplaceUses(F: SDValue(N, 1), T: SDValue(Hi, 0));
1257 }
1258 CurDAG->RemoveDeadNode(N);
1259}
1260
1261bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1262 if (!isUInt<16>(x: Offset))
1263 return false;
1264
1265 if (!Base || Subtarget->hasUsableDSOffset() ||
1266 Subtarget->unsafeDSOffsetFoldingEnabled())
1267 return true;
1268
1269 // On Southern Islands instruction with a negative base value and an offset
1270 // don't seem to work.
1271 return CurDAG->SignBitIsZero(Op: Base);
1272}
1273
1274bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1275 SDValue &Offset) const {
1276 SDLoc DL(Addr);
1277 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1278 SDValue N0 = Addr.getOperand(i: 0);
1279 SDValue N1 = Addr.getOperand(i: 1);
1280 ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
1281 if (isDSOffsetLegal(Base: N0, Offset: C1->getSExtValue())) {
1282 // (add n0, c0)
1283 Base = N0;
1284 Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i16);
1285 return true;
1286 }
1287 } else if (Addr.getOpcode() == ISD::SUB) {
1288 // sub C, x -> add (sub 0, x), C
1289 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 0))) {
1290 int64_t ByteOffset = C->getSExtValue();
1291 if (isDSOffsetLegal(Base: SDValue(), Offset: ByteOffset)) {
1292 SDValue Zero = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1293
1294 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1295 // the known bits in isDSOffsetLegal. We need to emit the selected node
1296 // here, so this is thrown away.
1297 SDValue Sub = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32,
1298 N1: Zero, N2: Addr.getOperand(i: 1));
1299
1300 if (isDSOffsetLegal(Base: Sub, Offset: ByteOffset)) {
1301 SmallVector<SDValue, 3> Opnds;
1302 Opnds.push_back(Elt: Zero);
1303 Opnds.push_back(Elt: Addr.getOperand(i: 1));
1304
1305 // FIXME: Select to VOP3 version for with-carry.
1306 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1307 if (Subtarget->hasAddNoCarryInsts()) {
1308 SubOp = AMDGPU::V_SUB_U32_e64;
1309 Opnds.push_back(
1310 Elt: CurDAG->getTargetConstant(Val: 0, DL: {}, VT: MVT::i1)); // clamp bit
1311 }
1312
1313 MachineSDNode *MachineSub =
1314 CurDAG->getMachineNode(Opcode: SubOp, dl: DL, VT: MVT::i32, Ops: Opnds);
1315
1316 Base = SDValue(MachineSub, 0);
1317 Offset = CurDAG->getTargetConstant(Val: ByteOffset, DL, VT: MVT::i16);
1318 return true;
1319 }
1320 }
1321 }
1322 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1323 // If we have a constant address, prefer to put the constant into the
1324 // offset. This can save moves to load the constant address since multiple
1325 // operations can share the zero base address register, and enables merging
1326 // into read2 / write2 instructions.
1327
1328 SDLoc DL(Addr);
1329
1330 if (isDSOffsetLegal(Base: SDValue(), Offset: CAddr->getZExtValue())) {
1331 SDValue Zero = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1332 MachineSDNode *MovZero = CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32,
1333 dl: DL, VT: MVT::i32, Op1: Zero);
1334 Base = SDValue(MovZero, 0);
1335 Offset = CurDAG->getTargetConstant(Val: CAddr->getZExtValue(), DL, VT: MVT::i16);
1336 return true;
1337 }
1338 }
1339
1340 // default case
1341 Base = Addr;
1342 Offset = CurDAG->getTargetConstant(Val: 0, DL: SDLoc(Addr), VT: MVT::i16);
1343 return true;
1344}
1345
1346bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1347 unsigned Offset1,
1348 unsigned Size) const {
1349 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1350 return false;
1351 if (!isUInt<8>(x: Offset0 / Size) || !isUInt<8>(x: Offset1 / Size))
1352 return false;
1353
1354 if (!Base || Subtarget->hasUsableDSOffset() ||
1355 Subtarget->unsafeDSOffsetFoldingEnabled())
1356 return true;
1357
1358 // On Southern Islands instruction with a negative base value and an offset
1359 // don't seem to work.
1360 return CurDAG->SignBitIsZero(Op: Base);
1361}
1362
1363// Return whether the operation has NoUnsignedWrap property.
1364static bool isNoUnsignedWrap(SDValue Addr) {
1365 return (Addr.getOpcode() == ISD::ADD &&
1366 Addr->getFlags().hasNoUnsignedWrap()) ||
1367 Addr->getOpcode() == ISD::OR;
1368}
1369
1370// Check that the base address of flat scratch load/store in the form of `base +
1371// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1372// requirement). We always treat the first operand as the base address here.
1373bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1374 if (isNoUnsignedWrap(Addr))
1375 return true;
1376
1377 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1378 // values.
1379 if (Subtarget->hasSignedScratchOffsets())
1380 return true;
1381
1382 auto LHS = Addr.getOperand(i: 0);
1383 auto RHS = Addr.getOperand(i: 1);
1384
1385 // If the immediate offset is negative and within certain range, the base
1386 // address cannot also be negative. If the base is also negative, the sum
1387 // would be either negative or much larger than the valid range of scratch
1388 // memory a thread can access.
1389 ConstantSDNode *ImmOp = nullptr;
1390 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(Val&: RHS))) {
1391 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1392 return true;
1393 }
1394
1395 return CurDAG->SignBitIsZero(Op: LHS);
1396}
1397
1398// Check address value in SGPR/VGPR are legal for flat scratch in the form
1399// of: SGPR + VGPR.
1400bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1401 if (isNoUnsignedWrap(Addr))
1402 return true;
1403
1404 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1405 // values.
1406 if (Subtarget->hasSignedScratchOffsets())
1407 return true;
1408
1409 auto LHS = Addr.getOperand(i: 0);
1410 auto RHS = Addr.getOperand(i: 1);
1411 return CurDAG->SignBitIsZero(Op: RHS) && CurDAG->SignBitIsZero(Op: LHS);
1412}
1413
1414// Check address value in SGPR/VGPR are legal for flat scratch in the form
1415// of: SGPR + VGPR + Imm.
1416bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1417 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1418 // values.
1419 if (AMDGPU::isGFX12Plus(STI: *Subtarget))
1420 return true;
1421
1422 auto Base = Addr.getOperand(i: 0);
1423 auto *RHSImm = cast<ConstantSDNode>(Val: Addr.getOperand(i: 1));
1424 // If the immediate offset is negative and within certain range, the base
1425 // address cannot also be negative. If the base is also negative, the sum
1426 // would be either negative or much larger than the valid range of scratch
1427 // memory a thread can access.
1428 if (isNoUnsignedWrap(Addr: Base) &&
1429 (isNoUnsignedWrap(Addr) ||
1430 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1431 return true;
1432
1433 auto LHS = Base.getOperand(i: 0);
1434 auto RHS = Base.getOperand(i: 1);
1435 return CurDAG->SignBitIsZero(Op: RHS) && CurDAG->SignBitIsZero(Op: LHS);
1436}
1437
1438// TODO: If offset is too big, put low 16-bit into offset.
1439bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1440 SDValue &Offset0,
1441 SDValue &Offset1) const {
1442 return SelectDSReadWrite2(Ptr: Addr, Base, Offset0, Offset1, Size: 4);
1443}
1444
1445bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1446 SDValue &Offset0,
1447 SDValue &Offset1) const {
1448 return SelectDSReadWrite2(Ptr: Addr, Base, Offset0, Offset1, Size: 8);
1449}
1450
1451bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1452 SDValue &Offset0, SDValue &Offset1,
1453 unsigned Size) const {
1454 SDLoc DL(Addr);
1455
1456 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1457 SDValue N0 = Addr.getOperand(i: 0);
1458 SDValue N1 = Addr.getOperand(i: 1);
1459 ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
1460 unsigned OffsetValue0 = C1->getZExtValue();
1461 unsigned OffsetValue1 = OffsetValue0 + Size;
1462
1463 // (add n0, c0)
1464 if (isDSOffset2Legal(Base: N0, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1465 Base = N0;
1466 Offset0 = CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i32);
1467 Offset1 = CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i32);
1468 return true;
1469 }
1470 } else if (Addr.getOpcode() == ISD::SUB) {
1471 // sub C, x -> add (sub 0, x), C
1472 if (const ConstantSDNode *C =
1473 dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 0))) {
1474 unsigned OffsetValue0 = C->getZExtValue();
1475 unsigned OffsetValue1 = OffsetValue0 + Size;
1476
1477 if (isDSOffset2Legal(Base: SDValue(), Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1478 SDLoc DL(Addr);
1479 SDValue Zero = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1480
1481 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1482 // the known bits in isDSOffsetLegal. We need to emit the selected node
1483 // here, so this is thrown away.
1484 SDValue Sub =
1485 CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: Zero, N2: Addr.getOperand(i: 1));
1486
1487 if (isDSOffset2Legal(Base: Sub, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1488 SmallVector<SDValue, 3> Opnds;
1489 Opnds.push_back(Elt: Zero);
1490 Opnds.push_back(Elt: Addr.getOperand(i: 1));
1491 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1492 if (Subtarget->hasAddNoCarryInsts()) {
1493 SubOp = AMDGPU::V_SUB_U32_e64;
1494 Opnds.push_back(
1495 Elt: CurDAG->getTargetConstant(Val: 0, DL: {}, VT: MVT::i1)); // clamp bit
1496 }
1497
1498 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1499 Opcode: SubOp, dl: DL, VT: MVT::getIntegerVT(BitWidth: Size * 8), Ops: Opnds);
1500
1501 Base = SDValue(MachineSub, 0);
1502 Offset0 =
1503 CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i32);
1504 Offset1 =
1505 CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i32);
1506 return true;
1507 }
1508 }
1509 }
1510 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1511 unsigned OffsetValue0 = CAddr->getZExtValue();
1512 unsigned OffsetValue1 = OffsetValue0 + Size;
1513
1514 if (isDSOffset2Legal(Base: SDValue(), Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1515 SDValue Zero = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1516 MachineSDNode *MovZero =
1517 CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32, Op1: Zero);
1518 Base = SDValue(MovZero, 0);
1519 Offset0 = CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i32);
1520 Offset1 = CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i32);
1521 return true;
1522 }
1523 }
1524
1525 // default case
1526
1527 Base = Addr;
1528 Offset0 = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1529 Offset1 = CurDAG->getTargetConstant(Val: 1, DL, VT: MVT::i32);
1530 return true;
1531}
1532
1533bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1534 SDValue &SOffset, SDValue &Offset,
1535 SDValue &Offen, SDValue &Idxen,
1536 SDValue &Addr64) const {
1537 // Subtarget prefers to use flat instruction
1538 // FIXME: This should be a pattern predicate and not reach here
1539 if (Subtarget->useFlatForGlobal())
1540 return false;
1541
1542 SDLoc DL(Addr);
1543
1544 Idxen = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
1545 Offen = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
1546 Addr64 = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
1547 SOffset = Subtarget->hasRestrictedSOffset()
1548 ? CurDAG->getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32)
1549 : CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1550
1551 ConstantSDNode *C1 = nullptr;
1552 SDValue N0 = Addr;
1553 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1554 C1 = cast<ConstantSDNode>(Val: Addr.getOperand(i: 1));
1555 if (isUInt<32>(x: C1->getZExtValue()))
1556 N0 = Addr.getOperand(i: 0);
1557 else
1558 C1 = nullptr;
1559 }
1560
1561 if (N0->isAnyAdd()) {
1562 // (add N2, N3) -> addr64, or
1563 // (add (add N2, N3), C1) -> addr64
1564 SDValue N2 = N0.getOperand(i: 0);
1565 SDValue N3 = N0.getOperand(i: 1);
1566 Addr64 = CurDAG->getTargetConstant(Val: 1, DL, VT: MVT::i1);
1567
1568 if (N2->isDivergent()) {
1569 if (N3->isDivergent()) {
1570 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1571 // addr64, and construct the resource from a 0 address.
1572 Ptr = SDValue(buildSMovImm64(DL, Imm: 0, VT: MVT::v2i32), 0);
1573 VAddr = N0;
1574 } else {
1575 // N2 is divergent, N3 is not.
1576 Ptr = N3;
1577 VAddr = N2;
1578 }
1579 } else {
1580 // N2 is not divergent.
1581 Ptr = N2;
1582 VAddr = N3;
1583 }
1584 Offset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1585 } else if (N0->isDivergent()) {
1586 // N0 is divergent. Use it as the addr64, and construct the resource from a
1587 // 0 address.
1588 Ptr = SDValue(buildSMovImm64(DL, Imm: 0, VT: MVT::v2i32), 0);
1589 VAddr = N0;
1590 Addr64 = CurDAG->getTargetConstant(Val: 1, DL, VT: MVT::i1);
1591 } else {
1592 // N0 -> offset, or
1593 // (N0 + C1) -> offset
1594 VAddr = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1595 Ptr = N0;
1596 }
1597
1598 if (!C1) {
1599 // No offset.
1600 Offset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1601 return true;
1602 }
1603
1604 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1605 if (TII->isLegalMUBUFImmOffset(Imm: C1->getZExtValue())) {
1606 // Legal offset for instruction.
1607 Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32);
1608 return true;
1609 }
1610
1611 // Illegal offset, store it in soffset.
1612 Offset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1613 SOffset =
1614 SDValue(CurDAG->getMachineNode(
1615 Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
1616 Op1: CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32)),
1617 0);
1618 return true;
1619}
1620
1621bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1622 SDValue &VAddr, SDValue &SOffset,
1623 SDValue &Offset) const {
1624 SDValue Ptr, Offen, Idxen, Addr64;
1625
1626 // addr64 bit was removed for volcanic islands.
1627 // FIXME: This should be a pattern predicate and not reach here
1628 if (!Subtarget->hasAddr64())
1629 return false;
1630
1631 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1632 return false;
1633
1634 ConstantSDNode *C = cast<ConstantSDNode>(Val&: Addr64);
1635 if (C->getSExtValue()) {
1636 SDLoc DL(Addr);
1637
1638 const SITargetLowering& Lowering =
1639 *static_cast<const SITargetLowering*>(getTargetLowering());
1640
1641 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(DAG&: *CurDAG, DL, Ptr), 0);
1642 return true;
1643 }
1644
1645 return false;
1646}
1647
1648std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1649 SDLoc DL(N);
1650
1651 auto *FI = dyn_cast<FrameIndexSDNode>(Val&: N);
1652 SDValue TFI =
1653 FI ? CurDAG->getTargetFrameIndex(FI: FI->getIndex(), VT: FI->getValueType(ResNo: 0)) : N;
1654
1655 // We rebase the base address into an absolute stack address and hence
1656 // use constant 0 for soffset. This value must be retained until
1657 // frame elimination and eliminateFrameIndex will choose the appropriate
1658 // frame register if need be.
1659 return std::pair(TFI, CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32));
1660}
1661
1662bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1663 SDValue Addr, SDValue &Rsrc,
1664 SDValue &VAddr, SDValue &SOffset,
1665 SDValue &ImmOffset) const {
1666
1667 SDLoc DL(Addr);
1668 MachineFunction &MF = CurDAG->getMachineFunction();
1669 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1670
1671 Rsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1672
1673 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1674 int64_t Imm = CAddr->getSExtValue();
1675 const int64_t NullPtr =
1676 AMDGPUTargetMachine::getNullPointerValue(AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
1677 // Don't fold null pointer.
1678 if (Imm != NullPtr) {
1679 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
1680 SDValue HighBits =
1681 CurDAG->getTargetConstant(Val: Imm & ~MaxOffset, DL, VT: MVT::i32);
1682 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1683 Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32, Op1: HighBits);
1684 VAddr = SDValue(MovHighBits, 0);
1685
1686 SOffset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1687 ImmOffset = CurDAG->getTargetConstant(Val: Imm & MaxOffset, DL, VT: MVT::i32);
1688 return true;
1689 }
1690 }
1691
1692 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1693 // (add n0, c1)
1694
1695 SDValue N0 = Addr.getOperand(i: 0);
1696 uint64_t C1 = Addr.getConstantOperandVal(i: 1);
1697
1698 // Offsets in vaddr must be positive if range checking is enabled.
1699 //
1700 // The total computation of vaddr + soffset + offset must not overflow. If
1701 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1702 // overflowing.
1703 //
1704 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1705 // always perform a range check. If a negative vaddr base index was used,
1706 // this would fail the range check. The overall address computation would
1707 // compute a valid address, but this doesn't happen due to the range
1708 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1709 //
1710 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1711 // MUBUF vaddr, but not on older subtargets which can only do this if the
1712 // sign bit is known 0.
1713 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1714 if (TII->isLegalMUBUFImmOffset(Imm: C1) &&
1715 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1716 CurDAG->SignBitIsZero(Op: N0))) {
1717 std::tie(args&: VAddr, args&: SOffset) = foldFrameIndex(N: N0);
1718 ImmOffset = CurDAG->getTargetConstant(Val: C1, DL, VT: MVT::i32);
1719 return true;
1720 }
1721 }
1722
1723 // (node)
1724 std::tie(args&: VAddr, args&: SOffset) = foldFrameIndex(N: Addr);
1725 ImmOffset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1726 return true;
1727}
1728
1729static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1730 if (Val.getOpcode() != ISD::CopyFromReg)
1731 return false;
1732 auto Reg = cast<RegisterSDNode>(Val: Val.getOperand(i: 1))->getReg();
1733 if (!Reg.isPhysical())
1734 return false;
1735 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1736 return RC && TRI.isSGPRClass(RC);
1737}
1738
1739bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1740 SDValue Addr,
1741 SDValue &SRsrc,
1742 SDValue &SOffset,
1743 SDValue &Offset) const {
1744 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1745 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1746 MachineFunction &MF = CurDAG->getMachineFunction();
1747 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1748 SDLoc DL(Addr);
1749
1750 // CopyFromReg <sgpr>
1751 if (IsCopyFromSGPR(TRI: *TRI, Val: Addr)) {
1752 SRsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1753 SOffset = Addr;
1754 Offset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1755 return true;
1756 }
1757
1758 ConstantSDNode *CAddr;
1759 if (Addr.getOpcode() == ISD::ADD) {
1760 // Add (CopyFromReg <sgpr>) <constant>
1761 CAddr = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 1));
1762 if (!CAddr || !TII->isLegalMUBUFImmOffset(Imm: CAddr->getZExtValue()))
1763 return false;
1764 if (!IsCopyFromSGPR(TRI: *TRI, Val: Addr.getOperand(i: 0)))
1765 return false;
1766
1767 SOffset = Addr.getOperand(i: 0);
1768 } else if ((CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) &&
1769 TII->isLegalMUBUFImmOffset(Imm: CAddr->getZExtValue())) {
1770 // <constant>
1771 SOffset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1772 } else {
1773 return false;
1774 }
1775
1776 SRsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1777
1778 Offset = CurDAG->getTargetConstant(Val: CAddr->getZExtValue(), DL, VT: MVT::i32);
1779 return true;
1780}
1781
1782bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1783 SDValue &SOffset, SDValue &Offset
1784 ) const {
1785 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1786 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1787
1788 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1789 return false;
1790
1791 if (!cast<ConstantSDNode>(Val&: Offen)->getSExtValue() &&
1792 !cast<ConstantSDNode>(Val&: Idxen)->getSExtValue() &&
1793 !cast<ConstantSDNode>(Val&: Addr64)->getSExtValue()) {
1794 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1795 maskTrailingOnes<uint64_t>(N: 32); // Size
1796 SDLoc DL(Addr);
1797
1798 const SITargetLowering& Lowering =
1799 *static_cast<const SITargetLowering*>(getTargetLowering());
1800
1801 SRsrc = SDValue(Lowering.buildRSRC(DAG&: *CurDAG, DL, Ptr, RsrcDword1: 0, RsrcDword2And3: Rsrc), 0);
1802 return true;
1803 }
1804 return false;
1805}
1806
1807bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1808 SDValue &SOffset) const {
1809 if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: ByteOffsetNode)) {
1810 SOffset = CurDAG->getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32);
1811 return true;
1812 }
1813
1814 SOffset = ByteOffsetNode;
1815 return true;
1816}
1817
1818// Find a load or store from corresponding pattern root.
1819// Roots may be build_vector, bitconvert or their combinations.
1820static MemSDNode* findMemSDNode(SDNode *N) {
1821 N = AMDGPUTargetLowering::stripBitcast(Val: SDValue(N,0)).getNode();
1822 if (MemSDNode *MN = dyn_cast<MemSDNode>(Val: N))
1823 return MN;
1824 assert(isa<BuildVectorSDNode>(N));
1825 for (SDValue V : N->op_values())
1826 if (MemSDNode *MN =
1827 dyn_cast<MemSDNode>(Val: AMDGPUTargetLowering::stripBitcast(Val: V)))
1828 return MN;
1829 llvm_unreachable("cannot find MemSDNode in the pattern!");
1830}
1831
1832bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1833 SDValue &VAddr, SDValue &Offset,
1834 uint64_t FlatVariant) const {
1835 int64_t OffsetVal = 0;
1836
1837 unsigned AS = findMemSDNode(N)->getAddressSpace();
1838
1839 bool CanHaveFlatSegmentOffsetBug =
1840 Subtarget->hasFlatSegmentOffsetBug() &&
1841 FlatVariant == SIInstrFlags::FLAT &&
1842 (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
1843
1844 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1845 SDValue N0, N1;
1846 if (isBaseWithConstantOffset64(Addr, LHS&: N0, RHS&: N1) &&
1847 (FlatVariant != SIInstrFlags::FlatScratch ||
1848 isFlatScratchBaseLegal(Addr))) {
1849 int64_t COffsetVal = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
1850
1851 // Adding the offset to the base address in a FLAT instruction must not
1852 // change the memory aperture in which the address falls. Therefore we can
1853 // only fold offsets from inbounds GEPs into FLAT instructions.
1854 bool IsInBounds =
1855 Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
1856 if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
1857 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1858 if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AS, FlatVariant)) {
1859 Addr = N0;
1860 OffsetVal = COffsetVal;
1861 } else {
1862 // If the offset doesn't fit, put the low bits into the offset field
1863 // and add the rest.
1864 //
1865 // For a FLAT instruction the hardware decides whether to access
1866 // global/scratch/shared memory based on the high bits of vaddr,
1867 // ignoring the offset field, so we have to ensure that when we add
1868 // remainder to vaddr it still points into the same underlying object.
1869 // The easiest way to do that is to make sure that we split the offset
1870 // into two pieces that are both >= 0 or both <= 0.
1871
1872 SDLoc DL(N);
1873 uint64_t RemainderOffset;
1874
1875 std::tie(args&: OffsetVal, args&: RemainderOffset) =
1876 TII->splitFlatOffset(COffsetVal, AddrSpace: AS, FlatVariant);
1877
1878 SDValue AddOffsetLo =
1879 getMaterializedScalarImm32(Val: Lo_32(Value: RemainderOffset), DL);
1880 SDValue Clamp = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
1881
1882 if (Addr.getValueType().getSizeInBits() == 32) {
1883 SmallVector<SDValue, 3> Opnds;
1884 Opnds.push_back(Elt: N0);
1885 Opnds.push_back(Elt: AddOffsetLo);
1886 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1887 if (Subtarget->hasAddNoCarryInsts()) {
1888 AddOp = AMDGPU::V_ADD_U32_e64;
1889 Opnds.push_back(Elt: Clamp);
1890 }
1891 Addr =
1892 SDValue(CurDAG->getMachineNode(Opcode: AddOp, dl: DL, VT: MVT::i32, Ops: Opnds), 0);
1893 } else {
1894 // TODO: Should this try to use a scalar add pseudo if the base
1895 // address is uniform and saddr is usable?
1896 SDValue Sub0 =
1897 CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
1898 SDValue Sub1 =
1899 CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
1900
1901 SDNode *N0Lo = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1902 dl: DL, VT: MVT::i32, Op1: N0, Op2: Sub0);
1903 SDNode *N0Hi = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1904 dl: DL, VT: MVT::i32, Op1: N0, Op2: Sub1);
1905
1906 SDValue AddOffsetHi =
1907 getMaterializedScalarImm32(Val: Hi_32(Value: RemainderOffset), DL);
1908
1909 SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i1);
1910
1911 SDNode *Add =
1912 CurDAG->getMachineNode(Opcode: AMDGPU::V_ADD_CO_U32_e64, dl: DL, VTs,
1913 Ops: {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1914
1915 SDNode *Addc = CurDAG->getMachineNode(
1916 Opcode: AMDGPU::V_ADDC_U32_e64, dl: DL, VTs,
1917 Ops: {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1918
1919 SDValue RegSequenceArgs[] = {
1920 CurDAG->getTargetConstant(Val: AMDGPU::VReg_64RegClassID, DL,
1921 VT: MVT::i32),
1922 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1923
1924 Addr = SDValue(CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL,
1925 VT: MVT::i64, Ops: RegSequenceArgs),
1926 0);
1927 }
1928 }
1929 }
1930 }
1931 }
1932
1933 VAddr = Addr;
1934 Offset = CurDAG->getSignedTargetConstant(Val: OffsetVal, DL: SDLoc(), VT: MVT::i32);
1935 return true;
1936}
1937
1938bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1939 SDValue &VAddr,
1940 SDValue &Offset) const {
1941 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, FlatVariant: SIInstrFlags::FLAT);
1942}
1943
1944bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1945 SDValue &VAddr,
1946 SDValue &Offset) const {
1947 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, FlatVariant: SIInstrFlags::FlatGlobal);
1948}
1949
1950bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1951 SDValue &VAddr,
1952 SDValue &Offset) const {
1953 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1954 FlatVariant: SIInstrFlags::FlatScratch);
1955}
1956
1957// If this matches *_extend i32:x, return x
1958// Otherwise if the value is I32 returns x.
1959static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
1960 const SelectionDAG *DAG) {
1961 if (Op.getValueType() == MVT::i32)
1962 return Op;
1963
1964 if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
1965 Op.getOpcode() != ISD::ANY_EXTEND &&
1966 !(DAG->SignBitIsZero(Op) &&
1967 Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
1968 return SDValue();
1969
1970 SDValue ExtSrc = Op.getOperand(i: 0);
1971 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1972}
1973
1974// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1975// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
1976bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
1977 SDValue &SAddr, SDValue &VOffset,
1978 SDValue &Offset, bool &ScaleOffset,
1979 bool NeedIOffset) const {
1980 int64_t ImmOffset = 0;
1981 ScaleOffset = false;
1982
1983 // Match the immediate offset first, which canonically is moved as low as
1984 // possible.
1985
1986 SDValue LHS, RHS;
1987 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1988 int64_t COffsetVal = cast<ConstantSDNode>(Val&: RHS)->getSExtValue();
1989 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1990
1991 if (NeedIOffset &&
1992 TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
1993 FlatVariant: SIInstrFlags::FlatGlobal)) {
1994 Addr = LHS;
1995 ImmOffset = COffsetVal;
1996 } else if (!LHS->isDivergent()) {
1997 if (COffsetVal > 0) {
1998 SDLoc SL(N);
1999 // saddr + large_offset -> saddr +
2000 // (voffset = large_offset & ~MaxOffset) +
2001 // (large_offset & MaxOffset);
2002 int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
2003 if (NeedIOffset) {
2004 std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
2005 COffsetVal, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS, FlatVariant: SIInstrFlags::FlatGlobal);
2006 }
2007
2008 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(x: RemainderOffset)
2009 : isUInt<32>(x: RemainderOffset)) {
2010 SDNode *VMov = CurDAG->getMachineNode(
2011 Opcode: AMDGPU::V_MOV_B32_e32, dl: SL, VT: MVT::i32,
2012 Op1: CurDAG->getTargetConstant(Val: RemainderOffset, DL: SDLoc(), VT: MVT::i32));
2013 VOffset = SDValue(VMov, 0);
2014 SAddr = LHS;
2015 Offset = CurDAG->getTargetConstant(Val: SplitImmOffset, DL: SDLoc(), VT: MVT::i32);
2016 return true;
2017 }
2018 }
2019
2020 // We are adding a 64 bit SGPR and a constant. If constant bus limit
2021 // is 1 we would need to perform 1 or 2 extra moves for each half of
2022 // the constant and it is better to do a scalar add and then issue a
2023 // single VALU instruction to materialize zero. Otherwise it is less
2024 // instructions to perform VALU adds with immediates or inline literals.
2025 unsigned NumLiterals =
2026 !TII->isInlineConstant(Imm: APInt(32, Lo_32(Value: COffsetVal))) +
2027 !TII->isInlineConstant(Imm: APInt(32, Hi_32(Value: COffsetVal)));
2028 if (Subtarget->getConstantBusLimit(Opcode: AMDGPU::V_ADD_U32_e64) > NumLiterals)
2029 return false;
2030 }
2031 }
2032
2033 // Match the variable offset.
2034 if (Addr->isAnyAdd()) {
2035 LHS = Addr.getOperand(i: 0);
2036
2037 if (!LHS->isDivergent()) {
2038 // add (i64 sgpr), (*_extend (i32 vgpr))
2039 RHS = Addr.getOperand(i: 1);
2040 ScaleOffset = SelectScaleOffset(N, Offset&: RHS, IsSigned: Subtarget->hasSignedGVSOffset());
2041 if (SDValue ExtRHS = matchExtFromI32orI32(
2042 Op: RHS, IsSigned: Subtarget->hasSignedGVSOffset(), DAG: CurDAG)) {
2043 SAddr = LHS;
2044 VOffset = ExtRHS;
2045 }
2046 }
2047
2048 RHS = Addr.getOperand(i: 1);
2049 if (!SAddr && !RHS->isDivergent()) {
2050 // add (*_extend (i32 vgpr)), (i64 sgpr)
2051 ScaleOffset = SelectScaleOffset(N, Offset&: LHS, IsSigned: Subtarget->hasSignedGVSOffset());
2052 if (SDValue ExtLHS = matchExtFromI32orI32(
2053 Op: LHS, IsSigned: Subtarget->hasSignedGVSOffset(), DAG: CurDAG)) {
2054 SAddr = RHS;
2055 VOffset = ExtLHS;
2056 }
2057 }
2058
2059 if (SAddr) {
2060 Offset = CurDAG->getSignedTargetConstant(Val: ImmOffset, DL: SDLoc(), VT: MVT::i32);
2061 return true;
2062 }
2063 }
2064
2065 if (Subtarget->hasScaleOffset() &&
2066 (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
2067 ? AMDGPUISD::MAD_I64_I32
2068 : AMDGPUISD::MAD_U64_U32) ||
2069 (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
2070 CurDAG->SignBitIsZero(Op: Addr.getOperand(i: 0)))) &&
2071 Addr.getOperand(i: 0)->isDivergent() &&
2072 isa<ConstantSDNode>(Val: Addr.getOperand(i: 1)) &&
2073 !Addr.getOperand(i: 2)->isDivergent()) {
2074 // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
2075 unsigned Size =
2076 (unsigned)cast<MemSDNode>(Val: N)->getMemoryVT().getFixedSizeInBits() / 8;
2077 ScaleOffset = Addr.getConstantOperandVal(i: 1) == Size;
2078 if (ScaleOffset) {
2079 SAddr = Addr.getOperand(i: 2);
2080 VOffset = Addr.getOperand(i: 0);
2081 Offset = CurDAG->getTargetConstant(Val: ImmOffset, DL: SDLoc(), VT: MVT::i32);
2082 return true;
2083 }
2084 }
2085
2086 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
2087 isa<ConstantSDNode>(Val: Addr))
2088 return false;
2089
2090 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
2091 // moves required to copy a 64-bit SGPR to VGPR.
2092 SAddr = Addr;
2093 SDNode *VMov =
2094 CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: SDLoc(Addr), VT: MVT::i32,
2095 Op1: CurDAG->getTargetConstant(Val: 0, DL: SDLoc(), VT: MVT::i32));
2096 VOffset = SDValue(VMov, 0);
2097 Offset = CurDAG->getSignedTargetConstant(Val: ImmOffset, DL: SDLoc(), VT: MVT::i32);
2098 return true;
2099}
2100
2101bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2102 SDValue &SAddr, SDValue &VOffset,
2103 SDValue &Offset,
2104 SDValue &CPol) const {
2105 bool ScaleOffset;
2106 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2107 return false;
2108
2109 CPol = CurDAG->getTargetConstant(Val: ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2110 DL: SDLoc(), VT: MVT::i32);
2111 return true;
2112}
2113
2114bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2115 SDValue &SAddr, SDValue &VOffset,
2116 SDValue &Offset,
2117 SDValue &CPol) const {
2118 bool ScaleOffset;
2119 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2120 return false;
2121
2122 // We are assuming CPol is always the last operand of the intrinsic.
2123 auto PassedCPol =
2124 N->getConstantOperandVal(Num: N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2125 CPol = CurDAG->getTargetConstant(
2126 Val: (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, DL: SDLoc(), VT: MVT::i32);
2127 return true;
2128}
2129
2130bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
2131 SDValue &SAddr,
2132 SDValue &VOffset,
2133 SDValue &Offset,
2134 SDValue &CPol) const {
2135 bool ScaleOffset;
2136 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2137 return false;
2138
2139 // We are assuming CPol is second from last operand of the intrinsic.
2140 auto PassedCPol =
2141 N->getConstantOperandVal(Num: N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2142 CPol = CurDAG->getTargetConstant(
2143 Val: (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, DL: SDLoc(), VT: MVT::i32);
2144 return true;
2145}
2146
2147bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
2148 SDValue &SAddr, SDValue &VOffset,
2149 SDValue &Offset,
2150 SDValue &CPol) const {
2151 bool ScaleOffset;
2152 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2153 return false;
2154
2155 unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
2156 CPol = CurDAG->getTargetConstant(Val: CPolVal, DL: SDLoc(), VT: MVT::i32);
2157 return true;
2158}
2159
2160bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2161 SDValue &SAddr,
2162 SDValue &VOffset,
2163 SDValue &CPol) const {
2164 bool ScaleOffset;
2165 SDValue DummyOffset;
2166 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset&: DummyOffset, ScaleOffset,
2167 NeedIOffset: false))
2168 return false;
2169
2170 // We are assuming CPol is always the last operand of the intrinsic.
2171 auto PassedCPol =
2172 N->getConstantOperandVal(Num: N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2173 CPol = CurDAG->getTargetConstant(
2174 Val: (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, DL: SDLoc(), VT: MVT::i32);
2175 return true;
2176}
2177
2178bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
2179 SDValue &SAddr,
2180 SDValue &VOffset,
2181 SDValue &CPol) const {
2182 bool ScaleOffset;
2183 SDValue DummyOffset;
2184 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset&: DummyOffset, ScaleOffset,
2185 NeedIOffset: false))
2186 return false;
2187
2188 // We are assuming CPol is second from last operand of the intrinsic.
2189 auto PassedCPol =
2190 N->getConstantOperandVal(Num: N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2191 CPol = CurDAG->getTargetConstant(
2192 Val: (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, DL: SDLoc(), VT: MVT::i32);
2193 return true;
2194}
2195
2196static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
2197 if (auto *FI = dyn_cast<FrameIndexSDNode>(Val&: SAddr)) {
2198 SAddr = CurDAG->getTargetFrameIndex(FI: FI->getIndex(), VT: FI->getValueType(ResNo: 0));
2199 } else if (SAddr.getOpcode() == ISD::ADD &&
2200 isa<FrameIndexSDNode>(Val: SAddr.getOperand(i: 0))) {
2201 // Materialize this into a scalar move for scalar address to avoid
2202 // readfirstlane.
2203 auto *FI = cast<FrameIndexSDNode>(Val: SAddr.getOperand(i: 0));
2204 SDValue TFI = CurDAG->getTargetFrameIndex(FI: FI->getIndex(),
2205 VT: FI->getValueType(ResNo: 0));
2206 SAddr = SDValue(CurDAG->getMachineNode(Opcode: AMDGPU::S_ADD_I32, dl: SDLoc(SAddr),
2207 VT: MVT::i32, Op1: TFI, Op2: SAddr.getOperand(i: 1)),
2208 0);
2209 }
2210
2211 return SAddr;
2212}
2213
2214// Match (32-bit SGPR base) + sext(imm offset)
2215bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
2216 SDValue &SAddr,
2217 SDValue &Offset) const {
2218 if (Addr->isDivergent())
2219 return false;
2220
2221 SDLoc DL(Addr);
2222
2223 int64_t COffsetVal = 0;
2224
2225 if (CurDAG->isBaseWithConstantOffset(Op: Addr) && isFlatScratchBaseLegal(Addr)) {
2226 COffsetVal = cast<ConstantSDNode>(Val: Addr.getOperand(i: 1))->getSExtValue();
2227 SAddr = Addr.getOperand(i: 0);
2228 } else {
2229 SAddr = Addr;
2230 }
2231
2232 SAddr = SelectSAddrFI(CurDAG, SAddr);
2233
2234 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2235
2236 if (!TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
2237 FlatVariant: SIInstrFlags::FlatScratch)) {
2238 int64_t SplitImmOffset, RemainderOffset;
2239 std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
2240 COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: SIInstrFlags::FlatScratch);
2241
2242 COffsetVal = SplitImmOffset;
2243
2244 SDValue AddOffset =
2245 SAddr.getOpcode() == ISD::TargetFrameIndex
2246 ? getMaterializedScalarImm32(Val: Lo_32(Value: RemainderOffset), DL)
2247 : CurDAG->getSignedTargetConstant(Val: RemainderOffset, DL, VT: MVT::i32);
2248 SAddr = SDValue(CurDAG->getMachineNode(Opcode: AMDGPU::S_ADD_I32, dl: DL, VT: MVT::i32,
2249 Op1: SAddr, Op2: AddOffset),
2250 0);
2251 }
2252
2253 Offset = CurDAG->getSignedTargetConstant(Val: COffsetVal, DL, VT: MVT::i32);
2254
2255 return true;
2256}
2257
2258// Check whether the flat scratch SVS swizzle bug affects this access.
2259bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2260 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2261 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2262 return false;
2263
2264 // The bug affects the swizzling of SVS accesses if there is any carry out
2265 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2266 // voffset to (soffset + inst_offset).
2267 KnownBits VKnown = CurDAG->computeKnownBits(Op: VAddr);
2268 KnownBits SKnown =
2269 KnownBits::add(LHS: CurDAG->computeKnownBits(Op: SAddr),
2270 RHS: KnownBits::makeConstant(C: APInt(32, ImmOffset,
2271 /*isSigned=*/true)));
2272 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2273 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2274 return (VMax & 3) + (SMax & 3) >= 4;
2275}
2276
2277bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2278 SDValue &VAddr, SDValue &SAddr,
2279 SDValue &Offset,
2280 SDValue &CPol) const {
2281 int64_t ImmOffset = 0;
2282
2283 SDValue LHS, RHS;
2284 SDValue OrigAddr = Addr;
2285 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2286 int64_t COffsetVal = cast<ConstantSDNode>(Val&: RHS)->getSExtValue();
2287 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2288
2289 if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
2290 FlatVariant: SIInstrFlags::FlatScratch)) {
2291 Addr = LHS;
2292 ImmOffset = COffsetVal;
2293 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2294 SDLoc SL(N);
2295 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2296 // (large_offset & MaxOffset);
2297 int64_t SplitImmOffset, RemainderOffset;
2298 std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
2299 COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: SIInstrFlags::FlatScratch);
2300
2301 if (isUInt<32>(x: RemainderOffset)) {
2302 SDNode *VMov = CurDAG->getMachineNode(
2303 Opcode: AMDGPU::V_MOV_B32_e32, dl: SL, VT: MVT::i32,
2304 Op1: CurDAG->getTargetConstant(Val: RemainderOffset, DL: SDLoc(), VT: MVT::i32));
2305 VAddr = SDValue(VMov, 0);
2306 SAddr = LHS;
2307 if (!isFlatScratchBaseLegal(Addr))
2308 return false;
2309 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset: SplitImmOffset))
2310 return false;
2311 Offset = CurDAG->getTargetConstant(Val: SplitImmOffset, DL: SDLoc(), VT: MVT::i32);
2312 CPol = CurDAG->getTargetConstant(Val: 0, DL: SDLoc(), VT: MVT::i32);
2313 return true;
2314 }
2315 }
2316 }
2317
2318 if (Addr.getOpcode() != ISD::ADD)
2319 return false;
2320
2321 LHS = Addr.getOperand(i: 0);
2322 RHS = Addr.getOperand(i: 1);
2323
2324 if (!LHS->isDivergent() && RHS->isDivergent()) {
2325 SAddr = LHS;
2326 VAddr = RHS;
2327 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2328 SAddr = RHS;
2329 VAddr = LHS;
2330 } else {
2331 return false;
2332 }
2333
2334 if (OrigAddr != Addr) {
2335 if (!isFlatScratchBaseLegalSVImm(Addr: OrigAddr))
2336 return false;
2337 } else {
2338 if (!isFlatScratchBaseLegalSV(Addr: OrigAddr))
2339 return false;
2340 }
2341
2342 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2343 return false;
2344 SAddr = SelectSAddrFI(CurDAG, SAddr);
2345 Offset = CurDAG->getSignedTargetConstant(Val: ImmOffset, DL: SDLoc(), VT: MVT::i32);
2346
2347 bool ScaleOffset = SelectScaleOffset(N, Offset&: VAddr, IsSigned: true /* IsSigned */);
2348 CPol = CurDAG->getTargetConstant(Val: ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2349 DL: SDLoc(), VT: MVT::i32);
2350 return true;
2351}
2352
2353// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2354// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2355// Handle the case where the Immediate Offset + SOffset is negative.
2356bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2357 bool Imm32Only,
2358 bool IsBuffer,
2359 int64_t ImmOffset) const {
2360 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2361 AMDGPU::hasSMRDSignedImmOffset(ST: *Subtarget)) {
2362 KnownBits SKnown = CurDAG->computeKnownBits(Op: *SOffset);
2363 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2364 return false;
2365 }
2366
2367 return true;
2368}
2369
2370// Given \p Offset and load node \p N check if an \p Offset is a multiple of
2371// the load byte size. If it is update \p Offset to a pre-scaled value and
2372// return true.
2373bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
2374 bool IsSigned) const {
2375 bool ScaleOffset = false;
2376 if (!Subtarget->hasScaleOffset() || !Offset)
2377 return false;
2378
2379 unsigned Size =
2380 (unsigned)cast<MemSDNode>(Val: N)->getMemoryVT().getFixedSizeInBits() / 8;
2381
2382 SDValue Off = Offset;
2383 if (SDValue Ext = matchExtFromI32orI32(Op: Offset, IsSigned, DAG: CurDAG))
2384 Off = Ext;
2385
2386 if (isPowerOf2_32(Value: Size) && Off.getOpcode() == ISD::SHL) {
2387 if (auto *C = dyn_cast<ConstantSDNode>(Val: Off.getOperand(i: 1)))
2388 ScaleOffset = C->getZExtValue() == Log2_32(Value: Size);
2389 } else if (Offset.getOpcode() == ISD::MUL ||
2390 (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
2391 Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
2392 (Offset.isMachineOpcode() &&
2393 Offset.getMachineOpcode() ==
2394 (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
2395 : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
2396 if (auto *C = dyn_cast<ConstantSDNode>(Val: Offset.getOperand(i: 1)))
2397 ScaleOffset = C->getZExtValue() == Size;
2398 }
2399
2400 if (ScaleOffset)
2401 Offset = Off.getOperand(i: 0);
2402
2403 return ScaleOffset;
2404}
2405
2406// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2407// not null) offset. If Imm32Only is true, match only 32-bit immediate
2408// offsets available on CI.
2409bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
2410 SDValue *SOffset, SDValue *Offset,
2411 bool Imm32Only, bool IsBuffer,
2412 bool HasSOffset, int64_t ImmOffset,
2413 bool *ScaleOffset) const {
2414 assert((!SOffset || !Offset) &&
2415 "Cannot match both soffset and offset at the same time!");
2416
2417 if (ScaleOffset) {
2418 assert(N && SOffset);
2419
2420 *ScaleOffset = SelectScaleOffset(N, Offset&: ByteOffsetNode, IsSigned: false /* IsSigned */);
2421 }
2422
2423 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: ByteOffsetNode);
2424 if (!C) {
2425 if (!SOffset)
2426 return false;
2427
2428 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2429 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2430 *SOffset = ByteOffsetNode;
2431 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2432 ImmOffset);
2433 }
2434 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2435 if (ByteOffsetNode.getOperand(i: 0).getValueType().getSizeInBits() == 32) {
2436 *SOffset = ByteOffsetNode.getOperand(i: 0);
2437 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2438 ImmOffset);
2439 }
2440 }
2441 return false;
2442 }
2443
2444 SDLoc SL(ByteOffsetNode);
2445
2446 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2447 // offset for S_BUFFER instructions is unsigned.
2448 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2449 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2450 ST: *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2451 if (EncodedOffset && Offset && !Imm32Only) {
2452 *Offset = CurDAG->getSignedTargetConstant(Val: *EncodedOffset, DL: SL, VT: MVT::i32);
2453 return true;
2454 }
2455
2456 // SGPR and literal offsets are unsigned.
2457 if (ByteOffset < 0)
2458 return false;
2459
2460 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(ST: *Subtarget, ByteOffset);
2461 if (EncodedOffset && Offset && Imm32Only) {
2462 *Offset = CurDAG->getTargetConstant(Val: *EncodedOffset, DL: SL, VT: MVT::i32);
2463 return true;
2464 }
2465
2466 if (!isUInt<32>(x: ByteOffset) && !isInt<32>(x: ByteOffset))
2467 return false;
2468
2469 if (SOffset) {
2470 SDValue C32Bit = CurDAG->getTargetConstant(Val: ByteOffset, DL: SL, VT: MVT::i32);
2471 *SOffset = SDValue(
2472 CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32, Op1: C32Bit), 0);
2473 return true;
2474 }
2475
2476 return false;
2477}
2478
2479SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2480 if (Addr.getValueType() != MVT::i32)
2481 return Addr;
2482
2483 // Zero-extend a 32-bit address.
2484 SDLoc SL(Addr);
2485
2486 const MachineFunction &MF = CurDAG->getMachineFunction();
2487 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2488 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2489 SDValue AddrHi = CurDAG->getTargetConstant(Val: AddrHiVal, DL: SL, VT: MVT::i32);
2490
2491 const SDValue Ops[] = {
2492 CurDAG->getTargetConstant(Val: AMDGPU::SReg_64_XEXECRegClassID, DL: SL, VT: MVT::i32),
2493 Addr,
2494 CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32),
2495 SDValue(CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32, Op1: AddrHi),
2496 0),
2497 CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32),
2498 };
2499
2500 return SDValue(CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: SL, VT: MVT::i64,
2501 Ops), 0);
2502}
2503
2504// Match a base and an immediate (if Offset is not null) or an SGPR (if
2505// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2506// true, match only 32-bit immediate offsets available on CI.
2507bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
2508 SDValue &SBase, SDValue *SOffset,
2509 SDValue *Offset, bool Imm32Only,
2510 bool IsBuffer, bool HasSOffset,
2511 int64_t ImmOffset,
2512 bool *ScaleOffset) const {
2513 if (SOffset && Offset) {
2514 assert(!Imm32Only && !IsBuffer);
2515 SDValue B;
2516
2517 if (!SelectSMRDBaseOffset(N, Addr, SBase&: B, SOffset: nullptr, Offset, Imm32Only: false, IsBuffer: false, HasSOffset: true))
2518 return false;
2519
2520 int64_t ImmOff = 0;
2521 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: *Offset))
2522 ImmOff = C->getSExtValue();
2523
2524 return SelectSMRDBaseOffset(N, Addr: B, SBase, SOffset, Offset: nullptr, Imm32Only: false, IsBuffer: false,
2525 HasSOffset: true, ImmOffset: ImmOff, ScaleOffset);
2526 }
2527
2528 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2529 // wraparound, because s_load instructions perform the addition in 64 bits.
2530 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2531 !Addr->getFlags().hasNoUnsignedWrap())
2532 return false;
2533
2534 SDValue N0, N1;
2535 // Extract the base and offset if possible.
2536 if (Addr->isAnyAdd() || CurDAG->isADDLike(Op: Addr)) {
2537 N0 = Addr.getOperand(i: 0);
2538 N1 = Addr.getOperand(i: 1);
2539 } else if (getBaseWithOffsetUsingSplitOR(DAG&: *CurDAG, Addr, N0, N1)) {
2540 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2541 }
2542 if (!N0 || !N1)
2543 return false;
2544
2545 if (SelectSMRDOffset(N, ByteOffsetNode: N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2546 ImmOffset, ScaleOffset)) {
2547 SBase = N0;
2548 return true;
2549 }
2550 if (SelectSMRDOffset(N, ByteOffsetNode: N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2551 ImmOffset, ScaleOffset)) {
2552 SBase = N1;
2553 return true;
2554 }
2555 return false;
2556}
2557
2558bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
2559 SDValue *SOffset, SDValue *Offset,
2560 bool Imm32Only, bool *ScaleOffset) const {
2561 if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
2562 /* IsBuffer */ false, /* HasSOffset */ false,
2563 /* ImmOffset */ 0, ScaleOffset)) {
2564 SBase = Expand32BitAddress(Addr: SBase);
2565 return true;
2566 }
2567
2568 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2569 SBase = Expand32BitAddress(Addr);
2570 *Offset = CurDAG->getTargetConstant(Val: 0, DL: SDLoc(Addr), VT: MVT::i32);
2571 return true;
2572 }
2573
2574 return false;
2575}
2576
2577bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2578 SDValue &Offset) const {
2579 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2580 Offset: &Offset);
2581}
2582
2583bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2584 SDValue &Offset) const {
2585 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2586 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2587 Offset: &Offset, /* Imm32Only */ true);
2588}
2589
2590bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
2591 SDValue &SOffset, SDValue &CPol) const {
2592 bool ScaleOffset;
2593 if (!SelectSMRD(N, Addr, SBase, SOffset: &SOffset, /* Offset */ nullptr,
2594 /* Imm32Only */ false, ScaleOffset: &ScaleOffset))
2595 return false;
2596
2597 CPol = CurDAG->getTargetConstant(Val: ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2598 DL: SDLoc(N), VT: MVT::i32);
2599 return true;
2600}
2601
2602bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
2603 SDValue &SBase, SDValue &SOffset,
2604 SDValue &Offset,
2605 SDValue &CPol) const {
2606 bool ScaleOffset;
2607 if (!SelectSMRD(N, Addr, SBase, SOffset: &SOffset, Offset: &Offset, Imm32Only: false, ScaleOffset: &ScaleOffset))
2608 return false;
2609
2610 CPol = CurDAG->getTargetConstant(Val: ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2611 DL: SDLoc(N), VT: MVT::i32);
2612 return true;
2613}
2614
2615bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2616 return SelectSMRDOffset(/* N */ nullptr, ByteOffsetNode: N, /* SOffset */ nullptr, Offset: &Offset,
2617 /* Imm32Only */ false, /* IsBuffer */ true);
2618}
2619
2620bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2621 SDValue &Offset) const {
2622 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2623 return SelectSMRDOffset(/* N */ nullptr, ByteOffsetNode: N, /* SOffset */ nullptr, Offset: &Offset,
2624 /* Imm32Only */ true, /* IsBuffer */ true);
2625}
2626
2627bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2628 SDValue &Offset) const {
2629 // Match the (soffset + offset) pair as a 32-bit register base and
2630 // an immediate offset.
2631 return N.getValueType() == MVT::i32 &&
2632 SelectSMRDBaseOffset(/* N */ nullptr, Addr: N, /* SBase */ SOffset,
2633 /* SOffset*/ nullptr, Offset: &Offset,
2634 /* Imm32Only */ false, /* IsBuffer */ true);
2635}
2636
2637bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2638 SDValue &Base,
2639 SDValue &Offset) const {
2640 SDLoc DL(Index);
2641
2642 if (CurDAG->isBaseWithConstantOffset(Op: Index)) {
2643 SDValue N0 = Index.getOperand(i: 0);
2644 SDValue N1 = Index.getOperand(i: 1);
2645 ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
2646
2647 // (add n0, c0)
2648 // Don't peel off the offset (c0) if doing so could possibly lead
2649 // the base (n0) to be negative.
2650 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2651 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(Op: N0) ||
2652 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2653 Base = N0;
2654 Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32);
2655 return true;
2656 }
2657 }
2658
2659 if (isa<ConstantSDNode>(Val: Index))
2660 return false;
2661
2662 Base = Index;
2663 Offset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
2664 return true;
2665}
2666
2667SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2668 SDValue Val, uint32_t Offset,
2669 uint32_t Width) {
2670 if (Val->isDivergent()) {
2671 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2672 SDValue Off = CurDAG->getTargetConstant(Val: Offset, DL, VT: MVT::i32);
2673 SDValue W = CurDAG->getTargetConstant(Val: Width, DL, VT: MVT::i32);
2674
2675 return CurDAG->getMachineNode(Opcode, dl: DL, VT: MVT::i32, Op1: Val, Op2: Off, Op3: W);
2676 }
2677 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2678 // Transformation function, pack the offset and width of a BFE into
2679 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2680 // source, bits [5:0] contain the offset and bits [22:16] the width.
2681 uint32_t PackedVal = Offset | (Width << 16);
2682 SDValue PackedConst = CurDAG->getTargetConstant(Val: PackedVal, DL, VT: MVT::i32);
2683
2684 return CurDAG->getMachineNode(Opcode, dl: DL, VT: MVT::i32, Op1: Val, Op2: PackedConst);
2685}
2686
2687void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2688 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2689 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2690 // Predicate: 0 < b <= c < 32
2691
2692 const SDValue &Shl = N->getOperand(Num: 0);
2693 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Val: Shl->getOperand(Num: 1));
2694 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
2695
2696 if (B && C) {
2697 uint32_t BVal = B->getZExtValue();
2698 uint32_t CVal = C->getZExtValue();
2699
2700 if (0 < BVal && BVal <= CVal && CVal < 32) {
2701 bool Signed = N->getOpcode() == ISD::SRA;
2702 ReplaceNode(F: N, T: getBFE32(IsSigned: Signed, DL: SDLoc(N), Val: Shl.getOperand(i: 0), Offset: CVal - BVal,
2703 Width: 32 - CVal));
2704 return;
2705 }
2706 }
2707 SelectCode(N);
2708}
2709
2710void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2711 switch (N->getOpcode()) {
2712 case ISD::AND:
2713 if (N->getOperand(Num: 0).getOpcode() == ISD::SRL) {
2714 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2715 // Predicate: isMask(mask)
2716 const SDValue &Srl = N->getOperand(Num: 0);
2717 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Val: Srl.getOperand(i: 1));
2718 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
2719
2720 if (Shift && Mask) {
2721 uint32_t ShiftVal = Shift->getZExtValue();
2722 uint32_t MaskVal = Mask->getZExtValue();
2723
2724 if (isMask_32(Value: MaskVal)) {
2725 uint32_t WidthVal = llvm::popcount(Value: MaskVal);
2726 ReplaceNode(F: N, T: getBFE32(IsSigned: false, DL: SDLoc(N), Val: Srl.getOperand(i: 0), Offset: ShiftVal,
2727 Width: WidthVal));
2728 return;
2729 }
2730 }
2731 }
2732 break;
2733 case ISD::SRL:
2734 if (N->getOperand(Num: 0).getOpcode() == ISD::AND) {
2735 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2736 // Predicate: isMask(mask >> b)
2737 const SDValue &And = N->getOperand(Num: 0);
2738 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
2739 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: 1));
2740
2741 if (Shift && Mask) {
2742 uint32_t ShiftVal = Shift->getZExtValue();
2743 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2744
2745 if (isMask_32(Value: MaskVal)) {
2746 uint32_t WidthVal = llvm::popcount(Value: MaskVal);
2747 ReplaceNode(F: N, T: getBFE32(IsSigned: false, DL: SDLoc(N), Val: And.getOperand(i: 0), Offset: ShiftVal,
2748 Width: WidthVal));
2749 return;
2750 }
2751 }
2752 } else if (N->getOperand(Num: 0).getOpcode() == ISD::SHL) {
2753 SelectS_BFEFromShifts(N);
2754 return;
2755 }
2756 break;
2757 case ISD::SRA:
2758 if (N->getOperand(Num: 0).getOpcode() == ISD::SHL) {
2759 SelectS_BFEFromShifts(N);
2760 return;
2761 }
2762 break;
2763
2764 case ISD::SIGN_EXTEND_INREG: {
2765 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2766 SDValue Src = N->getOperand(Num: 0);
2767 if (Src.getOpcode() != ISD::SRL)
2768 break;
2769
2770 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: 1));
2771 if (!Amt)
2772 break;
2773
2774 unsigned Width = cast<VTSDNode>(Val: N->getOperand(Num: 1))->getVT().getSizeInBits();
2775 ReplaceNode(F: N, T: getBFE32(IsSigned: true, DL: SDLoc(N), Val: Src.getOperand(i: 0),
2776 Offset: Amt->getZExtValue(), Width));
2777 return;
2778 }
2779 }
2780
2781 SelectCode(N);
2782}
2783
2784bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2785 assert(N->getOpcode() == ISD::BRCOND);
2786 if (!N->hasOneUse())
2787 return false;
2788
2789 SDValue Cond = N->getOperand(Num: 1);
2790 if (Cond.getOpcode() == ISD::CopyToReg)
2791 Cond = Cond.getOperand(i: 2);
2792
2793 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2794 return false;
2795
2796 MVT VT = Cond.getOperand(i: 0).getSimpleValueType();
2797 if (VT == MVT::i32)
2798 return true;
2799
2800 if (VT == MVT::i64) {
2801 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Cond.getOperand(i: 2))->get();
2802 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2803 Subtarget->hasScalarCompareEq64();
2804 }
2805
2806 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2807 return true;
2808
2809 return false;
2810}
2811
2812static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2813 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2814 // Special case for amdgcn.ballot:
2815 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2816 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2817 // =>
2818 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2819 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2820 // Cond becomes a i(WaveSize) full mask value.
2821 // Note that ballot doesn't use SETEQ condition but its easy to support it
2822 // here for completeness, so in this case Negate is set true on return.
2823 auto VCMP_CC = cast<CondCodeSDNode>(Val: VCMP.getOperand(i: 2))->get();
2824 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2825 isNullConstant(V: VCMP.getOperand(i: 1))) {
2826
2827 auto Cond = VCMP.getOperand(i: 0);
2828 if (ISD::isExtOpcode(Opcode: Cond->getOpcode())) // Skip extension.
2829 Cond = Cond.getOperand(i: 0);
2830
2831 if (isBoolSGPR(V: Cond)) {
2832 Negate = VCMP_CC == ISD::SETEQ;
2833 return Cond;
2834 }
2835 }
2836 return SDValue();
2837}
2838
2839void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2840 SDValue Cond = N->getOperand(Num: 1);
2841
2842 if (Cond.isUndef()) {
2843 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::SI_BR_UNDEF, VT: MVT::Other,
2844 Op1: N->getOperand(Num: 2), Op2: N->getOperand(Num: 0));
2845 return;
2846 }
2847
2848 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2849
2850 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2851 bool AndExec = !UseSCCBr;
2852 bool Negate = false;
2853
2854 if (Cond.getOpcode() == ISD::SETCC &&
2855 Cond->getOperand(Num: 0)->getOpcode() == AMDGPUISD::SETCC) {
2856 SDValue VCMP = Cond->getOperand(Num: 0);
2857 auto CC = cast<CondCodeSDNode>(Val: Cond->getOperand(Num: 2))->get();
2858 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2859 isNullConstant(V: Cond->getOperand(Num: 1)) &&
2860 // We may encounter ballot.i64 in wave32 mode on -O0.
2861 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2862 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2863 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2864 // BRCOND i1 %C, %BB
2865 // =>
2866 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2867 // VCC = COPY i(WaveSize) %VCMP
2868 // S_CBRANCH_VCCNZ/VCCZ %BB
2869 Negate = CC == ISD::SETEQ;
2870 bool NegatedBallot = false;
2871 if (auto BallotCond = combineBallotPattern(VCMP, Negate&: NegatedBallot)) {
2872 Cond = BallotCond;
2873 UseSCCBr = !BallotCond->isDivergent();
2874 Negate = Negate ^ NegatedBallot;
2875 } else {
2876 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2877 // selected as V_CMP, but this may change for uniform condition.
2878 Cond = VCMP;
2879 UseSCCBr = false;
2880 }
2881 }
2882 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2883 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2884 // used.
2885 AndExec = false;
2886 }
2887
2888 unsigned BrOp =
2889 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2890 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2891 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2892 SDLoc SL(N);
2893
2894 if (AndExec) {
2895 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2896 // analyzed what generates the vcc value, so we do not know whether vcc
2897 // bits for disabled lanes are 0. Thus we need to mask out bits for
2898 // disabled lanes.
2899 //
2900 // For the case that we select S_CBRANCH_SCC1 and it gets
2901 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2902 // SIInstrInfo::moveToVALU which inserts the S_AND).
2903 //
2904 // We could add an analysis of what generates the vcc value here and omit
2905 // the S_AND when is unnecessary. But it would be better to add a separate
2906 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2907 // catches both cases.
2908 Cond = SDValue(
2909 CurDAG->getMachineNode(
2910 Opcode: Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, dl: SL,
2911 VT: MVT::i1,
2912 Op1: CurDAG->getRegister(Reg: Subtarget->isWave32() ? AMDGPU::EXEC_LO
2913 : AMDGPU::EXEC,
2914 VT: MVT::i1),
2915 Op2: Cond),
2916 0);
2917 }
2918
2919 SDValue VCC = CurDAG->getCopyToReg(Chain: N->getOperand(Num: 0), dl: SL, Reg: CondReg, N: Cond);
2920 CurDAG->SelectNodeTo(N, MachineOpc: BrOp, VT: MVT::Other,
2921 Op1: N->getOperand(Num: 2), // Basic Block
2922 Op2: VCC.getValue(R: 0));
2923}
2924
2925void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2926 if (Subtarget->hasSALUFloatInsts() && N->getValueType(ResNo: 0) == MVT::f32 &&
2927 !N->isDivergent()) {
2928 SDValue Src = N->getOperand(Num: 0);
2929 if (Src.getValueType() == MVT::f16) {
2930 if (isExtractHiElt(In: Src, Out&: Src)) {
2931 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::S_CVT_HI_F32_F16, VTs: N->getVTList(),
2932 Ops: {Src});
2933 return;
2934 }
2935 }
2936 }
2937
2938 SelectCode(N);
2939}
2940
2941void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2942 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2943 // be copied to an SGPR with readfirstlane.
2944 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2945 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2946
2947 SDValue Chain = N->getOperand(Num: 0);
2948 SDValue Ptr = N->getOperand(Num: 2);
2949 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
2950 MachineMemOperand *MMO = M->getMemOperand();
2951 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2952
2953 SDValue Offset;
2954 if (CurDAG->isBaseWithConstantOffset(Op: Ptr)) {
2955 SDValue PtrBase = Ptr.getOperand(i: 0);
2956 SDValue PtrOffset = Ptr.getOperand(i: 1);
2957
2958 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2959 if (isDSOffsetLegal(Base: PtrBase, Offset: OffsetVal.getZExtValue())) {
2960 N = glueCopyToM0(N, Val: PtrBase);
2961 Offset = CurDAG->getTargetConstant(Val: OffsetVal, DL: SDLoc(), VT: MVT::i32);
2962 }
2963 }
2964
2965 if (!Offset) {
2966 N = glueCopyToM0(N, Val: Ptr);
2967 Offset = CurDAG->getTargetConstant(Val: 0, DL: SDLoc(), VT: MVT::i32);
2968 }
2969
2970 SDValue Ops[] = {
2971 Offset,
2972 CurDAG->getTargetConstant(Val: IsGDS, DL: SDLoc(), VT: MVT::i32),
2973 Chain,
2974 N->getOperand(Num: N->getNumOperands() - 1) // New glue
2975 };
2976
2977 SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
2978 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
2979}
2980
2981// We need to handle this here because tablegen doesn't support matching
2982// instructions with multiple outputs.
2983void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
2984 unsigned Opc;
2985 switch (IntrID) {
2986 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2987 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2988 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2989 break;
2990 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2991 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2992 break;
2993 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2994 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2995 break;
2996 }
2997 SDValue Ops[] = {N->getOperand(Num: 2), N->getOperand(Num: 3), N->getOperand(Num: 4),
2998 N->getOperand(Num: 5), N->getOperand(Num: 0)};
2999
3000 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
3001 MachineMemOperand *MMO = M->getMemOperand();
3002 SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
3003 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
3004}
3005
3006void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
3007 bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
3008 unsigned Opc =
3009 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS : AMDGPU::TENSOR_STORE_FROM_LDS;
3010
3011 SmallVector<SDValue, 7> TensorOps;
3012 // First two groups
3013 TensorOps.push_back(Elt: N->getOperand(Num: 2)); // D# group 0
3014 TensorOps.push_back(Elt: N->getOperand(Num: 3)); // D# group 1
3015
3016 // Use _D2 version if both group 2 and 3 are zero-initialized.
3017 SDValue Group2 = N->getOperand(Num: 4);
3018 SDValue Group3 = N->getOperand(Num: 5);
3019 if (ISD::isBuildVectorAllZeros(N: Group2.getNode()) &&
3020 ISD::isBuildVectorAllZeros(N: Group3.getNode())) {
3021 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2
3022 : AMDGPU::TENSOR_STORE_FROM_LDS_D2;
3023 } else { // Has at least 4 groups
3024 TensorOps.push_back(Elt: Group2); // D# group 2
3025 TensorOps.push_back(Elt: Group3); // D# group 3
3026 }
3027
3028 // TODO: Handle the fifth group: N->getOperand(6), which is silently ignored
3029 // for now because all existing targets only support up to 4 groups.
3030 TensorOps.push_back(Elt: CurDAG->getTargetConstant(Val: 0, DL: SDLoc(N), VT: MVT::i1)); // r128
3031 TensorOps.push_back(Elt: N->getOperand(Num: 7)); // cache policy
3032 TensorOps.push_back(Elt: N->getOperand(Num: 0)); // chain
3033
3034 (void)CurDAG->SelectNodeTo(N, MachineOpc: Opc, VT: MVT::Other, Ops: TensorOps);
3035}
3036
3037static unsigned gwsIntrinToOpcode(unsigned IntrID) {
3038 switch (IntrID) {
3039 case Intrinsic::amdgcn_ds_gws_init:
3040 return AMDGPU::DS_GWS_INIT;
3041 case Intrinsic::amdgcn_ds_gws_barrier:
3042 return AMDGPU::DS_GWS_BARRIER;
3043 case Intrinsic::amdgcn_ds_gws_sema_v:
3044 return AMDGPU::DS_GWS_SEMA_V;
3045 case Intrinsic::amdgcn_ds_gws_sema_br:
3046 return AMDGPU::DS_GWS_SEMA_BR;
3047 case Intrinsic::amdgcn_ds_gws_sema_p:
3048 return AMDGPU::DS_GWS_SEMA_P;
3049 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3050 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
3051 default:
3052 llvm_unreachable("not a gws intrinsic");
3053 }
3054}
3055
3056void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
3057 if (!Subtarget->hasGWS() ||
3058 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
3059 !Subtarget->hasGWSSemaReleaseAll())) {
3060 // Let this error.
3061 SelectCode(N);
3062 return;
3063 }
3064
3065 // Chain, intrinsic ID, vsrc, offset
3066 const bool HasVSrc = N->getNumOperands() == 4;
3067 assert(HasVSrc || N->getNumOperands() == 3);
3068
3069 SDLoc SL(N);
3070 SDValue BaseOffset = N->getOperand(Num: HasVSrc ? 3 : 2);
3071 int ImmOffset = 0;
3072 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
3073 MachineMemOperand *MMO = M->getMemOperand();
3074
3075 // Don't worry if the offset ends up in a VGPR. Only one lane will have
3076 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
3077
3078 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
3079 // offset field) % 64. Some versions of the programming guide omit the m0
3080 // part, or claim it's from offset 0.
3081 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(Val&: BaseOffset)) {
3082 // If we have a constant offset, try to use the 0 in m0 as the base.
3083 // TODO: Look into changing the default m0 initialization value. If the
3084 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
3085 // the immediate offset.
3086 glueCopyToM0(N, Val: CurDAG->getTargetConstant(Val: 0, DL: SL, VT: MVT::i32));
3087 ImmOffset = ConstOffset->getZExtValue();
3088 } else {
3089 if (CurDAG->isBaseWithConstantOffset(Op: BaseOffset)) {
3090 ImmOffset = BaseOffset.getConstantOperandVal(i: 1);
3091 BaseOffset = BaseOffset.getOperand(i: 0);
3092 }
3093
3094 // Prefer to do the shift in an SGPR since it should be possible to use m0
3095 // as the result directly. If it's already an SGPR, it will be eliminated
3096 // later.
3097 SDNode *SGPROffset
3098 = CurDAG->getMachineNode(Opcode: AMDGPU::V_READFIRSTLANE_B32, dl: SL, VT: MVT::i32,
3099 Op1: BaseOffset);
3100 // Shift to offset in m0
3101 SDNode *M0Base
3102 = CurDAG->getMachineNode(Opcode: AMDGPU::S_LSHL_B32, dl: SL, VT: MVT::i32,
3103 Op1: SDValue(SGPROffset, 0),
3104 Op2: CurDAG->getTargetConstant(Val: 16, DL: SL, VT: MVT::i32));
3105 glueCopyToM0(N, Val: SDValue(M0Base, 0));
3106 }
3107
3108 SDValue Chain = N->getOperand(Num: 0);
3109 SDValue OffsetField = CurDAG->getTargetConstant(Val: ImmOffset, DL: SL, VT: MVT::i32);
3110
3111 const unsigned Opc = gwsIntrinToOpcode(IntrID);
3112
3113 const MCInstrDesc &InstrDesc = TII->get(Opcode: Opc);
3114 int Data0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
3115
3116 const TargetRegisterClass *DataRC = TII->getRegClass(MCID: InstrDesc, OpNum: Data0Idx);
3117
3118 SmallVector<SDValue, 5> Ops;
3119 if (HasVSrc) {
3120 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3121
3122 SDValue Data = N->getOperand(Num: 2);
3123 MVT DataVT = Data.getValueType().getSimpleVT();
3124 if (TRI->isTypeLegalForClass(RC: *DataRC, T: DataVT)) {
3125 // Normal 32-bit case.
3126 Ops.push_back(Elt: N->getOperand(Num: 2));
3127 } else {
3128 // Operand is really 32-bits, but requires 64-bit alignment, so use the
3129 // even aligned 64-bit register class.
3130 const SDValue RegSeqOps[] = {
3131 CurDAG->getTargetConstant(Val: DataRC->getID(), DL: SL, VT: MVT::i32), Data,
3132 CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32),
3133 SDValue(
3134 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: SL, VT: MVT::i32),
3135 0),
3136 CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32)};
3137
3138 Ops.push_back(Elt: SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE,
3139 dl: SL, VT: MVT::v2i32, Ops: RegSeqOps),
3140 0));
3141 }
3142 }
3143
3144 Ops.push_back(Elt: OffsetField);
3145 Ops.push_back(Elt: Chain);
3146
3147 SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
3148 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
3149}
3150
3151void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
3152 if (Subtarget->getLDSBankCount() != 16) {
3153 // This is a single instruction with a pattern.
3154 SelectCode(N);
3155 return;
3156 }
3157
3158 SDLoc DL(N);
3159
3160 // This requires 2 instructions. It is possible to write a pattern to support
3161 // this, but the generated isel emitter doesn't correctly deal with multiple
3162 // output instructions using the same physical register input. The copy to m0
3163 // is incorrectly placed before the second instruction.
3164 //
3165 // TODO: Match source modifiers.
3166 //
3167 // def : Pat <
3168 // (int_amdgcn_interp_p1_f16
3169 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
3170 // (i32 timm:$attrchan), (i32 timm:$attr),
3171 // (i1 timm:$high), M0),
3172 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
3173 // timm:$attrchan, 0,
3174 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
3175 // let Predicates = [has16BankLDS];
3176 // }
3177
3178 // 16 bank LDS
3179 SDValue ToM0 = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl: DL, Reg: AMDGPU::M0,
3180 N: N->getOperand(Num: 5), Glue: SDValue());
3181
3182 SDVTList VTs = CurDAG->getVTList(VT1: MVT::f32, VT2: MVT::Other);
3183
3184 SDNode *InterpMov =
3185 CurDAG->getMachineNode(Opcode: AMDGPU::V_INTERP_MOV_F32, dl: DL, VTs, Ops: {
3186 CurDAG->getTargetConstant(Val: 2, DL, VT: MVT::i32), // P0
3187 N->getOperand(Num: 3), // Attr
3188 N->getOperand(Num: 2), // Attrchan
3189 ToM0.getValue(R: 1) // In glue
3190 });
3191
3192 SDNode *InterpP1LV =
3193 CurDAG->getMachineNode(Opcode: AMDGPU::V_INTERP_P1LV_F16, dl: DL, VT: MVT::f32, Ops: {
3194 CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32), // $src0_modifiers
3195 N->getOperand(Num: 1), // Src0
3196 N->getOperand(Num: 3), // Attr
3197 N->getOperand(Num: 2), // Attrchan
3198 CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32), // $src2_modifiers
3199 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
3200 N->getOperand(Num: 4), // high
3201 CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1), // $clamp
3202 CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32), // $omod
3203 SDValue(InterpMov, 1)
3204 });
3205
3206 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: SDValue(InterpP1LV, 0));
3207}
3208
3209void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
3210 unsigned IntrID = N->getConstantOperandVal(Num: 1);
3211 switch (IntrID) {
3212 case Intrinsic::amdgcn_ds_append:
3213 case Intrinsic::amdgcn_ds_consume: {
3214 if (N->getValueType(ResNo: 0) != MVT::i32)
3215 break;
3216 SelectDSAppendConsume(N, IntrID);
3217 return;
3218 }
3219 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3220 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3221 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3222 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3223 SelectDSBvhStackIntrinsic(N, IntrID);
3224 return;
3225 case Intrinsic::amdgcn_init_whole_wave:
3226 CurDAG->getMachineFunction()
3227 .getInfo<SIMachineFunctionInfo>()
3228 ->setInitWholeWave();
3229 break;
3230 }
3231
3232 SelectCode(N);
3233}
3234
3235void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
3236 unsigned IntrID = N->getConstantOperandVal(Num: 0);
3237 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
3238 SDNode *ConvGlueNode = N->getGluedNode();
3239 if (ConvGlueNode) {
3240 // FIXME: Possibly iterate over multiple glue nodes?
3241 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
3242 ConvGlueNode = ConvGlueNode->getOperand(Num: 0).getNode();
3243 ConvGlueNode =
3244 CurDAG->getMachineNode(Opcode: TargetOpcode::CONVERGENCECTRL_GLUE, dl: {},
3245 VT: MVT::Glue, Op1: SDValue(ConvGlueNode, 0));
3246 } else {
3247 ConvGlueNode = nullptr;
3248 }
3249 switch (IntrID) {
3250 case Intrinsic::amdgcn_wqm:
3251 Opcode = AMDGPU::WQM;
3252 break;
3253 case Intrinsic::amdgcn_softwqm:
3254 Opcode = AMDGPU::SOFT_WQM;
3255 break;
3256 case Intrinsic::amdgcn_wwm:
3257 case Intrinsic::amdgcn_strict_wwm:
3258 Opcode = AMDGPU::STRICT_WWM;
3259 break;
3260 case Intrinsic::amdgcn_strict_wqm:
3261 Opcode = AMDGPU::STRICT_WQM;
3262 break;
3263 case Intrinsic::amdgcn_interp_p1_f16:
3264 SelectInterpP1F16(N);
3265 return;
3266 case Intrinsic::amdgcn_permlane16_swap:
3267 case Intrinsic::amdgcn_permlane32_swap: {
3268 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
3269 !Subtarget->hasPermlane16Swap()) ||
3270 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3271 !Subtarget->hasPermlane32Swap())) {
3272 SelectCode(N); // Hit the default error
3273 return;
3274 }
3275
3276 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3277 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3278 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3279
3280 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
3281 if (ConvGlueNode)
3282 NewOps.push_back(Elt: SDValue(ConvGlueNode, 0));
3283
3284 bool FI = N->getConstantOperandVal(Num: 3);
3285 NewOps[2] = CurDAG->getTargetConstant(
3286 Val: FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, DL: SDLoc(), VT: MVT::i32);
3287
3288 CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops: NewOps);
3289 return;
3290 }
3291 default:
3292 SelectCode(N);
3293 break;
3294 }
3295
3296 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
3297 SDValue Src = N->getOperand(Num: 1);
3298 CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops: {Src});
3299 }
3300
3301 if (ConvGlueNode) {
3302 SmallVector<SDValue, 4> NewOps(N->ops());
3303 NewOps.push_back(Elt: SDValue(ConvGlueNode, 0));
3304 CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: N->getVTList(), Ops: NewOps);
3305 }
3306}
3307
3308void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
3309 unsigned IntrID = N->getConstantOperandVal(Num: 1);
3310 switch (IntrID) {
3311 case Intrinsic::amdgcn_ds_gws_init:
3312 case Intrinsic::amdgcn_ds_gws_barrier:
3313 case Intrinsic::amdgcn_ds_gws_sema_v:
3314 case Intrinsic::amdgcn_ds_gws_sema_br:
3315 case Intrinsic::amdgcn_ds_gws_sema_p:
3316 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3317 SelectDS_GWS(N, IntrID);
3318 return;
3319 case Intrinsic::amdgcn_tensor_load_to_lds:
3320 case Intrinsic::amdgcn_tensor_store_from_lds:
3321 SelectTensorLoadStore(N, IntrID);
3322 return;
3323 default:
3324 break;
3325 }
3326
3327 SelectCode(N);
3328}
3329
3330void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
3331 SDValue Log2WaveSize =
3332 CurDAG->getTargetConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: SDLoc(N), VT: MVT::i32);
3333 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::S_LSHR_B32, VTs: N->getVTList(),
3334 Ops: {N->getOperand(Num: 0), Log2WaveSize});
3335}
3336
3337void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
3338 SDValue SrcVal = N->getOperand(Num: 1);
3339 if (SrcVal.getValueType() != MVT::i32) {
3340 SelectCode(N); // Emit default error
3341 return;
3342 }
3343
3344 SDValue CopyVal;
3345 Register SP = TLI->getStackPointerRegisterToSaveRestore();
3346 SDLoc SL(N);
3347
3348 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
3349 CopyVal = SrcVal.getOperand(i: 0);
3350 } else {
3351 SDValue Log2WaveSize = CurDAG->getTargetConstant(
3352 Val: Subtarget->getWavefrontSizeLog2(), DL: SL, VT: MVT::i32);
3353
3354 if (N->isDivergent()) {
3355 SrcVal = SDValue(CurDAG->getMachineNode(Opcode: AMDGPU::V_READFIRSTLANE_B32, dl: SL,
3356 VT: MVT::i32, Op1: SrcVal),
3357 0);
3358 }
3359
3360 CopyVal = SDValue(CurDAG->getMachineNode(Opcode: AMDGPU::S_LSHL_B32, dl: SL, VT: MVT::i32,
3361 Ops: {SrcVal, Log2WaveSize}),
3362 0);
3363 }
3364
3365 SDValue CopyToSP = CurDAG->getCopyToReg(Chain: N->getOperand(Num: 0), dl: SL, Reg: SP, N: CopyVal);
3366 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: CopyToSP);
3367}
3368
3369bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3370 unsigned &Mods,
3371 bool IsCanonicalizing,
3372 bool AllowAbs) const {
3373 Mods = SISrcMods::NONE;
3374 Src = In;
3375
3376 if (Src.getOpcode() == ISD::FNEG) {
3377 Mods |= SISrcMods::NEG;
3378 Src = Src.getOperand(i: 0);
3379 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3380 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3381 // denormal mode, but we're implicitly canonicalizing in a source operand.
3382 auto *LHS = dyn_cast<ConstantFPSDNode>(Val: Src.getOperand(i: 0));
3383 if (LHS && LHS->isZero()) {
3384 Mods |= SISrcMods::NEG;
3385 Src = Src.getOperand(i: 1);
3386 }
3387 }
3388
3389 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3390 Mods |= SISrcMods::ABS;
3391 Src = Src.getOperand(i: 0);
3392 }
3393
3394 if (Mods != SISrcMods::NONE)
3395 return true;
3396
3397 // Convert various sign-bit masks on integers to src mods. Currently disabled
3398 // for 16-bit types as the codegen replaces the operand without adding a
3399 // srcmod. This is intentionally finding the cases where we are performing
3400 // float neg and abs on int types, the goal is not to obtain two's complement
3401 // neg or abs. Limit converison to select operands via the nonCanonalizing
3402 // pattern.
3403 // TODO: Add 16-bit support.
3404 if (IsCanonicalizing)
3405 return true;
3406
3407 // v2i32 xor/or/and are legal. A vselect using these instructions as operands
3408 // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
3409 // through the extract to the bitwise op.
3410 SDValue PeekSrc =
3411 Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(Num: 0) : Src;
3412 // Convert various sign-bit masks to src mods. Currently disabled for 16-bit
3413 // types as the codegen replaces the operand without adding a srcmod.
3414 // This is intentionally finding the cases where we are performing float neg
3415 // and abs on int types, the goal is not to obtain two's complement neg or
3416 // abs.
3417 // TODO: Add 16-bit support.
3418 unsigned Opc = PeekSrc.getOpcode();
3419 EVT VT = Src.getValueType();
3420 if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3421 (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
3422 return true;
3423
3424 ConstantSDNode *CRHS = isConstOrConstSplat(N: PeekSrc->getOperand(Num: 1));
3425 if (!CRHS)
3426 return true;
3427
3428 auto ReplaceSrc = [&]() -> SDValue {
3429 if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3430 return Src.getOperand(i: 0);
3431
3432 SDValue LHS = PeekSrc->getOperand(Num: 0);
3433 SDValue Index = Src->getOperand(Num: 1);
3434 return CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SDLoc(Src),
3435 VT: Src.getValueType(), N1: LHS, N2: Index);
3436 };
3437
3438 // Recognise Srcmods:
3439 // (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.
3440 // (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.
3441 // (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS
3442 // SrcModifiers.
3443 if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3444 Mods |= SISrcMods::NEG;
3445 Src = ReplaceSrc();
3446 } else if (Opc == ISD::AND && AllowAbs &&
3447 CRHS->getAPIntValue().isMaxSignedValue()) {
3448 Mods |= SISrcMods::ABS;
3449 Src = ReplaceSrc();
3450 } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3451 Mods |= SISrcMods::ABS | SISrcMods::NEG;
3452 Src = ReplaceSrc();
3453 }
3454
3455 return true;
3456}
3457
3458bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3459 SDValue &SrcMods) const {
3460 unsigned Mods;
3461 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3462 /*AllowAbs=*/true)) {
3463 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3464 return true;
3465 }
3466
3467 return false;
3468}
3469
3470bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3471 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3472 unsigned Mods;
3473 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3474 /*AllowAbs=*/true)) {
3475 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3476 return true;
3477 }
3478
3479 return false;
3480}
3481
3482bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3483 SDValue &SrcMods) const {
3484 unsigned Mods;
3485 if (SelectVOP3ModsImpl(In, Src, Mods,
3486 /*IsCanonicalizing=*/true,
3487 /*AllowAbs=*/false)) {
3488 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3489 return true;
3490 }
3491
3492 return false;
3493}
3494
3495bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3496 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3497 return false;
3498
3499 Src = In;
3500 return true;
3501}
3502
3503bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3504 SDValue &SrcMods,
3505 bool OpSel) const {
3506 unsigned Mods;
3507 if (SelectVOP3ModsImpl(In, Src, Mods,
3508 /*IsCanonicalizing=*/true,
3509 /*AllowAbs=*/false)) {
3510 if (OpSel)
3511 Mods |= SISrcMods::OP_SEL_0;
3512 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3513 return true;
3514 }
3515
3516 return false;
3517}
3518
3519bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3520 SDValue &SrcMods) const {
3521 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3522}
3523
3524bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3525 SDValue &SrcMods) const {
3526 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3527}
3528
3529bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3530 SDValue &SrcMods, SDValue &Clamp,
3531 SDValue &Omod) const {
3532 SDLoc DL(In);
3533 Clamp = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
3534 Omod = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
3535
3536 return SelectVOP3Mods(In, Src, SrcMods);
3537}
3538
3539bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3540 SDValue &SrcMods, SDValue &Clamp,
3541 SDValue &Omod) const {
3542 SDLoc DL(In);
3543 Clamp = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
3544 Omod = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
3545
3546 return SelectVOP3BMods(In, Src, SrcMods);
3547}
3548
3549bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3550 SDValue &Clamp, SDValue &Omod) const {
3551 Src = In;
3552
3553 SDLoc DL(In);
3554 Clamp = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
3555 Omod = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
3556
3557 return true;
3558}
3559
3560bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3561 SDValue &SrcMods, bool IsDOT) const {
3562 unsigned Mods = SISrcMods::NONE;
3563 Src = In;
3564
3565 // TODO: Handle G_FSUB 0 as fneg
3566 if (Src.getOpcode() == ISD::FNEG) {
3567 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3568 Src = Src.getOperand(i: 0);
3569 }
3570
3571 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3572 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3573 unsigned VecMods = Mods;
3574
3575 SDValue Lo = stripBitcast(Val: Src.getOperand(i: 0));
3576 SDValue Hi = stripBitcast(Val: Src.getOperand(i: 1));
3577
3578 if (Lo.getOpcode() == ISD::FNEG) {
3579 Lo = stripBitcast(Val: Lo.getOperand(i: 0));
3580 Mods ^= SISrcMods::NEG;
3581 }
3582
3583 if (Hi.getOpcode() == ISD::FNEG) {
3584 Hi = stripBitcast(Val: Hi.getOperand(i: 0));
3585 Mods ^= SISrcMods::NEG_HI;
3586 }
3587
3588 if (isExtractHiElt(In: Lo, Out&: Lo))
3589 Mods |= SISrcMods::OP_SEL_0;
3590
3591 if (isExtractHiElt(In: Hi, Out&: Hi))
3592 Mods |= SISrcMods::OP_SEL_1;
3593
3594 unsigned VecSize = Src.getValueSizeInBits();
3595 Lo = stripExtractLoElt(In: Lo);
3596 Hi = stripExtractLoElt(In: Hi);
3597
3598 if (Lo.getValueSizeInBits() > VecSize) {
3599 Lo = CurDAG->getTargetExtractSubreg(
3600 SRIdx: (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, DL: SDLoc(In),
3601 VT: MVT::getIntegerVT(BitWidth: VecSize), Operand: Lo);
3602 }
3603
3604 if (Hi.getValueSizeInBits() > VecSize) {
3605 Hi = CurDAG->getTargetExtractSubreg(
3606 SRIdx: (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, DL: SDLoc(In),
3607 VT: MVT::getIntegerVT(BitWidth: VecSize), Operand: Hi);
3608 }
3609
3610 assert(Lo.getValueSizeInBits() <= VecSize &&
3611 Hi.getValueSizeInBits() <= VecSize);
3612
3613 if (Lo == Hi && !isInlineImmediate(N: Lo.getNode())) {
3614 // Really a scalar input. Just select from the low half of the register to
3615 // avoid packing.
3616
3617 if (VecSize == Lo.getValueSizeInBits()) {
3618 Src = Lo;
3619 } else if (VecSize == 32) {
3620 Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
3621 } else {
3622 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3623
3624 SDLoc SL(In);
3625 SDValue Undef = SDValue(
3626 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: SL,
3627 VT: Lo.getValueType()), 0);
3628 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3629 : AMDGPU::SReg_64RegClassID;
3630 const SDValue Ops[] = {
3631 CurDAG->getTargetConstant(Val: RC, DL: SL, VT: MVT::i32),
3632 Lo, CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32),
3633 Undef, CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32) };
3634
3635 Src = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: SL,
3636 VT: Src.getValueType(), Ops), 0);
3637 }
3638 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3639 return true;
3640 }
3641
3642 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Val: Lo)) {
3643 uint64_t Lit = cast<ConstantFPSDNode>(Val&: Lo)->getValueAPF()
3644 .bitcastToAPInt().getZExtValue();
3645 if (AMDGPU::isInlinableLiteral32(Literal: Lit, HasInv2Pi: Subtarget->hasInv2PiInlineImm())) {
3646 Src = CurDAG->getTargetConstant(Val: Lit, DL: SDLoc(In), VT: MVT::i64);
3647 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3648 return true;
3649 }
3650 }
3651
3652 Mods = VecMods;
3653 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3654 Src.getNumOperands() == 2) {
3655
3656 // TODO: We should repeat the build_vector source check above for the
3657 // vector_shuffle for negates and casts of individual elements.
3658
3659 auto *SVN = cast<ShuffleVectorSDNode>(Val&: Src);
3660 ArrayRef<int> Mask = SVN->getMask();
3661
3662 if (Mask[0] < 2 && Mask[1] < 2) {
3663 // src1 should be undef.
3664 SDValue ShuffleSrc = SVN->getOperand(Num: 0);
3665
3666 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3667 ShuffleSrc = ShuffleSrc.getOperand(i: 0);
3668 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3669 }
3670
3671 if (Mask[0] == 1)
3672 Mods |= SISrcMods::OP_SEL_0;
3673 if (Mask[1] == 1)
3674 Mods |= SISrcMods::OP_SEL_1;
3675
3676 Src = ShuffleSrc;
3677 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3678 return true;
3679 }
3680 }
3681
3682 // Packed instructions do not have abs modifiers.
3683 Mods |= SISrcMods::OP_SEL_1;
3684
3685 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3686 return true;
3687}
3688
3689bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3690 SDValue &SrcMods) const {
3691 return SelectVOP3PMods(In, Src, SrcMods, IsDOT: true);
3692}
3693
3694bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3695 SDValue &Src) const {
3696 const ConstantSDNode *C = cast<ConstantSDNode>(Val&: In);
3697 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3698
3699 unsigned Mods = SISrcMods::OP_SEL_1;
3700 unsigned SrcVal = C->getZExtValue();
3701 if (SrcVal == 1)
3702 Mods |= SISrcMods::OP_SEL_0;
3703
3704 Src = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3705 return true;
3706}
3707
3708static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3709 llvm::SelectionDAG *CurDAG,
3710 const SDLoc &DL) {
3711 unsigned DstRegClass;
3712 EVT DstTy;
3713 switch (Elts.size()) {
3714 case 8:
3715 DstRegClass = AMDGPU::VReg_256RegClassID;
3716 DstTy = MVT::v8i32;
3717 break;
3718 case 4:
3719 DstRegClass = AMDGPU::VReg_128RegClassID;
3720 DstTy = MVT::v4i32;
3721 break;
3722 case 2:
3723 DstRegClass = AMDGPU::VReg_64RegClassID;
3724 DstTy = MVT::v2i32;
3725 break;
3726 default:
3727 llvm_unreachable("unhandled Reg sequence size");
3728 }
3729
3730 SmallVector<SDValue, 17> Ops;
3731 Ops.push_back(Elt: CurDAG->getTargetConstant(Val: DstRegClass, DL, VT: MVT::i32));
3732 for (unsigned i = 0; i < Elts.size(); ++i) {
3733 Ops.push_back(Elt: Elts[i]);
3734 Ops.push_back(Elt: CurDAG->getTargetConstant(
3735 Val: SIRegisterInfo::getSubRegFromChannel(Channel: i), DL, VT: MVT::i32));
3736 }
3737 return CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL, VT: DstTy, Ops);
3738}
3739
3740static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3741 llvm::SelectionDAG *CurDAG,
3742 const SDLoc &DL) {
3743 SmallVector<SDValue, 8> PackedElts;
3744 assert("unhandled Reg sequence size" &&
3745 (Elts.size() == 8 || Elts.size() == 16));
3746
3747 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3748 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3749 for (unsigned i = 0; i < Elts.size(); i += 2) {
3750 SDValue LoSrc = stripExtractLoElt(In: stripBitcast(Val: Elts[i]));
3751 SDValue HiSrc;
3752 if (isExtractHiElt(In: Elts[i + 1], Out&: HiSrc) && LoSrc == HiSrc) {
3753 PackedElts.push_back(Elt: HiSrc);
3754 } else {
3755 SDValue PackLoLo = CurDAG->getTargetConstant(Val: 0x05040100, DL, VT: MVT::i32);
3756 MachineSDNode *Packed =
3757 CurDAG->getMachineNode(Opcode: AMDGPU::V_PERM_B32_e64, dl: DL, VT: MVT::i32,
3758 Ops: {Elts[i + 1], Elts[i], PackLoLo});
3759 PackedElts.push_back(Elt: SDValue(Packed, 0));
3760 }
3761 }
3762
3763 return buildRegSequence32(Elts&: PackedElts, CurDAG, DL);
3764}
3765
3766static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3767 llvm::SelectionDAG *CurDAG,
3768 const SDLoc &DL, unsigned ElementSize) {
3769 if (ElementSize == 16)
3770 return buildRegSequence16(Elts, CurDAG, DL);
3771 if (ElementSize == 32)
3772 return buildRegSequence32(Elts, CurDAG, DL);
3773 llvm_unreachable("Unhandled element size");
3774}
3775
3776static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3777 SmallVectorImpl<SDValue> &Elts, SDValue &Src,
3778 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3779 unsigned ElementSize) {
3780 if (ModOpcode == ISD::FNEG) {
3781 Mods |= SISrcMods::NEG;
3782 // Check if all elements also have abs modifier
3783 SmallVector<SDValue, 8> NegAbsElts;
3784 for (auto El : Elts) {
3785 if (El.getOpcode() != ISD::FABS)
3786 break;
3787 NegAbsElts.push_back(Elt: El->getOperand(Num: 0));
3788 }
3789 if (Elts.size() != NegAbsElts.size()) {
3790 // Neg
3791 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3792 } else {
3793 // Neg and Abs
3794 Mods |= SISrcMods::NEG_HI;
3795 Src = SDValue(buildRegSequence(Elts&: NegAbsElts, CurDAG, DL, ElementSize), 0);
3796 }
3797 } else {
3798 assert(ModOpcode == ISD::FABS);
3799 // Abs
3800 Mods |= SISrcMods::NEG_HI;
3801 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3802 }
3803}
3804
3805// Check all f16 elements for modifiers while looking through b32 and v2b16
3806// build vector, stop if element does not satisfy ModifierCheck.
3807static void
3808checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
3809 std::function<bool(SDValue)> ModifierCheck) {
3810 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3811 if (auto *F16Pair =
3812 dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: BV->getOperand(Num: i)))) {
3813 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3814 SDValue ElF16 = stripBitcast(Val: F16Pair->getOperand(Num: i));
3815 if (!ModifierCheck(ElF16))
3816 break;
3817 }
3818 }
3819 }
3820}
3821
3822bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3823 SDValue &SrcMods) const {
3824 Src = In;
3825 unsigned Mods = SISrcMods::OP_SEL_1;
3826
3827 // mods are on f16 elements
3828 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3829 SmallVector<SDValue, 8> EltsF16;
3830
3831 checkWMMAElementsModifiersF16(BV, ModifierCheck: [&](SDValue Element) -> bool {
3832 if (Element.getOpcode() != ISD::FNEG)
3833 return false;
3834 EltsF16.push_back(Elt: Element.getOperand(i: 0));
3835 return true;
3836 });
3837
3838 // All elements have neg modifier
3839 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3840 Src = SDValue(buildRegSequence16(Elts&: EltsF16, CurDAG, DL: SDLoc(In)), 0);
3841 Mods |= SISrcMods::NEG;
3842 Mods |= SISrcMods::NEG_HI;
3843 }
3844 }
3845
3846 // mods are on v2f16 elements
3847 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3848 SmallVector<SDValue, 8> EltsV2F16;
3849 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3850 SDValue ElV2f16 = stripBitcast(Val: BV->getOperand(Num: i));
3851 // Based on first element decide which mod we match, neg or abs
3852 if (ElV2f16.getOpcode() != ISD::FNEG)
3853 break;
3854 EltsV2F16.push_back(Elt: ElV2f16.getOperand(i: 0));
3855 }
3856
3857 // All pairs of elements have neg modifier
3858 if (BV->getNumOperands() == EltsV2F16.size()) {
3859 Src = SDValue(buildRegSequence32(Elts&: EltsV2F16, CurDAG, DL: SDLoc(In)), 0);
3860 Mods |= SISrcMods::NEG;
3861 Mods |= SISrcMods::NEG_HI;
3862 }
3863 }
3864
3865 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3866 return true;
3867}
3868
3869bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3870 SDValue &SrcMods) const {
3871 Src = In;
3872 unsigned Mods = SISrcMods::OP_SEL_1;
3873 unsigned ModOpcode;
3874
3875 // mods are on f16 elements
3876 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3877 SmallVector<SDValue, 8> EltsF16;
3878 checkWMMAElementsModifiersF16(BV, ModifierCheck: [&](SDValue ElF16) -> bool {
3879 // Based on first element decide which mod we match, neg or abs
3880 if (EltsF16.empty())
3881 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3882 if (ElF16.getOpcode() != ModOpcode)
3883 return false;
3884 EltsF16.push_back(Elt: ElF16.getOperand(i: 0));
3885 return true;
3886 });
3887
3888 // All elements have ModOpcode modifier
3889 if (BV->getNumOperands() * 2 == EltsF16.size())
3890 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF16, Src, CurDAG, DL: SDLoc(In),
3891 ElementSize: 16);
3892 }
3893
3894 // mods are on v2f16 elements
3895 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3896 SmallVector<SDValue, 8> EltsV2F16;
3897
3898 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3899 SDValue ElV2f16 = stripBitcast(Val: BV->getOperand(Num: i));
3900 // Based on first element decide which mod we match, neg or abs
3901 if (EltsV2F16.empty())
3902 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3903 if (ElV2f16->getOpcode() != ModOpcode)
3904 break;
3905 EltsV2F16.push_back(Elt: ElV2f16->getOperand(Num: 0));
3906 }
3907
3908 // All elements have ModOpcode modifier
3909 if (BV->getNumOperands() == EltsV2F16.size())
3910 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsV2F16, Src, CurDAG, DL: SDLoc(In),
3911 ElementSize: 32);
3912 }
3913
3914 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3915 return true;
3916}
3917
3918bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3919 SDValue &SrcMods) const {
3920 Src = In;
3921 unsigned Mods = SISrcMods::OP_SEL_1;
3922 SmallVector<SDValue, 8> EltsF32;
3923
3924 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3925 assert(BV->getNumOperands() > 0);
3926 // Based on first element decide which mod we match, neg or abs
3927 SDValue ElF32 = stripBitcast(Val: BV->getOperand(Num: 0));
3928 unsigned ModOpcode =
3929 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3930 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3931 SDValue ElF32 = stripBitcast(Val: BV->getOperand(Num: i));
3932 if (ElF32.getOpcode() != ModOpcode)
3933 break;
3934 EltsF32.push_back(Elt: ElF32.getOperand(i: 0));
3935 }
3936
3937 // All elements had ModOpcode modifier
3938 if (BV->getNumOperands() == EltsF32.size())
3939 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF32, Src, CurDAG, DL: SDLoc(In),
3940 ElementSize: 32);
3941 }
3942
3943 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3944 return true;
3945}
3946
3947bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3948 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val&: In)) {
3949 BitVector UndefElements;
3950 if (SDValue Splat = BV->getSplatValue(UndefElements: &UndefElements))
3951 if (isInlineImmediate(N: Splat.getNode())) {
3952 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Splat)) {
3953 unsigned Imm = C->getAPIntValue().getSExtValue();
3954 Src = CurDAG->getTargetConstant(Val: Imm, DL: SDLoc(In), VT: MVT::i32);
3955 return true;
3956 }
3957 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Splat)) {
3958 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3959 Src = CurDAG->getTargetConstant(Val: Imm, DL: SDLoc(In), VT: MVT::i32);
3960 return true;
3961 }
3962 llvm_unreachable("unhandled Constant node");
3963 }
3964 }
3965
3966 // 16 bit splat
3967 SDValue SplatSrc32 = stripBitcast(Val: In);
3968 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(Val&: SplatSrc32))
3969 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3970 SDValue SplatSrc16 = stripBitcast(Val: Splat32);
3971 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(Val&: SplatSrc16))
3972 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3973 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3974 std::optional<APInt> RawValue;
3975 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Splat))
3976 RawValue = C->getValueAPF().bitcastToAPInt();
3977 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Splat))
3978 RawValue = C->getAPIntValue();
3979
3980 if (RawValue.has_value()) {
3981 EVT VT = In.getValueType().getScalarType();
3982 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3983 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3984 ? APFloatBase::IEEEhalf()
3985 : APFloatBase::BFloat(),
3986 RawValue.value());
3987 if (TII->isInlineConstant(Imm: FloatVal)) {
3988 Src = CurDAG->getTargetConstant(Val: RawValue.value(), DL: SDLoc(In),
3989 VT: MVT::i16);
3990 return true;
3991 }
3992 } else if (VT.getSimpleVT() == MVT::i16) {
3993 if (TII->isInlineConstant(Imm: RawValue.value())) {
3994 Src = CurDAG->getTargetConstant(Val: RawValue.value(), DL: SDLoc(In),
3995 VT: MVT::i16);
3996 return true;
3997 }
3998 } else
3999 llvm_unreachable("unknown 16-bit type");
4000 }
4001 }
4002 }
4003
4004 return false;
4005}
4006
4007bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
4008 SDValue &IndexKey) const {
4009 unsigned Key = 0;
4010 Src = In;
4011
4012 if (In.getOpcode() == ISD::SRL) {
4013 const llvm::SDValue &ShiftSrc = In.getOperand(i: 0);
4014 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: 1));
4015 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4016 ShiftAmt->getZExtValue() % 8 == 0) {
4017 Key = ShiftAmt->getZExtValue() / 8;
4018 Src = ShiftSrc;
4019 }
4020 }
4021
4022 IndexKey = CurDAG->getTargetConstant(Val: Key, DL: SDLoc(In), VT: MVT::i32);
4023 return true;
4024}
4025
4026bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
4027 SDValue &IndexKey) const {
4028 unsigned Key = 0;
4029 Src = In;
4030
4031 if (In.getOpcode() == ISD::SRL) {
4032 const llvm::SDValue &ShiftSrc = In.getOperand(i: 0);
4033 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: 1));
4034 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4035 ShiftAmt->getZExtValue() == 16) {
4036 Key = 1;
4037 Src = ShiftSrc;
4038 }
4039 }
4040
4041 IndexKey = CurDAG->getTargetConstant(Val: Key, DL: SDLoc(In), VT: MVT::i32);
4042 return true;
4043}
4044
4045bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
4046 SDValue &IndexKey) const {
4047 unsigned Key = 0;
4048 Src = In;
4049
4050 SDValue InI32;
4051
4052 if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
4053 const SDValue &ExtendSrc = In.getOperand(i: 0);
4054 if (ExtendSrc.getValueSizeInBits() == 32)
4055 InI32 = ExtendSrc;
4056 } else if (In->getOpcode() == ISD::BITCAST) {
4057 const SDValue &CastSrc = In.getOperand(i: 0);
4058 if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
4059 CastSrc.getOperand(i: 0).getValueSizeInBits() == 32) {
4060 ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(Val: CastSrc.getOperand(i: 1));
4061 if (Zero && Zero->getZExtValue() == 0)
4062 InI32 = CastSrc.getOperand(i: 0);
4063 }
4064 }
4065
4066 if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4067 const SDValue &ExtractVecEltSrc = InI32.getOperand(i: 0);
4068 ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(Val: InI32.getOperand(i: 1));
4069 if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
4070 EltIdx->getZExtValue() == 1) {
4071 Key = 1;
4072 Src = ExtractVecEltSrc;
4073 }
4074 }
4075
4076 IndexKey = CurDAG->getTargetConstant(Val: Key, DL: SDLoc(In), VT: MVT::i32);
4077 return true;
4078}
4079
4080bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
4081 SDValue &SrcMods) const {
4082 Src = In;
4083 // FIXME: Handle op_sel
4084 SrcMods = CurDAG->getTargetConstant(Val: 0, DL: SDLoc(In), VT: MVT::i32);
4085 return true;
4086}
4087
4088bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
4089 SDValue &SrcMods) const {
4090 // FIXME: Handle op_sel
4091 return SelectVOP3Mods(In, Src, SrcMods);
4092}
4093
4094// Match lowered fpext from bf16 to f32. This is a bit operation extending
4095// a 16-bit value with 16-bit of zeroes at LSB:
4096//
4097// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
4098// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
4099// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
4100static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
4101 if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
4102 return SDValue();
4103 Op = Op.getOperand(i: 0);
4104
4105 IsExtractHigh = false;
4106 if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
4107 auto Low16 = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 0));
4108 if (!Low16 || !Low16->isZero())
4109 return SDValue();
4110 Op = stripBitcast(Val: Op.getOperand(i: 1));
4111 if (Op.getValueType() != MVT::bf16)
4112 return SDValue();
4113 return Op;
4114 }
4115
4116 if (Op.getValueType() != MVT::i32)
4117 return SDValue();
4118
4119 if (Op.getOpcode() == ISD::AND) {
4120 if (auto Mask = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
4121 if (Mask->getZExtValue() == 0xffff0000) {
4122 IsExtractHigh = true;
4123 return Op.getOperand(i: 0);
4124 }
4125 }
4126 return SDValue();
4127 }
4128
4129 if (Op.getOpcode() == ISD::SHL) {
4130 if (auto Amt = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
4131 if (Amt->getZExtValue() == 16)
4132 return Op.getOperand(i: 0);
4133 }
4134 }
4135
4136 return SDValue();
4137}
4138
4139// The return value is not whether the match is possible (which it always is),
4140// but whether or not it a conversion is really used.
4141bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
4142 unsigned &Mods,
4143 MVT VT) const {
4144 Mods = 0;
4145 SelectVOP3ModsImpl(In, Src, Mods);
4146
4147 bool IsExtractHigh = false;
4148 if (Src.getOpcode() == ISD::FP_EXTEND) {
4149 Src = Src.getOperand(i: 0);
4150 } else if (VT == MVT::bf16) {
4151 SDValue B16 = matchBF16FPExtendLike(Op: Src, IsExtractHigh);
4152 if (!B16)
4153 return false;
4154 Src = B16;
4155 } else
4156 return false;
4157
4158 if (Src.getValueType() != VT &&
4159 (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
4160 return false;
4161
4162 Src = stripBitcast(Val: Src);
4163
4164 // Be careful about folding modifiers if we already have an abs. fneg is
4165 // applied last, so we don't want to apply an earlier fneg.
4166 if ((Mods & SISrcMods::ABS) == 0) {
4167 unsigned ModsTmp;
4168 SelectVOP3ModsImpl(In: Src, Src, Mods&: ModsTmp);
4169
4170 if ((ModsTmp & SISrcMods::NEG) != 0)
4171 Mods ^= SISrcMods::NEG;
4172
4173 if ((ModsTmp & SISrcMods::ABS) != 0)
4174 Mods |= SISrcMods::ABS;
4175 }
4176
4177 // op_sel/op_sel_hi decide the source type and source.
4178 // If the source's op_sel_hi is set, it indicates to do a conversion from
4179 // fp16. If the sources's op_sel is set, it picks the high half of the source
4180 // register.
4181
4182 Mods |= SISrcMods::OP_SEL_1;
4183 if (Src.getValueSizeInBits() == 16) {
4184 if (isExtractHiElt(In: Src, Out&: Src)) {
4185 Mods |= SISrcMods::OP_SEL_0;
4186
4187 // TODO: Should we try to look for neg/abs here?
4188 return true;
4189 }
4190
4191 if (Src.getOpcode() == ISD::TRUNCATE &&
4192 Src.getOperand(i: 0).getValueType() == MVT::i32) {
4193 Src = Src.getOperand(i: 0);
4194 return true;
4195 }
4196
4197 if (Subtarget->useRealTrue16Insts())
4198 // In true16 mode, pack src to a 32bit
4199 Src = createVOP3PSrc32FromLo16(Lo: Src, Src: In, CurDAG, Subtarget);
4200 } else if (IsExtractHigh)
4201 Mods |= SISrcMods::OP_SEL_0;
4202
4203 return true;
4204}
4205
4206bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
4207 SDValue &SrcMods) const {
4208 unsigned Mods = 0;
4209 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, VT: MVT::f16))
4210 return false;
4211 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
4212 return true;
4213}
4214
4215bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
4216 SDValue &SrcMods) const {
4217 unsigned Mods = 0;
4218 SelectVOP3PMadMixModsImpl(In, Src, Mods, VT: MVT::f16);
4219 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
4220 return true;
4221}
4222
4223bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
4224 SDValue &SrcMods) const {
4225 unsigned Mods = 0;
4226 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, VT: MVT::bf16))
4227 return false;
4228 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
4229 return true;
4230}
4231
4232bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
4233 SDValue &SrcMods) const {
4234 unsigned Mods = 0;
4235 SelectVOP3PMadMixModsImpl(In, Src, Mods, VT: MVT::bf16);
4236 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
4237 return true;
4238}
4239
4240// Match BITOP3 operation and return a number of matched instructions plus
4241// truth table.
4242static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
4243 SmallVectorImpl<SDValue> &Src) {
4244 unsigned NumOpcodes = 0;
4245 uint8_t LHSBits, RHSBits;
4246
4247 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
4248 // Define truth table given Src0, Src1, Src2 bits permutations:
4249 // 0 0 0
4250 // 0 0 1
4251 // 0 1 0
4252 // 0 1 1
4253 // 1 0 0
4254 // 1 0 1
4255 // 1 1 0
4256 // 1 1 1
4257 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4258
4259 if (auto *C = dyn_cast<ConstantSDNode>(Val&: Op)) {
4260 if (C->isAllOnes()) {
4261 Bits = 0xff;
4262 return true;
4263 }
4264 if (C->isZero()) {
4265 Bits = 0;
4266 return true;
4267 }
4268 }
4269
4270 for (unsigned I = 0; I < Src.size(); ++I) {
4271 // Try to find existing reused operand
4272 if (Src[I] == Op) {
4273 Bits = SrcBits[I];
4274 return true;
4275 }
4276 // Try to replace parent operator
4277 if (Src[I] == In) {
4278 Bits = SrcBits[I];
4279 Src[I] = Op;
4280 return true;
4281 }
4282 }
4283
4284 if (Src.size() == 3) {
4285 // No room left for operands. Try one last time, there can be a 'not' of
4286 // one of our source operands. In this case we can compute the bits
4287 // without growing Src vector.
4288 if (Op.getOpcode() == ISD::XOR) {
4289 if (auto *C = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
4290 if (C->isAllOnes()) {
4291 SDValue LHS = Op.getOperand(i: 0);
4292 for (unsigned I = 0; I < Src.size(); ++I) {
4293 if (Src[I] == LHS) {
4294 Bits = ~SrcBits[I];
4295 return true;
4296 }
4297 }
4298 }
4299 }
4300 }
4301
4302 return false;
4303 }
4304
4305 Bits = SrcBits[Src.size()];
4306 Src.push_back(Elt: Op);
4307 return true;
4308 };
4309
4310 switch (In.getOpcode()) {
4311 case ISD::AND:
4312 case ISD::OR:
4313 case ISD::XOR: {
4314 SDValue LHS = In.getOperand(i: 0);
4315 SDValue RHS = In.getOperand(i: 1);
4316
4317 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
4318 if (!getOperandBits(LHS, LHSBits) ||
4319 !getOperandBits(RHS, RHSBits)) {
4320 Src = std::move(Backup);
4321 return std::make_pair(x: 0, y: 0);
4322 }
4323
4324 // Recursion is naturally limited by the size of the operand vector.
4325 auto Op = BitOp3_Op(In: LHS, Src);
4326 if (Op.first) {
4327 NumOpcodes += Op.first;
4328 LHSBits = Op.second;
4329 }
4330
4331 Op = BitOp3_Op(In: RHS, Src);
4332 if (Op.first) {
4333 NumOpcodes += Op.first;
4334 RHSBits = Op.second;
4335 }
4336 break;
4337 }
4338 default:
4339 return std::make_pair(x: 0, y: 0);
4340 }
4341
4342 uint8_t TTbl;
4343 switch (In.getOpcode()) {
4344 case ISD::AND:
4345 TTbl = LHSBits & RHSBits;
4346 break;
4347 case ISD::OR:
4348 TTbl = LHSBits | RHSBits;
4349 break;
4350 case ISD::XOR:
4351 TTbl = LHSBits ^ RHSBits;
4352 break;
4353 default:
4354 break;
4355 }
4356
4357 return std::make_pair(x: NumOpcodes + 1, y&: TTbl);
4358}
4359
4360bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
4361 SDValue &Src2, SDValue &Tbl) const {
4362 SmallVector<SDValue, 3> Src;
4363 uint8_t TTbl;
4364 unsigned NumOpcodes;
4365
4366 std::tie(args&: NumOpcodes, args&: TTbl) = BitOp3_Op(In, Src);
4367
4368 // Src.empty() case can happen if all operands are all zero or all ones.
4369 // Normally it shall be optimized out before reaching this.
4370 if (NumOpcodes < 2 || Src.empty())
4371 return false;
4372
4373 // For a uniform case threshold should be higher to account for moves between
4374 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
4375 // and a readtfirstlane after.
4376 if (NumOpcodes < 4 && !In->isDivergent())
4377 return false;
4378
4379 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
4380 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4381 // asm more readable. This cannot be modeled with AddedComplexity because
4382 // selector does not know how many operations did we match.
4383 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
4384 (In.getOperand(i: 0).getOpcode() == In.getOpcode() ||
4385 In.getOperand(i: 1).getOpcode() == In.getOpcode()))
4386 return false;
4387
4388 if (In.getOpcode() == ISD::OR &&
4389 (In.getOperand(i: 0).getOpcode() == ISD::AND ||
4390 In.getOperand(i: 1).getOpcode() == ISD::AND))
4391 return false;
4392 }
4393
4394 // Last operand can be ignored, turning a ternary operation into a binary.
4395 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4396 // 'c' with 'a' here without changing the answer. In some pathological
4397 // cases it should be possible to get an operation with a single operand
4398 // too if optimizer would not catch it.
4399 while (Src.size() < 3)
4400 Src.push_back(Elt: Src[0]);
4401
4402 Src0 = Src[0];
4403 Src1 = Src[1];
4404 Src2 = Src[2];
4405
4406 Tbl = CurDAG->getTargetConstant(Val: TTbl, DL: SDLoc(In), VT: MVT::i32);
4407 return true;
4408}
4409
4410SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
4411 if (In.isUndef())
4412 return CurDAG->getUNDEF(VT: MVT::i32);
4413
4414 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: In)) {
4415 SDLoc SL(In);
4416 return CurDAG->getConstant(Val: C->getZExtValue() << 16, DL: SL, VT: MVT::i32);
4417 }
4418
4419 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: In)) {
4420 SDLoc SL(In);
4421 return CurDAG->getConstant(
4422 Val: C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, DL: SL, VT: MVT::i32);
4423 }
4424
4425 SDValue Src;
4426 if (isExtractHiElt(In, Out&: Src))
4427 return Src;
4428
4429 return SDValue();
4430}
4431
4432bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
4433 assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
4434
4435 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
4436 const SIInstrInfo *SII = Subtarget->getInstrInfo();
4437
4438 unsigned Limit = 0;
4439 bool AllUsesAcceptSReg = true;
4440 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
4441 Limit < 10 && U != E; ++U, ++Limit) {
4442 const TargetRegisterClass *RC =
4443 getOperandRegClass(N: U->getUser(), OpNo: U->getOperandNo());
4444
4445 // If the register class is unknown, it could be an unknown
4446 // register class that needs to be an SGPR, e.g. an inline asm
4447 // constraint
4448 if (!RC || SIRI->isSGPRClass(RC))
4449 return false;
4450
4451 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass &&
4452 RC != &AMDGPU::VS_64_Align2RegClass) {
4453 AllUsesAcceptSReg = false;
4454 SDNode *User = U->getUser();
4455 if (User->isMachineOpcode()) {
4456 unsigned Opc = User->getMachineOpcode();
4457 const MCInstrDesc &Desc = SII->get(Opcode: Opc);
4458 if (Desc.isCommutable()) {
4459 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
4460 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
4461 if (SII->findCommutedOpIndices(Desc, SrcOpIdx0&: OpIdx, SrcOpIdx1&: CommuteIdx1)) {
4462 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
4463 const TargetRegisterClass *CommutedRC =
4464 getOperandRegClass(N: U->getUser(), OpNo: CommutedOpNo);
4465 if (CommutedRC == &AMDGPU::VS_32RegClass ||
4466 CommutedRC == &AMDGPU::VS_64RegClass ||
4467 CommutedRC == &AMDGPU::VS_64_Align2RegClass)
4468 AllUsesAcceptSReg = true;
4469 }
4470 }
4471 }
4472 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
4473 // commuting current user. This means have at least one use
4474 // that strictly require VGPR. Thus, we will not attempt to commute
4475 // other user instructions.
4476 if (!AllUsesAcceptSReg)
4477 break;
4478 }
4479 }
4480 return !AllUsesAcceptSReg && (Limit < 10);
4481}
4482
4483bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
4484 const auto *Ld = cast<LoadSDNode>(Val: N);
4485 const MachineMemOperand *MMO = Ld->getMemOperand();
4486
4487 // FIXME: We ought to able able to take the direct isDivergent result. We
4488 // cannot rely on the MMO for a uniformity check, and should stop using
4489 // it. This is a hack for 2 ways that the IR divergence analysis is superior
4490 // to the DAG divergence: Recognizing shift-of-workitem-id as always
4491 // uniform, and isSingleLaneExecution. These should be handled in the DAG
4492 // version, and then this can be dropped.
4493 if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))
4494 return false;
4495
4496 return MMO->getSize().hasValue() &&
4497 Ld->getAlign() >=
4498 Align(std::min(a: MMO->getSize().getValue().getKnownMinValue(),
4499 b: uint64_t(4))) &&
4500 (MMO->isInvariant() ||
4501 (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4502 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
4503 (Subtarget->getScalarizeGlobalBehavior() &&
4504 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
4505 Ld->isSimple() &&
4506 static_cast<const SITargetLowering *>(getTargetLowering())
4507 ->isMemOpHasNoClobberedMemOperand(N)));
4508}
4509
4510void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
4511 const AMDGPUTargetLowering& Lowering =
4512 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
4513 bool IsModified = false;
4514 do {
4515 IsModified = false;
4516
4517 // Go over all selected nodes and try to fold them a bit more
4518 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
4519 while (Position != CurDAG->allnodes_end()) {
4520 SDNode *Node = &*Position++;
4521 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Val: Node);
4522 if (!MachineNode)
4523 continue;
4524
4525 SDNode *ResNode = Lowering.PostISelFolding(N: MachineNode, DAG&: *CurDAG);
4526 if (ResNode != Node) {
4527 if (ResNode)
4528 ReplaceUses(F: Node, T: ResNode);
4529 IsModified = true;
4530 }
4531 }
4532 CurDAG->RemoveDeadNodes();
4533 } while (IsModified);
4534}
4535
4536AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,
4537 CodeGenOptLevel OptLevel)
4538 : SelectionDAGISelLegacy(
4539 ID, std::make_unique<AMDGPUDAGToDAGISel>(args&: TM, args&: OptLevel)) {}
4540
4541char AMDGPUDAGToDAGISelLegacy::ID = 0;
4542