1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "MCTargetDesc/R600MCTargetDesc.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
23#include "SIMachineFunctionInfo.h"
24#include "llvm/Analysis/UniformityAnalysis.h"
25#include "llvm/CodeGen/FunctionLoweringInfo.h"
26#include "llvm/CodeGen/SelectionDAG.h"
27#include "llvm/CodeGen/SelectionDAGISel.h"
28#include "llvm/CodeGen/SelectionDAGNodes.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include "llvm/InitializePasses.h"
31#include "llvm/Support/ErrorHandling.h"
32
33#ifdef EXPENSIVE_CHECKS
34#include "llvm/Analysis/LoopInfo.h"
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(i: 0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(Val: In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: 1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(i: 0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(i: 0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: Srl.getOperand(i: 1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Val: Srl.getOperand(i: 0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
81 llvm::SelectionDAG *CurDAG,
82 const GCNSubtarget *Subtarget) {
83 if (!Subtarget->useRealTrue16Insts()) {
84 return Lo;
85 }
86
87 SDValue NewSrc;
88 SDLoc SL(Lo);
89
90 if (Lo->isDivergent()) {
91 SDValue Undef = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF,
92 dl: SL, VT: Lo.getValueType()),
93 0);
94 const SDValue Ops[] = {
95 CurDAG->getTargetConstant(Val: AMDGPU::VGPR_32RegClassID, DL: SL, VT: MVT::i32), Lo,
96 CurDAG->getTargetConstant(Val: AMDGPU::lo16, DL: SL, VT: MVT::i16), Undef,
97 CurDAG->getTargetConstant(Val: AMDGPU::hi16, DL: SL, VT: MVT::i16)};
98
99 NewSrc = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: SL,
100 VT: Src.getValueType(), Ops),
101 0);
102 } else {
103 // the S_MOV is needed since the Lo could still be a VGPR16.
104 // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
105 // the fixvgpr2sgprcopy pass to legalize it
106 NewSrc = SDValue(
107 CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: Src.getValueType(), Op1: Lo),
108 0);
109 }
110
111 return NewSrc;
112}
113
114// Look through operations that obscure just looking at the low 16-bits of the
115// same register.
116static SDValue stripExtractLoElt(SDValue In) {
117 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
118 SDValue Idx = In.getOperand(i: 1);
119 if (isNullConstant(V: Idx) && In.getValueSizeInBits() <= 32)
120 return In.getOperand(i: 0);
121 }
122
123 if (In.getOpcode() == ISD::TRUNCATE) {
124 SDValue Src = In.getOperand(i: 0);
125 if (Src.getValueType().getSizeInBits() == 32)
126 return stripBitcast(Val: Src);
127 }
128
129 return In;
130}
131
132static SDValue emitRegSequence(llvm::SelectionDAG &CurDAG, unsigned DstRegClass,
133 EVT DstTy, ArrayRef<SDValue> Elts,
134 ArrayRef<unsigned> SubRegClass,
135 const SDLoc &DL) {
136 assert(Elts.size() == SubRegClass.size() && "array size mismatch");
137 unsigned NumElts = Elts.size();
138 SmallVector<SDValue, 17> Ops(2 * NumElts + 1);
139 Ops[0] = (CurDAG.getTargetConstant(Val: DstRegClass, DL, VT: MVT::i32));
140 for (unsigned i = 0; i < NumElts; ++i) {
141 Ops[2 * i + 1] = Elts[i];
142 Ops[2 * i + 2] = CurDAG.getTargetConstant(Val: SubRegClass[i], DL, VT: MVT::i32);
143 }
144 return SDValue(
145 CurDAG.getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL, VT: DstTy, Ops), 0);
146}
147
148} // end anonymous namespace
149
150INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
151 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
152 false)
153INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
154INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
155#ifdef EXPENSIVE_CHECKS
156INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
157INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
158#endif
159INITIALIZE_PASS_END(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
160 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
161 false)
162
163/// This pass converts a legalized DAG into a AMDGPU-specific
164// DAG, ready for instruction scheduling.
165FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
166 CodeGenOptLevel OptLevel) {
167 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
168}
169
170AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM,
171 CodeGenOptLevel OptLevel)
172 : SelectionDAGISel(TM, OptLevel) {}
173
174bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
175 Subtarget = &MF.getSubtarget<GCNSubtarget>();
176 Subtarget->checkSubtargetFeatures(F: MF.getFunction());
177 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
178 return SelectionDAGISel::runOnMachineFunction(mf&: MF);
179}
180
181bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
182 // XXX - only need to list legal operations.
183 switch (Opc) {
184 case ISD::FADD:
185 case ISD::FSUB:
186 case ISD::FMUL:
187 case ISD::FDIV:
188 case ISD::FREM:
189 case ISD::FCANONICALIZE:
190 case ISD::UINT_TO_FP:
191 case ISD::SINT_TO_FP:
192 case ISD::FABS:
193 // Fabs is lowered to a bit operation, but it's an and which will clear the
194 // high bits anyway.
195 case ISD::FSQRT:
196 case ISD::FSIN:
197 case ISD::FCOS:
198 case ISD::FPOWI:
199 case ISD::FPOW:
200 case ISD::FLOG:
201 case ISD::FLOG2:
202 case ISD::FLOG10:
203 case ISD::FEXP:
204 case ISD::FEXP2:
205 case ISD::FCEIL:
206 case ISD::FTRUNC:
207 case ISD::FRINT:
208 case ISD::FNEARBYINT:
209 case ISD::FROUNDEVEN:
210 case ISD::FROUND:
211 case ISD::FFLOOR:
212 case ISD::FMINNUM:
213 case ISD::FMAXNUM:
214 case ISD::FLDEXP:
215 case AMDGPUISD::FRACT:
216 case AMDGPUISD::CLAMP:
217 case AMDGPUISD::COS_HW:
218 case AMDGPUISD::SIN_HW:
219 case AMDGPUISD::FMIN3:
220 case AMDGPUISD::FMAX3:
221 case AMDGPUISD::FMED3:
222 case AMDGPUISD::FMAD_FTZ:
223 case AMDGPUISD::RCP:
224 case AMDGPUISD::RSQ:
225 case AMDGPUISD::RCP_IFLAG:
226 // On gfx10, all 16-bit instructions preserve the high bits.
227 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
228 case ISD::FP_ROUND:
229 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
230 // high bits on gfx9.
231 // TODO: If we had the source node we could see if the source was fma/mad
232 return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
233 case ISD::FMA:
234 case ISD::FMAD:
235 case AMDGPUISD::DIV_FIXUP:
236 return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
237 default:
238 // fcopysign, select and others may be lowered to 32-bit bit operations
239 // which don't zero the high bits.
240 return false;
241 }
242}
243
244bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {
245#ifdef EXPENSIVE_CHECKS
246 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
247 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
248 for (auto &L : LI->getLoopsInPreorder()) {
249 assert(L->isLCSSAForm(DT));
250 }
251#endif
252 return SelectionDAGISelLegacy::runOnMachineFunction(MF);
253}
254
255void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
256 AU.addRequired<UniformityInfoWrapperPass>();
257#ifdef EXPENSIVE_CHECKS
258 AU.addRequired<DominatorTreeWrapperPass>();
259 AU.addRequired<LoopInfoWrapperPass>();
260#endif
261 SelectionDAGISelLegacy::getAnalysisUsage(AU);
262}
263
264bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
265 assert(Subtarget->d16PreservesUnusedBits());
266 MVT VT = N->getValueType(ResNo: 0).getSimpleVT();
267 if (VT != MVT::v2i16 && VT != MVT::v2f16)
268 return false;
269
270 SDValue Lo = N->getOperand(Num: 0);
271 SDValue Hi = N->getOperand(Num: 1);
272
273 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(Val: stripBitcast(Val: Hi));
274
275 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
276 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
277 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
278
279 // Need to check for possible indirect dependencies on the other half of the
280 // vector to avoid introducing a cycle.
281 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(N: Lo.getNode())) {
282 SDVTList VTList = CurDAG->getVTList(VT1: VT, VT2: MVT::Other);
283
284 SDValue TiedIn = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SDLoc(N), VT, Operand: Lo);
285 SDValue Ops[] = {
286 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
287 };
288
289 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
290 if (LdHi->getMemoryVT() == MVT::i8) {
291 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
292 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
293 } else {
294 assert(LdHi->getMemoryVT() == MVT::i16);
295 }
296
297 SDValue NewLoadHi =
298 CurDAG->getMemIntrinsicNode(Opcode: LoadOp, dl: SDLoc(LdHi), VTList,
299 Ops, MemVT: LdHi->getMemoryVT(),
300 MMO: LdHi->getMemOperand());
301
302 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: NewLoadHi);
303 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(LdHi, 1), To: NewLoadHi.getValue(R: 1));
304 return true;
305 }
306
307 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
308 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
309 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
310 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(Val: stripBitcast(Val: Lo));
311 if (LdLo && Lo.hasOneUse()) {
312 SDValue TiedIn = getHi16Elt(In: Hi);
313 if (!TiedIn || LdLo->isPredecessorOf(N: TiedIn.getNode()))
314 return false;
315
316 SDVTList VTList = CurDAG->getVTList(VT1: VT, VT2: MVT::Other);
317 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
318 if (LdLo->getMemoryVT() == MVT::i8) {
319 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
320 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
321 } else {
322 assert(LdLo->getMemoryVT() == MVT::i16);
323 }
324
325 TiedIn = CurDAG->getNode(Opcode: ISD::BITCAST, DL: SDLoc(N), VT, Operand: TiedIn);
326
327 SDValue Ops[] = {
328 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
329 };
330
331 SDValue NewLoadLo =
332 CurDAG->getMemIntrinsicNode(Opcode: LoadOp, dl: SDLoc(LdLo), VTList,
333 Ops, MemVT: LdLo->getMemoryVT(),
334 MMO: LdLo->getMemOperand());
335
336 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: NewLoadLo);
337 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(LdLo, 1), To: NewLoadLo.getValue(R: 1));
338 return true;
339 }
340
341 return false;
342}
343
344void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
345 if (!Subtarget->d16PreservesUnusedBits())
346 return;
347
348 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
349
350 bool MadeChange = false;
351 while (Position != CurDAG->allnodes_begin()) {
352 SDNode *N = &*--Position;
353 if (N->use_empty())
354 continue;
355
356 switch (N->getOpcode()) {
357 case ISD::BUILD_VECTOR:
358 // TODO: Match load d16 from shl (extload:i16), 16
359 MadeChange |= matchLoadD16FromBuildVector(N);
360 break;
361 default:
362 break;
363 }
364 }
365
366 if (MadeChange) {
367 CurDAG->RemoveDeadNodes();
368 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
369 CurDAG->dump(););
370 }
371}
372
373bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
374 if (N->isUndef())
375 return true;
376
377 const SIInstrInfo *TII = Subtarget->getInstrInfo();
378 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N))
379 return TII->isInlineConstant(Imm: C->getAPIntValue());
380
381 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val: N))
382 return TII->isInlineConstant(Imm: C->getValueAPF());
383
384 return false;
385}
386
387/// Determine the register class for \p OpNo
388/// \returns The register class of the virtual register that will be used for
389/// the given operand number \OpNo or NULL if the register class cannot be
390/// determined.
391const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
392 unsigned OpNo) const {
393 if (!N->isMachineOpcode()) {
394 if (N->getOpcode() == ISD::CopyToReg) {
395 Register Reg = cast<RegisterSDNode>(Val: N->getOperand(Num: 1))->getReg();
396 if (Reg.isVirtual()) {
397 MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
398 return MRI.getRegClass(Reg);
399 }
400
401 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
402 return TRI->getPhysRegBaseClass(Reg);
403 }
404
405 return nullptr;
406 }
407
408 switch (N->getMachineOpcode()) {
409 default: {
410 const SIInstrInfo *TII = Subtarget->getInstrInfo();
411 const MCInstrDesc &Desc = TII->get(Opcode: N->getMachineOpcode());
412 unsigned OpIdx = Desc.getNumDefs() + OpNo;
413 if (OpIdx >= Desc.getNumOperands())
414 return nullptr;
415
416 int16_t RegClass = TII->getOpRegClassID(OpInfo: Desc.operands()[OpIdx]);
417 if (RegClass == -1)
418 return nullptr;
419
420 return Subtarget->getRegisterInfo()->getRegClass(i: RegClass);
421 }
422 case AMDGPU::REG_SEQUENCE: {
423 unsigned RCID = N->getConstantOperandVal(Num: 0);
424 const TargetRegisterClass *SuperRC =
425 Subtarget->getRegisterInfo()->getRegClass(i: RCID);
426
427 SDValue SubRegOp = N->getOperand(Num: OpNo + 1);
428 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
429 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
430 SubRegIdx);
431 }
432 }
433}
434
435SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
436 SDValue Glue) const {
437 SmallVector <SDValue, 8> Ops;
438 Ops.push_back(Elt: NewChain); // Replace the chain.
439 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
440 Ops.push_back(Elt: N->getOperand(Num: i));
441
442 Ops.push_back(Elt: Glue);
443 return CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: N->getVTList(), Ops);
444}
445
446SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
447 const SITargetLowering& Lowering =
448 *static_cast<const SITargetLowering*>(getTargetLowering());
449
450 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
451
452 SDValue M0 = Lowering.copyToM0(DAG&: *CurDAG, Chain: N->getOperand(Num: 0), DL: SDLoc(N), V: Val);
453 return glueCopyToOp(N, NewChain: M0, Glue: M0.getValue(R: 1));
454}
455
456SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
457 unsigned AS = cast<MemSDNode>(Val: N)->getAddressSpace();
458 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
459 if (Subtarget->ldsRequiresM0Init())
460 return glueCopyToM0(
461 N, Val: CurDAG->getSignedTargetConstant(Val: -1, DL: SDLoc(N), VT: MVT::i32));
462 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
463 MachineFunction &MF = CurDAG->getMachineFunction();
464 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
465 return
466 glueCopyToM0(N, Val: CurDAG->getTargetConstant(Val: Value, DL: SDLoc(N), VT: MVT::i32));
467 }
468 return N;
469}
470
471MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
472 EVT VT) const {
473 SDNode *Lo = CurDAG->getMachineNode(
474 Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
475 Op1: CurDAG->getTargetConstant(Val: Lo_32(Value: Imm), DL, VT: MVT::i32));
476 SDNode *Hi = CurDAG->getMachineNode(
477 Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
478 Op1: CurDAG->getTargetConstant(Val: Hi_32(Value: Imm), DL, VT: MVT::i32));
479 const SDValue Ops[] = {
480 CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32),
481 SDValue(Lo, 0), CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
482 SDValue(Hi, 0), CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
483
484 return CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL, VT, Ops);
485}
486
487SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,
488 SelectionDAG &DAG) const {
489 // TODO: Handle undef as zero
490
491 assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
492 uint32_t LHSVal, RHSVal;
493 if (getConstantValue(N: N->getOperand(Num: 0), Out&: LHSVal) &&
494 getConstantValue(N: N->getOperand(Num: 1), Out&: RHSVal)) {
495 SDLoc SL(N);
496 uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
497 return DAG.getMachineNode(
498 Opcode: isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, dl: SL,
499 VT: N->getValueType(ResNo: 0), Op1: DAG.getTargetConstant(Val: K, DL: SL, VT: MVT::i32));
500 }
501
502 return nullptr;
503}
504
505void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
506 EVT VT = N->getValueType(ResNo: 0);
507 unsigned NumVectorElts = VT.getVectorNumElements();
508 EVT EltVT = VT.getVectorElementType();
509 SDLoc DL(N);
510 SDValue RegClass = CurDAG->getTargetConstant(Val: RegClassID, DL, VT: MVT::i32);
511
512 if (NumVectorElts == 1) {
513 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::COPY_TO_REGCLASS, VT: EltVT, Op1: N->getOperand(Num: 0),
514 Op2: RegClass);
515 return;
516 }
517
518 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
519 if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
520 CurDAG->isConstantValueOfAnyType(N: SDValue(N, 0))) {
521 uint64_t C = 0;
522 bool AllConst = true;
523 unsigned EltSize = EltVT.getSizeInBits();
524 for (unsigned I = 0; I < NumVectorElts; ++I) {
525 SDValue Op = N->getOperand(Num: I);
526 if (Op.isUndef()) {
527 AllConst = false;
528 break;
529 }
530 uint64_t Val;
531 if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
532 Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
533 } else
534 Val = cast<ConstantSDNode>(Val&: Op)->getZExtValue();
535 C |= Val << (EltSize * I);
536 }
537 if (AllConst) {
538 SDValue CV = CurDAG->getTargetConstant(Val: C, DL, VT: MVT::i64);
539 MachineSDNode *Copy =
540 CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B64_IMM_PSEUDO, dl: DL, VT, Op1: CV);
541 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::COPY_TO_REGCLASS, VT, Op1: SDValue(Copy, 0),
542 Op2: RegClass);
543 return;
544 }
545 }
546
547 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
548 "supported yet");
549 // 32 = Max Num Vector Elements
550 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
551 // 1 = Vector Register Class
552 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
553
554 RegSeqArgs[0] = CurDAG->getTargetConstant(Val: RegClassID, DL, VT: MVT::i32);
555 bool IsRegSeq = true;
556 unsigned NOps = N->getNumOperands();
557 for (unsigned i = 0; i < NOps; i++) {
558 // XXX: Why is this here?
559 if (isa<RegisterSDNode>(Val: N->getOperand(Num: i))) {
560 IsRegSeq = false;
561 break;
562 }
563 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(Channel: i)
564 : R600RegisterInfo::getSubRegFromChannel(Channel: i);
565 RegSeqArgs[1 + (2 * i)] = N->getOperand(Num: i);
566 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Val: Sub, DL, VT: MVT::i32);
567 }
568 if (NOps != NumVectorElts) {
569 // Fill in the missing undef elements if this was a scalar_to_vector.
570 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
571 MachineSDNode *ImpDef = CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF,
572 dl: DL, VT: EltVT);
573 for (unsigned i = NOps; i < NumVectorElts; ++i) {
574 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(Channel: i)
575 : R600RegisterInfo::getSubRegFromChannel(Channel: i);
576 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
577 RegSeqArgs[1 + (2 * i) + 1] =
578 CurDAG->getTargetConstant(Val: Sub, DL, VT: MVT::i32);
579 }
580 }
581
582 if (!IsRegSeq)
583 SelectCode(N);
584 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::REG_SEQUENCE, VTs: N->getVTList(), Ops: RegSeqArgs);
585}
586
587void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) {
588 EVT VT = N->getValueType(ResNo: 0);
589 EVT EltVT = VT.getVectorElementType();
590
591 // TODO: Handle 16-bit element vectors with even aligned masks.
592 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(VT: MVT::i32) ||
593 VT.getVectorNumElements() != 2) {
594 SelectCode(N);
595 return;
596 }
597
598 auto *SVN = cast<ShuffleVectorSDNode>(Val: N);
599
600 SDValue Src0 = SVN->getOperand(Num: 0);
601 SDValue Src1 = SVN->getOperand(Num: 1);
602 ArrayRef<int> Mask = SVN->getMask();
603 SDLoc DL(N);
604
605 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
606 Mask[0] < 4 && Mask[1] < 4);
607
608 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
609 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
610 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
611 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
612
613 if (Mask[0] < 0) {
614 Src0SubReg = Src1SubReg;
615 MachineSDNode *ImpDef =
616 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT);
617 VSrc0 = SDValue(ImpDef, 0);
618 }
619
620 if (Mask[1] < 0) {
621 Src1SubReg = Src0SubReg;
622 MachineSDNode *ImpDef =
623 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT);
624 VSrc1 = SDValue(ImpDef, 0);
625 }
626
627 // SGPR case needs to lower to copies.
628 //
629 // Also use subregister extract when we can directly blend the registers with
630 // a simple subregister copy.
631 //
632 // TODO: Maybe we should fold this out earlier
633 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
634 Src1SubReg == AMDGPU::sub0) {
635 // The low element of the result always comes from src0.
636 // The high element of the result always comes from src1.
637 // op_sel selects the high half of src0.
638 // op_sel_hi selects the high half of src1.
639
640 unsigned Src0OpSel =
641 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
642 unsigned Src1OpSel =
643 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
644
645 // Enable op_sel_hi to avoid printing it. This should have no effect on the
646 // result.
647 Src0OpSel |= SISrcMods::OP_SEL_1;
648 Src1OpSel |= SISrcMods::OP_SEL_1;
649
650 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Val: Src0OpSel, DL, VT: MVT::i32);
651 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Val: Src1OpSel, DL, VT: MVT::i32);
652 SDValue ZeroMods = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
653
654 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::V_PK_MOV_B32, VTs: N->getVTList(),
655 Ops: {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
656 ZeroMods, // clamp
657 ZeroMods, // op_sel
658 ZeroMods, // op_sel_hi
659 ZeroMods, // neg_lo
660 ZeroMods}); // neg_hi
661 return;
662 }
663
664 SDValue ResultElt0 =
665 CurDAG->getTargetExtractSubreg(SRIdx: Src0SubReg, DL, VT: EltVT, Operand: VSrc0);
666 SDValue ResultElt1 =
667 CurDAG->getTargetExtractSubreg(SRIdx: Src1SubReg, DL, VT: EltVT, Operand: VSrc1);
668
669 const SDValue Ops[] = {
670 CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32),
671 ResultElt0, CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
672 ResultElt1, CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
673 CurDAG->SelectNodeTo(N, MachineOpc: TargetOpcode::REG_SEQUENCE, VT, Ops);
674}
675
676void AMDGPUDAGToDAGISel::Select(SDNode *N) {
677 unsigned int Opc = N->getOpcode();
678 if (N->isMachineOpcode()) {
679 N->setNodeId(-1);
680 return; // Already selected.
681 }
682
683 // isa<MemSDNode> almost works but is slightly too permissive for some DS
684 // intrinsics.
685 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(Val: N)) {
686 N = glueCopyToM0LDSInit(N);
687 SelectCode(N);
688 return;
689 }
690
691 switch (Opc) {
692 default:
693 break;
694 // We are selecting i64 ADD here instead of custom lower it during
695 // DAG legalization, so we can fold some i64 ADDs used for address
696 // calculation into the LOAD and STORE instructions.
697 case ISD::ADDC:
698 case ISD::ADDE:
699 case ISD::SUBC:
700 case ISD::SUBE: {
701 if (N->getValueType(ResNo: 0) != MVT::i64)
702 break;
703
704 SelectADD_SUB_I64(N);
705 return;
706 }
707 case ISD::UADDO_CARRY:
708 case ISD::USUBO_CARRY:
709 if (N->getValueType(ResNo: 0) != MVT::i32)
710 break;
711
712 SelectAddcSubb(N);
713 return;
714 case ISD::UADDO:
715 case ISD::USUBO: {
716 SelectUADDO_USUBO(N);
717 return;
718 }
719 case AMDGPUISD::FMUL_W_CHAIN: {
720 SelectFMUL_W_CHAIN(N);
721 return;
722 }
723 case AMDGPUISD::FMA_W_CHAIN: {
724 SelectFMA_W_CHAIN(N);
725 return;
726 }
727
728 case ISD::SCALAR_TO_VECTOR:
729 case ISD::BUILD_VECTOR: {
730 EVT VT = N->getValueType(ResNo: 0);
731 unsigned NumVectorElts = VT.getVectorNumElements();
732 if (VT.getScalarSizeInBits() == 16) {
733 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
734 if (SDNode *Packed = packConstantV2I16(N, DAG&: *CurDAG)) {
735 ReplaceNode(F: N, T: Packed);
736 return;
737 }
738 }
739
740 break;
741 }
742
743 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
744 assert(VT.getVectorElementType().bitsEq(MVT::i32));
745 const TargetRegisterClass *RegClass =
746 N->isDivergent()
747 ? TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: NumVectorElts * 32)
748 : SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NumVectorElts * 32);
749
750 SelectBuildVector(N, RegClassID: RegClass->getID());
751 return;
752 }
753 case ISD::VECTOR_SHUFFLE:
754 SelectVectorShuffle(N);
755 return;
756 case ISD::BUILD_PAIR: {
757 SDValue RC, SubReg0, SubReg1;
758 SDLoc DL(N);
759 if (N->getValueType(ResNo: 0) == MVT::i128) {
760 RC = CurDAG->getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32);
761 SubReg0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0_sub1, DL, VT: MVT::i32);
762 SubReg1 = CurDAG->getTargetConstant(Val: AMDGPU::sub2_sub3, DL, VT: MVT::i32);
763 } else if (N->getValueType(ResNo: 0) == MVT::i64) {
764 RC = CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32);
765 SubReg0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
766 SubReg1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
767 } else {
768 llvm_unreachable("Unhandled value type for BUILD_PAIR");
769 }
770 const SDValue Ops[] = { RC, N->getOperand(Num: 0), SubReg0,
771 N->getOperand(Num: 1), SubReg1 };
772 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL,
773 VT: N->getValueType(ResNo: 0), Ops));
774 return;
775 }
776
777 case ISD::Constant:
778 case ISD::ConstantFP: {
779 if (N->getValueType(ResNo: 0).getSizeInBits() != 64 || isInlineImmediate(N) ||
780 Subtarget->has64BitLiterals())
781 break;
782
783 uint64_t Imm;
784 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(Val: N)) {
785 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
786 if (AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: true))
787 break;
788 } else {
789 ConstantSDNode *C = cast<ConstantSDNode>(Val: N);
790 Imm = C->getZExtValue();
791 if (AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: false))
792 break;
793 }
794
795 SDLoc DL(N);
796 ReplaceNode(F: N, T: buildSMovImm64(DL, Imm, VT: N->getValueType(ResNo: 0)));
797 return;
798 }
799 case AMDGPUISD::BFE_I32:
800 case AMDGPUISD::BFE_U32: {
801 // There is a scalar version available, but unlike the vector version which
802 // has a separate operand for the offset and width, the scalar version packs
803 // the width and offset into a single operand. Try to move to the scalar
804 // version if the offsets are constant, so that we can try to keep extended
805 // loads of kernel arguments in SGPRs.
806
807 // TODO: Technically we could try to pattern match scalar bitshifts of
808 // dynamic values, but it's probably not useful.
809 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
810 if (!Offset)
811 break;
812
813 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 2));
814 if (!Width)
815 break;
816
817 bool Signed = Opc == AMDGPUISD::BFE_I32;
818
819 uint32_t OffsetVal = Offset->getZExtValue();
820 uint32_t WidthVal = Width->getZExtValue();
821
822 ReplaceNode(F: N, T: getBFE32(IsSigned: Signed, DL: SDLoc(N), Val: N->getOperand(Num: 0), Offset: OffsetVal,
823 Width: WidthVal));
824 return;
825 }
826 case AMDGPUISD::DIV_SCALE: {
827 SelectDIV_SCALE(N);
828 return;
829 }
830 case AMDGPUISD::MAD_I64_I32:
831 case AMDGPUISD::MAD_U64_U32: {
832 SelectMAD_64_32(N);
833 return;
834 }
835 case ISD::SMUL_LOHI:
836 case ISD::UMUL_LOHI:
837 return SelectMUL_LOHI(N);
838 case ISD::CopyToReg: {
839 const SITargetLowering& Lowering =
840 *static_cast<const SITargetLowering*>(getTargetLowering());
841 N = Lowering.legalizeTargetIndependentNode(Node: N, DAG&: *CurDAG);
842 break;
843 }
844 case ISD::AND:
845 case ISD::SRL:
846 case ISD::SRA:
847 case ISD::SIGN_EXTEND_INREG:
848 if (N->getValueType(ResNo: 0) != MVT::i32)
849 break;
850
851 SelectS_BFE(N);
852 return;
853 case ISD::BRCOND:
854 SelectBRCOND(N);
855 return;
856 case ISD::FP_EXTEND:
857 SelectFP_EXTEND(N);
858 return;
859 case AMDGPUISD::CVT_PKRTZ_F16_F32:
860 case AMDGPUISD::CVT_PKNORM_I16_F32:
861 case AMDGPUISD::CVT_PKNORM_U16_F32:
862 case AMDGPUISD::CVT_PK_U16_U32:
863 case AMDGPUISD::CVT_PK_I16_I32: {
864 // Hack around using a legal type if f16 is illegal.
865 if (N->getValueType(ResNo: 0) == MVT::i32) {
866 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
867 N = CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: CurDAG->getVTList(VT: NewVT),
868 Ops: { N->getOperand(Num: 0), N->getOperand(Num: 1) });
869 SelectCode(N);
870 return;
871 }
872
873 break;
874 }
875 case ISD::INTRINSIC_W_CHAIN: {
876 SelectINTRINSIC_W_CHAIN(N);
877 return;
878 }
879 case ISD::INTRINSIC_WO_CHAIN: {
880 SelectINTRINSIC_WO_CHAIN(N);
881 return;
882 }
883 case ISD::INTRINSIC_VOID: {
884 SelectINTRINSIC_VOID(N);
885 return;
886 }
887 case AMDGPUISD::WAVE_ADDRESS: {
888 SelectWAVE_ADDRESS(N);
889 return;
890 }
891 case ISD::STACKRESTORE: {
892 SelectSTACKRESTORE(N);
893 return;
894 }
895 }
896
897 SelectCode(N);
898}
899
900bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
901 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
902 const Instruction *Term = BB->getTerminator();
903 return Term->getMetadata(Kind: "amdgpu.uniform") ||
904 Term->getMetadata(Kind: "structurizecfg.uniform");
905}
906
907bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
908 unsigned ShAmtBits) const {
909 assert(N->getOpcode() == ISD::AND);
910
911 const APInt &RHS = N->getConstantOperandAPInt(Num: 1);
912 if (RHS.countr_one() >= ShAmtBits)
913 return true;
914
915 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(Op: N->getOperand(Num: 0)).Zero;
916 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
917}
918
919static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
920 SDValue &N0, SDValue &N1) {
921 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
922 Addr.getOperand(i: 0).getOpcode() == ISD::BUILD_VECTOR) {
923 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
924 // (i64 (bitcast (v2i32 (build_vector
925 // (or (extract_vector_elt V, 0), OFFSET),
926 // (extract_vector_elt V, 1)))))
927 SDValue Lo = Addr.getOperand(i: 0).getOperand(i: 0);
928 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Op: Lo)) {
929 SDValue BaseLo = Lo.getOperand(i: 0);
930 SDValue BaseHi = Addr.getOperand(i: 0).getOperand(i: 1);
931 // Check that split base (Lo and Hi) are extracted from the same one.
932 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
933 BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
934 BaseLo.getOperand(i: 0) == BaseHi.getOperand(i: 0) &&
935 // Lo is statically extracted from index 0.
936 isa<ConstantSDNode>(Val: BaseLo.getOperand(i: 1)) &&
937 BaseLo.getConstantOperandVal(i: 1) == 0 &&
938 // Hi is statically extracted from index 0.
939 isa<ConstantSDNode>(Val: BaseHi.getOperand(i: 1)) &&
940 BaseHi.getConstantOperandVal(i: 1) == 1) {
941 N0 = BaseLo.getOperand(i: 0).getOperand(i: 0);
942 N1 = Lo.getOperand(i: 1);
943 return true;
944 }
945 }
946 }
947 return false;
948}
949
950bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
951 SDValue &RHS) const {
952 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
953 LHS = Addr.getOperand(i: 0);
954 RHS = Addr.getOperand(i: 1);
955 return true;
956 }
957
958 if (getBaseWithOffsetUsingSplitOR(DAG&: *CurDAG, Addr, N0&: LHS, N1&: RHS)) {
959 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
960 return true;
961 }
962
963 return false;
964}
965
966StringRef AMDGPUDAGToDAGISelLegacy::getPassName() const {
967 return "AMDGPU DAG->DAG Pattern Instruction Selection";
968}
969
970AMDGPUISelDAGToDAGPass::AMDGPUISelDAGToDAGPass(TargetMachine &TM)
971 : SelectionDAGISelPass(
972 std::make_unique<AMDGPUDAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {}
973
974PreservedAnalyses
975AMDGPUISelDAGToDAGPass::run(MachineFunction &MF,
976 MachineFunctionAnalysisManager &MFAM) {
977#ifdef EXPENSIVE_CHECKS
978 auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
979 .getManager();
980 auto &F = MF.getFunction();
981 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
982 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
983 for (auto &L : LI.getLoopsInPreorder())
984 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
985#endif
986 return SelectionDAGISelPass::run(MF, MFAM);
987}
988
989//===----------------------------------------------------------------------===//
990// Complex Patterns
991//===----------------------------------------------------------------------===//
992
993bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
994 SDValue &Offset) {
995 return false;
996}
997
998bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
999 SDValue &Offset) {
1000 ConstantSDNode *C;
1001 SDLoc DL(Addr);
1002
1003 if ((C = dyn_cast<ConstantSDNode>(Val&: Addr))) {
1004 Base = CurDAG->getRegister(Reg: R600::INDIRECT_BASE_ADDR, VT: MVT::i32);
1005 Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
1006 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
1007 (C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 0)))) {
1008 Base = CurDAG->getRegister(Reg: R600::INDIRECT_BASE_ADDR, VT: MVT::i32);
1009 Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
1010 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
1011 (C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 1)))) {
1012 Base = Addr.getOperand(i: 0);
1013 Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
1014 } else {
1015 Base = Addr;
1016 Offset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1017 }
1018
1019 return true;
1020}
1021
1022SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
1023 const SDLoc &DL) const {
1024 SDNode *Mov = CurDAG->getMachineNode(
1025 Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
1026 Op1: CurDAG->getTargetConstant(Val, DL, VT: MVT::i32));
1027 return SDValue(Mov, 0);
1028}
1029
1030// FIXME: Should only handle uaddo_carry/usubo_carry
1031void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
1032 SDLoc DL(N);
1033 SDValue LHS = N->getOperand(Num: 0);
1034 SDValue RHS = N->getOperand(Num: 1);
1035
1036 unsigned Opcode = N->getOpcode();
1037 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1038 bool ProduceCarry =
1039 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1040 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1041
1042 SDValue Sub0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
1043 SDValue Sub1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
1044
1045 SDNode *Lo0 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1046 dl: DL, VT: MVT::i32, Op1: LHS, Op2: Sub0);
1047 SDNode *Hi0 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1048 dl: DL, VT: MVT::i32, Op1: LHS, Op2: Sub1);
1049
1050 SDNode *Lo1 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1051 dl: DL, VT: MVT::i32, Op1: RHS, Op2: Sub0);
1052 SDNode *Hi1 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1053 dl: DL, VT: MVT::i32, Op1: RHS, Op2: Sub1);
1054
1055 SDVTList VTList = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::Glue);
1056
1057 static const unsigned OpcMap[2][2][2] = {
1058 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1059 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1060 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1061 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1062
1063 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
1064 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
1065
1066 SDNode *AddLo;
1067 if (!ConsumeCarry) {
1068 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1069 AddLo = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs: VTList, Ops: Args);
1070 } else {
1071 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(Num: 2) };
1072 AddLo = CurDAG->getMachineNode(Opcode: CarryOpc, dl: DL, VTs: VTList, Ops: Args);
1073 }
1074 SDValue AddHiArgs[] = {
1075 SDValue(Hi0, 0),
1076 SDValue(Hi1, 0),
1077 SDValue(AddLo, 1)
1078 };
1079 SDNode *AddHi = CurDAG->getMachineNode(Opcode: CarryOpc, dl: DL, VTs: VTList, Ops: AddHiArgs);
1080
1081 SDValue RegSequenceArgs[] = {
1082 CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32),
1083 SDValue(AddLo,0),
1084 Sub0,
1085 SDValue(AddHi,0),
1086 Sub1,
1087 };
1088 SDNode *RegSequence = CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL,
1089 VT: MVT::i64, Ops: RegSequenceArgs);
1090
1091 if (ProduceCarry) {
1092 // Replace the carry-use
1093 ReplaceUses(F: SDValue(N, 1), T: SDValue(AddHi, 1));
1094 }
1095
1096 // Replace the remaining uses.
1097 ReplaceNode(F: N, T: RegSequence);
1098}
1099
1100void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1101 SDValue LHS = N->getOperand(Num: 0);
1102 SDValue RHS = N->getOperand(Num: 1);
1103 SDValue CI = N->getOperand(Num: 2);
1104
1105 if (N->isDivergent()) {
1106 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1107 : AMDGPU::V_SUBB_U32_e64;
1108 CurDAG->SelectNodeTo(
1109 N, MachineOpc: Opc, VTs: N->getVTList(),
1110 Ops: {LHS, RHS, CI,
1111 CurDAG->getTargetConstant(Val: 0, DL: {}, VT: MVT::i1) /*clamp bit*/});
1112 } else {
1113 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1114 : AMDGPU::S_SUB_CO_PSEUDO;
1115 CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops: {LHS, RHS, CI});
1116 }
1117}
1118
1119void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1120 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1121 // carry out despite the _i32 name. These were renamed in VI to _U32.
1122 // FIXME: We should probably rename the opcodes here.
1123 bool IsAdd = N->getOpcode() == ISD::UADDO;
1124 bool IsVALU = N->isDivergent();
1125
1126 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1127 ++UI)
1128 if (UI.getUse().getResNo() == 1) {
1129 if (UI->isMachineOpcode()) {
1130 if (UI->getMachineOpcode() !=
1131 (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
1132 IsVALU = true;
1133 break;
1134 }
1135 } else {
1136 if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
1137 IsVALU = true;
1138 break;
1139 }
1140 }
1141 }
1142
1143 if (IsVALU) {
1144 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1145
1146 CurDAG->SelectNodeTo(
1147 N, MachineOpc: Opc, VTs: N->getVTList(),
1148 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1),
1149 CurDAG->getTargetConstant(Val: 0, DL: {}, VT: MVT::i1) /*clamp bit*/});
1150 } else {
1151 unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;
1152
1153 CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(),
1154 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1)});
1155 }
1156}
1157
1158void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1159 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1160 SDValue Ops[10];
1161
1162 SelectVOP3Mods0(In: N->getOperand(Num: 1), Src&: Ops[1], SrcMods&: Ops[0], Clamp&: Ops[6], Omod&: Ops[7]);
1163 SelectVOP3Mods(In: N->getOperand(Num: 2), Src&: Ops[3], SrcMods&: Ops[2]);
1164 SelectVOP3Mods(In: N->getOperand(Num: 3), Src&: Ops[5], SrcMods&: Ops[4]);
1165 Ops[8] = N->getOperand(Num: 0);
1166 Ops[9] = N->getOperand(Num: 4);
1167
1168 // If there are no source modifiers, prefer fmac over fma because it can use
1169 // the smaller VOP2 encoding.
1170 bool UseFMAC = Subtarget->hasDLInsts() &&
1171 cast<ConstantSDNode>(Val&: Ops[0])->isZero() &&
1172 cast<ConstantSDNode>(Val&: Ops[2])->isZero() &&
1173 cast<ConstantSDNode>(Val&: Ops[4])->isZero();
1174 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1175 CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops);
1176}
1177
1178void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1179 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1180 SDValue Ops[8];
1181
1182 SelectVOP3Mods0(In: N->getOperand(Num: 1), Src&: Ops[1], SrcMods&: Ops[0], Clamp&: Ops[4], Omod&: Ops[5]);
1183 SelectVOP3Mods(In: N->getOperand(Num: 2), Src&: Ops[3], SrcMods&: Ops[2]);
1184 Ops[6] = N->getOperand(Num: 0);
1185 Ops[7] = N->getOperand(Num: 3);
1186
1187 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::V_MUL_F32_e64, VTs: N->getVTList(), Ops);
1188}
1189
1190// We need to handle this here because tablegen doesn't support matching
1191// instructions with multiple outputs.
1192void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1193 EVT VT = N->getValueType(ResNo: 0);
1194
1195 assert(VT == MVT::f32 || VT == MVT::f64);
1196
1197 unsigned Opc
1198 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1199
1200 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1201 // omod
1202 SDValue Ops[8];
1203 SelectVOP3BMods0(In: N->getOperand(Num: 0), Src&: Ops[1], SrcMods&: Ops[0], Clamp&: Ops[6], Omod&: Ops[7]);
1204 SelectVOP3BMods(In: N->getOperand(Num: 1), Src&: Ops[3], SrcMods&: Ops[2]);
1205 SelectVOP3BMods(In: N->getOperand(Num: 2), Src&: Ops[5], SrcMods&: Ops[4]);
1206 CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
1207}
1208
1209// We need to handle this here because tablegen doesn't support matching
1210// instructions with multiple outputs.
1211void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1212 SDLoc SL(N);
1213 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1214 unsigned Opc;
1215 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(Value: 1);
1216 if (Subtarget->hasMADIntraFwdBug())
1217 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1218 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1219 else if (UseNoCarry)
1220 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1221 else
1222 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1223
1224 SDValue Clamp = CurDAG->getTargetConstant(Val: 0, DL: SL, VT: MVT::i1);
1225 SDValue Ops[] = { N->getOperand(Num: 0), N->getOperand(Num: 1), N->getOperand(Num: 2),
1226 Clamp };
1227
1228 if (UseNoCarry) {
1229 MachineSDNode *Mad = CurDAG->getMachineNode(Opcode: Opc, dl: SL, VT: MVT::i64, Ops);
1230 ReplaceUses(F: SDValue(N, 0), T: SDValue(Mad, 0));
1231 CurDAG->RemoveDeadNode(N);
1232 return;
1233 }
1234
1235 CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
1236}
1237
1238// We need to handle this here because tablegen doesn't support matching
1239// instructions with multiple outputs.
1240void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1241 SDLoc SL(N);
1242 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1243 SDVTList VTList;
1244 unsigned Opc;
1245 if (Subtarget->hasMadU64U32NoCarry()) {
1246 VTList = CurDAG->getVTList(VT: MVT::i64);
1247 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1248 } else {
1249 VTList = CurDAG->getVTList(VT1: MVT::i64, VT2: MVT::i1);
1250 if (Subtarget->hasMADIntraFwdBug()) {
1251 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1252 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1253 } else {
1254 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1255 }
1256 }
1257
1258 SDValue Zero = CurDAG->getTargetConstant(Val: 0, DL: SL, VT: MVT::i64);
1259 SDValue Clamp = CurDAG->getTargetConstant(Val: 0, DL: SL, VT: MVT::i1);
1260 SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), Zero, Clamp};
1261 SDNode *Mad = CurDAG->getMachineNode(Opcode: Opc, dl: SL, VTs: VTList, Ops);
1262 if (!SDValue(N, 0).use_empty()) {
1263 SDValue Sub0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32);
1264 SDNode *Lo = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: SL,
1265 VT: MVT::i32, Op1: SDValue(Mad, 0), Op2: Sub0);
1266 ReplaceUses(F: SDValue(N, 0), T: SDValue(Lo, 0));
1267 }
1268 if (!SDValue(N, 1).use_empty()) {
1269 SDValue Sub1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32);
1270 SDNode *Hi = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: SL,
1271 VT: MVT::i32, Op1: SDValue(Mad, 0), Op2: Sub1);
1272 ReplaceUses(F: SDValue(N, 1), T: SDValue(Hi, 0));
1273 }
1274 CurDAG->RemoveDeadNode(N);
1275}
1276
1277bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1278 if (!isUInt<16>(x: Offset))
1279 return false;
1280
1281 if (!Base || Subtarget->hasUsableDSOffset() ||
1282 Subtarget->unsafeDSOffsetFoldingEnabled())
1283 return true;
1284
1285 // On Southern Islands instruction with a negative base value and an offset
1286 // don't seem to work.
1287 return CurDAG->SignBitIsZero(Op: Base);
1288}
1289
1290bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1291 SDValue &Offset) const {
1292 SDLoc DL(Addr);
1293 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1294 SDValue N0 = Addr.getOperand(i: 0);
1295 SDValue N1 = Addr.getOperand(i: 1);
1296 ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
1297 if (isDSOffsetLegal(Base: N0, Offset: C1->getSExtValue())) {
1298 // (add n0, c0)
1299 Base = N0;
1300 Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i16);
1301 return true;
1302 }
1303 } else if (Addr.getOpcode() == ISD::SUB) {
1304 // sub C, x -> add (sub 0, x), C
1305 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 0))) {
1306 int64_t ByteOffset = C->getSExtValue();
1307 if (isDSOffsetLegal(Base: SDValue(), Offset: ByteOffset)) {
1308 SDValue Zero = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1309
1310 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1311 // the known bits in isDSOffsetLegal. We need to emit the selected node
1312 // here, so this is thrown away.
1313 SDValue Sub = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32,
1314 N1: Zero, N2: Addr.getOperand(i: 1));
1315
1316 if (isDSOffsetLegal(Base: Sub, Offset: ByteOffset)) {
1317 SmallVector<SDValue, 3> Opnds;
1318 Opnds.push_back(Elt: Zero);
1319 Opnds.push_back(Elt: Addr.getOperand(i: 1));
1320
1321 // FIXME: Select to VOP3 version for with-carry.
1322 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1323 if (Subtarget->hasAddNoCarryInsts()) {
1324 SubOp = AMDGPU::V_SUB_U32_e64;
1325 Opnds.push_back(
1326 Elt: CurDAG->getTargetConstant(Val: 0, DL: {}, VT: MVT::i1)); // clamp bit
1327 }
1328
1329 MachineSDNode *MachineSub =
1330 CurDAG->getMachineNode(Opcode: SubOp, dl: DL, VT: MVT::i32, Ops: Opnds);
1331
1332 Base = SDValue(MachineSub, 0);
1333 Offset = CurDAG->getTargetConstant(Val: ByteOffset, DL, VT: MVT::i16);
1334 return true;
1335 }
1336 }
1337 }
1338 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1339 // If we have a constant address, prefer to put the constant into the
1340 // offset. This can save moves to load the constant address since multiple
1341 // operations can share the zero base address register, and enables merging
1342 // into read2 / write2 instructions.
1343
1344 SDLoc DL(Addr);
1345
1346 if (isDSOffsetLegal(Base: SDValue(), Offset: CAddr->getZExtValue())) {
1347 SDValue Zero = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1348 MachineSDNode *MovZero = CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32,
1349 dl: DL, VT: MVT::i32, Op1: Zero);
1350 Base = SDValue(MovZero, 0);
1351 Offset = CurDAG->getTargetConstant(Val: CAddr->getZExtValue(), DL, VT: MVT::i16);
1352 return true;
1353 }
1354 }
1355
1356 // default case
1357 Base = Addr;
1358 Offset = CurDAG->getTargetConstant(Val: 0, DL: SDLoc(Addr), VT: MVT::i16);
1359 return true;
1360}
1361
1362bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1363 unsigned Offset1,
1364 unsigned Size) const {
1365 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1366 return false;
1367 if (!isUInt<8>(x: Offset0 / Size) || !isUInt<8>(x: Offset1 / Size))
1368 return false;
1369
1370 if (!Base || Subtarget->hasUsableDSOffset() ||
1371 Subtarget->unsafeDSOffsetFoldingEnabled())
1372 return true;
1373
1374 // On Southern Islands instruction with a negative base value and an offset
1375 // don't seem to work.
1376 return CurDAG->SignBitIsZero(Op: Base);
1377}
1378
1379// Return whether the operation has NoUnsignedWrap property.
1380static bool isNoUnsignedWrap(SDValue Addr) {
1381 return (Addr.getOpcode() == ISD::ADD &&
1382 Addr->getFlags().hasNoUnsignedWrap()) ||
1383 Addr->getOpcode() == ISD::OR;
1384}
1385
1386// Check that the base address of flat scratch load/store in the form of `base +
1387// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1388// requirement). We always treat the first operand as the base address here.
1389bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1390 if (isNoUnsignedWrap(Addr))
1391 return true;
1392
1393 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1394 // values.
1395 if (Subtarget->hasSignedScratchOffsets())
1396 return true;
1397
1398 auto LHS = Addr.getOperand(i: 0);
1399 auto RHS = Addr.getOperand(i: 1);
1400
1401 // If the immediate offset is negative and within certain range, the base
1402 // address cannot also be negative. If the base is also negative, the sum
1403 // would be either negative or much larger than the valid range of scratch
1404 // memory a thread can access.
1405 ConstantSDNode *ImmOp = nullptr;
1406 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(Val&: RHS))) {
1407 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1408 return true;
1409 }
1410
1411 return CurDAG->SignBitIsZero(Op: LHS);
1412}
1413
1414// Check address value in SGPR/VGPR are legal for flat scratch in the form
1415// of: SGPR + VGPR.
1416bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1417 if (isNoUnsignedWrap(Addr))
1418 return true;
1419
1420 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1421 // values.
1422 if (Subtarget->hasSignedScratchOffsets())
1423 return true;
1424
1425 auto LHS = Addr.getOperand(i: 0);
1426 auto RHS = Addr.getOperand(i: 1);
1427 return CurDAG->SignBitIsZero(Op: RHS) && CurDAG->SignBitIsZero(Op: LHS);
1428}
1429
1430// Check address value in SGPR/VGPR are legal for flat scratch in the form
1431// of: SGPR + VGPR + Imm.
1432bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1433 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1434 // values.
1435 if (AMDGPU::isGFX12Plus(STI: *Subtarget))
1436 return true;
1437
1438 auto Base = Addr.getOperand(i: 0);
1439 auto *RHSImm = cast<ConstantSDNode>(Val: Addr.getOperand(i: 1));
1440 // If the immediate offset is negative and within certain range, the base
1441 // address cannot also be negative. If the base is also negative, the sum
1442 // would be either negative or much larger than the valid range of scratch
1443 // memory a thread can access.
1444 if (isNoUnsignedWrap(Addr: Base) &&
1445 (isNoUnsignedWrap(Addr) ||
1446 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1447 return true;
1448
1449 auto LHS = Base.getOperand(i: 0);
1450 auto RHS = Base.getOperand(i: 1);
1451 return CurDAG->SignBitIsZero(Op: RHS) && CurDAG->SignBitIsZero(Op: LHS);
1452}
1453
1454// TODO: If offset is too big, put low 16-bit into offset.
1455bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1456 SDValue &Offset0,
1457 SDValue &Offset1) const {
1458 return SelectDSReadWrite2(Ptr: Addr, Base, Offset0, Offset1, Size: 4);
1459}
1460
1461bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1462 SDValue &Offset0,
1463 SDValue &Offset1) const {
1464 return SelectDSReadWrite2(Ptr: Addr, Base, Offset0, Offset1, Size: 8);
1465}
1466
1467bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1468 SDValue &Offset0, SDValue &Offset1,
1469 unsigned Size) const {
1470 SDLoc DL(Addr);
1471
1472 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1473 SDValue N0 = Addr.getOperand(i: 0);
1474 SDValue N1 = Addr.getOperand(i: 1);
1475 ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
1476 unsigned OffsetValue0 = C1->getZExtValue();
1477 unsigned OffsetValue1 = OffsetValue0 + Size;
1478
1479 // (add n0, c0)
1480 if (isDSOffset2Legal(Base: N0, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1481 Base = N0;
1482 Offset0 = CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i32);
1483 Offset1 = CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i32);
1484 return true;
1485 }
1486 } else if (Addr.getOpcode() == ISD::SUB) {
1487 // sub C, x -> add (sub 0, x), C
1488 if (const ConstantSDNode *C =
1489 dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 0))) {
1490 unsigned OffsetValue0 = C->getZExtValue();
1491 unsigned OffsetValue1 = OffsetValue0 + Size;
1492
1493 if (isDSOffset2Legal(Base: SDValue(), Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1494 SDLoc DL(Addr);
1495 SDValue Zero = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1496
1497 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1498 // the known bits in isDSOffsetLegal. We need to emit the selected node
1499 // here, so this is thrown away.
1500 SDValue Sub =
1501 CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: Zero, N2: Addr.getOperand(i: 1));
1502
1503 if (isDSOffset2Legal(Base: Sub, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1504 SmallVector<SDValue, 3> Opnds;
1505 Opnds.push_back(Elt: Zero);
1506 Opnds.push_back(Elt: Addr.getOperand(i: 1));
1507 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1508 if (Subtarget->hasAddNoCarryInsts()) {
1509 SubOp = AMDGPU::V_SUB_U32_e64;
1510 Opnds.push_back(
1511 Elt: CurDAG->getTargetConstant(Val: 0, DL: {}, VT: MVT::i1)); // clamp bit
1512 }
1513
1514 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1515 Opcode: SubOp, dl: DL, VT: MVT::getIntegerVT(BitWidth: Size * 8), Ops: Opnds);
1516
1517 Base = SDValue(MachineSub, 0);
1518 Offset0 =
1519 CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i32);
1520 Offset1 =
1521 CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i32);
1522 return true;
1523 }
1524 }
1525 }
1526 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1527 unsigned OffsetValue0 = CAddr->getZExtValue();
1528 unsigned OffsetValue1 = OffsetValue0 + Size;
1529
1530 if (isDSOffset2Legal(Base: SDValue(), Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1531 SDValue Zero = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1532 MachineSDNode *MovZero =
1533 CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32, Op1: Zero);
1534 Base = SDValue(MovZero, 0);
1535 Offset0 = CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i32);
1536 Offset1 = CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i32);
1537 return true;
1538 }
1539 }
1540
1541 // default case
1542
1543 Base = Addr;
1544 Offset0 = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1545 Offset1 = CurDAG->getTargetConstant(Val: 1, DL, VT: MVT::i32);
1546 return true;
1547}
1548
1549bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1550 SDValue &SOffset, SDValue &Offset,
1551 SDValue &Offen, SDValue &Idxen,
1552 SDValue &Addr64) const {
1553 // Subtarget prefers to use flat instruction
1554 // FIXME: This should be a pattern predicate and not reach here
1555 if (Subtarget->useFlatForGlobal())
1556 return false;
1557
1558 SDLoc DL(Addr);
1559
1560 Idxen = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
1561 Offen = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
1562 Addr64 = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
1563 SOffset = Subtarget->hasRestrictedSOffset()
1564 ? CurDAG->getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32)
1565 : CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1566
1567 ConstantSDNode *C1 = nullptr;
1568 SDValue N0 = Addr;
1569 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1570 C1 = cast<ConstantSDNode>(Val: Addr.getOperand(i: 1));
1571 if (isUInt<32>(x: C1->getZExtValue()))
1572 N0 = Addr.getOperand(i: 0);
1573 else
1574 C1 = nullptr;
1575 }
1576
1577 if (N0->isAnyAdd()) {
1578 // (add N2, N3) -> addr64, or
1579 // (add (add N2, N3), C1) -> addr64
1580 SDValue N2 = N0.getOperand(i: 0);
1581 SDValue N3 = N0.getOperand(i: 1);
1582 Addr64 = CurDAG->getTargetConstant(Val: 1, DL, VT: MVT::i1);
1583
1584 if (N2->isDivergent()) {
1585 if (N3->isDivergent()) {
1586 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1587 // addr64, and construct the resource from a 0 address.
1588 Ptr = SDValue(buildSMovImm64(DL, Imm: 0, VT: MVT::v2i32), 0);
1589 VAddr = N0;
1590 } else {
1591 // N2 is divergent, N3 is not.
1592 Ptr = N3;
1593 VAddr = N2;
1594 }
1595 } else {
1596 // N2 is not divergent.
1597 Ptr = N2;
1598 VAddr = N3;
1599 }
1600 Offset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1601 } else if (N0->isDivergent()) {
1602 // N0 is divergent. Use it as the addr64, and construct the resource from a
1603 // 0 address.
1604 Ptr = SDValue(buildSMovImm64(DL, Imm: 0, VT: MVT::v2i32), 0);
1605 VAddr = N0;
1606 Addr64 = CurDAG->getTargetConstant(Val: 1, DL, VT: MVT::i1);
1607 } else {
1608 // N0 -> offset, or
1609 // (N0 + C1) -> offset
1610 VAddr = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1611 Ptr = N0;
1612 }
1613
1614 if (!C1) {
1615 // No offset.
1616 Offset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1617 return true;
1618 }
1619
1620 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1621 if (TII->isLegalMUBUFImmOffset(Imm: C1->getZExtValue())) {
1622 // Legal offset for instruction.
1623 Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32);
1624 return true;
1625 }
1626
1627 // Illegal offset, store it in soffset.
1628 Offset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1629 SOffset =
1630 SDValue(CurDAG->getMachineNode(
1631 Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
1632 Op1: CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32)),
1633 0);
1634 return true;
1635}
1636
1637bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1638 SDValue &VAddr, SDValue &SOffset,
1639 SDValue &Offset) const {
1640 SDValue Ptr, Offen, Idxen, Addr64;
1641
1642 // addr64 bit was removed for volcanic islands.
1643 // FIXME: This should be a pattern predicate and not reach here
1644 if (!Subtarget->hasAddr64())
1645 return false;
1646
1647 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1648 return false;
1649
1650 ConstantSDNode *C = cast<ConstantSDNode>(Val&: Addr64);
1651 if (C->getSExtValue()) {
1652 SDLoc DL(Addr);
1653
1654 const SITargetLowering& Lowering =
1655 *static_cast<const SITargetLowering*>(getTargetLowering());
1656
1657 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(DAG&: *CurDAG, DL, Ptr), 0);
1658 return true;
1659 }
1660
1661 return false;
1662}
1663
1664std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1665 SDLoc DL(N);
1666
1667 auto *FI = dyn_cast<FrameIndexSDNode>(Val&: N);
1668 SDValue TFI =
1669 FI ? CurDAG->getTargetFrameIndex(FI: FI->getIndex(), VT: FI->getValueType(ResNo: 0)) : N;
1670
1671 // We rebase the base address into an absolute stack address and hence
1672 // use constant 0 for soffset. This value must be retained until
1673 // frame elimination and eliminateFrameIndex will choose the appropriate
1674 // frame register if need be.
1675 return std::pair(TFI, CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32));
1676}
1677
1678bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1679 SDValue Addr, SDValue &Rsrc,
1680 SDValue &VAddr, SDValue &SOffset,
1681 SDValue &ImmOffset) const {
1682
1683 SDLoc DL(Addr);
1684 MachineFunction &MF = CurDAG->getMachineFunction();
1685 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1686
1687 Rsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1688
1689 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1690 int64_t Imm = CAddr->getSExtValue();
1691 const int64_t NullPtr =
1692 AMDGPU::getNullPointerValue(AS: AMDGPUAS::PRIVATE_ADDRESS);
1693 // Don't fold null pointer.
1694 if (Imm != NullPtr) {
1695 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
1696 SDValue HighBits =
1697 CurDAG->getTargetConstant(Val: Imm & ~MaxOffset, DL, VT: MVT::i32);
1698 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1699 Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32, Op1: HighBits);
1700 VAddr = SDValue(MovHighBits, 0);
1701
1702 SOffset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1703 ImmOffset = CurDAG->getTargetConstant(Val: Imm & MaxOffset, DL, VT: MVT::i32);
1704 return true;
1705 }
1706 }
1707
1708 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1709 // (add n0, c1)
1710
1711 SDValue N0 = Addr.getOperand(i: 0);
1712 uint64_t C1 = Addr.getConstantOperandVal(i: 1);
1713
1714 // Offsets in vaddr must be positive if range checking is enabled.
1715 //
1716 // The total computation of vaddr + soffset + offset must not overflow. If
1717 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1718 // overflowing.
1719 //
1720 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1721 // always perform a range check. If a negative vaddr base index was used,
1722 // this would fail the range check. The overall address computation would
1723 // compute a valid address, but this doesn't happen due to the range
1724 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1725 //
1726 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1727 // MUBUF vaddr, but not on older subtargets which can only do this if the
1728 // sign bit is known 0.
1729 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1730 if (TII->isLegalMUBUFImmOffset(Imm: C1) &&
1731 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1732 CurDAG->SignBitIsZero(Op: N0))) {
1733 std::tie(args&: VAddr, args&: SOffset) = foldFrameIndex(N: N0);
1734 ImmOffset = CurDAG->getTargetConstant(Val: C1, DL, VT: MVT::i32);
1735 return true;
1736 }
1737 }
1738
1739 // (node)
1740 std::tie(args&: VAddr, args&: SOffset) = foldFrameIndex(N: Addr);
1741 ImmOffset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1742 return true;
1743}
1744
1745static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1746 if (Val.getOpcode() != ISD::CopyFromReg)
1747 return false;
1748 auto Reg = cast<RegisterSDNode>(Val: Val.getOperand(i: 1))->getReg();
1749 if (!Reg.isPhysical())
1750 return false;
1751 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1752 return RC && TRI.isSGPRClass(RC);
1753}
1754
1755bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1756 SDValue Addr,
1757 SDValue &SRsrc,
1758 SDValue &SOffset,
1759 SDValue &Offset) const {
1760 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1761 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1762 MachineFunction &MF = CurDAG->getMachineFunction();
1763 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1764 SDLoc DL(Addr);
1765
1766 // CopyFromReg <sgpr>
1767 if (IsCopyFromSGPR(TRI: *TRI, Val: Addr)) {
1768 SRsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1769 SOffset = Addr;
1770 Offset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1771 return true;
1772 }
1773
1774 ConstantSDNode *CAddr;
1775 if (Addr.getOpcode() == ISD::ADD) {
1776 // Add (CopyFromReg <sgpr>) <constant>
1777 CAddr = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 1));
1778 if (!CAddr || !TII->isLegalMUBUFImmOffset(Imm: CAddr->getZExtValue()))
1779 return false;
1780 if (!IsCopyFromSGPR(TRI: *TRI, Val: Addr.getOperand(i: 0)))
1781 return false;
1782
1783 SOffset = Addr.getOperand(i: 0);
1784 } else if ((CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) &&
1785 TII->isLegalMUBUFImmOffset(Imm: CAddr->getZExtValue())) {
1786 // <constant>
1787 SOffset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
1788 } else {
1789 return false;
1790 }
1791
1792 SRsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1793
1794 Offset = CurDAG->getTargetConstant(Val: CAddr->getZExtValue(), DL, VT: MVT::i32);
1795 return true;
1796}
1797
1798bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1799 SDValue &SOffset, SDValue &Offset
1800 ) const {
1801 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1802 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1803
1804 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1805 return false;
1806
1807 if (!cast<ConstantSDNode>(Val&: Offen)->getSExtValue() &&
1808 !cast<ConstantSDNode>(Val&: Idxen)->getSExtValue() &&
1809 !cast<ConstantSDNode>(Val&: Addr64)->getSExtValue()) {
1810 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1811 maskTrailingOnes<uint64_t>(N: 32); // Size
1812 SDLoc DL(Addr);
1813
1814 const SITargetLowering& Lowering =
1815 *static_cast<const SITargetLowering*>(getTargetLowering());
1816
1817 SRsrc = SDValue(Lowering.buildRSRC(DAG&: *CurDAG, DL, Ptr, RsrcDword1: 0, RsrcDword2And3: Rsrc), 0);
1818 return true;
1819 }
1820 return false;
1821}
1822
1823bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1824 SDValue &SOffset) const {
1825 if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: ByteOffsetNode)) {
1826 SOffset = CurDAG->getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32);
1827 return true;
1828 }
1829
1830 SOffset = ByteOffsetNode;
1831 return true;
1832}
1833
1834// Find a load or store from corresponding pattern root.
1835// Roots may be build_vector, bitconvert or their combinations.
1836static MemSDNode* findMemSDNode(SDNode *N) {
1837 N = AMDGPUTargetLowering::stripBitcast(Val: SDValue(N,0)).getNode();
1838 if (MemSDNode *MN = dyn_cast<MemSDNode>(Val: N))
1839 return MN;
1840 assert(isa<BuildVectorSDNode>(N));
1841 for (SDValue V : N->op_values())
1842 if (MemSDNode *MN =
1843 dyn_cast<MemSDNode>(Val: AMDGPUTargetLowering::stripBitcast(Val: V)))
1844 return MN;
1845 llvm_unreachable("cannot find MemSDNode in the pattern!");
1846}
1847
1848bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1849 SDValue &VAddr, SDValue &Offset,
1850 uint64_t FlatVariant) const {
1851 int64_t OffsetVal = 0;
1852
1853 unsigned AS = findMemSDNode(N)->getAddressSpace();
1854
1855 bool CanHaveFlatSegmentOffsetBug =
1856 Subtarget->hasFlatSegmentOffsetBug() &&
1857 FlatVariant == SIInstrFlags::FLAT &&
1858 (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
1859
1860 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1861 SDValue N0, N1;
1862 if (isBaseWithConstantOffset64(Addr, LHS&: N0, RHS&: N1) &&
1863 (FlatVariant != SIInstrFlags::FlatScratch ||
1864 isFlatScratchBaseLegal(Addr))) {
1865 int64_t COffsetVal = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
1866
1867 // Adding the offset to the base address in a FLAT instruction must not
1868 // change the memory aperture in which the address falls. Therefore we can
1869 // only fold offsets from inbounds GEPs into FLAT instructions.
1870 bool IsInBounds =
1871 Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
1872 if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
1873 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1874 if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AS, FlatVariant)) {
1875 Addr = N0;
1876 OffsetVal = COffsetVal;
1877 } else {
1878 // If the offset doesn't fit, put the low bits into the offset field
1879 // and add the rest.
1880 //
1881 // For a FLAT instruction the hardware decides whether to access
1882 // global/scratch/shared memory based on the high bits of vaddr,
1883 // ignoring the offset field, so we have to ensure that when we add
1884 // remainder to vaddr it still points into the same underlying object.
1885 // The easiest way to do that is to make sure that we split the offset
1886 // into two pieces that are both >= 0 or both <= 0.
1887
1888 SDLoc DL(N);
1889 uint64_t RemainderOffset;
1890
1891 std::tie(args&: OffsetVal, args&: RemainderOffset) =
1892 TII->splitFlatOffset(COffsetVal, AddrSpace: AS, FlatVariant);
1893
1894 SDValue AddOffsetLo =
1895 getMaterializedScalarImm32(Val: Lo_32(Value: RemainderOffset), DL);
1896 SDValue Clamp = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
1897
1898 if (Addr.getValueType().getSizeInBits() == 32) {
1899 SmallVector<SDValue, 3> Opnds;
1900 Opnds.push_back(Elt: N0);
1901 Opnds.push_back(Elt: AddOffsetLo);
1902 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1903 if (Subtarget->hasAddNoCarryInsts()) {
1904 AddOp = AMDGPU::V_ADD_U32_e64;
1905 Opnds.push_back(Elt: Clamp);
1906 }
1907 Addr =
1908 SDValue(CurDAG->getMachineNode(Opcode: AddOp, dl: DL, VT: MVT::i32, Ops: Opnds), 0);
1909 } else {
1910 // TODO: Should this try to use a scalar add pseudo if the base
1911 // address is uniform and saddr is usable?
1912 SDValue Sub0 =
1913 CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
1914 SDValue Sub1 =
1915 CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
1916
1917 SDNode *N0Lo = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1918 dl: DL, VT: MVT::i32, Op1: N0, Op2: Sub0);
1919 SDNode *N0Hi = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1920 dl: DL, VT: MVT::i32, Op1: N0, Op2: Sub1);
1921
1922 SDValue AddOffsetHi =
1923 getMaterializedScalarImm32(Val: Hi_32(Value: RemainderOffset), DL);
1924
1925 SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i1);
1926
1927 SDNode *Add =
1928 CurDAG->getMachineNode(Opcode: AMDGPU::V_ADD_CO_U32_e64, dl: DL, VTs,
1929 Ops: {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1930
1931 SDNode *Addc = CurDAG->getMachineNode(
1932 Opcode: AMDGPU::V_ADDC_U32_e64, dl: DL, VTs,
1933 Ops: {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1934
1935 SDValue RegSequenceArgs[] = {
1936 CurDAG->getTargetConstant(Val: AMDGPU::VReg_64RegClassID, DL,
1937 VT: MVT::i32),
1938 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1939
1940 Addr = SDValue(CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL,
1941 VT: MVT::i64, Ops: RegSequenceArgs),
1942 0);
1943 }
1944 }
1945 }
1946 }
1947 }
1948
1949 VAddr = Addr;
1950 Offset = CurDAG->getSignedTargetConstant(Val: OffsetVal, DL: SDLoc(), VT: MVT::i32);
1951 return true;
1952}
1953
1954bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1955 SDValue &VAddr,
1956 SDValue &Offset) const {
1957 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, FlatVariant: SIInstrFlags::FLAT);
1958}
1959
1960bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1961 SDValue &VAddr,
1962 SDValue &Offset) const {
1963 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, FlatVariant: SIInstrFlags::FlatGlobal);
1964}
1965
1966bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1967 SDValue &VAddr,
1968 SDValue &Offset) const {
1969 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1970 FlatVariant: SIInstrFlags::FlatScratch);
1971}
1972
1973// If this matches *_extend i32:x, return x
1974// Otherwise if the value is I32 returns x.
1975static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
1976 const SelectionDAG *DAG) {
1977 if (Op.getValueType() == MVT::i32)
1978 return Op;
1979
1980 if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
1981 Op.getOpcode() != ISD::ANY_EXTEND &&
1982 !(DAG->SignBitIsZero(Op) &&
1983 Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
1984 return SDValue();
1985
1986 SDValue ExtSrc = Op.getOperand(i: 0);
1987 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1988}
1989
1990// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1991// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
1992bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
1993 SDValue &SAddr, SDValue &VOffset,
1994 SDValue &Offset, bool &ScaleOffset,
1995 bool NeedIOffset) const {
1996 int64_t ImmOffset = 0;
1997 ScaleOffset = false;
1998
1999 // Match the immediate offset first, which canonically is moved as low as
2000 // possible.
2001
2002 SDValue LHS, RHS;
2003 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2004 int64_t COffsetVal = cast<ConstantSDNode>(Val&: RHS)->getSExtValue();
2005 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2006
2007 if (NeedIOffset &&
2008 TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
2009 FlatVariant: SIInstrFlags::FlatGlobal)) {
2010 Addr = LHS;
2011 ImmOffset = COffsetVal;
2012 } else if (!LHS->isDivergent()) {
2013 if (COffsetVal > 0) {
2014 SDLoc SL(N);
2015 // saddr + large_offset -> saddr +
2016 // (voffset = large_offset & ~MaxOffset) +
2017 // (large_offset & MaxOffset);
2018 int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
2019 if (NeedIOffset) {
2020 std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
2021 COffsetVal, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS, FlatVariant: SIInstrFlags::FlatGlobal);
2022 }
2023
2024 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(x: RemainderOffset)
2025 : isUInt<32>(x: RemainderOffset)) {
2026 SDNode *VMov = CurDAG->getMachineNode(
2027 Opcode: AMDGPU::V_MOV_B32_e32, dl: SL, VT: MVT::i32,
2028 Op1: CurDAG->getTargetConstant(Val: RemainderOffset, DL: SDLoc(), VT: MVT::i32));
2029 VOffset = SDValue(VMov, 0);
2030 SAddr = LHS;
2031 Offset = CurDAG->getTargetConstant(Val: SplitImmOffset, DL: SDLoc(), VT: MVT::i32);
2032 return true;
2033 }
2034 }
2035
2036 // We are adding a 64 bit SGPR and a constant. If constant bus limit
2037 // is 1 we would need to perform 1 or 2 extra moves for each half of
2038 // the constant and it is better to do a scalar add and then issue a
2039 // single VALU instruction to materialize zero. Otherwise it is less
2040 // instructions to perform VALU adds with immediates or inline literals.
2041 unsigned NumLiterals =
2042 !TII->isInlineConstant(Imm: APInt(32, Lo_32(Value: COffsetVal))) +
2043 !TII->isInlineConstant(Imm: APInt(32, Hi_32(Value: COffsetVal)));
2044 if (Subtarget->getConstantBusLimit(Opcode: AMDGPU::V_ADD_U32_e64) > NumLiterals)
2045 return false;
2046 }
2047 }
2048
2049 // Match the variable offset.
2050 if (Addr->isAnyAdd()) {
2051 LHS = Addr.getOperand(i: 0);
2052
2053 if (!LHS->isDivergent()) {
2054 // add (i64 sgpr), (*_extend (i32 vgpr))
2055 RHS = Addr.getOperand(i: 1);
2056 ScaleOffset = SelectScaleOffset(N, Offset&: RHS, IsSigned: Subtarget->hasSignedGVSOffset());
2057 if (SDValue ExtRHS = matchExtFromI32orI32(
2058 Op: RHS, IsSigned: Subtarget->hasSignedGVSOffset(), DAG: CurDAG)) {
2059 SAddr = LHS;
2060 VOffset = ExtRHS;
2061 }
2062 }
2063
2064 RHS = Addr.getOperand(i: 1);
2065 if (!SAddr && !RHS->isDivergent()) {
2066 // add (*_extend (i32 vgpr)), (i64 sgpr)
2067 ScaleOffset = SelectScaleOffset(N, Offset&: LHS, IsSigned: Subtarget->hasSignedGVSOffset());
2068 if (SDValue ExtLHS = matchExtFromI32orI32(
2069 Op: LHS, IsSigned: Subtarget->hasSignedGVSOffset(), DAG: CurDAG)) {
2070 SAddr = RHS;
2071 VOffset = ExtLHS;
2072 }
2073 }
2074
2075 if (SAddr) {
2076 Offset = CurDAG->getSignedTargetConstant(Val: ImmOffset, DL: SDLoc(), VT: MVT::i32);
2077 return true;
2078 }
2079 }
2080
2081 if (Subtarget->hasScaleOffset() &&
2082 (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
2083 ? AMDGPUISD::MAD_I64_I32
2084 : AMDGPUISD::MAD_U64_U32) ||
2085 (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
2086 CurDAG->SignBitIsZero(Op: Addr.getOperand(i: 0)))) &&
2087 Addr.getOperand(i: 0)->isDivergent() &&
2088 isa<ConstantSDNode>(Val: Addr.getOperand(i: 1)) &&
2089 !Addr.getOperand(i: 2)->isDivergent()) {
2090 // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
2091 unsigned Size =
2092 (unsigned)cast<MemSDNode>(Val: N)->getMemoryVT().getFixedSizeInBits() / 8;
2093 ScaleOffset = Addr.getConstantOperandVal(i: 1) == Size;
2094 if (ScaleOffset) {
2095 SAddr = Addr.getOperand(i: 2);
2096 VOffset = Addr.getOperand(i: 0);
2097 Offset = CurDAG->getTargetConstant(Val: ImmOffset, DL: SDLoc(), VT: MVT::i32);
2098 return true;
2099 }
2100 }
2101
2102 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
2103 isa<ConstantSDNode>(Val: Addr))
2104 return false;
2105
2106 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
2107 // moves required to copy a 64-bit SGPR to VGPR.
2108 SAddr = Addr;
2109 SDNode *VMov =
2110 CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: SDLoc(Addr), VT: MVT::i32,
2111 Op1: CurDAG->getTargetConstant(Val: 0, DL: SDLoc(), VT: MVT::i32));
2112 VOffset = SDValue(VMov, 0);
2113 Offset = CurDAG->getSignedTargetConstant(Val: ImmOffset, DL: SDLoc(), VT: MVT::i32);
2114 return true;
2115}
2116
2117bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2118 SDValue &SAddr, SDValue &VOffset,
2119 SDValue &Offset,
2120 SDValue &CPol) const {
2121 bool ScaleOffset;
2122 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2123 return false;
2124
2125 CPol = CurDAG->getTargetConstant(Val: ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2126 DL: SDLoc(), VT: MVT::i32);
2127 return true;
2128}
2129
2130bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2131 SDValue &SAddr, SDValue &VOffset,
2132 SDValue &Offset,
2133 SDValue &CPol) const {
2134 bool ScaleOffset;
2135 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2136 return false;
2137
2138 // We are assuming CPol is always the last operand of the intrinsic.
2139 auto PassedCPol =
2140 N->getConstantOperandVal(Num: N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2141 CPol = CurDAG->getTargetConstant(
2142 Val: (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, DL: SDLoc(), VT: MVT::i32);
2143 return true;
2144}
2145
2146bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
2147 SDValue &SAddr,
2148 SDValue &VOffset,
2149 SDValue &Offset,
2150 SDValue &CPol) const {
2151 bool ScaleOffset;
2152 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2153 return false;
2154
2155 // We are assuming CPol is second from last operand of the intrinsic.
2156 auto PassedCPol =
2157 N->getConstantOperandVal(Num: N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2158 CPol = CurDAG->getTargetConstant(
2159 Val: (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, DL: SDLoc(), VT: MVT::i32);
2160 return true;
2161}
2162
2163bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
2164 SDValue &SAddr, SDValue &VOffset,
2165 SDValue &Offset,
2166 SDValue &CPol) const {
2167 bool ScaleOffset;
2168 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2169 return false;
2170
2171 unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
2172 CPol = CurDAG->getTargetConstant(Val: CPolVal, DL: SDLoc(), VT: MVT::i32);
2173 return true;
2174}
2175
2176bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2177 SDValue &SAddr,
2178 SDValue &VOffset,
2179 SDValue &CPol) const {
2180 bool ScaleOffset;
2181 SDValue DummyOffset;
2182 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset&: DummyOffset, ScaleOffset,
2183 NeedIOffset: false))
2184 return false;
2185
2186 // We are assuming CPol is always the last operand of the intrinsic.
2187 auto PassedCPol =
2188 N->getConstantOperandVal(Num: N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2189 CPol = CurDAG->getTargetConstant(
2190 Val: (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, DL: SDLoc(), VT: MVT::i32);
2191 return true;
2192}
2193
2194bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
2195 SDValue &SAddr,
2196 SDValue &VOffset,
2197 SDValue &CPol) const {
2198 bool ScaleOffset;
2199 SDValue DummyOffset;
2200 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset&: DummyOffset, ScaleOffset,
2201 NeedIOffset: false))
2202 return false;
2203
2204 // We are assuming CPol is second from last operand of the intrinsic.
2205 auto PassedCPol =
2206 N->getConstantOperandVal(Num: N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2207 CPol = CurDAG->getTargetConstant(
2208 Val: (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, DL: SDLoc(), VT: MVT::i32);
2209 return true;
2210}
2211
2212static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
2213 if (auto *FI = dyn_cast<FrameIndexSDNode>(Val&: SAddr)) {
2214 SAddr = CurDAG->getTargetFrameIndex(FI: FI->getIndex(), VT: FI->getValueType(ResNo: 0));
2215 } else if (SAddr.getOpcode() == ISD::ADD &&
2216 isa<FrameIndexSDNode>(Val: SAddr.getOperand(i: 0))) {
2217 // Materialize this into a scalar move for scalar address to avoid
2218 // readfirstlane.
2219 auto *FI = cast<FrameIndexSDNode>(Val: SAddr.getOperand(i: 0));
2220 SDValue TFI = CurDAG->getTargetFrameIndex(FI: FI->getIndex(),
2221 VT: FI->getValueType(ResNo: 0));
2222 SAddr = SDValue(CurDAG->getMachineNode(Opcode: AMDGPU::S_ADD_I32, dl: SDLoc(SAddr),
2223 VT: MVT::i32, Op1: TFI, Op2: SAddr.getOperand(i: 1)),
2224 0);
2225 }
2226
2227 return SAddr;
2228}
2229
2230// Match (32-bit SGPR base) + sext(imm offset)
2231bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
2232 SDValue &SAddr,
2233 SDValue &Offset) const {
2234 if (Addr->isDivergent())
2235 return false;
2236
2237 SDLoc DL(Addr);
2238
2239 int64_t COffsetVal = 0;
2240
2241 if (CurDAG->isBaseWithConstantOffset(Op: Addr) && isFlatScratchBaseLegal(Addr)) {
2242 COffsetVal = cast<ConstantSDNode>(Val: Addr.getOperand(i: 1))->getSExtValue();
2243 SAddr = Addr.getOperand(i: 0);
2244 } else {
2245 SAddr = Addr;
2246 }
2247
2248 SAddr = SelectSAddrFI(CurDAG, SAddr);
2249
2250 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2251
2252 if (!TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
2253 FlatVariant: SIInstrFlags::FlatScratch)) {
2254 int64_t SplitImmOffset, RemainderOffset;
2255 std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
2256 COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: SIInstrFlags::FlatScratch);
2257
2258 COffsetVal = SplitImmOffset;
2259
2260 SDValue AddOffset =
2261 SAddr.getOpcode() == ISD::TargetFrameIndex
2262 ? getMaterializedScalarImm32(Val: Lo_32(Value: RemainderOffset), DL)
2263 : CurDAG->getSignedTargetConstant(Val: RemainderOffset, DL, VT: MVT::i32);
2264 SAddr = SDValue(CurDAG->getMachineNode(Opcode: AMDGPU::S_ADD_I32, dl: DL, VT: MVT::i32,
2265 Op1: SAddr, Op2: AddOffset),
2266 0);
2267 }
2268
2269 Offset = CurDAG->getSignedTargetConstant(Val: COffsetVal, DL, VT: MVT::i32);
2270
2271 return true;
2272}
2273
2274// Check whether the flat scratch SVS swizzle bug affects this access.
2275bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2276 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2277 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2278 return false;
2279
2280 // The bug affects the swizzling of SVS accesses if there is any carry out
2281 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2282 // voffset to (soffset + inst_offset).
2283 KnownBits VKnown = CurDAG->computeKnownBits(Op: VAddr);
2284 KnownBits SKnown =
2285 KnownBits::add(LHS: CurDAG->computeKnownBits(Op: SAddr),
2286 RHS: KnownBits::makeConstant(C: APInt(32, ImmOffset,
2287 /*isSigned=*/true)));
2288 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2289 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2290 return (VMax & 3) + (SMax & 3) >= 4;
2291}
2292
2293bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2294 SDValue &VAddr, SDValue &SAddr,
2295 SDValue &Offset,
2296 SDValue &CPol) const {
2297 int64_t ImmOffset = 0;
2298
2299 SDValue LHS, RHS;
2300 SDValue OrigAddr = Addr;
2301 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2302 int64_t COffsetVal = cast<ConstantSDNode>(Val&: RHS)->getSExtValue();
2303 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2304
2305 if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
2306 FlatVariant: SIInstrFlags::FlatScratch)) {
2307 Addr = LHS;
2308 ImmOffset = COffsetVal;
2309 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2310 SDLoc SL(N);
2311 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2312 // (large_offset & MaxOffset);
2313 int64_t SplitImmOffset, RemainderOffset;
2314 std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
2315 COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: SIInstrFlags::FlatScratch);
2316
2317 if (isUInt<32>(x: RemainderOffset)) {
2318 SDNode *VMov = CurDAG->getMachineNode(
2319 Opcode: AMDGPU::V_MOV_B32_e32, dl: SL, VT: MVT::i32,
2320 Op1: CurDAG->getTargetConstant(Val: RemainderOffset, DL: SDLoc(), VT: MVT::i32));
2321 VAddr = SDValue(VMov, 0);
2322 SAddr = LHS;
2323 if (!isFlatScratchBaseLegal(Addr))
2324 return false;
2325 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset: SplitImmOffset))
2326 return false;
2327 Offset = CurDAG->getTargetConstant(Val: SplitImmOffset, DL: SDLoc(), VT: MVT::i32);
2328 CPol = CurDAG->getTargetConstant(Val: 0, DL: SDLoc(), VT: MVT::i32);
2329 return true;
2330 }
2331 }
2332 }
2333
2334 if (Addr.getOpcode() != ISD::ADD)
2335 return false;
2336
2337 LHS = Addr.getOperand(i: 0);
2338 RHS = Addr.getOperand(i: 1);
2339
2340 if (!LHS->isDivergent() && RHS->isDivergent()) {
2341 SAddr = LHS;
2342 VAddr = RHS;
2343 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2344 SAddr = RHS;
2345 VAddr = LHS;
2346 } else {
2347 return false;
2348 }
2349
2350 if (OrigAddr != Addr) {
2351 if (!isFlatScratchBaseLegalSVImm(Addr: OrigAddr))
2352 return false;
2353 } else {
2354 if (!isFlatScratchBaseLegalSV(Addr: OrigAddr))
2355 return false;
2356 }
2357
2358 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2359 return false;
2360 SAddr = SelectSAddrFI(CurDAG, SAddr);
2361 Offset = CurDAG->getSignedTargetConstant(Val: ImmOffset, DL: SDLoc(), VT: MVT::i32);
2362
2363 bool ScaleOffset = SelectScaleOffset(N, Offset&: VAddr, IsSigned: true /* IsSigned */);
2364 CPol = CurDAG->getTargetConstant(Val: ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2365 DL: SDLoc(), VT: MVT::i32);
2366 return true;
2367}
2368
2369// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2370// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2371// Handle the case where the Immediate Offset + SOffset is negative.
2372bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2373 bool Imm32Only,
2374 bool IsBuffer,
2375 int64_t ImmOffset) const {
2376 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2377 AMDGPU::hasSMRDSignedImmOffset(ST: *Subtarget)) {
2378 KnownBits SKnown = CurDAG->computeKnownBits(Op: *SOffset);
2379 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2380 return false;
2381 }
2382
2383 return true;
2384}
2385
2386// Given \p Offset and load node \p N check if an \p Offset is a multiple of
2387// the load byte size. If it is update \p Offset to a pre-scaled value and
2388// return true.
2389bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
2390 bool IsSigned) const {
2391 bool ScaleOffset = false;
2392 if (!Subtarget->hasScaleOffset() || !Offset)
2393 return false;
2394
2395 unsigned Size =
2396 (unsigned)cast<MemSDNode>(Val: N)->getMemoryVT().getFixedSizeInBits() / 8;
2397
2398 SDValue Off = Offset;
2399 if (SDValue Ext = matchExtFromI32orI32(Op: Offset, IsSigned, DAG: CurDAG))
2400 Off = Ext;
2401
2402 if (isPowerOf2_32(Value: Size) && Off.getOpcode() == ISD::SHL) {
2403 if (auto *C = dyn_cast<ConstantSDNode>(Val: Off.getOperand(i: 1)))
2404 ScaleOffset = C->getZExtValue() == Log2_32(Value: Size);
2405 } else if (Offset.getOpcode() == ISD::MUL ||
2406 (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
2407 Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
2408 (Offset.isMachineOpcode() &&
2409 Offset.getMachineOpcode() ==
2410 (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
2411 : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
2412 if (auto *C = dyn_cast<ConstantSDNode>(Val: Offset.getOperand(i: 1)))
2413 ScaleOffset = C->getZExtValue() == Size;
2414 }
2415
2416 if (ScaleOffset)
2417 Offset = Off.getOperand(i: 0);
2418
2419 return ScaleOffset;
2420}
2421
2422// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2423// not null) offset. If Imm32Only is true, match only 32-bit immediate
2424// offsets available on CI.
2425bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
2426 SDValue *SOffset, SDValue *Offset,
2427 bool Imm32Only, bool IsBuffer,
2428 bool HasSOffset, int64_t ImmOffset,
2429 bool *ScaleOffset) const {
2430 assert((!SOffset || !Offset) &&
2431 "Cannot match both soffset and offset at the same time!");
2432
2433 if (ScaleOffset) {
2434 assert(N && SOffset);
2435
2436 *ScaleOffset = SelectScaleOffset(N, Offset&: ByteOffsetNode, IsSigned: false /* IsSigned */);
2437 }
2438
2439 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: ByteOffsetNode);
2440 if (!C) {
2441 if (!SOffset)
2442 return false;
2443
2444 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2445 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2446 *SOffset = ByteOffsetNode;
2447 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2448 ImmOffset);
2449 }
2450 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2451 if (ByteOffsetNode.getOperand(i: 0).getValueType().getSizeInBits() == 32) {
2452 *SOffset = ByteOffsetNode.getOperand(i: 0);
2453 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2454 ImmOffset);
2455 }
2456 }
2457 return false;
2458 }
2459
2460 SDLoc SL(ByteOffsetNode);
2461
2462 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2463 // offset for S_BUFFER instructions is unsigned.
2464 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2465 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2466 ST: *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2467 if (EncodedOffset && Offset && !Imm32Only) {
2468 *Offset = CurDAG->getSignedTargetConstant(Val: *EncodedOffset, DL: SL, VT: MVT::i32);
2469 return true;
2470 }
2471
2472 // SGPR and literal offsets are unsigned.
2473 if (ByteOffset < 0)
2474 return false;
2475
2476 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(ST: *Subtarget, ByteOffset);
2477 if (EncodedOffset && Offset && Imm32Only) {
2478 *Offset = CurDAG->getTargetConstant(Val: *EncodedOffset, DL: SL, VT: MVT::i32);
2479 return true;
2480 }
2481
2482 if (!isUInt<32>(x: ByteOffset) && !isInt<32>(x: ByteOffset))
2483 return false;
2484
2485 if (SOffset) {
2486 SDValue C32Bit = CurDAG->getTargetConstant(Val: ByteOffset, DL: SL, VT: MVT::i32);
2487 *SOffset = SDValue(
2488 CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32, Op1: C32Bit), 0);
2489 return true;
2490 }
2491
2492 return false;
2493}
2494
2495SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2496 if (Addr.getValueType() != MVT::i32)
2497 return Addr;
2498
2499 // Zero-extend a 32-bit address.
2500 SDLoc SL(Addr);
2501
2502 const MachineFunction &MF = CurDAG->getMachineFunction();
2503 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2504 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2505 SDValue AddrHi = CurDAG->getTargetConstant(Val: AddrHiVal, DL: SL, VT: MVT::i32);
2506
2507 const SDValue Ops[] = {
2508 CurDAG->getTargetConstant(Val: AMDGPU::SReg_64_XEXECRegClassID, DL: SL, VT: MVT::i32),
2509 Addr,
2510 CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32),
2511 SDValue(CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32, Op1: AddrHi),
2512 0),
2513 CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32),
2514 };
2515
2516 return SDValue(CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: SL, VT: MVT::i64,
2517 Ops), 0);
2518}
2519
2520// Match a base and an immediate (if Offset is not null) or an SGPR (if
2521// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2522// true, match only 32-bit immediate offsets available on CI.
2523bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
2524 SDValue &SBase, SDValue *SOffset,
2525 SDValue *Offset, bool Imm32Only,
2526 bool IsBuffer, bool HasSOffset,
2527 int64_t ImmOffset,
2528 bool *ScaleOffset) const {
2529 if (SOffset && Offset) {
2530 assert(!Imm32Only && !IsBuffer);
2531 SDValue B;
2532
2533 if (!SelectSMRDBaseOffset(N, Addr, SBase&: B, SOffset: nullptr, Offset, Imm32Only: false, IsBuffer: false, HasSOffset: true))
2534 return false;
2535
2536 int64_t ImmOff = 0;
2537 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: *Offset))
2538 ImmOff = C->getSExtValue();
2539
2540 return SelectSMRDBaseOffset(N, Addr: B, SBase, SOffset, Offset: nullptr, Imm32Only: false, IsBuffer: false,
2541 HasSOffset: true, ImmOffset: ImmOff, ScaleOffset);
2542 }
2543
2544 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2545 // wraparound, because s_load instructions perform the addition in 64 bits.
2546 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2547 !Addr->getFlags().hasNoUnsignedWrap())
2548 return false;
2549
2550 SDValue N0, N1;
2551 // Extract the base and offset if possible.
2552 if (Addr->isAnyAdd() || CurDAG->isADDLike(Op: Addr)) {
2553 N0 = Addr.getOperand(i: 0);
2554 N1 = Addr.getOperand(i: 1);
2555 } else if (getBaseWithOffsetUsingSplitOR(DAG&: *CurDAG, Addr, N0, N1)) {
2556 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2557 }
2558 if (!N0 || !N1)
2559 return false;
2560
2561 if (SelectSMRDOffset(N, ByteOffsetNode: N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2562 ImmOffset, ScaleOffset)) {
2563 SBase = N0;
2564 return true;
2565 }
2566 if (SelectSMRDOffset(N, ByteOffsetNode: N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2567 ImmOffset, ScaleOffset)) {
2568 SBase = N1;
2569 return true;
2570 }
2571 return false;
2572}
2573
2574bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
2575 SDValue *SOffset, SDValue *Offset,
2576 bool Imm32Only, bool *ScaleOffset) const {
2577 if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
2578 /* IsBuffer */ false, /* HasSOffset */ false,
2579 /* ImmOffset */ 0, ScaleOffset)) {
2580 SBase = Expand32BitAddress(Addr: SBase);
2581 return true;
2582 }
2583
2584 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2585 SBase = Expand32BitAddress(Addr);
2586 *Offset = CurDAG->getTargetConstant(Val: 0, DL: SDLoc(Addr), VT: MVT::i32);
2587 return true;
2588 }
2589
2590 return false;
2591}
2592
2593bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2594 SDValue &Offset) const {
2595 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2596 Offset: &Offset);
2597}
2598
2599bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2600 SDValue &Offset) const {
2601 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2602 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2603 Offset: &Offset, /* Imm32Only */ true);
2604}
2605
2606bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
2607 SDValue &SOffset, SDValue &CPol) const {
2608 bool ScaleOffset;
2609 if (!SelectSMRD(N, Addr, SBase, SOffset: &SOffset, /* Offset */ nullptr,
2610 /* Imm32Only */ false, ScaleOffset: &ScaleOffset))
2611 return false;
2612
2613 CPol = CurDAG->getTargetConstant(Val: ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2614 DL: SDLoc(N), VT: MVT::i32);
2615 return true;
2616}
2617
2618bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
2619 SDValue &SBase, SDValue &SOffset,
2620 SDValue &Offset,
2621 SDValue &CPol) const {
2622 bool ScaleOffset;
2623 if (!SelectSMRD(N, Addr, SBase, SOffset: &SOffset, Offset: &Offset, Imm32Only: false, ScaleOffset: &ScaleOffset))
2624 return false;
2625
2626 CPol = CurDAG->getTargetConstant(Val: ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2627 DL: SDLoc(N), VT: MVT::i32);
2628 return true;
2629}
2630
2631bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2632 return SelectSMRDOffset(/* N */ nullptr, ByteOffsetNode: N, /* SOffset */ nullptr, Offset: &Offset,
2633 /* Imm32Only */ false, /* IsBuffer */ true);
2634}
2635
2636bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2637 SDValue &Offset) const {
2638 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2639 return SelectSMRDOffset(/* N */ nullptr, ByteOffsetNode: N, /* SOffset */ nullptr, Offset: &Offset,
2640 /* Imm32Only */ true, /* IsBuffer */ true);
2641}
2642
2643bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2644 SDValue &Offset) const {
2645 // Match the (soffset + offset) pair as a 32-bit register base and
2646 // an immediate offset.
2647 return N.getValueType() == MVT::i32 &&
2648 SelectSMRDBaseOffset(/* N */ nullptr, Addr: N, /* SBase */ SOffset,
2649 /* SOffset*/ nullptr, Offset: &Offset,
2650 /* Imm32Only */ false, /* IsBuffer */ true);
2651}
2652
2653bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2654 SDValue &Base,
2655 SDValue &Offset) const {
2656 SDLoc DL(Index);
2657
2658 if (CurDAG->isBaseWithConstantOffset(Op: Index)) {
2659 SDValue N0 = Index.getOperand(i: 0);
2660 SDValue N1 = Index.getOperand(i: 1);
2661 ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
2662
2663 // (add n0, c0)
2664 // Don't peel off the offset (c0) if doing so could possibly lead
2665 // the base (n0) to be negative.
2666 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2667 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(Op: N0) ||
2668 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2669 Base = N0;
2670 Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32);
2671 return true;
2672 }
2673 }
2674
2675 if (isa<ConstantSDNode>(Val: Index))
2676 return false;
2677
2678 Base = Index;
2679 Offset = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32);
2680 return true;
2681}
2682
2683SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2684 SDValue Val, uint32_t Offset,
2685 uint32_t Width) {
2686 if (Val->isDivergent()) {
2687 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2688 SDValue Off = CurDAG->getTargetConstant(Val: Offset, DL, VT: MVT::i32);
2689 SDValue W = CurDAG->getTargetConstant(Val: Width, DL, VT: MVT::i32);
2690
2691 return CurDAG->getMachineNode(Opcode, dl: DL, VT: MVT::i32, Op1: Val, Op2: Off, Op3: W);
2692 }
2693 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2694 // Transformation function, pack the offset and width of a BFE into
2695 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2696 // source, bits [5:0] contain the offset and bits [22:16] the width.
2697 uint32_t PackedVal = Offset | (Width << 16);
2698 SDValue PackedConst = CurDAG->getTargetConstant(Val: PackedVal, DL, VT: MVT::i32);
2699
2700 return CurDAG->getMachineNode(Opcode, dl: DL, VT: MVT::i32, Op1: Val, Op2: PackedConst);
2701}
2702
2703void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2704 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2705 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2706 // Predicate: 0 < b <= c < 32
2707
2708 const SDValue &Shl = N->getOperand(Num: 0);
2709 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Val: Shl->getOperand(Num: 1));
2710 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
2711
2712 if (B && C) {
2713 uint32_t BVal = B->getZExtValue();
2714 uint32_t CVal = C->getZExtValue();
2715
2716 if (0 < BVal && BVal <= CVal && CVal < 32) {
2717 bool Signed = N->getOpcode() == ISD::SRA;
2718 ReplaceNode(F: N, T: getBFE32(IsSigned: Signed, DL: SDLoc(N), Val: Shl.getOperand(i: 0), Offset: CVal - BVal,
2719 Width: 32 - CVal));
2720 return;
2721 }
2722 }
2723 SelectCode(N);
2724}
2725
2726void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2727 switch (N->getOpcode()) {
2728 case ISD::AND:
2729 if (N->getOperand(Num: 0).getOpcode() == ISD::SRL) {
2730 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2731 // Predicate: isMask(mask)
2732 const SDValue &Srl = N->getOperand(Num: 0);
2733 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Val: Srl.getOperand(i: 1));
2734 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
2735
2736 if (Shift && Mask) {
2737 uint32_t ShiftVal = Shift->getZExtValue();
2738 uint32_t MaskVal = Mask->getZExtValue();
2739
2740 if (isMask_32(Value: MaskVal)) {
2741 uint32_t WidthVal = llvm::popcount(Value: MaskVal);
2742 ReplaceNode(F: N, T: getBFE32(IsSigned: false, DL: SDLoc(N), Val: Srl.getOperand(i: 0), Offset: ShiftVal,
2743 Width: WidthVal));
2744 return;
2745 }
2746 }
2747 }
2748 break;
2749 case ISD::SRL:
2750 if (N->getOperand(Num: 0).getOpcode() == ISD::AND) {
2751 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2752 // Predicate: isMask(mask >> b)
2753 const SDValue &And = N->getOperand(Num: 0);
2754 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
2755 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: 1));
2756
2757 if (Shift && Mask) {
2758 uint32_t ShiftVal = Shift->getZExtValue();
2759 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2760
2761 if (isMask_32(Value: MaskVal)) {
2762 uint32_t WidthVal = llvm::popcount(Value: MaskVal);
2763 ReplaceNode(F: N, T: getBFE32(IsSigned: false, DL: SDLoc(N), Val: And.getOperand(i: 0), Offset: ShiftVal,
2764 Width: WidthVal));
2765 return;
2766 }
2767 }
2768 } else if (N->getOperand(Num: 0).getOpcode() == ISD::SHL) {
2769 SelectS_BFEFromShifts(N);
2770 return;
2771 }
2772 break;
2773 case ISD::SRA:
2774 if (N->getOperand(Num: 0).getOpcode() == ISD::SHL) {
2775 SelectS_BFEFromShifts(N);
2776 return;
2777 }
2778 break;
2779
2780 case ISD::SIGN_EXTEND_INREG: {
2781 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2782 SDValue Src = N->getOperand(Num: 0);
2783 if (Src.getOpcode() != ISD::SRL)
2784 break;
2785
2786 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: 1));
2787 if (!Amt)
2788 break;
2789
2790 unsigned Width = cast<VTSDNode>(Val: N->getOperand(Num: 1))->getVT().getSizeInBits();
2791 ReplaceNode(F: N, T: getBFE32(IsSigned: true, DL: SDLoc(N), Val: Src.getOperand(i: 0),
2792 Offset: Amt->getZExtValue(), Width));
2793 return;
2794 }
2795 }
2796
2797 SelectCode(N);
2798}
2799
2800bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2801 assert(N->getOpcode() == ISD::BRCOND);
2802 if (!N->hasOneUse())
2803 return false;
2804
2805 SDValue Cond = N->getOperand(Num: 1);
2806 if (Cond.getOpcode() == ISD::CopyToReg)
2807 Cond = Cond.getOperand(i: 2);
2808
2809 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2810 return false;
2811
2812 MVT VT = Cond.getOperand(i: 0).getSimpleValueType();
2813 if (VT == MVT::i32)
2814 return true;
2815
2816 if (VT == MVT::i64) {
2817 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Cond.getOperand(i: 2))->get();
2818 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2819 Subtarget->hasScalarCompareEq64();
2820 }
2821
2822 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2823 return true;
2824
2825 return false;
2826}
2827
2828static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2829 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2830 // Special case for amdgcn.ballot:
2831 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2832 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2833 // =>
2834 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2835 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2836 // Cond becomes a i(WaveSize) full mask value.
2837 // Note that ballot doesn't use SETEQ condition but its easy to support it
2838 // here for completeness, so in this case Negate is set true on return.
2839 auto VCMP_CC = cast<CondCodeSDNode>(Val: VCMP.getOperand(i: 2))->get();
2840 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2841 isNullConstant(V: VCMP.getOperand(i: 1))) {
2842
2843 auto Cond = VCMP.getOperand(i: 0);
2844 if (ISD::isExtOpcode(Opcode: Cond->getOpcode())) // Skip extension.
2845 Cond = Cond.getOperand(i: 0);
2846
2847 if (isBoolSGPR(V: Cond)) {
2848 Negate = VCMP_CC == ISD::SETEQ;
2849 return Cond;
2850 }
2851 }
2852 return SDValue();
2853}
2854
2855void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2856 SDValue Cond = N->getOperand(Num: 1);
2857
2858 if (Cond.isUndef()) {
2859 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::SI_BR_UNDEF, VT: MVT::Other,
2860 Op1: N->getOperand(Num: 2), Op2: N->getOperand(Num: 0));
2861 return;
2862 }
2863
2864 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2865
2866 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2867 bool AndExec = !UseSCCBr;
2868 bool Negate = false;
2869
2870 if (Cond.getOpcode() == ISD::SETCC &&
2871 Cond->getOperand(Num: 0)->getOpcode() == AMDGPUISD::SETCC) {
2872 SDValue VCMP = Cond->getOperand(Num: 0);
2873 auto CC = cast<CondCodeSDNode>(Val: Cond->getOperand(Num: 2))->get();
2874 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2875 isNullConstant(V: Cond->getOperand(Num: 1)) &&
2876 // We may encounter ballot.i64 in wave32 mode on -O0.
2877 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2878 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2879 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2880 // BRCOND i1 %C, %BB
2881 // =>
2882 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2883 // VCC = COPY i(WaveSize) %VCMP
2884 // S_CBRANCH_VCCNZ/VCCZ %BB
2885 Negate = CC == ISD::SETEQ;
2886 bool NegatedBallot = false;
2887 if (auto BallotCond = combineBallotPattern(VCMP, Negate&: NegatedBallot)) {
2888 Cond = BallotCond;
2889 UseSCCBr = !BallotCond->isDivergent();
2890 Negate = Negate ^ NegatedBallot;
2891 } else {
2892 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2893 // selected as V_CMP, but this may change for uniform condition.
2894 Cond = VCMP;
2895 UseSCCBr = false;
2896 }
2897 }
2898 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2899 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2900 // used.
2901 AndExec = false;
2902 }
2903
2904 unsigned BrOp =
2905 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2906 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2907 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2908 SDLoc SL(N);
2909
2910 if (AndExec) {
2911 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2912 // analyzed what generates the vcc value, so we do not know whether vcc
2913 // bits for disabled lanes are 0. Thus we need to mask out bits for
2914 // disabled lanes.
2915 //
2916 // For the case that we select S_CBRANCH_SCC1 and it gets
2917 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2918 // SIInstrInfo::moveToVALU which inserts the S_AND).
2919 //
2920 // We could add an analysis of what generates the vcc value here and omit
2921 // the S_AND when is unnecessary. But it would be better to add a separate
2922 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2923 // catches both cases.
2924 Cond = SDValue(
2925 CurDAG->getMachineNode(
2926 Opcode: Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, dl: SL,
2927 VT: MVT::i1,
2928 Op1: CurDAG->getRegister(Reg: Subtarget->isWave32() ? AMDGPU::EXEC_LO
2929 : AMDGPU::EXEC,
2930 VT: MVT::i1),
2931 Op2: Cond),
2932 0);
2933 }
2934
2935 SDValue VCC = CurDAG->getCopyToReg(Chain: N->getOperand(Num: 0), dl: SL, Reg: CondReg, N: Cond);
2936 CurDAG->SelectNodeTo(N, MachineOpc: BrOp, VT: MVT::Other,
2937 Op1: N->getOperand(Num: 2), // Basic Block
2938 Op2: VCC.getValue(R: 0));
2939}
2940
2941void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2942 if (Subtarget->hasSALUFloatInsts() && N->getValueType(ResNo: 0) == MVT::f32 &&
2943 !N->isDivergent()) {
2944 SDValue Src = N->getOperand(Num: 0);
2945 if (Src.getValueType() == MVT::f16) {
2946 if (isExtractHiElt(In: Src, Out&: Src)) {
2947 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::S_CVT_HI_F32_F16, VTs: N->getVTList(),
2948 Ops: {Src});
2949 return;
2950 }
2951 }
2952 }
2953
2954 SelectCode(N);
2955}
2956
2957void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2958 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2959 // be copied to an SGPR with readfirstlane.
2960 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2961 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2962
2963 SDValue Chain = N->getOperand(Num: 0);
2964 SDValue Ptr = N->getOperand(Num: 2);
2965 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
2966 MachineMemOperand *MMO = M->getMemOperand();
2967 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2968
2969 SDValue Offset;
2970 if (CurDAG->isBaseWithConstantOffset(Op: Ptr)) {
2971 SDValue PtrBase = Ptr.getOperand(i: 0);
2972 SDValue PtrOffset = Ptr.getOperand(i: 1);
2973
2974 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2975 if (isDSOffsetLegal(Base: PtrBase, Offset: OffsetVal.getZExtValue())) {
2976 N = glueCopyToM0(N, Val: PtrBase);
2977 Offset = CurDAG->getTargetConstant(Val: OffsetVal, DL: SDLoc(), VT: MVT::i32);
2978 }
2979 }
2980
2981 if (!Offset) {
2982 N = glueCopyToM0(N, Val: Ptr);
2983 Offset = CurDAG->getTargetConstant(Val: 0, DL: SDLoc(), VT: MVT::i32);
2984 }
2985
2986 SDValue Ops[] = {
2987 Offset,
2988 CurDAG->getTargetConstant(Val: IsGDS, DL: SDLoc(), VT: MVT::i32),
2989 Chain,
2990 N->getOperand(Num: N->getNumOperands() - 1) // New glue
2991 };
2992
2993 SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
2994 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
2995}
2996
2997// We need to handle this here because tablegen doesn't support matching
2998// instructions with multiple outputs.
2999void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
3000 unsigned Opc;
3001 switch (IntrID) {
3002 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3003 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3004 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
3005 break;
3006 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3007 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
3008 break;
3009 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3010 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
3011 break;
3012 }
3013 SDValue Ops[] = {N->getOperand(Num: 2), N->getOperand(Num: 3), N->getOperand(Num: 4),
3014 N->getOperand(Num: 5), N->getOperand(Num: 0)};
3015
3016 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
3017 MachineMemOperand *MMO = M->getMemOperand();
3018 SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
3019 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
3020}
3021
3022void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
3023 bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
3024 unsigned Opc =
3025 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3026
3027 SmallVector<SDValue, 7> TensorOps;
3028 // First two groups
3029 TensorOps.push_back(Elt: N->getOperand(Num: 2)); // D# group 0
3030 TensorOps.push_back(Elt: N->getOperand(Num: 3)); // D# group 1
3031
3032 // Use _D2 version if both group 2 and 3 are zero-initialized.
3033 SDValue Group2 = N->getOperand(Num: 4);
3034 SDValue Group3 = N->getOperand(Num: 5);
3035 if (ISD::isBuildVectorAllZeros(N: Group2.getNode()) &&
3036 ISD::isBuildVectorAllZeros(N: Group3.getNode())) {
3037 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3038 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3039 } else { // Has at least 4 groups
3040 TensorOps.push_back(Elt: Group2); // D# group 2
3041 TensorOps.push_back(Elt: Group3); // D# group 3
3042 }
3043
3044 // TODO: Handle the fifth group: N->getOperand(6), which is silently ignored
3045 // for now because all existing targets only support up to 4 groups.
3046 TensorOps.push_back(Elt: CurDAG->getTargetConstant(Val: 0, DL: SDLoc(N), VT: MVT::i1)); // r128
3047 TensorOps.push_back(Elt: N->getOperand(Num: 7)); // cache policy
3048 TensorOps.push_back(Elt: N->getOperand(Num: 0)); // chain
3049
3050 (void)CurDAG->SelectNodeTo(N, MachineOpc: Opc, VT: MVT::Other, Ops: TensorOps);
3051}
3052
3053static unsigned gwsIntrinToOpcode(unsigned IntrID) {
3054 switch (IntrID) {
3055 case Intrinsic::amdgcn_ds_gws_init:
3056 return AMDGPU::DS_GWS_INIT;
3057 case Intrinsic::amdgcn_ds_gws_barrier:
3058 return AMDGPU::DS_GWS_BARRIER;
3059 case Intrinsic::amdgcn_ds_gws_sema_v:
3060 return AMDGPU::DS_GWS_SEMA_V;
3061 case Intrinsic::amdgcn_ds_gws_sema_br:
3062 return AMDGPU::DS_GWS_SEMA_BR;
3063 case Intrinsic::amdgcn_ds_gws_sema_p:
3064 return AMDGPU::DS_GWS_SEMA_P;
3065 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3066 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
3067 default:
3068 llvm_unreachable("not a gws intrinsic");
3069 }
3070}
3071
3072void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
3073 if (!Subtarget->hasGWS() ||
3074 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
3075 !Subtarget->hasGWSSemaReleaseAll())) {
3076 // Let this error.
3077 SelectCode(N);
3078 return;
3079 }
3080
3081 // Chain, intrinsic ID, vsrc, offset
3082 const bool HasVSrc = N->getNumOperands() == 4;
3083 assert(HasVSrc || N->getNumOperands() == 3);
3084
3085 SDLoc SL(N);
3086 SDValue BaseOffset = N->getOperand(Num: HasVSrc ? 3 : 2);
3087 int ImmOffset = 0;
3088 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
3089 MachineMemOperand *MMO = M->getMemOperand();
3090
3091 // Don't worry if the offset ends up in a VGPR. Only one lane will have
3092 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
3093
3094 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
3095 // offset field) % 64. Some versions of the programming guide omit the m0
3096 // part, or claim it's from offset 0.
3097 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(Val&: BaseOffset)) {
3098 // If we have a constant offset, try to use the 0 in m0 as the base.
3099 // TODO: Look into changing the default m0 initialization value. If the
3100 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
3101 // the immediate offset.
3102 glueCopyToM0(N, Val: CurDAG->getTargetConstant(Val: 0, DL: SL, VT: MVT::i32));
3103 ImmOffset = ConstOffset->getZExtValue();
3104 } else {
3105 if (CurDAG->isBaseWithConstantOffset(Op: BaseOffset)) {
3106 ImmOffset = BaseOffset.getConstantOperandVal(i: 1);
3107 BaseOffset = BaseOffset.getOperand(i: 0);
3108 }
3109
3110 // Prefer to do the shift in an SGPR since it should be possible to use m0
3111 // as the result directly. If it's already an SGPR, it will be eliminated
3112 // later.
3113 SDNode *SGPROffset
3114 = CurDAG->getMachineNode(Opcode: AMDGPU::V_READFIRSTLANE_B32, dl: SL, VT: MVT::i32,
3115 Op1: BaseOffset);
3116 // Shift to offset in m0
3117 SDNode *M0Base
3118 = CurDAG->getMachineNode(Opcode: AMDGPU::S_LSHL_B32, dl: SL, VT: MVT::i32,
3119 Op1: SDValue(SGPROffset, 0),
3120 Op2: CurDAG->getTargetConstant(Val: 16, DL: SL, VT: MVT::i32));
3121 glueCopyToM0(N, Val: SDValue(M0Base, 0));
3122 }
3123
3124 SDValue Chain = N->getOperand(Num: 0);
3125 SDValue OffsetField = CurDAG->getTargetConstant(Val: ImmOffset, DL: SL, VT: MVT::i32);
3126
3127 const unsigned Opc = gwsIntrinToOpcode(IntrID);
3128
3129 const MCInstrDesc &InstrDesc = TII->get(Opcode: Opc);
3130 int Data0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
3131
3132 const TargetRegisterClass *DataRC = TII->getRegClass(MCID: InstrDesc, OpNum: Data0Idx);
3133
3134 SmallVector<SDValue, 5> Ops;
3135 if (HasVSrc) {
3136 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3137
3138 SDValue Data = N->getOperand(Num: 2);
3139 MVT DataVT = Data.getValueType().getSimpleVT();
3140 if (TRI->isTypeLegalForClass(RC: *DataRC, T: DataVT)) {
3141 // Normal 32-bit case.
3142 Ops.push_back(Elt: N->getOperand(Num: 2));
3143 } else {
3144 // Operand is really 32-bits, but requires 64-bit alignment, so use the
3145 // even aligned 64-bit register class.
3146 const SDValue RegSeqOps[] = {
3147 CurDAG->getTargetConstant(Val: DataRC->getID(), DL: SL, VT: MVT::i32), Data,
3148 CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32),
3149 SDValue(
3150 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: SL, VT: MVT::i32),
3151 0),
3152 CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32)};
3153
3154 Ops.push_back(Elt: SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE,
3155 dl: SL, VT: MVT::v2i32, Ops: RegSeqOps),
3156 0));
3157 }
3158 }
3159
3160 Ops.push_back(Elt: OffsetField);
3161 Ops.push_back(Elt: Chain);
3162
3163 SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
3164 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
3165}
3166
3167void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
3168 if (Subtarget->getLDSBankCount() != 16) {
3169 // This is a single instruction with a pattern.
3170 SelectCode(N);
3171 return;
3172 }
3173
3174 SDLoc DL(N);
3175
3176 // This requires 2 instructions. It is possible to write a pattern to support
3177 // this, but the generated isel emitter doesn't correctly deal with multiple
3178 // output instructions using the same physical register input. The copy to m0
3179 // is incorrectly placed before the second instruction.
3180 //
3181 // TODO: Match source modifiers.
3182 //
3183 // def : Pat <
3184 // (int_amdgcn_interp_p1_f16
3185 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
3186 // (i32 timm:$attrchan), (i32 timm:$attr),
3187 // (i1 timm:$high), M0),
3188 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
3189 // timm:$attrchan, 0,
3190 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
3191 // let Predicates = [has16BankLDS];
3192 // }
3193
3194 // 16 bank LDS
3195 SDValue ToM0 = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl: DL, Reg: AMDGPU::M0,
3196 N: N->getOperand(Num: 5), Glue: SDValue());
3197
3198 SDVTList VTs = CurDAG->getVTList(VT1: MVT::f32, VT2: MVT::Other);
3199
3200 SDNode *InterpMov =
3201 CurDAG->getMachineNode(Opcode: AMDGPU::V_INTERP_MOV_F32, dl: DL, VTs, Ops: {
3202 CurDAG->getTargetConstant(Val: 2, DL, VT: MVT::i32), // P0
3203 N->getOperand(Num: 3), // Attr
3204 N->getOperand(Num: 2), // Attrchan
3205 ToM0.getValue(R: 1) // In glue
3206 });
3207
3208 SDNode *InterpP1LV =
3209 CurDAG->getMachineNode(Opcode: AMDGPU::V_INTERP_P1LV_F16, dl: DL, VT: MVT::f32, Ops: {
3210 CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32), // $src0_modifiers
3211 N->getOperand(Num: 1), // Src0
3212 N->getOperand(Num: 3), // Attr
3213 N->getOperand(Num: 2), // Attrchan
3214 CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32), // $src2_modifiers
3215 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
3216 N->getOperand(Num: 4), // high
3217 CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1), // $clamp
3218 CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i32), // $omod
3219 SDValue(InterpMov, 1)
3220 });
3221
3222 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: SDValue(InterpP1LV, 0));
3223}
3224
3225void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
3226 unsigned IntrID = N->getConstantOperandVal(Num: 1);
3227 switch (IntrID) {
3228 case Intrinsic::amdgcn_ds_append:
3229 case Intrinsic::amdgcn_ds_consume: {
3230 if (N->getValueType(ResNo: 0) != MVT::i32)
3231 break;
3232 SelectDSAppendConsume(N, IntrID);
3233 return;
3234 }
3235 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3236 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3237 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3238 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3239 SelectDSBvhStackIntrinsic(N, IntrID);
3240 return;
3241 case Intrinsic::amdgcn_init_whole_wave:
3242 CurDAG->getMachineFunction()
3243 .getInfo<SIMachineFunctionInfo>()
3244 ->setInitWholeWave();
3245 break;
3246 }
3247
3248 SelectCode(N);
3249}
3250
3251void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
3252 unsigned IntrID = N->getConstantOperandVal(Num: 0);
3253 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
3254 SDNode *ConvGlueNode = N->getGluedNode();
3255 if (ConvGlueNode) {
3256 // FIXME: Possibly iterate over multiple glue nodes?
3257 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
3258 ConvGlueNode = ConvGlueNode->getOperand(Num: 0).getNode();
3259 ConvGlueNode =
3260 CurDAG->getMachineNode(Opcode: TargetOpcode::CONVERGENCECTRL_GLUE, dl: {},
3261 VT: MVT::Glue, Op1: SDValue(ConvGlueNode, 0));
3262 } else {
3263 ConvGlueNode = nullptr;
3264 }
3265 switch (IntrID) {
3266 case Intrinsic::amdgcn_wqm:
3267 Opcode = AMDGPU::WQM;
3268 break;
3269 case Intrinsic::amdgcn_softwqm:
3270 Opcode = AMDGPU::SOFT_WQM;
3271 break;
3272 case Intrinsic::amdgcn_wwm:
3273 case Intrinsic::amdgcn_strict_wwm:
3274 Opcode = AMDGPU::STRICT_WWM;
3275 break;
3276 case Intrinsic::amdgcn_strict_wqm:
3277 Opcode = AMDGPU::STRICT_WQM;
3278 break;
3279 case Intrinsic::amdgcn_interp_p1_f16:
3280 SelectInterpP1F16(N);
3281 return;
3282 case Intrinsic::amdgcn_permlane16_swap:
3283 case Intrinsic::amdgcn_permlane32_swap: {
3284 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
3285 !Subtarget->hasPermlane16Swap()) ||
3286 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3287 !Subtarget->hasPermlane32Swap())) {
3288 SelectCode(N); // Hit the default error
3289 return;
3290 }
3291
3292 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3293 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3294 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3295
3296 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
3297 if (ConvGlueNode)
3298 NewOps.push_back(Elt: SDValue(ConvGlueNode, 0));
3299
3300 bool FI = N->getConstantOperandVal(Num: 3);
3301 NewOps[2] = CurDAG->getTargetConstant(
3302 Val: FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, DL: SDLoc(), VT: MVT::i32);
3303
3304 CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops: NewOps);
3305 return;
3306 }
3307 default:
3308 SelectCode(N);
3309 break;
3310 }
3311
3312 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
3313 SDValue Src = N->getOperand(Num: 1);
3314 CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops: {Src});
3315 }
3316
3317 if (ConvGlueNode) {
3318 SmallVector<SDValue, 4> NewOps(N->ops());
3319 NewOps.push_back(Elt: SDValue(ConvGlueNode, 0));
3320 CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: N->getVTList(), Ops: NewOps);
3321 }
3322}
3323
3324void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
3325 unsigned IntrID = N->getConstantOperandVal(Num: 1);
3326 switch (IntrID) {
3327 case Intrinsic::amdgcn_ds_gws_init:
3328 case Intrinsic::amdgcn_ds_gws_barrier:
3329 case Intrinsic::amdgcn_ds_gws_sema_v:
3330 case Intrinsic::amdgcn_ds_gws_sema_br:
3331 case Intrinsic::amdgcn_ds_gws_sema_p:
3332 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3333 SelectDS_GWS(N, IntrID);
3334 return;
3335 case Intrinsic::amdgcn_tensor_load_to_lds:
3336 case Intrinsic::amdgcn_tensor_store_from_lds:
3337 SelectTensorLoadStore(N, IntrID);
3338 return;
3339 default:
3340 break;
3341 }
3342
3343 SelectCode(N);
3344}
3345
3346void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
3347 SDValue Log2WaveSize =
3348 CurDAG->getTargetConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: SDLoc(N), VT: MVT::i32);
3349 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::S_LSHR_B32, VTs: N->getVTList(),
3350 Ops: {N->getOperand(Num: 0), Log2WaveSize});
3351}
3352
3353void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
3354 SDValue SrcVal = N->getOperand(Num: 1);
3355 if (SrcVal.getValueType() != MVT::i32) {
3356 SelectCode(N); // Emit default error
3357 return;
3358 }
3359
3360 SDValue CopyVal;
3361 Register SP = TLI->getStackPointerRegisterToSaveRestore();
3362 SDLoc SL(N);
3363
3364 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
3365 CopyVal = SrcVal.getOperand(i: 0);
3366 } else {
3367 SDValue Log2WaveSize = CurDAG->getTargetConstant(
3368 Val: Subtarget->getWavefrontSizeLog2(), DL: SL, VT: MVT::i32);
3369
3370 if (N->isDivergent()) {
3371 SrcVal = SDValue(CurDAG->getMachineNode(Opcode: AMDGPU::V_READFIRSTLANE_B32, dl: SL,
3372 VT: MVT::i32, Op1: SrcVal),
3373 0);
3374 }
3375
3376 CopyVal = SDValue(CurDAG->getMachineNode(Opcode: AMDGPU::S_LSHL_B32, dl: SL, VT: MVT::i32,
3377 Ops: {SrcVal, Log2WaveSize}),
3378 0);
3379 }
3380
3381 SDValue CopyToSP = CurDAG->getCopyToReg(Chain: N->getOperand(Num: 0), dl: SL, Reg: SP, N: CopyVal);
3382 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: CopyToSP);
3383}
3384
3385bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3386 unsigned &Mods,
3387 bool IsCanonicalizing,
3388 bool AllowAbs) const {
3389 Mods = SISrcMods::NONE;
3390 Src = In;
3391
3392 if (Src.getOpcode() == ISD::FNEG) {
3393 Mods |= SISrcMods::NEG;
3394 Src = Src.getOperand(i: 0);
3395 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3396 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3397 // denormal mode, but we're implicitly canonicalizing in a source operand.
3398 auto *LHS = dyn_cast<ConstantFPSDNode>(Val: Src.getOperand(i: 0));
3399 if (LHS && LHS->isZero()) {
3400 Mods |= SISrcMods::NEG;
3401 Src = Src.getOperand(i: 1);
3402 }
3403 }
3404
3405 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3406 Mods |= SISrcMods::ABS;
3407 Src = Src.getOperand(i: 0);
3408 }
3409
3410 if (Mods != SISrcMods::NONE)
3411 return true;
3412
3413 // Convert various sign-bit masks on integers to src mods. Currently disabled
3414 // for 16-bit types as the codegen replaces the operand without adding a
3415 // srcmod. This is intentionally finding the cases where we are performing
3416 // float neg and abs on int types, the goal is not to obtain two's complement
3417 // neg or abs. Limit converison to select operands via the nonCanonalizing
3418 // pattern.
3419 // TODO: Add 16-bit support.
3420 if (IsCanonicalizing)
3421 return true;
3422
3423 // v2i32 xor/or/and are legal. A vselect using these instructions as operands
3424 // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
3425 // through the extract to the bitwise op.
3426 SDValue PeekSrc =
3427 Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(Num: 0) : Src;
3428 // Convert various sign-bit masks to src mods. Currently disabled for 16-bit
3429 // types as the codegen replaces the operand without adding a srcmod.
3430 // This is intentionally finding the cases where we are performing float neg
3431 // and abs on int types, the goal is not to obtain two's complement neg or
3432 // abs.
3433 // TODO: Add 16-bit support.
3434 unsigned Opc = PeekSrc.getOpcode();
3435 EVT VT = Src.getValueType();
3436 if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3437 (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
3438 return true;
3439
3440 ConstantSDNode *CRHS = isConstOrConstSplat(N: PeekSrc->getOperand(Num: 1));
3441 if (!CRHS)
3442 return true;
3443
3444 auto ReplaceSrc = [&]() -> SDValue {
3445 if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3446 return Src.getOperand(i: 0);
3447
3448 SDValue LHS = PeekSrc->getOperand(Num: 0);
3449 SDValue Index = Src->getOperand(Num: 1);
3450 return CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SDLoc(Src),
3451 VT: Src.getValueType(), N1: LHS, N2: Index);
3452 };
3453
3454 // Recognise Srcmods:
3455 // (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.
3456 // (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.
3457 // (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS
3458 // SrcModifiers.
3459 if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3460 Mods |= SISrcMods::NEG;
3461 Src = ReplaceSrc();
3462 } else if (Opc == ISD::AND && AllowAbs &&
3463 CRHS->getAPIntValue().isMaxSignedValue()) {
3464 Mods |= SISrcMods::ABS;
3465 Src = ReplaceSrc();
3466 } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3467 Mods |= SISrcMods::ABS | SISrcMods::NEG;
3468 Src = ReplaceSrc();
3469 }
3470
3471 return true;
3472}
3473
3474bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3475 SDValue &SrcMods) const {
3476 unsigned Mods;
3477 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3478 /*AllowAbs=*/true)) {
3479 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3480 return true;
3481 }
3482
3483 return false;
3484}
3485
3486bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3487 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3488 unsigned Mods;
3489 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3490 /*AllowAbs=*/true)) {
3491 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3492 return true;
3493 }
3494
3495 return false;
3496}
3497
3498bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3499 SDValue &SrcMods) const {
3500 unsigned Mods;
3501 if (SelectVOP3ModsImpl(In, Src, Mods,
3502 /*IsCanonicalizing=*/true,
3503 /*AllowAbs=*/false)) {
3504 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3505 return true;
3506 }
3507
3508 return false;
3509}
3510
3511bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3512 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3513 return false;
3514
3515 Src = In;
3516 return true;
3517}
3518
3519bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3520 SDValue &SrcMods,
3521 bool OpSel) const {
3522 unsigned Mods;
3523 if (SelectVOP3ModsImpl(In, Src, Mods,
3524 /*IsCanonicalizing=*/true,
3525 /*AllowAbs=*/false)) {
3526 if (OpSel)
3527 Mods |= SISrcMods::OP_SEL_0;
3528 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3529 return true;
3530 }
3531
3532 return false;
3533}
3534
3535bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3536 SDValue &SrcMods) const {
3537 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3538}
3539
3540bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3541 SDValue &SrcMods) const {
3542 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3543}
3544
3545bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3546 SDValue &SrcMods, SDValue &Clamp,
3547 SDValue &Omod) const {
3548 SDLoc DL(In);
3549 Clamp = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
3550 Omod = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
3551
3552 return SelectVOP3Mods(In, Src, SrcMods);
3553}
3554
3555bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3556 SDValue &SrcMods, SDValue &Clamp,
3557 SDValue &Omod) const {
3558 SDLoc DL(In);
3559 Clamp = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
3560 Omod = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
3561
3562 return SelectVOP3BMods(In, Src, SrcMods);
3563}
3564
3565bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3566 SDValue &Clamp, SDValue &Omod) const {
3567 Src = In;
3568
3569 SDLoc DL(In);
3570 Clamp = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
3571 Omod = CurDAG->getTargetConstant(Val: 0, DL, VT: MVT::i1);
3572
3573 return true;
3574}
3575
3576bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3577 SDValue &SrcMods, bool IsDOT) const {
3578 unsigned Mods = SISrcMods::NONE;
3579 Src = In;
3580
3581 // TODO: Handle G_FSUB 0 as fneg
3582 if (Src.getOpcode() == ISD::FNEG) {
3583 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3584 Src = Src.getOperand(i: 0);
3585 }
3586
3587 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3588 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3589 unsigned VecMods = Mods;
3590
3591 SDValue Lo = stripBitcast(Val: Src.getOperand(i: 0));
3592 SDValue Hi = stripBitcast(Val: Src.getOperand(i: 1));
3593
3594 if (Lo.getOpcode() == ISD::FNEG) {
3595 Lo = stripBitcast(Val: Lo.getOperand(i: 0));
3596 Mods ^= SISrcMods::NEG;
3597 }
3598
3599 if (Hi.getOpcode() == ISD::FNEG) {
3600 Hi = stripBitcast(Val: Hi.getOperand(i: 0));
3601 Mods ^= SISrcMods::NEG_HI;
3602 }
3603
3604 if (isExtractHiElt(In: Lo, Out&: Lo))
3605 Mods |= SISrcMods::OP_SEL_0;
3606
3607 if (isExtractHiElt(In: Hi, Out&: Hi))
3608 Mods |= SISrcMods::OP_SEL_1;
3609
3610 unsigned VecSize = Src.getValueSizeInBits();
3611 Lo = stripExtractLoElt(In: Lo);
3612 Hi = stripExtractLoElt(In: Hi);
3613
3614 if (Lo.getValueSizeInBits() > VecSize) {
3615 Lo = CurDAG->getTargetExtractSubreg(
3616 SRIdx: (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, DL: SDLoc(In),
3617 VT: MVT::getIntegerVT(BitWidth: VecSize), Operand: Lo);
3618 }
3619
3620 if (Hi.getValueSizeInBits() > VecSize) {
3621 Hi = CurDAG->getTargetExtractSubreg(
3622 SRIdx: (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, DL: SDLoc(In),
3623 VT: MVT::getIntegerVT(BitWidth: VecSize), Operand: Hi);
3624 }
3625
3626 assert(Lo.getValueSizeInBits() <= VecSize &&
3627 Hi.getValueSizeInBits() <= VecSize);
3628
3629 if (Lo == Hi && !isInlineImmediate(N: Lo.getNode())) {
3630 // Really a scalar input. Just select from the low half of the register to
3631 // avoid packing.
3632
3633 if (VecSize == Lo.getValueSizeInBits()) {
3634 Src = Lo;
3635 } else if (VecSize == 32) {
3636 Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
3637 } else {
3638 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3639
3640 SDLoc SL(In);
3641 SDValue Undef = SDValue(
3642 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: SL,
3643 VT: Lo.getValueType()), 0);
3644 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3645 : AMDGPU::SReg_64RegClassID;
3646 const SDValue Ops[] = {
3647 CurDAG->getTargetConstant(Val: RC, DL: SL, VT: MVT::i32),
3648 Lo, CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32),
3649 Undef, CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32) };
3650
3651 Src = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: SL,
3652 VT: Src.getValueType(), Ops), 0);
3653 }
3654 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3655 return true;
3656 }
3657
3658 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Val: Lo)) {
3659 uint64_t Lit = cast<ConstantFPSDNode>(Val&: Lo)->getValueAPF()
3660 .bitcastToAPInt().getZExtValue();
3661 if (AMDGPU::isInlinableLiteral32(Literal: Lit, HasInv2Pi: Subtarget->hasInv2PiInlineImm())) {
3662 Src = CurDAG->getTargetConstant(Val: Lit, DL: SDLoc(In), VT: MVT::i64);
3663 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3664 return true;
3665 }
3666 }
3667
3668 Mods = VecMods;
3669 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3670 Src.getNumOperands() == 2) {
3671
3672 // TODO: We should repeat the build_vector source check above for the
3673 // vector_shuffle for negates and casts of individual elements.
3674
3675 auto *SVN = cast<ShuffleVectorSDNode>(Val&: Src);
3676 ArrayRef<int> Mask = SVN->getMask();
3677
3678 if (Mask[0] < 2 && Mask[1] < 2) {
3679 // src1 should be undef.
3680 SDValue ShuffleSrc = SVN->getOperand(Num: 0);
3681
3682 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3683 ShuffleSrc = ShuffleSrc.getOperand(i: 0);
3684 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3685 }
3686
3687 if (Mask[0] == 1)
3688 Mods |= SISrcMods::OP_SEL_0;
3689 if (Mask[1] == 1)
3690 Mods |= SISrcMods::OP_SEL_1;
3691
3692 Src = ShuffleSrc;
3693 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3694 return true;
3695 }
3696 }
3697
3698 // Packed instructions do not have abs modifiers.
3699 Mods |= SISrcMods::OP_SEL_1;
3700
3701 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3702 return true;
3703}
3704
3705bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3706 SDValue &SrcMods) const {
3707 return SelectVOP3PMods(In, Src, SrcMods, IsDOT: true);
3708}
3709
3710bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const {
3711 SDValue SrcTmp, SrcModsTmp;
3712 SelectVOP3PMods(In, Src&: SrcTmp, SrcMods&: SrcModsTmp, IsDOT: true);
3713 if (cast<ConstantSDNode>(Val&: SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
3714 Src = SrcTmp;
3715 return true;
3716 }
3717
3718 return false;
3719}
3720
3721bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src,
3722 SDValue &SrcMods) const {
3723 SelectVOP3Mods(In, Src, SrcMods);
3724 unsigned Mods = SISrcMods::OP_SEL_1;
3725 Mods |= cast<ConstantSDNode>(Val&: SrcMods)->getZExtValue();
3726 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3727 return true;
3728}
3729
3730bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const {
3731 SDValue SrcTmp, SrcModsTmp;
3732 SelectVOP3PModsF32(In, Src&: SrcTmp, SrcMods&: SrcModsTmp);
3733 if (cast<ConstantSDNode>(Val&: SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
3734 Src = SrcTmp;
3735 return true;
3736 }
3737
3738 return false;
3739}
3740
3741bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3742 SDValue &Src) const {
3743 const ConstantSDNode *C = cast<ConstantSDNode>(Val&: In);
3744 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3745
3746 unsigned Mods = SISrcMods::OP_SEL_1;
3747 unsigned SrcVal = C->getZExtValue();
3748 if (SrcVal == 1)
3749 Mods |= SISrcMods::OP_SEL_0;
3750
3751 Src = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3752 return true;
3753}
3754
3755MachineSDNode *
3756AMDGPUDAGToDAGISel::buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3757 const SDLoc &DL) const {
3758 unsigned DstRegClass;
3759 EVT DstTy;
3760 switch (Elts.size()) {
3761 case 8:
3762 DstRegClass = AMDGPU::VReg_256RegClassID;
3763 DstTy = MVT::v8i32;
3764 break;
3765 case 4:
3766 DstRegClass = AMDGPU::VReg_128RegClassID;
3767 DstTy = MVT::v4i32;
3768 break;
3769 case 2:
3770 DstRegClass = AMDGPU::VReg_64RegClassID;
3771 DstTy = MVT::v2i32;
3772 break;
3773 default:
3774 llvm_unreachable("unhandled Reg sequence size");
3775 }
3776
3777 SmallVector<SDValue, 17> Ops;
3778 Ops.push_back(Elt: CurDAG->getTargetConstant(Val: DstRegClass, DL, VT: MVT::i32));
3779 for (unsigned i = 0; i < Elts.size(); ++i) {
3780 Ops.push_back(Elt: Elts[i]);
3781 Ops.push_back(Elt: CurDAG->getTargetConstant(
3782 Val: SIRegisterInfo::getSubRegFromChannel(Channel: i), DL, VT: MVT::i32));
3783 }
3784 return CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL, VT: DstTy, Ops);
3785}
3786
3787MachineSDNode *
3788AMDGPUDAGToDAGISel::buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3789 const SDLoc &DL) const {
3790 SmallVector<SDValue, 8> PackedElts;
3791 assert("unhandled Reg sequence size" &&
3792 (Elts.size() == 8 || Elts.size() == 16));
3793
3794 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3795 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3796 for (unsigned i = 0; i < Elts.size(); i += 2) {
3797 SDValue LoSrc = stripExtractLoElt(In: stripBitcast(Val: Elts[i]));
3798 SDValue HiSrc;
3799 if (isExtractHiElt(In: Elts[i + 1], Out&: HiSrc) && LoSrc == HiSrc) {
3800 PackedElts.push_back(Elt: HiSrc);
3801 } else {
3802 if (Subtarget->useRealTrue16Insts()) {
3803 // FIXME-TRUE16. For now pack VGPR_32 for 16-bit source before
3804 // passing to v_perm_b32. Eventually we should use replace v_perm_b32
3805 // by reg_sequence.
3806 SDValue Undef = SDValue(
3807 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT: MVT::i16),
3808 0);
3809 Elts[i] =
3810 emitRegSequence(CurDAG&: *CurDAG, DstRegClass: AMDGPU::VGPR_32RegClassID, DstTy: MVT::i32,
3811 Elts: {Elts[i], Undef}, SubRegClass: {AMDGPU::lo16, AMDGPU::hi16}, DL);
3812 Elts[i + 1] = emitRegSequence(CurDAG&: *CurDAG, DstRegClass: AMDGPU::VGPR_32RegClassID,
3813 DstTy: MVT::i32, Elts: {Elts[i + 1], Undef},
3814 SubRegClass: {AMDGPU::lo16, AMDGPU::hi16}, DL);
3815 }
3816 SDValue PackLoLo = CurDAG->getTargetConstant(Val: 0x05040100, DL, VT: MVT::i32);
3817 MachineSDNode *Packed =
3818 CurDAG->getMachineNode(Opcode: AMDGPU::V_PERM_B32_e64, dl: DL, VT: MVT::i32,
3819 Ops: {Elts[i + 1], Elts[i], PackLoLo});
3820 PackedElts.push_back(Elt: SDValue(Packed, 0));
3821 }
3822 }
3823 return buildRegSequence32(Elts&: PackedElts, DL);
3824}
3825
3826MachineSDNode *
3827AMDGPUDAGToDAGISel::buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3828 const SDLoc &DL,
3829 unsigned ElementSize) const {
3830 if (ElementSize == 16)
3831 return buildRegSequence16(Elts, DL);
3832 if (ElementSize == 32)
3833 return buildRegSequence32(Elts, DL);
3834 llvm_unreachable("Unhandled element size");
3835}
3836
3837void AMDGPUDAGToDAGISel::selectWMMAModsNegAbs(unsigned ModOpcode,
3838 unsigned &Mods,
3839 SmallVectorImpl<SDValue> &Elts,
3840 SDValue &Src, const SDLoc &DL,
3841 unsigned ElementSize) const {
3842 if (ModOpcode == ISD::FNEG) {
3843 Mods |= SISrcMods::NEG;
3844 // Check if all elements also have abs modifier
3845 SmallVector<SDValue, 8> NegAbsElts;
3846 for (auto El : Elts) {
3847 if (El.getOpcode() != ISD::FABS)
3848 break;
3849 NegAbsElts.push_back(Elt: El->getOperand(Num: 0));
3850 }
3851 if (Elts.size() != NegAbsElts.size()) {
3852 // Neg
3853 Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);
3854 } else {
3855 // Neg and Abs
3856 Mods |= SISrcMods::NEG_HI;
3857 Src = SDValue(buildRegSequence(Elts&: NegAbsElts, DL, ElementSize), 0);
3858 }
3859 } else {
3860 assert(ModOpcode == ISD::FABS);
3861 // Abs
3862 Mods |= SISrcMods::NEG_HI;
3863 Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);
3864 }
3865}
3866
3867// Check all f16 elements for modifiers while looking through b32 and v2b16
3868// build vector, stop if element does not satisfy ModifierCheck.
3869static void
3870checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
3871 std::function<bool(SDValue)> ModifierCheck) {
3872 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3873 if (auto *F16Pair =
3874 dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: BV->getOperand(Num: i)))) {
3875 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3876 SDValue ElF16 = stripBitcast(Val: F16Pair->getOperand(Num: i));
3877 if (!ModifierCheck(ElF16))
3878 break;
3879 }
3880 }
3881 }
3882}
3883
3884bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3885 SDValue &SrcMods) const {
3886 Src = In;
3887 unsigned Mods = SISrcMods::OP_SEL_1;
3888
3889 // mods are on f16 elements
3890 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3891 SmallVector<SDValue, 8> EltsF16;
3892
3893 checkWMMAElementsModifiersF16(BV, ModifierCheck: [&](SDValue Element) -> bool {
3894 if (Element.getOpcode() != ISD::FNEG)
3895 return false;
3896 EltsF16.push_back(Elt: Element.getOperand(i: 0));
3897 return true;
3898 });
3899
3900 // All elements have neg modifier
3901 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3902 Src = SDValue(buildRegSequence16(Elts&: EltsF16, DL: SDLoc(In)), 0);
3903 Mods |= SISrcMods::NEG;
3904 Mods |= SISrcMods::NEG_HI;
3905 }
3906 }
3907
3908 // mods are on v2f16 elements
3909 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3910 SmallVector<SDValue, 8> EltsV2F16;
3911 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3912 SDValue ElV2f16 = stripBitcast(Val: BV->getOperand(Num: i));
3913 // Based on first element decide which mod we match, neg or abs
3914 if (ElV2f16.getOpcode() != ISD::FNEG)
3915 break;
3916 EltsV2F16.push_back(Elt: ElV2f16.getOperand(i: 0));
3917 }
3918
3919 // All pairs of elements have neg modifier
3920 if (BV->getNumOperands() == EltsV2F16.size()) {
3921 Src = SDValue(buildRegSequence32(Elts&: EltsV2F16, DL: SDLoc(In)), 0);
3922 Mods |= SISrcMods::NEG;
3923 Mods |= SISrcMods::NEG_HI;
3924 }
3925 }
3926
3927 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3928 return true;
3929}
3930
3931bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3932 SDValue &SrcMods) const {
3933 Src = In;
3934 unsigned Mods = SISrcMods::OP_SEL_1;
3935 unsigned ModOpcode;
3936
3937 // mods are on f16 elements
3938 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3939 SmallVector<SDValue, 8> EltsF16;
3940 checkWMMAElementsModifiersF16(BV, ModifierCheck: [&](SDValue ElF16) -> bool {
3941 // Based on first element decide which mod we match, neg or abs
3942 if (EltsF16.empty())
3943 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3944 if (ElF16.getOpcode() != ModOpcode)
3945 return false;
3946 EltsF16.push_back(Elt: ElF16.getOperand(i: 0));
3947 return true;
3948 });
3949
3950 // All elements have ModOpcode modifier
3951 if (BV->getNumOperands() * 2 == EltsF16.size())
3952 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF16, Src, DL: SDLoc(In), ElementSize: 16);
3953 }
3954
3955 // mods are on v2f16 elements
3956 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3957 SmallVector<SDValue, 8> EltsV2F16;
3958
3959 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3960 SDValue ElV2f16 = stripBitcast(Val: BV->getOperand(Num: i));
3961 // Based on first element decide which mod we match, neg or abs
3962 if (EltsV2F16.empty())
3963 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3964 if (ElV2f16->getOpcode() != ModOpcode)
3965 break;
3966 EltsV2F16.push_back(Elt: ElV2f16->getOperand(Num: 0));
3967 }
3968
3969 // All elements have ModOpcode modifier
3970 if (BV->getNumOperands() == EltsV2F16.size())
3971 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsV2F16, Src, DL: SDLoc(In), ElementSize: 32);
3972 }
3973
3974 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
3975 return true;
3976}
3977
3978bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3979 SDValue &SrcMods) const {
3980 Src = In;
3981 unsigned Mods = SISrcMods::OP_SEL_1;
3982 SmallVector<SDValue, 8> EltsF32;
3983
3984 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3985 assert(BV->getNumOperands() > 0);
3986 // Based on first element decide which mod we match, neg or abs
3987 SDValue ElF32 = stripBitcast(Val: BV->getOperand(Num: 0));
3988 unsigned ModOpcode =
3989 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3990 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3991 SDValue ElF32 = stripBitcast(Val: BV->getOperand(Num: i));
3992 if (ElF32.getOpcode() != ModOpcode)
3993 break;
3994 EltsF32.push_back(Elt: ElF32.getOperand(i: 0));
3995 }
3996
3997 // All elements had ModOpcode modifier
3998 if (BV->getNumOperands() == EltsF32.size())
3999 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF32, Src, DL: SDLoc(In), ElementSize: 32);
4000 }
4001
4002 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
4003 return true;
4004}
4005
4006bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
4007 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val&: In)) {
4008 BitVector UndefElements;
4009 if (SDValue Splat = BV->getSplatValue(UndefElements: &UndefElements))
4010 if (isInlineImmediate(N: Splat.getNode())) {
4011 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Splat)) {
4012 unsigned Imm = C->getAPIntValue().getSExtValue();
4013 Src = CurDAG->getTargetConstant(Val: Imm, DL: SDLoc(In), VT: MVT::i32);
4014 return true;
4015 }
4016 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Splat)) {
4017 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
4018 Src = CurDAG->getTargetConstant(Val: Imm, DL: SDLoc(In), VT: MVT::i32);
4019 return true;
4020 }
4021 llvm_unreachable("unhandled Constant node");
4022 }
4023 }
4024
4025 // 16 bit splat
4026 SDValue SplatSrc32 = stripBitcast(Val: In);
4027 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(Val&: SplatSrc32))
4028 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
4029 SDValue SplatSrc16 = stripBitcast(Val: Splat32);
4030 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(Val&: SplatSrc16))
4031 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
4032 const SIInstrInfo *TII = Subtarget->getInstrInfo();
4033 std::optional<APInt> RawValue;
4034 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Splat))
4035 RawValue = C->getValueAPF().bitcastToAPInt();
4036 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Splat))
4037 RawValue = C->getAPIntValue();
4038
4039 if (RawValue.has_value()) {
4040 EVT VT = In.getValueType().getScalarType();
4041 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
4042 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
4043 ? APFloatBase::IEEEhalf()
4044 : APFloatBase::BFloat(),
4045 RawValue.value());
4046 if (TII->isInlineConstant(Imm: FloatVal)) {
4047 Src = CurDAG->getTargetConstant(Val: RawValue.value(), DL: SDLoc(In),
4048 VT: MVT::i16);
4049 return true;
4050 }
4051 } else if (VT.getSimpleVT() == MVT::i16) {
4052 if (TII->isInlineConstant(Imm: RawValue.value())) {
4053 Src = CurDAG->getTargetConstant(Val: RawValue.value(), DL: SDLoc(In),
4054 VT: MVT::i16);
4055 return true;
4056 }
4057 } else
4058 llvm_unreachable("unknown 16-bit type");
4059 }
4060 }
4061 }
4062
4063 return false;
4064}
4065
4066bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
4067 SDValue &IndexKey) const {
4068 unsigned Key = 0;
4069 Src = In;
4070
4071 if (In.getOpcode() == ISD::SRL) {
4072 const llvm::SDValue &ShiftSrc = In.getOperand(i: 0);
4073 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: 1));
4074 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4075 ShiftAmt->getZExtValue() % 8 == 0) {
4076 Key = ShiftAmt->getZExtValue() / 8;
4077 Src = ShiftSrc;
4078 }
4079 }
4080
4081 IndexKey = CurDAG->getTargetConstant(Val: Key, DL: SDLoc(In), VT: MVT::i32);
4082 return true;
4083}
4084
4085bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
4086 SDValue &IndexKey) const {
4087 unsigned Key = 0;
4088 Src = In;
4089
4090 if (In.getOpcode() == ISD::SRL) {
4091 const llvm::SDValue &ShiftSrc = In.getOperand(i: 0);
4092 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: 1));
4093 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4094 ShiftAmt->getZExtValue() == 16) {
4095 Key = 1;
4096 Src = ShiftSrc;
4097 }
4098 }
4099
4100 IndexKey = CurDAG->getTargetConstant(Val: Key, DL: SDLoc(In), VT: MVT::i32);
4101 return true;
4102}
4103
4104bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
4105 SDValue &IndexKey) const {
4106 unsigned Key = 0;
4107 Src = In;
4108
4109 SDValue InI32;
4110
4111 if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
4112 const SDValue &ExtendSrc = In.getOperand(i: 0);
4113 if (ExtendSrc.getValueSizeInBits() == 32)
4114 InI32 = ExtendSrc;
4115 } else if (In->getOpcode() == ISD::BITCAST) {
4116 const SDValue &CastSrc = In.getOperand(i: 0);
4117 if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
4118 CastSrc.getOperand(i: 0).getValueSizeInBits() == 32) {
4119 ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(Val: CastSrc.getOperand(i: 1));
4120 if (Zero && Zero->getZExtValue() == 0)
4121 InI32 = CastSrc.getOperand(i: 0);
4122 }
4123 }
4124
4125 if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4126 const SDValue &ExtractVecEltSrc = InI32.getOperand(i: 0);
4127 ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(Val: InI32.getOperand(i: 1));
4128 if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
4129 EltIdx->getZExtValue() == 1) {
4130 Key = 1;
4131 Src = ExtractVecEltSrc;
4132 }
4133 }
4134
4135 IndexKey = CurDAG->getTargetConstant(Val: Key, DL: SDLoc(In), VT: MVT::i32);
4136 return true;
4137}
4138
4139bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
4140 SDValue &SrcMods) const {
4141 Src = In;
4142 // FIXME: Handle op_sel
4143 SrcMods = CurDAG->getTargetConstant(Val: 0, DL: SDLoc(In), VT: MVT::i32);
4144 return true;
4145}
4146
4147bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
4148 SDValue &SrcMods) const {
4149 // FIXME: Handle op_sel
4150 return SelectVOP3Mods(In, Src, SrcMods);
4151}
4152
4153// Match lowered fpext from bf16 to f32. This is a bit operation extending
4154// a 16-bit value with 16-bit of zeroes at LSB:
4155//
4156// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
4157// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
4158// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
4159static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
4160 if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
4161 return SDValue();
4162 Op = Op.getOperand(i: 0);
4163
4164 IsExtractHigh = false;
4165 if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
4166 auto Low16 = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 0));
4167 if (!Low16 || !Low16->isZero())
4168 return SDValue();
4169 Op = stripBitcast(Val: Op.getOperand(i: 1));
4170 if (Op.getValueType() != MVT::bf16)
4171 return SDValue();
4172 return Op;
4173 }
4174
4175 if (Op.getValueType() != MVT::i32)
4176 return SDValue();
4177
4178 if (Op.getOpcode() == ISD::AND) {
4179 if (auto Mask = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
4180 if (Mask->getZExtValue() == 0xffff0000) {
4181 IsExtractHigh = true;
4182 return Op.getOperand(i: 0);
4183 }
4184 }
4185 return SDValue();
4186 }
4187
4188 if (Op.getOpcode() == ISD::SHL) {
4189 if (auto Amt = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
4190 if (Amt->getZExtValue() == 16)
4191 return Op.getOperand(i: 0);
4192 }
4193 }
4194
4195 return SDValue();
4196}
4197
4198// The return value is not whether the match is possible (which it always is),
4199// but whether or not it a conversion is really used.
4200bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
4201 unsigned &Mods,
4202 MVT VT) const {
4203 Mods = 0;
4204 SelectVOP3ModsImpl(In, Src, Mods);
4205
4206 bool IsExtractHigh = false;
4207 if (Src.getOpcode() == ISD::FP_EXTEND) {
4208 Src = Src.getOperand(i: 0);
4209 } else if (VT == MVT::bf16) {
4210 SDValue B16 = matchBF16FPExtendLike(Op: Src, IsExtractHigh);
4211 if (!B16)
4212 return false;
4213 Src = B16;
4214 } else
4215 return false;
4216
4217 if (Src.getValueType() != VT &&
4218 (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
4219 return false;
4220
4221 Src = stripBitcast(Val: Src);
4222
4223 // Be careful about folding modifiers if we already have an abs. fneg is
4224 // applied last, so we don't want to apply an earlier fneg.
4225 if ((Mods & SISrcMods::ABS) == 0) {
4226 unsigned ModsTmp;
4227 SelectVOP3ModsImpl(In: Src, Src, Mods&: ModsTmp);
4228
4229 if ((ModsTmp & SISrcMods::NEG) != 0)
4230 Mods ^= SISrcMods::NEG;
4231
4232 if ((ModsTmp & SISrcMods::ABS) != 0)
4233 Mods |= SISrcMods::ABS;
4234 }
4235
4236 // op_sel/op_sel_hi decide the source type and source.
4237 // If the source's op_sel_hi is set, it indicates to do a conversion from
4238 // fp16. If the sources's op_sel is set, it picks the high half of the source
4239 // register.
4240
4241 Mods |= SISrcMods::OP_SEL_1;
4242 if (Src.getValueSizeInBits() == 16) {
4243 if (isExtractHiElt(In: Src, Out&: Src)) {
4244 Mods |= SISrcMods::OP_SEL_0;
4245
4246 // TODO: Should we try to look for neg/abs here?
4247 return true;
4248 }
4249
4250 if (Src.getOpcode() == ISD::TRUNCATE &&
4251 Src.getOperand(i: 0).getValueType() == MVT::i32) {
4252 Src = Src.getOperand(i: 0);
4253 return true;
4254 }
4255
4256 if (Subtarget->useRealTrue16Insts())
4257 // In true16 mode, pack src to a 32bit
4258 Src = createVOP3PSrc32FromLo16(Lo: Src, Src: In, CurDAG, Subtarget);
4259 } else if (IsExtractHigh)
4260 Mods |= SISrcMods::OP_SEL_0;
4261
4262 return true;
4263}
4264
4265bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
4266 SDValue &SrcMods) const {
4267 unsigned Mods = 0;
4268 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, VT: MVT::f16))
4269 return false;
4270 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
4271 return true;
4272}
4273
4274bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
4275 SDValue &SrcMods) const {
4276 unsigned Mods = 0;
4277 SelectVOP3PMadMixModsImpl(In, Src, Mods, VT: MVT::f16);
4278 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
4279 return true;
4280}
4281
4282bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
4283 SDValue &SrcMods) const {
4284 unsigned Mods = 0;
4285 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, VT: MVT::bf16))
4286 return false;
4287 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
4288 return true;
4289}
4290
4291bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
4292 SDValue &SrcMods) const {
4293 unsigned Mods = 0;
4294 SelectVOP3PMadMixModsImpl(In, Src, Mods, VT: MVT::bf16);
4295 SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc(In), VT: MVT::i32);
4296 return true;
4297}
4298
4299// Match BITOP3 operation and return a number of matched instructions plus
4300// truth table.
4301static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
4302 SmallVectorImpl<SDValue> &Src) {
4303 unsigned NumOpcodes = 0;
4304 uint8_t LHSBits, RHSBits;
4305
4306 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
4307 // Define truth table given Src0, Src1, Src2 bits permutations:
4308 // 0 0 0
4309 // 0 0 1
4310 // 0 1 0
4311 // 0 1 1
4312 // 1 0 0
4313 // 1 0 1
4314 // 1 1 0
4315 // 1 1 1
4316 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4317
4318 if (auto *C = dyn_cast<ConstantSDNode>(Val&: Op)) {
4319 if (C->isAllOnes()) {
4320 Bits = 0xff;
4321 return true;
4322 }
4323 if (C->isZero()) {
4324 Bits = 0;
4325 return true;
4326 }
4327 }
4328
4329 for (unsigned I = 0; I < Src.size(); ++I) {
4330 // Try to find existing reused operand
4331 if (Src[I] == Op) {
4332 Bits = SrcBits[I];
4333 return true;
4334 }
4335 // Try to replace parent operator
4336 if (Src[I] == In) {
4337 Bits = SrcBits[I];
4338 Src[I] = Op;
4339 return true;
4340 }
4341 }
4342
4343 if (Src.size() == 3) {
4344 // No room left for operands. Try one last time, there can be a 'not' of
4345 // one of our source operands. In this case we can compute the bits
4346 // without growing Src vector.
4347 if (Op.getOpcode() == ISD::XOR) {
4348 if (auto *C = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
4349 if (C->isAllOnes()) {
4350 SDValue LHS = Op.getOperand(i: 0);
4351 for (unsigned I = 0; I < Src.size(); ++I) {
4352 if (Src[I] == LHS) {
4353 Bits = ~SrcBits[I];
4354 return true;
4355 }
4356 }
4357 }
4358 }
4359 }
4360
4361 return false;
4362 }
4363
4364 Bits = SrcBits[Src.size()];
4365 Src.push_back(Elt: Op);
4366 return true;
4367 };
4368
4369 switch (In.getOpcode()) {
4370 case ISD::AND:
4371 case ISD::OR:
4372 case ISD::XOR: {
4373 SDValue LHS = In.getOperand(i: 0);
4374 SDValue RHS = In.getOperand(i: 1);
4375
4376 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
4377 if (!getOperandBits(LHS, LHSBits) ||
4378 !getOperandBits(RHS, RHSBits)) {
4379 Src = std::move(Backup);
4380 return std::make_pair(x: 0, y: 0);
4381 }
4382
4383 // Recursion is naturally limited by the size of the operand vector.
4384 auto Op = BitOp3_Op(In: LHS, Src);
4385 if (Op.first) {
4386 NumOpcodes += Op.first;
4387 LHSBits = Op.second;
4388 }
4389
4390 Op = BitOp3_Op(In: RHS, Src);
4391 if (Op.first) {
4392 NumOpcodes += Op.first;
4393 RHSBits = Op.second;
4394 }
4395 break;
4396 }
4397 default:
4398 return std::make_pair(x: 0, y: 0);
4399 }
4400
4401 uint8_t TTbl;
4402 switch (In.getOpcode()) {
4403 case ISD::AND:
4404 TTbl = LHSBits & RHSBits;
4405 break;
4406 case ISD::OR:
4407 TTbl = LHSBits | RHSBits;
4408 break;
4409 case ISD::XOR:
4410 TTbl = LHSBits ^ RHSBits;
4411 break;
4412 default:
4413 break;
4414 }
4415
4416 return std::make_pair(x: NumOpcodes + 1, y&: TTbl);
4417}
4418
4419bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
4420 SDValue &Src2, SDValue &Tbl) const {
4421 SmallVector<SDValue, 3> Src;
4422 uint8_t TTbl;
4423 unsigned NumOpcodes;
4424
4425 std::tie(args&: NumOpcodes, args&: TTbl) = BitOp3_Op(In, Src);
4426
4427 // Src.empty() case can happen if all operands are all zero or all ones.
4428 // Normally it shall be optimized out before reaching this.
4429 if (NumOpcodes < 2 || Src.empty())
4430 return false;
4431
4432 // For a uniform case threshold should be higher to account for moves between
4433 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
4434 // and a readtfirstlane after.
4435 if (NumOpcodes < 4 && !In->isDivergent())
4436 return false;
4437
4438 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
4439 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4440 // asm more readable. This cannot be modeled with AddedComplexity because
4441 // selector does not know how many operations did we match.
4442 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
4443 (In.getOperand(i: 0).getOpcode() == In.getOpcode() ||
4444 In.getOperand(i: 1).getOpcode() == In.getOpcode()))
4445 return false;
4446
4447 if (In.getOpcode() == ISD::OR &&
4448 (In.getOperand(i: 0).getOpcode() == ISD::AND ||
4449 In.getOperand(i: 1).getOpcode() == ISD::AND))
4450 return false;
4451 }
4452
4453 // Last operand can be ignored, turning a ternary operation into a binary.
4454 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4455 // 'c' with 'a' here without changing the answer. In some pathological
4456 // cases it should be possible to get an operation with a single operand
4457 // too if optimizer would not catch it.
4458 while (Src.size() < 3)
4459 Src.push_back(Elt: Src[0]);
4460
4461 Src0 = Src[0];
4462 Src1 = Src[1];
4463 Src2 = Src[2];
4464
4465 Tbl = CurDAG->getTargetConstant(Val: TTbl, DL: SDLoc(In), VT: MVT::i32);
4466 return true;
4467}
4468
4469SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
4470 if (In.isUndef())
4471 return CurDAG->getUNDEF(VT: MVT::i32);
4472
4473 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: In)) {
4474 SDLoc SL(In);
4475 return CurDAG->getConstant(Val: C->getZExtValue() << 16, DL: SL, VT: MVT::i32);
4476 }
4477
4478 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: In)) {
4479 SDLoc SL(In);
4480 return CurDAG->getConstant(
4481 Val: C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, DL: SL, VT: MVT::i32);
4482 }
4483
4484 SDValue Src;
4485 if (isExtractHiElt(In, Out&: Src))
4486 return Src;
4487
4488 return SDValue();
4489}
4490
4491bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
4492 assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
4493
4494 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
4495 const SIInstrInfo *SII = Subtarget->getInstrInfo();
4496
4497 unsigned Limit = 0;
4498 bool AllUsesAcceptSReg = true;
4499 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
4500 Limit < 10 && U != E; ++U, ++Limit) {
4501 const TargetRegisterClass *RC =
4502 getOperandRegClass(N: U->getUser(), OpNo: U->getOperandNo());
4503
4504 // If the register class is unknown, it could be an unknown
4505 // register class that needs to be an SGPR, e.g. an inline asm
4506 // constraint
4507 if (!RC || SIRI->isSGPRClass(RC))
4508 return false;
4509
4510 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass &&
4511 RC != &AMDGPU::VS_64_Align2RegClass) {
4512 AllUsesAcceptSReg = false;
4513 SDNode *User = U->getUser();
4514 if (User->isMachineOpcode()) {
4515 unsigned Opc = User->getMachineOpcode();
4516 const MCInstrDesc &Desc = SII->get(Opcode: Opc);
4517 if (Desc.isCommutable()) {
4518 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
4519 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
4520 if (SII->findCommutedOpIndices(Desc, SrcOpIdx0&: OpIdx, SrcOpIdx1&: CommuteIdx1)) {
4521 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
4522 const TargetRegisterClass *CommutedRC =
4523 getOperandRegClass(N: U->getUser(), OpNo: CommutedOpNo);
4524 if (CommutedRC == &AMDGPU::VS_32RegClass ||
4525 CommutedRC == &AMDGPU::VS_64RegClass ||
4526 CommutedRC == &AMDGPU::VS_64_Align2RegClass)
4527 AllUsesAcceptSReg = true;
4528 }
4529 }
4530 }
4531 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
4532 // commuting current user. This means have at least one use
4533 // that strictly require VGPR. Thus, we will not attempt to commute
4534 // other user instructions.
4535 if (!AllUsesAcceptSReg)
4536 break;
4537 }
4538 }
4539 return !AllUsesAcceptSReg && (Limit < 10);
4540}
4541
4542bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
4543 const auto *Ld = cast<LoadSDNode>(Val: N);
4544 const MachineMemOperand *MMO = Ld->getMemOperand();
4545
4546 // FIXME: We ought to able able to take the direct isDivergent result. We
4547 // cannot rely on the MMO for a uniformity check, and should stop using
4548 // it. This is a hack for 2 ways that the IR divergence analysis is superior
4549 // to the DAG divergence: Recognizing shift-of-workitem-id as always
4550 // uniform, and isSingleLaneExecution. These should be handled in the DAG
4551 // version, and then this can be dropped.
4552 if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))
4553 return false;
4554
4555 return MMO->getSize().hasValue() &&
4556 Ld->getAlign() >=
4557 Align(std::min(a: MMO->getSize().getValue().getKnownMinValue(),
4558 b: uint64_t(4))) &&
4559 (MMO->isInvariant() ||
4560 (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4561 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
4562 (Subtarget->getScalarizeGlobalBehavior() &&
4563 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
4564 Ld->isSimple() &&
4565 static_cast<const SITargetLowering *>(getTargetLowering())
4566 ->isMemOpHasNoClobberedMemOperand(N)));
4567}
4568
4569void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
4570 const AMDGPUTargetLowering& Lowering =
4571 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
4572 bool IsModified = false;
4573 do {
4574 IsModified = false;
4575
4576 // Go over all selected nodes and try to fold them a bit more
4577 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
4578 while (Position != CurDAG->allnodes_end()) {
4579 SDNode *Node = &*Position++;
4580 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Val: Node);
4581 if (!MachineNode)
4582 continue;
4583
4584 SDNode *ResNode = Lowering.PostISelFolding(N: MachineNode, DAG&: *CurDAG);
4585 if (ResNode != Node) {
4586 if (ResNode)
4587 ReplaceUses(F: Node, T: ResNode);
4588 IsModified = true;
4589 }
4590 }
4591 CurDAG->RemoveDeadNodes();
4592 } while (IsModified);
4593}
4594
4595AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,
4596 CodeGenOptLevel OptLevel)
4597 : SelectionDAGISelLegacy(
4598 ID, std::make_unique<AMDGPUDAGToDAGISel>(args&: TM, args&: OptLevel)) {}
4599
4600char AMDGPUDAGToDAGISelLegacy::ID = 0;
4601