AMDGPUISelDAGToDAG.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp]

1	//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//==-----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Defines an instruction selector for the AMDGPU target.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPUISelDAGToDAG.h"
15	#include "AMDGPU.h"
16	#include "AMDGPUInstrInfo.h"
17	#include "AMDGPUSubtarget.h"
18	#include "AMDGPUTargetMachine.h"
19	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20	#include "MCTargetDesc/R600MCTargetDesc.h"
21	#include "R600RegisterInfo.h"
22	#include "SIISelLowering.h"
23	#include "SIMachineFunctionInfo.h"
24	#include "llvm/Analysis/UniformityAnalysis.h"
25	#include "llvm/CodeGen/FunctionLoweringInfo.h"
26	#include "llvm/CodeGen/SelectionDAG.h"
27	#include "llvm/CodeGen/SelectionDAGISel.h"
28	#include "llvm/CodeGen/SelectionDAGNodes.h"
29	#include "llvm/IR/IntrinsicsAMDGPU.h"
30	#include "llvm/InitializePasses.h"
31	#include "llvm/Support/ErrorHandling.h"
32
33	#ifdef EXPENSIVE_CHECKS
34	#include "llvm/Analysis/LoopInfo.h"
35	#include "llvm/IR/Dominators.h"
36	#endif
37
38	#define DEBUG_TYPE "amdgpu-isel"
39
40	using namespace llvm;
41
42	//===----------------------------------------------------------------------===//
43	// Instruction Selector Implementation
44	//===----------------------------------------------------------------------===//
45
46	namespace {
47	static SDValue stripBitcast(SDValue Val) {
48	return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(i: `0`) : Val;
49	}
50
51	// Figure out if this is really an extract of the high 16-bits of a dword.
52	static bool isExtractHiElt(SDValue In, SDValue &Out) {
53	In = stripBitcast(Val: In);
54
55	if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56	if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: `1`))) {
57	if (!Idx->isOne())
58	return false;
59	Out = In.getOperand(i: `0`);
60	return true;
61	}
62	}
63
64	if (In.getOpcode() != ISD::TRUNCATE)
65	return false;
66
67	SDValue Srl = In.getOperand(i: `0`);
68	if (Srl.getOpcode() == ISD::SRL) {
69	if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: Srl.getOperand(i: `1`))) {
70	if (ShiftAmt->getZExtValue() == `16`) {
71	Out = stripBitcast(Val: Srl.getOperand(i: `0`));
72	return true;
73	}
74	}
75	}
76
77	return false;
78	}
79
80	static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
81	llvm::SelectionDAG *CurDAG,
82	const GCNSubtarget *Subtarget) {
83	if (!Subtarget->useRealTrue16Insts()) {
84	return Lo;
85	}
86
87	SDValue NewSrc;
88	SDLoc SL(Lo);
89
90	if (Lo ->isDivergent()) {
91	SDValue Undef = SDValue (CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF,
92	dl: SL, VT: Lo.getValueType()),
93	`0`);
94	const SDValue Ops[] = {
95	CurDAG->getTargetConstant(Val: AMDGPU::VGPR_32RegClassID, DL: SL, VT: MVT::i32), Lo,
96	CurDAG->getTargetConstant(Val: AMDGPU::lo16, DL: SL, VT: MVT::i16), Undef,
97	CurDAG->getTargetConstant(Val: AMDGPU::hi16, DL: SL, VT: MVT::i16)};
98
99	NewSrc = SDValue (CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: SL,
100	VT: Src.getValueType(), Ops),
101	`0`);
102	} else {
103	// the S_MOV is needed since the Lo could still be a VGPR16.
104	// With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
105	// the fixvgpr2sgprcopy pass to legalize it
106	NewSrc = SDValue (
107	CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: Src.getValueType(), Op1: Lo),
108	`0`);
109	}
110
111	return NewSrc;
112	}
113
114	// Look through operations that obscure just looking at the low 16-bits of the
115	// same register.
116	static SDValue stripExtractLoElt(SDValue In) {
117	if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
118	SDValue Idx = In.getOperand(i: `1`);
119	if (isNullConstant(V: Idx) && In.getValueSizeInBits() <= `32`)
120	return In.getOperand(i: `0`);
121	}
122
123	if (In.getOpcode() == ISD::TRUNCATE) {
124	SDValue Src = In.getOperand(i: `0`);
125	if (Src.getValueType().getSizeInBits() == `32`)
126	return stripBitcast(Val: Src);
127	}
128
129	return In;
130	}
131
132	} // end anonymous namespace
133
134	INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
135	"AMDGPU DAG->DAG Pattern Instruction Selection", false,
136	false)
137	INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
138	INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
139	#ifdef EXPENSIVE_CHECKS
140	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
141	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
142	#endif
143	INITIALIZE_PASS_END(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
144	"AMDGPU DAG->DAG Pattern Instruction Selection", false,
145	false)
146
147	/// This pass converts a legalized DAG into a AMDGPU-specific
148	// DAG, ready for instruction scheduling.
149	FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
150	CodeGenOptLevel OptLevel) {
151	return new AMDGPUDAGToDAGISelLegacy (TM, OptLevel);
152	}
153
154	AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM,
155	CodeGenOptLevel OptLevel)
156	: SelectionDAGISel (TM, OptLevel) {}
157
158	bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
159	Subtarget = &MF.getSubtarget<GCNSubtarget>();
160	Subtarget->checkSubtargetFeatures(F: MF.getFunction());
161	Mode = SIModeRegisterDefaults (MF.getFunction(), *Subtarget);
162	return SelectionDAGISel::runOnMachineFunction(mf&: MF);
163	}
164
165	bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
166	// XXX - only need to list legal operations.
167	switch (Opc) {
168	case ISD::FADD:
169	case ISD::FSUB:
170	case ISD::FMUL:
171	case ISD::FDIV:
172	case ISD::FREM:
173	case ISD::FCANONICALIZE:
174	case ISD::UINT_TO_FP:
175	case ISD::SINT_TO_FP:
176	case ISD::FABS:
177	// Fabs is lowered to a bit operation, but it's an and which will clear the
178	// high bits anyway.
179	case ISD::FSQRT:
180	case ISD::FSIN:
181	case ISD::FCOS:
182	case ISD::FPOWI:
183	case ISD::FPOW:
184	case ISD::FLOG:
185	case ISD::FLOG2:
186	case ISD::FLOG10:
187	case ISD::FEXP:
188	case ISD::FEXP2:
189	case ISD::FCEIL:
190	case ISD::FTRUNC:
191	case ISD::FRINT:
192	case ISD::FNEARBYINT:
193	case ISD::FROUNDEVEN:
194	case ISD::FROUND:
195	case ISD::FFLOOR:
196	case ISD::FMINNUM:
197	case ISD::FMAXNUM:
198	case ISD::FLDEXP:
199	case AMDGPUISD::FRACT:
200	case AMDGPUISD::CLAMP:
201	case AMDGPUISD::COS_HW:
202	case AMDGPUISD::SIN_HW:
203	case AMDGPUISD::FMIN3:
204	case AMDGPUISD::FMAX3:
205	case AMDGPUISD::FMED3:
206	case AMDGPUISD::FMAD_FTZ:
207	case AMDGPUISD::RCP:
208	case AMDGPUISD::RSQ:
209	case AMDGPUISD::RCP_IFLAG:
210	// On gfx10, all 16-bit instructions preserve the high bits.
211	return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
212	case ISD::FP_ROUND:
213	// We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
214	// high bits on gfx9.
215	// TODO: If we had the source node we could see if the source was fma/mad
216	return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
217	case ISD::FMA:
218	case ISD::FMAD:
219	case AMDGPUISD::DIV_FIXUP:
220	return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
221	default:
222	// fcopysign, select and others may be lowered to 32-bit bit operations
223	// which don't zero the high bits.
224	return false;
225	}
226	}
227
228	bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {
229	#ifdef EXPENSIVE_CHECKS
230	DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
231	LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
232	for (auto &L : LI->getLoopsInPreorder()) {
233	assert(L->isLCSSAForm(DT));
234	}
235	#endif
236	return SelectionDAGISelLegacy::runOnMachineFunction(MF);
237	}
238
239	void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
240	AU.addRequired<UniformityInfoWrapperPass>();
241	#ifdef EXPENSIVE_CHECKS
242	AU.addRequired<DominatorTreeWrapperPass>();
243	AU.addRequired<LoopInfoWrapperPass>();
244	#endif
245	SelectionDAGISelLegacy::getAnalysisUsage(AU);
246	}
247
248	bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode N) const* {
249	assert(Subtarget->d16PreservesUnusedBits());
250	MVT VT = N->getValueType(ResNo: `0`).getSimpleVT();
251	if (VT != MVT::v2i16 && VT != MVT::v2f16)
252	return false;
253
254	SDValue Lo = N->getOperand(Num: `0`);
255	SDValue Hi = N->getOperand(Num: `1`);
256
257	LoadSDNode *LdHi = dyn_cast<LoadSDNode>(Val: stripBitcast(Val: Hi));
258
259	// build_vector lo, (load ptr) -> load_d16_hi ptr, lo
260	// build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
261	// build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
262
263	// Need to check for possible indirect dependencies on the other half of the
264	// vector to avoid introducing a cycle.
265	if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(N: Lo.getNode())) {
266	SDVTList VTList = CurDAG->getVTList(VT1: VT, VT2: MVT::Other);
267
268	SDValue TiedIn = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SDLoc (N), VT, Operand: Lo);
269	SDValue Ops[] = {
270	LdHi->getChain(), LdHi->getBasePtr(), TiedIn
271	};
272
273	unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
274	if (LdHi->getMemoryVT() == MVT::i8) {
275	LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
276	AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
277	} else {
278	assert(LdHi->getMemoryVT() == MVT::i16);
279	}
280
281	SDValue NewLoadHi =
282	CurDAG->getMemIntrinsicNode(Opcode: LoadOp, dl: SDLoc (LdHi), VTList,
283	Ops, MemVT: LdHi->getMemoryVT(),
284	MMO: LdHi->getMemOperand());
285
286	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: NewLoadHi);
287	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (LdHi, `1`), To: NewLoadHi.getValue(R: `1`));
288	return true;
289	}
290
291	// build_vector (load ptr), hi -> load_d16_lo ptr, hi
292	// build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
293	// build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
294	LoadSDNode *LdLo = dyn_cast<LoadSDNode>(Val: stripBitcast(Val: Lo));
295	if (LdLo && Lo.hasOneUse()) {
296	SDValue TiedIn = getHi16Elt(In: Hi);
297	if (!TiedIn \|\| LdLo->isPredecessorOf(N: TiedIn.getNode()))
298	return false;
299
300	SDVTList VTList = CurDAG->getVTList(VT1: VT, VT2: MVT::Other);
301	unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
302	if (LdLo->getMemoryVT() == MVT::i8) {
303	LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
304	AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
305	} else {
306	assert(LdLo->getMemoryVT() == MVT::i16);
307	}
308
309	TiedIn = CurDAG->getNode(Opcode: ISD::BITCAST, DL: SDLoc (N), VT, Operand: TiedIn);
310
311	SDValue Ops[] = {
312	LdLo->getChain(), LdLo->getBasePtr(), TiedIn
313	};
314
315	SDValue NewLoadLo =
316	CurDAG->getMemIntrinsicNode(Opcode: LoadOp, dl: SDLoc (LdLo), VTList,
317	Ops, MemVT: LdLo->getMemoryVT(),
318	MMO: LdLo->getMemOperand());
319
320	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: NewLoadLo);
321	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (LdLo, `1`), To: NewLoadLo.getValue(R: `1`));
322	return true;
323	}
324
325	return false;
326	}
327
328	void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
329	if (!Subtarget->d16PreservesUnusedBits())
330	return;
331
332	SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
333
334	bool MadeChange = false;
335	while (Position != CurDAG->allnodes_begin()) {
336	SDNode N = &--Position;
337	if (N->use_empty())
338	continue;
339
340	switch (N->getOpcode()) {
341	case ISD::BUILD_VECTOR:
342	// TODO: Match load d16 from shl (extload:i16), 16
343	MadeChange \|= matchLoadD16FromBuildVector(N);
344	break;
345	default:
346	break;
347	}
348	}
349
350	if (MadeChange) {
351	CurDAG->RemoveDeadNodes();
352	LLVM_DEBUG(dbgs() << "After PreProcess:\n";
353	CurDAG->dump(););
354	}
355	}
356
357	bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode N) const* {
358	if (N->isUndef())
359	return true;
360
361	const SIInstrInfo *TII = Subtarget->getInstrInfo();
362	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N))
363	return TII->isInlineConstant(Imm: C->getAPIntValue());
364
365	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val: N))
366	return TII->isInlineConstant(Imm: C->getValueAPF());
367
368	return false;
369	}
370
371	/// Determine the register class for \p OpNo
372	/// \returns The register class of the virtual register that will be used for
373	/// the given operand number \OpNo or NULL if the register class cannot be
374	/// determined.
375	const TargetRegisterClass AMDGPUDAGToDAGISel::getOperandRegClass(SDNode N,
376	unsigned OpNo) const {
377	if (!N->isMachineOpcode()) {
378	if (N->getOpcode() == ISD::CopyToReg) {
379	Register Reg = cast<RegisterSDNode>(Val: N->getOperand(Num: `1`))->getReg();
380	if (Reg.isVirtual()) {
381	MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
382	return MRI.getRegClass(Reg);
383	}
384
385	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
386	return TRI->getPhysRegBaseClass(Reg);
387	}
388
389	return nullptr;
390	}
391
392	switch (N->getMachineOpcode()) {
393	default: {
394	const SIInstrInfo *TII = Subtarget->getInstrInfo();
395	const MCInstrDesc &Desc = TII->get(Opcode: N->getMachineOpcode());
396	unsigned OpIdx = Desc.getNumDefs() + OpNo;
397	if (OpIdx >= Desc.getNumOperands())
398	return nullptr;
399
400	int16_t RegClass = TII->getOpRegClassID(OpInfo: Desc.operands()[OpIdx]);
401	if (RegClass == -`1`)
402	return nullptr;
403
404	return Subtarget->getRegisterInfo()->getRegClass(i: RegClass);
405	}
406	case AMDGPU::REG_SEQUENCE: {
407	unsigned RCID = N->getConstantOperandVal(Num: `0`);
408	const TargetRegisterClass *SuperRC =
409	Subtarget->getRegisterInfo()->getRegClass(i: RCID);
410
411	SDValue SubRegOp = N->getOperand(Num: OpNo + `1`);
412	unsigned SubRegIdx = SubRegOp ->getAsZExtVal();
413	return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
414	SubRegIdx);
415	}
416	}
417	}
418
419	SDNode AMDGPUDAGToDAGISel::glueCopyToOp(SDNode N, SDValue NewChain,
420	SDValue Glue) const {
421	SmallVector <SDValue, `8`> Ops;
422	Ops.push_back(Elt: NewChain); // Replace the chain.
423	for (unsigned i = `1`, e = N->getNumOperands(); i != e; ++i)
424	Ops.push_back(Elt: N->getOperand(Num: i));
425
426	Ops.push_back(Elt: Glue);
427	return CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: N->getVTList(), Ops);
428	}
429
430	SDNode AMDGPUDAGToDAGISel::glueCopyToM0(SDNode N, SDValue Val) const {
431	const SITargetLowering& Lowering =
432	*static_cast<const SITargetLowering*>(getTargetLowering());
433
434	assert(N->getOperand(`0`).getValueType() == MVT::Other && "Expected chain");
435
436	SDValue M0 = Lowering.copyToM0(DAG&: *CurDAG, Chain: N->getOperand(Num: `0`), DL: SDLoc (N), V: Val);
437	return glueCopyToOp(N, NewChain: M0, Glue: M0.getValue(R: `1`));
438	}
439
440	SDNode AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode N) const {
441	unsigned AS = cast<MemSDNode>(Val: N)->getAddressSpace();
442	if (AS == AMDGPUAS::LOCAL_ADDRESS) {
443	if (Subtarget->ldsRequiresM0Init())
444	return glueCopyToM0(
445	N, Val: CurDAG->getSignedTargetConstant(Val: -`1`, DL: SDLoc (N), VT: MVT::i32));
446	} else if (AS == AMDGPUAS::REGION_ADDRESS) {
447	MachineFunction &MF = CurDAG->getMachineFunction();
448	unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
449	return
450	glueCopyToM0(N, Val: CurDAG->getTargetConstant(Val: Value, DL: SDLoc (N), VT: MVT::i32));
451	}
452	return N;
453	}
454
455	MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
456	EVT VT) const {
457	SDNode *Lo = CurDAG->getMachineNode(
458	Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
459	Op1: CurDAG->getTargetConstant(Val: Lo_32(Value: Imm), DL, VT: MVT::i32));
460	SDNode *Hi = CurDAG->getMachineNode(
461	Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
462	Op1: CurDAG->getTargetConstant(Val: Hi_32(Value: Imm), DL, VT: MVT::i32));
463	const SDValue Ops[] = {
464	CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32),
465	SDValue (Lo, `0`), CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
466	SDValue (Hi, `0`), CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
467
468	return CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL, VT, Ops);
469	}
470
471	SDNode AMDGPUDAGToDAGISel::packConstantV2I16(const* SDNode *N,
472	SelectionDAG &DAG) const {
473	// TODO: Handle undef as zero
474
475	assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == `2`);
476	uint32_t LHSVal, RHSVal;
477	if (getConstantValue(N: N->getOperand(Num: `0`), Out&: LHSVal) &&
478	getConstantValue(N: N->getOperand(Num: `1`), Out&: RHSVal)) {
479	SDLoc SL(N);
480	uint32_t K = (LHSVal & `0xffff`) \| (RHSVal << `16`);
481	return DAG.getMachineNode(
482	Opcode: isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, dl: SL,
483	VT: N->getValueType(ResNo: `0`), Op1: DAG.getTargetConstant(Val: K, DL: SL, VT: MVT::i32));
484	}
485
486	return nullptr;
487	}
488
489	void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode N, unsigned* RegClassID) {
490	EVT VT = N->getValueType(ResNo: `0`);
491	unsigned NumVectorElts = VT.getVectorNumElements();
492	EVT EltVT = VT.getVectorElementType();
493	SDLoc DL(N);
494	SDValue RegClass = CurDAG->getTargetConstant(Val: RegClassID, DL, VT: MVT::i32);
495
496	if (NumVectorElts == `1`) {
497	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::COPY_TO_REGCLASS, VT: EltVT, Op1: N->getOperand(Num: `0`),
498	Op2: RegClass);
499	return;
500	}
501
502	bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
503	if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == `64` &&
504	CurDAG->isConstantValueOfAnyType(N: SDValue (N, `0`))) {
505	uint64_t C = `0`;
506	bool AllConst = true;
507	unsigned EltSize = EltVT.getSizeInBits();
508	for (unsigned I = `0`; I < NumVectorElts; ++I) {
509	SDValue Op = N->getOperand(Num: I);
510	if (Op.isUndef()) {
511	AllConst = false;
512	break;
513	}
514	uint64_t Val;
515	if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
516	Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
517	} else
518	Val = cast<ConstantSDNode>(Val&: Op)->getZExtValue();
519	C \|= Val << (EltSize * I);
520	}
521	if (AllConst) {
522	SDValue CV = CurDAG->getTargetConstant(Val: C, DL, VT: MVT::i64);
523	MachineSDNode *Copy =
524	CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B64_IMM_PSEUDO, dl: DL, VT, Op1: CV);
525	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::COPY_TO_REGCLASS, VT, Op1: SDValue (Copy, `0`),
526	Op2: RegClass);
527	return;
528	}
529	}
530
531	assert(NumVectorElts <= `32` && "Vectors with more than 32 elements not "
532	"supported yet");
533	// 32 = Max Num Vector Elements
534	// 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
535	// 1 = Vector Register Class
536	SmallVector<SDValue, `32` * `2` + `1`> RegSeqArgs(NumVectorElts * `2` + `1`);
537
538	RegSeqArgs [`0`] = CurDAG->getTargetConstant(Val: RegClassID, DL, VT: MVT::i32);
539	bool IsRegSeq = true;
540	unsigned NOps = N->getNumOperands();
541	for (unsigned i = `0`; i < NOps; i++) {
542	// XXX: Why is this here?
543	if (isa<RegisterSDNode>(Val: N->getOperand(Num: i))) {
544	IsRegSeq = false;
545	break;
546	}
547	unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(Channel: i)
548	: R600RegisterInfo::getSubRegFromChannel(Channel: i);
549	RegSeqArgs [`1` + (`2` * i)] = N->getOperand(Num: i);
550	RegSeqArgs [`1` + (`2` * i) + `1`] = CurDAG->getTargetConstant(Val: Sub, DL, VT: MVT::i32);
551	}
552	if (NOps != NumVectorElts) {
553	// Fill in the missing undef elements if this was a scalar_to_vector.
554	assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
555	MachineSDNode *ImpDef = CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF,
556	dl: DL, VT: EltVT);
557	for (unsigned i = NOps; i < NumVectorElts; ++i) {
558	unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(Channel: i)
559	: R600RegisterInfo::getSubRegFromChannel(Channel: i);
560	RegSeqArgs [`1` + (`2` * i)] = SDValue (ImpDef, `0`);
561	RegSeqArgs [`1` + (`2` * i) + `1`] =
562	CurDAG->getTargetConstant(Val: Sub, DL, VT: MVT::i32);
563	}
564	}
565
566	if (!IsRegSeq)
567	SelectCode(N);
568	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::REG_SEQUENCE, VTs: N->getVTList(), Ops: RegSeqArgs);
569	}
570
571	void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) {
572	EVT VT = N->getValueType(ResNo: `0`);
573	EVT EltVT = VT.getVectorElementType();
574
575	// TODO: Handle 16-bit element vectors with even aligned masks.
576	if (!Subtarget->hasPkMovB32() \|\| !EltVT.bitsEq(VT: MVT::i32) \|\|
577	VT.getVectorNumElements() != `2`) {
578	SelectCode(N);
579	return;
580	}
581
582	auto *SVN = cast<ShuffleVectorSDNode>(Val: N);
583
584	SDValue Src0 = SVN->getOperand(Num: `0`);
585	SDValue Src1 = SVN->getOperand(Num: `1`);
586	ArrayRef<int> Mask = SVN->getMask();
587	SDLoc DL(N);
588
589	assert(Src0.getValueType().getVectorNumElements() == `2` && Mask.size() == `2` &&
590	Mask[`0`] < `4` && Mask[`1`] < `4`);
591
592	SDValue VSrc0 = Mask [`0`] < `2` ? Src0 : Src1;
593	SDValue VSrc1 = Mask [`1`] < `2` ? Src0 : Src1;
594	unsigned Src0SubReg = Mask [`0`] & `1` ? AMDGPU::sub1 : AMDGPU::sub0;
595	unsigned Src1SubReg = Mask [`1`] & `1` ? AMDGPU::sub1 : AMDGPU::sub0;
596
597	if (Mask [`0`] < `0`) {
598	Src0SubReg = Src1SubReg;
599	MachineSDNode *ImpDef =
600	CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT);
601	VSrc0 = SDValue (ImpDef, `0`);
602	}
603
604	if (Mask [`1`] < `0`) {
605	Src1SubReg = Src0SubReg;
606	MachineSDNode *ImpDef =
607	CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT);
608	VSrc1 = SDValue (ImpDef, `0`);
609	}
610
611	// SGPR case needs to lower to copies.
612	//
613	// Also use subregister extract when we can directly blend the registers with
614	// a simple subregister copy.
615	//
616	// TODO: Maybe we should fold this out earlier
617	if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
618	Src1SubReg == AMDGPU::sub0) {
619	// The low element of the result always comes from src0.
620	// The high element of the result always comes from src1.
621	// op_sel selects the high half of src0.
622	// op_sel_hi selects the high half of src1.
623
624	unsigned Src0OpSel =
625	Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
626	unsigned Src1OpSel =
627	Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
628
629	// Enable op_sel_hi to avoid printing it. This should have no effect on the
630	// result.
631	Src0OpSel \|= SISrcMods::OP_SEL_1;
632	Src1OpSel \|= SISrcMods::OP_SEL_1;
633
634	SDValue Src0OpSelVal = CurDAG->getTargetConstant(Val: Src0OpSel, DL, VT: MVT::i32);
635	SDValue Src1OpSelVal = CurDAG->getTargetConstant(Val: Src1OpSel, DL, VT: MVT::i32);
636	SDValue ZeroMods = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
637
638	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::V_PK_MOV_B32, VTs: N->getVTList(),
639	Ops: {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
640	ZeroMods, // clamp
641	ZeroMods, // op_sel
642	ZeroMods, // op_sel_hi
643	ZeroMods, // neg_lo
644	ZeroMods}); // neg_hi
645	return;
646	}
647
648	SDValue ResultElt0 =
649	CurDAG->getTargetExtractSubreg(SRIdx: Src0SubReg, DL, VT: EltVT, Operand: VSrc0);
650	SDValue ResultElt1 =
651	CurDAG->getTargetExtractSubreg(SRIdx: Src1SubReg, DL, VT: EltVT, Operand: VSrc1);
652
653	const SDValue Ops[] = {
654	CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32),
655	ResultElt0, CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
656	ResultElt1, CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
657	CurDAG->SelectNodeTo(N, MachineOpc: TargetOpcode::REG_SEQUENCE, VT, Ops);
658	}
659
660	void AMDGPUDAGToDAGISel::Select(SDNode *N) {
661	unsigned int Opc = N->getOpcode();
662	if (N->isMachineOpcode()) {
663	N->setNodeId(-`1`);
664	return; // Already selected.
665	}
666
667	// isa<MemSDNode> almost works but is slightly too permissive for some DS
668	// intrinsics.
669	if (Opc == ISD::LOAD \|\| Opc == ISD::STORE \|\| isa<AtomicSDNode>(Val: N)) {
670	N = glueCopyToM0LDSInit(N);
671	SelectCode(N);
672	return;
673	}
674
675	switch (Opc) {
676	default:
677	break;
678	// We are selecting i64 ADD here instead of custom lower it during
679	// DAG legalization, so we can fold some i64 ADDs used for address
680	// calculation into the LOAD and STORE instructions.
681	case ISD::ADDC:
682	case ISD::ADDE:
683	case ISD::SUBC:
684	case ISD::SUBE: {
685	if (N->getValueType(ResNo: `0`) != MVT::i64)
686	break;
687
688	SelectADD_SUB_I64(N);
689	return;
690	}
691	case ISD::UADDO_CARRY:
692	case ISD::USUBO_CARRY:
693	if (N->getValueType(ResNo: `0`) != MVT::i32)
694	break;
695
696	SelectAddcSubb(N);
697	return;
698	case ISD::UADDO:
699	case ISD::USUBO: {
700	SelectUADDO_USUBO(N);
701	return;
702	}
703	case AMDGPUISD::FMUL_W_CHAIN: {
704	SelectFMUL_W_CHAIN(N);
705	return;
706	}
707	case AMDGPUISD::FMA_W_CHAIN: {
708	SelectFMA_W_CHAIN(N);
709	return;
710	}
711
712	case ISD::SCALAR_TO_VECTOR:
713	case ISD::BUILD_VECTOR: {
714	EVT VT = N->getValueType(ResNo: `0`);
715	unsigned NumVectorElts = VT.getVectorNumElements();
716	if (VT.getScalarSizeInBits() == `16`) {
717	if (Opc == ISD::BUILD_VECTOR && NumVectorElts == `2`) {
718	if (SDNode Packed = packConstantV2I16(N, DAG&: CurDAG)) {
719	ReplaceNode(F: N, T: Packed);
720	return;
721	}
722	}
723
724	break;
725	}
726
727	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
728	assert(VT.getVectorElementType().bitsEq(MVT::i32));
729	const TargetRegisterClass *RegClass =
730	N->isDivergent()
731	? TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: NumVectorElts * `32`)
732	: SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NumVectorElts * `32`);
733
734	SelectBuildVector(N, RegClassID: RegClass->getID());
735	return;
736	}
737	case ISD::VECTOR_SHUFFLE:
738	SelectVectorShuffle(N);
739	return;
740	case ISD::BUILD_PAIR: {
741	SDValue RC, SubReg0, SubReg1;
742	SDLoc DL(N);
743	if (N->getValueType(ResNo: `0`) == MVT::i128) {
744	RC = CurDAG->getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32);
745	SubReg0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0_sub1, DL, VT: MVT::i32);
746	SubReg1 = CurDAG->getTargetConstant(Val: AMDGPU::sub2_sub3, DL, VT: MVT::i32);
747	} else if (N->getValueType(ResNo: `0`) == MVT::i64) {
748	RC = CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32);
749	SubReg0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
750	SubReg1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
751	} else {
752	llvm_unreachable("Unhandled value type for BUILD_PAIR");
753	}
754	const SDValue Ops[] = { RC, N->getOperand(Num: `0`), SubReg0,
755	N->getOperand(Num: `1`), SubReg1 };
756	ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL,
757	VT: N->getValueType(ResNo: `0`), Ops));
758	return;
759	}
760
761	case ISD::Constant:
762	case ISD::ConstantFP: {
763	if (N->getValueType(ResNo: `0`).getSizeInBits() != `64` \|\| isInlineImmediate(N) \|\|
764	Subtarget->has64BitLiterals())
765	break;
766
767	uint64_t Imm;
768	if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(Val: N)) {
769	Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
770	if (AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: true))
771	break;
772	} else {
773	ConstantSDNode *C = cast<ConstantSDNode>(Val: N);
774	Imm = C->getZExtValue();
775	if (AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: false))
776	break;
777	}
778
779	SDLoc DL(N);
780	ReplaceNode(F: N, T: buildSMovImm64(DL, Imm, VT: N->getValueType(ResNo: `0`)));
781	return;
782	}
783	case AMDGPUISD::BFE_I32:
784	case AMDGPUISD::BFE_U32: {
785	// There is a scalar version available, but unlike the vector version which
786	// has a separate operand for the offset and width, the scalar version packs
787	// the width and offset into a single operand. Try to move to the scalar
788	// version if the offsets are constant, so that we can try to keep extended
789	// loads of kernel arguments in SGPRs.
790
791	// TODO: Technically we could try to pattern match scalar bitshifts of
792	// dynamic values, but it's probably not useful.
793	ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
794	if (!Offset)
795	break;
796
797	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `2`));
798	if (!Width)
799	break;
800
801	bool Signed = Opc == AMDGPUISD::BFE_I32;
802
803	uint32_t OffsetVal = Offset->getZExtValue();
804	uint32_t WidthVal = Width->getZExtValue();
805
806	ReplaceNode(F: N, T: getBFE32(IsSigned: Signed, DL: SDLoc (N), Val: N->getOperand(Num: `0`), Offset: OffsetVal,
807	Width: WidthVal));
808	return;
809	}
810	case AMDGPUISD::DIV_SCALE: {
811	SelectDIV_SCALE(N);
812	return;
813	}
814	case AMDGPUISD::MAD_I64_I32:
815	case AMDGPUISD::MAD_U64_U32: {
816	SelectMAD_64_32(N);
817	return;
818	}
819	case ISD::SMUL_LOHI:
820	case ISD::UMUL_LOHI:
821	return SelectMUL_LOHI(N);
822	case ISD::CopyToReg: {
823	const SITargetLowering& Lowering =
824	*static_cast<const SITargetLowering*>(getTargetLowering());
825	N = Lowering.legalizeTargetIndependentNode(Node: N, DAG&: *CurDAG);
826	break;
827	}
828	case ISD::AND:
829	case ISD::SRL:
830	case ISD::SRA:
831	case ISD::SIGN_EXTEND_INREG:
832	if (N->getValueType(ResNo: `0`) != MVT::i32)
833	break;
834
835	SelectS_BFE(N);
836	return;
837	case ISD::BRCOND:
838	SelectBRCOND(N);
839	return;
840	case ISD::FP_EXTEND:
841	SelectFP_EXTEND(N);
842	return;
843	case AMDGPUISD::CVT_PKRTZ_F16_F32:
844	case AMDGPUISD::CVT_PKNORM_I16_F32:
845	case AMDGPUISD::CVT_PKNORM_U16_F32:
846	case AMDGPUISD::CVT_PK_U16_U32:
847	case AMDGPUISD::CVT_PK_I16_I32: {
848	// Hack around using a legal type if f16 is illegal.
849	if (N->getValueType(ResNo: `0`) == MVT::i32) {
850	MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
851	N = CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: CurDAG->getVTList(VT: NewVT),
852	Ops: { N->getOperand(Num: `0`), N->getOperand(Num: `1`) });
853	SelectCode(N);
854	return;
855	}
856
857	break;
858	}
859	case ISD::INTRINSIC_W_CHAIN: {
860	SelectINTRINSIC_W_CHAIN(N);
861	return;
862	}
863	case ISD::INTRINSIC_WO_CHAIN: {
864	SelectINTRINSIC_WO_CHAIN(N);
865	return;
866	}
867	case ISD::INTRINSIC_VOID: {
868	SelectINTRINSIC_VOID(N);
869	return;
870	}
871	case AMDGPUISD::WAVE_ADDRESS: {
872	SelectWAVE_ADDRESS(N);
873	return;
874	}
875	case ISD::STACKRESTORE: {
876	SelectSTACKRESTORE(N);
877	return;
878	}
879	}
880
881	SelectCode(N);
882	}
883
884	bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode N) const* {
885	const BasicBlock *BB = FuncInfo ->MBB->getBasicBlock();
886	const Instruction *Term = BB->getTerminator();
887	return Term->getMetadata(Kind: "amdgpu.uniform") \|\|
888	Term->getMetadata(Kind: "structurizecfg.uniform");
889	}
890
891	bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
892	unsigned ShAmtBits) const {
893	assert(N->getOpcode() == ISD::AND);
894
895	const APInt &RHS = N->getConstantOperandAPInt(Num: `1`);
896	if (RHS.countr_one() >= ShAmtBits)
897	return true;
898
899	const APInt &LHSKnownZeros = CurDAG->computeKnownBits(Op: N->getOperand(Num: `0`)).Zero;
900	return (LHSKnownZeros \| RHS).countr_one() >= ShAmtBits;
901	}
902
903	static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
904	SDValue &N0, SDValue &N1) {
905	if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
906	Addr.getOperand(i: `0`).getOpcode() == ISD::BUILD_VECTOR) {
907	// As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
908	// (i64 (bitcast (v2i32 (build_vector
909	// (or (extract_vector_elt V, 0), OFFSET),
910	// (extract_vector_elt V, 1)))))
911	SDValue Lo = Addr.getOperand(i: `0`).getOperand(i: `0`);
912	if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Op: Lo)) {
913	SDValue BaseLo = Lo.getOperand(i: `0`);
914	SDValue BaseHi = Addr.getOperand(i: `0`).getOperand(i: `1`);
915	// Check that split base (Lo and Hi) are extracted from the same one.
916	if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
917	BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
918	BaseLo.getOperand(i: `0`) == BaseHi.getOperand(i: `0`) &&
919	// Lo is statically extracted from index 0.
920	isa<ConstantSDNode>(Val: BaseLo.getOperand(i: `1`)) &&
921	BaseLo.getConstantOperandVal(i: `1`) == `0` &&
922	// Hi is statically extracted from index 0.
923	isa<ConstantSDNode>(Val: BaseHi.getOperand(i: `1`)) &&
924	BaseHi.getConstantOperandVal(i: `1`) == `1`) {
925	N0 = BaseLo.getOperand(i: `0`).getOperand(i: `0`);
926	N1 = Lo.getOperand(i: `1`);
927	return true;
928	}
929	}
930	}
931	return false;
932	}
933
934	bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
935	SDValue &RHS) const {
936	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
937	LHS = Addr.getOperand(i: `0`);
938	RHS = Addr.getOperand(i: `1`);
939	return true;
940	}
941
942	if (getBaseWithOffsetUsingSplitOR(DAG&: *CurDAG, Addr, N0&: LHS, N1&: RHS)) {
943	assert(LHS && RHS && isa<ConstantSDNode>(RHS));
944	return true;
945	}
946
947	return false;
948	}
949
950	StringRef AMDGPUDAGToDAGISelLegacy::getPassName() const {
951	return "AMDGPU DAG->DAG Pattern Instruction Selection";
952	}
953
954	AMDGPUISelDAGToDAGPass::AMDGPUISelDAGToDAGPass(TargetMachine &TM)
955	: SelectionDAGISelPass (
956	std::make_unique<AMDGPUDAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {}
957
958	PreservedAnalyses
959	AMDGPUISelDAGToDAGPass::run(MachineFunction &MF,
960	MachineFunctionAnalysisManager &MFAM) {
961	#ifdef EXPENSIVE_CHECKS
962	auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
963	.getManager();
964	auto &F = MF.getFunction();
965	DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
966	LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
967	for (auto &L : LI.getLoopsInPreorder())
968	assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
969	#endif
970	return SelectionDAGISelPass::run(MF, MFAM);
971	}
972
973	//===----------------------------------------------------------------------===//
974	// Complex Patterns
975	//===----------------------------------------------------------------------===//
976
977	bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
978	SDValue &Offset) {
979	return false;
980	}
981
982	bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
983	SDValue &Offset) {
984	ConstantSDNode *C;
985	SDLoc DL(Addr);
986
987	if ((C = dyn_cast<ConstantSDNode>(Val&: Addr))) {
988	Base = CurDAG->getRegister(Reg: R600::INDIRECT_BASE_ADDR, VT: MVT::i32);
989	Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
990	} else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
991	(C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `0`)))) {
992	Base = CurDAG->getRegister(Reg: R600::INDIRECT_BASE_ADDR, VT: MVT::i32);
993	Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
994	} else if ((Addr.getOpcode() == ISD::ADD \|\| Addr.getOpcode() == ISD::OR) &&
995	(C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`)))) {
996	Base = Addr.getOperand(i: `0`);
997	Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
998	} else {
999	Base = Addr;
1000	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1001	}
1002
1003	return true;
1004	}
1005
1006	SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
1007	const SDLoc &DL) const {
1008	SDNode *Mov = CurDAG->getMachineNode(
1009	Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
1010	Op1: CurDAG->getTargetConstant(Val, DL, VT: MVT::i32));
1011	return SDValue (Mov, `0`);
1012	}
1013
1014	// FIXME: Should only handle uaddo_carry/usubo_carry
1015	void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
1016	SDLoc DL(N);
1017	SDValue LHS = N->getOperand(Num: `0`);
1018	SDValue RHS = N->getOperand(Num: `1`);
1019
1020	unsigned Opcode = N->getOpcode();
1021	bool ConsumeCarry = (Opcode == ISD::ADDE \|\| Opcode == ISD::SUBE);
1022	bool ProduceCarry =
1023	ConsumeCarry \|\| Opcode == ISD::ADDC \|\| Opcode == ISD::SUBC;
1024	bool IsAdd = Opcode == ISD::ADD \|\| Opcode == ISD::ADDC \|\| Opcode == ISD::ADDE;
1025
1026	SDValue Sub0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
1027	SDValue Sub1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
1028
1029	SDNode *Lo0 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1030	dl: DL, VT: MVT::i32, Op1: LHS, Op2: Sub0);
1031	SDNode *Hi0 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1032	dl: DL, VT: MVT::i32, Op1: LHS, Op2: Sub1);
1033
1034	SDNode *Lo1 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1035	dl: DL, VT: MVT::i32, Op1: RHS, Op2: Sub0);
1036	SDNode *Hi1 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1037	dl: DL, VT: MVT::i32, Op1: RHS, Op2: Sub1);
1038
1039	SDVTList VTList = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::Glue);
1040
1041	static const unsigned OpcMap[`2`][`2`][`2`] = {
1042	{{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1043	{AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1044	{{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1045	{AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1046
1047	unsigned Opc = OpcMap[`0`][N->isDivergent()][IsAdd];
1048	unsigned CarryOpc = OpcMap[`1`][N->isDivergent()][IsAdd];
1049
1050	SDNode *AddLo;
1051	if (!ConsumeCarry) {
1052	SDValue Args[] = { SDValue (Lo0, `0`), SDValue (Lo1, `0`) };
1053	AddLo = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs: VTList, Ops: Args);
1054	} else {
1055	SDValue Args[] = { SDValue (Lo0, `0`), SDValue (Lo1, `0`), N->getOperand(Num: `2`) };
1056	AddLo = CurDAG->getMachineNode(Opcode: CarryOpc, dl: DL, VTs: VTList, Ops: Args);
1057	}
1058	SDValue AddHiArgs[] = {
1059	SDValue (Hi0, `0`),
1060	SDValue (Hi1, `0`),
1061	SDValue (AddLo, `1`)
1062	};
1063	SDNode *AddHi = CurDAG->getMachineNode(Opcode: CarryOpc, dl: DL, VTs: VTList, Ops: AddHiArgs);
1064
1065	SDValue RegSequenceArgs[] = {
1066	CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32),
1067	SDValue (AddLo,`0`),
1068	Sub0,
1069	SDValue (AddHi,`0`),
1070	Sub1,
1071	};
1072	SDNode *RegSequence = CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL,
1073	VT: MVT::i64, Ops: RegSequenceArgs);
1074
1075	if (ProduceCarry) {
1076	// Replace the carry-use
1077	ReplaceUses(F: SDValue (N, `1`), T: SDValue (AddHi, `1`));
1078	}
1079
1080	// Replace the remaining uses.
1081	ReplaceNode(F: N, T: RegSequence);
1082	}
1083
1084	void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1085	SDValue LHS = N->getOperand(Num: `0`);
1086	SDValue RHS = N->getOperand(Num: `1`);
1087	SDValue CI = N->getOperand(Num: `2`);
1088
1089	if (N->isDivergent()) {
1090	unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1091	: AMDGPU::V_SUBB_U32_e64;
1092	CurDAG->SelectNodeTo(
1093	N, MachineOpc: Opc, VTs: N->getVTList(),
1094	Ops: {LHS, RHS, CI,
1095	CurDAG->getTargetConstant(Val: `0`, DL: {}, VT: MVT::i1) /clamp bit/});
1096	} else {
1097	unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1098	: AMDGPU::S_SUB_CO_PSEUDO;
1099	CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops: {LHS, RHS, CI});
1100	}
1101	}
1102
1103	void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1104	// The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1105	// carry out despite the _i32 name. These were renamed in VI to _U32.
1106	// FIXME: We should probably rename the opcodes here.
1107	bool IsAdd = N->getOpcode() == ISD::UADDO;
1108	bool IsVALU = N->isDivergent();
1109
1110	for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1111	++UI)
1112	if (UI.getUse().getResNo() == `1`) {
1113	if (UI ->isMachineOpcode()) {
1114	if (UI ->getMachineOpcode() !=
1115	(IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
1116	IsVALU = true;
1117	break;
1118	}
1119	} else {
1120	if (UI ->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
1121	IsVALU = true;
1122	break;
1123	}
1124	}
1125	}
1126
1127	if (IsVALU) {
1128	unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1129
1130	CurDAG->SelectNodeTo(
1131	N, MachineOpc: Opc, VTs: N->getVTList(),
1132	Ops: {N->getOperand(Num: `0`), N->getOperand(Num: `1`),
1133	CurDAG->getTargetConstant(Val: `0`, DL: {}, VT: MVT::i1) /clamp bit/});
1134	} else {
1135	unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;
1136
1137	CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(),
1138	Ops: {N->getOperand(Num: `0`), N->getOperand(Num: `1`)});
1139	}
1140	}
1141
1142	void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1143	// src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1144	SDValue Ops[`10`];
1145
1146	SelectVOP3Mods0(In: N->getOperand(Num: `1`), Src&: Ops[`1`], SrcMods&: Ops[`0`], Clamp&: Ops[`6`], Omod&: Ops[`7`]);
1147	SelectVOP3Mods(In: N->getOperand(Num: `2`), Src&: Ops[`3`], SrcMods&: Ops[`2`]);
1148	SelectVOP3Mods(In: N->getOperand(Num: `3`), Src&: Ops[`5`], SrcMods&: Ops[`4`]);
1149	Ops[`8`] = N->getOperand(Num: `0`);
1150	Ops[`9`] = N->getOperand(Num: `4`);
1151
1152	// If there are no source modifiers, prefer fmac over fma because it can use
1153	// the smaller VOP2 encoding.
1154	bool UseFMAC = Subtarget->hasDLInsts() &&
1155	cast<ConstantSDNode>(Val&: Ops[`0`])->isZero() &&
1156	cast<ConstantSDNode>(Val&: Ops[`2`])->isZero() &&
1157	cast<ConstantSDNode>(Val&: Ops[`4`])->isZero();
1158	unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1159	CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops);
1160	}
1161
1162	void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1163	// src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1164	SDValue Ops[`8`];
1165
1166	SelectVOP3Mods0(In: N->getOperand(Num: `1`), Src&: Ops[`1`], SrcMods&: Ops[`0`], Clamp&: Ops[`4`], Omod&: Ops[`5`]);
1167	SelectVOP3Mods(In: N->getOperand(Num: `2`), Src&: Ops[`3`], SrcMods&: Ops[`2`]);
1168	Ops[`6`] = N->getOperand(Num: `0`);
1169	Ops[`7`] = N->getOperand(Num: `3`);
1170
1171	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::V_MUL_F32_e64, VTs: N->getVTList(), Ops);
1172	}
1173
1174	// We need to handle this here because tablegen doesn't support matching
1175	// instructions with multiple outputs.
1176	void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1177	EVT VT = N->getValueType(ResNo: `0`);
1178
1179	assert(VT == MVT::f32 \|\| VT == MVT::f64);
1180
1181	unsigned Opc
1182	= (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1183
1184	// src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1185	// omod
1186	SDValue Ops[`8`];
1187	SelectVOP3BMods0(In: N->getOperand(Num: `0`), Src&: Ops[`1`], SrcMods&: Ops[`0`], Clamp&: Ops[`6`], Omod&: Ops[`7`]);
1188	SelectVOP3BMods(In: N->getOperand(Num: `1`), Src&: Ops[`3`], SrcMods&: Ops[`2`]);
1189	SelectVOP3BMods(In: N->getOperand(Num: `2`), Src&: Ops[`5`], SrcMods&: Ops[`4`]);
1190	CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
1191	}
1192
1193	// We need to handle this here because tablegen doesn't support matching
1194	// instructions with multiple outputs.
1195	void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1196	SDLoc SL(N);
1197	bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1198	unsigned Opc;
1199	bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(Value: `1`);
1200	if (Subtarget->hasMADIntraFwdBug())
1201	Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1202	: AMDGPU::V_MAD_U64_U32_gfx11_e64;
1203	else if (UseNoCarry)
1204	Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1205	else
1206	Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1207
1208	SDValue Clamp = CurDAG->getTargetConstant(Val: `0`, DL: SL, VT: MVT::i1);
1209	SDValue Ops[] = { N->getOperand(Num: `0`), N->getOperand(Num: `1`), N->getOperand(Num: `2`),
1210	Clamp };
1211
1212	if (UseNoCarry) {
1213	MachineSDNode *Mad = CurDAG->getMachineNode(Opcode: Opc, dl: SL, VT: MVT::i64, Ops);
1214	ReplaceUses(F: SDValue (N, `0`), T: SDValue (Mad, `0`));
1215	CurDAG->RemoveDeadNode(N);
1216	return;
1217	}
1218
1219	CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
1220	}
1221
1222	// We need to handle this here because tablegen doesn't support matching
1223	// instructions with multiple outputs.
1224	void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1225	SDLoc SL(N);
1226	bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1227	SDVTList VTList;
1228	unsigned Opc;
1229	if (Subtarget->hasMadU64U32NoCarry()) {
1230	VTList = CurDAG->getVTList(VT: MVT::i64);
1231	Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1232	} else {
1233	VTList = CurDAG->getVTList(VT1: MVT::i64, VT2: MVT::i1);
1234	if (Subtarget->hasMADIntraFwdBug()) {
1235	Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1236	: AMDGPU::V_MAD_U64_U32_gfx11_e64;
1237	} else {
1238	Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1239	}
1240	}
1241
1242	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL: SL, VT: MVT::i64);
1243	SDValue Clamp = CurDAG->getTargetConstant(Val: `0`, DL: SL, VT: MVT::i1);
1244	SDValue Ops[] = {N->getOperand(Num: `0`), N->getOperand(Num: `1`), Zero, Clamp};
1245	SDNode *Mad = CurDAG->getMachineNode(Opcode: Opc, dl: SL, VTs: VTList, Ops);
1246	if (!SDValue (N, `0`).use_empty()) {
1247	SDValue Sub0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32);
1248	SDNode *Lo = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: SL,
1249	VT: MVT::i32, Op1: SDValue (Mad, `0`), Op2: Sub0);
1250	ReplaceUses(F: SDValue (N, `0`), T: SDValue (Lo, `0`));
1251	}
1252	if (!SDValue (N, `1`).use_empty()) {
1253	SDValue Sub1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32);
1254	SDNode *Hi = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: SL,
1255	VT: MVT::i32, Op1: SDValue (Mad, `0`), Op2: Sub1);
1256	ReplaceUses(F: SDValue (N, `1`), T: SDValue (Hi, `0`));
1257	}
1258	CurDAG->RemoveDeadNode(N);
1259	}
1260
1261	bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1262	if (!isUInt<`16`>(x: Offset))
1263	return false;
1264
1265	if (!Base \|\| Subtarget->hasUsableDSOffset() \|\|
1266	Subtarget->unsafeDSOffsetFoldingEnabled())
1267	return true;
1268
1269	// On Southern Islands instruction with a negative base value and an offset
1270	// don't seem to work.
1271	return CurDAG->SignBitIsZero(Op: Base);
1272	}
1273
1274	bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1275	SDValue &Offset) const {
1276	SDLoc DL(Addr);
1277	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1278	SDValue N0 = Addr.getOperand(i: `0`);
1279	SDValue N1 = Addr.getOperand(i: `1`);
1280	ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
1281	if (isDSOffsetLegal(Base: N0, Offset: C1->getSExtValue())) {
1282	// (add n0, c0)
1283	Base = N0;
1284	Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i16);
1285	return true;
1286	}
1287	} else if (Addr.getOpcode() == ISD::SUB) {
1288	// sub C, x -> add (sub 0, x), C
1289	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `0`))) {
1290	int64_t ByteOffset = C->getSExtValue();
1291	if (isDSOffsetLegal(Base: SDValue (), Offset: ByteOffset)) {
1292	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1293
1294	// XXX - This is kind of hacky. Create a dummy sub node so we can check
1295	// the known bits in isDSOffsetLegal. We need to emit the selected node
1296	// here, so this is thrown away.
1297	SDValue Sub = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32,
1298	N1: Zero, N2: Addr.getOperand(i: `1`));
1299
1300	if (isDSOffsetLegal(Base: Sub, Offset: ByteOffset)) {
1301	SmallVector<SDValue, `3`> Opnds;
1302	Opnds.push_back(Elt: Zero);
1303	Opnds.push_back(Elt: Addr.getOperand(i: `1`));
1304
1305	// FIXME: Select to VOP3 version for with-carry.
1306	unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1307	if (Subtarget->hasAddNoCarryInsts()) {
1308	SubOp = AMDGPU::V_SUB_U32_e64;
1309	Opnds.push_back(
1310	Elt: CurDAG->getTargetConstant(Val: `0`, DL: {}, VT: MVT::i1)); // clamp bit
1311	}
1312
1313	MachineSDNode *MachineSub =
1314	CurDAG->getMachineNode(Opcode: SubOp, dl: DL, VT: MVT::i32, Ops: Opnds);
1315
1316	Base = SDValue (MachineSub, `0`);
1317	Offset = CurDAG->getTargetConstant(Val: ByteOffset, DL, VT: MVT::i16);
1318	return true;
1319	}
1320	}
1321	}
1322	} else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1323	// If we have a constant address, prefer to put the constant into the
1324	// offset. This can save moves to load the constant address since multiple
1325	// operations can share the zero base address register, and enables merging
1326	// into read2 / write2 instructions.
1327
1328	SDLoc DL(Addr);
1329
1330	if (isDSOffsetLegal(Base: SDValue (), Offset: CAddr->getZExtValue())) {
1331	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1332	MachineSDNode *MovZero = CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32,
1333	dl: DL, VT: MVT::i32, Op1: Zero);
1334	Base = SDValue (MovZero, `0`);
1335	Offset = CurDAG->getTargetConstant(Val: CAddr->getZExtValue(), DL, VT: MVT::i16);
1336	return true;
1337	}
1338	}
1339
1340	// default case
1341	Base = Addr;
1342	Offset = CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (Addr), VT: MVT::i16);
1343	return true;
1344	}
1345
1346	bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1347	unsigned Offset1,
1348	unsigned Size) const {
1349	if (Offset0 % Size != `0` \|\| Offset1 % Size != `0`)
1350	return false;
1351	if (!isUInt<`8`>(x: Offset0 / Size) \|\| !isUInt<`8`>(x: Offset1 / Size))
1352	return false;
1353
1354	if (!Base \|\| Subtarget->hasUsableDSOffset() \|\|
1355	Subtarget->unsafeDSOffsetFoldingEnabled())
1356	return true;
1357
1358	// On Southern Islands instruction with a negative base value and an offset
1359	// don't seem to work.
1360	return CurDAG->SignBitIsZero(Op: Base);
1361	}
1362
1363	// Return whether the operation has NoUnsignedWrap property.
1364	static bool isNoUnsignedWrap(SDValue Addr) {
1365	return (Addr.getOpcode() == ISD::ADD &&
1366	Addr ->getFlags().hasNoUnsignedWrap()) \|\|
1367	Addr ->getOpcode() == ISD::OR;
1368	}
1369
1370	// Check that the base address of flat scratch load/store in the form of `base +
1371	// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1372	// requirement). We always treat the first operand as the base address here.
1373	bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1374	if (isNoUnsignedWrap(Addr))
1375	return true;
1376
1377	// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1378	// values.
1379	if (Subtarget->hasSignedScratchOffsets())
1380	return true;
1381
1382	auto LHS = Addr.getOperand(i: `0`);
1383	auto RHS = Addr.getOperand(i: `1`);
1384
1385	// If the immediate offset is negative and within certain range, the base
1386	// address cannot also be negative. If the base is also negative, the sum
1387	// would be either negative or much larger than the valid range of scratch
1388	// memory a thread can access.
1389	ConstantSDNode ImmOp = nullptr*;
1390	if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(Val&: RHS))) {
1391	if (ImmOp->getSExtValue() < `0` && ImmOp->getSExtValue() > -`0x40000000`)
1392	return true;
1393	}
1394
1395	return CurDAG->SignBitIsZero(Op: LHS);
1396	}
1397
1398	// Check address value in SGPR/VGPR are legal for flat scratch in the form
1399	// of: SGPR + VGPR.
1400	bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1401	if (isNoUnsignedWrap(Addr))
1402	return true;
1403
1404	// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1405	// values.
1406	if (Subtarget->hasSignedScratchOffsets())
1407	return true;
1408
1409	auto LHS = Addr.getOperand(i: `0`);
1410	auto RHS = Addr.getOperand(i: `1`);
1411	return CurDAG->SignBitIsZero(Op: RHS) && CurDAG->SignBitIsZero(Op: LHS);
1412	}
1413
1414	// Check address value in SGPR/VGPR are legal for flat scratch in the form
1415	// of: SGPR + VGPR + Imm.
1416	bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1417	// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1418	// values.
1419	if (AMDGPU::isGFX12Plus(STI: *Subtarget))
1420	return true;
1421
1422	auto Base = Addr.getOperand(i: `0`);
1423	auto *RHSImm = cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`));
1424	// If the immediate offset is negative and within certain range, the base
1425	// address cannot also be negative. If the base is also negative, the sum
1426	// would be either negative or much larger than the valid range of scratch
1427	// memory a thread can access.
1428	if (isNoUnsignedWrap(Addr: Base) &&
1429	(isNoUnsignedWrap(Addr) \|\|
1430	(RHSImm->getSExtValue() < `0` && RHSImm->getSExtValue() > -`0x40000000`)))
1431	return true;
1432
1433	auto LHS = Base.getOperand(i: `0`);
1434	auto RHS = Base.getOperand(i: `1`);
1435	return CurDAG->SignBitIsZero(Op: RHS) && CurDAG->SignBitIsZero(Op: LHS);
1436	}
1437
1438	// TODO: If offset is too big, put low 16-bit into offset.
1439	bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1440	SDValue &Offset0,
1441	SDValue &Offset1) const {
1442	return SelectDSReadWrite2(Ptr: Addr, Base, Offset0, Offset1, Size: `4`);
1443	}
1444
1445	bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1446	SDValue &Offset0,
1447	SDValue &Offset1) const {
1448	return SelectDSReadWrite2(Ptr: Addr, Base, Offset0, Offset1, Size: `8`);
1449	}
1450
1451	bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1452	SDValue &Offset0, SDValue &Offset1,
1453	unsigned Size) const {
1454	SDLoc DL(Addr);
1455
1456	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1457	SDValue N0 = Addr.getOperand(i: `0`);
1458	SDValue N1 = Addr.getOperand(i: `1`);
1459	ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
1460	unsigned OffsetValue0 = C1->getZExtValue();
1461	unsigned OffsetValue1 = OffsetValue0 + Size;
1462
1463	// (add n0, c0)
1464	if (isDSOffset2Legal(Base: N0, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1465	Base = N0;
1466	Offset0 = CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i32);
1467	Offset1 = CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i32);
1468	return true;
1469	}
1470	} else if (Addr.getOpcode() == ISD::SUB) {
1471	// sub C, x -> add (sub 0, x), C
1472	if (const ConstantSDNode *C =
1473	dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `0`))) {
1474	unsigned OffsetValue0 = C->getZExtValue();
1475	unsigned OffsetValue1 = OffsetValue0 + Size;
1476
1477	if (isDSOffset2Legal(Base: SDValue (), Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1478	SDLoc DL(Addr);
1479	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1480
1481	// XXX - This is kind of hacky. Create a dummy sub node so we can check
1482	// the known bits in isDSOffsetLegal. We need to emit the selected node
1483	// here, so this is thrown away.
1484	SDValue Sub =
1485	CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: Zero, N2: Addr.getOperand(i: `1`));
1486
1487	if (isDSOffset2Legal(Base: Sub, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1488	SmallVector<SDValue, `3`> Opnds;
1489	Opnds.push_back(Elt: Zero);
1490	Opnds.push_back(Elt: Addr.getOperand(i: `1`));
1491	unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1492	if (Subtarget->hasAddNoCarryInsts()) {
1493	SubOp = AMDGPU::V_SUB_U32_e64;
1494	Opnds.push_back(
1495	Elt: CurDAG->getTargetConstant(Val: `0`, DL: {}, VT: MVT::i1)); // clamp bit
1496	}
1497
1498	MachineSDNode *MachineSub = CurDAG->getMachineNode(
1499	Opcode: SubOp, dl: DL, VT: MVT::getIntegerVT(BitWidth: Size * `8`), Ops: Opnds);
1500
1501	Base = SDValue (MachineSub, `0`);
1502	Offset0 =
1503	CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i32);
1504	Offset1 =
1505	CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i32);
1506	return true;
1507	}
1508	}
1509	}
1510	} else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1511	unsigned OffsetValue0 = CAddr->getZExtValue();
1512	unsigned OffsetValue1 = OffsetValue0 + Size;
1513
1514	if (isDSOffset2Legal(Base: SDValue (), Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1515	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1516	MachineSDNode *MovZero =
1517	CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32, Op1: Zero);
1518	Base = SDValue (MovZero, `0`);
1519	Offset0 = CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i32);
1520	Offset1 = CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i32);
1521	return true;
1522	}
1523	}
1524
1525	// default case
1526
1527	Base = Addr;
1528	Offset0 = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1529	Offset1 = CurDAG->getTargetConstant(Val: `1`, DL, VT: MVT::i32);
1530	return true;
1531	}
1532
1533	bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1534	SDValue &SOffset, SDValue &Offset,
1535	SDValue &Offen, SDValue &Idxen,
1536	SDValue &Addr64) const {
1537	// Subtarget prefers to use flat instruction
1538	// FIXME: This should be a pattern predicate and not reach here
1539	if (Subtarget->useFlatForGlobal())
1540	return false;
1541
1542	SDLoc DL(Addr);
1543
1544	Idxen = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
1545	Offen = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
1546	Addr64 = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
1547	SOffset = Subtarget->hasRestrictedSOffset()
1548	? CurDAG->getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32)
1549	: CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1550
1551	ConstantSDNode C1 = nullptr*;
1552	SDValue N0 = Addr;
1553	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1554	C1 = cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`));
1555	if (isUInt<`32`>(x: C1->getZExtValue()))
1556	N0 = Addr.getOperand(i: `0`);
1557	else
1558	C1 = nullptr;
1559	}
1560
1561	if (N0 ->isAnyAdd()) {
1562	// (add N2, N3) -> addr64, or
1563	// (add (add N2, N3), C1) -> addr64
1564	SDValue N2 = N0.getOperand(i: `0`);
1565	SDValue N3 = N0.getOperand(i: `1`);
1566	Addr64 = CurDAG->getTargetConstant(Val: `1`, DL, VT: MVT::i1);
1567
1568	if (N2 ->isDivergent()) {
1569	if (N3 ->isDivergent()) {
1570	// Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1571	// addr64, and construct the resource from a 0 address.
1572	Ptr = SDValue (buildSMovImm64(DL, Imm: `0`, VT: MVT::v2i32), `0`);
1573	VAddr = N0;
1574	} else {
1575	// N2 is divergent, N3 is not.
1576	Ptr = N3;
1577	VAddr = N2;
1578	}
1579	} else {
1580	// N2 is not divergent.
1581	Ptr = N2;
1582	VAddr = N3;
1583	}
1584	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1585	} else if (N0 ->isDivergent()) {
1586	// N0 is divergent. Use it as the addr64, and construct the resource from a
1587	// 0 address.
1588	Ptr = SDValue (buildSMovImm64(DL, Imm: `0`, VT: MVT::v2i32), `0`);
1589	VAddr = N0;
1590	Addr64 = CurDAG->getTargetConstant(Val: `1`, DL, VT: MVT::i1);
1591	} else {
1592	// N0 -> offset, or
1593	// (N0 + C1) -> offset
1594	VAddr = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1595	Ptr = N0;
1596	}
1597
1598	if (!C1) {
1599	// No offset.
1600	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1601	return true;
1602	}
1603
1604	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1605	if (TII->isLegalMUBUFImmOffset(Imm: C1->getZExtValue())) {
1606	// Legal offset for instruction.
1607	Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32);
1608	return true;
1609	}
1610
1611	// Illegal offset, store it in soffset.
1612	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1613	SOffset =
1614	SDValue (CurDAG->getMachineNode(
1615	Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
1616	Op1: CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32)),
1617	`0`);
1618	return true;
1619	}
1620
1621	bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1622	SDValue &VAddr, SDValue &SOffset,
1623	SDValue &Offset) const {
1624	SDValue Ptr, Offen, Idxen, Addr64;
1625
1626	// addr64 bit was removed for volcanic islands.
1627	// FIXME: This should be a pattern predicate and not reach here
1628	if (!Subtarget->hasAddr64())
1629	return false;
1630
1631	if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1632	return false;
1633
1634	ConstantSDNode *C = cast<ConstantSDNode>(Val&: Addr64);
1635	if (C->getSExtValue()) {
1636	SDLoc DL(Addr);
1637
1638	const SITargetLowering& Lowering =
1639	*static_cast<const SITargetLowering*>(getTargetLowering());
1640
1641	SRsrc = SDValue (Lowering.wrapAddr64Rsrc(DAG&: *CurDAG, DL, Ptr), `0`);
1642	return true;
1643	}
1644
1645	return false;
1646	}
1647
1648	std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1649	SDLoc DL(N);
1650
1651	auto *FI = dyn_cast<FrameIndexSDNode>(Val&: N);
1652	SDValue TFI =
1653	FI ? CurDAG->getTargetFrameIndex(FI: FI->getIndex(), VT: FI->getValueType(ResNo: `0`)) : N;
1654
1655	// We rebase the base address into an absolute stack address and hence
1656	// use constant 0 for soffset. This value must be retained until
1657	// frame elimination and eliminateFrameIndex will choose the appropriate
1658	// frame register if need be.
1659	return std::pair(TFI, CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32));
1660	}
1661
1662	bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1663	SDValue Addr, SDValue &Rsrc,
1664	SDValue &VAddr, SDValue &SOffset,
1665	SDValue &ImmOffset) const {
1666
1667	SDLoc DL(Addr);
1668	MachineFunction &MF = CurDAG->getMachineFunction();
1669	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1670
1671	Rsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1672
1673	if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1674	int64_t Imm = CAddr->getSExtValue();
1675	const int64_t NullPtr =
1676	AMDGPU::getNullPointerValue(AS: AMDGPUAS::PRIVATE_ADDRESS);
1677	// Don't fold null pointer.
1678	if (Imm != NullPtr) {
1679	const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
1680	SDValue HighBits =
1681	CurDAG->getTargetConstant(Val: Imm & ~MaxOffset, DL, VT: MVT::i32);
1682	MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1683	Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32, Op1: HighBits);
1684	VAddr = SDValue (MovHighBits, `0`);
1685
1686	SOffset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1687	ImmOffset = CurDAG->getTargetConstant(Val: Imm & MaxOffset, DL, VT: MVT::i32);
1688	return true;
1689	}
1690	}
1691
1692	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1693	// (add n0, c1)
1694
1695	SDValue N0 = Addr.getOperand(i: `0`);
1696	uint64_t C1 = Addr.getConstantOperandVal(i: `1`);
1697
1698	// Offsets in vaddr must be positive if range checking is enabled.
1699	//
1700	// The total computation of vaddr + soffset + offset must not overflow. If
1701	// vaddr is negative, even if offset is 0 the sgpr offset add will end up
1702	// overflowing.
1703	//
1704	// Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1705	// always perform a range check. If a negative vaddr base index was used,
1706	// this would fail the range check. The overall address computation would
1707	// compute a valid address, but this doesn't happen due to the range
1708	// check. For out-of-bounds MUBUF loads, a 0 is returned.
1709	//
1710	// Therefore it should be safe to fold any VGPR offset on gfx9 into the
1711	// MUBUF vaddr, but not on older subtargets which can only do this if the
1712	// sign bit is known 0.
1713	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1714	if (TII->isLegalMUBUFImmOffset(Imm: C1) &&
1715	(!Subtarget->privateMemoryResourceIsRangeChecked() \|\|
1716	CurDAG->SignBitIsZero(Op: N0))) {
1717	std::tie(args&: VAddr, args&: SOffset) = foldFrameIndex(N: N0);
1718	ImmOffset = CurDAG->getTargetConstant(Val: C1, DL, VT: MVT::i32);
1719	return true;
1720	}
1721	}
1722
1723	// (node)
1724	std::tie(args&: VAddr, args&: SOffset) = foldFrameIndex(N: Addr);
1725	ImmOffset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1726	return true;
1727	}
1728
1729	static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1730	if (Val.getOpcode() != ISD::CopyFromReg)
1731	return false;
1732	auto Reg = cast<RegisterSDNode>(Val: Val.getOperand(i: `1`))->getReg();
1733	if (!Reg.isPhysical())
1734	return false;
1735	const auto *RC = TRI.getPhysRegBaseClass(Reg);
1736	return RC && TRI.isSGPRClass(RC);
1737	}
1738
1739	bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1740	SDValue Addr,
1741	SDValue &SRsrc,
1742	SDValue &SOffset,
1743	SDValue &Offset) const {
1744	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1745	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1746	MachineFunction &MF = CurDAG->getMachineFunction();
1747	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1748	SDLoc DL(Addr);
1749
1750	// CopyFromReg <sgpr>
1751	if (IsCopyFromSGPR(TRI: *TRI, Val: Addr)) {
1752	SRsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1753	SOffset = Addr;
1754	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1755	return true;
1756	}
1757
1758	ConstantSDNode *CAddr;
1759	if (Addr.getOpcode() == ISD::ADD) {
1760	// Add (CopyFromReg <sgpr>) <constant>
1761	CAddr = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`));
1762	if (!CAddr \|\| !TII->isLegalMUBUFImmOffset(Imm: CAddr->getZExtValue()))
1763	return false;
1764	if (!IsCopyFromSGPR(TRI: *TRI, Val: Addr.getOperand(i: `0`)))
1765	return false;
1766
1767	SOffset = Addr.getOperand(i: `0`);
1768	} else if ((CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) &&
1769	TII->isLegalMUBUFImmOffset(Imm: CAddr->getZExtValue())) {
1770	// <constant>
1771	SOffset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1772	} else {
1773	return false;
1774	}
1775
1776	SRsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1777
1778	Offset = CurDAG->getTargetConstant(Val: CAddr->getZExtValue(), DL, VT: MVT::i32);
1779	return true;
1780	}
1781
1782	bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1783	SDValue &SOffset, SDValue &Offset
1784	) const {
1785	SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1786	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1787
1788	if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1789	return false;
1790
1791	if (!cast<ConstantSDNode>(Val&: Offen)->getSExtValue() &&
1792	!cast<ConstantSDNode>(Val&: Idxen)->getSExtValue() &&
1793	!cast<ConstantSDNode>(Val&: Addr64)->getSExtValue()) {
1794	uint64_t Rsrc = TII->getDefaultRsrcDataFormat() \|
1795	maskTrailingOnes<uint64_t>(N: `32`); // Size
1796	SDLoc DL(Addr);
1797
1798	const SITargetLowering& Lowering =
1799	*static_cast<const SITargetLowering*>(getTargetLowering());
1800
1801	SRsrc = SDValue (Lowering.buildRSRC(DAG&: *CurDAG, DL, Ptr, RsrcDword1: `0`, RsrcDword2And3: Rsrc), `0`);
1802	return true;
1803	}
1804	return false;
1805	}
1806
1807	bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1808	SDValue &SOffset) const {
1809	if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: ByteOffsetNode)) {
1810	SOffset = CurDAG->getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32);
1811	return true;
1812	}
1813
1814	SOffset = ByteOffsetNode;
1815	return true;
1816	}
1817
1818	// Find a load or store from corresponding pattern root.
1819	// Roots may be build_vector, bitconvert or their combinations.
1820	static MemSDNode* findMemSDNode(SDNode *N) {
1821	N = AMDGPUTargetLowering::stripBitcast(Val: SDValue (N,`0`)).getNode();
1822	if (MemSDNode *MN = dyn_cast<MemSDNode>(Val: N))
1823	return MN;
1824	assert(isa<BuildVectorSDNode>(N));
1825	for (SDValue V : N->op_values())
1826	if (MemSDNode *MN =
1827	dyn_cast<MemSDNode>(Val: AMDGPUTargetLowering::stripBitcast(Val: V)))
1828	return MN;
1829	llvm_unreachable("cannot find MemSDNode in the pattern!");
1830	}
1831
1832	bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1833	SDValue &VAddr, SDValue &Offset,
1834	uint64_t FlatVariant) const {
1835	int64_t OffsetVal = `0`;
1836
1837	unsigned AS = findMemSDNode(N)->getAddressSpace();
1838
1839	bool CanHaveFlatSegmentOffsetBug =
1840	Subtarget->hasFlatSegmentOffsetBug() &&
1841	FlatVariant == SIInstrFlags::FLAT &&
1842	(AS == AMDGPUAS::FLAT_ADDRESS \|\| AS == AMDGPUAS::GLOBAL_ADDRESS);
1843
1844	if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1845	SDValue N0, N1;
1846	if (isBaseWithConstantOffset64(Addr, LHS&: N0, RHS&: N1) &&
1847	(FlatVariant != SIInstrFlags::FlatScratch \|\|
1848	isFlatScratchBaseLegal(Addr))) {
1849	int64_t COffsetVal = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
1850
1851	// Adding the offset to the base address in a FLAT instruction must not
1852	// change the memory aperture in which the address falls. Therefore we can
1853	// only fold offsets from inbounds GEPs into FLAT instructions.
1854	bool IsInBounds =
1855	Addr.getOpcode() == ISD::PTRADD && Addr ->getFlags().hasInBounds();
1856	if (COffsetVal == `0` \|\| FlatVariant != SIInstrFlags::FLAT \|\| IsInBounds) {
1857	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1858	if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AS, FlatVariant)) {
1859	Addr = N0;
1860	OffsetVal = COffsetVal;
1861	} else {
1862	// If the offset doesn't fit, put the low bits into the offset field
1863	// and add the rest.
1864	//
1865	// For a FLAT instruction the hardware decides whether to access
1866	// global/scratch/shared memory based on the high bits of vaddr,
1867	// ignoring the offset field, so we have to ensure that when we add
1868	// remainder to vaddr it still points into the same underlying object.
1869	// The easiest way to do that is to make sure that we split the offset
1870	// into two pieces that are both >= 0 or both <= 0.
1871
1872	SDLoc DL(N);
1873	uint64_t RemainderOffset;
1874
1875	std::tie(args&: OffsetVal, args&: RemainderOffset) =
1876	TII->splitFlatOffset(COffsetVal, AddrSpace: AS, FlatVariant);
1877
1878	SDValue AddOffsetLo =
1879	getMaterializedScalarImm32(Val: Lo_32(Value: RemainderOffset), DL);
1880	SDValue Clamp = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
1881
1882	if (Addr.getValueType().getSizeInBits() == `32`) {
1883	SmallVector<SDValue, `3`> Opnds;
1884	Opnds.push_back(Elt: N0);
1885	Opnds.push_back(Elt: AddOffsetLo);
1886	unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1887	if (Subtarget->hasAddNoCarryInsts()) {
1888	AddOp = AMDGPU::V_ADD_U32_e64;
1889	Opnds.push_back(Elt: Clamp);
1890	}
1891	Addr =
1892	SDValue (CurDAG->getMachineNode(Opcode: AddOp, dl: DL, VT: MVT::i32, Ops: Opnds), `0`);
1893	} else {
1894	// TODO: Should this try to use a scalar add pseudo if the base
1895	// address is uniform and saddr is usable?
1896	SDValue Sub0 =
1897	CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
1898	SDValue Sub1 =
1899	CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
1900
1901	SDNode *N0Lo = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1902	dl: DL, VT: MVT::i32, Op1: N0, Op2: Sub0);
1903	SDNode *N0Hi = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1904	dl: DL, VT: MVT::i32, Op1: N0, Op2: Sub1);
1905
1906	SDValue AddOffsetHi =
1907	getMaterializedScalarImm32(Val: Hi_32(Value: RemainderOffset), DL);
1908
1909	SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i1);
1910
1911	SDNode *Add =
1912	CurDAG->getMachineNode(Opcode: AMDGPU::V_ADD_CO_U32_e64, dl: DL, VTs,
1913	Ops: {AddOffsetLo, SDValue (N0Lo, `0`), Clamp});
1914
1915	SDNode *Addc = CurDAG->getMachineNode(
1916	Opcode: AMDGPU::V_ADDC_U32_e64, dl: DL, VTs,
1917	Ops: {AddOffsetHi, SDValue (N0Hi, `0`), SDValue (Add, `1`), Clamp});
1918
1919	SDValue RegSequenceArgs[] = {
1920	CurDAG->getTargetConstant(Val: AMDGPU::VReg_64RegClassID, DL,
1921	VT: MVT::i32),
1922	SDValue (Add, `0`), Sub0, SDValue (Addc, `0`), Sub1};
1923
1924	Addr = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL,
1925	VT: MVT::i64, Ops: RegSequenceArgs),
1926	`0`);
1927	}
1928	}
1929	}
1930	}
1931	}
1932
1933	VAddr = Addr;
1934	Offset = CurDAG->getSignedTargetConstant(Val: OffsetVal, DL: SDLoc (), VT: MVT::i32);
1935	return true;
1936	}
1937
1938	bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1939	SDValue &VAddr,
1940	SDValue &Offset) const {
1941	return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, FlatVariant: SIInstrFlags::FLAT);
1942	}
1943
1944	bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1945	SDValue &VAddr,
1946	SDValue &Offset) const {
1947	return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, FlatVariant: SIInstrFlags::FlatGlobal);
1948	}
1949
1950	bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1951	SDValue &VAddr,
1952	SDValue &Offset) const {
1953	return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1954	FlatVariant: SIInstrFlags::FlatScratch);
1955	}
1956
1957	// If this matches _extend i32:x, return x*
1958	// Otherwise if the value is I32 returns x.
1959	static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
1960	const SelectionDAG *DAG) {
1961	if (Op.getValueType() == MVT::i32)
1962	return Op;
1963
1964	if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
1965	Op.getOpcode() != ISD::ANY_EXTEND &&
1966	!(DAG->SignBitIsZero(Op) &&
1967	Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
1968	return SDValue ();
1969
1970	SDValue ExtSrc = Op.getOperand(i: `0`);
1971	return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue ();
1972	}
1973
1974	// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1975	// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
1976	bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
1977	SDValue &SAddr, SDValue &VOffset,
1978	SDValue &Offset, bool &ScaleOffset,
1979	bool NeedIOffset) const {
1980	int64_t ImmOffset = `0`;
1981	ScaleOffset = false;
1982
1983	// Match the immediate offset first, which canonically is moved as low as
1984	// possible.
1985
1986	SDValue LHS, RHS;
1987	if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1988	int64_t COffsetVal = cast<ConstantSDNode>(Val&: RHS)->getSExtValue();
1989	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1990
1991	if (NeedIOffset &&
1992	TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
1993	FlatVariant: SIInstrFlags::FlatGlobal)) {
1994	Addr = LHS;
1995	ImmOffset = COffsetVal;
1996	} else if (!LHS ->isDivergent()) {
1997	if (COffsetVal > `0`) {
1998	SDLoc SL(N);
1999	// saddr + large_offset -> saddr +
2000	// (voffset = large_offset & ~MaxOffset) +
2001	// (large_offset & MaxOffset);
2002	int64_t SplitImmOffset = `0`, RemainderOffset = COffsetVal;
2003	if (NeedIOffset) {
2004	std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
2005	COffsetVal, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS, FlatVariant: SIInstrFlags::FlatGlobal);
2006	}
2007
2008	if (Subtarget->hasSignedGVSOffset() ? isInt<`32`>(x: RemainderOffset)
2009	: isUInt<`32`>(x: RemainderOffset)) {
2010	SDNode *VMov = CurDAG->getMachineNode(
2011	Opcode: AMDGPU::V_MOV_B32_e32, dl: SL, VT: MVT::i32,
2012	Op1: CurDAG->getTargetConstant(Val: RemainderOffset, DL: SDLoc (), VT: MVT::i32));
2013	VOffset = SDValue (VMov, `0`);
2014	SAddr = LHS;
2015	Offset = CurDAG->getTargetConstant(Val: SplitImmOffset, DL: SDLoc (), VT: MVT::i32);
2016	return true;
2017	}
2018	}
2019
2020	// We are adding a 64 bit SGPR and a constant. If constant bus limit
2021	// is 1 we would need to perform 1 or 2 extra moves for each half of
2022	// the constant and it is better to do a scalar add and then issue a
2023	// single VALU instruction to materialize zero. Otherwise it is less
2024	// instructions to perform VALU adds with immediates or inline literals.
2025	unsigned NumLiterals =
2026	!TII->isInlineConstant(Imm: APInt (`32`, Lo_32(Value: COffsetVal))) +
2027	!TII->isInlineConstant(Imm: APInt (`32`, Hi_32(Value: COffsetVal)));
2028	if (Subtarget->getConstantBusLimit(Opcode: AMDGPU::V_ADD_U32_e64) > NumLiterals)
2029	return false;
2030	}
2031	}
2032
2033	// Match the variable offset.
2034	if (Addr ->isAnyAdd()) {
2035	LHS = Addr.getOperand(i: `0`);
2036
2037	if (!LHS ->isDivergent()) {
2038	// add (i64 sgpr), (_extend (i32 vgpr))*
2039	RHS = Addr.getOperand(i: `1`);
2040	ScaleOffset = SelectScaleOffset(N, Offset&: RHS, IsSigned: Subtarget->hasSignedGVSOffset());
2041	if (SDValue ExtRHS = matchExtFromI32orI32(
2042	Op: RHS, IsSigned: Subtarget->hasSignedGVSOffset(), DAG: CurDAG)) {
2043	SAddr = LHS;
2044	VOffset = ExtRHS;
2045	}
2046	}
2047
2048	RHS = Addr.getOperand(i: `1`);
2049	if (!SAddr && !RHS ->isDivergent()) {
2050	// add (_extend (i32 vgpr)), (i64 sgpr)*
2051	ScaleOffset = SelectScaleOffset(N, Offset&: LHS, IsSigned: Subtarget->hasSignedGVSOffset());
2052	if (SDValue ExtLHS = matchExtFromI32orI32(
2053	Op: LHS, IsSigned: Subtarget->hasSignedGVSOffset(), DAG: CurDAG)) {
2054	SAddr = RHS;
2055	VOffset = ExtLHS;
2056	}
2057	}
2058
2059	if (SAddr) {
2060	Offset = CurDAG->getSignedTargetConstant(Val: ImmOffset, DL: SDLoc (), VT: MVT::i32);
2061	return true;
2062	}
2063	}
2064
2065	if (Subtarget->hasScaleOffset() &&
2066	(Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
2067	? AMDGPUISD::MAD_I64_I32
2068	: AMDGPUISD::MAD_U64_U32) \|\|
2069	(Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
2070	CurDAG->SignBitIsZero(Op: Addr.getOperand(i: `0`)))) &&
2071	Addr.getOperand(i: `0`)->isDivergent() &&
2072	isa<ConstantSDNode>(Val: Addr.getOperand(i: `1`)) &&
2073	!Addr.getOperand(i: `2`)->isDivergent()) {
2074	// mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
2075	unsigned Size =
2076	(unsigned)cast<MemSDNode>(Val: N)->getMemoryVT().getFixedSizeInBits() / `8`;
2077	ScaleOffset = Addr.getConstantOperandVal(i: `1`) == Size;
2078	if (ScaleOffset) {
2079	SAddr = Addr.getOperand(i: `2`);
2080	VOffset = Addr.getOperand(i: `0`);
2081	Offset = CurDAG->getTargetConstant(Val: ImmOffset, DL: SDLoc (), VT: MVT::i32);
2082	return true;
2083	}
2084	}
2085
2086	if (Addr ->isDivergent() \|\| Addr.getOpcode() == ISD::UNDEF \|\|
2087	isa<ConstantSDNode>(Val: Addr))
2088	return false;
2089
2090	// It's cheaper to materialize a single 32-bit zero for vaddr than the two
2091	// moves required to copy a 64-bit SGPR to VGPR.
2092	SAddr = Addr;
2093	SDNode *VMov =
2094	CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: SDLoc (Addr), VT: MVT::i32,
2095	Op1: CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (), VT: MVT::i32));
2096	VOffset = SDValue (VMov, `0`);
2097	Offset = CurDAG->getSignedTargetConstant(Val: ImmOffset, DL: SDLoc (), VT: MVT::i32);
2098	return true;
2099	}
2100
2101	bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2102	SDValue &SAddr, SDValue &VOffset,
2103	SDValue &Offset,
2104	SDValue &CPol) const {
2105	bool ScaleOffset;
2106	if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2107	return false;
2108
2109	CPol = CurDAG->getTargetConstant(Val: ScaleOffset ? AMDGPU::CPol::SCAL : `0`,
2110	DL: SDLoc (), VT: MVT::i32);
2111	return true;
2112	}
2113
2114	bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2115	SDValue &SAddr, SDValue &VOffset,
2116	SDValue &Offset,
2117	SDValue &CPol) const {
2118	bool ScaleOffset;
2119	if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2120	return false;
2121
2122	// We are assuming CPol is always the last operand of the intrinsic.
2123	auto PassedCPol =
2124	N->getConstantOperandVal(Num: N->getNumOperands() - `1`) & ~AMDGPU::CPol::SCAL;
2125	CPol = CurDAG->getTargetConstant(
2126	Val: (ScaleOffset ? AMDGPU::CPol::SCAL : `0`) \| PassedCPol, DL: SDLoc (), VT: MVT::i32);
2127	return true;
2128	}
2129
2130	bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
2131	SDValue &SAddr,
2132	SDValue &VOffset,
2133	SDValue &Offset,
2134	SDValue &CPol) const {
2135	bool ScaleOffset;
2136	if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2137	return false;
2138
2139	// We are assuming CPol is second from last operand of the intrinsic.
2140	auto PassedCPol =
2141	N->getConstantOperandVal(Num: N->getNumOperands() - `2`) & ~AMDGPU::CPol::SCAL;
2142	CPol = CurDAG->getTargetConstant(
2143	Val: (ScaleOffset ? AMDGPU::CPol::SCAL : `0`) \| PassedCPol, DL: SDLoc (), VT: MVT::i32);
2144	return true;
2145	}
2146
2147	bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
2148	SDValue &SAddr, SDValue &VOffset,
2149	SDValue &Offset,
2150	SDValue &CPol) const {
2151	bool ScaleOffset;
2152	if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2153	return false;
2154
2155	unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : `0`) \| AMDGPU::CPol::GLC;
2156	CPol = CurDAG->getTargetConstant(Val: CPolVal, DL: SDLoc (), VT: MVT::i32);
2157	return true;
2158	}
2159
2160	bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2161	SDValue &SAddr,
2162	SDValue &VOffset,
2163	SDValue &CPol) const {
2164	bool ScaleOffset;
2165	SDValue DummyOffset;
2166	if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset&: DummyOffset, ScaleOffset,
2167	NeedIOffset: false))
2168	return false;
2169
2170	// We are assuming CPol is always the last operand of the intrinsic.
2171	auto PassedCPol =
2172	N->getConstantOperandVal(Num: N->getNumOperands() - `1`) & ~AMDGPU::CPol::SCAL;
2173	CPol = CurDAG->getTargetConstant(
2174	Val: (ScaleOffset ? AMDGPU::CPol::SCAL : `0`) \| PassedCPol, DL: SDLoc (), VT: MVT::i32);
2175	return true;
2176	}
2177
2178	bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
2179	SDValue &SAddr,
2180	SDValue &VOffset,
2181	SDValue &CPol) const {
2182	bool ScaleOffset;
2183	SDValue DummyOffset;
2184	if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset&: DummyOffset, ScaleOffset,
2185	NeedIOffset: false))
2186	return false;
2187
2188	// We are assuming CPol is second from last operand of the intrinsic.
2189	auto PassedCPol =
2190	N->getConstantOperandVal(Num: N->getNumOperands() - `2`) & ~AMDGPU::CPol::SCAL;
2191	CPol = CurDAG->getTargetConstant(
2192	Val: (ScaleOffset ? AMDGPU::CPol::SCAL : `0`) \| PassedCPol, DL: SDLoc (), VT: MVT::i32);
2193	return true;
2194	}
2195
2196	static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
2197	if (auto *FI = dyn_cast<FrameIndexSDNode>(Val&: SAddr)) {
2198	SAddr = CurDAG->getTargetFrameIndex(FI: FI->getIndex(), VT: FI->getValueType(ResNo: `0`));
2199	} else if (SAddr.getOpcode() == ISD::ADD &&
2200	isa<FrameIndexSDNode>(Val: SAddr.getOperand(i: `0`))) {
2201	// Materialize this into a scalar move for scalar address to avoid
2202	// readfirstlane.
2203	auto *FI = cast<FrameIndexSDNode>(Val: SAddr.getOperand(i: `0`));
2204	SDValue TFI = CurDAG->getTargetFrameIndex(FI: FI->getIndex(),
2205	VT: FI->getValueType(ResNo: `0`));
2206	SAddr = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::S_ADD_I32, dl: SDLoc (SAddr),
2207	VT: MVT::i32, Op1: TFI, Op2: SAddr.getOperand(i: `1`)),
2208	`0`);
2209	}
2210
2211	return SAddr;
2212	}
2213
2214	// Match (32-bit SGPR base) + sext(imm offset)
2215	bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
2216	SDValue &SAddr,
2217	SDValue &Offset) const {
2218	if (Addr ->isDivergent())
2219	return false;
2220
2221	SDLoc DL(Addr);
2222
2223	int64_t COffsetVal = `0`;
2224
2225	if (CurDAG->isBaseWithConstantOffset(Op: Addr) && isFlatScratchBaseLegal(Addr)) {
2226	COffsetVal = cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`))->getSExtValue();
2227	SAddr = Addr.getOperand(i: `0`);
2228	} else {
2229	SAddr = Addr;
2230	}
2231
2232	SAddr = SelectSAddrFI(CurDAG, SAddr);
2233
2234	const SIInstrInfo *TII = Subtarget->getInstrInfo();
2235
2236	if (!TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
2237	FlatVariant: SIInstrFlags::FlatScratch)) {
2238	int64_t SplitImmOffset, RemainderOffset;
2239	std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
2240	COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: SIInstrFlags::FlatScratch);
2241
2242	COffsetVal = SplitImmOffset;
2243
2244	SDValue AddOffset =
2245	SAddr.getOpcode() == ISD::TargetFrameIndex
2246	? getMaterializedScalarImm32(Val: Lo_32(Value: RemainderOffset), DL)
2247	: CurDAG->getSignedTargetConstant(Val: RemainderOffset, DL, VT: MVT::i32);
2248	SAddr = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::S_ADD_I32, dl: DL, VT: MVT::i32,
2249	Op1: SAddr, Op2: AddOffset),
2250	`0`);
2251	}
2252
2253	Offset = CurDAG->getSignedTargetConstant(Val: COffsetVal, DL, VT: MVT::i32);
2254
2255	return true;
2256	}
2257
2258	// Check whether the flat scratch SVS swizzle bug affects this access.
2259	bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2260	SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2261	if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2262	return false;
2263
2264	// The bug affects the swizzling of SVS accesses if there is any carry out
2265	// from the two low order bits (i.e. from bit 1 into bit 2) when adding
2266	// voffset to (soffset + inst_offset).
2267	KnownBits VKnown = CurDAG->computeKnownBits(Op: VAddr);
2268	KnownBits SKnown =
2269	KnownBits::add(LHS: CurDAG->computeKnownBits(Op: SAddr),
2270	RHS: KnownBits::makeConstant(C: APInt (`32`, ImmOffset,
2271	/isSigned=/true)));
2272	uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2273	uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2274	return (VMax & `3`) + (SMax & `3`) >= `4`;
2275	}
2276
2277	bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2278	SDValue &VAddr, SDValue &SAddr,
2279	SDValue &Offset,
2280	SDValue &CPol) const {
2281	int64_t ImmOffset = `0`;
2282
2283	SDValue LHS, RHS;
2284	SDValue OrigAddr = Addr;
2285	if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2286	int64_t COffsetVal = cast<ConstantSDNode>(Val&: RHS)->getSExtValue();
2287	const SIInstrInfo *TII = Subtarget->getInstrInfo();
2288
2289	if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
2290	FlatVariant: SIInstrFlags::FlatScratch)) {
2291	Addr = LHS;
2292	ImmOffset = COffsetVal;
2293	} else if (!LHS ->isDivergent() && COffsetVal > `0`) {
2294	SDLoc SL(N);
2295	// saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2296	// (large_offset & MaxOffset);
2297	int64_t SplitImmOffset, RemainderOffset;
2298	std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
2299	COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: SIInstrFlags::FlatScratch);
2300
2301	if (isUInt<`32`>(x: RemainderOffset)) {
2302	SDNode *VMov = CurDAG->getMachineNode(
2303	Opcode: AMDGPU::V_MOV_B32_e32, dl: SL, VT: MVT::i32,
2304	Op1: CurDAG->getTargetConstant(Val: RemainderOffset, DL: SDLoc (), VT: MVT::i32));
2305	VAddr = SDValue (VMov, `0`);
2306	SAddr = LHS;
2307	if (!isFlatScratchBaseLegal(Addr))
2308	return false;
2309	if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset: SplitImmOffset))
2310	return false;
2311	Offset = CurDAG->getTargetConstant(Val: SplitImmOffset, DL: SDLoc (), VT: MVT::i32);
2312	CPol = CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (), VT: MVT::i32);
2313	return true;
2314	}
2315	}
2316	}
2317
2318	if (Addr.getOpcode() != ISD::ADD)
2319	return false;
2320
2321	LHS = Addr.getOperand(i: `0`);
2322	RHS = Addr.getOperand(i: `1`);
2323
2324	if (!LHS ->isDivergent() && RHS ->isDivergent()) {
2325	SAddr = LHS;
2326	VAddr = RHS;
2327	} else if (!RHS ->isDivergent() && LHS ->isDivergent()) {
2328	SAddr = RHS;
2329	VAddr = LHS;
2330	} else {
2331	return false;
2332	}
2333
2334	if (OrigAddr != Addr) {
2335	if (!isFlatScratchBaseLegalSVImm(Addr: OrigAddr))
2336	return false;
2337	} else {
2338	if (!isFlatScratchBaseLegalSV(Addr: OrigAddr))
2339	return false;
2340	}
2341
2342	if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2343	return false;
2344	SAddr = SelectSAddrFI(CurDAG, SAddr);
2345	Offset = CurDAG->getSignedTargetConstant(Val: ImmOffset, DL: SDLoc (), VT: MVT::i32);
2346
2347	bool ScaleOffset = SelectScaleOffset(N, Offset&: VAddr, IsSigned: true / IsSigned /);
2348	CPol = CurDAG->getTargetConstant(Val: ScaleOffset ? AMDGPU::CPol::SCAL : `0`,
2349	DL: SDLoc (), VT: MVT::i32);
2350	return true;
2351	}
2352
2353	// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2354	// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2355	// Handle the case where the Immediate Offset + SOffset is negative.
2356	bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2357	bool Imm32Only,
2358	bool IsBuffer,
2359	int64_t ImmOffset) const {
2360	if (!IsBuffer && !Imm32Only && ImmOffset < `0` &&
2361	AMDGPU::hasSMRDSignedImmOffset(ST: *Subtarget)) {
2362	KnownBits SKnown = CurDAG->computeKnownBits(Op: *SOffset);
2363	if (ImmOffset + SKnown.getMinValue().getSExtValue() < `0`)
2364	return false;
2365	}
2366
2367	return true;
2368	}
2369
2370	// Given \p Offset and load node \p N check if an \p Offset is a multiple of
2371	// the load byte size. If it is update \p Offset to a pre-scaled value and
2372	// return true.
2373	bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
2374	bool IsSigned) const {
2375	bool ScaleOffset = false;
2376	if (!Subtarget->hasScaleOffset() \|\| !Offset)
2377	return false;
2378
2379	unsigned Size =
2380	(unsigned)cast<MemSDNode>(Val: N)->getMemoryVT().getFixedSizeInBits() / `8`;
2381
2382	SDValue Off = Offset;
2383	if (SDValue Ext = matchExtFromI32orI32(Op: Offset, IsSigned, DAG: CurDAG))
2384	Off = Ext;
2385
2386	if (isPowerOf2_32(Value: Size) && Off.getOpcode() == ISD::SHL) {
2387	if (auto *C = dyn_cast<ConstantSDNode>(Val: Off.getOperand(i: `1`)))
2388	ScaleOffset = C->getZExtValue() == Log2_32(Value: Size);
2389	} else if (Offset.getOpcode() == ISD::MUL \|\|
2390	(IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) \|\|
2391	Offset.getOpcode() == AMDGPUISD::MUL_U24 \|\|
2392	(Offset.isMachineOpcode() &&
2393	Offset.getMachineOpcode() ==
2394	(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
2395	: AMDGPU::S_MUL_U64_U32_PSEUDO))) {
2396	if (auto *C = dyn_cast<ConstantSDNode>(Val: Offset.getOperand(i: `1`)))
2397	ScaleOffset = C->getZExtValue() == Size;
2398	}
2399
2400	if (ScaleOffset)
2401	Offset = Off.getOperand(i: `0`);
2402
2403	return ScaleOffset;
2404	}
2405
2406	// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2407	// not null) offset. If Imm32Only is true, match only 32-bit immediate
2408	// offsets available on CI.
2409	bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
2410	SDValue SOffset, SDValue Offset,
2411	bool Imm32Only, bool IsBuffer,
2412	bool HasSOffset, int64_t ImmOffset,
2413	bool ScaleOffset) const* {
2414	assert((!SOffset \|\| !Offset) &&
2415	"Cannot match both soffset and offset at the same time!");
2416
2417	if (ScaleOffset) {
2418	assert(N && SOffset);
2419
2420	ScaleOffset = SelectScaleOffset(N, Offset&: ByteOffsetNode, IsSigned: false* / IsSigned /);
2421	}
2422
2423	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: ByteOffsetNode);
2424	if (!C) {
2425	if (!SOffset)
2426	return false;
2427
2428	if (ByteOffsetNode.getValueType().isScalarInteger() &&
2429	ByteOffsetNode.getValueType().getSizeInBits() == `32`) {
2430	*SOffset = ByteOffsetNode;
2431	return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2432	ImmOffset);
2433	}
2434	if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2435	if (ByteOffsetNode.getOperand(i: `0`).getValueType().getSizeInBits() == `32`) {
2436	*SOffset = ByteOffsetNode.getOperand(i: `0`);
2437	return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2438	ImmOffset);
2439	}
2440	}
2441	return false;
2442	}
2443
2444	SDLoc SL(ByteOffsetNode);
2445
2446	// GFX9 and GFX10 have signed byte immediate offsets. The immediate
2447	// offset for S_BUFFER instructions is unsigned.
2448	int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2449	std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2450	ST: *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2451	if (EncodedOffset && Offset && !Imm32Only) {
2452	Offset = CurDAG->getSignedTargetConstant(Val: EncodedOffset, DL: SL, VT: MVT::i32);
2453	return true;
2454	}
2455
2456	// SGPR and literal offsets are unsigned.
2457	if (ByteOffset < `0`)
2458	return false;
2459
2460	EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(ST: *Subtarget, ByteOffset);
2461	if (EncodedOffset && Offset && Imm32Only) {
2462	Offset = CurDAG->getTargetConstant(Val: EncodedOffset, DL: SL, VT: MVT::i32);
2463	return true;
2464	}
2465
2466	if (!isUInt<`32`>(x: ByteOffset) && !isInt<`32`>(x: ByteOffset))
2467	return false;
2468
2469	if (SOffset) {
2470	SDValue C32Bit = CurDAG->getTargetConstant(Val: ByteOffset, DL: SL, VT: MVT::i32);
2471	*SOffset = SDValue (
2472	CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32, Op1: C32Bit), `0`);
2473	return true;
2474	}
2475
2476	return false;
2477	}
2478
2479	SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2480	if (Addr.getValueType() != MVT::i32)
2481	return Addr;
2482
2483	// Zero-extend a 32-bit address.
2484	SDLoc SL(Addr);
2485
2486	const MachineFunction &MF = CurDAG->getMachineFunction();
2487	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2488	unsigned AddrHiVal = Info->get32BitAddressHighBits();
2489	SDValue AddrHi = CurDAG->getTargetConstant(Val: AddrHiVal, DL: SL, VT: MVT::i32);
2490
2491	const SDValue Ops[] = {
2492	CurDAG->getTargetConstant(Val: AMDGPU::SReg_64_XEXECRegClassID, DL: SL, VT: MVT::i32),
2493	Addr,
2494	CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32),
2495	SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32, Op1: AddrHi),
2496	`0`),
2497	CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32),
2498	};
2499
2500	return SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: SL, VT: MVT::i64,
2501	Ops), `0`);
2502	}
2503
2504	// Match a base and an immediate (if Offset is not null) or an SGPR (if
2505	// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2506	// true, match only 32-bit immediate offsets available on CI.
2507	bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
2508	SDValue &SBase, SDValue *SOffset,
2509	SDValue Offset, bool* Imm32Only,
2510	bool IsBuffer, bool HasSOffset,
2511	int64_t ImmOffset,
2512	bool ScaleOffset) const* {
2513	if (SOffset && Offset) {
2514	assert(!Imm32Only && !IsBuffer);
2515	SDValue B;
2516
2517	if (!SelectSMRDBaseOffset(N, Addr, SBase&: B, SOffset: nullptr, Offset, Imm32Only: false, IsBuffer: false, HasSOffset: true))
2518	return false;
2519
2520	int64_t ImmOff = `0`;
2521	if (ConstantSDNode C = dyn_cast<ConstantSDNode>(Val&: Offset))
2522	ImmOff = C->getSExtValue();
2523
2524	return SelectSMRDBaseOffset(N, Addr: B, SBase, SOffset, Offset: nullptr, Imm32Only: false, IsBuffer: false,
2525	HasSOffset: true, ImmOffset: ImmOff, ScaleOffset);
2526	}
2527
2528	// A 32-bit (address + offset) should not cause unsigned 32-bit integer
2529	// wraparound, because s_load instructions perform the addition in 64 bits.
2530	if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2531	!Addr ->getFlags().hasNoUnsignedWrap())
2532	return false;
2533
2534	SDValue N0, N1;
2535	// Extract the base and offset if possible.
2536	if (Addr ->isAnyAdd() \|\| CurDAG->isADDLike(Op: Addr)) {
2537	N0 = Addr.getOperand(i: `0`);
2538	N1 = Addr.getOperand(i: `1`);
2539	} else if (getBaseWithOffsetUsingSplitOR(DAG&: *CurDAG, Addr, N0, N1)) {
2540	assert(N0 && N1 && isa<ConstantSDNode>(N1));
2541	}
2542	if (!N0 \|\| !N1)
2543	return false;
2544
2545	if (SelectSMRDOffset(N, ByteOffsetNode: N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2546	ImmOffset, ScaleOffset)) {
2547	SBase = N0;
2548	return true;
2549	}
2550	if (SelectSMRDOffset(N, ByteOffsetNode: N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2551	ImmOffset, ScaleOffset)) {
2552	SBase = N1;
2553	return true;
2554	}
2555	return false;
2556	}
2557
2558	bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
2559	SDValue SOffset, SDValue Offset,
2560	bool Imm32Only, bool ScaleOffset) const* {
2561	if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
2562	/ IsBuffer / false, / HasSOffset / false,
2563	/ ImmOffset / `0`, ScaleOffset)) {
2564	SBase = Expand32BitAddress(Addr: SBase);
2565	return true;
2566	}
2567
2568	if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2569	SBase = Expand32BitAddress(Addr);
2570	*Offset = CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (Addr), VT: MVT::i32);
2571	return true;
2572	}
2573
2574	return false;
2575	}
2576
2577	bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2578	SDValue &Offset) const {
2579	return SelectSMRD(/ N / nullptr, Addr, SBase, / SOffset / nullptr,
2580	Offset: &Offset);
2581	}
2582
2583	bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2584	SDValue &Offset) const {
2585	assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2586	return SelectSMRD(/ N / nullptr, Addr, SBase, / SOffset / nullptr,
2587	Offset: &Offset, / Imm32Only / true);
2588	}
2589
2590	bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
2591	SDValue &SOffset, SDValue &CPol) const {
2592	bool ScaleOffset;
2593	if (!SelectSMRD(N, Addr, SBase, SOffset: &SOffset, / Offset / nullptr,
2594	/ Imm32Only / false, ScaleOffset: &ScaleOffset))
2595	return false;
2596
2597	CPol = CurDAG->getTargetConstant(Val: ScaleOffset ? AMDGPU::CPol::SCAL : `0`,
2598	DL: SDLoc (N), VT: MVT::i32);
2599	return true;
2600	}
2601
2602	bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
2603	SDValue &SBase, SDValue &SOffset,
2604	SDValue &Offset,
2605	SDValue &CPol) const {
2606	bool ScaleOffset;
2607	if (!SelectSMRD(N, Addr, SBase, SOffset: &SOffset, Offset: &Offset, Imm32Only: false, ScaleOffset: &ScaleOffset))
2608	return false;
2609
2610	CPol = CurDAG->getTargetConstant(Val: ScaleOffset ? AMDGPU::CPol::SCAL : `0`,
2611	DL: SDLoc (N), VT: MVT::i32);
2612	return true;
2613	}
2614
2615	bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2616	return SelectSMRDOffset(/ N / nullptr, ByteOffsetNode: N, / SOffset / nullptr, Offset: &Offset,
2617	/ Imm32Only / false, / IsBuffer / true);
2618	}
2619
2620	bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2621	SDValue &Offset) const {
2622	assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2623	return SelectSMRDOffset(/ N / nullptr, ByteOffsetNode: N, / SOffset / nullptr, Offset: &Offset,
2624	/ Imm32Only / true, / IsBuffer / true);
2625	}
2626
2627	bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2628	SDValue &Offset) const {
2629	// Match the (soffset + offset) pair as a 32-bit register base and
2630	// an immediate offset.
2631	return N.getValueType() == MVT::i32 &&
2632	SelectSMRDBaseOffset(/ N / nullptr, Addr: N, / SBase / SOffset,
2633	/ SOffset/ nullptr, Offset: &Offset,
2634	/ Imm32Only / false, / IsBuffer / true);
2635	}
2636
2637	bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2638	SDValue &Base,
2639	SDValue &Offset) const {
2640	SDLoc DL(Index);
2641
2642	if (CurDAG->isBaseWithConstantOffset(Op: Index)) {
2643	SDValue N0 = Index.getOperand(i: `0`);
2644	SDValue N1 = Index.getOperand(i: `1`);
2645	ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
2646
2647	// (add n0, c0)
2648	// Don't peel off the offset (c0) if doing so could possibly lead
2649	// the base (n0) to be negative.
2650	// (or n0, \|c0\|) can never change a sign given isBaseWithConstantOffset.
2651	if (C1->getSExtValue() <= `0` \|\| CurDAG->SignBitIsZero(Op: N0) \|\|
2652	(Index ->getOpcode() == ISD::OR && C1->getSExtValue() >= `0`)) {
2653	Base = N0;
2654	Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32);
2655	return true;
2656	}
2657	}
2658
2659	if (isa<ConstantSDNode>(Val: Index))
2660	return false;
2661
2662	Base = Index;
2663	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
2664	return true;
2665	}
2666
2667	SDNode AMDGPUDAGToDAGISel::getBFE32(bool* IsSigned, const SDLoc &DL,
2668	SDValue Val, uint32_t Offset,
2669	uint32_t Width) {
2670	if (Val ->isDivergent()) {
2671	unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2672	SDValue Off = CurDAG->getTargetConstant(Val: Offset, DL, VT: MVT::i32);
2673	SDValue W = CurDAG->getTargetConstant(Val: Width, DL, VT: MVT::i32);
2674
2675	return CurDAG->getMachineNode(Opcode, dl: DL, VT: MVT::i32, Op1: Val, Op2: Off, Op3: W);
2676	}
2677	unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2678	// Transformation function, pack the offset and width of a BFE into
2679	// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2680	// source, bits [5:0] contain the offset and bits [22:16] the width.
2681	uint32_t PackedVal = Offset \| (Width << `16`);
2682	SDValue PackedConst = CurDAG->getTargetConstant(Val: PackedVal, DL, VT: MVT::i32);
2683
2684	return CurDAG->getMachineNode(Opcode, dl: DL, VT: MVT::i32, Op1: Val, Op2: PackedConst);
2685	}
2686
2687	void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2688	// "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2689	// "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2690	// Predicate: 0 < b <= c < 32
2691
2692	const SDValue &Shl = N->getOperand(Num: `0`);
2693	ConstantSDNode *B = dyn_cast<ConstantSDNode>(Val: Shl ->getOperand(Num: `1`));
2694	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
2695
2696	if (B && C) {
2697	uint32_t BVal = B->getZExtValue();
2698	uint32_t CVal = C->getZExtValue();
2699
2700	if (`0` < BVal && BVal <= CVal && CVal < `32`) {
2701	bool Signed = N->getOpcode() == ISD::SRA;
2702	ReplaceNode(F: N, T: getBFE32(IsSigned: Signed, DL: SDLoc (N), Val: Shl.getOperand(i: `0`), Offset: CVal - BVal,
2703	Width: `32` - CVal));
2704	return;
2705	}
2706	}
2707	SelectCode(N);
2708	}
2709
2710	void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2711	switch (N->getOpcode()) {
2712	case ISD::AND:
2713	if (N->getOperand(Num: `0`).getOpcode() == ISD::SRL) {
2714	// "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2715	// Predicate: isMask(mask)
2716	const SDValue &Srl = N->getOperand(Num: `0`);
2717	ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Val: Srl.getOperand(i: `1`));
2718	ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
2719
2720	if (Shift && Mask) {
2721	uint32_t ShiftVal = Shift->getZExtValue();
2722	uint32_t MaskVal = Mask->getZExtValue();
2723
2724	if (isMask_32(Value: MaskVal)) {
2725	uint32_t WidthVal = llvm::popcount(Value: MaskVal);
2726	ReplaceNode(F: N, T: getBFE32(IsSigned: false, DL: SDLoc (N), Val: Srl.getOperand(i: `0`), Offset: ShiftVal,
2727	Width: WidthVal));
2728	return;
2729	}
2730	}
2731	}
2732	break;
2733	case ISD::SRL:
2734	if (N->getOperand(Num: `0`).getOpcode() == ISD::AND) {
2735	// "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2736	// Predicate: isMask(mask >> b)
2737	const SDValue &And = N->getOperand(Num: `0`);
2738	ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
2739	ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: And ->getOperand(Num: `1`));
2740
2741	if (Shift && Mask) {
2742	uint32_t ShiftVal = Shift->getZExtValue();
2743	uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2744
2745	if (isMask_32(Value: MaskVal)) {
2746	uint32_t WidthVal = llvm::popcount(Value: MaskVal);
2747	ReplaceNode(F: N, T: getBFE32(IsSigned: false, DL: SDLoc (N), Val: And.getOperand(i: `0`), Offset: ShiftVal,
2748	Width: WidthVal));
2749	return;
2750	}
2751	}
2752	} else if (N->getOperand(Num: `0`).getOpcode() == ISD::SHL) {
2753	SelectS_BFEFromShifts(N);
2754	return;
2755	}
2756	break;
2757	case ISD::SRA:
2758	if (N->getOperand(Num: `0`).getOpcode() == ISD::SHL) {
2759	SelectS_BFEFromShifts(N);
2760	return;
2761	}
2762	break;
2763
2764	case ISD::SIGN_EXTEND_INREG: {
2765	// sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2766	SDValue Src = N->getOperand(Num: `0`);
2767	if (Src.getOpcode() != ISD::SRL)
2768	break;
2769
2770	const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: `1`));
2771	if (!Amt)
2772	break;
2773
2774	unsigned Width = cast<VTSDNode>(Val: N->getOperand(Num: `1`))->getVT().getSizeInBits();
2775	ReplaceNode(F: N, T: getBFE32(IsSigned: true, DL: SDLoc (N), Val: Src.getOperand(i: `0`),
2776	Offset: Amt->getZExtValue(), Width));
2777	return;
2778	}
2779	}
2780
2781	SelectCode(N);
2782	}
2783
2784	bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode N) const* {
2785	assert(N->getOpcode() == ISD::BRCOND);
2786	if (!N->hasOneUse())
2787	return false;
2788
2789	SDValue Cond = N->getOperand(Num: `1`);
2790	if (Cond.getOpcode() == ISD::CopyToReg)
2791	Cond = Cond.getOperand(i: `2`);
2792
2793	if (Cond.getOpcode() != ISD::SETCC \|\| !Cond.hasOneUse())
2794	return false;
2795
2796	MVT VT = Cond.getOperand(i: `0`).getSimpleValueType();
2797	if (VT == MVT::i32)
2798	return true;
2799
2800	if (VT == MVT::i64) {
2801	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Cond.getOperand(i: `2`))->get();
2802	return (CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
2803	Subtarget->hasScalarCompareEq64();
2804	}
2805
2806	if ((VT == MVT::f16 \|\| VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2807	return true;
2808
2809	return false;
2810	}
2811
2812	static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2813	assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2814	// Special case for amdgcn.ballot:
2815	// %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2816	// %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2817	// =>
2818	// Use i1 %Cond value instead of i(WaveSize) %VCMP.
2819	// This is possible because divergent ISD::SETCC is selected as V_CMP and
2820	// Cond becomes a i(WaveSize) full mask value.
2821	// Note that ballot doesn't use SETEQ condition but its easy to support it
2822	// here for completeness, so in this case Negate is set true on return.
2823	auto VCMP_CC = cast<CondCodeSDNode>(Val: VCMP.getOperand(i: `2`))->get();
2824	if ((VCMP_CC == ISD::SETEQ \|\| VCMP_CC == ISD::SETNE) &&
2825	isNullConstant(V: VCMP.getOperand(i: `1`))) {
2826
2827	auto Cond = VCMP.getOperand(i: `0`);
2828	if (ISD::isExtOpcode(Opcode: Cond ->getOpcode())) // Skip extension.
2829	Cond = Cond.getOperand(i: `0`);
2830
2831	if (isBoolSGPR(V: Cond)) {
2832	Negate = VCMP_CC == ISD::SETEQ;
2833	return Cond;
2834	}
2835	}
2836	return SDValue ();
2837	}
2838
2839	void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2840	SDValue Cond = N->getOperand(Num: `1`);
2841
2842	if (Cond.isUndef()) {
2843	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::SI_BR_UNDEF, VT: MVT::Other,
2844	Op1: N->getOperand(Num: `2`), Op2: N->getOperand(Num: `0`));
2845	return;
2846	}
2847
2848	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2849
2850	bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2851	bool AndExec = !UseSCCBr;
2852	bool Negate = false;
2853
2854	if (Cond.getOpcode() == ISD::SETCC &&
2855	Cond ->getOperand(Num: `0`)->getOpcode() == AMDGPUISD::SETCC) {
2856	SDValue VCMP = Cond ->getOperand(Num: `0`);
2857	auto CC = cast<CondCodeSDNode>(Val: Cond ->getOperand(Num: `2`))->get();
2858	if ((CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
2859	isNullConstant(V: Cond ->getOperand(Num: `1`)) &&
2860	// We may encounter ballot.i64 in wave32 mode on -O0.
2861	VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2862	// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2863	// %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2864	// BRCOND i1 %C, %BB
2865	// =>
2866	// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2867	// VCC = COPY i(WaveSize) %VCMP
2868	// S_CBRANCH_VCCNZ/VCCZ %BB
2869	Negate = CC == ISD::SETEQ;
2870	bool NegatedBallot = false;
2871	if (auto BallotCond = combineBallotPattern(VCMP, Negate&: NegatedBallot)) {
2872	Cond = BallotCond;
2873	UseSCCBr = !BallotCond ->isDivergent();
2874	Negate = Negate ^ NegatedBallot;
2875	} else {
2876	// TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2877	// selected as V_CMP, but this may change for uniform condition.
2878	Cond = VCMP;
2879	UseSCCBr = false;
2880	}
2881	}
2882	// Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2883	// V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2884	// used.
2885	AndExec = false;
2886	}
2887
2888	unsigned BrOp =
2889	UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2890	: (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2891	Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2892	SDLoc SL(N);
2893
2894	if (AndExec) {
2895	// This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2896	// analyzed what generates the vcc value, so we do not know whether vcc
2897	// bits for disabled lanes are 0. Thus we need to mask out bits for
2898	// disabled lanes.
2899	//
2900	// For the case that we select S_CBRANCH_SCC1 and it gets
2901	// changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2902	// SIInstrInfo::moveToVALU which inserts the S_AND).
2903	//
2904	// We could add an analysis of what generates the vcc value here and omit
2905	// the S_AND when is unnecessary. But it would be better to add a separate
2906	// pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2907	// catches both cases.
2908	Cond = SDValue (
2909	CurDAG->getMachineNode(
2910	Opcode: Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, dl: SL,
2911	VT: MVT::i1,
2912	Op1: CurDAG->getRegister(Reg: Subtarget->isWave32() ? AMDGPU::EXEC_LO
2913	: AMDGPU::EXEC,
2914	VT: MVT::i1),
2915	Op2: Cond),
2916	`0`);
2917	}
2918
2919	SDValue VCC = CurDAG->getCopyToReg(Chain: N->getOperand(Num: `0`), dl: SL, Reg: CondReg, N: Cond);
2920	CurDAG->SelectNodeTo(N, MachineOpc: BrOp, VT: MVT::Other,
2921	Op1: N->getOperand(Num: `2`), // Basic Block
2922	Op2: VCC.getValue(R: `0`));
2923	}
2924
2925	void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2926	if (Subtarget->hasSALUFloatInsts() && N->getValueType(ResNo: `0`) == MVT::f32 &&
2927	!N->isDivergent()) {
2928	SDValue Src = N->getOperand(Num: `0`);
2929	if (Src.getValueType() == MVT::f16) {
2930	if (isExtractHiElt(In: Src, Out&: Src)) {
2931	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::S_CVT_HI_F32_F16, VTs: N->getVTList(),
2932	Ops: {Src});
2933	return;
2934	}
2935	}
2936	}
2937
2938	SelectCode(N);
2939	}
2940
2941	void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode N, unsigned* IntrID) {
2942	// The address is assumed to be uniform, so if it ends up in a VGPR, it will
2943	// be copied to an SGPR with readfirstlane.
2944	unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2945	AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2946
2947	SDValue Chain = N->getOperand(Num: `0`);
2948	SDValue Ptr = N->getOperand(Num: `2`);
2949	MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
2950	MachineMemOperand *MMO = M->getMemOperand();
2951	bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2952
2953	SDValue Offset;
2954	if (CurDAG->isBaseWithConstantOffset(Op: Ptr)) {
2955	SDValue PtrBase = Ptr.getOperand(i: `0`);
2956	SDValue PtrOffset = Ptr.getOperand(i: `1`);
2957
2958	const APInt &OffsetVal = PtrOffset ->getAsAPIntVal();
2959	if (isDSOffsetLegal(Base: PtrBase, Offset: OffsetVal.getZExtValue())) {
2960	N = glueCopyToM0(N, Val: PtrBase);
2961	Offset = CurDAG->getTargetConstant(Val: OffsetVal, DL: SDLoc (), VT: MVT::i32);
2962	}
2963	}
2964
2965	if (!Offset) {
2966	N = glueCopyToM0(N, Val: Ptr);
2967	Offset = CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (), VT: MVT::i32);
2968	}
2969
2970	SDValue Ops[] = {
2971	Offset,
2972	CurDAG->getTargetConstant(Val: IsGDS, DL: SDLoc (), VT: MVT::i32),
2973	Chain,
2974	N->getOperand(Num: N->getNumOperands() - `1`) // New glue
2975	};
2976
2977	SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
2978	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
2979	}
2980
2981	// We need to handle this here because tablegen doesn't support matching
2982	// instructions with multiple outputs.
2983	void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode N, unsigned* IntrID) {
2984	unsigned Opc;
2985	switch (IntrID) {
2986	case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2987	case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2988	Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2989	break;
2990	case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2991	Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2992	break;
2993	case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2994	Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2995	break;
2996	}
2997	SDValue Ops[] = {N->getOperand(Num: `2`), N->getOperand(Num: `3`), N->getOperand(Num: `4`),
2998	N->getOperand(Num: `5`), N->getOperand(Num: `0`)};
2999
3000	MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
3001	MachineMemOperand *MMO = M->getMemOperand();
3002	SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
3003	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
3004	}
3005
3006	void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode N, unsigned* IntrID) {
3007	bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
3008	unsigned Opc =
3009	IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3010
3011	SmallVector<SDValue, `7`> TensorOps;
3012	// First two groups
3013	TensorOps.push_back(Elt: N->getOperand(Num: `2`)); // D# group 0
3014	TensorOps.push_back(Elt: N->getOperand(Num: `3`)); // D# group 1
3015
3016	// Use _D2 version if both group 2 and 3 are zero-initialized.
3017	SDValue Group2 = N->getOperand(Num: `4`);
3018	SDValue Group3 = N->getOperand(Num: `5`);
3019	if (ISD::isBuildVectorAllZeros(N: Group2.getNode()) &&
3020	ISD::isBuildVectorAllZeros(N: Group3.getNode())) {
3021	Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3022	: AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3023	} else { // Has at least 4 groups
3024	TensorOps.push_back(Elt: Group2); // D# group 2
3025	TensorOps.push_back(Elt: Group3); // D# group 3
3026	}
3027
3028	// TODO: Handle the fifth group: N->getOperand(6), which is silently ignored
3029	// for now because all existing targets only support up to 4 groups.
3030	TensorOps.push_back(Elt: CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (N), VT: MVT::i1)); // r128
3031	TensorOps.push_back(Elt: N->getOperand(Num: `7`)); // cache policy
3032	TensorOps.push_back(Elt: N->getOperand(Num: `0`)); // chain
3033
3034	(void)CurDAG->SelectNodeTo(N, MachineOpc: Opc, VT: MVT::Other, Ops: TensorOps);
3035	}
3036
3037	static unsigned gwsIntrinToOpcode(unsigned IntrID) {
3038	switch (IntrID) {
3039	case Intrinsic::amdgcn_ds_gws_init:
3040	return AMDGPU::DS_GWS_INIT;
3041	case Intrinsic::amdgcn_ds_gws_barrier:
3042	return AMDGPU::DS_GWS_BARRIER;
3043	case Intrinsic::amdgcn_ds_gws_sema_v:
3044	return AMDGPU::DS_GWS_SEMA_V;
3045	case Intrinsic::amdgcn_ds_gws_sema_br:
3046	return AMDGPU::DS_GWS_SEMA_BR;
3047	case Intrinsic::amdgcn_ds_gws_sema_p:
3048	return AMDGPU::DS_GWS_SEMA_P;
3049	case Intrinsic::amdgcn_ds_gws_sema_release_all:
3050	return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
3051	default:
3052	llvm_unreachable("not a gws intrinsic");
3053	}
3054	}
3055
3056	void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode N, unsigned* IntrID) {
3057	if (!Subtarget->hasGWS() \|\|
3058	(IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
3059	!Subtarget->hasGWSSemaReleaseAll())) {
3060	// Let this error.
3061	SelectCode(N);
3062	return;
3063	}
3064
3065	// Chain, intrinsic ID, vsrc, offset
3066	const bool HasVSrc = N->getNumOperands() == `4`;
3067	assert(HasVSrc \|\| N->getNumOperands() == `3`);
3068
3069	SDLoc SL(N);
3070	SDValue BaseOffset = N->getOperand(Num: HasVSrc ? `3` : `2`);
3071	int ImmOffset = `0`;
3072	MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
3073	MachineMemOperand *MMO = M->getMemOperand();
3074
3075	// Don't worry if the offset ends up in a VGPR. Only one lane will have
3076	// effect, so SIFixSGPRCopies will validly insert readfirstlane.
3077
3078	// The resource id offset is computed as (<isa opaque base> + M0[21:16] +
3079	// offset field) % 64. Some versions of the programming guide omit the m0
3080	// part, or claim it's from offset 0.
3081	if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(Val&: BaseOffset)) {
3082	// If we have a constant offset, try to use the 0 in m0 as the base.
3083	// TODO: Look into changing the default m0 initialization value. If the
3084	// default -1 only set the low 16-bits, we could leave it as-is and add 1 to
3085	// the immediate offset.
3086	glueCopyToM0(N, Val: CurDAG->getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32));
3087	ImmOffset = ConstOffset->getZExtValue();
3088	} else {
3089	if (CurDAG->isBaseWithConstantOffset(Op: BaseOffset)) {
3090	ImmOffset = BaseOffset.getConstantOperandVal(i: `1`);
3091	BaseOffset = BaseOffset.getOperand(i: `0`);
3092	}
3093
3094	// Prefer to do the shift in an SGPR since it should be possible to use m0
3095	// as the result directly. If it's already an SGPR, it will be eliminated
3096	// later.
3097	SDNode *SGPROffset
3098	= CurDAG->getMachineNode(Opcode: AMDGPU::V_READFIRSTLANE_B32, dl: SL, VT: MVT::i32,
3099	Op1: BaseOffset);
3100	// Shift to offset in m0
3101	SDNode *M0Base
3102	= CurDAG->getMachineNode(Opcode: AMDGPU::S_LSHL_B32, dl: SL, VT: MVT::i32,
3103	Op1: SDValue (SGPROffset, `0`),
3104	Op2: CurDAG->getTargetConstant(Val: `16`, DL: SL, VT: MVT::i32));
3105	glueCopyToM0(N, Val: SDValue (M0Base, `0`));
3106	}
3107
3108	SDValue Chain = N->getOperand(Num: `0`);
3109	SDValue OffsetField = CurDAG->getTargetConstant(Val: ImmOffset, DL: SL, VT: MVT::i32);
3110
3111	const unsigned Opc = gwsIntrinToOpcode(IntrID);
3112
3113	const MCInstrDesc &InstrDesc = TII->get(Opcode: Opc);
3114	int Data0Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::data0);
3115
3116	const TargetRegisterClass *DataRC = TII->getRegClass(MCID: InstrDesc, OpNum: Data0Idx);
3117
3118	SmallVector<SDValue, `5`> Ops;
3119	if (HasVSrc) {
3120	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3121
3122	SDValue Data = N->getOperand(Num: `2`);
3123	MVT DataVT = Data.getValueType().getSimpleVT();
3124	if (TRI->isTypeLegalForClass(RC: *DataRC, T: DataVT)) {
3125	// Normal 32-bit case.
3126	Ops.push_back(Elt: N->getOperand(Num: `2`));
3127	} else {
3128	// Operand is really 32-bits, but requires 64-bit alignment, so use the
3129	// even aligned 64-bit register class.
3130	const SDValue RegSeqOps[] = {
3131	CurDAG->getTargetConstant(Val: DataRC->getID(), DL: SL, VT: MVT::i32), Data,
3132	CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32),
3133	SDValue (
3134	CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: SL, VT: MVT::i32),
3135	`0`),
3136	CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32)};
3137
3138	Ops.push_back(Elt: SDValue (CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE,
3139	dl: SL, VT: MVT::v2i32, Ops: RegSeqOps),
3140	`0`));
3141	}
3142	}
3143
3144	Ops.push_back(Elt: OffsetField);
3145	Ops.push_back(Elt: Chain);
3146
3147	SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
3148	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
3149	}
3150
3151	void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
3152	if (Subtarget->getLDSBankCount() != `16`) {
3153	// This is a single instruction with a pattern.
3154	SelectCode(N);
3155	return;
3156	}
3157
3158	SDLoc DL(N);
3159
3160	// This requires 2 instructions. It is possible to write a pattern to support
3161	// this, but the generated isel emitter doesn't correctly deal with multiple
3162	// output instructions using the same physical register input. The copy to m0
3163	// is incorrectly placed before the second instruction.
3164	//
3165	// TODO: Match source modifiers.
3166	//
3167	// def : Pat <
3168	// (int_amdgcn_interp_p1_f16
3169	// (VOP3Mods f32:$src0, i32:$src0_modifiers),
3170	// (i32 timm:$attrchan), (i32 timm:$attr),
3171	// (i1 timm:$high), M0),
3172	// (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
3173	// timm:$attrchan, 0,
3174	// (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
3175	// let Predicates = [has16BankLDS];
3176	// }
3177
3178	// 16 bank LDS
3179	SDValue ToM0 = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl: DL, Reg: AMDGPU::M0,
3180	N: N->getOperand(Num: `5`), Glue: SDValue ());
3181
3182	SDVTList VTs = CurDAG->getVTList(VT1: MVT::f32, VT2: MVT::Other);
3183
3184	SDNode *InterpMov =
3185	CurDAG->getMachineNode(Opcode: AMDGPU::V_INTERP_MOV_F32, dl: DL, VTs, Ops: {
3186	CurDAG->getTargetConstant(Val: `2`, DL, VT: MVT::i32), // P0
3187	N->getOperand(Num: `3`), // Attr
3188	N->getOperand(Num: `2`), // Attrchan
3189	ToM0.getValue(R: `1`) // In glue
3190	});
3191
3192	SDNode *InterpP1LV =
3193	CurDAG->getMachineNode(Opcode: AMDGPU::V_INTERP_P1LV_F16, dl: DL, VT: MVT::f32, Ops: {
3194	CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32), // $src0_modifiers
3195	N->getOperand(Num: `1`), // Src0
3196	N->getOperand(Num: `3`), // Attr
3197	N->getOperand(Num: `2`), // Attrchan
3198	CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32), // $src2_modifiers
3199	SDValue (InterpMov, `0`), // Src2 - holds two f16 values selected by high
3200	N->getOperand(Num: `4`), // high
3201	CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1), // $clamp
3202	CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32), // $omod
3203	SDValue (InterpMov, `1`)
3204	});
3205
3206	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: SDValue (InterpP1LV, `0`));
3207	}
3208
3209	void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
3210	unsigned IntrID = N->getConstantOperandVal(Num: `1`);
3211	switch (IntrID) {
3212	case Intrinsic::amdgcn_ds_append:
3213	case Intrinsic::amdgcn_ds_consume: {
3214	if (N->getValueType(ResNo: `0`) != MVT::i32)
3215	break;
3216	SelectDSAppendConsume(N, IntrID);
3217	return;
3218	}
3219	case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3220	case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3221	case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3222	case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3223	SelectDSBvhStackIntrinsic(N, IntrID);
3224	return;
3225	case Intrinsic::amdgcn_init_whole_wave:
3226	CurDAG->getMachineFunction()
3227	.getInfo<SIMachineFunctionInfo>()
3228	->setInitWholeWave();
3229	break;
3230	}
3231
3232	SelectCode(N);
3233	}
3234
3235	void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
3236	unsigned IntrID = N->getConstantOperandVal(Num: `0`);
3237	unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
3238	SDNode *ConvGlueNode = N->getGluedNode();
3239	if (ConvGlueNode) {
3240	// FIXME: Possibly iterate over multiple glue nodes?
3241	assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
3242	ConvGlueNode = ConvGlueNode->getOperand(Num: `0`).getNode();
3243	ConvGlueNode =
3244	CurDAG->getMachineNode(Opcode: TargetOpcode::CONVERGENCECTRL_GLUE, dl: {},
3245	VT: MVT::Glue, Op1: SDValue (ConvGlueNode, `0`));
3246	} else {
3247	ConvGlueNode = nullptr;
3248	}
3249	switch (IntrID) {
3250	case Intrinsic::amdgcn_wqm:
3251	Opcode = AMDGPU::WQM;
3252	break;
3253	case Intrinsic::amdgcn_softwqm:
3254	Opcode = AMDGPU::SOFT_WQM;
3255	break;
3256	case Intrinsic::amdgcn_wwm:
3257	case Intrinsic::amdgcn_strict_wwm:
3258	Opcode = AMDGPU::STRICT_WWM;
3259	break;
3260	case Intrinsic::amdgcn_strict_wqm:
3261	Opcode = AMDGPU::STRICT_WQM;
3262	break;
3263	case Intrinsic::amdgcn_interp_p1_f16:
3264	SelectInterpP1F16(N);
3265	return;
3266	case Intrinsic::amdgcn_permlane16_swap:
3267	case Intrinsic::amdgcn_permlane32_swap: {
3268	if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
3269	!Subtarget->hasPermlane16Swap()) \|\|
3270	(IntrID == Intrinsic::amdgcn_permlane32_swap &&
3271	!Subtarget->hasPermlane32Swap())) {
3272	SelectCode(N); // Hit the default error
3273	return;
3274	}
3275
3276	Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3277	? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3278	: AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3279
3280	SmallVector<SDValue, `4`> NewOps(N->op_begin() + `1`, N->op_end());
3281	if (ConvGlueNode)
3282	NewOps.push_back(Elt: SDValue (ConvGlueNode, `0`));
3283
3284	bool FI = N->getConstantOperandVal(Num: `3`);
3285	NewOps [`2`] = CurDAG->getTargetConstant(
3286	Val: FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, DL: SDLoc (), VT: MVT::i32);
3287
3288	CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops: NewOps);
3289	return;
3290	}
3291	default:
3292	SelectCode(N);
3293	break;
3294	}
3295
3296	if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
3297	SDValue Src = N->getOperand(Num: `1`);
3298	CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops: {Src});
3299	}
3300
3301	if (ConvGlueNode) {
3302	SmallVector<SDValue, `4`> NewOps(N->ops());
3303	NewOps.push_back(Elt: SDValue (ConvGlueNode, `0`));
3304	CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: N->getVTList(), Ops: NewOps);
3305	}
3306	}
3307
3308	void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
3309	unsigned IntrID = N->getConstantOperandVal(Num: `1`);
3310	switch (IntrID) {
3311	case Intrinsic::amdgcn_ds_gws_init:
3312	case Intrinsic::amdgcn_ds_gws_barrier:
3313	case Intrinsic::amdgcn_ds_gws_sema_v:
3314	case Intrinsic::amdgcn_ds_gws_sema_br:
3315	case Intrinsic::amdgcn_ds_gws_sema_p:
3316	case Intrinsic::amdgcn_ds_gws_sema_release_all:
3317	SelectDS_GWS(N, IntrID);
3318	return;
3319	case Intrinsic::amdgcn_tensor_load_to_lds:
3320	case Intrinsic::amdgcn_tensor_store_from_lds:
3321	SelectTensorLoadStore(N, IntrID);
3322	return;
3323	default:
3324	break;
3325	}
3326
3327	SelectCode(N);
3328	}
3329
3330	void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
3331	SDValue Log2WaveSize =
3332	CurDAG->getTargetConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: SDLoc (N), VT: MVT::i32);
3333	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::S_LSHR_B32, VTs: N->getVTList(),
3334	Ops: {N->getOperand(Num: `0`), Log2WaveSize});
3335	}
3336
3337	void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
3338	SDValue SrcVal = N->getOperand(Num: `1`);
3339	if (SrcVal.getValueType() != MVT::i32) {
3340	SelectCode(N); // Emit default error
3341	return;
3342	}
3343
3344	SDValue CopyVal;
3345	Register SP = TLI->getStackPointerRegisterToSaveRestore();
3346	SDLoc SL(N);
3347
3348	if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
3349	CopyVal = SrcVal.getOperand(i: `0`);
3350	} else {
3351	SDValue Log2WaveSize = CurDAG->getTargetConstant(
3352	Val: Subtarget->getWavefrontSizeLog2(), DL: SL, VT: MVT::i32);
3353
3354	if (N->isDivergent()) {
3355	SrcVal = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::V_READFIRSTLANE_B32, dl: SL,
3356	VT: MVT::i32, Op1: SrcVal),
3357	`0`);
3358	}
3359
3360	CopyVal = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::S_LSHL_B32, dl: SL, VT: MVT::i32,
3361	Ops: {SrcVal, Log2WaveSize}),
3362	`0`);
3363	}
3364
3365	SDValue CopyToSP = CurDAG->getCopyToReg(Chain: N->getOperand(Num: `0`), dl: SL, Reg: SP, N: CopyVal);
3366	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: CopyToSP);
3367	}
3368
3369	bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3370	unsigned &Mods,
3371	bool IsCanonicalizing,
3372	bool AllowAbs) const {
3373	Mods = SISrcMods::NONE;
3374	Src = In;
3375
3376	if (Src.getOpcode() == ISD::FNEG) {
3377	Mods \|= SISrcMods::NEG;
3378	Src = Src.getOperand(i: `0`);
3379	} else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3380	// Fold fsub [+-]0 into fneg. This may not have folded depending on the
3381	// denormal mode, but we're implicitly canonicalizing in a source operand.
3382	auto *LHS = dyn_cast<ConstantFPSDNode>(Val: Src.getOperand(i: `0`));
3383	if (LHS && LHS->isZero()) {
3384	Mods \|= SISrcMods::NEG;
3385	Src = Src.getOperand(i: `1`);
3386	}
3387	}
3388
3389	if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3390	Mods \|= SISrcMods::ABS;
3391	Src = Src.getOperand(i: `0`);
3392	}
3393
3394	if (Mods != SISrcMods::NONE)
3395	return true;
3396
3397	// Convert various sign-bit masks on integers to src mods. Currently disabled
3398	// for 16-bit types as the codegen replaces the operand without adding a
3399	// srcmod. This is intentionally finding the cases where we are performing
3400	// float neg and abs on int types, the goal is not to obtain two's complement
3401	// neg or abs. Limit converison to select operands via the nonCanonalizing
3402	// pattern.
3403	// TODO: Add 16-bit support.
3404	if (IsCanonicalizing)
3405	return true;
3406
3407	// v2i32 xor/or/and are legal. A vselect using these instructions as operands
3408	// is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
3409	// through the extract to the bitwise op.
3410	SDValue PeekSrc =
3411	Src ->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src ->getOperand(Num: `0`) : Src;
3412	// Convert various sign-bit masks to src mods. Currently disabled for 16-bit
3413	// types as the codegen replaces the operand without adding a srcmod.
3414	// This is intentionally finding the cases where we are performing float neg
3415	// and abs on int types, the goal is not to obtain two's complement neg or
3416	// abs.
3417	// TODO: Add 16-bit support.
3418	unsigned Opc = PeekSrc.getOpcode();
3419	EVT VT = Src.getValueType();
3420	if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) \|\|
3421	(VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
3422	return true;
3423
3424	ConstantSDNode *CRHS = isConstOrConstSplat(N: PeekSrc ->getOperand(Num: `1`));
3425	if (!CRHS)
3426	return true;
3427
3428	auto ReplaceSrc = [&]() -> SDValue {
3429	if (Src ->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3430	return Src.getOperand(i: `0`);
3431
3432	SDValue LHS = PeekSrc ->getOperand(Num: `0`);
3433	SDValue Index = Src ->getOperand(Num: `1`);
3434	return CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SDLoc (Src),
3435	VT: Src.getValueType(), N1: LHS, N2: Index);
3436	};
3437
3438	// Recognise Srcmods:
3439	// (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.
3440	// (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.
3441	// (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS
3442	// SrcModifiers.
3443	if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3444	Mods \|= SISrcMods::NEG;
3445	Src = ReplaceSrc ();
3446	} else if (Opc == ISD::AND && AllowAbs &&
3447	CRHS->getAPIntValue().isMaxSignedValue()) {
3448	Mods \|= SISrcMods::ABS;
3449	Src = ReplaceSrc ();
3450	} else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3451	Mods \|= SISrcMods::ABS \| SISrcMods::NEG;
3452	Src = ReplaceSrc ();
3453	}
3454
3455	return true;
3456	}
3457
3458	bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3459	SDValue &SrcMods) const {
3460	unsigned Mods;
3461	if (SelectVOP3ModsImpl(In, Src, Mods, /IsCanonicalizing=/true,
3462	/AllowAbs=/true)) {
3463	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3464	return true;
3465	}
3466
3467	return false;
3468	}
3469
3470	bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3471	SDValue In, SDValue &Src, SDValue &SrcMods) const {
3472	unsigned Mods;
3473	if (SelectVOP3ModsImpl(In, Src, Mods, /IsCanonicalizing=/false,
3474	/AllowAbs=/true)) {
3475	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3476	return true;
3477	}
3478
3479	return false;
3480	}
3481
3482	bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3483	SDValue &SrcMods) const {
3484	unsigned Mods;
3485	if (SelectVOP3ModsImpl(In, Src, Mods,
3486	/IsCanonicalizing=/true,
3487	/AllowAbs=/false)) {
3488	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3489	return true;
3490	}
3491
3492	return false;
3493	}
3494
3495	bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3496	if (In.getOpcode() == ISD::FABS \|\| In.getOpcode() == ISD::FNEG)
3497	return false;
3498
3499	Src = In;
3500	return true;
3501	}
3502
3503	bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3504	SDValue &SrcMods,
3505	bool OpSel) const {
3506	unsigned Mods;
3507	if (SelectVOP3ModsImpl(In, Src, Mods,
3508	/IsCanonicalizing=/true,
3509	/AllowAbs=/false)) {
3510	if (OpSel)
3511	Mods \|= SISrcMods::OP_SEL_0;
3512	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3513	return true;
3514	}
3515
3516	return false;
3517	}
3518
3519	bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3520	SDValue &SrcMods) const {
3521	return SelectVINTERPModsImpl(In, Src, SrcMods, / OpSel / false);
3522	}
3523
3524	bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3525	SDValue &SrcMods) const {
3526	return SelectVINTERPModsImpl(In, Src, SrcMods, / OpSel / true);
3527	}
3528
3529	bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3530	SDValue &SrcMods, SDValue &Clamp,
3531	SDValue &Omod) const {
3532	SDLoc DL(In);
3533	Clamp = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
3534	Omod = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
3535
3536	return SelectVOP3Mods(In, Src, SrcMods);
3537	}
3538
3539	bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3540	SDValue &SrcMods, SDValue &Clamp,
3541	SDValue &Omod) const {
3542	SDLoc DL(In);
3543	Clamp = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
3544	Omod = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
3545
3546	return SelectVOP3BMods(In, Src, SrcMods);
3547	}
3548
3549	bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3550	SDValue &Clamp, SDValue &Omod) const {
3551	Src = In;
3552
3553	SDLoc DL(In);
3554	Clamp = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
3555	Omod = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
3556
3557	return true;
3558	}
3559
3560	bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3561	SDValue &SrcMods, bool IsDOT) const {
3562	unsigned Mods = SISrcMods::NONE;
3563	Src = In;
3564
3565	// TODO: Handle G_FSUB 0 as fneg
3566	if (Src.getOpcode() == ISD::FNEG) {
3567	Mods ^= (SISrcMods::NEG \| SISrcMods::NEG_HI);
3568	Src = Src.getOperand(i: `0`);
3569	}
3570
3571	if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == `2` &&
3572	(!IsDOT \|\| !Subtarget->hasDOTOpSelHazard())) {
3573	unsigned VecMods = Mods;
3574
3575	SDValue Lo = stripBitcast(Val: Src.getOperand(i: `0`));
3576	SDValue Hi = stripBitcast(Val: Src.getOperand(i: `1`));
3577
3578	if (Lo.getOpcode() == ISD::FNEG) {
3579	Lo = stripBitcast(Val: Lo.getOperand(i: `0`));
3580	Mods ^= SISrcMods::NEG;
3581	}
3582
3583	if (Hi.getOpcode() == ISD::FNEG) {
3584	Hi = stripBitcast(Val: Hi.getOperand(i: `0`));
3585	Mods ^= SISrcMods::NEG_HI;
3586	}
3587
3588	if (isExtractHiElt(In: Lo, Out&: Lo))
3589	Mods \|= SISrcMods::OP_SEL_0;
3590
3591	if (isExtractHiElt(In: Hi, Out&: Hi))
3592	Mods \|= SISrcMods::OP_SEL_1;
3593
3594	unsigned VecSize = Src.getValueSizeInBits();
3595	Lo = stripExtractLoElt(In: Lo);
3596	Hi = stripExtractLoElt(In: Hi);
3597
3598	if (Lo.getValueSizeInBits() > VecSize) {
3599	Lo = CurDAG->getTargetExtractSubreg(
3600	SRIdx: (VecSize > `32`) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, DL: SDLoc (In),
3601	VT: MVT::getIntegerVT(BitWidth: VecSize), Operand: Lo);
3602	}
3603
3604	if (Hi.getValueSizeInBits() > VecSize) {
3605	Hi = CurDAG->getTargetExtractSubreg(
3606	SRIdx: (VecSize > `32`) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, DL: SDLoc (In),
3607	VT: MVT::getIntegerVT(BitWidth: VecSize), Operand: Hi);
3608	}
3609
3610	assert(Lo.getValueSizeInBits() <= VecSize &&
3611	Hi.getValueSizeInBits() <= VecSize);
3612
3613	if (Lo == Hi && !isInlineImmediate(N: Lo.getNode())) {
3614	// Really a scalar input. Just select from the low half of the register to
3615	// avoid packing.
3616
3617	if (VecSize == Lo.getValueSizeInBits()) {
3618	Src = Lo;
3619	} else if (VecSize == `32`) {
3620	Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
3621	} else {
3622	assert(Lo.getValueSizeInBits() == `32` && VecSize == `64`);
3623
3624	SDLoc SL(In);
3625	SDValue Undef = SDValue (
3626	CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: SL,
3627	VT: Lo.getValueType()), `0`);
3628	auto RC = Lo ->isDivergent() ? AMDGPU::VReg_64RegClassID
3629	: AMDGPU::SReg_64RegClassID;
3630	const SDValue Ops[] = {
3631	CurDAG->getTargetConstant(Val: RC, DL: SL, VT: MVT::i32),
3632	Lo, CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32),
3633	Undef, CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32) };
3634
3635	Src = SDValue (CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: SL,
3636	VT: Src.getValueType(), Ops), `0`);
3637	}
3638	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3639	return true;
3640	}
3641
3642	if (VecSize == `64` && Lo == Hi && isa<ConstantFPSDNode>(Val: Lo)) {
3643	uint64_t Lit = cast<ConstantFPSDNode>(Val&: Lo)->getValueAPF()
3644	.bitcastToAPInt().getZExtValue();
3645	if (AMDGPU::isInlinableLiteral32(Literal: Lit, HasInv2Pi: Subtarget->hasInv2PiInlineImm())) {
3646	Src = CurDAG->getTargetConstant(Val: Lit, DL: SDLoc (In), VT: MVT::i64);
3647	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3648	return true;
3649	}
3650	}
3651
3652	Mods = VecMods;
3653	} else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3654	Src.getNumOperands() == `2`) {
3655
3656	// TODO: We should repeat the build_vector source check above for the
3657	// vector_shuffle for negates and casts of individual elements.
3658
3659	auto *SVN = cast<ShuffleVectorSDNode>(Val&: Src);
3660	ArrayRef<int> Mask = SVN->getMask();
3661
3662	if (Mask [`0`] < `2` && Mask [`1`] < `2`) {
3663	// src1 should be undef.
3664	SDValue ShuffleSrc = SVN->getOperand(Num: `0`);
3665
3666	if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3667	ShuffleSrc = ShuffleSrc.getOperand(i: `0`);
3668	Mods ^= (SISrcMods::NEG \| SISrcMods::NEG_HI);
3669	}
3670
3671	if (Mask [`0`] == `1`)
3672	Mods \|= SISrcMods::OP_SEL_0;
3673	if (Mask [`1`] == `1`)
3674	Mods \|= SISrcMods::OP_SEL_1;
3675
3676	Src = ShuffleSrc;
3677	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3678	return true;
3679	}
3680	}
3681
3682	// Packed instructions do not have abs modifiers.
3683	Mods \|= SISrcMods::OP_SEL_1;
3684
3685	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3686	return true;
3687	}
3688
3689	bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3690	SDValue &SrcMods) const {
3691	return SelectVOP3PMods(In, Src, SrcMods, IsDOT: true);
3692	}
3693
3694	bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3695	SDValue &Src) const {
3696	const ConstantSDNode *C = cast<ConstantSDNode>(Val&: In);
3697	assert(C->getAPIntValue().getBitWidth() == `1` && "expected i1 value");
3698
3699	unsigned Mods = SISrcMods::OP_SEL_1;
3700	unsigned SrcVal = C->getZExtValue();
3701	if (SrcVal == `1`)
3702	Mods \|= SISrcMods::OP_SEL_0;
3703
3704	Src = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3705	return true;
3706	}
3707
3708	static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3709	llvm::SelectionDAG *CurDAG,
3710	const SDLoc &DL) {
3711	unsigned DstRegClass;
3712	EVT DstTy;
3713	switch (Elts.size()) {
3714	case `8`:
3715	DstRegClass = AMDGPU::VReg_256RegClassID;
3716	DstTy = MVT::v8i32;
3717	break;
3718	case `4`:
3719	DstRegClass = AMDGPU::VReg_128RegClassID;
3720	DstTy = MVT::v4i32;
3721	break;
3722	case `2`:
3723	DstRegClass = AMDGPU::VReg_64RegClassID;
3724	DstTy = MVT::v2i32;
3725	break;
3726	default:
3727	llvm_unreachable("unhandled Reg sequence size");
3728	}
3729
3730	SmallVector<SDValue, `17`> Ops;
3731	Ops.push_back(Elt: CurDAG->getTargetConstant(Val: DstRegClass, DL, VT: MVT::i32));
3732	for (unsigned i = `0`; i < Elts.size(); ++i) {
3733	Ops.push_back(Elt: Elts [i]);
3734	Ops.push_back(Elt: CurDAG->getTargetConstant(
3735	Val: SIRegisterInfo::getSubRegFromChannel(Channel: i), DL, VT: MVT::i32));
3736	}
3737	return CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL, VT: DstTy, Ops);
3738	}
3739
3740	static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3741	llvm::SelectionDAG *CurDAG,
3742	const SDLoc &DL) {
3743	SmallVector<SDValue, `8`> PackedElts;
3744	assert("unhandled Reg sequence size" &&
3745	(Elts.size() == `8` \|\| Elts.size() == `16`));
3746
3747	// Pack 16-bit elements in pairs into 32-bit register. If both elements are
3748	// unpacked from 32-bit source use it, otherwise pack them using v_perm.
3749	for (unsigned i = `0`; i < Elts.size(); i += `2`) {
3750	SDValue LoSrc = stripExtractLoElt(In: stripBitcast(Val: Elts [i]));
3751	SDValue HiSrc;
3752	if (isExtractHiElt(In: Elts [i + `1`], Out&: HiSrc) && LoSrc == HiSrc) {
3753	PackedElts.push_back(Elt: HiSrc);
3754	} else {
3755	SDValue PackLoLo = CurDAG->getTargetConstant(Val: `0x05040100`, DL, VT: MVT::i32);
3756	MachineSDNode *Packed =
3757	CurDAG->getMachineNode(Opcode: AMDGPU::V_PERM_B32_e64, dl: DL, VT: MVT::i32,
3758	Ops: {Elts [i + `1`], Elts [i], PackLoLo});
3759	PackedElts.push_back(Elt: SDValue (Packed, `0`));
3760	}
3761	}
3762
3763	return buildRegSequence32(Elts&: PackedElts, CurDAG, DL);
3764	}
3765
3766	static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3767	llvm::SelectionDAG *CurDAG,
3768	const SDLoc &DL, unsigned ElementSize) {
3769	if (ElementSize == `16`)
3770	return buildRegSequence16(Elts, CurDAG, DL);
3771	if (ElementSize == `32`)
3772	return buildRegSequence32(Elts, CurDAG, DL);
3773	llvm_unreachable("Unhandled element size");
3774	}
3775
3776	static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3777	SmallVectorImpl<SDValue> &Elts, SDValue &Src,
3778	llvm::SelectionDAG CurDAG, const* SDLoc &DL,
3779	unsigned ElementSize) {
3780	if (ModOpcode == ISD::FNEG) {
3781	Mods \|= SISrcMods::NEG;
3782	// Check if all elements also have abs modifier
3783	SmallVector<SDValue, `8`> NegAbsElts;
3784	for (auto El : Elts) {
3785	if (El.getOpcode() != ISD::FABS)
3786	break;
3787	NegAbsElts.push_back(Elt: El ->getOperand(Num: `0`));
3788	}
3789	if (Elts.size() != NegAbsElts.size()) {
3790	// Neg
3791	Src = SDValue (buildRegSequence(Elts, CurDAG, DL, ElementSize), `0`);
3792	} else {
3793	// Neg and Abs
3794	Mods \|= SISrcMods::NEG_HI;
3795	Src = SDValue (buildRegSequence(Elts&: NegAbsElts, CurDAG, DL, ElementSize), `0`);
3796	}
3797	} else {
3798	assert(ModOpcode == ISD::FABS);
3799	// Abs
3800	Mods \|= SISrcMods::NEG_HI;
3801	Src = SDValue (buildRegSequence(Elts, CurDAG, DL, ElementSize), `0`);
3802	}
3803	}
3804
3805	// Check all f16 elements for modifiers while looking through b32 and v2b16
3806	// build vector, stop if element does not satisfy ModifierCheck.
3807	static void
3808	checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
3809	std::function<bool(SDValue)> ModifierCheck) {
3810	for (unsigned i = `0`; i < BV->getNumOperands(); ++i) {
3811	if (auto *F16Pair =
3812	dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: BV->getOperand(Num: i)))) {
3813	for (unsigned i = `0`; i < F16Pair->getNumOperands(); ++i) {
3814	SDValue ElF16 = stripBitcast(Val: F16Pair->getOperand(Num: i));
3815	if (!ModifierCheck (ElF16))
3816	break;
3817	}
3818	}
3819	}
3820	}
3821
3822	bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3823	SDValue &SrcMods) const {
3824	Src = In;
3825	unsigned Mods = SISrcMods::OP_SEL_1;
3826
3827	// mods are on f16 elements
3828	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3829	SmallVector<SDValue, `8`> EltsF16;
3830
3831	checkWMMAElementsModifiersF16(BV, ModifierCheck: [&](SDValue Element) -> bool {
3832	if (Element.getOpcode() != ISD::FNEG)
3833	return false;
3834	EltsF16.push_back(Elt: Element.getOperand(i: `0`));
3835	return true;
3836	});
3837
3838	// All elements have neg modifier
3839	if (BV->getNumOperands() * `2` == EltsF16.size()) {
3840	Src = SDValue (buildRegSequence16(Elts&: EltsF16, CurDAG, DL: SDLoc (In)), `0`);
3841	Mods \|= SISrcMods::NEG;
3842	Mods \|= SISrcMods::NEG_HI;
3843	}
3844	}
3845
3846	// mods are on v2f16 elements
3847	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3848	SmallVector<SDValue, `8`> EltsV2F16;
3849	for (unsigned i = `0`; i < BV->getNumOperands(); ++i) {
3850	SDValue ElV2f16 = stripBitcast(Val: BV->getOperand(Num: i));
3851	// Based on first element decide which mod we match, neg or abs
3852	if (ElV2f16.getOpcode() != ISD::FNEG)
3853	break;
3854	EltsV2F16.push_back(Elt: ElV2f16.getOperand(i: `0`));
3855	}
3856
3857	// All pairs of elements have neg modifier
3858	if (BV->getNumOperands() == EltsV2F16.size()) {
3859	Src = SDValue (buildRegSequence32(Elts&: EltsV2F16, CurDAG, DL: SDLoc (In)), `0`);
3860	Mods \|= SISrcMods::NEG;
3861	Mods \|= SISrcMods::NEG_HI;
3862	}
3863	}
3864
3865	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3866	return true;
3867	}
3868
3869	bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3870	SDValue &SrcMods) const {
3871	Src = In;
3872	unsigned Mods = SISrcMods::OP_SEL_1;
3873	unsigned ModOpcode;
3874
3875	// mods are on f16 elements
3876	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3877	SmallVector<SDValue, `8`> EltsF16;
3878	checkWMMAElementsModifiersF16(BV, ModifierCheck: [&](SDValue ElF16) -> bool {
3879	// Based on first element decide which mod we match, neg or abs
3880	if (EltsF16.empty())
3881	ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3882	if (ElF16.getOpcode() != ModOpcode)
3883	return false;
3884	EltsF16.push_back(Elt: ElF16.getOperand(i: `0`));
3885	return true;
3886	});
3887
3888	// All elements have ModOpcode modifier
3889	if (BV->getNumOperands() * `2` == EltsF16.size())
3890	selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF16, Src, CurDAG, DL: SDLoc (In),
3891	ElementSize: `16`);
3892	}
3893
3894	// mods are on v2f16 elements
3895	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3896	SmallVector<SDValue, `8`> EltsV2F16;
3897
3898	for (unsigned i = `0`; i < BV->getNumOperands(); ++i) {
3899	SDValue ElV2f16 = stripBitcast(Val: BV->getOperand(Num: i));
3900	// Based on first element decide which mod we match, neg or abs
3901	if (EltsV2F16.empty())
3902	ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3903	if (ElV2f16 ->getOpcode() != ModOpcode)
3904	break;
3905	EltsV2F16.push_back(Elt: ElV2f16 ->getOperand(Num: `0`));
3906	}
3907
3908	// All elements have ModOpcode modifier
3909	if (BV->getNumOperands() == EltsV2F16.size())
3910	selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsV2F16, Src, CurDAG, DL: SDLoc (In),
3911	ElementSize: `32`);
3912	}
3913
3914	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3915	return true;
3916	}
3917
3918	bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3919	SDValue &SrcMods) const {
3920	Src = In;
3921	unsigned Mods = SISrcMods::OP_SEL_1;
3922	SmallVector<SDValue, `8`> EltsF32;
3923
3924	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3925	assert(BV->getNumOperands() > `0`);
3926	// Based on first element decide which mod we match, neg or abs
3927	SDValue ElF32 = stripBitcast(Val: BV->getOperand(Num: `0`));
3928	unsigned ModOpcode =
3929	(ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3930	for (unsigned i = `0`; i < BV->getNumOperands(); ++i) {
3931	SDValue ElF32 = stripBitcast(Val: BV->getOperand(Num: i));
3932	if (ElF32.getOpcode() != ModOpcode)
3933	break;
3934	EltsF32.push_back(Elt: ElF32.getOperand(i: `0`));
3935	}
3936
3937	// All elements had ModOpcode modifier
3938	if (BV->getNumOperands() == EltsF32.size())
3939	selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF32, Src, CurDAG, DL: SDLoc (In),
3940	ElementSize: `32`);
3941	}
3942
3943	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3944	return true;
3945	}
3946
3947	bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3948	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val&: In)) {
3949	BitVector UndefElements;
3950	if (SDValue Splat = BV->getSplatValue(UndefElements: &UndefElements))
3951	if (isInlineImmediate(N: Splat.getNode())) {
3952	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Splat)) {
3953	unsigned Imm = C->getAPIntValue().getSExtValue();
3954	Src = CurDAG->getTargetConstant(Val: Imm, DL: SDLoc (In), VT: MVT::i32);
3955	return true;
3956	}
3957	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Splat)) {
3958	unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3959	Src = CurDAG->getTargetConstant(Val: Imm, DL: SDLoc (In), VT: MVT::i32);
3960	return true;
3961	}
3962	llvm_unreachable("unhandled Constant node");
3963	}
3964	}
3965
3966	// 16 bit splat
3967	SDValue SplatSrc32 = stripBitcast(Val: In);
3968	if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(Val&: SplatSrc32))
3969	if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3970	SDValue SplatSrc16 = stripBitcast(Val: Splat32);
3971	if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(Val&: SplatSrc16))
3972	if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3973	const SIInstrInfo *TII = Subtarget->getInstrInfo();
3974	std::optional<APInt> RawValue;
3975	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Splat))
3976	RawValue = C->getValueAPF().bitcastToAPInt();
3977	else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Splat))
3978	RawValue = C->getAPIntValue();
3979
3980	if (RawValue.has_value()) {
3981	EVT VT = In.getValueType().getScalarType();
3982	if (VT.getSimpleVT() == MVT::f16 \|\| VT.getSimpleVT() == MVT::bf16) {
3983	APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3984	? APFloatBase::IEEEhalf()
3985	: APFloatBase::BFloat(),
3986	RawValue.value());
3987	if (TII->isInlineConstant(Imm: FloatVal)) {
3988	Src = CurDAG->getTargetConstant(Val: RawValue.value(), DL: SDLoc (In),
3989	VT: MVT::i16);
3990	return true;
3991	}
3992	} else if (VT.getSimpleVT() == MVT::i16) {
3993	if (TII->isInlineConstant(Imm: RawValue.value())) {
3994	Src = CurDAG->getTargetConstant(Val: RawValue.value(), DL: SDLoc (In),
3995	VT: MVT::i16);
3996	return true;
3997	}
3998	} else
3999	llvm_unreachable("unknown 16-bit type");
4000	}
4001	}
4002	}
4003
4004	return false;
4005	}
4006
4007	bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
4008	SDValue &IndexKey) const {
4009	unsigned Key = `0`;
4010	Src = In;
4011
4012	if (In.getOpcode() == ISD::SRL) {
4013	const llvm::SDValue &ShiftSrc = In.getOperand(i: `0`);
4014	ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: `1`));
4015	if (ShiftSrc.getValueType().getSizeInBits() == `32` && ShiftAmt &&
4016	ShiftAmt->getZExtValue() % `8` == `0`) {
4017	Key = ShiftAmt->getZExtValue() / `8`;
4018	Src = ShiftSrc;
4019	}
4020	}
4021
4022	IndexKey = CurDAG->getTargetConstant(Val: Key, DL: SDLoc (In), VT: MVT::i32);
4023	return true;
4024	}
4025
4026	bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
4027	SDValue &IndexKey) const {
4028	unsigned Key = `0`;
4029	Src = In;
4030
4031	if (In.getOpcode() == ISD::SRL) {
4032	const llvm::SDValue &ShiftSrc = In.getOperand(i: `0`);
4033	ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: `1`));
4034	if (ShiftSrc.getValueType().getSizeInBits() == `32` && ShiftAmt &&
4035	ShiftAmt->getZExtValue() == `16`) {
4036	Key = `1`;
4037	Src = ShiftSrc;
4038	}
4039	}
4040
4041	IndexKey = CurDAG->getTargetConstant(Val: Key, DL: SDLoc (In), VT: MVT::i32);
4042	return true;
4043	}
4044
4045	bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
4046	SDValue &IndexKey) const {
4047	unsigned Key = `0`;
4048	Src = In;
4049
4050	SDValue InI32;
4051
4052	if (In.getOpcode() == ISD::ANY_EXTEND \|\| In.getOpcode() == ISD::ZERO_EXTEND) {
4053	const SDValue &ExtendSrc = In.getOperand(i: `0`);
4054	if (ExtendSrc.getValueSizeInBits() == `32`)
4055	InI32 = ExtendSrc;
4056	} else if (In ->getOpcode() == ISD::BITCAST) {
4057	const SDValue &CastSrc = In.getOperand(i: `0`);
4058	if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
4059	CastSrc.getOperand(i: `0`).getValueSizeInBits() == `32`) {
4060	ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(Val: CastSrc.getOperand(i: `1`));
4061	if (Zero && Zero->getZExtValue() == `0`)
4062	InI32 = CastSrc.getOperand(i: `0`);
4063	}
4064	}
4065
4066	if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4067	const SDValue &ExtractVecEltSrc = InI32.getOperand(i: `0`);
4068	ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(Val: InI32.getOperand(i: `1`));
4069	if (ExtractVecEltSrc.getValueSizeInBits() == `64` && EltIdx &&
4070	EltIdx->getZExtValue() == `1`) {
4071	Key = `1`;
4072	Src = ExtractVecEltSrc;
4073	}
4074	}
4075
4076	IndexKey = CurDAG->getTargetConstant(Val: Key, DL: SDLoc (In), VT: MVT::i32);
4077	return true;
4078	}
4079
4080	bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
4081	SDValue &SrcMods) const {
4082	Src = In;
4083	// FIXME: Handle op_sel
4084	SrcMods = CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (In), VT: MVT::i32);
4085	return true;
4086	}
4087
4088	bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
4089	SDValue &SrcMods) const {
4090	// FIXME: Handle op_sel
4091	return SelectVOP3Mods(In, Src, SrcMods);
4092	}
4093
4094	// Match lowered fpext from bf16 to f32. This is a bit operation extending
4095	// a 16-bit value with 16-bit of zeroes at LSB:
4096	//
4097	// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
4098	// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
4099	// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
4100	static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
4101	if (Op.getValueType() != MVT::f32 \|\| Op.getOpcode() != ISD::BITCAST)
4102	return SDValue ();
4103	Op = Op.getOperand(i: `0`);
4104
4105	IsExtractHigh = false;
4106	if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
4107	auto Low16 = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `0`));
4108	if (!Low16 \|\| !Low16->isZero())
4109	return SDValue ();
4110	Op = stripBitcast(Val: Op.getOperand(i: `1`));
4111	if (Op.getValueType() != MVT::bf16)
4112	return SDValue ();
4113	return Op;
4114	}
4115
4116	if (Op.getValueType() != MVT::i32)
4117	return SDValue ();
4118
4119	if (Op.getOpcode() == ISD::AND) {
4120	if (auto Mask = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`))) {
4121	if (Mask->getZExtValue() == `0xffff0000`) {
4122	IsExtractHigh = true;
4123	return Op.getOperand(i: `0`);
4124	}
4125	}
4126	return SDValue ();
4127	}
4128
4129	if (Op.getOpcode() == ISD::SHL) {
4130	if (auto Amt = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`))) {
4131	if (Amt->getZExtValue() == `16`)
4132	return Op.getOperand(i: `0`);
4133	}
4134	}
4135
4136	return SDValue ();
4137	}
4138
4139	// The return value is not whether the match is possible (which it always is),
4140	// but whether or not it a conversion is really used.
4141	bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
4142	unsigned &Mods,
4143	MVT VT) const {
4144	Mods = `0`;
4145	SelectVOP3ModsImpl(In, Src, Mods);
4146
4147	bool IsExtractHigh = false;
4148	if (Src.getOpcode() == ISD::FP_EXTEND) {
4149	Src = Src.getOperand(i: `0`);
4150	} else if (VT == MVT::bf16) {
4151	SDValue B16 = matchBF16FPExtendLike(Op: Src, IsExtractHigh);
4152	if (!B16)
4153	return false;
4154	Src = B16;
4155	} else
4156	return false;
4157
4158	if (Src.getValueType() != VT &&
4159	(VT != MVT::bf16 \|\| Src.getValueType() != MVT::i32))
4160	return false;
4161
4162	Src = stripBitcast(Val: Src);
4163
4164	// Be careful about folding modifiers if we already have an abs. fneg is
4165	// applied last, so we don't want to apply an earlier fneg.
4166	if ((Mods & SISrcMods::ABS) == `0`) {
4167	unsigned ModsTmp;
4168	SelectVOP3ModsImpl(In: Src, Src, Mods&: ModsTmp);
4169
4170	if ((ModsTmp & SISrcMods::NEG) != `0`)
4171	Mods ^= SISrcMods::NEG;
4172
4173	if ((ModsTmp & SISrcMods::ABS) != `0`)
4174	Mods \|= SISrcMods::ABS;
4175	}
4176
4177	// op_sel/op_sel_hi decide the source type and source.
4178	// If the source's op_sel_hi is set, it indicates to do a conversion from
4179	// fp16. If the sources's op_sel is set, it picks the high half of the source
4180	// register.
4181
4182	Mods \|= SISrcMods::OP_SEL_1;
4183	if (Src.getValueSizeInBits() == `16`) {
4184	if (isExtractHiElt(In: Src, Out&: Src)) {
4185	Mods \|= SISrcMods::OP_SEL_0;
4186
4187	// TODO: Should we try to look for neg/abs here?
4188	return true;
4189	}
4190
4191	if (Src.getOpcode() == ISD::TRUNCATE &&
4192	Src.getOperand(i: `0`).getValueType() == MVT::i32) {
4193	Src = Src.getOperand(i: `0`);
4194	return true;
4195	}
4196
4197	if (Subtarget->useRealTrue16Insts())
4198	// In true16 mode, pack src to a 32bit
4199	Src = createVOP3PSrc32FromLo16(Lo: Src, Src: In, CurDAG, Subtarget);
4200	} else if (IsExtractHigh)
4201	Mods \|= SISrcMods::OP_SEL_0;
4202
4203	return true;
4204	}
4205
4206	bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
4207	SDValue &SrcMods) const {
4208	unsigned Mods = `0`;
4209	if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, VT: MVT::f16))
4210	return false;
4211	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
4212	return true;
4213	}
4214
4215	bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
4216	SDValue &SrcMods) const {
4217	unsigned Mods = `0`;
4218	SelectVOP3PMadMixModsImpl(In, Src, Mods, VT: MVT::f16);
4219	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
4220	return true;
4221	}
4222
4223	bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
4224	SDValue &SrcMods) const {
4225	unsigned Mods = `0`;
4226	if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, VT: MVT::bf16))
4227	return false;
4228	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
4229	return true;
4230	}
4231
4232	bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
4233	SDValue &SrcMods) const {
4234	unsigned Mods = `0`;
4235	SelectVOP3PMadMixModsImpl(In, Src, Mods, VT: MVT::bf16);
4236	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
4237	return true;
4238	}
4239
4240	// Match BITOP3 operation and return a number of matched instructions plus
4241	// truth table.
4242	static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
4243	SmallVectorImpl<SDValue> &Src) {
4244	unsigned NumOpcodes = `0`;
4245	uint8_t LHSBits, RHSBits;
4246
4247	auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
4248	// Define truth table given Src0, Src1, Src2 bits permutations:
4249	// 0 0 0
4250	// 0 0 1
4251	// 0 1 0
4252	// 0 1 1
4253	// 1 0 0
4254	// 1 0 1
4255	// 1 1 0
4256	// 1 1 1
4257	const uint8_t SrcBits[`3`] = { `0xf0`, `0xcc`, `0xaa` };
4258
4259	if (auto *C = dyn_cast<ConstantSDNode>(Val&: Op)) {
4260	if (C->isAllOnes()) {
4261	Bits = `0xff`;
4262	return true;
4263	}
4264	if (C->isZero()) {
4265	Bits = `0`;
4266	return true;
4267	}
4268	}
4269
4270	for (unsigned I = `0`; I < Src.size(); ++I) {
4271	// Try to find existing reused operand
4272	if (Src [I] == Op) {
4273	Bits = SrcBits[I];
4274	return true;
4275	}
4276	// Try to replace parent operator
4277	if (Src [I] == In) {
4278	Bits = SrcBits[I];
4279	Src [I] = Op;
4280	return true;
4281	}
4282	}
4283
4284	if (Src.size() == `3`) {
4285	// No room left for operands. Try one last time, there can be a 'not' of
4286	// one of our source operands. In this case we can compute the bits
4287	// without growing Src vector.
4288	if (Op.getOpcode() == ISD::XOR) {
4289	if (auto *C = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`))) {
4290	if (C->isAllOnes()) {
4291	SDValue LHS = Op.getOperand(i: `0`);
4292	for (unsigned I = `0`; I < Src.size(); ++I) {
4293	if (Src [I] == LHS) {
4294	Bits = ~SrcBits[I];
4295	return true;
4296	}
4297	}
4298	}
4299	}
4300	}
4301
4302	return false;
4303	}
4304
4305	Bits = SrcBits[Src.size()];
4306	Src.push_back(Elt: Op);
4307	return true;
4308	};
4309
4310	switch (In.getOpcode()) {
4311	case ISD::AND:
4312	case ISD::OR:
4313	case ISD::XOR: {
4314	SDValue LHS = In.getOperand(i: `0`);
4315	SDValue RHS = In.getOperand(i: `1`);
4316
4317	SmallVector<SDValue, `3`> Backup(Src.begin(), Src.end());
4318	if (!getOperandBits (LHS, LHSBits) \|\|
4319	!getOperandBits (RHS, RHSBits)) {
4320	Src = std::move(Backup);
4321	return std::make_pair(x: `0`, y: `0`);
4322	}
4323
4324	// Recursion is naturally limited by the size of the operand vector.
4325	auto Op = BitOp3_Op(In: LHS, Src);
4326	if (Op.first) {
4327	NumOpcodes += Op.first;
4328	LHSBits = Op.second;
4329	}
4330
4331	Op = BitOp3_Op(In: RHS, Src);
4332	if (Op.first) {
4333	NumOpcodes += Op.first;
4334	RHSBits = Op.second;
4335	}
4336	break;
4337	}
4338	default:
4339	return std::make_pair(x: `0`, y: `0`);
4340	}
4341
4342	uint8_t TTbl;
4343	switch (In.getOpcode()) {
4344	case ISD::AND:
4345	TTbl = LHSBits & RHSBits;
4346	break;
4347	case ISD::OR:
4348	TTbl = LHSBits \| RHSBits;
4349	break;
4350	case ISD::XOR:
4351	TTbl = LHSBits ^ RHSBits;
4352	break;
4353	default:
4354	break;
4355	}
4356
4357	return std::make_pair(x: NumOpcodes + `1`, y&: TTbl);
4358	}
4359
4360	bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
4361	SDValue &Src2, SDValue &Tbl) const {
4362	SmallVector<SDValue, `3`> Src;
4363	uint8_t TTbl;
4364	unsigned NumOpcodes;
4365
4366	std::tie(args&: NumOpcodes, args&: TTbl) = BitOp3_Op(In, Src);
4367
4368	// Src.empty() case can happen if all operands are all zero or all ones.
4369	// Normally it shall be optimized out before reaching this.
4370	if (NumOpcodes < `2` \|\| Src.empty())
4371	return false;
4372
4373	// For a uniform case threshold should be higher to account for moves between
4374	// VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
4375	// and a readtfirstlane after.
4376	if (NumOpcodes < `4` && !In ->isDivergent())
4377	return false;
4378
4379	if (NumOpcodes == `2` && In.getValueType() == MVT::i32) {
4380	// Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4381	// asm more readable. This cannot be modeled with AddedComplexity because
4382	// selector does not know how many operations did we match.
4383	if ((In.getOpcode() == ISD::XOR \|\| In.getOpcode() == ISD::OR) &&
4384	(In.getOperand(i: `0`).getOpcode() == In.getOpcode() \|\|
4385	In.getOperand(i: `1`).getOpcode() == In.getOpcode()))
4386	return false;
4387
4388	if (In.getOpcode() == ISD::OR &&
4389	(In.getOperand(i: `0`).getOpcode() == ISD::AND \|\|
4390	In.getOperand(i: `1`).getOpcode() == ISD::AND))
4391	return false;
4392	}
4393
4394	// Last operand can be ignored, turning a ternary operation into a binary.
4395	// For example: (~a & b & c) \| (~a & b & ~c) -> (~a & b). We can replace
4396	// 'c' with 'a' here without changing the answer. In some pathological
4397	// cases it should be possible to get an operation with a single operand
4398	// too if optimizer would not catch it.
4399	while (Src.size() < `3`)
4400	Src.push_back(Elt: Src [`0`]);
4401
4402	Src0 = Src [`0`];
4403	Src1 = Src [`1`];
4404	Src2 = Src [`2`];
4405
4406	Tbl = CurDAG->getTargetConstant(Val: TTbl, DL: SDLoc (In), VT: MVT::i32);
4407	return true;
4408	}
4409
4410	SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
4411	if (In.isUndef())
4412	return CurDAG->getUNDEF(VT: MVT::i32);
4413
4414	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: In)) {
4415	SDLoc SL(In);
4416	return CurDAG->getConstant(Val: C->getZExtValue() << `16`, DL: SL, VT: MVT::i32);
4417	}
4418
4419	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: In)) {
4420	SDLoc SL(In);
4421	return CurDAG->getConstant(
4422	Val: C->getValueAPF().bitcastToAPInt().getZExtValue() << `16`, DL: SL, VT: MVT::i32);
4423	}
4424
4425	SDValue Src;
4426	if (isExtractHiElt(In, Out&: Src))
4427	return Src;
4428
4429	return SDValue ();
4430	}
4431
4432	bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
4433	assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
4434
4435	const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
4436	const SIInstrInfo *SII = Subtarget->getInstrInfo();
4437
4438	unsigned Limit = `0`;
4439	bool AllUsesAcceptSReg = true;
4440	for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
4441	Limit < `10` && U != E; ++U, ++Limit) {
4442	const TargetRegisterClass *RC =
4443	getOperandRegClass(N: U ->getUser(), OpNo: U ->getOperandNo());
4444
4445	// If the register class is unknown, it could be an unknown
4446	// register class that needs to be an SGPR, e.g. an inline asm
4447	// constraint
4448	if (!RC \|\| SIRI->isSGPRClass(RC))
4449	return false;
4450
4451	if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass &&
4452	RC != &AMDGPU::VS_64_Align2RegClass) {
4453	AllUsesAcceptSReg = false;
4454	SDNode *User = U ->getUser();
4455	if (User->isMachineOpcode()) {
4456	unsigned Opc = User->getMachineOpcode();
4457	const MCInstrDesc &Desc = SII->get(Opcode: Opc);
4458	if (Desc.isCommutable()) {
4459	unsigned OpIdx = Desc.getNumDefs() + U ->getOperandNo();
4460	unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
4461	if (SII->findCommutedOpIndices(Desc, SrcOpIdx0&: OpIdx, SrcOpIdx1&: CommuteIdx1)) {
4462	unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
4463	const TargetRegisterClass *CommutedRC =
4464	getOperandRegClass(N: U ->getUser(), OpNo: CommutedOpNo);
4465	if (CommutedRC == &AMDGPU::VS_32RegClass \|\|
4466	CommutedRC == &AMDGPU::VS_64RegClass \|\|
4467	CommutedRC == &AMDGPU::VS_64_Align2RegClass)
4468	AllUsesAcceptSReg = true;
4469	}
4470	}
4471	}
4472	// If "AllUsesAcceptSReg == false" so far we haven't succeeded
4473	// commuting current user. This means have at least one use
4474	// that strictly require VGPR. Thus, we will not attempt to commute
4475	// other user instructions.
4476	if (!AllUsesAcceptSReg)
4477	break;
4478	}
4479	}
4480	return !AllUsesAcceptSReg && (Limit < `10`);
4481	}
4482
4483	bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode N) const* {
4484	const auto *Ld = cast<LoadSDNode>(Val: N);
4485	const MachineMemOperand *MMO = Ld->getMemOperand();
4486
4487	// FIXME: We ought to able able to take the direct isDivergent result. We
4488	// cannot rely on the MMO for a uniformity check, and should stop using
4489	// it. This is a hack for 2 ways that the IR divergence analysis is superior
4490	// to the DAG divergence: Recognizing shift-of-workitem-id as always
4491	// uniform, and isSingleLaneExecution. These should be handled in the DAG
4492	// version, and then this can be dropped.
4493	if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))
4494	return false;
4495
4496	return MMO->getSize().hasValue() &&
4497	Ld->getAlign() >=
4498	Align (std::min(a: MMO->getSize().getValue().getKnownMinValue(),
4499	b: uint64_t(`4`))) &&
4500	(MMO->isInvariant() \|\|
4501	(Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS \|\|
4502	Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) \|\|
4503	(Subtarget->getScalarizeGlobalBehavior() &&
4504	Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
4505	Ld->isSimple() &&
4506	static_cast<const SITargetLowering *>(getTargetLowering())
4507	->isMemOpHasNoClobberedMemOperand(N)));
4508	}
4509
4510	void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
4511	const AMDGPUTargetLowering& Lowering =
4512	*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
4513	bool IsModified = false;
4514	do {
4515	IsModified = false;
4516
4517	// Go over all selected nodes and try to fold them a bit more
4518	SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
4519	while (Position != CurDAG->allnodes_end()) {
4520	SDNode Node = &Position ++;
4521	MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Val: Node);
4522	if (!MachineNode)
4523	continue;
4524
4525	SDNode ResNode = Lowering.PostISelFolding(N: MachineNode, DAG&: CurDAG);
4526	if (ResNode != Node) {
4527	if (ResNode)
4528	ReplaceUses(F: Node, T: ResNode);
4529	IsModified = true;
4530	}
4531	}
4532	CurDAG->RemoveDeadNodes();
4533	} while (IsModified);
4534	}
4535
4536	AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,
4537	CodeGenOptLevel OptLevel)
4538	: SelectionDAGISelLegacy (
4539	ID, std::make_unique<AMDGPUDAGToDAGISel>(args&: TM, args&: OptLevel)) {}
4540
4541	char AMDGPUDAGToDAGISelLegacy::ID = `0`;
4542

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp