AMDGPUISelDAGToDAG.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp]

1	//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//==-----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Defines an instruction selector for the AMDGPU target.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPUISelDAGToDAG.h"
15	#include "AMDGPU.h"
16	#include "AMDGPUInstrInfo.h"
17	#include "AMDGPUSubtarget.h"
18	#include "AMDGPUTargetMachine.h"
19	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20	#include "MCTargetDesc/R600MCTargetDesc.h"
21	#include "R600RegisterInfo.h"
22	#include "SIISelLowering.h"
23	#include "SIMachineFunctionInfo.h"
24	#include "llvm/Analysis/UniformityAnalysis.h"
25	#include "llvm/Analysis/ValueTracking.h"
26	#include "llvm/CodeGen/FunctionLoweringInfo.h"
27	#include "llvm/CodeGen/SelectionDAG.h"
28	#include "llvm/CodeGen/SelectionDAGISel.h"
29	#include "llvm/CodeGen/SelectionDAGNodes.h"
30	#include "llvm/IR/IntrinsicsAMDGPU.h"
31	#include "llvm/InitializePasses.h"
32	#include "llvm/Support/ErrorHandling.h"
33
34	#ifdef EXPENSIVE_CHECKS
35	#include "llvm/Analysis/LoopInfo.h"
36	#include "llvm/IR/Dominators.h"
37	#endif
38
39	#define DEBUG_TYPE "amdgpu-isel"
40
41	using namespace llvm;
42
43	//===----------------------------------------------------------------------===//
44	// Instruction Selector Implementation
45	//===----------------------------------------------------------------------===//
46
47	namespace {
48	static SDValue stripBitcast(SDValue Val) {
49	return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(i: `0`) : Val;
50	}
51
52	// Figure out if this is really an extract of the high 16-bits of a dword.
53	static bool isExtractHiElt(SDValue In, SDValue &Out) {
54	In = stripBitcast(Val: In);
55
56	if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
57	if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: `1`))) {
58	if (!Idx->isOne())
59	return false;
60	Out = In.getOperand(i: `0`);
61	return true;
62	}
63	}
64
65	if (In.getOpcode() != ISD::TRUNCATE)
66	return false;
67
68	SDValue Srl = In.getOperand(i: `0`);
69	if (Srl.getOpcode() == ISD::SRL) {
70	if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: Srl.getOperand(i: `1`))) {
71	if (ShiftAmt->getZExtValue() == `16`) {
72	Out = stripBitcast(Val: Srl.getOperand(i: `0`));
73	return true;
74	}
75	}
76	}
77
78	return false;
79	}
80
81	// Look through operations that obscure just looking at the low 16-bits of the
82	// same register.
83	static SDValue stripExtractLoElt(SDValue In) {
84	if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
85	SDValue Idx = In.getOperand(i: `1`);
86	if (isNullConstant(V: Idx) && In.getValueSizeInBits() <= `32`)
87	return In.getOperand(i: `0`);
88	}
89
90	if (In.getOpcode() == ISD::TRUNCATE) {
91	SDValue Src = In.getOperand(i: `0`);
92	if (Src.getValueType().getSizeInBits() == `32`)
93	return stripBitcast(Val: Src);
94	}
95
96	return In;
97	}
98
99	} // end anonymous namespace
100
101	INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
102	"AMDGPU DAG->DAG Pattern Instruction Selection", false,
103	false)
104	INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
105	INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
106	INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
107	#ifdef EXPENSIVE_CHECKS
108	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
109	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
110	#endif
111	INITIALIZE_PASS_END(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
112	"AMDGPU DAG->DAG Pattern Instruction Selection", false,
113	false)
114
115	/// This pass converts a legalized DAG into a AMDGPU-specific
116	// DAG, ready for instruction scheduling.
117	FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
118	CodeGenOptLevel OptLevel) {
119	return new AMDGPUDAGToDAGISelLegacy (TM, OptLevel);
120	}
121
122	AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM,
123	CodeGenOptLevel OptLevel)
124	: SelectionDAGISel (TM, OptLevel) {
125	EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
126	}
127
128	bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
129	Subtarget = &MF.getSubtarget<GCNSubtarget>();
130	Subtarget->checkSubtargetFeatures(F: MF.getFunction());
131	Mode = SIModeRegisterDefaults (MF.getFunction(), *Subtarget);
132	return SelectionDAGISel::runOnMachineFunction(mf&: MF);
133	}
134
135	bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
136	// XXX - only need to list legal operations.
137	switch (Opc) {
138	case ISD::FADD:
139	case ISD::FSUB:
140	case ISD::FMUL:
141	case ISD::FDIV:
142	case ISD::FREM:
143	case ISD::FCANONICALIZE:
144	case ISD::UINT_TO_FP:
145	case ISD::SINT_TO_FP:
146	case ISD::FABS:
147	// Fabs is lowered to a bit operation, but it's an and which will clear the
148	// high bits anyway.
149	case ISD::FSQRT:
150	case ISD::FSIN:
151	case ISD::FCOS:
152	case ISD::FPOWI:
153	case ISD::FPOW:
154	case ISD::FLOG:
155	case ISD::FLOG2:
156	case ISD::FLOG10:
157	case ISD::FEXP:
158	case ISD::FEXP2:
159	case ISD::FCEIL:
160	case ISD::FTRUNC:
161	case ISD::FRINT:
162	case ISD::FNEARBYINT:
163	case ISD::FROUNDEVEN:
164	case ISD::FROUND:
165	case ISD::FFLOOR:
166	case ISD::FMINNUM:
167	case ISD::FMAXNUM:
168	case ISD::FLDEXP:
169	case AMDGPUISD::FRACT:
170	case AMDGPUISD::CLAMP:
171	case AMDGPUISD::COS_HW:
172	case AMDGPUISD::SIN_HW:
173	case AMDGPUISD::FMIN3:
174	case AMDGPUISD::FMAX3:
175	case AMDGPUISD::FMED3:
176	case AMDGPUISD::FMAD_FTZ:
177	case AMDGPUISD::RCP:
178	case AMDGPUISD::RSQ:
179	case AMDGPUISD::RCP_IFLAG:
180	// On gfx10, all 16-bit instructions preserve the high bits.
181	return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
182	case ISD::FP_ROUND:
183	// We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
184	// high bits on gfx9.
185	// TODO: If we had the source node we could see if the source was fma/mad
186	return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
187	case ISD::FMA:
188	case ISD::FMAD:
189	case AMDGPUISD::DIV_FIXUP:
190	return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
191	default:
192	// fcopysign, select and others may be lowered to 32-bit bit operations
193	// which don't zero the high bits.
194	return false;
195	}
196	}
197
198	bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {
199	#ifdef EXPENSIVE_CHECKS
200	DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
201	LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
202	for (auto &L : LI->getLoopsInPreorder()) {
203	assert(L->isLCSSAForm(DT));
204	}
205	#endif
206	return SelectionDAGISelLegacy::runOnMachineFunction(MF);
207	}
208
209	void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
210	AU.addRequired<AMDGPUArgumentUsageInfo>();
211	AU.addRequired<UniformityInfoWrapperPass>();
212	#ifdef EXPENSIVE_CHECKS
213	AU.addRequired<DominatorTreeWrapperPass>();
214	AU.addRequired<LoopInfoWrapperPass>();
215	#endif
216	SelectionDAGISelLegacy::getAnalysisUsage(AU);
217	}
218
219	bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode N) const* {
220	assert(Subtarget->d16PreservesUnusedBits());
221	MVT VT = N->getValueType(ResNo: `0`).getSimpleVT();
222	if (VT != MVT::v2i16 && VT != MVT::v2f16)
223	return false;
224
225	SDValue Lo = N->getOperand(Num: `0`);
226	SDValue Hi = N->getOperand(Num: `1`);
227
228	LoadSDNode *LdHi = dyn_cast<LoadSDNode>(Val: stripBitcast(Val: Hi));
229
230	// build_vector lo, (load ptr) -> load_d16_hi ptr, lo
231	// build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
232	// build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
233
234	// Need to check for possible indirect dependencies on the other half of the
235	// vector to avoid introducing a cycle.
236	if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(N: Lo.getNode())) {
237	SDVTList VTList = CurDAG->getVTList(VT1: VT, VT2: MVT::Other);
238
239	SDValue TiedIn = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SDLoc (N), VT, Operand: Lo);
240	SDValue Ops[] = {
241	LdHi->getChain(), LdHi->getBasePtr(), TiedIn
242	};
243
244	unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
245	if (LdHi->getMemoryVT() == MVT::i8) {
246	LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
247	AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
248	} else {
249	assert(LdHi->getMemoryVT() == MVT::i16);
250	}
251
252	SDValue NewLoadHi =
253	CurDAG->getMemIntrinsicNode(Opcode: LoadOp, dl: SDLoc (LdHi), VTList,
254	Ops, MemVT: LdHi->getMemoryVT(),
255	MMO: LdHi->getMemOperand());
256
257	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: NewLoadHi);
258	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (LdHi, `1`), To: NewLoadHi.getValue(R: `1`));
259	return true;
260	}
261
262	// build_vector (load ptr), hi -> load_d16_lo ptr, hi
263	// build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
264	// build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
265	LoadSDNode *LdLo = dyn_cast<LoadSDNode>(Val: stripBitcast(Val: Lo));
266	if (LdLo && Lo.hasOneUse()) {
267	SDValue TiedIn = getHi16Elt(In: Hi);
268	if (!TiedIn \|\| LdLo->isPredecessorOf(N: TiedIn.getNode()))
269	return false;
270
271	SDVTList VTList = CurDAG->getVTList(VT1: VT, VT2: MVT::Other);
272	unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
273	if (LdLo->getMemoryVT() == MVT::i8) {
274	LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
275	AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
276	} else {
277	assert(LdLo->getMemoryVT() == MVT::i16);
278	}
279
280	TiedIn = CurDAG->getNode(Opcode: ISD::BITCAST, DL: SDLoc (N), VT, Operand: TiedIn);
281
282	SDValue Ops[] = {
283	LdLo->getChain(), LdLo->getBasePtr(), TiedIn
284	};
285
286	SDValue NewLoadLo =
287	CurDAG->getMemIntrinsicNode(Opcode: LoadOp, dl: SDLoc (LdLo), VTList,
288	Ops, MemVT: LdLo->getMemoryVT(),
289	MMO: LdLo->getMemOperand());
290
291	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: NewLoadLo);
292	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (LdLo, `1`), To: NewLoadLo.getValue(R: `1`));
293	return true;
294	}
295
296	return false;
297	}
298
299	void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
300	if (!Subtarget->d16PreservesUnusedBits())
301	return;
302
303	SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
304
305	bool MadeChange = false;
306	while (Position != CurDAG->allnodes_begin()) {
307	SDNode N = &--Position;
308	if (N->use_empty())
309	continue;
310
311	switch (N->getOpcode()) {
312	case ISD::BUILD_VECTOR:
313	// TODO: Match load d16 from shl (extload:i16), 16
314	MadeChange \|= matchLoadD16FromBuildVector(N);
315	break;
316	default:
317	break;
318	}
319	}
320
321	if (MadeChange) {
322	CurDAG->RemoveDeadNodes();
323	LLVM_DEBUG(dbgs() << "After PreProcess:\n";
324	CurDAG->dump(););
325	}
326	}
327
328	bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode N) const* {
329	if (N->isUndef())
330	return true;
331
332	const SIInstrInfo *TII = Subtarget->getInstrInfo();
333	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N))
334	return TII->isInlineConstant(Imm: C->getAPIntValue());
335
336	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val: N))
337	return TII->isInlineConstant(Imm: C->getValueAPF());
338
339	return false;
340	}
341
342	/// Determine the register class for \p OpNo
343	/// \returns The register class of the virtual register that will be used for
344	/// the given operand number \OpNo or NULL if the register class cannot be
345	/// determined.
346	const TargetRegisterClass AMDGPUDAGToDAGISel::getOperandRegClass(SDNode N,
347	unsigned OpNo) const {
348	if (!N->isMachineOpcode()) {
349	if (N->getOpcode() == ISD::CopyToReg) {
350	Register Reg = cast<RegisterSDNode>(Val: N->getOperand(Num: `1`))->getReg();
351	if (Reg.isVirtual()) {
352	MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
353	return MRI.getRegClass(Reg);
354	}
355
356	const SIRegisterInfo *TRI
357	= static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
358	return TRI->getPhysRegBaseClass(Reg);
359	}
360
361	return nullptr;
362	}
363
364	switch (N->getMachineOpcode()) {
365	default: {
366	const MCInstrDesc &Desc =
367	Subtarget->getInstrInfo()->get(Opcode: N->getMachineOpcode());
368	unsigned OpIdx = Desc.getNumDefs() + OpNo;
369	if (OpIdx >= Desc.getNumOperands())
370	return nullptr;
371	int RegClass = Desc.operands()[OpIdx].RegClass;
372	if (RegClass == -`1`)
373	return nullptr;
374
375	return Subtarget->getRegisterInfo()->getRegClass(RCID: RegClass);
376	}
377	case AMDGPU::REG_SEQUENCE: {
378	unsigned RCID = N->getConstantOperandVal(Num: `0`);
379	const TargetRegisterClass *SuperRC =
380	Subtarget->getRegisterInfo()->getRegClass(RCID);
381
382	SDValue SubRegOp = N->getOperand(Num: OpNo + `1`);
383	unsigned SubRegIdx = SubRegOp ->getAsZExtVal();
384	return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
385	SubRegIdx);
386	}
387	}
388	}
389
390	SDNode AMDGPUDAGToDAGISel::glueCopyToOp(SDNode N, SDValue NewChain,
391	SDValue Glue) const {
392	SmallVector <SDValue, `8`> Ops;
393	Ops.push_back(Elt: NewChain); // Replace the chain.
394	for (unsigned i = `1`, e = N->getNumOperands(); i != e; ++i)
395	Ops.push_back(Elt: N->getOperand(Num: i));
396
397	Ops.push_back(Elt: Glue);
398	return CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: N->getVTList(), Ops);
399	}
400
401	SDNode AMDGPUDAGToDAGISel::glueCopyToM0(SDNode N, SDValue Val) const {
402	const SITargetLowering& Lowering =
403	*static_cast<const SITargetLowering*>(getTargetLowering());
404
405	assert(N->getOperand(`0`).getValueType() == MVT::Other && "Expected chain");
406
407	SDValue M0 = Lowering.copyToM0(DAG&: *CurDAG, Chain: N->getOperand(Num: `0`), DL: SDLoc (N), V: Val);
408	return glueCopyToOp(N, NewChain: M0, Glue: M0.getValue(R: `1`));
409	}
410
411	SDNode AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode N) const {
412	unsigned AS = cast<MemSDNode>(Val: N)->getAddressSpace();
413	if (AS == AMDGPUAS::LOCAL_ADDRESS) {
414	if (Subtarget->ldsRequiresM0Init())
415	return glueCopyToM0(N, Val: CurDAG->getTargetConstant(Val: -`1`, DL: SDLoc (N), VT: MVT::i32));
416	} else if (AS == AMDGPUAS::REGION_ADDRESS) {
417	MachineFunction &MF = CurDAG->getMachineFunction();
418	unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
419	return
420	glueCopyToM0(N, Val: CurDAG->getTargetConstant(Val: Value, DL: SDLoc (N), VT: MVT::i32));
421	}
422	return N;
423	}
424
425	MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
426	EVT VT) const {
427	SDNode *Lo = CurDAG->getMachineNode(
428	Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
429	Op1: CurDAG->getTargetConstant(Val: Imm & `0xFFFFFFFF`, DL, VT: MVT::i32));
430	SDNode *Hi =
431	CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
432	Op1: CurDAG->getTargetConstant(Val: Imm >> `32`, DL, VT: MVT::i32));
433	const SDValue Ops[] = {
434	CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32),
435	SDValue (Lo, `0`), CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
436	SDValue (Hi, `0`), CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
437
438	return CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL, VT, Ops);
439	}
440
441	void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode N, unsigned* RegClassID) {
442	EVT VT = N->getValueType(ResNo: `0`);
443	unsigned NumVectorElts = VT.getVectorNumElements();
444	EVT EltVT = VT.getVectorElementType();
445	SDLoc DL(N);
446	SDValue RegClass = CurDAG->getTargetConstant(Val: RegClassID, DL, VT: MVT::i32);
447
448	if (NumVectorElts == `1`) {
449	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::COPY_TO_REGCLASS, VT: EltVT, Op1: N->getOperand(Num: `0`),
450	Op2: RegClass);
451	return;
452	}
453
454	assert(NumVectorElts <= `32` && "Vectors with more than 32 elements not "
455	"supported yet");
456	// 32 = Max Num Vector Elements
457	// 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
458	// 1 = Vector Register Class
459	SmallVector<SDValue, `32` * `2` + `1`> RegSeqArgs(NumVectorElts * `2` + `1`);
460
461	bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
462	Triple::amdgcn;
463	RegSeqArgs [`0`] = CurDAG->getTargetConstant(Val: RegClassID, DL, VT: MVT::i32);
464	bool IsRegSeq = true;
465	unsigned NOps = N->getNumOperands();
466	for (unsigned i = `0`; i < NOps; i++) {
467	// XXX: Why is this here?
468	if (isa<RegisterSDNode>(Val: N->getOperand(Num: i))) {
469	IsRegSeq = false;
470	break;
471	}
472	unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(Channel: i)
473	: R600RegisterInfo::getSubRegFromChannel(Channel: i);
474	RegSeqArgs [`1` + (`2` * i)] = N->getOperand(Num: i);
475	RegSeqArgs [`1` + (`2` * i) + `1`] = CurDAG->getTargetConstant(Val: Sub, DL, VT: MVT::i32);
476	}
477	if (NOps != NumVectorElts) {
478	// Fill in the missing undef elements if this was a scalar_to_vector.
479	assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
480	MachineSDNode *ImpDef = CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF,
481	dl: DL, VT: EltVT);
482	for (unsigned i = NOps; i < NumVectorElts; ++i) {
483	unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(Channel: i)
484	: R600RegisterInfo::getSubRegFromChannel(Channel: i);
485	RegSeqArgs [`1` + (`2` * i)] = SDValue (ImpDef, `0`);
486	RegSeqArgs [`1` + (`2` * i) + `1`] =
487	CurDAG->getTargetConstant(Val: Sub, DL, VT: MVT::i32);
488	}
489	}
490
491	if (!IsRegSeq)
492	SelectCode(N);
493	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::REG_SEQUENCE, VTs: N->getVTList(), Ops: RegSeqArgs);
494	}
495
496	void AMDGPUDAGToDAGISel::Select(SDNode *N) {
497	unsigned int Opc = N->getOpcode();
498	if (N->isMachineOpcode()) {
499	N->setNodeId(-`1`);
500	return; // Already selected.
501	}
502
503	// isa<MemSDNode> almost works but is slightly too permissive for some DS
504	// intrinsics.
505	if (Opc == ISD::LOAD \|\| Opc == ISD::STORE \|\| isa<AtomicSDNode>(Val: N)) {
506	N = glueCopyToM0LDSInit(N);
507	SelectCode(N);
508	return;
509	}
510
511	switch (Opc) {
512	default:
513	break;
514	// We are selecting i64 ADD here instead of custom lower it during
515	// DAG legalization, so we can fold some i64 ADDs used for address
516	// calculation into the LOAD and STORE instructions.
517	case ISD::ADDC:
518	case ISD::ADDE:
519	case ISD::SUBC:
520	case ISD::SUBE: {
521	if (N->getValueType(ResNo: `0`) != MVT::i64)
522	break;
523
524	SelectADD_SUB_I64(N);
525	return;
526	}
527	case ISD::UADDO_CARRY:
528	case ISD::USUBO_CARRY:
529	if (N->getValueType(ResNo: `0`) != MVT::i32)
530	break;
531
532	SelectAddcSubb(N);
533	return;
534	case ISD::UADDO:
535	case ISD::USUBO: {
536	SelectUADDO_USUBO(N);
537	return;
538	}
539	case AMDGPUISD::FMUL_W_CHAIN: {
540	SelectFMUL_W_CHAIN(N);
541	return;
542	}
543	case AMDGPUISD::FMA_W_CHAIN: {
544	SelectFMA_W_CHAIN(N);
545	return;
546	}
547
548	case ISD::SCALAR_TO_VECTOR:
549	case ISD::BUILD_VECTOR: {
550	EVT VT = N->getValueType(ResNo: `0`);
551	unsigned NumVectorElts = VT.getVectorNumElements();
552	if (VT.getScalarSizeInBits() == `16`) {
553	if (Opc == ISD::BUILD_VECTOR && NumVectorElts == `2`) {
554	if (SDNode Packed = packConstantV2I16(N, DAG&: CurDAG)) {
555	ReplaceNode(F: N, T: Packed);
556	return;
557	}
558	}
559
560	break;
561	}
562
563	assert(VT.getVectorElementType().bitsEq(MVT::i32));
564	unsigned RegClassID =
565	SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NumVectorElts * `32`)->getID();
566	SelectBuildVector(N, RegClassID);
567	return;
568	}
569	case ISD::BUILD_PAIR: {
570	SDValue RC, SubReg0, SubReg1;
571	SDLoc DL(N);
572	if (N->getValueType(ResNo: `0`) == MVT::i128) {
573	RC = CurDAG->getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32);
574	SubReg0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0_sub1, DL, VT: MVT::i32);
575	SubReg1 = CurDAG->getTargetConstant(Val: AMDGPU::sub2_sub3, DL, VT: MVT::i32);
576	} else if (N->getValueType(ResNo: `0`) == MVT::i64) {
577	RC = CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32);
578	SubReg0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
579	SubReg1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
580	} else {
581	llvm_unreachable("Unhandled value type for BUILD_PAIR");
582	}
583	const SDValue Ops[] = { RC, N->getOperand(Num: `0`), SubReg0,
584	N->getOperand(Num: `1`), SubReg1 };
585	ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL,
586	VT: N->getValueType(ResNo: `0`), Ops));
587	return;
588	}
589
590	case ISD::Constant:
591	case ISD::ConstantFP: {
592	if (N->getValueType(ResNo: `0`).getSizeInBits() != `64` \|\| isInlineImmediate(N))
593	break;
594
595	uint64_t Imm;
596	if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(Val: N)) {
597	Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
598	if (AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: true))
599	break;
600	} else {
601	ConstantSDNode *C = cast<ConstantSDNode>(Val: N);
602	Imm = C->getZExtValue();
603	if (AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: false))
604	break;
605	}
606
607	SDLoc DL(N);
608	ReplaceNode(F: N, T: buildSMovImm64(DL, Imm, VT: N->getValueType(ResNo: `0`)));
609	return;
610	}
611	case AMDGPUISD::BFE_I32:
612	case AMDGPUISD::BFE_U32: {
613	// There is a scalar version available, but unlike the vector version which
614	// has a separate operand for the offset and width, the scalar version packs
615	// the width and offset into a single operand. Try to move to the scalar
616	// version if the offsets are constant, so that we can try to keep extended
617	// loads of kernel arguments in SGPRs.
618
619	// TODO: Technically we could try to pattern match scalar bitshifts of
620	// dynamic values, but it's probably not useful.
621	ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
622	if (!Offset)
623	break;
624
625	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `2`));
626	if (!Width)
627	break;
628
629	bool Signed = Opc == AMDGPUISD::BFE_I32;
630
631	uint32_t OffsetVal = Offset->getZExtValue();
632	uint32_t WidthVal = Width->getZExtValue();
633
634	ReplaceNode(F: N, T: getBFE32(IsSigned: Signed, DL: SDLoc (N), Val: N->getOperand(Num: `0`), Offset: OffsetVal,
635	Width: WidthVal));
636	return;
637	}
638	case AMDGPUISD::DIV_SCALE: {
639	SelectDIV_SCALE(N);
640	return;
641	}
642	case AMDGPUISD::MAD_I64_I32:
643	case AMDGPUISD::MAD_U64_U32: {
644	SelectMAD_64_32(N);
645	return;
646	}
647	case ISD::SMUL_LOHI:
648	case ISD::UMUL_LOHI:
649	return SelectMUL_LOHI(N);
650	case ISD::CopyToReg: {
651	const SITargetLowering& Lowering =
652	*static_cast<const SITargetLowering*>(getTargetLowering());
653	N = Lowering.legalizeTargetIndependentNode(Node: N, DAG&: *CurDAG);
654	break;
655	}
656	case ISD::AND:
657	case ISD::SRL:
658	case ISD::SRA:
659	case ISD::SIGN_EXTEND_INREG:
660	if (N->getValueType(ResNo: `0`) != MVT::i32)
661	break;
662
663	SelectS_BFE(N);
664	return;
665	case ISD::BRCOND:
666	SelectBRCOND(N);
667	return;
668	case ISD::FP_EXTEND:
669	SelectFP_EXTEND(N);
670	return;
671	case AMDGPUISD::CVT_PKRTZ_F16_F32:
672	case AMDGPUISD::CVT_PKNORM_I16_F32:
673	case AMDGPUISD::CVT_PKNORM_U16_F32:
674	case AMDGPUISD::CVT_PK_U16_U32:
675	case AMDGPUISD::CVT_PK_I16_I32: {
676	// Hack around using a legal type if f16 is illegal.
677	if (N->getValueType(ResNo: `0`) == MVT::i32) {
678	MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
679	N = CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: CurDAG->getVTList(VT: NewVT),
680	Ops: { N->getOperand(Num: `0`), N->getOperand(Num: `1`) });
681	SelectCode(N);
682	return;
683	}
684
685	break;
686	}
687	case ISD::INTRINSIC_W_CHAIN: {
688	SelectINTRINSIC_W_CHAIN(N);
689	return;
690	}
691	case ISD::INTRINSIC_WO_CHAIN: {
692	SelectINTRINSIC_WO_CHAIN(N);
693	return;
694	}
695	case ISD::INTRINSIC_VOID: {
696	SelectINTRINSIC_VOID(N);
697	return;
698	}
699	case AMDGPUISD::WAVE_ADDRESS: {
700	SelectWAVE_ADDRESS(N);
701	return;
702	}
703	case ISD::STACKRESTORE: {
704	SelectSTACKRESTORE(N);
705	return;
706	}
707	}
708
709	SelectCode(N);
710	}
711
712	bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode N) const* {
713	const BasicBlock *BB = FuncInfo ->MBB->getBasicBlock();
714	const Instruction *Term = BB->getTerminator();
715	return Term->getMetadata(Kind: "amdgpu.uniform") \|\|
716	Term->getMetadata(Kind: "structurizecfg.uniform");
717	}
718
719	bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
720	unsigned ShAmtBits) const {
721	assert(N->getOpcode() == ISD::AND);
722
723	const APInt &RHS = N->getConstantOperandAPInt(Num: `1`);
724	if (RHS.countr_one() >= ShAmtBits)
725	return true;
726
727	const APInt &LHSKnownZeros = CurDAG->computeKnownBits(Op: N->getOperand(Num: `0`)).Zero;
728	return (LHSKnownZeros \| RHS).countr_one() >= ShAmtBits;
729	}
730
731	static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
732	SDValue &N0, SDValue &N1) {
733	if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
734	Addr.getOperand(i: `0`).getOpcode() == ISD::BUILD_VECTOR) {
735	// As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
736	// (i64 (bitcast (v2i32 (build_vector
737	// (or (extract_vector_elt V, 0), OFFSET),
738	// (extract_vector_elt V, 1)))))
739	SDValue Lo = Addr.getOperand(i: `0`).getOperand(i: `0`);
740	if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Op: Lo)) {
741	SDValue BaseLo = Lo.getOperand(i: `0`);
742	SDValue BaseHi = Addr.getOperand(i: `0`).getOperand(i: `1`);
743	// Check that split base (Lo and Hi) are extracted from the same one.
744	if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
745	BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
746	BaseLo.getOperand(i: `0`) == BaseHi.getOperand(i: `0`) &&
747	// Lo is statically extracted from index 0.
748	isa<ConstantSDNode>(Val: BaseLo.getOperand(i: `1`)) &&
749	BaseLo.getConstantOperandVal(i: `1`) == `0` &&
750	// Hi is statically extracted from index 0.
751	isa<ConstantSDNode>(Val: BaseHi.getOperand(i: `1`)) &&
752	BaseHi.getConstantOperandVal(i: `1`) == `1`) {
753	N0 = BaseLo.getOperand(i: `0`).getOperand(i: `0`);
754	N1 = Lo.getOperand(i: `1`);
755	return true;
756	}
757	}
758	}
759	return false;
760	}
761
762	bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
763	SDValue &RHS) const {
764	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
765	LHS = Addr.getOperand(i: `0`);
766	RHS = Addr.getOperand(i: `1`);
767	return true;
768	}
769
770	if (getBaseWithOffsetUsingSplitOR(DAG&: *CurDAG, Addr, N0&: LHS, N1&: RHS)) {
771	assert(LHS && RHS && isa<ConstantSDNode>(RHS));
772	return true;
773	}
774
775	return false;
776	}
777
778	StringRef AMDGPUDAGToDAGISelLegacy::getPassName() const {
779	return "AMDGPU DAG->DAG Pattern Instruction Selection";
780	}
781
782	AMDGPUISelDAGToDAGPass::AMDGPUISelDAGToDAGPass(TargetMachine &TM)
783	: SelectionDAGISelPass (
784	std::make_unique<AMDGPUDAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {}
785
786	PreservedAnalyses
787	AMDGPUISelDAGToDAGPass::run(MachineFunction &MF,
788	MachineFunctionAnalysisManager &MFAM) {
789	#ifdef EXPENSIVE_CHECKS
790	auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
791	.getManager();
792	auto &F = MF.getFunction();
793	DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
794	LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
795	for (auto &L : LI.getLoopsInPreorder())
796	assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
797	#endif
798	return SelectionDAGISelPass::run(MF, MFAM);
799	}
800
801	//===----------------------------------------------------------------------===//
802	// Complex Patterns
803	//===----------------------------------------------------------------------===//
804
805	bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
806	SDValue &Offset) {
807	return false;
808	}
809
810	bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
811	SDValue &Offset) {
812	ConstantSDNode *C;
813	SDLoc DL(Addr);
814
815	if ((C = dyn_cast<ConstantSDNode>(Val&: Addr))) {
816	Base = CurDAG->getRegister(Reg: R600::INDIRECT_BASE_ADDR, VT: MVT::i32);
817	Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
818	} else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
819	(C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `0`)))) {
820	Base = CurDAG->getRegister(Reg: R600::INDIRECT_BASE_ADDR, VT: MVT::i32);
821	Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
822	} else if ((Addr.getOpcode() == ISD::ADD \|\| Addr.getOpcode() == ISD::OR) &&
823	(C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`)))) {
824	Base = Addr.getOperand(i: `0`);
825	Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
826	} else {
827	Base = Addr;
828	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
829	}
830
831	return true;
832	}
833
834	SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
835	const SDLoc &DL) const {
836	SDNode *Mov = CurDAG->getMachineNode(
837	Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
838	Op1: CurDAG->getTargetConstant(Val, DL, VT: MVT::i32));
839	return SDValue (Mov, `0`);
840	}
841
842	// FIXME: Should only handle uaddo_carry/usubo_carry
843	void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
844	SDLoc DL(N);
845	SDValue LHS = N->getOperand(Num: `0`);
846	SDValue RHS = N->getOperand(Num: `1`);
847
848	unsigned Opcode = N->getOpcode();
849	bool ConsumeCarry = (Opcode == ISD::ADDE \|\| Opcode == ISD::SUBE);
850	bool ProduceCarry =
851	ConsumeCarry \|\| Opcode == ISD::ADDC \|\| Opcode == ISD::SUBC;
852	bool IsAdd = Opcode == ISD::ADD \|\| Opcode == ISD::ADDC \|\| Opcode == ISD::ADDE;
853
854	SDValue Sub0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
855	SDValue Sub1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
856
857	SDNode *Lo0 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
858	dl: DL, VT: MVT::i32, Op1: LHS, Op2: Sub0);
859	SDNode *Hi0 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
860	dl: DL, VT: MVT::i32, Op1: LHS, Op2: Sub1);
861
862	SDNode *Lo1 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
863	dl: DL, VT: MVT::i32, Op1: RHS, Op2: Sub0);
864	SDNode *Hi1 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
865	dl: DL, VT: MVT::i32, Op1: RHS, Op2: Sub1);
866
867	SDVTList VTList = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::Glue);
868
869	static const unsigned OpcMap[`2`][`2`][`2`] = {
870	{{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
871	{AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
872	{{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
873	{AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
874
875	unsigned Opc = OpcMap[`0`][N->isDivergent()][IsAdd];
876	unsigned CarryOpc = OpcMap[`1`][N->isDivergent()][IsAdd];
877
878	SDNode *AddLo;
879	if (!ConsumeCarry) {
880	SDValue Args[] = { SDValue (Lo0, `0`), SDValue (Lo1, `0`) };
881	AddLo = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs: VTList, Ops: Args);
882	} else {
883	SDValue Args[] = { SDValue (Lo0, `0`), SDValue (Lo1, `0`), N->getOperand(Num: `2`) };
884	AddLo = CurDAG->getMachineNode(Opcode: CarryOpc, dl: DL, VTs: VTList, Ops: Args);
885	}
886	SDValue AddHiArgs[] = {
887	SDValue (Hi0, `0`),
888	SDValue (Hi1, `0`),
889	SDValue (AddLo, `1`)
890	};
891	SDNode *AddHi = CurDAG->getMachineNode(Opcode: CarryOpc, dl: DL, VTs: VTList, Ops: AddHiArgs);
892
893	SDValue RegSequenceArgs[] = {
894	CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32),
895	SDValue (AddLo,`0`),
896	Sub0,
897	SDValue (AddHi,`0`),
898	Sub1,
899	};
900	SDNode *RegSequence = CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL,
901	VT: MVT::i64, Ops: RegSequenceArgs);
902
903	if (ProduceCarry) {
904	// Replace the carry-use
905	ReplaceUses(F: SDValue (N, `1`), T: SDValue (AddHi, `1`));
906	}
907
908	// Replace the remaining uses.
909	ReplaceNode(F: N, T: RegSequence);
910	}
911
912	void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
913	SDLoc DL(N);
914	SDValue LHS = N->getOperand(Num: `0`);
915	SDValue RHS = N->getOperand(Num: `1`);
916	SDValue CI = N->getOperand(Num: `2`);
917
918	if (N->isDivergent()) {
919	unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
920	: AMDGPU::V_SUBB_U32_e64;
921	CurDAG->SelectNodeTo(
922	N, MachineOpc: Opc, VTs: N->getVTList(),
923	Ops: {LHS, RHS, CI,
924	CurDAG->getTargetConstant(Val: `0`, DL: {}, VT: MVT::i1) /clamp bit/});
925	} else {
926	unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
927	: AMDGPU::S_SUB_CO_PSEUDO;
928	CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops: {LHS, RHS, CI});
929	}
930	}
931
932	void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
933	// The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
934	// carry out despite the _i32 name. These were renamed in VI to _U32.
935	// FIXME: We should probably rename the opcodes here.
936	bool IsAdd = N->getOpcode() == ISD::UADDO;
937	bool IsVALU = N->isDivergent();
938
939	for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
940	++UI)
941	if (UI.getUse().getResNo() == `1`) {
942	if ((IsAdd && (UI ->getOpcode() != ISD::UADDO_CARRY)) \|\|
943	(!IsAdd && (UI ->getOpcode() != ISD::USUBO_CARRY))) {
944	IsVALU = true;
945	break;
946	}
947	}
948
949	if (IsVALU) {
950	unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
951
952	CurDAG->SelectNodeTo(
953	N, MachineOpc: Opc, VTs: N->getVTList(),
954	Ops: {N->getOperand(Num: `0`), N->getOperand(Num: `1`),
955	CurDAG->getTargetConstant(Val: `0`, DL: {}, VT: MVT::i1) /clamp bit/});
956	} else {
957	unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
958	: AMDGPU::S_USUBO_PSEUDO;
959
960	CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(),
961	Ops: {N->getOperand(Num: `0`), N->getOperand(Num: `1`)});
962	}
963	}
964
965	void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
966	SDLoc SL(N);
967	// src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
968	SDValue Ops[`10`];
969
970	SelectVOP3Mods0(In: N->getOperand(Num: `1`), Src&: Ops[`1`], SrcMods&: Ops[`0`], Clamp&: Ops[`6`], Omod&: Ops[`7`]);
971	SelectVOP3Mods(In: N->getOperand(Num: `2`), Src&: Ops[`3`], SrcMods&: Ops[`2`]);
972	SelectVOP3Mods(In: N->getOperand(Num: `3`), Src&: Ops[`5`], SrcMods&: Ops[`4`]);
973	Ops[`8`] = N->getOperand(Num: `0`);
974	Ops[`9`] = N->getOperand(Num: `4`);
975
976	// If there are no source modifiers, prefer fmac over fma because it can use
977	// the smaller VOP2 encoding.
978	bool UseFMAC = Subtarget->hasDLInsts() &&
979	cast<ConstantSDNode>(Val&: Ops[`0`])->isZero() &&
980	cast<ConstantSDNode>(Val&: Ops[`2`])->isZero() &&
981	cast<ConstantSDNode>(Val&: Ops[`4`])->isZero();
982	unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
983	CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops);
984	}
985
986	void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
987	SDLoc SL(N);
988	// src0_modifiers, src0, src1_modifiers, src1, clamp, omod
989	SDValue Ops[`8`];
990
991	SelectVOP3Mods0(In: N->getOperand(Num: `1`), Src&: Ops[`1`], SrcMods&: Ops[`0`], Clamp&: Ops[`4`], Omod&: Ops[`5`]);
992	SelectVOP3Mods(In: N->getOperand(Num: `2`), Src&: Ops[`3`], SrcMods&: Ops[`2`]);
993	Ops[`6`] = N->getOperand(Num: `0`);
994	Ops[`7`] = N->getOperand(Num: `3`);
995
996	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::V_MUL_F32_e64, VTs: N->getVTList(), Ops);
997	}
998
999	// We need to handle this here because tablegen doesn't support matching
1000	// instructions with multiple outputs.
1001	void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1002	SDLoc SL(N);
1003	EVT VT = N->getValueType(ResNo: `0`);
1004
1005	assert(VT == MVT::f32 \|\| VT == MVT::f64);
1006
1007	unsigned Opc
1008	= (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1009
1010	// src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1011	// omod
1012	SDValue Ops[`8`];
1013	SelectVOP3BMods0(In: N->getOperand(Num: `0`), Src&: Ops[`1`], SrcMods&: Ops[`0`], Clamp&: Ops[`6`], Omod&: Ops[`7`]);
1014	SelectVOP3BMods(In: N->getOperand(Num: `1`), Src&: Ops[`3`], SrcMods&: Ops[`2`]);
1015	SelectVOP3BMods(In: N->getOperand(Num: `2`), Src&: Ops[`5`], SrcMods&: Ops[`4`]);
1016	CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
1017	}
1018
1019	// We need to handle this here because tablegen doesn't support matching
1020	// instructions with multiple outputs.
1021	void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1022	SDLoc SL(N);
1023	bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1024	unsigned Opc;
1025	if (Subtarget->hasMADIntraFwdBug())
1026	Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1027	: AMDGPU::V_MAD_U64_U32_gfx11_e64;
1028	else
1029	Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1030
1031	SDValue Clamp = CurDAG->getTargetConstant(Val: `0`, DL: SL, VT: MVT::i1);
1032	SDValue Ops[] = { N->getOperand(Num: `0`), N->getOperand(Num: `1`), N->getOperand(Num: `2`),
1033	Clamp };
1034	CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
1035	}
1036
1037	// We need to handle this here because tablegen doesn't support matching
1038	// instructions with multiple outputs.
1039	void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1040	SDLoc SL(N);
1041	bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1042	unsigned Opc;
1043	if (Subtarget->hasMADIntraFwdBug())
1044	Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1045	: AMDGPU::V_MAD_U64_U32_gfx11_e64;
1046	else
1047	Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1048
1049	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL: SL, VT: MVT::i64);
1050	SDValue Clamp = CurDAG->getTargetConstant(Val: `0`, DL: SL, VT: MVT::i1);
1051	SDValue Ops[] = {N->getOperand(Num: `0`), N->getOperand(Num: `1`), Zero, Clamp};
1052	SDNode *Mad = CurDAG->getMachineNode(Opcode: Opc, dl: SL, VTs: N->getVTList(), Ops);
1053	if (!SDValue (N, `0`).use_empty()) {
1054	SDValue Sub0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32);
1055	SDNode *Lo = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: SL,
1056	VT: MVT::i32, Op1: SDValue (Mad, `0`), Op2: Sub0);
1057	ReplaceUses(F: SDValue (N, `0`), T: SDValue (Lo, `0`));
1058	}
1059	if (!SDValue (N, `1`).use_empty()) {
1060	SDValue Sub1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32);
1061	SDNode *Hi = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: SL,
1062	VT: MVT::i32, Op1: SDValue (Mad, `0`), Op2: Sub1);
1063	ReplaceUses(F: SDValue (N, `1`), T: SDValue (Hi, `0`));
1064	}
1065	CurDAG->RemoveDeadNode(N);
1066	}
1067
1068	bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1069	if (!isUInt<`16`>(x: Offset))
1070	return false;
1071
1072	if (!Base \|\| Subtarget->hasUsableDSOffset() \|\|
1073	Subtarget->unsafeDSOffsetFoldingEnabled())
1074	return true;
1075
1076	// On Southern Islands instruction with a negative base value and an offset
1077	// don't seem to work.
1078	return CurDAG->SignBitIsZero(Op: Base);
1079	}
1080
1081	bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1082	SDValue &Offset) const {
1083	SDLoc DL(Addr);
1084	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1085	SDValue N0 = Addr.getOperand(i: `0`);
1086	SDValue N1 = Addr.getOperand(i: `1`);
1087	ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
1088	if (isDSOffsetLegal(Base: N0, Offset: C1->getSExtValue())) {
1089	// (add n0, c0)
1090	Base = N0;
1091	Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i16);
1092	return true;
1093	}
1094	} else if (Addr.getOpcode() == ISD::SUB) {
1095	// sub C, x -> add (sub 0, x), C
1096	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `0`))) {
1097	int64_t ByteOffset = C->getSExtValue();
1098	if (isDSOffsetLegal(Base: SDValue (), Offset: ByteOffset)) {
1099	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1100
1101	// XXX - This is kind of hacky. Create a dummy sub node so we can check
1102	// the known bits in isDSOffsetLegal. We need to emit the selected node
1103	// here, so this is thrown away.
1104	SDValue Sub = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32,
1105	N1: Zero, N2: Addr.getOperand(i: `1`));
1106
1107	if (isDSOffsetLegal(Base: Sub, Offset: ByteOffset)) {
1108	SmallVector<SDValue, `3`> Opnds;
1109	Opnds.push_back(Elt: Zero);
1110	Opnds.push_back(Elt: Addr.getOperand(i: `1`));
1111
1112	// FIXME: Select to VOP3 version for with-carry.
1113	unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1114	if (Subtarget->hasAddNoCarry()) {
1115	SubOp = AMDGPU::V_SUB_U32_e64;
1116	Opnds.push_back(
1117	Elt: CurDAG->getTargetConstant(Val: `0`, DL: {}, VT: MVT::i1)); // clamp bit
1118	}
1119
1120	MachineSDNode *MachineSub =
1121	CurDAG->getMachineNode(Opcode: SubOp, dl: DL, VT: MVT::i32, Ops: Opnds);
1122
1123	Base = SDValue (MachineSub, `0`);
1124	Offset = CurDAG->getTargetConstant(Val: ByteOffset, DL, VT: MVT::i16);
1125	return true;
1126	}
1127	}
1128	}
1129	} else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1130	// If we have a constant address, prefer to put the constant into the
1131	// offset. This can save moves to load the constant address since multiple
1132	// operations can share the zero base address register, and enables merging
1133	// into read2 / write2 instructions.
1134
1135	SDLoc DL(Addr);
1136
1137	if (isDSOffsetLegal(Base: SDValue (), Offset: CAddr->getZExtValue())) {
1138	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1139	MachineSDNode *MovZero = CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32,
1140	dl: DL, VT: MVT::i32, Op1: Zero);
1141	Base = SDValue (MovZero, `0`);
1142	Offset = CurDAG->getTargetConstant(Val: CAddr->getZExtValue(), DL, VT: MVT::i16);
1143	return true;
1144	}
1145	}
1146
1147	// default case
1148	Base = Addr;
1149	Offset = CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (Addr), VT: MVT::i16);
1150	return true;
1151	}
1152
1153	bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1154	unsigned Offset1,
1155	unsigned Size) const {
1156	if (Offset0 % Size != `0` \|\| Offset1 % Size != `0`)
1157	return false;
1158	if (!isUInt<`8`>(x: Offset0 / Size) \|\| !isUInt<`8`>(x: Offset1 / Size))
1159	return false;
1160
1161	if (!Base \|\| Subtarget->hasUsableDSOffset() \|\|
1162	Subtarget->unsafeDSOffsetFoldingEnabled())
1163	return true;
1164
1165	// On Southern Islands instruction with a negative base value and an offset
1166	// don't seem to work.
1167	return CurDAG->SignBitIsZero(Op: Base);
1168	}
1169
1170	// Return whether the operation has NoUnsignedWrap property.
1171	static bool isNoUnsignedWrap(SDValue Addr) {
1172	return (Addr.getOpcode() == ISD::ADD &&
1173	Addr ->getFlags().hasNoUnsignedWrap()) \|\|
1174	Addr ->getOpcode() == ISD::OR;
1175	}
1176
1177	// Check that the base address of flat scratch load/store in the form of `base +
1178	// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1179	// requirement). We always treat the first operand as the base address here.
1180	bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1181	if (isNoUnsignedWrap(Addr))
1182	return true;
1183
1184	// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1185	// values.
1186	if (Subtarget->hasSignedScratchOffsets())
1187	return true;
1188
1189	auto LHS = Addr.getOperand(i: `0`);
1190	auto RHS = Addr.getOperand(i: `1`);
1191
1192	// If the immediate offset is negative and within certain range, the base
1193	// address cannot also be negative. If the base is also negative, the sum
1194	// would be either negative or much larger than the valid range of scratch
1195	// memory a thread can access.
1196	ConstantSDNode ImmOp = nullptr*;
1197	if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(Val&: RHS))) {
1198	if (ImmOp->getSExtValue() < `0` && ImmOp->getSExtValue() > -`0x40000000`)
1199	return true;
1200	}
1201
1202	return CurDAG->SignBitIsZero(Op: LHS);
1203	}
1204
1205	// Check address value in SGPR/VGPR are legal for flat scratch in the form
1206	// of: SGPR + VGPR.
1207	bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1208	if (isNoUnsignedWrap(Addr))
1209	return true;
1210
1211	// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1212	// values.
1213	if (Subtarget->hasSignedScratchOffsets())
1214	return true;
1215
1216	auto LHS = Addr.getOperand(i: `0`);
1217	auto RHS = Addr.getOperand(i: `1`);
1218	return CurDAG->SignBitIsZero(Op: RHS) && CurDAG->SignBitIsZero(Op: LHS);
1219	}
1220
1221	// Check address value in SGPR/VGPR are legal for flat scratch in the form
1222	// of: SGPR + VGPR + Imm.
1223	bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1224	// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1225	// values.
1226	if (AMDGPU::isGFX12Plus(STI: *Subtarget))
1227	return true;
1228
1229	auto Base = Addr.getOperand(i: `0`);
1230	auto *RHSImm = cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`));
1231	// If the immediate offset is negative and within certain range, the base
1232	// address cannot also be negative. If the base is also negative, the sum
1233	// would be either negative or much larger than the valid range of scratch
1234	// memory a thread can access.
1235	if (isNoUnsignedWrap(Addr: Base) &&
1236	(isNoUnsignedWrap(Addr) \|\|
1237	(RHSImm->getSExtValue() < `0` && RHSImm->getSExtValue() > -`0x40000000`)))
1238	return true;
1239
1240	auto LHS = Base.getOperand(i: `0`);
1241	auto RHS = Base.getOperand(i: `1`);
1242	return CurDAG->SignBitIsZero(Op: RHS) && CurDAG->SignBitIsZero(Op: LHS);
1243	}
1244
1245	// TODO: If offset is too big, put low 16-bit into offset.
1246	bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1247	SDValue &Offset0,
1248	SDValue &Offset1) const {
1249	return SelectDSReadWrite2(Ptr: Addr, Base, Offset0, Offset1, Size: `4`);
1250	}
1251
1252	bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1253	SDValue &Offset0,
1254	SDValue &Offset1) const {
1255	return SelectDSReadWrite2(Ptr: Addr, Base, Offset0, Offset1, Size: `8`);
1256	}
1257
1258	bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1259	SDValue &Offset0, SDValue &Offset1,
1260	unsigned Size) const {
1261	SDLoc DL(Addr);
1262
1263	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1264	SDValue N0 = Addr.getOperand(i: `0`);
1265	SDValue N1 = Addr.getOperand(i: `1`);
1266	ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
1267	unsigned OffsetValue0 = C1->getZExtValue();
1268	unsigned OffsetValue1 = OffsetValue0 + Size;
1269
1270	// (add n0, c0)
1271	if (isDSOffset2Legal(Base: N0, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1272	Base = N0;
1273	Offset0 = CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i8);
1274	Offset1 = CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i8);
1275	return true;
1276	}
1277	} else if (Addr.getOpcode() == ISD::SUB) {
1278	// sub C, x -> add (sub 0, x), C
1279	if (const ConstantSDNode *C =
1280	dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `0`))) {
1281	unsigned OffsetValue0 = C->getZExtValue();
1282	unsigned OffsetValue1 = OffsetValue0 + Size;
1283
1284	if (isDSOffset2Legal(Base: SDValue (), Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1285	SDLoc DL(Addr);
1286	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1287
1288	// XXX - This is kind of hacky. Create a dummy sub node so we can check
1289	// the known bits in isDSOffsetLegal. We need to emit the selected node
1290	// here, so this is thrown away.
1291	SDValue Sub =
1292	CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: Zero, N2: Addr.getOperand(i: `1`));
1293
1294	if (isDSOffset2Legal(Base: Sub, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1295	SmallVector<SDValue, `3`> Opnds;
1296	Opnds.push_back(Elt: Zero);
1297	Opnds.push_back(Elt: Addr.getOperand(i: `1`));
1298	unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1299	if (Subtarget->hasAddNoCarry()) {
1300	SubOp = AMDGPU::V_SUB_U32_e64;
1301	Opnds.push_back(
1302	Elt: CurDAG->getTargetConstant(Val: `0`, DL: {}, VT: MVT::i1)); // clamp bit
1303	}
1304
1305	MachineSDNode *MachineSub = CurDAG->getMachineNode(
1306	Opcode: SubOp, dl: DL, VT: MVT::getIntegerVT(BitWidth: Size * `8`), Ops: Opnds);
1307
1308	Base = SDValue (MachineSub, `0`);
1309	Offset0 = CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i8);
1310	Offset1 = CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i8);
1311	return true;
1312	}
1313	}
1314	}
1315	} else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1316	unsigned OffsetValue0 = CAddr->getZExtValue();
1317	unsigned OffsetValue1 = OffsetValue0 + Size;
1318
1319	if (isDSOffset2Legal(Base: SDValue (), Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1320	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1321	MachineSDNode *MovZero =
1322	CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32, Op1: Zero);
1323	Base = SDValue (MovZero, `0`);
1324	Offset0 = CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i8);
1325	Offset1 = CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i8);
1326	return true;
1327	}
1328	}
1329
1330	// default case
1331
1332	Base = Addr;
1333	Offset0 = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i8);
1334	Offset1 = CurDAG->getTargetConstant(Val: `1`, DL, VT: MVT::i8);
1335	return true;
1336	}
1337
1338	bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1339	SDValue &SOffset, SDValue &Offset,
1340	SDValue &Offen, SDValue &Idxen,
1341	SDValue &Addr64) const {
1342	// Subtarget prefers to use flat instruction
1343	// FIXME: This should be a pattern predicate and not reach here
1344	if (Subtarget->useFlatForGlobal())
1345	return false;
1346
1347	SDLoc DL(Addr);
1348
1349	Idxen = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
1350	Offen = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
1351	Addr64 = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
1352	SOffset = Subtarget->hasRestrictedSOffset()
1353	? CurDAG->getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32)
1354	: CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1355
1356	ConstantSDNode C1 = nullptr*;
1357	SDValue N0 = Addr;
1358	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1359	C1 = cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`));
1360	if (isUInt<`32`>(x: C1->getZExtValue()))
1361	N0 = Addr.getOperand(i: `0`);
1362	else
1363	C1 = nullptr;
1364	}
1365
1366	if (N0.getOpcode() == ISD::ADD) {
1367	// (add N2, N3) -> addr64, or
1368	// (add (add N2, N3), C1) -> addr64
1369	SDValue N2 = N0.getOperand(i: `0`);
1370	SDValue N3 = N0.getOperand(i: `1`);
1371	Addr64 = CurDAG->getTargetConstant(Val: `1`, DL, VT: MVT::i1);
1372
1373	if (N2 ->isDivergent()) {
1374	if (N3 ->isDivergent()) {
1375	// Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1376	// addr64, and construct the resource from a 0 address.
1377	Ptr = SDValue (buildSMovImm64(DL, Imm: `0`, VT: MVT::v2i32), `0`);
1378	VAddr = N0;
1379	} else {
1380	// N2 is divergent, N3 is not.
1381	Ptr = N3;
1382	VAddr = N2;
1383	}
1384	} else {
1385	// N2 is not divergent.
1386	Ptr = N2;
1387	VAddr = N3;
1388	}
1389	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1390	} else if (N0 ->isDivergent()) {
1391	// N0 is divergent. Use it as the addr64, and construct the resource from a
1392	// 0 address.
1393	Ptr = SDValue (buildSMovImm64(DL, Imm: `0`, VT: MVT::v2i32), `0`);
1394	VAddr = N0;
1395	Addr64 = CurDAG->getTargetConstant(Val: `1`, DL, VT: MVT::i1);
1396	} else {
1397	// N0 -> offset, or
1398	// (N0 + C1) -> offset
1399	VAddr = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1400	Ptr = N0;
1401	}
1402
1403	if (!C1) {
1404	// No offset.
1405	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1406	return true;
1407	}
1408
1409	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1410	if (TII->isLegalMUBUFImmOffset(Imm: C1->getZExtValue())) {
1411	// Legal offset for instruction.
1412	Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32);
1413	return true;
1414	}
1415
1416	// Illegal offset, store it in soffset.
1417	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1418	SOffset =
1419	SDValue (CurDAG->getMachineNode(
1420	Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
1421	Op1: CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32)),
1422	`0`);
1423	return true;
1424	}
1425
1426	bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1427	SDValue &VAddr, SDValue &SOffset,
1428	SDValue &Offset) const {
1429	SDValue Ptr, Offen, Idxen, Addr64;
1430
1431	// addr64 bit was removed for volcanic islands.
1432	// FIXME: This should be a pattern predicate and not reach here
1433	if (!Subtarget->hasAddr64())
1434	return false;
1435
1436	if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1437	return false;
1438
1439	ConstantSDNode *C = cast<ConstantSDNode>(Val&: Addr64);
1440	if (C->getSExtValue()) {
1441	SDLoc DL(Addr);
1442
1443	const SITargetLowering& Lowering =
1444	*static_cast<const SITargetLowering*>(getTargetLowering());
1445
1446	SRsrc = SDValue (Lowering.wrapAddr64Rsrc(DAG&: *CurDAG, DL, Ptr), `0`);
1447	return true;
1448	}
1449
1450	return false;
1451	}
1452
1453	std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1454	SDLoc DL(N);
1455
1456	auto *FI = dyn_cast<FrameIndexSDNode>(Val&: N);
1457	SDValue TFI =
1458	FI ? CurDAG->getTargetFrameIndex(FI: FI->getIndex(), VT: FI->getValueType(ResNo: `0`)) : N;
1459
1460	// We rebase the base address into an absolute stack address and hence
1461	// use constant 0 for soffset. This value must be retained until
1462	// frame elimination and eliminateFrameIndex will choose the appropriate
1463	// frame register if need be.
1464	return std::pair(TFI, CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32));
1465	}
1466
1467	bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1468	SDValue Addr, SDValue &Rsrc,
1469	SDValue &VAddr, SDValue &SOffset,
1470	SDValue &ImmOffset) const {
1471
1472	SDLoc DL(Addr);
1473	MachineFunction &MF = CurDAG->getMachineFunction();
1474	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1475
1476	Rsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1477
1478	if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1479	int64_t Imm = CAddr->getSExtValue();
1480	const int64_t NullPtr =
1481	AMDGPUTargetMachine::getNullPointerValue(AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
1482	// Don't fold null pointer.
1483	if (Imm != NullPtr) {
1484	const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
1485	SDValue HighBits =
1486	CurDAG->getTargetConstant(Val: Imm & ~MaxOffset, DL, VT: MVT::i32);
1487	MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1488	Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32, Op1: HighBits);
1489	VAddr = SDValue (MovHighBits, `0`);
1490
1491	SOffset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1492	ImmOffset = CurDAG->getTargetConstant(Val: Imm & MaxOffset, DL, VT: MVT::i32);
1493	return true;
1494	}
1495	}
1496
1497	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1498	// (add n0, c1)
1499
1500	SDValue N0 = Addr.getOperand(i: `0`);
1501	uint64_t C1 = Addr.getConstantOperandVal(i: `1`);
1502
1503	// Offsets in vaddr must be positive if range checking is enabled.
1504	//
1505	// The total computation of vaddr + soffset + offset must not overflow. If
1506	// vaddr is negative, even if offset is 0 the sgpr offset add will end up
1507	// overflowing.
1508	//
1509	// Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1510	// always perform a range check. If a negative vaddr base index was used,
1511	// this would fail the range check. The overall address computation would
1512	// compute a valid address, but this doesn't happen due to the range
1513	// check. For out-of-bounds MUBUF loads, a 0 is returned.
1514	//
1515	// Therefore it should be safe to fold any VGPR offset on gfx9 into the
1516	// MUBUF vaddr, but not on older subtargets which can only do this if the
1517	// sign bit is known 0.
1518	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1519	if (TII->isLegalMUBUFImmOffset(Imm: C1) &&
1520	(!Subtarget->privateMemoryResourceIsRangeChecked() \|\|
1521	CurDAG->SignBitIsZero(Op: N0))) {
1522	std::tie(args&: VAddr, args&: SOffset) = foldFrameIndex(N: N0);
1523	ImmOffset = CurDAG->getTargetConstant(Val: C1, DL, VT: MVT::i32);
1524	return true;
1525	}
1526	}
1527
1528	// (node)
1529	std::tie(args&: VAddr, args&: SOffset) = foldFrameIndex(N: Addr);
1530	ImmOffset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1531	return true;
1532	}
1533
1534	static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1535	if (Val.getOpcode() != ISD::CopyFromReg)
1536	return false;
1537	auto Reg = cast<RegisterSDNode>(Val: Val.getOperand(i: `1`))->getReg();
1538	if (!Reg.isPhysical())
1539	return false;
1540	auto RC = TRI.getPhysRegBaseClass(Reg);
1541	return RC && TRI.isSGPRClass(RC);
1542	}
1543
1544	bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1545	SDValue Addr,
1546	SDValue &SRsrc,
1547	SDValue &SOffset,
1548	SDValue &Offset) const {
1549	const SIRegisterInfo *TRI =
1550	static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1551	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1552	MachineFunction &MF = CurDAG->getMachineFunction();
1553	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1554	SDLoc DL(Addr);
1555
1556	// CopyFromReg <sgpr>
1557	if (IsCopyFromSGPR(TRI: *TRI, Val: Addr)) {
1558	SRsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1559	SOffset = Addr;
1560	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1561	return true;
1562	}
1563
1564	ConstantSDNode *CAddr;
1565	if (Addr.getOpcode() == ISD::ADD) {
1566	// Add (CopyFromReg <sgpr>) <constant>
1567	CAddr = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`));
1568	if (!CAddr \|\| !TII->isLegalMUBUFImmOffset(Imm: CAddr->getZExtValue()))
1569	return false;
1570	if (!IsCopyFromSGPR(TRI: *TRI, Val: Addr.getOperand(i: `0`)))
1571	return false;
1572
1573	SOffset = Addr.getOperand(i: `0`);
1574	} else if ((CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) &&
1575	TII->isLegalMUBUFImmOffset(Imm: CAddr->getZExtValue())) {
1576	// <constant>
1577	SOffset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1578	} else {
1579	return false;
1580	}
1581
1582	SRsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1583
1584	Offset = CurDAG->getTargetConstant(Val: CAddr->getZExtValue(), DL, VT: MVT::i32);
1585	return true;
1586	}
1587
1588	bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1589	SDValue &SOffset, SDValue &Offset
1590	) const {
1591	SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1592	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1593
1594	if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1595	return false;
1596
1597	if (!cast<ConstantSDNode>(Val&: Offen)->getSExtValue() &&
1598	!cast<ConstantSDNode>(Val&: Idxen)->getSExtValue() &&
1599	!cast<ConstantSDNode>(Val&: Addr64)->getSExtValue()) {
1600	uint64_t Rsrc = TII->getDefaultRsrcDataFormat() \|
1601	APInt::getAllOnes(numBits: `32`).getZExtValue(); // Size
1602	SDLoc DL(Addr);
1603
1604	const SITargetLowering& Lowering =
1605	*static_cast<const SITargetLowering*>(getTargetLowering());
1606
1607	SRsrc = SDValue (Lowering.buildRSRC(DAG&: *CurDAG, DL, Ptr, RsrcDword1: `0`, RsrcDword2And3: Rsrc), `0`);
1608	return true;
1609	}
1610	return false;
1611	}
1612
1613	bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1614	SDValue &SOffset) const {
1615	if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: ByteOffsetNode)) {
1616	SOffset = CurDAG->getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32);
1617	return true;
1618	}
1619
1620	SOffset = ByteOffsetNode;
1621	return true;
1622	}
1623
1624	// Find a load or store from corresponding pattern root.
1625	// Roots may be build_vector, bitconvert or their combinations.
1626	static MemSDNode* findMemSDNode(SDNode *N) {
1627	N = AMDGPUTargetLowering::stripBitcast(Val: SDValue (N,`0`)).getNode();
1628	if (MemSDNode *MN = dyn_cast<MemSDNode>(Val: N))
1629	return MN;
1630	assert(isa<BuildVectorSDNode>(N));
1631	for (SDValue V : N->op_values())
1632	if (MemSDNode *MN =
1633	dyn_cast<MemSDNode>(Val: AMDGPUTargetLowering::stripBitcast(Val: V)))
1634	return MN;
1635	llvm_unreachable("cannot find MemSDNode in the pattern!");
1636	}
1637
1638	bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1639	SDValue &VAddr, SDValue &Offset,
1640	uint64_t FlatVariant) const {
1641	int64_t OffsetVal = `0`;
1642
1643	unsigned AS = findMemSDNode(N)->getAddressSpace();
1644
1645	bool CanHaveFlatSegmentOffsetBug =
1646	Subtarget->hasFlatSegmentOffsetBug() &&
1647	FlatVariant == SIInstrFlags::FLAT &&
1648	(AS == AMDGPUAS::FLAT_ADDRESS \|\| AS == AMDGPUAS::GLOBAL_ADDRESS);
1649
1650	if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1651	SDValue N0, N1;
1652	if (isBaseWithConstantOffset64(Addr, LHS&: N0, RHS&: N1) &&
1653	(FlatVariant != SIInstrFlags::FlatScratch \|\|
1654	isFlatScratchBaseLegal(Addr))) {
1655	int64_t COffsetVal = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
1656
1657	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1658	if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AS, FlatVariant)) {
1659	Addr = N0;
1660	OffsetVal = COffsetVal;
1661	} else {
1662	// If the offset doesn't fit, put the low bits into the offset field and
1663	// add the rest.
1664	//
1665	// For a FLAT instruction the hardware decides whether to access
1666	// global/scratch/shared memory based on the high bits of vaddr,
1667	// ignoring the offset field, so we have to ensure that when we add
1668	// remainder to vaddr it still points into the same underlying object.
1669	// The easiest way to do that is to make sure that we split the offset
1670	// into two pieces that are both >= 0 or both <= 0.
1671
1672	SDLoc DL(N);
1673	uint64_t RemainderOffset;
1674
1675	std::tie(args&: OffsetVal, args&: RemainderOffset) =
1676	TII->splitFlatOffset(COffsetVal, AddrSpace: AS, FlatVariant);
1677
1678	SDValue AddOffsetLo =
1679	getMaterializedScalarImm32(Val: Lo_32(Value: RemainderOffset), DL);
1680	SDValue Clamp = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
1681
1682	if (Addr.getValueType().getSizeInBits() == `32`) {
1683	SmallVector<SDValue, `3`> Opnds;
1684	Opnds.push_back(Elt: N0);
1685	Opnds.push_back(Elt: AddOffsetLo);
1686	unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1687	if (Subtarget->hasAddNoCarry()) {
1688	AddOp = AMDGPU::V_ADD_U32_e64;
1689	Opnds.push_back(Elt: Clamp);
1690	}
1691	Addr = SDValue (CurDAG->getMachineNode(Opcode: AddOp, dl: DL, VT: MVT::i32, Ops: Opnds), `0`);
1692	} else {
1693	// TODO: Should this try to use a scalar add pseudo if the base address
1694	// is uniform and saddr is usable?
1695	SDValue Sub0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
1696	SDValue Sub1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
1697
1698	SDNode *N0Lo = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1699	dl: DL, VT: MVT::i32, Op1: N0, Op2: Sub0);
1700	SDNode *N0Hi = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1701	dl: DL, VT: MVT::i32, Op1: N0, Op2: Sub1);
1702
1703	SDValue AddOffsetHi =
1704	getMaterializedScalarImm32(Val: Hi_32(Value: RemainderOffset), DL);
1705
1706	SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i1);
1707
1708	SDNode *Add =
1709	CurDAG->getMachineNode(Opcode: AMDGPU::V_ADD_CO_U32_e64, dl: DL, VTs,
1710	Ops: {AddOffsetLo, SDValue (N0Lo, `0`), Clamp});
1711
1712	SDNode *Addc = CurDAG->getMachineNode(
1713	Opcode: AMDGPU::V_ADDC_U32_e64, dl: DL, VTs,
1714	Ops: {AddOffsetHi, SDValue (N0Hi, `0`), SDValue (Add, `1`), Clamp});
1715
1716	SDValue RegSequenceArgs[] = {
1717	CurDAG->getTargetConstant(Val: AMDGPU::VReg_64RegClassID, DL, VT: MVT::i32),
1718	SDValue (Add, `0`), Sub0, SDValue (Addc, `0`), Sub1};
1719
1720	Addr = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL,
1721	VT: MVT::i64, Ops: RegSequenceArgs),
1722	`0`);
1723	}
1724	}
1725	}
1726	}
1727
1728	VAddr = Addr;
1729	Offset = CurDAG->getTargetConstant(Val: OffsetVal, DL: SDLoc (), VT: MVT::i32);
1730	return true;
1731	}
1732
1733	bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1734	SDValue &VAddr,
1735	SDValue &Offset) const {
1736	return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, FlatVariant: SIInstrFlags::FLAT);
1737	}
1738
1739	bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1740	SDValue &VAddr,
1741	SDValue &Offset) const {
1742	return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, FlatVariant: SIInstrFlags::FlatGlobal);
1743	}
1744
1745	bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1746	SDValue &VAddr,
1747	SDValue &Offset) const {
1748	return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1749	FlatVariant: SIInstrFlags::FlatScratch);
1750	}
1751
1752	// If this matches zero_extend i32:x, return x
1753	static SDValue matchZExtFromI32(SDValue Op) {
1754	if (Op.getOpcode() != ISD::ZERO_EXTEND)
1755	return SDValue ();
1756
1757	SDValue ExtSrc = Op.getOperand(i: `0`);
1758	return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue ();
1759	}
1760
1761	// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1762	bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1763	SDValue Addr,
1764	SDValue &SAddr,
1765	SDValue &VOffset,
1766	SDValue &Offset) const {
1767	int64_t ImmOffset = `0`;
1768
1769	// Match the immediate offset first, which canonically is moved as low as
1770	// possible.
1771
1772	SDValue LHS, RHS;
1773	if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1774	int64_t COffsetVal = cast<ConstantSDNode>(Val&: RHS)->getSExtValue();
1775	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1776
1777	if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
1778	FlatVariant: SIInstrFlags::FlatGlobal)) {
1779	Addr = LHS;
1780	ImmOffset = COffsetVal;
1781	} else if (!LHS ->isDivergent()) {
1782	if (COffsetVal > `0`) {
1783	SDLoc SL(N);
1784	// saddr + large_offset -> saddr +
1785	// (voffset = large_offset & ~MaxOffset) +
1786	// (large_offset & MaxOffset);
1787	int64_t SplitImmOffset, RemainderOffset;
1788	std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
1789	COffsetVal, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS, FlatVariant: SIInstrFlags::FlatGlobal);
1790
1791	if (isUInt<`32`>(x: RemainderOffset)) {
1792	SDNode *VMov = CurDAG->getMachineNode(
1793	Opcode: AMDGPU::V_MOV_B32_e32, dl: SL, VT: MVT::i32,
1794	Op1: CurDAG->getTargetConstant(Val: RemainderOffset, DL: SDLoc (), VT: MVT::i32));
1795	VOffset = SDValue (VMov, `0`);
1796	SAddr = LHS;
1797	Offset = CurDAG->getTargetConstant(Val: SplitImmOffset, DL: SDLoc (), VT: MVT::i32);
1798	return true;
1799	}
1800	}
1801
1802	// We are adding a 64 bit SGPR and a constant. If constant bus limit
1803	// is 1 we would need to perform 1 or 2 extra moves for each half of
1804	// the constant and it is better to do a scalar add and then issue a
1805	// single VALU instruction to materialize zero. Otherwise it is less
1806	// instructions to perform VALU adds with immediates or inline literals.
1807	unsigned NumLiterals =
1808	!TII->isInlineConstant(Imm: APInt (`32`, COffsetVal & `0xffffffff`)) +
1809	!TII->isInlineConstant(Imm: APInt (`32`, COffsetVal >> `32`));
1810	if (Subtarget->getConstantBusLimit(Opcode: AMDGPU::V_ADD_U32_e64) > NumLiterals)
1811	return false;
1812	}
1813	}
1814
1815	// Match the variable offset.
1816	if (Addr.getOpcode() == ISD::ADD) {
1817	LHS = Addr.getOperand(i: `0`);
1818	RHS = Addr.getOperand(i: `1`);
1819
1820	if (!LHS ->isDivergent()) {
1821	// add (i64 sgpr), (zero_extend (i32 vgpr))
1822	if (SDValue ZextRHS = matchZExtFromI32(Op: RHS)) {
1823	SAddr = LHS;
1824	VOffset = ZextRHS;
1825	}
1826	}
1827
1828	if (!SAddr && !RHS ->isDivergent()) {
1829	// add (zero_extend (i32 vgpr)), (i64 sgpr)
1830	if (SDValue ZextLHS = matchZExtFromI32(Op: LHS)) {
1831	SAddr = RHS;
1832	VOffset = ZextLHS;
1833	}
1834	}
1835
1836	if (SAddr) {
1837	Offset = CurDAG->getTargetConstant(Val: ImmOffset, DL: SDLoc (), VT: MVT::i32);
1838	return true;
1839	}
1840	}
1841
1842	if (Addr ->isDivergent() \|\| Addr.getOpcode() == ISD::UNDEF \|\|
1843	isa<ConstantSDNode>(Val: Addr))
1844	return false;
1845
1846	// It's cheaper to materialize a single 32-bit zero for vaddr than the two
1847	// moves required to copy a 64-bit SGPR to VGPR.
1848	SAddr = Addr;
1849	SDNode *VMov =
1850	CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: SDLoc (Addr), VT: MVT::i32,
1851	Op1: CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (), VT: MVT::i32));
1852	VOffset = SDValue (VMov, `0`);
1853	Offset = CurDAG->getTargetConstant(Val: ImmOffset, DL: SDLoc (), VT: MVT::i32);
1854	return true;
1855	}
1856
1857	static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
1858	if (auto FI = dyn_cast<FrameIndexSDNode>(Val&: SAddr)) {
1859	SAddr = CurDAG->getTargetFrameIndex(FI: FI->getIndex(), VT: FI->getValueType(ResNo: `0`));
1860	} else if (SAddr.getOpcode() == ISD::ADD &&
1861	isa<FrameIndexSDNode>(Val: SAddr.getOperand(i: `0`))) {
1862	// Materialize this into a scalar move for scalar address to avoid
1863	// readfirstlane.
1864	auto FI = cast<FrameIndexSDNode>(Val: SAddr.getOperand(i: `0`));
1865	SDValue TFI = CurDAG->getTargetFrameIndex(FI: FI->getIndex(),
1866	VT: FI->getValueType(ResNo: `0`));
1867	SAddr = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::S_ADD_I32, dl: SDLoc (SAddr),
1868	VT: MVT::i32, Op1: TFI, Op2: SAddr.getOperand(i: `1`)),
1869	`0`);
1870	}
1871
1872	return SAddr;
1873	}
1874
1875	// Match (32-bit SGPR base) + sext(imm offset)
1876	bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1877	SDValue &SAddr,
1878	SDValue &Offset) const {
1879	if (Addr ->isDivergent())
1880	return false;
1881
1882	SDLoc DL(Addr);
1883
1884	int64_t COffsetVal = `0`;
1885
1886	if (CurDAG->isBaseWithConstantOffset(Op: Addr) && isFlatScratchBaseLegal(Addr)) {
1887	COffsetVal = cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`))->getSExtValue();
1888	SAddr = Addr.getOperand(i: `0`);
1889	} else {
1890	SAddr = Addr;
1891	}
1892
1893	SAddr = SelectSAddrFI(CurDAG, SAddr);
1894
1895	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1896
1897	if (!TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
1898	FlatVariant: SIInstrFlags::FlatScratch)) {
1899	int64_t SplitImmOffset, RemainderOffset;
1900	std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
1901	COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: SIInstrFlags::FlatScratch);
1902
1903	COffsetVal = SplitImmOffset;
1904
1905	SDValue AddOffset =
1906	SAddr.getOpcode() == ISD::TargetFrameIndex
1907	? getMaterializedScalarImm32(Val: Lo_32(Value: RemainderOffset), DL)
1908	: CurDAG->getTargetConstant(Val: RemainderOffset, DL, VT: MVT::i32);
1909	SAddr = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::S_ADD_I32, dl: DL, VT: MVT::i32,
1910	Op1: SAddr, Op2: AddOffset),
1911	`0`);
1912	}
1913
1914	Offset = CurDAG->getTargetConstant(Val: COffsetVal, DL, VT: MVT::i16);
1915
1916	return true;
1917	}
1918
1919	// Check whether the flat scratch SVS swizzle bug affects this access.
1920	bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1921	SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1922	if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1923	return false;
1924
1925	// The bug affects the swizzling of SVS accesses if there is any carry out
1926	// from the two low order bits (i.e. from bit 1 into bit 2) when adding
1927	// voffset to (soffset + inst_offset).
1928	KnownBits VKnown = CurDAG->computeKnownBits(Op: VAddr);
1929	KnownBits SKnown = KnownBits::computeForAddSub(
1930	/Add=/true, /NSW=/false, /NUW=/false,
1931	LHS: CurDAG->computeKnownBits(Op: SAddr),
1932	RHS: KnownBits::makeConstant(C: APInt (`32`, ImmOffset)));
1933	uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1934	uint64_t SMax = SKnown.getMaxValue().getZExtValue();
1935	return (VMax & `3`) + (SMax & `3`) >= `4`;
1936	}
1937
1938	bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1939	SDValue &VAddr, SDValue &SAddr,
1940	SDValue &Offset) const {
1941	int64_t ImmOffset = `0`;
1942
1943	SDValue LHS, RHS;
1944	SDValue OrigAddr = Addr;
1945	if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1946	int64_t COffsetVal = cast<ConstantSDNode>(Val&: RHS)->getSExtValue();
1947	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1948
1949	if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: true)) {
1950	Addr = LHS;
1951	ImmOffset = COffsetVal;
1952	} else if (!LHS ->isDivergent() && COffsetVal > `0`) {
1953	SDLoc SL(N);
1954	// saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1955	// (large_offset & MaxOffset);
1956	int64_t SplitImmOffset, RemainderOffset;
1957	std::tie(args&: SplitImmOffset, args&: RemainderOffset)
1958	= TII->splitFlatOffset(COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: true);
1959
1960	if (isUInt<`32`>(x: RemainderOffset)) {
1961	SDNode *VMov = CurDAG->getMachineNode(
1962	Opcode: AMDGPU::V_MOV_B32_e32, dl: SL, VT: MVT::i32,
1963	Op1: CurDAG->getTargetConstant(Val: RemainderOffset, DL: SDLoc (), VT: MVT::i32));
1964	VAddr = SDValue (VMov, `0`);
1965	SAddr = LHS;
1966	if (!isFlatScratchBaseLegal(Addr))
1967	return false;
1968	if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset: SplitImmOffset))
1969	return false;
1970	Offset = CurDAG->getTargetConstant(Val: SplitImmOffset, DL: SDLoc (), VT: MVT::i16);
1971	return true;
1972	}
1973	}
1974	}
1975
1976	if (Addr.getOpcode() != ISD::ADD)
1977	return false;
1978
1979	LHS = Addr.getOperand(i: `0`);
1980	RHS = Addr.getOperand(i: `1`);
1981
1982	if (!LHS ->isDivergent() && RHS ->isDivergent()) {
1983	SAddr = LHS;
1984	VAddr = RHS;
1985	} else if (!RHS ->isDivergent() && LHS ->isDivergent()) {
1986	SAddr = RHS;
1987	VAddr = LHS;
1988	} else {
1989	return false;
1990	}
1991
1992	if (OrigAddr != Addr) {
1993	if (!isFlatScratchBaseLegalSVImm(Addr: OrigAddr))
1994	return false;
1995	} else {
1996	if (!isFlatScratchBaseLegalSV(Addr: OrigAddr))
1997	return false;
1998	}
1999
2000	if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2001	return false;
2002	SAddr = SelectSAddrFI(CurDAG, SAddr);
2003	Offset = CurDAG->getTargetConstant(Val: ImmOffset, DL: SDLoc (), VT: MVT::i16);
2004	return true;
2005	}
2006
2007	// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2008	// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2009	// Handle the case where the Immediate Offset + SOffset is negative.
2010	bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2011	bool Imm32Only,
2012	bool IsBuffer,
2013	int64_t ImmOffset) const {
2014	if (!IsBuffer && !Imm32Only && ImmOffset < `0` &&
2015	AMDGPU::hasSMRDSignedImmOffset(ST: *Subtarget)) {
2016	KnownBits SKnown = CurDAG->computeKnownBits(Op: *SOffset);
2017	if (ImmOffset + SKnown.getMinValue().getSExtValue() < `0`)
2018	return false;
2019	}
2020
2021	return true;
2022	}
2023
2024	// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2025	// not null) offset. If Imm32Only is true, match only 32-bit immediate
2026	// offsets available on CI.
2027	bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2028	SDValue SOffset, SDValue Offset,
2029	bool Imm32Only, bool IsBuffer,
2030	bool HasSOffset,
2031	int64_t ImmOffset) const {
2032	assert((!SOffset \|\| !Offset) &&
2033	"Cannot match both soffset and offset at the same time!");
2034
2035	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: ByteOffsetNode);
2036	if (!C) {
2037	if (!SOffset)
2038	return false;
2039
2040	if (ByteOffsetNode.getValueType().isScalarInteger() &&
2041	ByteOffsetNode.getValueType().getSizeInBits() == `32`) {
2042	*SOffset = ByteOffsetNode;
2043	return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2044	ImmOffset);
2045	}
2046	if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2047	if (ByteOffsetNode.getOperand(i: `0`).getValueType().getSizeInBits() == `32`) {
2048	*SOffset = ByteOffsetNode.getOperand(i: `0`);
2049	return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2050	ImmOffset);
2051	}
2052	}
2053	return false;
2054	}
2055
2056	SDLoc SL(ByteOffsetNode);
2057
2058	// GFX9 and GFX10 have signed byte immediate offsets. The immediate
2059	// offset for S_BUFFER instructions is unsigned.
2060	int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2061	std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2062	ST: *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2063	if (EncodedOffset && Offset && !Imm32Only) {
2064	Offset = CurDAG->getTargetConstant(Val: EncodedOffset, DL: SL, VT: MVT::i32);
2065	return true;
2066	}
2067
2068	// SGPR and literal offsets are unsigned.
2069	if (ByteOffset < `0`)
2070	return false;
2071
2072	EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(ST: *Subtarget, ByteOffset);
2073	if (EncodedOffset && Offset && Imm32Only) {
2074	Offset = CurDAG->getTargetConstant(Val: EncodedOffset, DL: SL, VT: MVT::i32);
2075	return true;
2076	}
2077
2078	if (!isUInt<`32`>(x: ByteOffset) && !isInt<`32`>(x: ByteOffset))
2079	return false;
2080
2081	if (SOffset) {
2082	SDValue C32Bit = CurDAG->getTargetConstant(Val: ByteOffset, DL: SL, VT: MVT::i32);
2083	*SOffset = SDValue (
2084	CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32, Op1: C32Bit), `0`);
2085	return true;
2086	}
2087
2088	return false;
2089	}
2090
2091	SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2092	if (Addr.getValueType() != MVT::i32)
2093	return Addr;
2094
2095	// Zero-extend a 32-bit address.
2096	SDLoc SL(Addr);
2097
2098	const MachineFunction &MF = CurDAG->getMachineFunction();
2099	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2100	unsigned AddrHiVal = Info->get32BitAddressHighBits();
2101	SDValue AddrHi = CurDAG->getTargetConstant(Val: AddrHiVal, DL: SL, VT: MVT::i32);
2102
2103	const SDValue Ops[] = {
2104	CurDAG->getTargetConstant(Val: AMDGPU::SReg_64_XEXECRegClassID, DL: SL, VT: MVT::i32),
2105	Addr,
2106	CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32),
2107	SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32, Op1: AddrHi),
2108	`0`),
2109	CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32),
2110	};
2111
2112	return SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: SL, VT: MVT::i64,
2113	Ops), `0`);
2114	}
2115
2116	// Match a base and an immediate (if Offset is not null) or an SGPR (if
2117	// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2118	// true, match only 32-bit immediate offsets available on CI.
2119	bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2120	SDValue SOffset, SDValue Offset,
2121	bool Imm32Only, bool IsBuffer,
2122	bool HasSOffset,
2123	int64_t ImmOffset) const {
2124	if (SOffset && Offset) {
2125	assert(!Imm32Only && !IsBuffer);
2126	SDValue B;
2127
2128	if (!SelectSMRDBaseOffset(Addr, SBase&: B, SOffset: nullptr, Offset, Imm32Only: false, IsBuffer: false, HasSOffset: true))
2129	return false;
2130
2131	int64_t ImmOff = `0`;
2132	if (ConstantSDNode C = dyn_cast<ConstantSDNode>(Val&: Offset))
2133	ImmOff = C->getSExtValue();
2134
2135	return SelectSMRDBaseOffset(Addr: B, SBase, SOffset, Offset: nullptr, Imm32Only: false, IsBuffer: false, HasSOffset: true,
2136	ImmOffset: ImmOff);
2137	}
2138
2139	// A 32-bit (address + offset) should not cause unsigned 32-bit integer
2140	// wraparound, because s_load instructions perform the addition in 64 bits.
2141	if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2142	!Addr ->getFlags().hasNoUnsignedWrap())
2143	return false;
2144
2145	SDValue N0, N1;
2146	// Extract the base and offset if possible.
2147	if (CurDAG->isBaseWithConstantOffset(Op: Addr) \|\| Addr.getOpcode() == ISD::ADD) {
2148	N0 = Addr.getOperand(i: `0`);
2149	N1 = Addr.getOperand(i: `1`);
2150	} else if (getBaseWithOffsetUsingSplitOR(DAG&: *CurDAG, Addr, N0, N1)) {
2151	assert(N0 && N1 && isa<ConstantSDNode>(N1));
2152	}
2153	if (!N0 \|\| !N1)
2154	return false;
2155
2156	if (SelectSMRDOffset(ByteOffsetNode: N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2157	ImmOffset)) {
2158	SBase = N0;
2159	return true;
2160	}
2161	if (SelectSMRDOffset(ByteOffsetNode: N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2162	ImmOffset)) {
2163	SBase = N1;
2164	return true;
2165	}
2166	return false;
2167	}
2168
2169	bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2170	SDValue SOffset, SDValue Offset,
2171	bool Imm32Only) const {
2172	if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2173	SBase = Expand32BitAddress(Addr: SBase);
2174	return true;
2175	}
2176
2177	if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2178	SBase = Expand32BitAddress(Addr);
2179	*Offset = CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (Addr), VT: MVT::i32);
2180	return true;
2181	}
2182
2183	return false;
2184	}
2185
2186	bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2187	SDValue &Offset) const {
2188	return SelectSMRD(Addr, SBase, / SOffset / nullptr, Offset: &Offset);
2189	}
2190
2191	bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2192	SDValue &Offset) const {
2193	assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2194	return SelectSMRD(Addr, SBase, / SOffset / nullptr, Offset: &Offset,
2195	/ Imm32Only / true);
2196	}
2197
2198	bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2199	SDValue &SOffset) const {
2200	return SelectSMRD(Addr, SBase, SOffset: &SOffset, / Offset / nullptr);
2201	}
2202
2203	bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2204	SDValue &SOffset,
2205	SDValue &Offset) const {
2206	return SelectSMRD(Addr, SBase, SOffset: &SOffset, Offset: &Offset);
2207	}
2208
2209	bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2210	return SelectSMRDOffset(ByteOffsetNode: N, / SOffset / nullptr, Offset: &Offset,
2211	/ Imm32Only / false, / IsBuffer / true);
2212	}
2213
2214	bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2215	SDValue &Offset) const {
2216	assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2217	return SelectSMRDOffset(ByteOffsetNode: N, / SOffset / nullptr, Offset: &Offset,
2218	/ Imm32Only / true, / IsBuffer / true);
2219	}
2220
2221	bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2222	SDValue &Offset) const {
2223	// Match the (soffset + offset) pair as a 32-bit register base and
2224	// an immediate offset.
2225	return N.getValueType() == MVT::i32 &&
2226	SelectSMRDBaseOffset(Addr: N, / SBase / SOffset, / SOffset/ nullptr,
2227	Offset: &Offset, / Imm32Only / false,
2228	/ IsBuffer / true);
2229	}
2230
2231	bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2232	SDValue &Base,
2233	SDValue &Offset) const {
2234	SDLoc DL(Index);
2235
2236	if (CurDAG->isBaseWithConstantOffset(Op: Index)) {
2237	SDValue N0 = Index.getOperand(i: `0`);
2238	SDValue N1 = Index.getOperand(i: `1`);
2239	ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
2240
2241	// (add n0, c0)
2242	// Don't peel off the offset (c0) if doing so could possibly lead
2243	// the base (n0) to be negative.
2244	// (or n0, \|c0\|) can never change a sign given isBaseWithConstantOffset.
2245	if (C1->getSExtValue() <= `0` \|\| CurDAG->SignBitIsZero(Op: N0) \|\|
2246	(Index ->getOpcode() == ISD::OR && C1->getSExtValue() >= `0`)) {
2247	Base = N0;
2248	Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32);
2249	return true;
2250	}
2251	}
2252
2253	if (isa<ConstantSDNode>(Val: Index))
2254	return false;
2255
2256	Base = Index;
2257	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
2258	return true;
2259	}
2260
2261	SDNode AMDGPUDAGToDAGISel::getBFE32(bool* IsSigned, const SDLoc &DL,
2262	SDValue Val, uint32_t Offset,
2263	uint32_t Width) {
2264	if (Val ->isDivergent()) {
2265	unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2266	SDValue Off = CurDAG->getTargetConstant(Val: Offset, DL, VT: MVT::i32);
2267	SDValue W = CurDAG->getTargetConstant(Val: Width, DL, VT: MVT::i32);
2268
2269	return CurDAG->getMachineNode(Opcode, dl: DL, VT: MVT::i32, Op1: Val, Op2: Off, Op3: W);
2270	}
2271	unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2272	// Transformation function, pack the offset and width of a BFE into
2273	// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2274	// source, bits [5:0] contain the offset and bits [22:16] the width.
2275	uint32_t PackedVal = Offset \| (Width << `16`);
2276	SDValue PackedConst = CurDAG->getTargetConstant(Val: PackedVal, DL, VT: MVT::i32);
2277
2278	return CurDAG->getMachineNode(Opcode, dl: DL, VT: MVT::i32, Op1: Val, Op2: PackedConst);
2279	}
2280
2281	void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2282	// "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2283	// "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2284	// Predicate: 0 < b <= c < 32
2285
2286	const SDValue &Shl = N->getOperand(Num: `0`);
2287	ConstantSDNode *B = dyn_cast<ConstantSDNode>(Val: Shl ->getOperand(Num: `1`));
2288	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
2289
2290	if (B && C) {
2291	uint32_t BVal = B->getZExtValue();
2292	uint32_t CVal = C->getZExtValue();
2293
2294	if (`0` < BVal && BVal <= CVal && CVal < `32`) {
2295	bool Signed = N->getOpcode() == ISD::SRA;
2296	ReplaceNode(F: N, T: getBFE32(IsSigned: Signed, DL: SDLoc (N), Val: Shl.getOperand(i: `0`), Offset: CVal - BVal,
2297	Width: `32` - CVal));
2298	return;
2299	}
2300	}
2301	SelectCode(N);
2302	}
2303
2304	void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2305	switch (N->getOpcode()) {
2306	case ISD::AND:
2307	if (N->getOperand(Num: `0`).getOpcode() == ISD::SRL) {
2308	// "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2309	// Predicate: isMask(mask)
2310	const SDValue &Srl = N->getOperand(Num: `0`);
2311	ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Val: Srl.getOperand(i: `1`));
2312	ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
2313
2314	if (Shift && Mask) {
2315	uint32_t ShiftVal = Shift->getZExtValue();
2316	uint32_t MaskVal = Mask->getZExtValue();
2317
2318	if (isMask_32(Value: MaskVal)) {
2319	uint32_t WidthVal = llvm::popcount(Value: MaskVal);
2320	ReplaceNode(F: N, T: getBFE32(IsSigned: false, DL: SDLoc (N), Val: Srl.getOperand(i: `0`), Offset: ShiftVal,
2321	Width: WidthVal));
2322	return;
2323	}
2324	}
2325	}
2326	break;
2327	case ISD::SRL:
2328	if (N->getOperand(Num: `0`).getOpcode() == ISD::AND) {
2329	// "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2330	// Predicate: isMask(mask >> b)
2331	const SDValue &And = N->getOperand(Num: `0`);
2332	ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
2333	ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: And ->getOperand(Num: `1`));
2334
2335	if (Shift && Mask) {
2336	uint32_t ShiftVal = Shift->getZExtValue();
2337	uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2338
2339	if (isMask_32(Value: MaskVal)) {
2340	uint32_t WidthVal = llvm::popcount(Value: MaskVal);
2341	ReplaceNode(F: N, T: getBFE32(IsSigned: false, DL: SDLoc (N), Val: And.getOperand(i: `0`), Offset: ShiftVal,
2342	Width: WidthVal));
2343	return;
2344	}
2345	}
2346	} else if (N->getOperand(Num: `0`).getOpcode() == ISD::SHL) {
2347	SelectS_BFEFromShifts(N);
2348	return;
2349	}
2350	break;
2351	case ISD::SRA:
2352	if (N->getOperand(Num: `0`).getOpcode() == ISD::SHL) {
2353	SelectS_BFEFromShifts(N);
2354	return;
2355	}
2356	break;
2357
2358	case ISD::SIGN_EXTEND_INREG: {
2359	// sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2360	SDValue Src = N->getOperand(Num: `0`);
2361	if (Src.getOpcode() != ISD::SRL)
2362	break;
2363
2364	const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: `1`));
2365	if (!Amt)
2366	break;
2367
2368	unsigned Width = cast<VTSDNode>(Val: N->getOperand(Num: `1`))->getVT().getSizeInBits();
2369	ReplaceNode(F: N, T: getBFE32(IsSigned: true, DL: SDLoc (N), Val: Src.getOperand(i: `0`),
2370	Offset: Amt->getZExtValue(), Width));
2371	return;
2372	}
2373	}
2374
2375	SelectCode(N);
2376	}
2377
2378	bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode N) const* {
2379	assert(N->getOpcode() == ISD::BRCOND);
2380	if (!N->hasOneUse())
2381	return false;
2382
2383	SDValue Cond = N->getOperand(Num: `1`);
2384	if (Cond.getOpcode() == ISD::CopyToReg)
2385	Cond = Cond.getOperand(i: `2`);
2386
2387	if (Cond.getOpcode() != ISD::SETCC \|\| !Cond.hasOneUse())
2388	return false;
2389
2390	MVT VT = Cond.getOperand(i: `0`).getSimpleValueType();
2391	if (VT == MVT::i32)
2392	return true;
2393
2394	if (VT == MVT::i64) {
2395	auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2396
2397	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Cond.getOperand(i: `2`))->get();
2398	return (CC == ISD::SETEQ \|\| CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2399	}
2400
2401	return false;
2402	}
2403
2404	static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2405	assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2406	// Special case for amdgcn.ballot:
2407	// %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2408	// %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2409	// =>
2410	// Use i1 %Cond value instead of i(WaveSize) %VCMP.
2411	// This is possible because divergent ISD::SETCC is selected as V_CMP and
2412	// Cond becomes a i(WaveSize) full mask value.
2413	// Note that ballot doesn't use SETEQ condition but its easy to support it
2414	// here for completeness, so in this case Negate is set true on return.
2415	auto VCMP_CC = cast<CondCodeSDNode>(Val: VCMP.getOperand(i: `2`))->get();
2416	if ((VCMP_CC == ISD::SETEQ \|\| VCMP_CC == ISD::SETNE) &&
2417	isNullConstant(V: VCMP.getOperand(i: `1`))) {
2418
2419	auto Cond = VCMP.getOperand(i: `0`);
2420	if (ISD::isExtOpcode(Opcode: Cond ->getOpcode())) // Skip extension.
2421	Cond = Cond.getOperand(i: `0`);
2422
2423	if (isBoolSGPR(V: Cond)) {
2424	Negate = VCMP_CC == ISD::SETEQ;
2425	return Cond;
2426	}
2427	}
2428	return SDValue ();
2429	}
2430
2431	void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2432	SDValue Cond = N->getOperand(Num: `1`);
2433
2434	if (Cond.isUndef()) {
2435	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::SI_BR_UNDEF, VT: MVT::Other,
2436	Op1: N->getOperand(Num: `2`), Op2: N->getOperand(Num: `0`));
2437	return;
2438	}
2439
2440	const GCNSubtarget ST = static_cast<const* GCNSubtarget *>(Subtarget);
2441	const SIRegisterInfo *TRI = ST->getRegisterInfo();
2442
2443	bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2444	bool AndExec = !UseSCCBr;
2445	bool Negate = false;
2446
2447	if (Cond.getOpcode() == ISD::SETCC &&
2448	Cond ->getOperand(Num: `0`)->getOpcode() == AMDGPUISD::SETCC) {
2449	SDValue VCMP = Cond ->getOperand(Num: `0`);
2450	auto CC = cast<CondCodeSDNode>(Val: Cond ->getOperand(Num: `2`))->get();
2451	if ((CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
2452	isNullConstant(V: Cond ->getOperand(Num: `1`)) &&
2453	// We may encounter ballot.i64 in wave32 mode on -O0.
2454	VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
2455	// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2456	// %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2457	// BRCOND i1 %C, %BB
2458	// =>
2459	// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2460	// VCC = COPY i(WaveSize) %VCMP
2461	// S_CBRANCH_VCCNZ/VCCZ %BB
2462	Negate = CC == ISD::SETEQ;
2463	bool NegatedBallot = false;
2464	if (auto BallotCond = combineBallotPattern(VCMP, Negate&: NegatedBallot)) {
2465	Cond = BallotCond;
2466	UseSCCBr = !BallotCond ->isDivergent();
2467	Negate = Negate ^ NegatedBallot;
2468	} else {
2469	// TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2470	// selected as V_CMP, but this may change for uniform condition.
2471	Cond = VCMP;
2472	UseSCCBr = false;
2473	}
2474	}
2475	// Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2476	// V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2477	// used.
2478	AndExec = false;
2479	}
2480
2481	unsigned BrOp =
2482	UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2483	: (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2484	Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2485	SDLoc SL(N);
2486
2487	if (AndExec) {
2488	// This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2489	// analyzed what generates the vcc value, so we do not know whether vcc
2490	// bits for disabled lanes are 0. Thus we need to mask out bits for
2491	// disabled lanes.
2492	//
2493	// For the case that we select S_CBRANCH_SCC1 and it gets
2494	// changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2495	// SIInstrInfo::moveToVALU which inserts the S_AND).
2496	//
2497	// We could add an analysis of what generates the vcc value here and omit
2498	// the S_AND when is unnecessary. But it would be better to add a separate
2499	// pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2500	// catches both cases.
2501	Cond = SDValue (CurDAG->getMachineNode(Opcode: ST->isWave32() ? AMDGPU::S_AND_B32
2502	: AMDGPU::S_AND_B64,
2503	dl: SL, VT: MVT::i1,
2504	Op1: CurDAG->getRegister(Reg: ST->isWave32() ? AMDGPU::EXEC_LO
2505	: AMDGPU::EXEC,
2506	VT: MVT::i1),
2507	Op2: Cond),
2508	`0`);
2509	}
2510
2511	SDValue VCC = CurDAG->getCopyToReg(Chain: N->getOperand(Num: `0`), dl: SL, Reg: CondReg, N: Cond);
2512	CurDAG->SelectNodeTo(N, MachineOpc: BrOp, VT: MVT::Other,
2513	Op1: N->getOperand(Num: `2`), // Basic Block
2514	Op2: VCC.getValue(R: `0`));
2515	}
2516
2517	void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2518	if (Subtarget->hasSALUFloatInsts() && N->getValueType(ResNo: `0`) == MVT::f32 &&
2519	!N->isDivergent()) {
2520	SDValue Src = N->getOperand(Num: `0`);
2521	if (Src.getValueType() == MVT::f16) {
2522	if (isExtractHiElt(In: Src, Out&: Src)) {
2523	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::S_CVT_HI_F32_F16, VTs: N->getVTList(),
2524	Ops: {Src});
2525	return;
2526	}
2527	}
2528	}
2529
2530	SelectCode(N);
2531	}
2532
2533	void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode N, unsigned* IntrID) {
2534	// The address is assumed to be uniform, so if it ends up in a VGPR, it will
2535	// be copied to an SGPR with readfirstlane.
2536	unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2537	AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2538
2539	SDValue Chain = N->getOperand(Num: `0`);
2540	SDValue Ptr = N->getOperand(Num: `2`);
2541	MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
2542	MachineMemOperand *MMO = M->getMemOperand();
2543	bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2544
2545	SDValue Offset;
2546	if (CurDAG->isBaseWithConstantOffset(Op: Ptr)) {
2547	SDValue PtrBase = Ptr.getOperand(i: `0`);
2548	SDValue PtrOffset = Ptr.getOperand(i: `1`);
2549
2550	const APInt &OffsetVal = PtrOffset ->getAsAPIntVal();
2551	if (isDSOffsetLegal(Base: PtrBase, Offset: OffsetVal.getZExtValue())) {
2552	N = glueCopyToM0(N, Val: PtrBase);
2553	Offset = CurDAG->getTargetConstant(Val: OffsetVal, DL: SDLoc (), VT: MVT::i32);
2554	}
2555	}
2556
2557	if (!Offset) {
2558	N = glueCopyToM0(N, Val: Ptr);
2559	Offset = CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (), VT: MVT::i32);
2560	}
2561
2562	SDValue Ops[] = {
2563	Offset,
2564	CurDAG->getTargetConstant(Val: IsGDS, DL: SDLoc (), VT: MVT::i32),
2565	Chain,
2566	N->getOperand(Num: N->getNumOperands() - `1`) // New glue
2567	};
2568
2569	SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
2570	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
2571	}
2572
2573	// We need to handle this here because tablegen doesn't support matching
2574	// instructions with multiple outputs.
2575	void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2576	unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2577	SDValue Ops[] = {N->getOperand(Num: `2`), N->getOperand(Num: `3`), N->getOperand(Num: `4`),
2578	N->getOperand(Num: `5`), N->getOperand(Num: `0`)};
2579
2580	MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
2581	MachineMemOperand *MMO = M->getMemOperand();
2582	SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
2583	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
2584	}
2585
2586	static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2587	switch (IntrID) {
2588	case Intrinsic::amdgcn_ds_gws_init:
2589	return AMDGPU::DS_GWS_INIT;
2590	case Intrinsic::amdgcn_ds_gws_barrier:
2591	return AMDGPU::DS_GWS_BARRIER;
2592	case Intrinsic::amdgcn_ds_gws_sema_v:
2593	return AMDGPU::DS_GWS_SEMA_V;
2594	case Intrinsic::amdgcn_ds_gws_sema_br:
2595	return AMDGPU::DS_GWS_SEMA_BR;
2596	case Intrinsic::amdgcn_ds_gws_sema_p:
2597	return AMDGPU::DS_GWS_SEMA_P;
2598	case Intrinsic::amdgcn_ds_gws_sema_release_all:
2599	return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2600	default:
2601	llvm_unreachable("not a gws intrinsic");
2602	}
2603	}
2604
2605	void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode N, unsigned* IntrID) {
2606	if (!Subtarget->hasGWS() \|\|
2607	(IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2608	!Subtarget->hasGWSSemaReleaseAll())) {
2609	// Let this error.
2610	SelectCode(N);
2611	return;
2612	}
2613
2614	// Chain, intrinsic ID, vsrc, offset
2615	const bool HasVSrc = N->getNumOperands() == `4`;
2616	assert(HasVSrc \|\| N->getNumOperands() == `3`);
2617
2618	SDLoc SL(N);
2619	SDValue BaseOffset = N->getOperand(Num: HasVSrc ? `3` : `2`);
2620	int ImmOffset = `0`;
2621	MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
2622	MachineMemOperand *MMO = M->getMemOperand();
2623
2624	// Don't worry if the offset ends up in a VGPR. Only one lane will have
2625	// effect, so SIFixSGPRCopies will validly insert readfirstlane.
2626
2627	// The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2628	// offset field) % 64. Some versions of the programming guide omit the m0
2629	// part, or claim it's from offset 0.
2630	if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(Val&: BaseOffset)) {
2631	// If we have a constant offset, try to use the 0 in m0 as the base.
2632	// TODO: Look into changing the default m0 initialization value. If the
2633	// default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2634	// the immediate offset.
2635	glueCopyToM0(N, Val: CurDAG->getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32));
2636	ImmOffset = ConstOffset->getZExtValue();
2637	} else {
2638	if (CurDAG->isBaseWithConstantOffset(Op: BaseOffset)) {
2639	ImmOffset = BaseOffset.getConstantOperandVal(i: `1`);
2640	BaseOffset = BaseOffset.getOperand(i: `0`);
2641	}
2642
2643	// Prefer to do the shift in an SGPR since it should be possible to use m0
2644	// as the result directly. If it's already an SGPR, it will be eliminated
2645	// later.
2646	SDNode *SGPROffset
2647	= CurDAG->getMachineNode(Opcode: AMDGPU::V_READFIRSTLANE_B32, dl: SL, VT: MVT::i32,
2648	Op1: BaseOffset);
2649	// Shift to offset in m0
2650	SDNode *M0Base
2651	= CurDAG->getMachineNode(Opcode: AMDGPU::S_LSHL_B32, dl: SL, VT: MVT::i32,
2652	Op1: SDValue (SGPROffset, `0`),
2653	Op2: CurDAG->getTargetConstant(Val: `16`, DL: SL, VT: MVT::i32));
2654	glueCopyToM0(N, Val: SDValue (M0Base, `0`));
2655	}
2656
2657	SDValue Chain = N->getOperand(Num: `0`);
2658	SDValue OffsetField = CurDAG->getTargetConstant(Val: ImmOffset, DL: SL, VT: MVT::i32);
2659
2660	const unsigned Opc = gwsIntrinToOpcode(IntrID);
2661	SmallVector<SDValue, `5`> Ops;
2662	if (HasVSrc)
2663	Ops.push_back(Elt: N->getOperand(Num: `2`));
2664	Ops.push_back(Elt: OffsetField);
2665	Ops.push_back(Elt: Chain);
2666
2667	SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
2668	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
2669	}
2670
2671	void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2672	if (Subtarget->getLDSBankCount() != `16`) {
2673	// This is a single instruction with a pattern.
2674	SelectCode(N);
2675	return;
2676	}
2677
2678	SDLoc DL(N);
2679
2680	// This requires 2 instructions. It is possible to write a pattern to support
2681	// this, but the generated isel emitter doesn't correctly deal with multiple
2682	// output instructions using the same physical register input. The copy to m0
2683	// is incorrectly placed before the second instruction.
2684	//
2685	// TODO: Match source modifiers.
2686	//
2687	// def : Pat <
2688	// (int_amdgcn_interp_p1_f16
2689	// (VOP3Mods f32:$src0, i32:$src0_modifiers),
2690	// (i32 timm:$attrchan), (i32 timm:$attr),
2691	// (i1 timm:$high), M0),
2692	// (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2693	// timm:$attrchan, 0,
2694	// (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2695	// let Predicates = [has16BankLDS];
2696	// }
2697
2698	// 16 bank LDS
2699	SDValue ToM0 = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl: DL, Reg: AMDGPU::M0,
2700	N: N->getOperand(Num: `5`), Glue: SDValue ());
2701
2702	SDVTList VTs = CurDAG->getVTList(VT1: MVT::f32, VT2: MVT::Other);
2703
2704	SDNode *InterpMov =
2705	CurDAG->getMachineNode(Opcode: AMDGPU::V_INTERP_MOV_F32, dl: DL, VTs, Ops: {
2706	CurDAG->getTargetConstant(Val: `2`, DL, VT: MVT::i32), // P0
2707	N->getOperand(Num: `3`), // Attr
2708	N->getOperand(Num: `2`), // Attrchan
2709	ToM0.getValue(R: `1`) // In glue
2710	});
2711
2712	SDNode *InterpP1LV =
2713	CurDAG->getMachineNode(Opcode: AMDGPU::V_INTERP_P1LV_F16, dl: DL, VT: MVT::f32, Ops: {
2714	CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32), // $src0_modifiers
2715	N->getOperand(Num: `1`), // Src0
2716	N->getOperand(Num: `3`), // Attr
2717	N->getOperand(Num: `2`), // Attrchan
2718	CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32), // $src2_modifiers
2719	SDValue (InterpMov, `0`), // Src2 - holds two f16 values selected by high
2720	N->getOperand(Num: `4`), // high
2721	CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1), // $clamp
2722	CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32), // $omod
2723	SDValue (InterpMov, `1`)
2724	});
2725
2726	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: SDValue (InterpP1LV, `0`));
2727	}
2728
2729	void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2730	unsigned IntrID = N->getConstantOperandVal(Num: `1`);
2731	switch (IntrID) {
2732	case Intrinsic::amdgcn_ds_append:
2733	case Intrinsic::amdgcn_ds_consume: {
2734	if (N->getValueType(ResNo: `0`) != MVT::i32)
2735	break;
2736	SelectDSAppendConsume(N, IntrID);
2737	return;
2738	}
2739	case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2740	SelectDSBvhStackIntrinsic(N);
2741	return;
2742	}
2743
2744	SelectCode(N);
2745	}
2746
2747	void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2748	unsigned IntrID = N->getConstantOperandVal(Num: `0`);
2749	unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2750	SDNode *ConvGlueNode = N->getGluedNode();
2751	if (ConvGlueNode) {
2752	// FIXME: Possibly iterate over multiple glue nodes?
2753	assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2754	ConvGlueNode = ConvGlueNode->getOperand(Num: `0`).getNode();
2755	ConvGlueNode =
2756	CurDAG->getMachineNode(Opcode: TargetOpcode::CONVERGENCECTRL_GLUE, dl: {},
2757	VT: MVT::Glue, Op1: SDValue (ConvGlueNode, `0`));
2758	} else {
2759	ConvGlueNode = nullptr;
2760	}
2761	switch (IntrID) {
2762	case Intrinsic::amdgcn_wqm:
2763	Opcode = AMDGPU::WQM;
2764	break;
2765	case Intrinsic::amdgcn_softwqm:
2766	Opcode = AMDGPU::SOFT_WQM;
2767	break;
2768	case Intrinsic::amdgcn_wwm:
2769	case Intrinsic::amdgcn_strict_wwm:
2770	Opcode = AMDGPU::STRICT_WWM;
2771	break;
2772	case Intrinsic::amdgcn_strict_wqm:
2773	Opcode = AMDGPU::STRICT_WQM;
2774	break;
2775	case Intrinsic::amdgcn_interp_p1_f16:
2776	SelectInterpP1F16(N);
2777	return;
2778	default:
2779	SelectCode(N);
2780	break;
2781	}
2782
2783	if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2784	SDValue Src = N->getOperand(Num: `1`);
2785	CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops: {Src});
2786	}
2787
2788	if (ConvGlueNode) {
2789	SmallVector<SDValue, `4`> NewOps(N->op_begin(), N->op_end());
2790	NewOps.push_back(Elt: SDValue (ConvGlueNode, `0`));
2791	CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: N->getVTList(), Ops: NewOps);
2792	}
2793	}
2794
2795	void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2796	unsigned IntrID = N->getConstantOperandVal(Num: `1`);
2797	switch (IntrID) {
2798	case Intrinsic::amdgcn_ds_gws_init:
2799	case Intrinsic::amdgcn_ds_gws_barrier:
2800	case Intrinsic::amdgcn_ds_gws_sema_v:
2801	case Intrinsic::amdgcn_ds_gws_sema_br:
2802	case Intrinsic::amdgcn_ds_gws_sema_p:
2803	case Intrinsic::amdgcn_ds_gws_sema_release_all:
2804	SelectDS_GWS(N, IntrID);
2805	return;
2806	default:
2807	break;
2808	}
2809
2810	SelectCode(N);
2811	}
2812
2813	void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2814	SDValue Log2WaveSize =
2815	CurDAG->getTargetConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: SDLoc (N), VT: MVT::i32);
2816	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::S_LSHR_B32, VTs: N->getVTList(),
2817	Ops: {N->getOperand(Num: `0`), Log2WaveSize});
2818	}
2819
2820	void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2821	SDValue SrcVal = N->getOperand(Num: `1`);
2822	if (SrcVal.getValueType() != MVT::i32) {
2823	SelectCode(N); // Emit default error
2824	return;
2825	}
2826
2827	SDValue CopyVal;
2828	Register SP = TLI->getStackPointerRegisterToSaveRestore();
2829	SDLoc SL(N);
2830
2831	if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2832	CopyVal = SrcVal.getOperand(i: `0`);
2833	} else {
2834	SDValue Log2WaveSize = CurDAG->getTargetConstant(
2835	Val: Subtarget->getWavefrontSizeLog2(), DL: SL, VT: MVT::i32);
2836
2837	if (N->isDivergent()) {
2838	SrcVal = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::V_READFIRSTLANE_B32, dl: SL,
2839	VT: MVT::i32, Op1: SrcVal),
2840	`0`);
2841	}
2842
2843	CopyVal = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::S_LSHL_B32, dl: SL, VT: MVT::i32,
2844	Ops: {SrcVal, Log2WaveSize}),
2845	`0`);
2846	}
2847
2848	SDValue CopyToSP = CurDAG->getCopyToReg(Chain: N->getOperand(Num: `0`), dl: SL, Reg: SP, N: CopyVal);
2849	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: CopyToSP);
2850	}
2851
2852	bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2853	unsigned &Mods,
2854	bool IsCanonicalizing,
2855	bool AllowAbs) const {
2856	Mods = SISrcMods::NONE;
2857	Src = In;
2858
2859	if (Src.getOpcode() == ISD::FNEG) {
2860	Mods \|= SISrcMods::NEG;
2861	Src = Src.getOperand(i: `0`);
2862	} else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2863	// Fold fsub [+-]0 into fneg. This may not have folded depending on the
2864	// denormal mode, but we're implicitly canonicalizing in a source operand.
2865	auto *LHS = dyn_cast<ConstantFPSDNode>(Val: Src.getOperand(i: `0`));
2866	if (LHS && LHS->isZero()) {
2867	Mods \|= SISrcMods::NEG;
2868	Src = Src.getOperand(i: `1`);
2869	}
2870	}
2871
2872	if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2873	Mods \|= SISrcMods::ABS;
2874	Src = Src.getOperand(i: `0`);
2875	}
2876
2877	return true;
2878	}
2879
2880	bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2881	SDValue &SrcMods) const {
2882	unsigned Mods;
2883	if (SelectVOP3ModsImpl(In, Src, Mods, /IsCanonicalizing=/true,
2884	/AllowAbs=/true)) {
2885	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
2886	return true;
2887	}
2888
2889	return false;
2890	}
2891
2892	bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2893	SDValue In, SDValue &Src, SDValue &SrcMods) const {
2894	unsigned Mods;
2895	if (SelectVOP3ModsImpl(In, Src, Mods, /IsCanonicalizing=/false,
2896	/AllowAbs=/true)) {
2897	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
2898	return true;
2899	}
2900
2901	return false;
2902	}
2903
2904	bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2905	SDValue &SrcMods) const {
2906	unsigned Mods;
2907	if (SelectVOP3ModsImpl(In, Src, Mods,
2908	/IsCanonicalizing=/true,
2909	/AllowAbs=/false)) {
2910	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
2911	return true;
2912	}
2913
2914	return false;
2915	}
2916
2917	bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2918	if (In.getOpcode() == ISD::FABS \|\| In.getOpcode() == ISD::FNEG)
2919	return false;
2920
2921	Src = In;
2922	return true;
2923	}
2924
2925	bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2926	SDValue &SrcMods,
2927	bool OpSel) const {
2928	unsigned Mods;
2929	if (SelectVOP3ModsImpl(In, Src, Mods,
2930	/IsCanonicalizing=/true,
2931	/AllowAbs=/false)) {
2932	if (OpSel)
2933	Mods \|= SISrcMods::OP_SEL_0;
2934	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
2935	return true;
2936	}
2937
2938	return false;
2939	}
2940
2941	bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2942	SDValue &SrcMods) const {
2943	return SelectVINTERPModsImpl(In, Src, SrcMods, / OpSel / false);
2944	}
2945
2946	bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2947	SDValue &SrcMods) const {
2948	return SelectVINTERPModsImpl(In, Src, SrcMods, / OpSel / true);
2949	}
2950
2951	bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2952	SDValue &SrcMods, SDValue &Clamp,
2953	SDValue &Omod) const {
2954	SDLoc DL(In);
2955	Clamp = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
2956	Omod = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
2957
2958	return SelectVOP3Mods(In, Src, SrcMods);
2959	}
2960
2961	bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2962	SDValue &SrcMods, SDValue &Clamp,
2963	SDValue &Omod) const {
2964	SDLoc DL(In);
2965	Clamp = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
2966	Omod = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
2967
2968	return SelectVOP3BMods(In, Src, SrcMods);
2969	}
2970
2971	bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2972	SDValue &Clamp, SDValue &Omod) const {
2973	Src = In;
2974
2975	SDLoc DL(In);
2976	Clamp = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
2977	Omod = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
2978
2979	return true;
2980	}
2981
2982	bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2983	SDValue &SrcMods, bool IsDOT) const {
2984	unsigned Mods = SISrcMods::NONE;
2985	Src = In;
2986
2987	// TODO: Handle G_FSUB 0 as fneg
2988	if (Src.getOpcode() == ISD::FNEG) {
2989	Mods ^= (SISrcMods::NEG \| SISrcMods::NEG_HI);
2990	Src = Src.getOperand(i: `0`);
2991	}
2992
2993	if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == `2` &&
2994	(!IsDOT \|\| !Subtarget->hasDOTOpSelHazard())) {
2995	unsigned VecMods = Mods;
2996
2997	SDValue Lo = stripBitcast(Val: Src.getOperand(i: `0`));
2998	SDValue Hi = stripBitcast(Val: Src.getOperand(i: `1`));
2999
3000	if (Lo.getOpcode() == ISD::FNEG) {
3001	Lo = stripBitcast(Val: Lo.getOperand(i: `0`));
3002	Mods ^= SISrcMods::NEG;
3003	}
3004
3005	if (Hi.getOpcode() == ISD::FNEG) {
3006	Hi = stripBitcast(Val: Hi.getOperand(i: `0`));
3007	Mods ^= SISrcMods::NEG_HI;
3008	}
3009
3010	if (isExtractHiElt(In: Lo, Out&: Lo))
3011	Mods \|= SISrcMods::OP_SEL_0;
3012
3013	if (isExtractHiElt(In: Hi, Out&: Hi))
3014	Mods \|= SISrcMods::OP_SEL_1;
3015
3016	unsigned VecSize = Src.getValueSizeInBits();
3017	Lo = stripExtractLoElt(In: Lo);
3018	Hi = stripExtractLoElt(In: Hi);
3019
3020	if (Lo.getValueSizeInBits() > VecSize) {
3021	Lo = CurDAG->getTargetExtractSubreg(
3022	SRIdx: (VecSize > `32`) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, DL: SDLoc (In),
3023	VT: MVT::getIntegerVT(BitWidth: VecSize), Operand: Lo);
3024	}
3025
3026	if (Hi.getValueSizeInBits() > VecSize) {
3027	Hi = CurDAG->getTargetExtractSubreg(
3028	SRIdx: (VecSize > `32`) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, DL: SDLoc (In),
3029	VT: MVT::getIntegerVT(BitWidth: VecSize), Operand: Hi);
3030	}
3031
3032	assert(Lo.getValueSizeInBits() <= VecSize &&
3033	Hi.getValueSizeInBits() <= VecSize);
3034
3035	if (Lo == Hi && !isInlineImmediate(N: Lo.getNode())) {
3036	// Really a scalar input. Just select from the low half of the register to
3037	// avoid packing.
3038
3039	if (VecSize == `32` \|\| VecSize == Lo.getValueSizeInBits()) {
3040	Src = Lo;
3041	} else {
3042	assert(Lo.getValueSizeInBits() == `32` && VecSize == `64`);
3043
3044	SDLoc SL(In);
3045	SDValue Undef = SDValue (
3046	CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: SL,
3047	VT: Lo.getValueType()), `0`);
3048	auto RC = Lo ->isDivergent() ? AMDGPU::VReg_64RegClassID
3049	: AMDGPU::SReg_64RegClassID;
3050	const SDValue Ops[] = {
3051	CurDAG->getTargetConstant(Val: RC, DL: SL, VT: MVT::i32),
3052	Lo, CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32),
3053	Undef, CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32) };
3054
3055	Src = SDValue (CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: SL,
3056	VT: Src.getValueType(), Ops), `0`);
3057	}
3058	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3059	return true;
3060	}
3061
3062	if (VecSize == `64` && Lo == Hi && isa<ConstantFPSDNode>(Val: Lo)) {
3063	uint64_t Lit = cast<ConstantFPSDNode>(Val&: Lo)->getValueAPF()
3064	.bitcastToAPInt().getZExtValue();
3065	if (AMDGPU::isInlinableLiteral32(Literal: Lit, HasInv2Pi: Subtarget->hasInv2PiInlineImm())) {
3066	Src = CurDAG->getTargetConstant(Val: Lit, DL: SDLoc (In), VT: MVT::i64);
3067	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3068	return true;
3069	}
3070	}
3071
3072	Mods = VecMods;
3073	}
3074
3075	// Packed instructions do not have abs modifiers.
3076	Mods \|= SISrcMods::OP_SEL_1;
3077
3078	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3079	return true;
3080	}
3081
3082	bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3083	SDValue &SrcMods) const {
3084	return SelectVOP3PMods(In, Src, SrcMods, IsDOT: true);
3085	}
3086
3087	bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3088	const ConstantSDNode *C = cast<ConstantSDNode>(Val&: In);
3089	// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3090	// 1 promotes packed values to signed, 0 treats them as unsigned.
3091	assert(C->getAPIntValue().getBitWidth() == `1` && "expected i1 value");
3092
3093	unsigned Mods = SISrcMods::OP_SEL_1;
3094	unsigned SrcSign = C->getZExtValue();
3095	if (SrcSign == `1`)
3096	Mods ^= SISrcMods::NEG;
3097
3098	Src = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3099	return true;
3100	}
3101
3102	bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3103	SDValue &Src) const {
3104	const ConstantSDNode *C = cast<ConstantSDNode>(Val&: In);
3105	assert(C->getAPIntValue().getBitWidth() == `1` && "expected i1 value");
3106
3107	unsigned Mods = SISrcMods::OP_SEL_1;
3108	unsigned SrcVal = C->getZExtValue();
3109	if (SrcVal == `1`)
3110	Mods \|= SISrcMods::OP_SEL_0;
3111
3112	Src = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3113	return true;
3114	}
3115
3116	static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3117	llvm::SelectionDAG *CurDAG,
3118	const SDLoc &DL) {
3119	unsigned DstRegClass;
3120	EVT DstTy;
3121	switch (Elts.size()) {
3122	case `8`:
3123	DstRegClass = AMDGPU::VReg_256RegClassID;
3124	DstTy = MVT::v8i32;
3125	break;
3126	case `4`:
3127	DstRegClass = AMDGPU::VReg_128RegClassID;
3128	DstTy = MVT::v4i32;
3129	break;
3130	case `2`:
3131	DstRegClass = AMDGPU::VReg_64RegClassID;
3132	DstTy = MVT::v2i32;
3133	break;
3134	default:
3135	llvm_unreachable("unhandled Reg sequence size");
3136	}
3137
3138	SmallVector<SDValue, `17`> Ops;
3139	Ops.push_back(Elt: CurDAG->getTargetConstant(Val: DstRegClass, DL, VT: MVT::i32));
3140	for (unsigned i = `0`; i < Elts.size(); ++i) {
3141	Ops.push_back(Elt: Elts [i]);
3142	Ops.push_back(Elt: CurDAG->getTargetConstant(
3143	Val: SIRegisterInfo::getSubRegFromChannel(Channel: i), DL, VT: MVT::i32));
3144	}
3145	return CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL, VT: DstTy, Ops);
3146	}
3147
3148	static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3149	llvm::SelectionDAG *CurDAG,
3150	const SDLoc &DL) {
3151	SmallVector<SDValue, `8`> PackedElts;
3152	assert("unhandled Reg sequence size" &&
3153	(Elts.size() == `8` \|\| Elts.size() == `16`));
3154
3155	// Pack 16-bit elements in pairs into 32-bit register. If both elements are
3156	// unpacked from 32-bit source use it, otherwise pack them using v_perm.
3157	for (unsigned i = `0`; i < Elts.size(); i += `2`) {
3158	SDValue LoSrc = stripExtractLoElt(In: stripBitcast(Val: Elts [i]));
3159	SDValue HiSrc;
3160	if (isExtractHiElt(In: Elts [i + `1`], Out&: HiSrc) && LoSrc == HiSrc) {
3161	PackedElts.push_back(Elt: HiSrc);
3162	} else {
3163	SDValue PackLoLo = CurDAG->getTargetConstant(Val: `0x05040100`, DL, VT: MVT::i32);
3164	MachineSDNode *Packed =
3165	CurDAG->getMachineNode(Opcode: AMDGPU::V_PERM_B32_e64, dl: DL, VT: MVT::i32,
3166	Ops: {Elts [i + `1`], Elts [i], PackLoLo});
3167	PackedElts.push_back(Elt: SDValue (Packed, `0`));
3168	}
3169	}
3170
3171	return buildRegSequence32(Elts&: PackedElts, CurDAG, DL);
3172	}
3173
3174	static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3175	llvm::SelectionDAG *CurDAG,
3176	const SDLoc &DL, unsigned ElementSize) {
3177	if (ElementSize == `16`)
3178	return buildRegSequence16(Elts, CurDAG, DL);
3179	if (ElementSize == `32`)
3180	return buildRegSequence32(Elts, CurDAG, DL);
3181	llvm_unreachable("Unhandled element size");
3182	}
3183
3184	static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3185	SmallVectorImpl<SDValue> &Elts, SDValue &Src,
3186	llvm::SelectionDAG CurDAG, const* SDLoc &DL,
3187	unsigned ElementSize) {
3188	if (ModOpcode == ISD::FNEG) {
3189	Mods \|= SISrcMods::NEG;
3190	// Check if all elements also have abs modifier
3191	SmallVector<SDValue, `8`> NegAbsElts;
3192	for (auto El : Elts) {
3193	if (El.getOpcode() != ISD::FABS)
3194	break;
3195	NegAbsElts.push_back(Elt: El ->getOperand(Num: `0`));
3196	}
3197	if (Elts.size() != NegAbsElts.size()) {
3198	// Neg
3199	Src = SDValue (buildRegSequence(Elts, CurDAG, DL, ElementSize), `0`);
3200	} else {
3201	// Neg and Abs
3202	Mods \|= SISrcMods::NEG_HI;
3203	Src = SDValue (buildRegSequence(Elts&: NegAbsElts, CurDAG, DL, ElementSize), `0`);
3204	}
3205	} else {
3206	assert(ModOpcode == ISD::FABS);
3207	// Abs
3208	Mods \|= SISrcMods::NEG_HI;
3209	Src = SDValue (buildRegSequence(Elts, CurDAG, DL, ElementSize), `0`);
3210	}
3211	}
3212
3213	// Check all f16 elements for modifiers while looking through b32 and v2b16
3214	// build vector, stop if element does not satisfy ModifierCheck.
3215	static void
3216	checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
3217	std::function<bool(SDValue)> ModifierCheck) {
3218	for (unsigned i = `0`; i < BV->getNumOperands(); ++i) {
3219	if (auto *F16Pair =
3220	dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: BV->getOperand(Num: i)))) {
3221	for (unsigned i = `0`; i < F16Pair->getNumOperands(); ++i) {
3222	SDValue ElF16 = stripBitcast(Val: F16Pair->getOperand(Num: i));
3223	if (!ModifierCheck (ElF16))
3224	break;
3225	}
3226	}
3227	}
3228	}
3229
3230	bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3231	SDValue &SrcMods) const {
3232	Src = In;
3233	unsigned Mods = SISrcMods::OP_SEL_1;
3234
3235	// mods are on f16 elements
3236	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3237	SmallVector<SDValue, `8`> EltsF16;
3238
3239	checkWMMAElementsModifiersF16(BV, ModifierCheck: [&](SDValue Element) -> bool {
3240	if (Element.getOpcode() != ISD::FNEG)
3241	return false;
3242	EltsF16.push_back(Elt: Element.getOperand(i: `0`));
3243	return true;
3244	});
3245
3246	// All elements have neg modifier
3247	if (BV->getNumOperands() * `2` == EltsF16.size()) {
3248	Src = SDValue (buildRegSequence16(Elts&: EltsF16, CurDAG, DL: SDLoc (In)), `0`);
3249	Mods \|= SISrcMods::NEG;
3250	Mods \|= SISrcMods::NEG_HI;
3251	}
3252	}
3253
3254	// mods are on v2f16 elements
3255	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3256	SmallVector<SDValue, `8`> EltsV2F16;
3257	for (unsigned i = `0`; i < BV->getNumOperands(); ++i) {
3258	SDValue ElV2f16 = stripBitcast(Val: BV->getOperand(Num: i));
3259	// Based on first element decide which mod we match, neg or abs
3260	if (ElV2f16.getOpcode() != ISD::FNEG)
3261	break;
3262	EltsV2F16.push_back(Elt: ElV2f16.getOperand(i: `0`));
3263	}
3264
3265	// All pairs of elements have neg modifier
3266	if (BV->getNumOperands() == EltsV2F16.size()) {
3267	Src = SDValue (buildRegSequence32(Elts&: EltsV2F16, CurDAG, DL: SDLoc (In)), `0`);
3268	Mods \|= SISrcMods::NEG;
3269	Mods \|= SISrcMods::NEG_HI;
3270	}
3271	}
3272
3273	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3274	return true;
3275	}
3276
3277	bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3278	SDValue &SrcMods) const {
3279	Src = In;
3280	unsigned Mods = SISrcMods::OP_SEL_1;
3281	unsigned ModOpcode;
3282
3283	// mods are on f16 elements
3284	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3285	SmallVector<SDValue, `8`> EltsF16;
3286	checkWMMAElementsModifiersF16(BV, ModifierCheck: [&](SDValue ElF16) -> bool {
3287	// Based on first element decide which mod we match, neg or abs
3288	if (EltsF16.empty())
3289	ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3290	if (ElF16.getOpcode() != ModOpcode)
3291	return false;
3292	EltsF16.push_back(Elt: ElF16.getOperand(i: `0`));
3293	return true;
3294	});
3295
3296	// All elements have ModOpcode modifier
3297	if (BV->getNumOperands() * `2` == EltsF16.size())
3298	selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF16, Src, CurDAG, DL: SDLoc (In),
3299	ElementSize: `16`);
3300	}
3301
3302	// mods are on v2f16 elements
3303	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3304	SmallVector<SDValue, `8`> EltsV2F16;
3305
3306	for (unsigned i = `0`; i < BV->getNumOperands(); ++i) {
3307	SDValue ElV2f16 = stripBitcast(Val: BV->getOperand(Num: i));
3308	// Based on first element decide which mod we match, neg or abs
3309	if (EltsV2F16.empty())
3310	ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3311	if (ElV2f16 ->getOpcode() != ModOpcode)
3312	break;
3313	EltsV2F16.push_back(Elt: ElV2f16 ->getOperand(Num: `0`));
3314	}
3315
3316	// All elements have ModOpcode modifier
3317	if (BV->getNumOperands() == EltsV2F16.size())
3318	selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsV2F16, Src, CurDAG, DL: SDLoc (In),
3319	ElementSize: `32`);
3320	}
3321
3322	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3323	return true;
3324	}
3325
3326	bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3327	SDValue &SrcMods) const {
3328	Src = In;
3329	unsigned Mods = SISrcMods::OP_SEL_1;
3330	SmallVector<SDValue, `8`> EltsF32;
3331
3332	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3333	assert(BV->getNumOperands() > `0`);
3334	// Based on first element decide which mod we match, neg or abs
3335	SDValue ElF32 = stripBitcast(Val: BV->getOperand(Num: `0`));
3336	unsigned ModOpcode =
3337	(ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3338	for (unsigned i = `0`; i < BV->getNumOperands(); ++i) {
3339	SDValue ElF32 = stripBitcast(Val: BV->getOperand(Num: i));
3340	if (ElF32.getOpcode() != ModOpcode)
3341	break;
3342	EltsF32.push_back(Elt: ElF32.getOperand(i: `0`));
3343	}
3344
3345	// All elements had ModOpcode modifier
3346	if (BV->getNumOperands() == EltsF32.size())
3347	selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF32, Src, CurDAG, DL: SDLoc (In),
3348	ElementSize: `32`);
3349	}
3350
3351	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3352	return true;
3353	}
3354
3355	bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3356	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val&: In)) {
3357	BitVector UndefElements;
3358	if (SDValue Splat = BV->getSplatValue(UndefElements: &UndefElements))
3359	if (isInlineImmediate(N: Splat.getNode())) {
3360	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Splat)) {
3361	unsigned Imm = C->getAPIntValue().getSExtValue();
3362	Src = CurDAG->getTargetConstant(Val: Imm, DL: SDLoc (In), VT: MVT::i32);
3363	return true;
3364	}
3365	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Splat)) {
3366	unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3367	Src = CurDAG->getTargetConstant(Val: Imm, DL: SDLoc (In), VT: MVT::i32);
3368	return true;
3369	}
3370	llvm_unreachable("unhandled Constant node");
3371	}
3372	}
3373
3374	// 16 bit splat
3375	SDValue SplatSrc32 = stripBitcast(Val: In);
3376	if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(Val&: SplatSrc32))
3377	if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3378	SDValue SplatSrc16 = stripBitcast(Val: Splat32);
3379	if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(Val&: SplatSrc16))
3380	if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3381	const SIInstrInfo *TII = Subtarget->getInstrInfo();
3382	std::optional<APInt> RawValue;
3383	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Splat))
3384	RawValue = C->getValueAPF().bitcastToAPInt();
3385	else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Splat))
3386	RawValue = C->getAPIntValue();
3387
3388	if (RawValue.has_value()) {
3389	EVT VT = In.getValueType().getScalarType();
3390	if (VT.getSimpleVT() == MVT::f16 \|\| VT.getSimpleVT() == MVT::bf16) {
3391	APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3392	? APFloatBase::IEEEhalf()
3393	: APFloatBase::BFloat(),
3394	RawValue.value());
3395	if (TII->isInlineConstant(Imm: FloatVal)) {
3396	Src = CurDAG->getTargetConstant(Val: RawValue.value(), DL: SDLoc (In),
3397	VT: MVT::i16);
3398	return true;
3399	}
3400	} else if (VT.getSimpleVT() == MVT::i16) {
3401	if (TII->isInlineConstant(Imm: RawValue.value())) {
3402	Src = CurDAG->getTargetConstant(Val: RawValue.value(), DL: SDLoc (In),
3403	VT: MVT::i16);
3404	return true;
3405	}
3406	} else
3407	llvm_unreachable("unknown 16-bit type");
3408	}
3409	}
3410	}
3411
3412	return false;
3413	}
3414
3415	bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3416	SDValue &IndexKey) const {
3417	unsigned Key = `0`;
3418	Src = In;
3419
3420	if (In.getOpcode() == ISD::SRL) {
3421	const llvm::SDValue &ShiftSrc = In.getOperand(i: `0`);
3422	ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: `1`));
3423	if (ShiftSrc.getValueType().getSizeInBits() == `32` && ShiftAmt &&
3424	ShiftAmt->getZExtValue() % `8` == `0`) {
3425	Key = ShiftAmt->getZExtValue() / `8`;
3426	Src = ShiftSrc;
3427	}
3428	}
3429
3430	IndexKey = CurDAG->getTargetConstant(Val: Key, DL: SDLoc (In), VT: MVT::i32);
3431	return true;
3432	}
3433
3434	bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3435	SDValue &IndexKey) const {
3436	unsigned Key = `0`;
3437	Src = In;
3438
3439	if (In.getOpcode() == ISD::SRL) {
3440	const llvm::SDValue &ShiftSrc = In.getOperand(i: `0`);
3441	ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: `1`));
3442	if (ShiftSrc.getValueType().getSizeInBits() == `32` && ShiftAmt &&
3443	ShiftAmt->getZExtValue() == `16`) {
3444	Key = `1`;
3445	Src = ShiftSrc;
3446	}
3447	}
3448
3449	IndexKey = CurDAG->getTargetConstant(Val: Key, DL: SDLoc (In), VT: MVT::i32);
3450	return true;
3451	}
3452
3453	bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3454	SDValue &SrcMods) const {
3455	Src = In;
3456	// FIXME: Handle op_sel
3457	SrcMods = CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (In), VT: MVT::i32);
3458	return true;
3459	}
3460
3461	bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3462	SDValue &SrcMods) const {
3463	// FIXME: Handle op_sel
3464	return SelectVOP3Mods(In, Src, SrcMods);
3465	}
3466
3467	// The return value is not whether the match is possible (which it always is),
3468	// but whether or not it a conversion is really used.
3469	bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3470	unsigned &Mods) const {
3471	Mods = `0`;
3472	SelectVOP3ModsImpl(In, Src, Mods);
3473
3474	if (Src.getOpcode() == ISD::FP_EXTEND) {
3475	Src = Src.getOperand(i: `0`);
3476	assert(Src.getValueType() == MVT::f16);
3477	Src = stripBitcast(Val: Src);
3478
3479	// Be careful about folding modifiers if we already have an abs. fneg is
3480	// applied last, so we don't want to apply an earlier fneg.
3481	if ((Mods & SISrcMods::ABS) == `0`) {
3482	unsigned ModsTmp;
3483	SelectVOP3ModsImpl(In: Src, Src, Mods&: ModsTmp);
3484
3485	if ((ModsTmp & SISrcMods::NEG) != `0`)
3486	Mods ^= SISrcMods::NEG;
3487
3488	if ((ModsTmp & SISrcMods::ABS) != `0`)
3489	Mods \|= SISrcMods::ABS;
3490	}
3491
3492	// op_sel/op_sel_hi decide the source type and source.
3493	// If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3494	// If the sources's op_sel is set, it picks the high half of the source
3495	// register.
3496
3497	Mods \|= SISrcMods::OP_SEL_1;
3498	if (isExtractHiElt(In: Src, Out&: Src)) {
3499	Mods \|= SISrcMods::OP_SEL_0;
3500
3501	// TODO: Should we try to look for neg/abs here?
3502	}
3503
3504	return true;
3505	}
3506
3507	return false;
3508	}
3509
3510	bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3511	SDValue &SrcMods) const {
3512	unsigned Mods = `0`;
3513	if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3514	return false;
3515	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3516	return true;
3517	}
3518
3519	bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3520	SDValue &SrcMods) const {
3521	unsigned Mods = `0`;
3522	SelectVOP3PMadMixModsImpl(In, Src, Mods);
3523	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3524	return true;
3525	}
3526
3527	SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3528	if (In.isUndef())
3529	return CurDAG->getUNDEF(VT: MVT::i32);
3530
3531	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: In)) {
3532	SDLoc SL(In);
3533	return CurDAG->getConstant(Val: C->getZExtValue() << `16`, DL: SL, VT: MVT::i32);
3534	}
3535
3536	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: In)) {
3537	SDLoc SL(In);
3538	return CurDAG->getConstant(
3539	Val: C->getValueAPF().bitcastToAPInt().getZExtValue() << `16`, DL: SL, VT: MVT::i32);
3540	}
3541
3542	SDValue Src;
3543	if (isExtractHiElt(In, Out&: Src))
3544	return Src;
3545
3546	return SDValue ();
3547	}
3548
3549	bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3550	assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
3551
3552	const SIRegisterInfo *SIRI =
3553	static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3554	const SIInstrInfo * SII =
3555	static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3556
3557	unsigned Limit = `0`;
3558	bool AllUsesAcceptSReg = true;
3559	for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3560	Limit < `10` && U != E; ++U, ++Limit) {
3561	const TargetRegisterClass RC = getOperandRegClass(N: U, OpNo: U.getOperandNo());
3562
3563	// If the register class is unknown, it could be an unknown
3564	// register class that needs to be an SGPR, e.g. an inline asm
3565	// constraint
3566	if (!RC \|\| SIRI->isSGPRClass(RC))
3567	return false;
3568
3569	if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3570	AllUsesAcceptSReg = false;
3571	SDNode * User = *U;
3572	if (User->isMachineOpcode()) {
3573	unsigned Opc = User->getMachineOpcode();
3574	const MCInstrDesc &Desc = SII->get(Opcode: Opc);
3575	if (Desc.isCommutable()) {
3576	unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
3577	unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3578	if (SII->findCommutedOpIndices(Desc, SrcOpIdx0&: OpIdx, SrcOpIdx1&: CommuteIdx1)) {
3579	unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3580	const TargetRegisterClass CommutedRC = getOperandRegClass(N: U, OpNo: CommutedOpNo);
3581	if (CommutedRC == &AMDGPU::VS_32RegClass \|\|
3582	CommutedRC == &AMDGPU::VS_64RegClass)
3583	AllUsesAcceptSReg = true;
3584	}
3585	}
3586	}
3587	// If "AllUsesAcceptSReg == false" so far we haven't succeeded
3588	// commuting current user. This means have at least one use
3589	// that strictly require VGPR. Thus, we will not attempt to commute
3590	// other user instructions.
3591	if (!AllUsesAcceptSReg)
3592	break;
3593	}
3594	}
3595	return !AllUsesAcceptSReg && (Limit < `10`);
3596	}
3597
3598	bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode N) const* {
3599	auto Ld = cast<LoadSDNode>(Val: N);
3600
3601	const MachineMemOperand *MMO = Ld->getMemOperand();
3602	if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3603	return false;
3604
3605	return MMO->getSize().hasValue() &&
3606	Ld->getAlign() >=
3607	Align (std::min(a: MMO->getSize().getValue().getKnownMinValue(),
3608	b: uint64_t(`4`))) &&
3609	((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS \|\|
3610	Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) \|\|
3611	(Subtarget->getScalarizeGlobalBehavior() &&
3612	Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3613	Ld->isSimple() &&
3614	static_cast<const SITargetLowering *>(getTargetLowering())
3615	->isMemOpHasNoClobberedMemOperand(N)));
3616	}
3617
3618	void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
3619	const AMDGPUTargetLowering& Lowering =
3620	*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3621	bool IsModified = false;
3622	do {
3623	IsModified = false;
3624
3625	// Go over all selected nodes and try to fold them a bit more
3626	SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
3627	while (Position != CurDAG->allnodes_end()) {
3628	SDNode Node = &Position ++;
3629	MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Val: Node);
3630	if (!MachineNode)
3631	continue;
3632
3633	SDNode ResNode = Lowering.PostISelFolding(N: MachineNode, DAG&: CurDAG);
3634	if (ResNode != Node) {
3635	if (ResNode)
3636	ReplaceUses(F: Node, T: ResNode);
3637	IsModified = true;
3638	}
3639	}
3640	CurDAG->RemoveDeadNodes();
3641	} while (IsModified);
3642	}
3643
3644	AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,
3645	CodeGenOptLevel OptLevel)
3646	: SelectionDAGISelLegacy (
3647	ID, std::make_unique<AMDGPUDAGToDAGISel>(args&: TM, args&: OptLevel)) {}
3648
3649	char AMDGPUDAGToDAGISelLegacy::ID = `0`;
3650

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp