AMDGPUISelDAGToDAG.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp]

1	//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//==-----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Defines an instruction selector for the AMDGPU target.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPUISelDAGToDAG.h"
15	#include "AMDGPU.h"
16	#include "AMDGPUInstrInfo.h"
17	#include "AMDGPUSubtarget.h"
18	#include "AMDGPUTargetMachine.h"
19	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20	#include "MCTargetDesc/R600MCTargetDesc.h"
21	#include "R600RegisterInfo.h"
22	#include "SIISelLowering.h"
23	#include "SIMachineFunctionInfo.h"
24	#include "llvm/Analysis/UniformityAnalysis.h"
25	#include "llvm/CodeGen/FunctionLoweringInfo.h"
26	#include "llvm/CodeGen/SelectionDAG.h"
27	#include "llvm/CodeGen/SelectionDAGISel.h"
28	#include "llvm/CodeGen/SelectionDAGNodes.h"
29	#include "llvm/IR/IntrinsicsAMDGPU.h"
30	#include "llvm/Support/ErrorHandling.h"
31
32	#ifdef EXPENSIVE_CHECKS
33	#include "llvm/Analysis/LoopInfo.h"
34	#include "llvm/IR/Dominators.h"
35	#endif
36
37	#define DEBUG_TYPE "amdgpu-isel"
38
39	using namespace llvm;
40
41	//===----------------------------------------------------------------------===//
42	// Instruction Selector Implementation
43	//===----------------------------------------------------------------------===//
44
45	namespace {
46	static SDValue stripBitcast(SDValue Val) {
47	return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(i: `0`) : Val;
48	}
49
50	// Figure out if this is really an extract of the high 16-bits of a dword.
51	static bool isExtractHiElt(SDValue In, SDValue &Out) {
52	In = stripBitcast(Val: In);
53
54	if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
55	if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: `1`))) {
56	if (!Idx->isOne())
57	return false;
58	Out = In.getOperand(i: `0`);
59	return true;
60	}
61	}
62
63	if (In.getOpcode() != ISD::TRUNCATE)
64	return false;
65
66	SDValue Srl = In.getOperand(i: `0`);
67	if (Srl.getOpcode() == ISD::SRL) {
68	if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: Srl.getOperand(i: `1`))) {
69	if (ShiftAmt->getZExtValue() == `16`) {
70	Out = stripBitcast(Val: Srl.getOperand(i: `0`));
71	return true;
72	}
73	}
74	}
75
76	return false;
77	}
78
79	// Look through operations that obscure just looking at the low 16-bits of the
80	// same register.
81	static SDValue stripExtractLoElt(SDValue In) {
82	if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
83	SDValue Idx = In.getOperand(i: `1`);
84	if (isNullConstant(V: Idx) && In.getValueSizeInBits() <= `32`)
85	return In.getOperand(i: `0`);
86	}
87
88	if (In.getOpcode() == ISD::TRUNCATE) {
89	SDValue Src = In.getOperand(i: `0`);
90	if (Src.getValueType().getSizeInBits() == `32`)
91	return stripBitcast(Val: Src);
92	}
93
94	return In;
95	}
96
97	} // end anonymous namespace
98
99	INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
100	"AMDGPU DAG->DAG Pattern Instruction Selection", false,
101	false)
102	INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
103	INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
104	INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
105	#ifdef EXPENSIVE_CHECKS
106	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
107	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
108	#endif
109	INITIALIZE_PASS_END(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
110	"AMDGPU DAG->DAG Pattern Instruction Selection", false,
111	false)
112
113	/// This pass converts a legalized DAG into a AMDGPU-specific
114	// DAG, ready for instruction scheduling.
115	FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
116	CodeGenOptLevel OptLevel) {
117	return new AMDGPUDAGToDAGISelLegacy (TM, OptLevel);
118	}
119
120	AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM,
121	CodeGenOptLevel OptLevel)
122	: SelectionDAGISel (TM, OptLevel) {}
123
124	bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
125	Subtarget = &MF.getSubtarget<GCNSubtarget>();
126	Subtarget->checkSubtargetFeatures(F: MF.getFunction());
127	Mode = SIModeRegisterDefaults (MF.getFunction(), *Subtarget);
128	return SelectionDAGISel::runOnMachineFunction(mf&: MF);
129	}
130
131	bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
132	// XXX - only need to list legal operations.
133	switch (Opc) {
134	case ISD::FADD:
135	case ISD::FSUB:
136	case ISD::FMUL:
137	case ISD::FDIV:
138	case ISD::FREM:
139	case ISD::FCANONICALIZE:
140	case ISD::UINT_TO_FP:
141	case ISD::SINT_TO_FP:
142	case ISD::FABS:
143	// Fabs is lowered to a bit operation, but it's an and which will clear the
144	// high bits anyway.
145	case ISD::FSQRT:
146	case ISD::FSIN:
147	case ISD::FCOS:
148	case ISD::FPOWI:
149	case ISD::FPOW:
150	case ISD::FLOG:
151	case ISD::FLOG2:
152	case ISD::FLOG10:
153	case ISD::FEXP:
154	case ISD::FEXP2:
155	case ISD::FCEIL:
156	case ISD::FTRUNC:
157	case ISD::FRINT:
158	case ISD::FNEARBYINT:
159	case ISD::FROUNDEVEN:
160	case ISD::FROUND:
161	case ISD::FFLOOR:
162	case ISD::FMINNUM:
163	case ISD::FMAXNUM:
164	case ISD::FLDEXP:
165	case AMDGPUISD::FRACT:
166	case AMDGPUISD::CLAMP:
167	case AMDGPUISD::COS_HW:
168	case AMDGPUISD::SIN_HW:
169	case AMDGPUISD::FMIN3:
170	case AMDGPUISD::FMAX3:
171	case AMDGPUISD::FMED3:
172	case AMDGPUISD::FMAD_FTZ:
173	case AMDGPUISD::RCP:
174	case AMDGPUISD::RSQ:
175	case AMDGPUISD::RCP_IFLAG:
176	// On gfx10, all 16-bit instructions preserve the high bits.
177	return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
178	case ISD::FP_ROUND:
179	// We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
180	// high bits on gfx9.
181	// TODO: If we had the source node we could see if the source was fma/mad
182	return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
183	case ISD::FMA:
184	case ISD::FMAD:
185	case AMDGPUISD::DIV_FIXUP:
186	return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
187	default:
188	// fcopysign, select and others may be lowered to 32-bit bit operations
189	// which don't zero the high bits.
190	return false;
191	}
192	}
193
194	bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {
195	#ifdef EXPENSIVE_CHECKS
196	DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
197	LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
198	for (auto &L : LI->getLoopsInPreorder()) {
199	assert(L->isLCSSAForm(DT));
200	}
201	#endif
202	return SelectionDAGISelLegacy::runOnMachineFunction(MF);
203	}
204
205	void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
206	AU.addRequired<AMDGPUArgumentUsageInfo>();
207	AU.addRequired<UniformityInfoWrapperPass>();
208	#ifdef EXPENSIVE_CHECKS
209	AU.addRequired<DominatorTreeWrapperPass>();
210	AU.addRequired<LoopInfoWrapperPass>();
211	#endif
212	SelectionDAGISelLegacy::getAnalysisUsage(AU);
213	}
214
215	bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode N) const* {
216	assert(Subtarget->d16PreservesUnusedBits());
217	MVT VT = N->getValueType(ResNo: `0`).getSimpleVT();
218	if (VT != MVT::v2i16 && VT != MVT::v2f16)
219	return false;
220
221	SDValue Lo = N->getOperand(Num: `0`);
222	SDValue Hi = N->getOperand(Num: `1`);
223
224	LoadSDNode *LdHi = dyn_cast<LoadSDNode>(Val: stripBitcast(Val: Hi));
225
226	// build_vector lo, (load ptr) -> load_d16_hi ptr, lo
227	// build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
228	// build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
229
230	// Need to check for possible indirect dependencies on the other half of the
231	// vector to avoid introducing a cycle.
232	if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(N: Lo.getNode())) {
233	SDVTList VTList = CurDAG->getVTList(VT1: VT, VT2: MVT::Other);
234
235	SDValue TiedIn = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SDLoc (N), VT, Operand: Lo);
236	SDValue Ops[] = {
237	LdHi->getChain(), LdHi->getBasePtr(), TiedIn
238	};
239
240	unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
241	if (LdHi->getMemoryVT() == MVT::i8) {
242	LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
243	AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
244	} else {
245	assert(LdHi->getMemoryVT() == MVT::i16);
246	}
247
248	SDValue NewLoadHi =
249	CurDAG->getMemIntrinsicNode(Opcode: LoadOp, dl: SDLoc (LdHi), VTList,
250	Ops, MemVT: LdHi->getMemoryVT(),
251	MMO: LdHi->getMemOperand());
252
253	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: NewLoadHi);
254	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (LdHi, `1`), To: NewLoadHi.getValue(R: `1`));
255	return true;
256	}
257
258	// build_vector (load ptr), hi -> load_d16_lo ptr, hi
259	// build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
260	// build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
261	LoadSDNode *LdLo = dyn_cast<LoadSDNode>(Val: stripBitcast(Val: Lo));
262	if (LdLo && Lo.hasOneUse()) {
263	SDValue TiedIn = getHi16Elt(In: Hi);
264	if (!TiedIn \|\| LdLo->isPredecessorOf(N: TiedIn.getNode()))
265	return false;
266
267	SDVTList VTList = CurDAG->getVTList(VT1: VT, VT2: MVT::Other);
268	unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
269	if (LdLo->getMemoryVT() == MVT::i8) {
270	LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
271	AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
272	} else {
273	assert(LdLo->getMemoryVT() == MVT::i16);
274	}
275
276	TiedIn = CurDAG->getNode(Opcode: ISD::BITCAST, DL: SDLoc (N), VT, Operand: TiedIn);
277
278	SDValue Ops[] = {
279	LdLo->getChain(), LdLo->getBasePtr(), TiedIn
280	};
281
282	SDValue NewLoadLo =
283	CurDAG->getMemIntrinsicNode(Opcode: LoadOp, dl: SDLoc (LdLo), VTList,
284	Ops, MemVT: LdLo->getMemoryVT(),
285	MMO: LdLo->getMemOperand());
286
287	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: NewLoadLo);
288	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (LdLo, `1`), To: NewLoadLo.getValue(R: `1`));
289	return true;
290	}
291
292	return false;
293	}
294
295	void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
296	if (!Subtarget->d16PreservesUnusedBits())
297	return;
298
299	SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
300
301	bool MadeChange = false;
302	while (Position != CurDAG->allnodes_begin()) {
303	SDNode N = &--Position;
304	if (N->use_empty())
305	continue;
306
307	switch (N->getOpcode()) {
308	case ISD::BUILD_VECTOR:
309	// TODO: Match load d16 from shl (extload:i16), 16
310	MadeChange \|= matchLoadD16FromBuildVector(N);
311	break;
312	default:
313	break;
314	}
315	}
316
317	if (MadeChange) {
318	CurDAG->RemoveDeadNodes();
319	LLVM_DEBUG(dbgs() << "After PreProcess:\n";
320	CurDAG->dump(););
321	}
322	}
323
324	bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode N) const* {
325	if (N->isUndef())
326	return true;
327
328	const SIInstrInfo *TII = Subtarget->getInstrInfo();
329	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N))
330	return TII->isInlineConstant(Imm: C->getAPIntValue());
331
332	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val: N))
333	return TII->isInlineConstant(Imm: C->getValueAPF());
334
335	return false;
336	}
337
338	/// Determine the register class for \p OpNo
339	/// \returns The register class of the virtual register that will be used for
340	/// the given operand number \OpNo or NULL if the register class cannot be
341	/// determined.
342	const TargetRegisterClass AMDGPUDAGToDAGISel::getOperandRegClass(SDNode N,
343	unsigned OpNo) const {
344	if (!N->isMachineOpcode()) {
345	if (N->getOpcode() == ISD::CopyToReg) {
346	Register Reg = cast<RegisterSDNode>(Val: N->getOperand(Num: `1`))->getReg();
347	if (Reg.isVirtual()) {
348	MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
349	return MRI.getRegClass(Reg);
350	}
351
352	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
353	return TRI->getPhysRegBaseClass(Reg);
354	}
355
356	return nullptr;
357	}
358
359	switch (N->getMachineOpcode()) {
360	default: {
361	const MCInstrDesc &Desc =
362	Subtarget->getInstrInfo()->get(Opcode: N->getMachineOpcode());
363	unsigned OpIdx = Desc.getNumDefs() + OpNo;
364	if (OpIdx >= Desc.getNumOperands())
365	return nullptr;
366	int RegClass = Desc.operands()[OpIdx].RegClass;
367	if (RegClass == -`1`)
368	return nullptr;
369
370	return Subtarget->getRegisterInfo()->getRegClass(RCID: RegClass);
371	}
372	case AMDGPU::REG_SEQUENCE: {
373	unsigned RCID = N->getConstantOperandVal(Num: `0`);
374	const TargetRegisterClass *SuperRC =
375	Subtarget->getRegisterInfo()->getRegClass(RCID);
376
377	SDValue SubRegOp = N->getOperand(Num: OpNo + `1`);
378	unsigned SubRegIdx = SubRegOp ->getAsZExtVal();
379	return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
380	SubRegIdx);
381	}
382	}
383	}
384
385	SDNode AMDGPUDAGToDAGISel::glueCopyToOp(SDNode N, SDValue NewChain,
386	SDValue Glue) const {
387	SmallVector <SDValue, `8`> Ops;
388	Ops.push_back(Elt: NewChain); // Replace the chain.
389	for (unsigned i = `1`, e = N->getNumOperands(); i != e; ++i)
390	Ops.push_back(Elt: N->getOperand(Num: i));
391
392	Ops.push_back(Elt: Glue);
393	return CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: N->getVTList(), Ops);
394	}
395
396	SDNode AMDGPUDAGToDAGISel::glueCopyToM0(SDNode N, SDValue Val) const {
397	const SITargetLowering& Lowering =
398	*static_cast<const SITargetLowering*>(getTargetLowering());
399
400	assert(N->getOperand(`0`).getValueType() == MVT::Other && "Expected chain");
401
402	SDValue M0 = Lowering.copyToM0(DAG&: *CurDAG, Chain: N->getOperand(Num: `0`), DL: SDLoc (N), V: Val);
403	return glueCopyToOp(N, NewChain: M0, Glue: M0.getValue(R: `1`));
404	}
405
406	SDNode AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode N) const {
407	unsigned AS = cast<MemSDNode>(Val: N)->getAddressSpace();
408	if (AS == AMDGPUAS::LOCAL_ADDRESS) {
409	if (Subtarget->ldsRequiresM0Init())
410	return glueCopyToM0(
411	N, Val: CurDAG->getSignedTargetConstant(Val: -`1`, DL: SDLoc (N), VT: MVT::i32));
412	} else if (AS == AMDGPUAS::REGION_ADDRESS) {
413	MachineFunction &MF = CurDAG->getMachineFunction();
414	unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
415	return
416	glueCopyToM0(N, Val: CurDAG->getTargetConstant(Val: Value, DL: SDLoc (N), VT: MVT::i32));
417	}
418	return N;
419	}
420
421	MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
422	EVT VT) const {
423	SDNode *Lo = CurDAG->getMachineNode(
424	Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
425	Op1: CurDAG->getTargetConstant(Val: Lo_32(Value: Imm), DL, VT: MVT::i32));
426	SDNode *Hi = CurDAG->getMachineNode(
427	Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
428	Op1: CurDAG->getTargetConstant(Val: Hi_32(Value: Imm), DL, VT: MVT::i32));
429	const SDValue Ops[] = {
430	CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32),
431	SDValue (Lo, `0`), CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
432	SDValue (Hi, `0`), CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
433
434	return CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL, VT, Ops);
435	}
436
437	void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode N, unsigned* RegClassID) {
438	EVT VT = N->getValueType(ResNo: `0`);
439	unsigned NumVectorElts = VT.getVectorNumElements();
440	EVT EltVT = VT.getVectorElementType();
441	SDLoc DL(N);
442	SDValue RegClass = CurDAG->getTargetConstant(Val: RegClassID, DL, VT: MVT::i32);
443
444	if (NumVectorElts == `1`) {
445	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::COPY_TO_REGCLASS, VT: EltVT, Op1: N->getOperand(Num: `0`),
446	Op2: RegClass);
447	return;
448	}
449
450	assert(NumVectorElts <= `32` && "Vectors with more than 32 elements not "
451	"supported yet");
452	// 32 = Max Num Vector Elements
453	// 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
454	// 1 = Vector Register Class
455	SmallVector<SDValue, `32` * `2` + `1`> RegSeqArgs(NumVectorElts * `2` + `1`);
456
457	bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
458	RegSeqArgs [`0`] = CurDAG->getTargetConstant(Val: RegClassID, DL, VT: MVT::i32);
459	bool IsRegSeq = true;
460	unsigned NOps = N->getNumOperands();
461	for (unsigned i = `0`; i < NOps; i++) {
462	// XXX: Why is this here?
463	if (isa<RegisterSDNode>(Val: N->getOperand(Num: i))) {
464	IsRegSeq = false;
465	break;
466	}
467	unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(Channel: i)
468	: R600RegisterInfo::getSubRegFromChannel(Channel: i);
469	RegSeqArgs [`1` + (`2` * i)] = N->getOperand(Num: i);
470	RegSeqArgs [`1` + (`2` * i) + `1`] = CurDAG->getTargetConstant(Val: Sub, DL, VT: MVT::i32);
471	}
472	if (NOps != NumVectorElts) {
473	// Fill in the missing undef elements if this was a scalar_to_vector.
474	assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
475	MachineSDNode *ImpDef = CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF,
476	dl: DL, VT: EltVT);
477	for (unsigned i = NOps; i < NumVectorElts; ++i) {
478	unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(Channel: i)
479	: R600RegisterInfo::getSubRegFromChannel(Channel: i);
480	RegSeqArgs [`1` + (`2` * i)] = SDValue (ImpDef, `0`);
481	RegSeqArgs [`1` + (`2` * i) + `1`] =
482	CurDAG->getTargetConstant(Val: Sub, DL, VT: MVT::i32);
483	}
484	}
485
486	if (!IsRegSeq)
487	SelectCode(N);
488	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::REG_SEQUENCE, VTs: N->getVTList(), Ops: RegSeqArgs);
489	}
490
491	void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) {
492	EVT VT = N->getValueType(ResNo: `0`);
493	EVT EltVT = VT.getVectorElementType();
494
495	// TODO: Handle 16-bit element vectors with even aligned masks.
496	if (!Subtarget->hasPkMovB32() \|\| !EltVT.bitsEq(VT: MVT::i32) \|\|
497	VT.getVectorNumElements() != `2`) {
498	SelectCode(N);
499	return;
500	}
501
502	auto *SVN = cast<ShuffleVectorSDNode>(Val: N);
503
504	SDValue Src0 = SVN->getOperand(Num: `0`);
505	SDValue Src1 = SVN->getOperand(Num: `1`);
506	ArrayRef<int> Mask = SVN->getMask();
507	SDLoc DL(N);
508
509	assert(Src0.getValueType().getVectorNumElements() == `2` && Mask.size() == `2` &&
510	Mask[`0`] < `4` && Mask[`1`] < `4`);
511
512	SDValue VSrc0 = Mask [`0`] < `2` ? Src0 : Src1;
513	SDValue VSrc1 = Mask [`1`] < `2` ? Src0 : Src1;
514	unsigned Src0SubReg = Mask [`0`] & `1` ? AMDGPU::sub1 : AMDGPU::sub0;
515	unsigned Src1SubReg = Mask [`1`] & `1` ? AMDGPU::sub1 : AMDGPU::sub0;
516
517	if (Mask [`0`] < `0`) {
518	Src0SubReg = Src1SubReg;
519	MachineSDNode *ImpDef =
520	CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT);
521	VSrc0 = SDValue (ImpDef, `0`);
522	}
523
524	if (Mask [`1`] < `0`) {
525	Src1SubReg = Src0SubReg;
526	MachineSDNode *ImpDef =
527	CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: DL, VT);
528	VSrc1 = SDValue (ImpDef, `0`);
529	}
530
531	// SGPR case needs to lower to copies.
532	//
533	// Also use subregister extract when we can directly blend the registers with
534	// a simple subregister copy.
535	//
536	// TODO: Maybe we should fold this out earlier
537	if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
538	Src1SubReg == AMDGPU::sub0) {
539	// The low element of the result always comes from src0.
540	// The high element of the result always comes from src1.
541	// op_sel selects the high half of src0.
542	// op_sel_hi selects the high half of src1.
543
544	unsigned Src0OpSel =
545	Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
546	unsigned Src1OpSel =
547	Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
548
549	// Enable op_sel_hi to avoid printing it. This should have no effect on the
550	// result.
551	Src0OpSel \|= SISrcMods::OP_SEL_1;
552	Src1OpSel \|= SISrcMods::OP_SEL_1;
553
554	SDValue Src0OpSelVal = CurDAG->getTargetConstant(Val: Src0OpSel, DL, VT: MVT::i32);
555	SDValue Src1OpSelVal = CurDAG->getTargetConstant(Val: Src1OpSel, DL, VT: MVT::i32);
556	SDValue ZeroMods = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
557
558	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::V_PK_MOV_B32, VTs: N->getVTList(),
559	Ops: {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
560	ZeroMods, // clamp
561	ZeroMods, // op_sel
562	ZeroMods, // op_sel_hi
563	ZeroMods, // neg_lo
564	ZeroMods}); // neg_hi
565	return;
566	}
567
568	SDValue ResultElt0 =
569	CurDAG->getTargetExtractSubreg(SRIdx: Src0SubReg, DL, VT: EltVT, Operand: VSrc0);
570	SDValue ResultElt1 =
571	CurDAG->getTargetExtractSubreg(SRIdx: Src1SubReg, DL, VT: EltVT, Operand: VSrc1);
572
573	const SDValue Ops[] = {
574	CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32),
575	ResultElt0, CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
576	ResultElt1, CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
577	CurDAG->SelectNodeTo(N, MachineOpc: TargetOpcode::REG_SEQUENCE, VT, Ops);
578	}
579
580	void AMDGPUDAGToDAGISel::Select(SDNode *N) {
581	unsigned int Opc = N->getOpcode();
582	if (N->isMachineOpcode()) {
583	N->setNodeId(-`1`);
584	return; // Already selected.
585	}
586
587	// isa<MemSDNode> almost works but is slightly too permissive for some DS
588	// intrinsics.
589	if (Opc == ISD::LOAD \|\| Opc == ISD::STORE \|\| isa<AtomicSDNode>(Val: N)) {
590	N = glueCopyToM0LDSInit(N);
591	SelectCode(N);
592	return;
593	}
594
595	switch (Opc) {
596	default:
597	break;
598	// We are selecting i64 ADD here instead of custom lower it during
599	// DAG legalization, so we can fold some i64 ADDs used for address
600	// calculation into the LOAD and STORE instructions.
601	case ISD::ADDC:
602	case ISD::ADDE:
603	case ISD::SUBC:
604	case ISD::SUBE: {
605	if (N->getValueType(ResNo: `0`) != MVT::i64)
606	break;
607
608	SelectADD_SUB_I64(N);
609	return;
610	}
611	case ISD::UADDO_CARRY:
612	case ISD::USUBO_CARRY:
613	if (N->getValueType(ResNo: `0`) != MVT::i32)
614	break;
615
616	SelectAddcSubb(N);
617	return;
618	case ISD::UADDO:
619	case ISD::USUBO: {
620	SelectUADDO_USUBO(N);
621	return;
622	}
623	case AMDGPUISD::FMUL_W_CHAIN: {
624	SelectFMUL_W_CHAIN(N);
625	return;
626	}
627	case AMDGPUISD::FMA_W_CHAIN: {
628	SelectFMA_W_CHAIN(N);
629	return;
630	}
631
632	case ISD::SCALAR_TO_VECTOR:
633	case ISD::BUILD_VECTOR: {
634	EVT VT = N->getValueType(ResNo: `0`);
635	unsigned NumVectorElts = VT.getVectorNumElements();
636	if (VT.getScalarSizeInBits() == `16`) {
637	if (Opc == ISD::BUILD_VECTOR && NumVectorElts == `2`) {
638	if (SDNode Packed = packConstantV2I16(N, DAG&: CurDAG)) {
639	ReplaceNode(F: N, T: Packed);
640	return;
641	}
642	}
643
644	break;
645	}
646
647	assert(VT.getVectorElementType().bitsEq(MVT::i32));
648	unsigned RegClassID =
649	SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NumVectorElts * `32`)->getID();
650	SelectBuildVector(N, RegClassID);
651	return;
652	}
653	case ISD::VECTOR_SHUFFLE:
654	SelectVectorShuffle(N);
655	return;
656	case ISD::BUILD_PAIR: {
657	SDValue RC, SubReg0, SubReg1;
658	SDLoc DL(N);
659	if (N->getValueType(ResNo: `0`) == MVT::i128) {
660	RC = CurDAG->getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32);
661	SubReg0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0_sub1, DL, VT: MVT::i32);
662	SubReg1 = CurDAG->getTargetConstant(Val: AMDGPU::sub2_sub3, DL, VT: MVT::i32);
663	} else if (N->getValueType(ResNo: `0`) == MVT::i64) {
664	RC = CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32);
665	SubReg0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
666	SubReg1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
667	} else {
668	llvm_unreachable("Unhandled value type for BUILD_PAIR");
669	}
670	const SDValue Ops[] = { RC, N->getOperand(Num: `0`), SubReg0,
671	N->getOperand(Num: `1`), SubReg1 };
672	ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL,
673	VT: N->getValueType(ResNo: `0`), Ops));
674	return;
675	}
676
677	case ISD::Constant:
678	case ISD::ConstantFP: {
679	if (N->getValueType(ResNo: `0`).getSizeInBits() != `64` \|\| isInlineImmediate(N))
680	break;
681
682	uint64_t Imm;
683	if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(Val: N)) {
684	Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
685	if (AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: true))
686	break;
687	} else {
688	ConstantSDNode *C = cast<ConstantSDNode>(Val: N);
689	Imm = C->getZExtValue();
690	if (AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: false))
691	break;
692	}
693
694	SDLoc DL(N);
695	ReplaceNode(F: N, T: buildSMovImm64(DL, Imm, VT: N->getValueType(ResNo: `0`)));
696	return;
697	}
698	case AMDGPUISD::BFE_I32:
699	case AMDGPUISD::BFE_U32: {
700	// There is a scalar version available, but unlike the vector version which
701	// has a separate operand for the offset and width, the scalar version packs
702	// the width and offset into a single operand. Try to move to the scalar
703	// version if the offsets are constant, so that we can try to keep extended
704	// loads of kernel arguments in SGPRs.
705
706	// TODO: Technically we could try to pattern match scalar bitshifts of
707	// dynamic values, but it's probably not useful.
708	ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
709	if (!Offset)
710	break;
711
712	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `2`));
713	if (!Width)
714	break;
715
716	bool Signed = Opc == AMDGPUISD::BFE_I32;
717
718	uint32_t OffsetVal = Offset->getZExtValue();
719	uint32_t WidthVal = Width->getZExtValue();
720
721	ReplaceNode(F: N, T: getBFE32(IsSigned: Signed, DL: SDLoc (N), Val: N->getOperand(Num: `0`), Offset: OffsetVal,
722	Width: WidthVal));
723	return;
724	}
725	case AMDGPUISD::DIV_SCALE: {
726	SelectDIV_SCALE(N);
727	return;
728	}
729	case AMDGPUISD::MAD_I64_I32:
730	case AMDGPUISD::MAD_U64_U32: {
731	SelectMAD_64_32(N);
732	return;
733	}
734	case ISD::SMUL_LOHI:
735	case ISD::UMUL_LOHI:
736	return SelectMUL_LOHI(N);
737	case ISD::CopyToReg: {
738	const SITargetLowering& Lowering =
739	*static_cast<const SITargetLowering*>(getTargetLowering());
740	N = Lowering.legalizeTargetIndependentNode(Node: N, DAG&: *CurDAG);
741	break;
742	}
743	case ISD::AND:
744	case ISD::SRL:
745	case ISD::SRA:
746	case ISD::SIGN_EXTEND_INREG:
747	if (N->getValueType(ResNo: `0`) != MVT::i32)
748	break;
749
750	SelectS_BFE(N);
751	return;
752	case ISD::BRCOND:
753	SelectBRCOND(N);
754	return;
755	case ISD::FP_EXTEND:
756	SelectFP_EXTEND(N);
757	return;
758	case AMDGPUISD::CVT_PKRTZ_F16_F32:
759	case AMDGPUISD::CVT_PKNORM_I16_F32:
760	case AMDGPUISD::CVT_PKNORM_U16_F32:
761	case AMDGPUISD::CVT_PK_U16_U32:
762	case AMDGPUISD::CVT_PK_I16_I32: {
763	// Hack around using a legal type if f16 is illegal.
764	if (N->getValueType(ResNo: `0`) == MVT::i32) {
765	MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
766	N = CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: CurDAG->getVTList(VT: NewVT),
767	Ops: { N->getOperand(Num: `0`), N->getOperand(Num: `1`) });
768	SelectCode(N);
769	return;
770	}
771
772	break;
773	}
774	case ISD::INTRINSIC_W_CHAIN: {
775	SelectINTRINSIC_W_CHAIN(N);
776	return;
777	}
778	case ISD::INTRINSIC_WO_CHAIN: {
779	SelectINTRINSIC_WO_CHAIN(N);
780	return;
781	}
782	case ISD::INTRINSIC_VOID: {
783	SelectINTRINSIC_VOID(N);
784	return;
785	}
786	case AMDGPUISD::WAVE_ADDRESS: {
787	SelectWAVE_ADDRESS(N);
788	return;
789	}
790	case ISD::STACKRESTORE: {
791	SelectSTACKRESTORE(N);
792	return;
793	}
794	}
795
796	SelectCode(N);
797	}
798
799	bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode N) const* {
800	const BasicBlock *BB = FuncInfo ->MBB->getBasicBlock();
801	const Instruction *Term = BB->getTerminator();
802	return Term->getMetadata(Kind: "amdgpu.uniform") \|\|
803	Term->getMetadata(Kind: "structurizecfg.uniform");
804	}
805
806	bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
807	unsigned ShAmtBits) const {
808	assert(N->getOpcode() == ISD::AND);
809
810	const APInt &RHS = N->getConstantOperandAPInt(Num: `1`);
811	if (RHS.countr_one() >= ShAmtBits)
812	return true;
813
814	const APInt &LHSKnownZeros = CurDAG->computeKnownBits(Op: N->getOperand(Num: `0`)).Zero;
815	return (LHSKnownZeros \| RHS).countr_one() >= ShAmtBits;
816	}
817
818	static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
819	SDValue &N0, SDValue &N1) {
820	if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
821	Addr.getOperand(i: `0`).getOpcode() == ISD::BUILD_VECTOR) {
822	// As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
823	// (i64 (bitcast (v2i32 (build_vector
824	// (or (extract_vector_elt V, 0), OFFSET),
825	// (extract_vector_elt V, 1)))))
826	SDValue Lo = Addr.getOperand(i: `0`).getOperand(i: `0`);
827	if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Op: Lo)) {
828	SDValue BaseLo = Lo.getOperand(i: `0`);
829	SDValue BaseHi = Addr.getOperand(i: `0`).getOperand(i: `1`);
830	// Check that split base (Lo and Hi) are extracted from the same one.
831	if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
832	BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
833	BaseLo.getOperand(i: `0`) == BaseHi.getOperand(i: `0`) &&
834	// Lo is statically extracted from index 0.
835	isa<ConstantSDNode>(Val: BaseLo.getOperand(i: `1`)) &&
836	BaseLo.getConstantOperandVal(i: `1`) == `0` &&
837	// Hi is statically extracted from index 0.
838	isa<ConstantSDNode>(Val: BaseHi.getOperand(i: `1`)) &&
839	BaseHi.getConstantOperandVal(i: `1`) == `1`) {
840	N0 = BaseLo.getOperand(i: `0`).getOperand(i: `0`);
841	N1 = Lo.getOperand(i: `1`);
842	return true;
843	}
844	}
845	}
846	return false;
847	}
848
849	bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
850	SDValue &RHS) const {
851	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
852	LHS = Addr.getOperand(i: `0`);
853	RHS = Addr.getOperand(i: `1`);
854	return true;
855	}
856
857	if (getBaseWithOffsetUsingSplitOR(DAG&: *CurDAG, Addr, N0&: LHS, N1&: RHS)) {
858	assert(LHS && RHS && isa<ConstantSDNode>(RHS));
859	return true;
860	}
861
862	return false;
863	}
864
865	StringRef AMDGPUDAGToDAGISelLegacy::getPassName() const {
866	return "AMDGPU DAG->DAG Pattern Instruction Selection";
867	}
868
869	AMDGPUISelDAGToDAGPass::AMDGPUISelDAGToDAGPass(TargetMachine &TM)
870	: SelectionDAGISelPass (
871	std::make_unique<AMDGPUDAGToDAGISel>(args&: TM, args: TM.getOptLevel())) {}
872
873	PreservedAnalyses
874	AMDGPUISelDAGToDAGPass::run(MachineFunction &MF,
875	MachineFunctionAnalysisManager &MFAM) {
876	#ifdef EXPENSIVE_CHECKS
877	auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
878	.getManager();
879	auto &F = MF.getFunction();
880	DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
881	LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
882	for (auto &L : LI.getLoopsInPreorder())
883	assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
884	#endif
885	return SelectionDAGISelPass::run(MF, MFAM);
886	}
887
888	//===----------------------------------------------------------------------===//
889	// Complex Patterns
890	//===----------------------------------------------------------------------===//
891
892	bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
893	SDValue &Offset) {
894	return false;
895	}
896
897	bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
898	SDValue &Offset) {
899	ConstantSDNode *C;
900	SDLoc DL(Addr);
901
902	if ((C = dyn_cast<ConstantSDNode>(Val&: Addr))) {
903	Base = CurDAG->getRegister(Reg: R600::INDIRECT_BASE_ADDR, VT: MVT::i32);
904	Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
905	} else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
906	(C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `0`)))) {
907	Base = CurDAG->getRegister(Reg: R600::INDIRECT_BASE_ADDR, VT: MVT::i32);
908	Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
909	} else if ((Addr.getOpcode() == ISD::ADD \|\| Addr.getOpcode() == ISD::OR) &&
910	(C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`)))) {
911	Base = Addr.getOperand(i: `0`);
912	Offset = CurDAG->getTargetConstant(Val: C->getZExtValue(), DL, VT: MVT::i32);
913	} else {
914	Base = Addr;
915	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
916	}
917
918	return true;
919	}
920
921	SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
922	const SDLoc &DL) const {
923	SDNode *Mov = CurDAG->getMachineNode(
924	Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
925	Op1: CurDAG->getTargetConstant(Val, DL, VT: MVT::i32));
926	return SDValue (Mov, `0`);
927	}
928
929	// FIXME: Should only handle uaddo_carry/usubo_carry
930	void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
931	SDLoc DL(N);
932	SDValue LHS = N->getOperand(Num: `0`);
933	SDValue RHS = N->getOperand(Num: `1`);
934
935	unsigned Opcode = N->getOpcode();
936	bool ConsumeCarry = (Opcode == ISD::ADDE \|\| Opcode == ISD::SUBE);
937	bool ProduceCarry =
938	ConsumeCarry \|\| Opcode == ISD::ADDC \|\| Opcode == ISD::SUBC;
939	bool IsAdd = Opcode == ISD::ADD \|\| Opcode == ISD::ADDC \|\| Opcode == ISD::ADDE;
940
941	SDValue Sub0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
942	SDValue Sub1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
943
944	SDNode *Lo0 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
945	dl: DL, VT: MVT::i32, Op1: LHS, Op2: Sub0);
946	SDNode *Hi0 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
947	dl: DL, VT: MVT::i32, Op1: LHS, Op2: Sub1);
948
949	SDNode *Lo1 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
950	dl: DL, VT: MVT::i32, Op1: RHS, Op2: Sub0);
951	SDNode *Hi1 = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
952	dl: DL, VT: MVT::i32, Op1: RHS, Op2: Sub1);
953
954	SDVTList VTList = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::Glue);
955
956	static const unsigned OpcMap[`2`][`2`][`2`] = {
957	{{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
958	{AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
959	{{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
960	{AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
961
962	unsigned Opc = OpcMap[`0`][N->isDivergent()][IsAdd];
963	unsigned CarryOpc = OpcMap[`1`][N->isDivergent()][IsAdd];
964
965	SDNode *AddLo;
966	if (!ConsumeCarry) {
967	SDValue Args[] = { SDValue (Lo0, `0`), SDValue (Lo1, `0`) };
968	AddLo = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs: VTList, Ops: Args);
969	} else {
970	SDValue Args[] = { SDValue (Lo0, `0`), SDValue (Lo1, `0`), N->getOperand(Num: `2`) };
971	AddLo = CurDAG->getMachineNode(Opcode: CarryOpc, dl: DL, VTs: VTList, Ops: Args);
972	}
973	SDValue AddHiArgs[] = {
974	SDValue (Hi0, `0`),
975	SDValue (Hi1, `0`),
976	SDValue (AddLo, `1`)
977	};
978	SDNode *AddHi = CurDAG->getMachineNode(Opcode: CarryOpc, dl: DL, VTs: VTList, Ops: AddHiArgs);
979
980	SDValue RegSequenceArgs[] = {
981	CurDAG->getTargetConstant(Val: AMDGPU::SReg_64RegClassID, DL, VT: MVT::i32),
982	SDValue (AddLo,`0`),
983	Sub0,
984	SDValue (AddHi,`0`),
985	Sub1,
986	};
987	SDNode *RegSequence = CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL,
988	VT: MVT::i64, Ops: RegSequenceArgs);
989
990	if (ProduceCarry) {
991	// Replace the carry-use
992	ReplaceUses(F: SDValue (N, `1`), T: SDValue (AddHi, `1`));
993	}
994
995	// Replace the remaining uses.
996	ReplaceNode(F: N, T: RegSequence);
997	}
998
999	void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1000	SDValue LHS = N->getOperand(Num: `0`);
1001	SDValue RHS = N->getOperand(Num: `1`);
1002	SDValue CI = N->getOperand(Num: `2`);
1003
1004	if (N->isDivergent()) {
1005	unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1006	: AMDGPU::V_SUBB_U32_e64;
1007	CurDAG->SelectNodeTo(
1008	N, MachineOpc: Opc, VTs: N->getVTList(),
1009	Ops: {LHS, RHS, CI,
1010	CurDAG->getTargetConstant(Val: `0`, DL: {}, VT: MVT::i1) /clamp bit/});
1011	} else {
1012	unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1013	: AMDGPU::S_SUB_CO_PSEUDO;
1014	CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops: {LHS, RHS, CI});
1015	}
1016	}
1017
1018	void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1019	// The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1020	// carry out despite the _i32 name. These were renamed in VI to _U32.
1021	// FIXME: We should probably rename the opcodes here.
1022	bool IsAdd = N->getOpcode() == ISD::UADDO;
1023	bool IsVALU = N->isDivergent();
1024
1025	for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1026	++UI)
1027	if (UI.getUse().getResNo() == `1`) {
1028	if ((IsAdd && (UI ->getOpcode() != ISD::UADDO_CARRY)) \|\|
1029	(!IsAdd && (UI ->getOpcode() != ISD::USUBO_CARRY))) {
1030	IsVALU = true;
1031	break;
1032	}
1033	}
1034
1035	if (IsVALU) {
1036	unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1037
1038	CurDAG->SelectNodeTo(
1039	N, MachineOpc: Opc, VTs: N->getVTList(),
1040	Ops: {N->getOperand(Num: `0`), N->getOperand(Num: `1`),
1041	CurDAG->getTargetConstant(Val: `0`, DL: {}, VT: MVT::i1) /clamp bit/});
1042	} else {
1043	unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
1044	: AMDGPU::S_USUBO_PSEUDO;
1045
1046	CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(),
1047	Ops: {N->getOperand(Num: `0`), N->getOperand(Num: `1`)});
1048	}
1049	}
1050
1051	void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1052	// src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1053	SDValue Ops[`10`];
1054
1055	SelectVOP3Mods0(In: N->getOperand(Num: `1`), Src&: Ops[`1`], SrcMods&: Ops[`0`], Clamp&: Ops[`6`], Omod&: Ops[`7`]);
1056	SelectVOP3Mods(In: N->getOperand(Num: `2`), Src&: Ops[`3`], SrcMods&: Ops[`2`]);
1057	SelectVOP3Mods(In: N->getOperand(Num: `3`), Src&: Ops[`5`], SrcMods&: Ops[`4`]);
1058	Ops[`8`] = N->getOperand(Num: `0`);
1059	Ops[`9`] = N->getOperand(Num: `4`);
1060
1061	// If there are no source modifiers, prefer fmac over fma because it can use
1062	// the smaller VOP2 encoding.
1063	bool UseFMAC = Subtarget->hasDLInsts() &&
1064	cast<ConstantSDNode>(Val&: Ops[`0`])->isZero() &&
1065	cast<ConstantSDNode>(Val&: Ops[`2`])->isZero() &&
1066	cast<ConstantSDNode>(Val&: Ops[`4`])->isZero();
1067	unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1068	CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops);
1069	}
1070
1071	void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1072	// src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1073	SDValue Ops[`8`];
1074
1075	SelectVOP3Mods0(In: N->getOperand(Num: `1`), Src&: Ops[`1`], SrcMods&: Ops[`0`], Clamp&: Ops[`4`], Omod&: Ops[`5`]);
1076	SelectVOP3Mods(In: N->getOperand(Num: `2`), Src&: Ops[`3`], SrcMods&: Ops[`2`]);
1077	Ops[`6`] = N->getOperand(Num: `0`);
1078	Ops[`7`] = N->getOperand(Num: `3`);
1079
1080	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::V_MUL_F32_e64, VTs: N->getVTList(), Ops);
1081	}
1082
1083	// We need to handle this here because tablegen doesn't support matching
1084	// instructions with multiple outputs.
1085	void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1086	EVT VT = N->getValueType(ResNo: `0`);
1087
1088	assert(VT == MVT::f32 \|\| VT == MVT::f64);
1089
1090	unsigned Opc
1091	= (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1092
1093	// src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1094	// omod
1095	SDValue Ops[`8`];
1096	SelectVOP3BMods0(In: N->getOperand(Num: `0`), Src&: Ops[`1`], SrcMods&: Ops[`0`], Clamp&: Ops[`6`], Omod&: Ops[`7`]);
1097	SelectVOP3BMods(In: N->getOperand(Num: `1`), Src&: Ops[`3`], SrcMods&: Ops[`2`]);
1098	SelectVOP3BMods(In: N->getOperand(Num: `2`), Src&: Ops[`5`], SrcMods&: Ops[`4`]);
1099	CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
1100	}
1101
1102	// We need to handle this here because tablegen doesn't support matching
1103	// instructions with multiple outputs.
1104	void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1105	SDLoc SL(N);
1106	bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1107	unsigned Opc;
1108	if (Subtarget->hasMADIntraFwdBug())
1109	Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1110	: AMDGPU::V_MAD_U64_U32_gfx11_e64;
1111	else
1112	Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1113
1114	SDValue Clamp = CurDAG->getTargetConstant(Val: `0`, DL: SL, VT: MVT::i1);
1115	SDValue Ops[] = { N->getOperand(Num: `0`), N->getOperand(Num: `1`), N->getOperand(Num: `2`),
1116	Clamp };
1117	CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
1118	}
1119
1120	// We need to handle this here because tablegen doesn't support matching
1121	// instructions with multiple outputs.
1122	void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1123	SDLoc SL(N);
1124	bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1125	unsigned Opc;
1126	if (Subtarget->hasMADIntraFwdBug())
1127	Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1128	: AMDGPU::V_MAD_U64_U32_gfx11_e64;
1129	else
1130	Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1131
1132	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL: SL, VT: MVT::i64);
1133	SDValue Clamp = CurDAG->getTargetConstant(Val: `0`, DL: SL, VT: MVT::i1);
1134	SDValue Ops[] = {N->getOperand(Num: `0`), N->getOperand(Num: `1`), Zero, Clamp};
1135	SDNode *Mad = CurDAG->getMachineNode(
1136	Opcode: Opc, dl: SL, VTs: CurDAG->getVTList(VT1: MVT::i64, VT2: MVT::i1), Ops);
1137	if (!SDValue (N, `0`).use_empty()) {
1138	SDValue Sub0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32);
1139	SDNode *Lo = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: SL,
1140	VT: MVT::i32, Op1: SDValue (Mad, `0`), Op2: Sub0);
1141	ReplaceUses(F: SDValue (N, `0`), T: SDValue (Lo, `0`));
1142	}
1143	if (!SDValue (N, `1`).use_empty()) {
1144	SDValue Sub1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32);
1145	SDNode *Hi = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG, dl: SL,
1146	VT: MVT::i32, Op1: SDValue (Mad, `0`), Op2: Sub1);
1147	ReplaceUses(F: SDValue (N, `1`), T: SDValue (Hi, `0`));
1148	}
1149	CurDAG->RemoveDeadNode(N);
1150	}
1151
1152	bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1153	if (!isUInt<`16`>(x: Offset))
1154	return false;
1155
1156	if (!Base \|\| Subtarget->hasUsableDSOffset() \|\|
1157	Subtarget->unsafeDSOffsetFoldingEnabled())
1158	return true;
1159
1160	// On Southern Islands instruction with a negative base value and an offset
1161	// don't seem to work.
1162	return CurDAG->SignBitIsZero(Op: Base);
1163	}
1164
1165	bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1166	SDValue &Offset) const {
1167	SDLoc DL(Addr);
1168	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1169	SDValue N0 = Addr.getOperand(i: `0`);
1170	SDValue N1 = Addr.getOperand(i: `1`);
1171	ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
1172	if (isDSOffsetLegal(Base: N0, Offset: C1->getSExtValue())) {
1173	// (add n0, c0)
1174	Base = N0;
1175	Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i16);
1176	return true;
1177	}
1178	} else if (Addr.getOpcode() == ISD::SUB) {
1179	// sub C, x -> add (sub 0, x), C
1180	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `0`))) {
1181	int64_t ByteOffset = C->getSExtValue();
1182	if (isDSOffsetLegal(Base: SDValue (), Offset: ByteOffset)) {
1183	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1184
1185	// XXX - This is kind of hacky. Create a dummy sub node so we can check
1186	// the known bits in isDSOffsetLegal. We need to emit the selected node
1187	// here, so this is thrown away.
1188	SDValue Sub = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32,
1189	N1: Zero, N2: Addr.getOperand(i: `1`));
1190
1191	if (isDSOffsetLegal(Base: Sub, Offset: ByteOffset)) {
1192	SmallVector<SDValue, `3`> Opnds;
1193	Opnds.push_back(Elt: Zero);
1194	Opnds.push_back(Elt: Addr.getOperand(i: `1`));
1195
1196	// FIXME: Select to VOP3 version for with-carry.
1197	unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1198	if (Subtarget->hasAddNoCarry()) {
1199	SubOp = AMDGPU::V_SUB_U32_e64;
1200	Opnds.push_back(
1201	Elt: CurDAG->getTargetConstant(Val: `0`, DL: {}, VT: MVT::i1)); // clamp bit
1202	}
1203
1204	MachineSDNode *MachineSub =
1205	CurDAG->getMachineNode(Opcode: SubOp, dl: DL, VT: MVT::i32, Ops: Opnds);
1206
1207	Base = SDValue (MachineSub, `0`);
1208	Offset = CurDAG->getTargetConstant(Val: ByteOffset, DL, VT: MVT::i16);
1209	return true;
1210	}
1211	}
1212	}
1213	} else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1214	// If we have a constant address, prefer to put the constant into the
1215	// offset. This can save moves to load the constant address since multiple
1216	// operations can share the zero base address register, and enables merging
1217	// into read2 / write2 instructions.
1218
1219	SDLoc DL(Addr);
1220
1221	if (isDSOffsetLegal(Base: SDValue (), Offset: CAddr->getZExtValue())) {
1222	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1223	MachineSDNode *MovZero = CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32,
1224	dl: DL, VT: MVT::i32, Op1: Zero);
1225	Base = SDValue (MovZero, `0`);
1226	Offset = CurDAG->getTargetConstant(Val: CAddr->getZExtValue(), DL, VT: MVT::i16);
1227	return true;
1228	}
1229	}
1230
1231	// default case
1232	Base = Addr;
1233	Offset = CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (Addr), VT: MVT::i16);
1234	return true;
1235	}
1236
1237	bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1238	unsigned Offset1,
1239	unsigned Size) const {
1240	if (Offset0 % Size != `0` \|\| Offset1 % Size != `0`)
1241	return false;
1242	if (!isUInt<`8`>(x: Offset0 / Size) \|\| !isUInt<`8`>(x: Offset1 / Size))
1243	return false;
1244
1245	if (!Base \|\| Subtarget->hasUsableDSOffset() \|\|
1246	Subtarget->unsafeDSOffsetFoldingEnabled())
1247	return true;
1248
1249	// On Southern Islands instruction with a negative base value and an offset
1250	// don't seem to work.
1251	return CurDAG->SignBitIsZero(Op: Base);
1252	}
1253
1254	// Return whether the operation has NoUnsignedWrap property.
1255	static bool isNoUnsignedWrap(SDValue Addr) {
1256	return (Addr.getOpcode() == ISD::ADD &&
1257	Addr ->getFlags().hasNoUnsignedWrap()) \|\|
1258	Addr ->getOpcode() == ISD::OR;
1259	}
1260
1261	// Check that the base address of flat scratch load/store in the form of `base +
1262	// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1263	// requirement). We always treat the first operand as the base address here.
1264	bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1265	if (isNoUnsignedWrap(Addr))
1266	return true;
1267
1268	// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1269	// values.
1270	if (Subtarget->hasSignedScratchOffsets())
1271	return true;
1272
1273	auto LHS = Addr.getOperand(i: `0`);
1274	auto RHS = Addr.getOperand(i: `1`);
1275
1276	// If the immediate offset is negative and within certain range, the base
1277	// address cannot also be negative. If the base is also negative, the sum
1278	// would be either negative or much larger than the valid range of scratch
1279	// memory a thread can access.
1280	ConstantSDNode ImmOp = nullptr*;
1281	if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(Val&: RHS))) {
1282	if (ImmOp->getSExtValue() < `0` && ImmOp->getSExtValue() > -`0x40000000`)
1283	return true;
1284	}
1285
1286	return CurDAG->SignBitIsZero(Op: LHS);
1287	}
1288
1289	// Check address value in SGPR/VGPR are legal for flat scratch in the form
1290	// of: SGPR + VGPR.
1291	bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1292	if (isNoUnsignedWrap(Addr))
1293	return true;
1294
1295	// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1296	// values.
1297	if (Subtarget->hasSignedScratchOffsets())
1298	return true;
1299
1300	auto LHS = Addr.getOperand(i: `0`);
1301	auto RHS = Addr.getOperand(i: `1`);
1302	return CurDAG->SignBitIsZero(Op: RHS) && CurDAG->SignBitIsZero(Op: LHS);
1303	}
1304
1305	// Check address value in SGPR/VGPR are legal for flat scratch in the form
1306	// of: SGPR + VGPR + Imm.
1307	bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1308	// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1309	// values.
1310	if (AMDGPU::isGFX12Plus(STI: *Subtarget))
1311	return true;
1312
1313	auto Base = Addr.getOperand(i: `0`);
1314	auto *RHSImm = cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`));
1315	// If the immediate offset is negative and within certain range, the base
1316	// address cannot also be negative. If the base is also negative, the sum
1317	// would be either negative or much larger than the valid range of scratch
1318	// memory a thread can access.
1319	if (isNoUnsignedWrap(Addr: Base) &&
1320	(isNoUnsignedWrap(Addr) \|\|
1321	(RHSImm->getSExtValue() < `0` && RHSImm->getSExtValue() > -`0x40000000`)))
1322	return true;
1323
1324	auto LHS = Base.getOperand(i: `0`);
1325	auto RHS = Base.getOperand(i: `1`);
1326	return CurDAG->SignBitIsZero(Op: RHS) && CurDAG->SignBitIsZero(Op: LHS);
1327	}
1328
1329	// TODO: If offset is too big, put low 16-bit into offset.
1330	bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1331	SDValue &Offset0,
1332	SDValue &Offset1) const {
1333	return SelectDSReadWrite2(Ptr: Addr, Base, Offset0, Offset1, Size: `4`);
1334	}
1335
1336	bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1337	SDValue &Offset0,
1338	SDValue &Offset1) const {
1339	return SelectDSReadWrite2(Ptr: Addr, Base, Offset0, Offset1, Size: `8`);
1340	}
1341
1342	bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1343	SDValue &Offset0, SDValue &Offset1,
1344	unsigned Size) const {
1345	SDLoc DL(Addr);
1346
1347	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1348	SDValue N0 = Addr.getOperand(i: `0`);
1349	SDValue N1 = Addr.getOperand(i: `1`);
1350	ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
1351	unsigned OffsetValue0 = C1->getZExtValue();
1352	unsigned OffsetValue1 = OffsetValue0 + Size;
1353
1354	// (add n0, c0)
1355	if (isDSOffset2Legal(Base: N0, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1356	Base = N0;
1357	Offset0 = CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i32);
1358	Offset1 = CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i32);
1359	return true;
1360	}
1361	} else if (Addr.getOpcode() == ISD::SUB) {
1362	// sub C, x -> add (sub 0, x), C
1363	if (const ConstantSDNode *C =
1364	dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `0`))) {
1365	unsigned OffsetValue0 = C->getZExtValue();
1366	unsigned OffsetValue1 = OffsetValue0 + Size;
1367
1368	if (isDSOffset2Legal(Base: SDValue (), Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1369	SDLoc DL(Addr);
1370	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1371
1372	// XXX - This is kind of hacky. Create a dummy sub node so we can check
1373	// the known bits in isDSOffsetLegal. We need to emit the selected node
1374	// here, so this is thrown away.
1375	SDValue Sub =
1376	CurDAG->getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: Zero, N2: Addr.getOperand(i: `1`));
1377
1378	if (isDSOffset2Legal(Base: Sub, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1379	SmallVector<SDValue, `3`> Opnds;
1380	Opnds.push_back(Elt: Zero);
1381	Opnds.push_back(Elt: Addr.getOperand(i: `1`));
1382	unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1383	if (Subtarget->hasAddNoCarry()) {
1384	SubOp = AMDGPU::V_SUB_U32_e64;
1385	Opnds.push_back(
1386	Elt: CurDAG->getTargetConstant(Val: `0`, DL: {}, VT: MVT::i1)); // clamp bit
1387	}
1388
1389	MachineSDNode *MachineSub = CurDAG->getMachineNode(
1390	Opcode: SubOp, dl: DL, VT: MVT::getIntegerVT(BitWidth: Size * `8`), Ops: Opnds);
1391
1392	Base = SDValue (MachineSub, `0`);
1393	Offset0 =
1394	CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i32);
1395	Offset1 =
1396	CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i32);
1397	return true;
1398	}
1399	}
1400	}
1401	} else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1402	unsigned OffsetValue0 = CAddr->getZExtValue();
1403	unsigned OffsetValue1 = OffsetValue0 + Size;
1404
1405	if (isDSOffset2Legal(Base: SDValue (), Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1406	SDValue Zero = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1407	MachineSDNode *MovZero =
1408	CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32, Op1: Zero);
1409	Base = SDValue (MovZero, `0`);
1410	Offset0 = CurDAG->getTargetConstant(Val: OffsetValue0 / Size, DL, VT: MVT::i32);
1411	Offset1 = CurDAG->getTargetConstant(Val: OffsetValue1 / Size, DL, VT: MVT::i32);
1412	return true;
1413	}
1414	}
1415
1416	// default case
1417
1418	Base = Addr;
1419	Offset0 = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1420	Offset1 = CurDAG->getTargetConstant(Val: `1`, DL, VT: MVT::i32);
1421	return true;
1422	}
1423
1424	bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1425	SDValue &SOffset, SDValue &Offset,
1426	SDValue &Offen, SDValue &Idxen,
1427	SDValue &Addr64) const {
1428	// Subtarget prefers to use flat instruction
1429	// FIXME: This should be a pattern predicate and not reach here
1430	if (Subtarget->useFlatForGlobal())
1431	return false;
1432
1433	SDLoc DL(Addr);
1434
1435	Idxen = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
1436	Offen = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
1437	Addr64 = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
1438	SOffset = Subtarget->hasRestrictedSOffset()
1439	? CurDAG->getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32)
1440	: CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1441
1442	ConstantSDNode C1 = nullptr*;
1443	SDValue N0 = Addr;
1444	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1445	C1 = cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`));
1446	if (isUInt<`32`>(x: C1->getZExtValue()))
1447	N0 = Addr.getOperand(i: `0`);
1448	else
1449	C1 = nullptr;
1450	}
1451
1452	if (N0.getOpcode() == ISD::ADD) {
1453	// (add N2, N3) -> addr64, or
1454	// (add (add N2, N3), C1) -> addr64
1455	SDValue N2 = N0.getOperand(i: `0`);
1456	SDValue N3 = N0.getOperand(i: `1`);
1457	Addr64 = CurDAG->getTargetConstant(Val: `1`, DL, VT: MVT::i1);
1458
1459	if (N2 ->isDivergent()) {
1460	if (N3 ->isDivergent()) {
1461	// Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1462	// addr64, and construct the resource from a 0 address.
1463	Ptr = SDValue (buildSMovImm64(DL, Imm: `0`, VT: MVT::v2i32), `0`);
1464	VAddr = N0;
1465	} else {
1466	// N2 is divergent, N3 is not.
1467	Ptr = N3;
1468	VAddr = N2;
1469	}
1470	} else {
1471	// N2 is not divergent.
1472	Ptr = N2;
1473	VAddr = N3;
1474	}
1475	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1476	} else if (N0 ->isDivergent()) {
1477	// N0 is divergent. Use it as the addr64, and construct the resource from a
1478	// 0 address.
1479	Ptr = SDValue (buildSMovImm64(DL, Imm: `0`, VT: MVT::v2i32), `0`);
1480	VAddr = N0;
1481	Addr64 = CurDAG->getTargetConstant(Val: `1`, DL, VT: MVT::i1);
1482	} else {
1483	// N0 -> offset, or
1484	// (N0 + C1) -> offset
1485	VAddr = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1486	Ptr = N0;
1487	}
1488
1489	if (!C1) {
1490	// No offset.
1491	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1492	return true;
1493	}
1494
1495	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1496	if (TII->isLegalMUBUFImmOffset(Imm: C1->getZExtValue())) {
1497	// Legal offset for instruction.
1498	Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32);
1499	return true;
1500	}
1501
1502	// Illegal offset, store it in soffset.
1503	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1504	SOffset =
1505	SDValue (CurDAG->getMachineNode(
1506	Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
1507	Op1: CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32)),
1508	`0`);
1509	return true;
1510	}
1511
1512	bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1513	SDValue &VAddr, SDValue &SOffset,
1514	SDValue &Offset) const {
1515	SDValue Ptr, Offen, Idxen, Addr64;
1516
1517	// addr64 bit was removed for volcanic islands.
1518	// FIXME: This should be a pattern predicate and not reach here
1519	if (!Subtarget->hasAddr64())
1520	return false;
1521
1522	if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1523	return false;
1524
1525	ConstantSDNode *C = cast<ConstantSDNode>(Val&: Addr64);
1526	if (C->getSExtValue()) {
1527	SDLoc DL(Addr);
1528
1529	const SITargetLowering& Lowering =
1530	*static_cast<const SITargetLowering*>(getTargetLowering());
1531
1532	SRsrc = SDValue (Lowering.wrapAddr64Rsrc(DAG&: *CurDAG, DL, Ptr), `0`);
1533	return true;
1534	}
1535
1536	return false;
1537	}
1538
1539	std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1540	SDLoc DL(N);
1541
1542	auto *FI = dyn_cast<FrameIndexSDNode>(Val&: N);
1543	SDValue TFI =
1544	FI ? CurDAG->getTargetFrameIndex(FI: FI->getIndex(), VT: FI->getValueType(ResNo: `0`)) : N;
1545
1546	// We rebase the base address into an absolute stack address and hence
1547	// use constant 0 for soffset. This value must be retained until
1548	// frame elimination and eliminateFrameIndex will choose the appropriate
1549	// frame register if need be.
1550	return std::pair(TFI, CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32));
1551	}
1552
1553	bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1554	SDValue Addr, SDValue &Rsrc,
1555	SDValue &VAddr, SDValue &SOffset,
1556	SDValue &ImmOffset) const {
1557
1558	SDLoc DL(Addr);
1559	MachineFunction &MF = CurDAG->getMachineFunction();
1560	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1561
1562	Rsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1563
1564	if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1565	int64_t Imm = CAddr->getSExtValue();
1566	const int64_t NullPtr =
1567	AMDGPUTargetMachine::getNullPointerValue(AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
1568	// Don't fold null pointer.
1569	if (Imm != NullPtr) {
1570	const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
1571	SDValue HighBits =
1572	CurDAG->getTargetConstant(Val: Imm & ~MaxOffset, DL, VT: MVT::i32);
1573	MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1574	Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32, Op1: HighBits);
1575	VAddr = SDValue (MovHighBits, `0`);
1576
1577	SOffset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1578	ImmOffset = CurDAG->getTargetConstant(Val: Imm & MaxOffset, DL, VT: MVT::i32);
1579	return true;
1580	}
1581	}
1582
1583	if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1584	// (add n0, c1)
1585
1586	SDValue N0 = Addr.getOperand(i: `0`);
1587	uint64_t C1 = Addr.getConstantOperandVal(i: `1`);
1588
1589	// Offsets in vaddr must be positive if range checking is enabled.
1590	//
1591	// The total computation of vaddr + soffset + offset must not overflow. If
1592	// vaddr is negative, even if offset is 0 the sgpr offset add will end up
1593	// overflowing.
1594	//
1595	// Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1596	// always perform a range check. If a negative vaddr base index was used,
1597	// this would fail the range check. The overall address computation would
1598	// compute a valid address, but this doesn't happen due to the range
1599	// check. For out-of-bounds MUBUF loads, a 0 is returned.
1600	//
1601	// Therefore it should be safe to fold any VGPR offset on gfx9 into the
1602	// MUBUF vaddr, but not on older subtargets which can only do this if the
1603	// sign bit is known 0.
1604	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1605	if (TII->isLegalMUBUFImmOffset(Imm: C1) &&
1606	(!Subtarget->privateMemoryResourceIsRangeChecked() \|\|
1607	CurDAG->SignBitIsZero(Op: N0))) {
1608	std::tie(args&: VAddr, args&: SOffset) = foldFrameIndex(N: N0);
1609	ImmOffset = CurDAG->getTargetConstant(Val: C1, DL, VT: MVT::i32);
1610	return true;
1611	}
1612	}
1613
1614	// (node)
1615	std::tie(args&: VAddr, args&: SOffset) = foldFrameIndex(N: Addr);
1616	ImmOffset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1617	return true;
1618	}
1619
1620	static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1621	if (Val.getOpcode() != ISD::CopyFromReg)
1622	return false;
1623	auto Reg = cast<RegisterSDNode>(Val: Val.getOperand(i: `1`))->getReg();
1624	if (!Reg.isPhysical())
1625	return false;
1626	const auto *RC = TRI.getPhysRegBaseClass(Reg);
1627	return RC && TRI.isSGPRClass(RC);
1628	}
1629
1630	bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1631	SDValue Addr,
1632	SDValue &SRsrc,
1633	SDValue &SOffset,
1634	SDValue &Offset) const {
1635	const SIRegisterInfo *TRI =
1636	static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1637	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1638	MachineFunction &MF = CurDAG->getMachineFunction();
1639	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1640	SDLoc DL(Addr);
1641
1642	// CopyFromReg <sgpr>
1643	if (IsCopyFromSGPR(TRI: *TRI, Val: Addr)) {
1644	SRsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1645	SOffset = Addr;
1646	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1647	return true;
1648	}
1649
1650	ConstantSDNode *CAddr;
1651	if (Addr.getOpcode() == ISD::ADD) {
1652	// Add (CopyFromReg <sgpr>) <constant>
1653	CAddr = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`));
1654	if (!CAddr \|\| !TII->isLegalMUBUFImmOffset(Imm: CAddr->getZExtValue()))
1655	return false;
1656	if (!IsCopyFromSGPR(TRI: *TRI, Val: Addr.getOperand(i: `0`)))
1657	return false;
1658
1659	SOffset = Addr.getOperand(i: `0`);
1660	} else if ((CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) &&
1661	TII->isLegalMUBUFImmOffset(Imm: CAddr->getZExtValue())) {
1662	// <constant>
1663	SOffset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
1664	} else {
1665	return false;
1666	}
1667
1668	SRsrc = CurDAG->getRegister(Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
1669
1670	Offset = CurDAG->getTargetConstant(Val: CAddr->getZExtValue(), DL, VT: MVT::i32);
1671	return true;
1672	}
1673
1674	bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1675	SDValue &SOffset, SDValue &Offset
1676	) const {
1677	SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1678	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1679
1680	if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1681	return false;
1682
1683	if (!cast<ConstantSDNode>(Val&: Offen)->getSExtValue() &&
1684	!cast<ConstantSDNode>(Val&: Idxen)->getSExtValue() &&
1685	!cast<ConstantSDNode>(Val&: Addr64)->getSExtValue()) {
1686	uint64_t Rsrc = TII->getDefaultRsrcDataFormat() \|
1687	maskTrailingOnes<uint64_t>(N: `32`); // Size
1688	SDLoc DL(Addr);
1689
1690	const SITargetLowering& Lowering =
1691	*static_cast<const SITargetLowering*>(getTargetLowering());
1692
1693	SRsrc = SDValue (Lowering.buildRSRC(DAG&: *CurDAG, DL, Ptr, RsrcDword1: `0`, RsrcDword2And3: Rsrc), `0`);
1694	return true;
1695	}
1696	return false;
1697	}
1698
1699	bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1700	SDValue &SOffset) const {
1701	if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: ByteOffsetNode)) {
1702	SOffset = CurDAG->getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32);
1703	return true;
1704	}
1705
1706	SOffset = ByteOffsetNode;
1707	return true;
1708	}
1709
1710	// Find a load or store from corresponding pattern root.
1711	// Roots may be build_vector, bitconvert or their combinations.
1712	static MemSDNode* findMemSDNode(SDNode *N) {
1713	N = AMDGPUTargetLowering::stripBitcast(Val: SDValue (N,`0`)).getNode();
1714	if (MemSDNode *MN = dyn_cast<MemSDNode>(Val: N))
1715	return MN;
1716	assert(isa<BuildVectorSDNode>(N));
1717	for (SDValue V : N->op_values())
1718	if (MemSDNode *MN =
1719	dyn_cast<MemSDNode>(Val: AMDGPUTargetLowering::stripBitcast(Val: V)))
1720	return MN;
1721	llvm_unreachable("cannot find MemSDNode in the pattern!");
1722	}
1723
1724	bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1725	SDValue &VAddr, SDValue &Offset,
1726	uint64_t FlatVariant) const {
1727	int64_t OffsetVal = `0`;
1728
1729	unsigned AS = findMemSDNode(N)->getAddressSpace();
1730
1731	bool CanHaveFlatSegmentOffsetBug =
1732	Subtarget->hasFlatSegmentOffsetBug() &&
1733	FlatVariant == SIInstrFlags::FLAT &&
1734	(AS == AMDGPUAS::FLAT_ADDRESS \|\| AS == AMDGPUAS::GLOBAL_ADDRESS);
1735
1736	if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1737	SDValue N0, N1;
1738	if (isBaseWithConstantOffset64(Addr, LHS&: N0, RHS&: N1) &&
1739	(FlatVariant != SIInstrFlags::FlatScratch \|\|
1740	isFlatScratchBaseLegal(Addr))) {
1741	int64_t COffsetVal = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
1742
1743	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1744	if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AS, FlatVariant)) {
1745	Addr = N0;
1746	OffsetVal = COffsetVal;
1747	} else {
1748	// If the offset doesn't fit, put the low bits into the offset field and
1749	// add the rest.
1750	//
1751	// For a FLAT instruction the hardware decides whether to access
1752	// global/scratch/shared memory based on the high bits of vaddr,
1753	// ignoring the offset field, so we have to ensure that when we add
1754	// remainder to vaddr it still points into the same underlying object.
1755	// The easiest way to do that is to make sure that we split the offset
1756	// into two pieces that are both >= 0 or both <= 0.
1757
1758	SDLoc DL(N);
1759	uint64_t RemainderOffset;
1760
1761	std::tie(args&: OffsetVal, args&: RemainderOffset) =
1762	TII->splitFlatOffset(COffsetVal, AddrSpace: AS, FlatVariant);
1763
1764	SDValue AddOffsetLo =
1765	getMaterializedScalarImm32(Val: Lo_32(Value: RemainderOffset), DL);
1766	SDValue Clamp = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
1767
1768	if (Addr.getValueType().getSizeInBits() == `32`) {
1769	SmallVector<SDValue, `3`> Opnds;
1770	Opnds.push_back(Elt: N0);
1771	Opnds.push_back(Elt: AddOffsetLo);
1772	unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1773	if (Subtarget->hasAddNoCarry()) {
1774	AddOp = AMDGPU::V_ADD_U32_e64;
1775	Opnds.push_back(Elt: Clamp);
1776	}
1777	Addr = SDValue (CurDAG->getMachineNode(Opcode: AddOp, dl: DL, VT: MVT::i32, Ops: Opnds), `0`);
1778	} else {
1779	// TODO: Should this try to use a scalar add pseudo if the base address
1780	// is uniform and saddr is usable?
1781	SDValue Sub0 = CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32);
1782	SDValue Sub1 = CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32);
1783
1784	SDNode *N0Lo = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1785	dl: DL, VT: MVT::i32, Op1: N0, Op2: Sub0);
1786	SDNode *N0Hi = CurDAG->getMachineNode(Opcode: TargetOpcode::EXTRACT_SUBREG,
1787	dl: DL, VT: MVT::i32, Op1: N0, Op2: Sub1);
1788
1789	SDValue AddOffsetHi =
1790	getMaterializedScalarImm32(Val: Hi_32(Value: RemainderOffset), DL);
1791
1792	SDVTList VTs = CurDAG->getVTList(VT1: MVT::i32, VT2: MVT::i1);
1793
1794	SDNode *Add =
1795	CurDAG->getMachineNode(Opcode: AMDGPU::V_ADD_CO_U32_e64, dl: DL, VTs,
1796	Ops: {AddOffsetLo, SDValue (N0Lo, `0`), Clamp});
1797
1798	SDNode *Addc = CurDAG->getMachineNode(
1799	Opcode: AMDGPU::V_ADDC_U32_e64, dl: DL, VTs,
1800	Ops: {AddOffsetHi, SDValue (N0Hi, `0`), SDValue (Add, `1`), Clamp});
1801
1802	SDValue RegSequenceArgs[] = {
1803	CurDAG->getTargetConstant(Val: AMDGPU::VReg_64RegClassID, DL, VT: MVT::i32),
1804	SDValue (Add, `0`), Sub0, SDValue (Addc, `0`), Sub1};
1805
1806	Addr = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL,
1807	VT: MVT::i64, Ops: RegSequenceArgs),
1808	`0`);
1809	}
1810	}
1811	}
1812	}
1813
1814	VAddr = Addr;
1815	Offset = CurDAG->getSignedTargetConstant(Val: OffsetVal, DL: SDLoc (), VT: MVT::i32);
1816	return true;
1817	}
1818
1819	bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1820	SDValue &VAddr,
1821	SDValue &Offset) const {
1822	return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, FlatVariant: SIInstrFlags::FLAT);
1823	}
1824
1825	bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1826	SDValue &VAddr,
1827	SDValue &Offset) const {
1828	return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, FlatVariant: SIInstrFlags::FlatGlobal);
1829	}
1830
1831	bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1832	SDValue &VAddr,
1833	SDValue &Offset) const {
1834	return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1835	FlatVariant: SIInstrFlags::FlatScratch);
1836	}
1837
1838	// If this matches zero_extend i32:x, return x
1839	static SDValue matchZExtFromI32(SDValue Op) {
1840	if (Op.getOpcode() != ISD::ZERO_EXTEND)
1841	return SDValue ();
1842
1843	SDValue ExtSrc = Op.getOperand(i: `0`);
1844	return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue ();
1845	}
1846
1847	// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1848	bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1849	SDValue Addr,
1850	SDValue &SAddr,
1851	SDValue &VOffset,
1852	SDValue &Offset) const {
1853	int64_t ImmOffset = `0`;
1854
1855	// Match the immediate offset first, which canonically is moved as low as
1856	// possible.
1857
1858	SDValue LHS, RHS;
1859	if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1860	int64_t COffsetVal = cast<ConstantSDNode>(Val&: RHS)->getSExtValue();
1861	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1862
1863	if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
1864	FlatVariant: SIInstrFlags::FlatGlobal)) {
1865	Addr = LHS;
1866	ImmOffset = COffsetVal;
1867	} else if (!LHS ->isDivergent()) {
1868	if (COffsetVal > `0`) {
1869	SDLoc SL(N);
1870	// saddr + large_offset -> saddr +
1871	// (voffset = large_offset & ~MaxOffset) +
1872	// (large_offset & MaxOffset);
1873	int64_t SplitImmOffset, RemainderOffset;
1874	std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
1875	COffsetVal, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS, FlatVariant: SIInstrFlags::FlatGlobal);
1876
1877	if (isUInt<`32`>(x: RemainderOffset)) {
1878	SDNode *VMov = CurDAG->getMachineNode(
1879	Opcode: AMDGPU::V_MOV_B32_e32, dl: SL, VT: MVT::i32,
1880	Op1: CurDAG->getTargetConstant(Val: RemainderOffset, DL: SDLoc (), VT: MVT::i32));
1881	VOffset = SDValue (VMov, `0`);
1882	SAddr = LHS;
1883	Offset = CurDAG->getTargetConstant(Val: SplitImmOffset, DL: SDLoc (), VT: MVT::i32);
1884	return true;
1885	}
1886	}
1887
1888	// We are adding a 64 bit SGPR and a constant. If constant bus limit
1889	// is 1 we would need to perform 1 or 2 extra moves for each half of
1890	// the constant and it is better to do a scalar add and then issue a
1891	// single VALU instruction to materialize zero. Otherwise it is less
1892	// instructions to perform VALU adds with immediates or inline literals.
1893	unsigned NumLiterals =
1894	!TII->isInlineConstant(Imm: APInt (`32`, Lo_32(Value: COffsetVal))) +
1895	!TII->isInlineConstant(Imm: APInt (`32`, Hi_32(Value: COffsetVal)));
1896	if (Subtarget->getConstantBusLimit(Opcode: AMDGPU::V_ADD_U32_e64) > NumLiterals)
1897	return false;
1898	}
1899	}
1900
1901	// Match the variable offset.
1902	if (Addr.getOpcode() == ISD::ADD) {
1903	LHS = Addr.getOperand(i: `0`);
1904	RHS = Addr.getOperand(i: `1`);
1905
1906	if (!LHS ->isDivergent()) {
1907	// add (i64 sgpr), (zero_extend (i32 vgpr))
1908	if (SDValue ZextRHS = matchZExtFromI32(Op: RHS)) {
1909	SAddr = LHS;
1910	VOffset = ZextRHS;
1911	}
1912	}
1913
1914	if (!SAddr && !RHS ->isDivergent()) {
1915	// add (zero_extend (i32 vgpr)), (i64 sgpr)
1916	if (SDValue ZextLHS = matchZExtFromI32(Op: LHS)) {
1917	SAddr = RHS;
1918	VOffset = ZextLHS;
1919	}
1920	}
1921
1922	if (SAddr) {
1923	Offset = CurDAG->getSignedTargetConstant(Val: ImmOffset, DL: SDLoc (), VT: MVT::i32);
1924	return true;
1925	}
1926	}
1927
1928	if (Addr ->isDivergent() \|\| Addr.getOpcode() == ISD::UNDEF \|\|
1929	isa<ConstantSDNode>(Val: Addr))
1930	return false;
1931
1932	// It's cheaper to materialize a single 32-bit zero for vaddr than the two
1933	// moves required to copy a 64-bit SGPR to VGPR.
1934	SAddr = Addr;
1935	SDNode *VMov =
1936	CurDAG->getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: SDLoc (Addr), VT: MVT::i32,
1937	Op1: CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (), VT: MVT::i32));
1938	VOffset = SDValue (VMov, `0`);
1939	Offset = CurDAG->getSignedTargetConstant(Val: ImmOffset, DL: SDLoc (), VT: MVT::i32);
1940	return true;
1941	}
1942
1943	static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
1944	if (auto *FI = dyn_cast<FrameIndexSDNode>(Val&: SAddr)) {
1945	SAddr = CurDAG->getTargetFrameIndex(FI: FI->getIndex(), VT: FI->getValueType(ResNo: `0`));
1946	} else if (SAddr.getOpcode() == ISD::ADD &&
1947	isa<FrameIndexSDNode>(Val: SAddr.getOperand(i: `0`))) {
1948	// Materialize this into a scalar move for scalar address to avoid
1949	// readfirstlane.
1950	auto *FI = cast<FrameIndexSDNode>(Val: SAddr.getOperand(i: `0`));
1951	SDValue TFI = CurDAG->getTargetFrameIndex(FI: FI->getIndex(),
1952	VT: FI->getValueType(ResNo: `0`));
1953	SAddr = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::S_ADD_I32, dl: SDLoc (SAddr),
1954	VT: MVT::i32, Op1: TFI, Op2: SAddr.getOperand(i: `1`)),
1955	`0`);
1956	}
1957
1958	return SAddr;
1959	}
1960
1961	// Match (32-bit SGPR base) + sext(imm offset)
1962	bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1963	SDValue &SAddr,
1964	SDValue &Offset) const {
1965	if (Addr ->isDivergent())
1966	return false;
1967
1968	SDLoc DL(Addr);
1969
1970	int64_t COffsetVal = `0`;
1971
1972	if (CurDAG->isBaseWithConstantOffset(Op: Addr) && isFlatScratchBaseLegal(Addr)) {
1973	COffsetVal = cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`))->getSExtValue();
1974	SAddr = Addr.getOperand(i: `0`);
1975	} else {
1976	SAddr = Addr;
1977	}
1978
1979	SAddr = SelectSAddrFI(CurDAG, SAddr);
1980
1981	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1982
1983	if (!TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
1984	FlatVariant: SIInstrFlags::FlatScratch)) {
1985	int64_t SplitImmOffset, RemainderOffset;
1986	std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
1987	COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: SIInstrFlags::FlatScratch);
1988
1989	COffsetVal = SplitImmOffset;
1990
1991	SDValue AddOffset =
1992	SAddr.getOpcode() == ISD::TargetFrameIndex
1993	? getMaterializedScalarImm32(Val: Lo_32(Value: RemainderOffset), DL)
1994	: CurDAG->getSignedTargetConstant(Val: RemainderOffset, DL, VT: MVT::i32);
1995	SAddr = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::S_ADD_I32, dl: DL, VT: MVT::i32,
1996	Op1: SAddr, Op2: AddOffset),
1997	`0`);
1998	}
1999
2000	Offset = CurDAG->getSignedTargetConstant(Val: COffsetVal, DL, VT: MVT::i32);
2001
2002	return true;
2003	}
2004
2005	// Check whether the flat scratch SVS swizzle bug affects this access.
2006	bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2007	SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2008	if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2009	return false;
2010
2011	// The bug affects the swizzling of SVS accesses if there is any carry out
2012	// from the two low order bits (i.e. from bit 1 into bit 2) when adding
2013	// voffset to (soffset + inst_offset).
2014	KnownBits VKnown = CurDAG->computeKnownBits(Op: VAddr);
2015	KnownBits SKnown =
2016	KnownBits::add(LHS: CurDAG->computeKnownBits(Op: SAddr),
2017	RHS: KnownBits::makeConstant(C: APInt (`32`, ImmOffset,
2018	/isSigned=/true)));
2019	uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2020	uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2021	return (VMax & `3`) + (SMax & `3`) >= `4`;
2022	}
2023
2024	bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2025	SDValue &VAddr, SDValue &SAddr,
2026	SDValue &Offset) const {
2027	int64_t ImmOffset = `0`;
2028
2029	SDValue LHS, RHS;
2030	SDValue OrigAddr = Addr;
2031	if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2032	int64_t COffsetVal = cast<ConstantSDNode>(Val&: RHS)->getSExtValue();
2033	const SIInstrInfo *TII = Subtarget->getInstrInfo();
2034
2035	if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: true)) {
2036	Addr = LHS;
2037	ImmOffset = COffsetVal;
2038	} else if (!LHS ->isDivergent() && COffsetVal > `0`) {
2039	SDLoc SL(N);
2040	// saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2041	// (large_offset & MaxOffset);
2042	int64_t SplitImmOffset, RemainderOffset;
2043	std::tie(args&: SplitImmOffset, args&: RemainderOffset)
2044	= TII->splitFlatOffset(COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: true);
2045
2046	if (isUInt<`32`>(x: RemainderOffset)) {
2047	SDNode *VMov = CurDAG->getMachineNode(
2048	Opcode: AMDGPU::V_MOV_B32_e32, dl: SL, VT: MVT::i32,
2049	Op1: CurDAG->getTargetConstant(Val: RemainderOffset, DL: SDLoc (), VT: MVT::i32));
2050	VAddr = SDValue (VMov, `0`);
2051	SAddr = LHS;
2052	if (!isFlatScratchBaseLegal(Addr))
2053	return false;
2054	if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset: SplitImmOffset))
2055	return false;
2056	Offset = CurDAG->getTargetConstant(Val: SplitImmOffset, DL: SDLoc (), VT: MVT::i32);
2057	return true;
2058	}
2059	}
2060	}
2061
2062	if (Addr.getOpcode() != ISD::ADD)
2063	return false;
2064
2065	LHS = Addr.getOperand(i: `0`);
2066	RHS = Addr.getOperand(i: `1`);
2067
2068	if (!LHS ->isDivergent() && RHS ->isDivergent()) {
2069	SAddr = LHS;
2070	VAddr = RHS;
2071	} else if (!RHS ->isDivergent() && LHS ->isDivergent()) {
2072	SAddr = RHS;
2073	VAddr = LHS;
2074	} else {
2075	return false;
2076	}
2077
2078	if (OrigAddr != Addr) {
2079	if (!isFlatScratchBaseLegalSVImm(Addr: OrigAddr))
2080	return false;
2081	} else {
2082	if (!isFlatScratchBaseLegalSV(Addr: OrigAddr))
2083	return false;
2084	}
2085
2086	if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2087	return false;
2088	SAddr = SelectSAddrFI(CurDAG, SAddr);
2089	Offset = CurDAG->getSignedTargetConstant(Val: ImmOffset, DL: SDLoc (), VT: MVT::i32);
2090	return true;
2091	}
2092
2093	// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2094	// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2095	// Handle the case where the Immediate Offset + SOffset is negative.
2096	bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2097	bool Imm32Only,
2098	bool IsBuffer,
2099	int64_t ImmOffset) const {
2100	if (!IsBuffer && !Imm32Only && ImmOffset < `0` &&
2101	AMDGPU::hasSMRDSignedImmOffset(ST: *Subtarget)) {
2102	KnownBits SKnown = CurDAG->computeKnownBits(Op: *SOffset);
2103	if (ImmOffset + SKnown.getMinValue().getSExtValue() < `0`)
2104	return false;
2105	}
2106
2107	return true;
2108	}
2109
2110	// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2111	// not null) offset. If Imm32Only is true, match only 32-bit immediate
2112	// offsets available on CI.
2113	bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2114	SDValue SOffset, SDValue Offset,
2115	bool Imm32Only, bool IsBuffer,
2116	bool HasSOffset,
2117	int64_t ImmOffset) const {
2118	assert((!SOffset \|\| !Offset) &&
2119	"Cannot match both soffset and offset at the same time!");
2120
2121	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: ByteOffsetNode);
2122	if (!C) {
2123	if (!SOffset)
2124	return false;
2125
2126	if (ByteOffsetNode.getValueType().isScalarInteger() &&
2127	ByteOffsetNode.getValueType().getSizeInBits() == `32`) {
2128	*SOffset = ByteOffsetNode;
2129	return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2130	ImmOffset);
2131	}
2132	if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2133	if (ByteOffsetNode.getOperand(i: `0`).getValueType().getSizeInBits() == `32`) {
2134	*SOffset = ByteOffsetNode.getOperand(i: `0`);
2135	return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2136	ImmOffset);
2137	}
2138	}
2139	return false;
2140	}
2141
2142	SDLoc SL(ByteOffsetNode);
2143
2144	// GFX9 and GFX10 have signed byte immediate offsets. The immediate
2145	// offset for S_BUFFER instructions is unsigned.
2146	int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2147	std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2148	ST: *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2149	if (EncodedOffset && Offset && !Imm32Only) {
2150	Offset = CurDAG->getSignedTargetConstant(Val: EncodedOffset, DL: SL, VT: MVT::i32);
2151	return true;
2152	}
2153
2154	// SGPR and literal offsets are unsigned.
2155	if (ByteOffset < `0`)
2156	return false;
2157
2158	EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(ST: *Subtarget, ByteOffset);
2159	if (EncodedOffset && Offset && Imm32Only) {
2160	Offset = CurDAG->getTargetConstant(Val: EncodedOffset, DL: SL, VT: MVT::i32);
2161	return true;
2162	}
2163
2164	if (!isUInt<`32`>(x: ByteOffset) && !isInt<`32`>(x: ByteOffset))
2165	return false;
2166
2167	if (SOffset) {
2168	SDValue C32Bit = CurDAG->getTargetConstant(Val: ByteOffset, DL: SL, VT: MVT::i32);
2169	*SOffset = SDValue (
2170	CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32, Op1: C32Bit), `0`);
2171	return true;
2172	}
2173
2174	return false;
2175	}
2176
2177	SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2178	if (Addr.getValueType() != MVT::i32)
2179	return Addr;
2180
2181	// Zero-extend a 32-bit address.
2182	SDLoc SL(Addr);
2183
2184	const MachineFunction &MF = CurDAG->getMachineFunction();
2185	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2186	unsigned AddrHiVal = Info->get32BitAddressHighBits();
2187	SDValue AddrHi = CurDAG->getTargetConstant(Val: AddrHiVal, DL: SL, VT: MVT::i32);
2188
2189	const SDValue Ops[] = {
2190	CurDAG->getTargetConstant(Val: AMDGPU::SReg_64_XEXECRegClassID, DL: SL, VT: MVT::i32),
2191	Addr,
2192	CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32),
2193	SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32, Op1: AddrHi),
2194	`0`),
2195	CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32),
2196	};
2197
2198	return SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: SL, VT: MVT::i64,
2199	Ops), `0`);
2200	}
2201
2202	// Match a base and an immediate (if Offset is not null) or an SGPR (if
2203	// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2204	// true, match only 32-bit immediate offsets available on CI.
2205	bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2206	SDValue SOffset, SDValue Offset,
2207	bool Imm32Only, bool IsBuffer,
2208	bool HasSOffset,
2209	int64_t ImmOffset) const {
2210	if (SOffset && Offset) {
2211	assert(!Imm32Only && !IsBuffer);
2212	SDValue B;
2213
2214	if (!SelectSMRDBaseOffset(Addr, SBase&: B, SOffset: nullptr, Offset, Imm32Only: false, IsBuffer: false, HasSOffset: true))
2215	return false;
2216
2217	int64_t ImmOff = `0`;
2218	if (ConstantSDNode C = dyn_cast<ConstantSDNode>(Val&: Offset))
2219	ImmOff = C->getSExtValue();
2220
2221	return SelectSMRDBaseOffset(Addr: B, SBase, SOffset, Offset: nullptr, Imm32Only: false, IsBuffer: false, HasSOffset: true,
2222	ImmOffset: ImmOff);
2223	}
2224
2225	// A 32-bit (address + offset) should not cause unsigned 32-bit integer
2226	// wraparound, because s_load instructions perform the addition in 64 bits.
2227	if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2228	!Addr ->getFlags().hasNoUnsignedWrap())
2229	return false;
2230
2231	SDValue N0, N1;
2232	// Extract the base and offset if possible.
2233	if (CurDAG->isBaseWithConstantOffset(Op: Addr) \|\| Addr.getOpcode() == ISD::ADD) {
2234	N0 = Addr.getOperand(i: `0`);
2235	N1 = Addr.getOperand(i: `1`);
2236	} else if (getBaseWithOffsetUsingSplitOR(DAG&: *CurDAG, Addr, N0, N1)) {
2237	assert(N0 && N1 && isa<ConstantSDNode>(N1));
2238	}
2239	if (!N0 \|\| !N1)
2240	return false;
2241
2242	if (SelectSMRDOffset(ByteOffsetNode: N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2243	ImmOffset)) {
2244	SBase = N0;
2245	return true;
2246	}
2247	if (SelectSMRDOffset(ByteOffsetNode: N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2248	ImmOffset)) {
2249	SBase = N1;
2250	return true;
2251	}
2252	return false;
2253	}
2254
2255	bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2256	SDValue SOffset, SDValue Offset,
2257	bool Imm32Only) const {
2258	if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2259	SBase = Expand32BitAddress(Addr: SBase);
2260	return true;
2261	}
2262
2263	if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2264	SBase = Expand32BitAddress(Addr);
2265	*Offset = CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (Addr), VT: MVT::i32);
2266	return true;
2267	}
2268
2269	return false;
2270	}
2271
2272	bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2273	SDValue &Offset) const {
2274	return SelectSMRD(Addr, SBase, / SOffset / nullptr, Offset: &Offset);
2275	}
2276
2277	bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2278	SDValue &Offset) const {
2279	assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2280	return SelectSMRD(Addr, SBase, / SOffset / nullptr, Offset: &Offset,
2281	/ Imm32Only / true);
2282	}
2283
2284	bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2285	SDValue &SOffset) const {
2286	return SelectSMRD(Addr, SBase, SOffset: &SOffset, / Offset / nullptr);
2287	}
2288
2289	bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2290	SDValue &SOffset,
2291	SDValue &Offset) const {
2292	return SelectSMRD(Addr, SBase, SOffset: &SOffset, Offset: &Offset);
2293	}
2294
2295	bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2296	return SelectSMRDOffset(ByteOffsetNode: N, / SOffset / nullptr, Offset: &Offset,
2297	/ Imm32Only / false, / IsBuffer / true);
2298	}
2299
2300	bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2301	SDValue &Offset) const {
2302	assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2303	return SelectSMRDOffset(ByteOffsetNode: N, / SOffset / nullptr, Offset: &Offset,
2304	/ Imm32Only / true, / IsBuffer / true);
2305	}
2306
2307	bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2308	SDValue &Offset) const {
2309	// Match the (soffset + offset) pair as a 32-bit register base and
2310	// an immediate offset.
2311	return N.getValueType() == MVT::i32 &&
2312	SelectSMRDBaseOffset(Addr: N, / SBase / SOffset, / SOffset/ nullptr,
2313	Offset: &Offset, / Imm32Only / false,
2314	/ IsBuffer / true);
2315	}
2316
2317	bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2318	SDValue &Base,
2319	SDValue &Offset) const {
2320	SDLoc DL(Index);
2321
2322	if (CurDAG->isBaseWithConstantOffset(Op: Index)) {
2323	SDValue N0 = Index.getOperand(i: `0`);
2324	SDValue N1 = Index.getOperand(i: `1`);
2325	ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
2326
2327	// (add n0, c0)
2328	// Don't peel off the offset (c0) if doing so could possibly lead
2329	// the base (n0) to be negative.
2330	// (or n0, \|c0\|) can never change a sign given isBaseWithConstantOffset.
2331	if (C1->getSExtValue() <= `0` \|\| CurDAG->SignBitIsZero(Op: N0) \|\|
2332	(Index ->getOpcode() == ISD::OR && C1->getSExtValue() >= `0`)) {
2333	Base = N0;
2334	Offset = CurDAG->getTargetConstant(Val: C1->getZExtValue(), DL, VT: MVT::i32);
2335	return true;
2336	}
2337	}
2338
2339	if (isa<ConstantSDNode>(Val: Index))
2340	return false;
2341
2342	Base = Index;
2343	Offset = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32);
2344	return true;
2345	}
2346
2347	SDNode AMDGPUDAGToDAGISel::getBFE32(bool* IsSigned, const SDLoc &DL,
2348	SDValue Val, uint32_t Offset,
2349	uint32_t Width) {
2350	if (Val ->isDivergent()) {
2351	unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2352	SDValue Off = CurDAG->getTargetConstant(Val: Offset, DL, VT: MVT::i32);
2353	SDValue W = CurDAG->getTargetConstant(Val: Width, DL, VT: MVT::i32);
2354
2355	return CurDAG->getMachineNode(Opcode, dl: DL, VT: MVT::i32, Op1: Val, Op2: Off, Op3: W);
2356	}
2357	unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2358	// Transformation function, pack the offset and width of a BFE into
2359	// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2360	// source, bits [5:0] contain the offset and bits [22:16] the width.
2361	uint32_t PackedVal = Offset \| (Width << `16`);
2362	SDValue PackedConst = CurDAG->getTargetConstant(Val: PackedVal, DL, VT: MVT::i32);
2363
2364	return CurDAG->getMachineNode(Opcode, dl: DL, VT: MVT::i32, Op1: Val, Op2: PackedConst);
2365	}
2366
2367	void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2368	// "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2369	// "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2370	// Predicate: 0 < b <= c < 32
2371
2372	const SDValue &Shl = N->getOperand(Num: `0`);
2373	ConstantSDNode *B = dyn_cast<ConstantSDNode>(Val: Shl ->getOperand(Num: `1`));
2374	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
2375
2376	if (B && C) {
2377	uint32_t BVal = B->getZExtValue();
2378	uint32_t CVal = C->getZExtValue();
2379
2380	if (`0` < BVal && BVal <= CVal && CVal < `32`) {
2381	bool Signed = N->getOpcode() == ISD::SRA;
2382	ReplaceNode(F: N, T: getBFE32(IsSigned: Signed, DL: SDLoc (N), Val: Shl.getOperand(i: `0`), Offset: CVal - BVal,
2383	Width: `32` - CVal));
2384	return;
2385	}
2386	}
2387	SelectCode(N);
2388	}
2389
2390	void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2391	switch (N->getOpcode()) {
2392	case ISD::AND:
2393	if (N->getOperand(Num: `0`).getOpcode() == ISD::SRL) {
2394	// "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2395	// Predicate: isMask(mask)
2396	const SDValue &Srl = N->getOperand(Num: `0`);
2397	ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Val: Srl.getOperand(i: `1`));
2398	ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
2399
2400	if (Shift && Mask) {
2401	uint32_t ShiftVal = Shift->getZExtValue();
2402	uint32_t MaskVal = Mask->getZExtValue();
2403
2404	if (isMask_32(Value: MaskVal)) {
2405	uint32_t WidthVal = llvm::popcount(Value: MaskVal);
2406	ReplaceNode(F: N, T: getBFE32(IsSigned: false, DL: SDLoc (N), Val: Srl.getOperand(i: `0`), Offset: ShiftVal,
2407	Width: WidthVal));
2408	return;
2409	}
2410	}
2411	}
2412	break;
2413	case ISD::SRL:
2414	if (N->getOperand(Num: `0`).getOpcode() == ISD::AND) {
2415	// "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2416	// Predicate: isMask(mask >> b)
2417	const SDValue &And = N->getOperand(Num: `0`);
2418	ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
2419	ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: And ->getOperand(Num: `1`));
2420
2421	if (Shift && Mask) {
2422	uint32_t ShiftVal = Shift->getZExtValue();
2423	uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2424
2425	if (isMask_32(Value: MaskVal)) {
2426	uint32_t WidthVal = llvm::popcount(Value: MaskVal);
2427	ReplaceNode(F: N, T: getBFE32(IsSigned: false, DL: SDLoc (N), Val: And.getOperand(i: `0`), Offset: ShiftVal,
2428	Width: WidthVal));
2429	return;
2430	}
2431	}
2432	} else if (N->getOperand(Num: `0`).getOpcode() == ISD::SHL) {
2433	SelectS_BFEFromShifts(N);
2434	return;
2435	}
2436	break;
2437	case ISD::SRA:
2438	if (N->getOperand(Num: `0`).getOpcode() == ISD::SHL) {
2439	SelectS_BFEFromShifts(N);
2440	return;
2441	}
2442	break;
2443
2444	case ISD::SIGN_EXTEND_INREG: {
2445	// sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2446	SDValue Src = N->getOperand(Num: `0`);
2447	if (Src.getOpcode() != ISD::SRL)
2448	break;
2449
2450	const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: `1`));
2451	if (!Amt)
2452	break;
2453
2454	unsigned Width = cast<VTSDNode>(Val: N->getOperand(Num: `1`))->getVT().getSizeInBits();
2455	ReplaceNode(F: N, T: getBFE32(IsSigned: true, DL: SDLoc (N), Val: Src.getOperand(i: `0`),
2456	Offset: Amt->getZExtValue(), Width));
2457	return;
2458	}
2459	}
2460
2461	SelectCode(N);
2462	}
2463
2464	bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode N) const* {
2465	assert(N->getOpcode() == ISD::BRCOND);
2466	if (!N->hasOneUse())
2467	return false;
2468
2469	SDValue Cond = N->getOperand(Num: `1`);
2470	if (Cond.getOpcode() == ISD::CopyToReg)
2471	Cond = Cond.getOperand(i: `2`);
2472
2473	if (Cond.getOpcode() != ISD::SETCC \|\| !Cond.hasOneUse())
2474	return false;
2475
2476	MVT VT = Cond.getOperand(i: `0`).getSimpleValueType();
2477	if (VT == MVT::i32)
2478	return true;
2479
2480	if (VT == MVT::i64) {
2481	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Cond.getOperand(i: `2`))->get();
2482	return (CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
2483	Subtarget->hasScalarCompareEq64();
2484	}
2485
2486	if ((VT == MVT::f16 \|\| VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2487	return true;
2488
2489	return false;
2490	}
2491
2492	static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2493	assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2494	// Special case for amdgcn.ballot:
2495	// %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2496	// %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2497	// =>
2498	// Use i1 %Cond value instead of i(WaveSize) %VCMP.
2499	// This is possible because divergent ISD::SETCC is selected as V_CMP and
2500	// Cond becomes a i(WaveSize) full mask value.
2501	// Note that ballot doesn't use SETEQ condition but its easy to support it
2502	// here for completeness, so in this case Negate is set true on return.
2503	auto VCMP_CC = cast<CondCodeSDNode>(Val: VCMP.getOperand(i: `2`))->get();
2504	if ((VCMP_CC == ISD::SETEQ \|\| VCMP_CC == ISD::SETNE) &&
2505	isNullConstant(V: VCMP.getOperand(i: `1`))) {
2506
2507	auto Cond = VCMP.getOperand(i: `0`);
2508	if (ISD::isExtOpcode(Opcode: Cond ->getOpcode())) // Skip extension.
2509	Cond = Cond.getOperand(i: `0`);
2510
2511	if (isBoolSGPR(V: Cond)) {
2512	Negate = VCMP_CC == ISD::SETEQ;
2513	return Cond;
2514	}
2515	}
2516	return SDValue ();
2517	}
2518
2519	void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2520	SDValue Cond = N->getOperand(Num: `1`);
2521
2522	if (Cond.isUndef()) {
2523	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::SI_BR_UNDEF, VT: MVT::Other,
2524	Op1: N->getOperand(Num: `2`), Op2: N->getOperand(Num: `0`));
2525	return;
2526	}
2527
2528	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2529
2530	bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2531	bool AndExec = !UseSCCBr;
2532	bool Negate = false;
2533
2534	if (Cond.getOpcode() == ISD::SETCC &&
2535	Cond ->getOperand(Num: `0`)->getOpcode() == AMDGPUISD::SETCC) {
2536	SDValue VCMP = Cond ->getOperand(Num: `0`);
2537	auto CC = cast<CondCodeSDNode>(Val: Cond ->getOperand(Num: `2`))->get();
2538	if ((CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
2539	isNullConstant(V: Cond ->getOperand(Num: `1`)) &&
2540	// We may encounter ballot.i64 in wave32 mode on -O0.
2541	VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2542	// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2543	// %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2544	// BRCOND i1 %C, %BB
2545	// =>
2546	// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2547	// VCC = COPY i(WaveSize) %VCMP
2548	// S_CBRANCH_VCCNZ/VCCZ %BB
2549	Negate = CC == ISD::SETEQ;
2550	bool NegatedBallot = false;
2551	if (auto BallotCond = combineBallotPattern(VCMP, Negate&: NegatedBallot)) {
2552	Cond = BallotCond;
2553	UseSCCBr = !BallotCond ->isDivergent();
2554	Negate = Negate ^ NegatedBallot;
2555	} else {
2556	// TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2557	// selected as V_CMP, but this may change for uniform condition.
2558	Cond = VCMP;
2559	UseSCCBr = false;
2560	}
2561	}
2562	// Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2563	// V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2564	// used.
2565	AndExec = false;
2566	}
2567
2568	unsigned BrOp =
2569	UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2570	: (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2571	Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2572	SDLoc SL(N);
2573
2574	if (AndExec) {
2575	// This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2576	// analyzed what generates the vcc value, so we do not know whether vcc
2577	// bits for disabled lanes are 0. Thus we need to mask out bits for
2578	// disabled lanes.
2579	//
2580	// For the case that we select S_CBRANCH_SCC1 and it gets
2581	// changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2582	// SIInstrInfo::moveToVALU which inserts the S_AND).
2583	//
2584	// We could add an analysis of what generates the vcc value here and omit
2585	// the S_AND when is unnecessary. But it would be better to add a separate
2586	// pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2587	// catches both cases.
2588	Cond = SDValue (
2589	CurDAG->getMachineNode(
2590	Opcode: Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, dl: SL,
2591	VT: MVT::i1,
2592	Op1: CurDAG->getRegister(Reg: Subtarget->isWave32() ? AMDGPU::EXEC_LO
2593	: AMDGPU::EXEC,
2594	VT: MVT::i1),
2595	Op2: Cond),
2596	`0`);
2597	}
2598
2599	SDValue VCC = CurDAG->getCopyToReg(Chain: N->getOperand(Num: `0`), dl: SL, Reg: CondReg, N: Cond);
2600	CurDAG->SelectNodeTo(N, MachineOpc: BrOp, VT: MVT::Other,
2601	Op1: N->getOperand(Num: `2`), // Basic Block
2602	Op2: VCC.getValue(R: `0`));
2603	}
2604
2605	void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2606	if (Subtarget->hasSALUFloatInsts() && N->getValueType(ResNo: `0`) == MVT::f32 &&
2607	!N->isDivergent()) {
2608	SDValue Src = N->getOperand(Num: `0`);
2609	if (Src.getValueType() == MVT::f16) {
2610	if (isExtractHiElt(In: Src, Out&: Src)) {
2611	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::S_CVT_HI_F32_F16, VTs: N->getVTList(),
2612	Ops: {Src});
2613	return;
2614	}
2615	}
2616	}
2617
2618	SelectCode(N);
2619	}
2620
2621	void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode N, unsigned* IntrID) {
2622	// The address is assumed to be uniform, so if it ends up in a VGPR, it will
2623	// be copied to an SGPR with readfirstlane.
2624	unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2625	AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2626
2627	SDValue Chain = N->getOperand(Num: `0`);
2628	SDValue Ptr = N->getOperand(Num: `2`);
2629	MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
2630	MachineMemOperand *MMO = M->getMemOperand();
2631	bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2632
2633	SDValue Offset;
2634	if (CurDAG->isBaseWithConstantOffset(Op: Ptr)) {
2635	SDValue PtrBase = Ptr.getOperand(i: `0`);
2636	SDValue PtrOffset = Ptr.getOperand(i: `1`);
2637
2638	const APInt &OffsetVal = PtrOffset ->getAsAPIntVal();
2639	if (isDSOffsetLegal(Base: PtrBase, Offset: OffsetVal.getZExtValue())) {
2640	N = glueCopyToM0(N, Val: PtrBase);
2641	Offset = CurDAG->getTargetConstant(Val: OffsetVal, DL: SDLoc (), VT: MVT::i32);
2642	}
2643	}
2644
2645	if (!Offset) {
2646	N = glueCopyToM0(N, Val: Ptr);
2647	Offset = CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (), VT: MVT::i32);
2648	}
2649
2650	SDValue Ops[] = {
2651	Offset,
2652	CurDAG->getTargetConstant(Val: IsGDS, DL: SDLoc (), VT: MVT::i32),
2653	Chain,
2654	N->getOperand(Num: N->getNumOperands() - `1`) // New glue
2655	};
2656
2657	SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
2658	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
2659	}
2660
2661	// We need to handle this here because tablegen doesn't support matching
2662	// instructions with multiple outputs.
2663	void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode N, unsigned* IntrID) {
2664	unsigned Opc;
2665	switch (IntrID) {
2666	case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2667	case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2668	Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2669	break;
2670	case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2671	Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2672	break;
2673	case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2674	Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2675	break;
2676	}
2677	SDValue Ops[] = {N->getOperand(Num: `2`), N->getOperand(Num: `3`), N->getOperand(Num: `4`),
2678	N->getOperand(Num: `5`), N->getOperand(Num: `0`)};
2679
2680	MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
2681	MachineMemOperand *MMO = M->getMemOperand();
2682	SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
2683	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
2684	}
2685
2686	static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2687	switch (IntrID) {
2688	case Intrinsic::amdgcn_ds_gws_init:
2689	return AMDGPU::DS_GWS_INIT;
2690	case Intrinsic::amdgcn_ds_gws_barrier:
2691	return AMDGPU::DS_GWS_BARRIER;
2692	case Intrinsic::amdgcn_ds_gws_sema_v:
2693	return AMDGPU::DS_GWS_SEMA_V;
2694	case Intrinsic::amdgcn_ds_gws_sema_br:
2695	return AMDGPU::DS_GWS_SEMA_BR;
2696	case Intrinsic::amdgcn_ds_gws_sema_p:
2697	return AMDGPU::DS_GWS_SEMA_P;
2698	case Intrinsic::amdgcn_ds_gws_sema_release_all:
2699	return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2700	default:
2701	llvm_unreachable("not a gws intrinsic");
2702	}
2703	}
2704
2705	void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode N, unsigned* IntrID) {
2706	if (!Subtarget->hasGWS() \|\|
2707	(IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2708	!Subtarget->hasGWSSemaReleaseAll())) {
2709	// Let this error.
2710	SelectCode(N);
2711	return;
2712	}
2713
2714	// Chain, intrinsic ID, vsrc, offset
2715	const bool HasVSrc = N->getNumOperands() == `4`;
2716	assert(HasVSrc \|\| N->getNumOperands() == `3`);
2717
2718	SDLoc SL(N);
2719	SDValue BaseOffset = N->getOperand(Num: HasVSrc ? `3` : `2`);
2720	int ImmOffset = `0`;
2721	MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
2722	MachineMemOperand *MMO = M->getMemOperand();
2723
2724	// Don't worry if the offset ends up in a VGPR. Only one lane will have
2725	// effect, so SIFixSGPRCopies will validly insert readfirstlane.
2726
2727	// The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2728	// offset field) % 64. Some versions of the programming guide omit the m0
2729	// part, or claim it's from offset 0.
2730	if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(Val&: BaseOffset)) {
2731	// If we have a constant offset, try to use the 0 in m0 as the base.
2732	// TODO: Look into changing the default m0 initialization value. If the
2733	// default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2734	// the immediate offset.
2735	glueCopyToM0(N, Val: CurDAG->getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32));
2736	ImmOffset = ConstOffset->getZExtValue();
2737	} else {
2738	if (CurDAG->isBaseWithConstantOffset(Op: BaseOffset)) {
2739	ImmOffset = BaseOffset.getConstantOperandVal(i: `1`);
2740	BaseOffset = BaseOffset.getOperand(i: `0`);
2741	}
2742
2743	// Prefer to do the shift in an SGPR since it should be possible to use m0
2744	// as the result directly. If it's already an SGPR, it will be eliminated
2745	// later.
2746	SDNode *SGPROffset
2747	= CurDAG->getMachineNode(Opcode: AMDGPU::V_READFIRSTLANE_B32, dl: SL, VT: MVT::i32,
2748	Op1: BaseOffset);
2749	// Shift to offset in m0
2750	SDNode *M0Base
2751	= CurDAG->getMachineNode(Opcode: AMDGPU::S_LSHL_B32, dl: SL, VT: MVT::i32,
2752	Op1: SDValue (SGPROffset, `0`),
2753	Op2: CurDAG->getTargetConstant(Val: `16`, DL: SL, VT: MVT::i32));
2754	glueCopyToM0(N, Val: SDValue (M0Base, `0`));
2755	}
2756
2757	SDValue Chain = N->getOperand(Num: `0`);
2758	SDValue OffsetField = CurDAG->getTargetConstant(Val: ImmOffset, DL: SL, VT: MVT::i32);
2759
2760	const unsigned Opc = gwsIntrinToOpcode(IntrID);
2761	SmallVector<SDValue, `5`> Ops;
2762	if (HasVSrc)
2763	Ops.push_back(Elt: N->getOperand(Num: `2`));
2764	Ops.push_back(Elt: OffsetField);
2765	Ops.push_back(Elt: Chain);
2766
2767	SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
2768	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
2769	}
2770
2771	void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2772	if (Subtarget->getLDSBankCount() != `16`) {
2773	// This is a single instruction with a pattern.
2774	SelectCode(N);
2775	return;
2776	}
2777
2778	SDLoc DL(N);
2779
2780	// This requires 2 instructions. It is possible to write a pattern to support
2781	// this, but the generated isel emitter doesn't correctly deal with multiple
2782	// output instructions using the same physical register input. The copy to m0
2783	// is incorrectly placed before the second instruction.
2784	//
2785	// TODO: Match source modifiers.
2786	//
2787	// def : Pat <
2788	// (int_amdgcn_interp_p1_f16
2789	// (VOP3Mods f32:$src0, i32:$src0_modifiers),
2790	// (i32 timm:$attrchan), (i32 timm:$attr),
2791	// (i1 timm:$high), M0),
2792	// (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2793	// timm:$attrchan, 0,
2794	// (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2795	// let Predicates = [has16BankLDS];
2796	// }
2797
2798	// 16 bank LDS
2799	SDValue ToM0 = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl: DL, Reg: AMDGPU::M0,
2800	N: N->getOperand(Num: `5`), Glue: SDValue ());
2801
2802	SDVTList VTs = CurDAG->getVTList(VT1: MVT::f32, VT2: MVT::Other);
2803
2804	SDNode *InterpMov =
2805	CurDAG->getMachineNode(Opcode: AMDGPU::V_INTERP_MOV_F32, dl: DL, VTs, Ops: {
2806	CurDAG->getTargetConstant(Val: `2`, DL, VT: MVT::i32), // P0
2807	N->getOperand(Num: `3`), // Attr
2808	N->getOperand(Num: `2`), // Attrchan
2809	ToM0.getValue(R: `1`) // In glue
2810	});
2811
2812	SDNode *InterpP1LV =
2813	CurDAG->getMachineNode(Opcode: AMDGPU::V_INTERP_P1LV_F16, dl: DL, VT: MVT::f32, Ops: {
2814	CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32), // $src0_modifiers
2815	N->getOperand(Num: `1`), // Src0
2816	N->getOperand(Num: `3`), // Attr
2817	N->getOperand(Num: `2`), // Attrchan
2818	CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32), // $src2_modifiers
2819	SDValue (InterpMov, `0`), // Src2 - holds two f16 values selected by high
2820	N->getOperand(Num: `4`), // high
2821	CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1), // $clamp
2822	CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i32), // $omod
2823	SDValue (InterpMov, `1`)
2824	});
2825
2826	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: SDValue (InterpP1LV, `0`));
2827	}
2828
2829	void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2830	unsigned IntrID = N->getConstantOperandVal(Num: `1`);
2831	switch (IntrID) {
2832	case Intrinsic::amdgcn_ds_append:
2833	case Intrinsic::amdgcn_ds_consume: {
2834	if (N->getValueType(ResNo: `0`) != MVT::i32)
2835	break;
2836	SelectDSAppendConsume(N, IntrID);
2837	return;
2838	}
2839	case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2840	case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2841	case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2842	case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2843	SelectDSBvhStackIntrinsic(N, IntrID);
2844	return;
2845	case Intrinsic::amdgcn_init_whole_wave:
2846	CurDAG->getMachineFunction()
2847	.getInfo<SIMachineFunctionInfo>()
2848	->setInitWholeWave();
2849	break;
2850	}
2851
2852	SelectCode(N);
2853	}
2854
2855	void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2856	unsigned IntrID = N->getConstantOperandVal(Num: `0`);
2857	unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2858	SDNode *ConvGlueNode = N->getGluedNode();
2859	if (ConvGlueNode) {
2860	// FIXME: Possibly iterate over multiple glue nodes?
2861	assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2862	ConvGlueNode = ConvGlueNode->getOperand(Num: `0`).getNode();
2863	ConvGlueNode =
2864	CurDAG->getMachineNode(Opcode: TargetOpcode::CONVERGENCECTRL_GLUE, dl: {},
2865	VT: MVT::Glue, Op1: SDValue (ConvGlueNode, `0`));
2866	} else {
2867	ConvGlueNode = nullptr;
2868	}
2869	switch (IntrID) {
2870	case Intrinsic::amdgcn_wqm:
2871	Opcode = AMDGPU::WQM;
2872	break;
2873	case Intrinsic::amdgcn_softwqm:
2874	Opcode = AMDGPU::SOFT_WQM;
2875	break;
2876	case Intrinsic::amdgcn_wwm:
2877	case Intrinsic::amdgcn_strict_wwm:
2878	Opcode = AMDGPU::STRICT_WWM;
2879	break;
2880	case Intrinsic::amdgcn_strict_wqm:
2881	Opcode = AMDGPU::STRICT_WQM;
2882	break;
2883	case Intrinsic::amdgcn_interp_p1_f16:
2884	SelectInterpP1F16(N);
2885	return;
2886	case Intrinsic::amdgcn_permlane16_swap:
2887	case Intrinsic::amdgcn_permlane32_swap: {
2888	if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
2889	!Subtarget->hasPermlane16Swap()) \|\|
2890	(IntrID == Intrinsic::amdgcn_permlane32_swap &&
2891	!Subtarget->hasPermlane32Swap())) {
2892	SelectCode(N); // Hit the default error
2893	return;
2894	}
2895
2896	Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
2897	? AMDGPU::V_PERMLANE16_SWAP_B32_e64
2898	: AMDGPU::V_PERMLANE32_SWAP_B32_e64;
2899
2900	SmallVector<SDValue, `4`> NewOps(N->op_begin() + `1`, N->op_end());
2901	if (ConvGlueNode)
2902	NewOps.push_back(Elt: SDValue (ConvGlueNode, `0`));
2903
2904	bool FI = N->getConstantOperandVal(Num: `3`);
2905	NewOps [`2`] = CurDAG->getTargetConstant(
2906	Val: FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, DL: SDLoc (), VT: MVT::i32);
2907
2908	CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops: NewOps);
2909	return;
2910	}
2911	default:
2912	SelectCode(N);
2913	break;
2914	}
2915
2916	if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2917	SDValue Src = N->getOperand(Num: `1`);
2918	CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops: {Src});
2919	}
2920
2921	if (ConvGlueNode) {
2922	SmallVector<SDValue, `4`> NewOps(N->ops());
2923	NewOps.push_back(Elt: SDValue (ConvGlueNode, `0`));
2924	CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: N->getVTList(), Ops: NewOps);
2925	}
2926	}
2927
2928	void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2929	unsigned IntrID = N->getConstantOperandVal(Num: `1`);
2930	switch (IntrID) {
2931	case Intrinsic::amdgcn_ds_gws_init:
2932	case Intrinsic::amdgcn_ds_gws_barrier:
2933	case Intrinsic::amdgcn_ds_gws_sema_v:
2934	case Intrinsic::amdgcn_ds_gws_sema_br:
2935	case Intrinsic::amdgcn_ds_gws_sema_p:
2936	case Intrinsic::amdgcn_ds_gws_sema_release_all:
2937	SelectDS_GWS(N, IntrID);
2938	return;
2939	default:
2940	break;
2941	}
2942
2943	SelectCode(N);
2944	}
2945
2946	void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2947	SDValue Log2WaveSize =
2948	CurDAG->getTargetConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: SDLoc (N), VT: MVT::i32);
2949	CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::S_LSHR_B32, VTs: N->getVTList(),
2950	Ops: {N->getOperand(Num: `0`), Log2WaveSize});
2951	}
2952
2953	void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2954	SDValue SrcVal = N->getOperand(Num: `1`);
2955	if (SrcVal.getValueType() != MVT::i32) {
2956	SelectCode(N); // Emit default error
2957	return;
2958	}
2959
2960	SDValue CopyVal;
2961	Register SP = TLI->getStackPointerRegisterToSaveRestore();
2962	SDLoc SL(N);
2963
2964	if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2965	CopyVal = SrcVal.getOperand(i: `0`);
2966	} else {
2967	SDValue Log2WaveSize = CurDAG->getTargetConstant(
2968	Val: Subtarget->getWavefrontSizeLog2(), DL: SL, VT: MVT::i32);
2969
2970	if (N->isDivergent()) {
2971	SrcVal = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::V_READFIRSTLANE_B32, dl: SL,
2972	VT: MVT::i32, Op1: SrcVal),
2973	`0`);
2974	}
2975
2976	CopyVal = SDValue (CurDAG->getMachineNode(Opcode: AMDGPU::S_LSHL_B32, dl: SL, VT: MVT::i32,
2977	Ops: {SrcVal, Log2WaveSize}),
2978	`0`);
2979	}
2980
2981	SDValue CopyToSP = CurDAG->getCopyToReg(Chain: N->getOperand(Num: `0`), dl: SL, Reg: SP, N: CopyVal);
2982	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: CopyToSP);
2983	}
2984
2985	bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2986	unsigned &Mods,
2987	bool IsCanonicalizing,
2988	bool AllowAbs) const {
2989	Mods = SISrcMods::NONE;
2990	Src = In;
2991
2992	if (Src.getOpcode() == ISD::FNEG) {
2993	Mods \|= SISrcMods::NEG;
2994	Src = Src.getOperand(i: `0`);
2995	} else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2996	// Fold fsub [+-]0 into fneg. This may not have folded depending on the
2997	// denormal mode, but we're implicitly canonicalizing in a source operand.
2998	auto *LHS = dyn_cast<ConstantFPSDNode>(Val: Src.getOperand(i: `0`));
2999	if (LHS && LHS->isZero()) {
3000	Mods \|= SISrcMods::NEG;
3001	Src = Src.getOperand(i: `1`);
3002	}
3003	}
3004
3005	if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3006	Mods \|= SISrcMods::ABS;
3007	Src = Src.getOperand(i: `0`);
3008	}
3009
3010	return true;
3011	}
3012
3013	bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3014	SDValue &SrcMods) const {
3015	unsigned Mods;
3016	if (SelectVOP3ModsImpl(In, Src, Mods, /IsCanonicalizing=/true,
3017	/AllowAbs=/true)) {
3018	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3019	return true;
3020	}
3021
3022	return false;
3023	}
3024
3025	bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3026	SDValue In, SDValue &Src, SDValue &SrcMods) const {
3027	unsigned Mods;
3028	if (SelectVOP3ModsImpl(In, Src, Mods, /IsCanonicalizing=/false,
3029	/AllowAbs=/true)) {
3030	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3031	return true;
3032	}
3033
3034	return false;
3035	}
3036
3037	bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3038	SDValue &SrcMods) const {
3039	unsigned Mods;
3040	if (SelectVOP3ModsImpl(In, Src, Mods,
3041	/IsCanonicalizing=/true,
3042	/AllowAbs=/false)) {
3043	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3044	return true;
3045	}
3046
3047	return false;
3048	}
3049
3050	bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3051	if (In.getOpcode() == ISD::FABS \|\| In.getOpcode() == ISD::FNEG)
3052	return false;
3053
3054	Src = In;
3055	return true;
3056	}
3057
3058	bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3059	SDValue &SrcMods,
3060	bool OpSel) const {
3061	unsigned Mods;
3062	if (SelectVOP3ModsImpl(In, Src, Mods,
3063	/IsCanonicalizing=/true,
3064	/AllowAbs=/false)) {
3065	if (OpSel)
3066	Mods \|= SISrcMods::OP_SEL_0;
3067	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3068	return true;
3069	}
3070
3071	return false;
3072	}
3073
3074	bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3075	SDValue &SrcMods) const {
3076	return SelectVINTERPModsImpl(In, Src, SrcMods, / OpSel / false);
3077	}
3078
3079	bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3080	SDValue &SrcMods) const {
3081	return SelectVINTERPModsImpl(In, Src, SrcMods, / OpSel / true);
3082	}
3083
3084	bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3085	SDValue &SrcMods, SDValue &Clamp,
3086	SDValue &Omod) const {
3087	SDLoc DL(In);
3088	Clamp = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
3089	Omod = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
3090
3091	return SelectVOP3Mods(In, Src, SrcMods);
3092	}
3093
3094	bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3095	SDValue &SrcMods, SDValue &Clamp,
3096	SDValue &Omod) const {
3097	SDLoc DL(In);
3098	Clamp = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
3099	Omod = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
3100
3101	return SelectVOP3BMods(In, Src, SrcMods);
3102	}
3103
3104	bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3105	SDValue &Clamp, SDValue &Omod) const {
3106	Src = In;
3107
3108	SDLoc DL(In);
3109	Clamp = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
3110	Omod = CurDAG->getTargetConstant(Val: `0`, DL, VT: MVT::i1);
3111
3112	return true;
3113	}
3114
3115	bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3116	SDValue &SrcMods, bool IsDOT) const {
3117	unsigned Mods = SISrcMods::NONE;
3118	Src = In;
3119
3120	// TODO: Handle G_FSUB 0 as fneg
3121	if (Src.getOpcode() == ISD::FNEG) {
3122	Mods ^= (SISrcMods::NEG \| SISrcMods::NEG_HI);
3123	Src = Src.getOperand(i: `0`);
3124	}
3125
3126	if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == `2` &&
3127	(!IsDOT \|\| !Subtarget->hasDOTOpSelHazard())) {
3128	unsigned VecMods = Mods;
3129
3130	SDValue Lo = stripBitcast(Val: Src.getOperand(i: `0`));
3131	SDValue Hi = stripBitcast(Val: Src.getOperand(i: `1`));
3132
3133	if (Lo.getOpcode() == ISD::FNEG) {
3134	Lo = stripBitcast(Val: Lo.getOperand(i: `0`));
3135	Mods ^= SISrcMods::NEG;
3136	}
3137
3138	if (Hi.getOpcode() == ISD::FNEG) {
3139	Hi = stripBitcast(Val: Hi.getOperand(i: `0`));
3140	Mods ^= SISrcMods::NEG_HI;
3141	}
3142
3143	if (isExtractHiElt(In: Lo, Out&: Lo))
3144	Mods \|= SISrcMods::OP_SEL_0;
3145
3146	if (isExtractHiElt(In: Hi, Out&: Hi))
3147	Mods \|= SISrcMods::OP_SEL_1;
3148
3149	unsigned VecSize = Src.getValueSizeInBits();
3150	Lo = stripExtractLoElt(In: Lo);
3151	Hi = stripExtractLoElt(In: Hi);
3152
3153	if (Lo.getValueSizeInBits() > VecSize) {
3154	Lo = CurDAG->getTargetExtractSubreg(
3155	SRIdx: (VecSize > `32`) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, DL: SDLoc (In),
3156	VT: MVT::getIntegerVT(BitWidth: VecSize), Operand: Lo);
3157	}
3158
3159	if (Hi.getValueSizeInBits() > VecSize) {
3160	Hi = CurDAG->getTargetExtractSubreg(
3161	SRIdx: (VecSize > `32`) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, DL: SDLoc (In),
3162	VT: MVT::getIntegerVT(BitWidth: VecSize), Operand: Hi);
3163	}
3164
3165	assert(Lo.getValueSizeInBits() <= VecSize &&
3166	Hi.getValueSizeInBits() <= VecSize);
3167
3168	if (Lo == Hi && !isInlineImmediate(N: Lo.getNode())) {
3169	// Really a scalar input. Just select from the low half of the register to
3170	// avoid packing.
3171
3172	if (VecSize == `32` \|\| VecSize == Lo.getValueSizeInBits()) {
3173	Src = Lo;
3174	} else {
3175	assert(Lo.getValueSizeInBits() == `32` && VecSize == `64`);
3176
3177	SDLoc SL(In);
3178	SDValue Undef = SDValue (
3179	CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: SL,
3180	VT: Lo.getValueType()), `0`);
3181	auto RC = Lo ->isDivergent() ? AMDGPU::VReg_64RegClassID
3182	: AMDGPU::SReg_64RegClassID;
3183	const SDValue Ops[] = {
3184	CurDAG->getTargetConstant(Val: RC, DL: SL, VT: MVT::i32),
3185	Lo, CurDAG->getTargetConstant(Val: AMDGPU::sub0, DL: SL, VT: MVT::i32),
3186	Undef, CurDAG->getTargetConstant(Val: AMDGPU::sub1, DL: SL, VT: MVT::i32) };
3187
3188	Src = SDValue (CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: SL,
3189	VT: Src.getValueType(), Ops), `0`);
3190	}
3191	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3192	return true;
3193	}
3194
3195	if (VecSize == `64` && Lo == Hi && isa<ConstantFPSDNode>(Val: Lo)) {
3196	uint64_t Lit = cast<ConstantFPSDNode>(Val&: Lo)->getValueAPF()
3197	.bitcastToAPInt().getZExtValue();
3198	if (AMDGPU::isInlinableLiteral32(Literal: Lit, HasInv2Pi: Subtarget->hasInv2PiInlineImm())) {
3199	Src = CurDAG->getTargetConstant(Val: Lit, DL: SDLoc (In), VT: MVT::i64);
3200	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3201	return true;
3202	}
3203	}
3204
3205	Mods = VecMods;
3206	} else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3207	Src.getNumOperands() == `2`) {
3208
3209	// TODO: We should repeat the build_vector source check above for the
3210	// vector_shuffle for negates and casts of individual elements.
3211
3212	auto *SVN = cast<ShuffleVectorSDNode>(Val&: Src);
3213	ArrayRef<int> Mask = SVN->getMask();
3214
3215	if (Mask [`0`] < `2` && Mask [`1`] < `2`) {
3216	// src1 should be undef.
3217	SDValue ShuffleSrc = SVN->getOperand(Num: `0`);
3218
3219	if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3220	ShuffleSrc = ShuffleSrc.getOperand(i: `0`);
3221	Mods ^= (SISrcMods::NEG \| SISrcMods::NEG_HI);
3222	}
3223
3224	if (Mask [`0`] == `1`)
3225	Mods \|= SISrcMods::OP_SEL_0;
3226	if (Mask [`1`] == `1`)
3227	Mods \|= SISrcMods::OP_SEL_1;
3228
3229	Src = ShuffleSrc;
3230	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3231	return true;
3232	}
3233	}
3234
3235	// Packed instructions do not have abs modifiers.
3236	Mods \|= SISrcMods::OP_SEL_1;
3237
3238	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3239	return true;
3240	}
3241
3242	bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3243	SDValue &SrcMods) const {
3244	return SelectVOP3PMods(In, Src, SrcMods, IsDOT: true);
3245	}
3246
3247	bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3248	const ConstantSDNode *C = cast<ConstantSDNode>(Val&: In);
3249	// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3250	// 1 promotes packed values to signed, 0 treats them as unsigned.
3251	assert(C->getAPIntValue().getBitWidth() == `1` && "expected i1 value");
3252
3253	unsigned Mods = SISrcMods::OP_SEL_1;
3254	unsigned SrcSign = C->getZExtValue();
3255	if (SrcSign == `1`)
3256	Mods ^= SISrcMods::NEG;
3257
3258	Src = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3259	return true;
3260	}
3261
3262	bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3263	SDValue &Src) const {
3264	const ConstantSDNode *C = cast<ConstantSDNode>(Val&: In);
3265	assert(C->getAPIntValue().getBitWidth() == `1` && "expected i1 value");
3266
3267	unsigned Mods = SISrcMods::OP_SEL_1;
3268	unsigned SrcVal = C->getZExtValue();
3269	if (SrcVal == `1`)
3270	Mods \|= SISrcMods::OP_SEL_0;
3271
3272	Src = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3273	return true;
3274	}
3275
3276	static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3277	llvm::SelectionDAG *CurDAG,
3278	const SDLoc &DL) {
3279	unsigned DstRegClass;
3280	EVT DstTy;
3281	switch (Elts.size()) {
3282	case `8`:
3283	DstRegClass = AMDGPU::VReg_256RegClassID;
3284	DstTy = MVT::v8i32;
3285	break;
3286	case `4`:
3287	DstRegClass = AMDGPU::VReg_128RegClassID;
3288	DstTy = MVT::v4i32;
3289	break;
3290	case `2`:
3291	DstRegClass = AMDGPU::VReg_64RegClassID;
3292	DstTy = MVT::v2i32;
3293	break;
3294	default:
3295	llvm_unreachable("unhandled Reg sequence size");
3296	}
3297
3298	SmallVector<SDValue, `17`> Ops;
3299	Ops.push_back(Elt: CurDAG->getTargetConstant(Val: DstRegClass, DL, VT: MVT::i32));
3300	for (unsigned i = `0`; i < Elts.size(); ++i) {
3301	Ops.push_back(Elt: Elts [i]);
3302	Ops.push_back(Elt: CurDAG->getTargetConstant(
3303	Val: SIRegisterInfo::getSubRegFromChannel(Channel: i), DL, VT: MVT::i32));
3304	}
3305	return CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL, VT: DstTy, Ops);
3306	}
3307
3308	static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3309	llvm::SelectionDAG *CurDAG,
3310	const SDLoc &DL) {
3311	SmallVector<SDValue, `8`> PackedElts;
3312	assert("unhandled Reg sequence size" &&
3313	(Elts.size() == `8` \|\| Elts.size() == `16`));
3314
3315	// Pack 16-bit elements in pairs into 32-bit register. If both elements are
3316	// unpacked from 32-bit source use it, otherwise pack them using v_perm.
3317	for (unsigned i = `0`; i < Elts.size(); i += `2`) {
3318	SDValue LoSrc = stripExtractLoElt(In: stripBitcast(Val: Elts [i]));
3319	SDValue HiSrc;
3320	if (isExtractHiElt(In: Elts [i + `1`], Out&: HiSrc) && LoSrc == HiSrc) {
3321	PackedElts.push_back(Elt: HiSrc);
3322	} else {
3323	SDValue PackLoLo = CurDAG->getTargetConstant(Val: `0x05040100`, DL, VT: MVT::i32);
3324	MachineSDNode *Packed =
3325	CurDAG->getMachineNode(Opcode: AMDGPU::V_PERM_B32_e64, dl: DL, VT: MVT::i32,
3326	Ops: {Elts [i + `1`], Elts [i], PackLoLo});
3327	PackedElts.push_back(Elt: SDValue (Packed, `0`));
3328	}
3329	}
3330
3331	return buildRegSequence32(Elts&: PackedElts, CurDAG, DL);
3332	}
3333
3334	static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3335	llvm::SelectionDAG *CurDAG,
3336	const SDLoc &DL, unsigned ElementSize) {
3337	if (ElementSize == `16`)
3338	return buildRegSequence16(Elts, CurDAG, DL);
3339	if (ElementSize == `32`)
3340	return buildRegSequence32(Elts, CurDAG, DL);
3341	llvm_unreachable("Unhandled element size");
3342	}
3343
3344	static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3345	SmallVectorImpl<SDValue> &Elts, SDValue &Src,
3346	llvm::SelectionDAG CurDAG, const* SDLoc &DL,
3347	unsigned ElementSize) {
3348	if (ModOpcode == ISD::FNEG) {
3349	Mods \|= SISrcMods::NEG;
3350	// Check if all elements also have abs modifier
3351	SmallVector<SDValue, `8`> NegAbsElts;
3352	for (auto El : Elts) {
3353	if (El.getOpcode() != ISD::FABS)
3354	break;
3355	NegAbsElts.push_back(Elt: El ->getOperand(Num: `0`));
3356	}
3357	if (Elts.size() != NegAbsElts.size()) {
3358	// Neg
3359	Src = SDValue (buildRegSequence(Elts, CurDAG, DL, ElementSize), `0`);
3360	} else {
3361	// Neg and Abs
3362	Mods \|= SISrcMods::NEG_HI;
3363	Src = SDValue (buildRegSequence(Elts&: NegAbsElts, CurDAG, DL, ElementSize), `0`);
3364	}
3365	} else {
3366	assert(ModOpcode == ISD::FABS);
3367	// Abs
3368	Mods \|= SISrcMods::NEG_HI;
3369	Src = SDValue (buildRegSequence(Elts, CurDAG, DL, ElementSize), `0`);
3370	}
3371	}
3372
3373	// Check all f16 elements for modifiers while looking through b32 and v2b16
3374	// build vector, stop if element does not satisfy ModifierCheck.
3375	static void
3376	checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
3377	std::function<bool(SDValue)> ModifierCheck) {
3378	for (unsigned i = `0`; i < BV->getNumOperands(); ++i) {
3379	if (auto *F16Pair =
3380	dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: BV->getOperand(Num: i)))) {
3381	for (unsigned i = `0`; i < F16Pair->getNumOperands(); ++i) {
3382	SDValue ElF16 = stripBitcast(Val: F16Pair->getOperand(Num: i));
3383	if (!ModifierCheck (ElF16))
3384	break;
3385	}
3386	}
3387	}
3388	}
3389
3390	bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3391	SDValue &SrcMods) const {
3392	Src = In;
3393	unsigned Mods = SISrcMods::OP_SEL_1;
3394
3395	// mods are on f16 elements
3396	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3397	SmallVector<SDValue, `8`> EltsF16;
3398
3399	checkWMMAElementsModifiersF16(BV, ModifierCheck: [&](SDValue Element) -> bool {
3400	if (Element.getOpcode() != ISD::FNEG)
3401	return false;
3402	EltsF16.push_back(Elt: Element.getOperand(i: `0`));
3403	return true;
3404	});
3405
3406	// All elements have neg modifier
3407	if (BV->getNumOperands() * `2` == EltsF16.size()) {
3408	Src = SDValue (buildRegSequence16(Elts&: EltsF16, CurDAG, DL: SDLoc (In)), `0`);
3409	Mods \|= SISrcMods::NEG;
3410	Mods \|= SISrcMods::NEG_HI;
3411	}
3412	}
3413
3414	// mods are on v2f16 elements
3415	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3416	SmallVector<SDValue, `8`> EltsV2F16;
3417	for (unsigned i = `0`; i < BV->getNumOperands(); ++i) {
3418	SDValue ElV2f16 = stripBitcast(Val: BV->getOperand(Num: i));
3419	// Based on first element decide which mod we match, neg or abs
3420	if (ElV2f16.getOpcode() != ISD::FNEG)
3421	break;
3422	EltsV2F16.push_back(Elt: ElV2f16.getOperand(i: `0`));
3423	}
3424
3425	// All pairs of elements have neg modifier
3426	if (BV->getNumOperands() == EltsV2F16.size()) {
3427	Src = SDValue (buildRegSequence32(Elts&: EltsV2F16, CurDAG, DL: SDLoc (In)), `0`);
3428	Mods \|= SISrcMods::NEG;
3429	Mods \|= SISrcMods::NEG_HI;
3430	}
3431	}
3432
3433	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3434	return true;
3435	}
3436
3437	bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3438	SDValue &SrcMods) const {
3439	Src = In;
3440	unsigned Mods = SISrcMods::OP_SEL_1;
3441	unsigned ModOpcode;
3442
3443	// mods are on f16 elements
3444	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3445	SmallVector<SDValue, `8`> EltsF16;
3446	checkWMMAElementsModifiersF16(BV, ModifierCheck: [&](SDValue ElF16) -> bool {
3447	// Based on first element decide which mod we match, neg or abs
3448	if (EltsF16.empty())
3449	ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3450	if (ElF16.getOpcode() != ModOpcode)
3451	return false;
3452	EltsF16.push_back(Elt: ElF16.getOperand(i: `0`));
3453	return true;
3454	});
3455
3456	// All elements have ModOpcode modifier
3457	if (BV->getNumOperands() * `2` == EltsF16.size())
3458	selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF16, Src, CurDAG, DL: SDLoc (In),
3459	ElementSize: `16`);
3460	}
3461
3462	// mods are on v2f16 elements
3463	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3464	SmallVector<SDValue, `8`> EltsV2F16;
3465
3466	for (unsigned i = `0`; i < BV->getNumOperands(); ++i) {
3467	SDValue ElV2f16 = stripBitcast(Val: BV->getOperand(Num: i));
3468	// Based on first element decide which mod we match, neg or abs
3469	if (EltsV2F16.empty())
3470	ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3471	if (ElV2f16 ->getOpcode() != ModOpcode)
3472	break;
3473	EltsV2F16.push_back(Elt: ElV2f16 ->getOperand(Num: `0`));
3474	}
3475
3476	// All elements have ModOpcode modifier
3477	if (BV->getNumOperands() == EltsV2F16.size())
3478	selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsV2F16, Src, CurDAG, DL: SDLoc (In),
3479	ElementSize: `32`);
3480	}
3481
3482	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3483	return true;
3484	}
3485
3486	bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3487	SDValue &SrcMods) const {
3488	Src = In;
3489	unsigned Mods = SISrcMods::OP_SEL_1;
3490	SmallVector<SDValue, `8`> EltsF32;
3491
3492	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3493	assert(BV->getNumOperands() > `0`);
3494	// Based on first element decide which mod we match, neg or abs
3495	SDValue ElF32 = stripBitcast(Val: BV->getOperand(Num: `0`));
3496	unsigned ModOpcode =
3497	(ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3498	for (unsigned i = `0`; i < BV->getNumOperands(); ++i) {
3499	SDValue ElF32 = stripBitcast(Val: BV->getOperand(Num: i));
3500	if (ElF32.getOpcode() != ModOpcode)
3501	break;
3502	EltsF32.push_back(Elt: ElF32.getOperand(i: `0`));
3503	}
3504
3505	// All elements had ModOpcode modifier
3506	if (BV->getNumOperands() == EltsF32.size())
3507	selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF32, Src, CurDAG, DL: SDLoc (In),
3508	ElementSize: `32`);
3509	}
3510
3511	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3512	return true;
3513	}
3514
3515	bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3516	if (auto *BV = dyn_cast<BuildVectorSDNode>(Val&: In)) {
3517	BitVector UndefElements;
3518	if (SDValue Splat = BV->getSplatValue(UndefElements: &UndefElements))
3519	if (isInlineImmediate(N: Splat.getNode())) {
3520	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Splat)) {
3521	unsigned Imm = C->getAPIntValue().getSExtValue();
3522	Src = CurDAG->getTargetConstant(Val: Imm, DL: SDLoc (In), VT: MVT::i32);
3523	return true;
3524	}
3525	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Splat)) {
3526	unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3527	Src = CurDAG->getTargetConstant(Val: Imm, DL: SDLoc (In), VT: MVT::i32);
3528	return true;
3529	}
3530	llvm_unreachable("unhandled Constant node");
3531	}
3532	}
3533
3534	// 16 bit splat
3535	SDValue SplatSrc32 = stripBitcast(Val: In);
3536	if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(Val&: SplatSrc32))
3537	if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3538	SDValue SplatSrc16 = stripBitcast(Val: Splat32);
3539	if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(Val&: SplatSrc16))
3540	if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3541	const SIInstrInfo *TII = Subtarget->getInstrInfo();
3542	std::optional<APInt> RawValue;
3543	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Splat))
3544	RawValue = C->getValueAPF().bitcastToAPInt();
3545	else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Splat))
3546	RawValue = C->getAPIntValue();
3547
3548	if (RawValue.has_value()) {
3549	EVT VT = In.getValueType().getScalarType();
3550	if (VT.getSimpleVT() == MVT::f16 \|\| VT.getSimpleVT() == MVT::bf16) {
3551	APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3552	? APFloatBase::IEEEhalf()
3553	: APFloatBase::BFloat(),
3554	RawValue.value());
3555	if (TII->isInlineConstant(Imm: FloatVal)) {
3556	Src = CurDAG->getTargetConstant(Val: RawValue.value(), DL: SDLoc (In),
3557	VT: MVT::i16);
3558	return true;
3559	}
3560	} else if (VT.getSimpleVT() == MVT::i16) {
3561	if (TII->isInlineConstant(Imm: RawValue.value())) {
3562	Src = CurDAG->getTargetConstant(Val: RawValue.value(), DL: SDLoc (In),
3563	VT: MVT::i16);
3564	return true;
3565	}
3566	} else
3567	llvm_unreachable("unknown 16-bit type");
3568	}
3569	}
3570	}
3571
3572	return false;
3573	}
3574
3575	bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3576	SDValue &IndexKey) const {
3577	unsigned Key = `0`;
3578	Src = In;
3579
3580	if (In.getOpcode() == ISD::SRL) {
3581	const llvm::SDValue &ShiftSrc = In.getOperand(i: `0`);
3582	ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: `1`));
3583	if (ShiftSrc.getValueType().getSizeInBits() == `32` && ShiftAmt &&
3584	ShiftAmt->getZExtValue() % `8` == `0`) {
3585	Key = ShiftAmt->getZExtValue() / `8`;
3586	Src = ShiftSrc;
3587	}
3588	}
3589
3590	IndexKey = CurDAG->getTargetConstant(Val: Key, DL: SDLoc (In), VT: MVT::i32);
3591	return true;
3592	}
3593
3594	bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3595	SDValue &IndexKey) const {
3596	unsigned Key = `0`;
3597	Src = In;
3598
3599	if (In.getOpcode() == ISD::SRL) {
3600	const llvm::SDValue &ShiftSrc = In.getOperand(i: `0`);
3601	ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: `1`));
3602	if (ShiftSrc.getValueType().getSizeInBits() == `32` && ShiftAmt &&
3603	ShiftAmt->getZExtValue() == `16`) {
3604	Key = `1`;
3605	Src = ShiftSrc;
3606	}
3607	}
3608
3609	IndexKey = CurDAG->getTargetConstant(Val: Key, DL: SDLoc (In), VT: MVT::i32);
3610	return true;
3611	}
3612
3613	bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3614	SDValue &SrcMods) const {
3615	Src = In;
3616	// FIXME: Handle op_sel
3617	SrcMods = CurDAG->getTargetConstant(Val: `0`, DL: SDLoc (In), VT: MVT::i32);
3618	return true;
3619	}
3620
3621	bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3622	SDValue &SrcMods) const {
3623	// FIXME: Handle op_sel
3624	return SelectVOP3Mods(In, Src, SrcMods);
3625	}
3626
3627	// The return value is not whether the match is possible (which it always is),
3628	// but whether or not it a conversion is really used.
3629	bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3630	unsigned &Mods) const {
3631	Mods = `0`;
3632	SelectVOP3ModsImpl(In, Src, Mods);
3633
3634	if (Src.getOpcode() == ISD::FP_EXTEND) {
3635	Src = Src.getOperand(i: `0`);
3636	assert(Src.getValueType() == MVT::f16);
3637	Src = stripBitcast(Val: Src);
3638
3639	// Be careful about folding modifiers if we already have an abs. fneg is
3640	// applied last, so we don't want to apply an earlier fneg.
3641	if ((Mods & SISrcMods::ABS) == `0`) {
3642	unsigned ModsTmp;
3643	SelectVOP3ModsImpl(In: Src, Src, Mods&: ModsTmp);
3644
3645	if ((ModsTmp & SISrcMods::NEG) != `0`)
3646	Mods ^= SISrcMods::NEG;
3647
3648	if ((ModsTmp & SISrcMods::ABS) != `0`)
3649	Mods \|= SISrcMods::ABS;
3650	}
3651
3652	// op_sel/op_sel_hi decide the source type and source.
3653	// If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3654	// If the sources's op_sel is set, it picks the high half of the source
3655	// register.
3656
3657	Mods \|= SISrcMods::OP_SEL_1;
3658	if (isExtractHiElt(In: Src, Out&: Src)) {
3659	Mods \|= SISrcMods::OP_SEL_0;
3660
3661	// TODO: Should we try to look for neg/abs here?
3662	}
3663
3664	// Prevent unnecessary subreg COPY to VGPR_16
3665	if (Src.getOpcode() == ISD::TRUNCATE &&
3666	Src.getOperand(i: `0`).getValueType() == MVT::i32) {
3667	Src = Src.getOperand(i: `0`);
3668	}
3669	return true;
3670	}
3671
3672	return false;
3673	}
3674
3675	bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3676	SDValue &SrcMods) const {
3677	unsigned Mods = `0`;
3678	if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3679	return false;
3680	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3681	return true;
3682	}
3683
3684	bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3685	SDValue &SrcMods) const {
3686	unsigned Mods = `0`;
3687	SelectVOP3PMadMixModsImpl(In, Src, Mods);
3688	SrcMods = CurDAG->getTargetConstant(Val: Mods, DL: SDLoc (In), VT: MVT::i32);
3689	return true;
3690	}
3691
3692	// Match BITOP3 operation and return a number of matched instructions plus
3693	// truth table.
3694	static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
3695	SmallVectorImpl<SDValue> &Src) {
3696	unsigned NumOpcodes = `0`;
3697	uint8_t LHSBits, RHSBits;
3698
3699	auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
3700	// Define truth table given Src0, Src1, Src2 bits permutations:
3701	// 0 0 0
3702	// 0 0 1
3703	// 0 1 0
3704	// 0 1 1
3705	// 1 0 0
3706	// 1 0 1
3707	// 1 1 0
3708	// 1 1 1
3709	const uint8_t SrcBits[`3`] = { `0xf0`, `0xcc`, `0xaa` };
3710
3711	if (auto *C = dyn_cast<ConstantSDNode>(Val&: Op)) {
3712	if (C->isAllOnes()) {
3713	Bits = `0xff`;
3714	return true;
3715	}
3716	if (C->isZero()) {
3717	Bits = `0`;
3718	return true;
3719	}
3720	}
3721
3722	for (unsigned I = `0`; I < Src.size(); ++I) {
3723	// Try to find existing reused operand
3724	if (Src [I] == Op) {
3725	Bits = SrcBits[I];
3726	return true;
3727	}
3728	// Try to replace parent operator
3729	if (Src [I] == In) {
3730	Bits = SrcBits[I];
3731	Src [I] = Op;
3732	return true;
3733	}
3734	}
3735
3736	if (Src.size() == `3`) {
3737	// No room left for operands. Try one last time, there can be a 'not' of
3738	// one of our source operands. In this case we can compute the bits
3739	// without growing Src vector.
3740	if (Op.getOpcode() == ISD::XOR) {
3741	if (auto *C = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`))) {
3742	if (C->isAllOnes()) {
3743	SDValue LHS = Op.getOperand(i: `0`);
3744	for (unsigned I = `0`; I < Src.size(); ++I) {
3745	if (Src [I] == LHS) {
3746	Bits = ~SrcBits[I];
3747	return true;
3748	}
3749	}
3750	}
3751	}
3752	}
3753
3754	return false;
3755	}
3756
3757	Bits = SrcBits[Src.size()];
3758	Src.push_back(Elt: Op);
3759	return true;
3760	};
3761
3762	switch (In.getOpcode()) {
3763	case ISD::AND:
3764	case ISD::OR:
3765	case ISD::XOR: {
3766	SDValue LHS = In.getOperand(i: `0`);
3767	SDValue RHS = In.getOperand(i: `1`);
3768
3769	SmallVector<SDValue, `3`> Backup(Src.begin(), Src.end());
3770	if (!getOperandBits (LHS, LHSBits) \|\|
3771	!getOperandBits (RHS, RHSBits)) {
3772	Src = Backup;
3773	return std::make_pair(x: `0`, y: `0`);
3774	}
3775
3776	// Recursion is naturally limited by the size of the operand vector.
3777	auto Op = BitOp3_Op(In: LHS, Src);
3778	if (Op.first) {
3779	NumOpcodes += Op.first;
3780	LHSBits = Op.second;
3781	}
3782
3783	Op = BitOp3_Op(In: RHS, Src);
3784	if (Op.first) {
3785	NumOpcodes += Op.first;
3786	RHSBits = Op.second;
3787	}
3788	break;
3789	}
3790	default:
3791	return std::make_pair(x: `0`, y: `0`);
3792	}
3793
3794	uint8_t TTbl;
3795	switch (In.getOpcode()) {
3796	case ISD::AND:
3797	TTbl = LHSBits & RHSBits;
3798	break;
3799	case ISD::OR:
3800	TTbl = LHSBits \| RHSBits;
3801	break;
3802	case ISD::XOR:
3803	TTbl = LHSBits ^ RHSBits;
3804	break;
3805	default:
3806	break;
3807	}
3808
3809	return std::make_pair(x: NumOpcodes + `1`, y&: TTbl);
3810	}
3811
3812	bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
3813	SDValue &Src2, SDValue &Tbl) const {
3814	SmallVector<SDValue, `3`> Src;
3815	uint8_t TTbl;
3816	unsigned NumOpcodes;
3817
3818	std::tie(args&: NumOpcodes, args&: TTbl) = BitOp3_Op(In, Src);
3819
3820	// Src.empty() case can happen if all operands are all zero or all ones.
3821	// Normally it shall be optimized out before reaching this.
3822	if (NumOpcodes < `2` \|\| Src.empty())
3823	return false;
3824
3825	// For a uniform case threshold should be higher to account for moves between
3826	// VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3827	// and a readtfirstlane after.
3828	if (NumOpcodes < `4` && !In ->isDivergent())
3829	return false;
3830
3831	if (NumOpcodes == `2` && In.getValueType() == MVT::i32) {
3832	// Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3833	// asm more readable. This cannot be modeled with AddedComplexity because
3834	// selector does not know how many operations did we match.
3835	if ((In.getOpcode() == ISD::XOR \|\| In.getOpcode() == ISD::OR) &&
3836	(In.getOperand(i: `0`).getOpcode() == In.getOpcode() \|\|
3837	In.getOperand(i: `1`).getOpcode() == In.getOpcode()))
3838	return false;
3839
3840	if (In.getOpcode() == ISD::OR &&
3841	(In.getOperand(i: `0`).getOpcode() == ISD::AND \|\|
3842	In.getOperand(i: `1`).getOpcode() == ISD::AND))
3843	return false;
3844	}
3845
3846	// Last operand can be ignored, turning a ternary operation into a binary.
3847	// For example: (~a & b & c) \| (~a & b & ~c) -> (~a & b). We can replace
3848	// 'c' with 'a' here without changing the answer. In some pathological
3849	// cases it should be possible to get an operation with a single operand
3850	// too if optimizer would not catch it.
3851	while (Src.size() < `3`)
3852	Src.push_back(Elt: Src [`0`]);
3853
3854	Src0 = Src [`0`];
3855	Src1 = Src [`1`];
3856	Src2 = Src [`2`];
3857
3858	Tbl = CurDAG->getTargetConstant(Val: TTbl, DL: SDLoc (In), VT: MVT::i32);
3859	return true;
3860	}
3861
3862	SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3863	if (In.isUndef())
3864	return CurDAG->getUNDEF(VT: MVT::i32);
3865
3866	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: In)) {
3867	SDLoc SL(In);
3868	return CurDAG->getConstant(Val: C->getZExtValue() << `16`, DL: SL, VT: MVT::i32);
3869	}
3870
3871	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: In)) {
3872	SDLoc SL(In);
3873	return CurDAG->getConstant(
3874	Val: C->getValueAPF().bitcastToAPInt().getZExtValue() << `16`, DL: SL, VT: MVT::i32);
3875	}
3876
3877	SDValue Src;
3878	if (isExtractHiElt(In, Out&: Src))
3879	return Src;
3880
3881	return SDValue ();
3882	}
3883
3884	bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3885	assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
3886
3887	const SIRegisterInfo *SIRI =
3888	static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3889	const SIInstrInfo * SII =
3890	static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3891
3892	unsigned Limit = `0`;
3893	bool AllUsesAcceptSReg = true;
3894	for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3895	Limit < `10` && U != E; ++U, ++Limit) {
3896	const TargetRegisterClass *RC =
3897	getOperandRegClass(N: U ->getUser(), OpNo: U ->getOperandNo());
3898
3899	// If the register class is unknown, it could be an unknown
3900	// register class that needs to be an SGPR, e.g. an inline asm
3901	// constraint
3902	if (!RC \|\| SIRI->isSGPRClass(RC))
3903	return false;
3904
3905	if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3906	AllUsesAcceptSReg = false;
3907	SDNode *User = U ->getUser();
3908	if (User->isMachineOpcode()) {
3909	unsigned Opc = User->getMachineOpcode();
3910	const MCInstrDesc &Desc = SII->get(Opcode: Opc);
3911	if (Desc.isCommutable()) {
3912	unsigned OpIdx = Desc.getNumDefs() + U ->getOperandNo();
3913	unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3914	if (SII->findCommutedOpIndices(Desc, SrcOpIdx0&: OpIdx, SrcOpIdx1&: CommuteIdx1)) {
3915	unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3916	const TargetRegisterClass *CommutedRC =
3917	getOperandRegClass(N: U ->getUser(), OpNo: CommutedOpNo);
3918	if (CommutedRC == &AMDGPU::VS_32RegClass \|\|
3919	CommutedRC == &AMDGPU::VS_64RegClass)
3920	AllUsesAcceptSReg = true;
3921	}
3922	}
3923	}
3924	// If "AllUsesAcceptSReg == false" so far we haven't succeeded
3925	// commuting current user. This means have at least one use
3926	// that strictly require VGPR. Thus, we will not attempt to commute
3927	// other user instructions.
3928	if (!AllUsesAcceptSReg)
3929	break;
3930	}
3931	}
3932	return !AllUsesAcceptSReg && (Limit < `10`);
3933	}
3934
3935	bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode N) const* {
3936	const auto *Ld = cast<LoadSDNode>(Val: N);
3937
3938	const MachineMemOperand *MMO = Ld->getMemOperand();
3939	if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO))
3940	return false;
3941
3942	return MMO->getSize().hasValue() &&
3943	Ld->getAlign() >=
3944	Align (std::min(a: MMO->getSize().getValue().getKnownMinValue(),
3945	b: uint64_t(`4`))) &&
3946	((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS \|\|
3947	Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) \|\|
3948	(Subtarget->getScalarizeGlobalBehavior() &&
3949	Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3950	Ld->isSimple() &&
3951	static_cast<const SITargetLowering *>(getTargetLowering())
3952	->isMemOpHasNoClobberedMemOperand(N)));
3953	}
3954
3955	void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
3956	const AMDGPUTargetLowering& Lowering =
3957	*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3958	bool IsModified = false;
3959	do {
3960	IsModified = false;
3961
3962	// Go over all selected nodes and try to fold them a bit more
3963	SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
3964	while (Position != CurDAG->allnodes_end()) {
3965	SDNode Node = &Position ++;
3966	MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Val: Node);
3967	if (!MachineNode)
3968	continue;
3969
3970	SDNode ResNode = Lowering.PostISelFolding(N: MachineNode, DAG&: CurDAG);
3971	if (ResNode != Node) {
3972	if (ResNode)
3973	ReplaceUses(F: Node, T: ResNode);
3974	IsModified = true;
3975	}
3976	}
3977	CurDAG->RemoveDeadNodes();
3978	} while (IsModified);
3979	}
3980
3981	AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,
3982	CodeGenOptLevel OptLevel)
3983	: SelectionDAGISelLegacy (
3984	ID, std::make_unique<AMDGPUDAGToDAGISel>(args&: TM, args&: OptLevel)) {}
3985
3986	char AMDGPUDAGToDAGISelLegacy::ID = `0`;
3987

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp